PyPI - mcp-vector-search - Versions diffs - 0.0.3__py3-none-any.whl - Mend

mcp-vector-search 0.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (35) hide show

mcp_vector_search/__init__.py +9 -0
mcp_vector_search/cli/__init__.py +1 -0
mcp_vector_search/cli/commands/__init__.py +1 -0
mcp_vector_search/cli/commands/config.py +303 -0
mcp_vector_search/cli/commands/index.py +304 -0
mcp_vector_search/cli/commands/init.py +212 -0
mcp_vector_search/cli/commands/search.py +395 -0
mcp_vector_search/cli/commands/status.py +340 -0
mcp_vector_search/cli/commands/watch.py +288 -0
mcp_vector_search/cli/main.py +117 -0
mcp_vector_search/cli/output.py +242 -0
mcp_vector_search/config/__init__.py +1 -0
mcp_vector_search/config/defaults.py +175 -0
mcp_vector_search/config/settings.py +108 -0
mcp_vector_search/core/__init__.py +1 -0
mcp_vector_search/core/database.py +431 -0
mcp_vector_search/core/embeddings.py +250 -0
mcp_vector_search/core/exceptions.py +66 -0
mcp_vector_search/core/indexer.py +310 -0
mcp_vector_search/core/models.py +174 -0
mcp_vector_search/core/project.py +304 -0
mcp_vector_search/core/search.py +324 -0
mcp_vector_search/core/watcher.py +320 -0
mcp_vector_search/mcp/__init__.py +1 -0
mcp_vector_search/parsers/__init__.py +1 -0
mcp_vector_search/parsers/base.py +180 -0
mcp_vector_search/parsers/javascript.py +238 -0
mcp_vector_search/parsers/python.py +407 -0
mcp_vector_search/parsers/registry.py +187 -0
mcp_vector_search/py.typed +1 -0
mcp_vector_search-0.0.3.dist-info/METADATA +333 -0
mcp_vector_search-0.0.3.dist-info/RECORD +35 -0
mcp_vector_search-0.0.3.dist-info/WHEEL +4 -0
mcp_vector_search-0.0.3.dist-info/entry_points.txt +2 -0
mcp_vector_search-0.0.3.dist-info/licenses/LICENSE +21 -0

mcp_vector_search/core/project.py ADDED Viewed

@@ -0,0 +1,304 @@
+"""Project detection and management for MCP Vector Search."""
+import json
+from pathlib import Path
+from typing import List, Optional, Set
+from loguru import logger
+from ..config.defaults import (
+    DEFAULT_FILE_EXTENSIONS,
+    DEFAULT_IGNORE_PATTERNS,
+    get_default_config_path,
+    get_default_index_path,
+    get_language_from_extension,
+)
+from ..config.settings import ProjectConfig
+from .exceptions import (
+    ConfigurationError,
+    ProjectInitializationError,
+    ProjectNotFoundError,
+)
+from .models import ProjectInfo
+class ProjectManager:
+    """Manages project detection, initialization, and configuration."""
+    def __init__(self, project_root: Optional[Path] = None) -> None:
+        """Initialize project manager.
+        Args:
+            project_root: Project root directory. If None, will auto-detect.
+        """
+        self.project_root = project_root or self._detect_project_root()
+        self._config: Optional[ProjectConfig] = None
+    def _detect_project_root(self) -> Path:
+        """Auto-detect project root directory."""
+        current = Path.cwd()
+        # Look for common project indicators
+        indicators = [
+            ".git",
+            ".mcp-vector-search",
+            "pyproject.toml",
+            "package.json",
+            "Cargo.toml",
+            "go.mod",
+            "pom.xml",
+            "build.gradle",
+            ".project",
+        ]
+        # Walk up the directory tree
+        for path in [current] + list(current.parents):
+            for indicator in indicators:
+                if (path / indicator).exists():
+                    logger.debug(f"Detected project root: {path} (found {indicator})")
+                    return path
+        # Default to current directory
+        logger.debug(f"Using current directory as project root: {current}")
+        return current
+    def is_initialized(self) -> bool:
+        """Check if project is initialized for MCP Vector Search."""
+        config_path = get_default_config_path(self.project_root)
+        index_path = get_default_index_path(self.project_root)
+        return config_path.exists() and index_path.exists()
+    def initialize(
+        self,
+        file_extensions: Optional[List[str]] = None,
+        embedding_model: str = "microsoft/codebert-base",
+        similarity_threshold: float = 0.75,
+        force: bool = False,
+    ) -> ProjectConfig:
+        """Initialize project for MCP Vector Search.
+        Args:
+            file_extensions: File extensions to index
+            embedding_model: Embedding model to use
+            similarity_threshold: Similarity threshold for search
+            force: Force re-initialization if already exists
+        Returns:
+            Project configuration
+        Raises:
+            ProjectInitializationError: If initialization fails
+        """
+        if self.is_initialized() and not force:
+            raise ProjectInitializationError(
+                f"Project already initialized at {self.project_root}. Use --force to re-initialize."
+            )
+        try:
+            # Create index directory
+            index_path = get_default_index_path(self.project_root)
+            index_path.mkdir(parents=True, exist_ok=True)
+            # Detect languages and files
+            detected_languages = self.detect_languages()
+            file_count = self.count_indexable_files(file_extensions or DEFAULT_FILE_EXTENSIONS)
+            # Create configuration
+            config = ProjectConfig(
+                project_root=self.project_root,
+                index_path=index_path,
+                file_extensions=file_extensions or DEFAULT_FILE_EXTENSIONS,
+                embedding_model=embedding_model,
+                similarity_threshold=similarity_threshold,
+                languages=detected_languages,
+            )
+            # Save configuration
+            self.save_config(config)
+            logger.info(
+                f"Initialized project at {self.project_root}",
+                languages=detected_languages,
+                file_count=file_count,
+                extensions=config.file_extensions,
+            )
+            self._config = config
+            return config
+        except Exception as e:
+            raise ProjectInitializationError(f"Failed to initialize project: {e}") from e
+    def load_config(self) -> ProjectConfig:
+        """Load project configuration.
+        Returns:
+            Project configuration
+        Raises:
+            ProjectNotFoundError: If project is not initialized
+            ConfigurationError: If configuration is invalid
+        """
+        if not self.is_initialized():
+            raise ProjectNotFoundError(
+                f"Project not initialized at {self.project_root}. Run 'mcp-vector-search init' first."
+            )
+        config_path = get_default_config_path(self.project_root)
+        try:
+            with open(config_path, "r") as f:
+                config_data = json.load(f)
+            # Convert paths back to Path objects
+            config_data["project_root"] = Path(config_data["project_root"])
+            config_data["index_path"] = Path(config_data["index_path"])
+            config = ProjectConfig(**config_data)
+            self._config = config
+            return config
+        except Exception as e:
+            raise ConfigurationError(f"Failed to load configuration: {e}") from e
+    def save_config(self, config: ProjectConfig) -> None:
+        """Save project configuration.
+        Args:
+            config: Project configuration to save
+        Raises:
+            ConfigurationError: If saving fails
+        """
+        config_path = get_default_config_path(self.project_root)
+        config_path.parent.mkdir(parents=True, exist_ok=True)
+        try:
+            # Convert to JSON-serializable format
+            config_data = config.dict()
+            config_data["project_root"] = str(config.project_root)
+            config_data["index_path"] = str(config.index_path)
+            with open(config_path, "w") as f:
+                json.dump(config_data, f, indent=2)
+            logger.debug(f"Saved configuration to {config_path}")
+        except Exception as e:
+            raise ConfigurationError(f"Failed to save configuration: {e}") from e
+    @property
+    def config(self) -> ProjectConfig:
+        """Get project configuration, loading if necessary."""
+        if self._config is None:
+            self._config = self.load_config()
+        return self._config
+    def detect_languages(self) -> List[str]:
+        """Detect programming languages in the project.
+        Returns:
+            List of detected language names
+        """
+        languages: Set[str] = set()
+        for file_path in self._iter_source_files():
+            language = get_language_from_extension(file_path.suffix)
+            if language != "text":
+                languages.add(language)
+        return sorted(list(languages))
+    def count_indexable_files(self, extensions: List[str]) -> int:
+        """Count files that can be indexed.
+        Args:
+            extensions: File extensions to count
+        Returns:
+            Number of indexable files
+        """
+        count = 0
+        for file_path in self._iter_source_files():
+            if file_path.suffix in extensions:
+                count += 1
+        return count
+    def get_project_info(self) -> ProjectInfo:
+        """Get comprehensive project information.
+        Returns:
+            Project information
+        """
+        config_path = get_default_config_path(self.project_root)
+        index_path = get_default_index_path(self.project_root)
+        is_initialized = self.is_initialized()
+        languages = []
+        file_count = 0
+        if is_initialized:
+            try:
+                config = self.config
+                languages = config.languages
+                file_count = self.count_indexable_files(config.file_extensions)
+            except Exception:
+                # Ignore errors when getting detailed info
+                pass
+        return ProjectInfo(
+            name=self.project_root.name,
+            root_path=self.project_root,
+            config_path=config_path,
+            index_path=index_path,
+            is_initialized=is_initialized,
+            languages=languages,
+            file_count=file_count,
+        )
+    def _iter_source_files(self) -> List[Path]:
+        """Iterate over source files in the project.
+        Returns:
+            List of source file paths
+        """
+        files = []
+        for path in self.project_root.rglob("*"):
+            if not path.is_file():
+                continue
+            # Skip ignored patterns
+            if self._should_ignore_path(path):
+                continue
+            files.append(path)
+        return files
+    def _should_ignore_path(self, path: Path) -> bool:
+        """Check if a path should be ignored.
+        Args:
+            path: Path to check
+        Returns:
+            True if path should be ignored
+        """
+        # Check if any parent directory is in ignore patterns
+        for part in path.parts:
+            if part in DEFAULT_IGNORE_PATTERNS:
+                return True
+        # Check relative path from project root
+        try:
+            relative_path = path.relative_to(self.project_root)
+            for part in relative_path.parts:
+                if part in DEFAULT_IGNORE_PATTERNS:
+                    return True
+        except ValueError:
+            # Path is not relative to project root
+            return True
+        return False

mcp_vector_search/core/search.py ADDED Viewed

@@ -0,0 +1,324 @@
+"""Semantic search engine for MCP Vector Search."""
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+from loguru import logger
+from .database import VectorDatabase
+from .exceptions import SearchError
+from .models import SearchResult
+class SemanticSearchEngine:
+    """Semantic search engine for code search."""
+    def __init__(
+        self,
+        database: VectorDatabase,
+        project_root: Path,
+        similarity_threshold: float = 0.7,
+    ) -> None:
+        """Initialize semantic search engine.
+        Args:
+            database: Vector database instance
+            project_root: Project root directory
+            similarity_threshold: Default similarity threshold
+        """
+        self.database = database
+        self.project_root = project_root
+        self.similarity_threshold = similarity_threshold
+    async def search(
+        self,
+        query: str,
+        limit: int = 10,
+        filters: Optional[Dict[str, Any]] = None,
+        similarity_threshold: Optional[float] = None,
+        include_context: bool = True,
+    ) -> List[SearchResult]:
+        """Perform semantic search for code.
+        Args:
+            query: Search query
+            limit: Maximum number of results
+            filters: Optional filters (language, file_path, etc.)
+            similarity_threshold: Minimum similarity score
+            include_context: Whether to include context lines
+        Returns:
+            List of search results
+        """
+        if not query.strip():
+            return []
+        threshold = similarity_threshold or self.similarity_threshold
+        try:
+            # Preprocess query
+            processed_query = self._preprocess_query(query)
+            # Perform vector search
+            results = await self.database.search(
+                query=processed_query,
+                limit=limit,
+                filters=filters,
+                similarity_threshold=threshold,
+            )
+            # Post-process results
+            enhanced_results = []
+            for result in results:
+                enhanced_result = await self._enhance_result(result, include_context)
+                enhanced_results.append(enhanced_result)
+            # Apply additional ranking if needed
+            ranked_results = self._rerank_results(enhanced_results, query)
+            logger.debug(f"Search for '{query}' returned {len(ranked_results)} results")
+            return ranked_results
+        except Exception as e:
+            logger.error(f"Search failed for query '{query}': {e}")
+            raise SearchError(f"Search failed: {e}") from e
+    async def search_similar(
+        self,
+        file_path: Path,
+        function_name: Optional[str] = None,
+        limit: int = 10,
+        similarity_threshold: Optional[float] = None,
+    ) -> List[SearchResult]:
+        """Find code similar to a specific function or file.
+        Args:
+            file_path: Path to the reference file
+            function_name: Specific function name (optional)
+            limit: Maximum number of results
+            similarity_threshold: Minimum similarity score
+        Returns:
+            List of similar code results
+        """
+        try:
+            # Read the reference file
+            with open(file_path, "r", encoding="utf-8") as f:
+                content = f.read()
+            # If function name is specified, try to extract just that function
+            if function_name:
+                function_content = self._extract_function_content(content, function_name)
+                if function_content:
+                    content = function_content
+            # Use the content as the search query
+            return await self.search(
+                query=content,
+                limit=limit,
+                similarity_threshold=similarity_threshold,
+                include_context=True,
+            )
+        except Exception as e:
+            logger.error(f"Similar search failed for {file_path}: {e}")
+            raise SearchError(f"Similar search failed: {e}") from e
+    async def search_by_context(
+        self,
+        context_description: str,
+        focus_areas: Optional[List[str]] = None,
+        limit: int = 10,
+    ) -> List[SearchResult]:
+        """Search for code based on contextual description.
+        Args:
+            context_description: Description of what you're looking for
+            focus_areas: Areas to focus on (e.g., ["security", "authentication"])
+            limit: Maximum number of results
+        Returns:
+            List of contextually relevant results
+        """
+        # Build enhanced query with focus areas
+        query_parts = [context_description]
+        if focus_areas:
+            query_parts.extend(focus_areas)
+        enhanced_query = " ".join(query_parts)
+        return await self.search(
+            query=enhanced_query,
+            limit=limit,
+            include_context=True,
+        )
+    def _preprocess_query(self, query: str) -> str:
+        """Preprocess search query for better results.
+        Args:
+            query: Raw search query
+        Returns:
+            Processed query
+        """
+        # Remove extra whitespace
+        query = re.sub(r"\s+", " ", query.strip())
+        # Expand common abbreviations
+        expansions = {
+            "auth": "authentication",
+            "db": "database",
+            "api": "application programming interface",
+            "ui": "user interface",
+            "util": "utility",
+            "config": "configuration",
+        }
+        words = query.lower().split()
+        expanded_words = []
+        for word in words:
+            if word in expansions:
+                expanded_words.extend([word, expansions[word]])
+            else:
+                expanded_words.append(word)
+        return " ".join(expanded_words)
+    async def _enhance_result(
+        self, result: SearchResult, include_context: bool
+    ) -> SearchResult:
+        """Enhance search result with additional information.
+        Args:
+            result: Original search result
+            include_context: Whether to include context lines
+        Returns:
+            Enhanced search result
+        """
+        if not include_context:
+            return result
+        try:
+            # Read the source file to get context
+            with open(result.file_path, "r", encoding="utf-8") as f:
+                lines = f.readlines()
+            # Get context lines before and after
+            context_size = 3
+            start_idx = max(0, result.start_line - 1 - context_size)
+            end_idx = min(len(lines), result.end_line + context_size)
+            context_before = [
+                line.rstrip() for line in lines[start_idx : result.start_line - 1]
+            ]
+            context_after = [
+                line.rstrip() for line in lines[result.end_line : end_idx]
+            ]
+            # Update result with context
+            result.context_before = context_before
+            result.context_after = context_after
+        except Exception as e:
+            logger.warning(f"Failed to get context for {result.file_path}: {e}")
+        return result
+    def _rerank_results(
+        self, results: List[SearchResult], query: str
+    ) -> List[SearchResult]:
+        """Apply additional ranking to search results.
+        Args:
+            results: Original search results
+            query: Original search query
+        Returns:
+            Reranked search results
+        """
+        # Simple reranking based on additional factors
+        query_lower = query.lower()
+        for result in results:
+            # Boost score for exact matches in function/class names
+            boost = 0.0
+            if result.function_name and query_lower in result.function_name.lower():
+                boost += 0.1
+            if result.class_name and query_lower in result.class_name.lower():
+                boost += 0.1
+            # Boost score for matches in file name
+            if query_lower in result.file_path.name.lower():
+                boost += 0.05
+            # Apply boost
+            result.similarity_score = min(1.0, result.similarity_score + boost)
+        # Re-sort by similarity score
+        results.sort(key=lambda r: r.similarity_score, reverse=True)
+        # Update ranks
+        for i, result in enumerate(results):
+            result.rank = i + 1
+        return results
+    def _extract_function_content(self, content: str, function_name: str) -> Optional[str]:
+        """Extract content of a specific function from code.
+        Args:
+            content: Full file content
+            function_name: Name of function to extract
+        Returns:
+            Function content if found, None otherwise
+        """
+        # Simple regex-based extraction (could be improved with AST)
+        pattern = rf"^\s*def\s+{re.escape(function_name)}\s*\("
+        lines = content.splitlines()
+        for i, line in enumerate(lines):
+            if re.match(pattern, line):
+                # Found function start, now find the end
+                start_line = i
+                indent_level = len(line) - len(line.lstrip())
+                # Find end of function
+                end_line = len(lines)
+                for j in range(i + 1, len(lines)):
+                    if lines[j].strip():  # Skip empty lines
+                        current_indent = len(lines[j]) - len(lines[j].lstrip())
+                        if current_indent <= indent_level:
+                            end_line = j
+                            break
+                return "\n".join(lines[start_line:end_line])
+        return None
+    async def get_search_stats(self) -> Dict[str, Any]:
+        """Get search engine statistics.
+        Returns:
+            Dictionary with search statistics
+        """
+        try:
+            db_stats = await self.database.get_stats()
+            return {
+                "total_chunks": db_stats.total_chunks,
+                "languages": db_stats.languages,
+                "similarity_threshold": self.similarity_threshold,
+                "project_root": str(self.project_root),
+            }
+        except Exception as e:
+            logger.error(f"Failed to get search stats: {e}")
+            return {"error": str(e)}