PyPI - mcp-vector-search - Versions diffs - 0.7.5__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

mcp-vector-search 0.7.5py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (20) hide show

mcp_vector_search/__init__.py +2 -2
mcp_vector_search/cli/commands/demo.py +2 -4
mcp_vector_search/cli/commands/index.py +130 -30
mcp_vector_search/cli/commands/mcp.py +83 -56
mcp_vector_search/cli/commands/status.py +23 -9
mcp_vector_search/cli/commands/visualize.py +523 -0
mcp_vector_search/cli/main.py +16 -13
mcp_vector_search/core/database.py +117 -54
mcp_vector_search/core/indexer.py +262 -16
mcp_vector_search/core/models.py +45 -1
mcp_vector_search/core/project.py +6 -3
mcp_vector_search/parsers/base.py +83 -0
mcp_vector_search/parsers/javascript.py +350 -2
mcp_vector_search/parsers/python.py +79 -0
mcp_vector_search/utils/gitignore.py +31 -23
{mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/METADATA +1 -1
{mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/RECORD +20 -19
{mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/WHEEL +0 -0
{mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/entry_points.txt +0 -0
{mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/licenses/LICENSE +0 -0

mcp_vector_search/core/database.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Database abstraction and ChromaDB implementation for MCP Vector Search."""
+import asyncio
 import shutil
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -369,38 +370,67 @@ class ChromaVectorDatabase(VectorDatabase):
             raise DatabaseError(f"Failed to delete chunks: {e}") from e
     async def get_stats(self) -> IndexStats:
-        """Get database statistics."""
+        """Get database statistics with optimized chunked queries."""
         if not self._collection:
             raise DatabaseNotInitializedError("Database not initialized")
         try:
-            # Get total count
+            # Get total count (fast operation)
             count = self._collection.count()
-            # Get ALL metadata to analyze (not just a sample)
-            # Only fetch metadata, not embeddings, for performance
-            results = self._collection.get(include=["metadatas"])
+            if count == 0:
+                return IndexStats(
+                    total_files=0,
+                    total_chunks=0,
+                    languages={},
+                    file_types={},
+                    index_size_mb=0.0,
+                    last_updated="N/A",
+                    embedding_model="unknown",
+                )
-            # Count unique files from all chunks
-            files = {m.get("file_path", "") for m in results.get("metadatas", [])}
+            # Process in chunks to avoid loading everything at once
+            batch_size_limit = 1000
-            # Count languages and file types
-            language_counts = {}
-            file_type_counts = {}
+            files = set()
+            language_counts: dict[str, int] = {}
+            file_type_counts: dict[str, int] = {}
-            for metadata in results.get("metadatas", []):
-                # Count languages
-                lang = metadata.get("language", "unknown")
-                language_counts[lang] = language_counts.get(lang, 0) + 1
+            offset = 0
+            while offset < count:
+                # Fetch batch
+                batch_size = min(batch_size_limit, count - offset)
+                logger.debug(
+                    f"Processing database stats: batch {offset // batch_size_limit + 1}, "
+                    f"{offset}-{offset + batch_size} of {count} chunks"
+                )
-                # Count file types
-                file_path = metadata.get("file_path", "")
-                if file_path:
-                    ext = Path(file_path).suffix or "no_extension"
-                    file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                results = self._collection.get(
+                    include=["metadatas"],
+                    limit=batch_size,
+                    offset=offset,
+                )
-            # Estimate index size (rough approximation)
-            index_size_mb = count * 0.001  # Rough estimate
+                # Process batch metadata
+                for metadata in results.get("metadatas", []):
+                    # Language stats
+                    lang = metadata.get("language", "unknown")
+                    language_counts[lang] = language_counts.get(lang, 0) + 1
+                    # File stats
+                    file_path = metadata.get("file_path", "")
+                    if file_path:
+                        files.add(file_path)
+                        ext = Path(file_path).suffix or "no_extension"
+                        file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                offset += batch_size
+                # Yield to event loop periodically to prevent blocking
+                await asyncio.sleep(0)
+            # Estimate index size (rough approximation: ~1KB per chunk)
+            index_size_mb = count * 0.001
             return IndexStats(
                 total_files=len(files),
@@ -408,12 +438,13 @@ class ChromaVectorDatabase(VectorDatabase):
                 languages=language_counts,
                 file_types=file_type_counts,
                 index_size_mb=index_size_mb,
-                last_updated="unknown",  # TODO: Track this
-                embedding_model="unknown",  # TODO: Track this
+                last_updated="unknown",
+                embedding_model="unknown",
             )
         except Exception as e:
-            logger.error(f"Failed to get stats: {e}")
+            logger.error(f"Failed to get database statistics: {e}")
+            # Return empty stats instead of raising
             return IndexStats(
                 total_files=0,
                 total_chunks=0,
@@ -768,56 +799,88 @@ class PooledChromaVectorDatabase(VectorDatabase):
             raise DatabaseError(f"Failed to delete chunks: {e}") from e
     async def get_stats(self) -> IndexStats:
-        """Get database statistics using pooled connection."""
+        """Get database statistics with connection pooling and chunked queries."""
         try:
             async with self._pool.get_connection() as conn:
-                # Get total count
+                # Get total count (fast operation)
                 count = conn.collection.count()
-                # Get all metadata to analyze
-                results = conn.collection.get(include=["metadatas"])
+                if count == 0:
+                    return IndexStats(
+                        total_files=0,
+                        total_chunks=0,
+                        languages={},
+                        file_types={},
+                        index_size_mb=0.0,
+                        last_updated="N/A",
+                        embedding_model="unknown",
+                    )
+                # Process in chunks to avoid loading everything at once
+                batch_size_limit = 1000
-                # Analyze languages and files
-                languages = set()
                 files = set()
+                language_counts: dict[str, int] = {}
+                file_type_counts: dict[str, int] = {}
+                offset = 0
+                while offset < count:
+                    # Fetch batch
+                    batch_size = min(batch_size_limit, count - offset)
+                    logger.debug(
+                        f"Processing database stats: batch {offset // batch_size_limit + 1}, "
+                        f"{offset}-{offset + batch_size} of {count} chunks"
+                    )
-                for metadata in results["metadatas"]:
-                    if "language" in metadata:
-                        languages.add(metadata["language"])
-                    if "file_path" in metadata:
-                        files.add(metadata["file_path"])
+                    results = conn.collection.get(
+                        include=["metadatas"],
+                        limit=batch_size,
+                        offset=offset,
+                    )
-                # Count languages and file types
-                language_counts = {}
-                file_type_counts = {}
+                    # Process batch metadata
+                    for metadata in results.get("metadatas", []):
+                        # Language stats
+                        lang = metadata.get("language", "unknown")
+                        language_counts[lang] = language_counts.get(lang, 0) + 1
-                for metadata in results["metadatas"]:
-                    # Count languages
-                    lang = metadata.get("language", "unknown")
-                    language_counts[lang] = language_counts.get(lang, 0) + 1
+                        # File stats
+                        file_path = metadata.get("file_path", "")
+                        if file_path:
+                            files.add(file_path)
+                            ext = Path(file_path).suffix or "no_extension"
+                            file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
-                    # Count file types
-                    file_path = metadata.get("file_path", "")
-                    if file_path:
-                        ext = Path(file_path).suffix or "no_extension"
-                        file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                    offset += batch_size
+                    # Yield to event loop periodically to prevent blocking
+                    await asyncio.sleep(0)
-                # Estimate index size (rough approximation)
-                index_size_mb = count * 0.001  # Rough estimate
+                # Estimate index size (rough approximation: ~1KB per chunk)
+                index_size_mb = count * 0.001
                 return IndexStats(
-                    total_chunks=count,
                     total_files=len(files),
+                    total_chunks=count,
                     languages=language_counts,
                     file_types=file_type_counts,
                     index_size_mb=index_size_mb,
-                    last_updated="unknown",  # ChromaDB doesn't track this
-                    embedding_model="unknown",  # TODO: Track this in metadata
+                    last_updated="unknown",
+                    embedding_model="unknown",
                 )
         except Exception as e:
-            logger.error(f"Failed to get database stats: {e}")
-            raise DatabaseError(f"Failed to get stats: {e}") from e
+            logger.error(f"Failed to get database statistics: {e}")
+            # Return empty stats instead of raising
+            return IndexStats(
+                total_files=0,
+                total_chunks=0,
+                languages={},
+                file_types={},
+                index_size_mb=0.0,
+                last_updated="error",
+                embedding_model="unknown",
+            )
     async def remove_file_chunks(self, file_path: str) -> int:
         """Remove all chunks for a specific file using pooled connection."""

mcp_vector_search/core/indexer.py CHANGED Viewed

@@ -57,6 +57,11 @@ class SemanticIndexer:
             project_root / ".mcp-vector-search" / "index_metadata.json"
         )
+        # Add cache for indexable files to avoid repeated filesystem scans
+        self._indexable_files_cache: list[Path] | None = None
+        self._cache_timestamp: float = 0
+        self._cache_ttl: float = 60.0  # 60 second TTL
         # Initialize gitignore parser
         try:
             self.gitignore_parser = create_gitignore_parser(project_root)
@@ -290,8 +295,11 @@ class SemanticIndexer:
                 logger.debug(f"No chunks extracted from {file_path}")
                 return True  # Not an error, just empty file
+            # Build hierarchical relationships between chunks
+            chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
             # Add chunks to database
-            await self.database.add_chunks(chunks)
+            await self.database.add_chunks(chunks_with_hierarchy)
             # Update metadata after successful indexing
             metadata = self._load_index_metadata()
@@ -334,38 +342,120 @@ class SemanticIndexer:
             return 0
     def _find_indexable_files(self) -> list[Path]:
-        """Find all files that should be indexed.
+        """Find all files that should be indexed with caching.
         Returns:
             List of file paths to index
         """
+        import time
+        # Check cache
+        current_time = time.time()
+        if (
+            self._indexable_files_cache is not None
+            and current_time - self._cache_timestamp < self._cache_ttl
+        ):
+            logger.debug(
+                f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
+            )
+            return self._indexable_files_cache
+        # Rebuild cache using efficient directory filtering
+        logger.debug("Rebuilding indexable files cache...")
+        indexable_files = self._scan_files_sync()
+        self._indexable_files_cache = sorted(indexable_files)
+        self._cache_timestamp = current_time
+        logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
+        return self._indexable_files_cache
+    def _scan_files_sync(self) -> list[Path]:
+        """Synchronous file scanning (runs in thread pool).
+        Uses os.walk with directory filtering to avoid traversing ignored directories.
+        Returns:
+            List of indexable file paths
+        """
         indexable_files = []
-        for file_path in self.project_root.rglob("*"):
-            if self._should_index_file(file_path):
-                indexable_files.append(file_path)
+        # Use os.walk for efficient directory traversal with early filtering
+        for root, dirs, files in os.walk(self.project_root):
+            root_path = Path(root)
+            # Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
+            # This is much more efficient than checking every file in ignored directories
+            # PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
+            dirs[:] = [d for d in dirs if not self._should_ignore_path(root_path / d, is_directory=True)]
+            # Check each file in the current directory
+            # PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
+            for filename in files:
+                file_path = root_path / filename
+                if self._should_index_file(file_path, skip_file_check=True):
+                    indexable_files.append(file_path)
-        return sorted(indexable_files)
+        return indexable_files
+    async def _find_indexable_files_async(self) -> list[Path]:
+        """Find all files asynchronously without blocking event loop.
+        Returns:
+            List of file paths to index
+        """
+        import time
+        from concurrent.futures import ThreadPoolExecutor
+        # Check cache first
+        current_time = time.time()
+        if (
+            self._indexable_files_cache is not None
+            and current_time - self._cache_timestamp < self._cache_ttl
+        ):
+            logger.debug(
+                f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
+            )
+            return self._indexable_files_cache
+        # Run filesystem scan in thread pool to avoid blocking
+        logger.debug("Scanning files in background thread...")
+        loop = asyncio.get_running_loop()
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            indexable_files = await loop.run_in_executor(
+                executor, self._scan_files_sync
+            )
-    def _should_index_file(self, file_path: Path) -> bool:
+        # Update cache
+        self._indexable_files_cache = sorted(indexable_files)
+        self._cache_timestamp = current_time
+        logger.debug(f"Found {len(indexable_files)} indexable files")
+        return self._indexable_files_cache
+    def _should_index_file(self, file_path: Path, skip_file_check: bool = False) -> bool:
         """Check if a file should be indexed.
         Args:
             file_path: Path to check
+            skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
         Returns:
             True if file should be indexed
         """
-        # Must be a file
-        if not file_path.is_file():
+        # PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
+        # This eliminates most files without any filesystem calls
+        if file_path.suffix.lower() not in self.file_extensions:
             return False
-        # Check file extension
-        if file_path.suffix.lower() not in self.file_extensions:
+        # PERFORMANCE: Only check is_file() if not coming from os.walk
+        # os.walk already guarantees files, so we skip this expensive check
+        if not skip_file_check and not file_path.is_file():
             return False
         # Check if path should be ignored
-        if self._should_ignore_path(file_path):
+        # PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
+        if self._should_ignore_path(file_path, is_directory=False):
             return False
         # Check file size (skip very large files)
@@ -379,18 +469,20 @@ class SemanticIndexer:
         return True
-    def _should_ignore_path(self, file_path: Path) -> bool:
+    def _should_ignore_path(self, file_path: Path, is_directory: bool | None = None) -> bool:
         """Check if a path should be ignored.
         Args:
             file_path: Path to check
+            is_directory: Optional hint if path is a directory (avoids filesystem check)
         Returns:
             True if path should be ignored
         """
         try:
             # First check gitignore rules if available
-            if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path):
+            # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
+            if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path, is_directory=is_directory):
                 logger.debug(f"Path ignored by .gitignore: {file_path}")
                 return True
@@ -532,8 +624,8 @@ class SemanticIndexer:
             # Get database stats
             db_stats = await self.database.get_stats()
-            # Count indexable files
-            indexable_files = self._find_indexable_files()
+            # Count indexable files asynchronously without blocking
+            indexable_files = await self._find_indexable_files_async()
             return {
                 "total_indexable_files": len(indexable_files),
@@ -553,3 +645,157 @@ class SemanticIndexer:
                 "indexed_files": 0,
                 "total_chunks": 0,
             }
+    async def get_files_to_index(
+        self, force_reindex: bool = False
+    ) -> tuple[list[Path], list[Path]]:
+        """Get all indexable files and those that need indexing.
+        Args:
+            force_reindex: Whether to force reindex of all files
+        Returns:
+            Tuple of (all_indexable_files, files_to_index)
+        """
+        # Find all indexable files
+        all_files = await self._find_indexable_files_async()
+        if not all_files:
+            return [], []
+        # Load existing metadata for incremental indexing
+        metadata = self._load_index_metadata()
+        # Filter files that need indexing
+        if force_reindex:
+            files_to_index = all_files
+            logger.info(f"Force reindex: processing all {len(files_to_index)} files")
+        else:
+            files_to_index = [
+                f for f in all_files if self._needs_reindexing(f, metadata)
+            ]
+            logger.info(
+                f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
+            )
+        return all_files, files_to_index
+    async def index_files_with_progress(
+        self,
+        files_to_index: list[Path],
+        force_reindex: bool = False,
+    ):
+        """Index files and yield progress updates for each file.
+        Args:
+            files_to_index: List of file paths to index
+            force_reindex: Whether to force reindexing
+        Yields:
+            Tuple of (file_path, chunks_added, success) for each processed file
+        """
+        metadata = self._load_index_metadata()
+        # Process files in batches for better memory management
+        for i in range(0, len(files_to_index), self.batch_size):
+            batch = files_to_index[i : i + self.batch_size]
+            # Process each file in the batch
+            for file_path in batch:
+                chunks_added = 0
+                success = False
+                try:
+                    # Always remove existing chunks when reindexing
+                    await self.database.delete_by_file(file_path)
+                    # Parse file into chunks
+                    chunks = await self._parse_file(file_path)
+                    if chunks:
+                        # Build hierarchical relationships
+                        chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
+                        # Add chunks to database
+                        await self.database.add_chunks(chunks_with_hierarchy)
+                        chunks_added = len(chunks)
+                        logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
+                    success = True
+                    # Update metadata after successful indexing
+                    metadata[str(file_path)] = os.path.getmtime(file_path)
+                except Exception as e:
+                    logger.error(f"Failed to index file {file_path}: {e}")
+                    success = False
+                # Yield progress update
+                yield (file_path, chunks_added, success)
+        # Save metadata at the end
+        self._save_index_metadata(metadata)
+    def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
+        """Build parent-child relationships between chunks.
+        Logic:
+        - Module chunks (chunk_type="module") have depth 0
+        - Class chunks have depth 1, parent is module
+        - Method chunks have depth 2, parent is class
+        - Function chunks outside classes have depth 1, parent is module
+        - Nested classes increment depth
+        Args:
+            chunks: List of code chunks to process
+        Returns:
+            List of chunks with hierarchy relationships established
+        """
+        if not chunks:
+            return chunks
+        # Group chunks by type and name
+        module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
+        class_chunks = [c for c in chunks if c.chunk_type in ("class", "interface", "mixin")]
+        function_chunks = [c for c in chunks if c.chunk_type in ("function", "method", "constructor")]
+        # Build relationships
+        for func in function_chunks:
+            if func.class_name:
+                # Find parent class
+                parent_class = next(
+                    (c for c in class_chunks if c.class_name == func.class_name),
+                    None
+                )
+                if parent_class:
+                    func.parent_chunk_id = parent_class.chunk_id
+                    func.chunk_depth = parent_class.chunk_depth + 1
+                    if func.chunk_id not in parent_class.child_chunk_ids:
+                        parent_class.child_chunk_ids.append(func.chunk_id)
+            else:
+                # Top-level function
+                if not func.chunk_depth:
+                    func.chunk_depth = 1
+                # Link to module if exists
+                if module_chunks and not func.parent_chunk_id:
+                    func.parent_chunk_id = module_chunks[0].chunk_id
+                    if func.chunk_id not in module_chunks[0].child_chunk_ids:
+                        module_chunks[0].child_chunk_ids.append(func.chunk_id)
+        for cls in class_chunks:
+            # Classes without parent are top-level (depth 1)
+            if not cls.chunk_depth:
+                cls.chunk_depth = 1
+            # Link to module if exists
+            if module_chunks and not cls.parent_chunk_id:
+                cls.parent_chunk_id = module_chunks[0].chunk_id
+                if cls.chunk_id not in module_chunks[0].child_chunk_ids:
+                    module_chunks[0].child_chunk_ids.append(cls.chunk_id)
+        # Module chunks stay at depth 0
+        for mod in module_chunks:
+            if not mod.chunk_depth:
+                mod.chunk_depth = 0
+        return chunks

mcp_vector_search/core/models.py CHANGED Viewed

@@ -21,12 +21,40 @@ class CodeChunk:
     class_name: str | None = None
     docstring: str | None = None
     imports: list[str] = None
+    # Enhancement 1: Complexity scoring
     complexity_score: float = 0.0
+    # Enhancement 3: Hierarchical relationships
+    chunk_id: str | None = None
+    parent_chunk_id: str | None = None
+    child_chunk_ids: list[str] = None
+    chunk_depth: int = 0
+    # Enhancement 4: Enhanced metadata
+    decorators: list[str] = None
+    parameters: list[dict] = None
+    return_type: str | None = None
+    type_annotations: dict[str, str] = None
     def __post_init__(self) -> None:
-        """Initialize default values."""
+        """Initialize default values and generate chunk ID."""
         if self.imports is None:
             self.imports = []
+        if self.child_chunk_ids is None:
+            self.child_chunk_ids = []
+        if self.decorators is None:
+            self.decorators = []
+        if self.parameters is None:
+            self.parameters = []
+        if self.type_annotations is None:
+            self.type_annotations = {}
+        # Generate chunk ID if not provided
+        if self.chunk_id is None:
+            import hashlib
+            id_string = f"{self.file_path}:{self.chunk_type}:{self.start_line}:{self.end_line}"
+            self.chunk_id = hashlib.sha256(id_string.encode()).hexdigest()[:16]
     @property
     def id(self) -> str:
@@ -52,6 +80,14 @@ class CodeChunk:
             "docstring": self.docstring,
             "imports": self.imports,
             "complexity_score": self.complexity_score,
+            "chunk_id": self.chunk_id,
+            "parent_chunk_id": self.parent_chunk_id,
+            "child_chunk_ids": self.child_chunk_ids,
+            "chunk_depth": self.chunk_depth,
+            "decorators": self.decorators,
+            "parameters": self.parameters,
+            "return_type": self.return_type,
+            "type_annotations": self.type_annotations,
         }
     @classmethod
@@ -69,6 +105,14 @@ class CodeChunk:
             docstring=data.get("docstring"),
             imports=data.get("imports", []),
             complexity_score=data.get("complexity_score", 0.0),
+            chunk_id=data.get("chunk_id"),
+            parent_chunk_id=data.get("parent_chunk_id"),
+            child_chunk_ids=data.get("child_chunk_ids", []),
+            chunk_depth=data.get("chunk_depth", 0),
+            decorators=data.get("decorators", []),
+            parameters=data.get("parameters", []),
+            return_type=data.get("return_type"),
+            type_annotations=data.get("type_annotations", {}),
         )

mcp-vector-search 0.7.5__py3-none-any.whl → 0.8.0__py3-none-any.whl

Potentially problematic release.

mcp-vector-search 0.7.5py3-none-any.whl → 0.8.0py3-none-any.whl