PyPI - mcp-vector-search - Versions diffs - 0.7.4__py3-none-any.whl → 0.7.6__py3-none-any.whl - Mend

mcp-vector-search 0.7.4py3-none-any.whl → 0.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (15) hide show

mcp_vector_search/cli/commands/status.py CHANGED Viewed

@@ -102,15 +102,28 @@ def main(
         if project_root is None:
             project_root = Path.cwd()
-        asyncio.run(
-            show_status(
-                project_root=project_root,
-                verbose=verbose,
-                health_check=health_check,
-                mcp=mcp,
-                json_output=json_output,
-            )
-        )
+        async def run_status_with_timeout():
+            """Run status command with timeout protection."""
+            try:
+                await asyncio.wait_for(
+                    show_status(
+                        project_root=project_root,
+                        verbose=verbose,
+                        health_check=health_check,
+                        mcp=mcp,
+                        json_output=json_output,
+                    ),
+                    timeout=30.0,  # 30 second timeout
+                )
+            except TimeoutError:
+                logger.error("Status check timed out after 30 seconds")
+                print_error(
+                    "Status check timed out after 30 seconds. "
+                    "Try running with --verbose for more details."
+                )
+                raise typer.Exit(1)
+        asyncio.run(run_status_with_timeout())
     except Exception as e:
         logger.error(f"Status check failed: {e}")
@@ -162,6 +175,7 @@ async def show_status(
             file_extensions=config.file_extensions,
         )
+        # Get indexing stats (runs async file scanning in thread pool)
         async with database:
             index_stats = await indexer.get_indexing_stats()
             db_stats = await database.get_stats()

mcp_vector_search/cli/main.py CHANGED Viewed

@@ -39,7 +39,7 @@ unfamiliar codebases, finding similar patterns, and integrating with AI tools.
   status    📊 Show project status
   search    🔍 Search code semantically
   index     📇 Index codebase
-  mcp       🤖 MCP integration
+  mcp       🤖 MCP integration for AI tools
   config    ⚙️  Configure settings
   help      ❓ Get help
   version   ℹ️  Show version
@@ -84,7 +84,7 @@ app.add_typer(search_app, name="search", help="🔍 Search code semantically")
 app.add_typer(index_app, name="index", help="📇 Index codebase for semantic search")
 # 7. MCP - MCP integration
-app.add_typer(mcp_app, name="mcp", help="🤖 Manage Claude Code MCP integration")
+app.add_typer(mcp_app, name="mcp", help="🤖 Manage MCP integration for AI tools")
 # 8. CONFIG - Configuration
 app.add_typer(config_app, name="config", help="⚙️  Manage project configuration")
@@ -122,8 +122,6 @@ def deprecated_install():
     _deprecated_command("install", "init")()
 # Deprecated: find -> search
 @app.command("find", hidden=True)
 def deprecated_find():

mcp_vector_search/core/database.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Database abstraction and ChromaDB implementation for MCP Vector Search."""
+import asyncio
 import shutil
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -369,38 +370,67 @@ class ChromaVectorDatabase(VectorDatabase):
             raise DatabaseError(f"Failed to delete chunks: {e}") from e
     async def get_stats(self) -> IndexStats:
-        """Get database statistics."""
+        """Get database statistics with optimized chunked queries."""
         if not self._collection:
             raise DatabaseNotInitializedError("Database not initialized")
         try:
-            # Get total count
+            # Get total count (fast operation)
             count = self._collection.count()
-            # Get ALL metadata to analyze (not just a sample)
-            # Only fetch metadata, not embeddings, for performance
-            results = self._collection.get(include=["metadatas"])
+            if count == 0:
+                return IndexStats(
+                    total_files=0,
+                    total_chunks=0,
+                    languages={},
+                    file_types={},
+                    index_size_mb=0.0,
+                    last_updated="N/A",
+                    embedding_model="unknown",
+                )
-            # Count unique files from all chunks
-            files = {m.get("file_path", "") for m in results.get("metadatas", [])}
+            # Process in chunks to avoid loading everything at once
+            batch_size_limit = 1000
-            # Count languages and file types
-            language_counts = {}
-            file_type_counts = {}
+            files = set()
+            language_counts: dict[str, int] = {}
+            file_type_counts: dict[str, int] = {}
-            for metadata in results.get("metadatas", []):
-                # Count languages
-                lang = metadata.get("language", "unknown")
-                language_counts[lang] = language_counts.get(lang, 0) + 1
+            offset = 0
+            while offset < count:
+                # Fetch batch
+                batch_size = min(batch_size_limit, count - offset)
+                logger.debug(
+                    f"Processing database stats: batch {offset // batch_size_limit + 1}, "
+                    f"{offset}-{offset + batch_size} of {count} chunks"
+                )
-                # Count file types
-                file_path = metadata.get("file_path", "")
-                if file_path:
-                    ext = Path(file_path).suffix or "no_extension"
-                    file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                results = self._collection.get(
+                    include=["metadatas"],
+                    limit=batch_size,
+                    offset=offset,
+                )
-            # Estimate index size (rough approximation)
-            index_size_mb = count * 0.001  # Rough estimate
+                # Process batch metadata
+                for metadata in results.get("metadatas", []):
+                    # Language stats
+                    lang = metadata.get("language", "unknown")
+                    language_counts[lang] = language_counts.get(lang, 0) + 1
+                    # File stats
+                    file_path = metadata.get("file_path", "")
+                    if file_path:
+                        files.add(file_path)
+                        ext = Path(file_path).suffix or "no_extension"
+                        file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                offset += batch_size
+                # Yield to event loop periodically to prevent blocking
+                await asyncio.sleep(0)
+            # Estimate index size (rough approximation: ~1KB per chunk)
+            index_size_mb = count * 0.001
             return IndexStats(
                 total_files=len(files),
@@ -408,12 +438,13 @@ class ChromaVectorDatabase(VectorDatabase):
                 languages=language_counts,
                 file_types=file_type_counts,
                 index_size_mb=index_size_mb,
-                last_updated="unknown",  # TODO: Track this
-                embedding_model="unknown",  # TODO: Track this
+                last_updated="unknown",
+                embedding_model="unknown",
             )
         except Exception as e:
-            logger.error(f"Failed to get stats: {e}")
+            logger.error(f"Failed to get database statistics: {e}")
+            # Return empty stats instead of raising
             return IndexStats(
                 total_files=0,
                 total_chunks=0,
@@ -768,56 +799,88 @@ class PooledChromaVectorDatabase(VectorDatabase):
             raise DatabaseError(f"Failed to delete chunks: {e}") from e
     async def get_stats(self) -> IndexStats:
-        """Get database statistics using pooled connection."""
+        """Get database statistics with connection pooling and chunked queries."""
         try:
             async with self._pool.get_connection() as conn:
-                # Get total count
+                # Get total count (fast operation)
                 count = conn.collection.count()
-                # Get all metadata to analyze
-                results = conn.collection.get(include=["metadatas"])
+                if count == 0:
+                    return IndexStats(
+                        total_files=0,
+                        total_chunks=0,
+                        languages={},
+                        file_types={},
+                        index_size_mb=0.0,
+                        last_updated="N/A",
+                        embedding_model="unknown",
+                    )
+                # Process in chunks to avoid loading everything at once
+                batch_size_limit = 1000
-                # Analyze languages and files
-                languages = set()
                 files = set()
+                language_counts: dict[str, int] = {}
+                file_type_counts: dict[str, int] = {}
+                offset = 0
+                while offset < count:
+                    # Fetch batch
+                    batch_size = min(batch_size_limit, count - offset)
+                    logger.debug(
+                        f"Processing database stats: batch {offset // batch_size_limit + 1}, "
+                        f"{offset}-{offset + batch_size} of {count} chunks"
+                    )
-                for metadata in results["metadatas"]:
-                    if "language" in metadata:
-                        languages.add(metadata["language"])
-                    if "file_path" in metadata:
-                        files.add(metadata["file_path"])
+                    results = conn.collection.get(
+                        include=["metadatas"],
+                        limit=batch_size,
+                        offset=offset,
+                    )
-                # Count languages and file types
-                language_counts = {}
-                file_type_counts = {}
+                    # Process batch metadata
+                    for metadata in results.get("metadatas", []):
+                        # Language stats
+                        lang = metadata.get("language", "unknown")
+                        language_counts[lang] = language_counts.get(lang, 0) + 1
-                for metadata in results["metadatas"]:
-                    # Count languages
-                    lang = metadata.get("language", "unknown")
-                    language_counts[lang] = language_counts.get(lang, 0) + 1
+                        # File stats
+                        file_path = metadata.get("file_path", "")
+                        if file_path:
+                            files.add(file_path)
+                            ext = Path(file_path).suffix or "no_extension"
+                            file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
-                    # Count file types
-                    file_path = metadata.get("file_path", "")
-                    if file_path:
-                        ext = Path(file_path).suffix or "no_extension"
-                        file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                    offset += batch_size
+                    # Yield to event loop periodically to prevent blocking
+                    await asyncio.sleep(0)
-                # Estimate index size (rough approximation)
-                index_size_mb = count * 0.001  # Rough estimate
+                # Estimate index size (rough approximation: ~1KB per chunk)
+                index_size_mb = count * 0.001
                 return IndexStats(
-                    total_chunks=count,
                     total_files=len(files),
+                    total_chunks=count,
                     languages=language_counts,
                     file_types=file_type_counts,
                     index_size_mb=index_size_mb,
-                    last_updated="unknown",  # ChromaDB doesn't track this
-                    embedding_model="unknown",  # TODO: Track this in metadata
+                    last_updated="unknown",
+                    embedding_model="unknown",
                 )
         except Exception as e:
-            logger.error(f"Failed to get database stats: {e}")
-            raise DatabaseError(f"Failed to get stats: {e}") from e
+            logger.error(f"Failed to get database statistics: {e}")
+            # Return empty stats instead of raising
+            return IndexStats(
+                total_files=0,
+                total_chunks=0,
+                languages={},
+                file_types={},
+                index_size_mb=0.0,
+                last_updated="error",
+                embedding_model="unknown",
+            )
     async def remove_file_chunks(self, file_path: str) -> int:
         """Remove all chunks for a specific file using pooled connection."""

mcp_vector_search/core/indexer.py CHANGED Viewed

@@ -57,6 +57,11 @@ class SemanticIndexer:
             project_root / ".mcp-vector-search" / "index_metadata.json"
         )
+        # Add cache for indexable files to avoid repeated filesystem scans
+        self._indexable_files_cache: list[Path] | None = None
+        self._cache_timestamp: float = 0
+        self._cache_ttl: float = 60.0  # 60 second TTL
         # Initialize gitignore parser
         try:
             self.gitignore_parser = create_gitignore_parser(project_root)
@@ -334,38 +339,120 @@ class SemanticIndexer:
             return 0
     def _find_indexable_files(self) -> list[Path]:
-        """Find all files that should be indexed.
+        """Find all files that should be indexed with caching.
         Returns:
             List of file paths to index
         """
+        import time
+        # Check cache
+        current_time = time.time()
+        if (
+            self._indexable_files_cache is not None
+            and current_time - self._cache_timestamp < self._cache_ttl
+        ):
+            logger.debug(
+                f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
+            )
+            return self._indexable_files_cache
+        # Rebuild cache using efficient directory filtering
+        logger.debug("Rebuilding indexable files cache...")
+        indexable_files = self._scan_files_sync()
+        self._indexable_files_cache = sorted(indexable_files)
+        self._cache_timestamp = current_time
+        logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
+        return self._indexable_files_cache
+    def _scan_files_sync(self) -> list[Path]:
+        """Synchronous file scanning (runs in thread pool).
+        Uses os.walk with directory filtering to avoid traversing ignored directories.
+        Returns:
+            List of indexable file paths
+        """
         indexable_files = []
-        for file_path in self.project_root.rglob("*"):
-            if self._should_index_file(file_path):
-                indexable_files.append(file_path)
+        # Use os.walk for efficient directory traversal with early filtering
+        for root, dirs, files in os.walk(self.project_root):
+            root_path = Path(root)
+            # Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
+            # This is much more efficient than checking every file in ignored directories
+            # PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
+            dirs[:] = [d for d in dirs if not self._should_ignore_path(root_path / d, is_directory=True)]
+            # Check each file in the current directory
+            # PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
+            for filename in files:
+                file_path = root_path / filename
+                if self._should_index_file(file_path, skip_file_check=True):
+                    indexable_files.append(file_path)
-        return sorted(indexable_files)
+        return indexable_files
-    def _should_index_file(self, file_path: Path) -> bool:
+    async def _find_indexable_files_async(self) -> list[Path]:
+        """Find all files asynchronously without blocking event loop.
+        Returns:
+            List of file paths to index
+        """
+        import time
+        from concurrent.futures import ThreadPoolExecutor
+        # Check cache first
+        current_time = time.time()
+        if (
+            self._indexable_files_cache is not None
+            and current_time - self._cache_timestamp < self._cache_ttl
+        ):
+            logger.debug(
+                f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
+            )
+            return self._indexable_files_cache
+        # Run filesystem scan in thread pool to avoid blocking
+        logger.debug("Scanning files in background thread...")
+        loop = asyncio.get_running_loop()
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            indexable_files = await loop.run_in_executor(
+                executor, self._scan_files_sync
+            )
+        # Update cache
+        self._indexable_files_cache = sorted(indexable_files)
+        self._cache_timestamp = current_time
+        logger.debug(f"Found {len(indexable_files)} indexable files")
+        return self._indexable_files_cache
+    def _should_index_file(self, file_path: Path, skip_file_check: bool = False) -> bool:
         """Check if a file should be indexed.
         Args:
             file_path: Path to check
+            skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
         Returns:
             True if file should be indexed
         """
-        # Must be a file
-        if not file_path.is_file():
+        # PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
+        # This eliminates most files without any filesystem calls
+        if file_path.suffix.lower() not in self.file_extensions:
             return False
-        # Check file extension
-        if file_path.suffix.lower() not in self.file_extensions:
+        # PERFORMANCE: Only check is_file() if not coming from os.walk
+        # os.walk already guarantees files, so we skip this expensive check
+        if not skip_file_check and not file_path.is_file():
             return False
         # Check if path should be ignored
-        if self._should_ignore_path(file_path):
+        # PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
+        if self._should_ignore_path(file_path, is_directory=False):
             return False
         # Check file size (skip very large files)
@@ -379,18 +466,20 @@ class SemanticIndexer:
         return True
-    def _should_ignore_path(self, file_path: Path) -> bool:
+    def _should_ignore_path(self, file_path: Path, is_directory: bool | None = None) -> bool:
         """Check if a path should be ignored.
         Args:
             file_path: Path to check
+            is_directory: Optional hint if path is a directory (avoids filesystem check)
         Returns:
             True if path should be ignored
         """
         try:
             # First check gitignore rules if available
-            if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path):
+            # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
+            if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path, is_directory=is_directory):
                 logger.debug(f"Path ignored by .gitignore: {file_path}")
                 return True
@@ -532,8 +621,8 @@ class SemanticIndexer:
             # Get database stats
             db_stats = await self.database.get_stats()
-            # Count indexable files
-            indexable_files = self._find_indexable_files()
+            # Count indexable files asynchronously without blocking
+            indexable_files = await self._find_indexable_files_async()
             return {
                 "total_indexable_files": len(indexable_files),
@@ -553,3 +642,90 @@ class SemanticIndexer:
                 "indexed_files": 0,
                 "total_chunks": 0,
             }
+    async def get_files_to_index(
+        self, force_reindex: bool = False
+    ) -> tuple[list[Path], list[Path]]:
+        """Get all indexable files and those that need indexing.
+        Args:
+            force_reindex: Whether to force reindex of all files
+        Returns:
+            Tuple of (all_indexable_files, files_to_index)
+        """
+        # Find all indexable files
+        all_files = await self._find_indexable_files_async()
+        if not all_files:
+            return [], []
+        # Load existing metadata for incremental indexing
+        metadata = self._load_index_metadata()
+        # Filter files that need indexing
+        if force_reindex:
+            files_to_index = all_files
+            logger.info(f"Force reindex: processing all {len(files_to_index)} files")
+        else:
+            files_to_index = [
+                f for f in all_files if self._needs_reindexing(f, metadata)
+            ]
+            logger.info(
+                f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
+            )
+        return all_files, files_to_index
+    async def index_files_with_progress(
+        self,
+        files_to_index: list[Path],
+        force_reindex: bool = False,
+    ):
+        """Index files and yield progress updates for each file.
+        Args:
+            files_to_index: List of file paths to index
+            force_reindex: Whether to force reindexing
+        Yields:
+            Tuple of (file_path, chunks_added, success) for each processed file
+        """
+        metadata = self._load_index_metadata()
+        # Process files in batches for better memory management
+        for i in range(0, len(files_to_index), self.batch_size):
+            batch = files_to_index[i : i + self.batch_size]
+            # Process each file in the batch
+            for file_path in batch:
+                chunks_added = 0
+                success = False
+                try:
+                    # Always remove existing chunks when reindexing
+                    await self.database.delete_by_file(file_path)
+                    # Parse file into chunks
+                    chunks = await self._parse_file(file_path)
+                    if chunks:
+                        # Add chunks to database
+                        await self.database.add_chunks(chunks)
+                        chunks_added = len(chunks)
+                        logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
+                    success = True
+                    # Update metadata after successful indexing
+                    metadata[str(file_path)] = os.path.getmtime(file_path)
+                except Exception as e:
+                    logger.error(f"Failed to index file {file_path}: {e}")
+                    success = False
+                # Yield progress update
+                yield (file_path, chunks_added, success)
+        # Save metadata at the end
+        self._save_index_metadata(metadata)

mcp_vector_search/core/project.py CHANGED Viewed

@@ -281,24 +281,27 @@ class ProjectManager:
                 continue
             # Skip ignored patterns
-            if self._should_ignore_path(path):
+            # PERFORMANCE: Pass is_directory=False since we already checked is_file()
+            if self._should_ignore_path(path, is_directory=False):
                 continue
             files.append(path)
         return files
-    def _should_ignore_path(self, path: Path) -> bool:
+    def _should_ignore_path(self, path: Path, is_directory: bool | None = None) -> bool:
         """Check if a path should be ignored.
         Args:
             path: Path to check
+            is_directory: Optional hint if path is a directory (avoids filesystem check)
         Returns:
             True if path should be ignored
         """
         # First check gitignore rules if available
-        if self.gitignore_parser and self.gitignore_parser.is_ignored(path):
+        # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
+        if self.gitignore_parser and self.gitignore_parser.is_ignored(path, is_directory=is_directory):
             return True
         # Check if any parent directory is in ignore patterns

mcp-vector-search 0.7.4__py3-none-any.whl → 0.7.6__py3-none-any.whl

Potentially problematic release.

mcp-vector-search 0.7.4py3-none-any.whl → 0.7.6py3-none-any.whl