PyPI - mcp-vector-search - Versions diffs - 0.7.5__py3-none-any.whl → 0.7.6__py3-none-any.whl - Mend

mcp-vector-search 0.7.5py3-none-any.whl → 0.7.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (15) hide show

mcp_vector_search/core/database.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Database abstraction and ChromaDB implementation for MCP Vector Search."""
+import asyncio
 import shutil
 from abc import ABC, abstractmethod
 from pathlib import Path
@@ -369,38 +370,67 @@ class ChromaVectorDatabase(VectorDatabase):
             raise DatabaseError(f"Failed to delete chunks: {e}") from e
     async def get_stats(self) -> IndexStats:
-        """Get database statistics."""
+        """Get database statistics with optimized chunked queries."""
         if not self._collection:
             raise DatabaseNotInitializedError("Database not initialized")
         try:
-            # Get total count
+            # Get total count (fast operation)
             count = self._collection.count()
-            # Get ALL metadata to analyze (not just a sample)
-            # Only fetch metadata, not embeddings, for performance
-            results = self._collection.get(include=["metadatas"])
+            if count == 0:
+                return IndexStats(
+                    total_files=0,
+                    total_chunks=0,
+                    languages={},
+                    file_types={},
+                    index_size_mb=0.0,
+                    last_updated="N/A",
+                    embedding_model="unknown",
+                )
-            # Count unique files from all chunks
-            files = {m.get("file_path", "") for m in results.get("metadatas", [])}
+            # Process in chunks to avoid loading everything at once
+            batch_size_limit = 1000
-            # Count languages and file types
-            language_counts = {}
-            file_type_counts = {}
+            files = set()
+            language_counts: dict[str, int] = {}
+            file_type_counts: dict[str, int] = {}
-            for metadata in results.get("metadatas", []):
-                # Count languages
-                lang = metadata.get("language", "unknown")
-                language_counts[lang] = language_counts.get(lang, 0) + 1
+            offset = 0
+            while offset < count:
+                # Fetch batch
+                batch_size = min(batch_size_limit, count - offset)
+                logger.debug(
+                    f"Processing database stats: batch {offset // batch_size_limit + 1}, "
+                    f"{offset}-{offset + batch_size} of {count} chunks"
+                )
-                # Count file types
-                file_path = metadata.get("file_path", "")
-                if file_path:
-                    ext = Path(file_path).suffix or "no_extension"
-                    file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                results = self._collection.get(
+                    include=["metadatas"],
+                    limit=batch_size,
+                    offset=offset,
+                )
-            # Estimate index size (rough approximation)
-            index_size_mb = count * 0.001  # Rough estimate
+                # Process batch metadata
+                for metadata in results.get("metadatas", []):
+                    # Language stats
+                    lang = metadata.get("language", "unknown")
+                    language_counts[lang] = language_counts.get(lang, 0) + 1
+                    # File stats
+                    file_path = metadata.get("file_path", "")
+                    if file_path:
+                        files.add(file_path)
+                        ext = Path(file_path).suffix or "no_extension"
+                        file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                offset += batch_size
+                # Yield to event loop periodically to prevent blocking
+                await asyncio.sleep(0)
+            # Estimate index size (rough approximation: ~1KB per chunk)
+            index_size_mb = count * 0.001
             return IndexStats(
                 total_files=len(files),
@@ -408,12 +438,13 @@ class ChromaVectorDatabase(VectorDatabase):
                 languages=language_counts,
                 file_types=file_type_counts,
                 index_size_mb=index_size_mb,
-                last_updated="unknown",  # TODO: Track this
-                embedding_model="unknown",  # TODO: Track this
+                last_updated="unknown",
+                embedding_model="unknown",
             )
         except Exception as e:
-            logger.error(f"Failed to get stats: {e}")
+            logger.error(f"Failed to get database statistics: {e}")
+            # Return empty stats instead of raising
             return IndexStats(
                 total_files=0,
                 total_chunks=0,
@@ -768,56 +799,88 @@ class PooledChromaVectorDatabase(VectorDatabase):
             raise DatabaseError(f"Failed to delete chunks: {e}") from e
     async def get_stats(self) -> IndexStats:
-        """Get database statistics using pooled connection."""
+        """Get database statistics with connection pooling and chunked queries."""
         try:
             async with self._pool.get_connection() as conn:
-                # Get total count
+                # Get total count (fast operation)
                 count = conn.collection.count()
-                # Get all metadata to analyze
-                results = conn.collection.get(include=["metadatas"])
+                if count == 0:
+                    return IndexStats(
+                        total_files=0,
+                        total_chunks=0,
+                        languages={},
+                        file_types={},
+                        index_size_mb=0.0,
+                        last_updated="N/A",
+                        embedding_model="unknown",
+                    )
+                # Process in chunks to avoid loading everything at once
+                batch_size_limit = 1000
-                # Analyze languages and files
-                languages = set()
                 files = set()
+                language_counts: dict[str, int] = {}
+                file_type_counts: dict[str, int] = {}
+                offset = 0
+                while offset < count:
+                    # Fetch batch
+                    batch_size = min(batch_size_limit, count - offset)
+                    logger.debug(
+                        f"Processing database stats: batch {offset // batch_size_limit + 1}, "
+                        f"{offset}-{offset + batch_size} of {count} chunks"
+                    )
-                for metadata in results["metadatas"]:
-                    if "language" in metadata:
-                        languages.add(metadata["language"])
-                    if "file_path" in metadata:
-                        files.add(metadata["file_path"])
+                    results = conn.collection.get(
+                        include=["metadatas"],
+                        limit=batch_size,
+                        offset=offset,
+                    )
-                # Count languages and file types
-                language_counts = {}
-                file_type_counts = {}
+                    # Process batch metadata
+                    for metadata in results.get("metadatas", []):
+                        # Language stats
+                        lang = metadata.get("language", "unknown")
+                        language_counts[lang] = language_counts.get(lang, 0) + 1
-                for metadata in results["metadatas"]:
-                    # Count languages
-                    lang = metadata.get("language", "unknown")
-                    language_counts[lang] = language_counts.get(lang, 0) + 1
+                        # File stats
+                        file_path = metadata.get("file_path", "")
+                        if file_path:
+                            files.add(file_path)
+                            ext = Path(file_path).suffix or "no_extension"
+                            file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
-                    # Count file types
-                    file_path = metadata.get("file_path", "")
-                    if file_path:
-                        ext = Path(file_path).suffix or "no_extension"
-                        file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
+                    offset += batch_size
+                    # Yield to event loop periodically to prevent blocking
+                    await asyncio.sleep(0)
-                # Estimate index size (rough approximation)
-                index_size_mb = count * 0.001  # Rough estimate
+                # Estimate index size (rough approximation: ~1KB per chunk)
+                index_size_mb = count * 0.001
                 return IndexStats(
-                    total_chunks=count,
                     total_files=len(files),
+                    total_chunks=count,
                     languages=language_counts,
                     file_types=file_type_counts,
                     index_size_mb=index_size_mb,
-                    last_updated="unknown",  # ChromaDB doesn't track this
-                    embedding_model="unknown",  # TODO: Track this in metadata
+                    last_updated="unknown",
+                    embedding_model="unknown",
                 )
         except Exception as e:
-            logger.error(f"Failed to get database stats: {e}")
-            raise DatabaseError(f"Failed to get stats: {e}") from e
+            logger.error(f"Failed to get database statistics: {e}")
+            # Return empty stats instead of raising
+            return IndexStats(
+                total_files=0,
+                total_chunks=0,
+                languages={},
+                file_types={},
+                index_size_mb=0.0,
+                last_updated="error",
+                embedding_model="unknown",
+            )
     async def remove_file_chunks(self, file_path: str) -> int:
         """Remove all chunks for a specific file using pooled connection."""

mcp_vector_search/core/indexer.py CHANGED Viewed

@@ -57,6 +57,11 @@ class SemanticIndexer:
             project_root / ".mcp-vector-search" / "index_metadata.json"
         )
+        # Add cache for indexable files to avoid repeated filesystem scans
+        self._indexable_files_cache: list[Path] | None = None
+        self._cache_timestamp: float = 0
+        self._cache_ttl: float = 60.0  # 60 second TTL
         # Initialize gitignore parser
         try:
             self.gitignore_parser = create_gitignore_parser(project_root)
@@ -334,38 +339,120 @@ class SemanticIndexer:
             return 0
     def _find_indexable_files(self) -> list[Path]:
-        """Find all files that should be indexed.
+        """Find all files that should be indexed with caching.
         Returns:
             List of file paths to index
         """
+        import time
+        # Check cache
+        current_time = time.time()
+        if (
+            self._indexable_files_cache is not None
+            and current_time - self._cache_timestamp < self._cache_ttl
+        ):
+            logger.debug(
+                f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
+            )
+            return self._indexable_files_cache
+        # Rebuild cache using efficient directory filtering
+        logger.debug("Rebuilding indexable files cache...")
+        indexable_files = self._scan_files_sync()
+        self._indexable_files_cache = sorted(indexable_files)
+        self._cache_timestamp = current_time
+        logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
+        return self._indexable_files_cache
+    def _scan_files_sync(self) -> list[Path]:
+        """Synchronous file scanning (runs in thread pool).
+        Uses os.walk with directory filtering to avoid traversing ignored directories.
+        Returns:
+            List of indexable file paths
+        """
         indexable_files = []
-        for file_path in self.project_root.rglob("*"):
-            if self._should_index_file(file_path):
-                indexable_files.append(file_path)
+        # Use os.walk for efficient directory traversal with early filtering
+        for root, dirs, files in os.walk(self.project_root):
+            root_path = Path(root)
+            # Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
+            # This is much more efficient than checking every file in ignored directories
+            # PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
+            dirs[:] = [d for d in dirs if not self._should_ignore_path(root_path / d, is_directory=True)]
+            # Check each file in the current directory
+            # PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
+            for filename in files:
+                file_path = root_path / filename
+                if self._should_index_file(file_path, skip_file_check=True):
+                    indexable_files.append(file_path)
-        return sorted(indexable_files)
+        return indexable_files
-    def _should_index_file(self, file_path: Path) -> bool:
+    async def _find_indexable_files_async(self) -> list[Path]:
+        """Find all files asynchronously without blocking event loop.
+        Returns:
+            List of file paths to index
+        """
+        import time
+        from concurrent.futures import ThreadPoolExecutor
+        # Check cache first
+        current_time = time.time()
+        if (
+            self._indexable_files_cache is not None
+            and current_time - self._cache_timestamp < self._cache_ttl
+        ):
+            logger.debug(
+                f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
+            )
+            return self._indexable_files_cache
+        # Run filesystem scan in thread pool to avoid blocking
+        logger.debug("Scanning files in background thread...")
+        loop = asyncio.get_running_loop()
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            indexable_files = await loop.run_in_executor(
+                executor, self._scan_files_sync
+            )
+        # Update cache
+        self._indexable_files_cache = sorted(indexable_files)
+        self._cache_timestamp = current_time
+        logger.debug(f"Found {len(indexable_files)} indexable files")
+        return self._indexable_files_cache
+    def _should_index_file(self, file_path: Path, skip_file_check: bool = False) -> bool:
         """Check if a file should be indexed.
         Args:
             file_path: Path to check
+            skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
         Returns:
             True if file should be indexed
         """
-        # Must be a file
-        if not file_path.is_file():
+        # PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
+        # This eliminates most files without any filesystem calls
+        if file_path.suffix.lower() not in self.file_extensions:
             return False
-        # Check file extension
-        if file_path.suffix.lower() not in self.file_extensions:
+        # PERFORMANCE: Only check is_file() if not coming from os.walk
+        # os.walk already guarantees files, so we skip this expensive check
+        if not skip_file_check and not file_path.is_file():
             return False
         # Check if path should be ignored
-        if self._should_ignore_path(file_path):
+        # PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
+        if self._should_ignore_path(file_path, is_directory=False):
             return False
         # Check file size (skip very large files)
@@ -379,18 +466,20 @@ class SemanticIndexer:
         return True
-    def _should_ignore_path(self, file_path: Path) -> bool:
+    def _should_ignore_path(self, file_path: Path, is_directory: bool | None = None) -> bool:
         """Check if a path should be ignored.
         Args:
             file_path: Path to check
+            is_directory: Optional hint if path is a directory (avoids filesystem check)
         Returns:
             True if path should be ignored
         """
         try:
             # First check gitignore rules if available
-            if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path):
+            # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
+            if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path, is_directory=is_directory):
                 logger.debug(f"Path ignored by .gitignore: {file_path}")
                 return True
@@ -532,8 +621,8 @@ class SemanticIndexer:
             # Get database stats
             db_stats = await self.database.get_stats()
-            # Count indexable files
-            indexable_files = self._find_indexable_files()
+            # Count indexable files asynchronously without blocking
+            indexable_files = await self._find_indexable_files_async()
             return {
                 "total_indexable_files": len(indexable_files),
@@ -553,3 +642,90 @@ class SemanticIndexer:
                 "indexed_files": 0,
                 "total_chunks": 0,
             }
+    async def get_files_to_index(
+        self, force_reindex: bool = False
+    ) -> tuple[list[Path], list[Path]]:
+        """Get all indexable files and those that need indexing.
+        Args:
+            force_reindex: Whether to force reindex of all files
+        Returns:
+            Tuple of (all_indexable_files, files_to_index)
+        """
+        # Find all indexable files
+        all_files = await self._find_indexable_files_async()
+        if not all_files:
+            return [], []
+        # Load existing metadata for incremental indexing
+        metadata = self._load_index_metadata()
+        # Filter files that need indexing
+        if force_reindex:
+            files_to_index = all_files
+            logger.info(f"Force reindex: processing all {len(files_to_index)} files")
+        else:
+            files_to_index = [
+                f for f in all_files if self._needs_reindexing(f, metadata)
+            ]
+            logger.info(
+                f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
+            )
+        return all_files, files_to_index
+    async def index_files_with_progress(
+        self,
+        files_to_index: list[Path],
+        force_reindex: bool = False,
+    ):
+        """Index files and yield progress updates for each file.
+        Args:
+            files_to_index: List of file paths to index
+            force_reindex: Whether to force reindexing
+        Yields:
+            Tuple of (file_path, chunks_added, success) for each processed file
+        """
+        metadata = self._load_index_metadata()
+        # Process files in batches for better memory management
+        for i in range(0, len(files_to_index), self.batch_size):
+            batch = files_to_index[i : i + self.batch_size]
+            # Process each file in the batch
+            for file_path in batch:
+                chunks_added = 0
+                success = False
+                try:
+                    # Always remove existing chunks when reindexing
+                    await self.database.delete_by_file(file_path)
+                    # Parse file into chunks
+                    chunks = await self._parse_file(file_path)
+                    if chunks:
+                        # Add chunks to database
+                        await self.database.add_chunks(chunks)
+                        chunks_added = len(chunks)
+                        logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
+                    success = True
+                    # Update metadata after successful indexing
+                    metadata[str(file_path)] = os.path.getmtime(file_path)
+                except Exception as e:
+                    logger.error(f"Failed to index file {file_path}: {e}")
+                    success = False
+                # Yield progress update
+                yield (file_path, chunks_added, success)
+        # Save metadata at the end
+        self._save_index_metadata(metadata)

mcp_vector_search/core/project.py CHANGED Viewed

@@ -281,24 +281,27 @@ class ProjectManager:
                 continue
             # Skip ignored patterns
-            if self._should_ignore_path(path):
+            # PERFORMANCE: Pass is_directory=False since we already checked is_file()
+            if self._should_ignore_path(path, is_directory=False):
                 continue
             files.append(path)
         return files
-    def _should_ignore_path(self, path: Path) -> bool:
+    def _should_ignore_path(self, path: Path, is_directory: bool | None = None) -> bool:
         """Check if a path should be ignored.
         Args:
             path: Path to check
+            is_directory: Optional hint if path is a directory (avoids filesystem check)
         Returns:
             True if path should be ignored
         """
         # First check gitignore rules if available
-        if self.gitignore_parser and self.gitignore_parser.is_ignored(path):
+        # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
+        if self.gitignore_parser and self.gitignore_parser.is_ignored(path, is_directory=is_directory):
             return True
         # Check if any parent directory is in ignore patterns

mcp_vector_search/utils/gitignore.py CHANGED Viewed

@@ -102,16 +102,18 @@ class GitignoreParser:
         self._load_gitignore_files()
     def _load_gitignore_files(self) -> None:
-        """Load all .gitignore files in the project hierarchy."""
-        # Load global .gitignore first (if exists)
-        global_gitignore = self.project_root / ".gitignore"
-        if global_gitignore.exists():
-            self._parse_gitignore_file(global_gitignore)
+        """Load .gitignore file from project root only.
-        # Load .gitignore files in subdirectories
-        for gitignore_file in self.project_root.rglob(".gitignore"):
-            if gitignore_file != global_gitignore:
-                self._parse_gitignore_file(gitignore_file)
+        Note: Only the root .gitignore is loaded to avoid performance issues
+        with rglob traversing large directory trees (e.g., node_modules with
+        250K+ files). Subdirectory .gitignore files are intentionally skipped
+        as they would add significant overhead without much benefit for
+        semantic code search indexing.
+        """
+        # Load root .gitignore only
+        root_gitignore = self.project_root / ".gitignore"
+        if root_gitignore.exists():
+            self._parse_gitignore_file(root_gitignore)
     def _parse_gitignore_file(self, gitignore_path: Path) -> None:
         """Parse a single .gitignore file.
@@ -136,32 +138,32 @@ class GitignoreParser:
                 # Check for directory-only pattern
                 is_directory_only = line.endswith("/")
-                # Create pattern relative to the .gitignore file's directory
-                gitignore_dir = gitignore_path.parent
-                if gitignore_dir != self.project_root:
-                    # Adjust pattern for subdirectory .gitignore files
-                    relative_dir = gitignore_dir.relative_to(self.project_root)
-                    if not line.startswith("/") and not is_negation:
-                        line = str(relative_dir / line)
-                    elif is_negation and not line[1:].startswith("/"):
-                        line = "!" + str(relative_dir / line[1:])
+                # Create pattern (all patterns are from root .gitignore)
                 pattern = GitignorePattern(line, is_negation, is_directory_only)
                 self.patterns.append(pattern)
         except Exception as e:
             logger.warning(f"Failed to parse {gitignore_path}: {e}")
-    def is_ignored(self, path: Path) -> bool:
+    def is_ignored(self, path: Path, is_directory: bool | None = None) -> bool:
         """Check if a path should be ignored according to .gitignore rules.
         Args:
             path: Path to check (can be absolute or relative to project root)
+            is_directory: Optional hint if path is a directory.
+                         If None, will check filesystem (slower).
+                         If provided, skips filesystem check (faster).
         Returns:
             True if the path should be ignored
         """
         try:
+            # SHORT-CIRCUIT: If no patterns, nothing is ignored
+            # This prevents 200k+ unnecessary filesystem stat() calls on projects
+            # without .gitignore files
+            if not self.patterns:
+                return False
             # Convert to relative path from project root
             if path.is_absolute():
                 relative_path = path.relative_to(self.project_root)
@@ -169,7 +171,12 @@ class GitignoreParser:
                 relative_path = path
             path_str = str(relative_path).replace("\\", "/")
-            is_directory = path.is_dir() if path.exists() else False
+            # Only check if directory when needed and not provided as hint
+            # PERFORMANCE: Passing is_directory hint from caller (e.g., os.walk)
+            # avoids hundreds of thousands of stat() calls on large repositories
+            if is_directory is None:
+                is_directory = path.is_dir() if path.exists() else False
             # Apply patterns in order, with later patterns overriding earlier ones
             ignored = False
@@ -216,15 +223,16 @@ def create_gitignore_parser(project_root: Path) -> GitignoreParser:
     return GitignoreParser(project_root)
-def is_path_gitignored(path: Path, project_root: Path) -> bool:
+def is_path_gitignored(path: Path, project_root: Path, is_directory: bool | None = None) -> bool:
     """Quick function to check if a path is gitignored.
     Args:
         path: Path to check
         project_root: Root directory of the project
+        is_directory: Optional hint if path is a directory (avoids filesystem check)
     Returns:
         True if the path should be ignored
     """
     parser = create_gitignore_parser(project_root)
-    return parser.is_ignored(path)
+    return parser.is_ignored(path, is_directory=is_directory)

{mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.7.6.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: mcp-vector-search
-Version: 0.7.5
+Version: 0.7.6
 Summary: CLI-first semantic code search with MCP integration
 Project-URL: Homepage, https://github.com/bobmatnyc/mcp-vector-search
 Project-URL: Documentation, https://mcp-vector-search.readthedocs.io

mcp-vector-search 0.7.5__py3-none-any.whl → 0.7.6__py3-none-any.whl

Potentially problematic release.

mcp-vector-search 0.7.5py3-none-any.whl → 0.7.6py3-none-any.whl