PyPI - mcp-code-indexer - Versions diffs - 4.2.15__py3-none-any.whl → 4.2.17__py3-none-any.whl - Mend

mcp-code-indexer 4.2.15py3-none-any.whl → 4.2.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (28) hide show

mcp_code_indexer/database/database.py +334 -115
mcp_code_indexer/database/database_factory.py +1 -1
mcp_code_indexer/database/exceptions.py +1 -1
mcp_code_indexer/database/models.py +66 -24
mcp_code_indexer/database/retry_executor.py +15 -5
mcp_code_indexer/file_scanner.py +107 -12
mcp_code_indexer/main.py +43 -30
mcp_code_indexer/server/mcp_server.py +201 -7
mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
mcp_code_indexer/vector_mode/config.py +113 -45
mcp_code_indexer/vector_mode/const.py +24 -0
mcp_code_indexer/vector_mode/daemon.py +860 -98
mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
mcp_code_indexer/vector_mode/services/__init__.py +9 -0
mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
mcp_code_indexer/vector_mode/types.py +46 -0
mcp_code_indexer/vector_mode/utils.py +50 -0
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/METADATA +13 -10
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/RECORD +28 -21
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/WHEEL +1 -1
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info}/entry_points.txt +0 -0
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.17.dist-info/licenses}/LICENSE +0 -0

mcp_code_indexer/database/models.py CHANGED Viewed

@@ -32,7 +32,9 @@ class Project(BaseModel):
     last_accessed: datetime = Field(
         default_factory=datetime.utcnow, description="Last access timestamp"
     )
-    vector_mode: bool = Field(default=False, description="Enable vector search for this project")
+    vector_mode: bool = Field(
+        default=False, description="Enable vector search for this project"
+    )
 class FileDescription(BaseModel):
@@ -189,10 +191,12 @@ class WordFrequencyResult(BaseModel):
 # Vector Mode Models
 class ChunkType(str, Enum):
     """Types of code chunks for semantic analysis."""
     FUNCTION = "function"
-    CLASS = "class"
+    CLASS = "class"
     METHOD = "method"
     IMPORT = "import"
     DOCSTRING = "docstring"
@@ -204,27 +208,32 @@ class ChunkType(str, Enum):
     NAMESPACE = "namespace"
     GENERIC = "generic"
 class NodeType(str, Enum):
     """Types of nodes in Merkle tree."""
     FILE = "file"
     DIRECTORY = "directory"
     PROJECT = "project"
 class SyncStatus(str, Enum):
     """Vector index synchronization status."""
     PENDING = "pending"
     IN_PROGRESS = "in_progress"
     COMPLETED = "completed"
     FAILED = "failed"
     PAUSED = "paused"
 class CodeChunk(BaseModel):
     """
     Represents a semantic chunk of code extracted from a file.
     Used for embedding generation and vector search operations.
     """
     id: Optional[int] = Field(None, description="Database ID")
     file_id: int = Field(..., description="Reference to FileDescription")
     project_id: str = Field(..., description="Reference to project")
@@ -235,17 +244,24 @@ class CodeChunk(BaseModel):
     content_hash: str = Field(..., description="SHA-256 hash of chunk content")
     embedding_id: Optional[str] = Field(None, description="Vector database ID")
     redacted: bool = Field(default=False, description="Whether content was redacted")
-    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
-    created: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
-    last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict, description="Additional metadata"
+    )
+    created: datetime = Field(
+        default_factory=datetime.utcnow, description="Creation timestamp"
+    )
+    last_modified: datetime = Field(
+        default_factory=datetime.utcnow, description="Last update timestamp"
+    )
 class MerkleNode(BaseModel):
     """
     Represents a node in the Merkle tree for change detection.
     Used to efficiently detect file system changes without scanning entire directory trees.
     """
     id: Optional[int] = Field(None, description="Database ID")
     project_id: str = Field(..., description="Reference to project")
     path: str = Field(..., description="File/directory path relative to project root")
@@ -253,36 +269,56 @@ class MerkleNode(BaseModel):
     node_type: NodeType = Field(..., description="Type of filesystem node")
     parent_path: Optional[str] = Field(None, description="Path to parent directory")
     children_hash: Optional[str] = Field(None, description="Combined hash of children")
-    last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
+    last_modified: datetime = Field(
+        default_factory=datetime.utcnow, description="Last update timestamp"
+    )
 class IndexMeta(BaseModel):
     """
     Metadata about vector indexing progress and status for a project.
     Tracks indexing state, statistics, and synchronization status.
     """
     id: Optional[int] = Field(None, description="Database ID")
     project_id: str = Field(..., description="Reference to project", unique=True)
     total_chunks: int = Field(default=0, description="Total number of chunks")
-    indexed_chunks: int = Field(default=0, description="Number of chunks with embeddings")
+    indexed_chunks: int = Field(
+        default=0, description="Number of chunks with embeddings"
+    )
     total_files: int = Field(default=0, description="Total number of files")
     indexed_files: int = Field(default=0, description="Number of files processed")
-    last_sync: Optional[datetime] = Field(None, description="Last successful sync timestamp")
-    sync_status: SyncStatus = Field(default=SyncStatus.PENDING, description="Current sync status")
+    last_sync: Optional[datetime] = Field(
+        None, description="Last successful sync timestamp"
+    )
+    sync_status: SyncStatus = Field(
+        default=SyncStatus.PENDING, description="Current sync status"
+    )
     error_message: Optional[str] = Field(None, description="Last error message")
     queue_depth: int = Field(default=0, description="Number of pending tasks")
-    processing_rate: float = Field(default=0.0, description="Files per second processing rate")
-    estimated_completion: Optional[datetime] = Field(None, description="Estimated completion time")
-    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
-    created: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
-    last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
+    processing_rate: float = Field(
+        default=0.0, description="Files per second processing rate"
+    )
+    estimated_completion: Optional[datetime] = Field(
+        None, description="Estimated completion time"
+    )
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict, description="Additional metadata"
+    )
+    created: datetime = Field(
+        default_factory=datetime.utcnow, description="Creation timestamp"
+    )
+    last_modified: datetime = Field(
+        default_factory=datetime.utcnow, description="Last update timestamp"
+    )
 class VectorSearchResult(BaseModel):
     """
     Represents a vector search result with similarity scoring.
     """
     file_path: str = Field(..., description="Path to the matching file")
     chunk_name: Optional[str] = Field(None, description="Name of the code chunk")
     chunk_type: ChunkType = Field(..., description="Type of code chunk")
@@ -291,13 +327,16 @@ class VectorSearchResult(BaseModel):
     end_line: int = Field(..., description="Ending line number")
     similarity_score: float = Field(..., description="Cosine similarity score")
     project_id: str = Field(..., description="Project identifier")
-    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
+    metadata: Dict[str, Any] = Field(
+        default_factory=dict, description="Additional metadata"
+    )
 class VectorIndexStatus(BaseModel):
     """
     Current status of vector indexing for a project.
     """
     is_indexing: bool = Field(..., description="Whether indexing is currently active")
     indexed_files: int = Field(..., description="Number of files indexed")
     total_files: int = Field(..., description="Total number of files")
@@ -307,9 +346,12 @@ class VectorIndexStatus(BaseModel):
     sync_status: SyncStatus = Field(..., description="Current sync status")
     queue_depth: int = Field(..., description="Number of pending tasks")
     processing_rate: float = Field(..., description="Processing rate")
-    estimated_completion: Optional[datetime] = Field(None, description="Estimated completion time")
+    estimated_completion: Optional[datetime] = Field(
+        None, description="Estimated completion time"
+    )
     error_message: Optional[str] = Field(None, description="Last error message")
 # Enable forward references for recursive models
 FolderNode.model_rebuild()
 CodebaseOverview.model_rebuild()

mcp_code_indexer/database/retry_executor.py CHANGED Viewed

@@ -279,8 +279,13 @@ class RetryExecutor:
         Yields:
             Database connection
         """
+        import sys
+        # Store the context manager so we can properly call __aexit__
+        ctx_manager: Optional[AsyncContextManager[aiosqlite.Connection]] = None
         async def acquire_connection() -> aiosqlite.Connection:
+            nonlocal ctx_manager
             # This function will be retried by execute_with_retry
             # Get the async context manager and enter it
             ctx_manager = connection_factory()
@@ -288,15 +293,20 @@ class RetryExecutor:
             return conn
         # Use execute_with_retry to handle the retry logic
-        # We create a connection and store it for the context manager
         connection = await self.execute_with_retry(acquire_connection, operation_name)
         try:
             yield connection
-        finally:
-            # Close the connection properly
-            if hasattr(connection, "close"):
-                await connection.close()
+        except BaseException:
+            # Pass actual exception info to __aexit__ for proper rollback/cleanup
+            exc_type, exc, tb = sys.exc_info()
+            if ctx_manager is not None:
+                await ctx_manager.__aexit__(exc_type, exc, tb)
+            raise
+        else:
+            # No exception - call __aexit__ with None values
+            if ctx_manager is not None:
+                await ctx_manager.__aexit__(None, None, None)
     def _should_retry_exception(self, retry_state: RetryCallState) -> bool:
         """

mcp_code_indexer/file_scanner.py CHANGED Viewed

@@ -6,10 +6,12 @@ while respecting .gitignore patterns and common ignore patterns. It enables
 efficient discovery of files that need description tracking.
 """
+import asyncio
 import fnmatch
 import logging
+import os
 from pathlib import Path
-from typing import Dict, Generator, List, Optional, Set, Union, Any, cast
+from typing import Dict, Iterator, List, Optional, Set, Union, Any, cast
 try:
     from gitignore_parser import parse_gitignore
@@ -150,6 +152,13 @@ class FileScanner:
         self.project_root = Path(project_root).resolve()
         self._gitignore_cache: Dict[str, Any] = {}
         self._load_gitignore_patterns()
+        # Build ignore patterns set for directory pruning
+        self.ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
+    @property
+    def root_path(self) -> Path:
+        """Get the root path for the scanner (alias for project_root)."""
+        return self.project_root
     def _load_gitignore_patterns(self) -> None:
         """Load and cache gitignore patterns from the project."""
@@ -228,6 +237,53 @@ class FileScanner:
         """Check if a file has an ignored extension."""
         return file_path.suffix.lower() in IGNORED_EXTENSIONS
+    def should_ignore_path(self, path: Path) -> bool:
+        """
+        Check if a path (file or directory) should be ignored based on patterns.
+        This is used for directory pruning during walks to skip entire subtrees
+        like node_modules, .git, etc.
+        Args:
+            path: Path to check (can be file or directory)
+        Returns:
+            True if the path should be ignored
+        """
+        try:
+            rel_path = path.relative_to(self.project_root)
+        except ValueError:
+            rel_path = path
+        path_str = str(rel_path)
+        path_name = path.name
+        # Check against ignore patterns
+        for pattern in self.ignore_patterns:
+            # Handle directory patterns (ending with /)
+            if pattern.endswith("/"):
+                pattern_no_slash = pattern.rstrip("/")
+                if path_name == pattern_no_slash:
+                    return True
+            # Handle wildcard patterns (starting with *)
+            elif pattern.startswith("*"):
+                if path_str.endswith(pattern[1:]) or path_name.endswith(pattern[1:]):
+                    return True
+            # Handle path patterns (containing / or \)
+            elif "/" in pattern or "\\" in pattern:
+                if pattern in path_str:
+                    return True
+            # Handle simple name patterns
+            else:
+                if pattern in path.parts or path_name == pattern:
+                    return True
+        # Also check gitignore
+        if self._is_ignored_by_gitignore(path):
+            return True
+        return False
     def should_ignore_file(self, file_path: Path) -> bool:
         """
         Determine if a file should be ignored.
@@ -246,12 +302,8 @@ class FileScanner:
         if self._is_ignored_by_extension(file_path):
             return True
-        # Check default patterns
-        if self._is_ignored_by_default_patterns(file_path):
-            return True
-        # Check gitignore patterns
-        if self._is_ignored_by_gitignore(file_path):
+        # Check path-based patterns
+        if self.should_ignore_path(file_path):
             return True
         return False
@@ -286,12 +338,27 @@ class FileScanner:
         logger.info(f"Found {len(files)} trackable files in {self.project_root}")
         return files
-    def _walk_directory(self) -> Generator[Path, None, None]:
-        """Walk through all files in the project directory."""
+    def _walk_directory(self) -> Iterator[Path]:
+        """
+        Walk directory using os.walk with directory pruning.
+        This skips ignored directories entirely rather than traversing then filtering.
+        Critical for performance - avoids traversing node_modules, .git, etc.
+        """
         try:
-            for item in self.project_root.rglob("*"):
-                if item.is_file():
-                    yield item
+            for dirpath, dirnames, filenames in os.walk(self.project_root):
+                current_dir = Path(dirpath)
+                # Prune ignored directories in-place to prevent descending into them
+                # Modifying dirnames in-place is the documented way to prune os.walk
+                dirnames[:] = [
+                    d for d in dirnames
+                    if not self.should_ignore_path(current_dir / d)
+                ]
+                for filename in filenames:
+                    yield current_dir / filename
         except PermissionError as e:
             logger.warning(f"Permission denied accessing {e.filename}")
         except Exception as e:
@@ -404,3 +471,31 @@ class FileScanner:
             logger.error(f"Error getting project stats: {e}")
         return stats
+    async def scan_directory_async(
+        self, max_files: Optional[int] = None
+    ) -> List[Path]:
+        """
+        Async version of scan_directory running in a thread.
+        Args:
+            max_files: Maximum number of files to return (None for no limit)
+        Returns:
+            List of file paths that should be tracked
+        """
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.scan_directory, max_files)
+    async def find_missing_files_async(self, existing_paths: Set[str]) -> List[Path]:
+        """
+        Async version of find_missing_files running in a thread.
+        Args:
+            existing_paths: Set of relative file paths that already have descriptions
+        Returns:
+            List of file paths that are missing descriptions
+        """
+        loop = asyncio.get_running_loop()
+        return await loop.run_in_executor(None, self.find_missing_files, existing_paths)

mcp_code_indexer/main.py CHANGED Viewed

@@ -377,6 +377,8 @@ async def handle_runcommand(args: argparse.Namespace) -> None:
             "get_word_frequency": server._handle_get_word_frequency,
             "search_codebase_overview": server._handle_search_codebase_overview,
             "check_database_health": server._handle_check_database_health,
+            "enabled_vector_mode": server._handle_enabled_vector_mode,
+            "find_similar_code": server._handle_find_similar_code,
         }
         if tool_name not in tool_handlers:
@@ -1017,41 +1019,49 @@ async def main() -> None:
                 from .vector_mode import is_vector_mode_available, check_api_keys
                 from .vector_mode.config import load_vector_config
                 from .vector_mode.daemon import start_vector_daemon
                 # Check if vector mode is available
                 if not is_vector_mode_available():
-                    logger.error("Vector mode dependencies not found. Try reinstalling: pip install --upgrade mcp-code-indexer")
+                    logger.error(
+                        "Vector mode dependencies not found. Try reinstalling: pip install --upgrade mcp-code-indexer"
+                    )
                     sys.exit(1)
                 # Check API keys
                 api_keys = check_api_keys()
                 if not all(api_keys.values()):
                     missing = [k for k, v in api_keys.items() if not v]
-                    logger.error(f"Missing API keys for vector mode: {', '.join(missing)}")
+                    logger.error(
+                        f"Missing API keys for vector mode: {', '.join(missing)}"
+                    )
                     sys.exit(1)
                 # Load vector configuration
-                vector_config_path = Path(args.vector_config).expanduser() if args.vector_config else None
+                vector_config_path = (
+                    Path(args.vector_config).expanduser()
+                    if args.vector_config
+                    else None
+                )
                 vector_config = load_vector_config(vector_config_path)
                 logger.info(
-                    "Vector mode enabled",
+                    "Vector mode enabled",
                     extra={
                         "structured_data": {
                             "embedding_model": vector_config.embedding_model,
                             "batch_size": vector_config.batch_size,
                             "daemon_enabled": vector_config.daemon_enabled,
                         }
-                    }
+                    },
                 )
                 # Start vector daemon in background
                 if vector_config.daemon_enabled:
                     vector_daemon_task = asyncio.create_task(
                         start_vector_daemon(vector_config_path, db_path, cache_dir)
                     )
                     logger.info("Vector daemon started")
             except Exception as e:
                 logger.error(f"Failed to initialize vector mode: {e}")
                 sys.exit(1)
@@ -1100,27 +1110,26 @@ async def main() -> None:
         if args.vector and vector_daemon_task:
             # Setup signal handling for graceful shutdown
             shutdown_event = asyncio.Event()
             def signal_handler():
                 logger.info("Shutdown signal received")
                 shutdown_event.set()
             # Register signal handlers
             loop = asyncio.get_running_loop()
             for sig in [signal.SIGTERM, signal.SIGINT]:
                 loop.add_signal_handler(sig, signal_handler)
             # Run server and wait for shutdown signal
             server_task = asyncio.create_task(server.run())
             shutdown_task = asyncio.create_task(shutdown_event.wait())
             try:
                 # Wait for either server completion or shutdown signal
                 done, pending = await asyncio.wait(
-                    [server_task, shutdown_task],
-                    return_when=asyncio.FIRST_COMPLETED
+                    [server_task, shutdown_task], return_when=asyncio.FIRST_COMPLETED
                 )
                 # Cancel remaining tasks
                 for task in pending:
                     task.cancel()
@@ -1128,7 +1137,7 @@ async def main() -> None:
                         await task
                     except asyncio.CancelledError:
                         pass
             except Exception as e:
                 logger.error(f"Error during server execution: {e}")
                 raise
@@ -1144,17 +1153,21 @@ async def main() -> None:
         if vector_daemon_task and not vector_daemon_task.done():
             logger.info("Cancelling vector daemon")
             vector_daemon_task.cancel()
         # Wait for vector daemon to finish
         if vector_daemon_task:
             try:
                 await vector_daemon_task
             except asyncio.CancelledError:
                 logger.info("Vector daemon cancelled successfully")
         # Clean up any remaining asyncio tasks to prevent hanging
         current_task = asyncio.current_task()
-        tasks = [task for task in asyncio.all_tasks() if not task.done() and task is not current_task]
+        tasks = [
+            task
+            for task in asyncio.all_tasks()
+            if not task.done() and task is not current_task
+        ]
         if tasks:
             logger.info(f"Cancelling {len(tasks)} remaining tasks")
             for task in tasks:
@@ -1163,22 +1176,21 @@ async def main() -> None:
             # Wait for cancellation but don't wait forever
             try:
                 await asyncio.wait_for(
-                    asyncio.gather(*tasks, return_exceptions=True),
-                    timeout=2.0
+                    asyncio.gather(*tasks, return_exceptions=True), timeout=2.0
                 )
             except asyncio.TimeoutError:
                 logger.warning("Some tasks did not cancel within timeout")
         # Force close any remaining connections and cleanup resources
         try:
             # Give a moment for final cleanup
             await asyncio.sleep(0.1)
             # Shutdown the event loop executor to stop any background threads
             loop = asyncio.get_running_loop()
-            if hasattr(loop, '_default_executor') and loop._default_executor:
+            if hasattr(loop, "_default_executor") and loop._default_executor:
                 loop._default_executor.shutdown(wait=False)
         except Exception as e:
             logger.warning(f"Error during final cleanup: {e}")
@@ -1202,14 +1214,15 @@ def cli_main() -> None:
         # Force cleanup of any remaining resources to prevent hanging
         import threading
         import time
         # Give main threads a moment to finish
         time.sleep(0.1)
         # Force exit if daemon threads are preventing shutdown
         active_threads = threading.active_count()
         if active_threads > 1:  # More than just the main thread
             import os
             os._exit(0)

mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.17__py3-none-any.whl

mcp-code-indexer 4.2.15py3-none-any.whl → 4.2.17py3-none-any.whl