PyPI - mcp-code-indexer - Versions diffs - 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl - Mend

mcp-code-indexer 4.0.2py3-none-any.whl → 4.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

mcp_code_indexer/database/models.py CHANGED Viewed

@@ -7,7 +7,8 @@ the database operations.
 """
 from datetime import datetime
-from typing import List, Optional
+from typing import List, Optional, Dict, Any
+from enum import Enum
 from pydantic import BaseModel, Field
@@ -185,6 +186,129 @@ class WordFrequencyResult(BaseModel):
     total_unique_terms: int = Field(..., description="Number of unique terms found")
+# Vector Mode Models
+class ChunkType(str, Enum):
+    """Types of code chunks for semantic analysis."""
+    FUNCTION = "function"
+    CLASS = "class"
+    METHOD = "method"
+    IMPORT = "import"
+    DOCSTRING = "docstring"
+    COMMENT = "comment"
+    VARIABLE = "variable"
+    INTERFACE = "interface"
+    TYPE_DEFINITION = "type_definition"
+    MODULE = "module"
+    NAMESPACE = "namespace"
+    GENERIC = "generic"
+class NodeType(str, Enum):
+    """Types of nodes in Merkle tree."""
+    FILE = "file"
+    DIRECTORY = "directory"
+    PROJECT = "project"
+class SyncStatus(str, Enum):
+    """Vector index synchronization status."""
+    PENDING = "pending"
+    IN_PROGRESS = "in_progress"
+    COMPLETED = "completed"
+    FAILED = "failed"
+    PAUSED = "paused"
+class CodeChunk(BaseModel):
+    """
+    Represents a semantic chunk of code extracted from a file.
+    Used for embedding generation and vector search operations.
+    """
+    id: Optional[int] = Field(None, description="Database ID")
+    file_id: int = Field(..., description="Reference to FileDescription")
+    project_id: str = Field(..., description="Reference to project")
+    chunk_type: ChunkType = Field(..., description="Type of code chunk")
+    name: Optional[str] = Field(None, description="Name of function/class/etc")
+    start_line: int = Field(..., description="Starting line number")
+    end_line: int = Field(..., description="Ending line number")
+    content_hash: str = Field(..., description="SHA-256 hash of chunk content")
+    embedding_id: Optional[str] = Field(None, description="Vector database ID")
+    redacted: bool = Field(default=False, description="Whether content was redacted")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
+    created: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
+    last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
+class MerkleNode(BaseModel):
+    """
+    Represents a node in the Merkle tree for change detection.
+    Used to efficiently detect file system changes without scanning entire directory trees.
+    """
+    id: Optional[int] = Field(None, description="Database ID")
+    project_id: str = Field(..., description="Reference to project")
+    path: str = Field(..., description="File/directory path relative to project root")
+    hash: str = Field(..., description="SHA-256 hash of content or children")
+    node_type: NodeType = Field(..., description="Type of filesystem node")
+    parent_path: Optional[str] = Field(None, description="Path to parent directory")
+    children_hash: Optional[str] = Field(None, description="Combined hash of children")
+    last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
+class IndexMeta(BaseModel):
+    """
+    Metadata about vector indexing progress and status for a project.
+    Tracks indexing state, statistics, and synchronization status.
+    """
+    id: Optional[int] = Field(None, description="Database ID")
+    project_id: str = Field(..., description="Reference to project", unique=True)
+    total_chunks: int = Field(default=0, description="Total number of chunks")
+    indexed_chunks: int = Field(default=0, description="Number of chunks with embeddings")
+    total_files: int = Field(default=0, description="Total number of files")
+    indexed_files: int = Field(default=0, description="Number of files processed")
+    last_sync: Optional[datetime] = Field(None, description="Last successful sync timestamp")
+    sync_status: SyncStatus = Field(default=SyncStatus.PENDING, description="Current sync status")
+    error_message: Optional[str] = Field(None, description="Last error message")
+    queue_depth: int = Field(default=0, description="Number of pending tasks")
+    processing_rate: float = Field(default=0.0, description="Files per second processing rate")
+    estimated_completion: Optional[datetime] = Field(None, description="Estimated completion time")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
+    created: datetime = Field(default_factory=datetime.utcnow, description="Creation timestamp")
+    last_modified: datetime = Field(default_factory=datetime.utcnow, description="Last update timestamp")
+class VectorSearchResult(BaseModel):
+    """
+    Represents a vector search result with similarity scoring.
+    """
+    file_path: str = Field(..., description="Path to the matching file")
+    chunk_name: Optional[str] = Field(None, description="Name of the code chunk")
+    chunk_type: ChunkType = Field(..., description="Type of code chunk")
+    code_snippet: str = Field(..., description="Original code content")
+    start_line: int = Field(..., description="Starting line number")
+    end_line: int = Field(..., description="Ending line number")
+    similarity_score: float = Field(..., description="Cosine similarity score")
+    project_id: str = Field(..., description="Project identifier")
+    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
+class VectorIndexStatus(BaseModel):
+    """
+    Current status of vector indexing for a project.
+    """
+    is_indexing: bool = Field(..., description="Whether indexing is currently active")
+    indexed_files: int = Field(..., description="Number of files indexed")
+    total_files: int = Field(..., description="Total number of files")
+    indexed_chunks: int = Field(..., description="Number of chunks indexed")
+    total_chunks: int = Field(..., description="Total number of chunks")
+    last_sync: Optional[datetime] = Field(None, description="Last sync timestamp")
+    sync_status: SyncStatus = Field(..., description="Current sync status")
+    queue_depth: int = Field(..., description="Number of pending tasks")
+    processing_rate: float = Field(..., description="Processing rate")
+    estimated_completion: Optional[datetime] = Field(None, description="Estimated completion time")
+    error_message: Optional[str] = Field(None, description="Last error message")
 # Enable forward references for recursive models
 FolderNode.model_rebuild()
 CodebaseOverview.model_rebuild()

mcp_code_indexer/main.py CHANGED Viewed

@@ -151,6 +151,19 @@ def parse_arguments() -> argparse.Namespace:
         help="Allowed CORS origins for HTTP transport (default: allow all)",
     )
+    # Vector mode options
+    parser.add_argument(
+        "--vector",
+        action="store_true",
+        help="Enable vector mode with semantic search capabilities (requires vector extras)",
+    )
+    parser.add_argument(
+        "--vector-config",
+        type=str,
+        help="Path to vector mode configuration file",
+    )
     return parser.parse_args()
@@ -996,6 +1009,52 @@ async def main() -> None:
     )
     try:
+        # Handle vector mode initialization
+        vector_daemon_task = None
+        if args.vector:
+            try:
+                from .vector_mode import is_vector_mode_available, check_api_keys
+                from .vector_mode.config import load_vector_config
+                from .vector_mode.daemon import start_vector_daemon
+                # Check if vector mode is available
+                if not is_vector_mode_available():
+                    logger.error("Vector mode requires additional dependencies. Install with: pip install mcp-code-indexer[vector]")
+                    sys.exit(1)
+                # Check API keys
+                api_keys = check_api_keys()
+                if not all(api_keys.values()):
+                    missing = [k for k, v in api_keys.items() if not v]
+                    logger.error(f"Missing API keys for vector mode: {', '.join(missing)}")
+                    sys.exit(1)
+                # Load vector configuration
+                vector_config_path = Path(args.vector_config).expanduser() if args.vector_config else None
+                vector_config = load_vector_config(vector_config_path)
+                logger.info(
+                    "Vector mode enabled",
+                    extra={
+                        "structured_data": {
+                            "embedding_model": vector_config.embedding_model,
+                            "batch_size": vector_config.batch_size,
+                            "daemon_enabled": vector_config.daemon_enabled,
+                        }
+                    }
+                )
+                # Start vector daemon in background
+                if vector_config.daemon_enabled:
+                    vector_daemon_task = asyncio.create_task(
+                        start_vector_daemon(vector_config_path, db_path, cache_dir)
+                    )
+                    logger.info("Vector daemon started")
+            except Exception as e:
+                logger.error(f"Failed to initialize vector mode: {e}")
+                sys.exit(1)
         # Import and run the MCP server
         from .server.mcp_server import MCPCodeIndexServer
@@ -1028,6 +1087,7 @@ async def main() -> None:
             db_path=db_path,
             cache_dir=cache_dir,
             transport=transport,
+            vector_mode=args.vector,
         )
         # Set server instance in transport after server creation

mcp_code_indexer/migrations/006_vector_mode.sql ADDED Viewed

@@ -0,0 +1,189 @@
+-- Migration 006: Add vector mode tables and indexes
+-- This migration adds support for semantic search capabilities with embeddings
+-- Includes code chunks, Merkle tree nodes, and indexing metadata
+-- Ensure WAL mode is enabled for safe migrations
+PRAGMA journal_mode=WAL;
+-- Temporarily disable foreign key constraints for migration
+PRAGMA foreign_keys=OFF;
+-- Start transaction for atomic migration
+BEGIN TRANSACTION;
+-- Create code_chunks table for storing semantic code chunks
+CREATE TABLE code_chunks (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    file_id INTEGER NOT NULL,
+    project_id TEXT NOT NULL,
+    chunk_type TEXT NOT NULL DEFAULT 'generic', -- function, class, method, import, etc.
+    name TEXT, -- Name of function/class/etc, can be NULL for generic chunks
+    start_line INTEGER NOT NULL,
+    end_line INTEGER NOT NULL,
+    content_hash TEXT NOT NULL, -- SHA-256 hash of chunk content
+    embedding_id TEXT, -- ID in vector database (Turbopuffer)
+    redacted BOOLEAN DEFAULT FALSE, -- Whether content was redacted for security
+    metadata TEXT DEFAULT '{}', -- JSON metadata about the chunk
+    created DATETIME DEFAULT CURRENT_TIMESTAMP,
+    last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (file_id) REFERENCES file_descriptions(id) ON DELETE CASCADE,
+    FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE
+);
+-- Create indexes for code_chunks table
+CREATE INDEX idx_code_chunks_file_id ON code_chunks(file_id);
+CREATE INDEX idx_code_chunks_project_id ON code_chunks(project_id);
+CREATE INDEX idx_code_chunks_chunk_type ON code_chunks(chunk_type);
+CREATE INDEX idx_code_chunks_content_hash ON code_chunks(content_hash);
+CREATE INDEX idx_code_chunks_embedding_id ON code_chunks(embedding_id);
+CREATE INDEX idx_code_chunks_last_modified ON code_chunks(last_modified);
+CREATE INDEX idx_code_chunks_redacted ON code_chunks(redacted);
+-- Create merkle_nodes table for efficient change detection
+CREATE TABLE merkle_nodes (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    project_id TEXT NOT NULL,
+    path TEXT NOT NULL, -- File/directory path relative to project root
+    hash TEXT NOT NULL, -- SHA-256 hash of content or children
+    node_type TEXT NOT NULL DEFAULT 'file', -- file, directory, project
+    parent_path TEXT, -- Path to parent directory, NULL for root
+    children_hash TEXT, -- Combined hash of children for directories
+    last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
+    UNIQUE(project_id, path),
+    FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE
+);
+-- Create indexes for merkle_nodes table
+CREATE INDEX idx_merkle_nodes_project_id ON merkle_nodes(project_id);
+CREATE INDEX idx_merkle_nodes_path ON merkle_nodes(path);
+CREATE INDEX idx_merkle_nodes_hash ON merkle_nodes(hash);
+CREATE INDEX idx_merkle_nodes_node_type ON merkle_nodes(node_type);
+CREATE INDEX idx_merkle_nodes_parent_path ON merkle_nodes(parent_path);
+CREATE INDEX idx_merkle_nodes_last_modified ON merkle_nodes(last_modified);
+-- Create index_meta table for tracking vector indexing progress
+CREATE TABLE index_meta (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    project_id TEXT NOT NULL UNIQUE,
+    total_chunks INTEGER DEFAULT 0,
+    indexed_chunks INTEGER DEFAULT 0,
+    total_files INTEGER DEFAULT 0,
+    indexed_files INTEGER DEFAULT 0,
+    last_sync DATETIME,
+    sync_status TEXT DEFAULT 'pending', -- pending, in_progress, completed, failed, paused
+    error_message TEXT,
+    queue_depth INTEGER DEFAULT 0,
+    processing_rate REAL DEFAULT 0.0, -- Files per second
+    estimated_completion DATETIME,
+    metadata TEXT DEFAULT '{}', -- JSON metadata
+    created DATETIME DEFAULT CURRENT_TIMESTAMP,
+    last_modified DATETIME DEFAULT CURRENT_TIMESTAMP,
+    FOREIGN KEY (project_id) REFERENCES projects(id) ON DELETE CASCADE
+);
+-- Create indexes for index_meta table
+CREATE INDEX idx_index_meta_project_id ON index_meta(project_id);
+CREATE INDEX idx_index_meta_sync_status ON index_meta(sync_status);
+CREATE INDEX idx_index_meta_last_sync ON index_meta(last_sync);
+CREATE INDEX idx_index_meta_last_modified ON index_meta(last_modified);
+-- Add vector_mode column to projects table to track which projects use vector search
+ALTER TABLE projects ADD COLUMN vector_mode BOOLEAN DEFAULT FALSE;
+CREATE INDEX idx_projects_vector_mode ON projects(vector_mode);
+-- Create triggers to maintain consistency between file_descriptions and code_chunks
+CREATE TRIGGER code_chunks_cleanup_on_file_delete
+AFTER DELETE ON file_descriptions
+BEGIN
+    DELETE FROM code_chunks WHERE file_id = OLD.id;
+END;
+-- Create triggers to update index_meta when chunks are added/removed
+CREATE TRIGGER update_index_meta_on_chunk_insert
+AFTER INSERT ON code_chunks
+BEGIN
+    INSERT OR REPLACE INTO index_meta (
+        project_id, total_chunks, indexed_chunks, total_files, indexed_files, last_modified
+    )
+    SELECT
+        NEW.project_id,
+        COUNT(*) as total_chunks,
+        COUNT(embedding_id) as indexed_chunks,
+        (SELECT COUNT(DISTINCT file_id) FROM code_chunks WHERE project_id = NEW.project_id) as total_files,
+        (SELECT COUNT(DISTINCT file_id) FROM code_chunks WHERE project_id = NEW.project_id AND embedding_id IS NOT NULL) as indexed_files,
+        CURRENT_TIMESTAMP
+    FROM code_chunks
+    WHERE project_id = NEW.project_id;
+END;
+CREATE TRIGGER update_index_meta_on_chunk_update
+AFTER UPDATE ON code_chunks
+BEGIN
+    UPDATE index_meta SET
+        indexed_chunks = (
+            SELECT COUNT(*) FROM code_chunks
+            WHERE project_id = NEW.project_id AND embedding_id IS NOT NULL
+        ),
+        indexed_files = (
+            SELECT COUNT(DISTINCT file_id) FROM code_chunks
+            WHERE project_id = NEW.project_id AND embedding_id IS NOT NULL
+        ),
+        last_modified = CURRENT_TIMESTAMP
+    WHERE project_id = NEW.project_id;
+END;
+CREATE TRIGGER update_index_meta_on_chunk_delete
+AFTER DELETE ON code_chunks
+BEGIN
+    UPDATE index_meta SET
+        total_chunks = (
+            SELECT COUNT(*) FROM code_chunks
+            WHERE project_id = OLD.project_id
+        ),
+        indexed_chunks = (
+            SELECT COUNT(*) FROM code_chunks
+            WHERE project_id = OLD.project_id AND embedding_id IS NOT NULL
+        ),
+        total_files = (
+            SELECT COUNT(DISTINCT file_id) FROM code_chunks
+            WHERE project_id = OLD.project_id
+        ),
+        indexed_files = (
+            SELECT COUNT(DISTINCT file_id) FROM code_chunks
+            WHERE project_id = OLD.project_id AND embedding_id IS NOT NULL
+        ),
+        last_modified = CURRENT_TIMESTAMP
+    WHERE project_id = OLD.project_id;
+END;
+-- Create view for vector search results with file information
+CREATE VIEW vector_search_view AS
+SELECT
+    cc.id as chunk_id,
+    cc.file_id,
+    fd.file_path,
+    cc.chunk_type,
+    cc.name as chunk_name,
+    cc.start_line,
+    cc.end_line,
+    cc.content_hash,
+    cc.embedding_id,
+    cc.redacted,
+    cc.metadata as chunk_metadata,
+    cc.project_id,
+    p.name as project_name,
+    fd.description as file_description,
+    cc.created as chunk_created,
+    cc.last_modified as chunk_modified,
+    fd.last_modified as file_modified
+FROM code_chunks cc
+JOIN file_descriptions fd ON cc.file_id = fd.id
+JOIN projects p ON cc.project_id = p.id
+WHERE cc.embedding_id IS NOT NULL
+  AND fd.to_be_cleaned IS NULL;
+-- Re-enable foreign key constraints
+PRAGMA foreign_keys=ON;
+-- Commit the migration
+COMMIT;

mcp_code_indexer/server/mcp_server.py CHANGED Viewed

@@ -63,6 +63,7 @@ class MCPCodeIndexServer:
         retry_max_wait: float = 2.0,
         retry_jitter: float = 0.2,
         transport: Optional[Any] = None,
+        vector_mode: bool = False,
     ):
         """
         Initialize the MCP Code Index Server.
@@ -80,10 +81,12 @@ class MCPCodeIndexServer:
             retry_max_wait: Maximum wait time between retries in seconds
             retry_jitter: Maximum jitter to add to retry delays in seconds
             transport: Optional transport instance (if None, uses default stdio)
+            vector_mode: Enable vector search capabilities and tools
         """
         self.token_limit = token_limit
         self.db_path = db_path or Path.home() / ".mcp-code-index" / "tracker.db"
         self.cache_dir = cache_dir or Path.home() / ".mcp-code-index" / "cache"
+        self.vector_mode = vector_mode
         # Store database configuration
         self.db_config = {

mcp_code_indexer/vector_mode/__init__.py ADDED Viewed

@@ -0,0 +1,36 @@
+"""
+Vector Mode for MCP Code Indexer.
+This package provides semantic search capabilities using embeddings and vector databases.
+Includes automated file monitoring, AST-based code chunking, and secure embedding generation.
+"""
+from typing import Optional
+from pathlib import Path
+import os
+__version__ = "1.0.0"
+def is_vector_mode_available() -> bool:
+    """Check if vector mode dependencies are available."""
+    try:
+        import voyage
+        import turbopuffer
+        import tree_sitter
+        import watchdog
+        return True
+    except ImportError:
+        return False
+def get_vector_config_path() -> Path:
+    """Get path to vector mode configuration."""
+    config_dir = Path.home() / ".mcp-code-index" / "vector"
+    config_dir.mkdir(parents=True, exist_ok=True)
+    return config_dir / "config.yaml"
+def check_api_keys() -> dict[str, bool]:
+    """Check availability of required API keys."""
+    return {
+        "voyage": os.getenv("VOYAGE_API_KEY") is not None,
+        "turbopuffer": os.getenv("TURBOPUFFER_API_KEY") is not None,
+    }

mcp_code_indexer/vector_mode/chunking/__init__.py ADDED Viewed

@@ -0,0 +1,19 @@
+"""
+AST-based code chunking for vector mode.
+Provides semantic code chunking using Tree-sitter parsers to extract
+meaningful code units for embedding generation.
+"""
+from .ast_chunker import ASTChunker, CodeChunk
+from .language_handlers import LanguageHandler, get_language_handler
+from .chunk_optimizer import ChunkOptimizer, OptimizedChunk
+__all__ = [
+    "ASTChunker",
+    "CodeChunk",
+    "LanguageHandler",
+    "get_language_handler",
+    "ChunkOptimizer",
+    "OptimizedChunk",
+]

mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl

mcp-code-indexer 4.0.2py3-none-any.whl → 4.2.0py3-none-any.whl