PyPI - mcp-code-indexer - Versions diffs - 4.2.14__py3-none-any.whl → 4.2.16__py3-none-any.whl - Mend

mcp-code-indexer 4.2.14py3-none-any.whl → 4.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

mcp_code_indexer/database/database.py +251 -85
mcp_code_indexer/database/models.py +66 -24
mcp_code_indexer/database/retry_executor.py +15 -5
mcp_code_indexer/file_scanner.py +107 -12
mcp_code_indexer/main.py +75 -23
mcp_code_indexer/server/mcp_server.py +191 -1
mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
mcp_code_indexer/vector_mode/config.py +113 -45
mcp_code_indexer/vector_mode/const.py +24 -0
mcp_code_indexer/vector_mode/daemon.py +860 -98
mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
mcp_code_indexer/vector_mode/services/__init__.py +9 -0
mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
mcp_code_indexer/vector_mode/types.py +46 -0
mcp_code_indexer/vector_mode/utils.py +50 -0
{mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
{mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
{mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
{mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
{mcp_code_indexer-4.2.14.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0

mcp_code_indexer/vector_mode/config.py CHANGED Viewed

@@ -11,47 +11,78 @@ from pathlib import Path
 from typing import Optional
 import yaml
+from .const import MODEL_DIMENSIONS
+DEFAULT_EMBEDDING_MODEL = "voyage-code-2"
 @dataclass
 class VectorConfig:
     """Configuration for vector mode operations."""
     # API Configuration
     voyage_api_key: Optional[str] = None
     turbopuffer_api_key: Optional[str] = None
     turbopuffer_region: str = "gcp-europe-west3"
-    # Embedding Configuration
-    embedding_model: str = "voyage-code-2"
+    # Embedding Configuration
+    embedding_model: str = DEFAULT_EMBEDDING_MODEL
     batch_size: int = 128
     max_tokens_per_chunk: int = 1024
+    voyage_batch_size_limit: int = 1000
+    voyage_max_tokens_per_batch: int = 120000
     # Search Configuration
     similarity_threshold: float = 0.5
     max_search_results: int = 20
     enable_recency_boost: bool = True
     # Chunking Configuration
     max_chunk_size: int = 1500
     chunk_overlap: int = 100
     prefer_semantic_chunks: bool = True
     # File Monitoring Configuration
     watch_debounce_ms: int = 100
-    ignore_patterns: list[str] = field(default_factory=lambda: [
-        "*.log", "*.tmp", "*~", ".git/*", "__pycache__/*", "node_modules/*",
-        "*.pyc", "*.pyo", ".DS_Store", "Thumbs.db"
-    ])
+    ignore_patterns: list[str] = field(
+        default_factory=lambda: [
+            "*.log",
+            "*.tmp",
+            "*~",
+            ".git/*",
+            "__pycache__/*",
+            "node_modules/*",
+            "*.pyc",
+            "*.pyo",
+            ".DS_Store",
+            "Thumbs.db",
+            ".vscode/*",
+            ".coverage",
+            ".ruff_cache/*",
+            ".mypy_cache/*",
+            ".pytest_cache/*",
+            ".import_linter_cache/*",
+            "*/tiktoken_cache/*",
+            ".code-index/*",
+            ".hypothesis/*",
+            "poetry.lock",
+            "venv/*",
+            "htmlcov/*",
+        ]
+    )
     # Daemon Configuration
     daemon_enabled: bool = True
     daemon_poll_interval: int = 5
     max_queue_size: int = 1000
     worker_count: int = 3
+    max_concurrent_files: int = 5
+    max_concurrent_batches: int = 5
     # Security Configuration
     redact_secrets: bool = True
     redaction_patterns_file: Optional[str] = None
     @classmethod
     def from_env(cls) -> "VectorConfig":
         """Create config from environment variables."""
@@ -59,12 +90,19 @@ class VectorConfig:
             voyage_api_key=os.getenv("VOYAGE_API_KEY"),
             turbopuffer_api_key=os.getenv("TURBOPUFFER_API_KEY"),
             turbopuffer_region=os.getenv("TURBOPUFFER_REGION", "gcp-europe-west3"),
-            embedding_model=os.getenv("VECTOR_EMBEDDING_MODEL", "voyage-code-3"),
+            embedding_model=os.getenv(
+                "VECTOR_EMBEDDING_MODEL", DEFAULT_EMBEDDING_MODEL
+            ),
+            voyage_batch_size_limit=int(os.getenv("VOYAGE_BATCH_SIZE_LIMIT", "1000")),
+            voyage_max_tokens_per_batch=int(
+                os.getenv("VOYAGE_MAX_TOKENS_PER_BATCH", "120000")
+            ),
             batch_size=int(os.getenv("VECTOR_BATCH_SIZE", "128")),
             max_tokens_per_chunk=int(os.getenv("VECTOR_MAX_TOKENS", "2048")),
             similarity_threshold=float(os.getenv("VECTOR_SIMILARITY_THRESHOLD", "0.5")),
             max_search_results=int(os.getenv("VECTOR_MAX_RESULTS", "20")),
-            enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower() == "true",
+            enable_recency_boost=os.getenv("VECTOR_RECENCY_BOOST", "true").lower()
+            == "true",
             max_chunk_size=int(os.getenv("VECTOR_CHUNK_SIZE", "1500")),
             chunk_overlap=int(os.getenv("VECTOR_CHUNK_OVERLAP", "100")),
             watch_debounce_ms=int(os.getenv("VECTOR_DEBOUNCE_MS", "100")),
@@ -72,68 +110,81 @@ class VectorConfig:
             daemon_poll_interval=int(os.getenv("VECTOR_POLL_INTERVAL", "5")),
             max_queue_size=int(os.getenv("VECTOR_MAX_QUEUE", "1000")),
             worker_count=int(os.getenv("VECTOR_WORKERS", "3")),
+            max_concurrent_files=int(os.getenv("VECTOR_MAX_CONCURRENT_FILES", "5")),
+            max_concurrent_batches=int(os.getenv("VECTOR_MAX_CONCURRENT_BATCHES", "5")),
             redact_secrets=os.getenv("VECTOR_REDACT_SECRETS", "true").lower() == "true",
         )
     @classmethod
     def from_file(cls, config_path: Path) -> "VectorConfig":
         """Load config from YAML file."""
         if not config_path.exists():
             return cls.from_env()
         try:
             with open(config_path, "r") as f:
                 data = yaml.safe_load(f) or {}
             # Merge with environment variables (env takes precedence)
             env_config = cls.from_env()
             # Update with file values only if env variable not set
             for key, value in data.items():
                 if hasattr(env_config, key):
                     env_value = getattr(env_config, key)
                     # Use file value if env value is None or default
-                    if env_value is None or (key == "voyage_api_key" and env_value is None):
+                    if env_value is None or (
+                        key == "voyage_api_key" and env_value is None
+                    ):
                         setattr(env_config, key, value)
             return env_config
         except Exception as e:
             raise ValueError(f"Failed to load config from {config_path}: {e}")
     def to_file(self, config_path: Path) -> None:
         """Save config to YAML file."""
         config_path.parent.mkdir(parents=True, exist_ok=True)
         # Don't save API keys to file for security
-        data = {
-            k: v for k, v in self.__dict__.items()
-            if not k.endswith("_api_key")
-        }
+        data = {k: v for k, v in self.__dict__.items() if not k.endswith("_api_key")}
         with open(config_path, "w") as f:
             yaml.dump(data, f, default_flow_style=False, sort_keys=True)
     def validate(self) -> list[str]:
         """Validate configuration and return list of errors."""
         errors = []
         if self.daemon_enabled:
             if not self.voyage_api_key:
-                errors.append("VOYAGE_API_KEY environment variable required for vector mode")
+                errors.append(
+                    "VOYAGE_API_KEY environment variable required for vector mode"
+                )
             if not self.turbopuffer_api_key:
-                errors.append("TURBOPUFFER_API_KEY environment variable required for vector mode")
+                errors.append(
+                    "TURBOPUFFER_API_KEY environment variable required for vector mode"
+                )
         # Validate TurboPuffer region
         supported_regions = [
-            'aws-ap-southeast-2', 'aws-eu-central-1', 'aws-us-east-1',
-            'aws-us-east-2', 'aws-us-west-2', 'gcp-us-central1',
-            'gcp-us-west1', 'gcp-us-east4', 'gcp-europe-west3'
+            "aws-ap-southeast-2",
+            "aws-eu-central-1",
+            "aws-us-east-1",
+            "aws-us-east-2",
+            "aws-us-west-2",
+            "gcp-us-central1",
+            "gcp-us-west1",
+            "gcp-us-east4",
+            "gcp-europe-west3",
         ]
         if self.turbopuffer_region not in supported_regions:
-            errors.append(f"turbopuffer_region '{self.turbopuffer_region}' is not supported. " +
-                         f"Supported regions: {', '.join(supported_regions)}")
+            errors.append(
+                f"turbopuffer_region '{self.turbopuffer_region}' is not supported. "
+                + f"Supported regions: {', '.join(supported_regions)}"
+            )
         if self.batch_size <= 0:
             errors.append("batch_size must be positive")
         if self.max_tokens_per_chunk <= 0:
@@ -148,20 +199,37 @@ class VectorConfig:
             errors.append("chunk_overlap cannot be negative")
         if self.worker_count <= 0:
             errors.append("worker_count must be positive")
+        if self.max_concurrent_files <= 0 or self.max_concurrent_files > 50:
+            errors.append("max_concurrent_files must be between 1 and 50")
+        if self.voyage_batch_size_limit <= 0 or self.voyage_batch_size_limit > 1000:
+            errors.append("voyage_batch_size_limit must be between 1 and 1000")
+        if (
+            self.voyage_max_tokens_per_batch <= 0
+            or self.voyage_max_tokens_per_batch > 120000
+        ):
+            errors.append("voyage_max_tokens_per_batch must be between 1 and 120000")
         return errors
+    def get_embedding_dimensions(self) -> int:
+        """Get the vector dimensions for the current embedding model."""
+        return MODEL_DIMENSIONS.get(
+            self.embedding_model, 1536
+        )  # Default to 1536 if model not found
 def load_vector_config(config_path: Optional[Path] = None) -> VectorConfig:
     """Load vector configuration from file or environment."""
     if config_path is None:
         from . import get_vector_config_path
         config_path = get_vector_config_path()
     config = VectorConfig.from_file(config_path)
     # Validate configuration
     errors = config.validate()
     if errors:
         raise ValueError(f"Invalid vector configuration: {'; '.join(errors)}")
     return config

mcp_code_indexer/vector_mode/const.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""
+Constants for Voyage AI providers.
+Contains model names, dimensions, and other constants used across
+Voyage AI provider implementations.
+"""
+class VoyageModel:
+    """Voyage AI model names."""
+    VOYAGE_CODE_2 = "voyage-code-2"
+    VOYAGE_2 = "voyage-2"
+    VOYAGE_LARGE_2 = "voyage-large-2"
+    VOYAGE_3 = "voyage-3"
+# Model dimensions mapping
+MODEL_DIMENSIONS = {
+    VoyageModel.VOYAGE_CODE_2: 1536,
+    VoyageModel.VOYAGE_2: 1024,
+    VoyageModel.VOYAGE_LARGE_2: 1536,
+    VoyageModel.VOYAGE_3: 1024,
+}

mcp-code-indexer 4.2.14__py3-none-any.whl → 4.2.16__py3-none-any.whl

mcp-code-indexer 4.2.14py3-none-any.whl → 4.2.16py3-none-any.whl