PyPI - mcp-code-indexer - Versions diffs - 4.2.15__py3-none-any.whl → 4.2.16__py3-none-any.whl - Mend

mcp-code-indexer 4.2.15py3-none-any.whl → 4.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

mcp_code_indexer/database/database.py +251 -85
mcp_code_indexer/database/models.py +66 -24
mcp_code_indexer/database/retry_executor.py +15 -5
mcp_code_indexer/file_scanner.py +107 -12
mcp_code_indexer/main.py +43 -30
mcp_code_indexer/server/mcp_server.py +191 -1
mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
mcp_code_indexer/vector_mode/config.py +113 -45
mcp_code_indexer/vector_mode/const.py +24 -0
mcp_code_indexer/vector_mode/daemon.py +860 -98
mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
mcp_code_indexer/vector_mode/services/__init__.py +9 -0
mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
mcp_code_indexer/vector_mode/types.py +46 -0
mcp_code_indexer/vector_mode/utils.py +50 -0
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
{mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0

mcp_code_indexer/vector_mode/chunking/ast_chunker.py CHANGED Viewed

@@ -19,14 +19,16 @@ from ...database.models import ChunkType
 logger = logging.getLogger(__name__)
 @dataclass
 class CodeChunk:
     """
     Represents a code chunk ready for embedding generation.
     This is the final output of the chunking process, optimized and
     ready for vector indexing.
     """
     content: str
     chunk_type: ChunkType
     name: Optional[str]
@@ -39,16 +41,18 @@ class CodeChunk:
     metadata: Dict[str, Any] = None
     imports: List[str] = None
     parent_context: Optional[str] = None
     def __post_init__(self):
         if self.metadata is None:
             self.metadata = {}
         if self.imports is None:
             self.imports = []
 @dataclass
 class ChunkingStats:
     """Statistics about the chunking process."""
     files_processed: int = 0
     total_chunks: int = 0
     chunks_by_type: Dict[ChunkType, int] = None
@@ -56,21 +60,22 @@ class ChunkingStats:
     redacted_chunks: int = 0
     fallback_chunks: int = 0
     processing_time: float = 0.0
     def __post_init__(self):
         if self.chunks_by_type is None:
             self.chunks_by_type = {}
         if self.chunks_by_language is None:
             self.chunks_by_language = {}
 class ASTChunker:
     """
     Main AST-based code chunker.
     Orchestrates the entire chunking process from file content to
     optimized code chunks ready for embedding generation.
     """
     def __init__(
         self,
         max_chunk_size: int = 1500,
@@ -81,7 +86,7 @@ class ASTChunker:
     ):
         """
         Initialize AST chunker.
         Args:
             max_chunk_size: Maximum characters per chunk
             min_chunk_size: Minimum characters per chunk
@@ -93,7 +98,7 @@ class ASTChunker:
         self.min_chunk_size = min_chunk_size
         self.enable_redaction = enable_redaction
         self.enable_optimization = enable_optimization
         # Initialize components
         self.redactor: Optional[SecretRedactor] = None
         if enable_redaction:
@@ -101,111 +106,110 @@ class ASTChunker:
                 min_confidence=redaction_confidence,
                 preserve_structure=True,
             )
         self.optimizer: Optional[ChunkOptimizer] = None
         if enable_optimization:
             self.optimizer = ChunkOptimizer(
                 max_chunk_size=max_chunk_size,
                 min_chunk_size=min_chunk_size,
             )
         # Statistics
         self.stats = ChunkingStats()
         # Cache for performance
         self.handler_cache: Dict[str, Any] = {}
-    def chunk_file(self, file_path: str, content: Optional[str] = None) -> List[CodeChunk]:
+    def chunk_file(
+        self, file_path: str, content: Optional[str] = None
+    ) -> List[CodeChunk]:
         """
         Chunk a single file into semantic code chunks.
         Args:
             file_path: Path to the file to chunk
             content: Optional file content (if not provided, will read from file)
         Returns:
             List of code chunks
         """
         start_time = datetime.utcnow()
         try:
             # Read content if not provided
             if content is None:
-                with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                     content = f.read()
             # Skip empty files
             if not content.strip():
                 logger.debug(f"Skipping empty file: {file_path}")
                 return []
             # Get language handler
             handler = self._get_language_handler(file_path)
             if not handler:
                 logger.warning(f"No handler available for {file_path}")
                 return []
             # Parse into semantic chunks
             logger.debug(f"Parsing {file_path} with {handler.language_name} handler")
             parsed_chunks = handler.parse_code(content, file_path)
             # Convert to code chunks
             code_chunks = []
             for parsed_chunk in parsed_chunks:
                 code_chunk = self._convert_parsed_chunk(parsed_chunk, file_path)
                 if code_chunk:
                     code_chunks.append(code_chunk)
             # Apply redaction if enabled
             if self.enable_redaction and self.redactor:
                 code_chunks = self._apply_redaction(code_chunks, file_path)
             # Apply optimization if enabled
             if self.enable_optimization and self.optimizer:
                 code_chunks = self._apply_optimization(code_chunks)
             # Update statistics
             processing_time = (datetime.utcnow() - start_time).total_seconds()
             self._update_stats(code_chunks, handler.language_name, processing_time)
             logger.info(f"Chunked {file_path}: {len(code_chunks)} chunks")
             return code_chunks
         except Exception as e:
             logger.error(f"Failed to chunk file {file_path}: {e}")
             return []
     def chunk_content(
-        self,
-        content: str,
-        file_path: str,
-        language: Optional[str] = None
+        self, content: str, file_path: str, language: Optional[str] = None
     ) -> List[CodeChunk]:
         """
         Chunk content directly without reading from file.
         Args:
             content: Source code content
             file_path: Virtual file path for language detection
             language: Optional language override
         Returns:
             List of code chunks
         """
         return self.chunk_file(file_path, content)
     def chunk_multiple_files(self, file_paths: List[str]) -> Dict[str, List[CodeChunk]]:
         """
         Chunk multiple files and return results grouped by file.
         Args:
             file_paths: List of file paths to chunk
         Returns:
             Dictionary mapping file paths to their chunks
         """
         results = {}
         for file_path in file_paths:
             try:
                 chunks = self.chunk_file(file_path)
@@ -213,28 +217,30 @@ class ASTChunker:
             except Exception as e:
                 logger.error(f"Failed to chunk {file_path}: {e}")
                 results[file_path] = []
         return results
     def _get_language_handler(self, file_path: str) -> Optional[Any]:
         """Get language handler for file, with caching."""
         extension = Path(file_path).suffix.lower()
         if extension in self.handler_cache:
             return self.handler_cache[extension]
         handler = get_language_handler(file_path)
         self.handler_cache[extension] = handler
         return handler
-    def _convert_parsed_chunk(self, parsed_chunk: ParsedChunk, file_path: str) -> Optional[CodeChunk]:
+    def _convert_parsed_chunk(
+        self, parsed_chunk: ParsedChunk, file_path: str
+    ) -> Optional[CodeChunk]:
         """Convert a parsed chunk to a code chunk."""
         if not parsed_chunk.content.strip():
             return None
         # Generate content hash
-        content_hash = hashlib.sha256(parsed_chunk.content.encode('utf-8')).hexdigest()
+        content_hash = hashlib.sha256(parsed_chunk.content.encode("utf-8")).hexdigest()
         # Create code chunk
         code_chunk = CodeChunk(
             content=parsed_chunk.content,
@@ -249,42 +255,48 @@ class ASTChunker:
             imports=parsed_chunk.imports.copy() if parsed_chunk.imports else [],
             parent_context=parsed_chunk.parent_context,
         )
         return code_chunk
-    def _apply_redaction(self, chunks: List[CodeChunk], file_path: str) -> List[CodeChunk]:
+    def _apply_redaction(
+        self, chunks: List[CodeChunk], file_path: str
+    ) -> List[CodeChunk]:
         """Apply secret redaction to chunks."""
         redacted_chunks = []
         for chunk in chunks:
             try:
                 redaction_result = self.redactor.redact_content(
                     content=chunk.content,
                     file_path=file_path,
                 )
                 if redaction_result.was_redacted:
                     # Update chunk with redacted content
                     chunk.content = redaction_result.redacted_content
                     chunk.redacted = True
                     chunk.metadata["redaction_count"] = redaction_result.redaction_count
-                    chunk.metadata["redacted_patterns"] = redaction_result.patterns_matched
+                    chunk.metadata["redacted_patterns"] = (
+                        redaction_result.patterns_matched
+                    )
                     # Recompute hash for redacted content
                     chunk.content_hash = hashlib.sha256(
-                        chunk.content.encode('utf-8')
+                        chunk.content.encode("utf-8")
                     ).hexdigest()
-                    logger.debug(f"Redacted {redaction_result.redaction_count} secrets from chunk {chunk.name}")
+                    logger.debug(
+                        f"Redacted {redaction_result.redaction_count} secrets from chunk {chunk.name}"
+                    )
                 redacted_chunks.append(chunk)
             except Exception as e:
                 logger.warning(f"Failed to redact chunk {chunk.name}: {e}")
                 redacted_chunks.append(chunk)
         return redacted_chunks
     def _apply_optimization(self, chunks: List[CodeChunk]) -> List[CodeChunk]:
         """Apply chunk optimization."""
         try:
@@ -301,12 +313,13 @@ class ASTChunker:
                     language=chunk.language,
                     imports=chunk.imports,
                     parent_context=chunk.parent_context,
+                    redacted=chunk.redacted,
                 )
                 optimized_chunks.append(opt_chunk)
             # Apply optimization
             optimized_chunks = self.optimizer.optimize_chunks(optimized_chunks)
             # Convert back to code chunks
             result_chunks = []
             for opt_chunk in optimized_chunks:
@@ -317,74 +330,80 @@ class ASTChunker:
                     file_path=chunks[0].file_path if chunks else "",
                     start_line=opt_chunk.start_line,
                     end_line=opt_chunk.end_line,
-                    content_hash=hashlib.sha256(opt_chunk.content.encode('utf-8')).hexdigest(),
+                    content_hash=hashlib.sha256(
+                        opt_chunk.content.encode("utf-8")
+                    ).hexdigest(),
                     language=opt_chunk.language,
+                    redacted=opt_chunk.redacted,
                     metadata=opt_chunk.metadata,
                     imports=opt_chunk.imports,
                     parent_context=opt_chunk.parent_context,
                 )
                 result_chunks.append(code_chunk)
             return result_chunks
         except Exception as e:
             logger.warning(f"Chunk optimization failed: {e}")
             return chunks
-    def _update_stats(self, chunks: List[CodeChunk], language: str, processing_time: float) -> None:
+    def _update_stats(
+        self, chunks: List[CodeChunk], language: str, processing_time: float
+    ) -> None:
         """Update chunking statistics."""
         self.stats.files_processed += 1
         self.stats.total_chunks += len(chunks)
         self.stats.processing_time += processing_time
         # Count by type
         for chunk in chunks:
             self.stats.chunks_by_type[chunk.chunk_type] = (
                 self.stats.chunks_by_type.get(chunk.chunk_type, 0) + 1
             )
             if chunk.redacted:
                 self.stats.redacted_chunks += 1
             if chunk.metadata.get("fallback", False):
                 self.stats.fallback_chunks += 1
         # Count by language
-        self.stats.chunks_by_language[language] = (
-            self.stats.chunks_by_language.get(language, 0) + len(chunks)
-        )
+        self.stats.chunks_by_language[language] = self.stats.chunks_by_language.get(
+            language, 0
+        ) + len(chunks)
     def get_stats(self) -> ChunkingStats:
         """Get chunking statistics."""
         return self.stats
     def reset_stats(self) -> None:
         """Reset chunking statistics."""
         self.stats = ChunkingStats()
     def get_supported_extensions(self) -> Set[str]:
         """Get list of supported file extensions."""
         from .language_handlers import LANGUAGE_HANDLERS
         return set(LANGUAGE_HANDLERS.keys())
     def is_supported_file(self, file_path: str) -> bool:
         """Check if a file is supported for chunking."""
         extension = Path(file_path).suffix.lower()
         return extension in self.get_supported_extensions()
     def estimate_chunks(self, file_path: str) -> Dict[str, Any]:
         """Estimate number of chunks for a file without full processing."""
         try:
-            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
                 content = f.read()
             # Simple estimation based on content length and average chunk size
             content_length = len(content)
-            lines = content.count('\n') + 1
+            lines = content.count("\n") + 1
             # Rough estimates
             estimated_chunks = max(1, content_length // self.max_chunk_size)
             return {
                 "file_path": file_path,
                 "content_length": content_length,
@@ -392,7 +411,7 @@ class ASTChunker:
                 "estimated_chunks": estimated_chunks,
                 "is_supported": self.is_supported_file(file_path),
             }
         except Exception as e:
             logger.warning(f"Failed to estimate chunks for {file_path}: {e}")
             return {

mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py CHANGED Viewed

@@ -27,6 +27,7 @@ class OptimizedChunk:
     imports: List[str] = None
     parent_context: Optional[str] = None
     optimization_applied: str = "none"
+    redacted: bool = False
     def __post_init__(self):
         if self.imports is None:

mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.16__py3-none-any.whl

mcp-code-indexer 4.2.15py3-none-any.whl → 4.2.16py3-none-any.whl