PyPI - mcp-vector-search - Versions diffs - 0.12.6__py3-none-any.whl → 1.1.22__py3-none-any.whl - Mend

mcp-vector-search 0.12.6py3-none-any.whl → 1.1.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

mcp_vector_search/__init__.py +3 -3
mcp_vector_search/analysis/__init__.py +111 -0
mcp_vector_search/analysis/baseline/__init__.py +68 -0
mcp_vector_search/analysis/baseline/comparator.py +462 -0
mcp_vector_search/analysis/baseline/manager.py +621 -0
mcp_vector_search/analysis/collectors/__init__.py +74 -0
mcp_vector_search/analysis/collectors/base.py +164 -0
mcp_vector_search/analysis/collectors/cohesion.py +463 -0
mcp_vector_search/analysis/collectors/complexity.py +743 -0
mcp_vector_search/analysis/collectors/coupling.py +1162 -0
mcp_vector_search/analysis/collectors/halstead.py +514 -0
mcp_vector_search/analysis/collectors/smells.py +325 -0
mcp_vector_search/analysis/debt.py +516 -0
mcp_vector_search/analysis/interpretation.py +685 -0
mcp_vector_search/analysis/metrics.py +414 -0
mcp_vector_search/analysis/reporters/__init__.py +7 -0
mcp_vector_search/analysis/reporters/console.py +646 -0
mcp_vector_search/analysis/reporters/markdown.py +480 -0
mcp_vector_search/analysis/reporters/sarif.py +377 -0
mcp_vector_search/analysis/storage/__init__.py +93 -0
mcp_vector_search/analysis/storage/metrics_store.py +762 -0
mcp_vector_search/analysis/storage/schema.py +245 -0
mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
mcp_vector_search/analysis/trends.py +308 -0
mcp_vector_search/analysis/visualizer/__init__.py +90 -0
mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
mcp_vector_search/analysis/visualizer/exporter.py +484 -0
mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
mcp_vector_search/analysis/visualizer/schemas.py +525 -0
mcp_vector_search/cli/commands/analyze.py +1062 -0
mcp_vector_search/cli/commands/chat.py +1455 -0
mcp_vector_search/cli/commands/index.py +621 -5
mcp_vector_search/cli/commands/index_background.py +467 -0
mcp_vector_search/cli/commands/init.py +13 -0
mcp_vector_search/cli/commands/install.py +597 -335
mcp_vector_search/cli/commands/install_old.py +8 -4
mcp_vector_search/cli/commands/mcp.py +78 -6
mcp_vector_search/cli/commands/reset.py +68 -26
mcp_vector_search/cli/commands/search.py +224 -8
mcp_vector_search/cli/commands/setup.py +1184 -0
mcp_vector_search/cli/commands/status.py +339 -5
mcp_vector_search/cli/commands/uninstall.py +276 -357
mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
mcp_vector_search/cli/commands/visualize/cli.py +292 -0
mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +33 -0
mcp_vector_search/cli/commands/visualize/graph_builder.py +647 -0
mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
mcp_vector_search/cli/commands/visualize/server.py +600 -0
mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
mcp_vector_search/cli/commands/visualize/templates/base.py +234 -0
mcp_vector_search/cli/commands/visualize/templates/scripts.py +4542 -0
mcp_vector_search/cli/commands/visualize/templates/styles.py +2522 -0
mcp_vector_search/cli/didyoumean.py +27 -2
mcp_vector_search/cli/main.py +127 -160
mcp_vector_search/cli/output.py +158 -13
mcp_vector_search/config/__init__.py +4 -0
mcp_vector_search/config/default_thresholds.yaml +52 -0
mcp_vector_search/config/settings.py +12 -0
mcp_vector_search/config/thresholds.py +273 -0
mcp_vector_search/core/__init__.py +16 -0
mcp_vector_search/core/auto_indexer.py +3 -3
mcp_vector_search/core/boilerplate.py +186 -0
mcp_vector_search/core/config_utils.py +394 -0
mcp_vector_search/core/database.py +406 -94
mcp_vector_search/core/embeddings.py +24 -0
mcp_vector_search/core/exceptions.py +11 -0
mcp_vector_search/core/git.py +380 -0
mcp_vector_search/core/git_hooks.py +4 -4
mcp_vector_search/core/indexer.py +632 -54
mcp_vector_search/core/llm_client.py +756 -0
mcp_vector_search/core/models.py +91 -1
mcp_vector_search/core/project.py +17 -0
mcp_vector_search/core/relationships.py +473 -0
mcp_vector_search/core/scheduler.py +11 -11
mcp_vector_search/core/search.py +179 -29
mcp_vector_search/mcp/server.py +819 -9
mcp_vector_search/parsers/python.py +285 -5
mcp_vector_search/utils/__init__.py +2 -0
mcp_vector_search/utils/gitignore.py +0 -3
mcp_vector_search/utils/gitignore_updater.py +212 -0
mcp_vector_search/utils/monorepo.py +66 -4
mcp_vector_search/utils/timing.py +10 -6
{mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +184 -53
mcp_vector_search-1.1.22.dist-info/RECORD +120 -0
{mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +1 -1
{mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +1 -0
mcp_vector_search/cli/commands/visualize.py +0 -1467
mcp_vector_search-0.12.6.dist-info/RECORD +0 -68
{mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0

mcp_vector_search/core/indexer.py CHANGED Viewed

@@ -2,14 +2,20 @@
 import asyncio
 import json
+import multiprocessing
 import os
+from concurrent.futures import ProcessPoolExecutor
 from datetime import UTC, datetime
 from pathlib import Path
+from typing import Any
 from loguru import logger
 from packaging import version
 from .. import __version__
+from ..analysis.collectors.base import MetricCollector
+from ..analysis.metrics import ChunkMetrics
+from ..analysis.trends import TrendTracker
 from ..config.defaults import ALLOWED_DOTFILES, DEFAULT_IGNORE_PATTERNS
 from ..config.settings import ProjectConfig
 from ..parsers.registry import get_parser_registry
@@ -19,6 +25,81 @@ from .database import VectorDatabase
 from .directory_index import DirectoryIndex
 from .exceptions import ParsingError
 from .models import CodeChunk, IndexStats
+from .relationships import RelationshipStore
+# Extension to language mapping for metric collection
+EXTENSION_TO_LANGUAGE = {
+    ".py": "python",
+    ".js": "javascript",
+    ".ts": "typescript",
+    ".jsx": "javascript",
+    ".tsx": "typescript",
+    ".java": "java",
+    ".rs": "rust",
+    ".php": "php",
+    ".rb": "ruby",
+}
+def _parse_file_standalone(
+    args: tuple[Path, str | None],
+) -> tuple[Path, list[CodeChunk], Exception | None]:
+    """Parse a single file - standalone function for multiprocessing.
+    This function must be at module level (not a method) to be picklable for
+    multiprocessing. It creates its own parser registry to avoid serialization issues.
+    Args:
+        args: Tuple of (file_path, subproject_info_json)
+            - file_path: Path to the file to parse
+            - subproject_info_json: JSON string with subproject info or None
+    Returns:
+        Tuple of (file_path, chunks, error)
+        - file_path: The file path that was parsed
+        - chunks: List of parsed CodeChunk objects (empty if error)
+        - error: Exception if parsing failed, None if successful
+    """
+    file_path, subproject_info_json = args
+    try:
+        # Create parser registry in this process
+        parser_registry = get_parser_registry()
+        # Get appropriate parser
+        parser = parser_registry.get_parser_for_file(file_path)
+        # Parse file synchronously (tree-sitter is synchronous anyway)
+        # We need to use the synchronous version of parse_file
+        # Since parsers may have async methods, we'll read and parse directly
+        import asyncio
+        # Create event loop for this process if needed
+        try:
+            loop = asyncio.get_event_loop()
+        except RuntimeError:
+            loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(loop)
+        # Run the async parse_file in this process's event loop
+        chunks = loop.run_until_complete(parser.parse_file(file_path))
+        # Filter out empty chunks
+        valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
+        # Apply subproject information if available
+        if subproject_info_json:
+            subproject_info = json.loads(subproject_info_json)
+            for chunk in valid_chunks:
+                chunk.subproject_name = subproject_info.get("name")
+                chunk.subproject_path = subproject_info.get("relative_path")
+        return (file_path, valid_chunks, None)
+    except Exception as e:
+        # Return error instead of raising to avoid process crashes
+        logger.error(f"Failed to parse file {file_path} in worker process: {e}")
+        return (file_path, [], e)
 class SemanticIndexer:
@@ -33,6 +114,8 @@ class SemanticIndexer:
         max_workers: int | None = None,
         batch_size: int = 10,
         debug: bool = False,
+        collectors: list[MetricCollector] | None = None,
+        use_multiprocessing: bool = True,
     ) -> None:
         """Initialize semantic indexer.
@@ -41,9 +124,11 @@ class SemanticIndexer:
             project_root: Project root directory
             file_extensions: File extensions to index (deprecated, use config)
             config: Project configuration (preferred over file_extensions)
-            max_workers: Maximum number of worker threads for parallel processing
+            max_workers: Maximum number of worker processes for parallel parsing (ignored if use_multiprocessing=False)
             batch_size: Number of files to process in each batch
             debug: Enable debug output for hierarchy building
+            collectors: Metric collectors to run during indexing (defaults to all complexity collectors)
+            use_multiprocessing: Enable multiprocess parallel parsing (default: True, disable for debugging)
         """
         self.database = database
         self.project_root = project_root
@@ -63,13 +148,23 @@ class SemanticIndexer:
         self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
         self.debug = debug
-        # Safely get event loop for max_workers
-        try:
-            loop = asyncio.get_event_loop()
-            self.max_workers = max_workers or min(4, (loop.get_debug() and 1) or 4)
-        except RuntimeError:
-            # No event loop in current thread
-            self.max_workers = max_workers or 4
+        # Initialize metric collectors
+        self.collectors = (
+            collectors if collectors is not None else self._default_collectors()
+        )
+        # Configure multiprocessing for parallel parsing
+        self.use_multiprocessing = use_multiprocessing
+        if use_multiprocessing:
+            # Use 75% of CPU cores for parsing, but cap at 8 to avoid overhead
+            cpu_count = multiprocessing.cpu_count()
+            self.max_workers = max_workers or min(max(1, int(cpu_count * 0.75)), 8)
+            logger.debug(
+                f"Multiprocessing enabled with {self.max_workers} workers (CPU count: {cpu_count})"
+            )
+        else:
+            self.max_workers = 1
+            logger.debug("Multiprocessing disabled (single-threaded mode)")
         self.batch_size = batch_size
         self._index_metadata_file = (
@@ -110,16 +205,162 @@ class SemanticIndexer:
         # Load existing directory index
         self.directory_index.load()
+        # Initialize relationship store for pre-computing visualization relationships
+        self.relationship_store = RelationshipStore(project_root)
+        # Initialize trend tracker for historical metrics
+        self.trend_tracker = TrendTracker(project_root)
+    def _default_collectors(self) -> list[MetricCollector]:
+        """Return default set of metric collectors.
+        Returns:
+            List of all complexity collectors (cognitive, cyclomatic, nesting, parameters, methods)
+        """
+        from ..analysis.collectors.complexity import (
+            CognitiveComplexityCollector,
+            CyclomaticComplexityCollector,
+            MethodCountCollector,
+            NestingDepthCollector,
+            ParameterCountCollector,
+        )
+        return [
+            CognitiveComplexityCollector(),
+            CyclomaticComplexityCollector(),
+            NestingDepthCollector(),
+            ParameterCountCollector(),
+            MethodCountCollector(),
+        ]
+    def _collect_metrics(
+        self, chunk: CodeChunk, source_code: bytes, language: str
+    ) -> ChunkMetrics | None:
+        """Collect metrics for a code chunk.
+        This is a simplified version that estimates metrics from chunk content
+        without full TreeSitter traversal. Future implementation will use
+        TreeSitter node traversal for accurate metric collection.
+        Args:
+            chunk: The parsed code chunk
+            source_code: Raw source code bytes
+            language: Programming language identifier
+        Returns:
+            ChunkMetrics for the chunk, or None if no metrics collected
+        """
+        # For now, create basic metrics from chunk content
+        # TODO: Implement full TreeSitter traversal in Phase 2
+        lines_of_code = chunk.line_count
+        # Estimate complexity from simple heuristics
+        content = chunk.content
+        cognitive_complexity = self._estimate_cognitive_complexity(content)
+        cyclomatic_complexity = self._estimate_cyclomatic_complexity(content)
+        max_nesting_depth = self._estimate_nesting_depth(content)
+        parameter_count = len(chunk.parameters) if chunk.parameters else 0
+        metrics = ChunkMetrics(
+            cognitive_complexity=cognitive_complexity,
+            cyclomatic_complexity=cyclomatic_complexity,
+            max_nesting_depth=max_nesting_depth,
+            parameter_count=parameter_count,
+            lines_of_code=lines_of_code,
+        )
+        return metrics
+    def _estimate_cognitive_complexity(self, content: str) -> int:
+        """Estimate cognitive complexity from content (simplified heuristic).
+        Args:
+            content: Code content
+        Returns:
+            Estimated cognitive complexity score
+        """
+        # Simple heuristic: count control flow keywords
+        keywords = [
+            "if",
+            "elif",
+            "else",
+            "for",
+            "while",
+            "try",
+            "except",
+            "case",
+            "when",
+        ]
+        complexity = 0
+        for keyword in keywords:
+            complexity += content.count(f" {keyword} ")
+            complexity += content.count(f"\t{keyword} ")
+            complexity += content.count(f"\n{keyword} ")
+        return complexity
+    def _estimate_cyclomatic_complexity(self, content: str) -> int:
+        """Estimate cyclomatic complexity from content (simplified heuristic).
+        Args:
+            content: Code content
+        Returns:
+            Estimated cyclomatic complexity score (minimum 1)
+        """
+        # Start with baseline of 1
+        complexity = 1
+        # Count decision points
+        keywords = [
+            "if",
+            "elif",
+            "for",
+            "while",
+            "case",
+            "when",
+            "&&",
+            "||",
+            "and",
+            "or",
+        ]
+        for keyword in keywords:
+            complexity += content.count(keyword)
+        return complexity
+    def _estimate_nesting_depth(self, content: str) -> int:
+        """Estimate maximum nesting depth from indentation (simplified heuristic).
+        Args:
+            content: Code content
+        Returns:
+            Estimated maximum nesting depth
+        """
+        max_depth = 0
+        for line in content.split("\n"):
+            # Count leading whitespace (4 spaces or 1 tab = 1 level)
+            leading = len(line) - len(line.lstrip())
+            if "\t" in line[:leading]:
+                depth = line[:leading].count("\t")
+            else:
+                depth = leading // 4
+            max_depth = max(max_depth, depth)
+        return max_depth
     async def index_project(
         self,
         force_reindex: bool = False,
         show_progress: bool = True,
+        skip_relationships: bool = False,
     ) -> int:
         """Index all files in the project.
         Args:
             force_reindex: Whether to reindex existing files
             show_progress: Whether to show progress information
+            skip_relationships: Skip computing relationships for visualization (faster, but visualize will be slower)
         Returns:
             Number of files indexed
@@ -222,12 +463,134 @@ class SemanticIndexer:
             f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
         )
+        # Mark relationships for background computation (unless skipped)
+        # Default behavior: skip blocking computation, mark for background processing
+        if not skip_relationships and indexed_count > 0:
+            try:
+                logger.info("Marking relationships for background computation...")
+                # Get all chunks from database for relationship computation
+                all_chunks = await self.database.get_all_chunks()
+                if len(all_chunks) > 0:
+                    # Mark for background computation (non-blocking)
+                    await self.relationship_store.compute_and_store(
+                        all_chunks, self.database, background=True
+                    )
+                    logger.info("✓ Relationships marked for background computation")
+                    logger.info(
+                        "  Use 'mcp-vector-search index relationships' to compute now or wait for background task"
+                    )
+                else:
+                    logger.warning("No chunks found for relationship computation")
+            except Exception as e:
+                logger.warning(f"Failed to mark relationships: {e}")
+                logger.debug("Visualization will compute relationships on demand")
+        # Save trend snapshot after successful indexing
+        if indexed_count > 0:
+            try:
+                logger.info("Saving metrics snapshot for trend tracking...")
+                # Get database stats
+                stats = await self.database.get_stats()
+                # Get all chunks for detailed metrics
+                all_chunks = await self.database.get_all_chunks()
+                # Compute metrics from stats and chunks
+                metrics = self.trend_tracker.compute_metrics_from_stats(
+                    stats.to_dict(), all_chunks
+                )
+                # Save snapshot (updates today's entry if exists)
+                self.trend_tracker.save_snapshot(metrics)
+                logger.info(
+                    f"✓ Saved trend snapshot: {metrics['total_files']} files, "
+                    f"{metrics['total_chunks']} chunks, health score {metrics['health_score']}"
+                )
+            except Exception as e:
+                logger.warning(f"Failed to save trend snapshot: {e}")
         return indexed_count
+    async def _parse_and_prepare_file(
+        self, file_path: Path, force_reindex: bool = False
+    ) -> tuple[list[CodeChunk], dict[str, Any] | None]:
+        """Parse file and prepare chunks with metrics (no database insertion).
+        This method extracts the parsing and metric collection logic from index_file()
+        to enable batch processing across multiple files.
+        Args:
+            file_path: Path to the file to parse
+            force_reindex: Whether to force reindexing (always deletes existing chunks)
+        Returns:
+            Tuple of (chunks_with_hierarchy, chunk_metrics)
+        Raises:
+            ParsingError: If file parsing fails
+        """
+        # Check if file should be indexed
+        if not self._should_index_file(file_path):
+            return ([], None)
+        # Always remove existing chunks when reindexing a file
+        # This prevents duplicate chunks and ensures consistency
+        await self.database.delete_by_file(file_path)
+        # Parse file into chunks
+        chunks = await self._parse_file(file_path)
+        if not chunks:
+            logger.debug(f"No chunks extracted from {file_path}")
+            return ([], None)
+        # Build hierarchical relationships between chunks
+        chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
+        # Debug: Check if hierarchy was built
+        methods_with_parents = sum(
+            1
+            for c in chunks_with_hierarchy
+            if c.chunk_type in ("method", "function") and c.parent_chunk_id
+        )
+        logger.debug(
+            f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
+        )
+        # Collect metrics for chunks (if collectors are enabled)
+        chunk_metrics: dict[str, Any] | None = None
+        if self.collectors:
+            try:
+                # Read source code
+                source_code = file_path.read_bytes()
+                # Detect language from file extension
+                language = EXTENSION_TO_LANGUAGE.get(
+                    file_path.suffix.lower(), "unknown"
+                )
+                # Collect metrics for each chunk
+                chunk_metrics = {}
+                for chunk in chunks_with_hierarchy:
+                    metrics = self._collect_metrics(chunk, source_code, language)
+                    if metrics:
+                        chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
+                logger.debug(
+                    f"Collected metrics for {len(chunk_metrics)} chunks from {file_path}"
+                )
+            except Exception as e:
+                logger.warning(f"Failed to collect metrics for {file_path}: {e}")
+                chunk_metrics = None
+        return (chunks_with_hierarchy, chunk_metrics)
     async def _process_file_batch(
         self, file_paths: list[Path], force_reindex: bool = False
     ) -> list[bool]:
-        """Process a batch of files in parallel.
+        """Process a batch of files and accumulate chunks for batch embedding.
+        This method processes multiple files in parallel (using multiprocessing for
+        CPU-bound parsing) and then performs a single database insertion for all chunks,
+        enabling efficient batch embedding generation.
         Args:
             file_paths: List of file paths to process
@@ -236,26 +599,166 @@ class SemanticIndexer:
         Returns:
             List of success flags for each file
         """
-        # Create tasks for parallel processing
-        tasks = []
+        all_chunks: list[CodeChunk] = []
+        all_metrics: dict[str, Any] = {}
+        file_to_chunks_map: dict[str, tuple[int, int]] = {}
+        success_flags: list[bool] = []
+        # Filter files that should be indexed and delete old chunks
+        files_to_parse = []
         for file_path in file_paths:
-            task = asyncio.create_task(self._index_file_safe(file_path, force_reindex))
-            tasks.append(task)
+            if not self._should_index_file(file_path):
+                success_flags.append(True)  # Skipped file is not an error
+                continue
+            # Delete old chunks before parsing
+            await self.database.delete_by_file(file_path)
+            files_to_parse.append(file_path)
-        # Wait for all tasks to complete
-        results = await asyncio.gather(*tasks, return_exceptions=True)
+        if not files_to_parse:
+            return success_flags
-        # Convert results to success flags
-        success_flags = []
-        for i, result in enumerate(results):
-            if isinstance(result, Exception):
-                logger.error(f"Failed to index {file_paths[i]}: {result}")
+        # Parse files using multiprocessing if enabled
+        if self.use_multiprocessing and len(files_to_parse) > 1:
+            # Use ProcessPoolExecutor for CPU-bound parsing
+            parse_results = await self._parse_files_multiprocess(files_to_parse)
+        else:
+            # Fall back to async processing (for single file or disabled multiprocessing)
+            parse_results = await self._parse_files_async(files_to_parse)
+        # Accumulate chunks from all successfully parsed files
+        metadata = self._load_index_metadata()
+        for file_path, chunks, error in parse_results:
+            if error:
+                logger.error(f"Failed to parse {file_path}: {error}")
                 success_flags.append(False)
+                continue
+            if chunks:
+                # Build hierarchy and collect metrics for parsed chunks
+                chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
+                # Collect metrics if enabled
+                chunk_metrics = None
+                if self.collectors:
+                    try:
+                        source_code = file_path.read_bytes()
+                        language = EXTENSION_TO_LANGUAGE.get(
+                            file_path.suffix.lower(), "unknown"
+                        )
+                        chunk_metrics = {}
+                        for chunk in chunks_with_hierarchy:
+                            metrics = self._collect_metrics(
+                                chunk, source_code, language
+                            )
+                            if metrics:
+                                chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
+                    except Exception as e:
+                        logger.warning(
+                            f"Failed to collect metrics for {file_path}: {e}"
+                        )
+                # Accumulate chunks
+                start_idx = len(all_chunks)
+                all_chunks.extend(chunks_with_hierarchy)
+                end_idx = len(all_chunks)
+                file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
+                # Merge metrics
+                if chunk_metrics:
+                    all_metrics.update(chunk_metrics)
+                # Update metadata for successfully parsed file
+                metadata[str(file_path)] = os.path.getmtime(file_path)
+                success_flags.append(True)
             else:
-                success_flags.append(result)
+                # Empty file is not an error
+                metadata[str(file_path)] = os.path.getmtime(file_path)
+                success_flags.append(True)
+        # Single database insertion for entire batch
+        if all_chunks:
+            logger.info(
+                f"Batch inserting {len(all_chunks)} chunks from {len(file_paths)} files"
+            )
+            try:
+                await self.database.add_chunks(all_chunks, metrics=all_metrics)
+                logger.debug(
+                    f"Successfully indexed {len(all_chunks)} chunks from {sum(success_flags)} files"
+                )
+            except Exception as e:
+                logger.error(f"Failed to insert batch of chunks: {e}")
+                # Mark all files in this batch as failed
+                return [False] * len(file_paths)
+        # Save updated metadata after successful batch
+        self._save_index_metadata(metadata)
         return success_flags
+    async def _parse_files_multiprocess(
+        self, file_paths: list[Path]
+    ) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
+        """Parse multiple files using multiprocessing for CPU-bound parallelism.
+        Args:
+            file_paths: List of file paths to parse
+        Returns:
+            List of tuples (file_path, chunks, error) for each file
+        """
+        # Prepare arguments for worker processes
+        parse_args = []
+        for file_path in file_paths:
+            # Get subproject info if available
+            subproject = self.monorepo_detector.get_subproject_for_file(file_path)
+            subproject_info_json = None
+            if subproject:
+                subproject_info_json = json.dumps(
+                    {
+                        "name": subproject.name,
+                        "relative_path": subproject.relative_path,
+                    }
+                )
+            parse_args.append((file_path, subproject_info_json))
+        # Limit workers to avoid overhead
+        max_workers = min(self.max_workers, len(file_paths))
+        # Run parsing in ProcessPoolExecutor
+        loop = asyncio.get_running_loop()
+        with ProcessPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all tasks and wait for results
+            results = await loop.run_in_executor(
+                None, lambda: list(executor.map(_parse_file_standalone, parse_args))
+            )
+        logger.debug(
+            f"Multiprocess parsing completed: {len(results)} files parsed with {max_workers} workers"
+        )
+        return results
+    async def _parse_files_async(
+        self, file_paths: list[Path]
+    ) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
+        """Parse multiple files using async (fallback for single file or disabled multiprocessing).
+        Args:
+            file_paths: List of file paths to parse
+        Returns:
+            List of tuples (file_path, chunks, error) for each file
+        """
+        results = []
+        for file_path in file_paths:
+            try:
+                chunks = await self._parse_file(file_path)
+                results.append((file_path, chunks, None))
+            except Exception as e:
+                logger.error(f"Failed to parse {file_path}: {e}")
+                results.append((file_path, [], e))
+        return results
     def _load_index_metadata(self) -> dict[str, float]:
         """Load file modification times from metadata file.
@@ -379,8 +882,34 @@ class SemanticIndexer:
                 f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
             )
-            # Add chunks to database
-            await self.database.add_chunks(chunks_with_hierarchy)
+            # Collect metrics for chunks (if collectors are enabled)
+            chunk_metrics: dict[str, Any] | None = None
+            if self.collectors:
+                try:
+                    # Read source code
+                    source_code = file_path.read_bytes()
+                    # Detect language from file extension
+                    language = EXTENSION_TO_LANGUAGE.get(
+                        file_path.suffix.lower(), "unknown"
+                    )
+                    # Collect metrics for each chunk
+                    chunk_metrics = {}
+                    for chunk in chunks_with_hierarchy:
+                        metrics = self._collect_metrics(chunk, source_code, language)
+                        if metrics:
+                            chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
+                    logger.debug(
+                        f"Collected metrics for {len(chunk_metrics)} chunks from {file_path}"
+                    )
+                except Exception as e:
+                    logger.warning(f"Failed to collect metrics for {file_path}: {e}")
+                    chunk_metrics = None
+            # Add chunks to database with metrics
+            await self.database.add_chunks(chunks_with_hierarchy, metrics=chunk_metrics)
             # Update metadata after successful indexing
             metadata = self._load_index_metadata()
@@ -572,8 +1101,10 @@ class SemanticIndexer:
             # Get relative path from project root for checking
             relative_path = file_path.relative_to(self.project_root)
-            # 1. Check dotfile filtering (if enabled in config)
-            if self.config and self.config.skip_dotfiles:
+            # 1. Check dotfile filtering (ENABLED BY DEFAULT)
+            # Skip dotfiles unless config explicitly disables it
+            skip_dotfiles = self.config.skip_dotfiles if self.config else True
+            if skip_dotfiles:
                 for part in relative_path.parts:
                     # Skip dotfiles unless they're in the whitelist
                     if part.startswith(".") and part not in ALLOWED_DOTFILES:
@@ -807,6 +1338,9 @@ class SemanticIndexer:
     ):
         """Index files and yield progress updates for each file.
+        This method processes files in batches and accumulates chunks across files
+        before performing a single database insertion per batch for better performance.
         Args:
             files_to_index: List of file paths to index
             force_reindex: Whether to force reindexing
@@ -817,42 +1351,84 @@ class SemanticIndexer:
         # Write version header to error log at start of indexing run
         self._write_indexing_run_header()
-        metadata = self._load_index_metadata()
-        # Process files in batches for better memory management
+        # Process files in batches for better memory management and embedding efficiency
         for i in range(0, len(files_to_index), self.batch_size):
             batch = files_to_index[i : i + self.batch_size]
-            # Process each file in the batch
+            # Accumulate chunks from all files in batch
+            all_chunks: list[CodeChunk] = []
+            all_metrics: dict[str, Any] = {}
+            file_to_chunks_map: dict[str, tuple[int, int]] = {}
+            file_results: dict[Path, tuple[int, bool]] = {}
+            # Parse all files in parallel
+            tasks = []
             for file_path in batch:
-                chunks_added = 0
-                success = False
+                task = asyncio.create_task(
+                    self._parse_and_prepare_file(file_path, force_reindex)
+                )
+                tasks.append(task)
-                try:
-                    # Always remove existing chunks when reindexing
-                    await self.database.delete_by_file(file_path)
+            parse_results = await asyncio.gather(*tasks, return_exceptions=True)
-                    # Parse file into chunks
-                    chunks = await self._parse_file(file_path)
+            # Accumulate chunks from successfully parsed files
+            metadata = self._load_index_metadata()
+            for file_path, result in zip(batch, parse_results, strict=True):
+                if isinstance(result, Exception):
+                    error_msg = f"Failed to index file {file_path}: {type(result).__name__}: {str(result)}"
+                    logger.error(error_msg)
+                    file_results[file_path] = (0, False)
-                    if chunks:
-                        # Build hierarchical relationships
-                        chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
+                    # Save error to error log file
+                    try:
+                        error_log_path = (
+                            self.project_root
+                            / ".mcp-vector-search"
+                            / "indexing_errors.log"
+                        )
+                        with open(error_log_path, "a", encoding="utf-8") as f:
+                            timestamp = datetime.now().isoformat()
+                            f.write(f"[{timestamp}] {error_msg}\n")
+                    except Exception as log_err:
+                        logger.debug(f"Failed to write error log: {log_err}")
+                    continue
-                        # Add chunks to database
-                        await self.database.add_chunks(chunks_with_hierarchy)
-                        chunks_added = len(chunks)
-                        logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
+                chunks, metrics = result
+                if chunks:
+                    start_idx = len(all_chunks)
+                    all_chunks.extend(chunks)
+                    end_idx = len(all_chunks)
+                    file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
-                    success = True
+                    # Merge metrics
+                    if metrics:
+                        all_metrics.update(metrics)
-                    # Update metadata after successful indexing
+                    # Update metadata for successfully parsed file
+                    metadata[str(file_path)] = os.path.getmtime(file_path)
+                    file_results[file_path] = (len(chunks), True)
+                    logger.debug(f"Prepared {len(chunks)} chunks from {file_path}")
+                else:
+                    # Empty file is not an error
                     metadata[str(file_path)] = os.path.getmtime(file_path)
+                    file_results[file_path] = (0, True)
+            # Single database insertion for entire batch
+            if all_chunks:
+                logger.info(
+                    f"Batch inserting {len(all_chunks)} chunks from {len(batch)} files"
+                )
+                try:
+                    await self.database.add_chunks(all_chunks, metrics=all_metrics)
+                    logger.debug(
+                        f"Successfully indexed {len(all_chunks)} chunks from batch"
+                    )
                 except Exception as e:
-                    error_msg = f"Failed to index file {file_path}: {type(e).__name__}: {str(e)}"
+                    error_msg = f"Failed to insert batch of chunks: {e}"
                     logger.error(error_msg)
-                    success = False
+                    # Mark all files with chunks in this batch as failed
+                    for file_path in file_to_chunks_map.keys():
+                        file_results[Path(file_path)] = (0, False)
                     # Save error to error log file
                     try:
@@ -862,18 +1438,18 @@ class SemanticIndexer:
                             / "indexing_errors.log"
                         )
                         with open(error_log_path, "a", encoding="utf-8") as f:
-                            from datetime import datetime
                             timestamp = datetime.now().isoformat()
                             f.write(f"[{timestamp}] {error_msg}\n")
                     except Exception as log_err:
                         logger.debug(f"Failed to write error log: {log_err}")
-                # Yield progress update
-                yield (file_path, chunks_added, success)
+            # Save metadata after batch
+            self._save_index_metadata(metadata)
-        # Save metadata at the end
-        self._save_index_metadata(metadata)
+            # Yield progress updates for each file in batch
+            for file_path in batch:
+                chunks_added, success = file_results.get(file_path, (0, False))
+                yield (file_path, chunks_added, success)
     def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
         """Build parent-child relationships between chunks.
@@ -895,7 +1471,9 @@ class SemanticIndexer:
             return chunks
         # Group chunks by type and name
-        module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
+        # Only actual module chunks (not imports) serve as parents for top-level code
+        # imports chunks should remain siblings of classes/functions, not parents
+        module_chunks = [c for c in chunks if c.chunk_type == "module"]
         class_chunks = [
             c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
         ]

mcp-vector-search 0.12.6__py3-none-any.whl → 1.1.22__py3-none-any.whl

mcp-vector-search 0.12.6py3-none-any.whl → 1.1.22py3-none-any.whl