mcp-vector-search 1.0.3__py3-none-any.whl → 1.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. mcp_vector_search/__init__.py +3 -3
  2. mcp_vector_search/analysis/__init__.py +48 -1
  3. mcp_vector_search/analysis/baseline/__init__.py +68 -0
  4. mcp_vector_search/analysis/baseline/comparator.py +462 -0
  5. mcp_vector_search/analysis/baseline/manager.py +621 -0
  6. mcp_vector_search/analysis/collectors/__init__.py +35 -0
  7. mcp_vector_search/analysis/collectors/cohesion.py +463 -0
  8. mcp_vector_search/analysis/collectors/coupling.py +1162 -0
  9. mcp_vector_search/analysis/collectors/halstead.py +514 -0
  10. mcp_vector_search/analysis/collectors/smells.py +325 -0
  11. mcp_vector_search/analysis/debt.py +516 -0
  12. mcp_vector_search/analysis/interpretation.py +685 -0
  13. mcp_vector_search/analysis/metrics.py +74 -1
  14. mcp_vector_search/analysis/reporters/__init__.py +3 -1
  15. mcp_vector_search/analysis/reporters/console.py +424 -0
  16. mcp_vector_search/analysis/reporters/markdown.py +480 -0
  17. mcp_vector_search/analysis/reporters/sarif.py +377 -0
  18. mcp_vector_search/analysis/storage/__init__.py +93 -0
  19. mcp_vector_search/analysis/storage/metrics_store.py +762 -0
  20. mcp_vector_search/analysis/storage/schema.py +245 -0
  21. mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
  22. mcp_vector_search/analysis/trends.py +308 -0
  23. mcp_vector_search/analysis/visualizer/__init__.py +90 -0
  24. mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
  25. mcp_vector_search/analysis/visualizer/exporter.py +484 -0
  26. mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
  27. mcp_vector_search/analysis/visualizer/schemas.py +525 -0
  28. mcp_vector_search/cli/commands/analyze.py +665 -11
  29. mcp_vector_search/cli/commands/chat.py +193 -0
  30. mcp_vector_search/cli/commands/index.py +600 -2
  31. mcp_vector_search/cli/commands/index_background.py +467 -0
  32. mcp_vector_search/cli/commands/search.py +194 -1
  33. mcp_vector_search/cli/commands/setup.py +64 -13
  34. mcp_vector_search/cli/commands/status.py +302 -3
  35. mcp_vector_search/cli/commands/visualize/cli.py +26 -10
  36. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +8 -4
  37. mcp_vector_search/cli/commands/visualize/graph_builder.py +167 -234
  38. mcp_vector_search/cli/commands/visualize/server.py +304 -15
  39. mcp_vector_search/cli/commands/visualize/templates/base.py +60 -6
  40. mcp_vector_search/cli/commands/visualize/templates/scripts.py +2100 -65
  41. mcp_vector_search/cli/commands/visualize/templates/styles.py +1297 -88
  42. mcp_vector_search/cli/didyoumean.py +5 -0
  43. mcp_vector_search/cli/main.py +16 -5
  44. mcp_vector_search/cli/output.py +134 -5
  45. mcp_vector_search/config/thresholds.py +89 -1
  46. mcp_vector_search/core/__init__.py +16 -0
  47. mcp_vector_search/core/database.py +39 -2
  48. mcp_vector_search/core/embeddings.py +24 -0
  49. mcp_vector_search/core/git.py +380 -0
  50. mcp_vector_search/core/indexer.py +445 -84
  51. mcp_vector_search/core/llm_client.py +9 -4
  52. mcp_vector_search/core/models.py +88 -1
  53. mcp_vector_search/core/relationships.py +473 -0
  54. mcp_vector_search/core/search.py +1 -1
  55. mcp_vector_search/mcp/server.py +795 -4
  56. mcp_vector_search/parsers/python.py +285 -5
  57. mcp_vector_search/utils/gitignore.py +0 -3
  58. {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +3 -2
  59. {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/RECORD +62 -39
  60. mcp_vector_search/cli/commands/visualize.py.original +0 -2536
  61. {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +0 -0
  62. {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +0 -0
  63. {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0
@@ -2,7 +2,9 @@
2
2
 
3
3
  import asyncio
4
4
  import json
5
+ import multiprocessing
5
6
  import os
7
+ from concurrent.futures import ProcessPoolExecutor
6
8
  from datetime import UTC, datetime
7
9
  from pathlib import Path
8
10
  from typing import Any
@@ -13,6 +15,7 @@ from packaging import version
13
15
  from .. import __version__
14
16
  from ..analysis.collectors.base import MetricCollector
15
17
  from ..analysis.metrics import ChunkMetrics
18
+ from ..analysis.trends import TrendTracker
16
19
  from ..config.defaults import ALLOWED_DOTFILES, DEFAULT_IGNORE_PATTERNS
17
20
  from ..config.settings import ProjectConfig
18
21
  from ..parsers.registry import get_parser_registry
@@ -22,6 +25,7 @@ from .database import VectorDatabase
22
25
  from .directory_index import DirectoryIndex
23
26
  from .exceptions import ParsingError
24
27
  from .models import CodeChunk, IndexStats
28
+ from .relationships import RelationshipStore
25
29
 
26
30
  # Extension to language mapping for metric collection
27
31
  EXTENSION_TO_LANGUAGE = {
@@ -37,6 +41,67 @@ EXTENSION_TO_LANGUAGE = {
37
41
  }
38
42
 
39
43
 
44
+ def _parse_file_standalone(
45
+ args: tuple[Path, str | None],
46
+ ) -> tuple[Path, list[CodeChunk], Exception | None]:
47
+ """Parse a single file - standalone function for multiprocessing.
48
+
49
+ This function must be at module level (not a method) to be picklable for
50
+ multiprocessing. It creates its own parser registry to avoid serialization issues.
51
+
52
+ Args:
53
+ args: Tuple of (file_path, subproject_info_json)
54
+ - file_path: Path to the file to parse
55
+ - subproject_info_json: JSON string with subproject info or None
56
+
57
+ Returns:
58
+ Tuple of (file_path, chunks, error)
59
+ - file_path: The file path that was parsed
60
+ - chunks: List of parsed CodeChunk objects (empty if error)
61
+ - error: Exception if parsing failed, None if successful
62
+ """
63
+ file_path, subproject_info_json = args
64
+
65
+ try:
66
+ # Create parser registry in this process
67
+ parser_registry = get_parser_registry()
68
+
69
+ # Get appropriate parser
70
+ parser = parser_registry.get_parser_for_file(file_path)
71
+
72
+ # Parse file synchronously (tree-sitter is synchronous anyway)
73
+ # We need to use the synchronous version of parse_file
74
+ # Since parsers may have async methods, we'll read and parse directly
75
+ import asyncio
76
+
77
+ # Create event loop for this process if needed
78
+ try:
79
+ loop = asyncio.get_event_loop()
80
+ except RuntimeError:
81
+ loop = asyncio.new_event_loop()
82
+ asyncio.set_event_loop(loop)
83
+
84
+ # Run the async parse_file in this process's event loop
85
+ chunks = loop.run_until_complete(parser.parse_file(file_path))
86
+
87
+ # Filter out empty chunks
88
+ valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
89
+
90
+ # Apply subproject information if available
91
+ if subproject_info_json:
92
+ subproject_info = json.loads(subproject_info_json)
93
+ for chunk in valid_chunks:
94
+ chunk.subproject_name = subproject_info.get("name")
95
+ chunk.subproject_path = subproject_info.get("relative_path")
96
+
97
+ return (file_path, valid_chunks, None)
98
+
99
+ except Exception as e:
100
+ # Return error instead of raising to avoid process crashes
101
+ logger.error(f"Failed to parse file {file_path} in worker process: {e}")
102
+ return (file_path, [], e)
103
+
104
+
40
105
  class SemanticIndexer:
41
106
  """Semantic indexer for parsing and indexing code files."""
42
107
 
@@ -50,6 +115,7 @@ class SemanticIndexer:
50
115
  batch_size: int = 10,
51
116
  debug: bool = False,
52
117
  collectors: list[MetricCollector] | None = None,
118
+ use_multiprocessing: bool = True,
53
119
  ) -> None:
54
120
  """Initialize semantic indexer.
55
121
 
@@ -58,10 +124,11 @@ class SemanticIndexer:
58
124
  project_root: Project root directory
59
125
  file_extensions: File extensions to index (deprecated, use config)
60
126
  config: Project configuration (preferred over file_extensions)
61
- max_workers: Maximum number of worker threads for parallel processing
127
+ max_workers: Maximum number of worker processes for parallel parsing (ignored if use_multiprocessing=False)
62
128
  batch_size: Number of files to process in each batch
63
129
  debug: Enable debug output for hierarchy building
64
130
  collectors: Metric collectors to run during indexing (defaults to all complexity collectors)
131
+ use_multiprocessing: Enable multiprocess parallel parsing (default: True, disable for debugging)
65
132
  """
66
133
  self.database = database
67
134
  self.project_root = project_root
@@ -86,13 +153,18 @@ class SemanticIndexer:
86
153
  collectors if collectors is not None else self._default_collectors()
87
154
  )
88
155
 
89
- # Safely get event loop for max_workers
90
- try:
91
- loop = asyncio.get_event_loop()
92
- self.max_workers = max_workers or min(4, (loop.get_debug() and 1) or 4)
93
- except RuntimeError:
94
- # No event loop in current thread
95
- self.max_workers = max_workers or 4
156
+ # Configure multiprocessing for parallel parsing
157
+ self.use_multiprocessing = use_multiprocessing
158
+ if use_multiprocessing:
159
+ # Use 75% of CPU cores for parsing, but cap at 8 to avoid overhead
160
+ cpu_count = multiprocessing.cpu_count()
161
+ self.max_workers = max_workers or min(max(1, int(cpu_count * 0.75)), 8)
162
+ logger.debug(
163
+ f"Multiprocessing enabled with {self.max_workers} workers (CPU count: {cpu_count})"
164
+ )
165
+ else:
166
+ self.max_workers = 1
167
+ logger.debug("Multiprocessing disabled (single-threaded mode)")
96
168
 
97
169
  self.batch_size = batch_size
98
170
  self._index_metadata_file = (
@@ -133,6 +205,12 @@ class SemanticIndexer:
133
205
  # Load existing directory index
134
206
  self.directory_index.load()
135
207
 
208
+ # Initialize relationship store for pre-computing visualization relationships
209
+ self.relationship_store = RelationshipStore(project_root)
210
+
211
+ # Initialize trend tracker for historical metrics
212
+ self.trend_tracker = TrendTracker(project_root)
213
+
136
214
  def _default_collectors(self) -> list[MetricCollector]:
137
215
  """Return default set of metric collectors.
138
216
 
@@ -275,12 +353,14 @@ class SemanticIndexer:
275
353
  self,
276
354
  force_reindex: bool = False,
277
355
  show_progress: bool = True,
356
+ skip_relationships: bool = False,
278
357
  ) -> int:
279
358
  """Index all files in the project.
280
359
 
281
360
  Args:
282
361
  force_reindex: Whether to reindex existing files
283
362
  show_progress: Whether to show progress information
363
+ skip_relationships: Skip computing relationships for visualization (faster, but visualize will be slower)
284
364
 
285
365
  Returns:
286
366
  Number of files indexed
@@ -383,12 +463,134 @@ class SemanticIndexer:
383
463
  f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
384
464
  )
385
465
 
466
+ # Mark relationships for background computation (unless skipped)
467
+ # Default behavior: skip blocking computation, mark for background processing
468
+ if not skip_relationships and indexed_count > 0:
469
+ try:
470
+ logger.info("Marking relationships for background computation...")
471
+ # Get all chunks from database for relationship computation
472
+ all_chunks = await self.database.get_all_chunks()
473
+
474
+ if len(all_chunks) > 0:
475
+ # Mark for background computation (non-blocking)
476
+ await self.relationship_store.compute_and_store(
477
+ all_chunks, self.database, background=True
478
+ )
479
+ logger.info("✓ Relationships marked for background computation")
480
+ logger.info(
481
+ " Use 'mcp-vector-search index relationships' to compute now or wait for background task"
482
+ )
483
+ else:
484
+ logger.warning("No chunks found for relationship computation")
485
+ except Exception as e:
486
+ logger.warning(f"Failed to mark relationships: {e}")
487
+ logger.debug("Visualization will compute relationships on demand")
488
+
489
+ # Save trend snapshot after successful indexing
490
+ if indexed_count > 0:
491
+ try:
492
+ logger.info("Saving metrics snapshot for trend tracking...")
493
+ # Get database stats
494
+ stats = await self.database.get_stats()
495
+ # Get all chunks for detailed metrics
496
+ all_chunks = await self.database.get_all_chunks()
497
+ # Compute metrics from stats and chunks
498
+ metrics = self.trend_tracker.compute_metrics_from_stats(
499
+ stats.to_dict(), all_chunks
500
+ )
501
+ # Save snapshot (updates today's entry if exists)
502
+ self.trend_tracker.save_snapshot(metrics)
503
+ logger.info(
504
+ f"✓ Saved trend snapshot: {metrics['total_files']} files, "
505
+ f"{metrics['total_chunks']} chunks, health score {metrics['health_score']}"
506
+ )
507
+ except Exception as e:
508
+ logger.warning(f"Failed to save trend snapshot: {e}")
509
+
386
510
  return indexed_count
387
511
 
512
+ async def _parse_and_prepare_file(
513
+ self, file_path: Path, force_reindex: bool = False
514
+ ) -> tuple[list[CodeChunk], dict[str, Any] | None]:
515
+ """Parse file and prepare chunks with metrics (no database insertion).
516
+
517
+ This method extracts the parsing and metric collection logic from index_file()
518
+ to enable batch processing across multiple files.
519
+
520
+ Args:
521
+ file_path: Path to the file to parse
522
+ force_reindex: Whether to force reindexing (always deletes existing chunks)
523
+
524
+ Returns:
525
+ Tuple of (chunks_with_hierarchy, chunk_metrics)
526
+
527
+ Raises:
528
+ ParsingError: If file parsing fails
529
+ """
530
+ # Check if file should be indexed
531
+ if not self._should_index_file(file_path):
532
+ return ([], None)
533
+
534
+ # Always remove existing chunks when reindexing a file
535
+ # This prevents duplicate chunks and ensures consistency
536
+ await self.database.delete_by_file(file_path)
537
+
538
+ # Parse file into chunks
539
+ chunks = await self._parse_file(file_path)
540
+
541
+ if not chunks:
542
+ logger.debug(f"No chunks extracted from {file_path}")
543
+ return ([], None)
544
+
545
+ # Build hierarchical relationships between chunks
546
+ chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
547
+
548
+ # Debug: Check if hierarchy was built
549
+ methods_with_parents = sum(
550
+ 1
551
+ for c in chunks_with_hierarchy
552
+ if c.chunk_type in ("method", "function") and c.parent_chunk_id
553
+ )
554
+ logger.debug(
555
+ f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
556
+ )
557
+
558
+ # Collect metrics for chunks (if collectors are enabled)
559
+ chunk_metrics: dict[str, Any] | None = None
560
+ if self.collectors:
561
+ try:
562
+ # Read source code
563
+ source_code = file_path.read_bytes()
564
+
565
+ # Detect language from file extension
566
+ language = EXTENSION_TO_LANGUAGE.get(
567
+ file_path.suffix.lower(), "unknown"
568
+ )
569
+
570
+ # Collect metrics for each chunk
571
+ chunk_metrics = {}
572
+ for chunk in chunks_with_hierarchy:
573
+ metrics = self._collect_metrics(chunk, source_code, language)
574
+ if metrics:
575
+ chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
576
+
577
+ logger.debug(
578
+ f"Collected metrics for {len(chunk_metrics)} chunks from {file_path}"
579
+ )
580
+ except Exception as e:
581
+ logger.warning(f"Failed to collect metrics for {file_path}: {e}")
582
+ chunk_metrics = None
583
+
584
+ return (chunks_with_hierarchy, chunk_metrics)
585
+
388
586
  async def _process_file_batch(
389
587
  self, file_paths: list[Path], force_reindex: bool = False
390
588
  ) -> list[bool]:
391
- """Process a batch of files in parallel.
589
+ """Process a batch of files and accumulate chunks for batch embedding.
590
+
591
+ This method processes multiple files in parallel (using multiprocessing for
592
+ CPU-bound parsing) and then performs a single database insertion for all chunks,
593
+ enabling efficient batch embedding generation.
392
594
 
393
595
  Args:
394
596
  file_paths: List of file paths to process
@@ -397,26 +599,166 @@ class SemanticIndexer:
397
599
  Returns:
398
600
  List of success flags for each file
399
601
  """
400
- # Create tasks for parallel processing
401
- tasks = []
602
+ all_chunks: list[CodeChunk] = []
603
+ all_metrics: dict[str, Any] = {}
604
+ file_to_chunks_map: dict[str, tuple[int, int]] = {}
605
+ success_flags: list[bool] = []
606
+
607
+ # Filter files that should be indexed and delete old chunks
608
+ files_to_parse = []
402
609
  for file_path in file_paths:
403
- task = asyncio.create_task(self._index_file_safe(file_path, force_reindex))
404
- tasks.append(task)
610
+ if not self._should_index_file(file_path):
611
+ success_flags.append(True) # Skipped file is not an error
612
+ continue
613
+ # Delete old chunks before parsing
614
+ await self.database.delete_by_file(file_path)
615
+ files_to_parse.append(file_path)
405
616
 
406
- # Wait for all tasks to complete
407
- results = await asyncio.gather(*tasks, return_exceptions=True)
617
+ if not files_to_parse:
618
+ return success_flags
619
+
620
+ # Parse files using multiprocessing if enabled
621
+ if self.use_multiprocessing and len(files_to_parse) > 1:
622
+ # Use ProcessPoolExecutor for CPU-bound parsing
623
+ parse_results = await self._parse_files_multiprocess(files_to_parse)
624
+ else:
625
+ # Fall back to async processing (for single file or disabled multiprocessing)
626
+ parse_results = await self._parse_files_async(files_to_parse)
408
627
 
409
- # Convert results to success flags
410
- success_flags = []
411
- for i, result in enumerate(results):
412
- if isinstance(result, Exception):
413
- logger.error(f"Failed to index {file_paths[i]}: {result}")
628
+ # Accumulate chunks from all successfully parsed files
629
+ metadata = self._load_index_metadata()
630
+ for file_path, chunks, error in parse_results:
631
+ if error:
632
+ logger.error(f"Failed to parse {file_path}: {error}")
414
633
  success_flags.append(False)
634
+ continue
635
+
636
+ if chunks:
637
+ # Build hierarchy and collect metrics for parsed chunks
638
+ chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
639
+
640
+ # Collect metrics if enabled
641
+ chunk_metrics = None
642
+ if self.collectors:
643
+ try:
644
+ source_code = file_path.read_bytes()
645
+ language = EXTENSION_TO_LANGUAGE.get(
646
+ file_path.suffix.lower(), "unknown"
647
+ )
648
+ chunk_metrics = {}
649
+ for chunk in chunks_with_hierarchy:
650
+ metrics = self._collect_metrics(
651
+ chunk, source_code, language
652
+ )
653
+ if metrics:
654
+ chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
655
+ except Exception as e:
656
+ logger.warning(
657
+ f"Failed to collect metrics for {file_path}: {e}"
658
+ )
659
+
660
+ # Accumulate chunks
661
+ start_idx = len(all_chunks)
662
+ all_chunks.extend(chunks_with_hierarchy)
663
+ end_idx = len(all_chunks)
664
+ file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
665
+
666
+ # Merge metrics
667
+ if chunk_metrics:
668
+ all_metrics.update(chunk_metrics)
669
+
670
+ # Update metadata for successfully parsed file
671
+ metadata[str(file_path)] = os.path.getmtime(file_path)
672
+ success_flags.append(True)
415
673
  else:
416
- success_flags.append(result)
674
+ # Empty file is not an error
675
+ metadata[str(file_path)] = os.path.getmtime(file_path)
676
+ success_flags.append(True)
677
+
678
+ # Single database insertion for entire batch
679
+ if all_chunks:
680
+ logger.info(
681
+ f"Batch inserting {len(all_chunks)} chunks from {len(file_paths)} files"
682
+ )
683
+ try:
684
+ await self.database.add_chunks(all_chunks, metrics=all_metrics)
685
+ logger.debug(
686
+ f"Successfully indexed {len(all_chunks)} chunks from {sum(success_flags)} files"
687
+ )
688
+ except Exception as e:
689
+ logger.error(f"Failed to insert batch of chunks: {e}")
690
+ # Mark all files in this batch as failed
691
+ return [False] * len(file_paths)
692
+
693
+ # Save updated metadata after successful batch
694
+ self._save_index_metadata(metadata)
417
695
 
418
696
  return success_flags
419
697
 
698
+ async def _parse_files_multiprocess(
699
+ self, file_paths: list[Path]
700
+ ) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
701
+ """Parse multiple files using multiprocessing for CPU-bound parallelism.
702
+
703
+ Args:
704
+ file_paths: List of file paths to parse
705
+
706
+ Returns:
707
+ List of tuples (file_path, chunks, error) for each file
708
+ """
709
+ # Prepare arguments for worker processes
710
+ parse_args = []
711
+ for file_path in file_paths:
712
+ # Get subproject info if available
713
+ subproject = self.monorepo_detector.get_subproject_for_file(file_path)
714
+ subproject_info_json = None
715
+ if subproject:
716
+ subproject_info_json = json.dumps(
717
+ {
718
+ "name": subproject.name,
719
+ "relative_path": subproject.relative_path,
720
+ }
721
+ )
722
+ parse_args.append((file_path, subproject_info_json))
723
+
724
+ # Limit workers to avoid overhead
725
+ max_workers = min(self.max_workers, len(file_paths))
726
+
727
+ # Run parsing in ProcessPoolExecutor
728
+ loop = asyncio.get_running_loop()
729
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
730
+ # Submit all tasks and wait for results
731
+ results = await loop.run_in_executor(
732
+ None, lambda: list(executor.map(_parse_file_standalone, parse_args))
733
+ )
734
+
735
+ logger.debug(
736
+ f"Multiprocess parsing completed: {len(results)} files parsed with {max_workers} workers"
737
+ )
738
+ return results
739
+
740
+ async def _parse_files_async(
741
+ self, file_paths: list[Path]
742
+ ) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
743
+ """Parse multiple files using async (fallback for single file or disabled multiprocessing).
744
+
745
+ Args:
746
+ file_paths: List of file paths to parse
747
+
748
+ Returns:
749
+ List of tuples (file_path, chunks, error) for each file
750
+ """
751
+ results = []
752
+ for file_path in file_paths:
753
+ try:
754
+ chunks = await self._parse_file(file_path)
755
+ results.append((file_path, chunks, None))
756
+ except Exception as e:
757
+ logger.error(f"Failed to parse {file_path}: {e}")
758
+ results.append((file_path, [], e))
759
+
760
+ return results
761
+
420
762
  def _load_index_metadata(self) -> dict[str, float]:
421
763
  """Load file modification times from metadata file.
422
764
 
@@ -759,8 +1101,10 @@ class SemanticIndexer:
759
1101
  # Get relative path from project root for checking
760
1102
  relative_path = file_path.relative_to(self.project_root)
761
1103
 
762
- # 1. Check dotfile filtering (if enabled in config)
763
- if self.config and self.config.skip_dotfiles:
1104
+ # 1. Check dotfile filtering (ENABLED BY DEFAULT)
1105
+ # Skip dotfiles unless config explicitly disables it
1106
+ skip_dotfiles = self.config.skip_dotfiles if self.config else True
1107
+ if skip_dotfiles:
764
1108
  for part in relative_path.parts:
765
1109
  # Skip dotfiles unless they're in the whitelist
766
1110
  if part.startswith(".") and part not in ALLOWED_DOTFILES:
@@ -994,6 +1338,9 @@ class SemanticIndexer:
994
1338
  ):
995
1339
  """Index files and yield progress updates for each file.
996
1340
 
1341
+ This method processes files in batches and accumulates chunks across files
1342
+ before performing a single database insertion per batch for better performance.
1343
+
997
1344
  Args:
998
1345
  files_to_index: List of file paths to index
999
1346
  force_reindex: Whether to force reindexing
@@ -1004,72 +1351,84 @@ class SemanticIndexer:
1004
1351
  # Write version header to error log at start of indexing run
1005
1352
  self._write_indexing_run_header()
1006
1353
 
1007
- metadata = self._load_index_metadata()
1008
-
1009
- # Process files in batches for better memory management
1354
+ # Process files in batches for better memory management and embedding efficiency
1010
1355
  for i in range(0, len(files_to_index), self.batch_size):
1011
1356
  batch = files_to_index[i : i + self.batch_size]
1012
1357
 
1013
- # Process each file in the batch
1358
+ # Accumulate chunks from all files in batch
1359
+ all_chunks: list[CodeChunk] = []
1360
+ all_metrics: dict[str, Any] = {}
1361
+ file_to_chunks_map: dict[str, tuple[int, int]] = {}
1362
+ file_results: dict[Path, tuple[int, bool]] = {}
1363
+
1364
+ # Parse all files in parallel
1365
+ tasks = []
1014
1366
  for file_path in batch:
1015
- chunks_added = 0
1016
- success = False
1367
+ task = asyncio.create_task(
1368
+ self._parse_and_prepare_file(file_path, force_reindex)
1369
+ )
1370
+ tasks.append(task)
1017
1371
 
1018
- try:
1019
- # Always remove existing chunks when reindexing
1020
- await self.database.delete_by_file(file_path)
1021
-
1022
- # Parse file into chunks
1023
- chunks = await self._parse_file(file_path)
1024
-
1025
- if chunks:
1026
- # Build hierarchical relationships
1027
- chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
1028
-
1029
- # Collect metrics for chunks (if collectors are enabled)
1030
- chunk_metrics: dict[str, Any] | None = None
1031
- if self.collectors:
1032
- try:
1033
- # Read source code
1034
- source_code = file_path.read_bytes()
1035
-
1036
- # Detect language from file extension
1037
- language = EXTENSION_TO_LANGUAGE.get(
1038
- file_path.suffix.lower(), "unknown"
1039
- )
1040
-
1041
- # Collect metrics for each chunk
1042
- chunk_metrics = {}
1043
- for chunk in chunks_with_hierarchy:
1044
- metrics = self._collect_metrics(
1045
- chunk, source_code, language
1046
- )
1047
- if metrics:
1048
- chunk_metrics[chunk.chunk_id] = (
1049
- metrics.to_metadata()
1050
- )
1051
- except Exception as e:
1052
- logger.warning(
1053
- f"Failed to collect metrics for {file_path}: {e}"
1054
- )
1055
- chunk_metrics = None
1056
-
1057
- # Add chunks to database with metrics
1058
- await self.database.add_chunks(
1059
- chunks_with_hierarchy, metrics=chunk_metrics
1372
+ parse_results = await asyncio.gather(*tasks, return_exceptions=True)
1373
+
1374
+ # Accumulate chunks from successfully parsed files
1375
+ metadata = self._load_index_metadata()
1376
+ for file_path, result in zip(batch, parse_results, strict=True):
1377
+ if isinstance(result, Exception):
1378
+ error_msg = f"Failed to index file {file_path}: {type(result).__name__}: {str(result)}"
1379
+ logger.error(error_msg)
1380
+ file_results[file_path] = (0, False)
1381
+
1382
+ # Save error to error log file
1383
+ try:
1384
+ error_log_path = (
1385
+ self.project_root
1386
+ / ".mcp-vector-search"
1387
+ / "indexing_errors.log"
1060
1388
  )
1061
- chunks_added = len(chunks)
1062
- logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
1389
+ with open(error_log_path, "a", encoding="utf-8") as f:
1390
+ timestamp = datetime.now().isoformat()
1391
+ f.write(f"[{timestamp}] {error_msg}\n")
1392
+ except Exception as log_err:
1393
+ logger.debug(f"Failed to write error log: {log_err}")
1394
+ continue
1063
1395
 
1064
- success = True
1396
+ chunks, metrics = result
1397
+ if chunks:
1398
+ start_idx = len(all_chunks)
1399
+ all_chunks.extend(chunks)
1400
+ end_idx = len(all_chunks)
1401
+ file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
1065
1402
 
1066
- # Update metadata after successful indexing
1403
+ # Merge metrics
1404
+ if metrics:
1405
+ all_metrics.update(metrics)
1406
+
1407
+ # Update metadata for successfully parsed file
1408
+ metadata[str(file_path)] = os.path.getmtime(file_path)
1409
+ file_results[file_path] = (len(chunks), True)
1410
+ logger.debug(f"Prepared {len(chunks)} chunks from {file_path}")
1411
+ else:
1412
+ # Empty file is not an error
1067
1413
  metadata[str(file_path)] = os.path.getmtime(file_path)
1414
+ file_results[file_path] = (0, True)
1068
1415
 
1416
+ # Single database insertion for entire batch
1417
+ if all_chunks:
1418
+ logger.info(
1419
+ f"Batch inserting {len(all_chunks)} chunks from {len(batch)} files"
1420
+ )
1421
+ try:
1422
+ await self.database.add_chunks(all_chunks, metrics=all_metrics)
1423
+ logger.debug(
1424
+ f"Successfully indexed {len(all_chunks)} chunks from batch"
1425
+ )
1069
1426
  except Exception as e:
1070
- error_msg = f"Failed to index file {file_path}: {type(e).__name__}: {str(e)}"
1427
+ error_msg = f"Failed to insert batch of chunks: {e}"
1071
1428
  logger.error(error_msg)
1072
- success = False
1429
+ # Mark all files with chunks in this batch as failed
1430
+ for file_path in file_to_chunks_map.keys():
1431
+ file_results[Path(file_path)] = (0, False)
1073
1432
 
1074
1433
  # Save error to error log file
1075
1434
  try:
@@ -1079,18 +1438,18 @@ class SemanticIndexer:
1079
1438
  / "indexing_errors.log"
1080
1439
  )
1081
1440
  with open(error_log_path, "a", encoding="utf-8") as f:
1082
- from datetime import datetime
1083
-
1084
1441
  timestamp = datetime.now().isoformat()
1085
1442
  f.write(f"[{timestamp}] {error_msg}\n")
1086
1443
  except Exception as log_err:
1087
1444
  logger.debug(f"Failed to write error log: {log_err}")
1088
1445
 
1089
- # Yield progress update
1090
- yield (file_path, chunks_added, success)
1446
+ # Save metadata after batch
1447
+ self._save_index_metadata(metadata)
1091
1448
 
1092
- # Save metadata at the end
1093
- self._save_index_metadata(metadata)
1449
+ # Yield progress updates for each file in batch
1450
+ for file_path in batch:
1451
+ chunks_added, success = file_results.get(file_path, (0, False))
1452
+ yield (file_path, chunks_added, success)
1094
1453
 
1095
1454
  def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
1096
1455
  """Build parent-child relationships between chunks.
@@ -1112,7 +1471,9 @@ class SemanticIndexer:
1112
1471
  return chunks
1113
1472
 
1114
1473
  # Group chunks by type and name
1115
- module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
1474
+ # Only actual module chunks (not imports) serve as parents for top-level code
1475
+ # imports chunks should remain siblings of classes/functions, not parents
1476
+ module_chunks = [c for c in chunks if c.chunk_type == "module"]
1116
1477
  class_chunks = [
1117
1478
  c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
1118
1479
  ]