mcp-vector-search 0.12.6__py3-none-any.whl → 1.1.22__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. mcp_vector_search/__init__.py +3 -3
  2. mcp_vector_search/analysis/__init__.py +111 -0
  3. mcp_vector_search/analysis/baseline/__init__.py +68 -0
  4. mcp_vector_search/analysis/baseline/comparator.py +462 -0
  5. mcp_vector_search/analysis/baseline/manager.py +621 -0
  6. mcp_vector_search/analysis/collectors/__init__.py +74 -0
  7. mcp_vector_search/analysis/collectors/base.py +164 -0
  8. mcp_vector_search/analysis/collectors/cohesion.py +463 -0
  9. mcp_vector_search/analysis/collectors/complexity.py +743 -0
  10. mcp_vector_search/analysis/collectors/coupling.py +1162 -0
  11. mcp_vector_search/analysis/collectors/halstead.py +514 -0
  12. mcp_vector_search/analysis/collectors/smells.py +325 -0
  13. mcp_vector_search/analysis/debt.py +516 -0
  14. mcp_vector_search/analysis/interpretation.py +685 -0
  15. mcp_vector_search/analysis/metrics.py +414 -0
  16. mcp_vector_search/analysis/reporters/__init__.py +7 -0
  17. mcp_vector_search/analysis/reporters/console.py +646 -0
  18. mcp_vector_search/analysis/reporters/markdown.py +480 -0
  19. mcp_vector_search/analysis/reporters/sarif.py +377 -0
  20. mcp_vector_search/analysis/storage/__init__.py +93 -0
  21. mcp_vector_search/analysis/storage/metrics_store.py +762 -0
  22. mcp_vector_search/analysis/storage/schema.py +245 -0
  23. mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
  24. mcp_vector_search/analysis/trends.py +308 -0
  25. mcp_vector_search/analysis/visualizer/__init__.py +90 -0
  26. mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
  27. mcp_vector_search/analysis/visualizer/exporter.py +484 -0
  28. mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
  29. mcp_vector_search/analysis/visualizer/schemas.py +525 -0
  30. mcp_vector_search/cli/commands/analyze.py +1062 -0
  31. mcp_vector_search/cli/commands/chat.py +1455 -0
  32. mcp_vector_search/cli/commands/index.py +621 -5
  33. mcp_vector_search/cli/commands/index_background.py +467 -0
  34. mcp_vector_search/cli/commands/init.py +13 -0
  35. mcp_vector_search/cli/commands/install.py +597 -335
  36. mcp_vector_search/cli/commands/install_old.py +8 -4
  37. mcp_vector_search/cli/commands/mcp.py +78 -6
  38. mcp_vector_search/cli/commands/reset.py +68 -26
  39. mcp_vector_search/cli/commands/search.py +224 -8
  40. mcp_vector_search/cli/commands/setup.py +1184 -0
  41. mcp_vector_search/cli/commands/status.py +339 -5
  42. mcp_vector_search/cli/commands/uninstall.py +276 -357
  43. mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
  44. mcp_vector_search/cli/commands/visualize/cli.py +292 -0
  45. mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
  46. mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
  47. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +33 -0
  48. mcp_vector_search/cli/commands/visualize/graph_builder.py +647 -0
  49. mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
  50. mcp_vector_search/cli/commands/visualize/server.py +600 -0
  51. mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
  52. mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
  53. mcp_vector_search/cli/commands/visualize/templates/base.py +234 -0
  54. mcp_vector_search/cli/commands/visualize/templates/scripts.py +4542 -0
  55. mcp_vector_search/cli/commands/visualize/templates/styles.py +2522 -0
  56. mcp_vector_search/cli/didyoumean.py +27 -2
  57. mcp_vector_search/cli/main.py +127 -160
  58. mcp_vector_search/cli/output.py +158 -13
  59. mcp_vector_search/config/__init__.py +4 -0
  60. mcp_vector_search/config/default_thresholds.yaml +52 -0
  61. mcp_vector_search/config/settings.py +12 -0
  62. mcp_vector_search/config/thresholds.py +273 -0
  63. mcp_vector_search/core/__init__.py +16 -0
  64. mcp_vector_search/core/auto_indexer.py +3 -3
  65. mcp_vector_search/core/boilerplate.py +186 -0
  66. mcp_vector_search/core/config_utils.py +394 -0
  67. mcp_vector_search/core/database.py +406 -94
  68. mcp_vector_search/core/embeddings.py +24 -0
  69. mcp_vector_search/core/exceptions.py +11 -0
  70. mcp_vector_search/core/git.py +380 -0
  71. mcp_vector_search/core/git_hooks.py +4 -4
  72. mcp_vector_search/core/indexer.py +632 -54
  73. mcp_vector_search/core/llm_client.py +756 -0
  74. mcp_vector_search/core/models.py +91 -1
  75. mcp_vector_search/core/project.py +17 -0
  76. mcp_vector_search/core/relationships.py +473 -0
  77. mcp_vector_search/core/scheduler.py +11 -11
  78. mcp_vector_search/core/search.py +179 -29
  79. mcp_vector_search/mcp/server.py +819 -9
  80. mcp_vector_search/parsers/python.py +285 -5
  81. mcp_vector_search/utils/__init__.py +2 -0
  82. mcp_vector_search/utils/gitignore.py +0 -3
  83. mcp_vector_search/utils/gitignore_updater.py +212 -0
  84. mcp_vector_search/utils/monorepo.py +66 -4
  85. mcp_vector_search/utils/timing.py +10 -6
  86. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +184 -53
  87. mcp_vector_search-1.1.22.dist-info/RECORD +120 -0
  88. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +1 -1
  89. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +1 -0
  90. mcp_vector_search/cli/commands/visualize.py +0 -1467
  91. mcp_vector_search-0.12.6.dist-info/RECORD +0 -68
  92. {mcp_vector_search-0.12.6.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0
@@ -2,14 +2,20 @@
2
2
 
3
3
  import asyncio
4
4
  import json
5
+ import multiprocessing
5
6
  import os
7
+ from concurrent.futures import ProcessPoolExecutor
6
8
  from datetime import UTC, datetime
7
9
  from pathlib import Path
10
+ from typing import Any
8
11
 
9
12
  from loguru import logger
10
13
  from packaging import version
11
14
 
12
15
  from .. import __version__
16
+ from ..analysis.collectors.base import MetricCollector
17
+ from ..analysis.metrics import ChunkMetrics
18
+ from ..analysis.trends import TrendTracker
13
19
  from ..config.defaults import ALLOWED_DOTFILES, DEFAULT_IGNORE_PATTERNS
14
20
  from ..config.settings import ProjectConfig
15
21
  from ..parsers.registry import get_parser_registry
@@ -19,6 +25,81 @@ from .database import VectorDatabase
19
25
  from .directory_index import DirectoryIndex
20
26
  from .exceptions import ParsingError
21
27
  from .models import CodeChunk, IndexStats
28
+ from .relationships import RelationshipStore
29
+
30
+ # Extension to language mapping for metric collection
31
+ EXTENSION_TO_LANGUAGE = {
32
+ ".py": "python",
33
+ ".js": "javascript",
34
+ ".ts": "typescript",
35
+ ".jsx": "javascript",
36
+ ".tsx": "typescript",
37
+ ".java": "java",
38
+ ".rs": "rust",
39
+ ".php": "php",
40
+ ".rb": "ruby",
41
+ }
42
+
43
+
44
+ def _parse_file_standalone(
45
+ args: tuple[Path, str | None],
46
+ ) -> tuple[Path, list[CodeChunk], Exception | None]:
47
+ """Parse a single file - standalone function for multiprocessing.
48
+
49
+ This function must be at module level (not a method) to be picklable for
50
+ multiprocessing. It creates its own parser registry to avoid serialization issues.
51
+
52
+ Args:
53
+ args: Tuple of (file_path, subproject_info_json)
54
+ - file_path: Path to the file to parse
55
+ - subproject_info_json: JSON string with subproject info or None
56
+
57
+ Returns:
58
+ Tuple of (file_path, chunks, error)
59
+ - file_path: The file path that was parsed
60
+ - chunks: List of parsed CodeChunk objects (empty if error)
61
+ - error: Exception if parsing failed, None if successful
62
+ """
63
+ file_path, subproject_info_json = args
64
+
65
+ try:
66
+ # Create parser registry in this process
67
+ parser_registry = get_parser_registry()
68
+
69
+ # Get appropriate parser
70
+ parser = parser_registry.get_parser_for_file(file_path)
71
+
72
+ # Parse file synchronously (tree-sitter is synchronous anyway)
73
+ # We need to use the synchronous version of parse_file
74
+ # Since parsers may have async methods, we'll read and parse directly
75
+ import asyncio
76
+
77
+ # Create event loop for this process if needed
78
+ try:
79
+ loop = asyncio.get_event_loop()
80
+ except RuntimeError:
81
+ loop = asyncio.new_event_loop()
82
+ asyncio.set_event_loop(loop)
83
+
84
+ # Run the async parse_file in this process's event loop
85
+ chunks = loop.run_until_complete(parser.parse_file(file_path))
86
+
87
+ # Filter out empty chunks
88
+ valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
89
+
90
+ # Apply subproject information if available
91
+ if subproject_info_json:
92
+ subproject_info = json.loads(subproject_info_json)
93
+ for chunk in valid_chunks:
94
+ chunk.subproject_name = subproject_info.get("name")
95
+ chunk.subproject_path = subproject_info.get("relative_path")
96
+
97
+ return (file_path, valid_chunks, None)
98
+
99
+ except Exception as e:
100
+ # Return error instead of raising to avoid process crashes
101
+ logger.error(f"Failed to parse file {file_path} in worker process: {e}")
102
+ return (file_path, [], e)
22
103
 
23
104
 
24
105
  class SemanticIndexer:
@@ -33,6 +114,8 @@ class SemanticIndexer:
33
114
  max_workers: int | None = None,
34
115
  batch_size: int = 10,
35
116
  debug: bool = False,
117
+ collectors: list[MetricCollector] | None = None,
118
+ use_multiprocessing: bool = True,
36
119
  ) -> None:
37
120
  """Initialize semantic indexer.
38
121
 
@@ -41,9 +124,11 @@ class SemanticIndexer:
41
124
  project_root: Project root directory
42
125
  file_extensions: File extensions to index (deprecated, use config)
43
126
  config: Project configuration (preferred over file_extensions)
44
- max_workers: Maximum number of worker threads for parallel processing
127
+ max_workers: Maximum number of worker processes for parallel parsing (ignored if use_multiprocessing=False)
45
128
  batch_size: Number of files to process in each batch
46
129
  debug: Enable debug output for hierarchy building
130
+ collectors: Metric collectors to run during indexing (defaults to all complexity collectors)
131
+ use_multiprocessing: Enable multiprocess parallel parsing (default: True, disable for debugging)
47
132
  """
48
133
  self.database = database
49
134
  self.project_root = project_root
@@ -63,13 +148,23 @@ class SemanticIndexer:
63
148
  self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
64
149
  self.debug = debug
65
150
 
66
- # Safely get event loop for max_workers
67
- try:
68
- loop = asyncio.get_event_loop()
69
- self.max_workers = max_workers or min(4, (loop.get_debug() and 1) or 4)
70
- except RuntimeError:
71
- # No event loop in current thread
72
- self.max_workers = max_workers or 4
151
+ # Initialize metric collectors
152
+ self.collectors = (
153
+ collectors if collectors is not None else self._default_collectors()
154
+ )
155
+
156
+ # Configure multiprocessing for parallel parsing
157
+ self.use_multiprocessing = use_multiprocessing
158
+ if use_multiprocessing:
159
+ # Use 75% of CPU cores for parsing, but cap at 8 to avoid overhead
160
+ cpu_count = multiprocessing.cpu_count()
161
+ self.max_workers = max_workers or min(max(1, int(cpu_count * 0.75)), 8)
162
+ logger.debug(
163
+ f"Multiprocessing enabled with {self.max_workers} workers (CPU count: {cpu_count})"
164
+ )
165
+ else:
166
+ self.max_workers = 1
167
+ logger.debug("Multiprocessing disabled (single-threaded mode)")
73
168
 
74
169
  self.batch_size = batch_size
75
170
  self._index_metadata_file = (
@@ -110,16 +205,162 @@ class SemanticIndexer:
110
205
  # Load existing directory index
111
206
  self.directory_index.load()
112
207
 
208
+ # Initialize relationship store for pre-computing visualization relationships
209
+ self.relationship_store = RelationshipStore(project_root)
210
+
211
+ # Initialize trend tracker for historical metrics
212
+ self.trend_tracker = TrendTracker(project_root)
213
+
214
+ def _default_collectors(self) -> list[MetricCollector]:
215
+ """Return default set of metric collectors.
216
+
217
+ Returns:
218
+ List of all complexity collectors (cognitive, cyclomatic, nesting, parameters, methods)
219
+ """
220
+ from ..analysis.collectors.complexity import (
221
+ CognitiveComplexityCollector,
222
+ CyclomaticComplexityCollector,
223
+ MethodCountCollector,
224
+ NestingDepthCollector,
225
+ ParameterCountCollector,
226
+ )
227
+
228
+ return [
229
+ CognitiveComplexityCollector(),
230
+ CyclomaticComplexityCollector(),
231
+ NestingDepthCollector(),
232
+ ParameterCountCollector(),
233
+ MethodCountCollector(),
234
+ ]
235
+
236
+ def _collect_metrics(
237
+ self, chunk: CodeChunk, source_code: bytes, language: str
238
+ ) -> ChunkMetrics | None:
239
+ """Collect metrics for a code chunk.
240
+
241
+ This is a simplified version that estimates metrics from chunk content
242
+ without full TreeSitter traversal. Future implementation will use
243
+ TreeSitter node traversal for accurate metric collection.
244
+
245
+ Args:
246
+ chunk: The parsed code chunk
247
+ source_code: Raw source code bytes
248
+ language: Programming language identifier
249
+
250
+ Returns:
251
+ ChunkMetrics for the chunk, or None if no metrics collected
252
+ """
253
+ # For now, create basic metrics from chunk content
254
+ # TODO: Implement full TreeSitter traversal in Phase 2
255
+ lines_of_code = chunk.line_count
256
+
257
+ # Estimate complexity from simple heuristics
258
+ content = chunk.content
259
+ cognitive_complexity = self._estimate_cognitive_complexity(content)
260
+ cyclomatic_complexity = self._estimate_cyclomatic_complexity(content)
261
+ max_nesting_depth = self._estimate_nesting_depth(content)
262
+ parameter_count = len(chunk.parameters) if chunk.parameters else 0
263
+
264
+ metrics = ChunkMetrics(
265
+ cognitive_complexity=cognitive_complexity,
266
+ cyclomatic_complexity=cyclomatic_complexity,
267
+ max_nesting_depth=max_nesting_depth,
268
+ parameter_count=parameter_count,
269
+ lines_of_code=lines_of_code,
270
+ )
271
+
272
+ return metrics
273
+
274
+ def _estimate_cognitive_complexity(self, content: str) -> int:
275
+ """Estimate cognitive complexity from content (simplified heuristic).
276
+
277
+ Args:
278
+ content: Code content
279
+
280
+ Returns:
281
+ Estimated cognitive complexity score
282
+ """
283
+ # Simple heuristic: count control flow keywords
284
+ keywords = [
285
+ "if",
286
+ "elif",
287
+ "else",
288
+ "for",
289
+ "while",
290
+ "try",
291
+ "except",
292
+ "case",
293
+ "when",
294
+ ]
295
+ complexity = 0
296
+ for keyword in keywords:
297
+ complexity += content.count(f" {keyword} ")
298
+ complexity += content.count(f"\t{keyword} ")
299
+ complexity += content.count(f"\n{keyword} ")
300
+ return complexity
301
+
302
+ def _estimate_cyclomatic_complexity(self, content: str) -> int:
303
+ """Estimate cyclomatic complexity from content (simplified heuristic).
304
+
305
+ Args:
306
+ content: Code content
307
+
308
+ Returns:
309
+ Estimated cyclomatic complexity score (minimum 1)
310
+ """
311
+ # Start with baseline of 1
312
+ complexity = 1
313
+
314
+ # Count decision points
315
+ keywords = [
316
+ "if",
317
+ "elif",
318
+ "for",
319
+ "while",
320
+ "case",
321
+ "when",
322
+ "&&",
323
+ "||",
324
+ "and",
325
+ "or",
326
+ ]
327
+ for keyword in keywords:
328
+ complexity += content.count(keyword)
329
+
330
+ return complexity
331
+
332
+ def _estimate_nesting_depth(self, content: str) -> int:
333
+ """Estimate maximum nesting depth from indentation (simplified heuristic).
334
+
335
+ Args:
336
+ content: Code content
337
+
338
+ Returns:
339
+ Estimated maximum nesting depth
340
+ """
341
+ max_depth = 0
342
+ for line in content.split("\n"):
343
+ # Count leading whitespace (4 spaces or 1 tab = 1 level)
344
+ leading = len(line) - len(line.lstrip())
345
+ if "\t" in line[:leading]:
346
+ depth = line[:leading].count("\t")
347
+ else:
348
+ depth = leading // 4
349
+ max_depth = max(max_depth, depth)
350
+ return max_depth
351
+
113
352
  async def index_project(
114
353
  self,
115
354
  force_reindex: bool = False,
116
355
  show_progress: bool = True,
356
+ skip_relationships: bool = False,
117
357
  ) -> int:
118
358
  """Index all files in the project.
119
359
 
120
360
  Args:
121
361
  force_reindex: Whether to reindex existing files
122
362
  show_progress: Whether to show progress information
363
+ skip_relationships: Skip computing relationships for visualization (faster, but visualize will be slower)
123
364
 
124
365
  Returns:
125
366
  Number of files indexed
@@ -222,12 +463,134 @@ class SemanticIndexer:
222
463
  f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
223
464
  )
224
465
 
466
+ # Mark relationships for background computation (unless skipped)
467
+ # Default behavior: skip blocking computation, mark for background processing
468
+ if not skip_relationships and indexed_count > 0:
469
+ try:
470
+ logger.info("Marking relationships for background computation...")
471
+ # Get all chunks from database for relationship computation
472
+ all_chunks = await self.database.get_all_chunks()
473
+
474
+ if len(all_chunks) > 0:
475
+ # Mark for background computation (non-blocking)
476
+ await self.relationship_store.compute_and_store(
477
+ all_chunks, self.database, background=True
478
+ )
479
+ logger.info("✓ Relationships marked for background computation")
480
+ logger.info(
481
+ " Use 'mcp-vector-search index relationships' to compute now or wait for background task"
482
+ )
483
+ else:
484
+ logger.warning("No chunks found for relationship computation")
485
+ except Exception as e:
486
+ logger.warning(f"Failed to mark relationships: {e}")
487
+ logger.debug("Visualization will compute relationships on demand")
488
+
489
+ # Save trend snapshot after successful indexing
490
+ if indexed_count > 0:
491
+ try:
492
+ logger.info("Saving metrics snapshot for trend tracking...")
493
+ # Get database stats
494
+ stats = await self.database.get_stats()
495
+ # Get all chunks for detailed metrics
496
+ all_chunks = await self.database.get_all_chunks()
497
+ # Compute metrics from stats and chunks
498
+ metrics = self.trend_tracker.compute_metrics_from_stats(
499
+ stats.to_dict(), all_chunks
500
+ )
501
+ # Save snapshot (updates today's entry if exists)
502
+ self.trend_tracker.save_snapshot(metrics)
503
+ logger.info(
504
+ f"✓ Saved trend snapshot: {metrics['total_files']} files, "
505
+ f"{metrics['total_chunks']} chunks, health score {metrics['health_score']}"
506
+ )
507
+ except Exception as e:
508
+ logger.warning(f"Failed to save trend snapshot: {e}")
509
+
225
510
  return indexed_count
226
511
 
512
+ async def _parse_and_prepare_file(
513
+ self, file_path: Path, force_reindex: bool = False
514
+ ) -> tuple[list[CodeChunk], dict[str, Any] | None]:
515
+ """Parse file and prepare chunks with metrics (no database insertion).
516
+
517
+ This method extracts the parsing and metric collection logic from index_file()
518
+ to enable batch processing across multiple files.
519
+
520
+ Args:
521
+ file_path: Path to the file to parse
522
+ force_reindex: Whether to force reindexing (always deletes existing chunks)
523
+
524
+ Returns:
525
+ Tuple of (chunks_with_hierarchy, chunk_metrics)
526
+
527
+ Raises:
528
+ ParsingError: If file parsing fails
529
+ """
530
+ # Check if file should be indexed
531
+ if not self._should_index_file(file_path):
532
+ return ([], None)
533
+
534
+ # Always remove existing chunks when reindexing a file
535
+ # This prevents duplicate chunks and ensures consistency
536
+ await self.database.delete_by_file(file_path)
537
+
538
+ # Parse file into chunks
539
+ chunks = await self._parse_file(file_path)
540
+
541
+ if not chunks:
542
+ logger.debug(f"No chunks extracted from {file_path}")
543
+ return ([], None)
544
+
545
+ # Build hierarchical relationships between chunks
546
+ chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
547
+
548
+ # Debug: Check if hierarchy was built
549
+ methods_with_parents = sum(
550
+ 1
551
+ for c in chunks_with_hierarchy
552
+ if c.chunk_type in ("method", "function") and c.parent_chunk_id
553
+ )
554
+ logger.debug(
555
+ f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
556
+ )
557
+
558
+ # Collect metrics for chunks (if collectors are enabled)
559
+ chunk_metrics: dict[str, Any] | None = None
560
+ if self.collectors:
561
+ try:
562
+ # Read source code
563
+ source_code = file_path.read_bytes()
564
+
565
+ # Detect language from file extension
566
+ language = EXTENSION_TO_LANGUAGE.get(
567
+ file_path.suffix.lower(), "unknown"
568
+ )
569
+
570
+ # Collect metrics for each chunk
571
+ chunk_metrics = {}
572
+ for chunk in chunks_with_hierarchy:
573
+ metrics = self._collect_metrics(chunk, source_code, language)
574
+ if metrics:
575
+ chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
576
+
577
+ logger.debug(
578
+ f"Collected metrics for {len(chunk_metrics)} chunks from {file_path}"
579
+ )
580
+ except Exception as e:
581
+ logger.warning(f"Failed to collect metrics for {file_path}: {e}")
582
+ chunk_metrics = None
583
+
584
+ return (chunks_with_hierarchy, chunk_metrics)
585
+
227
586
  async def _process_file_batch(
228
587
  self, file_paths: list[Path], force_reindex: bool = False
229
588
  ) -> list[bool]:
230
- """Process a batch of files in parallel.
589
+ """Process a batch of files and accumulate chunks for batch embedding.
590
+
591
+ This method processes multiple files in parallel (using multiprocessing for
592
+ CPU-bound parsing) and then performs a single database insertion for all chunks,
593
+ enabling efficient batch embedding generation.
231
594
 
232
595
  Args:
233
596
  file_paths: List of file paths to process
@@ -236,26 +599,166 @@ class SemanticIndexer:
236
599
  Returns:
237
600
  List of success flags for each file
238
601
  """
239
- # Create tasks for parallel processing
240
- tasks = []
602
+ all_chunks: list[CodeChunk] = []
603
+ all_metrics: dict[str, Any] = {}
604
+ file_to_chunks_map: dict[str, tuple[int, int]] = {}
605
+ success_flags: list[bool] = []
606
+
607
+ # Filter files that should be indexed and delete old chunks
608
+ files_to_parse = []
241
609
  for file_path in file_paths:
242
- task = asyncio.create_task(self._index_file_safe(file_path, force_reindex))
243
- tasks.append(task)
610
+ if not self._should_index_file(file_path):
611
+ success_flags.append(True) # Skipped file is not an error
612
+ continue
613
+ # Delete old chunks before parsing
614
+ await self.database.delete_by_file(file_path)
615
+ files_to_parse.append(file_path)
244
616
 
245
- # Wait for all tasks to complete
246
- results = await asyncio.gather(*tasks, return_exceptions=True)
617
+ if not files_to_parse:
618
+ return success_flags
247
619
 
248
- # Convert results to success flags
249
- success_flags = []
250
- for i, result in enumerate(results):
251
- if isinstance(result, Exception):
252
- logger.error(f"Failed to index {file_paths[i]}: {result}")
620
+ # Parse files using multiprocessing if enabled
621
+ if self.use_multiprocessing and len(files_to_parse) > 1:
622
+ # Use ProcessPoolExecutor for CPU-bound parsing
623
+ parse_results = await self._parse_files_multiprocess(files_to_parse)
624
+ else:
625
+ # Fall back to async processing (for single file or disabled multiprocessing)
626
+ parse_results = await self._parse_files_async(files_to_parse)
627
+
628
+ # Accumulate chunks from all successfully parsed files
629
+ metadata = self._load_index_metadata()
630
+ for file_path, chunks, error in parse_results:
631
+ if error:
632
+ logger.error(f"Failed to parse {file_path}: {error}")
253
633
  success_flags.append(False)
634
+ continue
635
+
636
+ if chunks:
637
+ # Build hierarchy and collect metrics for parsed chunks
638
+ chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
639
+
640
+ # Collect metrics if enabled
641
+ chunk_metrics = None
642
+ if self.collectors:
643
+ try:
644
+ source_code = file_path.read_bytes()
645
+ language = EXTENSION_TO_LANGUAGE.get(
646
+ file_path.suffix.lower(), "unknown"
647
+ )
648
+ chunk_metrics = {}
649
+ for chunk in chunks_with_hierarchy:
650
+ metrics = self._collect_metrics(
651
+ chunk, source_code, language
652
+ )
653
+ if metrics:
654
+ chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
655
+ except Exception as e:
656
+ logger.warning(
657
+ f"Failed to collect metrics for {file_path}: {e}"
658
+ )
659
+
660
+ # Accumulate chunks
661
+ start_idx = len(all_chunks)
662
+ all_chunks.extend(chunks_with_hierarchy)
663
+ end_idx = len(all_chunks)
664
+ file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
665
+
666
+ # Merge metrics
667
+ if chunk_metrics:
668
+ all_metrics.update(chunk_metrics)
669
+
670
+ # Update metadata for successfully parsed file
671
+ metadata[str(file_path)] = os.path.getmtime(file_path)
672
+ success_flags.append(True)
254
673
  else:
255
- success_flags.append(result)
674
+ # Empty file is not an error
675
+ metadata[str(file_path)] = os.path.getmtime(file_path)
676
+ success_flags.append(True)
677
+
678
+ # Single database insertion for entire batch
679
+ if all_chunks:
680
+ logger.info(
681
+ f"Batch inserting {len(all_chunks)} chunks from {len(file_paths)} files"
682
+ )
683
+ try:
684
+ await self.database.add_chunks(all_chunks, metrics=all_metrics)
685
+ logger.debug(
686
+ f"Successfully indexed {len(all_chunks)} chunks from {sum(success_flags)} files"
687
+ )
688
+ except Exception as e:
689
+ logger.error(f"Failed to insert batch of chunks: {e}")
690
+ # Mark all files in this batch as failed
691
+ return [False] * len(file_paths)
692
+
693
+ # Save updated metadata after successful batch
694
+ self._save_index_metadata(metadata)
256
695
 
257
696
  return success_flags
258
697
 
698
+ async def _parse_files_multiprocess(
699
+ self, file_paths: list[Path]
700
+ ) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
701
+ """Parse multiple files using multiprocessing for CPU-bound parallelism.
702
+
703
+ Args:
704
+ file_paths: List of file paths to parse
705
+
706
+ Returns:
707
+ List of tuples (file_path, chunks, error) for each file
708
+ """
709
+ # Prepare arguments for worker processes
710
+ parse_args = []
711
+ for file_path in file_paths:
712
+ # Get subproject info if available
713
+ subproject = self.monorepo_detector.get_subproject_for_file(file_path)
714
+ subproject_info_json = None
715
+ if subproject:
716
+ subproject_info_json = json.dumps(
717
+ {
718
+ "name": subproject.name,
719
+ "relative_path": subproject.relative_path,
720
+ }
721
+ )
722
+ parse_args.append((file_path, subproject_info_json))
723
+
724
+ # Limit workers to avoid overhead
725
+ max_workers = min(self.max_workers, len(file_paths))
726
+
727
+ # Run parsing in ProcessPoolExecutor
728
+ loop = asyncio.get_running_loop()
729
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
730
+ # Submit all tasks and wait for results
731
+ results = await loop.run_in_executor(
732
+ None, lambda: list(executor.map(_parse_file_standalone, parse_args))
733
+ )
734
+
735
+ logger.debug(
736
+ f"Multiprocess parsing completed: {len(results)} files parsed with {max_workers} workers"
737
+ )
738
+ return results
739
+
740
+ async def _parse_files_async(
741
+ self, file_paths: list[Path]
742
+ ) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
743
+ """Parse multiple files using async (fallback for single file or disabled multiprocessing).
744
+
745
+ Args:
746
+ file_paths: List of file paths to parse
747
+
748
+ Returns:
749
+ List of tuples (file_path, chunks, error) for each file
750
+ """
751
+ results = []
752
+ for file_path in file_paths:
753
+ try:
754
+ chunks = await self._parse_file(file_path)
755
+ results.append((file_path, chunks, None))
756
+ except Exception as e:
757
+ logger.error(f"Failed to parse {file_path}: {e}")
758
+ results.append((file_path, [], e))
759
+
760
+ return results
761
+
259
762
  def _load_index_metadata(self) -> dict[str, float]:
260
763
  """Load file modification times from metadata file.
261
764
 
@@ -379,8 +882,34 @@ class SemanticIndexer:
379
882
  f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
380
883
  )
381
884
 
382
- # Add chunks to database
383
- await self.database.add_chunks(chunks_with_hierarchy)
885
+ # Collect metrics for chunks (if collectors are enabled)
886
+ chunk_metrics: dict[str, Any] | None = None
887
+ if self.collectors:
888
+ try:
889
+ # Read source code
890
+ source_code = file_path.read_bytes()
891
+
892
+ # Detect language from file extension
893
+ language = EXTENSION_TO_LANGUAGE.get(
894
+ file_path.suffix.lower(), "unknown"
895
+ )
896
+
897
+ # Collect metrics for each chunk
898
+ chunk_metrics = {}
899
+ for chunk in chunks_with_hierarchy:
900
+ metrics = self._collect_metrics(chunk, source_code, language)
901
+ if metrics:
902
+ chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
903
+
904
+ logger.debug(
905
+ f"Collected metrics for {len(chunk_metrics)} chunks from {file_path}"
906
+ )
907
+ except Exception as e:
908
+ logger.warning(f"Failed to collect metrics for {file_path}: {e}")
909
+ chunk_metrics = None
910
+
911
+ # Add chunks to database with metrics
912
+ await self.database.add_chunks(chunks_with_hierarchy, metrics=chunk_metrics)
384
913
 
385
914
  # Update metadata after successful indexing
386
915
  metadata = self._load_index_metadata()
@@ -572,8 +1101,10 @@ class SemanticIndexer:
572
1101
  # Get relative path from project root for checking
573
1102
  relative_path = file_path.relative_to(self.project_root)
574
1103
 
575
- # 1. Check dotfile filtering (if enabled in config)
576
- if self.config and self.config.skip_dotfiles:
1104
+ # 1. Check dotfile filtering (ENABLED BY DEFAULT)
1105
+ # Skip dotfiles unless config explicitly disables it
1106
+ skip_dotfiles = self.config.skip_dotfiles if self.config else True
1107
+ if skip_dotfiles:
577
1108
  for part in relative_path.parts:
578
1109
  # Skip dotfiles unless they're in the whitelist
579
1110
  if part.startswith(".") and part not in ALLOWED_DOTFILES:
@@ -807,6 +1338,9 @@ class SemanticIndexer:
807
1338
  ):
808
1339
  """Index files and yield progress updates for each file.
809
1340
 
1341
+ This method processes files in batches and accumulates chunks across files
1342
+ before performing a single database insertion per batch for better performance.
1343
+
810
1344
  Args:
811
1345
  files_to_index: List of file paths to index
812
1346
  force_reindex: Whether to force reindexing
@@ -817,42 +1351,84 @@ class SemanticIndexer:
817
1351
  # Write version header to error log at start of indexing run
818
1352
  self._write_indexing_run_header()
819
1353
 
820
- metadata = self._load_index_metadata()
821
-
822
- # Process files in batches for better memory management
1354
+ # Process files in batches for better memory management and embedding efficiency
823
1355
  for i in range(0, len(files_to_index), self.batch_size):
824
1356
  batch = files_to_index[i : i + self.batch_size]
825
1357
 
826
- # Process each file in the batch
1358
+ # Accumulate chunks from all files in batch
1359
+ all_chunks: list[CodeChunk] = []
1360
+ all_metrics: dict[str, Any] = {}
1361
+ file_to_chunks_map: dict[str, tuple[int, int]] = {}
1362
+ file_results: dict[Path, tuple[int, bool]] = {}
1363
+
1364
+ # Parse all files in parallel
1365
+ tasks = []
827
1366
  for file_path in batch:
828
- chunks_added = 0
829
- success = False
1367
+ task = asyncio.create_task(
1368
+ self._parse_and_prepare_file(file_path, force_reindex)
1369
+ )
1370
+ tasks.append(task)
830
1371
 
831
- try:
832
- # Always remove existing chunks when reindexing
833
- await self.database.delete_by_file(file_path)
1372
+ parse_results = await asyncio.gather(*tasks, return_exceptions=True)
834
1373
 
835
- # Parse file into chunks
836
- chunks = await self._parse_file(file_path)
1374
+ # Accumulate chunks from successfully parsed files
1375
+ metadata = self._load_index_metadata()
1376
+ for file_path, result in zip(batch, parse_results, strict=True):
1377
+ if isinstance(result, Exception):
1378
+ error_msg = f"Failed to index file {file_path}: {type(result).__name__}: {str(result)}"
1379
+ logger.error(error_msg)
1380
+ file_results[file_path] = (0, False)
837
1381
 
838
- if chunks:
839
- # Build hierarchical relationships
840
- chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
1382
+ # Save error to error log file
1383
+ try:
1384
+ error_log_path = (
1385
+ self.project_root
1386
+ / ".mcp-vector-search"
1387
+ / "indexing_errors.log"
1388
+ )
1389
+ with open(error_log_path, "a", encoding="utf-8") as f:
1390
+ timestamp = datetime.now().isoformat()
1391
+ f.write(f"[{timestamp}] {error_msg}\n")
1392
+ except Exception as log_err:
1393
+ logger.debug(f"Failed to write error log: {log_err}")
1394
+ continue
841
1395
 
842
- # Add chunks to database
843
- await self.database.add_chunks(chunks_with_hierarchy)
844
- chunks_added = len(chunks)
845
- logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
1396
+ chunks, metrics = result
1397
+ if chunks:
1398
+ start_idx = len(all_chunks)
1399
+ all_chunks.extend(chunks)
1400
+ end_idx = len(all_chunks)
1401
+ file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
846
1402
 
847
- success = True
1403
+ # Merge metrics
1404
+ if metrics:
1405
+ all_metrics.update(metrics)
848
1406
 
849
- # Update metadata after successful indexing
1407
+ # Update metadata for successfully parsed file
1408
+ metadata[str(file_path)] = os.path.getmtime(file_path)
1409
+ file_results[file_path] = (len(chunks), True)
1410
+ logger.debug(f"Prepared {len(chunks)} chunks from {file_path}")
1411
+ else:
1412
+ # Empty file is not an error
850
1413
  metadata[str(file_path)] = os.path.getmtime(file_path)
1414
+ file_results[file_path] = (0, True)
851
1415
 
1416
+ # Single database insertion for entire batch
1417
+ if all_chunks:
1418
+ logger.info(
1419
+ f"Batch inserting {len(all_chunks)} chunks from {len(batch)} files"
1420
+ )
1421
+ try:
1422
+ await self.database.add_chunks(all_chunks, metrics=all_metrics)
1423
+ logger.debug(
1424
+ f"Successfully indexed {len(all_chunks)} chunks from batch"
1425
+ )
852
1426
  except Exception as e:
853
- error_msg = f"Failed to index file {file_path}: {type(e).__name__}: {str(e)}"
1427
+ error_msg = f"Failed to insert batch of chunks: {e}"
854
1428
  logger.error(error_msg)
855
- success = False
1429
+ # Mark all files with chunks in this batch as failed
1430
+ for file_path in file_to_chunks_map.keys():
1431
+ file_results[Path(file_path)] = (0, False)
856
1432
 
857
1433
  # Save error to error log file
858
1434
  try:
@@ -862,18 +1438,18 @@ class SemanticIndexer:
862
1438
  / "indexing_errors.log"
863
1439
  )
864
1440
  with open(error_log_path, "a", encoding="utf-8") as f:
865
- from datetime import datetime
866
-
867
1441
  timestamp = datetime.now().isoformat()
868
1442
  f.write(f"[{timestamp}] {error_msg}\n")
869
1443
  except Exception as log_err:
870
1444
  logger.debug(f"Failed to write error log: {log_err}")
871
1445
 
872
- # Yield progress update
873
- yield (file_path, chunks_added, success)
1446
+ # Save metadata after batch
1447
+ self._save_index_metadata(metadata)
874
1448
 
875
- # Save metadata at the end
876
- self._save_index_metadata(metadata)
1449
+ # Yield progress updates for each file in batch
1450
+ for file_path in batch:
1451
+ chunks_added, success = file_results.get(file_path, (0, False))
1452
+ yield (file_path, chunks_added, success)
877
1453
 
878
1454
  def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
879
1455
  """Build parent-child relationships between chunks.
@@ -895,7 +1471,9 @@ class SemanticIndexer:
895
1471
  return chunks
896
1472
 
897
1473
  # Group chunks by type and name
898
- module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
1474
+ # Only actual module chunks (not imports) serve as parents for top-level code
1475
+ # imports chunks should remain siblings of classes/functions, not parents
1476
+ module_chunks = [c for c in chunks if c.chunk_type == "module"]
899
1477
  class_chunks = [
900
1478
  c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
901
1479
  ]