mcp-vector-search 1.0.3__py3-none-any.whl → 1.1.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_vector_search/__init__.py +3 -3
- mcp_vector_search/analysis/__init__.py +48 -1
- mcp_vector_search/analysis/baseline/__init__.py +68 -0
- mcp_vector_search/analysis/baseline/comparator.py +462 -0
- mcp_vector_search/analysis/baseline/manager.py +621 -0
- mcp_vector_search/analysis/collectors/__init__.py +35 -0
- mcp_vector_search/analysis/collectors/cohesion.py +463 -0
- mcp_vector_search/analysis/collectors/coupling.py +1162 -0
- mcp_vector_search/analysis/collectors/halstead.py +514 -0
- mcp_vector_search/analysis/collectors/smells.py +325 -0
- mcp_vector_search/analysis/debt.py +516 -0
- mcp_vector_search/analysis/interpretation.py +685 -0
- mcp_vector_search/analysis/metrics.py +74 -1
- mcp_vector_search/analysis/reporters/__init__.py +3 -1
- mcp_vector_search/analysis/reporters/console.py +424 -0
- mcp_vector_search/analysis/reporters/markdown.py +480 -0
- mcp_vector_search/analysis/reporters/sarif.py +377 -0
- mcp_vector_search/analysis/storage/__init__.py +93 -0
- mcp_vector_search/analysis/storage/metrics_store.py +762 -0
- mcp_vector_search/analysis/storage/schema.py +245 -0
- mcp_vector_search/analysis/storage/trend_tracker.py +560 -0
- mcp_vector_search/analysis/trends.py +308 -0
- mcp_vector_search/analysis/visualizer/__init__.py +90 -0
- mcp_vector_search/analysis/visualizer/d3_data.py +534 -0
- mcp_vector_search/analysis/visualizer/exporter.py +484 -0
- mcp_vector_search/analysis/visualizer/html_report.py +2895 -0
- mcp_vector_search/analysis/visualizer/schemas.py +525 -0
- mcp_vector_search/cli/commands/analyze.py +665 -11
- mcp_vector_search/cli/commands/chat.py +193 -0
- mcp_vector_search/cli/commands/index.py +600 -2
- mcp_vector_search/cli/commands/index_background.py +467 -0
- mcp_vector_search/cli/commands/search.py +194 -1
- mcp_vector_search/cli/commands/setup.py +64 -13
- mcp_vector_search/cli/commands/status.py +302 -3
- mcp_vector_search/cli/commands/visualize/cli.py +26 -10
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +8 -4
- mcp_vector_search/cli/commands/visualize/graph_builder.py +167 -234
- mcp_vector_search/cli/commands/visualize/server.py +304 -15
- mcp_vector_search/cli/commands/visualize/templates/base.py +60 -6
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +2100 -65
- mcp_vector_search/cli/commands/visualize/templates/styles.py +1297 -88
- mcp_vector_search/cli/didyoumean.py +5 -0
- mcp_vector_search/cli/main.py +16 -5
- mcp_vector_search/cli/output.py +134 -5
- mcp_vector_search/config/thresholds.py +89 -1
- mcp_vector_search/core/__init__.py +16 -0
- mcp_vector_search/core/database.py +39 -2
- mcp_vector_search/core/embeddings.py +24 -0
- mcp_vector_search/core/git.py +380 -0
- mcp_vector_search/core/indexer.py +445 -84
- mcp_vector_search/core/llm_client.py +9 -4
- mcp_vector_search/core/models.py +88 -1
- mcp_vector_search/core/relationships.py +473 -0
- mcp_vector_search/core/search.py +1 -1
- mcp_vector_search/mcp/server.py +795 -4
- mcp_vector_search/parsers/python.py +285 -5
- mcp_vector_search/utils/gitignore.py +0 -3
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/METADATA +3 -2
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/RECORD +62 -39
- mcp_vector_search/cli/commands/visualize.py.original +0 -2536
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/WHEEL +0 -0
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-1.0.3.dist-info → mcp_vector_search-1.1.22.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import json
|
|
5
|
+
import multiprocessing
|
|
5
6
|
import os
|
|
7
|
+
from concurrent.futures import ProcessPoolExecutor
|
|
6
8
|
from datetime import UTC, datetime
|
|
7
9
|
from pathlib import Path
|
|
8
10
|
from typing import Any
|
|
@@ -13,6 +15,7 @@ from packaging import version
|
|
|
13
15
|
from .. import __version__
|
|
14
16
|
from ..analysis.collectors.base import MetricCollector
|
|
15
17
|
from ..analysis.metrics import ChunkMetrics
|
|
18
|
+
from ..analysis.trends import TrendTracker
|
|
16
19
|
from ..config.defaults import ALLOWED_DOTFILES, DEFAULT_IGNORE_PATTERNS
|
|
17
20
|
from ..config.settings import ProjectConfig
|
|
18
21
|
from ..parsers.registry import get_parser_registry
|
|
@@ -22,6 +25,7 @@ from .database import VectorDatabase
|
|
|
22
25
|
from .directory_index import DirectoryIndex
|
|
23
26
|
from .exceptions import ParsingError
|
|
24
27
|
from .models import CodeChunk, IndexStats
|
|
28
|
+
from .relationships import RelationshipStore
|
|
25
29
|
|
|
26
30
|
# Extension to language mapping for metric collection
|
|
27
31
|
EXTENSION_TO_LANGUAGE = {
|
|
@@ -37,6 +41,67 @@ EXTENSION_TO_LANGUAGE = {
|
|
|
37
41
|
}
|
|
38
42
|
|
|
39
43
|
|
|
44
|
+
def _parse_file_standalone(
|
|
45
|
+
args: tuple[Path, str | None],
|
|
46
|
+
) -> tuple[Path, list[CodeChunk], Exception | None]:
|
|
47
|
+
"""Parse a single file - standalone function for multiprocessing.
|
|
48
|
+
|
|
49
|
+
This function must be at module level (not a method) to be picklable for
|
|
50
|
+
multiprocessing. It creates its own parser registry to avoid serialization issues.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
args: Tuple of (file_path, subproject_info_json)
|
|
54
|
+
- file_path: Path to the file to parse
|
|
55
|
+
- subproject_info_json: JSON string with subproject info or None
|
|
56
|
+
|
|
57
|
+
Returns:
|
|
58
|
+
Tuple of (file_path, chunks, error)
|
|
59
|
+
- file_path: The file path that was parsed
|
|
60
|
+
- chunks: List of parsed CodeChunk objects (empty if error)
|
|
61
|
+
- error: Exception if parsing failed, None if successful
|
|
62
|
+
"""
|
|
63
|
+
file_path, subproject_info_json = args
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
# Create parser registry in this process
|
|
67
|
+
parser_registry = get_parser_registry()
|
|
68
|
+
|
|
69
|
+
# Get appropriate parser
|
|
70
|
+
parser = parser_registry.get_parser_for_file(file_path)
|
|
71
|
+
|
|
72
|
+
# Parse file synchronously (tree-sitter is synchronous anyway)
|
|
73
|
+
# We need to use the synchronous version of parse_file
|
|
74
|
+
# Since parsers may have async methods, we'll read and parse directly
|
|
75
|
+
import asyncio
|
|
76
|
+
|
|
77
|
+
# Create event loop for this process if needed
|
|
78
|
+
try:
|
|
79
|
+
loop = asyncio.get_event_loop()
|
|
80
|
+
except RuntimeError:
|
|
81
|
+
loop = asyncio.new_event_loop()
|
|
82
|
+
asyncio.set_event_loop(loop)
|
|
83
|
+
|
|
84
|
+
# Run the async parse_file in this process's event loop
|
|
85
|
+
chunks = loop.run_until_complete(parser.parse_file(file_path))
|
|
86
|
+
|
|
87
|
+
# Filter out empty chunks
|
|
88
|
+
valid_chunks = [chunk for chunk in chunks if chunk.content.strip()]
|
|
89
|
+
|
|
90
|
+
# Apply subproject information if available
|
|
91
|
+
if subproject_info_json:
|
|
92
|
+
subproject_info = json.loads(subproject_info_json)
|
|
93
|
+
for chunk in valid_chunks:
|
|
94
|
+
chunk.subproject_name = subproject_info.get("name")
|
|
95
|
+
chunk.subproject_path = subproject_info.get("relative_path")
|
|
96
|
+
|
|
97
|
+
return (file_path, valid_chunks, None)
|
|
98
|
+
|
|
99
|
+
except Exception as e:
|
|
100
|
+
# Return error instead of raising to avoid process crashes
|
|
101
|
+
logger.error(f"Failed to parse file {file_path} in worker process: {e}")
|
|
102
|
+
return (file_path, [], e)
|
|
103
|
+
|
|
104
|
+
|
|
40
105
|
class SemanticIndexer:
|
|
41
106
|
"""Semantic indexer for parsing and indexing code files."""
|
|
42
107
|
|
|
@@ -50,6 +115,7 @@ class SemanticIndexer:
|
|
|
50
115
|
batch_size: int = 10,
|
|
51
116
|
debug: bool = False,
|
|
52
117
|
collectors: list[MetricCollector] | None = None,
|
|
118
|
+
use_multiprocessing: bool = True,
|
|
53
119
|
) -> None:
|
|
54
120
|
"""Initialize semantic indexer.
|
|
55
121
|
|
|
@@ -58,10 +124,11 @@ class SemanticIndexer:
|
|
|
58
124
|
project_root: Project root directory
|
|
59
125
|
file_extensions: File extensions to index (deprecated, use config)
|
|
60
126
|
config: Project configuration (preferred over file_extensions)
|
|
61
|
-
max_workers: Maximum number of worker
|
|
127
|
+
max_workers: Maximum number of worker processes for parallel parsing (ignored if use_multiprocessing=False)
|
|
62
128
|
batch_size: Number of files to process in each batch
|
|
63
129
|
debug: Enable debug output for hierarchy building
|
|
64
130
|
collectors: Metric collectors to run during indexing (defaults to all complexity collectors)
|
|
131
|
+
use_multiprocessing: Enable multiprocess parallel parsing (default: True, disable for debugging)
|
|
65
132
|
"""
|
|
66
133
|
self.database = database
|
|
67
134
|
self.project_root = project_root
|
|
@@ -86,13 +153,18 @@ class SemanticIndexer:
|
|
|
86
153
|
collectors if collectors is not None else self._default_collectors()
|
|
87
154
|
)
|
|
88
155
|
|
|
89
|
-
#
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
156
|
+
# Configure multiprocessing for parallel parsing
|
|
157
|
+
self.use_multiprocessing = use_multiprocessing
|
|
158
|
+
if use_multiprocessing:
|
|
159
|
+
# Use 75% of CPU cores for parsing, but cap at 8 to avoid overhead
|
|
160
|
+
cpu_count = multiprocessing.cpu_count()
|
|
161
|
+
self.max_workers = max_workers or min(max(1, int(cpu_count * 0.75)), 8)
|
|
162
|
+
logger.debug(
|
|
163
|
+
f"Multiprocessing enabled with {self.max_workers} workers (CPU count: {cpu_count})"
|
|
164
|
+
)
|
|
165
|
+
else:
|
|
166
|
+
self.max_workers = 1
|
|
167
|
+
logger.debug("Multiprocessing disabled (single-threaded mode)")
|
|
96
168
|
|
|
97
169
|
self.batch_size = batch_size
|
|
98
170
|
self._index_metadata_file = (
|
|
@@ -133,6 +205,12 @@ class SemanticIndexer:
|
|
|
133
205
|
# Load existing directory index
|
|
134
206
|
self.directory_index.load()
|
|
135
207
|
|
|
208
|
+
# Initialize relationship store for pre-computing visualization relationships
|
|
209
|
+
self.relationship_store = RelationshipStore(project_root)
|
|
210
|
+
|
|
211
|
+
# Initialize trend tracker for historical metrics
|
|
212
|
+
self.trend_tracker = TrendTracker(project_root)
|
|
213
|
+
|
|
136
214
|
def _default_collectors(self) -> list[MetricCollector]:
|
|
137
215
|
"""Return default set of metric collectors.
|
|
138
216
|
|
|
@@ -275,12 +353,14 @@ class SemanticIndexer:
|
|
|
275
353
|
self,
|
|
276
354
|
force_reindex: bool = False,
|
|
277
355
|
show_progress: bool = True,
|
|
356
|
+
skip_relationships: bool = False,
|
|
278
357
|
) -> int:
|
|
279
358
|
"""Index all files in the project.
|
|
280
359
|
|
|
281
360
|
Args:
|
|
282
361
|
force_reindex: Whether to reindex existing files
|
|
283
362
|
show_progress: Whether to show progress information
|
|
363
|
+
skip_relationships: Skip computing relationships for visualization (faster, but visualize will be slower)
|
|
284
364
|
|
|
285
365
|
Returns:
|
|
286
366
|
Number of files indexed
|
|
@@ -383,12 +463,134 @@ class SemanticIndexer:
|
|
|
383
463
|
f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
|
|
384
464
|
)
|
|
385
465
|
|
|
466
|
+
# Mark relationships for background computation (unless skipped)
|
|
467
|
+
# Default behavior: skip blocking computation, mark for background processing
|
|
468
|
+
if not skip_relationships and indexed_count > 0:
|
|
469
|
+
try:
|
|
470
|
+
logger.info("Marking relationships for background computation...")
|
|
471
|
+
# Get all chunks from database for relationship computation
|
|
472
|
+
all_chunks = await self.database.get_all_chunks()
|
|
473
|
+
|
|
474
|
+
if len(all_chunks) > 0:
|
|
475
|
+
# Mark for background computation (non-blocking)
|
|
476
|
+
await self.relationship_store.compute_and_store(
|
|
477
|
+
all_chunks, self.database, background=True
|
|
478
|
+
)
|
|
479
|
+
logger.info("✓ Relationships marked for background computation")
|
|
480
|
+
logger.info(
|
|
481
|
+
" Use 'mcp-vector-search index relationships' to compute now or wait for background task"
|
|
482
|
+
)
|
|
483
|
+
else:
|
|
484
|
+
logger.warning("No chunks found for relationship computation")
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.warning(f"Failed to mark relationships: {e}")
|
|
487
|
+
logger.debug("Visualization will compute relationships on demand")
|
|
488
|
+
|
|
489
|
+
# Save trend snapshot after successful indexing
|
|
490
|
+
if indexed_count > 0:
|
|
491
|
+
try:
|
|
492
|
+
logger.info("Saving metrics snapshot for trend tracking...")
|
|
493
|
+
# Get database stats
|
|
494
|
+
stats = await self.database.get_stats()
|
|
495
|
+
# Get all chunks for detailed metrics
|
|
496
|
+
all_chunks = await self.database.get_all_chunks()
|
|
497
|
+
# Compute metrics from stats and chunks
|
|
498
|
+
metrics = self.trend_tracker.compute_metrics_from_stats(
|
|
499
|
+
stats.to_dict(), all_chunks
|
|
500
|
+
)
|
|
501
|
+
# Save snapshot (updates today's entry if exists)
|
|
502
|
+
self.trend_tracker.save_snapshot(metrics)
|
|
503
|
+
logger.info(
|
|
504
|
+
f"✓ Saved trend snapshot: {metrics['total_files']} files, "
|
|
505
|
+
f"{metrics['total_chunks']} chunks, health score {metrics['health_score']}"
|
|
506
|
+
)
|
|
507
|
+
except Exception as e:
|
|
508
|
+
logger.warning(f"Failed to save trend snapshot: {e}")
|
|
509
|
+
|
|
386
510
|
return indexed_count
|
|
387
511
|
|
|
512
|
+
async def _parse_and_prepare_file(
|
|
513
|
+
self, file_path: Path, force_reindex: bool = False
|
|
514
|
+
) -> tuple[list[CodeChunk], dict[str, Any] | None]:
|
|
515
|
+
"""Parse file and prepare chunks with metrics (no database insertion).
|
|
516
|
+
|
|
517
|
+
This method extracts the parsing and metric collection logic from index_file()
|
|
518
|
+
to enable batch processing across multiple files.
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
file_path: Path to the file to parse
|
|
522
|
+
force_reindex: Whether to force reindexing (always deletes existing chunks)
|
|
523
|
+
|
|
524
|
+
Returns:
|
|
525
|
+
Tuple of (chunks_with_hierarchy, chunk_metrics)
|
|
526
|
+
|
|
527
|
+
Raises:
|
|
528
|
+
ParsingError: If file parsing fails
|
|
529
|
+
"""
|
|
530
|
+
# Check if file should be indexed
|
|
531
|
+
if not self._should_index_file(file_path):
|
|
532
|
+
return ([], None)
|
|
533
|
+
|
|
534
|
+
# Always remove existing chunks when reindexing a file
|
|
535
|
+
# This prevents duplicate chunks and ensures consistency
|
|
536
|
+
await self.database.delete_by_file(file_path)
|
|
537
|
+
|
|
538
|
+
# Parse file into chunks
|
|
539
|
+
chunks = await self._parse_file(file_path)
|
|
540
|
+
|
|
541
|
+
if not chunks:
|
|
542
|
+
logger.debug(f"No chunks extracted from {file_path}")
|
|
543
|
+
return ([], None)
|
|
544
|
+
|
|
545
|
+
# Build hierarchical relationships between chunks
|
|
546
|
+
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
547
|
+
|
|
548
|
+
# Debug: Check if hierarchy was built
|
|
549
|
+
methods_with_parents = sum(
|
|
550
|
+
1
|
|
551
|
+
for c in chunks_with_hierarchy
|
|
552
|
+
if c.chunk_type in ("method", "function") and c.parent_chunk_id
|
|
553
|
+
)
|
|
554
|
+
logger.debug(
|
|
555
|
+
f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
# Collect metrics for chunks (if collectors are enabled)
|
|
559
|
+
chunk_metrics: dict[str, Any] | None = None
|
|
560
|
+
if self.collectors:
|
|
561
|
+
try:
|
|
562
|
+
# Read source code
|
|
563
|
+
source_code = file_path.read_bytes()
|
|
564
|
+
|
|
565
|
+
# Detect language from file extension
|
|
566
|
+
language = EXTENSION_TO_LANGUAGE.get(
|
|
567
|
+
file_path.suffix.lower(), "unknown"
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
# Collect metrics for each chunk
|
|
571
|
+
chunk_metrics = {}
|
|
572
|
+
for chunk in chunks_with_hierarchy:
|
|
573
|
+
metrics = self._collect_metrics(chunk, source_code, language)
|
|
574
|
+
if metrics:
|
|
575
|
+
chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
|
|
576
|
+
|
|
577
|
+
logger.debug(
|
|
578
|
+
f"Collected metrics for {len(chunk_metrics)} chunks from {file_path}"
|
|
579
|
+
)
|
|
580
|
+
except Exception as e:
|
|
581
|
+
logger.warning(f"Failed to collect metrics for {file_path}: {e}")
|
|
582
|
+
chunk_metrics = None
|
|
583
|
+
|
|
584
|
+
return (chunks_with_hierarchy, chunk_metrics)
|
|
585
|
+
|
|
388
586
|
async def _process_file_batch(
|
|
389
587
|
self, file_paths: list[Path], force_reindex: bool = False
|
|
390
588
|
) -> list[bool]:
|
|
391
|
-
"""Process a batch of files
|
|
589
|
+
"""Process a batch of files and accumulate chunks for batch embedding.
|
|
590
|
+
|
|
591
|
+
This method processes multiple files in parallel (using multiprocessing for
|
|
592
|
+
CPU-bound parsing) and then performs a single database insertion for all chunks,
|
|
593
|
+
enabling efficient batch embedding generation.
|
|
392
594
|
|
|
393
595
|
Args:
|
|
394
596
|
file_paths: List of file paths to process
|
|
@@ -397,26 +599,166 @@ class SemanticIndexer:
|
|
|
397
599
|
Returns:
|
|
398
600
|
List of success flags for each file
|
|
399
601
|
"""
|
|
400
|
-
|
|
401
|
-
|
|
602
|
+
all_chunks: list[CodeChunk] = []
|
|
603
|
+
all_metrics: dict[str, Any] = {}
|
|
604
|
+
file_to_chunks_map: dict[str, tuple[int, int]] = {}
|
|
605
|
+
success_flags: list[bool] = []
|
|
606
|
+
|
|
607
|
+
# Filter files that should be indexed and delete old chunks
|
|
608
|
+
files_to_parse = []
|
|
402
609
|
for file_path in file_paths:
|
|
403
|
-
|
|
404
|
-
|
|
610
|
+
if not self._should_index_file(file_path):
|
|
611
|
+
success_flags.append(True) # Skipped file is not an error
|
|
612
|
+
continue
|
|
613
|
+
# Delete old chunks before parsing
|
|
614
|
+
await self.database.delete_by_file(file_path)
|
|
615
|
+
files_to_parse.append(file_path)
|
|
405
616
|
|
|
406
|
-
|
|
407
|
-
|
|
617
|
+
if not files_to_parse:
|
|
618
|
+
return success_flags
|
|
619
|
+
|
|
620
|
+
# Parse files using multiprocessing if enabled
|
|
621
|
+
if self.use_multiprocessing and len(files_to_parse) > 1:
|
|
622
|
+
# Use ProcessPoolExecutor for CPU-bound parsing
|
|
623
|
+
parse_results = await self._parse_files_multiprocess(files_to_parse)
|
|
624
|
+
else:
|
|
625
|
+
# Fall back to async processing (for single file or disabled multiprocessing)
|
|
626
|
+
parse_results = await self._parse_files_async(files_to_parse)
|
|
408
627
|
|
|
409
|
-
#
|
|
410
|
-
|
|
411
|
-
for
|
|
412
|
-
if
|
|
413
|
-
logger.error(f"Failed to
|
|
628
|
+
# Accumulate chunks from all successfully parsed files
|
|
629
|
+
metadata = self._load_index_metadata()
|
|
630
|
+
for file_path, chunks, error in parse_results:
|
|
631
|
+
if error:
|
|
632
|
+
logger.error(f"Failed to parse {file_path}: {error}")
|
|
414
633
|
success_flags.append(False)
|
|
634
|
+
continue
|
|
635
|
+
|
|
636
|
+
if chunks:
|
|
637
|
+
# Build hierarchy and collect metrics for parsed chunks
|
|
638
|
+
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
639
|
+
|
|
640
|
+
# Collect metrics if enabled
|
|
641
|
+
chunk_metrics = None
|
|
642
|
+
if self.collectors:
|
|
643
|
+
try:
|
|
644
|
+
source_code = file_path.read_bytes()
|
|
645
|
+
language = EXTENSION_TO_LANGUAGE.get(
|
|
646
|
+
file_path.suffix.lower(), "unknown"
|
|
647
|
+
)
|
|
648
|
+
chunk_metrics = {}
|
|
649
|
+
for chunk in chunks_with_hierarchy:
|
|
650
|
+
metrics = self._collect_metrics(
|
|
651
|
+
chunk, source_code, language
|
|
652
|
+
)
|
|
653
|
+
if metrics:
|
|
654
|
+
chunk_metrics[chunk.chunk_id] = metrics.to_metadata()
|
|
655
|
+
except Exception as e:
|
|
656
|
+
logger.warning(
|
|
657
|
+
f"Failed to collect metrics for {file_path}: {e}"
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
# Accumulate chunks
|
|
661
|
+
start_idx = len(all_chunks)
|
|
662
|
+
all_chunks.extend(chunks_with_hierarchy)
|
|
663
|
+
end_idx = len(all_chunks)
|
|
664
|
+
file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
|
|
665
|
+
|
|
666
|
+
# Merge metrics
|
|
667
|
+
if chunk_metrics:
|
|
668
|
+
all_metrics.update(chunk_metrics)
|
|
669
|
+
|
|
670
|
+
# Update metadata for successfully parsed file
|
|
671
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
672
|
+
success_flags.append(True)
|
|
415
673
|
else:
|
|
416
|
-
|
|
674
|
+
# Empty file is not an error
|
|
675
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
676
|
+
success_flags.append(True)
|
|
677
|
+
|
|
678
|
+
# Single database insertion for entire batch
|
|
679
|
+
if all_chunks:
|
|
680
|
+
logger.info(
|
|
681
|
+
f"Batch inserting {len(all_chunks)} chunks from {len(file_paths)} files"
|
|
682
|
+
)
|
|
683
|
+
try:
|
|
684
|
+
await self.database.add_chunks(all_chunks, metrics=all_metrics)
|
|
685
|
+
logger.debug(
|
|
686
|
+
f"Successfully indexed {len(all_chunks)} chunks from {sum(success_flags)} files"
|
|
687
|
+
)
|
|
688
|
+
except Exception as e:
|
|
689
|
+
logger.error(f"Failed to insert batch of chunks: {e}")
|
|
690
|
+
# Mark all files in this batch as failed
|
|
691
|
+
return [False] * len(file_paths)
|
|
692
|
+
|
|
693
|
+
# Save updated metadata after successful batch
|
|
694
|
+
self._save_index_metadata(metadata)
|
|
417
695
|
|
|
418
696
|
return success_flags
|
|
419
697
|
|
|
698
|
+
async def _parse_files_multiprocess(
|
|
699
|
+
self, file_paths: list[Path]
|
|
700
|
+
) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
|
|
701
|
+
"""Parse multiple files using multiprocessing for CPU-bound parallelism.
|
|
702
|
+
|
|
703
|
+
Args:
|
|
704
|
+
file_paths: List of file paths to parse
|
|
705
|
+
|
|
706
|
+
Returns:
|
|
707
|
+
List of tuples (file_path, chunks, error) for each file
|
|
708
|
+
"""
|
|
709
|
+
# Prepare arguments for worker processes
|
|
710
|
+
parse_args = []
|
|
711
|
+
for file_path in file_paths:
|
|
712
|
+
# Get subproject info if available
|
|
713
|
+
subproject = self.monorepo_detector.get_subproject_for_file(file_path)
|
|
714
|
+
subproject_info_json = None
|
|
715
|
+
if subproject:
|
|
716
|
+
subproject_info_json = json.dumps(
|
|
717
|
+
{
|
|
718
|
+
"name": subproject.name,
|
|
719
|
+
"relative_path": subproject.relative_path,
|
|
720
|
+
}
|
|
721
|
+
)
|
|
722
|
+
parse_args.append((file_path, subproject_info_json))
|
|
723
|
+
|
|
724
|
+
# Limit workers to avoid overhead
|
|
725
|
+
max_workers = min(self.max_workers, len(file_paths))
|
|
726
|
+
|
|
727
|
+
# Run parsing in ProcessPoolExecutor
|
|
728
|
+
loop = asyncio.get_running_loop()
|
|
729
|
+
with ProcessPoolExecutor(max_workers=max_workers) as executor:
|
|
730
|
+
# Submit all tasks and wait for results
|
|
731
|
+
results = await loop.run_in_executor(
|
|
732
|
+
None, lambda: list(executor.map(_parse_file_standalone, parse_args))
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
logger.debug(
|
|
736
|
+
f"Multiprocess parsing completed: {len(results)} files parsed with {max_workers} workers"
|
|
737
|
+
)
|
|
738
|
+
return results
|
|
739
|
+
|
|
740
|
+
async def _parse_files_async(
|
|
741
|
+
self, file_paths: list[Path]
|
|
742
|
+
) -> list[tuple[Path, list[CodeChunk], Exception | None]]:
|
|
743
|
+
"""Parse multiple files using async (fallback for single file or disabled multiprocessing).
|
|
744
|
+
|
|
745
|
+
Args:
|
|
746
|
+
file_paths: List of file paths to parse
|
|
747
|
+
|
|
748
|
+
Returns:
|
|
749
|
+
List of tuples (file_path, chunks, error) for each file
|
|
750
|
+
"""
|
|
751
|
+
results = []
|
|
752
|
+
for file_path in file_paths:
|
|
753
|
+
try:
|
|
754
|
+
chunks = await self._parse_file(file_path)
|
|
755
|
+
results.append((file_path, chunks, None))
|
|
756
|
+
except Exception as e:
|
|
757
|
+
logger.error(f"Failed to parse {file_path}: {e}")
|
|
758
|
+
results.append((file_path, [], e))
|
|
759
|
+
|
|
760
|
+
return results
|
|
761
|
+
|
|
420
762
|
def _load_index_metadata(self) -> dict[str, float]:
|
|
421
763
|
"""Load file modification times from metadata file.
|
|
422
764
|
|
|
@@ -759,8 +1101,10 @@ class SemanticIndexer:
|
|
|
759
1101
|
# Get relative path from project root for checking
|
|
760
1102
|
relative_path = file_path.relative_to(self.project_root)
|
|
761
1103
|
|
|
762
|
-
# 1. Check dotfile filtering (
|
|
763
|
-
|
|
1104
|
+
# 1. Check dotfile filtering (ENABLED BY DEFAULT)
|
|
1105
|
+
# Skip dotfiles unless config explicitly disables it
|
|
1106
|
+
skip_dotfiles = self.config.skip_dotfiles if self.config else True
|
|
1107
|
+
if skip_dotfiles:
|
|
764
1108
|
for part in relative_path.parts:
|
|
765
1109
|
# Skip dotfiles unless they're in the whitelist
|
|
766
1110
|
if part.startswith(".") and part not in ALLOWED_DOTFILES:
|
|
@@ -994,6 +1338,9 @@ class SemanticIndexer:
|
|
|
994
1338
|
):
|
|
995
1339
|
"""Index files and yield progress updates for each file.
|
|
996
1340
|
|
|
1341
|
+
This method processes files in batches and accumulates chunks across files
|
|
1342
|
+
before performing a single database insertion per batch for better performance.
|
|
1343
|
+
|
|
997
1344
|
Args:
|
|
998
1345
|
files_to_index: List of file paths to index
|
|
999
1346
|
force_reindex: Whether to force reindexing
|
|
@@ -1004,72 +1351,84 @@ class SemanticIndexer:
|
|
|
1004
1351
|
# Write version header to error log at start of indexing run
|
|
1005
1352
|
self._write_indexing_run_header()
|
|
1006
1353
|
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
# Process files in batches for better memory management
|
|
1354
|
+
# Process files in batches for better memory management and embedding efficiency
|
|
1010
1355
|
for i in range(0, len(files_to_index), self.batch_size):
|
|
1011
1356
|
batch = files_to_index[i : i + self.batch_size]
|
|
1012
1357
|
|
|
1013
|
-
#
|
|
1358
|
+
# Accumulate chunks from all files in batch
|
|
1359
|
+
all_chunks: list[CodeChunk] = []
|
|
1360
|
+
all_metrics: dict[str, Any] = {}
|
|
1361
|
+
file_to_chunks_map: dict[str, tuple[int, int]] = {}
|
|
1362
|
+
file_results: dict[Path, tuple[int, bool]] = {}
|
|
1363
|
+
|
|
1364
|
+
# Parse all files in parallel
|
|
1365
|
+
tasks = []
|
|
1014
1366
|
for file_path in batch:
|
|
1015
|
-
|
|
1016
|
-
|
|
1367
|
+
task = asyncio.create_task(
|
|
1368
|
+
self._parse_and_prepare_file(file_path, force_reindex)
|
|
1369
|
+
)
|
|
1370
|
+
tasks.append(task)
|
|
1017
1371
|
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
source_code = file_path.read_bytes()
|
|
1035
|
-
|
|
1036
|
-
# Detect language from file extension
|
|
1037
|
-
language = EXTENSION_TO_LANGUAGE.get(
|
|
1038
|
-
file_path.suffix.lower(), "unknown"
|
|
1039
|
-
)
|
|
1040
|
-
|
|
1041
|
-
# Collect metrics for each chunk
|
|
1042
|
-
chunk_metrics = {}
|
|
1043
|
-
for chunk in chunks_with_hierarchy:
|
|
1044
|
-
metrics = self._collect_metrics(
|
|
1045
|
-
chunk, source_code, language
|
|
1046
|
-
)
|
|
1047
|
-
if metrics:
|
|
1048
|
-
chunk_metrics[chunk.chunk_id] = (
|
|
1049
|
-
metrics.to_metadata()
|
|
1050
|
-
)
|
|
1051
|
-
except Exception as e:
|
|
1052
|
-
logger.warning(
|
|
1053
|
-
f"Failed to collect metrics for {file_path}: {e}"
|
|
1054
|
-
)
|
|
1055
|
-
chunk_metrics = None
|
|
1056
|
-
|
|
1057
|
-
# Add chunks to database with metrics
|
|
1058
|
-
await self.database.add_chunks(
|
|
1059
|
-
chunks_with_hierarchy, metrics=chunk_metrics
|
|
1372
|
+
parse_results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
1373
|
+
|
|
1374
|
+
# Accumulate chunks from successfully parsed files
|
|
1375
|
+
metadata = self._load_index_metadata()
|
|
1376
|
+
for file_path, result in zip(batch, parse_results, strict=True):
|
|
1377
|
+
if isinstance(result, Exception):
|
|
1378
|
+
error_msg = f"Failed to index file {file_path}: {type(result).__name__}: {str(result)}"
|
|
1379
|
+
logger.error(error_msg)
|
|
1380
|
+
file_results[file_path] = (0, False)
|
|
1381
|
+
|
|
1382
|
+
# Save error to error log file
|
|
1383
|
+
try:
|
|
1384
|
+
error_log_path = (
|
|
1385
|
+
self.project_root
|
|
1386
|
+
/ ".mcp-vector-search"
|
|
1387
|
+
/ "indexing_errors.log"
|
|
1060
1388
|
)
|
|
1061
|
-
|
|
1062
|
-
|
|
1389
|
+
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
1390
|
+
timestamp = datetime.now().isoformat()
|
|
1391
|
+
f.write(f"[{timestamp}] {error_msg}\n")
|
|
1392
|
+
except Exception as log_err:
|
|
1393
|
+
logger.debug(f"Failed to write error log: {log_err}")
|
|
1394
|
+
continue
|
|
1063
1395
|
|
|
1064
|
-
|
|
1396
|
+
chunks, metrics = result
|
|
1397
|
+
if chunks:
|
|
1398
|
+
start_idx = len(all_chunks)
|
|
1399
|
+
all_chunks.extend(chunks)
|
|
1400
|
+
end_idx = len(all_chunks)
|
|
1401
|
+
file_to_chunks_map[str(file_path)] = (start_idx, end_idx)
|
|
1065
1402
|
|
|
1066
|
-
#
|
|
1403
|
+
# Merge metrics
|
|
1404
|
+
if metrics:
|
|
1405
|
+
all_metrics.update(metrics)
|
|
1406
|
+
|
|
1407
|
+
# Update metadata for successfully parsed file
|
|
1408
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
1409
|
+
file_results[file_path] = (len(chunks), True)
|
|
1410
|
+
logger.debug(f"Prepared {len(chunks)} chunks from {file_path}")
|
|
1411
|
+
else:
|
|
1412
|
+
# Empty file is not an error
|
|
1067
1413
|
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
1414
|
+
file_results[file_path] = (0, True)
|
|
1068
1415
|
|
|
1416
|
+
# Single database insertion for entire batch
|
|
1417
|
+
if all_chunks:
|
|
1418
|
+
logger.info(
|
|
1419
|
+
f"Batch inserting {len(all_chunks)} chunks from {len(batch)} files"
|
|
1420
|
+
)
|
|
1421
|
+
try:
|
|
1422
|
+
await self.database.add_chunks(all_chunks, metrics=all_metrics)
|
|
1423
|
+
logger.debug(
|
|
1424
|
+
f"Successfully indexed {len(all_chunks)} chunks from batch"
|
|
1425
|
+
)
|
|
1069
1426
|
except Exception as e:
|
|
1070
|
-
error_msg = f"Failed to
|
|
1427
|
+
error_msg = f"Failed to insert batch of chunks: {e}"
|
|
1071
1428
|
logger.error(error_msg)
|
|
1072
|
-
|
|
1429
|
+
# Mark all files with chunks in this batch as failed
|
|
1430
|
+
for file_path in file_to_chunks_map.keys():
|
|
1431
|
+
file_results[Path(file_path)] = (0, False)
|
|
1073
1432
|
|
|
1074
1433
|
# Save error to error log file
|
|
1075
1434
|
try:
|
|
@@ -1079,18 +1438,18 @@ class SemanticIndexer:
|
|
|
1079
1438
|
/ "indexing_errors.log"
|
|
1080
1439
|
)
|
|
1081
1440
|
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
1082
|
-
from datetime import datetime
|
|
1083
|
-
|
|
1084
1441
|
timestamp = datetime.now().isoformat()
|
|
1085
1442
|
f.write(f"[{timestamp}] {error_msg}\n")
|
|
1086
1443
|
except Exception as log_err:
|
|
1087
1444
|
logger.debug(f"Failed to write error log: {log_err}")
|
|
1088
1445
|
|
|
1089
|
-
|
|
1090
|
-
|
|
1446
|
+
# Save metadata after batch
|
|
1447
|
+
self._save_index_metadata(metadata)
|
|
1091
1448
|
|
|
1092
|
-
|
|
1093
|
-
|
|
1449
|
+
# Yield progress updates for each file in batch
|
|
1450
|
+
for file_path in batch:
|
|
1451
|
+
chunks_added, success = file_results.get(file_path, (0, False))
|
|
1452
|
+
yield (file_path, chunks_added, success)
|
|
1094
1453
|
|
|
1095
1454
|
def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
|
|
1096
1455
|
"""Build parent-child relationships between chunks.
|
|
@@ -1112,7 +1471,9 @@ class SemanticIndexer:
|
|
|
1112
1471
|
return chunks
|
|
1113
1472
|
|
|
1114
1473
|
# Group chunks by type and name
|
|
1115
|
-
|
|
1474
|
+
# Only actual module chunks (not imports) serve as parents for top-level code
|
|
1475
|
+
# imports chunks should remain siblings of classes/functions, not parents
|
|
1476
|
+
module_chunks = [c for c in chunks if c.chunk_type == "module"]
|
|
1116
1477
|
class_chunks = [
|
|
1117
1478
|
c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
|
|
1118
1479
|
]
|