mcp-vector-search 0.7.5__py3-none-any.whl → 0.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/cli/commands/demo.py +2 -4
- mcp_vector_search/cli/commands/index.py +130 -30
- mcp_vector_search/cli/commands/mcp.py +83 -56
- mcp_vector_search/cli/commands/status.py +23 -9
- mcp_vector_search/cli/commands/visualize.py +523 -0
- mcp_vector_search/cli/main.py +16 -13
- mcp_vector_search/core/database.py +117 -54
- mcp_vector_search/core/indexer.py +262 -16
- mcp_vector_search/core/models.py +45 -1
- mcp_vector_search/core/project.py +6 -3
- mcp_vector_search/parsers/base.py +83 -0
- mcp_vector_search/parsers/javascript.py +350 -2
- mcp_vector_search/parsers/python.py +79 -0
- mcp_vector_search/utils/gitignore.py +31 -23
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/METADATA +1 -1
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/RECORD +20 -19
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.8.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Database abstraction and ChromaDB implementation for MCP Vector Search."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import shutil
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
6
|
from pathlib import Path
|
|
@@ -369,38 +370,67 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
369
370
|
raise DatabaseError(f"Failed to delete chunks: {e}") from e
|
|
370
371
|
|
|
371
372
|
async def get_stats(self) -> IndexStats:
|
|
372
|
-
"""Get database statistics."""
|
|
373
|
+
"""Get database statistics with optimized chunked queries."""
|
|
373
374
|
if not self._collection:
|
|
374
375
|
raise DatabaseNotInitializedError("Database not initialized")
|
|
375
376
|
|
|
376
377
|
try:
|
|
377
|
-
# Get total count
|
|
378
|
+
# Get total count (fast operation)
|
|
378
379
|
count = self._collection.count()
|
|
379
380
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
381
|
+
if count == 0:
|
|
382
|
+
return IndexStats(
|
|
383
|
+
total_files=0,
|
|
384
|
+
total_chunks=0,
|
|
385
|
+
languages={},
|
|
386
|
+
file_types={},
|
|
387
|
+
index_size_mb=0.0,
|
|
388
|
+
last_updated="N/A",
|
|
389
|
+
embedding_model="unknown",
|
|
390
|
+
)
|
|
383
391
|
|
|
384
|
-
#
|
|
385
|
-
|
|
392
|
+
# Process in chunks to avoid loading everything at once
|
|
393
|
+
batch_size_limit = 1000
|
|
386
394
|
|
|
387
|
-
|
|
388
|
-
language_counts = {}
|
|
389
|
-
file_type_counts = {}
|
|
395
|
+
files = set()
|
|
396
|
+
language_counts: dict[str, int] = {}
|
|
397
|
+
file_type_counts: dict[str, int] = {}
|
|
390
398
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
399
|
+
offset = 0
|
|
400
|
+
while offset < count:
|
|
401
|
+
# Fetch batch
|
|
402
|
+
batch_size = min(batch_size_limit, count - offset)
|
|
403
|
+
logger.debug(
|
|
404
|
+
f"Processing database stats: batch {offset // batch_size_limit + 1}, "
|
|
405
|
+
f"{offset}-{offset + batch_size} of {count} chunks"
|
|
406
|
+
)
|
|
395
407
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
408
|
+
results = self._collection.get(
|
|
409
|
+
include=["metadatas"],
|
|
410
|
+
limit=batch_size,
|
|
411
|
+
offset=offset,
|
|
412
|
+
)
|
|
401
413
|
|
|
402
|
-
|
|
403
|
-
|
|
414
|
+
# Process batch metadata
|
|
415
|
+
for metadata in results.get("metadatas", []):
|
|
416
|
+
# Language stats
|
|
417
|
+
lang = metadata.get("language", "unknown")
|
|
418
|
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
|
419
|
+
|
|
420
|
+
# File stats
|
|
421
|
+
file_path = metadata.get("file_path", "")
|
|
422
|
+
if file_path:
|
|
423
|
+
files.add(file_path)
|
|
424
|
+
ext = Path(file_path).suffix or "no_extension"
|
|
425
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
426
|
+
|
|
427
|
+
offset += batch_size
|
|
428
|
+
|
|
429
|
+
# Yield to event loop periodically to prevent blocking
|
|
430
|
+
await asyncio.sleep(0)
|
|
431
|
+
|
|
432
|
+
# Estimate index size (rough approximation: ~1KB per chunk)
|
|
433
|
+
index_size_mb = count * 0.001
|
|
404
434
|
|
|
405
435
|
return IndexStats(
|
|
406
436
|
total_files=len(files),
|
|
@@ -408,12 +438,13 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
408
438
|
languages=language_counts,
|
|
409
439
|
file_types=file_type_counts,
|
|
410
440
|
index_size_mb=index_size_mb,
|
|
411
|
-
last_updated="unknown",
|
|
412
|
-
embedding_model="unknown",
|
|
441
|
+
last_updated="unknown",
|
|
442
|
+
embedding_model="unknown",
|
|
413
443
|
)
|
|
414
444
|
|
|
415
445
|
except Exception as e:
|
|
416
|
-
logger.error(f"Failed to get
|
|
446
|
+
logger.error(f"Failed to get database statistics: {e}")
|
|
447
|
+
# Return empty stats instead of raising
|
|
417
448
|
return IndexStats(
|
|
418
449
|
total_files=0,
|
|
419
450
|
total_chunks=0,
|
|
@@ -768,56 +799,88 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
768
799
|
raise DatabaseError(f"Failed to delete chunks: {e}") from e
|
|
769
800
|
|
|
770
801
|
async def get_stats(self) -> IndexStats:
|
|
771
|
-
"""Get database statistics
|
|
802
|
+
"""Get database statistics with connection pooling and chunked queries."""
|
|
772
803
|
try:
|
|
773
804
|
async with self._pool.get_connection() as conn:
|
|
774
|
-
# Get total count
|
|
805
|
+
# Get total count (fast operation)
|
|
775
806
|
count = conn.collection.count()
|
|
776
807
|
|
|
777
|
-
|
|
778
|
-
|
|
808
|
+
if count == 0:
|
|
809
|
+
return IndexStats(
|
|
810
|
+
total_files=0,
|
|
811
|
+
total_chunks=0,
|
|
812
|
+
languages={},
|
|
813
|
+
file_types={},
|
|
814
|
+
index_size_mb=0.0,
|
|
815
|
+
last_updated="N/A",
|
|
816
|
+
embedding_model="unknown",
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
# Process in chunks to avoid loading everything at once
|
|
820
|
+
batch_size_limit = 1000
|
|
779
821
|
|
|
780
|
-
# Analyze languages and files
|
|
781
|
-
languages = set()
|
|
782
822
|
files = set()
|
|
823
|
+
language_counts: dict[str, int] = {}
|
|
824
|
+
file_type_counts: dict[str, int] = {}
|
|
825
|
+
|
|
826
|
+
offset = 0
|
|
827
|
+
while offset < count:
|
|
828
|
+
# Fetch batch
|
|
829
|
+
batch_size = min(batch_size_limit, count - offset)
|
|
830
|
+
logger.debug(
|
|
831
|
+
f"Processing database stats: batch {offset // batch_size_limit + 1}, "
|
|
832
|
+
f"{offset}-{offset + batch_size} of {count} chunks"
|
|
833
|
+
)
|
|
783
834
|
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
835
|
+
results = conn.collection.get(
|
|
836
|
+
include=["metadatas"],
|
|
837
|
+
limit=batch_size,
|
|
838
|
+
offset=offset,
|
|
839
|
+
)
|
|
789
840
|
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
841
|
+
# Process batch metadata
|
|
842
|
+
for metadata in results.get("metadatas", []):
|
|
843
|
+
# Language stats
|
|
844
|
+
lang = metadata.get("language", "unknown")
|
|
845
|
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
|
793
846
|
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
847
|
+
# File stats
|
|
848
|
+
file_path = metadata.get("file_path", "")
|
|
849
|
+
if file_path:
|
|
850
|
+
files.add(file_path)
|
|
851
|
+
ext = Path(file_path).suffix or "no_extension"
|
|
852
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
798
853
|
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
854
|
+
offset += batch_size
|
|
855
|
+
|
|
856
|
+
# Yield to event loop periodically to prevent blocking
|
|
857
|
+
await asyncio.sleep(0)
|
|
804
858
|
|
|
805
|
-
# Estimate index size (rough approximation)
|
|
806
|
-
index_size_mb = count * 0.001
|
|
859
|
+
# Estimate index size (rough approximation: ~1KB per chunk)
|
|
860
|
+
index_size_mb = count * 0.001
|
|
807
861
|
|
|
808
862
|
return IndexStats(
|
|
809
|
-
total_chunks=count,
|
|
810
863
|
total_files=len(files),
|
|
864
|
+
total_chunks=count,
|
|
811
865
|
languages=language_counts,
|
|
812
866
|
file_types=file_type_counts,
|
|
813
867
|
index_size_mb=index_size_mb,
|
|
814
|
-
last_updated="unknown",
|
|
815
|
-
embedding_model="unknown",
|
|
868
|
+
last_updated="unknown",
|
|
869
|
+
embedding_model="unknown",
|
|
816
870
|
)
|
|
817
871
|
|
|
818
872
|
except Exception as e:
|
|
819
|
-
logger.error(f"Failed to get database
|
|
820
|
-
|
|
873
|
+
logger.error(f"Failed to get database statistics: {e}")
|
|
874
|
+
# Return empty stats instead of raising
|
|
875
|
+
return IndexStats(
|
|
876
|
+
total_files=0,
|
|
877
|
+
total_chunks=0,
|
|
878
|
+
languages={},
|
|
879
|
+
file_types={},
|
|
880
|
+
index_size_mb=0.0,
|
|
881
|
+
last_updated="error",
|
|
882
|
+
embedding_model="unknown",
|
|
883
|
+
)
|
|
821
884
|
|
|
822
885
|
async def remove_file_chunks(self, file_path: str) -> int:
|
|
823
886
|
"""Remove all chunks for a specific file using pooled connection."""
|
|
@@ -57,6 +57,11 @@ class SemanticIndexer:
|
|
|
57
57
|
project_root / ".mcp-vector-search" / "index_metadata.json"
|
|
58
58
|
)
|
|
59
59
|
|
|
60
|
+
# Add cache for indexable files to avoid repeated filesystem scans
|
|
61
|
+
self._indexable_files_cache: list[Path] | None = None
|
|
62
|
+
self._cache_timestamp: float = 0
|
|
63
|
+
self._cache_ttl: float = 60.0 # 60 second TTL
|
|
64
|
+
|
|
60
65
|
# Initialize gitignore parser
|
|
61
66
|
try:
|
|
62
67
|
self.gitignore_parser = create_gitignore_parser(project_root)
|
|
@@ -290,8 +295,11 @@ class SemanticIndexer:
|
|
|
290
295
|
logger.debug(f"No chunks extracted from {file_path}")
|
|
291
296
|
return True # Not an error, just empty file
|
|
292
297
|
|
|
298
|
+
# Build hierarchical relationships between chunks
|
|
299
|
+
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
300
|
+
|
|
293
301
|
# Add chunks to database
|
|
294
|
-
await self.database.add_chunks(
|
|
302
|
+
await self.database.add_chunks(chunks_with_hierarchy)
|
|
295
303
|
|
|
296
304
|
# Update metadata after successful indexing
|
|
297
305
|
metadata = self._load_index_metadata()
|
|
@@ -334,38 +342,120 @@ class SemanticIndexer:
|
|
|
334
342
|
return 0
|
|
335
343
|
|
|
336
344
|
def _find_indexable_files(self) -> list[Path]:
|
|
337
|
-
"""Find all files that should be indexed.
|
|
345
|
+
"""Find all files that should be indexed with caching.
|
|
338
346
|
|
|
339
347
|
Returns:
|
|
340
348
|
List of file paths to index
|
|
341
349
|
"""
|
|
350
|
+
import time
|
|
351
|
+
|
|
352
|
+
# Check cache
|
|
353
|
+
current_time = time.time()
|
|
354
|
+
if (
|
|
355
|
+
self._indexable_files_cache is not None
|
|
356
|
+
and current_time - self._cache_timestamp < self._cache_ttl
|
|
357
|
+
):
|
|
358
|
+
logger.debug(
|
|
359
|
+
f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
|
|
360
|
+
)
|
|
361
|
+
return self._indexable_files_cache
|
|
362
|
+
|
|
363
|
+
# Rebuild cache using efficient directory filtering
|
|
364
|
+
logger.debug("Rebuilding indexable files cache...")
|
|
365
|
+
indexable_files = self._scan_files_sync()
|
|
366
|
+
|
|
367
|
+
self._indexable_files_cache = sorted(indexable_files)
|
|
368
|
+
self._cache_timestamp = current_time
|
|
369
|
+
logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
|
|
370
|
+
|
|
371
|
+
return self._indexable_files_cache
|
|
372
|
+
|
|
373
|
+
def _scan_files_sync(self) -> list[Path]:
|
|
374
|
+
"""Synchronous file scanning (runs in thread pool).
|
|
375
|
+
|
|
376
|
+
Uses os.walk with directory filtering to avoid traversing ignored directories.
|
|
377
|
+
|
|
378
|
+
Returns:
|
|
379
|
+
List of indexable file paths
|
|
380
|
+
"""
|
|
342
381
|
indexable_files = []
|
|
343
382
|
|
|
344
|
-
for
|
|
345
|
-
|
|
346
|
-
|
|
383
|
+
# Use os.walk for efficient directory traversal with early filtering
|
|
384
|
+
for root, dirs, files in os.walk(self.project_root):
|
|
385
|
+
root_path = Path(root)
|
|
386
|
+
|
|
387
|
+
# Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
|
|
388
|
+
# This is much more efficient than checking every file in ignored directories
|
|
389
|
+
# PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
|
|
390
|
+
dirs[:] = [d for d in dirs if not self._should_ignore_path(root_path / d, is_directory=True)]
|
|
391
|
+
|
|
392
|
+
# Check each file in the current directory
|
|
393
|
+
# PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
|
|
394
|
+
for filename in files:
|
|
395
|
+
file_path = root_path / filename
|
|
396
|
+
if self._should_index_file(file_path, skip_file_check=True):
|
|
397
|
+
indexable_files.append(file_path)
|
|
347
398
|
|
|
348
|
-
return
|
|
399
|
+
return indexable_files
|
|
400
|
+
|
|
401
|
+
async def _find_indexable_files_async(self) -> list[Path]:
|
|
402
|
+
"""Find all files asynchronously without blocking event loop.
|
|
403
|
+
|
|
404
|
+
Returns:
|
|
405
|
+
List of file paths to index
|
|
406
|
+
"""
|
|
407
|
+
import time
|
|
408
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
409
|
+
|
|
410
|
+
# Check cache first
|
|
411
|
+
current_time = time.time()
|
|
412
|
+
if (
|
|
413
|
+
self._indexable_files_cache is not None
|
|
414
|
+
and current_time - self._cache_timestamp < self._cache_ttl
|
|
415
|
+
):
|
|
416
|
+
logger.debug(
|
|
417
|
+
f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
|
|
418
|
+
)
|
|
419
|
+
return self._indexable_files_cache
|
|
420
|
+
|
|
421
|
+
# Run filesystem scan in thread pool to avoid blocking
|
|
422
|
+
logger.debug("Scanning files in background thread...")
|
|
423
|
+
loop = asyncio.get_running_loop()
|
|
424
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
425
|
+
indexable_files = await loop.run_in_executor(
|
|
426
|
+
executor, self._scan_files_sync
|
|
427
|
+
)
|
|
349
428
|
|
|
350
|
-
|
|
429
|
+
# Update cache
|
|
430
|
+
self._indexable_files_cache = sorted(indexable_files)
|
|
431
|
+
self._cache_timestamp = current_time
|
|
432
|
+
logger.debug(f"Found {len(indexable_files)} indexable files")
|
|
433
|
+
|
|
434
|
+
return self._indexable_files_cache
|
|
435
|
+
|
|
436
|
+
def _should_index_file(self, file_path: Path, skip_file_check: bool = False) -> bool:
|
|
351
437
|
"""Check if a file should be indexed.
|
|
352
438
|
|
|
353
439
|
Args:
|
|
354
440
|
file_path: Path to check
|
|
441
|
+
skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
|
|
355
442
|
|
|
356
443
|
Returns:
|
|
357
444
|
True if file should be indexed
|
|
358
445
|
"""
|
|
359
|
-
#
|
|
360
|
-
|
|
446
|
+
# PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
|
|
447
|
+
# This eliminates most files without any filesystem calls
|
|
448
|
+
if file_path.suffix.lower() not in self.file_extensions:
|
|
361
449
|
return False
|
|
362
450
|
|
|
363
|
-
#
|
|
364
|
-
|
|
451
|
+
# PERFORMANCE: Only check is_file() if not coming from os.walk
|
|
452
|
+
# os.walk already guarantees files, so we skip this expensive check
|
|
453
|
+
if not skip_file_check and not file_path.is_file():
|
|
365
454
|
return False
|
|
366
455
|
|
|
367
456
|
# Check if path should be ignored
|
|
368
|
-
|
|
457
|
+
# PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
|
|
458
|
+
if self._should_ignore_path(file_path, is_directory=False):
|
|
369
459
|
return False
|
|
370
460
|
|
|
371
461
|
# Check file size (skip very large files)
|
|
@@ -379,18 +469,20 @@ class SemanticIndexer:
|
|
|
379
469
|
|
|
380
470
|
return True
|
|
381
471
|
|
|
382
|
-
def _should_ignore_path(self, file_path: Path) -> bool:
|
|
472
|
+
def _should_ignore_path(self, file_path: Path, is_directory: bool | None = None) -> bool:
|
|
383
473
|
"""Check if a path should be ignored.
|
|
384
474
|
|
|
385
475
|
Args:
|
|
386
476
|
file_path: Path to check
|
|
477
|
+
is_directory: Optional hint if path is a directory (avoids filesystem check)
|
|
387
478
|
|
|
388
479
|
Returns:
|
|
389
480
|
True if path should be ignored
|
|
390
481
|
"""
|
|
391
482
|
try:
|
|
392
483
|
# First check gitignore rules if available
|
|
393
|
-
|
|
484
|
+
# PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
|
|
485
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path, is_directory=is_directory):
|
|
394
486
|
logger.debug(f"Path ignored by .gitignore: {file_path}")
|
|
395
487
|
return True
|
|
396
488
|
|
|
@@ -532,8 +624,8 @@ class SemanticIndexer:
|
|
|
532
624
|
# Get database stats
|
|
533
625
|
db_stats = await self.database.get_stats()
|
|
534
626
|
|
|
535
|
-
# Count indexable files
|
|
536
|
-
indexable_files = self.
|
|
627
|
+
# Count indexable files asynchronously without blocking
|
|
628
|
+
indexable_files = await self._find_indexable_files_async()
|
|
537
629
|
|
|
538
630
|
return {
|
|
539
631
|
"total_indexable_files": len(indexable_files),
|
|
@@ -553,3 +645,157 @@ class SemanticIndexer:
|
|
|
553
645
|
"indexed_files": 0,
|
|
554
646
|
"total_chunks": 0,
|
|
555
647
|
}
|
|
648
|
+
|
|
649
|
+
async def get_files_to_index(
|
|
650
|
+
self, force_reindex: bool = False
|
|
651
|
+
) -> tuple[list[Path], list[Path]]:
|
|
652
|
+
"""Get all indexable files and those that need indexing.
|
|
653
|
+
|
|
654
|
+
Args:
|
|
655
|
+
force_reindex: Whether to force reindex of all files
|
|
656
|
+
|
|
657
|
+
Returns:
|
|
658
|
+
Tuple of (all_indexable_files, files_to_index)
|
|
659
|
+
"""
|
|
660
|
+
# Find all indexable files
|
|
661
|
+
all_files = await self._find_indexable_files_async()
|
|
662
|
+
|
|
663
|
+
if not all_files:
|
|
664
|
+
return [], []
|
|
665
|
+
|
|
666
|
+
# Load existing metadata for incremental indexing
|
|
667
|
+
metadata = self._load_index_metadata()
|
|
668
|
+
|
|
669
|
+
# Filter files that need indexing
|
|
670
|
+
if force_reindex:
|
|
671
|
+
files_to_index = all_files
|
|
672
|
+
logger.info(f"Force reindex: processing all {len(files_to_index)} files")
|
|
673
|
+
else:
|
|
674
|
+
files_to_index = [
|
|
675
|
+
f for f in all_files if self._needs_reindexing(f, metadata)
|
|
676
|
+
]
|
|
677
|
+
logger.info(
|
|
678
|
+
f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
return all_files, files_to_index
|
|
682
|
+
|
|
683
|
+
async def index_files_with_progress(
|
|
684
|
+
self,
|
|
685
|
+
files_to_index: list[Path],
|
|
686
|
+
force_reindex: bool = False,
|
|
687
|
+
):
|
|
688
|
+
"""Index files and yield progress updates for each file.
|
|
689
|
+
|
|
690
|
+
Args:
|
|
691
|
+
files_to_index: List of file paths to index
|
|
692
|
+
force_reindex: Whether to force reindexing
|
|
693
|
+
|
|
694
|
+
Yields:
|
|
695
|
+
Tuple of (file_path, chunks_added, success) for each processed file
|
|
696
|
+
"""
|
|
697
|
+
metadata = self._load_index_metadata()
|
|
698
|
+
|
|
699
|
+
# Process files in batches for better memory management
|
|
700
|
+
for i in range(0, len(files_to_index), self.batch_size):
|
|
701
|
+
batch = files_to_index[i : i + self.batch_size]
|
|
702
|
+
|
|
703
|
+
# Process each file in the batch
|
|
704
|
+
for file_path in batch:
|
|
705
|
+
chunks_added = 0
|
|
706
|
+
success = False
|
|
707
|
+
|
|
708
|
+
try:
|
|
709
|
+
# Always remove existing chunks when reindexing
|
|
710
|
+
await self.database.delete_by_file(file_path)
|
|
711
|
+
|
|
712
|
+
# Parse file into chunks
|
|
713
|
+
chunks = await self._parse_file(file_path)
|
|
714
|
+
|
|
715
|
+
if chunks:
|
|
716
|
+
# Build hierarchical relationships
|
|
717
|
+
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
718
|
+
|
|
719
|
+
# Add chunks to database
|
|
720
|
+
await self.database.add_chunks(chunks_with_hierarchy)
|
|
721
|
+
chunks_added = len(chunks)
|
|
722
|
+
logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
|
|
723
|
+
|
|
724
|
+
success = True
|
|
725
|
+
|
|
726
|
+
# Update metadata after successful indexing
|
|
727
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
728
|
+
|
|
729
|
+
except Exception as e:
|
|
730
|
+
logger.error(f"Failed to index file {file_path}: {e}")
|
|
731
|
+
success = False
|
|
732
|
+
|
|
733
|
+
# Yield progress update
|
|
734
|
+
yield (file_path, chunks_added, success)
|
|
735
|
+
|
|
736
|
+
# Save metadata at the end
|
|
737
|
+
self._save_index_metadata(metadata)
|
|
738
|
+
|
|
739
|
+
def _build_chunk_hierarchy(self, chunks: list[CodeChunk]) -> list[CodeChunk]:
|
|
740
|
+
"""Build parent-child relationships between chunks.
|
|
741
|
+
|
|
742
|
+
Logic:
|
|
743
|
+
- Module chunks (chunk_type="module") have depth 0
|
|
744
|
+
- Class chunks have depth 1, parent is module
|
|
745
|
+
- Method chunks have depth 2, parent is class
|
|
746
|
+
- Function chunks outside classes have depth 1, parent is module
|
|
747
|
+
- Nested classes increment depth
|
|
748
|
+
|
|
749
|
+
Args:
|
|
750
|
+
chunks: List of code chunks to process
|
|
751
|
+
|
|
752
|
+
Returns:
|
|
753
|
+
List of chunks with hierarchy relationships established
|
|
754
|
+
"""
|
|
755
|
+
if not chunks:
|
|
756
|
+
return chunks
|
|
757
|
+
|
|
758
|
+
# Group chunks by type and name
|
|
759
|
+
module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
|
|
760
|
+
class_chunks = [c for c in chunks if c.chunk_type in ("class", "interface", "mixin")]
|
|
761
|
+
function_chunks = [c for c in chunks if c.chunk_type in ("function", "method", "constructor")]
|
|
762
|
+
|
|
763
|
+
# Build relationships
|
|
764
|
+
for func in function_chunks:
|
|
765
|
+
if func.class_name:
|
|
766
|
+
# Find parent class
|
|
767
|
+
parent_class = next(
|
|
768
|
+
(c for c in class_chunks if c.class_name == func.class_name),
|
|
769
|
+
None
|
|
770
|
+
)
|
|
771
|
+
if parent_class:
|
|
772
|
+
func.parent_chunk_id = parent_class.chunk_id
|
|
773
|
+
func.chunk_depth = parent_class.chunk_depth + 1
|
|
774
|
+
if func.chunk_id not in parent_class.child_chunk_ids:
|
|
775
|
+
parent_class.child_chunk_ids.append(func.chunk_id)
|
|
776
|
+
else:
|
|
777
|
+
# Top-level function
|
|
778
|
+
if not func.chunk_depth:
|
|
779
|
+
func.chunk_depth = 1
|
|
780
|
+
# Link to module if exists
|
|
781
|
+
if module_chunks and not func.parent_chunk_id:
|
|
782
|
+
func.parent_chunk_id = module_chunks[0].chunk_id
|
|
783
|
+
if func.chunk_id not in module_chunks[0].child_chunk_ids:
|
|
784
|
+
module_chunks[0].child_chunk_ids.append(func.chunk_id)
|
|
785
|
+
|
|
786
|
+
for cls in class_chunks:
|
|
787
|
+
# Classes without parent are top-level (depth 1)
|
|
788
|
+
if not cls.chunk_depth:
|
|
789
|
+
cls.chunk_depth = 1
|
|
790
|
+
# Link to module if exists
|
|
791
|
+
if module_chunks and not cls.parent_chunk_id:
|
|
792
|
+
cls.parent_chunk_id = module_chunks[0].chunk_id
|
|
793
|
+
if cls.chunk_id not in module_chunks[0].child_chunk_ids:
|
|
794
|
+
module_chunks[0].child_chunk_ids.append(cls.chunk_id)
|
|
795
|
+
|
|
796
|
+
# Module chunks stay at depth 0
|
|
797
|
+
for mod in module_chunks:
|
|
798
|
+
if not mod.chunk_depth:
|
|
799
|
+
mod.chunk_depth = 0
|
|
800
|
+
|
|
801
|
+
return chunks
|
mcp_vector_search/core/models.py
CHANGED
|
@@ -21,12 +21,40 @@ class CodeChunk:
|
|
|
21
21
|
class_name: str | None = None
|
|
22
22
|
docstring: str | None = None
|
|
23
23
|
imports: list[str] = None
|
|
24
|
+
|
|
25
|
+
# Enhancement 1: Complexity scoring
|
|
24
26
|
complexity_score: float = 0.0
|
|
25
27
|
|
|
28
|
+
# Enhancement 3: Hierarchical relationships
|
|
29
|
+
chunk_id: str | None = None
|
|
30
|
+
parent_chunk_id: str | None = None
|
|
31
|
+
child_chunk_ids: list[str] = None
|
|
32
|
+
chunk_depth: int = 0
|
|
33
|
+
|
|
34
|
+
# Enhancement 4: Enhanced metadata
|
|
35
|
+
decorators: list[str] = None
|
|
36
|
+
parameters: list[dict] = None
|
|
37
|
+
return_type: str | None = None
|
|
38
|
+
type_annotations: dict[str, str] = None
|
|
39
|
+
|
|
26
40
|
def __post_init__(self) -> None:
|
|
27
|
-
"""Initialize default values."""
|
|
41
|
+
"""Initialize default values and generate chunk ID."""
|
|
28
42
|
if self.imports is None:
|
|
29
43
|
self.imports = []
|
|
44
|
+
if self.child_chunk_ids is None:
|
|
45
|
+
self.child_chunk_ids = []
|
|
46
|
+
if self.decorators is None:
|
|
47
|
+
self.decorators = []
|
|
48
|
+
if self.parameters is None:
|
|
49
|
+
self.parameters = []
|
|
50
|
+
if self.type_annotations is None:
|
|
51
|
+
self.type_annotations = {}
|
|
52
|
+
|
|
53
|
+
# Generate chunk ID if not provided
|
|
54
|
+
if self.chunk_id is None:
|
|
55
|
+
import hashlib
|
|
56
|
+
id_string = f"{self.file_path}:{self.chunk_type}:{self.start_line}:{self.end_line}"
|
|
57
|
+
self.chunk_id = hashlib.sha256(id_string.encode()).hexdigest()[:16]
|
|
30
58
|
|
|
31
59
|
@property
|
|
32
60
|
def id(self) -> str:
|
|
@@ -52,6 +80,14 @@ class CodeChunk:
|
|
|
52
80
|
"docstring": self.docstring,
|
|
53
81
|
"imports": self.imports,
|
|
54
82
|
"complexity_score": self.complexity_score,
|
|
83
|
+
"chunk_id": self.chunk_id,
|
|
84
|
+
"parent_chunk_id": self.parent_chunk_id,
|
|
85
|
+
"child_chunk_ids": self.child_chunk_ids,
|
|
86
|
+
"chunk_depth": self.chunk_depth,
|
|
87
|
+
"decorators": self.decorators,
|
|
88
|
+
"parameters": self.parameters,
|
|
89
|
+
"return_type": self.return_type,
|
|
90
|
+
"type_annotations": self.type_annotations,
|
|
55
91
|
}
|
|
56
92
|
|
|
57
93
|
@classmethod
|
|
@@ -69,6 +105,14 @@ class CodeChunk:
|
|
|
69
105
|
docstring=data.get("docstring"),
|
|
70
106
|
imports=data.get("imports", []),
|
|
71
107
|
complexity_score=data.get("complexity_score", 0.0),
|
|
108
|
+
chunk_id=data.get("chunk_id"),
|
|
109
|
+
parent_chunk_id=data.get("parent_chunk_id"),
|
|
110
|
+
child_chunk_ids=data.get("child_chunk_ids", []),
|
|
111
|
+
chunk_depth=data.get("chunk_depth", 0),
|
|
112
|
+
decorators=data.get("decorators", []),
|
|
113
|
+
parameters=data.get("parameters", []),
|
|
114
|
+
return_type=data.get("return_type"),
|
|
115
|
+
type_annotations=data.get("type_annotations", {}),
|
|
72
116
|
)
|
|
73
117
|
|
|
74
118
|
|