mcp-vector-search 0.7.5__py3-none-any.whl → 0.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/cli/commands/demo.py +2 -4
- mcp_vector_search/cli/commands/index.py +130 -30
- mcp_vector_search/cli/commands/mcp.py +83 -56
- mcp_vector_search/cli/commands/status.py +23 -9
- mcp_vector_search/cli/main.py +0 -2
- mcp_vector_search/core/database.py +117 -54
- mcp_vector_search/core/indexer.py +191 -15
- mcp_vector_search/core/project.py +6 -3
- mcp_vector_search/utils/gitignore.py +31 -23
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.7.6.dist-info}/METADATA +1 -1
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.7.6.dist-info}/RECORD +15 -15
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.7.6.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.7.6.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.7.5.dist-info → mcp_vector_search-0.7.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Database abstraction and ChromaDB implementation for MCP Vector Search."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import shutil
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
6
|
from pathlib import Path
|
|
@@ -369,38 +370,67 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
369
370
|
raise DatabaseError(f"Failed to delete chunks: {e}") from e
|
|
370
371
|
|
|
371
372
|
async def get_stats(self) -> IndexStats:
|
|
372
|
-
"""Get database statistics."""
|
|
373
|
+
"""Get database statistics with optimized chunked queries."""
|
|
373
374
|
if not self._collection:
|
|
374
375
|
raise DatabaseNotInitializedError("Database not initialized")
|
|
375
376
|
|
|
376
377
|
try:
|
|
377
|
-
# Get total count
|
|
378
|
+
# Get total count (fast operation)
|
|
378
379
|
count = self._collection.count()
|
|
379
380
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
381
|
+
if count == 0:
|
|
382
|
+
return IndexStats(
|
|
383
|
+
total_files=0,
|
|
384
|
+
total_chunks=0,
|
|
385
|
+
languages={},
|
|
386
|
+
file_types={},
|
|
387
|
+
index_size_mb=0.0,
|
|
388
|
+
last_updated="N/A",
|
|
389
|
+
embedding_model="unknown",
|
|
390
|
+
)
|
|
383
391
|
|
|
384
|
-
#
|
|
385
|
-
|
|
392
|
+
# Process in chunks to avoid loading everything at once
|
|
393
|
+
batch_size_limit = 1000
|
|
386
394
|
|
|
387
|
-
|
|
388
|
-
language_counts = {}
|
|
389
|
-
file_type_counts = {}
|
|
395
|
+
files = set()
|
|
396
|
+
language_counts: dict[str, int] = {}
|
|
397
|
+
file_type_counts: dict[str, int] = {}
|
|
390
398
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
399
|
+
offset = 0
|
|
400
|
+
while offset < count:
|
|
401
|
+
# Fetch batch
|
|
402
|
+
batch_size = min(batch_size_limit, count - offset)
|
|
403
|
+
logger.debug(
|
|
404
|
+
f"Processing database stats: batch {offset // batch_size_limit + 1}, "
|
|
405
|
+
f"{offset}-{offset + batch_size} of {count} chunks"
|
|
406
|
+
)
|
|
395
407
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
408
|
+
results = self._collection.get(
|
|
409
|
+
include=["metadatas"],
|
|
410
|
+
limit=batch_size,
|
|
411
|
+
offset=offset,
|
|
412
|
+
)
|
|
401
413
|
|
|
402
|
-
|
|
403
|
-
|
|
414
|
+
# Process batch metadata
|
|
415
|
+
for metadata in results.get("metadatas", []):
|
|
416
|
+
# Language stats
|
|
417
|
+
lang = metadata.get("language", "unknown")
|
|
418
|
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
|
419
|
+
|
|
420
|
+
# File stats
|
|
421
|
+
file_path = metadata.get("file_path", "")
|
|
422
|
+
if file_path:
|
|
423
|
+
files.add(file_path)
|
|
424
|
+
ext = Path(file_path).suffix or "no_extension"
|
|
425
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
426
|
+
|
|
427
|
+
offset += batch_size
|
|
428
|
+
|
|
429
|
+
# Yield to event loop periodically to prevent blocking
|
|
430
|
+
await asyncio.sleep(0)
|
|
431
|
+
|
|
432
|
+
# Estimate index size (rough approximation: ~1KB per chunk)
|
|
433
|
+
index_size_mb = count * 0.001
|
|
404
434
|
|
|
405
435
|
return IndexStats(
|
|
406
436
|
total_files=len(files),
|
|
@@ -408,12 +438,13 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
408
438
|
languages=language_counts,
|
|
409
439
|
file_types=file_type_counts,
|
|
410
440
|
index_size_mb=index_size_mb,
|
|
411
|
-
last_updated="unknown",
|
|
412
|
-
embedding_model="unknown",
|
|
441
|
+
last_updated="unknown",
|
|
442
|
+
embedding_model="unknown",
|
|
413
443
|
)
|
|
414
444
|
|
|
415
445
|
except Exception as e:
|
|
416
|
-
logger.error(f"Failed to get
|
|
446
|
+
logger.error(f"Failed to get database statistics: {e}")
|
|
447
|
+
# Return empty stats instead of raising
|
|
417
448
|
return IndexStats(
|
|
418
449
|
total_files=0,
|
|
419
450
|
total_chunks=0,
|
|
@@ -768,56 +799,88 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
768
799
|
raise DatabaseError(f"Failed to delete chunks: {e}") from e
|
|
769
800
|
|
|
770
801
|
async def get_stats(self) -> IndexStats:
|
|
771
|
-
"""Get database statistics
|
|
802
|
+
"""Get database statistics with connection pooling and chunked queries."""
|
|
772
803
|
try:
|
|
773
804
|
async with self._pool.get_connection() as conn:
|
|
774
|
-
# Get total count
|
|
805
|
+
# Get total count (fast operation)
|
|
775
806
|
count = conn.collection.count()
|
|
776
807
|
|
|
777
|
-
|
|
778
|
-
|
|
808
|
+
if count == 0:
|
|
809
|
+
return IndexStats(
|
|
810
|
+
total_files=0,
|
|
811
|
+
total_chunks=0,
|
|
812
|
+
languages={},
|
|
813
|
+
file_types={},
|
|
814
|
+
index_size_mb=0.0,
|
|
815
|
+
last_updated="N/A",
|
|
816
|
+
embedding_model="unknown",
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
# Process in chunks to avoid loading everything at once
|
|
820
|
+
batch_size_limit = 1000
|
|
779
821
|
|
|
780
|
-
# Analyze languages and files
|
|
781
|
-
languages = set()
|
|
782
822
|
files = set()
|
|
823
|
+
language_counts: dict[str, int] = {}
|
|
824
|
+
file_type_counts: dict[str, int] = {}
|
|
825
|
+
|
|
826
|
+
offset = 0
|
|
827
|
+
while offset < count:
|
|
828
|
+
# Fetch batch
|
|
829
|
+
batch_size = min(batch_size_limit, count - offset)
|
|
830
|
+
logger.debug(
|
|
831
|
+
f"Processing database stats: batch {offset // batch_size_limit + 1}, "
|
|
832
|
+
f"{offset}-{offset + batch_size} of {count} chunks"
|
|
833
|
+
)
|
|
783
834
|
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
835
|
+
results = conn.collection.get(
|
|
836
|
+
include=["metadatas"],
|
|
837
|
+
limit=batch_size,
|
|
838
|
+
offset=offset,
|
|
839
|
+
)
|
|
789
840
|
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
841
|
+
# Process batch metadata
|
|
842
|
+
for metadata in results.get("metadatas", []):
|
|
843
|
+
# Language stats
|
|
844
|
+
lang = metadata.get("language", "unknown")
|
|
845
|
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
|
793
846
|
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
847
|
+
# File stats
|
|
848
|
+
file_path = metadata.get("file_path", "")
|
|
849
|
+
if file_path:
|
|
850
|
+
files.add(file_path)
|
|
851
|
+
ext = Path(file_path).suffix or "no_extension"
|
|
852
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
798
853
|
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
854
|
+
offset += batch_size
|
|
855
|
+
|
|
856
|
+
# Yield to event loop periodically to prevent blocking
|
|
857
|
+
await asyncio.sleep(0)
|
|
804
858
|
|
|
805
|
-
# Estimate index size (rough approximation)
|
|
806
|
-
index_size_mb = count * 0.001
|
|
859
|
+
# Estimate index size (rough approximation: ~1KB per chunk)
|
|
860
|
+
index_size_mb = count * 0.001
|
|
807
861
|
|
|
808
862
|
return IndexStats(
|
|
809
|
-
total_chunks=count,
|
|
810
863
|
total_files=len(files),
|
|
864
|
+
total_chunks=count,
|
|
811
865
|
languages=language_counts,
|
|
812
866
|
file_types=file_type_counts,
|
|
813
867
|
index_size_mb=index_size_mb,
|
|
814
|
-
last_updated="unknown",
|
|
815
|
-
embedding_model="unknown",
|
|
868
|
+
last_updated="unknown",
|
|
869
|
+
embedding_model="unknown",
|
|
816
870
|
)
|
|
817
871
|
|
|
818
872
|
except Exception as e:
|
|
819
|
-
logger.error(f"Failed to get database
|
|
820
|
-
|
|
873
|
+
logger.error(f"Failed to get database statistics: {e}")
|
|
874
|
+
# Return empty stats instead of raising
|
|
875
|
+
return IndexStats(
|
|
876
|
+
total_files=0,
|
|
877
|
+
total_chunks=0,
|
|
878
|
+
languages={},
|
|
879
|
+
file_types={},
|
|
880
|
+
index_size_mb=0.0,
|
|
881
|
+
last_updated="error",
|
|
882
|
+
embedding_model="unknown",
|
|
883
|
+
)
|
|
821
884
|
|
|
822
885
|
async def remove_file_chunks(self, file_path: str) -> int:
|
|
823
886
|
"""Remove all chunks for a specific file using pooled connection."""
|
|
@@ -57,6 +57,11 @@ class SemanticIndexer:
|
|
|
57
57
|
project_root / ".mcp-vector-search" / "index_metadata.json"
|
|
58
58
|
)
|
|
59
59
|
|
|
60
|
+
# Add cache for indexable files to avoid repeated filesystem scans
|
|
61
|
+
self._indexable_files_cache: list[Path] | None = None
|
|
62
|
+
self._cache_timestamp: float = 0
|
|
63
|
+
self._cache_ttl: float = 60.0 # 60 second TTL
|
|
64
|
+
|
|
60
65
|
# Initialize gitignore parser
|
|
61
66
|
try:
|
|
62
67
|
self.gitignore_parser = create_gitignore_parser(project_root)
|
|
@@ -334,38 +339,120 @@ class SemanticIndexer:
|
|
|
334
339
|
return 0
|
|
335
340
|
|
|
336
341
|
def _find_indexable_files(self) -> list[Path]:
|
|
337
|
-
"""Find all files that should be indexed.
|
|
342
|
+
"""Find all files that should be indexed with caching.
|
|
338
343
|
|
|
339
344
|
Returns:
|
|
340
345
|
List of file paths to index
|
|
341
346
|
"""
|
|
347
|
+
import time
|
|
348
|
+
|
|
349
|
+
# Check cache
|
|
350
|
+
current_time = time.time()
|
|
351
|
+
if (
|
|
352
|
+
self._indexable_files_cache is not None
|
|
353
|
+
and current_time - self._cache_timestamp < self._cache_ttl
|
|
354
|
+
):
|
|
355
|
+
logger.debug(
|
|
356
|
+
f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
|
|
357
|
+
)
|
|
358
|
+
return self._indexable_files_cache
|
|
359
|
+
|
|
360
|
+
# Rebuild cache using efficient directory filtering
|
|
361
|
+
logger.debug("Rebuilding indexable files cache...")
|
|
362
|
+
indexable_files = self._scan_files_sync()
|
|
363
|
+
|
|
364
|
+
self._indexable_files_cache = sorted(indexable_files)
|
|
365
|
+
self._cache_timestamp = current_time
|
|
366
|
+
logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
|
|
367
|
+
|
|
368
|
+
return self._indexable_files_cache
|
|
369
|
+
|
|
370
|
+
def _scan_files_sync(self) -> list[Path]:
|
|
371
|
+
"""Synchronous file scanning (runs in thread pool).
|
|
372
|
+
|
|
373
|
+
Uses os.walk with directory filtering to avoid traversing ignored directories.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
List of indexable file paths
|
|
377
|
+
"""
|
|
342
378
|
indexable_files = []
|
|
343
379
|
|
|
344
|
-
for
|
|
345
|
-
|
|
346
|
-
|
|
380
|
+
# Use os.walk for efficient directory traversal with early filtering
|
|
381
|
+
for root, dirs, files in os.walk(self.project_root):
|
|
382
|
+
root_path = Path(root)
|
|
383
|
+
|
|
384
|
+
# Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
|
|
385
|
+
# This is much more efficient than checking every file in ignored directories
|
|
386
|
+
# PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
|
|
387
|
+
dirs[:] = [d for d in dirs if not self._should_ignore_path(root_path / d, is_directory=True)]
|
|
388
|
+
|
|
389
|
+
# Check each file in the current directory
|
|
390
|
+
# PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
|
|
391
|
+
for filename in files:
|
|
392
|
+
file_path = root_path / filename
|
|
393
|
+
if self._should_index_file(file_path, skip_file_check=True):
|
|
394
|
+
indexable_files.append(file_path)
|
|
347
395
|
|
|
348
|
-
return
|
|
396
|
+
return indexable_files
|
|
349
397
|
|
|
350
|
-
def
|
|
398
|
+
async def _find_indexable_files_async(self) -> list[Path]:
|
|
399
|
+
"""Find all files asynchronously without blocking event loop.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
List of file paths to index
|
|
403
|
+
"""
|
|
404
|
+
import time
|
|
405
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
406
|
+
|
|
407
|
+
# Check cache first
|
|
408
|
+
current_time = time.time()
|
|
409
|
+
if (
|
|
410
|
+
self._indexable_files_cache is not None
|
|
411
|
+
and current_time - self._cache_timestamp < self._cache_ttl
|
|
412
|
+
):
|
|
413
|
+
logger.debug(
|
|
414
|
+
f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
|
|
415
|
+
)
|
|
416
|
+
return self._indexable_files_cache
|
|
417
|
+
|
|
418
|
+
# Run filesystem scan in thread pool to avoid blocking
|
|
419
|
+
logger.debug("Scanning files in background thread...")
|
|
420
|
+
loop = asyncio.get_running_loop()
|
|
421
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
422
|
+
indexable_files = await loop.run_in_executor(
|
|
423
|
+
executor, self._scan_files_sync
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# Update cache
|
|
427
|
+
self._indexable_files_cache = sorted(indexable_files)
|
|
428
|
+
self._cache_timestamp = current_time
|
|
429
|
+
logger.debug(f"Found {len(indexable_files)} indexable files")
|
|
430
|
+
|
|
431
|
+
return self._indexable_files_cache
|
|
432
|
+
|
|
433
|
+
def _should_index_file(self, file_path: Path, skip_file_check: bool = False) -> bool:
|
|
351
434
|
"""Check if a file should be indexed.
|
|
352
435
|
|
|
353
436
|
Args:
|
|
354
437
|
file_path: Path to check
|
|
438
|
+
skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
|
|
355
439
|
|
|
356
440
|
Returns:
|
|
357
441
|
True if file should be indexed
|
|
358
442
|
"""
|
|
359
|
-
#
|
|
360
|
-
|
|
443
|
+
# PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
|
|
444
|
+
# This eliminates most files without any filesystem calls
|
|
445
|
+
if file_path.suffix.lower() not in self.file_extensions:
|
|
361
446
|
return False
|
|
362
447
|
|
|
363
|
-
#
|
|
364
|
-
|
|
448
|
+
# PERFORMANCE: Only check is_file() if not coming from os.walk
|
|
449
|
+
# os.walk already guarantees files, so we skip this expensive check
|
|
450
|
+
if not skip_file_check and not file_path.is_file():
|
|
365
451
|
return False
|
|
366
452
|
|
|
367
453
|
# Check if path should be ignored
|
|
368
|
-
|
|
454
|
+
# PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
|
|
455
|
+
if self._should_ignore_path(file_path, is_directory=False):
|
|
369
456
|
return False
|
|
370
457
|
|
|
371
458
|
# Check file size (skip very large files)
|
|
@@ -379,18 +466,20 @@ class SemanticIndexer:
|
|
|
379
466
|
|
|
380
467
|
return True
|
|
381
468
|
|
|
382
|
-
def _should_ignore_path(self, file_path: Path) -> bool:
|
|
469
|
+
def _should_ignore_path(self, file_path: Path, is_directory: bool | None = None) -> bool:
|
|
383
470
|
"""Check if a path should be ignored.
|
|
384
471
|
|
|
385
472
|
Args:
|
|
386
473
|
file_path: Path to check
|
|
474
|
+
is_directory: Optional hint if path is a directory (avoids filesystem check)
|
|
387
475
|
|
|
388
476
|
Returns:
|
|
389
477
|
True if path should be ignored
|
|
390
478
|
"""
|
|
391
479
|
try:
|
|
392
480
|
# First check gitignore rules if available
|
|
393
|
-
|
|
481
|
+
# PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
|
|
482
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path, is_directory=is_directory):
|
|
394
483
|
logger.debug(f"Path ignored by .gitignore: {file_path}")
|
|
395
484
|
return True
|
|
396
485
|
|
|
@@ -532,8 +621,8 @@ class SemanticIndexer:
|
|
|
532
621
|
# Get database stats
|
|
533
622
|
db_stats = await self.database.get_stats()
|
|
534
623
|
|
|
535
|
-
# Count indexable files
|
|
536
|
-
indexable_files = self.
|
|
624
|
+
# Count indexable files asynchronously without blocking
|
|
625
|
+
indexable_files = await self._find_indexable_files_async()
|
|
537
626
|
|
|
538
627
|
return {
|
|
539
628
|
"total_indexable_files": len(indexable_files),
|
|
@@ -553,3 +642,90 @@ class SemanticIndexer:
|
|
|
553
642
|
"indexed_files": 0,
|
|
554
643
|
"total_chunks": 0,
|
|
555
644
|
}
|
|
645
|
+
|
|
646
|
+
async def get_files_to_index(
|
|
647
|
+
self, force_reindex: bool = False
|
|
648
|
+
) -> tuple[list[Path], list[Path]]:
|
|
649
|
+
"""Get all indexable files and those that need indexing.
|
|
650
|
+
|
|
651
|
+
Args:
|
|
652
|
+
force_reindex: Whether to force reindex of all files
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
Tuple of (all_indexable_files, files_to_index)
|
|
656
|
+
"""
|
|
657
|
+
# Find all indexable files
|
|
658
|
+
all_files = await self._find_indexable_files_async()
|
|
659
|
+
|
|
660
|
+
if not all_files:
|
|
661
|
+
return [], []
|
|
662
|
+
|
|
663
|
+
# Load existing metadata for incremental indexing
|
|
664
|
+
metadata = self._load_index_metadata()
|
|
665
|
+
|
|
666
|
+
# Filter files that need indexing
|
|
667
|
+
if force_reindex:
|
|
668
|
+
files_to_index = all_files
|
|
669
|
+
logger.info(f"Force reindex: processing all {len(files_to_index)} files")
|
|
670
|
+
else:
|
|
671
|
+
files_to_index = [
|
|
672
|
+
f for f in all_files if self._needs_reindexing(f, metadata)
|
|
673
|
+
]
|
|
674
|
+
logger.info(
|
|
675
|
+
f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
return all_files, files_to_index
|
|
679
|
+
|
|
680
|
+
async def index_files_with_progress(
|
|
681
|
+
self,
|
|
682
|
+
files_to_index: list[Path],
|
|
683
|
+
force_reindex: bool = False,
|
|
684
|
+
):
|
|
685
|
+
"""Index files and yield progress updates for each file.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
files_to_index: List of file paths to index
|
|
689
|
+
force_reindex: Whether to force reindexing
|
|
690
|
+
|
|
691
|
+
Yields:
|
|
692
|
+
Tuple of (file_path, chunks_added, success) for each processed file
|
|
693
|
+
"""
|
|
694
|
+
metadata = self._load_index_metadata()
|
|
695
|
+
|
|
696
|
+
# Process files in batches for better memory management
|
|
697
|
+
for i in range(0, len(files_to_index), self.batch_size):
|
|
698
|
+
batch = files_to_index[i : i + self.batch_size]
|
|
699
|
+
|
|
700
|
+
# Process each file in the batch
|
|
701
|
+
for file_path in batch:
|
|
702
|
+
chunks_added = 0
|
|
703
|
+
success = False
|
|
704
|
+
|
|
705
|
+
try:
|
|
706
|
+
# Always remove existing chunks when reindexing
|
|
707
|
+
await self.database.delete_by_file(file_path)
|
|
708
|
+
|
|
709
|
+
# Parse file into chunks
|
|
710
|
+
chunks = await self._parse_file(file_path)
|
|
711
|
+
|
|
712
|
+
if chunks:
|
|
713
|
+
# Add chunks to database
|
|
714
|
+
await self.database.add_chunks(chunks)
|
|
715
|
+
chunks_added = len(chunks)
|
|
716
|
+
logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
|
|
717
|
+
|
|
718
|
+
success = True
|
|
719
|
+
|
|
720
|
+
# Update metadata after successful indexing
|
|
721
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
722
|
+
|
|
723
|
+
except Exception as e:
|
|
724
|
+
logger.error(f"Failed to index file {file_path}: {e}")
|
|
725
|
+
success = False
|
|
726
|
+
|
|
727
|
+
# Yield progress update
|
|
728
|
+
yield (file_path, chunks_added, success)
|
|
729
|
+
|
|
730
|
+
# Save metadata at the end
|
|
731
|
+
self._save_index_metadata(metadata)
|
|
@@ -281,24 +281,27 @@ class ProjectManager:
|
|
|
281
281
|
continue
|
|
282
282
|
|
|
283
283
|
# Skip ignored patterns
|
|
284
|
-
|
|
284
|
+
# PERFORMANCE: Pass is_directory=False since we already checked is_file()
|
|
285
|
+
if self._should_ignore_path(path, is_directory=False):
|
|
285
286
|
continue
|
|
286
287
|
|
|
287
288
|
files.append(path)
|
|
288
289
|
|
|
289
290
|
return files
|
|
290
291
|
|
|
291
|
-
def _should_ignore_path(self, path: Path) -> bool:
|
|
292
|
+
def _should_ignore_path(self, path: Path, is_directory: bool | None = None) -> bool:
|
|
292
293
|
"""Check if a path should be ignored.
|
|
293
294
|
|
|
294
295
|
Args:
|
|
295
296
|
path: Path to check
|
|
297
|
+
is_directory: Optional hint if path is a directory (avoids filesystem check)
|
|
296
298
|
|
|
297
299
|
Returns:
|
|
298
300
|
True if path should be ignored
|
|
299
301
|
"""
|
|
300
302
|
# First check gitignore rules if available
|
|
301
|
-
|
|
303
|
+
# PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
|
|
304
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(path, is_directory=is_directory):
|
|
302
305
|
return True
|
|
303
306
|
|
|
304
307
|
# Check if any parent directory is in ignore patterns
|
|
@@ -102,16 +102,18 @@ class GitignoreParser:
|
|
|
102
102
|
self._load_gitignore_files()
|
|
103
103
|
|
|
104
104
|
def _load_gitignore_files(self) -> None:
|
|
105
|
-
"""Load
|
|
106
|
-
# Load global .gitignore first (if exists)
|
|
107
|
-
global_gitignore = self.project_root / ".gitignore"
|
|
108
|
-
if global_gitignore.exists():
|
|
109
|
-
self._parse_gitignore_file(global_gitignore)
|
|
105
|
+
"""Load .gitignore file from project root only.
|
|
110
106
|
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
107
|
+
Note: Only the root .gitignore is loaded to avoid performance issues
|
|
108
|
+
with rglob traversing large directory trees (e.g., node_modules with
|
|
109
|
+
250K+ files). Subdirectory .gitignore files are intentionally skipped
|
|
110
|
+
as they would add significant overhead without much benefit for
|
|
111
|
+
semantic code search indexing.
|
|
112
|
+
"""
|
|
113
|
+
# Load root .gitignore only
|
|
114
|
+
root_gitignore = self.project_root / ".gitignore"
|
|
115
|
+
if root_gitignore.exists():
|
|
116
|
+
self._parse_gitignore_file(root_gitignore)
|
|
115
117
|
|
|
116
118
|
def _parse_gitignore_file(self, gitignore_path: Path) -> None:
|
|
117
119
|
"""Parse a single .gitignore file.
|
|
@@ -136,32 +138,32 @@ class GitignoreParser:
|
|
|
136
138
|
# Check for directory-only pattern
|
|
137
139
|
is_directory_only = line.endswith("/")
|
|
138
140
|
|
|
139
|
-
# Create pattern
|
|
140
|
-
gitignore_dir = gitignore_path.parent
|
|
141
|
-
if gitignore_dir != self.project_root:
|
|
142
|
-
# Adjust pattern for subdirectory .gitignore files
|
|
143
|
-
relative_dir = gitignore_dir.relative_to(self.project_root)
|
|
144
|
-
if not line.startswith("/") and not is_negation:
|
|
145
|
-
line = str(relative_dir / line)
|
|
146
|
-
elif is_negation and not line[1:].startswith("/"):
|
|
147
|
-
line = "!" + str(relative_dir / line[1:])
|
|
148
|
-
|
|
141
|
+
# Create pattern (all patterns are from root .gitignore)
|
|
149
142
|
pattern = GitignorePattern(line, is_negation, is_directory_only)
|
|
150
143
|
self.patterns.append(pattern)
|
|
151
144
|
|
|
152
145
|
except Exception as e:
|
|
153
146
|
logger.warning(f"Failed to parse {gitignore_path}: {e}")
|
|
154
147
|
|
|
155
|
-
def is_ignored(self, path: Path) -> bool:
|
|
148
|
+
def is_ignored(self, path: Path, is_directory: bool | None = None) -> bool:
|
|
156
149
|
"""Check if a path should be ignored according to .gitignore rules.
|
|
157
150
|
|
|
158
151
|
Args:
|
|
159
152
|
path: Path to check (can be absolute or relative to project root)
|
|
153
|
+
is_directory: Optional hint if path is a directory.
|
|
154
|
+
If None, will check filesystem (slower).
|
|
155
|
+
If provided, skips filesystem check (faster).
|
|
160
156
|
|
|
161
157
|
Returns:
|
|
162
158
|
True if the path should be ignored
|
|
163
159
|
"""
|
|
164
160
|
try:
|
|
161
|
+
# SHORT-CIRCUIT: If no patterns, nothing is ignored
|
|
162
|
+
# This prevents 200k+ unnecessary filesystem stat() calls on projects
|
|
163
|
+
# without .gitignore files
|
|
164
|
+
if not self.patterns:
|
|
165
|
+
return False
|
|
166
|
+
|
|
165
167
|
# Convert to relative path from project root
|
|
166
168
|
if path.is_absolute():
|
|
167
169
|
relative_path = path.relative_to(self.project_root)
|
|
@@ -169,7 +171,12 @@ class GitignoreParser:
|
|
|
169
171
|
relative_path = path
|
|
170
172
|
|
|
171
173
|
path_str = str(relative_path).replace("\\", "/")
|
|
172
|
-
|
|
174
|
+
|
|
175
|
+
# Only check if directory when needed and not provided as hint
|
|
176
|
+
# PERFORMANCE: Passing is_directory hint from caller (e.g., os.walk)
|
|
177
|
+
# avoids hundreds of thousands of stat() calls on large repositories
|
|
178
|
+
if is_directory is None:
|
|
179
|
+
is_directory = path.is_dir() if path.exists() else False
|
|
173
180
|
|
|
174
181
|
# Apply patterns in order, with later patterns overriding earlier ones
|
|
175
182
|
ignored = False
|
|
@@ -216,15 +223,16 @@ def create_gitignore_parser(project_root: Path) -> GitignoreParser:
|
|
|
216
223
|
return GitignoreParser(project_root)
|
|
217
224
|
|
|
218
225
|
|
|
219
|
-
def is_path_gitignored(path: Path, project_root: Path) -> bool:
|
|
226
|
+
def is_path_gitignored(path: Path, project_root: Path, is_directory: bool | None = None) -> bool:
|
|
220
227
|
"""Quick function to check if a path is gitignored.
|
|
221
228
|
|
|
222
229
|
Args:
|
|
223
230
|
path: Path to check
|
|
224
231
|
project_root: Root directory of the project
|
|
232
|
+
is_directory: Optional hint if path is a directory (avoids filesystem check)
|
|
225
233
|
|
|
226
234
|
Returns:
|
|
227
235
|
True if the path should be ignored
|
|
228
236
|
"""
|
|
229
237
|
parser = create_gitignore_parser(project_root)
|
|
230
|
-
return parser.is_ignored(path)
|
|
238
|
+
return parser.is_ignored(path, is_directory=is_directory)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcp-vector-search
|
|
3
|
-
Version: 0.7.
|
|
3
|
+
Version: 0.7.6
|
|
4
4
|
Summary: CLI-first semantic code search with MCP integration
|
|
5
5
|
Project-URL: Homepage, https://github.com/bobmatnyc/mcp-vector-search
|
|
6
6
|
Project-URL: Documentation, https://mcp-vector-search.readthedocs.io
|