mcp-vector-search 0.7.4__py3-none-any.whl → 0.7.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

@@ -102,15 +102,28 @@ def main(
102
102
  if project_root is None:
103
103
  project_root = Path.cwd()
104
104
 
105
- asyncio.run(
106
- show_status(
107
- project_root=project_root,
108
- verbose=verbose,
109
- health_check=health_check,
110
- mcp=mcp,
111
- json_output=json_output,
112
- )
113
- )
105
+ async def run_status_with_timeout():
106
+ """Run status command with timeout protection."""
107
+ try:
108
+ await asyncio.wait_for(
109
+ show_status(
110
+ project_root=project_root,
111
+ verbose=verbose,
112
+ health_check=health_check,
113
+ mcp=mcp,
114
+ json_output=json_output,
115
+ ),
116
+ timeout=30.0, # 30 second timeout
117
+ )
118
+ except TimeoutError:
119
+ logger.error("Status check timed out after 30 seconds")
120
+ print_error(
121
+ "Status check timed out after 30 seconds. "
122
+ "Try running with --verbose for more details."
123
+ )
124
+ raise typer.Exit(1)
125
+
126
+ asyncio.run(run_status_with_timeout())
114
127
 
115
128
  except Exception as e:
116
129
  logger.error(f"Status check failed: {e}")
@@ -162,6 +175,7 @@ async def show_status(
162
175
  file_extensions=config.file_extensions,
163
176
  )
164
177
 
178
+ # Get indexing stats (runs async file scanning in thread pool)
165
179
  async with database:
166
180
  index_stats = await indexer.get_indexing_stats()
167
181
  db_stats = await database.get_stats()
@@ -39,7 +39,7 @@ unfamiliar codebases, finding similar patterns, and integrating with AI tools.
39
39
  status 📊 Show project status
40
40
  search 🔍 Search code semantically
41
41
  index 📇 Index codebase
42
- mcp 🤖 MCP integration
42
+ mcp 🤖 MCP integration for AI tools
43
43
  config ⚙️ Configure settings
44
44
  help ❓ Get help
45
45
  version ℹ️ Show version
@@ -84,7 +84,7 @@ app.add_typer(search_app, name="search", help="🔍 Search code semantically")
84
84
  app.add_typer(index_app, name="index", help="📇 Index codebase for semantic search")
85
85
 
86
86
  # 7. MCP - MCP integration
87
- app.add_typer(mcp_app, name="mcp", help="🤖 Manage Claude Code MCP integration")
87
+ app.add_typer(mcp_app, name="mcp", help="🤖 Manage MCP integration for AI tools")
88
88
 
89
89
  # 8. CONFIG - Configuration
90
90
  app.add_typer(config_app, name="config", help="⚙️ Manage project configuration")
@@ -122,8 +122,6 @@ def deprecated_install():
122
122
  _deprecated_command("install", "init")()
123
123
 
124
124
 
125
-
126
-
127
125
  # Deprecated: find -> search
128
126
  @app.command("find", hidden=True)
129
127
  def deprecated_find():
@@ -1,5 +1,6 @@
1
1
  """Database abstraction and ChromaDB implementation for MCP Vector Search."""
2
2
 
3
+ import asyncio
3
4
  import shutil
4
5
  from abc import ABC, abstractmethod
5
6
  from pathlib import Path
@@ -369,38 +370,67 @@ class ChromaVectorDatabase(VectorDatabase):
369
370
  raise DatabaseError(f"Failed to delete chunks: {e}") from e
370
371
 
371
372
  async def get_stats(self) -> IndexStats:
372
- """Get database statistics."""
373
+ """Get database statistics with optimized chunked queries."""
373
374
  if not self._collection:
374
375
  raise DatabaseNotInitializedError("Database not initialized")
375
376
 
376
377
  try:
377
- # Get total count
378
+ # Get total count (fast operation)
378
379
  count = self._collection.count()
379
380
 
380
- # Get ALL metadata to analyze (not just a sample)
381
- # Only fetch metadata, not embeddings, for performance
382
- results = self._collection.get(include=["metadatas"])
381
+ if count == 0:
382
+ return IndexStats(
383
+ total_files=0,
384
+ total_chunks=0,
385
+ languages={},
386
+ file_types={},
387
+ index_size_mb=0.0,
388
+ last_updated="N/A",
389
+ embedding_model="unknown",
390
+ )
383
391
 
384
- # Count unique files from all chunks
385
- files = {m.get("file_path", "") for m in results.get("metadatas", [])}
392
+ # Process in chunks to avoid loading everything at once
393
+ batch_size_limit = 1000
386
394
 
387
- # Count languages and file types
388
- language_counts = {}
389
- file_type_counts = {}
395
+ files = set()
396
+ language_counts: dict[str, int] = {}
397
+ file_type_counts: dict[str, int] = {}
390
398
 
391
- for metadata in results.get("metadatas", []):
392
- # Count languages
393
- lang = metadata.get("language", "unknown")
394
- language_counts[lang] = language_counts.get(lang, 0) + 1
399
+ offset = 0
400
+ while offset < count:
401
+ # Fetch batch
402
+ batch_size = min(batch_size_limit, count - offset)
403
+ logger.debug(
404
+ f"Processing database stats: batch {offset // batch_size_limit + 1}, "
405
+ f"{offset}-{offset + batch_size} of {count} chunks"
406
+ )
395
407
 
396
- # Count file types
397
- file_path = metadata.get("file_path", "")
398
- if file_path:
399
- ext = Path(file_path).suffix or "no_extension"
400
- file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
408
+ results = self._collection.get(
409
+ include=["metadatas"],
410
+ limit=batch_size,
411
+ offset=offset,
412
+ )
401
413
 
402
- # Estimate index size (rough approximation)
403
- index_size_mb = count * 0.001 # Rough estimate
414
+ # Process batch metadata
415
+ for metadata in results.get("metadatas", []):
416
+ # Language stats
417
+ lang = metadata.get("language", "unknown")
418
+ language_counts[lang] = language_counts.get(lang, 0) + 1
419
+
420
+ # File stats
421
+ file_path = metadata.get("file_path", "")
422
+ if file_path:
423
+ files.add(file_path)
424
+ ext = Path(file_path).suffix or "no_extension"
425
+ file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
426
+
427
+ offset += batch_size
428
+
429
+ # Yield to event loop periodically to prevent blocking
430
+ await asyncio.sleep(0)
431
+
432
+ # Estimate index size (rough approximation: ~1KB per chunk)
433
+ index_size_mb = count * 0.001
404
434
 
405
435
  return IndexStats(
406
436
  total_files=len(files),
@@ -408,12 +438,13 @@ class ChromaVectorDatabase(VectorDatabase):
408
438
  languages=language_counts,
409
439
  file_types=file_type_counts,
410
440
  index_size_mb=index_size_mb,
411
- last_updated="unknown", # TODO: Track this
412
- embedding_model="unknown", # TODO: Track this
441
+ last_updated="unknown",
442
+ embedding_model="unknown",
413
443
  )
414
444
 
415
445
  except Exception as e:
416
- logger.error(f"Failed to get stats: {e}")
446
+ logger.error(f"Failed to get database statistics: {e}")
447
+ # Return empty stats instead of raising
417
448
  return IndexStats(
418
449
  total_files=0,
419
450
  total_chunks=0,
@@ -768,56 +799,88 @@ class PooledChromaVectorDatabase(VectorDatabase):
768
799
  raise DatabaseError(f"Failed to delete chunks: {e}") from e
769
800
 
770
801
  async def get_stats(self) -> IndexStats:
771
- """Get database statistics using pooled connection."""
802
+ """Get database statistics with connection pooling and chunked queries."""
772
803
  try:
773
804
  async with self._pool.get_connection() as conn:
774
- # Get total count
805
+ # Get total count (fast operation)
775
806
  count = conn.collection.count()
776
807
 
777
- # Get all metadata to analyze
778
- results = conn.collection.get(include=["metadatas"])
808
+ if count == 0:
809
+ return IndexStats(
810
+ total_files=0,
811
+ total_chunks=0,
812
+ languages={},
813
+ file_types={},
814
+ index_size_mb=0.0,
815
+ last_updated="N/A",
816
+ embedding_model="unknown",
817
+ )
818
+
819
+ # Process in chunks to avoid loading everything at once
820
+ batch_size_limit = 1000
779
821
 
780
- # Analyze languages and files
781
- languages = set()
782
822
  files = set()
823
+ language_counts: dict[str, int] = {}
824
+ file_type_counts: dict[str, int] = {}
825
+
826
+ offset = 0
827
+ while offset < count:
828
+ # Fetch batch
829
+ batch_size = min(batch_size_limit, count - offset)
830
+ logger.debug(
831
+ f"Processing database stats: batch {offset // batch_size_limit + 1}, "
832
+ f"{offset}-{offset + batch_size} of {count} chunks"
833
+ )
783
834
 
784
- for metadata in results["metadatas"]:
785
- if "language" in metadata:
786
- languages.add(metadata["language"])
787
- if "file_path" in metadata:
788
- files.add(metadata["file_path"])
835
+ results = conn.collection.get(
836
+ include=["metadatas"],
837
+ limit=batch_size,
838
+ offset=offset,
839
+ )
789
840
 
790
- # Count languages and file types
791
- language_counts = {}
792
- file_type_counts = {}
841
+ # Process batch metadata
842
+ for metadata in results.get("metadatas", []):
843
+ # Language stats
844
+ lang = metadata.get("language", "unknown")
845
+ language_counts[lang] = language_counts.get(lang, 0) + 1
793
846
 
794
- for metadata in results["metadatas"]:
795
- # Count languages
796
- lang = metadata.get("language", "unknown")
797
- language_counts[lang] = language_counts.get(lang, 0) + 1
847
+ # File stats
848
+ file_path = metadata.get("file_path", "")
849
+ if file_path:
850
+ files.add(file_path)
851
+ ext = Path(file_path).suffix or "no_extension"
852
+ file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
798
853
 
799
- # Count file types
800
- file_path = metadata.get("file_path", "")
801
- if file_path:
802
- ext = Path(file_path).suffix or "no_extension"
803
- file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
854
+ offset += batch_size
855
+
856
+ # Yield to event loop periodically to prevent blocking
857
+ await asyncio.sleep(0)
804
858
 
805
- # Estimate index size (rough approximation)
806
- index_size_mb = count * 0.001 # Rough estimate
859
+ # Estimate index size (rough approximation: ~1KB per chunk)
860
+ index_size_mb = count * 0.001
807
861
 
808
862
  return IndexStats(
809
- total_chunks=count,
810
863
  total_files=len(files),
864
+ total_chunks=count,
811
865
  languages=language_counts,
812
866
  file_types=file_type_counts,
813
867
  index_size_mb=index_size_mb,
814
- last_updated="unknown", # ChromaDB doesn't track this
815
- embedding_model="unknown", # TODO: Track this in metadata
868
+ last_updated="unknown",
869
+ embedding_model="unknown",
816
870
  )
817
871
 
818
872
  except Exception as e:
819
- logger.error(f"Failed to get database stats: {e}")
820
- raise DatabaseError(f"Failed to get stats: {e}") from e
873
+ logger.error(f"Failed to get database statistics: {e}")
874
+ # Return empty stats instead of raising
875
+ return IndexStats(
876
+ total_files=0,
877
+ total_chunks=0,
878
+ languages={},
879
+ file_types={},
880
+ index_size_mb=0.0,
881
+ last_updated="error",
882
+ embedding_model="unknown",
883
+ )
821
884
 
822
885
  async def remove_file_chunks(self, file_path: str) -> int:
823
886
  """Remove all chunks for a specific file using pooled connection."""
@@ -57,6 +57,11 @@ class SemanticIndexer:
57
57
  project_root / ".mcp-vector-search" / "index_metadata.json"
58
58
  )
59
59
 
60
+ # Add cache for indexable files to avoid repeated filesystem scans
61
+ self._indexable_files_cache: list[Path] | None = None
62
+ self._cache_timestamp: float = 0
63
+ self._cache_ttl: float = 60.0 # 60 second TTL
64
+
60
65
  # Initialize gitignore parser
61
66
  try:
62
67
  self.gitignore_parser = create_gitignore_parser(project_root)
@@ -334,38 +339,120 @@ class SemanticIndexer:
334
339
  return 0
335
340
 
336
341
  def _find_indexable_files(self) -> list[Path]:
337
- """Find all files that should be indexed.
342
+ """Find all files that should be indexed with caching.
338
343
 
339
344
  Returns:
340
345
  List of file paths to index
341
346
  """
347
+ import time
348
+
349
+ # Check cache
350
+ current_time = time.time()
351
+ if (
352
+ self._indexable_files_cache is not None
353
+ and current_time - self._cache_timestamp < self._cache_ttl
354
+ ):
355
+ logger.debug(
356
+ f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
357
+ )
358
+ return self._indexable_files_cache
359
+
360
+ # Rebuild cache using efficient directory filtering
361
+ logger.debug("Rebuilding indexable files cache...")
362
+ indexable_files = self._scan_files_sync()
363
+
364
+ self._indexable_files_cache = sorted(indexable_files)
365
+ self._cache_timestamp = current_time
366
+ logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
367
+
368
+ return self._indexable_files_cache
369
+
370
+ def _scan_files_sync(self) -> list[Path]:
371
+ """Synchronous file scanning (runs in thread pool).
372
+
373
+ Uses os.walk with directory filtering to avoid traversing ignored directories.
374
+
375
+ Returns:
376
+ List of indexable file paths
377
+ """
342
378
  indexable_files = []
343
379
 
344
- for file_path in self.project_root.rglob("*"):
345
- if self._should_index_file(file_path):
346
- indexable_files.append(file_path)
380
+ # Use os.walk for efficient directory traversal with early filtering
381
+ for root, dirs, files in os.walk(self.project_root):
382
+ root_path = Path(root)
383
+
384
+ # Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
385
+ # This is much more efficient than checking every file in ignored directories
386
+ # PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
387
+ dirs[:] = [d for d in dirs if not self._should_ignore_path(root_path / d, is_directory=True)]
388
+
389
+ # Check each file in the current directory
390
+ # PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
391
+ for filename in files:
392
+ file_path = root_path / filename
393
+ if self._should_index_file(file_path, skip_file_check=True):
394
+ indexable_files.append(file_path)
347
395
 
348
- return sorted(indexable_files)
396
+ return indexable_files
349
397
 
350
- def _should_index_file(self, file_path: Path) -> bool:
398
+ async def _find_indexable_files_async(self) -> list[Path]:
399
+ """Find all files asynchronously without blocking event loop.
400
+
401
+ Returns:
402
+ List of file paths to index
403
+ """
404
+ import time
405
+ from concurrent.futures import ThreadPoolExecutor
406
+
407
+ # Check cache first
408
+ current_time = time.time()
409
+ if (
410
+ self._indexable_files_cache is not None
411
+ and current_time - self._cache_timestamp < self._cache_ttl
412
+ ):
413
+ logger.debug(
414
+ f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
415
+ )
416
+ return self._indexable_files_cache
417
+
418
+ # Run filesystem scan in thread pool to avoid blocking
419
+ logger.debug("Scanning files in background thread...")
420
+ loop = asyncio.get_running_loop()
421
+ with ThreadPoolExecutor(max_workers=1) as executor:
422
+ indexable_files = await loop.run_in_executor(
423
+ executor, self._scan_files_sync
424
+ )
425
+
426
+ # Update cache
427
+ self._indexable_files_cache = sorted(indexable_files)
428
+ self._cache_timestamp = current_time
429
+ logger.debug(f"Found {len(indexable_files)} indexable files")
430
+
431
+ return self._indexable_files_cache
432
+
433
+ def _should_index_file(self, file_path: Path, skip_file_check: bool = False) -> bool:
351
434
  """Check if a file should be indexed.
352
435
 
353
436
  Args:
354
437
  file_path: Path to check
438
+ skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
355
439
 
356
440
  Returns:
357
441
  True if file should be indexed
358
442
  """
359
- # Must be a file
360
- if not file_path.is_file():
443
+ # PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
444
+ # This eliminates most files without any filesystem calls
445
+ if file_path.suffix.lower() not in self.file_extensions:
361
446
  return False
362
447
 
363
- # Check file extension
364
- if file_path.suffix.lower() not in self.file_extensions:
448
+ # PERFORMANCE: Only check is_file() if not coming from os.walk
449
+ # os.walk already guarantees files, so we skip this expensive check
450
+ if not skip_file_check and not file_path.is_file():
365
451
  return False
366
452
 
367
453
  # Check if path should be ignored
368
- if self._should_ignore_path(file_path):
454
+ # PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
455
+ if self._should_ignore_path(file_path, is_directory=False):
369
456
  return False
370
457
 
371
458
  # Check file size (skip very large files)
@@ -379,18 +466,20 @@ class SemanticIndexer:
379
466
 
380
467
  return True
381
468
 
382
- def _should_ignore_path(self, file_path: Path) -> bool:
469
+ def _should_ignore_path(self, file_path: Path, is_directory: bool | None = None) -> bool:
383
470
  """Check if a path should be ignored.
384
471
 
385
472
  Args:
386
473
  file_path: Path to check
474
+ is_directory: Optional hint if path is a directory (avoids filesystem check)
387
475
 
388
476
  Returns:
389
477
  True if path should be ignored
390
478
  """
391
479
  try:
392
480
  # First check gitignore rules if available
393
- if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path):
481
+ # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
482
+ if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path, is_directory=is_directory):
394
483
  logger.debug(f"Path ignored by .gitignore: {file_path}")
395
484
  return True
396
485
 
@@ -532,8 +621,8 @@ class SemanticIndexer:
532
621
  # Get database stats
533
622
  db_stats = await self.database.get_stats()
534
623
 
535
- # Count indexable files
536
- indexable_files = self._find_indexable_files()
624
+ # Count indexable files asynchronously without blocking
625
+ indexable_files = await self._find_indexable_files_async()
537
626
 
538
627
  return {
539
628
  "total_indexable_files": len(indexable_files),
@@ -553,3 +642,90 @@ class SemanticIndexer:
553
642
  "indexed_files": 0,
554
643
  "total_chunks": 0,
555
644
  }
645
+
646
+ async def get_files_to_index(
647
+ self, force_reindex: bool = False
648
+ ) -> tuple[list[Path], list[Path]]:
649
+ """Get all indexable files and those that need indexing.
650
+
651
+ Args:
652
+ force_reindex: Whether to force reindex of all files
653
+
654
+ Returns:
655
+ Tuple of (all_indexable_files, files_to_index)
656
+ """
657
+ # Find all indexable files
658
+ all_files = await self._find_indexable_files_async()
659
+
660
+ if not all_files:
661
+ return [], []
662
+
663
+ # Load existing metadata for incremental indexing
664
+ metadata = self._load_index_metadata()
665
+
666
+ # Filter files that need indexing
667
+ if force_reindex:
668
+ files_to_index = all_files
669
+ logger.info(f"Force reindex: processing all {len(files_to_index)} files")
670
+ else:
671
+ files_to_index = [
672
+ f for f in all_files if self._needs_reindexing(f, metadata)
673
+ ]
674
+ logger.info(
675
+ f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
676
+ )
677
+
678
+ return all_files, files_to_index
679
+
680
+ async def index_files_with_progress(
681
+ self,
682
+ files_to_index: list[Path],
683
+ force_reindex: bool = False,
684
+ ):
685
+ """Index files and yield progress updates for each file.
686
+
687
+ Args:
688
+ files_to_index: List of file paths to index
689
+ force_reindex: Whether to force reindexing
690
+
691
+ Yields:
692
+ Tuple of (file_path, chunks_added, success) for each processed file
693
+ """
694
+ metadata = self._load_index_metadata()
695
+
696
+ # Process files in batches for better memory management
697
+ for i in range(0, len(files_to_index), self.batch_size):
698
+ batch = files_to_index[i : i + self.batch_size]
699
+
700
+ # Process each file in the batch
701
+ for file_path in batch:
702
+ chunks_added = 0
703
+ success = False
704
+
705
+ try:
706
+ # Always remove existing chunks when reindexing
707
+ await self.database.delete_by_file(file_path)
708
+
709
+ # Parse file into chunks
710
+ chunks = await self._parse_file(file_path)
711
+
712
+ if chunks:
713
+ # Add chunks to database
714
+ await self.database.add_chunks(chunks)
715
+ chunks_added = len(chunks)
716
+ logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
717
+
718
+ success = True
719
+
720
+ # Update metadata after successful indexing
721
+ metadata[str(file_path)] = os.path.getmtime(file_path)
722
+
723
+ except Exception as e:
724
+ logger.error(f"Failed to index file {file_path}: {e}")
725
+ success = False
726
+
727
+ # Yield progress update
728
+ yield (file_path, chunks_added, success)
729
+
730
+ # Save metadata at the end
731
+ self._save_index_metadata(metadata)
@@ -281,24 +281,27 @@ class ProjectManager:
281
281
  continue
282
282
 
283
283
  # Skip ignored patterns
284
- if self._should_ignore_path(path):
284
+ # PERFORMANCE: Pass is_directory=False since we already checked is_file()
285
+ if self._should_ignore_path(path, is_directory=False):
285
286
  continue
286
287
 
287
288
  files.append(path)
288
289
 
289
290
  return files
290
291
 
291
- def _should_ignore_path(self, path: Path) -> bool:
292
+ def _should_ignore_path(self, path: Path, is_directory: bool | None = None) -> bool:
292
293
  """Check if a path should be ignored.
293
294
 
294
295
  Args:
295
296
  path: Path to check
297
+ is_directory: Optional hint if path is a directory (avoids filesystem check)
296
298
 
297
299
  Returns:
298
300
  True if path should be ignored
299
301
  """
300
302
  # First check gitignore rules if available
301
- if self.gitignore_parser and self.gitignore_parser.is_ignored(path):
303
+ # PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
304
+ if self.gitignore_parser and self.gitignore_parser.is_ignored(path, is_directory=is_directory):
302
305
  return True
303
306
 
304
307
  # Check if any parent directory is in ignore patterns