mcp-vector-search 0.7.4__py3-none-any.whl → 0.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/cli/commands/demo.py +2 -4
- mcp_vector_search/cli/commands/index.py +130 -30
- mcp_vector_search/cli/commands/mcp.py +673 -36
- mcp_vector_search/cli/commands/status.py +23 -9
- mcp_vector_search/cli/main.py +2 -4
- mcp_vector_search/core/database.py +117 -54
- mcp_vector_search/core/indexer.py +191 -15
- mcp_vector_search/core/project.py +6 -3
- mcp_vector_search/utils/gitignore.py +31 -23
- {mcp_vector_search-0.7.4.dist-info → mcp_vector_search-0.7.6.dist-info}/METADATA +1 -1
- {mcp_vector_search-0.7.4.dist-info → mcp_vector_search-0.7.6.dist-info}/RECORD +15 -15
- {mcp_vector_search-0.7.4.dist-info → mcp_vector_search-0.7.6.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.7.4.dist-info → mcp_vector_search-0.7.6.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.7.4.dist-info → mcp_vector_search-0.7.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -102,15 +102,28 @@ def main(
|
|
|
102
102
|
if project_root is None:
|
|
103
103
|
project_root = Path.cwd()
|
|
104
104
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
105
|
+
async def run_status_with_timeout():
|
|
106
|
+
"""Run status command with timeout protection."""
|
|
107
|
+
try:
|
|
108
|
+
await asyncio.wait_for(
|
|
109
|
+
show_status(
|
|
110
|
+
project_root=project_root,
|
|
111
|
+
verbose=verbose,
|
|
112
|
+
health_check=health_check,
|
|
113
|
+
mcp=mcp,
|
|
114
|
+
json_output=json_output,
|
|
115
|
+
),
|
|
116
|
+
timeout=30.0, # 30 second timeout
|
|
117
|
+
)
|
|
118
|
+
except TimeoutError:
|
|
119
|
+
logger.error("Status check timed out after 30 seconds")
|
|
120
|
+
print_error(
|
|
121
|
+
"Status check timed out after 30 seconds. "
|
|
122
|
+
"Try running with --verbose for more details."
|
|
123
|
+
)
|
|
124
|
+
raise typer.Exit(1)
|
|
125
|
+
|
|
126
|
+
asyncio.run(run_status_with_timeout())
|
|
114
127
|
|
|
115
128
|
except Exception as e:
|
|
116
129
|
logger.error(f"Status check failed: {e}")
|
|
@@ -162,6 +175,7 @@ async def show_status(
|
|
|
162
175
|
file_extensions=config.file_extensions,
|
|
163
176
|
)
|
|
164
177
|
|
|
178
|
+
# Get indexing stats (runs async file scanning in thread pool)
|
|
165
179
|
async with database:
|
|
166
180
|
index_stats = await indexer.get_indexing_stats()
|
|
167
181
|
db_stats = await database.get_stats()
|
mcp_vector_search/cli/main.py
CHANGED
|
@@ -39,7 +39,7 @@ unfamiliar codebases, finding similar patterns, and integrating with AI tools.
|
|
|
39
39
|
status 📊 Show project status
|
|
40
40
|
search 🔍 Search code semantically
|
|
41
41
|
index 📇 Index codebase
|
|
42
|
-
mcp 🤖 MCP integration
|
|
42
|
+
mcp 🤖 MCP integration for AI tools
|
|
43
43
|
config ⚙️ Configure settings
|
|
44
44
|
help ❓ Get help
|
|
45
45
|
version ℹ️ Show version
|
|
@@ -84,7 +84,7 @@ app.add_typer(search_app, name="search", help="🔍 Search code semantically")
|
|
|
84
84
|
app.add_typer(index_app, name="index", help="📇 Index codebase for semantic search")
|
|
85
85
|
|
|
86
86
|
# 7. MCP - MCP integration
|
|
87
|
-
app.add_typer(mcp_app, name="mcp", help="🤖 Manage
|
|
87
|
+
app.add_typer(mcp_app, name="mcp", help="🤖 Manage MCP integration for AI tools")
|
|
88
88
|
|
|
89
89
|
# 8. CONFIG - Configuration
|
|
90
90
|
app.add_typer(config_app, name="config", help="⚙️ Manage project configuration")
|
|
@@ -122,8 +122,6 @@ def deprecated_install():
|
|
|
122
122
|
_deprecated_command("install", "init")()
|
|
123
123
|
|
|
124
124
|
|
|
125
|
-
|
|
126
|
-
|
|
127
125
|
# Deprecated: find -> search
|
|
128
126
|
@app.command("find", hidden=True)
|
|
129
127
|
def deprecated_find():
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Database abstraction and ChromaDB implementation for MCP Vector Search."""
|
|
2
2
|
|
|
3
|
+
import asyncio
|
|
3
4
|
import shutil
|
|
4
5
|
from abc import ABC, abstractmethod
|
|
5
6
|
from pathlib import Path
|
|
@@ -369,38 +370,67 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
369
370
|
raise DatabaseError(f"Failed to delete chunks: {e}") from e
|
|
370
371
|
|
|
371
372
|
async def get_stats(self) -> IndexStats:
|
|
372
|
-
"""Get database statistics."""
|
|
373
|
+
"""Get database statistics with optimized chunked queries."""
|
|
373
374
|
if not self._collection:
|
|
374
375
|
raise DatabaseNotInitializedError("Database not initialized")
|
|
375
376
|
|
|
376
377
|
try:
|
|
377
|
-
# Get total count
|
|
378
|
+
# Get total count (fast operation)
|
|
378
379
|
count = self._collection.count()
|
|
379
380
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
381
|
+
if count == 0:
|
|
382
|
+
return IndexStats(
|
|
383
|
+
total_files=0,
|
|
384
|
+
total_chunks=0,
|
|
385
|
+
languages={},
|
|
386
|
+
file_types={},
|
|
387
|
+
index_size_mb=0.0,
|
|
388
|
+
last_updated="N/A",
|
|
389
|
+
embedding_model="unknown",
|
|
390
|
+
)
|
|
383
391
|
|
|
384
|
-
#
|
|
385
|
-
|
|
392
|
+
# Process in chunks to avoid loading everything at once
|
|
393
|
+
batch_size_limit = 1000
|
|
386
394
|
|
|
387
|
-
|
|
388
|
-
language_counts = {}
|
|
389
|
-
file_type_counts = {}
|
|
395
|
+
files = set()
|
|
396
|
+
language_counts: dict[str, int] = {}
|
|
397
|
+
file_type_counts: dict[str, int] = {}
|
|
390
398
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
399
|
+
offset = 0
|
|
400
|
+
while offset < count:
|
|
401
|
+
# Fetch batch
|
|
402
|
+
batch_size = min(batch_size_limit, count - offset)
|
|
403
|
+
logger.debug(
|
|
404
|
+
f"Processing database stats: batch {offset // batch_size_limit + 1}, "
|
|
405
|
+
f"{offset}-{offset + batch_size} of {count} chunks"
|
|
406
|
+
)
|
|
395
407
|
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
408
|
+
results = self._collection.get(
|
|
409
|
+
include=["metadatas"],
|
|
410
|
+
limit=batch_size,
|
|
411
|
+
offset=offset,
|
|
412
|
+
)
|
|
401
413
|
|
|
402
|
-
|
|
403
|
-
|
|
414
|
+
# Process batch metadata
|
|
415
|
+
for metadata in results.get("metadatas", []):
|
|
416
|
+
# Language stats
|
|
417
|
+
lang = metadata.get("language", "unknown")
|
|
418
|
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
|
419
|
+
|
|
420
|
+
# File stats
|
|
421
|
+
file_path = metadata.get("file_path", "")
|
|
422
|
+
if file_path:
|
|
423
|
+
files.add(file_path)
|
|
424
|
+
ext = Path(file_path).suffix or "no_extension"
|
|
425
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
426
|
+
|
|
427
|
+
offset += batch_size
|
|
428
|
+
|
|
429
|
+
# Yield to event loop periodically to prevent blocking
|
|
430
|
+
await asyncio.sleep(0)
|
|
431
|
+
|
|
432
|
+
# Estimate index size (rough approximation: ~1KB per chunk)
|
|
433
|
+
index_size_mb = count * 0.001
|
|
404
434
|
|
|
405
435
|
return IndexStats(
|
|
406
436
|
total_files=len(files),
|
|
@@ -408,12 +438,13 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
408
438
|
languages=language_counts,
|
|
409
439
|
file_types=file_type_counts,
|
|
410
440
|
index_size_mb=index_size_mb,
|
|
411
|
-
last_updated="unknown",
|
|
412
|
-
embedding_model="unknown",
|
|
441
|
+
last_updated="unknown",
|
|
442
|
+
embedding_model="unknown",
|
|
413
443
|
)
|
|
414
444
|
|
|
415
445
|
except Exception as e:
|
|
416
|
-
logger.error(f"Failed to get
|
|
446
|
+
logger.error(f"Failed to get database statistics: {e}")
|
|
447
|
+
# Return empty stats instead of raising
|
|
417
448
|
return IndexStats(
|
|
418
449
|
total_files=0,
|
|
419
450
|
total_chunks=0,
|
|
@@ -768,56 +799,88 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
768
799
|
raise DatabaseError(f"Failed to delete chunks: {e}") from e
|
|
769
800
|
|
|
770
801
|
async def get_stats(self) -> IndexStats:
|
|
771
|
-
"""Get database statistics
|
|
802
|
+
"""Get database statistics with connection pooling and chunked queries."""
|
|
772
803
|
try:
|
|
773
804
|
async with self._pool.get_connection() as conn:
|
|
774
|
-
# Get total count
|
|
805
|
+
# Get total count (fast operation)
|
|
775
806
|
count = conn.collection.count()
|
|
776
807
|
|
|
777
|
-
|
|
778
|
-
|
|
808
|
+
if count == 0:
|
|
809
|
+
return IndexStats(
|
|
810
|
+
total_files=0,
|
|
811
|
+
total_chunks=0,
|
|
812
|
+
languages={},
|
|
813
|
+
file_types={},
|
|
814
|
+
index_size_mb=0.0,
|
|
815
|
+
last_updated="N/A",
|
|
816
|
+
embedding_model="unknown",
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
# Process in chunks to avoid loading everything at once
|
|
820
|
+
batch_size_limit = 1000
|
|
779
821
|
|
|
780
|
-
# Analyze languages and files
|
|
781
|
-
languages = set()
|
|
782
822
|
files = set()
|
|
823
|
+
language_counts: dict[str, int] = {}
|
|
824
|
+
file_type_counts: dict[str, int] = {}
|
|
825
|
+
|
|
826
|
+
offset = 0
|
|
827
|
+
while offset < count:
|
|
828
|
+
# Fetch batch
|
|
829
|
+
batch_size = min(batch_size_limit, count - offset)
|
|
830
|
+
logger.debug(
|
|
831
|
+
f"Processing database stats: batch {offset // batch_size_limit + 1}, "
|
|
832
|
+
f"{offset}-{offset + batch_size} of {count} chunks"
|
|
833
|
+
)
|
|
783
834
|
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
835
|
+
results = conn.collection.get(
|
|
836
|
+
include=["metadatas"],
|
|
837
|
+
limit=batch_size,
|
|
838
|
+
offset=offset,
|
|
839
|
+
)
|
|
789
840
|
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
841
|
+
# Process batch metadata
|
|
842
|
+
for metadata in results.get("metadatas", []):
|
|
843
|
+
# Language stats
|
|
844
|
+
lang = metadata.get("language", "unknown")
|
|
845
|
+
language_counts[lang] = language_counts.get(lang, 0) + 1
|
|
793
846
|
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
847
|
+
# File stats
|
|
848
|
+
file_path = metadata.get("file_path", "")
|
|
849
|
+
if file_path:
|
|
850
|
+
files.add(file_path)
|
|
851
|
+
ext = Path(file_path).suffix or "no_extension"
|
|
852
|
+
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
798
853
|
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
file_type_counts[ext] = file_type_counts.get(ext, 0) + 1
|
|
854
|
+
offset += batch_size
|
|
855
|
+
|
|
856
|
+
# Yield to event loop periodically to prevent blocking
|
|
857
|
+
await asyncio.sleep(0)
|
|
804
858
|
|
|
805
|
-
# Estimate index size (rough approximation)
|
|
806
|
-
index_size_mb = count * 0.001
|
|
859
|
+
# Estimate index size (rough approximation: ~1KB per chunk)
|
|
860
|
+
index_size_mb = count * 0.001
|
|
807
861
|
|
|
808
862
|
return IndexStats(
|
|
809
|
-
total_chunks=count,
|
|
810
863
|
total_files=len(files),
|
|
864
|
+
total_chunks=count,
|
|
811
865
|
languages=language_counts,
|
|
812
866
|
file_types=file_type_counts,
|
|
813
867
|
index_size_mb=index_size_mb,
|
|
814
|
-
last_updated="unknown",
|
|
815
|
-
embedding_model="unknown",
|
|
868
|
+
last_updated="unknown",
|
|
869
|
+
embedding_model="unknown",
|
|
816
870
|
)
|
|
817
871
|
|
|
818
872
|
except Exception as e:
|
|
819
|
-
logger.error(f"Failed to get database
|
|
820
|
-
|
|
873
|
+
logger.error(f"Failed to get database statistics: {e}")
|
|
874
|
+
# Return empty stats instead of raising
|
|
875
|
+
return IndexStats(
|
|
876
|
+
total_files=0,
|
|
877
|
+
total_chunks=0,
|
|
878
|
+
languages={},
|
|
879
|
+
file_types={},
|
|
880
|
+
index_size_mb=0.0,
|
|
881
|
+
last_updated="error",
|
|
882
|
+
embedding_model="unknown",
|
|
883
|
+
)
|
|
821
884
|
|
|
822
885
|
async def remove_file_chunks(self, file_path: str) -> int:
|
|
823
886
|
"""Remove all chunks for a specific file using pooled connection."""
|
|
@@ -57,6 +57,11 @@ class SemanticIndexer:
|
|
|
57
57
|
project_root / ".mcp-vector-search" / "index_metadata.json"
|
|
58
58
|
)
|
|
59
59
|
|
|
60
|
+
# Add cache for indexable files to avoid repeated filesystem scans
|
|
61
|
+
self._indexable_files_cache: list[Path] | None = None
|
|
62
|
+
self._cache_timestamp: float = 0
|
|
63
|
+
self._cache_ttl: float = 60.0 # 60 second TTL
|
|
64
|
+
|
|
60
65
|
# Initialize gitignore parser
|
|
61
66
|
try:
|
|
62
67
|
self.gitignore_parser = create_gitignore_parser(project_root)
|
|
@@ -334,38 +339,120 @@ class SemanticIndexer:
|
|
|
334
339
|
return 0
|
|
335
340
|
|
|
336
341
|
def _find_indexable_files(self) -> list[Path]:
|
|
337
|
-
"""Find all files that should be indexed.
|
|
342
|
+
"""Find all files that should be indexed with caching.
|
|
338
343
|
|
|
339
344
|
Returns:
|
|
340
345
|
List of file paths to index
|
|
341
346
|
"""
|
|
347
|
+
import time
|
|
348
|
+
|
|
349
|
+
# Check cache
|
|
350
|
+
current_time = time.time()
|
|
351
|
+
if (
|
|
352
|
+
self._indexable_files_cache is not None
|
|
353
|
+
and current_time - self._cache_timestamp < self._cache_ttl
|
|
354
|
+
):
|
|
355
|
+
logger.debug(
|
|
356
|
+
f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
|
|
357
|
+
)
|
|
358
|
+
return self._indexable_files_cache
|
|
359
|
+
|
|
360
|
+
# Rebuild cache using efficient directory filtering
|
|
361
|
+
logger.debug("Rebuilding indexable files cache...")
|
|
362
|
+
indexable_files = self._scan_files_sync()
|
|
363
|
+
|
|
364
|
+
self._indexable_files_cache = sorted(indexable_files)
|
|
365
|
+
self._cache_timestamp = current_time
|
|
366
|
+
logger.debug(f"Rebuilt indexable files cache ({len(indexable_files)} files)")
|
|
367
|
+
|
|
368
|
+
return self._indexable_files_cache
|
|
369
|
+
|
|
370
|
+
def _scan_files_sync(self) -> list[Path]:
|
|
371
|
+
"""Synchronous file scanning (runs in thread pool).
|
|
372
|
+
|
|
373
|
+
Uses os.walk with directory filtering to avoid traversing ignored directories.
|
|
374
|
+
|
|
375
|
+
Returns:
|
|
376
|
+
List of indexable file paths
|
|
377
|
+
"""
|
|
342
378
|
indexable_files = []
|
|
343
379
|
|
|
344
|
-
for
|
|
345
|
-
|
|
346
|
-
|
|
380
|
+
# Use os.walk for efficient directory traversal with early filtering
|
|
381
|
+
for root, dirs, files in os.walk(self.project_root):
|
|
382
|
+
root_path = Path(root)
|
|
383
|
+
|
|
384
|
+
# Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
|
|
385
|
+
# This is much more efficient than checking every file in ignored directories
|
|
386
|
+
# PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
|
|
387
|
+
dirs[:] = [d for d in dirs if not self._should_ignore_path(root_path / d, is_directory=True)]
|
|
388
|
+
|
|
389
|
+
# Check each file in the current directory
|
|
390
|
+
# PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
|
|
391
|
+
for filename in files:
|
|
392
|
+
file_path = root_path / filename
|
|
393
|
+
if self._should_index_file(file_path, skip_file_check=True):
|
|
394
|
+
indexable_files.append(file_path)
|
|
347
395
|
|
|
348
|
-
return
|
|
396
|
+
return indexable_files
|
|
349
397
|
|
|
350
|
-
def
|
|
398
|
+
async def _find_indexable_files_async(self) -> list[Path]:
|
|
399
|
+
"""Find all files asynchronously without blocking event loop.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
List of file paths to index
|
|
403
|
+
"""
|
|
404
|
+
import time
|
|
405
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
406
|
+
|
|
407
|
+
# Check cache first
|
|
408
|
+
current_time = time.time()
|
|
409
|
+
if (
|
|
410
|
+
self._indexable_files_cache is not None
|
|
411
|
+
and current_time - self._cache_timestamp < self._cache_ttl
|
|
412
|
+
):
|
|
413
|
+
logger.debug(
|
|
414
|
+
f"Using cached indexable files ({len(self._indexable_files_cache)} files)"
|
|
415
|
+
)
|
|
416
|
+
return self._indexable_files_cache
|
|
417
|
+
|
|
418
|
+
# Run filesystem scan in thread pool to avoid blocking
|
|
419
|
+
logger.debug("Scanning files in background thread...")
|
|
420
|
+
loop = asyncio.get_running_loop()
|
|
421
|
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
|
422
|
+
indexable_files = await loop.run_in_executor(
|
|
423
|
+
executor, self._scan_files_sync
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# Update cache
|
|
427
|
+
self._indexable_files_cache = sorted(indexable_files)
|
|
428
|
+
self._cache_timestamp = current_time
|
|
429
|
+
logger.debug(f"Found {len(indexable_files)} indexable files")
|
|
430
|
+
|
|
431
|
+
return self._indexable_files_cache
|
|
432
|
+
|
|
433
|
+
def _should_index_file(self, file_path: Path, skip_file_check: bool = False) -> bool:
|
|
351
434
|
"""Check if a file should be indexed.
|
|
352
435
|
|
|
353
436
|
Args:
|
|
354
437
|
file_path: Path to check
|
|
438
|
+
skip_file_check: Skip is_file() check if caller knows it's a file (optimization)
|
|
355
439
|
|
|
356
440
|
Returns:
|
|
357
441
|
True if file should be indexed
|
|
358
442
|
"""
|
|
359
|
-
#
|
|
360
|
-
|
|
443
|
+
# PERFORMANCE: Check file extension FIRST (cheapest operation, no I/O)
|
|
444
|
+
# This eliminates most files without any filesystem calls
|
|
445
|
+
if file_path.suffix.lower() not in self.file_extensions:
|
|
361
446
|
return False
|
|
362
447
|
|
|
363
|
-
#
|
|
364
|
-
|
|
448
|
+
# PERFORMANCE: Only check is_file() if not coming from os.walk
|
|
449
|
+
# os.walk already guarantees files, so we skip this expensive check
|
|
450
|
+
if not skip_file_check and not file_path.is_file():
|
|
365
451
|
return False
|
|
366
452
|
|
|
367
453
|
# Check if path should be ignored
|
|
368
|
-
|
|
454
|
+
# PERFORMANCE: Pass is_directory=False to skip stat() call (we know it's a file)
|
|
455
|
+
if self._should_ignore_path(file_path, is_directory=False):
|
|
369
456
|
return False
|
|
370
457
|
|
|
371
458
|
# Check file size (skip very large files)
|
|
@@ -379,18 +466,20 @@ class SemanticIndexer:
|
|
|
379
466
|
|
|
380
467
|
return True
|
|
381
468
|
|
|
382
|
-
def _should_ignore_path(self, file_path: Path) -> bool:
|
|
469
|
+
def _should_ignore_path(self, file_path: Path, is_directory: bool | None = None) -> bool:
|
|
383
470
|
"""Check if a path should be ignored.
|
|
384
471
|
|
|
385
472
|
Args:
|
|
386
473
|
file_path: Path to check
|
|
474
|
+
is_directory: Optional hint if path is a directory (avoids filesystem check)
|
|
387
475
|
|
|
388
476
|
Returns:
|
|
389
477
|
True if path should be ignored
|
|
390
478
|
"""
|
|
391
479
|
try:
|
|
392
480
|
# First check gitignore rules if available
|
|
393
|
-
|
|
481
|
+
# PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
|
|
482
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(file_path, is_directory=is_directory):
|
|
394
483
|
logger.debug(f"Path ignored by .gitignore: {file_path}")
|
|
395
484
|
return True
|
|
396
485
|
|
|
@@ -532,8 +621,8 @@ class SemanticIndexer:
|
|
|
532
621
|
# Get database stats
|
|
533
622
|
db_stats = await self.database.get_stats()
|
|
534
623
|
|
|
535
|
-
# Count indexable files
|
|
536
|
-
indexable_files = self.
|
|
624
|
+
# Count indexable files asynchronously without blocking
|
|
625
|
+
indexable_files = await self._find_indexable_files_async()
|
|
537
626
|
|
|
538
627
|
return {
|
|
539
628
|
"total_indexable_files": len(indexable_files),
|
|
@@ -553,3 +642,90 @@ class SemanticIndexer:
|
|
|
553
642
|
"indexed_files": 0,
|
|
554
643
|
"total_chunks": 0,
|
|
555
644
|
}
|
|
645
|
+
|
|
646
|
+
async def get_files_to_index(
|
|
647
|
+
self, force_reindex: bool = False
|
|
648
|
+
) -> tuple[list[Path], list[Path]]:
|
|
649
|
+
"""Get all indexable files and those that need indexing.
|
|
650
|
+
|
|
651
|
+
Args:
|
|
652
|
+
force_reindex: Whether to force reindex of all files
|
|
653
|
+
|
|
654
|
+
Returns:
|
|
655
|
+
Tuple of (all_indexable_files, files_to_index)
|
|
656
|
+
"""
|
|
657
|
+
# Find all indexable files
|
|
658
|
+
all_files = await self._find_indexable_files_async()
|
|
659
|
+
|
|
660
|
+
if not all_files:
|
|
661
|
+
return [], []
|
|
662
|
+
|
|
663
|
+
# Load existing metadata for incremental indexing
|
|
664
|
+
metadata = self._load_index_metadata()
|
|
665
|
+
|
|
666
|
+
# Filter files that need indexing
|
|
667
|
+
if force_reindex:
|
|
668
|
+
files_to_index = all_files
|
|
669
|
+
logger.info(f"Force reindex: processing all {len(files_to_index)} files")
|
|
670
|
+
else:
|
|
671
|
+
files_to_index = [
|
|
672
|
+
f for f in all_files if self._needs_reindexing(f, metadata)
|
|
673
|
+
]
|
|
674
|
+
logger.info(
|
|
675
|
+
f"Incremental index: {len(files_to_index)} of {len(all_files)} files need updating"
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
return all_files, files_to_index
|
|
679
|
+
|
|
680
|
+
async def index_files_with_progress(
|
|
681
|
+
self,
|
|
682
|
+
files_to_index: list[Path],
|
|
683
|
+
force_reindex: bool = False,
|
|
684
|
+
):
|
|
685
|
+
"""Index files and yield progress updates for each file.
|
|
686
|
+
|
|
687
|
+
Args:
|
|
688
|
+
files_to_index: List of file paths to index
|
|
689
|
+
force_reindex: Whether to force reindexing
|
|
690
|
+
|
|
691
|
+
Yields:
|
|
692
|
+
Tuple of (file_path, chunks_added, success) for each processed file
|
|
693
|
+
"""
|
|
694
|
+
metadata = self._load_index_metadata()
|
|
695
|
+
|
|
696
|
+
# Process files in batches for better memory management
|
|
697
|
+
for i in range(0, len(files_to_index), self.batch_size):
|
|
698
|
+
batch = files_to_index[i : i + self.batch_size]
|
|
699
|
+
|
|
700
|
+
# Process each file in the batch
|
|
701
|
+
for file_path in batch:
|
|
702
|
+
chunks_added = 0
|
|
703
|
+
success = False
|
|
704
|
+
|
|
705
|
+
try:
|
|
706
|
+
# Always remove existing chunks when reindexing
|
|
707
|
+
await self.database.delete_by_file(file_path)
|
|
708
|
+
|
|
709
|
+
# Parse file into chunks
|
|
710
|
+
chunks = await self._parse_file(file_path)
|
|
711
|
+
|
|
712
|
+
if chunks:
|
|
713
|
+
# Add chunks to database
|
|
714
|
+
await self.database.add_chunks(chunks)
|
|
715
|
+
chunks_added = len(chunks)
|
|
716
|
+
logger.debug(f"Indexed {chunks_added} chunks from {file_path}")
|
|
717
|
+
|
|
718
|
+
success = True
|
|
719
|
+
|
|
720
|
+
# Update metadata after successful indexing
|
|
721
|
+
metadata[str(file_path)] = os.path.getmtime(file_path)
|
|
722
|
+
|
|
723
|
+
except Exception as e:
|
|
724
|
+
logger.error(f"Failed to index file {file_path}: {e}")
|
|
725
|
+
success = False
|
|
726
|
+
|
|
727
|
+
# Yield progress update
|
|
728
|
+
yield (file_path, chunks_added, success)
|
|
729
|
+
|
|
730
|
+
# Save metadata at the end
|
|
731
|
+
self._save_index_metadata(metadata)
|
|
@@ -281,24 +281,27 @@ class ProjectManager:
|
|
|
281
281
|
continue
|
|
282
282
|
|
|
283
283
|
# Skip ignored patterns
|
|
284
|
-
|
|
284
|
+
# PERFORMANCE: Pass is_directory=False since we already checked is_file()
|
|
285
|
+
if self._should_ignore_path(path, is_directory=False):
|
|
285
286
|
continue
|
|
286
287
|
|
|
287
288
|
files.append(path)
|
|
288
289
|
|
|
289
290
|
return files
|
|
290
291
|
|
|
291
|
-
def _should_ignore_path(self, path: Path) -> bool:
|
|
292
|
+
def _should_ignore_path(self, path: Path, is_directory: bool | None = None) -> bool:
|
|
292
293
|
"""Check if a path should be ignored.
|
|
293
294
|
|
|
294
295
|
Args:
|
|
295
296
|
path: Path to check
|
|
297
|
+
is_directory: Optional hint if path is a directory (avoids filesystem check)
|
|
296
298
|
|
|
297
299
|
Returns:
|
|
298
300
|
True if path should be ignored
|
|
299
301
|
"""
|
|
300
302
|
# First check gitignore rules if available
|
|
301
|
-
|
|
303
|
+
# PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
|
|
304
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(path, is_directory=is_directory):
|
|
302
305
|
return True
|
|
303
306
|
|
|
304
307
|
# Check if any parent directory is in ignore patterns
|