mcp-vector-search 0.12.0__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/cli/commands/index.py +15 -24
- mcp_vector_search/cli/commands/install.py +502 -523
- mcp_vector_search/cli/commands/install_old.py +696 -0
- mcp_vector_search/cli/commands/status.py +7 -5
- mcp_vector_search/cli/commands/uninstall.py +485 -0
- mcp_vector_search/cli/commands/visualize.py +406 -120
- mcp_vector_search/cli/didyoumean.py +10 -0
- mcp_vector_search/cli/main.py +39 -21
- mcp_vector_search/core/connection_pool.py +49 -11
- mcp_vector_search/core/database.py +7 -9
- mcp_vector_search/core/directory_index.py +26 -11
- mcp_vector_search/core/indexer.py +89 -29
- mcp_vector_search/core/models.py +4 -1
- mcp_vector_search/core/project.py +16 -5
- mcp_vector_search/parsers/base.py +54 -18
- mcp_vector_search/parsers/javascript.py +41 -20
- mcp_vector_search/parsers/python.py +19 -11
- mcp_vector_search/parsers/registry.py +3 -2
- mcp_vector_search/utils/gitignore.py +3 -1
- {mcp_vector_search-0.12.0.dist-info → mcp_vector_search-0.12.1.dist-info}/METADATA +87 -24
- {mcp_vector_search-0.12.0.dist-info → mcp_vector_search-0.12.1.dist-info}/RECORD +25 -23
- {mcp_vector_search-0.12.0.dist-info → mcp_vector_search-0.12.1.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.12.0.dist-info → mcp_vector_search-0.12.1.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.12.0.dist-info → mcp_vector_search-0.12.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -57,6 +57,11 @@ class EnhancedDidYouMeanTyper(typer.Typer):
|
|
|
57
57
|
# Get the underlying click group
|
|
58
58
|
click_group = super().__call__(*args, **kwargs)
|
|
59
59
|
|
|
60
|
+
# If click_group is None (command already executed), return None
|
|
61
|
+
# This happens after command execution completes successfully
|
|
62
|
+
if click_group is None:
|
|
63
|
+
return None
|
|
64
|
+
|
|
60
65
|
# Create enhanced DYM group with original group's properties
|
|
61
66
|
enhanced_group = EnhancedDidYouMeanGroup(
|
|
62
67
|
name=click_group.name,
|
|
@@ -161,6 +166,11 @@ def enhance_existing_typer(app: typer.Typer) -> typer.Typer:
|
|
|
161
166
|
"""Enhanced call that uses EnhancedDidYouMeanGroup."""
|
|
162
167
|
click_group = original_call(*args, **kwargs)
|
|
163
168
|
|
|
169
|
+
# If click_group is None (command already executed), return None
|
|
170
|
+
# This happens after command execution completes successfully
|
|
171
|
+
if click_group is None:
|
|
172
|
+
return None
|
|
173
|
+
|
|
164
174
|
# Create enhanced group
|
|
165
175
|
enhanced_group = EnhancedDidYouMeanGroup(
|
|
166
176
|
name=click_group.name,
|
mcp_vector_search/cli/main.py
CHANGED
|
@@ -33,13 +33,15 @@ unfamiliar codebases, finding similar patterns, and integrating with AI tools.
|
|
|
33
33
|
3. Check status: [green]mcp-vector-search status[/green]
|
|
34
34
|
|
|
35
35
|
[bold cyan]Main Commands:[/bold cyan]
|
|
36
|
-
|
|
36
|
+
install 📦 Install project and MCP integrations
|
|
37
|
+
uninstall 🗑️ Remove MCP integrations
|
|
38
|
+
init 🔧 Initialize project (simple)
|
|
37
39
|
demo 🎬 Run interactive demo
|
|
38
40
|
doctor 🩺 Check system health
|
|
39
41
|
status 📊 Show project status
|
|
40
42
|
search 🔍 Search code semantically
|
|
41
43
|
index 📇 Index codebase
|
|
42
|
-
mcp
|
|
44
|
+
mcp 🔌 MCP server operations
|
|
43
45
|
config ⚙️ Configure settings
|
|
44
46
|
visualize 📊 Visualize code relationships
|
|
45
47
|
help ❓ Get help
|
|
@@ -56,48 +58,61 @@ from .commands.config import config_app # noqa: E402
|
|
|
56
58
|
from .commands.demo import demo_app # noqa: E402
|
|
57
59
|
from .commands.index import index_app # noqa: E402
|
|
58
60
|
from .commands.init import init_app # noqa: E402
|
|
61
|
+
from .commands.install import install_app # noqa: E402
|
|
59
62
|
from .commands.mcp import mcp_app # noqa: E402
|
|
60
63
|
from .commands.search import search_app, search_main # noqa: E402, F401
|
|
61
64
|
from .commands.status import main as status_main # noqa: E402
|
|
65
|
+
from .commands.uninstall import uninstall_app # noqa: E402
|
|
62
66
|
from .commands.visualize import app as visualize_app # noqa: E402
|
|
63
67
|
|
|
64
68
|
# ============================================================================
|
|
65
69
|
# MAIN COMMANDS - Clean hierarchy
|
|
66
70
|
# ============================================================================
|
|
67
71
|
|
|
68
|
-
# 1.
|
|
72
|
+
# 1. INSTALL - Install project and MCP integrations (NEW!)
|
|
73
|
+
app.add_typer(
|
|
74
|
+
install_app, name="install", help="📦 Install project and MCP integrations"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# 2. UNINSTALL - Remove MCP integrations (NEW!)
|
|
78
|
+
app.add_typer(uninstall_app, name="uninstall", help="🗑️ Remove MCP integrations")
|
|
79
|
+
app.add_typer(uninstall_app, name="remove", help="🗑️ Remove MCP integrations (alias)")
|
|
80
|
+
|
|
81
|
+
# 3. INIT - Initialize project (simplified)
|
|
69
82
|
# Use Typer group for init to support both direct call and subcommands
|
|
70
83
|
app.add_typer(init_app, name="init", help="🔧 Initialize project for semantic search")
|
|
71
84
|
|
|
72
|
-
#
|
|
85
|
+
# 4. DEMO - Interactive demo
|
|
73
86
|
app.add_typer(demo_app, name="demo", help="🎬 Run interactive demo with sample project")
|
|
74
87
|
|
|
75
|
-
#
|
|
88
|
+
# 5. DOCTOR - System health check
|
|
76
89
|
# (defined below inline)
|
|
77
90
|
|
|
78
|
-
#
|
|
91
|
+
# 6. STATUS - Project status
|
|
79
92
|
app.command("status", help="📊 Show project status and statistics")(status_main)
|
|
80
93
|
|
|
81
|
-
#
|
|
94
|
+
# 7. SEARCH - Search code
|
|
82
95
|
# Register search as both a command and a typer group
|
|
83
96
|
app.add_typer(search_app, name="search", help="🔍 Search code semantically")
|
|
84
97
|
|
|
85
|
-
#
|
|
98
|
+
# 8. INDEX - Index codebase
|
|
86
99
|
app.add_typer(index_app, name="index", help="📇 Index codebase for semantic search")
|
|
87
100
|
|
|
88
|
-
#
|
|
89
|
-
app.add_typer(mcp_app, name="mcp", help="
|
|
101
|
+
# 9. MCP - MCP server operations (RESERVED for server ops only!)
|
|
102
|
+
app.add_typer(mcp_app, name="mcp", help="🔌 MCP server operations")
|
|
90
103
|
|
|
91
|
-
#
|
|
104
|
+
# 10. CONFIG - Configuration
|
|
92
105
|
app.add_typer(config_app, name="config", help="⚙️ Manage project configuration")
|
|
93
106
|
|
|
94
|
-
#
|
|
95
|
-
app.add_typer(
|
|
107
|
+
# 11. VISUALIZE - Code graph visualization
|
|
108
|
+
app.add_typer(
|
|
109
|
+
visualize_app, name="visualize", help="📊 Visualize code chunk relationships"
|
|
110
|
+
)
|
|
96
111
|
|
|
97
|
-
#
|
|
112
|
+
# 12. HELP - Enhanced help
|
|
98
113
|
# (defined below inline)
|
|
99
114
|
|
|
100
|
-
#
|
|
115
|
+
# 13. VERSION - Version info
|
|
101
116
|
# (defined below inline)
|
|
102
117
|
|
|
103
118
|
|
|
@@ -120,11 +135,9 @@ def _deprecated_command(old_cmd: str, new_cmd: str):
|
|
|
120
135
|
return wrapper
|
|
121
136
|
|
|
122
137
|
|
|
123
|
-
#
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
"""[DEPRECATED] Use 'init' instead."""
|
|
127
|
-
_deprecated_command("install", "init")()
|
|
138
|
+
# NOTE: 'install' command is now the primary command for project installation
|
|
139
|
+
# Old 'install' was deprecated in favor of 'init' in v0.7.0
|
|
140
|
+
# Now 'install' is back as the hierarchical installation command in v0.13.0
|
|
128
141
|
|
|
129
142
|
|
|
130
143
|
# Deprecated: find -> search
|
|
@@ -432,7 +445,12 @@ def cli_with_suggestions():
|
|
|
432
445
|
except Exception as e:
|
|
433
446
|
# For other exceptions, show error and exit if verbose logging is enabled
|
|
434
447
|
# Suppress internal framework errors in normal operation
|
|
435
|
-
|
|
448
|
+
|
|
449
|
+
# Suppress harmless didyoumean framework AttributeError (known issue)
|
|
450
|
+
# This occurs during Click/Typer cleanup after successful command completion
|
|
451
|
+
if isinstance(e, AttributeError) and "attribute" in str(e) and "name" in str(e):
|
|
452
|
+
pass # Ignore - this is a harmless framework cleanup error
|
|
453
|
+
elif "--verbose" in sys.argv or "-v" in sys.argv:
|
|
436
454
|
click.echo(f"Unexpected error: {e}", err=True)
|
|
437
455
|
sys.exit(1)
|
|
438
456
|
# Otherwise, just exit silently to avoid confusing error messages
|
|
@@ -24,6 +24,16 @@ class PooledConnection:
|
|
|
24
24
|
in_use: bool = False
|
|
25
25
|
use_count: int = 0
|
|
26
26
|
|
|
27
|
+
@property
|
|
28
|
+
def age(self) -> float:
|
|
29
|
+
"""Get the age of this connection in seconds."""
|
|
30
|
+
return time.time() - self.created_at
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def idle_time(self) -> float:
|
|
34
|
+
"""Get the idle time of this connection in seconds."""
|
|
35
|
+
return time.time() - self.last_used
|
|
36
|
+
|
|
27
37
|
|
|
28
38
|
class ChromaConnectionPool:
|
|
29
39
|
"""Connection pool for ChromaDB operations."""
|
|
@@ -209,18 +219,18 @@ class ChromaConnectionPool:
|
|
|
209
219
|
logger.debug(f"Created new connection (pool size: {len(self._pool)})")
|
|
210
220
|
return conn
|
|
211
221
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
"Connection pool exhausted, waiting for available connection"
|
|
216
|
-
)
|
|
222
|
+
# Pool is full, wait for a connection to become available (outside lock)
|
|
223
|
+
self._stats["pool_misses"] += 1
|
|
224
|
+
logger.warning("Connection pool exhausted, waiting for available connection")
|
|
217
225
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
226
|
+
# Wait for a connection (with timeout) - release lock during wait
|
|
227
|
+
timeout = 30.0 # 30 seconds
|
|
228
|
+
start_time = time.time()
|
|
221
229
|
|
|
222
|
-
|
|
223
|
-
|
|
230
|
+
while time.time() - start_time < timeout:
|
|
231
|
+
await asyncio.sleep(0.1)
|
|
232
|
+
# Re-acquire lock to check for available connections
|
|
233
|
+
async with self._lock:
|
|
224
234
|
for conn in self._pool:
|
|
225
235
|
if not conn.in_use and self._is_connection_valid(conn):
|
|
226
236
|
conn.in_use = True
|
|
@@ -229,7 +239,7 @@ class ChromaConnectionPool:
|
|
|
229
239
|
self._stats["connections_reused"] += 1
|
|
230
240
|
return conn
|
|
231
241
|
|
|
232
|
-
|
|
242
|
+
raise DatabaseError("Connection pool timeout: no connections available")
|
|
233
243
|
|
|
234
244
|
async def _release_connection(self, conn: PooledConnection) -> None:
|
|
235
245
|
"""Release a connection back to the pool."""
|
|
@@ -320,3 +330,31 @@ class ChromaConnectionPool:
|
|
|
320
330
|
except Exception as e:
|
|
321
331
|
logger.error(f"Connection pool health check failed: {e}")
|
|
322
332
|
return False
|
|
333
|
+
|
|
334
|
+
# Backward compatibility aliases for old test API
|
|
335
|
+
async def cleanup(self) -> None:
|
|
336
|
+
"""Alias for close() method (backward compatibility)."""
|
|
337
|
+
await self.close()
|
|
338
|
+
|
|
339
|
+
def _validate_connection(self, conn: PooledConnection) -> bool:
|
|
340
|
+
"""Alias for _is_connection_valid() method (backward compatibility)."""
|
|
341
|
+
return self._is_connection_valid(conn)
|
|
342
|
+
|
|
343
|
+
async def _cleanup_idle_connections(self) -> None:
|
|
344
|
+
"""Alias for _cleanup_expired_connections() method (backward compatibility)."""
|
|
345
|
+
await self._cleanup_expired_connections()
|
|
346
|
+
|
|
347
|
+
@property
|
|
348
|
+
def _connections(self) -> list[PooledConnection]:
|
|
349
|
+
"""Alias for _pool attribute (backward compatibility)."""
|
|
350
|
+
return self._pool
|
|
351
|
+
|
|
352
|
+
@property
|
|
353
|
+
def _max_connections(self) -> int:
|
|
354
|
+
"""Alias for max_connections attribute (backward compatibility)."""
|
|
355
|
+
return self.max_connections
|
|
356
|
+
|
|
357
|
+
@property
|
|
358
|
+
def _min_connections(self) -> int:
|
|
359
|
+
"""Alias for min_connections attribute (backward compatibility)."""
|
|
360
|
+
return self.min_connections
|
|
@@ -501,13 +501,11 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
501
501
|
|
|
502
502
|
try:
|
|
503
503
|
# Get all documents from collection
|
|
504
|
-
results = self._collection.get(
|
|
505
|
-
include=["metadatas", "documents"]
|
|
506
|
-
)
|
|
504
|
+
results = self._collection.get(include=["metadatas", "documents"])
|
|
507
505
|
|
|
508
506
|
chunks = []
|
|
509
507
|
if results and results.get("ids"):
|
|
510
|
-
for i,
|
|
508
|
+
for i, _chunk_id in enumerate(results["ids"]):
|
|
511
509
|
metadata = results["metadatas"][i]
|
|
512
510
|
content = results["documents"][i]
|
|
513
511
|
|
|
@@ -802,7 +800,9 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
802
800
|
"decorators": json.dumps(chunk.decorators or []),
|
|
803
801
|
"parameters": json.dumps(chunk.parameters or []),
|
|
804
802
|
"return_type": chunk.return_type or "",
|
|
805
|
-
"type_annotations": json.dumps(
|
|
803
|
+
"type_annotations": json.dumps(
|
|
804
|
+
chunk.type_annotations or {}
|
|
805
|
+
),
|
|
806
806
|
# Monorepo support
|
|
807
807
|
"subproject_name": chunk.subproject_name or "",
|
|
808
808
|
"subproject_path": chunk.subproject_path or "",
|
|
@@ -1031,13 +1031,11 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
1031
1031
|
try:
|
|
1032
1032
|
async with self._pool.get_connection() as conn:
|
|
1033
1033
|
# Get all documents from collection
|
|
1034
|
-
results = conn.collection.get(
|
|
1035
|
-
include=["metadatas", "documents"]
|
|
1036
|
-
)
|
|
1034
|
+
results = conn.collection.get(include=["metadatas", "documents"])
|
|
1037
1035
|
|
|
1038
1036
|
chunks = []
|
|
1039
1037
|
if results and results.get("ids"):
|
|
1040
|
-
for i,
|
|
1038
|
+
for i, _chunk_id in enumerate(results["ids"]):
|
|
1041
1039
|
metadata = results["metadatas"][i]
|
|
1042
1040
|
content = results["documents"][i]
|
|
1043
1041
|
|
|
@@ -22,7 +22,9 @@ class DirectoryIndex:
|
|
|
22
22
|
self.index_path = index_path
|
|
23
23
|
self.directories: dict[str, Directory] = {} # path -> Directory
|
|
24
24
|
self.file_to_directory: dict[str, str] = {} # file_path -> directory_path
|
|
25
|
-
self.directory_files: dict[str, list[str]] = defaultdict(
|
|
25
|
+
self.directory_files: dict[str, list[str]] = defaultdict(
|
|
26
|
+
list
|
|
27
|
+
) # dir_path -> [file_paths]
|
|
26
28
|
|
|
27
29
|
def load(self) -> None:
|
|
28
30
|
"""Load directory index from disk."""
|
|
@@ -31,7 +33,7 @@ class DirectoryIndex:
|
|
|
31
33
|
return
|
|
32
34
|
|
|
33
35
|
try:
|
|
34
|
-
with open(self.index_path
|
|
36
|
+
with open(self.index_path) as f:
|
|
35
37
|
data = json.load(f)
|
|
36
38
|
|
|
37
39
|
# Load directories
|
|
@@ -138,7 +140,7 @@ class DirectoryIndex:
|
|
|
138
140
|
parent_path_str = str(directory_path)
|
|
139
141
|
subdirs = []
|
|
140
142
|
|
|
141
|
-
for
|
|
143
|
+
for _dir_path_str, directory in self.directories.items():
|
|
142
144
|
if directory.parent_path and str(directory.parent_path) == parent_path_str:
|
|
143
145
|
subdirs.append(directory)
|
|
144
146
|
|
|
@@ -195,7 +197,12 @@ class DirectoryIndex:
|
|
|
195
197
|
self.directory_files[dir_path]
|
|
196
198
|
)
|
|
197
199
|
|
|
198
|
-
def rebuild_from_files(
|
|
200
|
+
def rebuild_from_files(
|
|
201
|
+
self,
|
|
202
|
+
file_paths: list[Path],
|
|
203
|
+
root_path: Path,
|
|
204
|
+
chunk_stats: dict[str, dict] | None = None,
|
|
205
|
+
) -> None:
|
|
199
206
|
"""Rebuild directory index from list of files with statistics from chunks.
|
|
200
207
|
|
|
201
208
|
Args:
|
|
@@ -210,7 +217,9 @@ class DirectoryIndex:
|
|
|
210
217
|
# Track all unique directories and their statistics
|
|
211
218
|
dir_set = set()
|
|
212
219
|
dir_chunks = defaultdict(int) # directory -> total chunks
|
|
213
|
-
dir_languages = defaultdict(
|
|
220
|
+
dir_languages = defaultdict(
|
|
221
|
+
lambda: defaultdict(int)
|
|
222
|
+
) # directory -> {language: count}
|
|
214
223
|
dir_modified = defaultdict(float) # directory -> most recent modification time
|
|
215
224
|
|
|
216
225
|
for file_path in file_paths:
|
|
@@ -227,12 +236,16 @@ class DirectoryIndex:
|
|
|
227
236
|
# Accumulate statistics up the directory tree
|
|
228
237
|
if chunk_stats and str(file_path) in chunk_stats:
|
|
229
238
|
stats = chunk_stats[str(file_path)]
|
|
230
|
-
dir_chunks[current] += stats.get(
|
|
231
|
-
if
|
|
232
|
-
dir_languages[current][stats[
|
|
239
|
+
dir_chunks[current] += stats.get("chunks", 0)
|
|
240
|
+
if "language" in stats:
|
|
241
|
+
dir_languages[current][stats["language"]] += stats.get(
|
|
242
|
+
"chunks", 0
|
|
243
|
+
)
|
|
233
244
|
# Track most recent modification time
|
|
234
|
-
if
|
|
235
|
-
dir_modified[current] = max(
|
|
245
|
+
if "modified" in stats:
|
|
246
|
+
dir_modified[current] = max(
|
|
247
|
+
dir_modified.get(current, 0), stats["modified"]
|
|
248
|
+
)
|
|
236
249
|
|
|
237
250
|
current = current.parent
|
|
238
251
|
|
|
@@ -276,7 +289,9 @@ class DirectoryIndex:
|
|
|
276
289
|
subdirs = self.get_subdirectories(directory.path)
|
|
277
290
|
directory.subdirectory_count = len(subdirs)
|
|
278
291
|
|
|
279
|
-
logger.info(
|
|
292
|
+
logger.info(
|
|
293
|
+
f"Rebuilt directory index with {len(self.directories)} directories, {sum(dir_chunks.values())} total chunks"
|
|
294
|
+
)
|
|
280
295
|
|
|
281
296
|
def get_stats(self) -> dict[str, Any]:
|
|
282
297
|
"""Get directory index statistics.
|
|
@@ -17,7 +17,7 @@ from ..utils.monorepo import MonorepoDetector
|
|
|
17
17
|
from .database import VectorDatabase
|
|
18
18
|
from .directory_index import DirectoryIndex
|
|
19
19
|
from .exceptions import ParsingError
|
|
20
|
-
from .models import CodeChunk
|
|
20
|
+
from .models import CodeChunk, IndexStats
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
class SemanticIndexer:
|
|
@@ -179,8 +179,8 @@ class SemanticIndexer:
|
|
|
179
179
|
# For now, just track modification time
|
|
180
180
|
# Chunk counts will be aggregated from the database later if needed
|
|
181
181
|
chunk_stats[str(file_path)] = {
|
|
182
|
-
|
|
183
|
-
|
|
182
|
+
"modified": mtime,
|
|
183
|
+
"chunks": 1, # Placeholder - real count from chunks
|
|
184
184
|
}
|
|
185
185
|
except OSError:
|
|
186
186
|
pass
|
|
@@ -197,6 +197,7 @@ class SemanticIndexer:
|
|
|
197
197
|
except Exception as e:
|
|
198
198
|
logger.error(f"Failed to update directory index: {e}")
|
|
199
199
|
import traceback
|
|
200
|
+
|
|
200
201
|
logger.debug(traceback.format_exc())
|
|
201
202
|
|
|
202
203
|
logger.info(
|
|
@@ -351,8 +352,14 @@ class SemanticIndexer:
|
|
|
351
352
|
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
352
353
|
|
|
353
354
|
# Debug: Check if hierarchy was built
|
|
354
|
-
methods_with_parents = sum(
|
|
355
|
-
|
|
355
|
+
methods_with_parents = sum(
|
|
356
|
+
1
|
|
357
|
+
for c in chunks_with_hierarchy
|
|
358
|
+
if c.chunk_type in ("method", "function") and c.parent_chunk_id
|
|
359
|
+
)
|
|
360
|
+
logger.debug(
|
|
361
|
+
f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
|
|
362
|
+
)
|
|
356
363
|
|
|
357
364
|
# Add chunks to database
|
|
358
365
|
await self.database.add_chunks(chunks_with_hierarchy)
|
|
@@ -443,7 +450,11 @@ class SemanticIndexer:
|
|
|
443
450
|
# Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
|
|
444
451
|
# This is much more efficient than checking every file in ignored directories
|
|
445
452
|
# PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
|
|
446
|
-
dirs[:] = [
|
|
453
|
+
dirs[:] = [
|
|
454
|
+
d
|
|
455
|
+
for d in dirs
|
|
456
|
+
if not self._should_ignore_path(root_path / d, is_directory=True)
|
|
457
|
+
]
|
|
447
458
|
|
|
448
459
|
# Check each file in the current directory
|
|
449
460
|
# PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
|
|
@@ -489,7 +500,9 @@ class SemanticIndexer:
|
|
|
489
500
|
|
|
490
501
|
return self._indexable_files_cache
|
|
491
502
|
|
|
492
|
-
def _should_index_file(
|
|
503
|
+
def _should_index_file(
|
|
504
|
+
self, file_path: Path, skip_file_check: bool = False
|
|
505
|
+
) -> bool:
|
|
493
506
|
"""Check if a file should be indexed.
|
|
494
507
|
|
|
495
508
|
Args:
|
|
@@ -525,7 +538,9 @@ class SemanticIndexer:
|
|
|
525
538
|
|
|
526
539
|
return True
|
|
527
540
|
|
|
528
|
-
def _should_ignore_path(
|
|
541
|
+
def _should_ignore_path(
|
|
542
|
+
self, file_path: Path, is_directory: bool | None = None
|
|
543
|
+
) -> bool:
|
|
529
544
|
"""Check if a path should be ignored.
|
|
530
545
|
|
|
531
546
|
Args:
|
|
@@ -538,7 +553,9 @@ class SemanticIndexer:
|
|
|
538
553
|
try:
|
|
539
554
|
# First check gitignore rules if available
|
|
540
555
|
# PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
|
|
541
|
-
if self.gitignore_parser and self.gitignore_parser.is_ignored(
|
|
556
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(
|
|
557
|
+
file_path, is_directory=is_directory
|
|
558
|
+
):
|
|
542
559
|
logger.debug(f"Path ignored by .gitignore: {file_path}")
|
|
543
560
|
return True
|
|
544
561
|
|
|
@@ -677,24 +694,34 @@ class SemanticIndexer:
|
|
|
677
694
|
# If we can't parse versions, be safe and reindex
|
|
678
695
|
return True
|
|
679
696
|
|
|
680
|
-
async def get_indexing_stats(self) -> dict:
|
|
697
|
+
async def get_indexing_stats(self, db_stats: IndexStats | None = None) -> dict:
|
|
681
698
|
"""Get statistics about the indexing process.
|
|
682
699
|
|
|
700
|
+
Args:
|
|
701
|
+
db_stats: Optional pre-fetched database stats to avoid duplicate queries
|
|
702
|
+
|
|
683
703
|
Returns:
|
|
684
704
|
Dictionary with indexing statistics
|
|
705
|
+
|
|
706
|
+
Note:
|
|
707
|
+
Uses database statistics only for performance on large projects.
|
|
708
|
+
Filesystem scanning would timeout on 100K+ file projects.
|
|
709
|
+
Pass db_stats parameter to avoid calling database.get_stats() twice.
|
|
685
710
|
"""
|
|
686
711
|
try:
|
|
687
|
-
# Get database stats
|
|
688
|
-
db_stats
|
|
689
|
-
|
|
690
|
-
# Count indexable files asynchronously without blocking
|
|
691
|
-
indexable_files = await self._find_indexable_files_async()
|
|
712
|
+
# Get database stats if not provided (fast, no filesystem scan)
|
|
713
|
+
if db_stats is None:
|
|
714
|
+
db_stats = await self.database.get_stats()
|
|
692
715
|
|
|
716
|
+
# Use database stats for all file counts
|
|
717
|
+
# This avoids expensive filesystem scans on large projects
|
|
693
718
|
return {
|
|
694
|
-
"total_indexable_files":
|
|
719
|
+
"total_indexable_files": db_stats.total_files,
|
|
695
720
|
"indexed_files": db_stats.total_files,
|
|
721
|
+
"total_files": db_stats.total_files, # For backward compatibility
|
|
696
722
|
"total_chunks": db_stats.total_chunks,
|
|
697
723
|
"languages": db_stats.languages,
|
|
724
|
+
"file_types": db_stats.file_types, # Include file type distribution
|
|
698
725
|
"file_extensions": list(self.file_extensions),
|
|
699
726
|
"ignore_patterns": list(self._ignore_patterns),
|
|
700
727
|
"parser_info": self.parser_registry.get_parser_info(),
|
|
@@ -706,6 +733,7 @@ class SemanticIndexer:
|
|
|
706
733
|
"error": str(e),
|
|
707
734
|
"total_indexable_files": 0,
|
|
708
735
|
"indexed_files": 0,
|
|
736
|
+
"total_files": 0,
|
|
709
737
|
"total_chunks": 0,
|
|
710
738
|
}
|
|
711
739
|
|
|
@@ -799,9 +827,14 @@ class SemanticIndexer:
|
|
|
799
827
|
|
|
800
828
|
# Save error to error log file
|
|
801
829
|
try:
|
|
802
|
-
error_log_path =
|
|
830
|
+
error_log_path = (
|
|
831
|
+
self.project_root
|
|
832
|
+
/ ".mcp-vector-search"
|
|
833
|
+
/ "indexing_errors.log"
|
|
834
|
+
)
|
|
803
835
|
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
804
836
|
from datetime import datetime
|
|
837
|
+
|
|
805
838
|
timestamp = datetime.now().isoformat()
|
|
806
839
|
f.write(f"[{timestamp}] {error_msg}\n")
|
|
807
840
|
except Exception as log_err:
|
|
@@ -834,25 +867,38 @@ class SemanticIndexer:
|
|
|
834
867
|
|
|
835
868
|
# Group chunks by type and name
|
|
836
869
|
module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
|
|
837
|
-
class_chunks = [
|
|
838
|
-
|
|
870
|
+
class_chunks = [
|
|
871
|
+
c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
|
|
872
|
+
]
|
|
873
|
+
function_chunks = [
|
|
874
|
+
c for c in chunks if c.chunk_type in ("function", "method", "constructor")
|
|
875
|
+
]
|
|
839
876
|
|
|
840
877
|
# DEBUG: Print what we have (if debug enabled)
|
|
841
878
|
if self.debug:
|
|
842
879
|
import sys
|
|
843
|
-
|
|
880
|
+
|
|
881
|
+
print(
|
|
882
|
+
f"\n[DEBUG] Building hierarchy: {len(module_chunks)} modules, {len(class_chunks)} classes, {len(function_chunks)} functions",
|
|
883
|
+
file=sys.stderr,
|
|
884
|
+
)
|
|
844
885
|
if class_chunks:
|
|
845
|
-
print(
|
|
886
|
+
print(
|
|
887
|
+
f"[DEBUG] Class names: {[c.class_name for c in class_chunks[:5]]}",
|
|
888
|
+
file=sys.stderr,
|
|
889
|
+
)
|
|
846
890
|
if function_chunks:
|
|
847
|
-
print(
|
|
891
|
+
print(
|
|
892
|
+
f"[DEBUG] First 5 functions with class_name: {[(f.function_name, f.class_name) for f in function_chunks[:5]]}",
|
|
893
|
+
file=sys.stderr,
|
|
894
|
+
)
|
|
848
895
|
|
|
849
896
|
# Build relationships
|
|
850
897
|
for func in function_chunks:
|
|
851
898
|
if func.class_name:
|
|
852
899
|
# Find parent class
|
|
853
900
|
parent_class = next(
|
|
854
|
-
(c for c in class_chunks if c.class_name == func.class_name),
|
|
855
|
-
None
|
|
901
|
+
(c for c in class_chunks if c.class_name == func.class_name), None
|
|
856
902
|
)
|
|
857
903
|
if parent_class:
|
|
858
904
|
func.parent_chunk_id = parent_class.chunk_id
|
|
@@ -861,8 +907,14 @@ class SemanticIndexer:
|
|
|
861
907
|
parent_class.child_chunk_ids.append(func.chunk_id)
|
|
862
908
|
if self.debug:
|
|
863
909
|
import sys
|
|
864
|
-
|
|
865
|
-
|
|
910
|
+
|
|
911
|
+
print(
|
|
912
|
+
f"[DEBUG] ✓ Linked '{func.function_name}' to class '{parent_class.class_name}'",
|
|
913
|
+
file=sys.stderr,
|
|
914
|
+
)
|
|
915
|
+
logger.debug(
|
|
916
|
+
f"Linked method '{func.function_name}' (ID: {func.chunk_id[:8]}) to class '{parent_class.class_name}' (ID: {parent_class.chunk_id[:8]})"
|
|
917
|
+
)
|
|
866
918
|
else:
|
|
867
919
|
# Top-level function
|
|
868
920
|
if not func.chunk_depth:
|
|
@@ -891,23 +943,31 @@ class SemanticIndexer:
|
|
|
891
943
|
# DEBUG: Print summary
|
|
892
944
|
if self.debug:
|
|
893
945
|
import sys
|
|
946
|
+
|
|
894
947
|
funcs_with_parents = sum(1 for f in function_chunks if f.parent_chunk_id)
|
|
895
948
|
classes_with_parents = sum(1 for c in class_chunks if c.parent_chunk_id)
|
|
896
|
-
print(
|
|
949
|
+
print(
|
|
950
|
+
f"[DEBUG] Hierarchy built: {funcs_with_parents}/{len(function_chunks)} functions linked, {classes_with_parents}/{len(class_chunks)} classes linked\n",
|
|
951
|
+
file=sys.stderr,
|
|
952
|
+
)
|
|
897
953
|
|
|
898
954
|
return chunks
|
|
899
955
|
|
|
900
956
|
def _write_indexing_run_header(self) -> None:
|
|
901
957
|
"""Write version and timestamp header to error log at start of indexing run."""
|
|
902
958
|
try:
|
|
903
|
-
error_log_path =
|
|
959
|
+
error_log_path = (
|
|
960
|
+
self.project_root / ".mcp-vector-search" / "indexing_errors.log"
|
|
961
|
+
)
|
|
904
962
|
error_log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
905
963
|
|
|
906
964
|
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
907
965
|
timestamp = datetime.now(UTC).isoformat()
|
|
908
966
|
separator = "=" * 80
|
|
909
967
|
f.write(f"\n{separator}\n")
|
|
910
|
-
f.write(
|
|
968
|
+
f.write(
|
|
969
|
+
f"[{timestamp}] Indexing run started - mcp-vector-search v{__version__}\n"
|
|
970
|
+
)
|
|
911
971
|
f.write(f"{separator}\n")
|
|
912
972
|
except Exception as e:
|
|
913
973
|
logger.debug(f"Failed to write indexing run header: {e}")
|
mcp_vector_search/core/models.py
CHANGED
|
@@ -219,7 +219,9 @@ class Directory:
|
|
|
219
219
|
languages: dict[str, int] = None # Language distribution in this directory
|
|
220
220
|
depth: int = 0 # Depth from project root (0 = root)
|
|
221
221
|
is_package: bool = False # True if contains __init__.py or package.json
|
|
222
|
-
last_modified: float | None =
|
|
222
|
+
last_modified: float | None = (
|
|
223
|
+
None # Most recent file modification time (unix timestamp)
|
|
224
|
+
)
|
|
223
225
|
|
|
224
226
|
def __post_init__(self) -> None:
|
|
225
227
|
"""Initialize default values and generate directory ID."""
|
|
@@ -230,6 +232,7 @@ class Directory:
|
|
|
230
232
|
def id(self) -> str:
|
|
231
233
|
"""Generate unique ID for this directory."""
|
|
232
234
|
import hashlib
|
|
235
|
+
|
|
233
236
|
return hashlib.sha256(str(self.path).encode()).hexdigest()[:16]
|
|
234
237
|
|
|
235
238
|
def to_dict(self) -> dict[str, Any]:
|