mcp-vector-search 0.9.3__py3-none-any.whl → 0.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/cli/commands/index.py +44 -22
- mcp_vector_search/cli/commands/install.py +502 -523
- mcp_vector_search/cli/commands/install_old.py +696 -0
- mcp_vector_search/cli/commands/status.py +7 -5
- mcp_vector_search/cli/commands/uninstall.py +485 -0
- mcp_vector_search/cli/commands/visualize.py +677 -53
- mcp_vector_search/cli/didyoumean.py +10 -0
- mcp_vector_search/cli/main.py +39 -21
- mcp_vector_search/core/connection_pool.py +49 -11
- mcp_vector_search/core/database.py +61 -28
- mcp_vector_search/core/directory_index.py +318 -0
- mcp_vector_search/core/indexer.py +146 -19
- mcp_vector_search/core/models.py +61 -0
- mcp_vector_search/core/project.py +16 -5
- mcp_vector_search/parsers/base.py +54 -18
- mcp_vector_search/parsers/javascript.py +41 -20
- mcp_vector_search/parsers/python.py +19 -11
- mcp_vector_search/parsers/registry.py +3 -2
- mcp_vector_search/utils/gitignore.py +17 -5
- mcp_vector_search/visualization/index.html +658 -0
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.1.dist-info}/METADATA +87 -24
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.1.dist-info}/RECORD +26 -22
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.1.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.1.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""Directory index for tracking project structure and file relationships."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from .models import Directory
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DirectoryIndex:
|
|
14
|
+
"""Manages directory structure and file-directory relationships."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, index_path: Path) -> None:
|
|
17
|
+
"""Initialize directory index.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
index_path: Path to directory index file (JSON)
|
|
21
|
+
"""
|
|
22
|
+
self.index_path = index_path
|
|
23
|
+
self.directories: dict[str, Directory] = {} # path -> Directory
|
|
24
|
+
self.file_to_directory: dict[str, str] = {} # file_path -> directory_path
|
|
25
|
+
self.directory_files: dict[str, list[str]] = defaultdict(
|
|
26
|
+
list
|
|
27
|
+
) # dir_path -> [file_paths]
|
|
28
|
+
|
|
29
|
+
def load(self) -> None:
|
|
30
|
+
"""Load directory index from disk."""
|
|
31
|
+
if not self.index_path.exists():
|
|
32
|
+
logger.debug("No directory index found, starting fresh")
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
with open(self.index_path) as f:
|
|
37
|
+
data = json.load(f)
|
|
38
|
+
|
|
39
|
+
# Load directories
|
|
40
|
+
for dir_data in data.get("directories", []):
|
|
41
|
+
directory = Directory.from_dict(dir_data)
|
|
42
|
+
self.directories[str(directory.path)] = directory
|
|
43
|
+
|
|
44
|
+
# Load file mappings
|
|
45
|
+
self.file_to_directory = data.get("file_to_directory", {})
|
|
46
|
+
|
|
47
|
+
# Rebuild directory_files from file_to_directory
|
|
48
|
+
self.directory_files = defaultdict(list)
|
|
49
|
+
for file_path, dir_path in self.file_to_directory.items():
|
|
50
|
+
self.directory_files[dir_path].append(file_path)
|
|
51
|
+
|
|
52
|
+
logger.info(f"Loaded {len(self.directories)} directories from index")
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.error(f"Failed to load directory index: {e}")
|
|
56
|
+
self.directories = {}
|
|
57
|
+
self.file_to_directory = {}
|
|
58
|
+
self.directory_files = defaultdict(list)
|
|
59
|
+
|
|
60
|
+
def save(self) -> None:
|
|
61
|
+
"""Save directory index to disk."""
|
|
62
|
+
try:
|
|
63
|
+
# Ensure parent directory exists
|
|
64
|
+
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
data = {
|
|
67
|
+
"directories": [d.to_dict() for d in self.directories.values()],
|
|
68
|
+
"file_to_directory": self.file_to_directory,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
with open(self.index_path, "w") as f:
|
|
72
|
+
json.dump(data, f, indent=2)
|
|
73
|
+
|
|
74
|
+
logger.debug(f"Saved {len(self.directories)} directories to index")
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Failed to save directory index: {e}")
|
|
78
|
+
raise
|
|
79
|
+
|
|
80
|
+
def add_directory(self, directory: Directory) -> None:
|
|
81
|
+
"""Add or update a directory in the index.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
directory: Directory to add
|
|
85
|
+
"""
|
|
86
|
+
dir_path = str(directory.path)
|
|
87
|
+
self.directories[dir_path] = directory
|
|
88
|
+
|
|
89
|
+
def add_file(self, file_path: Path, directory_path: Path) -> None:
|
|
90
|
+
"""Associate a file with its directory.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
file_path: Path to the file
|
|
94
|
+
directory_path: Path to the directory containing the file
|
|
95
|
+
"""
|
|
96
|
+
file_path_str = str(file_path)
|
|
97
|
+
dir_path_str = str(directory_path)
|
|
98
|
+
|
|
99
|
+
self.file_to_directory[file_path_str] = dir_path_str
|
|
100
|
+
if file_path_str not in self.directory_files[dir_path_str]:
|
|
101
|
+
self.directory_files[dir_path_str].append(file_path_str)
|
|
102
|
+
|
|
103
|
+
# Update directory file count
|
|
104
|
+
if dir_path_str in self.directories:
|
|
105
|
+
self.directories[dir_path_str].file_count = len(
|
|
106
|
+
self.directory_files[dir_path_str]
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def get_directory(self, directory_path: Path) -> Directory | None:
|
|
110
|
+
"""Get directory by path.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
directory_path: Path to directory
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Directory object or None if not found
|
|
117
|
+
"""
|
|
118
|
+
return self.directories.get(str(directory_path))
|
|
119
|
+
|
|
120
|
+
def get_files_in_directory(self, directory_path: Path) -> list[str]:
|
|
121
|
+
"""Get all files in a directory.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
directory_path: Path to directory
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of file paths in the directory
|
|
128
|
+
"""
|
|
129
|
+
return self.directory_files.get(str(directory_path), [])
|
|
130
|
+
|
|
131
|
+
def get_subdirectories(self, directory_path: Path) -> list[Directory]:
|
|
132
|
+
"""Get all immediate subdirectories.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
directory_path: Path to parent directory
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of subdirectory objects
|
|
139
|
+
"""
|
|
140
|
+
parent_path_str = str(directory_path)
|
|
141
|
+
subdirs = []
|
|
142
|
+
|
|
143
|
+
for _dir_path_str, directory in self.directories.items():
|
|
144
|
+
if directory.parent_path and str(directory.parent_path) == parent_path_str:
|
|
145
|
+
subdirs.append(directory)
|
|
146
|
+
|
|
147
|
+
return subdirs
|
|
148
|
+
|
|
149
|
+
def get_root_directories(self) -> list[Directory]:
|
|
150
|
+
"""Get all root-level directories (no parent).
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
List of root directory objects
|
|
154
|
+
"""
|
|
155
|
+
return [d for d in self.directories.values() if d.parent_path is None]
|
|
156
|
+
|
|
157
|
+
def delete_directory(self, directory_path: Path) -> None:
|
|
158
|
+
"""Remove directory and its file associations.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
directory_path: Path to directory to remove
|
|
162
|
+
"""
|
|
163
|
+
dir_path_str = str(directory_path)
|
|
164
|
+
|
|
165
|
+
# Remove directory
|
|
166
|
+
if dir_path_str in self.directories:
|
|
167
|
+
del self.directories[dir_path_str]
|
|
168
|
+
|
|
169
|
+
# Remove file associations
|
|
170
|
+
if dir_path_str in self.directory_files:
|
|
171
|
+
for file_path in self.directory_files[dir_path_str]:
|
|
172
|
+
if file_path in self.file_to_directory:
|
|
173
|
+
del self.file_to_directory[file_path]
|
|
174
|
+
del self.directory_files[dir_path_str]
|
|
175
|
+
|
|
176
|
+
def delete_file(self, file_path: Path) -> None:
|
|
177
|
+
"""Remove file from directory associations.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
file_path: Path to file to remove
|
|
181
|
+
"""
|
|
182
|
+
file_path_str = str(file_path)
|
|
183
|
+
|
|
184
|
+
if file_path_str in self.file_to_directory:
|
|
185
|
+
dir_path = self.file_to_directory[file_path_str]
|
|
186
|
+
del self.file_to_directory[file_path_str]
|
|
187
|
+
|
|
188
|
+
# Remove from directory_files
|
|
189
|
+
if dir_path in self.directory_files:
|
|
190
|
+
self.directory_files[dir_path] = [
|
|
191
|
+
f for f in self.directory_files[dir_path] if f != file_path_str
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
# Update directory file count
|
|
195
|
+
if dir_path in self.directories:
|
|
196
|
+
self.directories[dir_path].file_count = len(
|
|
197
|
+
self.directory_files[dir_path]
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def rebuild_from_files(
|
|
201
|
+
self,
|
|
202
|
+
file_paths: list[Path],
|
|
203
|
+
root_path: Path,
|
|
204
|
+
chunk_stats: dict[str, dict] | None = None,
|
|
205
|
+
) -> None:
|
|
206
|
+
"""Rebuild directory index from list of files with statistics from chunks.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
file_paths: List of file paths to index
|
|
210
|
+
root_path: Project root path
|
|
211
|
+
chunk_stats: Optional dict mapping file_path -> {'chunks': count, 'language': str}
|
|
212
|
+
"""
|
|
213
|
+
self.directories = {}
|
|
214
|
+
self.file_to_directory = {}
|
|
215
|
+
self.directory_files = defaultdict(list)
|
|
216
|
+
|
|
217
|
+
# Track all unique directories and their statistics
|
|
218
|
+
dir_set = set()
|
|
219
|
+
dir_chunks = defaultdict(int) # directory -> total chunks
|
|
220
|
+
dir_languages = defaultdict(
|
|
221
|
+
lambda: defaultdict(int)
|
|
222
|
+
) # directory -> {language: count}
|
|
223
|
+
dir_modified = defaultdict(float) # directory -> most recent modification time
|
|
224
|
+
|
|
225
|
+
for file_path in file_paths:
|
|
226
|
+
# Get relative path from root
|
|
227
|
+
try:
|
|
228
|
+
rel_path = file_path.relative_to(root_path)
|
|
229
|
+
parent_dir = rel_path.parent
|
|
230
|
+
|
|
231
|
+
# Add all parent directories up to root
|
|
232
|
+
current = parent_dir
|
|
233
|
+
while current != Path("."):
|
|
234
|
+
dir_set.add(current)
|
|
235
|
+
|
|
236
|
+
# Accumulate statistics up the directory tree
|
|
237
|
+
if chunk_stats and str(file_path) in chunk_stats:
|
|
238
|
+
stats = chunk_stats[str(file_path)]
|
|
239
|
+
dir_chunks[current] += stats.get("chunks", 0)
|
|
240
|
+
if "language" in stats:
|
|
241
|
+
dir_languages[current][stats["language"]] += stats.get(
|
|
242
|
+
"chunks", 0
|
|
243
|
+
)
|
|
244
|
+
# Track most recent modification time
|
|
245
|
+
if "modified" in stats:
|
|
246
|
+
dir_modified[current] = max(
|
|
247
|
+
dir_modified.get(current, 0), stats["modified"]
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
current = current.parent
|
|
251
|
+
|
|
252
|
+
# Associate file with its direct parent
|
|
253
|
+
if parent_dir != Path("."):
|
|
254
|
+
self.add_file(rel_path, parent_dir)
|
|
255
|
+
|
|
256
|
+
except ValueError:
|
|
257
|
+
# File not relative to root, skip
|
|
258
|
+
logger.warning(f"File {file_path} not under root {root_path}")
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
# Create Directory objects for all directories
|
|
262
|
+
for dir_path in sorted(dir_set):
|
|
263
|
+
# Determine parent
|
|
264
|
+
parent_path = dir_path.parent if dir_path.parent != Path(".") else None
|
|
265
|
+
|
|
266
|
+
# Check if it's a package
|
|
267
|
+
is_package = False
|
|
268
|
+
full_dir_path = root_path / dir_path
|
|
269
|
+
if (full_dir_path / "__init__.py").exists():
|
|
270
|
+
is_package = True
|
|
271
|
+
elif (full_dir_path / "package.json").exists():
|
|
272
|
+
is_package = True
|
|
273
|
+
|
|
274
|
+
directory = Directory(
|
|
275
|
+
path=dir_path,
|
|
276
|
+
name=dir_path.name,
|
|
277
|
+
parent_path=parent_path,
|
|
278
|
+
depth=len(dir_path.parts),
|
|
279
|
+
is_package=is_package,
|
|
280
|
+
total_chunks=dir_chunks.get(dir_path, 0),
|
|
281
|
+
languages=dict(dir_languages.get(dir_path, {})),
|
|
282
|
+
last_modified=dir_modified.get(dir_path),
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
self.add_directory(directory)
|
|
286
|
+
|
|
287
|
+
# Update subdirectory counts
|
|
288
|
+
for directory in self.directories.values():
|
|
289
|
+
subdirs = self.get_subdirectories(directory.path)
|
|
290
|
+
directory.subdirectory_count = len(subdirs)
|
|
291
|
+
|
|
292
|
+
logger.info(
|
|
293
|
+
f"Rebuilt directory index with {len(self.directories)} directories, {sum(dir_chunks.values())} total chunks"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
def get_stats(self) -> dict[str, Any]:
|
|
297
|
+
"""Get directory index statistics.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Dictionary with statistics
|
|
301
|
+
"""
|
|
302
|
+
return {
|
|
303
|
+
"total_directories": len(self.directories),
|
|
304
|
+
"total_files": len(self.file_to_directory),
|
|
305
|
+
"root_directories": len(self.get_root_directories()),
|
|
306
|
+
"packages": sum(1 for d in self.directories.values() if d.is_package),
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
def reset(self) -> None:
|
|
310
|
+
"""Clear all directory data."""
|
|
311
|
+
self.directories = {}
|
|
312
|
+
self.file_to_directory = {}
|
|
313
|
+
self.directory_files = defaultdict(list)
|
|
314
|
+
|
|
315
|
+
if self.index_path.exists():
|
|
316
|
+
self.index_path.unlink()
|
|
317
|
+
|
|
318
|
+
logger.info("Directory index reset")
|
|
@@ -15,8 +15,9 @@ from ..parsers.registry import get_parser_registry
|
|
|
15
15
|
from ..utils.gitignore import create_gitignore_parser
|
|
16
16
|
from ..utils.monorepo import MonorepoDetector
|
|
17
17
|
from .database import VectorDatabase
|
|
18
|
+
from .directory_index import DirectoryIndex
|
|
18
19
|
from .exceptions import ParsingError
|
|
19
|
-
from .models import CodeChunk
|
|
20
|
+
from .models import CodeChunk, IndexStats
|
|
20
21
|
|
|
21
22
|
|
|
22
23
|
class SemanticIndexer:
|
|
@@ -29,6 +30,7 @@ class SemanticIndexer:
|
|
|
29
30
|
file_extensions: list[str],
|
|
30
31
|
max_workers: int | None = None,
|
|
31
32
|
batch_size: int = 10,
|
|
33
|
+
debug: bool = False,
|
|
32
34
|
) -> None:
|
|
33
35
|
"""Initialize semantic indexer.
|
|
34
36
|
|
|
@@ -38,12 +40,14 @@ class SemanticIndexer:
|
|
|
38
40
|
file_extensions: File extensions to index
|
|
39
41
|
max_workers: Maximum number of worker threads for parallel processing
|
|
40
42
|
batch_size: Number of files to process in each batch
|
|
43
|
+
debug: Enable debug output for hierarchy building
|
|
41
44
|
"""
|
|
42
45
|
self.database = database
|
|
43
46
|
self.project_root = project_root
|
|
44
47
|
self.file_extensions = {ext.lower() for ext in file_extensions}
|
|
45
48
|
self.parser_registry = get_parser_registry()
|
|
46
49
|
self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
|
|
50
|
+
self.debug = debug
|
|
47
51
|
|
|
48
52
|
# Safely get event loop for max_workers
|
|
49
53
|
try:
|
|
@@ -81,6 +85,13 @@ class SemanticIndexer:
|
|
|
81
85
|
for sp in subprojects:
|
|
82
86
|
logger.debug(f" - {sp.name} ({sp.relative_path})")
|
|
83
87
|
|
|
88
|
+
# Initialize directory index
|
|
89
|
+
self.directory_index = DirectoryIndex(
|
|
90
|
+
project_root / ".mcp-vector-search" / "directory_index.json"
|
|
91
|
+
)
|
|
92
|
+
# Load existing directory index
|
|
93
|
+
self.directory_index.load()
|
|
94
|
+
|
|
84
95
|
async def index_project(
|
|
85
96
|
self,
|
|
86
97
|
force_reindex: bool = False,
|
|
@@ -156,6 +167,39 @@ class SemanticIndexer:
|
|
|
156
167
|
|
|
157
168
|
self._save_index_metadata(metadata)
|
|
158
169
|
|
|
170
|
+
# Rebuild directory index from successfully indexed files
|
|
171
|
+
try:
|
|
172
|
+
logger.debug("Rebuilding directory index...")
|
|
173
|
+
# We don't have chunk counts here, but we have file modification times
|
|
174
|
+
# Build a simple stats dict with file mod times for recency tracking
|
|
175
|
+
chunk_stats = {}
|
|
176
|
+
for file_path in files_to_index:
|
|
177
|
+
try:
|
|
178
|
+
mtime = os.path.getmtime(file_path)
|
|
179
|
+
# For now, just track modification time
|
|
180
|
+
# Chunk counts will be aggregated from the database later if needed
|
|
181
|
+
chunk_stats[str(file_path)] = {
|
|
182
|
+
"modified": mtime,
|
|
183
|
+
"chunks": 1, # Placeholder - real count from chunks
|
|
184
|
+
}
|
|
185
|
+
except OSError:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
self.directory_index.rebuild_from_files(
|
|
189
|
+
files_to_index, self.project_root, chunk_stats=chunk_stats
|
|
190
|
+
)
|
|
191
|
+
self.directory_index.save()
|
|
192
|
+
dir_stats = self.directory_index.get_stats()
|
|
193
|
+
logger.info(
|
|
194
|
+
f"Directory index updated: {dir_stats['total_directories']} directories, "
|
|
195
|
+
f"{dir_stats['total_files']} files"
|
|
196
|
+
)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.error(f"Failed to update directory index: {e}")
|
|
199
|
+
import traceback
|
|
200
|
+
|
|
201
|
+
logger.debug(traceback.format_exc())
|
|
202
|
+
|
|
159
203
|
logger.info(
|
|
160
204
|
f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
|
|
161
205
|
)
|
|
@@ -307,6 +351,16 @@ class SemanticIndexer:
|
|
|
307
351
|
# Build hierarchical relationships between chunks
|
|
308
352
|
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
309
353
|
|
|
354
|
+
# Debug: Check if hierarchy was built
|
|
355
|
+
methods_with_parents = sum(
|
|
356
|
+
1
|
|
357
|
+
for c in chunks_with_hierarchy
|
|
358
|
+
if c.chunk_type in ("method", "function") and c.parent_chunk_id
|
|
359
|
+
)
|
|
360
|
+
logger.debug(
|
|
361
|
+
f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents"
|
|
362
|
+
)
|
|
363
|
+
|
|
310
364
|
# Add chunks to database
|
|
311
365
|
await self.database.add_chunks(chunks_with_hierarchy)
|
|
312
366
|
|
|
@@ -396,7 +450,11 @@ class SemanticIndexer:
|
|
|
396
450
|
# Filter out ignored directories IN-PLACE to prevent os.walk from traversing them
|
|
397
451
|
# This is much more efficient than checking every file in ignored directories
|
|
398
452
|
# PERFORMANCE: Pass is_directory=True hint to skip filesystem stat() calls
|
|
399
|
-
dirs[:] = [
|
|
453
|
+
dirs[:] = [
|
|
454
|
+
d
|
|
455
|
+
for d in dirs
|
|
456
|
+
if not self._should_ignore_path(root_path / d, is_directory=True)
|
|
457
|
+
]
|
|
400
458
|
|
|
401
459
|
# Check each file in the current directory
|
|
402
460
|
# PERFORMANCE: skip_file_check=True because os.walk guarantees these are files
|
|
@@ -442,7 +500,9 @@ class SemanticIndexer:
|
|
|
442
500
|
|
|
443
501
|
return self._indexable_files_cache
|
|
444
502
|
|
|
445
|
-
def _should_index_file(
|
|
503
|
+
def _should_index_file(
|
|
504
|
+
self, file_path: Path, skip_file_check: bool = False
|
|
505
|
+
) -> bool:
|
|
446
506
|
"""Check if a file should be indexed.
|
|
447
507
|
|
|
448
508
|
Args:
|
|
@@ -478,7 +538,9 @@ class SemanticIndexer:
|
|
|
478
538
|
|
|
479
539
|
return True
|
|
480
540
|
|
|
481
|
-
def _should_ignore_path(
|
|
541
|
+
def _should_ignore_path(
|
|
542
|
+
self, file_path: Path, is_directory: bool | None = None
|
|
543
|
+
) -> bool:
|
|
482
544
|
"""Check if a path should be ignored.
|
|
483
545
|
|
|
484
546
|
Args:
|
|
@@ -491,7 +553,9 @@ class SemanticIndexer:
|
|
|
491
553
|
try:
|
|
492
554
|
# First check gitignore rules if available
|
|
493
555
|
# PERFORMANCE: Pass is_directory hint to avoid redundant stat() calls
|
|
494
|
-
if self.gitignore_parser and self.gitignore_parser.is_ignored(
|
|
556
|
+
if self.gitignore_parser and self.gitignore_parser.is_ignored(
|
|
557
|
+
file_path, is_directory=is_directory
|
|
558
|
+
):
|
|
495
559
|
logger.debug(f"Path ignored by .gitignore: {file_path}")
|
|
496
560
|
return True
|
|
497
561
|
|
|
@@ -630,24 +694,34 @@ class SemanticIndexer:
|
|
|
630
694
|
# If we can't parse versions, be safe and reindex
|
|
631
695
|
return True
|
|
632
696
|
|
|
633
|
-
async def get_indexing_stats(self) -> dict:
|
|
697
|
+
async def get_indexing_stats(self, db_stats: IndexStats | None = None) -> dict:
|
|
634
698
|
"""Get statistics about the indexing process.
|
|
635
699
|
|
|
700
|
+
Args:
|
|
701
|
+
db_stats: Optional pre-fetched database stats to avoid duplicate queries
|
|
702
|
+
|
|
636
703
|
Returns:
|
|
637
704
|
Dictionary with indexing statistics
|
|
705
|
+
|
|
706
|
+
Note:
|
|
707
|
+
Uses database statistics only for performance on large projects.
|
|
708
|
+
Filesystem scanning would timeout on 100K+ file projects.
|
|
709
|
+
Pass db_stats parameter to avoid calling database.get_stats() twice.
|
|
638
710
|
"""
|
|
639
711
|
try:
|
|
640
|
-
# Get database stats
|
|
641
|
-
db_stats
|
|
642
|
-
|
|
643
|
-
# Count indexable files asynchronously without blocking
|
|
644
|
-
indexable_files = await self._find_indexable_files_async()
|
|
712
|
+
# Get database stats if not provided (fast, no filesystem scan)
|
|
713
|
+
if db_stats is None:
|
|
714
|
+
db_stats = await self.database.get_stats()
|
|
645
715
|
|
|
716
|
+
# Use database stats for all file counts
|
|
717
|
+
# This avoids expensive filesystem scans on large projects
|
|
646
718
|
return {
|
|
647
|
-
"total_indexable_files":
|
|
719
|
+
"total_indexable_files": db_stats.total_files,
|
|
648
720
|
"indexed_files": db_stats.total_files,
|
|
721
|
+
"total_files": db_stats.total_files, # For backward compatibility
|
|
649
722
|
"total_chunks": db_stats.total_chunks,
|
|
650
723
|
"languages": db_stats.languages,
|
|
724
|
+
"file_types": db_stats.file_types, # Include file type distribution
|
|
651
725
|
"file_extensions": list(self.file_extensions),
|
|
652
726
|
"ignore_patterns": list(self._ignore_patterns),
|
|
653
727
|
"parser_info": self.parser_registry.get_parser_info(),
|
|
@@ -659,6 +733,7 @@ class SemanticIndexer:
|
|
|
659
733
|
"error": str(e),
|
|
660
734
|
"total_indexable_files": 0,
|
|
661
735
|
"indexed_files": 0,
|
|
736
|
+
"total_files": 0,
|
|
662
737
|
"total_chunks": 0,
|
|
663
738
|
}
|
|
664
739
|
|
|
@@ -752,9 +827,14 @@ class SemanticIndexer:
|
|
|
752
827
|
|
|
753
828
|
# Save error to error log file
|
|
754
829
|
try:
|
|
755
|
-
error_log_path =
|
|
830
|
+
error_log_path = (
|
|
831
|
+
self.project_root
|
|
832
|
+
/ ".mcp-vector-search"
|
|
833
|
+
/ "indexing_errors.log"
|
|
834
|
+
)
|
|
756
835
|
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
757
836
|
from datetime import datetime
|
|
837
|
+
|
|
758
838
|
timestamp = datetime.now().isoformat()
|
|
759
839
|
f.write(f"[{timestamp}] {error_msg}\n")
|
|
760
840
|
except Exception as log_err:
|
|
@@ -787,22 +867,54 @@ class SemanticIndexer:
|
|
|
787
867
|
|
|
788
868
|
# Group chunks by type and name
|
|
789
869
|
module_chunks = [c for c in chunks if c.chunk_type in ("module", "imports")]
|
|
790
|
-
class_chunks = [
|
|
791
|
-
|
|
870
|
+
class_chunks = [
|
|
871
|
+
c for c in chunks if c.chunk_type in ("class", "interface", "mixin")
|
|
872
|
+
]
|
|
873
|
+
function_chunks = [
|
|
874
|
+
c for c in chunks if c.chunk_type in ("function", "method", "constructor")
|
|
875
|
+
]
|
|
876
|
+
|
|
877
|
+
# DEBUG: Print what we have (if debug enabled)
|
|
878
|
+
if self.debug:
|
|
879
|
+
import sys
|
|
880
|
+
|
|
881
|
+
print(
|
|
882
|
+
f"\n[DEBUG] Building hierarchy: {len(module_chunks)} modules, {len(class_chunks)} classes, {len(function_chunks)} functions",
|
|
883
|
+
file=sys.stderr,
|
|
884
|
+
)
|
|
885
|
+
if class_chunks:
|
|
886
|
+
print(
|
|
887
|
+
f"[DEBUG] Class names: {[c.class_name for c in class_chunks[:5]]}",
|
|
888
|
+
file=sys.stderr,
|
|
889
|
+
)
|
|
890
|
+
if function_chunks:
|
|
891
|
+
print(
|
|
892
|
+
f"[DEBUG] First 5 functions with class_name: {[(f.function_name, f.class_name) for f in function_chunks[:5]]}",
|
|
893
|
+
file=sys.stderr,
|
|
894
|
+
)
|
|
792
895
|
|
|
793
896
|
# Build relationships
|
|
794
897
|
for func in function_chunks:
|
|
795
898
|
if func.class_name:
|
|
796
899
|
# Find parent class
|
|
797
900
|
parent_class = next(
|
|
798
|
-
(c for c in class_chunks if c.class_name == func.class_name),
|
|
799
|
-
None
|
|
901
|
+
(c for c in class_chunks if c.class_name == func.class_name), None
|
|
800
902
|
)
|
|
801
903
|
if parent_class:
|
|
802
904
|
func.parent_chunk_id = parent_class.chunk_id
|
|
803
905
|
func.chunk_depth = parent_class.chunk_depth + 1
|
|
804
906
|
if func.chunk_id not in parent_class.child_chunk_ids:
|
|
805
907
|
parent_class.child_chunk_ids.append(func.chunk_id)
|
|
908
|
+
if self.debug:
|
|
909
|
+
import sys
|
|
910
|
+
|
|
911
|
+
print(
|
|
912
|
+
f"[DEBUG] ✓ Linked '{func.function_name}' to class '{parent_class.class_name}'",
|
|
913
|
+
file=sys.stderr,
|
|
914
|
+
)
|
|
915
|
+
logger.debug(
|
|
916
|
+
f"Linked method '{func.function_name}' (ID: {func.chunk_id[:8]}) to class '{parent_class.class_name}' (ID: {parent_class.chunk_id[:8]})"
|
|
917
|
+
)
|
|
806
918
|
else:
|
|
807
919
|
# Top-level function
|
|
808
920
|
if not func.chunk_depth:
|
|
@@ -828,19 +940,34 @@ class SemanticIndexer:
|
|
|
828
940
|
if not mod.chunk_depth:
|
|
829
941
|
mod.chunk_depth = 0
|
|
830
942
|
|
|
943
|
+
# DEBUG: Print summary
|
|
944
|
+
if self.debug:
|
|
945
|
+
import sys
|
|
946
|
+
|
|
947
|
+
funcs_with_parents = sum(1 for f in function_chunks if f.parent_chunk_id)
|
|
948
|
+
classes_with_parents = sum(1 for c in class_chunks if c.parent_chunk_id)
|
|
949
|
+
print(
|
|
950
|
+
f"[DEBUG] Hierarchy built: {funcs_with_parents}/{len(function_chunks)} functions linked, {classes_with_parents}/{len(class_chunks)} classes linked\n",
|
|
951
|
+
file=sys.stderr,
|
|
952
|
+
)
|
|
953
|
+
|
|
831
954
|
return chunks
|
|
832
955
|
|
|
833
956
|
def _write_indexing_run_header(self) -> None:
|
|
834
957
|
"""Write version and timestamp header to error log at start of indexing run."""
|
|
835
958
|
try:
|
|
836
|
-
error_log_path =
|
|
959
|
+
error_log_path = (
|
|
960
|
+
self.project_root / ".mcp-vector-search" / "indexing_errors.log"
|
|
961
|
+
)
|
|
837
962
|
error_log_path.parent.mkdir(parents=True, exist_ok=True)
|
|
838
963
|
|
|
839
964
|
with open(error_log_path, "a", encoding="utf-8") as f:
|
|
840
965
|
timestamp = datetime.now(UTC).isoformat()
|
|
841
966
|
separator = "=" * 80
|
|
842
967
|
f.write(f"\n{separator}\n")
|
|
843
|
-
f.write(
|
|
968
|
+
f.write(
|
|
969
|
+
f"[{timestamp}] Indexing run started - mcp-vector-search v{__version__}\n"
|
|
970
|
+
)
|
|
844
971
|
f.write(f"{separator}\n")
|
|
845
972
|
except Exception as e:
|
|
846
973
|
logger.debug(f"Failed to write indexing run header: {e}")
|
mcp_vector_search/core/models.py
CHANGED
|
@@ -206,6 +206,67 @@ class IndexStats(BaseModel):
|
|
|
206
206
|
}
|
|
207
207
|
|
|
208
208
|
|
|
209
|
+
@dataclass
|
|
210
|
+
class Directory:
|
|
211
|
+
"""Represents a directory in the project structure."""
|
|
212
|
+
|
|
213
|
+
path: Path # Relative path from project root
|
|
214
|
+
name: str # Directory name
|
|
215
|
+
parent_path: Path | None = None # Parent directory path (None for root)
|
|
216
|
+
file_count: int = 0 # Number of files directly in this directory
|
|
217
|
+
subdirectory_count: int = 0 # Number of subdirectories
|
|
218
|
+
total_chunks: int = 0 # Total code chunks in this directory (recursive)
|
|
219
|
+
languages: dict[str, int] = None # Language distribution in this directory
|
|
220
|
+
depth: int = 0 # Depth from project root (0 = root)
|
|
221
|
+
is_package: bool = False # True if contains __init__.py or package.json
|
|
222
|
+
last_modified: float | None = (
|
|
223
|
+
None # Most recent file modification time (unix timestamp)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
def __post_init__(self) -> None:
|
|
227
|
+
"""Initialize default values and generate directory ID."""
|
|
228
|
+
if self.languages is None:
|
|
229
|
+
self.languages = {}
|
|
230
|
+
|
|
231
|
+
@property
|
|
232
|
+
def id(self) -> str:
|
|
233
|
+
"""Generate unique ID for this directory."""
|
|
234
|
+
import hashlib
|
|
235
|
+
|
|
236
|
+
return hashlib.sha256(str(self.path).encode()).hexdigest()[:16]
|
|
237
|
+
|
|
238
|
+
def to_dict(self) -> dict[str, Any]:
|
|
239
|
+
"""Convert to dictionary for storage."""
|
|
240
|
+
return {
|
|
241
|
+
"path": str(self.path),
|
|
242
|
+
"name": self.name,
|
|
243
|
+
"parent_path": str(self.parent_path) if self.parent_path else None,
|
|
244
|
+
"file_count": self.file_count,
|
|
245
|
+
"subdirectory_count": self.subdirectory_count,
|
|
246
|
+
"total_chunks": self.total_chunks,
|
|
247
|
+
"languages": self.languages,
|
|
248
|
+
"depth": self.depth,
|
|
249
|
+
"is_package": self.is_package,
|
|
250
|
+
"last_modified": self.last_modified,
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
@classmethod
|
|
254
|
+
def from_dict(cls, data: dict[str, Any]) -> "Directory":
|
|
255
|
+
"""Create from dictionary."""
|
|
256
|
+
return cls(
|
|
257
|
+
path=Path(data["path"]),
|
|
258
|
+
name=data["name"],
|
|
259
|
+
parent_path=Path(data["parent_path"]) if data.get("parent_path") else None,
|
|
260
|
+
file_count=data.get("file_count", 0),
|
|
261
|
+
subdirectory_count=data.get("subdirectory_count", 0),
|
|
262
|
+
total_chunks=data.get("total_chunks", 0),
|
|
263
|
+
languages=data.get("languages", {}),
|
|
264
|
+
depth=data.get("depth", 0),
|
|
265
|
+
is_package=data.get("is_package", False),
|
|
266
|
+
last_modified=data.get("last_modified"),
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
|
|
209
270
|
class ProjectInfo(BaseModel):
|
|
210
271
|
"""Information about a project."""
|
|
211
272
|
|