mcp-vector-search 0.9.3__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/cli/commands/index.py +31 -0
- mcp_vector_search/cli/commands/visualize.py +358 -20
- mcp_vector_search/core/database.py +55 -20
- mcp_vector_search/core/directory_index.py +303 -0
- mcp_vector_search/core/indexer.py +67 -0
- mcp_vector_search/core/models.py +58 -0
- mcp_vector_search/utils/gitignore.py +14 -4
- mcp_vector_search/visualization/index.html +658 -0
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.0.dist-info}/METADATA +1 -1
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.0.dist-info}/RECORD +14 -12
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.0.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.0.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.9.3.dist-info → mcp_vector_search-0.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Database abstraction and ChromaDB implementation for MCP Vector Search."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import json
|
|
4
5
|
import shutil
|
|
5
6
|
from abc import ABC, abstractmethod
|
|
6
7
|
from pathlib import Path
|
|
@@ -273,16 +274,16 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
273
274
|
"class_name": chunk.class_name or "",
|
|
274
275
|
"docstring": chunk.docstring or "",
|
|
275
276
|
"complexity_score": chunk.complexity_score,
|
|
276
|
-
# Hierarchy fields
|
|
277
|
+
# Hierarchy fields (convert lists to JSON strings for ChromaDB)
|
|
277
278
|
"chunk_id": chunk.chunk_id or "",
|
|
278
279
|
"parent_chunk_id": chunk.parent_chunk_id or "",
|
|
279
|
-
"child_chunk_ids": chunk.child_chunk_ids or [],
|
|
280
|
+
"child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
|
|
280
281
|
"chunk_depth": chunk.chunk_depth,
|
|
281
|
-
# Additional metadata
|
|
282
|
-
"decorators": chunk.decorators or [],
|
|
283
|
-
"parameters": chunk.parameters or [],
|
|
282
|
+
# Additional metadata (convert lists/dicts to JSON strings)
|
|
283
|
+
"decorators": json.dumps(chunk.decorators or []),
|
|
284
|
+
"parameters": json.dumps(chunk.parameters or []),
|
|
284
285
|
"return_type": chunk.return_type or "",
|
|
285
|
-
"type_annotations": chunk.type_annotations or {},
|
|
286
|
+
"type_annotations": json.dumps(chunk.type_annotations or {}),
|
|
286
287
|
# Monorepo support
|
|
287
288
|
"subproject_name": chunk.subproject_name or "",
|
|
288
289
|
"subproject_path": chunk.subproject_path or "",
|
|
@@ -510,6 +511,23 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
510
511
|
metadata = results["metadatas"][i]
|
|
511
512
|
content = results["documents"][i]
|
|
512
513
|
|
|
514
|
+
# Parse JSON strings back to lists/dicts
|
|
515
|
+
child_chunk_ids = metadata.get("child_chunk_ids", "[]")
|
|
516
|
+
if isinstance(child_chunk_ids, str):
|
|
517
|
+
child_chunk_ids = json.loads(child_chunk_ids)
|
|
518
|
+
|
|
519
|
+
decorators = metadata.get("decorators", "[]")
|
|
520
|
+
if isinstance(decorators, str):
|
|
521
|
+
decorators = json.loads(decorators)
|
|
522
|
+
|
|
523
|
+
parameters = metadata.get("parameters", "[]")
|
|
524
|
+
if isinstance(parameters, str):
|
|
525
|
+
parameters = json.loads(parameters)
|
|
526
|
+
|
|
527
|
+
type_annotations = metadata.get("type_annotations", "{}")
|
|
528
|
+
if isinstance(type_annotations, str):
|
|
529
|
+
type_annotations = json.loads(type_annotations)
|
|
530
|
+
|
|
513
531
|
chunk = CodeChunk(
|
|
514
532
|
content=content,
|
|
515
533
|
file_path=Path(metadata["file_path"]),
|
|
@@ -524,12 +542,12 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
524
542
|
complexity_score=metadata.get("complexity_score", 0.0),
|
|
525
543
|
chunk_id=metadata.get("chunk_id"),
|
|
526
544
|
parent_chunk_id=metadata.get("parent_chunk_id"),
|
|
527
|
-
child_chunk_ids=
|
|
545
|
+
child_chunk_ids=child_chunk_ids,
|
|
528
546
|
chunk_depth=metadata.get("chunk_depth", 0),
|
|
529
|
-
decorators=
|
|
530
|
-
parameters=
|
|
547
|
+
decorators=decorators,
|
|
548
|
+
parameters=parameters,
|
|
531
549
|
return_type=metadata.get("return_type"),
|
|
532
|
-
type_annotations=
|
|
550
|
+
type_annotations=type_annotations,
|
|
533
551
|
subproject_name=metadata.get("subproject_name"),
|
|
534
552
|
subproject_path=metadata.get("subproject_path"),
|
|
535
553
|
)
|
|
@@ -775,16 +793,16 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
775
793
|
"class_name": chunk.class_name or "",
|
|
776
794
|
"docstring": chunk.docstring or "",
|
|
777
795
|
"complexity_score": chunk.complexity_score,
|
|
778
|
-
# Hierarchy fields
|
|
796
|
+
# Hierarchy fields (convert lists to JSON strings for ChromaDB)
|
|
779
797
|
"chunk_id": chunk.chunk_id or "",
|
|
780
798
|
"parent_chunk_id": chunk.parent_chunk_id or "",
|
|
781
|
-
"child_chunk_ids": chunk.child_chunk_ids or [],
|
|
799
|
+
"child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
|
|
782
800
|
"chunk_depth": chunk.chunk_depth,
|
|
783
|
-
# Additional metadata
|
|
784
|
-
"decorators": chunk.decorators or [],
|
|
785
|
-
"parameters": chunk.parameters or [],
|
|
801
|
+
# Additional metadata (convert lists/dicts to JSON strings)
|
|
802
|
+
"decorators": json.dumps(chunk.decorators or []),
|
|
803
|
+
"parameters": json.dumps(chunk.parameters or []),
|
|
786
804
|
"return_type": chunk.return_type or "",
|
|
787
|
-
"type_annotations": chunk.type_annotations or {},
|
|
805
|
+
"type_annotations": json.dumps(chunk.type_annotations or {}),
|
|
788
806
|
# Monorepo support
|
|
789
807
|
"subproject_name": chunk.subproject_name or "",
|
|
790
808
|
"subproject_path": chunk.subproject_path or "",
|
|
@@ -1023,6 +1041,23 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
1023
1041
|
metadata = results["metadatas"][i]
|
|
1024
1042
|
content = results["documents"][i]
|
|
1025
1043
|
|
|
1044
|
+
# Parse JSON strings back to lists/dicts
|
|
1045
|
+
child_chunk_ids = metadata.get("child_chunk_ids", "[]")
|
|
1046
|
+
if isinstance(child_chunk_ids, str):
|
|
1047
|
+
child_chunk_ids = json.loads(child_chunk_ids)
|
|
1048
|
+
|
|
1049
|
+
decorators = metadata.get("decorators", "[]")
|
|
1050
|
+
if isinstance(decorators, str):
|
|
1051
|
+
decorators = json.loads(decorators)
|
|
1052
|
+
|
|
1053
|
+
parameters = metadata.get("parameters", "[]")
|
|
1054
|
+
if isinstance(parameters, str):
|
|
1055
|
+
parameters = json.loads(parameters)
|
|
1056
|
+
|
|
1057
|
+
type_annotations = metadata.get("type_annotations", "{}")
|
|
1058
|
+
if isinstance(type_annotations, str):
|
|
1059
|
+
type_annotations = json.loads(type_annotations)
|
|
1060
|
+
|
|
1026
1061
|
chunk = CodeChunk(
|
|
1027
1062
|
content=content,
|
|
1028
1063
|
file_path=Path(metadata["file_path"]),
|
|
@@ -1037,12 +1072,12 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
1037
1072
|
complexity_score=metadata.get("complexity_score", 0.0),
|
|
1038
1073
|
chunk_id=metadata.get("chunk_id"),
|
|
1039
1074
|
parent_chunk_id=metadata.get("parent_chunk_id"),
|
|
1040
|
-
child_chunk_ids=
|
|
1075
|
+
child_chunk_ids=child_chunk_ids,
|
|
1041
1076
|
chunk_depth=metadata.get("chunk_depth", 0),
|
|
1042
|
-
decorators=
|
|
1043
|
-
parameters=
|
|
1077
|
+
decorators=decorators,
|
|
1078
|
+
parameters=parameters,
|
|
1044
1079
|
return_type=metadata.get("return_type"),
|
|
1045
|
-
type_annotations=
|
|
1080
|
+
type_annotations=type_annotations,
|
|
1046
1081
|
subproject_name=metadata.get("subproject_name"),
|
|
1047
1082
|
subproject_path=metadata.get("subproject_path"),
|
|
1048
1083
|
)
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""Directory index for tracking project structure and file relationships."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from .models import Directory
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DirectoryIndex:
|
|
14
|
+
"""Manages directory structure and file-directory relationships."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, index_path: Path) -> None:
|
|
17
|
+
"""Initialize directory index.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
index_path: Path to directory index file (JSON)
|
|
21
|
+
"""
|
|
22
|
+
self.index_path = index_path
|
|
23
|
+
self.directories: dict[str, Directory] = {} # path -> Directory
|
|
24
|
+
self.file_to_directory: dict[str, str] = {} # file_path -> directory_path
|
|
25
|
+
self.directory_files: dict[str, list[str]] = defaultdict(list) # dir_path -> [file_paths]
|
|
26
|
+
|
|
27
|
+
def load(self) -> None:
|
|
28
|
+
"""Load directory index from disk."""
|
|
29
|
+
if not self.index_path.exists():
|
|
30
|
+
logger.debug("No directory index found, starting fresh")
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
with open(self.index_path, "r") as f:
|
|
35
|
+
data = json.load(f)
|
|
36
|
+
|
|
37
|
+
# Load directories
|
|
38
|
+
for dir_data in data.get("directories", []):
|
|
39
|
+
directory = Directory.from_dict(dir_data)
|
|
40
|
+
self.directories[str(directory.path)] = directory
|
|
41
|
+
|
|
42
|
+
# Load file mappings
|
|
43
|
+
self.file_to_directory = data.get("file_to_directory", {})
|
|
44
|
+
|
|
45
|
+
# Rebuild directory_files from file_to_directory
|
|
46
|
+
self.directory_files = defaultdict(list)
|
|
47
|
+
for file_path, dir_path in self.file_to_directory.items():
|
|
48
|
+
self.directory_files[dir_path].append(file_path)
|
|
49
|
+
|
|
50
|
+
logger.info(f"Loaded {len(self.directories)} directories from index")
|
|
51
|
+
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logger.error(f"Failed to load directory index: {e}")
|
|
54
|
+
self.directories = {}
|
|
55
|
+
self.file_to_directory = {}
|
|
56
|
+
self.directory_files = defaultdict(list)
|
|
57
|
+
|
|
58
|
+
def save(self) -> None:
|
|
59
|
+
"""Save directory index to disk."""
|
|
60
|
+
try:
|
|
61
|
+
# Ensure parent directory exists
|
|
62
|
+
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
|
|
64
|
+
data = {
|
|
65
|
+
"directories": [d.to_dict() for d in self.directories.values()],
|
|
66
|
+
"file_to_directory": self.file_to_directory,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
with open(self.index_path, "w") as f:
|
|
70
|
+
json.dump(data, f, indent=2)
|
|
71
|
+
|
|
72
|
+
logger.debug(f"Saved {len(self.directories)} directories to index")
|
|
73
|
+
|
|
74
|
+
except Exception as e:
|
|
75
|
+
logger.error(f"Failed to save directory index: {e}")
|
|
76
|
+
raise
|
|
77
|
+
|
|
78
|
+
def add_directory(self, directory: Directory) -> None:
|
|
79
|
+
"""Add or update a directory in the index.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
directory: Directory to add
|
|
83
|
+
"""
|
|
84
|
+
dir_path = str(directory.path)
|
|
85
|
+
self.directories[dir_path] = directory
|
|
86
|
+
|
|
87
|
+
def add_file(self, file_path: Path, directory_path: Path) -> None:
|
|
88
|
+
"""Associate a file with its directory.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
file_path: Path to the file
|
|
92
|
+
directory_path: Path to the directory containing the file
|
|
93
|
+
"""
|
|
94
|
+
file_path_str = str(file_path)
|
|
95
|
+
dir_path_str = str(directory_path)
|
|
96
|
+
|
|
97
|
+
self.file_to_directory[file_path_str] = dir_path_str
|
|
98
|
+
if file_path_str not in self.directory_files[dir_path_str]:
|
|
99
|
+
self.directory_files[dir_path_str].append(file_path_str)
|
|
100
|
+
|
|
101
|
+
# Update directory file count
|
|
102
|
+
if dir_path_str in self.directories:
|
|
103
|
+
self.directories[dir_path_str].file_count = len(
|
|
104
|
+
self.directory_files[dir_path_str]
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def get_directory(self, directory_path: Path) -> Directory | None:
|
|
108
|
+
"""Get directory by path.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
directory_path: Path to directory
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Directory object or None if not found
|
|
115
|
+
"""
|
|
116
|
+
return self.directories.get(str(directory_path))
|
|
117
|
+
|
|
118
|
+
def get_files_in_directory(self, directory_path: Path) -> list[str]:
|
|
119
|
+
"""Get all files in a directory.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
directory_path: Path to directory
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of file paths in the directory
|
|
126
|
+
"""
|
|
127
|
+
return self.directory_files.get(str(directory_path), [])
|
|
128
|
+
|
|
129
|
+
def get_subdirectories(self, directory_path: Path) -> list[Directory]:
|
|
130
|
+
"""Get all immediate subdirectories.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
directory_path: Path to parent directory
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
List of subdirectory objects
|
|
137
|
+
"""
|
|
138
|
+
parent_path_str = str(directory_path)
|
|
139
|
+
subdirs = []
|
|
140
|
+
|
|
141
|
+
for dir_path_str, directory in self.directories.items():
|
|
142
|
+
if directory.parent_path and str(directory.parent_path) == parent_path_str:
|
|
143
|
+
subdirs.append(directory)
|
|
144
|
+
|
|
145
|
+
return subdirs
|
|
146
|
+
|
|
147
|
+
def get_root_directories(self) -> list[Directory]:
|
|
148
|
+
"""Get all root-level directories (no parent).
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
List of root directory objects
|
|
152
|
+
"""
|
|
153
|
+
return [d for d in self.directories.values() if d.parent_path is None]
|
|
154
|
+
|
|
155
|
+
def delete_directory(self, directory_path: Path) -> None:
|
|
156
|
+
"""Remove directory and its file associations.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
directory_path: Path to directory to remove
|
|
160
|
+
"""
|
|
161
|
+
dir_path_str = str(directory_path)
|
|
162
|
+
|
|
163
|
+
# Remove directory
|
|
164
|
+
if dir_path_str in self.directories:
|
|
165
|
+
del self.directories[dir_path_str]
|
|
166
|
+
|
|
167
|
+
# Remove file associations
|
|
168
|
+
if dir_path_str in self.directory_files:
|
|
169
|
+
for file_path in self.directory_files[dir_path_str]:
|
|
170
|
+
if file_path in self.file_to_directory:
|
|
171
|
+
del self.file_to_directory[file_path]
|
|
172
|
+
del self.directory_files[dir_path_str]
|
|
173
|
+
|
|
174
|
+
def delete_file(self, file_path: Path) -> None:
|
|
175
|
+
"""Remove file from directory associations.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
file_path: Path to file to remove
|
|
179
|
+
"""
|
|
180
|
+
file_path_str = str(file_path)
|
|
181
|
+
|
|
182
|
+
if file_path_str in self.file_to_directory:
|
|
183
|
+
dir_path = self.file_to_directory[file_path_str]
|
|
184
|
+
del self.file_to_directory[file_path_str]
|
|
185
|
+
|
|
186
|
+
# Remove from directory_files
|
|
187
|
+
if dir_path in self.directory_files:
|
|
188
|
+
self.directory_files[dir_path] = [
|
|
189
|
+
f for f in self.directory_files[dir_path] if f != file_path_str
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
# Update directory file count
|
|
193
|
+
if dir_path in self.directories:
|
|
194
|
+
self.directories[dir_path].file_count = len(
|
|
195
|
+
self.directory_files[dir_path]
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def rebuild_from_files(self, file_paths: list[Path], root_path: Path, chunk_stats: dict[str, dict] | None = None) -> None:
|
|
199
|
+
"""Rebuild directory index from list of files with statistics from chunks.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
file_paths: List of file paths to index
|
|
203
|
+
root_path: Project root path
|
|
204
|
+
chunk_stats: Optional dict mapping file_path -> {'chunks': count, 'language': str}
|
|
205
|
+
"""
|
|
206
|
+
self.directories = {}
|
|
207
|
+
self.file_to_directory = {}
|
|
208
|
+
self.directory_files = defaultdict(list)
|
|
209
|
+
|
|
210
|
+
# Track all unique directories and their statistics
|
|
211
|
+
dir_set = set()
|
|
212
|
+
dir_chunks = defaultdict(int) # directory -> total chunks
|
|
213
|
+
dir_languages = defaultdict(lambda: defaultdict(int)) # directory -> {language: count}
|
|
214
|
+
dir_modified = defaultdict(float) # directory -> most recent modification time
|
|
215
|
+
|
|
216
|
+
for file_path in file_paths:
|
|
217
|
+
# Get relative path from root
|
|
218
|
+
try:
|
|
219
|
+
rel_path = file_path.relative_to(root_path)
|
|
220
|
+
parent_dir = rel_path.parent
|
|
221
|
+
|
|
222
|
+
# Add all parent directories up to root
|
|
223
|
+
current = parent_dir
|
|
224
|
+
while current != Path("."):
|
|
225
|
+
dir_set.add(current)
|
|
226
|
+
|
|
227
|
+
# Accumulate statistics up the directory tree
|
|
228
|
+
if chunk_stats and str(file_path) in chunk_stats:
|
|
229
|
+
stats = chunk_stats[str(file_path)]
|
|
230
|
+
dir_chunks[current] += stats.get('chunks', 0)
|
|
231
|
+
if 'language' in stats:
|
|
232
|
+
dir_languages[current][stats['language']] += stats.get('chunks', 0)
|
|
233
|
+
# Track most recent modification time
|
|
234
|
+
if 'modified' in stats:
|
|
235
|
+
dir_modified[current] = max(dir_modified.get(current, 0), stats['modified'])
|
|
236
|
+
|
|
237
|
+
current = current.parent
|
|
238
|
+
|
|
239
|
+
# Associate file with its direct parent
|
|
240
|
+
if parent_dir != Path("."):
|
|
241
|
+
self.add_file(rel_path, parent_dir)
|
|
242
|
+
|
|
243
|
+
except ValueError:
|
|
244
|
+
# File not relative to root, skip
|
|
245
|
+
logger.warning(f"File {file_path} not under root {root_path}")
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
# Create Directory objects for all directories
|
|
249
|
+
for dir_path in sorted(dir_set):
|
|
250
|
+
# Determine parent
|
|
251
|
+
parent_path = dir_path.parent if dir_path.parent != Path(".") else None
|
|
252
|
+
|
|
253
|
+
# Check if it's a package
|
|
254
|
+
is_package = False
|
|
255
|
+
full_dir_path = root_path / dir_path
|
|
256
|
+
if (full_dir_path / "__init__.py").exists():
|
|
257
|
+
is_package = True
|
|
258
|
+
elif (full_dir_path / "package.json").exists():
|
|
259
|
+
is_package = True
|
|
260
|
+
|
|
261
|
+
directory = Directory(
|
|
262
|
+
path=dir_path,
|
|
263
|
+
name=dir_path.name,
|
|
264
|
+
parent_path=parent_path,
|
|
265
|
+
depth=len(dir_path.parts),
|
|
266
|
+
is_package=is_package,
|
|
267
|
+
total_chunks=dir_chunks.get(dir_path, 0),
|
|
268
|
+
languages=dict(dir_languages.get(dir_path, {})),
|
|
269
|
+
last_modified=dir_modified.get(dir_path),
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
self.add_directory(directory)
|
|
273
|
+
|
|
274
|
+
# Update subdirectory counts
|
|
275
|
+
for directory in self.directories.values():
|
|
276
|
+
subdirs = self.get_subdirectories(directory.path)
|
|
277
|
+
directory.subdirectory_count = len(subdirs)
|
|
278
|
+
|
|
279
|
+
logger.info(f"Rebuilt directory index with {len(self.directories)} directories, {sum(dir_chunks.values())} total chunks")
|
|
280
|
+
|
|
281
|
+
def get_stats(self) -> dict[str, Any]:
|
|
282
|
+
"""Get directory index statistics.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Dictionary with statistics
|
|
286
|
+
"""
|
|
287
|
+
return {
|
|
288
|
+
"total_directories": len(self.directories),
|
|
289
|
+
"total_files": len(self.file_to_directory),
|
|
290
|
+
"root_directories": len(self.get_root_directories()),
|
|
291
|
+
"packages": sum(1 for d in self.directories.values() if d.is_package),
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
def reset(self) -> None:
|
|
295
|
+
"""Clear all directory data."""
|
|
296
|
+
self.directories = {}
|
|
297
|
+
self.file_to_directory = {}
|
|
298
|
+
self.directory_files = defaultdict(list)
|
|
299
|
+
|
|
300
|
+
if self.index_path.exists():
|
|
301
|
+
self.index_path.unlink()
|
|
302
|
+
|
|
303
|
+
logger.info("Directory index reset")
|
|
@@ -15,6 +15,7 @@ from ..parsers.registry import get_parser_registry
|
|
|
15
15
|
from ..utils.gitignore import create_gitignore_parser
|
|
16
16
|
from ..utils.monorepo import MonorepoDetector
|
|
17
17
|
from .database import VectorDatabase
|
|
18
|
+
from .directory_index import DirectoryIndex
|
|
18
19
|
from .exceptions import ParsingError
|
|
19
20
|
from .models import CodeChunk
|
|
20
21
|
|
|
@@ -29,6 +30,7 @@ class SemanticIndexer:
|
|
|
29
30
|
file_extensions: list[str],
|
|
30
31
|
max_workers: int | None = None,
|
|
31
32
|
batch_size: int = 10,
|
|
33
|
+
debug: bool = False,
|
|
32
34
|
) -> None:
|
|
33
35
|
"""Initialize semantic indexer.
|
|
34
36
|
|
|
@@ -38,12 +40,14 @@ class SemanticIndexer:
|
|
|
38
40
|
file_extensions: File extensions to index
|
|
39
41
|
max_workers: Maximum number of worker threads for parallel processing
|
|
40
42
|
batch_size: Number of files to process in each batch
|
|
43
|
+
debug: Enable debug output for hierarchy building
|
|
41
44
|
"""
|
|
42
45
|
self.database = database
|
|
43
46
|
self.project_root = project_root
|
|
44
47
|
self.file_extensions = {ext.lower() for ext in file_extensions}
|
|
45
48
|
self.parser_registry = get_parser_registry()
|
|
46
49
|
self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
|
|
50
|
+
self.debug = debug
|
|
47
51
|
|
|
48
52
|
# Safely get event loop for max_workers
|
|
49
53
|
try:
|
|
@@ -81,6 +85,13 @@ class SemanticIndexer:
|
|
|
81
85
|
for sp in subprojects:
|
|
82
86
|
logger.debug(f" - {sp.name} ({sp.relative_path})")
|
|
83
87
|
|
|
88
|
+
# Initialize directory index
|
|
89
|
+
self.directory_index = DirectoryIndex(
|
|
90
|
+
project_root / ".mcp-vector-search" / "directory_index.json"
|
|
91
|
+
)
|
|
92
|
+
# Load existing directory index
|
|
93
|
+
self.directory_index.load()
|
|
94
|
+
|
|
84
95
|
async def index_project(
|
|
85
96
|
self,
|
|
86
97
|
force_reindex: bool = False,
|
|
@@ -156,6 +167,38 @@ class SemanticIndexer:
|
|
|
156
167
|
|
|
157
168
|
self._save_index_metadata(metadata)
|
|
158
169
|
|
|
170
|
+
# Rebuild directory index from successfully indexed files
|
|
171
|
+
try:
|
|
172
|
+
logger.debug("Rebuilding directory index...")
|
|
173
|
+
# We don't have chunk counts here, but we have file modification times
|
|
174
|
+
# Build a simple stats dict with file mod times for recency tracking
|
|
175
|
+
chunk_stats = {}
|
|
176
|
+
for file_path in files_to_index:
|
|
177
|
+
try:
|
|
178
|
+
mtime = os.path.getmtime(file_path)
|
|
179
|
+
# For now, just track modification time
|
|
180
|
+
# Chunk counts will be aggregated from the database later if needed
|
|
181
|
+
chunk_stats[str(file_path)] = {
|
|
182
|
+
'modified': mtime,
|
|
183
|
+
'chunks': 1, # Placeholder - real count from chunks
|
|
184
|
+
}
|
|
185
|
+
except OSError:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
self.directory_index.rebuild_from_files(
|
|
189
|
+
files_to_index, self.project_root, chunk_stats=chunk_stats
|
|
190
|
+
)
|
|
191
|
+
self.directory_index.save()
|
|
192
|
+
dir_stats = self.directory_index.get_stats()
|
|
193
|
+
logger.info(
|
|
194
|
+
f"Directory index updated: {dir_stats['total_directories']} directories, "
|
|
195
|
+
f"{dir_stats['total_files']} files"
|
|
196
|
+
)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.error(f"Failed to update directory index: {e}")
|
|
199
|
+
import traceback
|
|
200
|
+
logger.debug(traceback.format_exc())
|
|
201
|
+
|
|
159
202
|
logger.info(
|
|
160
203
|
f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
|
|
161
204
|
)
|
|
@@ -307,6 +350,10 @@ class SemanticIndexer:
|
|
|
307
350
|
# Build hierarchical relationships between chunks
|
|
308
351
|
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
309
352
|
|
|
353
|
+
# Debug: Check if hierarchy was built
|
|
354
|
+
methods_with_parents = sum(1 for c in chunks_with_hierarchy if c.chunk_type in ("method", "function") and c.parent_chunk_id)
|
|
355
|
+
logger.debug(f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents")
|
|
356
|
+
|
|
310
357
|
# Add chunks to database
|
|
311
358
|
await self.database.add_chunks(chunks_with_hierarchy)
|
|
312
359
|
|
|
@@ -790,6 +837,15 @@ class SemanticIndexer:
|
|
|
790
837
|
class_chunks = [c for c in chunks if c.chunk_type in ("class", "interface", "mixin")]
|
|
791
838
|
function_chunks = [c for c in chunks if c.chunk_type in ("function", "method", "constructor")]
|
|
792
839
|
|
|
840
|
+
# DEBUG: Print what we have (if debug enabled)
|
|
841
|
+
if self.debug:
|
|
842
|
+
import sys
|
|
843
|
+
print(f"\n[DEBUG] Building hierarchy: {len(module_chunks)} modules, {len(class_chunks)} classes, {len(function_chunks)} functions", file=sys.stderr)
|
|
844
|
+
if class_chunks:
|
|
845
|
+
print(f"[DEBUG] Class names: {[c.class_name for c in class_chunks[:5]]}", file=sys.stderr)
|
|
846
|
+
if function_chunks:
|
|
847
|
+
print(f"[DEBUG] First 5 functions with class_name: {[(f.function_name, f.class_name) for f in function_chunks[:5]]}", file=sys.stderr)
|
|
848
|
+
|
|
793
849
|
# Build relationships
|
|
794
850
|
for func in function_chunks:
|
|
795
851
|
if func.class_name:
|
|
@@ -803,6 +859,10 @@ class SemanticIndexer:
|
|
|
803
859
|
func.chunk_depth = parent_class.chunk_depth + 1
|
|
804
860
|
if func.chunk_id not in parent_class.child_chunk_ids:
|
|
805
861
|
parent_class.child_chunk_ids.append(func.chunk_id)
|
|
862
|
+
if self.debug:
|
|
863
|
+
import sys
|
|
864
|
+
print(f"[DEBUG] ✓ Linked '{func.function_name}' to class '{parent_class.class_name}'", file=sys.stderr)
|
|
865
|
+
logger.debug(f"Linked method '{func.function_name}' (ID: {func.chunk_id[:8]}) to class '{parent_class.class_name}' (ID: {parent_class.chunk_id[:8]})")
|
|
806
866
|
else:
|
|
807
867
|
# Top-level function
|
|
808
868
|
if not func.chunk_depth:
|
|
@@ -828,6 +888,13 @@ class SemanticIndexer:
|
|
|
828
888
|
if not mod.chunk_depth:
|
|
829
889
|
mod.chunk_depth = 0
|
|
830
890
|
|
|
891
|
+
# DEBUG: Print summary
|
|
892
|
+
if self.debug:
|
|
893
|
+
import sys
|
|
894
|
+
funcs_with_parents = sum(1 for f in function_chunks if f.parent_chunk_id)
|
|
895
|
+
classes_with_parents = sum(1 for c in class_chunks if c.parent_chunk_id)
|
|
896
|
+
print(f"[DEBUG] Hierarchy built: {funcs_with_parents}/{len(function_chunks)} functions linked, {classes_with_parents}/{len(class_chunks)} classes linked\n", file=sys.stderr)
|
|
897
|
+
|
|
831
898
|
return chunks
|
|
832
899
|
|
|
833
900
|
def _write_indexing_run_header(self) -> None:
|
mcp_vector_search/core/models.py
CHANGED
|
@@ -206,6 +206,64 @@ class IndexStats(BaseModel):
|
|
|
206
206
|
}
|
|
207
207
|
|
|
208
208
|
|
|
209
|
+
@dataclass
|
|
210
|
+
class Directory:
|
|
211
|
+
"""Represents a directory in the project structure."""
|
|
212
|
+
|
|
213
|
+
path: Path # Relative path from project root
|
|
214
|
+
name: str # Directory name
|
|
215
|
+
parent_path: Path | None = None # Parent directory path (None for root)
|
|
216
|
+
file_count: int = 0 # Number of files directly in this directory
|
|
217
|
+
subdirectory_count: int = 0 # Number of subdirectories
|
|
218
|
+
total_chunks: int = 0 # Total code chunks in this directory (recursive)
|
|
219
|
+
languages: dict[str, int] = None # Language distribution in this directory
|
|
220
|
+
depth: int = 0 # Depth from project root (0 = root)
|
|
221
|
+
is_package: bool = False # True if contains __init__.py or package.json
|
|
222
|
+
last_modified: float | None = None # Most recent file modification time (unix timestamp)
|
|
223
|
+
|
|
224
|
+
def __post_init__(self) -> None:
|
|
225
|
+
"""Initialize default values and generate directory ID."""
|
|
226
|
+
if self.languages is None:
|
|
227
|
+
self.languages = {}
|
|
228
|
+
|
|
229
|
+
@property
|
|
230
|
+
def id(self) -> str:
|
|
231
|
+
"""Generate unique ID for this directory."""
|
|
232
|
+
import hashlib
|
|
233
|
+
return hashlib.sha256(str(self.path).encode()).hexdigest()[:16]
|
|
234
|
+
|
|
235
|
+
def to_dict(self) -> dict[str, Any]:
|
|
236
|
+
"""Convert to dictionary for storage."""
|
|
237
|
+
return {
|
|
238
|
+
"path": str(self.path),
|
|
239
|
+
"name": self.name,
|
|
240
|
+
"parent_path": str(self.parent_path) if self.parent_path else None,
|
|
241
|
+
"file_count": self.file_count,
|
|
242
|
+
"subdirectory_count": self.subdirectory_count,
|
|
243
|
+
"total_chunks": self.total_chunks,
|
|
244
|
+
"languages": self.languages,
|
|
245
|
+
"depth": self.depth,
|
|
246
|
+
"is_package": self.is_package,
|
|
247
|
+
"last_modified": self.last_modified,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
@classmethod
|
|
251
|
+
def from_dict(cls, data: dict[str, Any]) -> "Directory":
|
|
252
|
+
"""Create from dictionary."""
|
|
253
|
+
return cls(
|
|
254
|
+
path=Path(data["path"]),
|
|
255
|
+
name=data["name"],
|
|
256
|
+
parent_path=Path(data["parent_path"]) if data.get("parent_path") else None,
|
|
257
|
+
file_count=data.get("file_count", 0),
|
|
258
|
+
subdirectory_count=data.get("subdirectory_count", 0),
|
|
259
|
+
total_chunks=data.get("total_chunks", 0),
|
|
260
|
+
languages=data.get("languages", {}),
|
|
261
|
+
depth=data.get("depth", 0),
|
|
262
|
+
is_package=data.get("is_package", False),
|
|
263
|
+
last_modified=data.get("last_modified"),
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
209
267
|
class ProjectInfo(BaseModel):
|
|
210
268
|
"""Information about a project."""
|
|
211
269
|
|
|
@@ -51,14 +51,24 @@ class GitignorePattern:
|
|
|
51
51
|
Returns:
|
|
52
52
|
True if the pattern matches
|
|
53
53
|
"""
|
|
54
|
-
# Directory-only patterns only match directories
|
|
55
|
-
if self.is_directory_only and not is_directory:
|
|
56
|
-
return False
|
|
57
|
-
|
|
58
54
|
# Convert path separators for consistent matching
|
|
59
55
|
path = path.replace("\\", "/")
|
|
60
56
|
pattern = self.pattern.replace("\\", "/")
|
|
61
57
|
|
|
58
|
+
# For directory-only patterns, check if any parent directory matches
|
|
59
|
+
# This implements Git's behavior where "dir/" excludes both the directory
|
|
60
|
+
# AND all files within it recursively
|
|
61
|
+
if self.is_directory_only:
|
|
62
|
+
path_parts = path.split("/")
|
|
63
|
+
# Check each parent directory component
|
|
64
|
+
for i in range(1, len(path_parts) + 1):
|
|
65
|
+
parent = "/".join(path_parts[:i])
|
|
66
|
+
if fnmatch.fnmatch(parent, pattern):
|
|
67
|
+
return True
|
|
68
|
+
# If no parent matches and this is not a directory, don't exclude
|
|
69
|
+
if not is_directory:
|
|
70
|
+
return False
|
|
71
|
+
|
|
62
72
|
# Try exact match first
|
|
63
73
|
if fnmatch.fnmatch(path, pattern):
|
|
64
74
|
return True
|