mcp-vector-search 0.9.2__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +2 -2
- mcp_vector_search/cli/commands/index.py +31 -0
- mcp_vector_search/cli/commands/visualize.py +358 -20
- mcp_vector_search/core/database.py +71 -8
- mcp_vector_search/core/directory_index.py +303 -0
- mcp_vector_search/core/indexer.py +67 -0
- mcp_vector_search/core/models.py +58 -0
- mcp_vector_search/utils/gitignore.py +14 -4
- mcp_vector_search/visualization/index.html +658 -0
- {mcp_vector_search-0.9.2.dist-info → mcp_vector_search-0.12.0.dist-info}/METADATA +1 -1
- {mcp_vector_search-0.9.2.dist-info → mcp_vector_search-0.12.0.dist-info}/RECORD +14 -12
- {mcp_vector_search-0.9.2.dist-info → mcp_vector_search-0.12.0.dist-info}/WHEEL +0 -0
- {mcp_vector_search-0.9.2.dist-info → mcp_vector_search-0.12.0.dist-info}/entry_points.txt +0 -0
- {mcp_vector_search-0.9.2.dist-info → mcp_vector_search-0.12.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
"""Database abstraction and ChromaDB implementation for MCP Vector Search."""
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import json
|
|
4
5
|
import shutil
|
|
5
6
|
from abc import ABC, abstractmethod
|
|
6
7
|
from pathlib import Path
|
|
@@ -273,6 +274,19 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
273
274
|
"class_name": chunk.class_name or "",
|
|
274
275
|
"docstring": chunk.docstring or "",
|
|
275
276
|
"complexity_score": chunk.complexity_score,
|
|
277
|
+
# Hierarchy fields (convert lists to JSON strings for ChromaDB)
|
|
278
|
+
"chunk_id": chunk.chunk_id or "",
|
|
279
|
+
"parent_chunk_id": chunk.parent_chunk_id or "",
|
|
280
|
+
"child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
|
|
281
|
+
"chunk_depth": chunk.chunk_depth,
|
|
282
|
+
# Additional metadata (convert lists/dicts to JSON strings)
|
|
283
|
+
"decorators": json.dumps(chunk.decorators or []),
|
|
284
|
+
"parameters": json.dumps(chunk.parameters or []),
|
|
285
|
+
"return_type": chunk.return_type or "",
|
|
286
|
+
"type_annotations": json.dumps(chunk.type_annotations or {}),
|
|
287
|
+
# Monorepo support
|
|
288
|
+
"subproject_name": chunk.subproject_name or "",
|
|
289
|
+
"subproject_path": chunk.subproject_path or "",
|
|
276
290
|
}
|
|
277
291
|
metadatas.append(metadata)
|
|
278
292
|
|
|
@@ -497,6 +511,23 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
497
511
|
metadata = results["metadatas"][i]
|
|
498
512
|
content = results["documents"][i]
|
|
499
513
|
|
|
514
|
+
# Parse JSON strings back to lists/dicts
|
|
515
|
+
child_chunk_ids = metadata.get("child_chunk_ids", "[]")
|
|
516
|
+
if isinstance(child_chunk_ids, str):
|
|
517
|
+
child_chunk_ids = json.loads(child_chunk_ids)
|
|
518
|
+
|
|
519
|
+
decorators = metadata.get("decorators", "[]")
|
|
520
|
+
if isinstance(decorators, str):
|
|
521
|
+
decorators = json.loads(decorators)
|
|
522
|
+
|
|
523
|
+
parameters = metadata.get("parameters", "[]")
|
|
524
|
+
if isinstance(parameters, str):
|
|
525
|
+
parameters = json.loads(parameters)
|
|
526
|
+
|
|
527
|
+
type_annotations = metadata.get("type_annotations", "{}")
|
|
528
|
+
if isinstance(type_annotations, str):
|
|
529
|
+
type_annotations = json.loads(type_annotations)
|
|
530
|
+
|
|
500
531
|
chunk = CodeChunk(
|
|
501
532
|
content=content,
|
|
502
533
|
file_path=Path(metadata["file_path"]),
|
|
@@ -511,12 +542,12 @@ class ChromaVectorDatabase(VectorDatabase):
|
|
|
511
542
|
complexity_score=metadata.get("complexity_score", 0.0),
|
|
512
543
|
chunk_id=metadata.get("chunk_id"),
|
|
513
544
|
parent_chunk_id=metadata.get("parent_chunk_id"),
|
|
514
|
-
child_chunk_ids=
|
|
545
|
+
child_chunk_ids=child_chunk_ids,
|
|
515
546
|
chunk_depth=metadata.get("chunk_depth", 0),
|
|
516
|
-
decorators=
|
|
517
|
-
parameters=
|
|
547
|
+
decorators=decorators,
|
|
548
|
+
parameters=parameters,
|
|
518
549
|
return_type=metadata.get("return_type"),
|
|
519
|
-
type_annotations=
|
|
550
|
+
type_annotations=type_annotations,
|
|
520
551
|
subproject_name=metadata.get("subproject_name"),
|
|
521
552
|
subproject_path=metadata.get("subproject_path"),
|
|
522
553
|
)
|
|
@@ -760,6 +791,21 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
760
791
|
"chunk_type": chunk.chunk_type,
|
|
761
792
|
"function_name": chunk.function_name or "",
|
|
762
793
|
"class_name": chunk.class_name or "",
|
|
794
|
+
"docstring": chunk.docstring or "",
|
|
795
|
+
"complexity_score": chunk.complexity_score,
|
|
796
|
+
# Hierarchy fields (convert lists to JSON strings for ChromaDB)
|
|
797
|
+
"chunk_id": chunk.chunk_id or "",
|
|
798
|
+
"parent_chunk_id": chunk.parent_chunk_id or "",
|
|
799
|
+
"child_chunk_ids": json.dumps(chunk.child_chunk_ids or []),
|
|
800
|
+
"chunk_depth": chunk.chunk_depth,
|
|
801
|
+
# Additional metadata (convert lists/dicts to JSON strings)
|
|
802
|
+
"decorators": json.dumps(chunk.decorators or []),
|
|
803
|
+
"parameters": json.dumps(chunk.parameters or []),
|
|
804
|
+
"return_type": chunk.return_type or "",
|
|
805
|
+
"type_annotations": json.dumps(chunk.type_annotations or {}),
|
|
806
|
+
# Monorepo support
|
|
807
|
+
"subproject_name": chunk.subproject_name or "",
|
|
808
|
+
"subproject_path": chunk.subproject_path or "",
|
|
763
809
|
}
|
|
764
810
|
)
|
|
765
811
|
ids.append(chunk.id)
|
|
@@ -995,6 +1041,23 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
995
1041
|
metadata = results["metadatas"][i]
|
|
996
1042
|
content = results["documents"][i]
|
|
997
1043
|
|
|
1044
|
+
# Parse JSON strings back to lists/dicts
|
|
1045
|
+
child_chunk_ids = metadata.get("child_chunk_ids", "[]")
|
|
1046
|
+
if isinstance(child_chunk_ids, str):
|
|
1047
|
+
child_chunk_ids = json.loads(child_chunk_ids)
|
|
1048
|
+
|
|
1049
|
+
decorators = metadata.get("decorators", "[]")
|
|
1050
|
+
if isinstance(decorators, str):
|
|
1051
|
+
decorators = json.loads(decorators)
|
|
1052
|
+
|
|
1053
|
+
parameters = metadata.get("parameters", "[]")
|
|
1054
|
+
if isinstance(parameters, str):
|
|
1055
|
+
parameters = json.loads(parameters)
|
|
1056
|
+
|
|
1057
|
+
type_annotations = metadata.get("type_annotations", "{}")
|
|
1058
|
+
if isinstance(type_annotations, str):
|
|
1059
|
+
type_annotations = json.loads(type_annotations)
|
|
1060
|
+
|
|
998
1061
|
chunk = CodeChunk(
|
|
999
1062
|
content=content,
|
|
1000
1063
|
file_path=Path(metadata["file_path"]),
|
|
@@ -1009,12 +1072,12 @@ class PooledChromaVectorDatabase(VectorDatabase):
|
|
|
1009
1072
|
complexity_score=metadata.get("complexity_score", 0.0),
|
|
1010
1073
|
chunk_id=metadata.get("chunk_id"),
|
|
1011
1074
|
parent_chunk_id=metadata.get("parent_chunk_id"),
|
|
1012
|
-
child_chunk_ids=
|
|
1075
|
+
child_chunk_ids=child_chunk_ids,
|
|
1013
1076
|
chunk_depth=metadata.get("chunk_depth", 0),
|
|
1014
|
-
decorators=
|
|
1015
|
-
parameters=
|
|
1077
|
+
decorators=decorators,
|
|
1078
|
+
parameters=parameters,
|
|
1016
1079
|
return_type=metadata.get("return_type"),
|
|
1017
|
-
type_annotations=
|
|
1080
|
+
type_annotations=type_annotations,
|
|
1018
1081
|
subproject_name=metadata.get("subproject_name"),
|
|
1019
1082
|
subproject_path=metadata.get("subproject_path"),
|
|
1020
1083
|
)
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
"""Directory index for tracking project structure and file relationships."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from .models import Directory
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DirectoryIndex:
|
|
14
|
+
"""Manages directory structure and file-directory relationships."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, index_path: Path) -> None:
|
|
17
|
+
"""Initialize directory index.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
index_path: Path to directory index file (JSON)
|
|
21
|
+
"""
|
|
22
|
+
self.index_path = index_path
|
|
23
|
+
self.directories: dict[str, Directory] = {} # path -> Directory
|
|
24
|
+
self.file_to_directory: dict[str, str] = {} # file_path -> directory_path
|
|
25
|
+
self.directory_files: dict[str, list[str]] = defaultdict(list) # dir_path -> [file_paths]
|
|
26
|
+
|
|
27
|
+
def load(self) -> None:
|
|
28
|
+
"""Load directory index from disk."""
|
|
29
|
+
if not self.index_path.exists():
|
|
30
|
+
logger.debug("No directory index found, starting fresh")
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
with open(self.index_path, "r") as f:
|
|
35
|
+
data = json.load(f)
|
|
36
|
+
|
|
37
|
+
# Load directories
|
|
38
|
+
for dir_data in data.get("directories", []):
|
|
39
|
+
directory = Directory.from_dict(dir_data)
|
|
40
|
+
self.directories[str(directory.path)] = directory
|
|
41
|
+
|
|
42
|
+
# Load file mappings
|
|
43
|
+
self.file_to_directory = data.get("file_to_directory", {})
|
|
44
|
+
|
|
45
|
+
# Rebuild directory_files from file_to_directory
|
|
46
|
+
self.directory_files = defaultdict(list)
|
|
47
|
+
for file_path, dir_path in self.file_to_directory.items():
|
|
48
|
+
self.directory_files[dir_path].append(file_path)
|
|
49
|
+
|
|
50
|
+
logger.info(f"Loaded {len(self.directories)} directories from index")
|
|
51
|
+
|
|
52
|
+
except Exception as e:
|
|
53
|
+
logger.error(f"Failed to load directory index: {e}")
|
|
54
|
+
self.directories = {}
|
|
55
|
+
self.file_to_directory = {}
|
|
56
|
+
self.directory_files = defaultdict(list)
|
|
57
|
+
|
|
58
|
+
def save(self) -> None:
|
|
59
|
+
"""Save directory index to disk."""
|
|
60
|
+
try:
|
|
61
|
+
# Ensure parent directory exists
|
|
62
|
+
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
63
|
+
|
|
64
|
+
data = {
|
|
65
|
+
"directories": [d.to_dict() for d in self.directories.values()],
|
|
66
|
+
"file_to_directory": self.file_to_directory,
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
with open(self.index_path, "w") as f:
|
|
70
|
+
json.dump(data, f, indent=2)
|
|
71
|
+
|
|
72
|
+
logger.debug(f"Saved {len(self.directories)} directories to index")
|
|
73
|
+
|
|
74
|
+
except Exception as e:
|
|
75
|
+
logger.error(f"Failed to save directory index: {e}")
|
|
76
|
+
raise
|
|
77
|
+
|
|
78
|
+
def add_directory(self, directory: Directory) -> None:
|
|
79
|
+
"""Add or update a directory in the index.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
directory: Directory to add
|
|
83
|
+
"""
|
|
84
|
+
dir_path = str(directory.path)
|
|
85
|
+
self.directories[dir_path] = directory
|
|
86
|
+
|
|
87
|
+
def add_file(self, file_path: Path, directory_path: Path) -> None:
|
|
88
|
+
"""Associate a file with its directory.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
file_path: Path to the file
|
|
92
|
+
directory_path: Path to the directory containing the file
|
|
93
|
+
"""
|
|
94
|
+
file_path_str = str(file_path)
|
|
95
|
+
dir_path_str = str(directory_path)
|
|
96
|
+
|
|
97
|
+
self.file_to_directory[file_path_str] = dir_path_str
|
|
98
|
+
if file_path_str not in self.directory_files[dir_path_str]:
|
|
99
|
+
self.directory_files[dir_path_str].append(file_path_str)
|
|
100
|
+
|
|
101
|
+
# Update directory file count
|
|
102
|
+
if dir_path_str in self.directories:
|
|
103
|
+
self.directories[dir_path_str].file_count = len(
|
|
104
|
+
self.directory_files[dir_path_str]
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def get_directory(self, directory_path: Path) -> Directory | None:
|
|
108
|
+
"""Get directory by path.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
directory_path: Path to directory
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Directory object or None if not found
|
|
115
|
+
"""
|
|
116
|
+
return self.directories.get(str(directory_path))
|
|
117
|
+
|
|
118
|
+
def get_files_in_directory(self, directory_path: Path) -> list[str]:
|
|
119
|
+
"""Get all files in a directory.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
directory_path: Path to directory
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
List of file paths in the directory
|
|
126
|
+
"""
|
|
127
|
+
return self.directory_files.get(str(directory_path), [])
|
|
128
|
+
|
|
129
|
+
def get_subdirectories(self, directory_path: Path) -> list[Directory]:
|
|
130
|
+
"""Get all immediate subdirectories.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
directory_path: Path to parent directory
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
List of subdirectory objects
|
|
137
|
+
"""
|
|
138
|
+
parent_path_str = str(directory_path)
|
|
139
|
+
subdirs = []
|
|
140
|
+
|
|
141
|
+
for dir_path_str, directory in self.directories.items():
|
|
142
|
+
if directory.parent_path and str(directory.parent_path) == parent_path_str:
|
|
143
|
+
subdirs.append(directory)
|
|
144
|
+
|
|
145
|
+
return subdirs
|
|
146
|
+
|
|
147
|
+
def get_root_directories(self) -> list[Directory]:
|
|
148
|
+
"""Get all root-level directories (no parent).
|
|
149
|
+
|
|
150
|
+
Returns:
|
|
151
|
+
List of root directory objects
|
|
152
|
+
"""
|
|
153
|
+
return [d for d in self.directories.values() if d.parent_path is None]
|
|
154
|
+
|
|
155
|
+
def delete_directory(self, directory_path: Path) -> None:
|
|
156
|
+
"""Remove directory and its file associations.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
directory_path: Path to directory to remove
|
|
160
|
+
"""
|
|
161
|
+
dir_path_str = str(directory_path)
|
|
162
|
+
|
|
163
|
+
# Remove directory
|
|
164
|
+
if dir_path_str in self.directories:
|
|
165
|
+
del self.directories[dir_path_str]
|
|
166
|
+
|
|
167
|
+
# Remove file associations
|
|
168
|
+
if dir_path_str in self.directory_files:
|
|
169
|
+
for file_path in self.directory_files[dir_path_str]:
|
|
170
|
+
if file_path in self.file_to_directory:
|
|
171
|
+
del self.file_to_directory[file_path]
|
|
172
|
+
del self.directory_files[dir_path_str]
|
|
173
|
+
|
|
174
|
+
def delete_file(self, file_path: Path) -> None:
|
|
175
|
+
"""Remove file from directory associations.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
file_path: Path to file to remove
|
|
179
|
+
"""
|
|
180
|
+
file_path_str = str(file_path)
|
|
181
|
+
|
|
182
|
+
if file_path_str in self.file_to_directory:
|
|
183
|
+
dir_path = self.file_to_directory[file_path_str]
|
|
184
|
+
del self.file_to_directory[file_path_str]
|
|
185
|
+
|
|
186
|
+
# Remove from directory_files
|
|
187
|
+
if dir_path in self.directory_files:
|
|
188
|
+
self.directory_files[dir_path] = [
|
|
189
|
+
f for f in self.directory_files[dir_path] if f != file_path_str
|
|
190
|
+
]
|
|
191
|
+
|
|
192
|
+
# Update directory file count
|
|
193
|
+
if dir_path in self.directories:
|
|
194
|
+
self.directories[dir_path].file_count = len(
|
|
195
|
+
self.directory_files[dir_path]
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
def rebuild_from_files(self, file_paths: list[Path], root_path: Path, chunk_stats: dict[str, dict] | None = None) -> None:
|
|
199
|
+
"""Rebuild directory index from list of files with statistics from chunks.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
file_paths: List of file paths to index
|
|
203
|
+
root_path: Project root path
|
|
204
|
+
chunk_stats: Optional dict mapping file_path -> {'chunks': count, 'language': str}
|
|
205
|
+
"""
|
|
206
|
+
self.directories = {}
|
|
207
|
+
self.file_to_directory = {}
|
|
208
|
+
self.directory_files = defaultdict(list)
|
|
209
|
+
|
|
210
|
+
# Track all unique directories and their statistics
|
|
211
|
+
dir_set = set()
|
|
212
|
+
dir_chunks = defaultdict(int) # directory -> total chunks
|
|
213
|
+
dir_languages = defaultdict(lambda: defaultdict(int)) # directory -> {language: count}
|
|
214
|
+
dir_modified = defaultdict(float) # directory -> most recent modification time
|
|
215
|
+
|
|
216
|
+
for file_path in file_paths:
|
|
217
|
+
# Get relative path from root
|
|
218
|
+
try:
|
|
219
|
+
rel_path = file_path.relative_to(root_path)
|
|
220
|
+
parent_dir = rel_path.parent
|
|
221
|
+
|
|
222
|
+
# Add all parent directories up to root
|
|
223
|
+
current = parent_dir
|
|
224
|
+
while current != Path("."):
|
|
225
|
+
dir_set.add(current)
|
|
226
|
+
|
|
227
|
+
# Accumulate statistics up the directory tree
|
|
228
|
+
if chunk_stats and str(file_path) in chunk_stats:
|
|
229
|
+
stats = chunk_stats[str(file_path)]
|
|
230
|
+
dir_chunks[current] += stats.get('chunks', 0)
|
|
231
|
+
if 'language' in stats:
|
|
232
|
+
dir_languages[current][stats['language']] += stats.get('chunks', 0)
|
|
233
|
+
# Track most recent modification time
|
|
234
|
+
if 'modified' in stats:
|
|
235
|
+
dir_modified[current] = max(dir_modified.get(current, 0), stats['modified'])
|
|
236
|
+
|
|
237
|
+
current = current.parent
|
|
238
|
+
|
|
239
|
+
# Associate file with its direct parent
|
|
240
|
+
if parent_dir != Path("."):
|
|
241
|
+
self.add_file(rel_path, parent_dir)
|
|
242
|
+
|
|
243
|
+
except ValueError:
|
|
244
|
+
# File not relative to root, skip
|
|
245
|
+
logger.warning(f"File {file_path} not under root {root_path}")
|
|
246
|
+
continue
|
|
247
|
+
|
|
248
|
+
# Create Directory objects for all directories
|
|
249
|
+
for dir_path in sorted(dir_set):
|
|
250
|
+
# Determine parent
|
|
251
|
+
parent_path = dir_path.parent if dir_path.parent != Path(".") else None
|
|
252
|
+
|
|
253
|
+
# Check if it's a package
|
|
254
|
+
is_package = False
|
|
255
|
+
full_dir_path = root_path / dir_path
|
|
256
|
+
if (full_dir_path / "__init__.py").exists():
|
|
257
|
+
is_package = True
|
|
258
|
+
elif (full_dir_path / "package.json").exists():
|
|
259
|
+
is_package = True
|
|
260
|
+
|
|
261
|
+
directory = Directory(
|
|
262
|
+
path=dir_path,
|
|
263
|
+
name=dir_path.name,
|
|
264
|
+
parent_path=parent_path,
|
|
265
|
+
depth=len(dir_path.parts),
|
|
266
|
+
is_package=is_package,
|
|
267
|
+
total_chunks=dir_chunks.get(dir_path, 0),
|
|
268
|
+
languages=dict(dir_languages.get(dir_path, {})),
|
|
269
|
+
last_modified=dir_modified.get(dir_path),
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
self.add_directory(directory)
|
|
273
|
+
|
|
274
|
+
# Update subdirectory counts
|
|
275
|
+
for directory in self.directories.values():
|
|
276
|
+
subdirs = self.get_subdirectories(directory.path)
|
|
277
|
+
directory.subdirectory_count = len(subdirs)
|
|
278
|
+
|
|
279
|
+
logger.info(f"Rebuilt directory index with {len(self.directories)} directories, {sum(dir_chunks.values())} total chunks")
|
|
280
|
+
|
|
281
|
+
def get_stats(self) -> dict[str, Any]:
|
|
282
|
+
"""Get directory index statistics.
|
|
283
|
+
|
|
284
|
+
Returns:
|
|
285
|
+
Dictionary with statistics
|
|
286
|
+
"""
|
|
287
|
+
return {
|
|
288
|
+
"total_directories": len(self.directories),
|
|
289
|
+
"total_files": len(self.file_to_directory),
|
|
290
|
+
"root_directories": len(self.get_root_directories()),
|
|
291
|
+
"packages": sum(1 for d in self.directories.values() if d.is_package),
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
def reset(self) -> None:
|
|
295
|
+
"""Clear all directory data."""
|
|
296
|
+
self.directories = {}
|
|
297
|
+
self.file_to_directory = {}
|
|
298
|
+
self.directory_files = defaultdict(list)
|
|
299
|
+
|
|
300
|
+
if self.index_path.exists():
|
|
301
|
+
self.index_path.unlink()
|
|
302
|
+
|
|
303
|
+
logger.info("Directory index reset")
|
|
@@ -15,6 +15,7 @@ from ..parsers.registry import get_parser_registry
|
|
|
15
15
|
from ..utils.gitignore import create_gitignore_parser
|
|
16
16
|
from ..utils.monorepo import MonorepoDetector
|
|
17
17
|
from .database import VectorDatabase
|
|
18
|
+
from .directory_index import DirectoryIndex
|
|
18
19
|
from .exceptions import ParsingError
|
|
19
20
|
from .models import CodeChunk
|
|
20
21
|
|
|
@@ -29,6 +30,7 @@ class SemanticIndexer:
|
|
|
29
30
|
file_extensions: list[str],
|
|
30
31
|
max_workers: int | None = None,
|
|
31
32
|
batch_size: int = 10,
|
|
33
|
+
debug: bool = False,
|
|
32
34
|
) -> None:
|
|
33
35
|
"""Initialize semantic indexer.
|
|
34
36
|
|
|
@@ -38,12 +40,14 @@ class SemanticIndexer:
|
|
|
38
40
|
file_extensions: File extensions to index
|
|
39
41
|
max_workers: Maximum number of worker threads for parallel processing
|
|
40
42
|
batch_size: Number of files to process in each batch
|
|
43
|
+
debug: Enable debug output for hierarchy building
|
|
41
44
|
"""
|
|
42
45
|
self.database = database
|
|
43
46
|
self.project_root = project_root
|
|
44
47
|
self.file_extensions = {ext.lower() for ext in file_extensions}
|
|
45
48
|
self.parser_registry = get_parser_registry()
|
|
46
49
|
self._ignore_patterns = set(DEFAULT_IGNORE_PATTERNS)
|
|
50
|
+
self.debug = debug
|
|
47
51
|
|
|
48
52
|
# Safely get event loop for max_workers
|
|
49
53
|
try:
|
|
@@ -81,6 +85,13 @@ class SemanticIndexer:
|
|
|
81
85
|
for sp in subprojects:
|
|
82
86
|
logger.debug(f" - {sp.name} ({sp.relative_path})")
|
|
83
87
|
|
|
88
|
+
# Initialize directory index
|
|
89
|
+
self.directory_index = DirectoryIndex(
|
|
90
|
+
project_root / ".mcp-vector-search" / "directory_index.json"
|
|
91
|
+
)
|
|
92
|
+
# Load existing directory index
|
|
93
|
+
self.directory_index.load()
|
|
94
|
+
|
|
84
95
|
async def index_project(
|
|
85
96
|
self,
|
|
86
97
|
force_reindex: bool = False,
|
|
@@ -156,6 +167,38 @@ class SemanticIndexer:
|
|
|
156
167
|
|
|
157
168
|
self._save_index_metadata(metadata)
|
|
158
169
|
|
|
170
|
+
# Rebuild directory index from successfully indexed files
|
|
171
|
+
try:
|
|
172
|
+
logger.debug("Rebuilding directory index...")
|
|
173
|
+
# We don't have chunk counts here, but we have file modification times
|
|
174
|
+
# Build a simple stats dict with file mod times for recency tracking
|
|
175
|
+
chunk_stats = {}
|
|
176
|
+
for file_path in files_to_index:
|
|
177
|
+
try:
|
|
178
|
+
mtime = os.path.getmtime(file_path)
|
|
179
|
+
# For now, just track modification time
|
|
180
|
+
# Chunk counts will be aggregated from the database later if needed
|
|
181
|
+
chunk_stats[str(file_path)] = {
|
|
182
|
+
'modified': mtime,
|
|
183
|
+
'chunks': 1, # Placeholder - real count from chunks
|
|
184
|
+
}
|
|
185
|
+
except OSError:
|
|
186
|
+
pass
|
|
187
|
+
|
|
188
|
+
self.directory_index.rebuild_from_files(
|
|
189
|
+
files_to_index, self.project_root, chunk_stats=chunk_stats
|
|
190
|
+
)
|
|
191
|
+
self.directory_index.save()
|
|
192
|
+
dir_stats = self.directory_index.get_stats()
|
|
193
|
+
logger.info(
|
|
194
|
+
f"Directory index updated: {dir_stats['total_directories']} directories, "
|
|
195
|
+
f"{dir_stats['total_files']} files"
|
|
196
|
+
)
|
|
197
|
+
except Exception as e:
|
|
198
|
+
logger.error(f"Failed to update directory index: {e}")
|
|
199
|
+
import traceback
|
|
200
|
+
logger.debug(traceback.format_exc())
|
|
201
|
+
|
|
159
202
|
logger.info(
|
|
160
203
|
f"Indexing complete: {indexed_count} files indexed, {failed_count} failed"
|
|
161
204
|
)
|
|
@@ -307,6 +350,10 @@ class SemanticIndexer:
|
|
|
307
350
|
# Build hierarchical relationships between chunks
|
|
308
351
|
chunks_with_hierarchy = self._build_chunk_hierarchy(chunks)
|
|
309
352
|
|
|
353
|
+
# Debug: Check if hierarchy was built
|
|
354
|
+
methods_with_parents = sum(1 for c in chunks_with_hierarchy if c.chunk_type in ("method", "function") and c.parent_chunk_id)
|
|
355
|
+
logger.debug(f"After hierarchy build: {methods_with_parents}/{len([c for c in chunks_with_hierarchy if c.chunk_type in ('method', 'function')])} methods have parents")
|
|
356
|
+
|
|
310
357
|
# Add chunks to database
|
|
311
358
|
await self.database.add_chunks(chunks_with_hierarchy)
|
|
312
359
|
|
|
@@ -790,6 +837,15 @@ class SemanticIndexer:
|
|
|
790
837
|
class_chunks = [c for c in chunks if c.chunk_type in ("class", "interface", "mixin")]
|
|
791
838
|
function_chunks = [c for c in chunks if c.chunk_type in ("function", "method", "constructor")]
|
|
792
839
|
|
|
840
|
+
# DEBUG: Print what we have (if debug enabled)
|
|
841
|
+
if self.debug:
|
|
842
|
+
import sys
|
|
843
|
+
print(f"\n[DEBUG] Building hierarchy: {len(module_chunks)} modules, {len(class_chunks)} classes, {len(function_chunks)} functions", file=sys.stderr)
|
|
844
|
+
if class_chunks:
|
|
845
|
+
print(f"[DEBUG] Class names: {[c.class_name for c in class_chunks[:5]]}", file=sys.stderr)
|
|
846
|
+
if function_chunks:
|
|
847
|
+
print(f"[DEBUG] First 5 functions with class_name: {[(f.function_name, f.class_name) for f in function_chunks[:5]]}", file=sys.stderr)
|
|
848
|
+
|
|
793
849
|
# Build relationships
|
|
794
850
|
for func in function_chunks:
|
|
795
851
|
if func.class_name:
|
|
@@ -803,6 +859,10 @@ class SemanticIndexer:
|
|
|
803
859
|
func.chunk_depth = parent_class.chunk_depth + 1
|
|
804
860
|
if func.chunk_id not in parent_class.child_chunk_ids:
|
|
805
861
|
parent_class.child_chunk_ids.append(func.chunk_id)
|
|
862
|
+
if self.debug:
|
|
863
|
+
import sys
|
|
864
|
+
print(f"[DEBUG] ✓ Linked '{func.function_name}' to class '{parent_class.class_name}'", file=sys.stderr)
|
|
865
|
+
logger.debug(f"Linked method '{func.function_name}' (ID: {func.chunk_id[:8]}) to class '{parent_class.class_name}' (ID: {parent_class.chunk_id[:8]})")
|
|
806
866
|
else:
|
|
807
867
|
# Top-level function
|
|
808
868
|
if not func.chunk_depth:
|
|
@@ -828,6 +888,13 @@ class SemanticIndexer:
|
|
|
828
888
|
if not mod.chunk_depth:
|
|
829
889
|
mod.chunk_depth = 0
|
|
830
890
|
|
|
891
|
+
# DEBUG: Print summary
|
|
892
|
+
if self.debug:
|
|
893
|
+
import sys
|
|
894
|
+
funcs_with_parents = sum(1 for f in function_chunks if f.parent_chunk_id)
|
|
895
|
+
classes_with_parents = sum(1 for c in class_chunks if c.parent_chunk_id)
|
|
896
|
+
print(f"[DEBUG] Hierarchy built: {funcs_with_parents}/{len(function_chunks)} functions linked, {classes_with_parents}/{len(class_chunks)} classes linked\n", file=sys.stderr)
|
|
897
|
+
|
|
831
898
|
return chunks
|
|
832
899
|
|
|
833
900
|
def _write_indexing_run_header(self) -> None:
|
mcp_vector_search/core/models.py
CHANGED
|
@@ -206,6 +206,64 @@ class IndexStats(BaseModel):
|
|
|
206
206
|
}
|
|
207
207
|
|
|
208
208
|
|
|
209
|
+
@dataclass
|
|
210
|
+
class Directory:
|
|
211
|
+
"""Represents a directory in the project structure."""
|
|
212
|
+
|
|
213
|
+
path: Path # Relative path from project root
|
|
214
|
+
name: str # Directory name
|
|
215
|
+
parent_path: Path | None = None # Parent directory path (None for root)
|
|
216
|
+
file_count: int = 0 # Number of files directly in this directory
|
|
217
|
+
subdirectory_count: int = 0 # Number of subdirectories
|
|
218
|
+
total_chunks: int = 0 # Total code chunks in this directory (recursive)
|
|
219
|
+
languages: dict[str, int] = None # Language distribution in this directory
|
|
220
|
+
depth: int = 0 # Depth from project root (0 = root)
|
|
221
|
+
is_package: bool = False # True if contains __init__.py or package.json
|
|
222
|
+
last_modified: float | None = None # Most recent file modification time (unix timestamp)
|
|
223
|
+
|
|
224
|
+
def __post_init__(self) -> None:
|
|
225
|
+
"""Initialize default values and generate directory ID."""
|
|
226
|
+
if self.languages is None:
|
|
227
|
+
self.languages = {}
|
|
228
|
+
|
|
229
|
+
@property
|
|
230
|
+
def id(self) -> str:
|
|
231
|
+
"""Generate unique ID for this directory."""
|
|
232
|
+
import hashlib
|
|
233
|
+
return hashlib.sha256(str(self.path).encode()).hexdigest()[:16]
|
|
234
|
+
|
|
235
|
+
def to_dict(self) -> dict[str, Any]:
|
|
236
|
+
"""Convert to dictionary for storage."""
|
|
237
|
+
return {
|
|
238
|
+
"path": str(self.path),
|
|
239
|
+
"name": self.name,
|
|
240
|
+
"parent_path": str(self.parent_path) if self.parent_path else None,
|
|
241
|
+
"file_count": self.file_count,
|
|
242
|
+
"subdirectory_count": self.subdirectory_count,
|
|
243
|
+
"total_chunks": self.total_chunks,
|
|
244
|
+
"languages": self.languages,
|
|
245
|
+
"depth": self.depth,
|
|
246
|
+
"is_package": self.is_package,
|
|
247
|
+
"last_modified": self.last_modified,
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
@classmethod
|
|
251
|
+
def from_dict(cls, data: dict[str, Any]) -> "Directory":
|
|
252
|
+
"""Create from dictionary."""
|
|
253
|
+
return cls(
|
|
254
|
+
path=Path(data["path"]),
|
|
255
|
+
name=data["name"],
|
|
256
|
+
parent_path=Path(data["parent_path"]) if data.get("parent_path") else None,
|
|
257
|
+
file_count=data.get("file_count", 0),
|
|
258
|
+
subdirectory_count=data.get("subdirectory_count", 0),
|
|
259
|
+
total_chunks=data.get("total_chunks", 0),
|
|
260
|
+
languages=data.get("languages", {}),
|
|
261
|
+
depth=data.get("depth", 0),
|
|
262
|
+
is_package=data.get("is_package", False),
|
|
263
|
+
last_modified=data.get("last_modified"),
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
|
|
209
267
|
class ProjectInfo(BaseModel):
|
|
210
268
|
"""Information about a project."""
|
|
211
269
|
|
|
@@ -51,14 +51,24 @@ class GitignorePattern:
|
|
|
51
51
|
Returns:
|
|
52
52
|
True if the pattern matches
|
|
53
53
|
"""
|
|
54
|
-
# Directory-only patterns only match directories
|
|
55
|
-
if self.is_directory_only and not is_directory:
|
|
56
|
-
return False
|
|
57
|
-
|
|
58
54
|
# Convert path separators for consistent matching
|
|
59
55
|
path = path.replace("\\", "/")
|
|
60
56
|
pattern = self.pattern.replace("\\", "/")
|
|
61
57
|
|
|
58
|
+
# For directory-only patterns, check if any parent directory matches
|
|
59
|
+
# This implements Git's behavior where "dir/" excludes both the directory
|
|
60
|
+
# AND all files within it recursively
|
|
61
|
+
if self.is_directory_only:
|
|
62
|
+
path_parts = path.split("/")
|
|
63
|
+
# Check each parent directory component
|
|
64
|
+
for i in range(1, len(path_parts) + 1):
|
|
65
|
+
parent = "/".join(path_parts[:i])
|
|
66
|
+
if fnmatch.fnmatch(parent, pattern):
|
|
67
|
+
return True
|
|
68
|
+
# If no parent matches and this is not a directory, don't exclude
|
|
69
|
+
if not is_directory:
|
|
70
|
+
return False
|
|
71
|
+
|
|
62
72
|
# Try exact match first
|
|
63
73
|
if fnmatch.fnmatch(path, pattern):
|
|
64
74
|
return True
|