mcp-vector-search 0.15.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mcp-vector-search might be problematic. Click here for more details.
- mcp_vector_search/__init__.py +10 -0
- mcp_vector_search/cli/__init__.py +1 -0
- mcp_vector_search/cli/commands/__init__.py +1 -0
- mcp_vector_search/cli/commands/auto_index.py +397 -0
- mcp_vector_search/cli/commands/chat.py +534 -0
- mcp_vector_search/cli/commands/config.py +393 -0
- mcp_vector_search/cli/commands/demo.py +358 -0
- mcp_vector_search/cli/commands/index.py +762 -0
- mcp_vector_search/cli/commands/init.py +658 -0
- mcp_vector_search/cli/commands/install.py +869 -0
- mcp_vector_search/cli/commands/install_old.py +700 -0
- mcp_vector_search/cli/commands/mcp.py +1254 -0
- mcp_vector_search/cli/commands/reset.py +393 -0
- mcp_vector_search/cli/commands/search.py +796 -0
- mcp_vector_search/cli/commands/setup.py +1133 -0
- mcp_vector_search/cli/commands/status.py +584 -0
- mcp_vector_search/cli/commands/uninstall.py +404 -0
- mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
- mcp_vector_search/cli/commands/visualize/cli.py +265 -0
- mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
- mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
- mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
- mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
- mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
- mcp_vector_search/cli/commands/visualize/server.py +201 -0
- mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
- mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
- mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
- mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
- mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
- mcp_vector_search/cli/commands/visualize.py.original +2536 -0
- mcp_vector_search/cli/commands/watch.py +287 -0
- mcp_vector_search/cli/didyoumean.py +520 -0
- mcp_vector_search/cli/export.py +320 -0
- mcp_vector_search/cli/history.py +295 -0
- mcp_vector_search/cli/interactive.py +342 -0
- mcp_vector_search/cli/main.py +484 -0
- mcp_vector_search/cli/output.py +414 -0
- mcp_vector_search/cli/suggestions.py +375 -0
- mcp_vector_search/config/__init__.py +1 -0
- mcp_vector_search/config/constants.py +24 -0
- mcp_vector_search/config/defaults.py +200 -0
- mcp_vector_search/config/settings.py +146 -0
- mcp_vector_search/core/__init__.py +1 -0
- mcp_vector_search/core/auto_indexer.py +298 -0
- mcp_vector_search/core/config_utils.py +394 -0
- mcp_vector_search/core/connection_pool.py +360 -0
- mcp_vector_search/core/database.py +1237 -0
- mcp_vector_search/core/directory_index.py +318 -0
- mcp_vector_search/core/embeddings.py +294 -0
- mcp_vector_search/core/exceptions.py +89 -0
- mcp_vector_search/core/factory.py +318 -0
- mcp_vector_search/core/git_hooks.py +345 -0
- mcp_vector_search/core/indexer.py +1002 -0
- mcp_vector_search/core/llm_client.py +453 -0
- mcp_vector_search/core/models.py +294 -0
- mcp_vector_search/core/project.py +350 -0
- mcp_vector_search/core/scheduler.py +330 -0
- mcp_vector_search/core/search.py +952 -0
- mcp_vector_search/core/watcher.py +322 -0
- mcp_vector_search/mcp/__init__.py +5 -0
- mcp_vector_search/mcp/__main__.py +25 -0
- mcp_vector_search/mcp/server.py +752 -0
- mcp_vector_search/parsers/__init__.py +8 -0
- mcp_vector_search/parsers/base.py +296 -0
- mcp_vector_search/parsers/dart.py +605 -0
- mcp_vector_search/parsers/html.py +413 -0
- mcp_vector_search/parsers/javascript.py +643 -0
- mcp_vector_search/parsers/php.py +694 -0
- mcp_vector_search/parsers/python.py +502 -0
- mcp_vector_search/parsers/registry.py +223 -0
- mcp_vector_search/parsers/ruby.py +678 -0
- mcp_vector_search/parsers/text.py +186 -0
- mcp_vector_search/parsers/utils.py +265 -0
- mcp_vector_search/py.typed +1 -0
- mcp_vector_search/utils/__init__.py +42 -0
- mcp_vector_search/utils/gitignore.py +250 -0
- mcp_vector_search/utils/gitignore_updater.py +212 -0
- mcp_vector_search/utils/monorepo.py +339 -0
- mcp_vector_search/utils/timing.py +338 -0
- mcp_vector_search/utils/version.py +47 -0
- mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
- mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
- mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
- mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
- mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,318 @@
|
|
|
1
|
+
"""Directory index for tracking project structure and file relationships."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from loguru import logger
|
|
9
|
+
|
|
10
|
+
from .models import Directory
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DirectoryIndex:
|
|
14
|
+
"""Manages directory structure and file-directory relationships."""
|
|
15
|
+
|
|
16
|
+
def __init__(self, index_path: Path) -> None:
|
|
17
|
+
"""Initialize directory index.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
index_path: Path to directory index file (JSON)
|
|
21
|
+
"""
|
|
22
|
+
self.index_path = index_path
|
|
23
|
+
self.directories: dict[str, Directory] = {} # path -> Directory
|
|
24
|
+
self.file_to_directory: dict[str, str] = {} # file_path -> directory_path
|
|
25
|
+
self.directory_files: dict[str, list[str]] = defaultdict(
|
|
26
|
+
list
|
|
27
|
+
) # dir_path -> [file_paths]
|
|
28
|
+
|
|
29
|
+
def load(self) -> None:
|
|
30
|
+
"""Load directory index from disk."""
|
|
31
|
+
if not self.index_path.exists():
|
|
32
|
+
logger.debug("No directory index found, starting fresh")
|
|
33
|
+
return
|
|
34
|
+
|
|
35
|
+
try:
|
|
36
|
+
with open(self.index_path) as f:
|
|
37
|
+
data = json.load(f)
|
|
38
|
+
|
|
39
|
+
# Load directories
|
|
40
|
+
for dir_data in data.get("directories", []):
|
|
41
|
+
directory = Directory.from_dict(dir_data)
|
|
42
|
+
self.directories[str(directory.path)] = directory
|
|
43
|
+
|
|
44
|
+
# Load file mappings
|
|
45
|
+
self.file_to_directory = data.get("file_to_directory", {})
|
|
46
|
+
|
|
47
|
+
# Rebuild directory_files from file_to_directory
|
|
48
|
+
self.directory_files = defaultdict(list)
|
|
49
|
+
for file_path, dir_path in self.file_to_directory.items():
|
|
50
|
+
self.directory_files[dir_path].append(file_path)
|
|
51
|
+
|
|
52
|
+
logger.info(f"Loaded {len(self.directories)} directories from index")
|
|
53
|
+
|
|
54
|
+
except Exception as e:
|
|
55
|
+
logger.error(f"Failed to load directory index: {e}")
|
|
56
|
+
self.directories = {}
|
|
57
|
+
self.file_to_directory = {}
|
|
58
|
+
self.directory_files = defaultdict(list)
|
|
59
|
+
|
|
60
|
+
def save(self) -> None:
|
|
61
|
+
"""Save directory index to disk."""
|
|
62
|
+
try:
|
|
63
|
+
# Ensure parent directory exists
|
|
64
|
+
self.index_path.parent.mkdir(parents=True, exist_ok=True)
|
|
65
|
+
|
|
66
|
+
data = {
|
|
67
|
+
"directories": [d.to_dict() for d in self.directories.values()],
|
|
68
|
+
"file_to_directory": self.file_to_directory,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
with open(self.index_path, "w") as f:
|
|
72
|
+
json.dump(data, f, indent=2)
|
|
73
|
+
|
|
74
|
+
logger.debug(f"Saved {len(self.directories)} directories to index")
|
|
75
|
+
|
|
76
|
+
except Exception as e:
|
|
77
|
+
logger.error(f"Failed to save directory index: {e}")
|
|
78
|
+
raise
|
|
79
|
+
|
|
80
|
+
def add_directory(self, directory: Directory) -> None:
|
|
81
|
+
"""Add or update a directory in the index.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
directory: Directory to add
|
|
85
|
+
"""
|
|
86
|
+
dir_path = str(directory.path)
|
|
87
|
+
self.directories[dir_path] = directory
|
|
88
|
+
|
|
89
|
+
def add_file(self, file_path: Path, directory_path: Path) -> None:
|
|
90
|
+
"""Associate a file with its directory.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
file_path: Path to the file
|
|
94
|
+
directory_path: Path to the directory containing the file
|
|
95
|
+
"""
|
|
96
|
+
file_path_str = str(file_path)
|
|
97
|
+
dir_path_str = str(directory_path)
|
|
98
|
+
|
|
99
|
+
self.file_to_directory[file_path_str] = dir_path_str
|
|
100
|
+
if file_path_str not in self.directory_files[dir_path_str]:
|
|
101
|
+
self.directory_files[dir_path_str].append(file_path_str)
|
|
102
|
+
|
|
103
|
+
# Update directory file count
|
|
104
|
+
if dir_path_str in self.directories:
|
|
105
|
+
self.directories[dir_path_str].file_count = len(
|
|
106
|
+
self.directory_files[dir_path_str]
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
def get_directory(self, directory_path: Path) -> Directory | None:
|
|
110
|
+
"""Get directory by path.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
directory_path: Path to directory
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Directory object or None if not found
|
|
117
|
+
"""
|
|
118
|
+
return self.directories.get(str(directory_path))
|
|
119
|
+
|
|
120
|
+
def get_files_in_directory(self, directory_path: Path) -> list[str]:
|
|
121
|
+
"""Get all files in a directory.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
directory_path: Path to directory
|
|
125
|
+
|
|
126
|
+
Returns:
|
|
127
|
+
List of file paths in the directory
|
|
128
|
+
"""
|
|
129
|
+
return self.directory_files.get(str(directory_path), [])
|
|
130
|
+
|
|
131
|
+
def get_subdirectories(self, directory_path: Path) -> list[Directory]:
|
|
132
|
+
"""Get all immediate subdirectories.
|
|
133
|
+
|
|
134
|
+
Args:
|
|
135
|
+
directory_path: Path to parent directory
|
|
136
|
+
|
|
137
|
+
Returns:
|
|
138
|
+
List of subdirectory objects
|
|
139
|
+
"""
|
|
140
|
+
parent_path_str = str(directory_path)
|
|
141
|
+
subdirs = []
|
|
142
|
+
|
|
143
|
+
for _dir_path_str, directory in self.directories.items():
|
|
144
|
+
if directory.parent_path and str(directory.parent_path) == parent_path_str:
|
|
145
|
+
subdirs.append(directory)
|
|
146
|
+
|
|
147
|
+
return subdirs
|
|
148
|
+
|
|
149
|
+
def get_root_directories(self) -> list[Directory]:
|
|
150
|
+
"""Get all root-level directories (no parent).
|
|
151
|
+
|
|
152
|
+
Returns:
|
|
153
|
+
List of root directory objects
|
|
154
|
+
"""
|
|
155
|
+
return [d for d in self.directories.values() if d.parent_path is None]
|
|
156
|
+
|
|
157
|
+
def delete_directory(self, directory_path: Path) -> None:
|
|
158
|
+
"""Remove directory and its file associations.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
directory_path: Path to directory to remove
|
|
162
|
+
"""
|
|
163
|
+
dir_path_str = str(directory_path)
|
|
164
|
+
|
|
165
|
+
# Remove directory
|
|
166
|
+
if dir_path_str in self.directories:
|
|
167
|
+
del self.directories[dir_path_str]
|
|
168
|
+
|
|
169
|
+
# Remove file associations
|
|
170
|
+
if dir_path_str in self.directory_files:
|
|
171
|
+
for file_path in self.directory_files[dir_path_str]:
|
|
172
|
+
if file_path in self.file_to_directory:
|
|
173
|
+
del self.file_to_directory[file_path]
|
|
174
|
+
del self.directory_files[dir_path_str]
|
|
175
|
+
|
|
176
|
+
def delete_file(self, file_path: Path) -> None:
|
|
177
|
+
"""Remove file from directory associations.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
file_path: Path to file to remove
|
|
181
|
+
"""
|
|
182
|
+
file_path_str = str(file_path)
|
|
183
|
+
|
|
184
|
+
if file_path_str in self.file_to_directory:
|
|
185
|
+
dir_path = self.file_to_directory[file_path_str]
|
|
186
|
+
del self.file_to_directory[file_path_str]
|
|
187
|
+
|
|
188
|
+
# Remove from directory_files
|
|
189
|
+
if dir_path in self.directory_files:
|
|
190
|
+
self.directory_files[dir_path] = [
|
|
191
|
+
f for f in self.directory_files[dir_path] if f != file_path_str
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
# Update directory file count
|
|
195
|
+
if dir_path in self.directories:
|
|
196
|
+
self.directories[dir_path].file_count = len(
|
|
197
|
+
self.directory_files[dir_path]
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
def rebuild_from_files(
|
|
201
|
+
self,
|
|
202
|
+
file_paths: list[Path],
|
|
203
|
+
root_path: Path,
|
|
204
|
+
chunk_stats: dict[str, dict] | None = None,
|
|
205
|
+
) -> None:
|
|
206
|
+
"""Rebuild directory index from list of files with statistics from chunks.
|
|
207
|
+
|
|
208
|
+
Args:
|
|
209
|
+
file_paths: List of file paths to index
|
|
210
|
+
root_path: Project root path
|
|
211
|
+
chunk_stats: Optional dict mapping file_path -> {'chunks': count, 'language': str}
|
|
212
|
+
"""
|
|
213
|
+
self.directories = {}
|
|
214
|
+
self.file_to_directory = {}
|
|
215
|
+
self.directory_files = defaultdict(list)
|
|
216
|
+
|
|
217
|
+
# Track all unique directories and their statistics
|
|
218
|
+
dir_set = set()
|
|
219
|
+
dir_chunks = defaultdict(int) # directory -> total chunks
|
|
220
|
+
dir_languages = defaultdict(
|
|
221
|
+
lambda: defaultdict(int)
|
|
222
|
+
) # directory -> {language: count}
|
|
223
|
+
dir_modified = defaultdict(float) # directory -> most recent modification time
|
|
224
|
+
|
|
225
|
+
for file_path in file_paths:
|
|
226
|
+
# Get relative path from root
|
|
227
|
+
try:
|
|
228
|
+
rel_path = file_path.relative_to(root_path)
|
|
229
|
+
parent_dir = rel_path.parent
|
|
230
|
+
|
|
231
|
+
# Add all parent directories up to root
|
|
232
|
+
current = parent_dir
|
|
233
|
+
while current != Path("."):
|
|
234
|
+
dir_set.add(current)
|
|
235
|
+
|
|
236
|
+
# Accumulate statistics up the directory tree
|
|
237
|
+
if chunk_stats and str(file_path) in chunk_stats:
|
|
238
|
+
stats = chunk_stats[str(file_path)]
|
|
239
|
+
dir_chunks[current] += stats.get("chunks", 0)
|
|
240
|
+
if "language" in stats:
|
|
241
|
+
dir_languages[current][stats["language"]] += stats.get(
|
|
242
|
+
"chunks", 0
|
|
243
|
+
)
|
|
244
|
+
# Track most recent modification time
|
|
245
|
+
if "modified" in stats:
|
|
246
|
+
dir_modified[current] = max(
|
|
247
|
+
dir_modified.get(current, 0), stats["modified"]
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
current = current.parent
|
|
251
|
+
|
|
252
|
+
# Associate file with its direct parent
|
|
253
|
+
if parent_dir != Path("."):
|
|
254
|
+
self.add_file(rel_path, parent_dir)
|
|
255
|
+
|
|
256
|
+
except ValueError:
|
|
257
|
+
# File not relative to root, skip
|
|
258
|
+
logger.warning(f"File {file_path} not under root {root_path}")
|
|
259
|
+
continue
|
|
260
|
+
|
|
261
|
+
# Create Directory objects for all directories
|
|
262
|
+
for dir_path in sorted(dir_set):
|
|
263
|
+
# Determine parent
|
|
264
|
+
parent_path = dir_path.parent if dir_path.parent != Path(".") else None
|
|
265
|
+
|
|
266
|
+
# Check if it's a package
|
|
267
|
+
is_package = False
|
|
268
|
+
full_dir_path = root_path / dir_path
|
|
269
|
+
if (full_dir_path / "__init__.py").exists():
|
|
270
|
+
is_package = True
|
|
271
|
+
elif (full_dir_path / "package.json").exists():
|
|
272
|
+
is_package = True
|
|
273
|
+
|
|
274
|
+
directory = Directory(
|
|
275
|
+
path=dir_path,
|
|
276
|
+
name=dir_path.name,
|
|
277
|
+
parent_path=parent_path,
|
|
278
|
+
depth=len(dir_path.parts),
|
|
279
|
+
is_package=is_package,
|
|
280
|
+
total_chunks=dir_chunks.get(dir_path, 0),
|
|
281
|
+
languages=dict(dir_languages.get(dir_path, {})),
|
|
282
|
+
last_modified=dir_modified.get(dir_path),
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
self.add_directory(directory)
|
|
286
|
+
|
|
287
|
+
# Update subdirectory counts
|
|
288
|
+
for directory in self.directories.values():
|
|
289
|
+
subdirs = self.get_subdirectories(directory.path)
|
|
290
|
+
directory.subdirectory_count = len(subdirs)
|
|
291
|
+
|
|
292
|
+
logger.info(
|
|
293
|
+
f"Rebuilt directory index with {len(self.directories)} directories, {sum(dir_chunks.values())} total chunks"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
def get_stats(self) -> dict[str, Any]:
|
|
297
|
+
"""Get directory index statistics.
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Dictionary with statistics
|
|
301
|
+
"""
|
|
302
|
+
return {
|
|
303
|
+
"total_directories": len(self.directories),
|
|
304
|
+
"total_files": len(self.file_to_directory),
|
|
305
|
+
"root_directories": len(self.get_root_directories()),
|
|
306
|
+
"packages": sum(1 for d in self.directories.values() if d.is_package),
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
def reset(self) -> None:
|
|
310
|
+
"""Clear all directory data."""
|
|
311
|
+
self.directories = {}
|
|
312
|
+
self.file_to_directory = {}
|
|
313
|
+
self.directory_files = defaultdict(list)
|
|
314
|
+
|
|
315
|
+
if self.index_path.exists():
|
|
316
|
+
self.index_path.unlink()
|
|
317
|
+
|
|
318
|
+
logger.info("Directory index reset")
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""Embedding generation for MCP Vector Search."""
|
|
2
|
+
|
|
3
|
+
import hashlib
|
|
4
|
+
import json
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import aiofiles
|
|
8
|
+
from loguru import logger
|
|
9
|
+
from sentence_transformers import SentenceTransformer
|
|
10
|
+
|
|
11
|
+
from .exceptions import EmbeddingError
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class EmbeddingCache:
|
|
15
|
+
"""LRU cache for embeddings with disk persistence."""
|
|
16
|
+
|
|
17
|
+
def __init__(self, cache_dir: Path, max_size: int = 1000) -> None:
|
|
18
|
+
"""Initialize embedding cache.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
cache_dir: Directory to store cached embeddings
|
|
22
|
+
max_size: Maximum number of embeddings to keep in memory
|
|
23
|
+
"""
|
|
24
|
+
self.cache_dir = cache_dir
|
|
25
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
self.max_size = max_size
|
|
27
|
+
self._memory_cache: dict[str, list[float]] = {}
|
|
28
|
+
self._access_order: list[str] = [] # For LRU eviction
|
|
29
|
+
self._cache_hits = 0
|
|
30
|
+
self._cache_misses = 0
|
|
31
|
+
|
|
32
|
+
def _hash_content(self, content: str) -> str:
|
|
33
|
+
"""Generate cache key from content."""
|
|
34
|
+
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
|
35
|
+
|
|
36
|
+
async def get_embedding(self, content: str) -> list[float] | None:
|
|
37
|
+
"""Get cached embedding for content."""
|
|
38
|
+
cache_key = self._hash_content(content)
|
|
39
|
+
|
|
40
|
+
# Check memory cache first
|
|
41
|
+
if cache_key in self._memory_cache:
|
|
42
|
+
self._cache_hits += 1
|
|
43
|
+
# Move to end for LRU
|
|
44
|
+
self._access_order.remove(cache_key)
|
|
45
|
+
self._access_order.append(cache_key)
|
|
46
|
+
return self._memory_cache[cache_key]
|
|
47
|
+
|
|
48
|
+
# Check disk cache
|
|
49
|
+
cache_file = self.cache_dir / f"{cache_key}.json"
|
|
50
|
+
if cache_file.exists():
|
|
51
|
+
try:
|
|
52
|
+
async with aiofiles.open(cache_file) as f:
|
|
53
|
+
content_str = await f.read()
|
|
54
|
+
embedding = json.loads(content_str)
|
|
55
|
+
|
|
56
|
+
# Add to memory cache with LRU management
|
|
57
|
+
self._add_to_memory_cache(cache_key, embedding)
|
|
58
|
+
self._cache_hits += 1
|
|
59
|
+
return embedding
|
|
60
|
+
except Exception as e:
|
|
61
|
+
logger.warning(f"Failed to load cached embedding: {e}")
|
|
62
|
+
|
|
63
|
+
self._cache_misses += 1
|
|
64
|
+
return None
|
|
65
|
+
|
|
66
|
+
async def store_embedding(self, content: str, embedding: list[float]) -> None:
|
|
67
|
+
"""Store embedding in cache."""
|
|
68
|
+
cache_key = self._hash_content(content)
|
|
69
|
+
|
|
70
|
+
# Store in memory cache with LRU management
|
|
71
|
+
self._add_to_memory_cache(cache_key, embedding)
|
|
72
|
+
|
|
73
|
+
# Store in disk cache
|
|
74
|
+
cache_file = self.cache_dir / f"{cache_key}.json"
|
|
75
|
+
try:
|
|
76
|
+
async with aiofiles.open(cache_file, "w") as f:
|
|
77
|
+
await f.write(json.dumps(embedding))
|
|
78
|
+
except Exception as e:
|
|
79
|
+
logger.warning(f"Failed to cache embedding: {e}")
|
|
80
|
+
|
|
81
|
+
def _add_to_memory_cache(self, cache_key: str, embedding: list[float]) -> None:
|
|
82
|
+
"""Add embedding to memory cache with LRU eviction.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
cache_key: Cache key for the embedding
|
|
86
|
+
embedding: Embedding vector to cache
|
|
87
|
+
"""
|
|
88
|
+
# If already in cache, update and move to end
|
|
89
|
+
if cache_key in self._memory_cache:
|
|
90
|
+
self._access_order.remove(cache_key)
|
|
91
|
+
self._access_order.append(cache_key)
|
|
92
|
+
self._memory_cache[cache_key] = embedding
|
|
93
|
+
return
|
|
94
|
+
|
|
95
|
+
# If cache is full, evict least recently used
|
|
96
|
+
if len(self._memory_cache) >= self.max_size:
|
|
97
|
+
lru_key = self._access_order.pop(0)
|
|
98
|
+
del self._memory_cache[lru_key]
|
|
99
|
+
|
|
100
|
+
# Add new embedding
|
|
101
|
+
self._memory_cache[cache_key] = embedding
|
|
102
|
+
self._access_order.append(cache_key)
|
|
103
|
+
|
|
104
|
+
def clear_memory_cache(self) -> None:
|
|
105
|
+
"""Clear the in-memory cache."""
|
|
106
|
+
self._memory_cache.clear()
|
|
107
|
+
self._access_order.clear()
|
|
108
|
+
|
|
109
|
+
def get_cache_stats(self) -> dict[str, any]:
|
|
110
|
+
"""Get cache performance statistics.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Dictionary with cache statistics
|
|
114
|
+
"""
|
|
115
|
+
total_requests = self._cache_hits + self._cache_misses
|
|
116
|
+
hit_rate = self._cache_hits / total_requests if total_requests > 0 else 0.0
|
|
117
|
+
disk_files = (
|
|
118
|
+
len(list(self.cache_dir.glob("*.json"))) if self.cache_dir.exists() else 0
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
return {
|
|
122
|
+
"memory_cache_size": len(self._memory_cache),
|
|
123
|
+
"memory_cached": len(self._memory_cache), # Alias for compatibility
|
|
124
|
+
"max_cache_size": self.max_size,
|
|
125
|
+
"memory_limit": self.max_size, # Alias for compatibility
|
|
126
|
+
"cache_hits": self._cache_hits,
|
|
127
|
+
"cache_misses": self._cache_misses,
|
|
128
|
+
"hit_rate": round(hit_rate, 3),
|
|
129
|
+
"disk_cache_files": disk_files,
|
|
130
|
+
"disk_cached": disk_files, # Alias for compatibility
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class CodeBERTEmbeddingFunction:
|
|
135
|
+
"""ChromaDB-compatible embedding function using CodeBERT."""
|
|
136
|
+
|
|
137
|
+
def __init__(self, model_name: str = "microsoft/codebert-base") -> None:
|
|
138
|
+
"""Initialize CodeBERT embedding function.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
model_name: Name of the sentence transformer model
|
|
142
|
+
"""
|
|
143
|
+
try:
|
|
144
|
+
self.model = SentenceTransformer(model_name)
|
|
145
|
+
self.model_name = model_name
|
|
146
|
+
logger.info(f"Loaded embedding model: {model_name}")
|
|
147
|
+
except Exception as e:
|
|
148
|
+
logger.error(f"Failed to load embedding model {model_name}: {e}")
|
|
149
|
+
raise EmbeddingError(f"Failed to load embedding model: {e}") from e
|
|
150
|
+
|
|
151
|
+
def __call__(self, input: list[str]) -> list[list[float]]:
|
|
152
|
+
"""Generate embeddings for input texts (ChromaDB interface)."""
|
|
153
|
+
try:
|
|
154
|
+
embeddings = self.model.encode(input, convert_to_numpy=True)
|
|
155
|
+
return embeddings.tolist()
|
|
156
|
+
except Exception as e:
|
|
157
|
+
logger.error(f"Failed to generate embeddings: {e}")
|
|
158
|
+
raise EmbeddingError(f"Failed to generate embeddings: {e}") from e
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class BatchEmbeddingProcessor:
|
|
162
|
+
"""Batch processing for efficient embedding generation with caching."""
|
|
163
|
+
|
|
164
|
+
def __init__(
|
|
165
|
+
self,
|
|
166
|
+
embedding_function: CodeBERTEmbeddingFunction,
|
|
167
|
+
cache: EmbeddingCache | None = None,
|
|
168
|
+
batch_size: int = 32,
|
|
169
|
+
) -> None:
|
|
170
|
+
"""Initialize batch embedding processor.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
embedding_function: Function to generate embeddings
|
|
174
|
+
cache: Optional embedding cache
|
|
175
|
+
batch_size: Size of batches for processing
|
|
176
|
+
"""
|
|
177
|
+
self.embedding_function = embedding_function
|
|
178
|
+
self.cache = cache
|
|
179
|
+
self.batch_size = batch_size
|
|
180
|
+
|
|
181
|
+
async def process_batch(self, contents: list[str]) -> list[list[float]]:
|
|
182
|
+
"""Process a batch of content for embeddings.
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
contents: List of text content to embed
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
List of embeddings
|
|
189
|
+
"""
|
|
190
|
+
if not contents:
|
|
191
|
+
return []
|
|
192
|
+
|
|
193
|
+
embeddings = []
|
|
194
|
+
uncached_contents = []
|
|
195
|
+
uncached_indices = []
|
|
196
|
+
|
|
197
|
+
# Check cache for each content if cache is available
|
|
198
|
+
if self.cache:
|
|
199
|
+
for i, content in enumerate(contents):
|
|
200
|
+
cached_embedding = await self.cache.get_embedding(content)
|
|
201
|
+
if cached_embedding:
|
|
202
|
+
embeddings.append(cached_embedding)
|
|
203
|
+
else:
|
|
204
|
+
embeddings.append(None) # Placeholder
|
|
205
|
+
uncached_contents.append(content)
|
|
206
|
+
uncached_indices.append(i)
|
|
207
|
+
else:
|
|
208
|
+
# No cache, process all content
|
|
209
|
+
uncached_contents = contents
|
|
210
|
+
uncached_indices = list(range(len(contents)))
|
|
211
|
+
embeddings = [None] * len(contents)
|
|
212
|
+
|
|
213
|
+
# Generate embeddings for uncached content
|
|
214
|
+
if uncached_contents:
|
|
215
|
+
logger.debug(f"Generating {len(uncached_contents)} new embeddings")
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
new_embeddings = []
|
|
219
|
+
for i in range(0, len(uncached_contents), self.batch_size):
|
|
220
|
+
batch = uncached_contents[i : i + self.batch_size]
|
|
221
|
+
batch_embeddings = self.embedding_function(batch)
|
|
222
|
+
new_embeddings.extend(batch_embeddings)
|
|
223
|
+
|
|
224
|
+
# Cache new embeddings and fill placeholders
|
|
225
|
+
for i, (content, embedding) in enumerate(
|
|
226
|
+
zip(uncached_contents, new_embeddings, strict=False)
|
|
227
|
+
):
|
|
228
|
+
if self.cache:
|
|
229
|
+
await self.cache.store_embedding(content, embedding)
|
|
230
|
+
embeddings[uncached_indices[i]] = embedding
|
|
231
|
+
|
|
232
|
+
except Exception as e:
|
|
233
|
+
logger.error(f"Failed to generate embeddings: {e}")
|
|
234
|
+
raise EmbeddingError(f"Failed to generate embeddings: {e}") from e
|
|
235
|
+
|
|
236
|
+
return embeddings
|
|
237
|
+
|
|
238
|
+
def get_stats(self) -> dict[str, any]:
|
|
239
|
+
"""Get processor statistics."""
|
|
240
|
+
stats = {
|
|
241
|
+
"model_name": self.embedding_function.model_name,
|
|
242
|
+
"batch_size": self.batch_size,
|
|
243
|
+
"cache_enabled": self.cache is not None,
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
if self.cache:
|
|
247
|
+
stats.update(self.cache.get_cache_stats())
|
|
248
|
+
|
|
249
|
+
return stats
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def create_embedding_function(
|
|
253
|
+
model_name: str = "microsoft/codebert-base",
|
|
254
|
+
cache_dir: Path | None = None,
|
|
255
|
+
cache_size: int = 1000,
|
|
256
|
+
):
|
|
257
|
+
"""Create embedding function and cache.
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
model_name: Name of the embedding model
|
|
261
|
+
cache_dir: Directory for caching embeddings
|
|
262
|
+
cache_size: Maximum cache size
|
|
263
|
+
|
|
264
|
+
Returns:
|
|
265
|
+
Tuple of (embedding_function, cache)
|
|
266
|
+
"""
|
|
267
|
+
try:
|
|
268
|
+
# Use ChromaDB's built-in sentence transformer function
|
|
269
|
+
from chromadb.utils import embedding_functions
|
|
270
|
+
|
|
271
|
+
# Map our model names to sentence-transformers compatible names
|
|
272
|
+
model_mapping = {
|
|
273
|
+
"microsoft/codebert-base": "sentence-transformers/all-MiniLM-L6-v2", # Fallback to working model
|
|
274
|
+
"microsoft/unixcoder-base": "sentence-transformers/all-MiniLM-L6-v2", # Fallback to working model
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
actual_model = model_mapping.get(model_name, model_name)
|
|
278
|
+
|
|
279
|
+
embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
|
|
280
|
+
model_name=actual_model
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
logger.debug(f"Created ChromaDB embedding function with model: {actual_model}")
|
|
284
|
+
|
|
285
|
+
except Exception as e:
|
|
286
|
+
logger.warning(f"Failed to create ChromaDB embedding function: {e}")
|
|
287
|
+
# Fallback to our custom implementation
|
|
288
|
+
embedding_function = CodeBERTEmbeddingFunction(model_name)
|
|
289
|
+
|
|
290
|
+
cache = None
|
|
291
|
+
if cache_dir:
|
|
292
|
+
cache = EmbeddingCache(cache_dir, cache_size)
|
|
293
|
+
|
|
294
|
+
return embedding_function, cache
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Custom exception hierarchy for MCP Vector Search."""
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MCPVectorSearchError(Exception):
|
|
7
|
+
"""Base exception for MCP Vector Search."""
|
|
8
|
+
|
|
9
|
+
def __init__(self, message: str, context: dict[str, Any] | None = None) -> None:
|
|
10
|
+
super().__init__(message)
|
|
11
|
+
self.context = context or {}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DatabaseError(MCPVectorSearchError):
|
|
15
|
+
"""Database-related errors."""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class DatabaseInitializationError(DatabaseError):
|
|
21
|
+
"""Database initialization failed."""
|
|
22
|
+
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DatabaseNotInitializedError(DatabaseError):
|
|
27
|
+
"""Operation attempted on uninitialized database."""
|
|
28
|
+
|
|
29
|
+
pass
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ConnectionPoolError(DatabaseError):
|
|
33
|
+
"""Connection pool operation failed."""
|
|
34
|
+
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class DocumentAdditionError(DatabaseError):
|
|
39
|
+
"""Failed to add documents to database."""
|
|
40
|
+
|
|
41
|
+
pass
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class SearchError(DatabaseError):
|
|
45
|
+
"""Search operation failed."""
|
|
46
|
+
|
|
47
|
+
pass
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class IndexCorruptionError(DatabaseError):
|
|
51
|
+
"""Index corruption detected."""
|
|
52
|
+
|
|
53
|
+
pass
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class ParsingError(MCPVectorSearchError):
|
|
57
|
+
"""Code parsing errors."""
|
|
58
|
+
|
|
59
|
+
pass
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class EmbeddingError(MCPVectorSearchError):
|
|
63
|
+
"""Embedding generation errors."""
|
|
64
|
+
|
|
65
|
+
pass
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class ConfigurationError(MCPVectorSearchError):
|
|
69
|
+
"""Configuration validation errors."""
|
|
70
|
+
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class ProjectError(MCPVectorSearchError):
|
|
75
|
+
"""Project management errors."""
|
|
76
|
+
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class ProjectNotFoundError(ProjectError):
|
|
81
|
+
"""Project directory or configuration not found."""
|
|
82
|
+
|
|
83
|
+
pass
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
class ProjectInitializationError(ProjectError):
|
|
87
|
+
"""Failed to initialize project."""
|
|
88
|
+
|
|
89
|
+
pass
|