mcp-vector-search 0.15.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mcp-vector-search might be problematic. Click here for more details.

Files changed (86) hide show
  1. mcp_vector_search/__init__.py +10 -0
  2. mcp_vector_search/cli/__init__.py +1 -0
  3. mcp_vector_search/cli/commands/__init__.py +1 -0
  4. mcp_vector_search/cli/commands/auto_index.py +397 -0
  5. mcp_vector_search/cli/commands/chat.py +534 -0
  6. mcp_vector_search/cli/commands/config.py +393 -0
  7. mcp_vector_search/cli/commands/demo.py +358 -0
  8. mcp_vector_search/cli/commands/index.py +762 -0
  9. mcp_vector_search/cli/commands/init.py +658 -0
  10. mcp_vector_search/cli/commands/install.py +869 -0
  11. mcp_vector_search/cli/commands/install_old.py +700 -0
  12. mcp_vector_search/cli/commands/mcp.py +1254 -0
  13. mcp_vector_search/cli/commands/reset.py +393 -0
  14. mcp_vector_search/cli/commands/search.py +796 -0
  15. mcp_vector_search/cli/commands/setup.py +1133 -0
  16. mcp_vector_search/cli/commands/status.py +584 -0
  17. mcp_vector_search/cli/commands/uninstall.py +404 -0
  18. mcp_vector_search/cli/commands/visualize/__init__.py +39 -0
  19. mcp_vector_search/cli/commands/visualize/cli.py +265 -0
  20. mcp_vector_search/cli/commands/visualize/exporters/__init__.py +12 -0
  21. mcp_vector_search/cli/commands/visualize/exporters/html_exporter.py +33 -0
  22. mcp_vector_search/cli/commands/visualize/exporters/json_exporter.py +29 -0
  23. mcp_vector_search/cli/commands/visualize/graph_builder.py +709 -0
  24. mcp_vector_search/cli/commands/visualize/layout_engine.py +469 -0
  25. mcp_vector_search/cli/commands/visualize/server.py +201 -0
  26. mcp_vector_search/cli/commands/visualize/state_manager.py +428 -0
  27. mcp_vector_search/cli/commands/visualize/templates/__init__.py +16 -0
  28. mcp_vector_search/cli/commands/visualize/templates/base.py +218 -0
  29. mcp_vector_search/cli/commands/visualize/templates/scripts.py +3670 -0
  30. mcp_vector_search/cli/commands/visualize/templates/styles.py +779 -0
  31. mcp_vector_search/cli/commands/visualize.py.original +2536 -0
  32. mcp_vector_search/cli/commands/watch.py +287 -0
  33. mcp_vector_search/cli/didyoumean.py +520 -0
  34. mcp_vector_search/cli/export.py +320 -0
  35. mcp_vector_search/cli/history.py +295 -0
  36. mcp_vector_search/cli/interactive.py +342 -0
  37. mcp_vector_search/cli/main.py +484 -0
  38. mcp_vector_search/cli/output.py +414 -0
  39. mcp_vector_search/cli/suggestions.py +375 -0
  40. mcp_vector_search/config/__init__.py +1 -0
  41. mcp_vector_search/config/constants.py +24 -0
  42. mcp_vector_search/config/defaults.py +200 -0
  43. mcp_vector_search/config/settings.py +146 -0
  44. mcp_vector_search/core/__init__.py +1 -0
  45. mcp_vector_search/core/auto_indexer.py +298 -0
  46. mcp_vector_search/core/config_utils.py +394 -0
  47. mcp_vector_search/core/connection_pool.py +360 -0
  48. mcp_vector_search/core/database.py +1237 -0
  49. mcp_vector_search/core/directory_index.py +318 -0
  50. mcp_vector_search/core/embeddings.py +294 -0
  51. mcp_vector_search/core/exceptions.py +89 -0
  52. mcp_vector_search/core/factory.py +318 -0
  53. mcp_vector_search/core/git_hooks.py +345 -0
  54. mcp_vector_search/core/indexer.py +1002 -0
  55. mcp_vector_search/core/llm_client.py +453 -0
  56. mcp_vector_search/core/models.py +294 -0
  57. mcp_vector_search/core/project.py +350 -0
  58. mcp_vector_search/core/scheduler.py +330 -0
  59. mcp_vector_search/core/search.py +952 -0
  60. mcp_vector_search/core/watcher.py +322 -0
  61. mcp_vector_search/mcp/__init__.py +5 -0
  62. mcp_vector_search/mcp/__main__.py +25 -0
  63. mcp_vector_search/mcp/server.py +752 -0
  64. mcp_vector_search/parsers/__init__.py +8 -0
  65. mcp_vector_search/parsers/base.py +296 -0
  66. mcp_vector_search/parsers/dart.py +605 -0
  67. mcp_vector_search/parsers/html.py +413 -0
  68. mcp_vector_search/parsers/javascript.py +643 -0
  69. mcp_vector_search/parsers/php.py +694 -0
  70. mcp_vector_search/parsers/python.py +502 -0
  71. mcp_vector_search/parsers/registry.py +223 -0
  72. mcp_vector_search/parsers/ruby.py +678 -0
  73. mcp_vector_search/parsers/text.py +186 -0
  74. mcp_vector_search/parsers/utils.py +265 -0
  75. mcp_vector_search/py.typed +1 -0
  76. mcp_vector_search/utils/__init__.py +42 -0
  77. mcp_vector_search/utils/gitignore.py +250 -0
  78. mcp_vector_search/utils/gitignore_updater.py +212 -0
  79. mcp_vector_search/utils/monorepo.py +339 -0
  80. mcp_vector_search/utils/timing.py +338 -0
  81. mcp_vector_search/utils/version.py +47 -0
  82. mcp_vector_search-0.15.7.dist-info/METADATA +884 -0
  83. mcp_vector_search-0.15.7.dist-info/RECORD +86 -0
  84. mcp_vector_search-0.15.7.dist-info/WHEEL +4 -0
  85. mcp_vector_search-0.15.7.dist-info/entry_points.txt +3 -0
  86. mcp_vector_search-0.15.7.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,318 @@
1
+ """Directory index for tracking project structure and file relationships."""
2
+
3
+ import json
4
+ from collections import defaultdict
5
+ from pathlib import Path
6
+ from typing import Any
7
+
8
+ from loguru import logger
9
+
10
+ from .models import Directory
11
+
12
+
13
+ class DirectoryIndex:
14
+ """Manages directory structure and file-directory relationships."""
15
+
16
+ def __init__(self, index_path: Path) -> None:
17
+ """Initialize directory index.
18
+
19
+ Args:
20
+ index_path: Path to directory index file (JSON)
21
+ """
22
+ self.index_path = index_path
23
+ self.directories: dict[str, Directory] = {} # path -> Directory
24
+ self.file_to_directory: dict[str, str] = {} # file_path -> directory_path
25
+ self.directory_files: dict[str, list[str]] = defaultdict(
26
+ list
27
+ ) # dir_path -> [file_paths]
28
+
29
+ def load(self) -> None:
30
+ """Load directory index from disk."""
31
+ if not self.index_path.exists():
32
+ logger.debug("No directory index found, starting fresh")
33
+ return
34
+
35
+ try:
36
+ with open(self.index_path) as f:
37
+ data = json.load(f)
38
+
39
+ # Load directories
40
+ for dir_data in data.get("directories", []):
41
+ directory = Directory.from_dict(dir_data)
42
+ self.directories[str(directory.path)] = directory
43
+
44
+ # Load file mappings
45
+ self.file_to_directory = data.get("file_to_directory", {})
46
+
47
+ # Rebuild directory_files from file_to_directory
48
+ self.directory_files = defaultdict(list)
49
+ for file_path, dir_path in self.file_to_directory.items():
50
+ self.directory_files[dir_path].append(file_path)
51
+
52
+ logger.info(f"Loaded {len(self.directories)} directories from index")
53
+
54
+ except Exception as e:
55
+ logger.error(f"Failed to load directory index: {e}")
56
+ self.directories = {}
57
+ self.file_to_directory = {}
58
+ self.directory_files = defaultdict(list)
59
+
60
+ def save(self) -> None:
61
+ """Save directory index to disk."""
62
+ try:
63
+ # Ensure parent directory exists
64
+ self.index_path.parent.mkdir(parents=True, exist_ok=True)
65
+
66
+ data = {
67
+ "directories": [d.to_dict() for d in self.directories.values()],
68
+ "file_to_directory": self.file_to_directory,
69
+ }
70
+
71
+ with open(self.index_path, "w") as f:
72
+ json.dump(data, f, indent=2)
73
+
74
+ logger.debug(f"Saved {len(self.directories)} directories to index")
75
+
76
+ except Exception as e:
77
+ logger.error(f"Failed to save directory index: {e}")
78
+ raise
79
+
80
+ def add_directory(self, directory: Directory) -> None:
81
+ """Add or update a directory in the index.
82
+
83
+ Args:
84
+ directory: Directory to add
85
+ """
86
+ dir_path = str(directory.path)
87
+ self.directories[dir_path] = directory
88
+
89
+ def add_file(self, file_path: Path, directory_path: Path) -> None:
90
+ """Associate a file with its directory.
91
+
92
+ Args:
93
+ file_path: Path to the file
94
+ directory_path: Path to the directory containing the file
95
+ """
96
+ file_path_str = str(file_path)
97
+ dir_path_str = str(directory_path)
98
+
99
+ self.file_to_directory[file_path_str] = dir_path_str
100
+ if file_path_str not in self.directory_files[dir_path_str]:
101
+ self.directory_files[dir_path_str].append(file_path_str)
102
+
103
+ # Update directory file count
104
+ if dir_path_str in self.directories:
105
+ self.directories[dir_path_str].file_count = len(
106
+ self.directory_files[dir_path_str]
107
+ )
108
+
109
+ def get_directory(self, directory_path: Path) -> Directory | None:
110
+ """Get directory by path.
111
+
112
+ Args:
113
+ directory_path: Path to directory
114
+
115
+ Returns:
116
+ Directory object or None if not found
117
+ """
118
+ return self.directories.get(str(directory_path))
119
+
120
+ def get_files_in_directory(self, directory_path: Path) -> list[str]:
121
+ """Get all files in a directory.
122
+
123
+ Args:
124
+ directory_path: Path to directory
125
+
126
+ Returns:
127
+ List of file paths in the directory
128
+ """
129
+ return self.directory_files.get(str(directory_path), [])
130
+
131
+ def get_subdirectories(self, directory_path: Path) -> list[Directory]:
132
+ """Get all immediate subdirectories.
133
+
134
+ Args:
135
+ directory_path: Path to parent directory
136
+
137
+ Returns:
138
+ List of subdirectory objects
139
+ """
140
+ parent_path_str = str(directory_path)
141
+ subdirs = []
142
+
143
+ for _dir_path_str, directory in self.directories.items():
144
+ if directory.parent_path and str(directory.parent_path) == parent_path_str:
145
+ subdirs.append(directory)
146
+
147
+ return subdirs
148
+
149
+ def get_root_directories(self) -> list[Directory]:
150
+ """Get all root-level directories (no parent).
151
+
152
+ Returns:
153
+ List of root directory objects
154
+ """
155
+ return [d for d in self.directories.values() if d.parent_path is None]
156
+
157
+ def delete_directory(self, directory_path: Path) -> None:
158
+ """Remove directory and its file associations.
159
+
160
+ Args:
161
+ directory_path: Path to directory to remove
162
+ """
163
+ dir_path_str = str(directory_path)
164
+
165
+ # Remove directory
166
+ if dir_path_str in self.directories:
167
+ del self.directories[dir_path_str]
168
+
169
+ # Remove file associations
170
+ if dir_path_str in self.directory_files:
171
+ for file_path in self.directory_files[dir_path_str]:
172
+ if file_path in self.file_to_directory:
173
+ del self.file_to_directory[file_path]
174
+ del self.directory_files[dir_path_str]
175
+
176
+ def delete_file(self, file_path: Path) -> None:
177
+ """Remove file from directory associations.
178
+
179
+ Args:
180
+ file_path: Path to file to remove
181
+ """
182
+ file_path_str = str(file_path)
183
+
184
+ if file_path_str in self.file_to_directory:
185
+ dir_path = self.file_to_directory[file_path_str]
186
+ del self.file_to_directory[file_path_str]
187
+
188
+ # Remove from directory_files
189
+ if dir_path in self.directory_files:
190
+ self.directory_files[dir_path] = [
191
+ f for f in self.directory_files[dir_path] if f != file_path_str
192
+ ]
193
+
194
+ # Update directory file count
195
+ if dir_path in self.directories:
196
+ self.directories[dir_path].file_count = len(
197
+ self.directory_files[dir_path]
198
+ )
199
+
200
+ def rebuild_from_files(
201
+ self,
202
+ file_paths: list[Path],
203
+ root_path: Path,
204
+ chunk_stats: dict[str, dict] | None = None,
205
+ ) -> None:
206
+ """Rebuild directory index from list of files with statistics from chunks.
207
+
208
+ Args:
209
+ file_paths: List of file paths to index
210
+ root_path: Project root path
211
+ chunk_stats: Optional dict mapping file_path -> {'chunks': count, 'language': str}
212
+ """
213
+ self.directories = {}
214
+ self.file_to_directory = {}
215
+ self.directory_files = defaultdict(list)
216
+
217
+ # Track all unique directories and their statistics
218
+ dir_set = set()
219
+ dir_chunks = defaultdict(int) # directory -> total chunks
220
+ dir_languages = defaultdict(
221
+ lambda: defaultdict(int)
222
+ ) # directory -> {language: count}
223
+ dir_modified = defaultdict(float) # directory -> most recent modification time
224
+
225
+ for file_path in file_paths:
226
+ # Get relative path from root
227
+ try:
228
+ rel_path = file_path.relative_to(root_path)
229
+ parent_dir = rel_path.parent
230
+
231
+ # Add all parent directories up to root
232
+ current = parent_dir
233
+ while current != Path("."):
234
+ dir_set.add(current)
235
+
236
+ # Accumulate statistics up the directory tree
237
+ if chunk_stats and str(file_path) in chunk_stats:
238
+ stats = chunk_stats[str(file_path)]
239
+ dir_chunks[current] += stats.get("chunks", 0)
240
+ if "language" in stats:
241
+ dir_languages[current][stats["language"]] += stats.get(
242
+ "chunks", 0
243
+ )
244
+ # Track most recent modification time
245
+ if "modified" in stats:
246
+ dir_modified[current] = max(
247
+ dir_modified.get(current, 0), stats["modified"]
248
+ )
249
+
250
+ current = current.parent
251
+
252
+ # Associate file with its direct parent
253
+ if parent_dir != Path("."):
254
+ self.add_file(rel_path, parent_dir)
255
+
256
+ except ValueError:
257
+ # File not relative to root, skip
258
+ logger.warning(f"File {file_path} not under root {root_path}")
259
+ continue
260
+
261
+ # Create Directory objects for all directories
262
+ for dir_path in sorted(dir_set):
263
+ # Determine parent
264
+ parent_path = dir_path.parent if dir_path.parent != Path(".") else None
265
+
266
+ # Check if it's a package
267
+ is_package = False
268
+ full_dir_path = root_path / dir_path
269
+ if (full_dir_path / "__init__.py").exists():
270
+ is_package = True
271
+ elif (full_dir_path / "package.json").exists():
272
+ is_package = True
273
+
274
+ directory = Directory(
275
+ path=dir_path,
276
+ name=dir_path.name,
277
+ parent_path=parent_path,
278
+ depth=len(dir_path.parts),
279
+ is_package=is_package,
280
+ total_chunks=dir_chunks.get(dir_path, 0),
281
+ languages=dict(dir_languages.get(dir_path, {})),
282
+ last_modified=dir_modified.get(dir_path),
283
+ )
284
+
285
+ self.add_directory(directory)
286
+
287
+ # Update subdirectory counts
288
+ for directory in self.directories.values():
289
+ subdirs = self.get_subdirectories(directory.path)
290
+ directory.subdirectory_count = len(subdirs)
291
+
292
+ logger.info(
293
+ f"Rebuilt directory index with {len(self.directories)} directories, {sum(dir_chunks.values())} total chunks"
294
+ )
295
+
296
+ def get_stats(self) -> dict[str, Any]:
297
+ """Get directory index statistics.
298
+
299
+ Returns:
300
+ Dictionary with statistics
301
+ """
302
+ return {
303
+ "total_directories": len(self.directories),
304
+ "total_files": len(self.file_to_directory),
305
+ "root_directories": len(self.get_root_directories()),
306
+ "packages": sum(1 for d in self.directories.values() if d.is_package),
307
+ }
308
+
309
+ def reset(self) -> None:
310
+ """Clear all directory data."""
311
+ self.directories = {}
312
+ self.file_to_directory = {}
313
+ self.directory_files = defaultdict(list)
314
+
315
+ if self.index_path.exists():
316
+ self.index_path.unlink()
317
+
318
+ logger.info("Directory index reset")
@@ -0,0 +1,294 @@
1
+ """Embedding generation for MCP Vector Search."""
2
+
3
+ import hashlib
4
+ import json
5
+ from pathlib import Path
6
+
7
+ import aiofiles
8
+ from loguru import logger
9
+ from sentence_transformers import SentenceTransformer
10
+
11
+ from .exceptions import EmbeddingError
12
+
13
+
14
+ class EmbeddingCache:
15
+ """LRU cache for embeddings with disk persistence."""
16
+
17
+ def __init__(self, cache_dir: Path, max_size: int = 1000) -> None:
18
+ """Initialize embedding cache.
19
+
20
+ Args:
21
+ cache_dir: Directory to store cached embeddings
22
+ max_size: Maximum number of embeddings to keep in memory
23
+ """
24
+ self.cache_dir = cache_dir
25
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
26
+ self.max_size = max_size
27
+ self._memory_cache: dict[str, list[float]] = {}
28
+ self._access_order: list[str] = [] # For LRU eviction
29
+ self._cache_hits = 0
30
+ self._cache_misses = 0
31
+
32
+ def _hash_content(self, content: str) -> str:
33
+ """Generate cache key from content."""
34
+ return hashlib.sha256(content.encode()).hexdigest()[:16]
35
+
36
+ async def get_embedding(self, content: str) -> list[float] | None:
37
+ """Get cached embedding for content."""
38
+ cache_key = self._hash_content(content)
39
+
40
+ # Check memory cache first
41
+ if cache_key in self._memory_cache:
42
+ self._cache_hits += 1
43
+ # Move to end for LRU
44
+ self._access_order.remove(cache_key)
45
+ self._access_order.append(cache_key)
46
+ return self._memory_cache[cache_key]
47
+
48
+ # Check disk cache
49
+ cache_file = self.cache_dir / f"{cache_key}.json"
50
+ if cache_file.exists():
51
+ try:
52
+ async with aiofiles.open(cache_file) as f:
53
+ content_str = await f.read()
54
+ embedding = json.loads(content_str)
55
+
56
+ # Add to memory cache with LRU management
57
+ self._add_to_memory_cache(cache_key, embedding)
58
+ self._cache_hits += 1
59
+ return embedding
60
+ except Exception as e:
61
+ logger.warning(f"Failed to load cached embedding: {e}")
62
+
63
+ self._cache_misses += 1
64
+ return None
65
+
66
+ async def store_embedding(self, content: str, embedding: list[float]) -> None:
67
+ """Store embedding in cache."""
68
+ cache_key = self._hash_content(content)
69
+
70
+ # Store in memory cache with LRU management
71
+ self._add_to_memory_cache(cache_key, embedding)
72
+
73
+ # Store in disk cache
74
+ cache_file = self.cache_dir / f"{cache_key}.json"
75
+ try:
76
+ async with aiofiles.open(cache_file, "w") as f:
77
+ await f.write(json.dumps(embedding))
78
+ except Exception as e:
79
+ logger.warning(f"Failed to cache embedding: {e}")
80
+
81
+ def _add_to_memory_cache(self, cache_key: str, embedding: list[float]) -> None:
82
+ """Add embedding to memory cache with LRU eviction.
83
+
84
+ Args:
85
+ cache_key: Cache key for the embedding
86
+ embedding: Embedding vector to cache
87
+ """
88
+ # If already in cache, update and move to end
89
+ if cache_key in self._memory_cache:
90
+ self._access_order.remove(cache_key)
91
+ self._access_order.append(cache_key)
92
+ self._memory_cache[cache_key] = embedding
93
+ return
94
+
95
+ # If cache is full, evict least recently used
96
+ if len(self._memory_cache) >= self.max_size:
97
+ lru_key = self._access_order.pop(0)
98
+ del self._memory_cache[lru_key]
99
+
100
+ # Add new embedding
101
+ self._memory_cache[cache_key] = embedding
102
+ self._access_order.append(cache_key)
103
+
104
+ def clear_memory_cache(self) -> None:
105
+ """Clear the in-memory cache."""
106
+ self._memory_cache.clear()
107
+ self._access_order.clear()
108
+
109
+ def get_cache_stats(self) -> dict[str, any]:
110
+ """Get cache performance statistics.
111
+
112
+ Returns:
113
+ Dictionary with cache statistics
114
+ """
115
+ total_requests = self._cache_hits + self._cache_misses
116
+ hit_rate = self._cache_hits / total_requests if total_requests > 0 else 0.0
117
+ disk_files = (
118
+ len(list(self.cache_dir.glob("*.json"))) if self.cache_dir.exists() else 0
119
+ )
120
+
121
+ return {
122
+ "memory_cache_size": len(self._memory_cache),
123
+ "memory_cached": len(self._memory_cache), # Alias for compatibility
124
+ "max_cache_size": self.max_size,
125
+ "memory_limit": self.max_size, # Alias for compatibility
126
+ "cache_hits": self._cache_hits,
127
+ "cache_misses": self._cache_misses,
128
+ "hit_rate": round(hit_rate, 3),
129
+ "disk_cache_files": disk_files,
130
+ "disk_cached": disk_files, # Alias for compatibility
131
+ }
132
+
133
+
134
+ class CodeBERTEmbeddingFunction:
135
+ """ChromaDB-compatible embedding function using CodeBERT."""
136
+
137
+ def __init__(self, model_name: str = "microsoft/codebert-base") -> None:
138
+ """Initialize CodeBERT embedding function.
139
+
140
+ Args:
141
+ model_name: Name of the sentence transformer model
142
+ """
143
+ try:
144
+ self.model = SentenceTransformer(model_name)
145
+ self.model_name = model_name
146
+ logger.info(f"Loaded embedding model: {model_name}")
147
+ except Exception as e:
148
+ logger.error(f"Failed to load embedding model {model_name}: {e}")
149
+ raise EmbeddingError(f"Failed to load embedding model: {e}") from e
150
+
151
+ def __call__(self, input: list[str]) -> list[list[float]]:
152
+ """Generate embeddings for input texts (ChromaDB interface)."""
153
+ try:
154
+ embeddings = self.model.encode(input, convert_to_numpy=True)
155
+ return embeddings.tolist()
156
+ except Exception as e:
157
+ logger.error(f"Failed to generate embeddings: {e}")
158
+ raise EmbeddingError(f"Failed to generate embeddings: {e}") from e
159
+
160
+
161
+ class BatchEmbeddingProcessor:
162
+ """Batch processing for efficient embedding generation with caching."""
163
+
164
+ def __init__(
165
+ self,
166
+ embedding_function: CodeBERTEmbeddingFunction,
167
+ cache: EmbeddingCache | None = None,
168
+ batch_size: int = 32,
169
+ ) -> None:
170
+ """Initialize batch embedding processor.
171
+
172
+ Args:
173
+ embedding_function: Function to generate embeddings
174
+ cache: Optional embedding cache
175
+ batch_size: Size of batches for processing
176
+ """
177
+ self.embedding_function = embedding_function
178
+ self.cache = cache
179
+ self.batch_size = batch_size
180
+
181
+ async def process_batch(self, contents: list[str]) -> list[list[float]]:
182
+ """Process a batch of content for embeddings.
183
+
184
+ Args:
185
+ contents: List of text content to embed
186
+
187
+ Returns:
188
+ List of embeddings
189
+ """
190
+ if not contents:
191
+ return []
192
+
193
+ embeddings = []
194
+ uncached_contents = []
195
+ uncached_indices = []
196
+
197
+ # Check cache for each content if cache is available
198
+ if self.cache:
199
+ for i, content in enumerate(contents):
200
+ cached_embedding = await self.cache.get_embedding(content)
201
+ if cached_embedding:
202
+ embeddings.append(cached_embedding)
203
+ else:
204
+ embeddings.append(None) # Placeholder
205
+ uncached_contents.append(content)
206
+ uncached_indices.append(i)
207
+ else:
208
+ # No cache, process all content
209
+ uncached_contents = contents
210
+ uncached_indices = list(range(len(contents)))
211
+ embeddings = [None] * len(contents)
212
+
213
+ # Generate embeddings for uncached content
214
+ if uncached_contents:
215
+ logger.debug(f"Generating {len(uncached_contents)} new embeddings")
216
+
217
+ try:
218
+ new_embeddings = []
219
+ for i in range(0, len(uncached_contents), self.batch_size):
220
+ batch = uncached_contents[i : i + self.batch_size]
221
+ batch_embeddings = self.embedding_function(batch)
222
+ new_embeddings.extend(batch_embeddings)
223
+
224
+ # Cache new embeddings and fill placeholders
225
+ for i, (content, embedding) in enumerate(
226
+ zip(uncached_contents, new_embeddings, strict=False)
227
+ ):
228
+ if self.cache:
229
+ await self.cache.store_embedding(content, embedding)
230
+ embeddings[uncached_indices[i]] = embedding
231
+
232
+ except Exception as e:
233
+ logger.error(f"Failed to generate embeddings: {e}")
234
+ raise EmbeddingError(f"Failed to generate embeddings: {e}") from e
235
+
236
+ return embeddings
237
+
238
+ def get_stats(self) -> dict[str, any]:
239
+ """Get processor statistics."""
240
+ stats = {
241
+ "model_name": self.embedding_function.model_name,
242
+ "batch_size": self.batch_size,
243
+ "cache_enabled": self.cache is not None,
244
+ }
245
+
246
+ if self.cache:
247
+ stats.update(self.cache.get_cache_stats())
248
+
249
+ return stats
250
+
251
+
252
+ def create_embedding_function(
253
+ model_name: str = "microsoft/codebert-base",
254
+ cache_dir: Path | None = None,
255
+ cache_size: int = 1000,
256
+ ):
257
+ """Create embedding function and cache.
258
+
259
+ Args:
260
+ model_name: Name of the embedding model
261
+ cache_dir: Directory for caching embeddings
262
+ cache_size: Maximum cache size
263
+
264
+ Returns:
265
+ Tuple of (embedding_function, cache)
266
+ """
267
+ try:
268
+ # Use ChromaDB's built-in sentence transformer function
269
+ from chromadb.utils import embedding_functions
270
+
271
+ # Map our model names to sentence-transformers compatible names
272
+ model_mapping = {
273
+ "microsoft/codebert-base": "sentence-transformers/all-MiniLM-L6-v2", # Fallback to working model
274
+ "microsoft/unixcoder-base": "sentence-transformers/all-MiniLM-L6-v2", # Fallback to working model
275
+ }
276
+
277
+ actual_model = model_mapping.get(model_name, model_name)
278
+
279
+ embedding_function = embedding_functions.SentenceTransformerEmbeddingFunction(
280
+ model_name=actual_model
281
+ )
282
+
283
+ logger.debug(f"Created ChromaDB embedding function with model: {actual_model}")
284
+
285
+ except Exception as e:
286
+ logger.warning(f"Failed to create ChromaDB embedding function: {e}")
287
+ # Fallback to our custom implementation
288
+ embedding_function = CodeBERTEmbeddingFunction(model_name)
289
+
290
+ cache = None
291
+ if cache_dir:
292
+ cache = EmbeddingCache(cache_dir, cache_size)
293
+
294
+ return embedding_function, cache
@@ -0,0 +1,89 @@
1
+ """Custom exception hierarchy for MCP Vector Search."""
2
+
3
+ from typing import Any
4
+
5
+
6
+ class MCPVectorSearchError(Exception):
7
+ """Base exception for MCP Vector Search."""
8
+
9
+ def __init__(self, message: str, context: dict[str, Any] | None = None) -> None:
10
+ super().__init__(message)
11
+ self.context = context or {}
12
+
13
+
14
+ class DatabaseError(MCPVectorSearchError):
15
+ """Database-related errors."""
16
+
17
+ pass
18
+
19
+
20
+ class DatabaseInitializationError(DatabaseError):
21
+ """Database initialization failed."""
22
+
23
+ pass
24
+
25
+
26
+ class DatabaseNotInitializedError(DatabaseError):
27
+ """Operation attempted on uninitialized database."""
28
+
29
+ pass
30
+
31
+
32
+ class ConnectionPoolError(DatabaseError):
33
+ """Connection pool operation failed."""
34
+
35
+ pass
36
+
37
+
38
+ class DocumentAdditionError(DatabaseError):
39
+ """Failed to add documents to database."""
40
+
41
+ pass
42
+
43
+
44
+ class SearchError(DatabaseError):
45
+ """Search operation failed."""
46
+
47
+ pass
48
+
49
+
50
+ class IndexCorruptionError(DatabaseError):
51
+ """Index corruption detected."""
52
+
53
+ pass
54
+
55
+
56
+ class ParsingError(MCPVectorSearchError):
57
+ """Code parsing errors."""
58
+
59
+ pass
60
+
61
+
62
+ class EmbeddingError(MCPVectorSearchError):
63
+ """Embedding generation errors."""
64
+
65
+ pass
66
+
67
+
68
+ class ConfigurationError(MCPVectorSearchError):
69
+ """Configuration validation errors."""
70
+
71
+ pass
72
+
73
+
74
+ class ProjectError(MCPVectorSearchError):
75
+ """Project management errors."""
76
+
77
+ pass
78
+
79
+
80
+ class ProjectNotFoundError(ProjectError):
81
+ """Project directory or configuration not found."""
82
+
83
+ pass
84
+
85
+
86
+ class ProjectInitializationError(ProjectError):
87
+ """Failed to initialize project."""
88
+
89
+ pass