mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/models.py +125 -1
- mcp_code_indexer/main.py +60 -0
- mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
- mcp_code_indexer/server/mcp_server.py +3 -0
- mcp_code_indexer/vector_mode/__init__.py +36 -0
- mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
- mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
- mcp_code_indexer/vector_mode/config.py +167 -0
- mcp_code_indexer/vector_mode/daemon.py +335 -0
- mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
- mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
- mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
- mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
- mcp_code_indexer/vector_mode/security/__init__.py +11 -0
- mcp_code_indexer/vector_mode/security/patterns.py +297 -0
- mcp_code_indexer/vector_mode/security/redactor.py +368 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Merkle tree implementation for efficient change detection.
|
|
3
|
+
|
|
4
|
+
Provides a hierarchical hash tree for detecting file system changes
|
|
5
|
+
without scanning entire directory structures.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
from ...database.models import MerkleNode as MerkleNodeModel, NodeType
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class TreeStats:
|
|
23
|
+
"""Statistics about the Merkle tree."""
|
|
24
|
+
total_nodes: int = 0
|
|
25
|
+
file_nodes: int = 0
|
|
26
|
+
directory_nodes: int = 0
|
|
27
|
+
max_depth: int = 0
|
|
28
|
+
total_size: int = 0
|
|
29
|
+
last_updated: Optional[datetime] = None
|
|
30
|
+
|
|
31
|
+
class MerkleNode:
|
|
32
|
+
"""
|
|
33
|
+
Node in the Merkle tree representing a file or directory.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
path: str,
|
|
39
|
+
node_type: NodeType,
|
|
40
|
+
hash_value: Optional[str] = None,
|
|
41
|
+
parent: Optional["MerkleNode"] = None,
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
Initialize a Merkle tree node.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
path: Relative path from project root
|
|
48
|
+
node_type: Type of node (file or directory)
|
|
49
|
+
hash_value: Hash value for the node
|
|
50
|
+
parent: Parent node
|
|
51
|
+
"""
|
|
52
|
+
self.path = path
|
|
53
|
+
self.node_type = node_type
|
|
54
|
+
self.hash_value = hash_value
|
|
55
|
+
self.parent = parent
|
|
56
|
+
self.children: Dict[str, "MerkleNode"] = {}
|
|
57
|
+
self.last_modified = datetime.utcnow()
|
|
58
|
+
self.size: Optional[int] = None
|
|
59
|
+
self.metadata: Dict[str, any] = {}
|
|
60
|
+
|
|
61
|
+
def add_child(self, name: str, child: "MerkleNode") -> None:
|
|
62
|
+
"""Add a child node."""
|
|
63
|
+
child.parent = self
|
|
64
|
+
self.children[name] = child
|
|
65
|
+
|
|
66
|
+
def remove_child(self, name: str) -> Optional["MerkleNode"]:
|
|
67
|
+
"""Remove and return a child node."""
|
|
68
|
+
return self.children.pop(name, None)
|
|
69
|
+
|
|
70
|
+
def get_child(self, name: str) -> Optional["MerkleNode"]:
|
|
71
|
+
"""Get a child node by name."""
|
|
72
|
+
return self.children.get(name)
|
|
73
|
+
|
|
74
|
+
def compute_hash(self, project_root: Path) -> str:
|
|
75
|
+
"""Compute hash for this node."""
|
|
76
|
+
if self.node_type == NodeType.FILE:
|
|
77
|
+
return self._compute_file_hash(project_root)
|
|
78
|
+
else:
|
|
79
|
+
return self._compute_directory_hash()
|
|
80
|
+
|
|
81
|
+
def _compute_file_hash(self, project_root: Path) -> str:
|
|
82
|
+
"""Compute hash for a file node."""
|
|
83
|
+
file_path = project_root / self.path
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
if not file_path.exists():
|
|
87
|
+
return "deleted"
|
|
88
|
+
|
|
89
|
+
# Use file modification time and size for quick comparison
|
|
90
|
+
stat = file_path.stat()
|
|
91
|
+
self.size = stat.st_size
|
|
92
|
+
|
|
93
|
+
# For small files, use content hash
|
|
94
|
+
if stat.st_size < 1024 * 1024: # 1MB
|
|
95
|
+
with open(file_path, 'rb') as f:
|
|
96
|
+
content = f.read()
|
|
97
|
+
return hashlib.sha256(content).hexdigest()
|
|
98
|
+
else:
|
|
99
|
+
# For large files, use metadata hash
|
|
100
|
+
metadata = f"{stat.st_size}:{stat.st_mtime}"
|
|
101
|
+
return hashlib.sha256(metadata.encode()).hexdigest()
|
|
102
|
+
|
|
103
|
+
except (OSError, PermissionError) as e:
|
|
104
|
+
logger.warning(f"Could not hash file {file_path}: {e}")
|
|
105
|
+
return "error"
|
|
106
|
+
|
|
107
|
+
def _compute_directory_hash(self) -> str:
|
|
108
|
+
"""Compute hash for a directory node based on children."""
|
|
109
|
+
if not self.children:
|
|
110
|
+
return hashlib.sha256(b"empty").hexdigest()
|
|
111
|
+
|
|
112
|
+
# Sort children by name for consistent hashing
|
|
113
|
+
child_hashes = []
|
|
114
|
+
for name in sorted(self.children.keys()):
|
|
115
|
+
child = self.children[name]
|
|
116
|
+
child_hash = child.hash_value or ""
|
|
117
|
+
combined = f"{name}:{child_hash}"
|
|
118
|
+
child_hashes.append(combined)
|
|
119
|
+
|
|
120
|
+
combined_hash = "|".join(child_hashes)
|
|
121
|
+
return hashlib.sha256(combined_hash.encode()).hexdigest()
|
|
122
|
+
|
|
123
|
+
def update_hash(self, project_root: Path) -> bool:
|
|
124
|
+
"""Update hash and return True if it changed."""
|
|
125
|
+
old_hash = self.hash_value
|
|
126
|
+
new_hash = self.compute_hash(project_root)
|
|
127
|
+
|
|
128
|
+
if old_hash != new_hash:
|
|
129
|
+
self.hash_value = new_hash
|
|
130
|
+
self.last_modified = datetime.utcnow()
|
|
131
|
+
return True
|
|
132
|
+
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
def get_depth(self) -> int:
|
|
136
|
+
"""Get depth of this node in the tree."""
|
|
137
|
+
if self.parent is None:
|
|
138
|
+
return 0
|
|
139
|
+
return self.parent.get_depth() + 1
|
|
140
|
+
|
|
141
|
+
def is_leaf(self) -> bool:
|
|
142
|
+
"""Check if this is a leaf node."""
|
|
143
|
+
return len(self.children) == 0
|
|
144
|
+
|
|
145
|
+
def to_dict(self) -> Dict:
|
|
146
|
+
"""Convert node to dictionary representation."""
|
|
147
|
+
return {
|
|
148
|
+
"path": self.path,
|
|
149
|
+
"node_type": self.node_type.value,
|
|
150
|
+
"hash_value": self.hash_value,
|
|
151
|
+
"last_modified": self.last_modified.isoformat(),
|
|
152
|
+
"size": self.size,
|
|
153
|
+
"children": list(self.children.keys()),
|
|
154
|
+
"metadata": self.metadata,
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
class MerkleTree:
|
|
158
|
+
"""
|
|
159
|
+
Merkle tree for efficient file system change detection.
|
|
160
|
+
|
|
161
|
+
Maintains a hierarchical hash tree of the project structure to quickly
|
|
162
|
+
identify changes without scanning all files.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def __init__(self, project_root: Path, project_id: str):
|
|
166
|
+
"""
|
|
167
|
+
Initialize Merkle tree.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
project_root: Root directory path
|
|
171
|
+
project_id: Project identifier
|
|
172
|
+
"""
|
|
173
|
+
self.project_root = Path(project_root).resolve()
|
|
174
|
+
self.project_id = project_id
|
|
175
|
+
self.root: Optional[MerkleNode] = None
|
|
176
|
+
self.node_map: Dict[str, MerkleNode] = {} # path -> node mapping
|
|
177
|
+
|
|
178
|
+
# Statistics
|
|
179
|
+
self.stats = TreeStats()
|
|
180
|
+
|
|
181
|
+
# Change tracking
|
|
182
|
+
self.changed_nodes: Set[str] = set()
|
|
183
|
+
self.last_scan_time: Optional[datetime] = None
|
|
184
|
+
|
|
185
|
+
def build_tree(self, ignore_patterns: Optional[List[str]] = None) -> None:
|
|
186
|
+
"""Build the complete Merkle tree by scanning the file system."""
|
|
187
|
+
logger.info(f"Building Merkle tree for {self.project_root}")
|
|
188
|
+
|
|
189
|
+
ignore_patterns = ignore_patterns or [
|
|
190
|
+
"*.log", "*.tmp", "*~", ".git", "__pycache__",
|
|
191
|
+
"node_modules", "*.pyc", "*.pyo", ".DS_Store", "Thumbs.db"
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
# Create root node
|
|
195
|
+
self.root = MerkleNode("", NodeType.DIRECTORY)
|
|
196
|
+
self.node_map[""] = self.root
|
|
197
|
+
|
|
198
|
+
# Recursively build tree
|
|
199
|
+
self._build_tree_recursive(self.project_root, self.root, ignore_patterns)
|
|
200
|
+
|
|
201
|
+
# Compute hashes bottom-up
|
|
202
|
+
self._compute_hashes_recursive(self.root)
|
|
203
|
+
|
|
204
|
+
# Update statistics
|
|
205
|
+
self._update_stats()
|
|
206
|
+
self.last_scan_time = datetime.utcnow()
|
|
207
|
+
|
|
208
|
+
logger.info(
|
|
209
|
+
f"Built Merkle tree: {self.stats.total_nodes} nodes "
|
|
210
|
+
f"({self.stats.file_nodes} files, {self.stats.directory_nodes} directories)"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def _build_tree_recursive(
|
|
214
|
+
self,
|
|
215
|
+
current_path: Path,
|
|
216
|
+
current_node: MerkleNode,
|
|
217
|
+
ignore_patterns: List[str]
|
|
218
|
+
) -> None:
|
|
219
|
+
"""Recursively build tree structure."""
|
|
220
|
+
try:
|
|
221
|
+
if not current_path.is_dir():
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
for item in current_path.iterdir():
|
|
225
|
+
# Check if should ignore
|
|
226
|
+
if self._should_ignore(item, ignore_patterns):
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# Get relative path
|
|
230
|
+
try:
|
|
231
|
+
relative_path = str(item.relative_to(self.project_root))
|
|
232
|
+
except ValueError:
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
# Create node
|
|
236
|
+
if item.is_file():
|
|
237
|
+
node = MerkleNode(relative_path, NodeType.FILE)
|
|
238
|
+
current_node.add_child(item.name, node)
|
|
239
|
+
self.node_map[relative_path] = node
|
|
240
|
+
|
|
241
|
+
elif item.is_dir():
|
|
242
|
+
node = MerkleNode(relative_path, NodeType.DIRECTORY)
|
|
243
|
+
current_node.add_child(item.name, node)
|
|
244
|
+
self.node_map[relative_path] = node
|
|
245
|
+
|
|
246
|
+
# Recurse into directory
|
|
247
|
+
self._build_tree_recursive(item, node, ignore_patterns)
|
|
248
|
+
|
|
249
|
+
except (OSError, PermissionError) as e:
|
|
250
|
+
logger.warning(f"Could not scan directory {current_path}: {e}")
|
|
251
|
+
|
|
252
|
+
def _should_ignore(self, path: Path, ignore_patterns: List[str]) -> bool:
|
|
253
|
+
"""Check if path should be ignored."""
|
|
254
|
+
import fnmatch
|
|
255
|
+
|
|
256
|
+
path_str = path.name
|
|
257
|
+
for pattern in ignore_patterns:
|
|
258
|
+
if fnmatch.fnmatch(path_str, pattern):
|
|
259
|
+
return True
|
|
260
|
+
|
|
261
|
+
return False
|
|
262
|
+
|
|
263
|
+
def _compute_hashes_recursive(self, node: MerkleNode) -> None:
|
|
264
|
+
"""Compute hashes recursively (bottom-up)."""
|
|
265
|
+
# First compute hashes for all children
|
|
266
|
+
for child in node.children.values():
|
|
267
|
+
self._compute_hashes_recursive(child)
|
|
268
|
+
|
|
269
|
+
# Then compute hash for this node
|
|
270
|
+
node.update_hash(self.project_root)
|
|
271
|
+
|
|
272
|
+
def _update_stats(self) -> None:
|
|
273
|
+
"""Update tree statistics."""
|
|
274
|
+
self.stats = TreeStats()
|
|
275
|
+
self.stats.last_updated = datetime.utcnow()
|
|
276
|
+
|
|
277
|
+
if self.root:
|
|
278
|
+
self._update_stats_recursive(self.root, 0)
|
|
279
|
+
|
|
280
|
+
def _update_stats_recursive(self, node: MerkleNode, depth: int) -> None:
|
|
281
|
+
"""Recursively update statistics."""
|
|
282
|
+
self.stats.total_nodes += 1
|
|
283
|
+
self.stats.max_depth = max(self.stats.max_depth, depth)
|
|
284
|
+
|
|
285
|
+
if node.node_type == NodeType.FILE:
|
|
286
|
+
self.stats.file_nodes += 1
|
|
287
|
+
if node.size:
|
|
288
|
+
self.stats.total_size += node.size
|
|
289
|
+
else:
|
|
290
|
+
self.stats.directory_nodes += 1
|
|
291
|
+
|
|
292
|
+
for child in node.children.values():
|
|
293
|
+
self._update_stats_recursive(child, depth + 1)
|
|
294
|
+
|
|
295
|
+
def update_file(self, relative_path: str) -> bool:
|
|
296
|
+
"""
|
|
297
|
+
Update a file in the tree and return True if hash changed.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
relative_path: Path relative to project root
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
True if the file's hash changed
|
|
304
|
+
"""
|
|
305
|
+
node = self.node_map.get(relative_path)
|
|
306
|
+
if not node:
|
|
307
|
+
# File might be new, rebuild tree
|
|
308
|
+
logger.info(f"File {relative_path} not in tree, triggering rebuild")
|
|
309
|
+
self.build_tree()
|
|
310
|
+
return True
|
|
311
|
+
|
|
312
|
+
# Update file hash
|
|
313
|
+
changed = node.update_hash(self.project_root)
|
|
314
|
+
|
|
315
|
+
if changed:
|
|
316
|
+
self.changed_nodes.add(relative_path)
|
|
317
|
+
|
|
318
|
+
# Propagate hash changes up the tree
|
|
319
|
+
self._propagate_hash_changes(node.parent)
|
|
320
|
+
|
|
321
|
+
return changed
|
|
322
|
+
|
|
323
|
+
def _propagate_hash_changes(self, node: Optional[MerkleNode]) -> None:
|
|
324
|
+
"""Propagate hash changes up the tree."""
|
|
325
|
+
if not node:
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
old_hash = node.hash_value
|
|
329
|
+
node.update_hash(self.project_root)
|
|
330
|
+
|
|
331
|
+
if old_hash != node.hash_value:
|
|
332
|
+
self.changed_nodes.add(node.path)
|
|
333
|
+
self._propagate_hash_changes(node.parent)
|
|
334
|
+
|
|
335
|
+
def get_changed_files(self, since: Optional[datetime] = None) -> List[str]:
|
|
336
|
+
"""Get list of files that changed since timestamp."""
|
|
337
|
+
if since is None:
|
|
338
|
+
return list(self.changed_nodes)
|
|
339
|
+
|
|
340
|
+
changed_files = []
|
|
341
|
+
for path in self.changed_nodes:
|
|
342
|
+
node = self.node_map.get(path)
|
|
343
|
+
if node and node.last_modified >= since:
|
|
344
|
+
changed_files.append(path)
|
|
345
|
+
|
|
346
|
+
return changed_files
|
|
347
|
+
|
|
348
|
+
def verify_tree(self) -> Tuple[bool, List[str]]:
|
|
349
|
+
"""
|
|
350
|
+
Verify tree integrity by recomputing hashes.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Tuple of (is_valid, list_of_errors)
|
|
354
|
+
"""
|
|
355
|
+
errors = []
|
|
356
|
+
|
|
357
|
+
if not self.root:
|
|
358
|
+
return False, ["No root node"]
|
|
359
|
+
|
|
360
|
+
# Recompute all hashes and compare
|
|
361
|
+
for path, node in self.node_map.items():
|
|
362
|
+
expected_hash = node.compute_hash(self.project_root)
|
|
363
|
+
if node.hash_value != expected_hash:
|
|
364
|
+
errors.append(f"Hash mismatch for {path}: {node.hash_value} != {expected_hash}")
|
|
365
|
+
|
|
366
|
+
return len(errors) == 0, errors
|
|
367
|
+
|
|
368
|
+
def get_subtree_hash(self, relative_path: str) -> Optional[str]:
|
|
369
|
+
"""Get hash for a subtree rooted at the given path."""
|
|
370
|
+
node = self.node_map.get(relative_path)
|
|
371
|
+
return node.hash_value if node else None
|
|
372
|
+
|
|
373
|
+
def export_to_database_models(self) -> List[MerkleNodeModel]:
|
|
374
|
+
"""Export tree to database models for persistence."""
|
|
375
|
+
models = []
|
|
376
|
+
|
|
377
|
+
for path, node in self.node_map.items():
|
|
378
|
+
# Determine parent path
|
|
379
|
+
parent_path = None
|
|
380
|
+
if node.parent and node.parent.path:
|
|
381
|
+
parent_path = node.parent.path
|
|
382
|
+
|
|
383
|
+
model = MerkleNodeModel(
|
|
384
|
+
project_id=self.project_id,
|
|
385
|
+
path=path,
|
|
386
|
+
hash=node.hash_value or "",
|
|
387
|
+
node_type=node.node_type,
|
|
388
|
+
parent_path=parent_path,
|
|
389
|
+
children_hash=node._compute_directory_hash() if node.node_type == NodeType.DIRECTORY else None,
|
|
390
|
+
last_modified=node.last_modified,
|
|
391
|
+
)
|
|
392
|
+
models.append(model)
|
|
393
|
+
|
|
394
|
+
return models
|
|
395
|
+
|
|
396
|
+
def clear_changed_nodes(self) -> int:
|
|
397
|
+
"""Clear the changed nodes set and return count."""
|
|
398
|
+
count = len(self.changed_nodes)
|
|
399
|
+
self.changed_nodes.clear()
|
|
400
|
+
return count
|
|
401
|
+
|
|
402
|
+
def get_tree_summary(self) -> Dict:
|
|
403
|
+
"""Get a summary of the tree structure."""
|
|
404
|
+
return {
|
|
405
|
+
"project_id": self.project_id,
|
|
406
|
+
"project_root": str(self.project_root),
|
|
407
|
+
"stats": {
|
|
408
|
+
"total_nodes": self.stats.total_nodes,
|
|
409
|
+
"file_nodes": self.stats.file_nodes,
|
|
410
|
+
"directory_nodes": self.stats.directory_nodes,
|
|
411
|
+
"max_depth": self.stats.max_depth,
|
|
412
|
+
"total_size": self.stats.total_size,
|
|
413
|
+
"last_updated": self.stats.last_updated.isoformat() if self.stats.last_updated else None,
|
|
414
|
+
},
|
|
415
|
+
"root_hash": self.root.hash_value if self.root else None,
|
|
416
|
+
"last_scan": self.last_scan_time.isoformat() if self.last_scan_time else None,
|
|
417
|
+
"changed_nodes": len(self.changed_nodes),
|
|
418
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""
|
|
2
|
+
External service providers for vector mode.
|
|
3
|
+
|
|
4
|
+
This package provides clean integrations with external services using official SDKs:
|
|
5
|
+
- Voyage AI for embedding generation (voyageai SDK)
|
|
6
|
+
- Turbopuffer for vector storage and search (turbopuffer SDK)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .voyage_client import VoyageClient, create_voyage_client
|
|
10
|
+
from .turbopuffer_client import TurbopufferClient, create_turbopuffer_client
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
'VoyageClient',
|
|
14
|
+
'create_voyage_client',
|
|
15
|
+
'TurbopufferClient',
|
|
16
|
+
'create_turbopuffer_client',
|
|
17
|
+
]
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Turbopuffer client for vector storage and search using official SDK.
|
|
3
|
+
|
|
4
|
+
Provides clean integration with Turbopuffer's vector database for storing
|
|
5
|
+
embeddings and performing similarity searches. Supports configurable
|
|
6
|
+
regions for optimal latency and data residency compliance.
|
|
7
|
+
|
|
8
|
+
Default region: gcp-europe-west3 (Frankfurt)
|
|
9
|
+
Configure via TURBOPUFFER_REGION environment variable.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
import uuid
|
|
14
|
+
from typing import List, Dict, Any, Optional
|
|
15
|
+
import turbopuffer
|
|
16
|
+
|
|
17
|
+
from ..config import VectorConfig
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
class TurbopufferClient:
|
|
22
|
+
"""Clean Turbopuffer client using official SDK."""
|
|
23
|
+
|
|
24
|
+
def __init__(self, api_key: str, region: str = "gcp-europe-west3"):
|
|
25
|
+
self.api_key = api_key
|
|
26
|
+
self.region = region
|
|
27
|
+
|
|
28
|
+
# Initialize official TurboPuffer client
|
|
29
|
+
self.client = turbopuffer.Turbopuffer(
|
|
30
|
+
api_key=api_key,
|
|
31
|
+
region=region
|
|
32
|
+
)
|
|
33
|
+
logger.info(f"Initialized TurboPuffer client with region {region}")
|
|
34
|
+
|
|
35
|
+
def health_check(self) -> bool:
|
|
36
|
+
"""Check if Turbopuffer service is healthy."""
|
|
37
|
+
try:
|
|
38
|
+
namespaces = self.client.namespaces()
|
|
39
|
+
return True
|
|
40
|
+
except Exception as e:
|
|
41
|
+
logger.warning(f"Turbopuffer health check failed: {e}")
|
|
42
|
+
return False
|
|
43
|
+
|
|
44
|
+
def generate_vector_id(self, project_id: str, chunk_id: int) -> str:
|
|
45
|
+
"""Generate a unique vector ID."""
|
|
46
|
+
return f"{project_id}_{chunk_id}_{uuid.uuid4().hex[:8]}"
|
|
47
|
+
|
|
48
|
+
def upsert_vectors(
|
|
49
|
+
self,
|
|
50
|
+
vectors: List[Dict[str, Any]],
|
|
51
|
+
namespace: str,
|
|
52
|
+
**kwargs
|
|
53
|
+
) -> Dict[str, Any]:
|
|
54
|
+
"""Store or update vectors in the database."""
|
|
55
|
+
if not vectors:
|
|
56
|
+
return {"upserted": 0}
|
|
57
|
+
|
|
58
|
+
logger.info(f"Upserting {len(vectors)} vectors to namespace '{namespace}'")
|
|
59
|
+
|
|
60
|
+
# Format vectors for Turbopuffer SDK
|
|
61
|
+
formatted_vectors = []
|
|
62
|
+
for vector in vectors:
|
|
63
|
+
if "id" not in vector or "values" not in vector:
|
|
64
|
+
raise ValueError("Each vector must have 'id' and 'values' fields")
|
|
65
|
+
|
|
66
|
+
formatted_vector = {
|
|
67
|
+
"id": str(vector["id"]),
|
|
68
|
+
"vector": vector["values"],
|
|
69
|
+
"attributes": vector.get("metadata", {}),
|
|
70
|
+
}
|
|
71
|
+
formatted_vectors.append(formatted_vector)
|
|
72
|
+
|
|
73
|
+
try:
|
|
74
|
+
ns = self.client.namespace(namespace)
|
|
75
|
+
ns.upsert(vectors=formatted_vectors)
|
|
76
|
+
|
|
77
|
+
logger.info(f"Successfully upserted {len(vectors)} vectors")
|
|
78
|
+
return {"upserted": len(vectors)}
|
|
79
|
+
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.error(f"Failed to upsert vectors: {e}")
|
|
82
|
+
raise RuntimeError(f"Vector upsert failed: {e}")
|
|
83
|
+
|
|
84
|
+
def search_vectors(
|
|
85
|
+
self,
|
|
86
|
+
query_vector: List[float],
|
|
87
|
+
top_k: int = 10,
|
|
88
|
+
namespace: str = "default",
|
|
89
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
90
|
+
**kwargs
|
|
91
|
+
) -> List[Dict[str, Any]]:
|
|
92
|
+
"""Search for similar vectors."""
|
|
93
|
+
logger.debug(f"Searching {top_k} vectors in namespace '{namespace}'")
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
ns = self.client.namespace(namespace)
|
|
97
|
+
|
|
98
|
+
results = ns.query(
|
|
99
|
+
rank_by=[("vector", "ANN", query_vector)],
|
|
100
|
+
top_k=top_k,
|
|
101
|
+
filters=filters,
|
|
102
|
+
include_attributes=True
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
logger.debug(f"Found {len(results)} similar vectors")
|
|
106
|
+
return results
|
|
107
|
+
|
|
108
|
+
except Exception as e:
|
|
109
|
+
logger.error(f"Vector search failed: {e}")
|
|
110
|
+
raise RuntimeError(f"Vector search failed: {e}")
|
|
111
|
+
|
|
112
|
+
def delete_vectors(
|
|
113
|
+
self,
|
|
114
|
+
vector_ids: List[str],
|
|
115
|
+
namespace: str,
|
|
116
|
+
**kwargs
|
|
117
|
+
) -> Dict[str, Any]:
|
|
118
|
+
"""Delete vectors by ID."""
|
|
119
|
+
if not vector_ids:
|
|
120
|
+
return {"deleted": 0}
|
|
121
|
+
|
|
122
|
+
logger.info(f"Deleting {len(vector_ids)} vectors from namespace '{namespace}'")
|
|
123
|
+
|
|
124
|
+
try:
|
|
125
|
+
ns = self.client.namespace(namespace)
|
|
126
|
+
ns.delete(ids=vector_ids)
|
|
127
|
+
|
|
128
|
+
logger.info(f"Successfully deleted vectors")
|
|
129
|
+
return {"deleted": len(vector_ids)}
|
|
130
|
+
|
|
131
|
+
except Exception as e:
|
|
132
|
+
logger.error(f"Failed to delete vectors: {e}")
|
|
133
|
+
raise RuntimeError(f"Vector deletion failed: {e}")
|
|
134
|
+
|
|
135
|
+
def list_namespaces(self) -> List[str]:
|
|
136
|
+
"""List all available namespaces."""
|
|
137
|
+
try:
|
|
138
|
+
namespaces = self.client.namespaces()
|
|
139
|
+
return [ns.name for ns in namespaces]
|
|
140
|
+
|
|
141
|
+
except Exception as e:
|
|
142
|
+
logger.error(f"Failed to list namespaces: {e}")
|
|
143
|
+
raise RuntimeError(f"Namespace listing failed: {e}")
|
|
144
|
+
|
|
145
|
+
def create_namespace(self, namespace: str, dimension: int, **kwargs) -> Dict[str, Any]:
|
|
146
|
+
"""Create a new namespace."""
|
|
147
|
+
logger.info(f"Creating namespace '{namespace}' with dimension {dimension}")
|
|
148
|
+
|
|
149
|
+
try:
|
|
150
|
+
self.client.create_namespace(
|
|
151
|
+
name=namespace,
|
|
152
|
+
dimension=dimension
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
logger.info(f"Successfully created namespace '{namespace}'")
|
|
156
|
+
return {"name": namespace, "dimension": dimension}
|
|
157
|
+
|
|
158
|
+
except Exception as e:
|
|
159
|
+
logger.error(f"Failed to create namespace: {e}")
|
|
160
|
+
raise RuntimeError(f"Namespace creation failed: {e}")
|
|
161
|
+
|
|
162
|
+
def delete_namespace(self, namespace: str) -> Dict[str, Any]:
|
|
163
|
+
"""Delete a namespace and all its vectors."""
|
|
164
|
+
logger.warning(f"Deleting namespace '{namespace}' and all its vectors")
|
|
165
|
+
|
|
166
|
+
try:
|
|
167
|
+
self.client.delete_namespace(namespace)
|
|
168
|
+
|
|
169
|
+
logger.info(f"Successfully deleted namespace '{namespace}'")
|
|
170
|
+
return {"deleted": namespace}
|
|
171
|
+
|
|
172
|
+
except Exception as e:
|
|
173
|
+
logger.error(f"Failed to delete namespace: {e}")
|
|
174
|
+
raise RuntimeError(f"Namespace deletion failed: {e}")
|
|
175
|
+
|
|
176
|
+
def get_namespace_for_project(self, project_id: str) -> str:
|
|
177
|
+
"""Get the namespace name for a project."""
|
|
178
|
+
# Use project ID as namespace, with prefix for safety
|
|
179
|
+
safe_project_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in project_id)
|
|
180
|
+
return f"mcp_code_{safe_project_id}".lower()
|
|
181
|
+
|
|
182
|
+
def search_with_metadata_filter(
|
|
183
|
+
self,
|
|
184
|
+
query_vector: List[float],
|
|
185
|
+
project_id: str,
|
|
186
|
+
chunk_type: Optional[str] = None,
|
|
187
|
+
file_path: Optional[str] = None,
|
|
188
|
+
top_k: int = 10,
|
|
189
|
+
**kwargs
|
|
190
|
+
) -> List[Dict[str, Any]]:
|
|
191
|
+
"""Search vectors with metadata filtering."""
|
|
192
|
+
namespace = self.get_namespace_for_project(project_id)
|
|
193
|
+
|
|
194
|
+
# Build metadata filters
|
|
195
|
+
filters = {"project_id": project_id}
|
|
196
|
+
if chunk_type:
|
|
197
|
+
filters["chunk_type"] = chunk_type
|
|
198
|
+
if file_path:
|
|
199
|
+
filters["file_path"] = file_path
|
|
200
|
+
|
|
201
|
+
return self.search_vectors(
|
|
202
|
+
query_vector=query_vector,
|
|
203
|
+
top_k=top_k,
|
|
204
|
+
namespace=namespace,
|
|
205
|
+
filters=filters,
|
|
206
|
+
**kwargs
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
def create_turbopuffer_client(config: VectorConfig) -> TurbopufferClient:
|
|
210
|
+
"""Create a Turbopuffer client from configuration."""
|
|
211
|
+
if not config.turbopuffer_api_key:
|
|
212
|
+
raise ValueError("TURBOPUFFER_API_KEY is required for vector storage")
|
|
213
|
+
|
|
214
|
+
return TurbopufferClient(
|
|
215
|
+
api_key=config.turbopuffer_api_key,
|
|
216
|
+
region=config.turbopuffer_region,
|
|
217
|
+
)
|