mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/models.py +125 -1
  2. mcp_code_indexer/main.py +60 -0
  3. mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
  4. mcp_code_indexer/server/mcp_server.py +3 -0
  5. mcp_code_indexer/vector_mode/__init__.py +36 -0
  6. mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
  9. mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
  10. mcp_code_indexer/vector_mode/config.py +167 -0
  11. mcp_code_indexer/vector_mode/daemon.py +335 -0
  12. mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
  13. mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
  14. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
  15. mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
  16. mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
  17. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
  18. mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
  19. mcp_code_indexer/vector_mode/security/__init__.py +11 -0
  20. mcp_code_indexer/vector_mode/security/patterns.py +297 -0
  21. mcp_code_indexer/vector_mode/security/redactor.py +368 -0
  22. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
  23. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
  24. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
  25. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
  26. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,418 @@
1
+ """
2
+ Merkle tree implementation for efficient change detection.
3
+
4
+ Provides a hierarchical hash tree for detecting file system changes
5
+ without scanning entire directory structures.
6
+ """
7
+
8
+ import hashlib
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Set, Tuple
12
+ from dataclasses import dataclass
13
+ from datetime import datetime
14
+ import json
15
+ import os
16
+
17
+ from ...database.models import MerkleNode as MerkleNodeModel, NodeType
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ @dataclass
22
+ class TreeStats:
23
+ """Statistics about the Merkle tree."""
24
+ total_nodes: int = 0
25
+ file_nodes: int = 0
26
+ directory_nodes: int = 0
27
+ max_depth: int = 0
28
+ total_size: int = 0
29
+ last_updated: Optional[datetime] = None
30
+
31
+ class MerkleNode:
32
+ """
33
+ Node in the Merkle tree representing a file or directory.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ path: str,
39
+ node_type: NodeType,
40
+ hash_value: Optional[str] = None,
41
+ parent: Optional["MerkleNode"] = None,
42
+ ):
43
+ """
44
+ Initialize a Merkle tree node.
45
+
46
+ Args:
47
+ path: Relative path from project root
48
+ node_type: Type of node (file or directory)
49
+ hash_value: Hash value for the node
50
+ parent: Parent node
51
+ """
52
+ self.path = path
53
+ self.node_type = node_type
54
+ self.hash_value = hash_value
55
+ self.parent = parent
56
+ self.children: Dict[str, "MerkleNode"] = {}
57
+ self.last_modified = datetime.utcnow()
58
+ self.size: Optional[int] = None
59
+ self.metadata: Dict[str, any] = {}
60
+
61
+ def add_child(self, name: str, child: "MerkleNode") -> None:
62
+ """Add a child node."""
63
+ child.parent = self
64
+ self.children[name] = child
65
+
66
+ def remove_child(self, name: str) -> Optional["MerkleNode"]:
67
+ """Remove and return a child node."""
68
+ return self.children.pop(name, None)
69
+
70
+ def get_child(self, name: str) -> Optional["MerkleNode"]:
71
+ """Get a child node by name."""
72
+ return self.children.get(name)
73
+
74
+ def compute_hash(self, project_root: Path) -> str:
75
+ """Compute hash for this node."""
76
+ if self.node_type == NodeType.FILE:
77
+ return self._compute_file_hash(project_root)
78
+ else:
79
+ return self._compute_directory_hash()
80
+
81
+ def _compute_file_hash(self, project_root: Path) -> str:
82
+ """Compute hash for a file node."""
83
+ file_path = project_root / self.path
84
+
85
+ try:
86
+ if not file_path.exists():
87
+ return "deleted"
88
+
89
+ # Use file modification time and size for quick comparison
90
+ stat = file_path.stat()
91
+ self.size = stat.st_size
92
+
93
+ # For small files, use content hash
94
+ if stat.st_size < 1024 * 1024: # 1MB
95
+ with open(file_path, 'rb') as f:
96
+ content = f.read()
97
+ return hashlib.sha256(content).hexdigest()
98
+ else:
99
+ # For large files, use metadata hash
100
+ metadata = f"{stat.st_size}:{stat.st_mtime}"
101
+ return hashlib.sha256(metadata.encode()).hexdigest()
102
+
103
+ except (OSError, PermissionError) as e:
104
+ logger.warning(f"Could not hash file {file_path}: {e}")
105
+ return "error"
106
+
107
+ def _compute_directory_hash(self) -> str:
108
+ """Compute hash for a directory node based on children."""
109
+ if not self.children:
110
+ return hashlib.sha256(b"empty").hexdigest()
111
+
112
+ # Sort children by name for consistent hashing
113
+ child_hashes = []
114
+ for name in sorted(self.children.keys()):
115
+ child = self.children[name]
116
+ child_hash = child.hash_value or ""
117
+ combined = f"{name}:{child_hash}"
118
+ child_hashes.append(combined)
119
+
120
+ combined_hash = "|".join(child_hashes)
121
+ return hashlib.sha256(combined_hash.encode()).hexdigest()
122
+
123
+ def update_hash(self, project_root: Path) -> bool:
124
+ """Update hash and return True if it changed."""
125
+ old_hash = self.hash_value
126
+ new_hash = self.compute_hash(project_root)
127
+
128
+ if old_hash != new_hash:
129
+ self.hash_value = new_hash
130
+ self.last_modified = datetime.utcnow()
131
+ return True
132
+
133
+ return False
134
+
135
+ def get_depth(self) -> int:
136
+ """Get depth of this node in the tree."""
137
+ if self.parent is None:
138
+ return 0
139
+ return self.parent.get_depth() + 1
140
+
141
+ def is_leaf(self) -> bool:
142
+ """Check if this is a leaf node."""
143
+ return len(self.children) == 0
144
+
145
+ def to_dict(self) -> Dict:
146
+ """Convert node to dictionary representation."""
147
+ return {
148
+ "path": self.path,
149
+ "node_type": self.node_type.value,
150
+ "hash_value": self.hash_value,
151
+ "last_modified": self.last_modified.isoformat(),
152
+ "size": self.size,
153
+ "children": list(self.children.keys()),
154
+ "metadata": self.metadata,
155
+ }
156
+
157
+ class MerkleTree:
158
+ """
159
+ Merkle tree for efficient file system change detection.
160
+
161
+ Maintains a hierarchical hash tree of the project structure to quickly
162
+ identify changes without scanning all files.
163
+ """
164
+
165
+ def __init__(self, project_root: Path, project_id: str):
166
+ """
167
+ Initialize Merkle tree.
168
+
169
+ Args:
170
+ project_root: Root directory path
171
+ project_id: Project identifier
172
+ """
173
+ self.project_root = Path(project_root).resolve()
174
+ self.project_id = project_id
175
+ self.root: Optional[MerkleNode] = None
176
+ self.node_map: Dict[str, MerkleNode] = {} # path -> node mapping
177
+
178
+ # Statistics
179
+ self.stats = TreeStats()
180
+
181
+ # Change tracking
182
+ self.changed_nodes: Set[str] = set()
183
+ self.last_scan_time: Optional[datetime] = None
184
+
185
+ def build_tree(self, ignore_patterns: Optional[List[str]] = None) -> None:
186
+ """Build the complete Merkle tree by scanning the file system."""
187
+ logger.info(f"Building Merkle tree for {self.project_root}")
188
+
189
+ ignore_patterns = ignore_patterns or [
190
+ "*.log", "*.tmp", "*~", ".git", "__pycache__",
191
+ "node_modules", "*.pyc", "*.pyo", ".DS_Store", "Thumbs.db"
192
+ ]
193
+
194
+ # Create root node
195
+ self.root = MerkleNode("", NodeType.DIRECTORY)
196
+ self.node_map[""] = self.root
197
+
198
+ # Recursively build tree
199
+ self._build_tree_recursive(self.project_root, self.root, ignore_patterns)
200
+
201
+ # Compute hashes bottom-up
202
+ self._compute_hashes_recursive(self.root)
203
+
204
+ # Update statistics
205
+ self._update_stats()
206
+ self.last_scan_time = datetime.utcnow()
207
+
208
+ logger.info(
209
+ f"Built Merkle tree: {self.stats.total_nodes} nodes "
210
+ f"({self.stats.file_nodes} files, {self.stats.directory_nodes} directories)"
211
+ )
212
+
213
+ def _build_tree_recursive(
214
+ self,
215
+ current_path: Path,
216
+ current_node: MerkleNode,
217
+ ignore_patterns: List[str]
218
+ ) -> None:
219
+ """Recursively build tree structure."""
220
+ try:
221
+ if not current_path.is_dir():
222
+ return
223
+
224
+ for item in current_path.iterdir():
225
+ # Check if should ignore
226
+ if self._should_ignore(item, ignore_patterns):
227
+ continue
228
+
229
+ # Get relative path
230
+ try:
231
+ relative_path = str(item.relative_to(self.project_root))
232
+ except ValueError:
233
+ continue
234
+
235
+ # Create node
236
+ if item.is_file():
237
+ node = MerkleNode(relative_path, NodeType.FILE)
238
+ current_node.add_child(item.name, node)
239
+ self.node_map[relative_path] = node
240
+
241
+ elif item.is_dir():
242
+ node = MerkleNode(relative_path, NodeType.DIRECTORY)
243
+ current_node.add_child(item.name, node)
244
+ self.node_map[relative_path] = node
245
+
246
+ # Recurse into directory
247
+ self._build_tree_recursive(item, node, ignore_patterns)
248
+
249
+ except (OSError, PermissionError) as e:
250
+ logger.warning(f"Could not scan directory {current_path}: {e}")
251
+
252
+ def _should_ignore(self, path: Path, ignore_patterns: List[str]) -> bool:
253
+ """Check if path should be ignored."""
254
+ import fnmatch
255
+
256
+ path_str = path.name
257
+ for pattern in ignore_patterns:
258
+ if fnmatch.fnmatch(path_str, pattern):
259
+ return True
260
+
261
+ return False
262
+
263
+ def _compute_hashes_recursive(self, node: MerkleNode) -> None:
264
+ """Compute hashes recursively (bottom-up)."""
265
+ # First compute hashes for all children
266
+ for child in node.children.values():
267
+ self._compute_hashes_recursive(child)
268
+
269
+ # Then compute hash for this node
270
+ node.update_hash(self.project_root)
271
+
272
+ def _update_stats(self) -> None:
273
+ """Update tree statistics."""
274
+ self.stats = TreeStats()
275
+ self.stats.last_updated = datetime.utcnow()
276
+
277
+ if self.root:
278
+ self._update_stats_recursive(self.root, 0)
279
+
280
+ def _update_stats_recursive(self, node: MerkleNode, depth: int) -> None:
281
+ """Recursively update statistics."""
282
+ self.stats.total_nodes += 1
283
+ self.stats.max_depth = max(self.stats.max_depth, depth)
284
+
285
+ if node.node_type == NodeType.FILE:
286
+ self.stats.file_nodes += 1
287
+ if node.size:
288
+ self.stats.total_size += node.size
289
+ else:
290
+ self.stats.directory_nodes += 1
291
+
292
+ for child in node.children.values():
293
+ self._update_stats_recursive(child, depth + 1)
294
+
295
+ def update_file(self, relative_path: str) -> bool:
296
+ """
297
+ Update a file in the tree and return True if hash changed.
298
+
299
+ Args:
300
+ relative_path: Path relative to project root
301
+
302
+ Returns:
303
+ True if the file's hash changed
304
+ """
305
+ node = self.node_map.get(relative_path)
306
+ if not node:
307
+ # File might be new, rebuild tree
308
+ logger.info(f"File {relative_path} not in tree, triggering rebuild")
309
+ self.build_tree()
310
+ return True
311
+
312
+ # Update file hash
313
+ changed = node.update_hash(self.project_root)
314
+
315
+ if changed:
316
+ self.changed_nodes.add(relative_path)
317
+
318
+ # Propagate hash changes up the tree
319
+ self._propagate_hash_changes(node.parent)
320
+
321
+ return changed
322
+
323
+ def _propagate_hash_changes(self, node: Optional[MerkleNode]) -> None:
324
+ """Propagate hash changes up the tree."""
325
+ if not node:
326
+ return
327
+
328
+ old_hash = node.hash_value
329
+ node.update_hash(self.project_root)
330
+
331
+ if old_hash != node.hash_value:
332
+ self.changed_nodes.add(node.path)
333
+ self._propagate_hash_changes(node.parent)
334
+
335
+ def get_changed_files(self, since: Optional[datetime] = None) -> List[str]:
336
+ """Get list of files that changed since timestamp."""
337
+ if since is None:
338
+ return list(self.changed_nodes)
339
+
340
+ changed_files = []
341
+ for path in self.changed_nodes:
342
+ node = self.node_map.get(path)
343
+ if node and node.last_modified >= since:
344
+ changed_files.append(path)
345
+
346
+ return changed_files
347
+
348
+ def verify_tree(self) -> Tuple[bool, List[str]]:
349
+ """
350
+ Verify tree integrity by recomputing hashes.
351
+
352
+ Returns:
353
+ Tuple of (is_valid, list_of_errors)
354
+ """
355
+ errors = []
356
+
357
+ if not self.root:
358
+ return False, ["No root node"]
359
+
360
+ # Recompute all hashes and compare
361
+ for path, node in self.node_map.items():
362
+ expected_hash = node.compute_hash(self.project_root)
363
+ if node.hash_value != expected_hash:
364
+ errors.append(f"Hash mismatch for {path}: {node.hash_value} != {expected_hash}")
365
+
366
+ return len(errors) == 0, errors
367
+
368
+ def get_subtree_hash(self, relative_path: str) -> Optional[str]:
369
+ """Get hash for a subtree rooted at the given path."""
370
+ node = self.node_map.get(relative_path)
371
+ return node.hash_value if node else None
372
+
373
+ def export_to_database_models(self) -> List[MerkleNodeModel]:
374
+ """Export tree to database models for persistence."""
375
+ models = []
376
+
377
+ for path, node in self.node_map.items():
378
+ # Determine parent path
379
+ parent_path = None
380
+ if node.parent and node.parent.path:
381
+ parent_path = node.parent.path
382
+
383
+ model = MerkleNodeModel(
384
+ project_id=self.project_id,
385
+ path=path,
386
+ hash=node.hash_value or "",
387
+ node_type=node.node_type,
388
+ parent_path=parent_path,
389
+ children_hash=node._compute_directory_hash() if node.node_type == NodeType.DIRECTORY else None,
390
+ last_modified=node.last_modified,
391
+ )
392
+ models.append(model)
393
+
394
+ return models
395
+
396
+ def clear_changed_nodes(self) -> int:
397
+ """Clear the changed nodes set and return count."""
398
+ count = len(self.changed_nodes)
399
+ self.changed_nodes.clear()
400
+ return count
401
+
402
+ def get_tree_summary(self) -> Dict:
403
+ """Get a summary of the tree structure."""
404
+ return {
405
+ "project_id": self.project_id,
406
+ "project_root": str(self.project_root),
407
+ "stats": {
408
+ "total_nodes": self.stats.total_nodes,
409
+ "file_nodes": self.stats.file_nodes,
410
+ "directory_nodes": self.stats.directory_nodes,
411
+ "max_depth": self.stats.max_depth,
412
+ "total_size": self.stats.total_size,
413
+ "last_updated": self.stats.last_updated.isoformat() if self.stats.last_updated else None,
414
+ },
415
+ "root_hash": self.root.hash_value if self.root else None,
416
+ "last_scan": self.last_scan_time.isoformat() if self.last_scan_time else None,
417
+ "changed_nodes": len(self.changed_nodes),
418
+ }
@@ -0,0 +1,17 @@
1
+ """
2
+ External service providers for vector mode.
3
+
4
+ This package provides clean integrations with external services using official SDKs:
5
+ - Voyage AI for embedding generation (voyageai SDK)
6
+ - Turbopuffer for vector storage and search (turbopuffer SDK)
7
+ """
8
+
9
+ from .voyage_client import VoyageClient, create_voyage_client
10
+ from .turbopuffer_client import TurbopufferClient, create_turbopuffer_client
11
+
12
+ __all__ = [
13
+ 'VoyageClient',
14
+ 'create_voyage_client',
15
+ 'TurbopufferClient',
16
+ 'create_turbopuffer_client',
17
+ ]
@@ -0,0 +1,217 @@
1
+ """
2
+ Turbopuffer client for vector storage and search using official SDK.
3
+
4
+ Provides clean integration with Turbopuffer's vector database for storing
5
+ embeddings and performing similarity searches. Supports configurable
6
+ regions for optimal latency and data residency compliance.
7
+
8
+ Default region: gcp-europe-west3 (Frankfurt)
9
+ Configure via TURBOPUFFER_REGION environment variable.
10
+ """
11
+
12
+ import logging
13
+ import uuid
14
+ from typing import List, Dict, Any, Optional
15
+ import turbopuffer
16
+
17
+ from ..config import VectorConfig
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class TurbopufferClient:
22
+ """Clean Turbopuffer client using official SDK."""
23
+
24
+ def __init__(self, api_key: str, region: str = "gcp-europe-west3"):
25
+ self.api_key = api_key
26
+ self.region = region
27
+
28
+ # Initialize official TurboPuffer client
29
+ self.client = turbopuffer.Turbopuffer(
30
+ api_key=api_key,
31
+ region=region
32
+ )
33
+ logger.info(f"Initialized TurboPuffer client with region {region}")
34
+
35
+ def health_check(self) -> bool:
36
+ """Check if Turbopuffer service is healthy."""
37
+ try:
38
+ namespaces = self.client.namespaces()
39
+ return True
40
+ except Exception as e:
41
+ logger.warning(f"Turbopuffer health check failed: {e}")
42
+ return False
43
+
44
+ def generate_vector_id(self, project_id: str, chunk_id: int) -> str:
45
+ """Generate a unique vector ID."""
46
+ return f"{project_id}_{chunk_id}_{uuid.uuid4().hex[:8]}"
47
+
48
+ def upsert_vectors(
49
+ self,
50
+ vectors: List[Dict[str, Any]],
51
+ namespace: str,
52
+ **kwargs
53
+ ) -> Dict[str, Any]:
54
+ """Store or update vectors in the database."""
55
+ if not vectors:
56
+ return {"upserted": 0}
57
+
58
+ logger.info(f"Upserting {len(vectors)} vectors to namespace '{namespace}'")
59
+
60
+ # Format vectors for Turbopuffer SDK
61
+ formatted_vectors = []
62
+ for vector in vectors:
63
+ if "id" not in vector or "values" not in vector:
64
+ raise ValueError("Each vector must have 'id' and 'values' fields")
65
+
66
+ formatted_vector = {
67
+ "id": str(vector["id"]),
68
+ "vector": vector["values"],
69
+ "attributes": vector.get("metadata", {}),
70
+ }
71
+ formatted_vectors.append(formatted_vector)
72
+
73
+ try:
74
+ ns = self.client.namespace(namespace)
75
+ ns.upsert(vectors=formatted_vectors)
76
+
77
+ logger.info(f"Successfully upserted {len(vectors)} vectors")
78
+ return {"upserted": len(vectors)}
79
+
80
+ except Exception as e:
81
+ logger.error(f"Failed to upsert vectors: {e}")
82
+ raise RuntimeError(f"Vector upsert failed: {e}")
83
+
84
+ def search_vectors(
85
+ self,
86
+ query_vector: List[float],
87
+ top_k: int = 10,
88
+ namespace: str = "default",
89
+ filters: Optional[Dict[str, Any]] = None,
90
+ **kwargs
91
+ ) -> List[Dict[str, Any]]:
92
+ """Search for similar vectors."""
93
+ logger.debug(f"Searching {top_k} vectors in namespace '{namespace}'")
94
+
95
+ try:
96
+ ns = self.client.namespace(namespace)
97
+
98
+ results = ns.query(
99
+ rank_by=[("vector", "ANN", query_vector)],
100
+ top_k=top_k,
101
+ filters=filters,
102
+ include_attributes=True
103
+ )
104
+
105
+ logger.debug(f"Found {len(results)} similar vectors")
106
+ return results
107
+
108
+ except Exception as e:
109
+ logger.error(f"Vector search failed: {e}")
110
+ raise RuntimeError(f"Vector search failed: {e}")
111
+
112
+ def delete_vectors(
113
+ self,
114
+ vector_ids: List[str],
115
+ namespace: str,
116
+ **kwargs
117
+ ) -> Dict[str, Any]:
118
+ """Delete vectors by ID."""
119
+ if not vector_ids:
120
+ return {"deleted": 0}
121
+
122
+ logger.info(f"Deleting {len(vector_ids)} vectors from namespace '{namespace}'")
123
+
124
+ try:
125
+ ns = self.client.namespace(namespace)
126
+ ns.delete(ids=vector_ids)
127
+
128
+ logger.info(f"Successfully deleted vectors")
129
+ return {"deleted": len(vector_ids)}
130
+
131
+ except Exception as e:
132
+ logger.error(f"Failed to delete vectors: {e}")
133
+ raise RuntimeError(f"Vector deletion failed: {e}")
134
+
135
+ def list_namespaces(self) -> List[str]:
136
+ """List all available namespaces."""
137
+ try:
138
+ namespaces = self.client.namespaces()
139
+ return [ns.name for ns in namespaces]
140
+
141
+ except Exception as e:
142
+ logger.error(f"Failed to list namespaces: {e}")
143
+ raise RuntimeError(f"Namespace listing failed: {e}")
144
+
145
+ def create_namespace(self, namespace: str, dimension: int, **kwargs) -> Dict[str, Any]:
146
+ """Create a new namespace."""
147
+ logger.info(f"Creating namespace '{namespace}' with dimension {dimension}")
148
+
149
+ try:
150
+ self.client.create_namespace(
151
+ name=namespace,
152
+ dimension=dimension
153
+ )
154
+
155
+ logger.info(f"Successfully created namespace '{namespace}'")
156
+ return {"name": namespace, "dimension": dimension}
157
+
158
+ except Exception as e:
159
+ logger.error(f"Failed to create namespace: {e}")
160
+ raise RuntimeError(f"Namespace creation failed: {e}")
161
+
162
+ def delete_namespace(self, namespace: str) -> Dict[str, Any]:
163
+ """Delete a namespace and all its vectors."""
164
+ logger.warning(f"Deleting namespace '{namespace}' and all its vectors")
165
+
166
+ try:
167
+ self.client.delete_namespace(namespace)
168
+
169
+ logger.info(f"Successfully deleted namespace '{namespace}'")
170
+ return {"deleted": namespace}
171
+
172
+ except Exception as e:
173
+ logger.error(f"Failed to delete namespace: {e}")
174
+ raise RuntimeError(f"Namespace deletion failed: {e}")
175
+
176
+ def get_namespace_for_project(self, project_id: str) -> str:
177
+ """Get the namespace name for a project."""
178
+ # Use project ID as namespace, with prefix for safety
179
+ safe_project_id = "".join(c if c.isalnum() or c in "-_" else "_" for c in project_id)
180
+ return f"mcp_code_{safe_project_id}".lower()
181
+
182
+ def search_with_metadata_filter(
183
+ self,
184
+ query_vector: List[float],
185
+ project_id: str,
186
+ chunk_type: Optional[str] = None,
187
+ file_path: Optional[str] = None,
188
+ top_k: int = 10,
189
+ **kwargs
190
+ ) -> List[Dict[str, Any]]:
191
+ """Search vectors with metadata filtering."""
192
+ namespace = self.get_namespace_for_project(project_id)
193
+
194
+ # Build metadata filters
195
+ filters = {"project_id": project_id}
196
+ if chunk_type:
197
+ filters["chunk_type"] = chunk_type
198
+ if file_path:
199
+ filters["file_path"] = file_path
200
+
201
+ return self.search_vectors(
202
+ query_vector=query_vector,
203
+ top_k=top_k,
204
+ namespace=namespace,
205
+ filters=filters,
206
+ **kwargs
207
+ )
208
+
209
+ def create_turbopuffer_client(config: VectorConfig) -> TurbopufferClient:
210
+ """Create a Turbopuffer client from configuration."""
211
+ if not config.turbopuffer_api_key:
212
+ raise ValueError("TURBOPUFFER_API_KEY is required for vector storage")
213
+
214
+ return TurbopufferClient(
215
+ api_key=config.turbopuffer_api_key,
216
+ region=config.turbopuffer_region,
217
+ )