mcp-code-indexer 4.0.1__py3-none-any.whl → 4.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. mcp_code_indexer/__init__.py +7 -5
  2. mcp_code_indexer/ask_handler.py +2 -2
  3. mcp_code_indexer/claude_api_handler.py +10 -5
  4. mcp_code_indexer/cleanup_manager.py +20 -12
  5. mcp_code_indexer/commands/makelocal.py +85 -63
  6. mcp_code_indexer/data/stop_words_english.txt +1 -1
  7. mcp_code_indexer/database/connection_health.py +29 -20
  8. mcp_code_indexer/database/database.py +44 -31
  9. mcp_code_indexer/database/database_factory.py +19 -20
  10. mcp_code_indexer/database/exceptions.py +10 -10
  11. mcp_code_indexer/database/models.py +126 -1
  12. mcp_code_indexer/database/path_resolver.py +22 -21
  13. mcp_code_indexer/database/retry_executor.py +37 -19
  14. mcp_code_indexer/deepask_handler.py +3 -3
  15. mcp_code_indexer/error_handler.py +46 -20
  16. mcp_code_indexer/file_scanner.py +15 -12
  17. mcp_code_indexer/git_hook_handler.py +71 -76
  18. mcp_code_indexer/logging_config.py +13 -5
  19. mcp_code_indexer/main.py +85 -22
  20. mcp_code_indexer/middleware/__init__.py +1 -1
  21. mcp_code_indexer/middleware/auth.py +47 -43
  22. mcp_code_indexer/middleware/error_middleware.py +15 -15
  23. mcp_code_indexer/middleware/logging.py +44 -42
  24. mcp_code_indexer/middleware/security.py +84 -76
  25. mcp_code_indexer/migrations/002_performance_indexes.sql +1 -1
  26. mcp_code_indexer/migrations/004_remove_branch_dependency.sql +14 -14
  27. mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
  28. mcp_code_indexer/query_preprocessor.py +2 -2
  29. mcp_code_indexer/server/mcp_server.py +158 -94
  30. mcp_code_indexer/transport/__init__.py +1 -1
  31. mcp_code_indexer/transport/base.py +19 -17
  32. mcp_code_indexer/transport/http_transport.py +89 -76
  33. mcp_code_indexer/transport/stdio_transport.py +12 -8
  34. mcp_code_indexer/vector_mode/__init__.py +36 -0
  35. mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
  36. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
  37. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
  38. mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
  39. mcp_code_indexer/vector_mode/config.py +155 -0
  40. mcp_code_indexer/vector_mode/daemon.py +335 -0
  41. mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
  42. mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
  43. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
  44. mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
  45. mcp_code_indexer/vector_mode/providers/__init__.py +72 -0
  46. mcp_code_indexer/vector_mode/providers/base_provider.py +230 -0
  47. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +338 -0
  48. mcp_code_indexer/vector_mode/providers/voyage_client.py +212 -0
  49. mcp_code_indexer/vector_mode/security/__init__.py +11 -0
  50. mcp_code_indexer/vector_mode/security/patterns.py +297 -0
  51. mcp_code_indexer/vector_mode/security/redactor.py +368 -0
  52. {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/METADATA +82 -24
  53. mcp_code_indexer-4.1.0.dist-info/RECORD +66 -0
  54. mcp_code_indexer-4.0.1.dist-info/RECORD +0 -47
  55. {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/LICENSE +0 -0
  56. {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/WHEEL +0 -0
  57. {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,418 @@
1
+ """
2
+ Merkle tree implementation for efficient change detection.
3
+
4
+ Provides a hierarchical hash tree for detecting file system changes
5
+ without scanning entire directory structures.
6
+ """
7
+
8
+ import hashlib
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Dict, List, Optional, Set, Tuple
12
+ from dataclasses import dataclass
13
+ from datetime import datetime
14
+ import json
15
+ import os
16
+
17
+ from ...database.models import MerkleNode as MerkleNodeModel, NodeType
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ @dataclass
22
+ class TreeStats:
23
+ """Statistics about the Merkle tree."""
24
+ total_nodes: int = 0
25
+ file_nodes: int = 0
26
+ directory_nodes: int = 0
27
+ max_depth: int = 0
28
+ total_size: int = 0
29
+ last_updated: Optional[datetime] = None
30
+
31
+ class MerkleNode:
32
+ """
33
+ Node in the Merkle tree representing a file or directory.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ path: str,
39
+ node_type: NodeType,
40
+ hash_value: Optional[str] = None,
41
+ parent: Optional["MerkleNode"] = None,
42
+ ):
43
+ """
44
+ Initialize a Merkle tree node.
45
+
46
+ Args:
47
+ path: Relative path from project root
48
+ node_type: Type of node (file or directory)
49
+ hash_value: Hash value for the node
50
+ parent: Parent node
51
+ """
52
+ self.path = path
53
+ self.node_type = node_type
54
+ self.hash_value = hash_value
55
+ self.parent = parent
56
+ self.children: Dict[str, "MerkleNode"] = {}
57
+ self.last_modified = datetime.utcnow()
58
+ self.size: Optional[int] = None
59
+ self.metadata: Dict[str, any] = {}
60
+
61
+ def add_child(self, name: str, child: "MerkleNode") -> None:
62
+ """Add a child node."""
63
+ child.parent = self
64
+ self.children[name] = child
65
+
66
+ def remove_child(self, name: str) -> Optional["MerkleNode"]:
67
+ """Remove and return a child node."""
68
+ return self.children.pop(name, None)
69
+
70
+ def get_child(self, name: str) -> Optional["MerkleNode"]:
71
+ """Get a child node by name."""
72
+ return self.children.get(name)
73
+
74
+ def compute_hash(self, project_root: Path) -> str:
75
+ """Compute hash for this node."""
76
+ if self.node_type == NodeType.FILE:
77
+ return self._compute_file_hash(project_root)
78
+ else:
79
+ return self._compute_directory_hash()
80
+
81
+ def _compute_file_hash(self, project_root: Path) -> str:
82
+ """Compute hash for a file node."""
83
+ file_path = project_root / self.path
84
+
85
+ try:
86
+ if not file_path.exists():
87
+ return "deleted"
88
+
89
+ # Use file modification time and size for quick comparison
90
+ stat = file_path.stat()
91
+ self.size = stat.st_size
92
+
93
+ # For small files, use content hash
94
+ if stat.st_size < 1024 * 1024: # 1MB
95
+ with open(file_path, 'rb') as f:
96
+ content = f.read()
97
+ return hashlib.sha256(content).hexdigest()
98
+ else:
99
+ # For large files, use metadata hash
100
+ metadata = f"{stat.st_size}:{stat.st_mtime}"
101
+ return hashlib.sha256(metadata.encode()).hexdigest()
102
+
103
+ except (OSError, PermissionError) as e:
104
+ logger.warning(f"Could not hash file {file_path}: {e}")
105
+ return "error"
106
+
107
+ def _compute_directory_hash(self) -> str:
108
+ """Compute hash for a directory node based on children."""
109
+ if not self.children:
110
+ return hashlib.sha256(b"empty").hexdigest()
111
+
112
+ # Sort children by name for consistent hashing
113
+ child_hashes = []
114
+ for name in sorted(self.children.keys()):
115
+ child = self.children[name]
116
+ child_hash = child.hash_value or ""
117
+ combined = f"{name}:{child_hash}"
118
+ child_hashes.append(combined)
119
+
120
+ combined_hash = "|".join(child_hashes)
121
+ return hashlib.sha256(combined_hash.encode()).hexdigest()
122
+
123
+ def update_hash(self, project_root: Path) -> bool:
124
+ """Update hash and return True if it changed."""
125
+ old_hash = self.hash_value
126
+ new_hash = self.compute_hash(project_root)
127
+
128
+ if old_hash != new_hash:
129
+ self.hash_value = new_hash
130
+ self.last_modified = datetime.utcnow()
131
+ return True
132
+
133
+ return False
134
+
135
+ def get_depth(self) -> int:
136
+ """Get depth of this node in the tree."""
137
+ if self.parent is None:
138
+ return 0
139
+ return self.parent.get_depth() + 1
140
+
141
+ def is_leaf(self) -> bool:
142
+ """Check if this is a leaf node."""
143
+ return len(self.children) == 0
144
+
145
+ def to_dict(self) -> Dict:
146
+ """Convert node to dictionary representation."""
147
+ return {
148
+ "path": self.path,
149
+ "node_type": self.node_type.value,
150
+ "hash_value": self.hash_value,
151
+ "last_modified": self.last_modified.isoformat(),
152
+ "size": self.size,
153
+ "children": list(self.children.keys()),
154
+ "metadata": self.metadata,
155
+ }
156
+
157
+ class MerkleTree:
158
+ """
159
+ Merkle tree for efficient file system change detection.
160
+
161
+ Maintains a hierarchical hash tree of the project structure to quickly
162
+ identify changes without scanning all files.
163
+ """
164
+
165
+ def __init__(self, project_root: Path, project_id: str):
166
+ """
167
+ Initialize Merkle tree.
168
+
169
+ Args:
170
+ project_root: Root directory path
171
+ project_id: Project identifier
172
+ """
173
+ self.project_root = Path(project_root).resolve()
174
+ self.project_id = project_id
175
+ self.root: Optional[MerkleNode] = None
176
+ self.node_map: Dict[str, MerkleNode] = {} # path -> node mapping
177
+
178
+ # Statistics
179
+ self.stats = TreeStats()
180
+
181
+ # Change tracking
182
+ self.changed_nodes: Set[str] = set()
183
+ self.last_scan_time: Optional[datetime] = None
184
+
185
+ def build_tree(self, ignore_patterns: Optional[List[str]] = None) -> None:
186
+ """Build the complete Merkle tree by scanning the file system."""
187
+ logger.info(f"Building Merkle tree for {self.project_root}")
188
+
189
+ ignore_patterns = ignore_patterns or [
190
+ "*.log", "*.tmp", "*~", ".git", "__pycache__",
191
+ "node_modules", "*.pyc", "*.pyo", ".DS_Store", "Thumbs.db"
192
+ ]
193
+
194
+ # Create root node
195
+ self.root = MerkleNode("", NodeType.DIRECTORY)
196
+ self.node_map[""] = self.root
197
+
198
+ # Recursively build tree
199
+ self._build_tree_recursive(self.project_root, self.root, ignore_patterns)
200
+
201
+ # Compute hashes bottom-up
202
+ self._compute_hashes_recursive(self.root)
203
+
204
+ # Update statistics
205
+ self._update_stats()
206
+ self.last_scan_time = datetime.utcnow()
207
+
208
+ logger.info(
209
+ f"Built Merkle tree: {self.stats.total_nodes} nodes "
210
+ f"({self.stats.file_nodes} files, {self.stats.directory_nodes} directories)"
211
+ )
212
+
213
+ def _build_tree_recursive(
214
+ self,
215
+ current_path: Path,
216
+ current_node: MerkleNode,
217
+ ignore_patterns: List[str]
218
+ ) -> None:
219
+ """Recursively build tree structure."""
220
+ try:
221
+ if not current_path.is_dir():
222
+ return
223
+
224
+ for item in current_path.iterdir():
225
+ # Check if should ignore
226
+ if self._should_ignore(item, ignore_patterns):
227
+ continue
228
+
229
+ # Get relative path
230
+ try:
231
+ relative_path = str(item.relative_to(self.project_root))
232
+ except ValueError:
233
+ continue
234
+
235
+ # Create node
236
+ if item.is_file():
237
+ node = MerkleNode(relative_path, NodeType.FILE)
238
+ current_node.add_child(item.name, node)
239
+ self.node_map[relative_path] = node
240
+
241
+ elif item.is_dir():
242
+ node = MerkleNode(relative_path, NodeType.DIRECTORY)
243
+ current_node.add_child(item.name, node)
244
+ self.node_map[relative_path] = node
245
+
246
+ # Recurse into directory
247
+ self._build_tree_recursive(item, node, ignore_patterns)
248
+
249
+ except (OSError, PermissionError) as e:
250
+ logger.warning(f"Could not scan directory {current_path}: {e}")
251
+
252
+ def _should_ignore(self, path: Path, ignore_patterns: List[str]) -> bool:
253
+ """Check if path should be ignored."""
254
+ import fnmatch
255
+
256
+ path_str = path.name
257
+ for pattern in ignore_patterns:
258
+ if fnmatch.fnmatch(path_str, pattern):
259
+ return True
260
+
261
+ return False
262
+
263
+ def _compute_hashes_recursive(self, node: MerkleNode) -> None:
264
+ """Compute hashes recursively (bottom-up)."""
265
+ # First compute hashes for all children
266
+ for child in node.children.values():
267
+ self._compute_hashes_recursive(child)
268
+
269
+ # Then compute hash for this node
270
+ node.update_hash(self.project_root)
271
+
272
+ def _update_stats(self) -> None:
273
+ """Update tree statistics."""
274
+ self.stats = TreeStats()
275
+ self.stats.last_updated = datetime.utcnow()
276
+
277
+ if self.root:
278
+ self._update_stats_recursive(self.root, 0)
279
+
280
+ def _update_stats_recursive(self, node: MerkleNode, depth: int) -> None:
281
+ """Recursively update statistics."""
282
+ self.stats.total_nodes += 1
283
+ self.stats.max_depth = max(self.stats.max_depth, depth)
284
+
285
+ if node.node_type == NodeType.FILE:
286
+ self.stats.file_nodes += 1
287
+ if node.size:
288
+ self.stats.total_size += node.size
289
+ else:
290
+ self.stats.directory_nodes += 1
291
+
292
+ for child in node.children.values():
293
+ self._update_stats_recursive(child, depth + 1)
294
+
295
+ def update_file(self, relative_path: str) -> bool:
296
+ """
297
+ Update a file in the tree and return True if hash changed.
298
+
299
+ Args:
300
+ relative_path: Path relative to project root
301
+
302
+ Returns:
303
+ True if the file's hash changed
304
+ """
305
+ node = self.node_map.get(relative_path)
306
+ if not node:
307
+ # File might be new, rebuild tree
308
+ logger.info(f"File {relative_path} not in tree, triggering rebuild")
309
+ self.build_tree()
310
+ return True
311
+
312
+ # Update file hash
313
+ changed = node.update_hash(self.project_root)
314
+
315
+ if changed:
316
+ self.changed_nodes.add(relative_path)
317
+
318
+ # Propagate hash changes up the tree
319
+ self._propagate_hash_changes(node.parent)
320
+
321
+ return changed
322
+
323
+ def _propagate_hash_changes(self, node: Optional[MerkleNode]) -> None:
324
+ """Propagate hash changes up the tree."""
325
+ if not node:
326
+ return
327
+
328
+ old_hash = node.hash_value
329
+ node.update_hash(self.project_root)
330
+
331
+ if old_hash != node.hash_value:
332
+ self.changed_nodes.add(node.path)
333
+ self._propagate_hash_changes(node.parent)
334
+
335
+ def get_changed_files(self, since: Optional[datetime] = None) -> List[str]:
336
+ """Get list of files that changed since timestamp."""
337
+ if since is None:
338
+ return list(self.changed_nodes)
339
+
340
+ changed_files = []
341
+ for path in self.changed_nodes:
342
+ node = self.node_map.get(path)
343
+ if node and node.last_modified >= since:
344
+ changed_files.append(path)
345
+
346
+ return changed_files
347
+
348
+ def verify_tree(self) -> Tuple[bool, List[str]]:
349
+ """
350
+ Verify tree integrity by recomputing hashes.
351
+
352
+ Returns:
353
+ Tuple of (is_valid, list_of_errors)
354
+ """
355
+ errors = []
356
+
357
+ if not self.root:
358
+ return False, ["No root node"]
359
+
360
+ # Recompute all hashes and compare
361
+ for path, node in self.node_map.items():
362
+ expected_hash = node.compute_hash(self.project_root)
363
+ if node.hash_value != expected_hash:
364
+ errors.append(f"Hash mismatch for {path}: {node.hash_value} != {expected_hash}")
365
+
366
+ return len(errors) == 0, errors
367
+
368
+ def get_subtree_hash(self, relative_path: str) -> Optional[str]:
369
+ """Get hash for a subtree rooted at the given path."""
370
+ node = self.node_map.get(relative_path)
371
+ return node.hash_value if node else None
372
+
373
+ def export_to_database_models(self) -> List[MerkleNodeModel]:
374
+ """Export tree to database models for persistence."""
375
+ models = []
376
+
377
+ for path, node in self.node_map.items():
378
+ # Determine parent path
379
+ parent_path = None
380
+ if node.parent and node.parent.path:
381
+ parent_path = node.parent.path
382
+
383
+ model = MerkleNodeModel(
384
+ project_id=self.project_id,
385
+ path=path,
386
+ hash=node.hash_value or "",
387
+ node_type=node.node_type,
388
+ parent_path=parent_path,
389
+ children_hash=node._compute_directory_hash() if node.node_type == NodeType.DIRECTORY else None,
390
+ last_modified=node.last_modified,
391
+ )
392
+ models.append(model)
393
+
394
+ return models
395
+
396
+ def clear_changed_nodes(self) -> int:
397
+ """Clear the changed nodes set and return count."""
398
+ count = len(self.changed_nodes)
399
+ self.changed_nodes.clear()
400
+ return count
401
+
402
+ def get_tree_summary(self) -> Dict:
403
+ """Get a summary of the tree structure."""
404
+ return {
405
+ "project_id": self.project_id,
406
+ "project_root": str(self.project_root),
407
+ "stats": {
408
+ "total_nodes": self.stats.total_nodes,
409
+ "file_nodes": self.stats.file_nodes,
410
+ "directory_nodes": self.stats.directory_nodes,
411
+ "max_depth": self.stats.max_depth,
412
+ "total_size": self.stats.total_size,
413
+ "last_updated": self.stats.last_updated.isoformat() if self.stats.last_updated else None,
414
+ },
415
+ "root_hash": self.root.hash_value if self.root else None,
416
+ "last_scan": self.last_scan_time.isoformat() if self.last_scan_time else None,
417
+ "changed_nodes": len(self.changed_nodes),
418
+ }
@@ -0,0 +1,72 @@
1
+ """
2
+ External service providers for vector mode.
3
+
4
+ This package provides integrations with external services including:
5
+ - Voyage AI for embedding generation
6
+ - Turbopuffer for vector storage and search
7
+ """
8
+
9
+ from typing import Protocol, List, Dict, Any, Optional
10
+ from abc import abstractmethod
11
+
12
+ class EmbeddingProvider(Protocol):
13
+ """Protocol for embedding generation providers."""
14
+
15
+ @abstractmethod
16
+ async def generate_embeddings(
17
+ self,
18
+ texts: List[str],
19
+ input_type: str = "document",
20
+ **kwargs
21
+ ) -> List[List[float]]:
22
+ """Generate embeddings for a list of texts."""
23
+ ...
24
+
25
+ @abstractmethod
26
+ async def get_embedding_dimension(self) -> int:
27
+ """Get the dimension of embeddings produced by this provider."""
28
+ ...
29
+
30
+ class VectorStoreProvider(Protocol):
31
+ """Protocol for vector storage providers."""
32
+
33
+ @abstractmethod
34
+ async def upsert_vectors(
35
+ self,
36
+ vectors: List[Dict[str, Any]],
37
+ namespace: Optional[str] = None,
38
+ **kwargs
39
+ ) -> Dict[str, Any]:
40
+ """Store or update vectors in the database."""
41
+ ...
42
+
43
+ @abstractmethod
44
+ async def search_vectors(
45
+ self,
46
+ query_vector: List[float],
47
+ top_k: int = 10,
48
+ namespace: Optional[str] = None,
49
+ filters: Optional[Dict[str, Any]] = None,
50
+ **kwargs
51
+ ) -> List[Dict[str, Any]]:
52
+ """Search for similar vectors."""
53
+ ...
54
+
55
+ @abstractmethod
56
+ async def delete_vectors(
57
+ self,
58
+ vector_ids: List[str],
59
+ namespace: Optional[str] = None,
60
+ **kwargs
61
+ ) -> Dict[str, Any]:
62
+ """Delete vectors by ID."""
63
+ ...
64
+
65
+ @abstractmethod
66
+ async def get_namespace_stats(
67
+ self,
68
+ namespace: Optional[str] = None,
69
+ **kwargs
70
+ ) -> Dict[str, Any]:
71
+ """Get statistics about a namespace."""
72
+ ...