mcp-code-indexer 4.0.1__py3-none-any.whl → 4.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/__init__.py +7 -5
- mcp_code_indexer/ask_handler.py +2 -2
- mcp_code_indexer/claude_api_handler.py +10 -5
- mcp_code_indexer/cleanup_manager.py +20 -12
- mcp_code_indexer/commands/makelocal.py +85 -63
- mcp_code_indexer/data/stop_words_english.txt +1 -1
- mcp_code_indexer/database/connection_health.py +29 -20
- mcp_code_indexer/database/database.py +44 -31
- mcp_code_indexer/database/database_factory.py +19 -20
- mcp_code_indexer/database/exceptions.py +10 -10
- mcp_code_indexer/database/models.py +126 -1
- mcp_code_indexer/database/path_resolver.py +22 -21
- mcp_code_indexer/database/retry_executor.py +37 -19
- mcp_code_indexer/deepask_handler.py +3 -3
- mcp_code_indexer/error_handler.py +46 -20
- mcp_code_indexer/file_scanner.py +15 -12
- mcp_code_indexer/git_hook_handler.py +71 -76
- mcp_code_indexer/logging_config.py +13 -5
- mcp_code_indexer/main.py +85 -22
- mcp_code_indexer/middleware/__init__.py +1 -1
- mcp_code_indexer/middleware/auth.py +47 -43
- mcp_code_indexer/middleware/error_middleware.py +15 -15
- mcp_code_indexer/middleware/logging.py +44 -42
- mcp_code_indexer/middleware/security.py +84 -76
- mcp_code_indexer/migrations/002_performance_indexes.sql +1 -1
- mcp_code_indexer/migrations/004_remove_branch_dependency.sql +14 -14
- mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
- mcp_code_indexer/query_preprocessor.py +2 -2
- mcp_code_indexer/server/mcp_server.py +158 -94
- mcp_code_indexer/transport/__init__.py +1 -1
- mcp_code_indexer/transport/base.py +19 -17
- mcp_code_indexer/transport/http_transport.py +89 -76
- mcp_code_indexer/transport/stdio_transport.py +12 -8
- mcp_code_indexer/vector_mode/__init__.py +36 -0
- mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
- mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
- mcp_code_indexer/vector_mode/config.py +155 -0
- mcp_code_indexer/vector_mode/daemon.py +335 -0
- mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
- mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
- mcp_code_indexer/vector_mode/providers/__init__.py +72 -0
- mcp_code_indexer/vector_mode/providers/base_provider.py +230 -0
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +338 -0
- mcp_code_indexer/vector_mode/providers/voyage_client.py +212 -0
- mcp_code_indexer/vector_mode/security/__init__.py +11 -0
- mcp_code_indexer/vector_mode/security/patterns.py +297 -0
- mcp_code_indexer/vector_mode/security/redactor.py +368 -0
- {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/METADATA +82 -24
- mcp_code_indexer-4.1.0.dist-info/RECORD +66 -0
- mcp_code_indexer-4.0.1.dist-info/RECORD +0 -47
- {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/LICENSE +0 -0
- {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/WHEEL +0 -0
- {mcp_code_indexer-4.0.1.dist-info → mcp_code_indexer-4.1.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,418 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Merkle tree implementation for efficient change detection.
|
|
3
|
+
|
|
4
|
+
Provides a hierarchical hash tree for detecting file system changes
|
|
5
|
+
without scanning entire directory structures.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import hashlib
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
import json
|
|
15
|
+
import os
|
|
16
|
+
|
|
17
|
+
from ...database.models import MerkleNode as MerkleNodeModel, NodeType
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class TreeStats:
|
|
23
|
+
"""Statistics about the Merkle tree."""
|
|
24
|
+
total_nodes: int = 0
|
|
25
|
+
file_nodes: int = 0
|
|
26
|
+
directory_nodes: int = 0
|
|
27
|
+
max_depth: int = 0
|
|
28
|
+
total_size: int = 0
|
|
29
|
+
last_updated: Optional[datetime] = None
|
|
30
|
+
|
|
31
|
+
class MerkleNode:
|
|
32
|
+
"""
|
|
33
|
+
Node in the Merkle tree representing a file or directory.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
path: str,
|
|
39
|
+
node_type: NodeType,
|
|
40
|
+
hash_value: Optional[str] = None,
|
|
41
|
+
parent: Optional["MerkleNode"] = None,
|
|
42
|
+
):
|
|
43
|
+
"""
|
|
44
|
+
Initialize a Merkle tree node.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
path: Relative path from project root
|
|
48
|
+
node_type: Type of node (file or directory)
|
|
49
|
+
hash_value: Hash value for the node
|
|
50
|
+
parent: Parent node
|
|
51
|
+
"""
|
|
52
|
+
self.path = path
|
|
53
|
+
self.node_type = node_type
|
|
54
|
+
self.hash_value = hash_value
|
|
55
|
+
self.parent = parent
|
|
56
|
+
self.children: Dict[str, "MerkleNode"] = {}
|
|
57
|
+
self.last_modified = datetime.utcnow()
|
|
58
|
+
self.size: Optional[int] = None
|
|
59
|
+
self.metadata: Dict[str, any] = {}
|
|
60
|
+
|
|
61
|
+
def add_child(self, name: str, child: "MerkleNode") -> None:
|
|
62
|
+
"""Add a child node."""
|
|
63
|
+
child.parent = self
|
|
64
|
+
self.children[name] = child
|
|
65
|
+
|
|
66
|
+
def remove_child(self, name: str) -> Optional["MerkleNode"]:
|
|
67
|
+
"""Remove and return a child node."""
|
|
68
|
+
return self.children.pop(name, None)
|
|
69
|
+
|
|
70
|
+
def get_child(self, name: str) -> Optional["MerkleNode"]:
|
|
71
|
+
"""Get a child node by name."""
|
|
72
|
+
return self.children.get(name)
|
|
73
|
+
|
|
74
|
+
def compute_hash(self, project_root: Path) -> str:
|
|
75
|
+
"""Compute hash for this node."""
|
|
76
|
+
if self.node_type == NodeType.FILE:
|
|
77
|
+
return self._compute_file_hash(project_root)
|
|
78
|
+
else:
|
|
79
|
+
return self._compute_directory_hash()
|
|
80
|
+
|
|
81
|
+
def _compute_file_hash(self, project_root: Path) -> str:
|
|
82
|
+
"""Compute hash for a file node."""
|
|
83
|
+
file_path = project_root / self.path
|
|
84
|
+
|
|
85
|
+
try:
|
|
86
|
+
if not file_path.exists():
|
|
87
|
+
return "deleted"
|
|
88
|
+
|
|
89
|
+
# Use file modification time and size for quick comparison
|
|
90
|
+
stat = file_path.stat()
|
|
91
|
+
self.size = stat.st_size
|
|
92
|
+
|
|
93
|
+
# For small files, use content hash
|
|
94
|
+
if stat.st_size < 1024 * 1024: # 1MB
|
|
95
|
+
with open(file_path, 'rb') as f:
|
|
96
|
+
content = f.read()
|
|
97
|
+
return hashlib.sha256(content).hexdigest()
|
|
98
|
+
else:
|
|
99
|
+
# For large files, use metadata hash
|
|
100
|
+
metadata = f"{stat.st_size}:{stat.st_mtime}"
|
|
101
|
+
return hashlib.sha256(metadata.encode()).hexdigest()
|
|
102
|
+
|
|
103
|
+
except (OSError, PermissionError) as e:
|
|
104
|
+
logger.warning(f"Could not hash file {file_path}: {e}")
|
|
105
|
+
return "error"
|
|
106
|
+
|
|
107
|
+
def _compute_directory_hash(self) -> str:
|
|
108
|
+
"""Compute hash for a directory node based on children."""
|
|
109
|
+
if not self.children:
|
|
110
|
+
return hashlib.sha256(b"empty").hexdigest()
|
|
111
|
+
|
|
112
|
+
# Sort children by name for consistent hashing
|
|
113
|
+
child_hashes = []
|
|
114
|
+
for name in sorted(self.children.keys()):
|
|
115
|
+
child = self.children[name]
|
|
116
|
+
child_hash = child.hash_value or ""
|
|
117
|
+
combined = f"{name}:{child_hash}"
|
|
118
|
+
child_hashes.append(combined)
|
|
119
|
+
|
|
120
|
+
combined_hash = "|".join(child_hashes)
|
|
121
|
+
return hashlib.sha256(combined_hash.encode()).hexdigest()
|
|
122
|
+
|
|
123
|
+
def update_hash(self, project_root: Path) -> bool:
|
|
124
|
+
"""Update hash and return True if it changed."""
|
|
125
|
+
old_hash = self.hash_value
|
|
126
|
+
new_hash = self.compute_hash(project_root)
|
|
127
|
+
|
|
128
|
+
if old_hash != new_hash:
|
|
129
|
+
self.hash_value = new_hash
|
|
130
|
+
self.last_modified = datetime.utcnow()
|
|
131
|
+
return True
|
|
132
|
+
|
|
133
|
+
return False
|
|
134
|
+
|
|
135
|
+
def get_depth(self) -> int:
|
|
136
|
+
"""Get depth of this node in the tree."""
|
|
137
|
+
if self.parent is None:
|
|
138
|
+
return 0
|
|
139
|
+
return self.parent.get_depth() + 1
|
|
140
|
+
|
|
141
|
+
def is_leaf(self) -> bool:
|
|
142
|
+
"""Check if this is a leaf node."""
|
|
143
|
+
return len(self.children) == 0
|
|
144
|
+
|
|
145
|
+
def to_dict(self) -> Dict:
|
|
146
|
+
"""Convert node to dictionary representation."""
|
|
147
|
+
return {
|
|
148
|
+
"path": self.path,
|
|
149
|
+
"node_type": self.node_type.value,
|
|
150
|
+
"hash_value": self.hash_value,
|
|
151
|
+
"last_modified": self.last_modified.isoformat(),
|
|
152
|
+
"size": self.size,
|
|
153
|
+
"children": list(self.children.keys()),
|
|
154
|
+
"metadata": self.metadata,
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
class MerkleTree:
|
|
158
|
+
"""
|
|
159
|
+
Merkle tree for efficient file system change detection.
|
|
160
|
+
|
|
161
|
+
Maintains a hierarchical hash tree of the project structure to quickly
|
|
162
|
+
identify changes without scanning all files.
|
|
163
|
+
"""
|
|
164
|
+
|
|
165
|
+
def __init__(self, project_root: Path, project_id: str):
|
|
166
|
+
"""
|
|
167
|
+
Initialize Merkle tree.
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
project_root: Root directory path
|
|
171
|
+
project_id: Project identifier
|
|
172
|
+
"""
|
|
173
|
+
self.project_root = Path(project_root).resolve()
|
|
174
|
+
self.project_id = project_id
|
|
175
|
+
self.root: Optional[MerkleNode] = None
|
|
176
|
+
self.node_map: Dict[str, MerkleNode] = {} # path -> node mapping
|
|
177
|
+
|
|
178
|
+
# Statistics
|
|
179
|
+
self.stats = TreeStats()
|
|
180
|
+
|
|
181
|
+
# Change tracking
|
|
182
|
+
self.changed_nodes: Set[str] = set()
|
|
183
|
+
self.last_scan_time: Optional[datetime] = None
|
|
184
|
+
|
|
185
|
+
def build_tree(self, ignore_patterns: Optional[List[str]] = None) -> None:
|
|
186
|
+
"""Build the complete Merkle tree by scanning the file system."""
|
|
187
|
+
logger.info(f"Building Merkle tree for {self.project_root}")
|
|
188
|
+
|
|
189
|
+
ignore_patterns = ignore_patterns or [
|
|
190
|
+
"*.log", "*.tmp", "*~", ".git", "__pycache__",
|
|
191
|
+
"node_modules", "*.pyc", "*.pyo", ".DS_Store", "Thumbs.db"
|
|
192
|
+
]
|
|
193
|
+
|
|
194
|
+
# Create root node
|
|
195
|
+
self.root = MerkleNode("", NodeType.DIRECTORY)
|
|
196
|
+
self.node_map[""] = self.root
|
|
197
|
+
|
|
198
|
+
# Recursively build tree
|
|
199
|
+
self._build_tree_recursive(self.project_root, self.root, ignore_patterns)
|
|
200
|
+
|
|
201
|
+
# Compute hashes bottom-up
|
|
202
|
+
self._compute_hashes_recursive(self.root)
|
|
203
|
+
|
|
204
|
+
# Update statistics
|
|
205
|
+
self._update_stats()
|
|
206
|
+
self.last_scan_time = datetime.utcnow()
|
|
207
|
+
|
|
208
|
+
logger.info(
|
|
209
|
+
f"Built Merkle tree: {self.stats.total_nodes} nodes "
|
|
210
|
+
f"({self.stats.file_nodes} files, {self.stats.directory_nodes} directories)"
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
def _build_tree_recursive(
|
|
214
|
+
self,
|
|
215
|
+
current_path: Path,
|
|
216
|
+
current_node: MerkleNode,
|
|
217
|
+
ignore_patterns: List[str]
|
|
218
|
+
) -> None:
|
|
219
|
+
"""Recursively build tree structure."""
|
|
220
|
+
try:
|
|
221
|
+
if not current_path.is_dir():
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
for item in current_path.iterdir():
|
|
225
|
+
# Check if should ignore
|
|
226
|
+
if self._should_ignore(item, ignore_patterns):
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# Get relative path
|
|
230
|
+
try:
|
|
231
|
+
relative_path = str(item.relative_to(self.project_root))
|
|
232
|
+
except ValueError:
|
|
233
|
+
continue
|
|
234
|
+
|
|
235
|
+
# Create node
|
|
236
|
+
if item.is_file():
|
|
237
|
+
node = MerkleNode(relative_path, NodeType.FILE)
|
|
238
|
+
current_node.add_child(item.name, node)
|
|
239
|
+
self.node_map[relative_path] = node
|
|
240
|
+
|
|
241
|
+
elif item.is_dir():
|
|
242
|
+
node = MerkleNode(relative_path, NodeType.DIRECTORY)
|
|
243
|
+
current_node.add_child(item.name, node)
|
|
244
|
+
self.node_map[relative_path] = node
|
|
245
|
+
|
|
246
|
+
# Recurse into directory
|
|
247
|
+
self._build_tree_recursive(item, node, ignore_patterns)
|
|
248
|
+
|
|
249
|
+
except (OSError, PermissionError) as e:
|
|
250
|
+
logger.warning(f"Could not scan directory {current_path}: {e}")
|
|
251
|
+
|
|
252
|
+
def _should_ignore(self, path: Path, ignore_patterns: List[str]) -> bool:
|
|
253
|
+
"""Check if path should be ignored."""
|
|
254
|
+
import fnmatch
|
|
255
|
+
|
|
256
|
+
path_str = path.name
|
|
257
|
+
for pattern in ignore_patterns:
|
|
258
|
+
if fnmatch.fnmatch(path_str, pattern):
|
|
259
|
+
return True
|
|
260
|
+
|
|
261
|
+
return False
|
|
262
|
+
|
|
263
|
+
def _compute_hashes_recursive(self, node: MerkleNode) -> None:
|
|
264
|
+
"""Compute hashes recursively (bottom-up)."""
|
|
265
|
+
# First compute hashes for all children
|
|
266
|
+
for child in node.children.values():
|
|
267
|
+
self._compute_hashes_recursive(child)
|
|
268
|
+
|
|
269
|
+
# Then compute hash for this node
|
|
270
|
+
node.update_hash(self.project_root)
|
|
271
|
+
|
|
272
|
+
def _update_stats(self) -> None:
|
|
273
|
+
"""Update tree statistics."""
|
|
274
|
+
self.stats = TreeStats()
|
|
275
|
+
self.stats.last_updated = datetime.utcnow()
|
|
276
|
+
|
|
277
|
+
if self.root:
|
|
278
|
+
self._update_stats_recursive(self.root, 0)
|
|
279
|
+
|
|
280
|
+
def _update_stats_recursive(self, node: MerkleNode, depth: int) -> None:
|
|
281
|
+
"""Recursively update statistics."""
|
|
282
|
+
self.stats.total_nodes += 1
|
|
283
|
+
self.stats.max_depth = max(self.stats.max_depth, depth)
|
|
284
|
+
|
|
285
|
+
if node.node_type == NodeType.FILE:
|
|
286
|
+
self.stats.file_nodes += 1
|
|
287
|
+
if node.size:
|
|
288
|
+
self.stats.total_size += node.size
|
|
289
|
+
else:
|
|
290
|
+
self.stats.directory_nodes += 1
|
|
291
|
+
|
|
292
|
+
for child in node.children.values():
|
|
293
|
+
self._update_stats_recursive(child, depth + 1)
|
|
294
|
+
|
|
295
|
+
def update_file(self, relative_path: str) -> bool:
|
|
296
|
+
"""
|
|
297
|
+
Update a file in the tree and return True if hash changed.
|
|
298
|
+
|
|
299
|
+
Args:
|
|
300
|
+
relative_path: Path relative to project root
|
|
301
|
+
|
|
302
|
+
Returns:
|
|
303
|
+
True if the file's hash changed
|
|
304
|
+
"""
|
|
305
|
+
node = self.node_map.get(relative_path)
|
|
306
|
+
if not node:
|
|
307
|
+
# File might be new, rebuild tree
|
|
308
|
+
logger.info(f"File {relative_path} not in tree, triggering rebuild")
|
|
309
|
+
self.build_tree()
|
|
310
|
+
return True
|
|
311
|
+
|
|
312
|
+
# Update file hash
|
|
313
|
+
changed = node.update_hash(self.project_root)
|
|
314
|
+
|
|
315
|
+
if changed:
|
|
316
|
+
self.changed_nodes.add(relative_path)
|
|
317
|
+
|
|
318
|
+
# Propagate hash changes up the tree
|
|
319
|
+
self._propagate_hash_changes(node.parent)
|
|
320
|
+
|
|
321
|
+
return changed
|
|
322
|
+
|
|
323
|
+
def _propagate_hash_changes(self, node: Optional[MerkleNode]) -> None:
|
|
324
|
+
"""Propagate hash changes up the tree."""
|
|
325
|
+
if not node:
|
|
326
|
+
return
|
|
327
|
+
|
|
328
|
+
old_hash = node.hash_value
|
|
329
|
+
node.update_hash(self.project_root)
|
|
330
|
+
|
|
331
|
+
if old_hash != node.hash_value:
|
|
332
|
+
self.changed_nodes.add(node.path)
|
|
333
|
+
self._propagate_hash_changes(node.parent)
|
|
334
|
+
|
|
335
|
+
def get_changed_files(self, since: Optional[datetime] = None) -> List[str]:
|
|
336
|
+
"""Get list of files that changed since timestamp."""
|
|
337
|
+
if since is None:
|
|
338
|
+
return list(self.changed_nodes)
|
|
339
|
+
|
|
340
|
+
changed_files = []
|
|
341
|
+
for path in self.changed_nodes:
|
|
342
|
+
node = self.node_map.get(path)
|
|
343
|
+
if node and node.last_modified >= since:
|
|
344
|
+
changed_files.append(path)
|
|
345
|
+
|
|
346
|
+
return changed_files
|
|
347
|
+
|
|
348
|
+
def verify_tree(self) -> Tuple[bool, List[str]]:
|
|
349
|
+
"""
|
|
350
|
+
Verify tree integrity by recomputing hashes.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Tuple of (is_valid, list_of_errors)
|
|
354
|
+
"""
|
|
355
|
+
errors = []
|
|
356
|
+
|
|
357
|
+
if not self.root:
|
|
358
|
+
return False, ["No root node"]
|
|
359
|
+
|
|
360
|
+
# Recompute all hashes and compare
|
|
361
|
+
for path, node in self.node_map.items():
|
|
362
|
+
expected_hash = node.compute_hash(self.project_root)
|
|
363
|
+
if node.hash_value != expected_hash:
|
|
364
|
+
errors.append(f"Hash mismatch for {path}: {node.hash_value} != {expected_hash}")
|
|
365
|
+
|
|
366
|
+
return len(errors) == 0, errors
|
|
367
|
+
|
|
368
|
+
def get_subtree_hash(self, relative_path: str) -> Optional[str]:
|
|
369
|
+
"""Get hash for a subtree rooted at the given path."""
|
|
370
|
+
node = self.node_map.get(relative_path)
|
|
371
|
+
return node.hash_value if node else None
|
|
372
|
+
|
|
373
|
+
def export_to_database_models(self) -> List[MerkleNodeModel]:
|
|
374
|
+
"""Export tree to database models for persistence."""
|
|
375
|
+
models = []
|
|
376
|
+
|
|
377
|
+
for path, node in self.node_map.items():
|
|
378
|
+
# Determine parent path
|
|
379
|
+
parent_path = None
|
|
380
|
+
if node.parent and node.parent.path:
|
|
381
|
+
parent_path = node.parent.path
|
|
382
|
+
|
|
383
|
+
model = MerkleNodeModel(
|
|
384
|
+
project_id=self.project_id,
|
|
385
|
+
path=path,
|
|
386
|
+
hash=node.hash_value or "",
|
|
387
|
+
node_type=node.node_type,
|
|
388
|
+
parent_path=parent_path,
|
|
389
|
+
children_hash=node._compute_directory_hash() if node.node_type == NodeType.DIRECTORY else None,
|
|
390
|
+
last_modified=node.last_modified,
|
|
391
|
+
)
|
|
392
|
+
models.append(model)
|
|
393
|
+
|
|
394
|
+
return models
|
|
395
|
+
|
|
396
|
+
def clear_changed_nodes(self) -> int:
|
|
397
|
+
"""Clear the changed nodes set and return count."""
|
|
398
|
+
count = len(self.changed_nodes)
|
|
399
|
+
self.changed_nodes.clear()
|
|
400
|
+
return count
|
|
401
|
+
|
|
402
|
+
def get_tree_summary(self) -> Dict:
|
|
403
|
+
"""Get a summary of the tree structure."""
|
|
404
|
+
return {
|
|
405
|
+
"project_id": self.project_id,
|
|
406
|
+
"project_root": str(self.project_root),
|
|
407
|
+
"stats": {
|
|
408
|
+
"total_nodes": self.stats.total_nodes,
|
|
409
|
+
"file_nodes": self.stats.file_nodes,
|
|
410
|
+
"directory_nodes": self.stats.directory_nodes,
|
|
411
|
+
"max_depth": self.stats.max_depth,
|
|
412
|
+
"total_size": self.stats.total_size,
|
|
413
|
+
"last_updated": self.stats.last_updated.isoformat() if self.stats.last_updated else None,
|
|
414
|
+
},
|
|
415
|
+
"root_hash": self.root.hash_value if self.root else None,
|
|
416
|
+
"last_scan": self.last_scan_time.isoformat() if self.last_scan_time else None,
|
|
417
|
+
"changed_nodes": len(self.changed_nodes),
|
|
418
|
+
}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""
|
|
2
|
+
External service providers for vector mode.
|
|
3
|
+
|
|
4
|
+
This package provides integrations with external services including:
|
|
5
|
+
- Voyage AI for embedding generation
|
|
6
|
+
- Turbopuffer for vector storage and search
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Protocol, List, Dict, Any, Optional
|
|
10
|
+
from abc import abstractmethod
|
|
11
|
+
|
|
12
|
+
class EmbeddingProvider(Protocol):
|
|
13
|
+
"""Protocol for embedding generation providers."""
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
async def generate_embeddings(
|
|
17
|
+
self,
|
|
18
|
+
texts: List[str],
|
|
19
|
+
input_type: str = "document",
|
|
20
|
+
**kwargs
|
|
21
|
+
) -> List[List[float]]:
|
|
22
|
+
"""Generate embeddings for a list of texts."""
|
|
23
|
+
...
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
async def get_embedding_dimension(self) -> int:
|
|
27
|
+
"""Get the dimension of embeddings produced by this provider."""
|
|
28
|
+
...
|
|
29
|
+
|
|
30
|
+
class VectorStoreProvider(Protocol):
|
|
31
|
+
"""Protocol for vector storage providers."""
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
async def upsert_vectors(
|
|
35
|
+
self,
|
|
36
|
+
vectors: List[Dict[str, Any]],
|
|
37
|
+
namespace: Optional[str] = None,
|
|
38
|
+
**kwargs
|
|
39
|
+
) -> Dict[str, Any]:
|
|
40
|
+
"""Store or update vectors in the database."""
|
|
41
|
+
...
|
|
42
|
+
|
|
43
|
+
@abstractmethod
|
|
44
|
+
async def search_vectors(
|
|
45
|
+
self,
|
|
46
|
+
query_vector: List[float],
|
|
47
|
+
top_k: int = 10,
|
|
48
|
+
namespace: Optional[str] = None,
|
|
49
|
+
filters: Optional[Dict[str, Any]] = None,
|
|
50
|
+
**kwargs
|
|
51
|
+
) -> List[Dict[str, Any]]:
|
|
52
|
+
"""Search for similar vectors."""
|
|
53
|
+
...
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
async def delete_vectors(
|
|
57
|
+
self,
|
|
58
|
+
vector_ids: List[str],
|
|
59
|
+
namespace: Optional[str] = None,
|
|
60
|
+
**kwargs
|
|
61
|
+
) -> Dict[str, Any]:
|
|
62
|
+
"""Delete vectors by ID."""
|
|
63
|
+
...
|
|
64
|
+
|
|
65
|
+
@abstractmethod
|
|
66
|
+
async def get_namespace_stats(
|
|
67
|
+
self,
|
|
68
|
+
namespace: Optional[str] = None,
|
|
69
|
+
**kwargs
|
|
70
|
+
) -> Dict[str, Any]:
|
|
71
|
+
"""Get statistics about a namespace."""
|
|
72
|
+
...
|