mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/models.py +125 -1
- mcp_code_indexer/main.py +60 -0
- mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
- mcp_code_indexer/server/mcp_server.py +3 -0
- mcp_code_indexer/vector_mode/__init__.py +36 -0
- mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
- mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
- mcp_code_indexer/vector_mode/config.py +167 -0
- mcp_code_indexer/vector_mode/daemon.py +335 -0
- mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
- mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
- mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
- mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
- mcp_code_indexer/vector_mode/security/__init__.py +11 -0
- mcp_code_indexer/vector_mode/security/patterns.py +297 -0
- mcp_code_indexer/vector_mode/security/redactor.py +368 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
- {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,445 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Real-time file system monitoring using watchdog.
|
|
3
|
+
|
|
4
|
+
Provides efficient file change detection with debouncing and pattern filtering
|
|
5
|
+
for the vector mode indexing pipeline.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Callable, Optional, List, Dict, Any
|
|
12
|
+
import time
|
|
13
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
from watchdog.observers import Observer
|
|
17
|
+
from watchdog.events import FileSystemEventHandler, FileSystemEvent
|
|
18
|
+
WATCHDOG_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
WATCHDOG_AVAILABLE = False
|
|
21
|
+
Observer = None
|
|
22
|
+
FileSystemEventHandler = None
|
|
23
|
+
FileSystemEvent = None
|
|
24
|
+
|
|
25
|
+
from .change_detector import ChangeDetector, FileChange, ChangeType
|
|
26
|
+
from .merkle_tree import MerkleTree
|
|
27
|
+
|
|
28
|
+
logger = logging.getLogger(__name__)
|
|
29
|
+
|
|
30
|
+
class VectorModeEventHandler(FileSystemEventHandler):
|
|
31
|
+
"""Event handler for file system changes in vector mode."""
|
|
32
|
+
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
change_detector: ChangeDetector,
|
|
36
|
+
merkle_tree: Optional[MerkleTree] = None,
|
|
37
|
+
callback: Optional[Callable[[FileChange], None]] = None,
|
|
38
|
+
):
|
|
39
|
+
"""
|
|
40
|
+
Initialize event handler.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
change_detector: Change detection processor
|
|
44
|
+
merkle_tree: Optional Merkle tree for efficient change tracking
|
|
45
|
+
callback: Optional callback for change notifications
|
|
46
|
+
"""
|
|
47
|
+
super().__init__()
|
|
48
|
+
self.change_detector = change_detector
|
|
49
|
+
self.merkle_tree = merkle_tree
|
|
50
|
+
self.callback = callback
|
|
51
|
+
|
|
52
|
+
# Debouncing state
|
|
53
|
+
self.pending_events: Dict[str, FileSystemEvent] = {}
|
|
54
|
+
self.debounce_tasks: Dict[str, asyncio.Task] = {}
|
|
55
|
+
|
|
56
|
+
def on_any_event(self, event: FileSystemEvent) -> None:
|
|
57
|
+
"""Handle any file system event."""
|
|
58
|
+
if event.is_directory:
|
|
59
|
+
return # Skip directory events for now
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
asyncio.create_task(self._handle_event_async(event))
|
|
63
|
+
except RuntimeError:
|
|
64
|
+
# No event loop running, handle synchronously
|
|
65
|
+
self._handle_event_sync(event)
|
|
66
|
+
|
|
67
|
+
def _handle_event_sync(self, event: FileSystemEvent) -> None:
|
|
68
|
+
"""Handle event synchronously."""
|
|
69
|
+
path = Path(event.src_path)
|
|
70
|
+
|
|
71
|
+
# Process the change
|
|
72
|
+
change = self.change_detector.process_fs_event(
|
|
73
|
+
event_type=event.event_type,
|
|
74
|
+
path=path,
|
|
75
|
+
old_path=Path(event.dest_path) if hasattr(event, 'dest_path') else None
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
if change:
|
|
79
|
+
# Update Merkle tree if available
|
|
80
|
+
if self.merkle_tree:
|
|
81
|
+
try:
|
|
82
|
+
self.merkle_tree.update_file(change.path)
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.warning(f"Failed to update Merkle tree for {change.path}: {e}")
|
|
85
|
+
|
|
86
|
+
# Call callback if provided
|
|
87
|
+
if self.callback:
|
|
88
|
+
try:
|
|
89
|
+
self.callback(change)
|
|
90
|
+
except Exception as e:
|
|
91
|
+
logger.error(f"Callback failed for change {change.path}: {e}")
|
|
92
|
+
|
|
93
|
+
async def _handle_event_async(self, event: FileSystemEvent) -> None:
|
|
94
|
+
"""Handle event asynchronously with debouncing."""
|
|
95
|
+
file_path = event.src_path
|
|
96
|
+
|
|
97
|
+
# Cancel existing debounce task for this file
|
|
98
|
+
if file_path in self.debounce_tasks:
|
|
99
|
+
self.debounce_tasks[file_path].cancel()
|
|
100
|
+
|
|
101
|
+
# Store pending event
|
|
102
|
+
self.pending_events[file_path] = event
|
|
103
|
+
|
|
104
|
+
# Create new debounce task
|
|
105
|
+
self.debounce_tasks[file_path] = asyncio.create_task(
|
|
106
|
+
self._process_after_debounce(file_path)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
async def _process_after_debounce(self, file_path: str) -> None:
|
|
110
|
+
"""Process event after debounce delay."""
|
|
111
|
+
# Wait for debounce interval
|
|
112
|
+
await asyncio.sleep(0.1) # 100ms debounce
|
|
113
|
+
|
|
114
|
+
# Get pending event
|
|
115
|
+
event = self.pending_events.pop(file_path, None)
|
|
116
|
+
if event:
|
|
117
|
+
self._handle_event_sync(event)
|
|
118
|
+
|
|
119
|
+
# Clean up task reference
|
|
120
|
+
self.debounce_tasks.pop(file_path, None)
|
|
121
|
+
|
|
122
|
+
class FileWatcher:
|
|
123
|
+
"""
|
|
124
|
+
Real-time file system watcher for vector mode.
|
|
125
|
+
|
|
126
|
+
Monitors file changes and integrates with change detection and Merkle tree
|
|
127
|
+
systems for efficient vector index updates.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
def __init__(
|
|
131
|
+
self,
|
|
132
|
+
project_root: Path,
|
|
133
|
+
project_id: str,
|
|
134
|
+
ignore_patterns: Optional[List[str]] = None,
|
|
135
|
+
debounce_interval: float = 0.1,
|
|
136
|
+
enable_merkle_tree: bool = True,
|
|
137
|
+
):
|
|
138
|
+
"""
|
|
139
|
+
Initialize file watcher.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
project_root: Root directory to watch
|
|
143
|
+
project_id: Project identifier
|
|
144
|
+
ignore_patterns: Patterns to ignore
|
|
145
|
+
debounce_interval: Debounce interval in seconds
|
|
146
|
+
enable_merkle_tree: Whether to use Merkle tree for change tracking
|
|
147
|
+
"""
|
|
148
|
+
if not WATCHDOG_AVAILABLE:
|
|
149
|
+
raise ImportError("watchdog library is required for file monitoring")
|
|
150
|
+
|
|
151
|
+
self.project_root = Path(project_root).resolve()
|
|
152
|
+
self.project_id = project_id
|
|
153
|
+
self.ignore_patterns = ignore_patterns
|
|
154
|
+
self.debounce_interval = debounce_interval
|
|
155
|
+
|
|
156
|
+
# Initialize components
|
|
157
|
+
self.change_detector = ChangeDetector(
|
|
158
|
+
project_root=self.project_root,
|
|
159
|
+
ignore_patterns=ignore_patterns,
|
|
160
|
+
debounce_interval=debounce_interval,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
self.merkle_tree: Optional[MerkleTree] = None
|
|
164
|
+
if enable_merkle_tree:
|
|
165
|
+
self.merkle_tree = MerkleTree(self.project_root, project_id)
|
|
166
|
+
|
|
167
|
+
# Watchdog components
|
|
168
|
+
self.observer: Optional[Observer] = None
|
|
169
|
+
self.event_handler: Optional[VectorModeEventHandler] = None
|
|
170
|
+
|
|
171
|
+
# State
|
|
172
|
+
self.is_watching = False
|
|
173
|
+
self.change_callbacks: List[Callable[[FileChange], None]] = []
|
|
174
|
+
|
|
175
|
+
# Thread pool for intensive operations
|
|
176
|
+
self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="file_watcher")
|
|
177
|
+
|
|
178
|
+
def add_change_callback(self, callback: Callable[[FileChange], None]) -> None:
|
|
179
|
+
"""Add a callback to be called when files change."""
|
|
180
|
+
self.change_callbacks.append(callback)
|
|
181
|
+
|
|
182
|
+
def remove_change_callback(self, callback: Callable[[FileChange], None]) -> None:
|
|
183
|
+
"""Remove a change callback."""
|
|
184
|
+
if callback in self.change_callbacks:
|
|
185
|
+
self.change_callbacks.remove(callback)
|
|
186
|
+
|
|
187
|
+
def _on_change(self, change: FileChange) -> None:
|
|
188
|
+
"""Handle a file change by notifying all callbacks."""
|
|
189
|
+
for callback in self.change_callbacks:
|
|
190
|
+
try:
|
|
191
|
+
callback(change)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
logger.error(f"Change callback failed: {e}")
|
|
194
|
+
|
|
195
|
+
async def initialize(self) -> None:
|
|
196
|
+
"""Initialize the file watcher (build Merkle tree, etc.)."""
|
|
197
|
+
logger.info(f"Initializing file watcher for {self.project_root}")
|
|
198
|
+
|
|
199
|
+
# Build Merkle tree in thread pool to avoid blocking
|
|
200
|
+
if self.merkle_tree:
|
|
201
|
+
loop = asyncio.get_event_loop()
|
|
202
|
+
await loop.run_in_executor(
|
|
203
|
+
self.executor,
|
|
204
|
+
self.merkle_tree.build_tree,
|
|
205
|
+
self.ignore_patterns
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
logger.info("Merkle tree built successfully")
|
|
209
|
+
|
|
210
|
+
def start_watching(self) -> None:
|
|
211
|
+
"""Start watching for file changes."""
|
|
212
|
+
if self.is_watching:
|
|
213
|
+
logger.warning("File watcher is already running")
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
if not WATCHDOG_AVAILABLE:
|
|
217
|
+
logger.error("Cannot start file watching: watchdog not available")
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
logger.info(f"Starting file watcher for {self.project_root}")
|
|
221
|
+
|
|
222
|
+
# Create event handler
|
|
223
|
+
self.event_handler = VectorModeEventHandler(
|
|
224
|
+
change_detector=self.change_detector,
|
|
225
|
+
merkle_tree=self.merkle_tree,
|
|
226
|
+
callback=self._on_change,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Create and start observer
|
|
230
|
+
self.observer = Observer()
|
|
231
|
+
self.observer.schedule(
|
|
232
|
+
self.event_handler,
|
|
233
|
+
str(self.project_root),
|
|
234
|
+
recursive=True
|
|
235
|
+
)
|
|
236
|
+
self.observer.start()
|
|
237
|
+
|
|
238
|
+
self.is_watching = True
|
|
239
|
+
logger.info("File watcher started successfully")
|
|
240
|
+
|
|
241
|
+
def stop_watching(self) -> None:
|
|
242
|
+
"""Stop watching for file changes."""
|
|
243
|
+
if not self.is_watching:
|
|
244
|
+
return
|
|
245
|
+
|
|
246
|
+
logger.info("Stopping file watcher")
|
|
247
|
+
|
|
248
|
+
if self.observer:
|
|
249
|
+
self.observer.stop()
|
|
250
|
+
self.observer.join()
|
|
251
|
+
self.observer = None
|
|
252
|
+
|
|
253
|
+
self.event_handler = None
|
|
254
|
+
self.is_watching = False
|
|
255
|
+
|
|
256
|
+
logger.info("File watcher stopped")
|
|
257
|
+
|
|
258
|
+
def get_recent_changes(
|
|
259
|
+
self,
|
|
260
|
+
limit: Optional[int] = None,
|
|
261
|
+
change_types: Optional[List[ChangeType]] = None
|
|
262
|
+
) -> List[FileChange]:
|
|
263
|
+
"""Get recent file changes."""
|
|
264
|
+
return self.change_detector.get_recent_changes(limit, change_types)
|
|
265
|
+
|
|
266
|
+
def get_changed_files(self, since: Optional[str] = None) -> List[str]:
|
|
267
|
+
"""Get list of files that have changed."""
|
|
268
|
+
from datetime import datetime
|
|
269
|
+
|
|
270
|
+
since_dt = None
|
|
271
|
+
if since:
|
|
272
|
+
try:
|
|
273
|
+
since_dt = datetime.fromisoformat(since)
|
|
274
|
+
except ValueError:
|
|
275
|
+
logger.warning(f"Invalid timestamp format: {since}")
|
|
276
|
+
|
|
277
|
+
# Get changes from detector
|
|
278
|
+
changed_files = list(self.change_detector.get_changed_files(since_dt))
|
|
279
|
+
|
|
280
|
+
# Add changes from Merkle tree if available
|
|
281
|
+
if self.merkle_tree:
|
|
282
|
+
merkle_changes = self.merkle_tree.get_changed_files(since_dt)
|
|
283
|
+
changed_files.extend(merkle_changes)
|
|
284
|
+
|
|
285
|
+
return list(set(changed_files)) # Remove duplicates
|
|
286
|
+
|
|
287
|
+
def force_scan(self) -> int:
|
|
288
|
+
"""Force a full scan and return number of changes detected."""
|
|
289
|
+
logger.info("Forcing full file system scan")
|
|
290
|
+
|
|
291
|
+
if self.merkle_tree:
|
|
292
|
+
# Rebuild Merkle tree
|
|
293
|
+
self.merkle_tree.build_tree(self.ignore_patterns)
|
|
294
|
+
|
|
295
|
+
# Get changed files
|
|
296
|
+
changed_files = self.merkle_tree.get_changed_files()
|
|
297
|
+
|
|
298
|
+
# Process changes through detector
|
|
299
|
+
for file_path in changed_files:
|
|
300
|
+
full_path = self.project_root / file_path
|
|
301
|
+
change = self.change_detector.process_fs_event(
|
|
302
|
+
event_type="modified",
|
|
303
|
+
path=full_path
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
if change and self.change_callbacks:
|
|
307
|
+
self._on_change(change)
|
|
308
|
+
|
|
309
|
+
return len(changed_files)
|
|
310
|
+
|
|
311
|
+
return 0
|
|
312
|
+
|
|
313
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
314
|
+
"""Get watcher statistics."""
|
|
315
|
+
stats = {
|
|
316
|
+
"is_watching": self.is_watching,
|
|
317
|
+
"project_root": str(self.project_root),
|
|
318
|
+
"project_id": self.project_id,
|
|
319
|
+
"change_detector_stats": self.change_detector.get_stats().__dict__,
|
|
320
|
+
"callbacks_registered": len(self.change_callbacks),
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if self.merkle_tree:
|
|
324
|
+
stats["merkle_tree"] = self.merkle_tree.get_tree_summary()
|
|
325
|
+
|
|
326
|
+
return stats
|
|
327
|
+
|
|
328
|
+
def cleanup(self) -> None:
|
|
329
|
+
"""Clean up resources."""
|
|
330
|
+
self.stop_watching()
|
|
331
|
+
|
|
332
|
+
if self.executor:
|
|
333
|
+
self.executor.shutdown(wait=True)
|
|
334
|
+
|
|
335
|
+
async def __aenter__(self):
|
|
336
|
+
await self.initialize()
|
|
337
|
+
return self
|
|
338
|
+
|
|
339
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
|
340
|
+
self.cleanup()
|
|
341
|
+
|
|
342
|
+
# Fallback implementation for when watchdog is not available
|
|
343
|
+
class PollingFileWatcher:
|
|
344
|
+
"""
|
|
345
|
+
Fallback file watcher using polling instead of OS events.
|
|
346
|
+
|
|
347
|
+
Used when watchdog is not available or on systems that don't support
|
|
348
|
+
efficient file system monitoring.
|
|
349
|
+
"""
|
|
350
|
+
|
|
351
|
+
def __init__(
|
|
352
|
+
self,
|
|
353
|
+
project_root: Path,
|
|
354
|
+
project_id: str,
|
|
355
|
+
poll_interval: float = 5.0,
|
|
356
|
+
**kwargs
|
|
357
|
+
):
|
|
358
|
+
"""Initialize polling file watcher."""
|
|
359
|
+
self.project_root = Path(project_root).resolve()
|
|
360
|
+
self.project_id = project_id
|
|
361
|
+
self.poll_interval = poll_interval
|
|
362
|
+
|
|
363
|
+
self.change_detector = ChangeDetector(project_root=self.project_root, **kwargs)
|
|
364
|
+
self.merkle_tree = MerkleTree(self.project_root, project_id)
|
|
365
|
+
|
|
366
|
+
self.is_watching = False
|
|
367
|
+
self.poll_task: Optional[asyncio.Task] = None
|
|
368
|
+
self.change_callbacks: List[Callable[[FileChange], None]] = []
|
|
369
|
+
|
|
370
|
+
def add_change_callback(self, callback: Callable[[FileChange], None]) -> None:
|
|
371
|
+
"""Add a callback to be called when files change."""
|
|
372
|
+
self.change_callbacks.append(callback)
|
|
373
|
+
|
|
374
|
+
async def initialize(self) -> None:
|
|
375
|
+
"""Initialize the polling watcher."""
|
|
376
|
+
self.merkle_tree.build_tree()
|
|
377
|
+
|
|
378
|
+
def start_watching(self) -> None:
|
|
379
|
+
"""Start polling for changes."""
|
|
380
|
+
if self.is_watching:
|
|
381
|
+
return
|
|
382
|
+
|
|
383
|
+
self.is_watching = True
|
|
384
|
+
self.poll_task = asyncio.create_task(self._poll_loop())
|
|
385
|
+
|
|
386
|
+
def stop_watching(self) -> None:
|
|
387
|
+
"""Stop polling for changes."""
|
|
388
|
+
self.is_watching = False
|
|
389
|
+
if self.poll_task:
|
|
390
|
+
self.poll_task.cancel()
|
|
391
|
+
|
|
392
|
+
async def _poll_loop(self) -> None:
|
|
393
|
+
"""Main polling loop."""
|
|
394
|
+
while self.is_watching:
|
|
395
|
+
try:
|
|
396
|
+
# Force scan for changes
|
|
397
|
+
changed_files = self.merkle_tree.get_changed_files()
|
|
398
|
+
|
|
399
|
+
for file_path in changed_files:
|
|
400
|
+
full_path = self.project_root / file_path
|
|
401
|
+
change = self.change_detector.process_fs_event(
|
|
402
|
+
event_type="modified",
|
|
403
|
+
path=full_path
|
|
404
|
+
)
|
|
405
|
+
|
|
406
|
+
if change:
|
|
407
|
+
for callback in self.change_callbacks:
|
|
408
|
+
callback(change)
|
|
409
|
+
|
|
410
|
+
await asyncio.sleep(self.poll_interval)
|
|
411
|
+
|
|
412
|
+
except asyncio.CancelledError:
|
|
413
|
+
break
|
|
414
|
+
except Exception as e:
|
|
415
|
+
logger.error(f"Error in polling loop: {e}")
|
|
416
|
+
await asyncio.sleep(self.poll_interval)
|
|
417
|
+
|
|
418
|
+
def cleanup(self) -> None:
|
|
419
|
+
"""Clean up resources."""
|
|
420
|
+
self.stop_watching()
|
|
421
|
+
|
|
422
|
+
def create_file_watcher(
|
|
423
|
+
project_root: Path,
|
|
424
|
+
project_id: str,
|
|
425
|
+
use_polling: bool = False,
|
|
426
|
+
**kwargs
|
|
427
|
+
) -> Any:
|
|
428
|
+
"""
|
|
429
|
+
Create appropriate file watcher based on availability.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
project_root: Root directory to watch
|
|
433
|
+
project_id: Project identifier
|
|
434
|
+
use_polling: Force use of polling watcher
|
|
435
|
+
**kwargs: Additional arguments for watcher
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
FileWatcher or PollingFileWatcher instance
|
|
439
|
+
"""
|
|
440
|
+
if use_polling or not WATCHDOG_AVAILABLE:
|
|
441
|
+
logger.info("Using polling file watcher")
|
|
442
|
+
return PollingFileWatcher(project_root, project_id, **kwargs)
|
|
443
|
+
else:
|
|
444
|
+
logger.info("Using real-time file watcher")
|
|
445
|
+
return FileWatcher(project_root, project_id, **kwargs)
|