mcp-code-indexer 4.0.2__py3-none-any.whl → 4.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/models.py +125 -1
  2. mcp_code_indexer/main.py +60 -0
  3. mcp_code_indexer/migrations/006_vector_mode.sql +189 -0
  4. mcp_code_indexer/server/mcp_server.py +3 -0
  5. mcp_code_indexer/vector_mode/__init__.py +36 -0
  6. mcp_code_indexer/vector_mode/chunking/__init__.py +19 -0
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +403 -0
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +500 -0
  9. mcp_code_indexer/vector_mode/chunking/language_handlers.py +478 -0
  10. mcp_code_indexer/vector_mode/config.py +167 -0
  11. mcp_code_indexer/vector_mode/daemon.py +335 -0
  12. mcp_code_indexer/vector_mode/monitoring/__init__.py +19 -0
  13. mcp_code_indexer/vector_mode/monitoring/change_detector.py +312 -0
  14. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +445 -0
  15. mcp_code_indexer/vector_mode/monitoring/merkle_tree.py +418 -0
  16. mcp_code_indexer/vector_mode/providers/__init__.py +17 -0
  17. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +217 -0
  18. mcp_code_indexer/vector_mode/providers/voyage_client.py +119 -0
  19. mcp_code_indexer/vector_mode/security/__init__.py +11 -0
  20. mcp_code_indexer/vector_mode/security/patterns.py +297 -0
  21. mcp_code_indexer/vector_mode/security/redactor.py +368 -0
  22. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/METADATA +66 -5
  23. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/RECORD +26 -8
  24. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/LICENSE +0 -0
  25. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/WHEEL +0 -0
  26. {mcp_code_indexer-4.0.2.dist-info → mcp_code_indexer-4.2.0.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,445 @@
1
+ """
2
+ Real-time file system monitoring using watchdog.
3
+
4
+ Provides efficient file change detection with debouncing and pattern filtering
5
+ for the vector mode indexing pipeline.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from pathlib import Path
11
+ from typing import Callable, Optional, List, Dict, Any
12
+ import time
13
+ from concurrent.futures import ThreadPoolExecutor
14
+
15
+ try:
16
+ from watchdog.observers import Observer
17
+ from watchdog.events import FileSystemEventHandler, FileSystemEvent
18
+ WATCHDOG_AVAILABLE = True
19
+ except ImportError:
20
+ WATCHDOG_AVAILABLE = False
21
+ Observer = None
22
+ FileSystemEventHandler = None
23
+ FileSystemEvent = None
24
+
25
+ from .change_detector import ChangeDetector, FileChange, ChangeType
26
+ from .merkle_tree import MerkleTree
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ class VectorModeEventHandler(FileSystemEventHandler):
31
+ """Event handler for file system changes in vector mode."""
32
+
33
+ def __init__(
34
+ self,
35
+ change_detector: ChangeDetector,
36
+ merkle_tree: Optional[MerkleTree] = None,
37
+ callback: Optional[Callable[[FileChange], None]] = None,
38
+ ):
39
+ """
40
+ Initialize event handler.
41
+
42
+ Args:
43
+ change_detector: Change detection processor
44
+ merkle_tree: Optional Merkle tree for efficient change tracking
45
+ callback: Optional callback for change notifications
46
+ """
47
+ super().__init__()
48
+ self.change_detector = change_detector
49
+ self.merkle_tree = merkle_tree
50
+ self.callback = callback
51
+
52
+ # Debouncing state
53
+ self.pending_events: Dict[str, FileSystemEvent] = {}
54
+ self.debounce_tasks: Dict[str, asyncio.Task] = {}
55
+
56
+ def on_any_event(self, event: FileSystemEvent) -> None:
57
+ """Handle any file system event."""
58
+ if event.is_directory:
59
+ return # Skip directory events for now
60
+
61
+ try:
62
+ asyncio.create_task(self._handle_event_async(event))
63
+ except RuntimeError:
64
+ # No event loop running, handle synchronously
65
+ self._handle_event_sync(event)
66
+
67
+ def _handle_event_sync(self, event: FileSystemEvent) -> None:
68
+ """Handle event synchronously."""
69
+ path = Path(event.src_path)
70
+
71
+ # Process the change
72
+ change = self.change_detector.process_fs_event(
73
+ event_type=event.event_type,
74
+ path=path,
75
+ old_path=Path(event.dest_path) if hasattr(event, 'dest_path') else None
76
+ )
77
+
78
+ if change:
79
+ # Update Merkle tree if available
80
+ if self.merkle_tree:
81
+ try:
82
+ self.merkle_tree.update_file(change.path)
83
+ except Exception as e:
84
+ logger.warning(f"Failed to update Merkle tree for {change.path}: {e}")
85
+
86
+ # Call callback if provided
87
+ if self.callback:
88
+ try:
89
+ self.callback(change)
90
+ except Exception as e:
91
+ logger.error(f"Callback failed for change {change.path}: {e}")
92
+
93
+ async def _handle_event_async(self, event: FileSystemEvent) -> None:
94
+ """Handle event asynchronously with debouncing."""
95
+ file_path = event.src_path
96
+
97
+ # Cancel existing debounce task for this file
98
+ if file_path in self.debounce_tasks:
99
+ self.debounce_tasks[file_path].cancel()
100
+
101
+ # Store pending event
102
+ self.pending_events[file_path] = event
103
+
104
+ # Create new debounce task
105
+ self.debounce_tasks[file_path] = asyncio.create_task(
106
+ self._process_after_debounce(file_path)
107
+ )
108
+
109
+ async def _process_after_debounce(self, file_path: str) -> None:
110
+ """Process event after debounce delay."""
111
+ # Wait for debounce interval
112
+ await asyncio.sleep(0.1) # 100ms debounce
113
+
114
+ # Get pending event
115
+ event = self.pending_events.pop(file_path, None)
116
+ if event:
117
+ self._handle_event_sync(event)
118
+
119
+ # Clean up task reference
120
+ self.debounce_tasks.pop(file_path, None)
121
+
122
+ class FileWatcher:
123
+ """
124
+ Real-time file system watcher for vector mode.
125
+
126
+ Monitors file changes and integrates with change detection and Merkle tree
127
+ systems for efficient vector index updates.
128
+ """
129
+
130
+ def __init__(
131
+ self,
132
+ project_root: Path,
133
+ project_id: str,
134
+ ignore_patterns: Optional[List[str]] = None,
135
+ debounce_interval: float = 0.1,
136
+ enable_merkle_tree: bool = True,
137
+ ):
138
+ """
139
+ Initialize file watcher.
140
+
141
+ Args:
142
+ project_root: Root directory to watch
143
+ project_id: Project identifier
144
+ ignore_patterns: Patterns to ignore
145
+ debounce_interval: Debounce interval in seconds
146
+ enable_merkle_tree: Whether to use Merkle tree for change tracking
147
+ """
148
+ if not WATCHDOG_AVAILABLE:
149
+ raise ImportError("watchdog library is required for file monitoring")
150
+
151
+ self.project_root = Path(project_root).resolve()
152
+ self.project_id = project_id
153
+ self.ignore_patterns = ignore_patterns
154
+ self.debounce_interval = debounce_interval
155
+
156
+ # Initialize components
157
+ self.change_detector = ChangeDetector(
158
+ project_root=self.project_root,
159
+ ignore_patterns=ignore_patterns,
160
+ debounce_interval=debounce_interval,
161
+ )
162
+
163
+ self.merkle_tree: Optional[MerkleTree] = None
164
+ if enable_merkle_tree:
165
+ self.merkle_tree = MerkleTree(self.project_root, project_id)
166
+
167
+ # Watchdog components
168
+ self.observer: Optional[Observer] = None
169
+ self.event_handler: Optional[VectorModeEventHandler] = None
170
+
171
+ # State
172
+ self.is_watching = False
173
+ self.change_callbacks: List[Callable[[FileChange], None]] = []
174
+
175
+ # Thread pool for intensive operations
176
+ self.executor = ThreadPoolExecutor(max_workers=2, thread_name_prefix="file_watcher")
177
+
178
+ def add_change_callback(self, callback: Callable[[FileChange], None]) -> None:
179
+ """Add a callback to be called when files change."""
180
+ self.change_callbacks.append(callback)
181
+
182
+ def remove_change_callback(self, callback: Callable[[FileChange], None]) -> None:
183
+ """Remove a change callback."""
184
+ if callback in self.change_callbacks:
185
+ self.change_callbacks.remove(callback)
186
+
187
+ def _on_change(self, change: FileChange) -> None:
188
+ """Handle a file change by notifying all callbacks."""
189
+ for callback in self.change_callbacks:
190
+ try:
191
+ callback(change)
192
+ except Exception as e:
193
+ logger.error(f"Change callback failed: {e}")
194
+
195
+ async def initialize(self) -> None:
196
+ """Initialize the file watcher (build Merkle tree, etc.)."""
197
+ logger.info(f"Initializing file watcher for {self.project_root}")
198
+
199
+ # Build Merkle tree in thread pool to avoid blocking
200
+ if self.merkle_tree:
201
+ loop = asyncio.get_event_loop()
202
+ await loop.run_in_executor(
203
+ self.executor,
204
+ self.merkle_tree.build_tree,
205
+ self.ignore_patterns
206
+ )
207
+
208
+ logger.info("Merkle tree built successfully")
209
+
210
+ def start_watching(self) -> None:
211
+ """Start watching for file changes."""
212
+ if self.is_watching:
213
+ logger.warning("File watcher is already running")
214
+ return
215
+
216
+ if not WATCHDOG_AVAILABLE:
217
+ logger.error("Cannot start file watching: watchdog not available")
218
+ return
219
+
220
+ logger.info(f"Starting file watcher for {self.project_root}")
221
+
222
+ # Create event handler
223
+ self.event_handler = VectorModeEventHandler(
224
+ change_detector=self.change_detector,
225
+ merkle_tree=self.merkle_tree,
226
+ callback=self._on_change,
227
+ )
228
+
229
+ # Create and start observer
230
+ self.observer = Observer()
231
+ self.observer.schedule(
232
+ self.event_handler,
233
+ str(self.project_root),
234
+ recursive=True
235
+ )
236
+ self.observer.start()
237
+
238
+ self.is_watching = True
239
+ logger.info("File watcher started successfully")
240
+
241
+ def stop_watching(self) -> None:
242
+ """Stop watching for file changes."""
243
+ if not self.is_watching:
244
+ return
245
+
246
+ logger.info("Stopping file watcher")
247
+
248
+ if self.observer:
249
+ self.observer.stop()
250
+ self.observer.join()
251
+ self.observer = None
252
+
253
+ self.event_handler = None
254
+ self.is_watching = False
255
+
256
+ logger.info("File watcher stopped")
257
+
258
+ def get_recent_changes(
259
+ self,
260
+ limit: Optional[int] = None,
261
+ change_types: Optional[List[ChangeType]] = None
262
+ ) -> List[FileChange]:
263
+ """Get recent file changes."""
264
+ return self.change_detector.get_recent_changes(limit, change_types)
265
+
266
+ def get_changed_files(self, since: Optional[str] = None) -> List[str]:
267
+ """Get list of files that have changed."""
268
+ from datetime import datetime
269
+
270
+ since_dt = None
271
+ if since:
272
+ try:
273
+ since_dt = datetime.fromisoformat(since)
274
+ except ValueError:
275
+ logger.warning(f"Invalid timestamp format: {since}")
276
+
277
+ # Get changes from detector
278
+ changed_files = list(self.change_detector.get_changed_files(since_dt))
279
+
280
+ # Add changes from Merkle tree if available
281
+ if self.merkle_tree:
282
+ merkle_changes = self.merkle_tree.get_changed_files(since_dt)
283
+ changed_files.extend(merkle_changes)
284
+
285
+ return list(set(changed_files)) # Remove duplicates
286
+
287
+ def force_scan(self) -> int:
288
+ """Force a full scan and return number of changes detected."""
289
+ logger.info("Forcing full file system scan")
290
+
291
+ if self.merkle_tree:
292
+ # Rebuild Merkle tree
293
+ self.merkle_tree.build_tree(self.ignore_patterns)
294
+
295
+ # Get changed files
296
+ changed_files = self.merkle_tree.get_changed_files()
297
+
298
+ # Process changes through detector
299
+ for file_path in changed_files:
300
+ full_path = self.project_root / file_path
301
+ change = self.change_detector.process_fs_event(
302
+ event_type="modified",
303
+ path=full_path
304
+ )
305
+
306
+ if change and self.change_callbacks:
307
+ self._on_change(change)
308
+
309
+ return len(changed_files)
310
+
311
+ return 0
312
+
313
+ def get_stats(self) -> Dict[str, Any]:
314
+ """Get watcher statistics."""
315
+ stats = {
316
+ "is_watching": self.is_watching,
317
+ "project_root": str(self.project_root),
318
+ "project_id": self.project_id,
319
+ "change_detector_stats": self.change_detector.get_stats().__dict__,
320
+ "callbacks_registered": len(self.change_callbacks),
321
+ }
322
+
323
+ if self.merkle_tree:
324
+ stats["merkle_tree"] = self.merkle_tree.get_tree_summary()
325
+
326
+ return stats
327
+
328
+ def cleanup(self) -> None:
329
+ """Clean up resources."""
330
+ self.stop_watching()
331
+
332
+ if self.executor:
333
+ self.executor.shutdown(wait=True)
334
+
335
+ async def __aenter__(self):
336
+ await self.initialize()
337
+ return self
338
+
339
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
340
+ self.cleanup()
341
+
342
+ # Fallback implementation for when watchdog is not available
343
+ class PollingFileWatcher:
344
+ """
345
+ Fallback file watcher using polling instead of OS events.
346
+
347
+ Used when watchdog is not available or on systems that don't support
348
+ efficient file system monitoring.
349
+ """
350
+
351
+ def __init__(
352
+ self,
353
+ project_root: Path,
354
+ project_id: str,
355
+ poll_interval: float = 5.0,
356
+ **kwargs
357
+ ):
358
+ """Initialize polling file watcher."""
359
+ self.project_root = Path(project_root).resolve()
360
+ self.project_id = project_id
361
+ self.poll_interval = poll_interval
362
+
363
+ self.change_detector = ChangeDetector(project_root=self.project_root, **kwargs)
364
+ self.merkle_tree = MerkleTree(self.project_root, project_id)
365
+
366
+ self.is_watching = False
367
+ self.poll_task: Optional[asyncio.Task] = None
368
+ self.change_callbacks: List[Callable[[FileChange], None]] = []
369
+
370
+ def add_change_callback(self, callback: Callable[[FileChange], None]) -> None:
371
+ """Add a callback to be called when files change."""
372
+ self.change_callbacks.append(callback)
373
+
374
+ async def initialize(self) -> None:
375
+ """Initialize the polling watcher."""
376
+ self.merkle_tree.build_tree()
377
+
378
+ def start_watching(self) -> None:
379
+ """Start polling for changes."""
380
+ if self.is_watching:
381
+ return
382
+
383
+ self.is_watching = True
384
+ self.poll_task = asyncio.create_task(self._poll_loop())
385
+
386
+ def stop_watching(self) -> None:
387
+ """Stop polling for changes."""
388
+ self.is_watching = False
389
+ if self.poll_task:
390
+ self.poll_task.cancel()
391
+
392
+ async def _poll_loop(self) -> None:
393
+ """Main polling loop."""
394
+ while self.is_watching:
395
+ try:
396
+ # Force scan for changes
397
+ changed_files = self.merkle_tree.get_changed_files()
398
+
399
+ for file_path in changed_files:
400
+ full_path = self.project_root / file_path
401
+ change = self.change_detector.process_fs_event(
402
+ event_type="modified",
403
+ path=full_path
404
+ )
405
+
406
+ if change:
407
+ for callback in self.change_callbacks:
408
+ callback(change)
409
+
410
+ await asyncio.sleep(self.poll_interval)
411
+
412
+ except asyncio.CancelledError:
413
+ break
414
+ except Exception as e:
415
+ logger.error(f"Error in polling loop: {e}")
416
+ await asyncio.sleep(self.poll_interval)
417
+
418
+ def cleanup(self) -> None:
419
+ """Clean up resources."""
420
+ self.stop_watching()
421
+
422
+ def create_file_watcher(
423
+ project_root: Path,
424
+ project_id: str,
425
+ use_polling: bool = False,
426
+ **kwargs
427
+ ) -> Any:
428
+ """
429
+ Create appropriate file watcher based on availability.
430
+
431
+ Args:
432
+ project_root: Root directory to watch
433
+ project_id: Project identifier
434
+ use_polling: Force use of polling watcher
435
+ **kwargs: Additional arguments for watcher
436
+
437
+ Returns:
438
+ FileWatcher or PollingFileWatcher instance
439
+ """
440
+ if use_polling or not WATCHDOG_AVAILABLE:
441
+ logger.info("Using polling file watcher")
442
+ return PollingFileWatcher(project_root, project_id, **kwargs)
443
+ else:
444
+ logger.info("Using real-time file watcher")
445
+ return FileWatcher(project_root, project_id, **kwargs)