mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.16__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. mcp_code_indexer/database/database.py +251 -85
  2. mcp_code_indexer/database/models.py +66 -24
  3. mcp_code_indexer/database/retry_executor.py +15 -5
  4. mcp_code_indexer/file_scanner.py +107 -12
  5. mcp_code_indexer/main.py +43 -30
  6. mcp_code_indexer/server/mcp_server.py +191 -1
  7. mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
  8. mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
  9. mcp_code_indexer/vector_mode/config.py +113 -45
  10. mcp_code_indexer/vector_mode/const.py +24 -0
  11. mcp_code_indexer/vector_mode/daemon.py +860 -98
  12. mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
  13. mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
  14. mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
  15. mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
  16. mcp_code_indexer/vector_mode/services/__init__.py +9 -0
  17. mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
  18. mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
  19. mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
  20. mcp_code_indexer/vector_mode/types.py +46 -0
  21. mcp_code_indexer/vector_mode/utils.py +50 -0
  22. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
  23. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
  24. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
  25. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
  26. {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
@@ -7,26 +7,44 @@ Handles embedding generation, change detection, and vector database synchronizat
7
7
 
8
8
  import asyncio
9
9
  import logging
10
- import signal
11
10
  import sys
12
- from pathlib import Path
13
- from typing import Optional, Set
14
- import json
15
11
  import time
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional, Set
15
+
16
+
16
17
 
17
18
  from ..database.database import DatabaseManager
19
+ from ..database.models import Project, SyncStatus
18
20
  from .config import VectorConfig, load_vector_config
21
+ from .monitoring.file_watcher import create_file_watcher, FileWatcher
22
+ from .providers.voyage_client import VoyageClient, create_voyage_client
23
+ from .providers.turbopuffer_client import create_turbopuffer_client
24
+ from .services.embedding_service import EmbeddingService
25
+ from .services.vector_storage_service import VectorStorageService
26
+
27
+ from .monitoring.change_detector import FileChange, ChangeType
28
+ from .chunking.ast_chunker import ASTChunker, CodeChunk
29
+ from .utils import should_ignore_path
30
+ from .types import (
31
+ ScanProjectTask,
32
+ VectorDaemonTaskType,
33
+ ProcessFileChangeTask,
34
+ InitialProjectEmbeddingTask,
35
+ )
19
36
 
20
37
  logger = logging.getLogger(__name__)
21
38
 
39
+
22
40
  class VectorDaemon:
23
41
  """
24
42
  Background daemon for vector mode operations.
25
-
43
+
26
44
  Monitors file changes, generates embeddings, and maintains vector indexes
27
45
  for all projects with vector mode enabled.
28
46
  """
29
-
47
+
30
48
  def __init__(
31
49
  self,
32
50
  config: VectorConfig,
@@ -38,13 +56,24 @@ class VectorDaemon:
38
56
  self.db_manager = db_manager
39
57
  self.cache_dir = cache_dir
40
58
  self.is_running = False
41
-
59
+
42
60
  # Process tracking
43
61
  self.monitored_projects: Set[str] = set()
44
- self.processing_queue: asyncio.Queue = asyncio.Queue(maxsize=config.max_queue_size)
62
+ self.processing_queue: asyncio.Queue = asyncio.Queue(
63
+ maxsize=config.max_queue_size
64
+ )
45
65
  self.workers: list[asyncio.Task] = []
46
66
  self.monitor_tasks: list[asyncio.Task] = []
47
-
67
+
68
+ # File watcher management
69
+ self.file_watchers: Dict[str, FileWatcher] = {}
70
+ self.watcher_locks: Dict[str, asyncio.Lock] = {}
71
+
72
+ # Concurrency control for batch file processing
73
+ self.file_processing_semaphore = asyncio.Semaphore(
74
+ config.max_concurrent_batches
75
+ )
76
+
48
77
  # Statistics
49
78
  self.stats = {
50
79
  "start_time": time.time(),
@@ -53,17 +82,64 @@ class VectorDaemon:
53
82
  "errors_count": 0,
54
83
  "last_activity": time.time(),
55
84
  }
56
-
85
+
86
+ # Initialize VoyageClient and EmbeddingService for embedding generation
87
+ self._voyage_client = create_voyage_client(self.config)
88
+ self._embedding_service = EmbeddingService(self._voyage_client, self.config)
89
+
90
+ # Get embedding dimension from VoyageClient
91
+ embedding_dimension = self._voyage_client.get_embedding_dimension()
92
+
93
+ # Initialize TurbopufferClient and VectorStorageService for vector storage
94
+ self._turbopuffer_client = create_turbopuffer_client(self.config)
95
+ self._vector_storage_service = VectorStorageService(
96
+ self._turbopuffer_client, embedding_dimension, self.config
97
+ )
98
+
99
+ # Initialize ASTChunker for code chunking
100
+ self._ast_chunker = ASTChunker(
101
+ max_chunk_size=1500,
102
+ min_chunk_size=50,
103
+ enable_redaction=True,
104
+ enable_optimization=True,
105
+ )
106
+
57
107
  # Signal handling is delegated to the parent process
58
-
108
+
109
+ def _on_file_change(self, project_name: str) -> callable:
110
+ """Create a non-blocking change callback for a specific project."""
111
+
112
+ def callback(change: FileChange) -> None:
113
+ """Non-blocking callback that queues file change processing."""
114
+ try:
115
+ # Create file change processing task
116
+ task_item: ProcessFileChangeTask = {
117
+ "type": VectorDaemonTaskType.PROCESS_FILE_CHANGE,
118
+ "project_name": project_name,
119
+ "change": change,
120
+ "timestamp": time.time(),
121
+ }
122
+
123
+ # Put task in processing queue (non-blocking)
124
+ try:
125
+ self.processing_queue.put_nowait(task_item)
126
+ except asyncio.QueueFull:
127
+ logger.warning(
128
+ f"Processing queue full, dropping file change event for {change.path}"
129
+ )
130
+ except Exception as e:
131
+ logger.error(f"Error queueing file change task: {e}")
132
+
133
+ return callback
134
+
59
135
  async def start(self) -> None:
60
136
  """Start the vector daemon."""
61
137
  if self.is_running:
62
138
  logger.warning("Daemon is already running")
63
139
  return
64
-
140
+
65
141
  self.is_running = True
66
-
142
+
67
143
  logger.info(
68
144
  "Starting vector daemon",
69
145
  extra={
@@ -74,29 +150,29 @@ class VectorDaemon:
74
150
  "poll_interval": self.config.daemon_poll_interval,
75
151
  }
76
152
  }
77
- }
153
+ },
78
154
  )
79
-
155
+
80
156
  try:
81
157
  # Start worker tasks
82
158
  for i in range(self.config.worker_count):
83
159
  worker = asyncio.create_task(self._worker(f"worker-{i}"))
84
160
  self.workers.append(worker)
85
-
161
+
86
162
  # Start monitoring tasks
87
163
  monitor_task = asyncio.create_task(self._monitor_projects())
88
164
  stats_task = asyncio.create_task(self._stats_reporter())
89
165
  self.monitor_tasks.extend([monitor_task, stats_task])
90
-
166
+
91
167
  # Wait for shutdown signal
92
168
  await self._run_until_shutdown()
93
-
169
+
94
170
  except Exception as e:
95
171
  logger.error(f"Daemon error: {e}", exc_info=True)
96
172
  self.stats["errors_count"] += 1
97
173
  finally:
98
174
  await self._cleanup()
99
-
175
+
100
176
  async def _run_until_shutdown(self) -> None:
101
177
  """Run daemon until shutdown is requested."""
102
178
  # Wait indefinitely until task is cancelled by parent process
@@ -106,31 +182,78 @@ class VectorDaemon:
106
182
  except asyncio.CancelledError:
107
183
  logger.info("Vector daemon shutdown requested")
108
184
  raise
109
-
185
+
186
+ async def _get_project_monitoring_status(self) -> Dict[str, List[Project]]:
187
+ """
188
+ Get projects categorized by monitoring status.
189
+
190
+ Returns:
191
+ Dict with 'monitored' and 'unmonitored' keys containing project lists
192
+ """
193
+ # Get all projects with vector mode enabled
194
+ vector_enabled_projects = await self.db_manager.get_vector_enabled_projects()
195
+
196
+ # Filter projects that should be monitored (have valid aliases)
197
+ monitorable_projects = [
198
+ project
199
+ for project in vector_enabled_projects
200
+ if project.aliases and project.vector_mode
201
+ ]
202
+
203
+ # Determine which projects to monitor (not currently monitored)
204
+ projects_to_monitor = [
205
+ project
206
+ for project in monitorable_projects
207
+ if project.name not in self.monitored_projects
208
+ ]
209
+
210
+ # Determine which projects to unmonitor (currently monitored but no longer should be)
211
+ monitorable_names = {project.name for project in monitorable_projects}
212
+ projects_to_unmonitor = []
213
+
214
+ # Get full project data for unmonitoring
215
+ all_projects = await self.db_manager.get_all_projects()
216
+ for project in all_projects:
217
+ if (
218
+ project.name in self.monitored_projects
219
+ and project.name not in monitorable_names
220
+ ):
221
+ projects_to_unmonitor.append(project)
222
+
223
+ return {
224
+ "monitored": projects_to_monitor,
225
+ "unmonitored": projects_to_unmonitor,
226
+ }
227
+
110
228
  async def _monitor_projects(self) -> None:
111
229
  """Monitor projects for vector indexing requirements."""
112
230
  logger.info("Starting project monitoring")
113
-
231
+
114
232
  while self.is_running:
115
233
  try:
116
- # Get projects that have vector mode enabled
117
- projects = await self.db_manager.get_vector_enabled_projects()
118
-
119
- for project in projects:
120
- if (project.name not in self.monitored_projects and
121
- project.aliases and
122
- project.vector_mode):
123
- logger.info(f"Adding project to monitoring: {project.name}")
124
- self.monitored_projects.add(project.name)
125
-
126
- # Use first alias as folder path
127
- folder_path = project.aliases[0]
128
-
129
- # Queue initial indexing task
130
- await self._queue_project_scan(project.name, folder_path)
131
-
234
+ # Get project monitoring status
235
+ monitoring_status = await self._get_project_monitoring_status()
236
+ # Add new projects to monitoring
237
+ for project in monitoring_status["monitored"]:
238
+ logger.info(f"Adding project to monitoring: {project.name}")
239
+ self.monitored_projects.add(project.name)
240
+
241
+ # Use first alias as folder path
242
+ folder_path = project.aliases[0]
243
+
244
+ # Queue initial indexing task based on IndexMeta status
245
+ await self._queue_full_project_indexing(project.name, folder_path)
246
+
247
+ # Queue project scan for file watching
248
+ await self._queue_project_scan(project.name, folder_path)
249
+
250
+ # Remove projects from monitoring
251
+ for project in monitoring_status["unmonitored"]:
252
+ logger.info(f"Removing project from monitoring: {project.name}")
253
+ self.monitored_projects.discard(project.name)
254
+
132
255
  await asyncio.sleep(self.config.daemon_poll_interval)
133
-
256
+
134
257
  except asyncio.CancelledError:
135
258
  logger.info("Project monitoring cancelled")
136
259
  break
@@ -138,41 +261,86 @@ class VectorDaemon:
138
261
  logger.error(f"Error in project monitoring: {e}")
139
262
  self.stats["errors_count"] += 1
140
263
  await asyncio.sleep(5.0) # Back off on error
141
-
264
+
265
+ async def _queue_full_project_indexing(
266
+ self, project_name: str, folder_path: str
267
+ ) -> None:
268
+ """
269
+ Queue full project indexing based on IndexMeta status.
270
+
271
+ Retrieves IndexMeta for the project and queues initial embedding task
272
+ only if sync_status is 'pending'. Updates status from 'failed' or 'paused' to 'pending'.
273
+ """
274
+ try:
275
+ # Get or create IndexMeta for the project
276
+ index_meta = await self.db_manager.get_or_create_index_meta(project_name)
277
+
278
+ # If status is 'failed' or 'paused', change it to 'pending'
279
+ if index_meta.sync_status in [SyncStatus.FAILED, SyncStatus.PAUSED]:
280
+ logger.info(
281
+ f"Changing sync status from {index_meta.sync_status.value} to pending for {project_name}"
282
+ )
283
+ index_meta.sync_status = SyncStatus.PENDING
284
+ await self.db_manager.update_index_meta(index_meta)
285
+
286
+ # Only queue initial embedding if status is 'pending'
287
+ if index_meta.sync_status == SyncStatus.PENDING:
288
+ task: InitialProjectEmbeddingTask = {
289
+ "type": VectorDaemonTaskType.INITIAL_PROJECT_EMBEDDING,
290
+ "project_name": project_name,
291
+ "folder_path": folder_path,
292
+ "timestamp": time.time(),
293
+ }
294
+
295
+ try:
296
+ await self.processing_queue.put(task)
297
+ logger.debug(f"Queued initial project embedding: {project_name}")
298
+ except asyncio.QueueFull:
299
+ logger.warning(
300
+ f"Processing queue full, dropping initial embedding task for {project_name}"
301
+ )
302
+ else:
303
+ logger.debug(
304
+ f"Skipping initial embedding for {project_name}, status: {index_meta.sync_status.value}"
305
+ )
306
+ except Exception as e:
307
+ logger.error(f"Error queuing full project indexing for {project_name}: {e}")
308
+
142
309
  async def _queue_project_scan(self, project_name: str, folder_path: str) -> None:
143
310
  """Queue a project for scanning and indexing."""
144
- task = {
145
- "type": "scan_project",
311
+ task: ScanProjectTask = {
312
+ "type": VectorDaemonTaskType.SCAN_PROJECT,
146
313
  "project_name": project_name,
147
314
  "folder_path": folder_path,
148
315
  "timestamp": time.time(),
149
316
  }
150
-
317
+
151
318
  try:
152
319
  await self.processing_queue.put(task)
153
320
  logger.debug(f"Queued project scan: {project_name}")
154
321
  except asyncio.QueueFull:
155
- logger.warning(f"Processing queue full, dropping scan task for {project_name}")
156
-
322
+ logger.warning(
323
+ f"Processing queue full, dropping scan task for {project_name}"
324
+ )
325
+
157
326
  async def _worker(self, worker_id: str) -> None:
158
327
  """Worker task to process queued items."""
159
328
  logger.info(f"Starting worker: {worker_id}")
160
-
329
+
161
330
  while self.is_running:
162
331
  try:
163
332
  # Get task from queue with timeout
164
333
  try:
165
334
  task = await asyncio.wait_for(
166
- self.processing_queue.get(),
167
- timeout=5.0
335
+ self.processing_queue.get(), timeout=5.0
168
336
  )
169
337
  except asyncio.TimeoutError:
170
338
  continue
171
-
339
+
172
340
  # Process the task
173
341
  await self._process_task(task, worker_id)
174
342
  self.stats["last_activity"] = time.time()
175
-
343
+
176
344
  except asyncio.CancelledError:
177
345
  logger.info(f"Worker {worker_id} cancelled")
178
346
  break
@@ -180,57 +348,231 @@ class VectorDaemon:
180
348
  logger.error(f"Worker {worker_id} error: {e}")
181
349
  self.stats["errors_count"] += 1
182
350
  await asyncio.sleep(1.0) # Brief pause on error
183
-
351
+
184
352
  async def _process_task(self, task: dict, worker_id: str) -> None:
185
353
  """Process a queued task."""
354
+ logger.debug(f"Worker {worker_id} processing task: {task}")
186
355
  task_type = task.get("type")
187
-
188
- if task_type == "scan_project":
356
+
357
+ if task_type == VectorDaemonTaskType.SCAN_PROJECT:
189
358
  await self._process_project_scan(task, worker_id)
359
+ elif task_type == VectorDaemonTaskType.PROCESS_FILE_CHANGE:
360
+ await self._process_file_change_task(task, worker_id)
361
+ elif task_type == VectorDaemonTaskType.INITIAL_PROJECT_EMBEDDING:
362
+ await self._process_initial_project_embedding_task(task, worker_id)
190
363
  else:
191
364
  logger.warning(f"Unknown task type: {task_type}")
192
-
365
+
366
+ async def _process_file_change_task(
367
+ self, task: ProcessFileChangeTask, worker_id: str
368
+ ) -> None:
369
+ """Process a file change task."""
370
+ project_name: str = task["project_name"]
371
+ change: FileChange = task["change"]
372
+ logger.info(
373
+ f"Worker {worker_id}: File change detected for project {project_name}: {change.path} ({change.change_type.value})"
374
+ )
375
+
376
+ try:
377
+ # Handle deleted files by removing their vectors from the database
378
+ if change.change_type == ChangeType.DELETED:
379
+ logger.info(
380
+ f"Worker {worker_id}: Deleting vectors for deleted file {change.path}"
381
+ )
382
+ try:
383
+ await self._vector_storage_service.delete_vectors_for_file(
384
+ project_name, str(change.path)
385
+ )
386
+ logger.info(
387
+ f"Worker {worker_id}: Successfully deleted vectors for {change.path}"
388
+ )
389
+ except Exception as e:
390
+ logger.error(
391
+ f"Worker {worker_id}: Failed to delete vectors for {change.path}: {e}"
392
+ )
393
+ return
394
+
395
+ # Read and chunk the file using the shared ASTChunker instance
396
+ # Run in executor to avoid blocking the event loop (CPU-bound work)
397
+ try:
398
+ loop = asyncio.get_running_loop()
399
+ chunks = await loop.run_in_executor(
400
+ None, self._ast_chunker.chunk_file, str(change.path)
401
+ )
402
+ chunk_count = len(chunks)
403
+
404
+ # Only process files that actually produced chunks
405
+ if chunk_count == 0:
406
+ logger.debug(
407
+ f"Worker {worker_id}: No chunks produced for {change.path}"
408
+ )
409
+ return
410
+
411
+ # Generate and store embeddings for chunks
412
+ embeddings = await self._generate_embeddings(
413
+ chunks, project_name, change.path
414
+ )
415
+ await self._store_embeddings(
416
+ embeddings, chunks, project_name, change.path
417
+ )
418
+
419
+ # Only increment stats for successfully chunked files
420
+ self.stats["files_processed"] += 1
421
+ self.stats["last_activity"] = time.time()
422
+
423
+ except Exception as read_error:
424
+ logger.error(
425
+ f"Worker {worker_id}: Failed to read/chunk file {change.path}: {read_error}"
426
+ )
427
+ self.stats["errors_count"] += 1
428
+ return
429
+
430
+ except Exception as e:
431
+ logger.error(
432
+ f"Worker {worker_id}: Error processing file change {change.path}: {e}"
433
+ )
434
+ self.stats["errors_count"] += 1
435
+
436
+ async def _process_initial_project_embedding_task(
437
+ self, task: InitialProjectEmbeddingTask, worker_id: str
438
+ ) -> None:
439
+ """Process an initial project embedding task."""
440
+ project_name: str = task["project_name"]
441
+ folder_path: str = task["folder_path"]
442
+
443
+ logger.info(
444
+ f"Worker {worker_id}: Starting initial project embedding for {project_name}"
445
+ )
446
+ try:
447
+ # Update IndexMeta status to in_progress
448
+ index_meta = await self.db_manager.get_or_create_index_meta(project_name)
449
+ index_meta.sync_status = SyncStatus.IN_PROGRESS
450
+ await self.db_manager.update_index_meta(index_meta)
451
+ # Perform the actual embedding
452
+ stats = await self._perform_initial_project_embedding(
453
+ project_name, folder_path
454
+ )
455
+
456
+ # Update IndexMeta status to completed on success
457
+ index_meta = await self.db_manager.get_or_create_index_meta(project_name)
458
+ if stats["failed"] > 0:
459
+ index_meta.sync_status = SyncStatus.FAILED
460
+ index_meta.error_message = (
461
+ f"{stats['failed']} files failed during initial embedding"
462
+ )
463
+ else:
464
+ index_meta.sync_status = SyncStatus.COMPLETED
465
+ index_meta.error_message = None
466
+
467
+ index_meta.last_sync = datetime.utcnow()
468
+ index_meta.total_files = stats.get("scanned", 0)
469
+ index_meta.indexed_files = stats.get("processed", 0)
470
+ await self.db_manager.update_index_meta(index_meta)
471
+
472
+ logger.info(
473
+ f"Worker {worker_id}: Successfully completed initial embedding for {project_name}"
474
+ )
475
+
476
+ except Exception as e:
477
+ logger.error(
478
+ f"Worker {worker_id}: Error processing initial embedding for {project_name}: {e}"
479
+ )
480
+
481
+ # Update IndexMeta status to failed on error
482
+ try:
483
+ index_meta = await self.db_manager.get_or_create_index_meta(
484
+ project_name
485
+ )
486
+ index_meta.sync_status = SyncStatus.FAILED
487
+ index_meta.error_message = str(e)
488
+ await self.db_manager.update_index_meta(index_meta)
489
+ except Exception as meta_error:
490
+ logger.error(
491
+ f"Failed to update IndexMeta after embedding error: {meta_error}"
492
+ )
493
+
494
+ self.stats["errors_count"] += 1
495
+
193
496
  async def _process_project_scan(self, task: dict, worker_id: str) -> None:
194
497
  """Process a project scan task."""
195
498
  project_name = task["project_name"]
196
499
  folder_path = task["folder_path"]
197
-
500
+
198
501
  logger.debug(f"Worker {worker_id} processing project: {project_name}")
199
-
502
+
200
503
  try:
201
- # Check if vector mode components are available
202
- # For now, just log that we would process this project
203
- logger.info(
204
- f"Vector processing for project {project_name}",
205
- extra={
206
- "structured_data": {
207
- "project_name": project_name,
208
- "folder_path": folder_path,
209
- "worker_id": worker_id,
210
- }
211
- }
212
- )
213
-
504
+ # Ensure we have a lock for this project
505
+ if project_name not in self.watcher_locks:
506
+ self.watcher_locks[project_name] = asyncio.Lock()
507
+
508
+ # Use project-specific lock to prevent race conditions
509
+ async with self.watcher_locks[project_name]:
510
+ # Check if file watcher already exists for this project
511
+ if project_name not in self.file_watchers:
512
+ logger.info(
513
+ f"Initializing file watcher for project {project_name}",
514
+ extra={
515
+ "structured_data": {
516
+ "project_name": project_name,
517
+ "folder_path": folder_path,
518
+ "worker_id": worker_id,
519
+ }
520
+ },
521
+ )
522
+
523
+ # Validate folder path exists
524
+ project_path = Path(folder_path)
525
+ if not project_path.exists():
526
+ logger.warning(f"Project folder does not exist: {folder_path}")
527
+ return
528
+
529
+ # Create file watcher with appropriate configuration
530
+ watcher = create_file_watcher(
531
+ project_root=project_path,
532
+ project_id=project_name,
533
+ ignore_patterns=self.config.ignore_patterns,
534
+ debounce_interval=self.config.watch_debounce_ms / 1000.0,
535
+ )
536
+ logger.debug(f"VectorDaemon: Created watcher for {project_name}")
537
+ # Initialize the watcher
538
+ await watcher.initialize()
539
+
540
+ # Add change callback
541
+ watcher.add_change_callback(self._on_file_change(project_name))
542
+
543
+ # Start watching
544
+ watcher.start_watching()
545
+
546
+ # Store watcher for later cleanup
547
+ self.file_watchers[project_name] = watcher
548
+
549
+ logger.info(
550
+ f"File watcher started for project {project_name}",
551
+ extra={
552
+ "structured_data": {
553
+ "project_name": project_name,
554
+ "folder_path": folder_path,
555
+ "watcher_stats": watcher.get_stats(),
556
+ }
557
+ },
558
+ )
559
+ else:
560
+ logger.debug(
561
+ f"File watcher already exists for project {project_name}"
562
+ )
563
+
214
564
  self.stats["files_processed"] += 1
215
-
216
- # TODO: Implement actual vector processing:
217
- # 1. Scan for file changes using Merkle tree
218
- # 2. Chunk modified files using AST
219
- # 3. Apply secret redaction
220
- # 4. Generate embeddings via Voyage
221
- # 5. Store in Turbopuffer
222
- # 6. Update database metadata
223
-
565
+
224
566
  except Exception as e:
225
- logger.error(f"Error processing project {project_name}: {e}")
567
+ logger.error(f"Error processing project {project_name}: {e}", exc_info=True)
226
568
  self.stats["errors_count"] += 1
227
-
569
+
228
570
  async def _stats_reporter(self) -> None:
229
571
  """Periodically report daemon statistics."""
230
572
  while self.is_running:
231
573
  try:
232
574
  uptime = time.time() - self.stats["start_time"]
233
-
575
+
234
576
  logger.info(
235
577
  "Daemon statistics",
236
578
  extra={
@@ -242,94 +584,513 @@ class VectorDaemon:
242
584
  "embeddings_generated": self.stats["embeddings_generated"],
243
585
  "errors_count": self.stats["errors_count"],
244
586
  }
245
- }
587
+ },
246
588
  )
247
-
589
+
248
590
  await asyncio.sleep(60.0) # Report every minute
249
-
591
+
250
592
  except asyncio.CancelledError:
251
593
  logger.info("Stats reporting cancelled")
252
594
  break
253
595
  except Exception as e:
254
596
  logger.error(f"Error in stats reporting: {e}")
255
597
  await asyncio.sleep(10.0)
256
-
598
+
257
599
  async def _cleanup(self) -> None:
258
600
  """Clean up resources and shut down workers."""
259
601
  logger.info("Starting daemon cleanup")
260
602
  self.is_running = False
261
-
603
+
604
+ # Stop and cleanup all file watchers first
605
+ if self.file_watchers:
606
+ logger.info(f"Cleaning up {len(self.file_watchers)} file watchers")
607
+ for project_name, watcher in self.file_watchers.items():
608
+ try:
609
+ logger.debug(f"Stopping file watcher for project: {project_name}")
610
+ watcher.cleanup()
611
+ except Exception as e:
612
+ logger.error(f"Error cleaning up watcher for {project_name}: {e}")
613
+ self.file_watchers.clear()
614
+ self.watcher_locks.clear()
615
+
262
616
  # Cancel all workers
263
617
  for worker in self.workers:
264
618
  worker.cancel()
265
-
619
+
266
620
  # Cancel monitor tasks
267
621
  for task in self.monitor_tasks:
268
622
  task.cancel()
269
-
623
+
270
624
  # Wait for all tasks to finish
271
625
  all_tasks = self.workers + self.monitor_tasks
272
626
  if all_tasks:
273
627
  await asyncio.gather(*all_tasks, return_exceptions=True)
274
-
628
+
275
629
  logger.info("Vector daemon shutdown complete")
276
-
630
+
277
631
  def get_status(self) -> dict:
278
632
  """Get current daemon status."""
633
+ watcher_stats = {}
634
+ for project_name, watcher in self.file_watchers.items():
635
+ try:
636
+ watcher_stats[project_name] = watcher.get_stats()
637
+ except Exception as e:
638
+ watcher_stats[project_name] = {"error": str(e)}
639
+
279
640
  return {
280
641
  "is_running": self.is_running,
281
642
  "uptime": time.time() - self.stats["start_time"] if self.is_running else 0,
282
643
  "monitored_projects": len(self.monitored_projects),
644
+ "active_file_watchers": len(self.file_watchers),
283
645
  "queue_size": self.processing_queue.qsize(),
284
646
  "stats": self.stats.copy(),
647
+ "file_watcher_stats": watcher_stats,
648
+ }
649
+
650
+ async def _generate_embeddings(
651
+ self, chunks: list[CodeChunk], project_name: str, file_path: Path
652
+ ) -> list[list[float]]:
653
+ """Generate embeddings for file chunks using EmbeddingService."""
654
+ try:
655
+ generating_embedding_time = time.time()
656
+ embeddings = await self._embedding_service.generate_embeddings_for_chunks(
657
+ chunks, project_name, file_path
658
+ )
659
+
660
+ # Update daemon statistics
661
+ self.stats["embeddings_generated"] += len(embeddings)
662
+ self.stats["last_activity"] = time.time()
663
+ logger.debug(
664
+ f"Generated {len(embeddings)} embeddings for {file_path} in {time.time() - generating_embedding_time:.2f} seconds"
665
+ )
666
+ return embeddings
667
+
668
+ except Exception as e:
669
+ # Update error statistics
670
+ self.stats["errors_count"] += 1
671
+ raise
672
+
673
+ async def _store_embeddings(
674
+ self,
675
+ embeddings: list[list[float]],
676
+ chunks: list[CodeChunk],
677
+ project_name: str,
678
+ file_path: str,
679
+ ) -> None:
680
+ """Store embeddings in vector database."""
681
+ try:
682
+ store_embeddings_time = time.time()
683
+ await self._vector_storage_service.store_embeddings(
684
+ embeddings, chunks, project_name, file_path
685
+ )
686
+ logger.debug(
687
+ f"Stored embeddings for {file_path} in {time.time() - store_embeddings_time:.2f} seconds"
688
+ )
689
+ except Exception as e:
690
+ # Update error statistics
691
+ self.stats["errors_count"] += 1
692
+ raise
693
+
694
+ def _gather_project_files(self, project_root: Path) -> list[Path]:
695
+ """
696
+ Gather all relevant files in the project by applying ignore patterns.
697
+
698
+ Args:
699
+ project_root: Root path of the project
700
+
701
+ Returns:
702
+ List of file paths that should be processed
703
+ """
704
+ project_files = []
705
+
706
+ for file_path in project_root.rglob("*"):
707
+ if file_path.is_file() and not should_ignore_path(
708
+ file_path, project_root, self.config.ignore_patterns
709
+ ):
710
+ project_files.append(file_path)
711
+
712
+ return project_files
713
+
714
+ async def _perform_initial_project_embedding(
715
+ self, project_name: str, folder_path: str
716
+ ) -> dict[str, int]:
717
+ """
718
+ Perform initial project embedding for all files, processing only changed files.
719
+
720
+ Args:
721
+ project_name: Name of the project
722
+ folder_path: Root folder path of the project
723
+
724
+ Returns:
725
+ Dictionary with processing statistics
726
+ """
727
+ stats = {
728
+ "scanned": 0,
729
+ "processed": 0,
730
+ "skipped": 0,
731
+ "failed": 0,
732
+ "deleted": 0,
285
733
  }
286
734
 
735
+ logger.info(f"Starting initial project embedding for {project_name}")
736
+
737
+ try:
738
+ project_root = Path(folder_path)
739
+ if not project_root.exists():
740
+ logger.error(f"Project folder does not exist: {folder_path}")
741
+ return stats
742
+
743
+ # Discover all relevant files in the project
744
+ project_files = self._gather_project_files(project_root)
745
+
746
+ stats["scanned"] = len(project_files)
747
+ logger.info(f"Found {len(project_files)} files to scan in {project_name}")
748
+
749
+ # Process batches concurrently with controlled concurrency
750
+ batch_size = 50
751
+ processed_count = 0
752
+
753
+ # Create batch processing tasks with semaphore control
754
+ async def process_batch_with_semaphore(
755
+ batch_files: list[Path], batch_index: int
756
+ ):
757
+ """Process a single batch with semaphore control."""
758
+ async with self.file_processing_semaphore:
759
+ # Get stored file metadata from vector database
760
+ stored_metadata = (
761
+ await self._vector_storage_service.get_file_metadata(
762
+ project_name,
763
+ [str(f) for f in batch_files],
764
+ )
765
+ )
766
+ return (
767
+ await self._process_file_batch_for_initial_embedding(
768
+ batch_files, project_name, stored_metadata
769
+ ),
770
+ batch_index,
771
+ )
772
+
773
+ # Create tasks for all batches
774
+ batch_tasks = []
775
+ for i in range(0, len(project_files), batch_size):
776
+ batch = project_files[i : i + batch_size]
777
+ batch_index = i // batch_size
778
+ task = process_batch_with_semaphore(batch, batch_index)
779
+ batch_tasks.append(task)
780
+
781
+ logger.info(
782
+ f"Processing {len(batch_tasks)} batches concurrently "
783
+ f"(max concurrent: {self.file_processing_semaphore._value})"
784
+ )
785
+
786
+ # Process all batches concurrently
787
+ batch_start_time = time.time()
788
+ batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
789
+ total_batch_time = time.time() - batch_start_time
790
+
791
+ error_messages = []
792
+ # Aggregate results from all batches
793
+ successful_batches = 0
794
+ for result in batch_results:
795
+ if isinstance(result, Exception):
796
+ logger.error(f"Batch processing failed with exception: {result}")
797
+ # Estimate failed files (assuming average batch size)
798
+ estimated_failed = min(
799
+ batch_size, len(project_files) - processed_count
800
+ )
801
+ error_messages.append(f"Batch {batch_index} failed: {result}")
802
+ stats["failed"] += estimated_failed
803
+ processed_count += estimated_failed
804
+ elif isinstance(result, tuple):
805
+ batch_stats, batch_index = result
806
+ # Update aggregate stats
807
+ stats["processed"] += batch_stats["processed"]
808
+ stats["skipped"] += batch_stats["skipped"]
809
+ stats["failed"] += batch_stats["failed"]
810
+ processed_count += (
811
+ batch_stats["processed"]
812
+ + batch_stats["skipped"]
813
+ + batch_stats["failed"]
814
+ )
815
+ successful_batches += 1
816
+ else:
817
+ logger.warning(f"Unexpected batch result type: {type(result)}")
818
+
819
+ logger.info(
820
+ f"Concurrent batch processing completed in {total_batch_time:.2f}s: "
821
+ f"{successful_batches}/{len(batch_tasks)} batches successful"
822
+ )
823
+
824
+ # Handle deleted files - files that exist in vector DB but not locally
825
+ # Get all stored metadata for cleanup (not limited to specific batches)
826
+ all_stored_metadata = await self._vector_storage_service.get_file_metadata(
827
+ project_name
828
+ )
829
+ await self._cleanup_deleted_files(
830
+ project_name, project_files, all_stored_metadata, stats
831
+ )
832
+
833
+ logger.info(
834
+ f"Initial project embedding complete for {project_name}: "
835
+ f"scanned={stats['scanned']}, processed={stats['processed']}, "
836
+ f"skipped={stats['skipped']}, failed={stats['failed']}, deleted={stats.get('deleted', 0)}"
837
+ )
838
+
839
+ except Exception as e:
840
+ logger.error(
841
+ f"Error during initial project embedding for {project_name}: {e}"
842
+ )
843
+ stats["failed"] += 1
844
+
845
+ return stats
846
+
847
+ async def _process_file_batch_for_initial_embedding(
848
+ self,
849
+ file_batch: list[Path],
850
+ project_name: str,
851
+ stored_metadata: dict[str, float],
852
+ ) -> dict[str, int]:
853
+ """
854
+ Process a batch of files for initial embedding using true batch processing.
855
+
856
+ Args:
857
+ file_batch: List of file paths to process
858
+ project_name: Name of the project
859
+ stored_metadata: Dictionary of file_path -> mtime from vector database
860
+
861
+ Returns:
862
+ Dictionary with batch processing statistics
863
+ """
864
+ batch_stats = {"processed": 0, "skipped": 0, "failed": 0}
865
+
866
+ # Filter files that need processing based on mtime comparison
867
+ # TODO: remove comparing mtimes
868
+ files_to_process: list[Path] = []
869
+ for file_path in file_batch:
870
+ try:
871
+ current_mtime = file_path.stat().st_mtime
872
+ stored_mtime = stored_metadata.get(str(file_path), 0.0)
873
+ # Use epsilon comparison for floating point mtime
874
+ if abs(current_mtime - stored_mtime) > 0.001:
875
+ files_to_process.append(file_path)
876
+ else:
877
+ batch_stats["skipped"] += 1
878
+
879
+ except (OSError, FileNotFoundError) as e:
880
+ logger.warning(f"Failed to get mtime for {file_path}: {e}")
881
+ batch_stats["failed"] += 1
882
+
883
+ # Process files using true batch processing: chunk → embed → store
884
+ if files_to_process:
885
+ logger.info(
886
+ f"Batch processing {len(files_to_process)}/{len(file_batch)} files "
887
+ f"using true batch processing (chunk → embed → store)"
888
+ )
889
+
890
+ try:
891
+ batch_start_time = time.time()
892
+
893
+ # Step 1: Batch chunking for all files
894
+ # Run in executor to avoid blocking the event loop (CPU-bound work)
895
+ logger.debug(f"Step 1: Chunking {len(files_to_process)} files")
896
+ chunking_start_time = time.time()
897
+
898
+ loop = asyncio.get_running_loop()
899
+ file_chunks = await loop.run_in_executor(
900
+ None,
901
+ self._ast_chunker.chunk_multiple_files,
902
+ [str(f) for f in files_to_process],
903
+ )
904
+
905
+ # Filter out files that failed to chunk
906
+ successful_files = {
907
+ file_path: chunks
908
+ for file_path, chunks in file_chunks.items()
909
+ if chunks # Only keep files with successful chunks
910
+ }
911
+ failed_chunking_count = len(files_to_process) - len(successful_files)
912
+
913
+ logger.debug(
914
+ f"Chunking completed in {time.time() - chunking_start_time:.2f}s: "
915
+ f"{len(successful_files)} files successful, {failed_chunking_count} failed"
916
+ )
917
+
918
+ if successful_files:
919
+ # Step 2: Batch embedding for all chunks
920
+ logger.debug(
921
+ f"Step 2: Generating embeddings for {len(successful_files)} files"
922
+ )
923
+ embedding_start_time = time.time()
924
+
925
+ file_embeddings = await self._embedding_service.generate_embeddings_for_multiple_files(
926
+ successful_files, project_name
927
+ )
928
+
929
+ logger.debug(
930
+ f"Embedding completed in {time.time() - embedding_start_time:.2f}s: "
931
+ f"{len(file_embeddings)} files embedded"
932
+ )
933
+
934
+ # Step 3: Batch storage for all vectors
935
+ if file_embeddings:
936
+ logger.debug(
937
+ f"Step 3: Storing vectors for {len(file_embeddings)} files"
938
+ )
939
+ storage_start_time = time.time()
940
+
941
+ await self._vector_storage_service.store_embeddings_batch(
942
+ file_embeddings, successful_files, project_name
943
+ )
944
+
945
+ logger.debug(
946
+ f"Storage completed in {time.time() - storage_start_time:.2f}s"
947
+ )
948
+
949
+ # Update success count
950
+ batch_stats["processed"] = len(file_embeddings)
951
+ else:
952
+ logger.warning(
953
+ "No embeddings generated despite successful chunking"
954
+ )
955
+
956
+ # Update failure count for chunking failures
957
+ batch_stats["failed"] += failed_chunking_count
958
+
959
+ total_batch_time = time.time() - batch_start_time
960
+ logger.debug(
961
+ f"Batch processing completed in {total_batch_time:.2f}s: "
962
+ f"{batch_stats['processed']} processed, {batch_stats['failed']} failed, "
963
+ f"{batch_stats['skipped']} skipped"
964
+ )
965
+
966
+ except Exception as e:
967
+ logger.error(f"Batch processing failed: {e}", exc_info=True)
968
+ # Mark all files as failed if batch processing fails
969
+ batch_stats["failed"] += len(files_to_process)
970
+
971
+ return batch_stats
972
+
973
+ async def _cleanup_deleted_files(
974
+ self,
975
+ project_name: str,
976
+ existing_files: list[Path],
977
+ stored_metadata: dict[str, float],
978
+ stats: dict[str, int],
979
+ ) -> None:
980
+ """
981
+ Clean up files that exist in vector database but not locally (deleted files).
982
+
983
+ Args:
984
+ project_name: Name of the project
985
+ existing_files: List of files that exist locally
986
+ stored_metadata: Dictionary of file_path -> mtime from vector database
987
+ stats: Statistics dictionary to update
988
+ """
989
+ if not stored_metadata:
990
+ return
991
+
992
+ # Create set of existing file paths for efficient lookup
993
+ existing_file_paths = {str(file_path) for file_path in existing_files}
994
+
995
+ # Find files that exist in vector DB but not locally
996
+ deleted_files = []
997
+ for stored_file_path in stored_metadata.keys():
998
+ if stored_file_path not in existing_file_paths:
999
+ # Convert string path back to Path object for processing
1000
+ deleted_file_path = Path(stored_file_path)
1001
+ deleted_files.append(deleted_file_path)
1002
+
1003
+ if deleted_files:
1004
+ logger.info(
1005
+ f"Found {len(deleted_files)} deleted files to clean up from vector database"
1006
+ )
1007
+
1008
+ # Initialize deleted count in stats
1009
+ if "deleted" not in stats:
1010
+ stats["deleted"] = 0
1011
+
1012
+ # Process each deleted file
1013
+ for deleted_file_path in deleted_files:
1014
+ try:
1015
+ # Create FileChange object for deleted file
1016
+ file_change = FileChange(
1017
+ path=deleted_file_path,
1018
+ change_type=ChangeType.DELETED,
1019
+ timestamp=time.time(),
1020
+ )
1021
+
1022
+ # Create ProcessFileChangeTask for deletion
1023
+ task_item: ProcessFileChangeTask = {
1024
+ "type": VectorDaemonTaskType.PROCESS_FILE_CHANGE,
1025
+ "project_name": project_name,
1026
+ "change": file_change,
1027
+ "timestamp": time.time(),
1028
+ }
1029
+
1030
+ # Process deletion using existing file change task logic
1031
+ await self._process_file_change_task(
1032
+ task_item, "initial-processing"
1033
+ )
1034
+ stats["deleted"] += 1
1035
+
1036
+ logger.debug(f"Cleaned up deleted file: {deleted_file_path}")
1037
+
1038
+ except Exception as e:
1039
+ logger.error(
1040
+ f"Failed to clean up deleted file {deleted_file_path}: {e}"
1041
+ )
1042
+ stats["failed"] += 1
1043
+ else:
1044
+ logger.debug("No deleted files found during initial processing")
1045
+
1046
+
287
1047
  async def start_vector_daemon(
288
1048
  config_path: Optional[Path] = None,
289
1049
  db_path: Optional[Path] = None,
290
1050
  cache_dir: Optional[Path] = None,
291
1051
  ) -> None:
292
1052
  """Start the vector daemon process."""
293
-
1053
+
294
1054
  # Load configuration
295
1055
  config = load_vector_config(config_path)
296
-
1056
+
297
1057
  # Setup database
298
1058
  if db_path is None:
299
1059
  db_path = Path.home() / ".mcp-code-index" / "tracker.db"
300
1060
  if cache_dir is None:
301
1061
  cache_dir = Path.home() / ".mcp-code-index" / "cache"
302
-
1062
+
303
1063
  db_manager = DatabaseManager(db_path)
304
1064
  await db_manager.initialize()
305
-
1065
+
306
1066
  # Create and start daemon
307
1067
  daemon = VectorDaemon(config, db_manager, cache_dir)
308
-
1068
+
309
1069
  try:
310
1070
  await daemon.start()
311
1071
  finally:
312
1072
  # Clean up database connections
313
1073
  await db_manager.close_pool()
314
1074
 
1075
+
315
1076
  def main() -> None:
316
1077
  """CLI entry point for vector daemon."""
317
1078
  import argparse
318
-
1079
+
319
1080
  parser = argparse.ArgumentParser(description="MCP Code Indexer Vector Daemon")
320
1081
  parser.add_argument("--config", type=Path, help="Path to config file")
321
1082
  parser.add_argument("--db-path", type=Path, help="Path to database")
322
1083
  parser.add_argument("--cache-dir", type=Path, help="Cache directory")
323
1084
  parser.add_argument("--log-level", default="INFO", help="Logging level")
324
-
1085
+
325
1086
  args = parser.parse_args()
326
-
1087
+
327
1088
  # Setup logging
328
1089
  logging.basicConfig(
329
1090
  level=getattr(logging, args.log_level.upper()),
330
1091
  format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
331
1092
  )
332
-
1093
+
333
1094
  try:
334
1095
  asyncio.run(start_vector_daemon(args.config, args.db_path, args.cache_dir))
335
1096
  except KeyboardInterrupt:
@@ -339,5 +1100,6 @@ def main() -> None:
339
1100
  logger.error(f"Daemon failed: {e}", exc_info=True)
340
1101
  sys.exit(1)
341
1102
 
1103
+
342
1104
  if __name__ == "__main__":
343
1105
  main()