mcp-code-indexer 4.2.15__py3-none-any.whl → 4.2.16__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mcp_code_indexer/database/database.py +251 -85
- mcp_code_indexer/database/models.py +66 -24
- mcp_code_indexer/database/retry_executor.py +15 -5
- mcp_code_indexer/file_scanner.py +107 -12
- mcp_code_indexer/main.py +43 -30
- mcp_code_indexer/server/mcp_server.py +191 -1
- mcp_code_indexer/vector_mode/chunking/ast_chunker.py +103 -84
- mcp_code_indexer/vector_mode/chunking/chunk_optimizer.py +1 -0
- mcp_code_indexer/vector_mode/config.py +113 -45
- mcp_code_indexer/vector_mode/const.py +24 -0
- mcp_code_indexer/vector_mode/daemon.py +860 -98
- mcp_code_indexer/vector_mode/monitoring/change_detector.py +113 -97
- mcp_code_indexer/vector_mode/monitoring/file_watcher.py +175 -121
- mcp_code_indexer/vector_mode/providers/turbopuffer_client.py +291 -98
- mcp_code_indexer/vector_mode/providers/voyage_client.py +140 -38
- mcp_code_indexer/vector_mode/services/__init__.py +9 -0
- mcp_code_indexer/vector_mode/services/embedding_service.py +389 -0
- mcp_code_indexer/vector_mode/services/vector_mode_tools_service.py +459 -0
- mcp_code_indexer/vector_mode/services/vector_storage_service.py +580 -0
- mcp_code_indexer/vector_mode/types.py +46 -0
- mcp_code_indexer/vector_mode/utils.py +50 -0
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/METADATA +13 -10
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/RECORD +26 -19
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/WHEEL +1 -1
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info}/entry_points.txt +0 -0
- {mcp_code_indexer-4.2.15.dist-info → mcp_code_indexer-4.2.16.dist-info/licenses}/LICENSE +0 -0
|
@@ -7,26 +7,44 @@ Handles embedding generation, change detection, and vector database synchronizat
|
|
|
7
7
|
|
|
8
8
|
import asyncio
|
|
9
9
|
import logging
|
|
10
|
-
import signal
|
|
11
10
|
import sys
|
|
12
|
-
from pathlib import Path
|
|
13
|
-
from typing import Optional, Set
|
|
14
|
-
import json
|
|
15
11
|
import time
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import Dict, List, Optional, Set
|
|
15
|
+
|
|
16
|
+
|
|
16
17
|
|
|
17
18
|
from ..database.database import DatabaseManager
|
|
19
|
+
from ..database.models import Project, SyncStatus
|
|
18
20
|
from .config import VectorConfig, load_vector_config
|
|
21
|
+
from .monitoring.file_watcher import create_file_watcher, FileWatcher
|
|
22
|
+
from .providers.voyage_client import VoyageClient, create_voyage_client
|
|
23
|
+
from .providers.turbopuffer_client import create_turbopuffer_client
|
|
24
|
+
from .services.embedding_service import EmbeddingService
|
|
25
|
+
from .services.vector_storage_service import VectorStorageService
|
|
26
|
+
|
|
27
|
+
from .monitoring.change_detector import FileChange, ChangeType
|
|
28
|
+
from .chunking.ast_chunker import ASTChunker, CodeChunk
|
|
29
|
+
from .utils import should_ignore_path
|
|
30
|
+
from .types import (
|
|
31
|
+
ScanProjectTask,
|
|
32
|
+
VectorDaemonTaskType,
|
|
33
|
+
ProcessFileChangeTask,
|
|
34
|
+
InitialProjectEmbeddingTask,
|
|
35
|
+
)
|
|
19
36
|
|
|
20
37
|
logger = logging.getLogger(__name__)
|
|
21
38
|
|
|
39
|
+
|
|
22
40
|
class VectorDaemon:
|
|
23
41
|
"""
|
|
24
42
|
Background daemon for vector mode operations.
|
|
25
|
-
|
|
43
|
+
|
|
26
44
|
Monitors file changes, generates embeddings, and maintains vector indexes
|
|
27
45
|
for all projects with vector mode enabled.
|
|
28
46
|
"""
|
|
29
|
-
|
|
47
|
+
|
|
30
48
|
def __init__(
|
|
31
49
|
self,
|
|
32
50
|
config: VectorConfig,
|
|
@@ -38,13 +56,24 @@ class VectorDaemon:
|
|
|
38
56
|
self.db_manager = db_manager
|
|
39
57
|
self.cache_dir = cache_dir
|
|
40
58
|
self.is_running = False
|
|
41
|
-
|
|
59
|
+
|
|
42
60
|
# Process tracking
|
|
43
61
|
self.monitored_projects: Set[str] = set()
|
|
44
|
-
self.processing_queue: asyncio.Queue = asyncio.Queue(
|
|
62
|
+
self.processing_queue: asyncio.Queue = asyncio.Queue(
|
|
63
|
+
maxsize=config.max_queue_size
|
|
64
|
+
)
|
|
45
65
|
self.workers: list[asyncio.Task] = []
|
|
46
66
|
self.monitor_tasks: list[asyncio.Task] = []
|
|
47
|
-
|
|
67
|
+
|
|
68
|
+
# File watcher management
|
|
69
|
+
self.file_watchers: Dict[str, FileWatcher] = {}
|
|
70
|
+
self.watcher_locks: Dict[str, asyncio.Lock] = {}
|
|
71
|
+
|
|
72
|
+
# Concurrency control for batch file processing
|
|
73
|
+
self.file_processing_semaphore = asyncio.Semaphore(
|
|
74
|
+
config.max_concurrent_batches
|
|
75
|
+
)
|
|
76
|
+
|
|
48
77
|
# Statistics
|
|
49
78
|
self.stats = {
|
|
50
79
|
"start_time": time.time(),
|
|
@@ -53,17 +82,64 @@ class VectorDaemon:
|
|
|
53
82
|
"errors_count": 0,
|
|
54
83
|
"last_activity": time.time(),
|
|
55
84
|
}
|
|
56
|
-
|
|
85
|
+
|
|
86
|
+
# Initialize VoyageClient and EmbeddingService for embedding generation
|
|
87
|
+
self._voyage_client = create_voyage_client(self.config)
|
|
88
|
+
self._embedding_service = EmbeddingService(self._voyage_client, self.config)
|
|
89
|
+
|
|
90
|
+
# Get embedding dimension from VoyageClient
|
|
91
|
+
embedding_dimension = self._voyage_client.get_embedding_dimension()
|
|
92
|
+
|
|
93
|
+
# Initialize TurbopufferClient and VectorStorageService for vector storage
|
|
94
|
+
self._turbopuffer_client = create_turbopuffer_client(self.config)
|
|
95
|
+
self._vector_storage_service = VectorStorageService(
|
|
96
|
+
self._turbopuffer_client, embedding_dimension, self.config
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# Initialize ASTChunker for code chunking
|
|
100
|
+
self._ast_chunker = ASTChunker(
|
|
101
|
+
max_chunk_size=1500,
|
|
102
|
+
min_chunk_size=50,
|
|
103
|
+
enable_redaction=True,
|
|
104
|
+
enable_optimization=True,
|
|
105
|
+
)
|
|
106
|
+
|
|
57
107
|
# Signal handling is delegated to the parent process
|
|
58
|
-
|
|
108
|
+
|
|
109
|
+
def _on_file_change(self, project_name: str) -> callable:
|
|
110
|
+
"""Create a non-blocking change callback for a specific project."""
|
|
111
|
+
|
|
112
|
+
def callback(change: FileChange) -> None:
|
|
113
|
+
"""Non-blocking callback that queues file change processing."""
|
|
114
|
+
try:
|
|
115
|
+
# Create file change processing task
|
|
116
|
+
task_item: ProcessFileChangeTask = {
|
|
117
|
+
"type": VectorDaemonTaskType.PROCESS_FILE_CHANGE,
|
|
118
|
+
"project_name": project_name,
|
|
119
|
+
"change": change,
|
|
120
|
+
"timestamp": time.time(),
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
# Put task in processing queue (non-blocking)
|
|
124
|
+
try:
|
|
125
|
+
self.processing_queue.put_nowait(task_item)
|
|
126
|
+
except asyncio.QueueFull:
|
|
127
|
+
logger.warning(
|
|
128
|
+
f"Processing queue full, dropping file change event for {change.path}"
|
|
129
|
+
)
|
|
130
|
+
except Exception as e:
|
|
131
|
+
logger.error(f"Error queueing file change task: {e}")
|
|
132
|
+
|
|
133
|
+
return callback
|
|
134
|
+
|
|
59
135
|
async def start(self) -> None:
|
|
60
136
|
"""Start the vector daemon."""
|
|
61
137
|
if self.is_running:
|
|
62
138
|
logger.warning("Daemon is already running")
|
|
63
139
|
return
|
|
64
|
-
|
|
140
|
+
|
|
65
141
|
self.is_running = True
|
|
66
|
-
|
|
142
|
+
|
|
67
143
|
logger.info(
|
|
68
144
|
"Starting vector daemon",
|
|
69
145
|
extra={
|
|
@@ -74,29 +150,29 @@ class VectorDaemon:
|
|
|
74
150
|
"poll_interval": self.config.daemon_poll_interval,
|
|
75
151
|
}
|
|
76
152
|
}
|
|
77
|
-
}
|
|
153
|
+
},
|
|
78
154
|
)
|
|
79
|
-
|
|
155
|
+
|
|
80
156
|
try:
|
|
81
157
|
# Start worker tasks
|
|
82
158
|
for i in range(self.config.worker_count):
|
|
83
159
|
worker = asyncio.create_task(self._worker(f"worker-{i}"))
|
|
84
160
|
self.workers.append(worker)
|
|
85
|
-
|
|
161
|
+
|
|
86
162
|
# Start monitoring tasks
|
|
87
163
|
monitor_task = asyncio.create_task(self._monitor_projects())
|
|
88
164
|
stats_task = asyncio.create_task(self._stats_reporter())
|
|
89
165
|
self.monitor_tasks.extend([monitor_task, stats_task])
|
|
90
|
-
|
|
166
|
+
|
|
91
167
|
# Wait for shutdown signal
|
|
92
168
|
await self._run_until_shutdown()
|
|
93
|
-
|
|
169
|
+
|
|
94
170
|
except Exception as e:
|
|
95
171
|
logger.error(f"Daemon error: {e}", exc_info=True)
|
|
96
172
|
self.stats["errors_count"] += 1
|
|
97
173
|
finally:
|
|
98
174
|
await self._cleanup()
|
|
99
|
-
|
|
175
|
+
|
|
100
176
|
async def _run_until_shutdown(self) -> None:
|
|
101
177
|
"""Run daemon until shutdown is requested."""
|
|
102
178
|
# Wait indefinitely until task is cancelled by parent process
|
|
@@ -106,31 +182,78 @@ class VectorDaemon:
|
|
|
106
182
|
except asyncio.CancelledError:
|
|
107
183
|
logger.info("Vector daemon shutdown requested")
|
|
108
184
|
raise
|
|
109
|
-
|
|
185
|
+
|
|
186
|
+
async def _get_project_monitoring_status(self) -> Dict[str, List[Project]]:
|
|
187
|
+
"""
|
|
188
|
+
Get projects categorized by monitoring status.
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Dict with 'monitored' and 'unmonitored' keys containing project lists
|
|
192
|
+
"""
|
|
193
|
+
# Get all projects with vector mode enabled
|
|
194
|
+
vector_enabled_projects = await self.db_manager.get_vector_enabled_projects()
|
|
195
|
+
|
|
196
|
+
# Filter projects that should be monitored (have valid aliases)
|
|
197
|
+
monitorable_projects = [
|
|
198
|
+
project
|
|
199
|
+
for project in vector_enabled_projects
|
|
200
|
+
if project.aliases and project.vector_mode
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
# Determine which projects to monitor (not currently monitored)
|
|
204
|
+
projects_to_monitor = [
|
|
205
|
+
project
|
|
206
|
+
for project in monitorable_projects
|
|
207
|
+
if project.name not in self.monitored_projects
|
|
208
|
+
]
|
|
209
|
+
|
|
210
|
+
# Determine which projects to unmonitor (currently monitored but no longer should be)
|
|
211
|
+
monitorable_names = {project.name for project in monitorable_projects}
|
|
212
|
+
projects_to_unmonitor = []
|
|
213
|
+
|
|
214
|
+
# Get full project data for unmonitoring
|
|
215
|
+
all_projects = await self.db_manager.get_all_projects()
|
|
216
|
+
for project in all_projects:
|
|
217
|
+
if (
|
|
218
|
+
project.name in self.monitored_projects
|
|
219
|
+
and project.name not in monitorable_names
|
|
220
|
+
):
|
|
221
|
+
projects_to_unmonitor.append(project)
|
|
222
|
+
|
|
223
|
+
return {
|
|
224
|
+
"monitored": projects_to_monitor,
|
|
225
|
+
"unmonitored": projects_to_unmonitor,
|
|
226
|
+
}
|
|
227
|
+
|
|
110
228
|
async def _monitor_projects(self) -> None:
|
|
111
229
|
"""Monitor projects for vector indexing requirements."""
|
|
112
230
|
logger.info("Starting project monitoring")
|
|
113
|
-
|
|
231
|
+
|
|
114
232
|
while self.is_running:
|
|
115
233
|
try:
|
|
116
|
-
# Get
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
for project in
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
234
|
+
# Get project monitoring status
|
|
235
|
+
monitoring_status = await self._get_project_monitoring_status()
|
|
236
|
+
# Add new projects to monitoring
|
|
237
|
+
for project in monitoring_status["monitored"]:
|
|
238
|
+
logger.info(f"Adding project to monitoring: {project.name}")
|
|
239
|
+
self.monitored_projects.add(project.name)
|
|
240
|
+
|
|
241
|
+
# Use first alias as folder path
|
|
242
|
+
folder_path = project.aliases[0]
|
|
243
|
+
|
|
244
|
+
# Queue initial indexing task based on IndexMeta status
|
|
245
|
+
await self._queue_full_project_indexing(project.name, folder_path)
|
|
246
|
+
|
|
247
|
+
# Queue project scan for file watching
|
|
248
|
+
await self._queue_project_scan(project.name, folder_path)
|
|
249
|
+
|
|
250
|
+
# Remove projects from monitoring
|
|
251
|
+
for project in monitoring_status["unmonitored"]:
|
|
252
|
+
logger.info(f"Removing project from monitoring: {project.name}")
|
|
253
|
+
self.monitored_projects.discard(project.name)
|
|
254
|
+
|
|
132
255
|
await asyncio.sleep(self.config.daemon_poll_interval)
|
|
133
|
-
|
|
256
|
+
|
|
134
257
|
except asyncio.CancelledError:
|
|
135
258
|
logger.info("Project monitoring cancelled")
|
|
136
259
|
break
|
|
@@ -138,41 +261,86 @@ class VectorDaemon:
|
|
|
138
261
|
logger.error(f"Error in project monitoring: {e}")
|
|
139
262
|
self.stats["errors_count"] += 1
|
|
140
263
|
await asyncio.sleep(5.0) # Back off on error
|
|
141
|
-
|
|
264
|
+
|
|
265
|
+
async def _queue_full_project_indexing(
|
|
266
|
+
self, project_name: str, folder_path: str
|
|
267
|
+
) -> None:
|
|
268
|
+
"""
|
|
269
|
+
Queue full project indexing based on IndexMeta status.
|
|
270
|
+
|
|
271
|
+
Retrieves IndexMeta for the project and queues initial embedding task
|
|
272
|
+
only if sync_status is 'pending'. Updates status from 'failed' or 'paused' to 'pending'.
|
|
273
|
+
"""
|
|
274
|
+
try:
|
|
275
|
+
# Get or create IndexMeta for the project
|
|
276
|
+
index_meta = await self.db_manager.get_or_create_index_meta(project_name)
|
|
277
|
+
|
|
278
|
+
# If status is 'failed' or 'paused', change it to 'pending'
|
|
279
|
+
if index_meta.sync_status in [SyncStatus.FAILED, SyncStatus.PAUSED]:
|
|
280
|
+
logger.info(
|
|
281
|
+
f"Changing sync status from {index_meta.sync_status.value} to pending for {project_name}"
|
|
282
|
+
)
|
|
283
|
+
index_meta.sync_status = SyncStatus.PENDING
|
|
284
|
+
await self.db_manager.update_index_meta(index_meta)
|
|
285
|
+
|
|
286
|
+
# Only queue initial embedding if status is 'pending'
|
|
287
|
+
if index_meta.sync_status == SyncStatus.PENDING:
|
|
288
|
+
task: InitialProjectEmbeddingTask = {
|
|
289
|
+
"type": VectorDaemonTaskType.INITIAL_PROJECT_EMBEDDING,
|
|
290
|
+
"project_name": project_name,
|
|
291
|
+
"folder_path": folder_path,
|
|
292
|
+
"timestamp": time.time(),
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
try:
|
|
296
|
+
await self.processing_queue.put(task)
|
|
297
|
+
logger.debug(f"Queued initial project embedding: {project_name}")
|
|
298
|
+
except asyncio.QueueFull:
|
|
299
|
+
logger.warning(
|
|
300
|
+
f"Processing queue full, dropping initial embedding task for {project_name}"
|
|
301
|
+
)
|
|
302
|
+
else:
|
|
303
|
+
logger.debug(
|
|
304
|
+
f"Skipping initial embedding for {project_name}, status: {index_meta.sync_status.value}"
|
|
305
|
+
)
|
|
306
|
+
except Exception as e:
|
|
307
|
+
logger.error(f"Error queuing full project indexing for {project_name}: {e}")
|
|
308
|
+
|
|
142
309
|
async def _queue_project_scan(self, project_name: str, folder_path: str) -> None:
|
|
143
310
|
"""Queue a project for scanning and indexing."""
|
|
144
|
-
task = {
|
|
145
|
-
"type":
|
|
311
|
+
task: ScanProjectTask = {
|
|
312
|
+
"type": VectorDaemonTaskType.SCAN_PROJECT,
|
|
146
313
|
"project_name": project_name,
|
|
147
314
|
"folder_path": folder_path,
|
|
148
315
|
"timestamp": time.time(),
|
|
149
316
|
}
|
|
150
|
-
|
|
317
|
+
|
|
151
318
|
try:
|
|
152
319
|
await self.processing_queue.put(task)
|
|
153
320
|
logger.debug(f"Queued project scan: {project_name}")
|
|
154
321
|
except asyncio.QueueFull:
|
|
155
|
-
logger.warning(
|
|
156
|
-
|
|
322
|
+
logger.warning(
|
|
323
|
+
f"Processing queue full, dropping scan task for {project_name}"
|
|
324
|
+
)
|
|
325
|
+
|
|
157
326
|
async def _worker(self, worker_id: str) -> None:
|
|
158
327
|
"""Worker task to process queued items."""
|
|
159
328
|
logger.info(f"Starting worker: {worker_id}")
|
|
160
|
-
|
|
329
|
+
|
|
161
330
|
while self.is_running:
|
|
162
331
|
try:
|
|
163
332
|
# Get task from queue with timeout
|
|
164
333
|
try:
|
|
165
334
|
task = await asyncio.wait_for(
|
|
166
|
-
self.processing_queue.get(),
|
|
167
|
-
timeout=5.0
|
|
335
|
+
self.processing_queue.get(), timeout=5.0
|
|
168
336
|
)
|
|
169
337
|
except asyncio.TimeoutError:
|
|
170
338
|
continue
|
|
171
|
-
|
|
339
|
+
|
|
172
340
|
# Process the task
|
|
173
341
|
await self._process_task(task, worker_id)
|
|
174
342
|
self.stats["last_activity"] = time.time()
|
|
175
|
-
|
|
343
|
+
|
|
176
344
|
except asyncio.CancelledError:
|
|
177
345
|
logger.info(f"Worker {worker_id} cancelled")
|
|
178
346
|
break
|
|
@@ -180,57 +348,231 @@ class VectorDaemon:
|
|
|
180
348
|
logger.error(f"Worker {worker_id} error: {e}")
|
|
181
349
|
self.stats["errors_count"] += 1
|
|
182
350
|
await asyncio.sleep(1.0) # Brief pause on error
|
|
183
|
-
|
|
351
|
+
|
|
184
352
|
async def _process_task(self, task: dict, worker_id: str) -> None:
|
|
185
353
|
"""Process a queued task."""
|
|
354
|
+
logger.debug(f"Worker {worker_id} processing task: {task}")
|
|
186
355
|
task_type = task.get("type")
|
|
187
|
-
|
|
188
|
-
if task_type ==
|
|
356
|
+
|
|
357
|
+
if task_type == VectorDaemonTaskType.SCAN_PROJECT:
|
|
189
358
|
await self._process_project_scan(task, worker_id)
|
|
359
|
+
elif task_type == VectorDaemonTaskType.PROCESS_FILE_CHANGE:
|
|
360
|
+
await self._process_file_change_task(task, worker_id)
|
|
361
|
+
elif task_type == VectorDaemonTaskType.INITIAL_PROJECT_EMBEDDING:
|
|
362
|
+
await self._process_initial_project_embedding_task(task, worker_id)
|
|
190
363
|
else:
|
|
191
364
|
logger.warning(f"Unknown task type: {task_type}")
|
|
192
|
-
|
|
365
|
+
|
|
366
|
+
async def _process_file_change_task(
|
|
367
|
+
self, task: ProcessFileChangeTask, worker_id: str
|
|
368
|
+
) -> None:
|
|
369
|
+
"""Process a file change task."""
|
|
370
|
+
project_name: str = task["project_name"]
|
|
371
|
+
change: FileChange = task["change"]
|
|
372
|
+
logger.info(
|
|
373
|
+
f"Worker {worker_id}: File change detected for project {project_name}: {change.path} ({change.change_type.value})"
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
try:
|
|
377
|
+
# Handle deleted files by removing their vectors from the database
|
|
378
|
+
if change.change_type == ChangeType.DELETED:
|
|
379
|
+
logger.info(
|
|
380
|
+
f"Worker {worker_id}: Deleting vectors for deleted file {change.path}"
|
|
381
|
+
)
|
|
382
|
+
try:
|
|
383
|
+
await self._vector_storage_service.delete_vectors_for_file(
|
|
384
|
+
project_name, str(change.path)
|
|
385
|
+
)
|
|
386
|
+
logger.info(
|
|
387
|
+
f"Worker {worker_id}: Successfully deleted vectors for {change.path}"
|
|
388
|
+
)
|
|
389
|
+
except Exception as e:
|
|
390
|
+
logger.error(
|
|
391
|
+
f"Worker {worker_id}: Failed to delete vectors for {change.path}: {e}"
|
|
392
|
+
)
|
|
393
|
+
return
|
|
394
|
+
|
|
395
|
+
# Read and chunk the file using the shared ASTChunker instance
|
|
396
|
+
# Run in executor to avoid blocking the event loop (CPU-bound work)
|
|
397
|
+
try:
|
|
398
|
+
loop = asyncio.get_running_loop()
|
|
399
|
+
chunks = await loop.run_in_executor(
|
|
400
|
+
None, self._ast_chunker.chunk_file, str(change.path)
|
|
401
|
+
)
|
|
402
|
+
chunk_count = len(chunks)
|
|
403
|
+
|
|
404
|
+
# Only process files that actually produced chunks
|
|
405
|
+
if chunk_count == 0:
|
|
406
|
+
logger.debug(
|
|
407
|
+
f"Worker {worker_id}: No chunks produced for {change.path}"
|
|
408
|
+
)
|
|
409
|
+
return
|
|
410
|
+
|
|
411
|
+
# Generate and store embeddings for chunks
|
|
412
|
+
embeddings = await self._generate_embeddings(
|
|
413
|
+
chunks, project_name, change.path
|
|
414
|
+
)
|
|
415
|
+
await self._store_embeddings(
|
|
416
|
+
embeddings, chunks, project_name, change.path
|
|
417
|
+
)
|
|
418
|
+
|
|
419
|
+
# Only increment stats for successfully chunked files
|
|
420
|
+
self.stats["files_processed"] += 1
|
|
421
|
+
self.stats["last_activity"] = time.time()
|
|
422
|
+
|
|
423
|
+
except Exception as read_error:
|
|
424
|
+
logger.error(
|
|
425
|
+
f"Worker {worker_id}: Failed to read/chunk file {change.path}: {read_error}"
|
|
426
|
+
)
|
|
427
|
+
self.stats["errors_count"] += 1
|
|
428
|
+
return
|
|
429
|
+
|
|
430
|
+
except Exception as e:
|
|
431
|
+
logger.error(
|
|
432
|
+
f"Worker {worker_id}: Error processing file change {change.path}: {e}"
|
|
433
|
+
)
|
|
434
|
+
self.stats["errors_count"] += 1
|
|
435
|
+
|
|
436
|
+
async def _process_initial_project_embedding_task(
|
|
437
|
+
self, task: InitialProjectEmbeddingTask, worker_id: str
|
|
438
|
+
) -> None:
|
|
439
|
+
"""Process an initial project embedding task."""
|
|
440
|
+
project_name: str = task["project_name"]
|
|
441
|
+
folder_path: str = task["folder_path"]
|
|
442
|
+
|
|
443
|
+
logger.info(
|
|
444
|
+
f"Worker {worker_id}: Starting initial project embedding for {project_name}"
|
|
445
|
+
)
|
|
446
|
+
try:
|
|
447
|
+
# Update IndexMeta status to in_progress
|
|
448
|
+
index_meta = await self.db_manager.get_or_create_index_meta(project_name)
|
|
449
|
+
index_meta.sync_status = SyncStatus.IN_PROGRESS
|
|
450
|
+
await self.db_manager.update_index_meta(index_meta)
|
|
451
|
+
# Perform the actual embedding
|
|
452
|
+
stats = await self._perform_initial_project_embedding(
|
|
453
|
+
project_name, folder_path
|
|
454
|
+
)
|
|
455
|
+
|
|
456
|
+
# Update IndexMeta status to completed on success
|
|
457
|
+
index_meta = await self.db_manager.get_or_create_index_meta(project_name)
|
|
458
|
+
if stats["failed"] > 0:
|
|
459
|
+
index_meta.sync_status = SyncStatus.FAILED
|
|
460
|
+
index_meta.error_message = (
|
|
461
|
+
f"{stats['failed']} files failed during initial embedding"
|
|
462
|
+
)
|
|
463
|
+
else:
|
|
464
|
+
index_meta.sync_status = SyncStatus.COMPLETED
|
|
465
|
+
index_meta.error_message = None
|
|
466
|
+
|
|
467
|
+
index_meta.last_sync = datetime.utcnow()
|
|
468
|
+
index_meta.total_files = stats.get("scanned", 0)
|
|
469
|
+
index_meta.indexed_files = stats.get("processed", 0)
|
|
470
|
+
await self.db_manager.update_index_meta(index_meta)
|
|
471
|
+
|
|
472
|
+
logger.info(
|
|
473
|
+
f"Worker {worker_id}: Successfully completed initial embedding for {project_name}"
|
|
474
|
+
)
|
|
475
|
+
|
|
476
|
+
except Exception as e:
|
|
477
|
+
logger.error(
|
|
478
|
+
f"Worker {worker_id}: Error processing initial embedding for {project_name}: {e}"
|
|
479
|
+
)
|
|
480
|
+
|
|
481
|
+
# Update IndexMeta status to failed on error
|
|
482
|
+
try:
|
|
483
|
+
index_meta = await self.db_manager.get_or_create_index_meta(
|
|
484
|
+
project_name
|
|
485
|
+
)
|
|
486
|
+
index_meta.sync_status = SyncStatus.FAILED
|
|
487
|
+
index_meta.error_message = str(e)
|
|
488
|
+
await self.db_manager.update_index_meta(index_meta)
|
|
489
|
+
except Exception as meta_error:
|
|
490
|
+
logger.error(
|
|
491
|
+
f"Failed to update IndexMeta after embedding error: {meta_error}"
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
self.stats["errors_count"] += 1
|
|
495
|
+
|
|
193
496
|
async def _process_project_scan(self, task: dict, worker_id: str) -> None:
|
|
194
497
|
"""Process a project scan task."""
|
|
195
498
|
project_name = task["project_name"]
|
|
196
499
|
folder_path = task["folder_path"]
|
|
197
|
-
|
|
500
|
+
|
|
198
501
|
logger.debug(f"Worker {worker_id} processing project: {project_name}")
|
|
199
|
-
|
|
502
|
+
|
|
200
503
|
try:
|
|
201
|
-
#
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
504
|
+
# Ensure we have a lock for this project
|
|
505
|
+
if project_name not in self.watcher_locks:
|
|
506
|
+
self.watcher_locks[project_name] = asyncio.Lock()
|
|
507
|
+
|
|
508
|
+
# Use project-specific lock to prevent race conditions
|
|
509
|
+
async with self.watcher_locks[project_name]:
|
|
510
|
+
# Check if file watcher already exists for this project
|
|
511
|
+
if project_name not in self.file_watchers:
|
|
512
|
+
logger.info(
|
|
513
|
+
f"Initializing file watcher for project {project_name}",
|
|
514
|
+
extra={
|
|
515
|
+
"structured_data": {
|
|
516
|
+
"project_name": project_name,
|
|
517
|
+
"folder_path": folder_path,
|
|
518
|
+
"worker_id": worker_id,
|
|
519
|
+
}
|
|
520
|
+
},
|
|
521
|
+
)
|
|
522
|
+
|
|
523
|
+
# Validate folder path exists
|
|
524
|
+
project_path = Path(folder_path)
|
|
525
|
+
if not project_path.exists():
|
|
526
|
+
logger.warning(f"Project folder does not exist: {folder_path}")
|
|
527
|
+
return
|
|
528
|
+
|
|
529
|
+
# Create file watcher with appropriate configuration
|
|
530
|
+
watcher = create_file_watcher(
|
|
531
|
+
project_root=project_path,
|
|
532
|
+
project_id=project_name,
|
|
533
|
+
ignore_patterns=self.config.ignore_patterns,
|
|
534
|
+
debounce_interval=self.config.watch_debounce_ms / 1000.0,
|
|
535
|
+
)
|
|
536
|
+
logger.debug(f"VectorDaemon: Created watcher for {project_name}")
|
|
537
|
+
# Initialize the watcher
|
|
538
|
+
await watcher.initialize()
|
|
539
|
+
|
|
540
|
+
# Add change callback
|
|
541
|
+
watcher.add_change_callback(self._on_file_change(project_name))
|
|
542
|
+
|
|
543
|
+
# Start watching
|
|
544
|
+
watcher.start_watching()
|
|
545
|
+
|
|
546
|
+
# Store watcher for later cleanup
|
|
547
|
+
self.file_watchers[project_name] = watcher
|
|
548
|
+
|
|
549
|
+
logger.info(
|
|
550
|
+
f"File watcher started for project {project_name}",
|
|
551
|
+
extra={
|
|
552
|
+
"structured_data": {
|
|
553
|
+
"project_name": project_name,
|
|
554
|
+
"folder_path": folder_path,
|
|
555
|
+
"watcher_stats": watcher.get_stats(),
|
|
556
|
+
}
|
|
557
|
+
},
|
|
558
|
+
)
|
|
559
|
+
else:
|
|
560
|
+
logger.debug(
|
|
561
|
+
f"File watcher already exists for project {project_name}"
|
|
562
|
+
)
|
|
563
|
+
|
|
214
564
|
self.stats["files_processed"] += 1
|
|
215
|
-
|
|
216
|
-
# TODO: Implement actual vector processing:
|
|
217
|
-
# 1. Scan for file changes using Merkle tree
|
|
218
|
-
# 2. Chunk modified files using AST
|
|
219
|
-
# 3. Apply secret redaction
|
|
220
|
-
# 4. Generate embeddings via Voyage
|
|
221
|
-
# 5. Store in Turbopuffer
|
|
222
|
-
# 6. Update database metadata
|
|
223
|
-
|
|
565
|
+
|
|
224
566
|
except Exception as e:
|
|
225
|
-
logger.error(f"Error processing project {project_name}: {e}")
|
|
567
|
+
logger.error(f"Error processing project {project_name}: {e}", exc_info=True)
|
|
226
568
|
self.stats["errors_count"] += 1
|
|
227
|
-
|
|
569
|
+
|
|
228
570
|
async def _stats_reporter(self) -> None:
|
|
229
571
|
"""Periodically report daemon statistics."""
|
|
230
572
|
while self.is_running:
|
|
231
573
|
try:
|
|
232
574
|
uptime = time.time() - self.stats["start_time"]
|
|
233
|
-
|
|
575
|
+
|
|
234
576
|
logger.info(
|
|
235
577
|
"Daemon statistics",
|
|
236
578
|
extra={
|
|
@@ -242,94 +584,513 @@ class VectorDaemon:
|
|
|
242
584
|
"embeddings_generated": self.stats["embeddings_generated"],
|
|
243
585
|
"errors_count": self.stats["errors_count"],
|
|
244
586
|
}
|
|
245
|
-
}
|
|
587
|
+
},
|
|
246
588
|
)
|
|
247
|
-
|
|
589
|
+
|
|
248
590
|
await asyncio.sleep(60.0) # Report every minute
|
|
249
|
-
|
|
591
|
+
|
|
250
592
|
except asyncio.CancelledError:
|
|
251
593
|
logger.info("Stats reporting cancelled")
|
|
252
594
|
break
|
|
253
595
|
except Exception as e:
|
|
254
596
|
logger.error(f"Error in stats reporting: {e}")
|
|
255
597
|
await asyncio.sleep(10.0)
|
|
256
|
-
|
|
598
|
+
|
|
257
599
|
async def _cleanup(self) -> None:
|
|
258
600
|
"""Clean up resources and shut down workers."""
|
|
259
601
|
logger.info("Starting daemon cleanup")
|
|
260
602
|
self.is_running = False
|
|
261
|
-
|
|
603
|
+
|
|
604
|
+
# Stop and cleanup all file watchers first
|
|
605
|
+
if self.file_watchers:
|
|
606
|
+
logger.info(f"Cleaning up {len(self.file_watchers)} file watchers")
|
|
607
|
+
for project_name, watcher in self.file_watchers.items():
|
|
608
|
+
try:
|
|
609
|
+
logger.debug(f"Stopping file watcher for project: {project_name}")
|
|
610
|
+
watcher.cleanup()
|
|
611
|
+
except Exception as e:
|
|
612
|
+
logger.error(f"Error cleaning up watcher for {project_name}: {e}")
|
|
613
|
+
self.file_watchers.clear()
|
|
614
|
+
self.watcher_locks.clear()
|
|
615
|
+
|
|
262
616
|
# Cancel all workers
|
|
263
617
|
for worker in self.workers:
|
|
264
618
|
worker.cancel()
|
|
265
|
-
|
|
619
|
+
|
|
266
620
|
# Cancel monitor tasks
|
|
267
621
|
for task in self.monitor_tasks:
|
|
268
622
|
task.cancel()
|
|
269
|
-
|
|
623
|
+
|
|
270
624
|
# Wait for all tasks to finish
|
|
271
625
|
all_tasks = self.workers + self.monitor_tasks
|
|
272
626
|
if all_tasks:
|
|
273
627
|
await asyncio.gather(*all_tasks, return_exceptions=True)
|
|
274
|
-
|
|
628
|
+
|
|
275
629
|
logger.info("Vector daemon shutdown complete")
|
|
276
|
-
|
|
630
|
+
|
|
277
631
|
def get_status(self) -> dict:
|
|
278
632
|
"""Get current daemon status."""
|
|
633
|
+
watcher_stats = {}
|
|
634
|
+
for project_name, watcher in self.file_watchers.items():
|
|
635
|
+
try:
|
|
636
|
+
watcher_stats[project_name] = watcher.get_stats()
|
|
637
|
+
except Exception as e:
|
|
638
|
+
watcher_stats[project_name] = {"error": str(e)}
|
|
639
|
+
|
|
279
640
|
return {
|
|
280
641
|
"is_running": self.is_running,
|
|
281
642
|
"uptime": time.time() - self.stats["start_time"] if self.is_running else 0,
|
|
282
643
|
"monitored_projects": len(self.monitored_projects),
|
|
644
|
+
"active_file_watchers": len(self.file_watchers),
|
|
283
645
|
"queue_size": self.processing_queue.qsize(),
|
|
284
646
|
"stats": self.stats.copy(),
|
|
647
|
+
"file_watcher_stats": watcher_stats,
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
async def _generate_embeddings(
|
|
651
|
+
self, chunks: list[CodeChunk], project_name: str, file_path: Path
|
|
652
|
+
) -> list[list[float]]:
|
|
653
|
+
"""Generate embeddings for file chunks using EmbeddingService."""
|
|
654
|
+
try:
|
|
655
|
+
generating_embedding_time = time.time()
|
|
656
|
+
embeddings = await self._embedding_service.generate_embeddings_for_chunks(
|
|
657
|
+
chunks, project_name, file_path
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
# Update daemon statistics
|
|
661
|
+
self.stats["embeddings_generated"] += len(embeddings)
|
|
662
|
+
self.stats["last_activity"] = time.time()
|
|
663
|
+
logger.debug(
|
|
664
|
+
f"Generated {len(embeddings)} embeddings for {file_path} in {time.time() - generating_embedding_time:.2f} seconds"
|
|
665
|
+
)
|
|
666
|
+
return embeddings
|
|
667
|
+
|
|
668
|
+
except Exception as e:
|
|
669
|
+
# Update error statistics
|
|
670
|
+
self.stats["errors_count"] += 1
|
|
671
|
+
raise
|
|
672
|
+
|
|
673
|
+
async def _store_embeddings(
|
|
674
|
+
self,
|
|
675
|
+
embeddings: list[list[float]],
|
|
676
|
+
chunks: list[CodeChunk],
|
|
677
|
+
project_name: str,
|
|
678
|
+
file_path: str,
|
|
679
|
+
) -> None:
|
|
680
|
+
"""Store embeddings in vector database."""
|
|
681
|
+
try:
|
|
682
|
+
store_embeddings_time = time.time()
|
|
683
|
+
await self._vector_storage_service.store_embeddings(
|
|
684
|
+
embeddings, chunks, project_name, file_path
|
|
685
|
+
)
|
|
686
|
+
logger.debug(
|
|
687
|
+
f"Stored embeddings for {file_path} in {time.time() - store_embeddings_time:.2f} seconds"
|
|
688
|
+
)
|
|
689
|
+
except Exception as e:
|
|
690
|
+
# Update error statistics
|
|
691
|
+
self.stats["errors_count"] += 1
|
|
692
|
+
raise
|
|
693
|
+
|
|
694
|
+
def _gather_project_files(self, project_root: Path) -> list[Path]:
|
|
695
|
+
"""
|
|
696
|
+
Gather all relevant files in the project by applying ignore patterns.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
project_root: Root path of the project
|
|
700
|
+
|
|
701
|
+
Returns:
|
|
702
|
+
List of file paths that should be processed
|
|
703
|
+
"""
|
|
704
|
+
project_files = []
|
|
705
|
+
|
|
706
|
+
for file_path in project_root.rglob("*"):
|
|
707
|
+
if file_path.is_file() and not should_ignore_path(
|
|
708
|
+
file_path, project_root, self.config.ignore_patterns
|
|
709
|
+
):
|
|
710
|
+
project_files.append(file_path)
|
|
711
|
+
|
|
712
|
+
return project_files
|
|
713
|
+
|
|
714
|
+
async def _perform_initial_project_embedding(
|
|
715
|
+
self, project_name: str, folder_path: str
|
|
716
|
+
) -> dict[str, int]:
|
|
717
|
+
"""
|
|
718
|
+
Perform initial project embedding for all files, processing only changed files.
|
|
719
|
+
|
|
720
|
+
Args:
|
|
721
|
+
project_name: Name of the project
|
|
722
|
+
folder_path: Root folder path of the project
|
|
723
|
+
|
|
724
|
+
Returns:
|
|
725
|
+
Dictionary with processing statistics
|
|
726
|
+
"""
|
|
727
|
+
stats = {
|
|
728
|
+
"scanned": 0,
|
|
729
|
+
"processed": 0,
|
|
730
|
+
"skipped": 0,
|
|
731
|
+
"failed": 0,
|
|
732
|
+
"deleted": 0,
|
|
285
733
|
}
|
|
286
734
|
|
|
735
|
+
logger.info(f"Starting initial project embedding for {project_name}")
|
|
736
|
+
|
|
737
|
+
try:
|
|
738
|
+
project_root = Path(folder_path)
|
|
739
|
+
if not project_root.exists():
|
|
740
|
+
logger.error(f"Project folder does not exist: {folder_path}")
|
|
741
|
+
return stats
|
|
742
|
+
|
|
743
|
+
# Discover all relevant files in the project
|
|
744
|
+
project_files = self._gather_project_files(project_root)
|
|
745
|
+
|
|
746
|
+
stats["scanned"] = len(project_files)
|
|
747
|
+
logger.info(f"Found {len(project_files)} files to scan in {project_name}")
|
|
748
|
+
|
|
749
|
+
# Process batches concurrently with controlled concurrency
|
|
750
|
+
batch_size = 50
|
|
751
|
+
processed_count = 0
|
|
752
|
+
|
|
753
|
+
# Create batch processing tasks with semaphore control
|
|
754
|
+
async def process_batch_with_semaphore(
|
|
755
|
+
batch_files: list[Path], batch_index: int
|
|
756
|
+
):
|
|
757
|
+
"""Process a single batch with semaphore control."""
|
|
758
|
+
async with self.file_processing_semaphore:
|
|
759
|
+
# Get stored file metadata from vector database
|
|
760
|
+
stored_metadata = (
|
|
761
|
+
await self._vector_storage_service.get_file_metadata(
|
|
762
|
+
project_name,
|
|
763
|
+
[str(f) for f in batch_files],
|
|
764
|
+
)
|
|
765
|
+
)
|
|
766
|
+
return (
|
|
767
|
+
await self._process_file_batch_for_initial_embedding(
|
|
768
|
+
batch_files, project_name, stored_metadata
|
|
769
|
+
),
|
|
770
|
+
batch_index,
|
|
771
|
+
)
|
|
772
|
+
|
|
773
|
+
# Create tasks for all batches
|
|
774
|
+
batch_tasks = []
|
|
775
|
+
for i in range(0, len(project_files), batch_size):
|
|
776
|
+
batch = project_files[i : i + batch_size]
|
|
777
|
+
batch_index = i // batch_size
|
|
778
|
+
task = process_batch_with_semaphore(batch, batch_index)
|
|
779
|
+
batch_tasks.append(task)
|
|
780
|
+
|
|
781
|
+
logger.info(
|
|
782
|
+
f"Processing {len(batch_tasks)} batches concurrently "
|
|
783
|
+
f"(max concurrent: {self.file_processing_semaphore._value})"
|
|
784
|
+
)
|
|
785
|
+
|
|
786
|
+
# Process all batches concurrently
|
|
787
|
+
batch_start_time = time.time()
|
|
788
|
+
batch_results = await asyncio.gather(*batch_tasks, return_exceptions=True)
|
|
789
|
+
total_batch_time = time.time() - batch_start_time
|
|
790
|
+
|
|
791
|
+
error_messages = []
|
|
792
|
+
# Aggregate results from all batches
|
|
793
|
+
successful_batches = 0
|
|
794
|
+
for result in batch_results:
|
|
795
|
+
if isinstance(result, Exception):
|
|
796
|
+
logger.error(f"Batch processing failed with exception: {result}")
|
|
797
|
+
# Estimate failed files (assuming average batch size)
|
|
798
|
+
estimated_failed = min(
|
|
799
|
+
batch_size, len(project_files) - processed_count
|
|
800
|
+
)
|
|
801
|
+
error_messages.append(f"Batch {batch_index} failed: {result}")
|
|
802
|
+
stats["failed"] += estimated_failed
|
|
803
|
+
processed_count += estimated_failed
|
|
804
|
+
elif isinstance(result, tuple):
|
|
805
|
+
batch_stats, batch_index = result
|
|
806
|
+
# Update aggregate stats
|
|
807
|
+
stats["processed"] += batch_stats["processed"]
|
|
808
|
+
stats["skipped"] += batch_stats["skipped"]
|
|
809
|
+
stats["failed"] += batch_stats["failed"]
|
|
810
|
+
processed_count += (
|
|
811
|
+
batch_stats["processed"]
|
|
812
|
+
+ batch_stats["skipped"]
|
|
813
|
+
+ batch_stats["failed"]
|
|
814
|
+
)
|
|
815
|
+
successful_batches += 1
|
|
816
|
+
else:
|
|
817
|
+
logger.warning(f"Unexpected batch result type: {type(result)}")
|
|
818
|
+
|
|
819
|
+
logger.info(
|
|
820
|
+
f"Concurrent batch processing completed in {total_batch_time:.2f}s: "
|
|
821
|
+
f"{successful_batches}/{len(batch_tasks)} batches successful"
|
|
822
|
+
)
|
|
823
|
+
|
|
824
|
+
# Handle deleted files - files that exist in vector DB but not locally
|
|
825
|
+
# Get all stored metadata for cleanup (not limited to specific batches)
|
|
826
|
+
all_stored_metadata = await self._vector_storage_service.get_file_metadata(
|
|
827
|
+
project_name
|
|
828
|
+
)
|
|
829
|
+
await self._cleanup_deleted_files(
|
|
830
|
+
project_name, project_files, all_stored_metadata, stats
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
logger.info(
|
|
834
|
+
f"Initial project embedding complete for {project_name}: "
|
|
835
|
+
f"scanned={stats['scanned']}, processed={stats['processed']}, "
|
|
836
|
+
f"skipped={stats['skipped']}, failed={stats['failed']}, deleted={stats.get('deleted', 0)}"
|
|
837
|
+
)
|
|
838
|
+
|
|
839
|
+
except Exception as e:
|
|
840
|
+
logger.error(
|
|
841
|
+
f"Error during initial project embedding for {project_name}: {e}"
|
|
842
|
+
)
|
|
843
|
+
stats["failed"] += 1
|
|
844
|
+
|
|
845
|
+
return stats
|
|
846
|
+
|
|
847
|
+
async def _process_file_batch_for_initial_embedding(
|
|
848
|
+
self,
|
|
849
|
+
file_batch: list[Path],
|
|
850
|
+
project_name: str,
|
|
851
|
+
stored_metadata: dict[str, float],
|
|
852
|
+
) -> dict[str, int]:
|
|
853
|
+
"""
|
|
854
|
+
Process a batch of files for initial embedding using true batch processing.
|
|
855
|
+
|
|
856
|
+
Args:
|
|
857
|
+
file_batch: List of file paths to process
|
|
858
|
+
project_name: Name of the project
|
|
859
|
+
stored_metadata: Dictionary of file_path -> mtime from vector database
|
|
860
|
+
|
|
861
|
+
Returns:
|
|
862
|
+
Dictionary with batch processing statistics
|
|
863
|
+
"""
|
|
864
|
+
batch_stats = {"processed": 0, "skipped": 0, "failed": 0}
|
|
865
|
+
|
|
866
|
+
# Filter files that need processing based on mtime comparison
|
|
867
|
+
# TODO: remove comparing mtimes
|
|
868
|
+
files_to_process: list[Path] = []
|
|
869
|
+
for file_path in file_batch:
|
|
870
|
+
try:
|
|
871
|
+
current_mtime = file_path.stat().st_mtime
|
|
872
|
+
stored_mtime = stored_metadata.get(str(file_path), 0.0)
|
|
873
|
+
# Use epsilon comparison for floating point mtime
|
|
874
|
+
if abs(current_mtime - stored_mtime) > 0.001:
|
|
875
|
+
files_to_process.append(file_path)
|
|
876
|
+
else:
|
|
877
|
+
batch_stats["skipped"] += 1
|
|
878
|
+
|
|
879
|
+
except (OSError, FileNotFoundError) as e:
|
|
880
|
+
logger.warning(f"Failed to get mtime for {file_path}: {e}")
|
|
881
|
+
batch_stats["failed"] += 1
|
|
882
|
+
|
|
883
|
+
# Process files using true batch processing: chunk → embed → store
|
|
884
|
+
if files_to_process:
|
|
885
|
+
logger.info(
|
|
886
|
+
f"Batch processing {len(files_to_process)}/{len(file_batch)} files "
|
|
887
|
+
f"using true batch processing (chunk → embed → store)"
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
try:
|
|
891
|
+
batch_start_time = time.time()
|
|
892
|
+
|
|
893
|
+
# Step 1: Batch chunking for all files
|
|
894
|
+
# Run in executor to avoid blocking the event loop (CPU-bound work)
|
|
895
|
+
logger.debug(f"Step 1: Chunking {len(files_to_process)} files")
|
|
896
|
+
chunking_start_time = time.time()
|
|
897
|
+
|
|
898
|
+
loop = asyncio.get_running_loop()
|
|
899
|
+
file_chunks = await loop.run_in_executor(
|
|
900
|
+
None,
|
|
901
|
+
self._ast_chunker.chunk_multiple_files,
|
|
902
|
+
[str(f) for f in files_to_process],
|
|
903
|
+
)
|
|
904
|
+
|
|
905
|
+
# Filter out files that failed to chunk
|
|
906
|
+
successful_files = {
|
|
907
|
+
file_path: chunks
|
|
908
|
+
for file_path, chunks in file_chunks.items()
|
|
909
|
+
if chunks # Only keep files with successful chunks
|
|
910
|
+
}
|
|
911
|
+
failed_chunking_count = len(files_to_process) - len(successful_files)
|
|
912
|
+
|
|
913
|
+
logger.debug(
|
|
914
|
+
f"Chunking completed in {time.time() - chunking_start_time:.2f}s: "
|
|
915
|
+
f"{len(successful_files)} files successful, {failed_chunking_count} failed"
|
|
916
|
+
)
|
|
917
|
+
|
|
918
|
+
if successful_files:
|
|
919
|
+
# Step 2: Batch embedding for all chunks
|
|
920
|
+
logger.debug(
|
|
921
|
+
f"Step 2: Generating embeddings for {len(successful_files)} files"
|
|
922
|
+
)
|
|
923
|
+
embedding_start_time = time.time()
|
|
924
|
+
|
|
925
|
+
file_embeddings = await self._embedding_service.generate_embeddings_for_multiple_files(
|
|
926
|
+
successful_files, project_name
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
logger.debug(
|
|
930
|
+
f"Embedding completed in {time.time() - embedding_start_time:.2f}s: "
|
|
931
|
+
f"{len(file_embeddings)} files embedded"
|
|
932
|
+
)
|
|
933
|
+
|
|
934
|
+
# Step 3: Batch storage for all vectors
|
|
935
|
+
if file_embeddings:
|
|
936
|
+
logger.debug(
|
|
937
|
+
f"Step 3: Storing vectors for {len(file_embeddings)} files"
|
|
938
|
+
)
|
|
939
|
+
storage_start_time = time.time()
|
|
940
|
+
|
|
941
|
+
await self._vector_storage_service.store_embeddings_batch(
|
|
942
|
+
file_embeddings, successful_files, project_name
|
|
943
|
+
)
|
|
944
|
+
|
|
945
|
+
logger.debug(
|
|
946
|
+
f"Storage completed in {time.time() - storage_start_time:.2f}s"
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
# Update success count
|
|
950
|
+
batch_stats["processed"] = len(file_embeddings)
|
|
951
|
+
else:
|
|
952
|
+
logger.warning(
|
|
953
|
+
"No embeddings generated despite successful chunking"
|
|
954
|
+
)
|
|
955
|
+
|
|
956
|
+
# Update failure count for chunking failures
|
|
957
|
+
batch_stats["failed"] += failed_chunking_count
|
|
958
|
+
|
|
959
|
+
total_batch_time = time.time() - batch_start_time
|
|
960
|
+
logger.debug(
|
|
961
|
+
f"Batch processing completed in {total_batch_time:.2f}s: "
|
|
962
|
+
f"{batch_stats['processed']} processed, {batch_stats['failed']} failed, "
|
|
963
|
+
f"{batch_stats['skipped']} skipped"
|
|
964
|
+
)
|
|
965
|
+
|
|
966
|
+
except Exception as e:
|
|
967
|
+
logger.error(f"Batch processing failed: {e}", exc_info=True)
|
|
968
|
+
# Mark all files as failed if batch processing fails
|
|
969
|
+
batch_stats["failed"] += len(files_to_process)
|
|
970
|
+
|
|
971
|
+
return batch_stats
|
|
972
|
+
|
|
973
|
+
async def _cleanup_deleted_files(
|
|
974
|
+
self,
|
|
975
|
+
project_name: str,
|
|
976
|
+
existing_files: list[Path],
|
|
977
|
+
stored_metadata: dict[str, float],
|
|
978
|
+
stats: dict[str, int],
|
|
979
|
+
) -> None:
|
|
980
|
+
"""
|
|
981
|
+
Clean up files that exist in vector database but not locally (deleted files).
|
|
982
|
+
|
|
983
|
+
Args:
|
|
984
|
+
project_name: Name of the project
|
|
985
|
+
existing_files: List of files that exist locally
|
|
986
|
+
stored_metadata: Dictionary of file_path -> mtime from vector database
|
|
987
|
+
stats: Statistics dictionary to update
|
|
988
|
+
"""
|
|
989
|
+
if not stored_metadata:
|
|
990
|
+
return
|
|
991
|
+
|
|
992
|
+
# Create set of existing file paths for efficient lookup
|
|
993
|
+
existing_file_paths = {str(file_path) for file_path in existing_files}
|
|
994
|
+
|
|
995
|
+
# Find files that exist in vector DB but not locally
|
|
996
|
+
deleted_files = []
|
|
997
|
+
for stored_file_path in stored_metadata.keys():
|
|
998
|
+
if stored_file_path not in existing_file_paths:
|
|
999
|
+
# Convert string path back to Path object for processing
|
|
1000
|
+
deleted_file_path = Path(stored_file_path)
|
|
1001
|
+
deleted_files.append(deleted_file_path)
|
|
1002
|
+
|
|
1003
|
+
if deleted_files:
|
|
1004
|
+
logger.info(
|
|
1005
|
+
f"Found {len(deleted_files)} deleted files to clean up from vector database"
|
|
1006
|
+
)
|
|
1007
|
+
|
|
1008
|
+
# Initialize deleted count in stats
|
|
1009
|
+
if "deleted" not in stats:
|
|
1010
|
+
stats["deleted"] = 0
|
|
1011
|
+
|
|
1012
|
+
# Process each deleted file
|
|
1013
|
+
for deleted_file_path in deleted_files:
|
|
1014
|
+
try:
|
|
1015
|
+
# Create FileChange object for deleted file
|
|
1016
|
+
file_change = FileChange(
|
|
1017
|
+
path=deleted_file_path,
|
|
1018
|
+
change_type=ChangeType.DELETED,
|
|
1019
|
+
timestamp=time.time(),
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
# Create ProcessFileChangeTask for deletion
|
|
1023
|
+
task_item: ProcessFileChangeTask = {
|
|
1024
|
+
"type": VectorDaemonTaskType.PROCESS_FILE_CHANGE,
|
|
1025
|
+
"project_name": project_name,
|
|
1026
|
+
"change": file_change,
|
|
1027
|
+
"timestamp": time.time(),
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
# Process deletion using existing file change task logic
|
|
1031
|
+
await self._process_file_change_task(
|
|
1032
|
+
task_item, "initial-processing"
|
|
1033
|
+
)
|
|
1034
|
+
stats["deleted"] += 1
|
|
1035
|
+
|
|
1036
|
+
logger.debug(f"Cleaned up deleted file: {deleted_file_path}")
|
|
1037
|
+
|
|
1038
|
+
except Exception as e:
|
|
1039
|
+
logger.error(
|
|
1040
|
+
f"Failed to clean up deleted file {deleted_file_path}: {e}"
|
|
1041
|
+
)
|
|
1042
|
+
stats["failed"] += 1
|
|
1043
|
+
else:
|
|
1044
|
+
logger.debug("No deleted files found during initial processing")
|
|
1045
|
+
|
|
1046
|
+
|
|
287
1047
|
async def start_vector_daemon(
|
|
288
1048
|
config_path: Optional[Path] = None,
|
|
289
1049
|
db_path: Optional[Path] = None,
|
|
290
1050
|
cache_dir: Optional[Path] = None,
|
|
291
1051
|
) -> None:
|
|
292
1052
|
"""Start the vector daemon process."""
|
|
293
|
-
|
|
1053
|
+
|
|
294
1054
|
# Load configuration
|
|
295
1055
|
config = load_vector_config(config_path)
|
|
296
|
-
|
|
1056
|
+
|
|
297
1057
|
# Setup database
|
|
298
1058
|
if db_path is None:
|
|
299
1059
|
db_path = Path.home() / ".mcp-code-index" / "tracker.db"
|
|
300
1060
|
if cache_dir is None:
|
|
301
1061
|
cache_dir = Path.home() / ".mcp-code-index" / "cache"
|
|
302
|
-
|
|
1062
|
+
|
|
303
1063
|
db_manager = DatabaseManager(db_path)
|
|
304
1064
|
await db_manager.initialize()
|
|
305
|
-
|
|
1065
|
+
|
|
306
1066
|
# Create and start daemon
|
|
307
1067
|
daemon = VectorDaemon(config, db_manager, cache_dir)
|
|
308
|
-
|
|
1068
|
+
|
|
309
1069
|
try:
|
|
310
1070
|
await daemon.start()
|
|
311
1071
|
finally:
|
|
312
1072
|
# Clean up database connections
|
|
313
1073
|
await db_manager.close_pool()
|
|
314
1074
|
|
|
1075
|
+
|
|
315
1076
|
def main() -> None:
|
|
316
1077
|
"""CLI entry point for vector daemon."""
|
|
317
1078
|
import argparse
|
|
318
|
-
|
|
1079
|
+
|
|
319
1080
|
parser = argparse.ArgumentParser(description="MCP Code Indexer Vector Daemon")
|
|
320
1081
|
parser.add_argument("--config", type=Path, help="Path to config file")
|
|
321
1082
|
parser.add_argument("--db-path", type=Path, help="Path to database")
|
|
322
1083
|
parser.add_argument("--cache-dir", type=Path, help="Cache directory")
|
|
323
1084
|
parser.add_argument("--log-level", default="INFO", help="Logging level")
|
|
324
|
-
|
|
1085
|
+
|
|
325
1086
|
args = parser.parse_args()
|
|
326
|
-
|
|
1087
|
+
|
|
327
1088
|
# Setup logging
|
|
328
1089
|
logging.basicConfig(
|
|
329
1090
|
level=getattr(logging, args.log_level.upper()),
|
|
330
1091
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
|
331
1092
|
)
|
|
332
|
-
|
|
1093
|
+
|
|
333
1094
|
try:
|
|
334
1095
|
asyncio.run(start_vector_daemon(args.config, args.db_path, args.cache_dir))
|
|
335
1096
|
except KeyboardInterrupt:
|
|
@@ -339,5 +1100,6 @@ def main() -> None:
|
|
|
339
1100
|
logger.error(f"Daemon failed: {e}", exc_info=True)
|
|
340
1101
|
sys.exit(1)
|
|
341
1102
|
|
|
1103
|
+
|
|
342
1104
|
if __name__ == "__main__":
|
|
343
1105
|
main()
|