agent-brain-rag 2.0.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,434 @@
1
+ """Background job worker that processes indexing jobs from the queue."""
2
+
3
+ import asyncio
4
+ import logging
5
+ from datetime import datetime, timezone
6
+ from typing import Optional
7
+
8
+ from agent_brain_server.job_queue.job_store import JobQueueStore
9
+ from agent_brain_server.models import IndexingState, IndexingStatusEnum, IndexRequest
10
+ from agent_brain_server.models.job import JobProgress, JobRecord, JobStatus
11
+ from agent_brain_server.services.indexing_service import IndexingService
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class CancellationRequestedError(Exception):
17
+ """Raised when a job cancellation is requested."""
18
+
19
+ pass
20
+
21
+
22
+ class JobWorker:
23
+ """Background asyncio task that polls for and processes indexing jobs.
24
+
25
+ Features:
26
+ - Polls for pending jobs from JobQueueStore
27
+ - Processes one job at a time (concurrency=1)
28
+ - Timeout support with configurable MAX_RUNTIME_SECONDS
29
+ - Cancellation via cancel_requested flag on JobRecord
30
+ - Progress updates at configurable intervals
31
+ - Verifies ChromaDB has chunks after indexing before marking DONE
32
+
33
+ Example:
34
+ worker = JobWorker(job_store, indexing_service)
35
+ await worker.start()
36
+ # ... later ...
37
+ await worker.stop()
38
+ """
39
+
40
+ # Default configuration
41
+ MAX_RUNTIME_SECONDS: int = 7200 # 2 hours
42
+ PROGRESS_CHECKPOINT_INTERVAL: int = 50 # Update progress every N files
43
+ POLL_INTERVAL_SECONDS: float = 1.0 # Poll for new jobs every N seconds
44
+
45
+ def __init__(
46
+ self,
47
+ job_store: JobQueueStore,
48
+ indexing_service: IndexingService,
49
+ max_runtime_seconds: Optional[int] = None,
50
+ progress_checkpoint_interval: Optional[int] = None,
51
+ poll_interval_seconds: Optional[float] = None,
52
+ ):
53
+ """Initialize the job worker.
54
+
55
+ Args:
56
+ job_store: Job queue store for persistence.
57
+ indexing_service: Indexing service for processing jobs.
58
+ max_runtime_seconds: Maximum job runtime before timeout.
59
+ progress_checkpoint_interval: Update progress every N files.
60
+ poll_interval_seconds: Poll interval for checking new jobs.
61
+ """
62
+ self._job_store = job_store
63
+ self._indexing_service = indexing_service
64
+
65
+ # Configuration (use instance values or defaults)
66
+ self._max_runtime_seconds = max_runtime_seconds or self.MAX_RUNTIME_SECONDS
67
+ self._progress_interval = (
68
+ progress_checkpoint_interval or self.PROGRESS_CHECKPOINT_INTERVAL
69
+ )
70
+ self._poll_interval = poll_interval_seconds or self.POLL_INTERVAL_SECONDS
71
+
72
+ # Internal state
73
+ self._running = False
74
+ self._task: Optional[asyncio.Task[None]] = None
75
+ self._current_job: Optional[JobRecord] = None
76
+ self._stop_event = asyncio.Event()
77
+
78
+ @property
79
+ def is_running(self) -> bool:
80
+ """Check if the worker is currently running."""
81
+ return self._running and self._task is not None and not self._task.done()
82
+
83
+ @property
84
+ def current_job(self) -> Optional[JobRecord]:
85
+ """Get the currently processing job, if any."""
86
+ return self._current_job
87
+
88
+ async def start(self) -> None:
89
+ """Start the background worker task.
90
+
91
+ Creates an asyncio task that polls for pending jobs and processes them.
92
+ Safe to call multiple times (subsequent calls are no-ops if already running).
93
+ """
94
+ if self._running:
95
+ logger.warning("JobWorker already running")
96
+ return
97
+
98
+ self._running = True
99
+ self._stop_event.clear()
100
+ self._task = asyncio.create_task(self._run_loop())
101
+ logger.info("JobWorker started")
102
+
103
+ async def stop(self, timeout: float = 30.0) -> None:
104
+ """Gracefully stop the worker.
105
+
106
+ Signals the worker to stop and waits for the current job to complete
107
+ or for the timeout to expire.
108
+
109
+ Args:
110
+ timeout: Maximum seconds to wait for graceful shutdown.
111
+ """
112
+ if not self._running:
113
+ return
114
+
115
+ logger.info("JobWorker stopping...")
116
+ self._running = False
117
+ self._stop_event.set()
118
+
119
+ if self._task is not None:
120
+ try:
121
+ await asyncio.wait_for(self._task, timeout=timeout)
122
+ except asyncio.TimeoutError:
123
+ logger.warning(
124
+ f"JobWorker did not stop within {timeout}s, cancelling task"
125
+ )
126
+ self._task.cancel()
127
+ try:
128
+ await self._task
129
+ except asyncio.CancelledError:
130
+ pass
131
+
132
+ self._task = None
133
+ self._current_job = None
134
+ logger.info("JobWorker stopped")
135
+
136
+ async def _run_loop(self) -> None:
137
+ """Main worker loop that polls for and processes jobs.
138
+
139
+ Continuously polls for pending jobs and processes them one at a time.
140
+ Exits when stop() is called.
141
+ """
142
+ logger.info("JobWorker run loop started")
143
+
144
+ while self._running:
145
+ try:
146
+ # Check for pending jobs
147
+ pending_jobs = await self._job_store.get_pending_jobs()
148
+
149
+ if pending_jobs:
150
+ # Process the first pending job (FIFO)
151
+ job = pending_jobs[0]
152
+ await self._process_job(job)
153
+ else:
154
+ # No jobs, wait before polling again
155
+ try:
156
+ await asyncio.wait_for(
157
+ self._stop_event.wait(),
158
+ timeout=self._poll_interval,
159
+ )
160
+ # If we get here, stop was requested
161
+ break
162
+ except asyncio.TimeoutError:
163
+ # Normal timeout, continue polling
164
+ pass
165
+
166
+ except Exception as e:
167
+ logger.error(f"Error in job worker loop: {e}", exc_info=True)
168
+ # Brief pause before retrying to avoid tight error loop
169
+ await asyncio.sleep(1.0)
170
+
171
+ logger.info("JobWorker run loop exited")
172
+
173
+ async def _process_job(self, job: JobRecord) -> None:
174
+ """Process a single indexing job.
175
+
176
+ Marks the job as RUNNING, executes the indexing pipeline with timeout,
177
+ and marks the job as DONE, FAILED, or CANCELLED based on outcome.
178
+
179
+ Args:
180
+ job: The job record to process.
181
+ """
182
+ logger.info(f"Processing job {job.id} for {job.folder_path}")
183
+ self._current_job = job
184
+
185
+ try:
186
+ # Mark job as RUNNING
187
+ job.status = JobStatus.RUNNING
188
+ job.started_at = datetime.now(timezone.utc)
189
+ job.progress = JobProgress(
190
+ files_processed=0,
191
+ files_total=0,
192
+ chunks_created=0,
193
+ current_file="",
194
+ updated_at=datetime.now(timezone.utc),
195
+ )
196
+ await self._job_store.update_job(job)
197
+
198
+ # Set IndexingService state to indicate indexing is in progress
199
+ # This ensures status endpoints reflect the correct state
200
+ async with self._indexing_service._lock:
201
+ self._indexing_service._state = IndexingState(
202
+ current_job_id=job.id,
203
+ status=IndexingStatusEnum.INDEXING,
204
+ is_indexing=True,
205
+ folder_path=job.folder_path,
206
+ started_at=job.started_at,
207
+ completed_at=None,
208
+ error=None,
209
+ )
210
+
211
+ # Create IndexRequest from JobRecord
212
+ index_request = IndexRequest(
213
+ folder_path=job.folder_path,
214
+ include_code=job.include_code,
215
+ chunk_size=job.chunk_size,
216
+ chunk_overlap=job.chunk_overlap,
217
+ recursive=job.recursive,
218
+ generate_summaries=job.generate_summaries,
219
+ supported_languages=job.supported_languages,
220
+ include_patterns=job.include_patterns,
221
+ exclude_patterns=job.exclude_patterns,
222
+ )
223
+
224
+ # Create progress callback that checks for cancellation
225
+ async def progress_callback(current: int, total: int, message: str) -> None:
226
+ """Progress callback that updates job and checks for cancellation."""
227
+ # Re-fetch job to check for cancellation request
228
+ refreshed_job = await self._job_store.get_job(job.id)
229
+ if refreshed_job and refreshed_job.cancel_requested:
230
+ logger.info(f"Cancellation requested for job {job.id}")
231
+ raise CancellationRequestedError(
232
+ f"Job {job.id} cancellation requested"
233
+ )
234
+
235
+ # Update progress at intervals
236
+ if (
237
+ job.progress is None
238
+ or current - job.progress.files_processed >= self._progress_interval
239
+ or current == total
240
+ ):
241
+ job.progress = JobProgress(
242
+ files_processed=current,
243
+ files_total=total,
244
+ chunks_created=0, # Will be updated at end
245
+ current_file=message,
246
+ updated_at=datetime.now(timezone.utc),
247
+ )
248
+ await self._job_store.update_job(job)
249
+
250
+ # Get chunk count before indexing for delta verification
251
+ count_before = 0
252
+ try:
253
+ vector_store = self._indexing_service.vector_store
254
+ if vector_store.is_initialized:
255
+ count_before = await vector_store.get_count()
256
+ except Exception as e:
257
+ logger.warning(f"Could not get count before indexing: {e}")
258
+
259
+ # Execute indexing with timeout
260
+ try:
261
+ await asyncio.wait_for(
262
+ self._indexing_service._run_indexing_pipeline(
263
+ index_request, job.id, progress_callback
264
+ ),
265
+ timeout=self._max_runtime_seconds,
266
+ )
267
+ except asyncio.TimeoutError:
268
+ logger.error(
269
+ f"Job {job.id} timed out after {self._max_runtime_seconds}s"
270
+ )
271
+ job.status = JobStatus.FAILED
272
+ job.error = f"Job timed out after {self._max_runtime_seconds} seconds"
273
+ job.finished_at = datetime.now(timezone.utc)
274
+ await self._job_store.update_job(job)
275
+
276
+ # Clear IndexingService state on timeout
277
+ async with self._indexing_service._lock:
278
+ self._indexing_service._state = IndexingState(
279
+ current_job_id=job.id,
280
+ status=IndexingStatusEnum.FAILED,
281
+ is_indexing=False,
282
+ folder_path=job.folder_path,
283
+ started_at=job.started_at,
284
+ completed_at=job.finished_at,
285
+ error=job.error,
286
+ )
287
+ return
288
+
289
+ # Verify collection has new chunks (delta verification)
290
+ verification_passed = await self._verify_collection_delta(job, count_before)
291
+
292
+ if verification_passed:
293
+ # Get final chunk count from indexing service status
294
+ status = await self._indexing_service.get_status()
295
+ job.total_chunks = status.get("total_chunks", 0)
296
+ job.total_documents = status.get("total_documents", 0)
297
+
298
+ # Update final progress
299
+ if job.progress:
300
+ job.progress = JobProgress(
301
+ files_processed=job.progress.files_total,
302
+ files_total=job.progress.files_total,
303
+ chunks_created=job.total_chunks,
304
+ current_file="Complete",
305
+ updated_at=datetime.now(timezone.utc),
306
+ )
307
+
308
+ job.status = JobStatus.DONE
309
+ job.finished_at = datetime.now(timezone.utc)
310
+ logger.info(
311
+ f"Job {job.id} completed: {job.total_documents} docs, "
312
+ f"{job.total_chunks} chunks"
313
+ )
314
+
315
+ # Clear IndexingService state on success
316
+ async with self._indexing_service._lock:
317
+ self._indexing_service._state = IndexingState(
318
+ current_job_id=job.id,
319
+ status=IndexingStatusEnum.COMPLETED,
320
+ is_indexing=False,
321
+ folder_path=job.folder_path,
322
+ started_at=job.started_at,
323
+ completed_at=job.finished_at,
324
+ error=None,
325
+ )
326
+ else:
327
+ job.status = JobStatus.FAILED
328
+ job.error = "Verification failed: No chunks found in vector store"
329
+ job.finished_at = datetime.now(timezone.utc)
330
+ logger.error(f"Job {job.id} verification failed: no chunks in store")
331
+
332
+ # Clear IndexingService state on verification failure
333
+ async with self._indexing_service._lock:
334
+ self._indexing_service._state = IndexingState(
335
+ current_job_id=job.id,
336
+ status=IndexingStatusEnum.FAILED,
337
+ is_indexing=False,
338
+ folder_path=job.folder_path,
339
+ started_at=job.started_at,
340
+ completed_at=job.finished_at,
341
+ error=job.error,
342
+ )
343
+
344
+ await self._job_store.update_job(job)
345
+
346
+ except CancellationRequestedError:
347
+ job.status = JobStatus.CANCELLED
348
+ job.error = "Job was cancelled by user request"
349
+ job.finished_at = datetime.now(timezone.utc)
350
+ await self._job_store.update_job(job)
351
+
352
+ # Clear IndexingService state on cancellation
353
+ async with self._indexing_service._lock:
354
+ self._indexing_service._state = IndexingState(
355
+ current_job_id=job.id,
356
+ status=IndexingStatusEnum.IDLE,
357
+ is_indexing=False,
358
+ folder_path=job.folder_path,
359
+ started_at=job.started_at,
360
+ completed_at=job.finished_at,
361
+ error=job.error,
362
+ )
363
+ logger.info(f"Job {job.id} cancelled")
364
+
365
+ except Exception as e:
366
+ logger.error(f"Job {job.id} failed with error: {e}", exc_info=True)
367
+ job.status = JobStatus.FAILED
368
+ job.error = str(e)
369
+ job.finished_at = datetime.now(timezone.utc)
370
+ await self._job_store.update_job(job)
371
+
372
+ # Clear IndexingService state on error
373
+ async with self._indexing_service._lock:
374
+ self._indexing_service._state = IndexingState(
375
+ current_job_id=job.id,
376
+ status=IndexingStatusEnum.FAILED,
377
+ is_indexing=False,
378
+ folder_path=job.folder_path,
379
+ started_at=job.started_at,
380
+ completed_at=job.finished_at,
381
+ error=job.error,
382
+ )
383
+
384
+ finally:
385
+ self._current_job = None
386
+
387
+ async def _verify_collection_delta(self, job: JobRecord, count_before: int) -> bool:
388
+ """Verify that the vector store has new chunks after indexing.
389
+
390
+ Uses delta verification (count_after - count_before) to avoid false
391
+ positives when prior chunks exist but the job added nothing.
392
+
393
+ Args:
394
+ job: The job record to verify.
395
+ count_before: Chunk count before indexing started.
396
+
397
+ Returns:
398
+ True if verification passed (new chunks added), False otherwise.
399
+ """
400
+ try:
401
+ vector_store = self._indexing_service.vector_store
402
+ count_after = await vector_store.get_count()
403
+ delta = count_after - count_before
404
+
405
+ if delta > 0:
406
+ logger.info(
407
+ f"Verification passed for job {job.id}: "
408
+ f"{delta} new chunks (before={count_before}, after={count_after})"
409
+ )
410
+ return True
411
+ elif count_after > 0 and delta == 0:
412
+ # Special case: job might have processed files that were already indexed
413
+ # Check if any documents were processed
414
+ if job.progress and job.progress.files_processed > 0:
415
+ logger.warning(
416
+ f"Job {job.id} processed {job.progress.files_processed} files "
417
+ f"but added no new chunks (may have been already indexed)"
418
+ )
419
+ # Consider this a success if files were processed
420
+ return True
421
+ logger.warning(
422
+ f"Verification failed for job {job.id}: no new chunks added "
423
+ f"(before={count_before}, after={count_after})"
424
+ )
425
+ return False
426
+ else:
427
+ logger.warning(
428
+ f"Verification failed for job {job.id}: no chunks in vector store"
429
+ )
430
+ return False
431
+
432
+ except Exception as e:
433
+ logger.error(f"Verification error for job {job.id}: {e}")
434
+ return False
@@ -1,15 +1,92 @@
1
- """File-based locking for doc-serve instances."""
1
+ """File-based locking for Agent Brain instances."""
2
2
 
3
- import fcntl
4
3
  import logging
5
4
  import os
5
+ import sys
6
6
  from pathlib import Path
7
- from typing import Optional
7
+ from typing import Callable, Optional
8
8
 
9
9
  logger = logging.getLogger(__name__)
10
10
 
11
- LOCK_FILE = "doc-serve.lock"
12
- PID_FILE = "doc-serve.pid"
11
+
12
+ # Platform-safe file locking functions
13
+ def _lock_exclusive_noop(fd: int) -> None:
14
+ """No-op exclusive lock for platforms without native support."""
15
+ pass
16
+
17
+
18
+ def _lock_nonblocking_noop(fd: int) -> bool:
19
+ """No-op non-blocking lock. Returns True (always succeeds)."""
20
+ return True
21
+
22
+
23
+ def _unlock_noop(fd: int) -> None:
24
+ """No-op unlock for platforms without native support."""
25
+ pass
26
+
27
+
28
+ # Initialize lock/unlock functions based on platform
29
+ _lock_exclusive: Callable[[int], None] = _lock_exclusive_noop
30
+ _try_lock_exclusive: Callable[[int], bool] = _lock_nonblocking_noop
31
+ _unlock: Callable[[int], None] = _unlock_noop
32
+ _lock_warning_shown = False
33
+
34
+ if sys.platform != "win32":
35
+ try:
36
+ import fcntl
37
+
38
+ def _lock_exclusive_fcntl(fd: int) -> None:
39
+ """Blocking exclusive lock using fcntl (POSIX)."""
40
+ fcntl.flock(fd, fcntl.LOCK_EX)
41
+
42
+ def _try_lock_exclusive_fcntl(fd: int) -> bool:
43
+ """Non-blocking exclusive lock using fcntl (POSIX)."""
44
+ try:
45
+ fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
46
+ return True
47
+ except OSError:
48
+ return False
49
+
50
+ def _unlock_fcntl(fd: int) -> None:
51
+ """Unlock using fcntl (POSIX)."""
52
+ fcntl.flock(fd, fcntl.LOCK_UN)
53
+
54
+ _lock_exclusive = _lock_exclusive_fcntl
55
+ _try_lock_exclusive = _try_lock_exclusive_fcntl
56
+ _unlock = _unlock_fcntl
57
+ except ImportError:
58
+ pass
59
+ else:
60
+ try:
61
+ import msvcrt
62
+
63
+ def _lock_exclusive_msvcrt(fd: int) -> None:
64
+ """Blocking exclusive lock using msvcrt (Windows)."""
65
+ msvcrt.locking(fd, msvcrt.LK_LOCK, 1)
66
+
67
+ def _try_lock_exclusive_msvcrt(fd: int) -> bool:
68
+ """Non-blocking exclusive lock using msvcrt (Windows)."""
69
+ try:
70
+ msvcrt.locking(fd, msvcrt.LK_NBLCK, 1)
71
+ return True
72
+ except OSError:
73
+ return False
74
+
75
+ def _unlock_msvcrt(fd: int) -> None:
76
+ """Unlock using msvcrt (Windows)."""
77
+ try:
78
+ msvcrt.locking(fd, msvcrt.LK_UNLCK, 1)
79
+ except OSError:
80
+ pass
81
+
82
+ _lock_exclusive = _lock_exclusive_msvcrt
83
+ _try_lock_exclusive = _try_lock_exclusive_msvcrt
84
+ _unlock = _unlock_msvcrt
85
+ except ImportError:
86
+ pass
87
+
88
+ LOCK_FILE = "agent-brain.lock"
89
+ PID_FILE = "agent-brain.pid"
13
90
 
14
91
  # Module-level storage for lock file descriptors
15
92
  _lock_fds: dict[str, int] = {}
@@ -26,12 +103,26 @@ def acquire_lock(state_dir: Path) -> bool:
26
103
  Returns:
27
104
  True if lock acquired, False if already held.
28
105
  """
106
+ global _lock_warning_shown
107
+
29
108
  state_dir.mkdir(parents=True, exist_ok=True)
30
109
  lock_path = state_dir / LOCK_FILE
31
110
 
111
+ # Warn once if no locking available
112
+ if _try_lock_exclusive is _lock_nonblocking_noop and not _lock_warning_shown:
113
+ logger.warning(
114
+ "File locking not available on this platform. "
115
+ "Multiple instances may conflict."
116
+ )
117
+ _lock_warning_shown = True
118
+
32
119
  try:
33
120
  fd = os.open(str(lock_path), os.O_CREAT | os.O_WRONLY)
34
- fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
121
+
122
+ if not _try_lock_exclusive(fd):
123
+ os.close(fd)
124
+ logger.warning(f"Lock already held: {lock_path}")
125
+ return False
35
126
 
36
127
  # Write PID
37
128
  pid_path = state_dir / PID_FILE
@@ -58,7 +149,7 @@ def release_lock(state_dir: Path) -> None:
58
149
  fd = _lock_fds.pop(str(state_dir), None)
59
150
  if fd is not None:
60
151
  try:
61
- fcntl.flock(fd, fcntl.LOCK_UN)
152
+ _unlock(fd)
62
153
  os.close(fd)
63
154
  except OSError:
64
155
  pass
@@ -118,12 +209,14 @@ def cleanup_stale(state_dir: Path) -> None:
118
209
  """Clean up stale lock and PID files.
119
210
 
120
211
  Only cleans up if the lock is determined to be stale.
212
+ Note: Does NOT clean runtime.json - that's managed by the CLI
213
+ to avoid race conditions during server startup.
121
214
 
122
215
  Args:
123
216
  state_dir: Path to the state directory.
124
217
  """
125
218
  if is_stale(state_dir):
126
- for fname in [LOCK_FILE, PID_FILE, "runtime.json"]:
219
+ for fname in [LOCK_FILE, PID_FILE]:
127
220
  fpath = state_dir / fname
128
221
  if fpath.exists():
129
222
  try:
@@ -3,6 +3,16 @@
3
3
  from .graph import GraphEntity, GraphIndexStatus, GraphQueryContext, GraphTriple
4
4
  from .health import HealthStatus, IndexingStatus
5
5
  from .index import IndexingState, IndexingStatusEnum, IndexRequest, IndexResponse
6
+ from .job import (
7
+ JobDetailResponse,
8
+ JobEnqueueResponse,
9
+ JobListResponse,
10
+ JobProgress,
11
+ JobRecord,
12
+ JobStatus,
13
+ JobSummary,
14
+ QueueStats,
15
+ )
6
16
  from .query import QueryMode, QueryRequest, QueryResponse, QueryResult
7
17
 
8
18
  __all__ = [
@@ -24,4 +34,13 @@ __all__ = [
24
34
  "GraphEntity",
25
35
  "GraphIndexStatus",
26
36
  "GraphQueryContext",
37
+ # Job queue models (Feature 115)
38
+ "JobStatus",
39
+ "JobProgress",
40
+ "JobRecord",
41
+ "JobEnqueueResponse",
42
+ "JobListResponse",
43
+ "JobSummary",
44
+ "JobDetailResponse",
45
+ "QueueStats",
27
46
  ]
@@ -110,6 +110,21 @@ class IndexingStatus(BaseModel):
110
110
  default=None,
111
111
  description="Graph index status with entity_count, relationship_count, etc.",
112
112
  )
113
+ # Queue status (Feature 115)
114
+ queue_pending: int = Field(
115
+ default=0,
116
+ ge=0,
117
+ description="Number of pending jobs in the queue",
118
+ )
119
+ queue_running: int = Field(
120
+ default=0,
121
+ ge=0,
122
+ description="Number of running jobs (0 or 1)",
123
+ )
124
+ current_job_running_time_ms: Optional[int] = Field(
125
+ None,
126
+ description="Running time of current job in milliseconds",
127
+ )
113
128
 
114
129
  model_config = {
115
130
  "json_schema_extra": {