agent-brain-rag 2.0.0__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {agent_brain_rag-2.0.0.dist-info → agent_brain_rag-3.0.0.dist-info}/METADATA +3 -4
- {agent_brain_rag-2.0.0.dist-info → agent_brain_rag-3.0.0.dist-info}/RECORD +26 -20
- {agent_brain_rag-2.0.0.dist-info → agent_brain_rag-3.0.0.dist-info}/WHEEL +1 -1
- {agent_brain_rag-2.0.0.dist-info → agent_brain_rag-3.0.0.dist-info}/entry_points.txt +0 -1
- agent_brain_server/__init__.py +1 -1
- agent_brain_server/api/main.py +118 -45
- agent_brain_server/api/routers/__init__.py +2 -0
- agent_brain_server/api/routers/health.py +85 -22
- agent_brain_server/api/routers/index.py +108 -36
- agent_brain_server/api/routers/jobs.py +111 -0
- agent_brain_server/config/provider_config.py +63 -19
- agent_brain_server/config/settings.py +10 -4
- agent_brain_server/indexing/bm25_index.py +15 -2
- agent_brain_server/indexing/document_loader.py +45 -4
- agent_brain_server/job_queue/__init__.py +11 -0
- agent_brain_server/job_queue/job_service.py +317 -0
- agent_brain_server/job_queue/job_store.py +427 -0
- agent_brain_server/job_queue/job_worker.py +434 -0
- agent_brain_server/locking.py +101 -8
- agent_brain_server/models/__init__.py +19 -0
- agent_brain_server/models/health.py +15 -0
- agent_brain_server/models/job.py +289 -0
- agent_brain_server/models/query.py +2 -2
- agent_brain_server/project_root.py +1 -1
- agent_brain_server/runtime.py +2 -2
- agent_brain_server/storage_paths.py +3 -3
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
"""Background job worker that processes indexing jobs from the queue."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import logging
|
|
5
|
+
from datetime import datetime, timezone
|
|
6
|
+
from typing import Optional
|
|
7
|
+
|
|
8
|
+
from agent_brain_server.job_queue.job_store import JobQueueStore
|
|
9
|
+
from agent_brain_server.models import IndexingState, IndexingStatusEnum, IndexRequest
|
|
10
|
+
from agent_brain_server.models.job import JobProgress, JobRecord, JobStatus
|
|
11
|
+
from agent_brain_server.services.indexing_service import IndexingService
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger(__name__)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CancellationRequestedError(Exception):
|
|
17
|
+
"""Raised when a job cancellation is requested."""
|
|
18
|
+
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class JobWorker:
|
|
23
|
+
"""Background asyncio task that polls for and processes indexing jobs.
|
|
24
|
+
|
|
25
|
+
Features:
|
|
26
|
+
- Polls for pending jobs from JobQueueStore
|
|
27
|
+
- Processes one job at a time (concurrency=1)
|
|
28
|
+
- Timeout support with configurable MAX_RUNTIME_SECONDS
|
|
29
|
+
- Cancellation via cancel_requested flag on JobRecord
|
|
30
|
+
- Progress updates at configurable intervals
|
|
31
|
+
- Verifies ChromaDB has chunks after indexing before marking DONE
|
|
32
|
+
|
|
33
|
+
Example:
|
|
34
|
+
worker = JobWorker(job_store, indexing_service)
|
|
35
|
+
await worker.start()
|
|
36
|
+
# ... later ...
|
|
37
|
+
await worker.stop()
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
# Default configuration
|
|
41
|
+
MAX_RUNTIME_SECONDS: int = 7200 # 2 hours
|
|
42
|
+
PROGRESS_CHECKPOINT_INTERVAL: int = 50 # Update progress every N files
|
|
43
|
+
POLL_INTERVAL_SECONDS: float = 1.0 # Poll for new jobs every N seconds
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
job_store: JobQueueStore,
|
|
48
|
+
indexing_service: IndexingService,
|
|
49
|
+
max_runtime_seconds: Optional[int] = None,
|
|
50
|
+
progress_checkpoint_interval: Optional[int] = None,
|
|
51
|
+
poll_interval_seconds: Optional[float] = None,
|
|
52
|
+
):
|
|
53
|
+
"""Initialize the job worker.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
job_store: Job queue store for persistence.
|
|
57
|
+
indexing_service: Indexing service for processing jobs.
|
|
58
|
+
max_runtime_seconds: Maximum job runtime before timeout.
|
|
59
|
+
progress_checkpoint_interval: Update progress every N files.
|
|
60
|
+
poll_interval_seconds: Poll interval for checking new jobs.
|
|
61
|
+
"""
|
|
62
|
+
self._job_store = job_store
|
|
63
|
+
self._indexing_service = indexing_service
|
|
64
|
+
|
|
65
|
+
# Configuration (use instance values or defaults)
|
|
66
|
+
self._max_runtime_seconds = max_runtime_seconds or self.MAX_RUNTIME_SECONDS
|
|
67
|
+
self._progress_interval = (
|
|
68
|
+
progress_checkpoint_interval or self.PROGRESS_CHECKPOINT_INTERVAL
|
|
69
|
+
)
|
|
70
|
+
self._poll_interval = poll_interval_seconds or self.POLL_INTERVAL_SECONDS
|
|
71
|
+
|
|
72
|
+
# Internal state
|
|
73
|
+
self._running = False
|
|
74
|
+
self._task: Optional[asyncio.Task[None]] = None
|
|
75
|
+
self._current_job: Optional[JobRecord] = None
|
|
76
|
+
self._stop_event = asyncio.Event()
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def is_running(self) -> bool:
|
|
80
|
+
"""Check if the worker is currently running."""
|
|
81
|
+
return self._running and self._task is not None and not self._task.done()
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def current_job(self) -> Optional[JobRecord]:
|
|
85
|
+
"""Get the currently processing job, if any."""
|
|
86
|
+
return self._current_job
|
|
87
|
+
|
|
88
|
+
async def start(self) -> None:
|
|
89
|
+
"""Start the background worker task.
|
|
90
|
+
|
|
91
|
+
Creates an asyncio task that polls for pending jobs and processes them.
|
|
92
|
+
Safe to call multiple times (subsequent calls are no-ops if already running).
|
|
93
|
+
"""
|
|
94
|
+
if self._running:
|
|
95
|
+
logger.warning("JobWorker already running")
|
|
96
|
+
return
|
|
97
|
+
|
|
98
|
+
self._running = True
|
|
99
|
+
self._stop_event.clear()
|
|
100
|
+
self._task = asyncio.create_task(self._run_loop())
|
|
101
|
+
logger.info("JobWorker started")
|
|
102
|
+
|
|
103
|
+
async def stop(self, timeout: float = 30.0) -> None:
|
|
104
|
+
"""Gracefully stop the worker.
|
|
105
|
+
|
|
106
|
+
Signals the worker to stop and waits for the current job to complete
|
|
107
|
+
or for the timeout to expire.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
timeout: Maximum seconds to wait for graceful shutdown.
|
|
111
|
+
"""
|
|
112
|
+
if not self._running:
|
|
113
|
+
return
|
|
114
|
+
|
|
115
|
+
logger.info("JobWorker stopping...")
|
|
116
|
+
self._running = False
|
|
117
|
+
self._stop_event.set()
|
|
118
|
+
|
|
119
|
+
if self._task is not None:
|
|
120
|
+
try:
|
|
121
|
+
await asyncio.wait_for(self._task, timeout=timeout)
|
|
122
|
+
except asyncio.TimeoutError:
|
|
123
|
+
logger.warning(
|
|
124
|
+
f"JobWorker did not stop within {timeout}s, cancelling task"
|
|
125
|
+
)
|
|
126
|
+
self._task.cancel()
|
|
127
|
+
try:
|
|
128
|
+
await self._task
|
|
129
|
+
except asyncio.CancelledError:
|
|
130
|
+
pass
|
|
131
|
+
|
|
132
|
+
self._task = None
|
|
133
|
+
self._current_job = None
|
|
134
|
+
logger.info("JobWorker stopped")
|
|
135
|
+
|
|
136
|
+
async def _run_loop(self) -> None:
|
|
137
|
+
"""Main worker loop that polls for and processes jobs.
|
|
138
|
+
|
|
139
|
+
Continuously polls for pending jobs and processes them one at a time.
|
|
140
|
+
Exits when stop() is called.
|
|
141
|
+
"""
|
|
142
|
+
logger.info("JobWorker run loop started")
|
|
143
|
+
|
|
144
|
+
while self._running:
|
|
145
|
+
try:
|
|
146
|
+
# Check for pending jobs
|
|
147
|
+
pending_jobs = await self._job_store.get_pending_jobs()
|
|
148
|
+
|
|
149
|
+
if pending_jobs:
|
|
150
|
+
# Process the first pending job (FIFO)
|
|
151
|
+
job = pending_jobs[0]
|
|
152
|
+
await self._process_job(job)
|
|
153
|
+
else:
|
|
154
|
+
# No jobs, wait before polling again
|
|
155
|
+
try:
|
|
156
|
+
await asyncio.wait_for(
|
|
157
|
+
self._stop_event.wait(),
|
|
158
|
+
timeout=self._poll_interval,
|
|
159
|
+
)
|
|
160
|
+
# If we get here, stop was requested
|
|
161
|
+
break
|
|
162
|
+
except asyncio.TimeoutError:
|
|
163
|
+
# Normal timeout, continue polling
|
|
164
|
+
pass
|
|
165
|
+
|
|
166
|
+
except Exception as e:
|
|
167
|
+
logger.error(f"Error in job worker loop: {e}", exc_info=True)
|
|
168
|
+
# Brief pause before retrying to avoid tight error loop
|
|
169
|
+
await asyncio.sleep(1.0)
|
|
170
|
+
|
|
171
|
+
logger.info("JobWorker run loop exited")
|
|
172
|
+
|
|
173
|
+
async def _process_job(self, job: JobRecord) -> None:
|
|
174
|
+
"""Process a single indexing job.
|
|
175
|
+
|
|
176
|
+
Marks the job as RUNNING, executes the indexing pipeline with timeout,
|
|
177
|
+
and marks the job as DONE, FAILED, or CANCELLED based on outcome.
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
job: The job record to process.
|
|
181
|
+
"""
|
|
182
|
+
logger.info(f"Processing job {job.id} for {job.folder_path}")
|
|
183
|
+
self._current_job = job
|
|
184
|
+
|
|
185
|
+
try:
|
|
186
|
+
# Mark job as RUNNING
|
|
187
|
+
job.status = JobStatus.RUNNING
|
|
188
|
+
job.started_at = datetime.now(timezone.utc)
|
|
189
|
+
job.progress = JobProgress(
|
|
190
|
+
files_processed=0,
|
|
191
|
+
files_total=0,
|
|
192
|
+
chunks_created=0,
|
|
193
|
+
current_file="",
|
|
194
|
+
updated_at=datetime.now(timezone.utc),
|
|
195
|
+
)
|
|
196
|
+
await self._job_store.update_job(job)
|
|
197
|
+
|
|
198
|
+
# Set IndexingService state to indicate indexing is in progress
|
|
199
|
+
# This ensures status endpoints reflect the correct state
|
|
200
|
+
async with self._indexing_service._lock:
|
|
201
|
+
self._indexing_service._state = IndexingState(
|
|
202
|
+
current_job_id=job.id,
|
|
203
|
+
status=IndexingStatusEnum.INDEXING,
|
|
204
|
+
is_indexing=True,
|
|
205
|
+
folder_path=job.folder_path,
|
|
206
|
+
started_at=job.started_at,
|
|
207
|
+
completed_at=None,
|
|
208
|
+
error=None,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Create IndexRequest from JobRecord
|
|
212
|
+
index_request = IndexRequest(
|
|
213
|
+
folder_path=job.folder_path,
|
|
214
|
+
include_code=job.include_code,
|
|
215
|
+
chunk_size=job.chunk_size,
|
|
216
|
+
chunk_overlap=job.chunk_overlap,
|
|
217
|
+
recursive=job.recursive,
|
|
218
|
+
generate_summaries=job.generate_summaries,
|
|
219
|
+
supported_languages=job.supported_languages,
|
|
220
|
+
include_patterns=job.include_patterns,
|
|
221
|
+
exclude_patterns=job.exclude_patterns,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Create progress callback that checks for cancellation
|
|
225
|
+
async def progress_callback(current: int, total: int, message: str) -> None:
|
|
226
|
+
"""Progress callback that updates job and checks for cancellation."""
|
|
227
|
+
# Re-fetch job to check for cancellation request
|
|
228
|
+
refreshed_job = await self._job_store.get_job(job.id)
|
|
229
|
+
if refreshed_job and refreshed_job.cancel_requested:
|
|
230
|
+
logger.info(f"Cancellation requested for job {job.id}")
|
|
231
|
+
raise CancellationRequestedError(
|
|
232
|
+
f"Job {job.id} cancellation requested"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
# Update progress at intervals
|
|
236
|
+
if (
|
|
237
|
+
job.progress is None
|
|
238
|
+
or current - job.progress.files_processed >= self._progress_interval
|
|
239
|
+
or current == total
|
|
240
|
+
):
|
|
241
|
+
job.progress = JobProgress(
|
|
242
|
+
files_processed=current,
|
|
243
|
+
files_total=total,
|
|
244
|
+
chunks_created=0, # Will be updated at end
|
|
245
|
+
current_file=message,
|
|
246
|
+
updated_at=datetime.now(timezone.utc),
|
|
247
|
+
)
|
|
248
|
+
await self._job_store.update_job(job)
|
|
249
|
+
|
|
250
|
+
# Get chunk count before indexing for delta verification
|
|
251
|
+
count_before = 0
|
|
252
|
+
try:
|
|
253
|
+
vector_store = self._indexing_service.vector_store
|
|
254
|
+
if vector_store.is_initialized:
|
|
255
|
+
count_before = await vector_store.get_count()
|
|
256
|
+
except Exception as e:
|
|
257
|
+
logger.warning(f"Could not get count before indexing: {e}")
|
|
258
|
+
|
|
259
|
+
# Execute indexing with timeout
|
|
260
|
+
try:
|
|
261
|
+
await asyncio.wait_for(
|
|
262
|
+
self._indexing_service._run_indexing_pipeline(
|
|
263
|
+
index_request, job.id, progress_callback
|
|
264
|
+
),
|
|
265
|
+
timeout=self._max_runtime_seconds,
|
|
266
|
+
)
|
|
267
|
+
except asyncio.TimeoutError:
|
|
268
|
+
logger.error(
|
|
269
|
+
f"Job {job.id} timed out after {self._max_runtime_seconds}s"
|
|
270
|
+
)
|
|
271
|
+
job.status = JobStatus.FAILED
|
|
272
|
+
job.error = f"Job timed out after {self._max_runtime_seconds} seconds"
|
|
273
|
+
job.finished_at = datetime.now(timezone.utc)
|
|
274
|
+
await self._job_store.update_job(job)
|
|
275
|
+
|
|
276
|
+
# Clear IndexingService state on timeout
|
|
277
|
+
async with self._indexing_service._lock:
|
|
278
|
+
self._indexing_service._state = IndexingState(
|
|
279
|
+
current_job_id=job.id,
|
|
280
|
+
status=IndexingStatusEnum.FAILED,
|
|
281
|
+
is_indexing=False,
|
|
282
|
+
folder_path=job.folder_path,
|
|
283
|
+
started_at=job.started_at,
|
|
284
|
+
completed_at=job.finished_at,
|
|
285
|
+
error=job.error,
|
|
286
|
+
)
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
# Verify collection has new chunks (delta verification)
|
|
290
|
+
verification_passed = await self._verify_collection_delta(job, count_before)
|
|
291
|
+
|
|
292
|
+
if verification_passed:
|
|
293
|
+
# Get final chunk count from indexing service status
|
|
294
|
+
status = await self._indexing_service.get_status()
|
|
295
|
+
job.total_chunks = status.get("total_chunks", 0)
|
|
296
|
+
job.total_documents = status.get("total_documents", 0)
|
|
297
|
+
|
|
298
|
+
# Update final progress
|
|
299
|
+
if job.progress:
|
|
300
|
+
job.progress = JobProgress(
|
|
301
|
+
files_processed=job.progress.files_total,
|
|
302
|
+
files_total=job.progress.files_total,
|
|
303
|
+
chunks_created=job.total_chunks,
|
|
304
|
+
current_file="Complete",
|
|
305
|
+
updated_at=datetime.now(timezone.utc),
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
job.status = JobStatus.DONE
|
|
309
|
+
job.finished_at = datetime.now(timezone.utc)
|
|
310
|
+
logger.info(
|
|
311
|
+
f"Job {job.id} completed: {job.total_documents} docs, "
|
|
312
|
+
f"{job.total_chunks} chunks"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Clear IndexingService state on success
|
|
316
|
+
async with self._indexing_service._lock:
|
|
317
|
+
self._indexing_service._state = IndexingState(
|
|
318
|
+
current_job_id=job.id,
|
|
319
|
+
status=IndexingStatusEnum.COMPLETED,
|
|
320
|
+
is_indexing=False,
|
|
321
|
+
folder_path=job.folder_path,
|
|
322
|
+
started_at=job.started_at,
|
|
323
|
+
completed_at=job.finished_at,
|
|
324
|
+
error=None,
|
|
325
|
+
)
|
|
326
|
+
else:
|
|
327
|
+
job.status = JobStatus.FAILED
|
|
328
|
+
job.error = "Verification failed: No chunks found in vector store"
|
|
329
|
+
job.finished_at = datetime.now(timezone.utc)
|
|
330
|
+
logger.error(f"Job {job.id} verification failed: no chunks in store")
|
|
331
|
+
|
|
332
|
+
# Clear IndexingService state on verification failure
|
|
333
|
+
async with self._indexing_service._lock:
|
|
334
|
+
self._indexing_service._state = IndexingState(
|
|
335
|
+
current_job_id=job.id,
|
|
336
|
+
status=IndexingStatusEnum.FAILED,
|
|
337
|
+
is_indexing=False,
|
|
338
|
+
folder_path=job.folder_path,
|
|
339
|
+
started_at=job.started_at,
|
|
340
|
+
completed_at=job.finished_at,
|
|
341
|
+
error=job.error,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
await self._job_store.update_job(job)
|
|
345
|
+
|
|
346
|
+
except CancellationRequestedError:
|
|
347
|
+
job.status = JobStatus.CANCELLED
|
|
348
|
+
job.error = "Job was cancelled by user request"
|
|
349
|
+
job.finished_at = datetime.now(timezone.utc)
|
|
350
|
+
await self._job_store.update_job(job)
|
|
351
|
+
|
|
352
|
+
# Clear IndexingService state on cancellation
|
|
353
|
+
async with self._indexing_service._lock:
|
|
354
|
+
self._indexing_service._state = IndexingState(
|
|
355
|
+
current_job_id=job.id,
|
|
356
|
+
status=IndexingStatusEnum.IDLE,
|
|
357
|
+
is_indexing=False,
|
|
358
|
+
folder_path=job.folder_path,
|
|
359
|
+
started_at=job.started_at,
|
|
360
|
+
completed_at=job.finished_at,
|
|
361
|
+
error=job.error,
|
|
362
|
+
)
|
|
363
|
+
logger.info(f"Job {job.id} cancelled")
|
|
364
|
+
|
|
365
|
+
except Exception as e:
|
|
366
|
+
logger.error(f"Job {job.id} failed with error: {e}", exc_info=True)
|
|
367
|
+
job.status = JobStatus.FAILED
|
|
368
|
+
job.error = str(e)
|
|
369
|
+
job.finished_at = datetime.now(timezone.utc)
|
|
370
|
+
await self._job_store.update_job(job)
|
|
371
|
+
|
|
372
|
+
# Clear IndexingService state on error
|
|
373
|
+
async with self._indexing_service._lock:
|
|
374
|
+
self._indexing_service._state = IndexingState(
|
|
375
|
+
current_job_id=job.id,
|
|
376
|
+
status=IndexingStatusEnum.FAILED,
|
|
377
|
+
is_indexing=False,
|
|
378
|
+
folder_path=job.folder_path,
|
|
379
|
+
started_at=job.started_at,
|
|
380
|
+
completed_at=job.finished_at,
|
|
381
|
+
error=job.error,
|
|
382
|
+
)
|
|
383
|
+
|
|
384
|
+
finally:
|
|
385
|
+
self._current_job = None
|
|
386
|
+
|
|
387
|
+
async def _verify_collection_delta(self, job: JobRecord, count_before: int) -> bool:
|
|
388
|
+
"""Verify that the vector store has new chunks after indexing.
|
|
389
|
+
|
|
390
|
+
Uses delta verification (count_after - count_before) to avoid false
|
|
391
|
+
positives when prior chunks exist but the job added nothing.
|
|
392
|
+
|
|
393
|
+
Args:
|
|
394
|
+
job: The job record to verify.
|
|
395
|
+
count_before: Chunk count before indexing started.
|
|
396
|
+
|
|
397
|
+
Returns:
|
|
398
|
+
True if verification passed (new chunks added), False otherwise.
|
|
399
|
+
"""
|
|
400
|
+
try:
|
|
401
|
+
vector_store = self._indexing_service.vector_store
|
|
402
|
+
count_after = await vector_store.get_count()
|
|
403
|
+
delta = count_after - count_before
|
|
404
|
+
|
|
405
|
+
if delta > 0:
|
|
406
|
+
logger.info(
|
|
407
|
+
f"Verification passed for job {job.id}: "
|
|
408
|
+
f"{delta} new chunks (before={count_before}, after={count_after})"
|
|
409
|
+
)
|
|
410
|
+
return True
|
|
411
|
+
elif count_after > 0 and delta == 0:
|
|
412
|
+
# Special case: job might have processed files that were already indexed
|
|
413
|
+
# Check if any documents were processed
|
|
414
|
+
if job.progress and job.progress.files_processed > 0:
|
|
415
|
+
logger.warning(
|
|
416
|
+
f"Job {job.id} processed {job.progress.files_processed} files "
|
|
417
|
+
f"but added no new chunks (may have been already indexed)"
|
|
418
|
+
)
|
|
419
|
+
# Consider this a success if files were processed
|
|
420
|
+
return True
|
|
421
|
+
logger.warning(
|
|
422
|
+
f"Verification failed for job {job.id}: no new chunks added "
|
|
423
|
+
f"(before={count_before}, after={count_after})"
|
|
424
|
+
)
|
|
425
|
+
return False
|
|
426
|
+
else:
|
|
427
|
+
logger.warning(
|
|
428
|
+
f"Verification failed for job {job.id}: no chunks in vector store"
|
|
429
|
+
)
|
|
430
|
+
return False
|
|
431
|
+
|
|
432
|
+
except Exception as e:
|
|
433
|
+
logger.error(f"Verification error for job {job.id}: {e}")
|
|
434
|
+
return False
|
agent_brain_server/locking.py
CHANGED
|
@@ -1,15 +1,92 @@
|
|
|
1
|
-
"""File-based locking for
|
|
1
|
+
"""File-based locking for Agent Brain instances."""
|
|
2
2
|
|
|
3
|
-
import fcntl
|
|
4
3
|
import logging
|
|
5
4
|
import os
|
|
5
|
+
import sys
|
|
6
6
|
from pathlib import Path
|
|
7
|
-
from typing import Optional
|
|
7
|
+
from typing import Callable, Optional
|
|
8
8
|
|
|
9
9
|
logger = logging.getLogger(__name__)
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
11
|
+
|
|
12
|
+
# Platform-safe file locking functions
|
|
13
|
+
def _lock_exclusive_noop(fd: int) -> None:
|
|
14
|
+
"""No-op exclusive lock for platforms without native support."""
|
|
15
|
+
pass
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _lock_nonblocking_noop(fd: int) -> bool:
|
|
19
|
+
"""No-op non-blocking lock. Returns True (always succeeds)."""
|
|
20
|
+
return True
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _unlock_noop(fd: int) -> None:
|
|
24
|
+
"""No-op unlock for platforms without native support."""
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
# Initialize lock/unlock functions based on platform
|
|
29
|
+
_lock_exclusive: Callable[[int], None] = _lock_exclusive_noop
|
|
30
|
+
_try_lock_exclusive: Callable[[int], bool] = _lock_nonblocking_noop
|
|
31
|
+
_unlock: Callable[[int], None] = _unlock_noop
|
|
32
|
+
_lock_warning_shown = False
|
|
33
|
+
|
|
34
|
+
if sys.platform != "win32":
|
|
35
|
+
try:
|
|
36
|
+
import fcntl
|
|
37
|
+
|
|
38
|
+
def _lock_exclusive_fcntl(fd: int) -> None:
|
|
39
|
+
"""Blocking exclusive lock using fcntl (POSIX)."""
|
|
40
|
+
fcntl.flock(fd, fcntl.LOCK_EX)
|
|
41
|
+
|
|
42
|
+
def _try_lock_exclusive_fcntl(fd: int) -> bool:
|
|
43
|
+
"""Non-blocking exclusive lock using fcntl (POSIX)."""
|
|
44
|
+
try:
|
|
45
|
+
fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
46
|
+
return True
|
|
47
|
+
except OSError:
|
|
48
|
+
return False
|
|
49
|
+
|
|
50
|
+
def _unlock_fcntl(fd: int) -> None:
|
|
51
|
+
"""Unlock using fcntl (POSIX)."""
|
|
52
|
+
fcntl.flock(fd, fcntl.LOCK_UN)
|
|
53
|
+
|
|
54
|
+
_lock_exclusive = _lock_exclusive_fcntl
|
|
55
|
+
_try_lock_exclusive = _try_lock_exclusive_fcntl
|
|
56
|
+
_unlock = _unlock_fcntl
|
|
57
|
+
except ImportError:
|
|
58
|
+
pass
|
|
59
|
+
else:
|
|
60
|
+
try:
|
|
61
|
+
import msvcrt
|
|
62
|
+
|
|
63
|
+
def _lock_exclusive_msvcrt(fd: int) -> None:
|
|
64
|
+
"""Blocking exclusive lock using msvcrt (Windows)."""
|
|
65
|
+
msvcrt.locking(fd, msvcrt.LK_LOCK, 1)
|
|
66
|
+
|
|
67
|
+
def _try_lock_exclusive_msvcrt(fd: int) -> bool:
|
|
68
|
+
"""Non-blocking exclusive lock using msvcrt (Windows)."""
|
|
69
|
+
try:
|
|
70
|
+
msvcrt.locking(fd, msvcrt.LK_NBLCK, 1)
|
|
71
|
+
return True
|
|
72
|
+
except OSError:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
def _unlock_msvcrt(fd: int) -> None:
|
|
76
|
+
"""Unlock using msvcrt (Windows)."""
|
|
77
|
+
try:
|
|
78
|
+
msvcrt.locking(fd, msvcrt.LK_UNLCK, 1)
|
|
79
|
+
except OSError:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
_lock_exclusive = _lock_exclusive_msvcrt
|
|
83
|
+
_try_lock_exclusive = _try_lock_exclusive_msvcrt
|
|
84
|
+
_unlock = _unlock_msvcrt
|
|
85
|
+
except ImportError:
|
|
86
|
+
pass
|
|
87
|
+
|
|
88
|
+
LOCK_FILE = "agent-brain.lock"
|
|
89
|
+
PID_FILE = "agent-brain.pid"
|
|
13
90
|
|
|
14
91
|
# Module-level storage for lock file descriptors
|
|
15
92
|
_lock_fds: dict[str, int] = {}
|
|
@@ -26,12 +103,26 @@ def acquire_lock(state_dir: Path) -> bool:
|
|
|
26
103
|
Returns:
|
|
27
104
|
True if lock acquired, False if already held.
|
|
28
105
|
"""
|
|
106
|
+
global _lock_warning_shown
|
|
107
|
+
|
|
29
108
|
state_dir.mkdir(parents=True, exist_ok=True)
|
|
30
109
|
lock_path = state_dir / LOCK_FILE
|
|
31
110
|
|
|
111
|
+
# Warn once if no locking available
|
|
112
|
+
if _try_lock_exclusive is _lock_nonblocking_noop and not _lock_warning_shown:
|
|
113
|
+
logger.warning(
|
|
114
|
+
"File locking not available on this platform. "
|
|
115
|
+
"Multiple instances may conflict."
|
|
116
|
+
)
|
|
117
|
+
_lock_warning_shown = True
|
|
118
|
+
|
|
32
119
|
try:
|
|
33
120
|
fd = os.open(str(lock_path), os.O_CREAT | os.O_WRONLY)
|
|
34
|
-
|
|
121
|
+
|
|
122
|
+
if not _try_lock_exclusive(fd):
|
|
123
|
+
os.close(fd)
|
|
124
|
+
logger.warning(f"Lock already held: {lock_path}")
|
|
125
|
+
return False
|
|
35
126
|
|
|
36
127
|
# Write PID
|
|
37
128
|
pid_path = state_dir / PID_FILE
|
|
@@ -58,7 +149,7 @@ def release_lock(state_dir: Path) -> None:
|
|
|
58
149
|
fd = _lock_fds.pop(str(state_dir), None)
|
|
59
150
|
if fd is not None:
|
|
60
151
|
try:
|
|
61
|
-
|
|
152
|
+
_unlock(fd)
|
|
62
153
|
os.close(fd)
|
|
63
154
|
except OSError:
|
|
64
155
|
pass
|
|
@@ -118,12 +209,14 @@ def cleanup_stale(state_dir: Path) -> None:
|
|
|
118
209
|
"""Clean up stale lock and PID files.
|
|
119
210
|
|
|
120
211
|
Only cleans up if the lock is determined to be stale.
|
|
212
|
+
Note: Does NOT clean runtime.json - that's managed by the CLI
|
|
213
|
+
to avoid race conditions during server startup.
|
|
121
214
|
|
|
122
215
|
Args:
|
|
123
216
|
state_dir: Path to the state directory.
|
|
124
217
|
"""
|
|
125
218
|
if is_stale(state_dir):
|
|
126
|
-
for fname in [LOCK_FILE, PID_FILE
|
|
219
|
+
for fname in [LOCK_FILE, PID_FILE]:
|
|
127
220
|
fpath = state_dir / fname
|
|
128
221
|
if fpath.exists():
|
|
129
222
|
try:
|
|
@@ -3,6 +3,16 @@
|
|
|
3
3
|
from .graph import GraphEntity, GraphIndexStatus, GraphQueryContext, GraphTriple
|
|
4
4
|
from .health import HealthStatus, IndexingStatus
|
|
5
5
|
from .index import IndexingState, IndexingStatusEnum, IndexRequest, IndexResponse
|
|
6
|
+
from .job import (
|
|
7
|
+
JobDetailResponse,
|
|
8
|
+
JobEnqueueResponse,
|
|
9
|
+
JobListResponse,
|
|
10
|
+
JobProgress,
|
|
11
|
+
JobRecord,
|
|
12
|
+
JobStatus,
|
|
13
|
+
JobSummary,
|
|
14
|
+
QueueStats,
|
|
15
|
+
)
|
|
6
16
|
from .query import QueryMode, QueryRequest, QueryResponse, QueryResult
|
|
7
17
|
|
|
8
18
|
__all__ = [
|
|
@@ -24,4 +34,13 @@ __all__ = [
|
|
|
24
34
|
"GraphEntity",
|
|
25
35
|
"GraphIndexStatus",
|
|
26
36
|
"GraphQueryContext",
|
|
37
|
+
# Job queue models (Feature 115)
|
|
38
|
+
"JobStatus",
|
|
39
|
+
"JobProgress",
|
|
40
|
+
"JobRecord",
|
|
41
|
+
"JobEnqueueResponse",
|
|
42
|
+
"JobListResponse",
|
|
43
|
+
"JobSummary",
|
|
44
|
+
"JobDetailResponse",
|
|
45
|
+
"QueueStats",
|
|
27
46
|
]
|
|
@@ -110,6 +110,21 @@ class IndexingStatus(BaseModel):
|
|
|
110
110
|
default=None,
|
|
111
111
|
description="Graph index status with entity_count, relationship_count, etc.",
|
|
112
112
|
)
|
|
113
|
+
# Queue status (Feature 115)
|
|
114
|
+
queue_pending: int = Field(
|
|
115
|
+
default=0,
|
|
116
|
+
ge=0,
|
|
117
|
+
description="Number of pending jobs in the queue",
|
|
118
|
+
)
|
|
119
|
+
queue_running: int = Field(
|
|
120
|
+
default=0,
|
|
121
|
+
ge=0,
|
|
122
|
+
description="Number of running jobs (0 or 1)",
|
|
123
|
+
)
|
|
124
|
+
current_job_running_time_ms: Optional[int] = Field(
|
|
125
|
+
None,
|
|
126
|
+
description="Running time of current job in milliseconds",
|
|
127
|
+
)
|
|
113
128
|
|
|
114
129
|
model_config = {
|
|
115
130
|
"json_schema_extra": {
|