agent-brain-rag 2.0.0__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,427 @@
1
+ """JSONL-based persistent job queue store with atomic writes and file locking."""
2
+
3
+ import asyncio
4
+ import logging
5
+ import os
6
+ import sys
7
+ from datetime import datetime, timezone
8
+ from pathlib import Path
9
+ from types import TracebackType
10
+ from typing import IO, Any, Callable, Literal, Optional
11
+
12
+ from agent_brain_server.models.job import JobRecord, JobStatus, QueueStats
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ # Platform-safe file locking functions
18
+ # These are defined based on platform to provide consistent API
19
+ def _lock_file_noop(fd: int) -> None:
20
+ """No-op file lock for platforms without native support."""
21
+ pass
22
+
23
+
24
+ def _unlock_file_noop(fd: int) -> None:
25
+ """No-op file unlock for platforms without native support."""
26
+ pass
27
+
28
+
29
+ # Initialize lock/unlock functions based on platform
30
+ _lock_file: Callable[[int], None] = _lock_file_noop
31
+ _unlock_file: Callable[[int], None] = _unlock_file_noop
32
+ _lock_warning_shown = False
33
+
34
+ if sys.platform != "win32":
35
+ try:
36
+ import fcntl
37
+
38
+ def _lock_file_fcntl(fd: int) -> None:
39
+ """Lock file using fcntl (POSIX)."""
40
+ fcntl.flock(fd, fcntl.LOCK_EX)
41
+
42
+ def _unlock_file_fcntl(fd: int) -> None:
43
+ """Unlock file using fcntl (POSIX)."""
44
+ fcntl.flock(fd, fcntl.LOCK_UN)
45
+
46
+ _lock_file = _lock_file_fcntl
47
+ _unlock_file = _unlock_file_fcntl
48
+ except ImportError:
49
+ pass
50
+ else:
51
+ try:
52
+ import msvcrt
53
+
54
+ def _lock_file_msvcrt(fd: int) -> None:
55
+ """Lock file using msvcrt (Windows)."""
56
+ msvcrt.locking(fd, msvcrt.LK_LOCK, 1)
57
+
58
+ def _unlock_file_msvcrt(fd: int) -> None:
59
+ """Unlock file using msvcrt (Windows)."""
60
+ try:
61
+ msvcrt.locking(fd, msvcrt.LK_UNLCK, 1)
62
+ except OSError:
63
+ # Ignore errors on unlock
64
+ pass
65
+
66
+ _lock_file = _lock_file_msvcrt
67
+ _unlock_file = _unlock_file_msvcrt
68
+ except ImportError:
69
+ pass
70
+
71
+
72
+ class JobQueueStore:
73
+ """JSONL-based persistent job queue with atomic writes and crash recovery.
74
+
75
+ Features:
76
+ - Append-only JSONL file for durability
77
+ - Periodic snapshot compaction
78
+ - File locking for multi-process safety
79
+ - Restart recovery with stale job handling
80
+
81
+ File structure:
82
+ - index_queue.jsonl: Append-only job state changes
83
+ - index_queue.snapshot: Full state snapshot for fast loading
84
+ - .queue.lock: Lock file for file operations
85
+ """
86
+
87
+ QUEUE_FILE = "index_queue.jsonl"
88
+ SNAPSHOT_FILE = "index_queue.snapshot"
89
+ LOCK_FILE = ".queue.lock"
90
+
91
+ MAX_RETRIES = 3
92
+ COMPACT_THRESHOLD = 100 # Compact after N updates
93
+
94
+ def __init__(self, state_dir: Path):
95
+ """Initialize the job queue store.
96
+
97
+ Args:
98
+ state_dir: Directory for storing queue files.
99
+ """
100
+ self._state_dir = state_dir
101
+ self._jobs_dir = state_dir / "jobs"
102
+ self._jobs_dir.mkdir(parents=True, exist_ok=True)
103
+
104
+ self._queue_path = self._jobs_dir / self.QUEUE_FILE
105
+ self._snapshot_path = self._jobs_dir / self.SNAPSHOT_FILE
106
+ self._lock_path = self._jobs_dir / self.LOCK_FILE
107
+
108
+ # In-memory state
109
+ self._jobs: dict[str, JobRecord] = {}
110
+ self._update_count = 0
111
+
112
+ # Async lock for in-process synchronization
113
+ self._asyncio_lock = asyncio.Lock()
114
+
115
+ logger.info(f"JobQueueStore initialized at {self._jobs_dir}")
116
+
117
+ async def initialize(self) -> None:
118
+ """Load jobs from persistent storage and handle stale RUNNING jobs.
119
+
120
+ On startup:
121
+ 1. Load from snapshot if available
122
+ 2. Replay JSONL updates
123
+ 3. Reset stale RUNNING jobs to PENDING with retry tracking
124
+ """
125
+ async with self._asyncio_lock:
126
+ await self._load_jobs()
127
+ await self._handle_stale_jobs()
128
+
129
+ async def _load_jobs(self) -> None:
130
+ """Load jobs from snapshot and JSONL file."""
131
+ self._jobs.clear()
132
+
133
+ # Load from snapshot first (if exists)
134
+ if self._snapshot_path.exists():
135
+ try:
136
+ with self._with_file_lock():
137
+ with open(self._snapshot_path) as f:
138
+ for line in f:
139
+ line = line.strip()
140
+ if line:
141
+ job = JobRecord.model_validate_json(line)
142
+ self._jobs[job.id] = job
143
+ logger.info(f"Loaded {len(self._jobs)} jobs from snapshot")
144
+ except Exception as e:
145
+ logger.error(f"Failed to load snapshot: {e}")
146
+ self._jobs.clear()
147
+
148
+ # Replay JSONL updates
149
+ if self._queue_path.exists():
150
+ try:
151
+ with self._with_file_lock():
152
+ with open(self._queue_path) as f:
153
+ for line in f:
154
+ line = line.strip()
155
+ if line:
156
+ job = JobRecord.model_validate_json(line)
157
+ self._jobs[job.id] = job
158
+ logger.info(f"Replayed JSONL updates, total jobs: {len(self._jobs)}")
159
+ except Exception as e:
160
+ logger.error(f"Failed to replay JSONL: {e}")
161
+
162
+ async def _handle_stale_jobs(self) -> None:
163
+ """Handle jobs that were RUNNING when server stopped.
164
+
165
+ - Reset to PENDING if retry_count < MAX_RETRIES
166
+ - Mark as FAILED if retry_count >= MAX_RETRIES
167
+ """
168
+ stale_jobs = [
169
+ job for job in self._jobs.values() if job.status == JobStatus.RUNNING
170
+ ]
171
+
172
+ for job in stale_jobs:
173
+ job.retry_count += 1
174
+
175
+ if job.retry_count > self.MAX_RETRIES:
176
+ job.status = JobStatus.FAILED
177
+ job.error = f"Max retries ({self.MAX_RETRIES}) exceeded after restart"
178
+ job.finished_at = datetime.now(timezone.utc)
179
+ logger.warning(
180
+ f"Job {job.id} permanently failed after {job.retry_count} retries"
181
+ )
182
+ else:
183
+ job.status = JobStatus.PENDING
184
+ job.started_at = None
185
+ job.progress = None
186
+ logger.info(f"Job {job.id} reset to PENDING (retry {job.retry_count})")
187
+
188
+ await self._persist_job(job)
189
+
190
+ def _with_file_lock(self) -> "_FileLock":
191
+ """Context manager for file locking."""
192
+ return _FileLock(self._lock_path)
193
+
194
+ async def _persist_job(self, job: JobRecord) -> None:
195
+ """Persist a job to JSONL with atomic write.
196
+
197
+ Args:
198
+ job: Job record to persist.
199
+ """
200
+ with self._with_file_lock():
201
+ with open(self._queue_path, "a") as f:
202
+ f.write(job.model_dump_json() + "\n")
203
+ f.flush()
204
+ os.fsync(f.fileno())
205
+
206
+ self._update_count += 1
207
+
208
+ # Compact if threshold exceeded
209
+ if self._update_count >= self.COMPACT_THRESHOLD:
210
+ await self._compact()
211
+
212
+ async def _compact(self) -> None:
213
+ """Compact queue by writing snapshot and truncating JSONL."""
214
+ logger.info("Compacting job queue...")
215
+
216
+ with self._with_file_lock():
217
+ # Write snapshot to temp file
218
+ tmp_path = self._snapshot_path.with_suffix(".tmp")
219
+ with open(tmp_path, "w") as f:
220
+ for job in self._jobs.values():
221
+ f.write(job.model_dump_json() + "\n")
222
+ f.flush()
223
+ os.fsync(f.fileno())
224
+
225
+ # Atomic rename
226
+ tmp_path.rename(self._snapshot_path)
227
+
228
+ # Truncate JSONL
229
+ with open(self._queue_path, "w") as f:
230
+ f.truncate(0)
231
+ f.flush()
232
+ os.fsync(f.fileno())
233
+
234
+ self._update_count = 0
235
+ logger.info(f"Compaction complete: {len(self._jobs)} jobs in snapshot")
236
+
237
+ async def append_job(self, job: JobRecord) -> int:
238
+ """Append a new job to the queue.
239
+
240
+ Args:
241
+ job: Job record to append.
242
+
243
+ Returns:
244
+ Queue position (0-indexed).
245
+ """
246
+ async with self._asyncio_lock:
247
+ self._jobs[job.id] = job
248
+ await self._persist_job(job)
249
+
250
+ # Calculate queue position
251
+ pending_jobs = [
252
+ j
253
+ for j in self._jobs.values()
254
+ if j.status == JobStatus.PENDING and j.id != job.id
255
+ ]
256
+ position = len(pending_jobs)
257
+
258
+ logger.info(f"Job {job.id} appended at position {position}")
259
+ return position
260
+
261
+ async def update_job(self, job: JobRecord) -> None:
262
+ """Update an existing job.
263
+
264
+ Args:
265
+ job: Job record with updated fields.
266
+ """
267
+ async with self._asyncio_lock:
268
+ if job.id not in self._jobs:
269
+ raise KeyError(f"Job {job.id} not found")
270
+
271
+ self._jobs[job.id] = job
272
+ await self._persist_job(job)
273
+
274
+ async def get_job(self, job_id: str) -> Optional[JobRecord]:
275
+ """Get a job by ID.
276
+
277
+ Args:
278
+ job_id: Job identifier.
279
+
280
+ Returns:
281
+ Job record or None if not found.
282
+ """
283
+ return self._jobs.get(job_id)
284
+
285
+ async def find_by_dedupe_key(self, dedupe_key: str) -> Optional[JobRecord]:
286
+ """Find an active job by deduplication key.
287
+
288
+ Args:
289
+ dedupe_key: SHA256 dedupe key.
290
+
291
+ Returns:
292
+ Matching job in PENDING or RUNNING status, or None.
293
+ """
294
+ for job in self._jobs.values():
295
+ if job.dedupe_key == dedupe_key and job.status in (
296
+ JobStatus.PENDING,
297
+ JobStatus.RUNNING,
298
+ ):
299
+ return job
300
+ return None
301
+
302
+ async def get_pending_jobs(self) -> list[JobRecord]:
303
+ """Get all pending jobs in FIFO order.
304
+
305
+ Returns:
306
+ List of pending jobs ordered by enqueue time.
307
+ """
308
+ pending = [j for j in self._jobs.values() if j.status == JobStatus.PENDING]
309
+ return sorted(pending, key=lambda j: j.enqueued_at)
310
+
311
+ async def get_running_job(self) -> Optional[JobRecord]:
312
+ """Get the currently running job, if any.
313
+
314
+ Returns:
315
+ Running job or None.
316
+ """
317
+ for job in self._jobs.values():
318
+ if job.status == JobStatus.RUNNING:
319
+ return job
320
+ return None
321
+
322
+ async def get_all_jobs(self, limit: int = 50, offset: int = 0) -> list[JobRecord]:
323
+ """Get all jobs with pagination.
324
+
325
+ Args:
326
+ limit: Maximum jobs to return.
327
+ offset: Number of jobs to skip.
328
+
329
+ Returns:
330
+ List of jobs sorted by enqueue time (newest first).
331
+ """
332
+ all_jobs = sorted(
333
+ self._jobs.values(), key=lambda j: j.enqueued_at, reverse=True
334
+ )
335
+ return all_jobs[offset : offset + limit]
336
+
337
+ async def get_queue_stats(self) -> QueueStats:
338
+ """Get statistics about the queue.
339
+
340
+ Returns:
341
+ QueueStats with counts and current job info.
342
+ """
343
+ pending = 0
344
+ running = 0
345
+ completed = 0
346
+ failed = 0
347
+ cancelled = 0
348
+ current_job_id = None
349
+ current_job_running_time_ms = None
350
+
351
+ for job in self._jobs.values():
352
+ if job.status == JobStatus.PENDING:
353
+ pending += 1
354
+ elif job.status == JobStatus.RUNNING:
355
+ running += 1
356
+ current_job_id = job.id
357
+ if job.started_at:
358
+ delta = datetime.now(timezone.utc) - job.started_at
359
+ current_job_running_time_ms = int(delta.total_seconds() * 1000)
360
+ elif job.status == JobStatus.DONE:
361
+ completed += 1
362
+ elif job.status == JobStatus.FAILED:
363
+ failed += 1
364
+ elif job.status == JobStatus.CANCELLED:
365
+ cancelled += 1
366
+
367
+ return QueueStats(
368
+ pending=pending,
369
+ running=running,
370
+ completed=completed,
371
+ failed=failed,
372
+ cancelled=cancelled,
373
+ total=len(self._jobs),
374
+ current_job_id=current_job_id,
375
+ current_job_running_time_ms=current_job_running_time_ms,
376
+ )
377
+
378
+ async def get_queue_length(self) -> int:
379
+ """Get number of pending + running jobs.
380
+
381
+ Returns:
382
+ Count of jobs not yet completed.
383
+ """
384
+ return sum(
385
+ 1
386
+ for j in self._jobs.values()
387
+ if j.status in (JobStatus.PENDING, JobStatus.RUNNING)
388
+ )
389
+
390
+
391
+ class _FileLock:
392
+ """Platform-safe file locking context manager.
393
+
394
+ Uses fcntl on POSIX (Linux, macOS) and msvcrt on Windows.
395
+ Falls back to no-op locking if neither is available.
396
+ """
397
+
398
+ def __init__(self, lock_path: Path) -> None:
399
+ self._lock_path = lock_path
400
+ self._lock_file: Optional[IO[Any]] = None
401
+
402
+ def __enter__(self) -> "_FileLock":
403
+ global _lock_warning_shown
404
+
405
+ self._lock_file = open(self._lock_path, "w")
406
+
407
+ # Check if we're using the no-op lock
408
+ if _lock_file is _lock_file_noop and not _lock_warning_shown:
409
+ logger.warning(
410
+ "File locking not available on this platform. "
411
+ "Concurrent access may cause issues."
412
+ )
413
+ _lock_warning_shown = True
414
+
415
+ _lock_file(self._lock_file.fileno())
416
+ return self
417
+
418
+ def __exit__(
419
+ self,
420
+ exc_type: Optional[type[BaseException]],
421
+ exc_val: Optional[BaseException],
422
+ exc_tb: Optional[TracebackType],
423
+ ) -> Literal[False]:
424
+ if self._lock_file:
425
+ _unlock_file(self._lock_file.fileno())
426
+ self._lock_file.close()
427
+ return False