rrq 0.4.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rrq/store.py CHANGED
@@ -4,12 +4,14 @@ with the Redis backend for storing and managing RRQ job data and queues.
4
4
 
5
5
  import json
6
6
  import logging
7
- from datetime import UTC, datetime
7
+ from datetime import timezone, datetime, timedelta
8
8
  from typing import Any, Optional
9
9
 
10
10
  from redis.asyncio import Redis as AsyncRedis
11
+ from redis.exceptions import RedisError
11
12
 
12
13
  from .constants import (
14
+ CONNECTION_POOL_MAX_CONNECTIONS,
13
15
  DEFAULT_DLQ_RESULT_TTL_SECONDS,
14
16
  JOB_KEY_PREFIX,
15
17
  LOCK_KEY_PREFIX,
@@ -21,11 +23,17 @@ from .settings import RRQSettings
21
23
 
22
24
  logger = logging.getLogger(__name__)
23
25
 
26
+
24
27
  class JobStore:
25
28
  """Provides an abstraction layer for interacting with Redis for RRQ operations.
26
29
 
27
30
  Handles serialization/deserialization, key management, and atomic operations
28
31
  related to jobs, queues, locks, and worker health.
32
+
33
+ Transaction Usage Guidelines:
34
+ - Use transaction=True for write operations that must be atomic (job updates, DLQ moves)
35
+ - Use transaction=False for read-only batch operations (health checks, queue size queries)
36
+ - All async context managers (async with) properly handle cleanup even on exceptions
29
37
  """
30
38
 
31
39
  def __init__(self, settings: RRQSettings):
@@ -36,9 +44,40 @@ class JobStore:
36
44
  """
37
45
  self.settings = settings
38
46
  self.redis = AsyncRedis.from_url(
39
- settings.redis_dsn, decode_responses=False
40
- ) # Work with bytes initially
41
-
47
+ settings.redis_dsn,
48
+ decode_responses=False,
49
+ max_connections=CONNECTION_POOL_MAX_CONNECTIONS,
50
+ retry_on_timeout=True,
51
+ socket_keepalive=True,
52
+ socket_keepalive_options={},
53
+ )
54
+
55
+ # LUA scripts for atomic operations
56
+ self._atomic_lock_and_remove_script = """
57
+ -- KEYS: [1] = lock_key, [2] = queue_key
58
+ -- ARGV: [1] = worker_id, [2] = lock_timeout_ms, [3] = job_id
59
+ local lock_result = redis.call('SET', KEYS[1], ARGV[1], 'NX', 'PX', ARGV[2])
60
+ if lock_result then
61
+ local removed_count = redis.call('ZREM', KEYS[2], ARGV[3])
62
+ if removed_count == 0 then
63
+ redis.call('DEL', KEYS[1]) -- Release lock if job wasn't in queue
64
+ return {0, 0} -- {lock_acquired, removed_count}
65
+ end
66
+ return {1, removed_count}
67
+ else
68
+ return {0, 0}
69
+ end
70
+ """
71
+
72
+ self._atomic_retry_script = """
73
+ -- KEYS: [1] = job_key, [2] = queue_key
74
+ -- ARGV: [1] = job_id, [2] = retry_at_score, [3] = error_message, [4] = status
75
+ local new_retry_count = redis.call('HINCRBY', KEYS[1], 'current_retries', 1)
76
+ redis.call('HMSET', KEYS[1], 'status', ARGV[4], 'last_error', ARGV[3])
77
+ redis.call('ZADD', KEYS[2], ARGV[2], ARGV[1])
78
+ return new_retry_count
79
+ """
80
+
42
81
  def _format_queue_key(self, queue_name: str) -> str:
43
82
  """Normalize a queue name or key into a Redis key for ZSET queues."""
44
83
 
@@ -60,37 +99,6 @@ class JobStore:
60
99
  """Closes the Redis connection pool associated with this store."""
61
100
  await self.redis.aclose()
62
101
 
63
- async def _serialize_job_field(self, value: Any) -> bytes:
64
- """Serializes a single field value for storing in a Redis hash."""
65
- # Pydantic models are dumped to dict, then JSON string, then bytes.
66
- # Basic types are JSON dumped directly.
67
- if hasattr(value, "model_dump_json"): # For Pydantic sub-models if any
68
- return value.model_dump_json().encode("utf-8")
69
- if isinstance(value, dict | list) or (
70
- hasattr(value, "__dict__") and not callable(value)
71
- ):
72
- # Fallback for other dict-like or list-like objects, and simple custom objects
73
- try:
74
- # Use Pydantic-aware JSON dumping if possible
75
- if hasattr(value, "model_dump"):
76
- value = value.model_dump(mode="json")
77
- return json.dumps(value, default=str).encode(
78
- "utf-8"
79
- ) # default=str for datetimes etc.
80
- except TypeError:
81
- return str(value).encode("utf-8") # Last resort
82
- return str(value).encode("utf-8") # For simple types like int, str, bool
83
-
84
- async def _deserialize_job_field(self, value_bytes: bytes) -> Any:
85
- """Deserializes a single field value from Redis bytes."""
86
- try:
87
- # Attempt to parse as JSON first, as most complex types will be stored this way.
88
- return json.loads(value_bytes.decode("utf-8"))
89
- except (json.JSONDecodeError, UnicodeDecodeError):
90
- # If it fails, it might be a simple string that wasn't JSON encoded (e.g. status enums)
91
- # or a raw byte representation that needs specific handling (not covered here yet)
92
- return value_bytes.decode("utf-8") # Fallback to string
93
-
94
102
  async def save_job_definition(self, job: Job) -> None:
95
103
  """Saves the complete job definition as a Redis hash.
96
104
 
@@ -214,6 +222,29 @@ class JobStore:
214
222
  )
215
223
  return None
216
224
 
225
+ async def get_job_data_dict(self, job_id: str) -> Optional[dict[str, str]]:
226
+ """Retrieves raw job data from Redis as a decoded dictionary.
227
+
228
+ This method provides a lightweight way to get job data for CLI commands
229
+ without the overhead of full Job object reconstruction and validation.
230
+
231
+ Args:
232
+ job_id: The unique ID of the job to retrieve.
233
+
234
+ Returns:
235
+ Dict with decoded string keys and values, or None if job not found.
236
+ """
237
+ job_key = f"{JOB_KEY_PREFIX}{job_id}"
238
+ job_data_raw_bytes = await self.redis.hgetall(job_key)
239
+
240
+ if not job_data_raw_bytes:
241
+ return None
242
+
243
+ # Decode all keys and values from bytes to str
244
+ return {
245
+ k.decode("utf-8"): v.decode("utf-8") for k, v in job_data_raw_bytes.items()
246
+ }
247
+
217
248
  async def add_job_to_queue(
218
249
  self, queue_name: str, job_id: str, score: float
219
250
  ) -> None:
@@ -263,7 +294,7 @@ class JobStore:
263
294
  if count <= 0:
264
295
  return []
265
296
  queue_key = self._format_queue_key(queue_name)
266
- now_ms = int(datetime.now(UTC).timestamp() * 1000)
297
+ now_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
267
298
  # Fetch jobs with score from -inf up to current time, limit by count
268
299
  job_ids_bytes = await self.redis.zrangebyscore(
269
300
  queue_key, min=float("-inf"), max=float(now_ms), start=0, num=count
@@ -308,6 +339,99 @@ class JobStore:
308
339
  logger.debug(f"Released lock for job {job_id} ({lock_key}).")
309
340
  # No need to log if lock didn't exist
310
341
 
342
+ async def atomic_lock_and_remove_job(
343
+ self, job_id: str, queue_name: str, worker_id: str, lock_timeout_ms: int
344
+ ) -> tuple[bool, int]:
345
+ """Atomically acquires a job lock and removes the job from the queue.
346
+
347
+ This is a critical operation that prevents race conditions between multiple
348
+ workers trying to process the same job.
349
+
350
+ Args:
351
+ job_id: The ID of the job to lock and remove.
352
+ queue_name: The name of the queue to remove the job from.
353
+ worker_id: The ID of the worker attempting to acquire the lock.
354
+ lock_timeout_ms: The lock timeout/TTL in milliseconds.
355
+
356
+ Returns:
357
+ A tuple of (lock_acquired: bool, removed_count: int).
358
+ - lock_acquired: True if the lock was successfully acquired
359
+ - removed_count: Number of jobs removed from the queue (0 or 1)
360
+ """
361
+ lock_key = f"{LOCK_KEY_PREFIX}{job_id}"
362
+ queue_key = self._format_queue_key(queue_name)
363
+
364
+ result = await self.redis.eval(
365
+ self._atomic_lock_and_remove_script,
366
+ 2, # Number of keys
367
+ lock_key,
368
+ queue_key,
369
+ worker_id.encode("utf-8"),
370
+ str(lock_timeout_ms),
371
+ job_id.encode("utf-8"),
372
+ )
373
+
374
+ lock_acquired = bool(result[0])
375
+ removed_count = int(result[1])
376
+
377
+ if lock_acquired and removed_count > 0:
378
+ logger.debug(
379
+ f"Worker {worker_id} atomically acquired lock and removed job {job_id} from queue '{queue_name}'."
380
+ )
381
+ elif not lock_acquired:
382
+ logger.debug(
383
+ f"Worker {worker_id} failed to acquire lock for job {job_id} (already locked by another worker)."
384
+ )
385
+ else:
386
+ logger.warning(
387
+ f"Worker {worker_id} acquired lock for job {job_id} but job was already removed from queue '{queue_name}'."
388
+ )
389
+
390
+ return lock_acquired, removed_count
391
+
392
+ async def atomic_retry_job(
393
+ self,
394
+ job_id: str,
395
+ queue_name: str,
396
+ retry_at_score: float,
397
+ error_message: str,
398
+ status: JobStatus,
399
+ ) -> int:
400
+ """Atomically increments job retry count, updates status/error, and re-queues the job.
401
+
402
+ This prevents race conditions in the retry logic where multiple operations
403
+ need to be performed atomically.
404
+
405
+ Args:
406
+ job_id: The ID of the job to retry.
407
+ queue_name: The name of the queue to add the job back to.
408
+ retry_at_score: The score (timestamp) when the job should be retried.
409
+ error_message: The error message to store.
410
+ status: The job status to set (usually RETRYING).
411
+
412
+ Returns:
413
+ The new retry count after incrementing.
414
+ """
415
+ job_key = f"{JOB_KEY_PREFIX}{job_id}"
416
+ queue_key = self._format_queue_key(queue_name)
417
+
418
+ new_retry_count = await self.redis.eval(
419
+ self._atomic_retry_script,
420
+ 2, # Number of keys
421
+ job_key,
422
+ queue_key,
423
+ job_id.encode("utf-8"),
424
+ str(retry_at_score),
425
+ error_message.encode("utf-8"),
426
+ status.value.encode("utf-8"),
427
+ )
428
+
429
+ new_count = int(new_retry_count)
430
+ logger.debug(
431
+ f"Atomically incremented retries for job {job_id} to {new_count} and re-queued for retry."
432
+ )
433
+ return new_count
434
+
311
435
  async def update_job_status(self, job_id: str, status: JobStatus) -> None:
312
436
  """Updates only the status field of a job in its Redis hash.
313
437
 
@@ -361,14 +485,23 @@ class JobStore:
361
485
  "completion_time": completion_time.isoformat().encode("utf-8"),
362
486
  }
363
487
 
364
- # Use pipeline for atomicity
488
+ # Use pipeline with transaction=True for atomic write operations
489
+ # This ensures all commands succeed or none do (ACID properties)
365
490
  async with self.redis.pipeline(transaction=True) as pipe:
366
- pipe.hset(job_key, mapping=update_data)
367
- pipe.lpush(dlq_redis_key, job_id.encode("utf-8"))
368
- pipe.expire(job_key, DEFAULT_DLQ_RESULT_TTL_SECONDS)
369
- results = await pipe.execute()
370
- logger.info(f"Moved job {job_id} to DLQ '{dlq_redis_key}'. Results: {results}")
371
-
491
+ try:
492
+ pipe.hset(job_key, mapping=update_data)
493
+ pipe.lpush(dlq_redis_key, job_id.encode("utf-8"))
494
+ pipe.expire(job_key, DEFAULT_DLQ_RESULT_TTL_SECONDS)
495
+ results = await pipe.execute()
496
+ logger.info(
497
+ f"Moved job {job_id} to DLQ '{dlq_redis_key}'. Results: {results}"
498
+ )
499
+ except RedisError as e:
500
+ logger.error(
501
+ f"Failed to move job {job_id} to DLQ '{dlq_redis_key}': {e}"
502
+ )
503
+ raise
504
+
372
505
  async def requeue_dlq(
373
506
  self,
374
507
  dlq_name: str,
@@ -396,7 +529,7 @@ class JobStore:
396
529
  break
397
530
  job_id = job_id_bytes.decode("utf-8")
398
531
  # Use current time for re-enqueue score
399
- now_ms = int(datetime.now(UTC).timestamp() * 1000)
532
+ now_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
400
533
  await self.add_job_to_queue(
401
534
  self._format_queue_key(target_queue),
402
535
  job_id,
@@ -504,7 +637,7 @@ class JobStore:
504
637
  0 means persist indefinitely. < 0 means leave existing TTL.
505
638
  """
506
639
  job_key = f"{JOB_KEY_PREFIX}{job_id}"
507
- completion_time = datetime.now(UTC)
640
+ completion_time = datetime.now(timezone.utc)
508
641
 
509
642
  # Serialize result to JSON string
510
643
  try:
@@ -526,17 +659,22 @@ class JobStore:
526
659
  "status": JobStatus.COMPLETED.value.encode("utf-8"),
527
660
  }
528
661
 
529
- # Use pipeline for atomicity of update + expire
662
+ # Use pipeline with transaction=True to atomically update and set TTL
663
+ # This prevents partial updates where result is saved but TTL isn't set
530
664
  async with self.redis.pipeline(transaction=True) as pipe:
531
- pipe.hset(job_key, mapping=update_data)
532
- if ttl_seconds > 0:
533
- pipe.expire(job_key, ttl_seconds)
534
- elif ttl_seconds == 0:
535
- pipe.persist(job_key)
536
- results = await pipe.execute()
537
- logger.debug(
538
- f"Saved result for job {job_id}. Status set to COMPLETED. TTL={ttl_seconds}. Results: {results}"
539
- )
665
+ try:
666
+ pipe.hset(job_key, mapping=update_data)
667
+ if ttl_seconds > 0:
668
+ pipe.expire(job_key, ttl_seconds)
669
+ elif ttl_seconds == 0:
670
+ pipe.persist(job_key)
671
+ results = await pipe.execute()
672
+ logger.debug(
673
+ f"Saved result for job {job_id}. Status set to COMPLETED. TTL={ttl_seconds}. Results: {results}"
674
+ )
675
+ except RedisError as e:
676
+ logger.error(f"Failed to save result for job {job_id}: {e}")
677
+ raise
540
678
 
541
679
  async def set_worker_health(
542
680
  self, worker_id: str, data: dict[str, Any], ttl_seconds: int
@@ -572,6 +710,8 @@ class JobStore:
572
710
  """
573
711
  health_key = f"rrq:health:worker:{worker_id}"
574
712
 
713
+ # Use pipeline with transaction=False for read-only batch operations
714
+ # No atomicity needed as we're only reading, this improves performance
575
715
  async with self.redis.pipeline(transaction=False) as pipe:
576
716
  pipe.get(health_key)
577
717
  pipe.ttl(health_key)
@@ -601,3 +741,141 @@ class JobStore:
601
741
  f"Retrieved health data for worker {worker_id}: TTL={final_ttl}, Data keys={list(health_data.keys()) if health_data else None}"
602
742
  )
603
743
  return health_data, final_ttl
744
+
745
+ async def get_job(self, job_id: str) -> Optional[dict[str, Any]]:
746
+ """Get simplified job data for monitoring/CLI purposes.
747
+
748
+ Returns a dictionary with basic job information, or None if job not found.
749
+ This is more lightweight than get_job_definition which returns full Job objects.
750
+ """
751
+ job_key = f"{JOB_KEY_PREFIX}{job_id}"
752
+ job_data = await self.redis.hgetall(job_key)
753
+
754
+ if not job_data:
755
+ return None
756
+
757
+ # Convert bytes to strings and return simplified dict
758
+ return {k.decode("utf-8"): v.decode("utf-8") for k, v in job_data.items()}
759
+
760
+ # Hybrid monitoring optimization methods
761
+ async def register_active_queue(self, queue_name: str) -> None:
762
+ """Register a queue as active in the monitoring registry"""
763
+ from .constants import ACTIVE_QUEUES_SET
764
+
765
+ timestamp = datetime.now(timezone.utc).timestamp()
766
+ await self.redis.zadd(ACTIVE_QUEUES_SET, {queue_name: timestamp})
767
+
768
+ async def register_active_worker(self, worker_id: str) -> None:
769
+ """Register a worker as active in the monitoring registry"""
770
+ from .constants import ACTIVE_WORKERS_SET
771
+
772
+ timestamp = datetime.now(timezone.utc).timestamp()
773
+ await self.redis.zadd(ACTIVE_WORKERS_SET, {worker_id: timestamp})
774
+
775
+ async def get_active_queues(self, max_age_seconds: int = 300) -> list[str]:
776
+ """Get list of recently active queues"""
777
+ from .constants import ACTIVE_QUEUES_SET
778
+
779
+ cutoff_time = datetime.now(timezone.utc).timestamp() - max_age_seconds
780
+
781
+ # Remove stale entries and get active ones
782
+ await self.redis.zremrangebyscore(ACTIVE_QUEUES_SET, 0, cutoff_time)
783
+ active_queues = await self.redis.zrange(ACTIVE_QUEUES_SET, 0, -1)
784
+
785
+ return [q.decode("utf-8") if isinstance(q, bytes) else q for q in active_queues]
786
+
787
+ async def get_active_workers(self, max_age_seconds: int = 60) -> list[str]:
788
+ """Get list of recently active workers"""
789
+ from .constants import ACTIVE_WORKERS_SET
790
+
791
+ cutoff_time = datetime.now(timezone.utc).timestamp() - max_age_seconds
792
+
793
+ # Remove stale entries and get active ones
794
+ await self.redis.zremrangebyscore(ACTIVE_WORKERS_SET, 0, cutoff_time)
795
+ active_workers = await self.redis.zrange(ACTIVE_WORKERS_SET, 0, -1)
796
+
797
+ return [
798
+ w.decode("utf-8") if isinstance(w, bytes) else w for w in active_workers
799
+ ]
800
+
801
+ async def publish_monitor_event(self, event_type: str, data: dict) -> None:
802
+ """Publish a monitoring event to the Redis stream"""
803
+ from .constants import MONITOR_EVENTS_STREAM
804
+
805
+ event_data = {
806
+ "event_type": event_type,
807
+ "timestamp": datetime.now(timezone.utc).timestamp(),
808
+ **data,
809
+ }
810
+
811
+ # Add to stream with max length to prevent unbounded growth
812
+ await self.redis.xadd(
813
+ MONITOR_EVENTS_STREAM, event_data, maxlen=1000, approximate=True
814
+ )
815
+
816
+ async def consume_monitor_events(
817
+ self, last_id: str = "0", count: int = 100, block: int = 50
818
+ ) -> list:
819
+ """Consume monitoring events from Redis stream"""
820
+ from .constants import MONITOR_EVENTS_STREAM
821
+
822
+ try:
823
+ events = await self.redis.xread(
824
+ {MONITOR_EVENTS_STREAM: last_id}, count=count, block=block
825
+ )
826
+ return events
827
+ except Exception:
828
+ # Handle timeout or other Redis errors gracefully
829
+ return []
830
+
831
+ async def get_lock_ttl(self, unique_key: str) -> int:
832
+ lock_key = f"{UNIQUE_JOB_LOCK_PREFIX}{unique_key}"
833
+ ttl = await self.redis.ttl(lock_key)
834
+ try:
835
+ ttl_int = int(ttl)
836
+ except (TypeError, ValueError):
837
+ ttl_int = 0
838
+ return ttl_int if ttl_int and ttl_int > 0 else 0
839
+
840
+ async def get_last_process_time(self, unique_key: str) -> Optional[datetime]:
841
+ key = f"last_process:{unique_key}"
842
+ timestamp = await self.redis.get(key)
843
+ return datetime.fromtimestamp(float(timestamp), timezone.utc) if timestamp else None
844
+
845
+ async def set_last_process_time(self, unique_key: str, timestamp: datetime) -> None:
846
+ key = f"last_process:{unique_key}"
847
+ # Add TTL to auto-expire the marker; independent of app specifics
848
+ ttl_seconds = max(60, int(self.settings.expected_job_ttl) * 2)
849
+ await self.redis.set(key, timestamp.timestamp(), ex=ttl_seconds)
850
+
851
+ async def get_unique_lock_holder(self, unique_key: str) -> Optional[str]:
852
+ """Return the job_id currently holding the unique lock, if any."""
853
+ lock_key = f"{UNIQUE_JOB_LOCK_PREFIX}{unique_key}"
854
+ value = await self.redis.get(lock_key)
855
+ return value.decode("utf-8") if value else None
856
+
857
+ async def defer_job(self, job: Job, defer_by: timedelta) -> None:
858
+ target_queue = job.queue_name or self.settings.default_queue_name
859
+ queue_key = self._format_queue_key(target_queue)
860
+ # Use milliseconds since epoch to be consistent with queue scores
861
+ score_ms = int((datetime.now(timezone.utc) + defer_by).timestamp() * 1000)
862
+ await self.redis.zadd(queue_key, {job.id.encode("utf-8"): float(score_ms)})
863
+ # Note: job was already removed from queue during acquisition.
864
+
865
+ async def batch_get_queue_sizes(self, queue_names: list[str]) -> dict[str, int]:
866
+ """Efficiently get sizes for multiple queues using pipeline"""
867
+ from .constants import QUEUE_KEY_PREFIX
868
+
869
+ if not queue_names:
870
+ return {}
871
+
872
+ # Use pipeline with transaction=False for read-only batch operations
873
+ # No atomicity needed as we're only reading, this improves performance
874
+ async with self.redis.pipeline(transaction=False) as pipe:
875
+ for queue_name in queue_names:
876
+ queue_key = f"{QUEUE_KEY_PREFIX}{queue_name}"
877
+ pipe.zcard(queue_key)
878
+
879
+ sizes = await pipe.execute()
880
+
881
+ return dict(zip(queue_names, sizes))