rrq 0.4.0__py3-none-any.whl → 0.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rrq/worker.py CHANGED
@@ -7,11 +7,12 @@ import asyncio
7
7
  # Use standard logging instead of custom one if appropriate
8
8
  import logging
9
9
  import os
10
+ import random
10
11
  import signal
11
12
  import time
12
13
  import uuid
13
14
  from contextlib import suppress
14
- from datetime import UTC, datetime
15
+ from datetime import timezone, datetime
15
16
  from typing import (
16
17
  Any,
17
18
  Optional,
@@ -21,7 +22,6 @@ from rrq.client import RRQClient
21
22
 
22
23
  from .constants import (
23
24
  DEFAULT_WORKER_ID_PREFIX,
24
- JOB_KEY_PREFIX,
25
25
  )
26
26
  from .exc import RetryJob
27
27
  from .job import Job, JobStatus
@@ -91,6 +91,30 @@ class RRQWorker:
91
91
  f"Initializing RRQWorker {self.worker_id} for queues: {self.queues}"
92
92
  )
93
93
 
94
+ def _calculate_jittered_delay(
95
+ self, base_delay: float, jitter_factor: float = 0.5
96
+ ) -> float:
97
+ """Calculate a jittered delay to prevent thundering herd effects.
98
+
99
+ Args:
100
+ base_delay: The base delay in seconds.
101
+ jitter_factor: Factor for jitter (0.0 to 1.0). Default 0.5 means ±50% jitter.
102
+
103
+ Returns:
104
+ The jittered delay in seconds.
105
+ """
106
+ # Clamp jitter_factor to safe range to prevent negative delays
107
+ jitter_factor = max(0.0, min(jitter_factor, 0.99))
108
+
109
+ # Calculate jitter range: base_delay * (1 ± jitter_factor)
110
+ min_delay = base_delay * (1 - jitter_factor)
111
+ max_delay = base_delay * (1 + jitter_factor)
112
+
113
+ # Ensure min_delay is always positive
114
+ min_delay = max(0.001, min_delay)
115
+
116
+ return random.uniform(min_delay, max_delay)
117
+
94
118
  async def _call_startup_hook(self) -> None:
95
119
  if self.settings.on_startup:
96
120
  logger.info(f"Worker {self.worker_id} calling on_startup hook...")
@@ -171,14 +195,19 @@ class RRQWorker:
171
195
  self.status = "idle (concurrency limit)"
172
196
  # At concurrency limit, wait for tasks to finish or poll delay
173
197
 
174
- await asyncio.sleep(self.settings.default_poll_delay_seconds)
198
+ # Use jittered delay to prevent thundering herd effects
199
+ jittered_delay = self._calculate_jittered_delay(
200
+ self.settings.default_poll_delay_seconds
201
+ )
202
+ await asyncio.sleep(jittered_delay)
175
203
  except Exception as e:
176
204
  logger.error(
177
205
  f"Worker {self.worker_id} encountered error in main run loop: {e}",
178
206
  exc_info=True,
179
207
  )
180
- # Avoid tight loop on persistent errors
181
- await asyncio.sleep(1)
208
+ # Avoid tight loop on persistent errors with jittered delay
209
+ jittered_delay = self._calculate_jittered_delay(1.0)
210
+ await asyncio.sleep(jittered_delay)
182
211
 
183
212
  logger.info(
184
213
  f"Worker {self.worker_id} shutdown signal received. Draining tasks..."
@@ -222,53 +251,65 @@ class RRQWorker:
222
251
  if fetched_count >= count or self._shutdown_event.is_set():
223
252
  break
224
253
 
225
- # Attempt to acquire semaphore *before* trying to process
226
- await self._semaphore.acquire()
227
254
  try:
228
- # _try_process_job handles lock acquisition, fetching, task creation
229
- job_started = await self._try_process_job(job_id, queue_name)
230
- if job_started:
231
- fetched_count += 1
232
- else:
233
- # If job wasn't started (e.g., lock conflict), release semaphore immediately
234
- self._semaphore.release()
255
+ # Try to acquire lock and remove from queue first (without semaphore)
256
+ job_acquired = await self._try_acquire_job(job_id, queue_name)
257
+ if job_acquired:
258
+ # Only acquire semaphore after successfully getting the job
259
+ await self._semaphore.acquire()
260
+ try:
261
+ # Process the job (we already have the lock and removed from queue)
262
+ # The semaphore will be released when the job task completes
263
+ await self._process_acquired_job(
264
+ job_acquired, queue_name
265
+ )
266
+ fetched_count += 1
267
+ except Exception as e_process:
268
+ logger.error(
269
+ f"Worker {self.worker_id} exception processing acquired job {job_id}: {e_process}",
270
+ exc_info=True,
271
+ )
272
+ # Release lock and semaphore since processing failed
273
+ await self.job_store.release_job_lock(job_id)
274
+ self._semaphore.release()
275
+ # If job_acquired is None, another worker got it - continue to next job
235
276
  except Exception as e_try:
236
- # Catch errors during the _try_process_job itself
277
+ # Catch errors during the job acquisition itself
237
278
  logger.error(
238
- f"Worker {self.worker_id} exception trying to process job {job_id}: {e_try}",
279
+ f"Worker {self.worker_id} exception trying to acquire job {job_id}: {e_try}",
239
280
  exc_info=True,
240
281
  )
241
- self._semaphore.release() # Ensure semaphore is released on error
242
282
 
243
283
  except Exception as e_poll:
244
284
  logger.error(
245
285
  f"Worker {self.worker_id} error polling queue '{queue_name}': {e_poll}",
246
286
  exc_info=True,
247
287
  )
248
- await asyncio.sleep(1) # Avoid tight loop on polling error
288
+ # Avoid tight loop on polling error with jittered delay
289
+ jittered_delay = self._calculate_jittered_delay(1.0)
290
+ await asyncio.sleep(jittered_delay)
249
291
  # For burst mode, return number of jobs fetched in this poll
250
292
  return fetched_count
251
293
 
252
- async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
253
- """Attempts to lock, fetch definition, and start the execution task for a specific job.
294
+ async def _try_acquire_job(self, job_id: str, queue_name: str) -> Optional[Job]:
295
+ """Attempts to atomically lock and remove a job from the queue.
254
296
 
255
297
  Args:
256
- job_id: The ID of the job to attempt processing.
298
+ job_id: The ID of the job to attempt acquiring.
257
299
  queue_name: The name of the queue the job ID was retrieved from.
258
300
 
259
301
  Returns:
260
- True if the job processing task was successfully started, False otherwise
261
- (e.g., lock conflict, job definition not found, already removed).
302
+ The Job object if successfully acquired, None otherwise.
262
303
  """
263
304
  logger.debug(
264
- f"Worker {self.worker_id} attempting to process job {job_id} from queue '{queue_name}'"
305
+ f"Worker {self.worker_id} attempting to acquire job {job_id} from queue '{queue_name}'"
265
306
  )
266
307
  job = await self.job_store.get_job_definition(job_id)
267
308
  if not job:
268
309
  logger.warning(
269
- f"Worker {self.worker_id} job definition {job_id} not found during _try_process_job from queue {queue_name}."
310
+ f"Worker {self.worker_id} job definition {job_id} not found during _try_acquire_job from queue {queue_name}."
270
311
  )
271
- return False # Job vanished between poll and fetch?
312
+ return None # Job vanished between poll and fetch?
272
313
 
273
314
  # Determine job-specific timeout and calculate lock timeout
274
315
  job_timeout = (
@@ -280,32 +321,28 @@ class RRQWorker:
280
321
  job_timeout + self.settings.default_lock_timeout_extension_seconds
281
322
  ) * 1000
282
323
 
283
- # Attempt to acquire the processing lock
284
- lock_acquired = await self.job_store.acquire_job_lock(
285
- job.id, self.worker_id, int(lock_timeout_ms)
324
+ # Atomically acquire the processing lock and remove from queue
325
+ lock_acquired, removed_count = await self.job_store.atomic_lock_and_remove_job(
326
+ job.id, queue_name, self.worker_id, int(lock_timeout_ms)
286
327
  )
287
- if not lock_acquired:
288
- logger.debug(
289
- f"Worker {self.worker_id} failed to acquire lock for job {job.id} (already locked by another worker)."
290
- )
291
- return False # Another worker got there first
292
328
 
293
- logger.debug(f"Worker {self.worker_id} acquired lock for job {job.id}")
329
+ if not lock_acquired or removed_count == 0:
330
+ return None # Another worker got there first
294
331
 
295
- # Atomically remove the job from the queue (verify it was actually removed)
296
- # Note: Ideally, lock acquisition and queue removal would be a single atomic operation (e.g., Lua script).
297
- removed_count = await self.job_store.remove_job_from_queue(queue_name, job.id)
298
- logger.debug(
299
- f"Worker {self.worker_id} removed job {job.id} from queue '{queue_name}' (count: {removed_count})."
300
- )
301
- if removed_count == 0:
302
- logger.warning(
303
- f"Worker {self.worker_id} job {job.id} was already removed from queue '{queue_name}' after lock acquisition. Releasing lock."
304
- )
305
- await self.job_store.release_job_lock(job.id) # Release the acquired lock
306
- return False # Job processed by another worker between our poll and lock
332
+ # Successfully acquired the job
333
+ logger.debug(f"Worker {self.worker_id} successfully acquired job {job.id}")
334
+ return job
335
+
336
+ async def _process_acquired_job(self, job: Job, queue_name: str) -> None:
337
+ """Processes a job that has already been acquired (locked and removed from queue).
338
+
339
+ Note: This method assumes the worker has already acquired the concurrency semaphore.
340
+ The semaphore will be released when the job task completes via _task_cleanup.
307
341
 
308
- # We have the lock and have removed the job from the queue - proceed to execute
342
+ Args:
343
+ job: The Job object that was successfully acquired.
344
+ queue_name: The name of the queue the job was retrieved from.
345
+ """
309
346
  try:
310
347
  await self.job_store.update_job_status(job.id, JobStatus.ACTIVE)
311
348
  logger.debug(
@@ -313,21 +350,58 @@ class RRQWorker:
313
350
  )
314
351
 
315
352
  # Create and track the execution task
353
+ # The semaphore will be released when this task completes
316
354
  task = self._loop.create_task(self._execute_job(job, queue_name))
317
355
  self._running_tasks.add(task)
318
356
  task.add_done_callback(lambda t: self._task_cleanup(t, self._semaphore))
319
357
  logger.info(
320
358
  f"Worker {self.worker_id} started job {job.id} ('{job.function_name}') from queue '{queue_name}'"
321
359
  )
322
- return True
323
360
  except Exception as e_start:
324
361
  # Catch errors during status update or task creation
325
362
  logger.error(
326
- f"Worker {self.worker_id} failed to start task for job {job.id} after lock/removal: {e_start}",
363
+ f"Worker {self.worker_id} failed to start task for job {job.id} after acquisition: {e_start}",
327
364
  exc_info=True,
328
365
  )
329
- # Attempt to release the lock since task wasn't started
366
+ # Release the lock since task wasn't started
330
367
  await self.job_store.release_job_lock(job.id)
368
+ raise # Re-raise to be handled by caller
369
+
370
+ async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
371
+ """Attempts to lock, fetch definition, and start the execution task for a specific job.
372
+
373
+ This method is kept for backward compatibility and uses the optimized approach internally.
374
+ For new code, prefer using _try_acquire_job and _process_acquired_job separately.
375
+
376
+ Note: This method handles semaphore acquisition internally for backward compatibility.
377
+
378
+ Args:
379
+ job_id: The ID of the job to attempt processing.
380
+ queue_name: The name of the queue the job ID was retrieved from.
381
+
382
+ Returns:
383
+ True if the job processing task was successfully started, False otherwise
384
+ (e.g., lock conflict, job definition not found, already removed).
385
+ """
386
+ # Use the optimized approach: acquire job first, then process
387
+ job_acquired = await self._try_acquire_job(job_id, queue_name)
388
+ if not job_acquired:
389
+ return False
390
+
391
+ # For backward compatibility, acquire semaphore here since old callers expect it
392
+ await self._semaphore.acquire()
393
+ try:
394
+ # Process the acquired job
395
+ await self._process_acquired_job(job_acquired, queue_name)
396
+ return True
397
+ except Exception as e_process:
398
+ logger.error(
399
+ f"Worker {self.worker_id} failed to process acquired job {job_id}: {e_process}",
400
+ exc_info=True,
401
+ )
402
+ # Release semaphore on error since _process_acquired_job doesn't handle it
403
+ self._semaphore.release()
404
+ # Lock is already released in _process_acquired_job on error
331
405
  return False
332
406
 
333
407
  async def _execute_job(self, job: Job, queue_name: str) -> None:
@@ -475,63 +549,54 @@ class RRQWorker:
475
549
  appropriate delay (custom or exponential backoff) or moves to DLQ.
476
550
  """
477
551
  log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
478
- job_key = f"{JOB_KEY_PREFIX}{job.id}"
479
- try:
480
- # Atomically increment retries in the store.
481
- new_retry_count = await self.job_store.increment_job_retries(job.id)
482
- max_retries = (
483
- job.max_retries
484
- ) # Use max_retries from the job object passed in
485
-
486
- if new_retry_count < max_retries:
487
- # Update status and error atomically
488
- await self.job_store.redis.hset(
489
- job_key,
490
- mapping={
491
- "status": JobStatus.RETRYING.value,
492
- "last_error": str(exc),
493
- },
494
- )
495
- logger.debug(f"{log_prefix} status set to RETRYING, error saved.")
496
-
497
- # Determine deferral time
498
- defer_seconds = exc.defer_seconds
499
- if defer_seconds is None:
500
- # Create a temporary job representation for backoff calculation
501
- # using the *new* retry count.
502
- temp_job_for_backoff = Job(
503
- id=job.id,
504
- function_name=job.function_name,
505
- current_retries=new_retry_count, # Use updated count
506
- max_retries=max_retries, # Ensure this is passed
507
- )
508
- defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
509
- defer_seconds = defer_ms / 1000.0
510
- else:
511
- logger.debug(
512
- f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
513
- )
552
+ max_retries = job.max_retries
514
553
 
515
- retry_at_score = (time.time() + defer_seconds) * 1000
516
- target_queue = job.queue_name or self.settings.default_queue_name
517
- await self.job_store.add_job_to_queue(
518
- target_queue, job.id, retry_at_score
519
- )
520
- logger.info(
521
- f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
522
- f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
523
- )
524
- else:
525
- # Max retries exceeded even though RetryJob was raised
554
+ try:
555
+ # Check if we would exceed max retries
556
+ anticipated_retry_count = job.current_retries + 1
557
+ if anticipated_retry_count >= max_retries:
558
+ # Max retries exceeded, increment retry count and move directly to DLQ
526
559
  logger.warning(
527
560
  f"{log_prefix} max retries ({max_retries}) exceeded "
528
- f"despite RetryJob exception. Moving to DLQ."
561
+ f"with RetryJob exception. Moving to DLQ."
529
562
  )
530
- # _move_to_dlq handles setting FAILED status etc.
563
+ # Increment retry count before moving to DLQ
564
+ await self.job_store.increment_job_retries(job.id)
531
565
  error_msg = (
532
566
  str(exc) or f"Max retries ({max_retries}) exceeded after RetryJob"
533
567
  )
534
568
  await self._move_to_dlq(job, queue_name, error_msg)
569
+ return
570
+
571
+ # Determine deferral time
572
+ defer_seconds = exc.defer_seconds
573
+ if defer_seconds is None:
574
+ # Create a temporary job representation for backoff calculation
575
+ temp_job_for_backoff = Job(
576
+ id=job.id,
577
+ function_name=job.function_name,
578
+ current_retries=anticipated_retry_count, # Use anticipated count
579
+ max_retries=max_retries,
580
+ )
581
+ defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
582
+ defer_seconds = defer_ms / 1000.0
583
+ else:
584
+ logger.debug(
585
+ f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
586
+ )
587
+
588
+ retry_at_score = (time.time() + defer_seconds) * 1000
589
+ target_queue = job.queue_name or self.settings.default_queue_name
590
+
591
+ # Atomically increment retries, update status/error, and re-queue
592
+ new_retry_count = await self.job_store.atomic_retry_job(
593
+ job.id, target_queue, retry_at_score, str(exc), JobStatus.RETRYING
594
+ )
595
+
596
+ logger.info(
597
+ f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
598
+ f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
599
+ )
535
600
  except Exception as e_handle:
536
601
  logger.exception(
537
602
  f"{log_prefix} CRITICAL error during RetryJob processing: {e_handle}"
@@ -549,48 +614,43 @@ class RRQWorker:
549
614
  logger.debug(f"{log_prefix} processing general failure: {type(exc).__name__}")
550
615
 
551
616
  try:
552
- new_retry_count = await self.job_store.increment_job_retries(job.id)
553
- # Re-fetch job state after incrementing retries might be safer if fields changed?
554
- # For now, assume the job object passed in is mostly accurate except for retry count.
555
- # Use max_retries from the job object passed in.
556
617
  max_retries = job.max_retries
557
618
  last_error_str = str(exc)
558
619
 
559
- if new_retry_count < max_retries:
560
- # Re-queue for standard retry with backoff
561
- defer_ms = self._calculate_backoff_ms(
562
- Job(
563
- id=job.id,
564
- function_name=job.function_name,
565
- current_retries=new_retry_count,
566
- max_retries=max_retries,
567
- )
568
- )
569
- retry_at_score = (time.time() * 1000) + defer_ms
570
- target_queue = job.queue_name or self.settings.default_queue_name
571
-
572
- # Atomically update status/error and re-add to queue (if possible, else separate)
573
- # For now, separate HSET and ZADD
574
- await self.job_store.redis.hset(
575
- f"{JOB_KEY_PREFIX}{job.id}",
576
- mapping={
577
- "status": JobStatus.RETRYING.value,
578
- "last_error": last_error_str,
579
- },
580
- )
581
- await self.job_store.add_job_to_queue(
582
- target_queue, job.id, retry_at_score
583
- )
584
- logger.info(
585
- f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
586
- f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
587
- )
588
- else: # Max retries reached
620
+ # Check if we would exceed max retries
621
+ anticipated_retry_count = job.current_retries + 1
622
+ if anticipated_retry_count >= max_retries:
623
+ # Max retries exceeded, increment retry count and move directly to DLQ
589
624
  logger.warning(
590
625
  f"{log_prefix} failed after max retries ({max_retries}). Moving to DLQ. Error: {str(exc)[:100]}..."
591
626
  )
627
+ # Increment retry count before moving to DLQ
628
+ await self.job_store.increment_job_retries(job.id)
592
629
  # _move_to_dlq handles setting FAILED status, completion time, and last error.
593
630
  await self._move_to_dlq(job, queue_name, last_error_str)
631
+ return
632
+
633
+ # Calculate backoff delay using anticipated retry count
634
+ defer_ms = self._calculate_backoff_ms(
635
+ Job(
636
+ id=job.id,
637
+ function_name=job.function_name,
638
+ current_retries=anticipated_retry_count, # Use anticipated count
639
+ max_retries=max_retries,
640
+ )
641
+ )
642
+ retry_at_score = (time.time() * 1000) + defer_ms
643
+ target_queue = job.queue_name or self.settings.default_queue_name
644
+
645
+ # Atomically increment retries, update status/error, and re-queue
646
+ new_retry_count = await self.job_store.atomic_retry_job(
647
+ job.id, target_queue, retry_at_score, last_error_str, JobStatus.RETRYING
648
+ )
649
+
650
+ logger.info(
651
+ f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
652
+ f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
653
+ )
594
654
 
595
655
  except Exception as e_handle:
596
656
  logger.exception(
@@ -601,7 +661,7 @@ class RRQWorker:
601
661
  """Moves a job to the Dead Letter Queue (DLQ) and releases its unique lock if present."""
602
662
 
603
663
  dlq_name = self.settings.default_dlq_name # Or derive from original queue_name
604
- completion_time = datetime.now(UTC)
664
+ completion_time = datetime.now(timezone.utc)
605
665
  try:
606
666
  await self.job_store.move_job_to_dlq(
607
667
  job_id=job.id,
@@ -749,7 +809,7 @@ class RRQWorker:
749
809
  try:
750
810
  health_data = {
751
811
  "worker_id": self.worker_id,
752
- "timestamp": datetime.now(UTC).isoformat(),
812
+ "timestamp": datetime.now(timezone.utc).isoformat(),
753
813
  "status": self.status,
754
814
  "active_jobs": len(self._running_tasks),
755
815
  "concurrency_limit": self.settings.worker_concurrency,
@@ -795,7 +855,7 @@ class RRQWorker:
795
855
 
796
856
  async def _maybe_enqueue_cron_jobs(self) -> None:
797
857
  """Enqueue cron jobs that are due to run."""
798
- now = datetime.now(UTC)
858
+ now = datetime.now(timezone.utc)
799
859
  for cj in self.cron_jobs:
800
860
  if cj.due(now):
801
861
  unique_key = f"cron:{cj.function_name}" if cj.unique else None
@@ -914,7 +974,7 @@ class RRQWorker:
914
974
  )
915
975
  try:
916
976
  job.status = JobStatus.PENDING
917
- job.next_scheduled_run_time = datetime.now(UTC) # Re-queue immediately
977
+ job.next_scheduled_run_time = datetime.now(timezone.utc) # Re-queue immediately
918
978
  job.last_error = "Job execution interrupted by worker shutdown. Re-queued."
919
979
  # Do not increment retries for shutdown interruption
920
980
 
@@ -935,7 +995,7 @@ class RRQWorker:
935
995
  job.id,
936
996
  self.settings.default_dlq_name,
937
997
  f"Failed to re-queue during cancellation: {e_requeue}",
938
- datetime.now(UTC),
998
+ datetime.now(timezone.utc),
939
999
  )
940
1000
  logger.info(
941
1001
  f"Successfully moved job {job.id} to DLQ due to re-queueing failure."