rrq 0.3.7__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rrq/worker.py CHANGED
@@ -7,6 +7,7 @@ import asyncio
7
7
  # Use standard logging instead of custom one if appropriate
8
8
  import logging
9
9
  import os
10
+ import random
10
11
  import signal
11
12
  import time
12
13
  import uuid
@@ -21,13 +22,13 @@ from rrq.client import RRQClient
21
22
 
22
23
  from .constants import (
23
24
  DEFAULT_WORKER_ID_PREFIX,
24
- JOB_KEY_PREFIX,
25
25
  )
26
26
  from .exc import RetryJob
27
27
  from .job import Job, JobStatus
28
28
  from .registry import JobRegistry
29
29
  from .settings import RRQSettings
30
30
  from .store import JobStore
31
+ from .cron import CronJob
31
32
 
32
33
  logger = logging.getLogger(__name__)
33
34
 
@@ -77,16 +78,43 @@ class RRQWorker:
77
78
  # Burst mode: process existing jobs then exit
78
79
  self.burst = burst
79
80
 
81
+ self.cron_jobs: list[CronJob] = list(self.settings.cron_jobs)
82
+
80
83
  self._semaphore = asyncio.Semaphore(self.settings.worker_concurrency)
81
84
  self._running_tasks: set[asyncio.Task] = set()
82
85
  self._shutdown_event = asyncio.Event()
83
86
  self._loop = None # Will be set in run()
84
87
  self._health_check_task: Optional[asyncio.Task] = None
88
+ self._cron_task: Optional[asyncio.Task] = None
85
89
  self.status: str = "initializing" # Worker status (e.g., initializing, running, polling, idle, stopped)
86
90
  logger.info(
87
91
  f"Initializing RRQWorker {self.worker_id} for queues: {self.queues}"
88
92
  )
89
93
 
94
+ def _calculate_jittered_delay(
95
+ self, base_delay: float, jitter_factor: float = 0.5
96
+ ) -> float:
97
+ """Calculate a jittered delay to prevent thundering herd effects.
98
+
99
+ Args:
100
+ base_delay: The base delay in seconds.
101
+ jitter_factor: Factor for jitter (0.0 to 1.0). Default 0.5 means ±50% jitter.
102
+
103
+ Returns:
104
+ The jittered delay in seconds.
105
+ """
106
+ # Clamp jitter_factor to safe range to prevent negative delays
107
+ jitter_factor = max(0.0, min(jitter_factor, 0.99))
108
+
109
+ # Calculate jitter range: base_delay * (1 ± jitter_factor)
110
+ min_delay = base_delay * (1 - jitter_factor)
111
+ max_delay = base_delay * (1 + jitter_factor)
112
+
113
+ # Ensure min_delay is always positive
114
+ min_delay = max(0.001, min_delay)
115
+
116
+ return random.uniform(min_delay, max_delay)
117
+
90
118
  async def _call_startup_hook(self) -> None:
91
119
  if self.settings.on_startup:
92
120
  logger.info(f"Worker {self.worker_id} calling on_startup hook...")
@@ -135,6 +163,10 @@ class RRQWorker:
135
163
  """
136
164
  logger.info(f"Worker {self.worker_id} starting run loop.")
137
165
  self._health_check_task = self._loop.create_task(self._heartbeat_loop())
166
+ if self.cron_jobs:
167
+ for cj in self.cron_jobs:
168
+ cj.schedule_next()
169
+ self._cron_task = self._loop.create_task(self._cron_loop())
138
170
 
139
171
  while not self._shutdown_event.is_set():
140
172
  try:
@@ -163,14 +195,19 @@ class RRQWorker:
163
195
  self.status = "idle (concurrency limit)"
164
196
  # At concurrency limit, wait for tasks to finish or poll delay
165
197
 
166
- await asyncio.sleep(self.settings.default_poll_delay_seconds)
198
+ # Use jittered delay to prevent thundering herd effects
199
+ jittered_delay = self._calculate_jittered_delay(
200
+ self.settings.default_poll_delay_seconds
201
+ )
202
+ await asyncio.sleep(jittered_delay)
167
203
  except Exception as e:
168
204
  logger.error(
169
205
  f"Worker {self.worker_id} encountered error in main run loop: {e}",
170
206
  exc_info=True,
171
207
  )
172
- # Avoid tight loop on persistent errors
173
- await asyncio.sleep(1)
208
+ # Avoid tight loop on persistent errors with jittered delay
209
+ jittered_delay = self._calculate_jittered_delay(1.0)
210
+ await asyncio.sleep(jittered_delay)
174
211
 
175
212
  logger.info(
176
213
  f"Worker {self.worker_id} shutdown signal received. Draining tasks..."
@@ -181,6 +218,10 @@ class RRQWorker:
181
218
  self._health_check_task.cancel()
182
219
  with suppress(asyncio.CancelledError):
183
220
  await self._health_check_task
221
+ if self._cron_task:
222
+ self._cron_task.cancel()
223
+ with suppress(asyncio.CancelledError):
224
+ await self._cron_task
184
225
 
185
226
  async def _poll_for_jobs(self, count: int) -> None:
186
227
  """Polls configured queues round-robin and attempts to start processing jobs.
@@ -210,53 +251,65 @@ class RRQWorker:
210
251
  if fetched_count >= count or self._shutdown_event.is_set():
211
252
  break
212
253
 
213
- # Attempt to acquire semaphore *before* trying to process
214
- await self._semaphore.acquire()
215
254
  try:
216
- # _try_process_job handles lock acquisition, fetching, task creation
217
- job_started = await self._try_process_job(job_id, queue_name)
218
- if job_started:
219
- fetched_count += 1
220
- else:
221
- # If job wasn't started (e.g., lock conflict), release semaphore immediately
222
- self._semaphore.release()
255
+ # Try to acquire lock and remove from queue first (without semaphore)
256
+ job_acquired = await self._try_acquire_job(job_id, queue_name)
257
+ if job_acquired:
258
+ # Only acquire semaphore after successfully getting the job
259
+ await self._semaphore.acquire()
260
+ try:
261
+ # Process the job (we already have the lock and removed from queue)
262
+ # The semaphore will be released when the job task completes
263
+ await self._process_acquired_job(
264
+ job_acquired, queue_name
265
+ )
266
+ fetched_count += 1
267
+ except Exception as e_process:
268
+ logger.error(
269
+ f"Worker {self.worker_id} exception processing acquired job {job_id}: {e_process}",
270
+ exc_info=True,
271
+ )
272
+ # Release lock and semaphore since processing failed
273
+ await self.job_store.release_job_lock(job_id)
274
+ self._semaphore.release()
275
+ # If job_acquired is None, another worker got it - continue to next job
223
276
  except Exception as e_try:
224
- # Catch errors during the _try_process_job itself
277
+ # Catch errors during the job acquisition itself
225
278
  logger.error(
226
- f"Worker {self.worker_id} exception trying to process job {job_id}: {e_try}",
279
+ f"Worker {self.worker_id} exception trying to acquire job {job_id}: {e_try}",
227
280
  exc_info=True,
228
281
  )
229
- self._semaphore.release() # Ensure semaphore is released on error
230
282
 
231
283
  except Exception as e_poll:
232
284
  logger.error(
233
285
  f"Worker {self.worker_id} error polling queue '{queue_name}': {e_poll}",
234
286
  exc_info=True,
235
287
  )
236
- await asyncio.sleep(1) # Avoid tight loop on polling error
288
+ # Avoid tight loop on polling error with jittered delay
289
+ jittered_delay = self._calculate_jittered_delay(1.0)
290
+ await asyncio.sleep(jittered_delay)
237
291
  # For burst mode, return number of jobs fetched in this poll
238
292
  return fetched_count
239
293
 
240
- async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
241
- """Attempts to lock, fetch definition, and start the execution task for a specific job.
294
+ async def _try_acquire_job(self, job_id: str, queue_name: str) -> Optional[Job]:
295
+ """Attempts to atomically lock and remove a job from the queue.
242
296
 
243
297
  Args:
244
- job_id: The ID of the job to attempt processing.
298
+ job_id: The ID of the job to attempt acquiring.
245
299
  queue_name: The name of the queue the job ID was retrieved from.
246
300
 
247
301
  Returns:
248
- True if the job processing task was successfully started, False otherwise
249
- (e.g., lock conflict, job definition not found, already removed).
302
+ The Job object if successfully acquired, None otherwise.
250
303
  """
251
304
  logger.debug(
252
- f"Worker {self.worker_id} attempting to process job {job_id} from queue '{queue_name}'"
305
+ f"Worker {self.worker_id} attempting to acquire job {job_id} from queue '{queue_name}'"
253
306
  )
254
307
  job = await self.job_store.get_job_definition(job_id)
255
308
  if not job:
256
309
  logger.warning(
257
- f"Worker {self.worker_id} job definition {job_id} not found during _try_process_job from queue {queue_name}."
310
+ f"Worker {self.worker_id} job definition {job_id} not found during _try_acquire_job from queue {queue_name}."
258
311
  )
259
- return False # Job vanished between poll and fetch?
312
+ return None # Job vanished between poll and fetch?
260
313
 
261
314
  # Determine job-specific timeout and calculate lock timeout
262
315
  job_timeout = (
@@ -268,32 +321,28 @@ class RRQWorker:
268
321
  job_timeout + self.settings.default_lock_timeout_extension_seconds
269
322
  ) * 1000
270
323
 
271
- # Attempt to acquire the processing lock
272
- lock_acquired = await self.job_store.acquire_job_lock(
273
- job.id, self.worker_id, int(lock_timeout_ms)
324
+ # Atomically acquire the processing lock and remove from queue
325
+ lock_acquired, removed_count = await self.job_store.atomic_lock_and_remove_job(
326
+ job.id, queue_name, self.worker_id, int(lock_timeout_ms)
274
327
  )
275
- if not lock_acquired:
276
- logger.debug(
277
- f"Worker {self.worker_id} failed to acquire lock for job {job.id} (already locked by another worker)."
278
- )
279
- return False # Another worker got there first
280
328
 
281
- logger.debug(f"Worker {self.worker_id} acquired lock for job {job.id}")
329
+ if not lock_acquired or removed_count == 0:
330
+ return None # Another worker got there first
282
331
 
283
- # Atomically remove the job from the queue (verify it was actually removed)
284
- # Note: Ideally, lock acquisition and queue removal would be a single atomic operation (e.g., Lua script).
285
- removed_count = await self.job_store.remove_job_from_queue(queue_name, job.id)
286
- logger.debug(
287
- f"Worker {self.worker_id} removed job {job.id} from queue '{queue_name}' (count: {removed_count})."
288
- )
289
- if removed_count == 0:
290
- logger.warning(
291
- f"Worker {self.worker_id} job {job.id} was already removed from queue '{queue_name}' after lock acquisition. Releasing lock."
292
- )
293
- await self.job_store.release_job_lock(job.id) # Release the acquired lock
294
- return False # Job processed by another worker between our poll and lock
332
+ # Successfully acquired the job
333
+ logger.debug(f"Worker {self.worker_id} successfully acquired job {job.id}")
334
+ return job
335
+
336
+ async def _process_acquired_job(self, job: Job, queue_name: str) -> None:
337
+ """Processes a job that has already been acquired (locked and removed from queue).
295
338
 
296
- # We have the lock and have removed the job from the queue - proceed to execute
339
+ Note: This method assumes the worker has already acquired the concurrency semaphore.
340
+ The semaphore will be released when the job task completes via _task_cleanup.
341
+
342
+ Args:
343
+ job: The Job object that was successfully acquired.
344
+ queue_name: The name of the queue the job was retrieved from.
345
+ """
297
346
  try:
298
347
  await self.job_store.update_job_status(job.id, JobStatus.ACTIVE)
299
348
  logger.debug(
@@ -301,21 +350,58 @@ class RRQWorker:
301
350
  )
302
351
 
303
352
  # Create and track the execution task
353
+ # The semaphore will be released when this task completes
304
354
  task = self._loop.create_task(self._execute_job(job, queue_name))
305
355
  self._running_tasks.add(task)
306
356
  task.add_done_callback(lambda t: self._task_cleanup(t, self._semaphore))
307
357
  logger.info(
308
358
  f"Worker {self.worker_id} started job {job.id} ('{job.function_name}') from queue '{queue_name}'"
309
359
  )
310
- return True
311
360
  except Exception as e_start:
312
361
  # Catch errors during status update or task creation
313
362
  logger.error(
314
- f"Worker {self.worker_id} failed to start task for job {job.id} after lock/removal: {e_start}",
363
+ f"Worker {self.worker_id} failed to start task for job {job.id} after acquisition: {e_start}",
315
364
  exc_info=True,
316
365
  )
317
- # Attempt to release the lock since task wasn't started
366
+ # Release the lock since task wasn't started
318
367
  await self.job_store.release_job_lock(job.id)
368
+ raise # Re-raise to be handled by caller
369
+
370
+ async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
371
+ """Attempts to lock, fetch definition, and start the execution task for a specific job.
372
+
373
+ This method is kept for backward compatibility and uses the optimized approach internally.
374
+ For new code, prefer using _try_acquire_job and _process_acquired_job separately.
375
+
376
+ Note: This method handles semaphore acquisition internally for backward compatibility.
377
+
378
+ Args:
379
+ job_id: The ID of the job to attempt processing.
380
+ queue_name: The name of the queue the job ID was retrieved from.
381
+
382
+ Returns:
383
+ True if the job processing task was successfully started, False otherwise
384
+ (e.g., lock conflict, job definition not found, already removed).
385
+ """
386
+ # Use the optimized approach: acquire job first, then process
387
+ job_acquired = await self._try_acquire_job(job_id, queue_name)
388
+ if not job_acquired:
389
+ return False
390
+
391
+ # For backward compatibility, acquire semaphore here since old callers expect it
392
+ await self._semaphore.acquire()
393
+ try:
394
+ # Process the acquired job
395
+ await self._process_acquired_job(job_acquired, queue_name)
396
+ return True
397
+ except Exception as e_process:
398
+ logger.error(
399
+ f"Worker {self.worker_id} failed to process acquired job {job_id}: {e_process}",
400
+ exc_info=True,
401
+ )
402
+ # Release semaphore on error since _process_acquired_job doesn't handle it
403
+ self._semaphore.release()
404
+ # Lock is already released in _process_acquired_job on error
319
405
  return False
320
406
 
321
407
  async def _execute_job(self, job: Job, queue_name: str) -> None:
@@ -463,63 +549,54 @@ class RRQWorker:
463
549
  appropriate delay (custom or exponential backoff) or moves to DLQ.
464
550
  """
465
551
  log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
466
- job_key = f"{JOB_KEY_PREFIX}{job.id}"
467
- try:
468
- # Atomically increment retries in the store.
469
- new_retry_count = await self.job_store.increment_job_retries(job.id)
470
- max_retries = (
471
- job.max_retries
472
- ) # Use max_retries from the job object passed in
473
-
474
- if new_retry_count < max_retries:
475
- # Update status and error atomically
476
- await self.job_store.redis.hset(
477
- job_key,
478
- mapping={
479
- "status": JobStatus.RETRYING.value,
480
- "last_error": str(exc),
481
- },
482
- )
483
- logger.debug(f"{log_prefix} status set to RETRYING, error saved.")
484
-
485
- # Determine deferral time
486
- defer_seconds = exc.defer_seconds
487
- if defer_seconds is None:
488
- # Create a temporary job representation for backoff calculation
489
- # using the *new* retry count.
490
- temp_job_for_backoff = Job(
491
- id=job.id,
492
- function_name=job.function_name,
493
- current_retries=new_retry_count, # Use updated count
494
- max_retries=max_retries, # Ensure this is passed
495
- )
496
- defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
497
- defer_seconds = defer_ms / 1000.0
498
- else:
499
- logger.debug(
500
- f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
501
- )
552
+ max_retries = job.max_retries
502
553
 
503
- retry_at_score = (time.time() + defer_seconds) * 1000
504
- target_queue = job.queue_name or self.settings.default_queue_name
505
- await self.job_store.add_job_to_queue(
506
- target_queue, job.id, retry_at_score
507
- )
508
- logger.info(
509
- f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
510
- f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
511
- )
512
- else:
513
- # Max retries exceeded even though RetryJob was raised
554
+ try:
555
+ # Check if we would exceed max retries
556
+ anticipated_retry_count = job.current_retries + 1
557
+ if anticipated_retry_count >= max_retries:
558
+ # Max retries exceeded, increment retry count and move directly to DLQ
514
559
  logger.warning(
515
560
  f"{log_prefix} max retries ({max_retries}) exceeded "
516
- f"despite RetryJob exception. Moving to DLQ."
561
+ f"with RetryJob exception. Moving to DLQ."
517
562
  )
518
- # _move_to_dlq handles setting FAILED status etc.
563
+ # Increment retry count before moving to DLQ
564
+ await self.job_store.increment_job_retries(job.id)
519
565
  error_msg = (
520
566
  str(exc) or f"Max retries ({max_retries}) exceeded after RetryJob"
521
567
  )
522
568
  await self._move_to_dlq(job, queue_name, error_msg)
569
+ return
570
+
571
+ # Determine deferral time
572
+ defer_seconds = exc.defer_seconds
573
+ if defer_seconds is None:
574
+ # Create a temporary job representation for backoff calculation
575
+ temp_job_for_backoff = Job(
576
+ id=job.id,
577
+ function_name=job.function_name,
578
+ current_retries=anticipated_retry_count, # Use anticipated count
579
+ max_retries=max_retries,
580
+ )
581
+ defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
582
+ defer_seconds = defer_ms / 1000.0
583
+ else:
584
+ logger.debug(
585
+ f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
586
+ )
587
+
588
+ retry_at_score = (time.time() + defer_seconds) * 1000
589
+ target_queue = job.queue_name or self.settings.default_queue_name
590
+
591
+ # Atomically increment retries, update status/error, and re-queue
592
+ new_retry_count = await self.job_store.atomic_retry_job(
593
+ job.id, target_queue, retry_at_score, str(exc), JobStatus.RETRYING
594
+ )
595
+
596
+ logger.info(
597
+ f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
598
+ f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
599
+ )
523
600
  except Exception as e_handle:
524
601
  logger.exception(
525
602
  f"{log_prefix} CRITICAL error during RetryJob processing: {e_handle}"
@@ -537,48 +614,43 @@ class RRQWorker:
537
614
  logger.debug(f"{log_prefix} processing general failure: {type(exc).__name__}")
538
615
 
539
616
  try:
540
- new_retry_count = await self.job_store.increment_job_retries(job.id)
541
- # Re-fetch job state after incrementing retries might be safer if fields changed?
542
- # For now, assume the job object passed in is mostly accurate except for retry count.
543
- # Use max_retries from the job object passed in.
544
617
  max_retries = job.max_retries
545
618
  last_error_str = str(exc)
546
619
 
547
- if new_retry_count < max_retries:
548
- # Re-queue for standard retry with backoff
549
- defer_ms = self._calculate_backoff_ms(
550
- Job(
551
- id=job.id,
552
- function_name=job.function_name,
553
- current_retries=new_retry_count,
554
- max_retries=max_retries,
555
- )
556
- )
557
- retry_at_score = (time.time() * 1000) + defer_ms
558
- target_queue = job.queue_name or self.settings.default_queue_name
559
-
560
- # Atomically update status/error and re-add to queue (if possible, else separate)
561
- # For now, separate HSET and ZADD
562
- await self.job_store.redis.hset(
563
- f"{JOB_KEY_PREFIX}{job.id}",
564
- mapping={
565
- "status": JobStatus.RETRYING.value,
566
- "last_error": last_error_str,
567
- },
568
- )
569
- await self.job_store.add_job_to_queue(
570
- target_queue, job.id, retry_at_score
571
- )
572
- logger.info(
573
- f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
574
- f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
575
- )
576
- else: # Max retries reached
620
+ # Check if we would exceed max retries
621
+ anticipated_retry_count = job.current_retries + 1
622
+ if anticipated_retry_count >= max_retries:
623
+ # Max retries exceeded, increment retry count and move directly to DLQ
577
624
  logger.warning(
578
625
  f"{log_prefix} failed after max retries ({max_retries}). Moving to DLQ. Error: {str(exc)[:100]}..."
579
626
  )
627
+ # Increment retry count before moving to DLQ
628
+ await self.job_store.increment_job_retries(job.id)
580
629
  # _move_to_dlq handles setting FAILED status, completion time, and last error.
581
630
  await self._move_to_dlq(job, queue_name, last_error_str)
631
+ return
632
+
633
+ # Calculate backoff delay using anticipated retry count
634
+ defer_ms = self._calculate_backoff_ms(
635
+ Job(
636
+ id=job.id,
637
+ function_name=job.function_name,
638
+ current_retries=anticipated_retry_count, # Use anticipated count
639
+ max_retries=max_retries,
640
+ )
641
+ )
642
+ retry_at_score = (time.time() * 1000) + defer_ms
643
+ target_queue = job.queue_name or self.settings.default_queue_name
644
+
645
+ # Atomically increment retries, update status/error, and re-queue
646
+ new_retry_count = await self.job_store.atomic_retry_job(
647
+ job.id, target_queue, retry_at_score, last_error_str, JobStatus.RETRYING
648
+ )
649
+
650
+ logger.info(
651
+ f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
652
+ f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
653
+ )
582
654
 
583
655
  except Exception as e_handle:
584
656
  logger.exception(
@@ -781,6 +853,39 @@ class RRQWorker:
781
853
 
782
854
  logger.debug(f"Worker {self.worker_id} heartbeat loop finished.")
783
855
 
856
+ async def _maybe_enqueue_cron_jobs(self) -> None:
857
+ """Enqueue cron jobs that are due to run."""
858
+ now = datetime.now(UTC)
859
+ for cj in self.cron_jobs:
860
+ if cj.due(now):
861
+ unique_key = f"cron:{cj.function_name}" if cj.unique else None
862
+ try:
863
+ await self.client.enqueue(
864
+ cj.function_name,
865
+ *cj.args,
866
+ _queue_name=cj.queue_name,
867
+ _unique_key=unique_key,
868
+ **cj.kwargs,
869
+ )
870
+ finally:
871
+ cj.schedule_next(now)
872
+
873
+ async def _cron_loop(self) -> None:
874
+ logger.debug(f"Worker {self.worker_id} starting cron loop.")
875
+ while not self._shutdown_event.is_set():
876
+ try:
877
+ await self._maybe_enqueue_cron_jobs()
878
+ except Exception as e:
879
+ logger.error(
880
+ f"Worker {self.worker_id} error running cron jobs: {e}",
881
+ exc_info=True,
882
+ )
883
+ try:
884
+ await asyncio.wait_for(self._shutdown_event.wait(), timeout=30)
885
+ except TimeoutError:
886
+ pass
887
+ logger.debug(f"Worker {self.worker_id} cron loop finished.")
888
+
784
889
  async def _close_resources(self) -> None:
785
890
  """Closes the worker's resources, primarily the JobStore connection."""
786
891
  logger.info(f"Worker {self.worker_id} closing resources...")