rrq 0.3.7__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rrq/__init__.py +14 -0
- rrq/cli.py +303 -29
- rrq/cron.py +154 -0
- rrq/settings.py +5 -0
- rrq/store.py +122 -2
- rrq/worker.py +238 -133
- {rrq-0.3.7.dist-info → rrq-0.5.0.dist-info}/METADATA +125 -7
- rrq-0.5.0.dist-info/RECORD +16 -0
- rrq-0.3.7.dist-info/RECORD +0 -15
- {rrq-0.3.7.dist-info → rrq-0.5.0.dist-info}/WHEEL +0 -0
- {rrq-0.3.7.dist-info → rrq-0.5.0.dist-info}/entry_points.txt +0 -0
- {rrq-0.3.7.dist-info → rrq-0.5.0.dist-info}/licenses/LICENSE +0 -0
rrq/worker.py
CHANGED
|
@@ -7,6 +7,7 @@ import asyncio
|
|
|
7
7
|
# Use standard logging instead of custom one if appropriate
|
|
8
8
|
import logging
|
|
9
9
|
import os
|
|
10
|
+
import random
|
|
10
11
|
import signal
|
|
11
12
|
import time
|
|
12
13
|
import uuid
|
|
@@ -21,13 +22,13 @@ from rrq.client import RRQClient
|
|
|
21
22
|
|
|
22
23
|
from .constants import (
|
|
23
24
|
DEFAULT_WORKER_ID_PREFIX,
|
|
24
|
-
JOB_KEY_PREFIX,
|
|
25
25
|
)
|
|
26
26
|
from .exc import RetryJob
|
|
27
27
|
from .job import Job, JobStatus
|
|
28
28
|
from .registry import JobRegistry
|
|
29
29
|
from .settings import RRQSettings
|
|
30
30
|
from .store import JobStore
|
|
31
|
+
from .cron import CronJob
|
|
31
32
|
|
|
32
33
|
logger = logging.getLogger(__name__)
|
|
33
34
|
|
|
@@ -77,16 +78,43 @@ class RRQWorker:
|
|
|
77
78
|
# Burst mode: process existing jobs then exit
|
|
78
79
|
self.burst = burst
|
|
79
80
|
|
|
81
|
+
self.cron_jobs: list[CronJob] = list(self.settings.cron_jobs)
|
|
82
|
+
|
|
80
83
|
self._semaphore = asyncio.Semaphore(self.settings.worker_concurrency)
|
|
81
84
|
self._running_tasks: set[asyncio.Task] = set()
|
|
82
85
|
self._shutdown_event = asyncio.Event()
|
|
83
86
|
self._loop = None # Will be set in run()
|
|
84
87
|
self._health_check_task: Optional[asyncio.Task] = None
|
|
88
|
+
self._cron_task: Optional[asyncio.Task] = None
|
|
85
89
|
self.status: str = "initializing" # Worker status (e.g., initializing, running, polling, idle, stopped)
|
|
86
90
|
logger.info(
|
|
87
91
|
f"Initializing RRQWorker {self.worker_id} for queues: {self.queues}"
|
|
88
92
|
)
|
|
89
93
|
|
|
94
|
+
def _calculate_jittered_delay(
|
|
95
|
+
self, base_delay: float, jitter_factor: float = 0.5
|
|
96
|
+
) -> float:
|
|
97
|
+
"""Calculate a jittered delay to prevent thundering herd effects.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
base_delay: The base delay in seconds.
|
|
101
|
+
jitter_factor: Factor for jitter (0.0 to 1.0). Default 0.5 means ±50% jitter.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The jittered delay in seconds.
|
|
105
|
+
"""
|
|
106
|
+
# Clamp jitter_factor to safe range to prevent negative delays
|
|
107
|
+
jitter_factor = max(0.0, min(jitter_factor, 0.99))
|
|
108
|
+
|
|
109
|
+
# Calculate jitter range: base_delay * (1 ± jitter_factor)
|
|
110
|
+
min_delay = base_delay * (1 - jitter_factor)
|
|
111
|
+
max_delay = base_delay * (1 + jitter_factor)
|
|
112
|
+
|
|
113
|
+
# Ensure min_delay is always positive
|
|
114
|
+
min_delay = max(0.001, min_delay)
|
|
115
|
+
|
|
116
|
+
return random.uniform(min_delay, max_delay)
|
|
117
|
+
|
|
90
118
|
async def _call_startup_hook(self) -> None:
|
|
91
119
|
if self.settings.on_startup:
|
|
92
120
|
logger.info(f"Worker {self.worker_id} calling on_startup hook...")
|
|
@@ -135,6 +163,10 @@ class RRQWorker:
|
|
|
135
163
|
"""
|
|
136
164
|
logger.info(f"Worker {self.worker_id} starting run loop.")
|
|
137
165
|
self._health_check_task = self._loop.create_task(self._heartbeat_loop())
|
|
166
|
+
if self.cron_jobs:
|
|
167
|
+
for cj in self.cron_jobs:
|
|
168
|
+
cj.schedule_next()
|
|
169
|
+
self._cron_task = self._loop.create_task(self._cron_loop())
|
|
138
170
|
|
|
139
171
|
while not self._shutdown_event.is_set():
|
|
140
172
|
try:
|
|
@@ -163,14 +195,19 @@ class RRQWorker:
|
|
|
163
195
|
self.status = "idle (concurrency limit)"
|
|
164
196
|
# At concurrency limit, wait for tasks to finish or poll delay
|
|
165
197
|
|
|
166
|
-
|
|
198
|
+
# Use jittered delay to prevent thundering herd effects
|
|
199
|
+
jittered_delay = self._calculate_jittered_delay(
|
|
200
|
+
self.settings.default_poll_delay_seconds
|
|
201
|
+
)
|
|
202
|
+
await asyncio.sleep(jittered_delay)
|
|
167
203
|
except Exception as e:
|
|
168
204
|
logger.error(
|
|
169
205
|
f"Worker {self.worker_id} encountered error in main run loop: {e}",
|
|
170
206
|
exc_info=True,
|
|
171
207
|
)
|
|
172
|
-
# Avoid tight loop on persistent errors
|
|
173
|
-
|
|
208
|
+
# Avoid tight loop on persistent errors with jittered delay
|
|
209
|
+
jittered_delay = self._calculate_jittered_delay(1.0)
|
|
210
|
+
await asyncio.sleep(jittered_delay)
|
|
174
211
|
|
|
175
212
|
logger.info(
|
|
176
213
|
f"Worker {self.worker_id} shutdown signal received. Draining tasks..."
|
|
@@ -181,6 +218,10 @@ class RRQWorker:
|
|
|
181
218
|
self._health_check_task.cancel()
|
|
182
219
|
with suppress(asyncio.CancelledError):
|
|
183
220
|
await self._health_check_task
|
|
221
|
+
if self._cron_task:
|
|
222
|
+
self._cron_task.cancel()
|
|
223
|
+
with suppress(asyncio.CancelledError):
|
|
224
|
+
await self._cron_task
|
|
184
225
|
|
|
185
226
|
async def _poll_for_jobs(self, count: int) -> None:
|
|
186
227
|
"""Polls configured queues round-robin and attempts to start processing jobs.
|
|
@@ -210,53 +251,65 @@ class RRQWorker:
|
|
|
210
251
|
if fetched_count >= count or self._shutdown_event.is_set():
|
|
211
252
|
break
|
|
212
253
|
|
|
213
|
-
# Attempt to acquire semaphore *before* trying to process
|
|
214
|
-
await self._semaphore.acquire()
|
|
215
254
|
try:
|
|
216
|
-
#
|
|
217
|
-
|
|
218
|
-
if
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
255
|
+
# Try to acquire lock and remove from queue first (without semaphore)
|
|
256
|
+
job_acquired = await self._try_acquire_job(job_id, queue_name)
|
|
257
|
+
if job_acquired:
|
|
258
|
+
# Only acquire semaphore after successfully getting the job
|
|
259
|
+
await self._semaphore.acquire()
|
|
260
|
+
try:
|
|
261
|
+
# Process the job (we already have the lock and removed from queue)
|
|
262
|
+
# The semaphore will be released when the job task completes
|
|
263
|
+
await self._process_acquired_job(
|
|
264
|
+
job_acquired, queue_name
|
|
265
|
+
)
|
|
266
|
+
fetched_count += 1
|
|
267
|
+
except Exception as e_process:
|
|
268
|
+
logger.error(
|
|
269
|
+
f"Worker {self.worker_id} exception processing acquired job {job_id}: {e_process}",
|
|
270
|
+
exc_info=True,
|
|
271
|
+
)
|
|
272
|
+
# Release lock and semaphore since processing failed
|
|
273
|
+
await self.job_store.release_job_lock(job_id)
|
|
274
|
+
self._semaphore.release()
|
|
275
|
+
# If job_acquired is None, another worker got it - continue to next job
|
|
223
276
|
except Exception as e_try:
|
|
224
|
-
# Catch errors during the
|
|
277
|
+
# Catch errors during the job acquisition itself
|
|
225
278
|
logger.error(
|
|
226
|
-
f"Worker {self.worker_id} exception trying to
|
|
279
|
+
f"Worker {self.worker_id} exception trying to acquire job {job_id}: {e_try}",
|
|
227
280
|
exc_info=True,
|
|
228
281
|
)
|
|
229
|
-
self._semaphore.release() # Ensure semaphore is released on error
|
|
230
282
|
|
|
231
283
|
except Exception as e_poll:
|
|
232
284
|
logger.error(
|
|
233
285
|
f"Worker {self.worker_id} error polling queue '{queue_name}': {e_poll}",
|
|
234
286
|
exc_info=True,
|
|
235
287
|
)
|
|
236
|
-
|
|
288
|
+
# Avoid tight loop on polling error with jittered delay
|
|
289
|
+
jittered_delay = self._calculate_jittered_delay(1.0)
|
|
290
|
+
await asyncio.sleep(jittered_delay)
|
|
237
291
|
# For burst mode, return number of jobs fetched in this poll
|
|
238
292
|
return fetched_count
|
|
239
293
|
|
|
240
|
-
async def
|
|
241
|
-
"""Attempts to lock
|
|
294
|
+
async def _try_acquire_job(self, job_id: str, queue_name: str) -> Optional[Job]:
|
|
295
|
+
"""Attempts to atomically lock and remove a job from the queue.
|
|
242
296
|
|
|
243
297
|
Args:
|
|
244
|
-
job_id: The ID of the job to attempt
|
|
298
|
+
job_id: The ID of the job to attempt acquiring.
|
|
245
299
|
queue_name: The name of the queue the job ID was retrieved from.
|
|
246
300
|
|
|
247
301
|
Returns:
|
|
248
|
-
|
|
249
|
-
(e.g., lock conflict, job definition not found, already removed).
|
|
302
|
+
The Job object if successfully acquired, None otherwise.
|
|
250
303
|
"""
|
|
251
304
|
logger.debug(
|
|
252
|
-
f"Worker {self.worker_id} attempting to
|
|
305
|
+
f"Worker {self.worker_id} attempting to acquire job {job_id} from queue '{queue_name}'"
|
|
253
306
|
)
|
|
254
307
|
job = await self.job_store.get_job_definition(job_id)
|
|
255
308
|
if not job:
|
|
256
309
|
logger.warning(
|
|
257
|
-
f"Worker {self.worker_id} job definition {job_id} not found during
|
|
310
|
+
f"Worker {self.worker_id} job definition {job_id} not found during _try_acquire_job from queue {queue_name}."
|
|
258
311
|
)
|
|
259
|
-
return
|
|
312
|
+
return None # Job vanished between poll and fetch?
|
|
260
313
|
|
|
261
314
|
# Determine job-specific timeout and calculate lock timeout
|
|
262
315
|
job_timeout = (
|
|
@@ -268,32 +321,28 @@ class RRQWorker:
|
|
|
268
321
|
job_timeout + self.settings.default_lock_timeout_extension_seconds
|
|
269
322
|
) * 1000
|
|
270
323
|
|
|
271
|
-
#
|
|
272
|
-
lock_acquired = await self.job_store.
|
|
273
|
-
job.id, self.worker_id, int(lock_timeout_ms)
|
|
324
|
+
# Atomically acquire the processing lock and remove from queue
|
|
325
|
+
lock_acquired, removed_count = await self.job_store.atomic_lock_and_remove_job(
|
|
326
|
+
job.id, queue_name, self.worker_id, int(lock_timeout_ms)
|
|
274
327
|
)
|
|
275
|
-
if not lock_acquired:
|
|
276
|
-
logger.debug(
|
|
277
|
-
f"Worker {self.worker_id} failed to acquire lock for job {job.id} (already locked by another worker)."
|
|
278
|
-
)
|
|
279
|
-
return False # Another worker got there first
|
|
280
328
|
|
|
281
|
-
|
|
329
|
+
if not lock_acquired or removed_count == 0:
|
|
330
|
+
return None # Another worker got there first
|
|
282
331
|
|
|
283
|
-
#
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
)
|
|
289
|
-
if removed_count == 0:
|
|
290
|
-
logger.warning(
|
|
291
|
-
f"Worker {self.worker_id} job {job.id} was already removed from queue '{queue_name}' after lock acquisition. Releasing lock."
|
|
292
|
-
)
|
|
293
|
-
await self.job_store.release_job_lock(job.id) # Release the acquired lock
|
|
294
|
-
return False # Job processed by another worker between our poll and lock
|
|
332
|
+
# Successfully acquired the job
|
|
333
|
+
logger.debug(f"Worker {self.worker_id} successfully acquired job {job.id}")
|
|
334
|
+
return job
|
|
335
|
+
|
|
336
|
+
async def _process_acquired_job(self, job: Job, queue_name: str) -> None:
|
|
337
|
+
"""Processes a job that has already been acquired (locked and removed from queue).
|
|
295
338
|
|
|
296
|
-
|
|
339
|
+
Note: This method assumes the worker has already acquired the concurrency semaphore.
|
|
340
|
+
The semaphore will be released when the job task completes via _task_cleanup.
|
|
341
|
+
|
|
342
|
+
Args:
|
|
343
|
+
job: The Job object that was successfully acquired.
|
|
344
|
+
queue_name: The name of the queue the job was retrieved from.
|
|
345
|
+
"""
|
|
297
346
|
try:
|
|
298
347
|
await self.job_store.update_job_status(job.id, JobStatus.ACTIVE)
|
|
299
348
|
logger.debug(
|
|
@@ -301,21 +350,58 @@ class RRQWorker:
|
|
|
301
350
|
)
|
|
302
351
|
|
|
303
352
|
# Create and track the execution task
|
|
353
|
+
# The semaphore will be released when this task completes
|
|
304
354
|
task = self._loop.create_task(self._execute_job(job, queue_name))
|
|
305
355
|
self._running_tasks.add(task)
|
|
306
356
|
task.add_done_callback(lambda t: self._task_cleanup(t, self._semaphore))
|
|
307
357
|
logger.info(
|
|
308
358
|
f"Worker {self.worker_id} started job {job.id} ('{job.function_name}') from queue '{queue_name}'"
|
|
309
359
|
)
|
|
310
|
-
return True
|
|
311
360
|
except Exception as e_start:
|
|
312
361
|
# Catch errors during status update or task creation
|
|
313
362
|
logger.error(
|
|
314
|
-
f"Worker {self.worker_id} failed to start task for job {job.id} after
|
|
363
|
+
f"Worker {self.worker_id} failed to start task for job {job.id} after acquisition: {e_start}",
|
|
315
364
|
exc_info=True,
|
|
316
365
|
)
|
|
317
|
-
#
|
|
366
|
+
# Release the lock since task wasn't started
|
|
318
367
|
await self.job_store.release_job_lock(job.id)
|
|
368
|
+
raise # Re-raise to be handled by caller
|
|
369
|
+
|
|
370
|
+
async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
|
|
371
|
+
"""Attempts to lock, fetch definition, and start the execution task for a specific job.
|
|
372
|
+
|
|
373
|
+
This method is kept for backward compatibility and uses the optimized approach internally.
|
|
374
|
+
For new code, prefer using _try_acquire_job and _process_acquired_job separately.
|
|
375
|
+
|
|
376
|
+
Note: This method handles semaphore acquisition internally for backward compatibility.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
job_id: The ID of the job to attempt processing.
|
|
380
|
+
queue_name: The name of the queue the job ID was retrieved from.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
True if the job processing task was successfully started, False otherwise
|
|
384
|
+
(e.g., lock conflict, job definition not found, already removed).
|
|
385
|
+
"""
|
|
386
|
+
# Use the optimized approach: acquire job first, then process
|
|
387
|
+
job_acquired = await self._try_acquire_job(job_id, queue_name)
|
|
388
|
+
if not job_acquired:
|
|
389
|
+
return False
|
|
390
|
+
|
|
391
|
+
# For backward compatibility, acquire semaphore here since old callers expect it
|
|
392
|
+
await self._semaphore.acquire()
|
|
393
|
+
try:
|
|
394
|
+
# Process the acquired job
|
|
395
|
+
await self._process_acquired_job(job_acquired, queue_name)
|
|
396
|
+
return True
|
|
397
|
+
except Exception as e_process:
|
|
398
|
+
logger.error(
|
|
399
|
+
f"Worker {self.worker_id} failed to process acquired job {job_id}: {e_process}",
|
|
400
|
+
exc_info=True,
|
|
401
|
+
)
|
|
402
|
+
# Release semaphore on error since _process_acquired_job doesn't handle it
|
|
403
|
+
self._semaphore.release()
|
|
404
|
+
# Lock is already released in _process_acquired_job on error
|
|
319
405
|
return False
|
|
320
406
|
|
|
321
407
|
async def _execute_job(self, job: Job, queue_name: str) -> None:
|
|
@@ -463,63 +549,54 @@ class RRQWorker:
|
|
|
463
549
|
appropriate delay (custom or exponential backoff) or moves to DLQ.
|
|
464
550
|
"""
|
|
465
551
|
log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
|
|
466
|
-
|
|
467
|
-
try:
|
|
468
|
-
# Atomically increment retries in the store.
|
|
469
|
-
new_retry_count = await self.job_store.increment_job_retries(job.id)
|
|
470
|
-
max_retries = (
|
|
471
|
-
job.max_retries
|
|
472
|
-
) # Use max_retries from the job object passed in
|
|
473
|
-
|
|
474
|
-
if new_retry_count < max_retries:
|
|
475
|
-
# Update status and error atomically
|
|
476
|
-
await self.job_store.redis.hset(
|
|
477
|
-
job_key,
|
|
478
|
-
mapping={
|
|
479
|
-
"status": JobStatus.RETRYING.value,
|
|
480
|
-
"last_error": str(exc),
|
|
481
|
-
},
|
|
482
|
-
)
|
|
483
|
-
logger.debug(f"{log_prefix} status set to RETRYING, error saved.")
|
|
484
|
-
|
|
485
|
-
# Determine deferral time
|
|
486
|
-
defer_seconds = exc.defer_seconds
|
|
487
|
-
if defer_seconds is None:
|
|
488
|
-
# Create a temporary job representation for backoff calculation
|
|
489
|
-
# using the *new* retry count.
|
|
490
|
-
temp_job_for_backoff = Job(
|
|
491
|
-
id=job.id,
|
|
492
|
-
function_name=job.function_name,
|
|
493
|
-
current_retries=new_retry_count, # Use updated count
|
|
494
|
-
max_retries=max_retries, # Ensure this is passed
|
|
495
|
-
)
|
|
496
|
-
defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
|
|
497
|
-
defer_seconds = defer_ms / 1000.0
|
|
498
|
-
else:
|
|
499
|
-
logger.debug(
|
|
500
|
-
f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
|
|
501
|
-
)
|
|
552
|
+
max_retries = job.max_retries
|
|
502
553
|
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
logger.info(
|
|
509
|
-
f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
|
|
510
|
-
f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
|
|
511
|
-
)
|
|
512
|
-
else:
|
|
513
|
-
# Max retries exceeded even though RetryJob was raised
|
|
554
|
+
try:
|
|
555
|
+
# Check if we would exceed max retries
|
|
556
|
+
anticipated_retry_count = job.current_retries + 1
|
|
557
|
+
if anticipated_retry_count >= max_retries:
|
|
558
|
+
# Max retries exceeded, increment retry count and move directly to DLQ
|
|
514
559
|
logger.warning(
|
|
515
560
|
f"{log_prefix} max retries ({max_retries}) exceeded "
|
|
516
|
-
f"
|
|
561
|
+
f"with RetryJob exception. Moving to DLQ."
|
|
517
562
|
)
|
|
518
|
-
#
|
|
563
|
+
# Increment retry count before moving to DLQ
|
|
564
|
+
await self.job_store.increment_job_retries(job.id)
|
|
519
565
|
error_msg = (
|
|
520
566
|
str(exc) or f"Max retries ({max_retries}) exceeded after RetryJob"
|
|
521
567
|
)
|
|
522
568
|
await self._move_to_dlq(job, queue_name, error_msg)
|
|
569
|
+
return
|
|
570
|
+
|
|
571
|
+
# Determine deferral time
|
|
572
|
+
defer_seconds = exc.defer_seconds
|
|
573
|
+
if defer_seconds is None:
|
|
574
|
+
# Create a temporary job representation for backoff calculation
|
|
575
|
+
temp_job_for_backoff = Job(
|
|
576
|
+
id=job.id,
|
|
577
|
+
function_name=job.function_name,
|
|
578
|
+
current_retries=anticipated_retry_count, # Use anticipated count
|
|
579
|
+
max_retries=max_retries,
|
|
580
|
+
)
|
|
581
|
+
defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
|
|
582
|
+
defer_seconds = defer_ms / 1000.0
|
|
583
|
+
else:
|
|
584
|
+
logger.debug(
|
|
585
|
+
f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
retry_at_score = (time.time() + defer_seconds) * 1000
|
|
589
|
+
target_queue = job.queue_name or self.settings.default_queue_name
|
|
590
|
+
|
|
591
|
+
# Atomically increment retries, update status/error, and re-queue
|
|
592
|
+
new_retry_count = await self.job_store.atomic_retry_job(
|
|
593
|
+
job.id, target_queue, retry_at_score, str(exc), JobStatus.RETRYING
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
logger.info(
|
|
597
|
+
f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
|
|
598
|
+
f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
|
|
599
|
+
)
|
|
523
600
|
except Exception as e_handle:
|
|
524
601
|
logger.exception(
|
|
525
602
|
f"{log_prefix} CRITICAL error during RetryJob processing: {e_handle}"
|
|
@@ -537,48 +614,43 @@ class RRQWorker:
|
|
|
537
614
|
logger.debug(f"{log_prefix} processing general failure: {type(exc).__name__}")
|
|
538
615
|
|
|
539
616
|
try:
|
|
540
|
-
new_retry_count = await self.job_store.increment_job_retries(job.id)
|
|
541
|
-
# Re-fetch job state after incrementing retries might be safer if fields changed?
|
|
542
|
-
# For now, assume the job object passed in is mostly accurate except for retry count.
|
|
543
|
-
# Use max_retries from the job object passed in.
|
|
544
617
|
max_retries = job.max_retries
|
|
545
618
|
last_error_str = str(exc)
|
|
546
619
|
|
|
547
|
-
if
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
id=job.id,
|
|
552
|
-
function_name=job.function_name,
|
|
553
|
-
current_retries=new_retry_count,
|
|
554
|
-
max_retries=max_retries,
|
|
555
|
-
)
|
|
556
|
-
)
|
|
557
|
-
retry_at_score = (time.time() * 1000) + defer_ms
|
|
558
|
-
target_queue = job.queue_name or self.settings.default_queue_name
|
|
559
|
-
|
|
560
|
-
# Atomically update status/error and re-add to queue (if possible, else separate)
|
|
561
|
-
# For now, separate HSET and ZADD
|
|
562
|
-
await self.job_store.redis.hset(
|
|
563
|
-
f"{JOB_KEY_PREFIX}{job.id}",
|
|
564
|
-
mapping={
|
|
565
|
-
"status": JobStatus.RETRYING.value,
|
|
566
|
-
"last_error": last_error_str,
|
|
567
|
-
},
|
|
568
|
-
)
|
|
569
|
-
await self.job_store.add_job_to_queue(
|
|
570
|
-
target_queue, job.id, retry_at_score
|
|
571
|
-
)
|
|
572
|
-
logger.info(
|
|
573
|
-
f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
|
|
574
|
-
f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
|
|
575
|
-
)
|
|
576
|
-
else: # Max retries reached
|
|
620
|
+
# Check if we would exceed max retries
|
|
621
|
+
anticipated_retry_count = job.current_retries + 1
|
|
622
|
+
if anticipated_retry_count >= max_retries:
|
|
623
|
+
# Max retries exceeded, increment retry count and move directly to DLQ
|
|
577
624
|
logger.warning(
|
|
578
625
|
f"{log_prefix} failed after max retries ({max_retries}). Moving to DLQ. Error: {str(exc)[:100]}..."
|
|
579
626
|
)
|
|
627
|
+
# Increment retry count before moving to DLQ
|
|
628
|
+
await self.job_store.increment_job_retries(job.id)
|
|
580
629
|
# _move_to_dlq handles setting FAILED status, completion time, and last error.
|
|
581
630
|
await self._move_to_dlq(job, queue_name, last_error_str)
|
|
631
|
+
return
|
|
632
|
+
|
|
633
|
+
# Calculate backoff delay using anticipated retry count
|
|
634
|
+
defer_ms = self._calculate_backoff_ms(
|
|
635
|
+
Job(
|
|
636
|
+
id=job.id,
|
|
637
|
+
function_name=job.function_name,
|
|
638
|
+
current_retries=anticipated_retry_count, # Use anticipated count
|
|
639
|
+
max_retries=max_retries,
|
|
640
|
+
)
|
|
641
|
+
)
|
|
642
|
+
retry_at_score = (time.time() * 1000) + defer_ms
|
|
643
|
+
target_queue = job.queue_name or self.settings.default_queue_name
|
|
644
|
+
|
|
645
|
+
# Atomically increment retries, update status/error, and re-queue
|
|
646
|
+
new_retry_count = await self.job_store.atomic_retry_job(
|
|
647
|
+
job.id, target_queue, retry_at_score, last_error_str, JobStatus.RETRYING
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
logger.info(
|
|
651
|
+
f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
|
|
652
|
+
f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
|
|
653
|
+
)
|
|
582
654
|
|
|
583
655
|
except Exception as e_handle:
|
|
584
656
|
logger.exception(
|
|
@@ -781,6 +853,39 @@ class RRQWorker:
|
|
|
781
853
|
|
|
782
854
|
logger.debug(f"Worker {self.worker_id} heartbeat loop finished.")
|
|
783
855
|
|
|
856
|
+
async def _maybe_enqueue_cron_jobs(self) -> None:
|
|
857
|
+
"""Enqueue cron jobs that are due to run."""
|
|
858
|
+
now = datetime.now(UTC)
|
|
859
|
+
for cj in self.cron_jobs:
|
|
860
|
+
if cj.due(now):
|
|
861
|
+
unique_key = f"cron:{cj.function_name}" if cj.unique else None
|
|
862
|
+
try:
|
|
863
|
+
await self.client.enqueue(
|
|
864
|
+
cj.function_name,
|
|
865
|
+
*cj.args,
|
|
866
|
+
_queue_name=cj.queue_name,
|
|
867
|
+
_unique_key=unique_key,
|
|
868
|
+
**cj.kwargs,
|
|
869
|
+
)
|
|
870
|
+
finally:
|
|
871
|
+
cj.schedule_next(now)
|
|
872
|
+
|
|
873
|
+
async def _cron_loop(self) -> None:
|
|
874
|
+
logger.debug(f"Worker {self.worker_id} starting cron loop.")
|
|
875
|
+
while not self._shutdown_event.is_set():
|
|
876
|
+
try:
|
|
877
|
+
await self._maybe_enqueue_cron_jobs()
|
|
878
|
+
except Exception as e:
|
|
879
|
+
logger.error(
|
|
880
|
+
f"Worker {self.worker_id} error running cron jobs: {e}",
|
|
881
|
+
exc_info=True,
|
|
882
|
+
)
|
|
883
|
+
try:
|
|
884
|
+
await asyncio.wait_for(self._shutdown_event.wait(), timeout=30)
|
|
885
|
+
except TimeoutError:
|
|
886
|
+
pass
|
|
887
|
+
logger.debug(f"Worker {self.worker_id} cron loop finished.")
|
|
888
|
+
|
|
784
889
|
async def _close_resources(self) -> None:
|
|
785
890
|
"""Closes the worker's resources, primarily the JobStore connection."""
|
|
786
891
|
logger.info(f"Worker {self.worker_id} closing resources...")
|