rrq 0.4.0__py3-none-any.whl → 0.7.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rrq/cli.py +340 -91
- rrq/cli_commands/__init__.py +1 -0
- rrq/cli_commands/base.py +102 -0
- rrq/cli_commands/commands/__init__.py +1 -0
- rrq/cli_commands/commands/debug.py +551 -0
- rrq/cli_commands/commands/dlq.py +853 -0
- rrq/cli_commands/commands/jobs.py +516 -0
- rrq/cli_commands/commands/monitor.py +776 -0
- rrq/cli_commands/commands/queues.py +539 -0
- rrq/cli_commands/utils.py +161 -0
- rrq/client.py +39 -35
- rrq/constants.py +10 -0
- rrq/cron.py +75 -15
- rrq/hooks.py +217 -0
- rrq/job.py +5 -5
- rrq/registry.py +0 -3
- rrq/settings.py +13 -1
- rrq/store.py +333 -55
- rrq/worker.py +199 -139
- {rrq-0.4.0.dist-info → rrq-0.7.0.dist-info}/METADATA +208 -24
- rrq-0.7.0.dist-info/RECORD +26 -0
- rrq-0.4.0.dist-info/RECORD +0 -16
- {rrq-0.4.0.dist-info → rrq-0.7.0.dist-info}/WHEEL +0 -0
- {rrq-0.4.0.dist-info → rrq-0.7.0.dist-info}/entry_points.txt +0 -0
- {rrq-0.4.0.dist-info → rrq-0.7.0.dist-info}/licenses/LICENSE +0 -0
rrq/worker.py
CHANGED
|
@@ -7,11 +7,12 @@ import asyncio
|
|
|
7
7
|
# Use standard logging instead of custom one if appropriate
|
|
8
8
|
import logging
|
|
9
9
|
import os
|
|
10
|
+
import random
|
|
10
11
|
import signal
|
|
11
12
|
import time
|
|
12
13
|
import uuid
|
|
13
14
|
from contextlib import suppress
|
|
14
|
-
from datetime import
|
|
15
|
+
from datetime import timezone, datetime
|
|
15
16
|
from typing import (
|
|
16
17
|
Any,
|
|
17
18
|
Optional,
|
|
@@ -21,7 +22,6 @@ from rrq.client import RRQClient
|
|
|
21
22
|
|
|
22
23
|
from .constants import (
|
|
23
24
|
DEFAULT_WORKER_ID_PREFIX,
|
|
24
|
-
JOB_KEY_PREFIX,
|
|
25
25
|
)
|
|
26
26
|
from .exc import RetryJob
|
|
27
27
|
from .job import Job, JobStatus
|
|
@@ -91,6 +91,30 @@ class RRQWorker:
|
|
|
91
91
|
f"Initializing RRQWorker {self.worker_id} for queues: {self.queues}"
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
+
def _calculate_jittered_delay(
|
|
95
|
+
self, base_delay: float, jitter_factor: float = 0.5
|
|
96
|
+
) -> float:
|
|
97
|
+
"""Calculate a jittered delay to prevent thundering herd effects.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
base_delay: The base delay in seconds.
|
|
101
|
+
jitter_factor: Factor for jitter (0.0 to 1.0). Default 0.5 means ±50% jitter.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The jittered delay in seconds.
|
|
105
|
+
"""
|
|
106
|
+
# Clamp jitter_factor to safe range to prevent negative delays
|
|
107
|
+
jitter_factor = max(0.0, min(jitter_factor, 0.99))
|
|
108
|
+
|
|
109
|
+
# Calculate jitter range: base_delay * (1 ± jitter_factor)
|
|
110
|
+
min_delay = base_delay * (1 - jitter_factor)
|
|
111
|
+
max_delay = base_delay * (1 + jitter_factor)
|
|
112
|
+
|
|
113
|
+
# Ensure min_delay is always positive
|
|
114
|
+
min_delay = max(0.001, min_delay)
|
|
115
|
+
|
|
116
|
+
return random.uniform(min_delay, max_delay)
|
|
117
|
+
|
|
94
118
|
async def _call_startup_hook(self) -> None:
|
|
95
119
|
if self.settings.on_startup:
|
|
96
120
|
logger.info(f"Worker {self.worker_id} calling on_startup hook...")
|
|
@@ -171,14 +195,19 @@ class RRQWorker:
|
|
|
171
195
|
self.status = "idle (concurrency limit)"
|
|
172
196
|
# At concurrency limit, wait for tasks to finish or poll delay
|
|
173
197
|
|
|
174
|
-
|
|
198
|
+
# Use jittered delay to prevent thundering herd effects
|
|
199
|
+
jittered_delay = self._calculate_jittered_delay(
|
|
200
|
+
self.settings.default_poll_delay_seconds
|
|
201
|
+
)
|
|
202
|
+
await asyncio.sleep(jittered_delay)
|
|
175
203
|
except Exception as e:
|
|
176
204
|
logger.error(
|
|
177
205
|
f"Worker {self.worker_id} encountered error in main run loop: {e}",
|
|
178
206
|
exc_info=True,
|
|
179
207
|
)
|
|
180
|
-
# Avoid tight loop on persistent errors
|
|
181
|
-
|
|
208
|
+
# Avoid tight loop on persistent errors with jittered delay
|
|
209
|
+
jittered_delay = self._calculate_jittered_delay(1.0)
|
|
210
|
+
await asyncio.sleep(jittered_delay)
|
|
182
211
|
|
|
183
212
|
logger.info(
|
|
184
213
|
f"Worker {self.worker_id} shutdown signal received. Draining tasks..."
|
|
@@ -222,53 +251,65 @@ class RRQWorker:
|
|
|
222
251
|
if fetched_count >= count or self._shutdown_event.is_set():
|
|
223
252
|
break
|
|
224
253
|
|
|
225
|
-
# Attempt to acquire semaphore *before* trying to process
|
|
226
|
-
await self._semaphore.acquire()
|
|
227
254
|
try:
|
|
228
|
-
#
|
|
229
|
-
|
|
230
|
-
if
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
255
|
+
# Try to acquire lock and remove from queue first (without semaphore)
|
|
256
|
+
job_acquired = await self._try_acquire_job(job_id, queue_name)
|
|
257
|
+
if job_acquired:
|
|
258
|
+
# Only acquire semaphore after successfully getting the job
|
|
259
|
+
await self._semaphore.acquire()
|
|
260
|
+
try:
|
|
261
|
+
# Process the job (we already have the lock and removed from queue)
|
|
262
|
+
# The semaphore will be released when the job task completes
|
|
263
|
+
await self._process_acquired_job(
|
|
264
|
+
job_acquired, queue_name
|
|
265
|
+
)
|
|
266
|
+
fetched_count += 1
|
|
267
|
+
except Exception as e_process:
|
|
268
|
+
logger.error(
|
|
269
|
+
f"Worker {self.worker_id} exception processing acquired job {job_id}: {e_process}",
|
|
270
|
+
exc_info=True,
|
|
271
|
+
)
|
|
272
|
+
# Release lock and semaphore since processing failed
|
|
273
|
+
await self.job_store.release_job_lock(job_id)
|
|
274
|
+
self._semaphore.release()
|
|
275
|
+
# If job_acquired is None, another worker got it - continue to next job
|
|
235
276
|
except Exception as e_try:
|
|
236
|
-
# Catch errors during the
|
|
277
|
+
# Catch errors during the job acquisition itself
|
|
237
278
|
logger.error(
|
|
238
|
-
f"Worker {self.worker_id} exception trying to
|
|
279
|
+
f"Worker {self.worker_id} exception trying to acquire job {job_id}: {e_try}",
|
|
239
280
|
exc_info=True,
|
|
240
281
|
)
|
|
241
|
-
self._semaphore.release() # Ensure semaphore is released on error
|
|
242
282
|
|
|
243
283
|
except Exception as e_poll:
|
|
244
284
|
logger.error(
|
|
245
285
|
f"Worker {self.worker_id} error polling queue '{queue_name}': {e_poll}",
|
|
246
286
|
exc_info=True,
|
|
247
287
|
)
|
|
248
|
-
|
|
288
|
+
# Avoid tight loop on polling error with jittered delay
|
|
289
|
+
jittered_delay = self._calculate_jittered_delay(1.0)
|
|
290
|
+
await asyncio.sleep(jittered_delay)
|
|
249
291
|
# For burst mode, return number of jobs fetched in this poll
|
|
250
292
|
return fetched_count
|
|
251
293
|
|
|
252
|
-
async def
|
|
253
|
-
"""Attempts to lock
|
|
294
|
+
async def _try_acquire_job(self, job_id: str, queue_name: str) -> Optional[Job]:
|
|
295
|
+
"""Attempts to atomically lock and remove a job from the queue.
|
|
254
296
|
|
|
255
297
|
Args:
|
|
256
|
-
job_id: The ID of the job to attempt
|
|
298
|
+
job_id: The ID of the job to attempt acquiring.
|
|
257
299
|
queue_name: The name of the queue the job ID was retrieved from.
|
|
258
300
|
|
|
259
301
|
Returns:
|
|
260
|
-
|
|
261
|
-
(e.g., lock conflict, job definition not found, already removed).
|
|
302
|
+
The Job object if successfully acquired, None otherwise.
|
|
262
303
|
"""
|
|
263
304
|
logger.debug(
|
|
264
|
-
f"Worker {self.worker_id} attempting to
|
|
305
|
+
f"Worker {self.worker_id} attempting to acquire job {job_id} from queue '{queue_name}'"
|
|
265
306
|
)
|
|
266
307
|
job = await self.job_store.get_job_definition(job_id)
|
|
267
308
|
if not job:
|
|
268
309
|
logger.warning(
|
|
269
|
-
f"Worker {self.worker_id} job definition {job_id} not found during
|
|
310
|
+
f"Worker {self.worker_id} job definition {job_id} not found during _try_acquire_job from queue {queue_name}."
|
|
270
311
|
)
|
|
271
|
-
return
|
|
312
|
+
return None # Job vanished between poll and fetch?
|
|
272
313
|
|
|
273
314
|
# Determine job-specific timeout and calculate lock timeout
|
|
274
315
|
job_timeout = (
|
|
@@ -280,32 +321,28 @@ class RRQWorker:
|
|
|
280
321
|
job_timeout + self.settings.default_lock_timeout_extension_seconds
|
|
281
322
|
) * 1000
|
|
282
323
|
|
|
283
|
-
#
|
|
284
|
-
lock_acquired = await self.job_store.
|
|
285
|
-
job.id, self.worker_id, int(lock_timeout_ms)
|
|
324
|
+
# Atomically acquire the processing lock and remove from queue
|
|
325
|
+
lock_acquired, removed_count = await self.job_store.atomic_lock_and_remove_job(
|
|
326
|
+
job.id, queue_name, self.worker_id, int(lock_timeout_ms)
|
|
286
327
|
)
|
|
287
|
-
if not lock_acquired:
|
|
288
|
-
logger.debug(
|
|
289
|
-
f"Worker {self.worker_id} failed to acquire lock for job {job.id} (already locked by another worker)."
|
|
290
|
-
)
|
|
291
|
-
return False # Another worker got there first
|
|
292
328
|
|
|
293
|
-
|
|
329
|
+
if not lock_acquired or removed_count == 0:
|
|
330
|
+
return None # Another worker got there first
|
|
294
331
|
|
|
295
|
-
#
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
)
|
|
305
|
-
await self.job_store.release_job_lock(job.id) # Release the acquired lock
|
|
306
|
-
return False # Job processed by another worker between our poll and lock
|
|
332
|
+
# Successfully acquired the job
|
|
333
|
+
logger.debug(f"Worker {self.worker_id} successfully acquired job {job.id}")
|
|
334
|
+
return job
|
|
335
|
+
|
|
336
|
+
async def _process_acquired_job(self, job: Job, queue_name: str) -> None:
|
|
337
|
+
"""Processes a job that has already been acquired (locked and removed from queue).
|
|
338
|
+
|
|
339
|
+
Note: This method assumes the worker has already acquired the concurrency semaphore.
|
|
340
|
+
The semaphore will be released when the job task completes via _task_cleanup.
|
|
307
341
|
|
|
308
|
-
|
|
342
|
+
Args:
|
|
343
|
+
job: The Job object that was successfully acquired.
|
|
344
|
+
queue_name: The name of the queue the job was retrieved from.
|
|
345
|
+
"""
|
|
309
346
|
try:
|
|
310
347
|
await self.job_store.update_job_status(job.id, JobStatus.ACTIVE)
|
|
311
348
|
logger.debug(
|
|
@@ -313,21 +350,58 @@ class RRQWorker:
|
|
|
313
350
|
)
|
|
314
351
|
|
|
315
352
|
# Create and track the execution task
|
|
353
|
+
# The semaphore will be released when this task completes
|
|
316
354
|
task = self._loop.create_task(self._execute_job(job, queue_name))
|
|
317
355
|
self._running_tasks.add(task)
|
|
318
356
|
task.add_done_callback(lambda t: self._task_cleanup(t, self._semaphore))
|
|
319
357
|
logger.info(
|
|
320
358
|
f"Worker {self.worker_id} started job {job.id} ('{job.function_name}') from queue '{queue_name}'"
|
|
321
359
|
)
|
|
322
|
-
return True
|
|
323
360
|
except Exception as e_start:
|
|
324
361
|
# Catch errors during status update or task creation
|
|
325
362
|
logger.error(
|
|
326
|
-
f"Worker {self.worker_id} failed to start task for job {job.id} after
|
|
363
|
+
f"Worker {self.worker_id} failed to start task for job {job.id} after acquisition: {e_start}",
|
|
327
364
|
exc_info=True,
|
|
328
365
|
)
|
|
329
|
-
#
|
|
366
|
+
# Release the lock since task wasn't started
|
|
330
367
|
await self.job_store.release_job_lock(job.id)
|
|
368
|
+
raise # Re-raise to be handled by caller
|
|
369
|
+
|
|
370
|
+
async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
|
|
371
|
+
"""Attempts to lock, fetch definition, and start the execution task for a specific job.
|
|
372
|
+
|
|
373
|
+
This method is kept for backward compatibility and uses the optimized approach internally.
|
|
374
|
+
For new code, prefer using _try_acquire_job and _process_acquired_job separately.
|
|
375
|
+
|
|
376
|
+
Note: This method handles semaphore acquisition internally for backward compatibility.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
job_id: The ID of the job to attempt processing.
|
|
380
|
+
queue_name: The name of the queue the job ID was retrieved from.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
True if the job processing task was successfully started, False otherwise
|
|
384
|
+
(e.g., lock conflict, job definition not found, already removed).
|
|
385
|
+
"""
|
|
386
|
+
# Use the optimized approach: acquire job first, then process
|
|
387
|
+
job_acquired = await self._try_acquire_job(job_id, queue_name)
|
|
388
|
+
if not job_acquired:
|
|
389
|
+
return False
|
|
390
|
+
|
|
391
|
+
# For backward compatibility, acquire semaphore here since old callers expect it
|
|
392
|
+
await self._semaphore.acquire()
|
|
393
|
+
try:
|
|
394
|
+
# Process the acquired job
|
|
395
|
+
await self._process_acquired_job(job_acquired, queue_name)
|
|
396
|
+
return True
|
|
397
|
+
except Exception as e_process:
|
|
398
|
+
logger.error(
|
|
399
|
+
f"Worker {self.worker_id} failed to process acquired job {job_id}: {e_process}",
|
|
400
|
+
exc_info=True,
|
|
401
|
+
)
|
|
402
|
+
# Release semaphore on error since _process_acquired_job doesn't handle it
|
|
403
|
+
self._semaphore.release()
|
|
404
|
+
# Lock is already released in _process_acquired_job on error
|
|
331
405
|
return False
|
|
332
406
|
|
|
333
407
|
async def _execute_job(self, job: Job, queue_name: str) -> None:
|
|
@@ -475,63 +549,54 @@ class RRQWorker:
|
|
|
475
549
|
appropriate delay (custom or exponential backoff) or moves to DLQ.
|
|
476
550
|
"""
|
|
477
551
|
log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
|
|
478
|
-
|
|
479
|
-
try:
|
|
480
|
-
# Atomically increment retries in the store.
|
|
481
|
-
new_retry_count = await self.job_store.increment_job_retries(job.id)
|
|
482
|
-
max_retries = (
|
|
483
|
-
job.max_retries
|
|
484
|
-
) # Use max_retries from the job object passed in
|
|
485
|
-
|
|
486
|
-
if new_retry_count < max_retries:
|
|
487
|
-
# Update status and error atomically
|
|
488
|
-
await self.job_store.redis.hset(
|
|
489
|
-
job_key,
|
|
490
|
-
mapping={
|
|
491
|
-
"status": JobStatus.RETRYING.value,
|
|
492
|
-
"last_error": str(exc),
|
|
493
|
-
},
|
|
494
|
-
)
|
|
495
|
-
logger.debug(f"{log_prefix} status set to RETRYING, error saved.")
|
|
496
|
-
|
|
497
|
-
# Determine deferral time
|
|
498
|
-
defer_seconds = exc.defer_seconds
|
|
499
|
-
if defer_seconds is None:
|
|
500
|
-
# Create a temporary job representation for backoff calculation
|
|
501
|
-
# using the *new* retry count.
|
|
502
|
-
temp_job_for_backoff = Job(
|
|
503
|
-
id=job.id,
|
|
504
|
-
function_name=job.function_name,
|
|
505
|
-
current_retries=new_retry_count, # Use updated count
|
|
506
|
-
max_retries=max_retries, # Ensure this is passed
|
|
507
|
-
)
|
|
508
|
-
defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
|
|
509
|
-
defer_seconds = defer_ms / 1000.0
|
|
510
|
-
else:
|
|
511
|
-
logger.debug(
|
|
512
|
-
f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
|
|
513
|
-
)
|
|
552
|
+
max_retries = job.max_retries
|
|
514
553
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
logger.info(
|
|
521
|
-
f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
|
|
522
|
-
f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
|
|
523
|
-
)
|
|
524
|
-
else:
|
|
525
|
-
# Max retries exceeded even though RetryJob was raised
|
|
554
|
+
try:
|
|
555
|
+
# Check if we would exceed max retries
|
|
556
|
+
anticipated_retry_count = job.current_retries + 1
|
|
557
|
+
if anticipated_retry_count >= max_retries:
|
|
558
|
+
# Max retries exceeded, increment retry count and move directly to DLQ
|
|
526
559
|
logger.warning(
|
|
527
560
|
f"{log_prefix} max retries ({max_retries}) exceeded "
|
|
528
|
-
f"
|
|
561
|
+
f"with RetryJob exception. Moving to DLQ."
|
|
529
562
|
)
|
|
530
|
-
#
|
|
563
|
+
# Increment retry count before moving to DLQ
|
|
564
|
+
await self.job_store.increment_job_retries(job.id)
|
|
531
565
|
error_msg = (
|
|
532
566
|
str(exc) or f"Max retries ({max_retries}) exceeded after RetryJob"
|
|
533
567
|
)
|
|
534
568
|
await self._move_to_dlq(job, queue_name, error_msg)
|
|
569
|
+
return
|
|
570
|
+
|
|
571
|
+
# Determine deferral time
|
|
572
|
+
defer_seconds = exc.defer_seconds
|
|
573
|
+
if defer_seconds is None:
|
|
574
|
+
# Create a temporary job representation for backoff calculation
|
|
575
|
+
temp_job_for_backoff = Job(
|
|
576
|
+
id=job.id,
|
|
577
|
+
function_name=job.function_name,
|
|
578
|
+
current_retries=anticipated_retry_count, # Use anticipated count
|
|
579
|
+
max_retries=max_retries,
|
|
580
|
+
)
|
|
581
|
+
defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
|
|
582
|
+
defer_seconds = defer_ms / 1000.0
|
|
583
|
+
else:
|
|
584
|
+
logger.debug(
|
|
585
|
+
f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
retry_at_score = (time.time() + defer_seconds) * 1000
|
|
589
|
+
target_queue = job.queue_name or self.settings.default_queue_name
|
|
590
|
+
|
|
591
|
+
# Atomically increment retries, update status/error, and re-queue
|
|
592
|
+
new_retry_count = await self.job_store.atomic_retry_job(
|
|
593
|
+
job.id, target_queue, retry_at_score, str(exc), JobStatus.RETRYING
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
logger.info(
|
|
597
|
+
f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
|
|
598
|
+
f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
|
|
599
|
+
)
|
|
535
600
|
except Exception as e_handle:
|
|
536
601
|
logger.exception(
|
|
537
602
|
f"{log_prefix} CRITICAL error during RetryJob processing: {e_handle}"
|
|
@@ -549,48 +614,43 @@ class RRQWorker:
|
|
|
549
614
|
logger.debug(f"{log_prefix} processing general failure: {type(exc).__name__}")
|
|
550
615
|
|
|
551
616
|
try:
|
|
552
|
-
new_retry_count = await self.job_store.increment_job_retries(job.id)
|
|
553
|
-
# Re-fetch job state after incrementing retries might be safer if fields changed?
|
|
554
|
-
# For now, assume the job object passed in is mostly accurate except for retry count.
|
|
555
|
-
# Use max_retries from the job object passed in.
|
|
556
617
|
max_retries = job.max_retries
|
|
557
618
|
last_error_str = str(exc)
|
|
558
619
|
|
|
559
|
-
if
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
id=job.id,
|
|
564
|
-
function_name=job.function_name,
|
|
565
|
-
current_retries=new_retry_count,
|
|
566
|
-
max_retries=max_retries,
|
|
567
|
-
)
|
|
568
|
-
)
|
|
569
|
-
retry_at_score = (time.time() * 1000) + defer_ms
|
|
570
|
-
target_queue = job.queue_name or self.settings.default_queue_name
|
|
571
|
-
|
|
572
|
-
# Atomically update status/error and re-add to queue (if possible, else separate)
|
|
573
|
-
# For now, separate HSET and ZADD
|
|
574
|
-
await self.job_store.redis.hset(
|
|
575
|
-
f"{JOB_KEY_PREFIX}{job.id}",
|
|
576
|
-
mapping={
|
|
577
|
-
"status": JobStatus.RETRYING.value,
|
|
578
|
-
"last_error": last_error_str,
|
|
579
|
-
},
|
|
580
|
-
)
|
|
581
|
-
await self.job_store.add_job_to_queue(
|
|
582
|
-
target_queue, job.id, retry_at_score
|
|
583
|
-
)
|
|
584
|
-
logger.info(
|
|
585
|
-
f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
|
|
586
|
-
f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
|
|
587
|
-
)
|
|
588
|
-
else: # Max retries reached
|
|
620
|
+
# Check if we would exceed max retries
|
|
621
|
+
anticipated_retry_count = job.current_retries + 1
|
|
622
|
+
if anticipated_retry_count >= max_retries:
|
|
623
|
+
# Max retries exceeded, increment retry count and move directly to DLQ
|
|
589
624
|
logger.warning(
|
|
590
625
|
f"{log_prefix} failed after max retries ({max_retries}). Moving to DLQ. Error: {str(exc)[:100]}..."
|
|
591
626
|
)
|
|
627
|
+
# Increment retry count before moving to DLQ
|
|
628
|
+
await self.job_store.increment_job_retries(job.id)
|
|
592
629
|
# _move_to_dlq handles setting FAILED status, completion time, and last error.
|
|
593
630
|
await self._move_to_dlq(job, queue_name, last_error_str)
|
|
631
|
+
return
|
|
632
|
+
|
|
633
|
+
# Calculate backoff delay using anticipated retry count
|
|
634
|
+
defer_ms = self._calculate_backoff_ms(
|
|
635
|
+
Job(
|
|
636
|
+
id=job.id,
|
|
637
|
+
function_name=job.function_name,
|
|
638
|
+
current_retries=anticipated_retry_count, # Use anticipated count
|
|
639
|
+
max_retries=max_retries,
|
|
640
|
+
)
|
|
641
|
+
)
|
|
642
|
+
retry_at_score = (time.time() * 1000) + defer_ms
|
|
643
|
+
target_queue = job.queue_name or self.settings.default_queue_name
|
|
644
|
+
|
|
645
|
+
# Atomically increment retries, update status/error, and re-queue
|
|
646
|
+
new_retry_count = await self.job_store.atomic_retry_job(
|
|
647
|
+
job.id, target_queue, retry_at_score, last_error_str, JobStatus.RETRYING
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
logger.info(
|
|
651
|
+
f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
|
|
652
|
+
f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
|
|
653
|
+
)
|
|
594
654
|
|
|
595
655
|
except Exception as e_handle:
|
|
596
656
|
logger.exception(
|
|
@@ -601,7 +661,7 @@ class RRQWorker:
|
|
|
601
661
|
"""Moves a job to the Dead Letter Queue (DLQ) and releases its unique lock if present."""
|
|
602
662
|
|
|
603
663
|
dlq_name = self.settings.default_dlq_name # Or derive from original queue_name
|
|
604
|
-
completion_time = datetime.now(
|
|
664
|
+
completion_time = datetime.now(timezone.utc)
|
|
605
665
|
try:
|
|
606
666
|
await self.job_store.move_job_to_dlq(
|
|
607
667
|
job_id=job.id,
|
|
@@ -749,7 +809,7 @@ class RRQWorker:
|
|
|
749
809
|
try:
|
|
750
810
|
health_data = {
|
|
751
811
|
"worker_id": self.worker_id,
|
|
752
|
-
"timestamp": datetime.now(
|
|
812
|
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
753
813
|
"status": self.status,
|
|
754
814
|
"active_jobs": len(self._running_tasks),
|
|
755
815
|
"concurrency_limit": self.settings.worker_concurrency,
|
|
@@ -795,7 +855,7 @@ class RRQWorker:
|
|
|
795
855
|
|
|
796
856
|
async def _maybe_enqueue_cron_jobs(self) -> None:
|
|
797
857
|
"""Enqueue cron jobs that are due to run."""
|
|
798
|
-
now = datetime.now(
|
|
858
|
+
now = datetime.now(timezone.utc)
|
|
799
859
|
for cj in self.cron_jobs:
|
|
800
860
|
if cj.due(now):
|
|
801
861
|
unique_key = f"cron:{cj.function_name}" if cj.unique else None
|
|
@@ -914,7 +974,7 @@ class RRQWorker:
|
|
|
914
974
|
)
|
|
915
975
|
try:
|
|
916
976
|
job.status = JobStatus.PENDING
|
|
917
|
-
job.next_scheduled_run_time = datetime.now(
|
|
977
|
+
job.next_scheduled_run_time = datetime.now(timezone.utc) # Re-queue immediately
|
|
918
978
|
job.last_error = "Job execution interrupted by worker shutdown. Re-queued."
|
|
919
979
|
# Do not increment retries for shutdown interruption
|
|
920
980
|
|
|
@@ -935,7 +995,7 @@ class RRQWorker:
|
|
|
935
995
|
job.id,
|
|
936
996
|
self.settings.default_dlq_name,
|
|
937
997
|
f"Failed to re-queue during cancellation: {e_requeue}",
|
|
938
|
-
datetime.now(
|
|
998
|
+
datetime.now(timezone.utc),
|
|
939
999
|
)
|
|
940
1000
|
logger.info(
|
|
941
1001
|
f"Successfully moved job {job.id} to DLQ due to re-queueing failure."
|