rrq 0.2.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
rrq/worker.py ADDED
@@ -0,0 +1,897 @@
1
+ """This module defines the RRQWorker class, the core component responsible for
2
+ processing jobs from the Reliable Redis Queue (RRQ) system.
3
+ """
4
+
5
+ import asyncio
6
+
7
+ # Use standard logging instead of custom one if appropriate
8
+ import logging
9
+ import os
10
+ import signal
11
+ import time
12
+ import uuid
13
+ from contextlib import suppress
14
+ from datetime import UTC, datetime
15
+ from typing import (
16
+ Any,
17
+ Optional,
18
+ )
19
+
20
+ from rrq.client import RRQClient
21
+
22
+ from .constants import (
23
+ DEFAULT_WORKER_ID_PREFIX,
24
+ JOB_KEY_PREFIX,
25
+ )
26
+ from .exc import RetryJob
27
+ from .job import Job, JobStatus
28
+ from .registry import JobRegistry
29
+ from .settings import RRQSettings
30
+ from .store import JobStore
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ class RRQWorker:
36
+ """An asynchronous worker process for the RRQ system.
37
+
38
+ Polls specified queues for ready jobs, acquires locks, executes job handlers,
39
+ manages job lifecycle states (success, failure, retry, timeout, DLQ),
40
+ handles graceful shutdown, and reports health status.
41
+ """
42
+
43
+ SIGNALS = (signal.SIGINT, signal.SIGTERM)
44
+
45
+ def __init__(
46
+ self,
47
+ settings: RRQSettings,
48
+ job_registry: JobRegistry,
49
+ queues: Optional[list[str]] = None,
50
+ worker_id: Optional[str] = None,
51
+ ):
52
+ """Initializes the RRQWorker.
53
+
54
+ Args:
55
+ settings: The RRQSettings instance for configuration.
56
+ job_registry: The JobRegistry containing the handler functions.
57
+ queues: A list of queue names (without prefix) to poll.
58
+ If None, defaults to `settings.default_queue_name`.
59
+ worker_id: A unique identifier for this worker instance.
60
+ If None, one is generated automatically.
61
+ """
62
+ self.settings = settings
63
+ self.job_registry = job_registry
64
+ self.queues = (
65
+ queues if queues is not None else [self.settings.default_queue_name]
66
+ )
67
+ if not self.queues:
68
+ raise ValueError("Worker must be configured with at least one queue.")
69
+
70
+ self.job_store = JobStore(settings=settings)
71
+ self.client = RRQClient(settings=settings, job_store=self.job_store)
72
+ self.worker_id = (
73
+ worker_id
74
+ or f"{DEFAULT_WORKER_ID_PREFIX}{os.getpid()}_{uuid.uuid4().hex[:6]}"
75
+ )
76
+
77
+ self._semaphore = asyncio.Semaphore(self.settings.worker_concurrency)
78
+ self._running_tasks: set[asyncio.Task] = set()
79
+ self._shutdown_event = asyncio.Event()
80
+ self._loop = None # Will be set in run()
81
+ self._health_check_task: Optional[asyncio.Task] = None
82
+ self.status: str = "initializing" # Worker status (e.g., initializing, running, polling, idle, stopped)
83
+ logger.info(
84
+ f"Initializing RRQWorker {self.worker_id} for queues: {self.queues}"
85
+ )
86
+
87
+ async def _call_startup_hook(self) -> None:
88
+ if self.settings.on_startup:
89
+ logger.info(f"Worker {self.worker_id} calling on_startup hook...")
90
+ try:
91
+ await self.settings.on_startup()
92
+ logger.info(f"Worker {self.worker_id} on_startup hook completed.")
93
+ except Exception as e:
94
+ logger.error(
95
+ f"Worker {self.worker_id} error during on_startup hook: {e}",
96
+ exc_info=True,
97
+ )
98
+ raise
99
+
100
+ async def _call_shutdown_hook(self) -> None:
101
+ if self.settings.on_shutdown:
102
+ logger.info(f"Worker {self.worker_id} calling on_shutdown hook...")
103
+ try:
104
+ await self.settings.on_shutdown()
105
+ logger.info(f"Worker {self.worker_id} on_shutdown hook completed.")
106
+ except Exception as e:
107
+ logger.error(
108
+ f"Worker {self.worker_id} error during on_shutdown hook: {e}",
109
+ exc_info=True,
110
+ )
111
+
112
+ async def run(self) -> None:
113
+ logger.info(f"RRQWorker {self.worker_id} starting.")
114
+ self.status = "running"
115
+ self._loop = asyncio.get_running_loop()
116
+ self._setup_signal_handlers()
117
+ try:
118
+ await self._call_startup_hook()
119
+ await self._run_loop()
120
+ except asyncio.CancelledError:
121
+ logger.info(f"Worker {self.worker_id} run cancelled.")
122
+ finally:
123
+ logger.info(f"Worker {self.worker_id} shutting down cleanly.")
124
+ await self._call_shutdown_hook()
125
+ self.status = "stopped"
126
+ logger.info(f"Worker {self.worker_id} stopped.")
127
+
128
+ async def _run_loop(self) -> None:
129
+ """The main asynchronous execution loop for the worker.
130
+
131
+ Continuously polls queues for jobs, manages concurrency, and handles shutdown.
132
+ """
133
+ logger.info(f"Worker {self.worker_id} starting run loop.")
134
+ self._health_check_task = self._loop.create_task(self._heartbeat_loop())
135
+
136
+ while not self._shutdown_event.is_set():
137
+ try:
138
+ jobs_to_fetch = self.settings.worker_concurrency - len(
139
+ self._running_tasks
140
+ )
141
+ if jobs_to_fetch > 0:
142
+ if self.status != "polling":
143
+ logger.debug(
144
+ f"Worker {self.worker_id} polling for up to {jobs_to_fetch} jobs..."
145
+ )
146
+ self.status = "polling"
147
+ await self._poll_for_jobs(jobs_to_fetch)
148
+ else:
149
+ if self.status != "idle (concurrency limit)":
150
+ logger.debug(
151
+ f"Worker {self.worker_id} at concurrency limit ({self.settings.worker_concurrency}). Waiting..."
152
+ )
153
+ self.status = "idle (concurrency limit)"
154
+ # At concurrency limit, wait for tasks to finish or poll delay
155
+
156
+ await asyncio.sleep(self.settings.default_poll_delay_seconds)
157
+ except Exception as e:
158
+ logger.error(
159
+ f"Worker {self.worker_id} encountered error in main run loop: {e}",
160
+ exc_info=True,
161
+ )
162
+ # Avoid tight loop on persistent errors
163
+ await asyncio.sleep(1)
164
+
165
+ logger.info(
166
+ f"Worker {self.worker_id} shutdown signal received. Draining tasks..."
167
+ )
168
+ await self._drain_tasks()
169
+ logger.info(f"Worker {self.worker_id} task drain complete.")
170
+ if self._health_check_task:
171
+ self._health_check_task.cancel()
172
+ with suppress(asyncio.CancelledError):
173
+ await self._health_check_task
174
+
175
+ async def _poll_for_jobs(self, count: int) -> None:
176
+ """Polls configured queues round-robin and attempts to start processing jobs.
177
+
178
+ Args:
179
+ count: The maximum number of jobs to attempt to start in this poll cycle.
180
+ """
181
+ fetched_count = 0
182
+ # Simple round-robin polling for now
183
+ # TODO: Add queue prioritization logic if needed.
184
+ for queue_name in self.queues:
185
+ if fetched_count >= count or self._shutdown_event.is_set():
186
+ break
187
+
188
+ try:
189
+ ready_job_ids = await self.job_store.get_ready_job_ids(
190
+ queue_name, count - fetched_count
191
+ )
192
+ if not ready_job_ids:
193
+ continue # No jobs ready in this queue
194
+
195
+ logger.debug(
196
+ f"Worker {self.worker_id} found {len(ready_job_ids)} ready jobs in queue '{queue_name}'."
197
+ )
198
+
199
+ for job_id in ready_job_ids:
200
+ if fetched_count >= count or self._shutdown_event.is_set():
201
+ break
202
+
203
+ # Attempt to acquire semaphore *before* trying to process
204
+ await self._semaphore.acquire()
205
+ try:
206
+ # _try_process_job handles lock acquisition, fetching, task creation
207
+ job_started = await self._try_process_job(job_id, queue_name)
208
+ if job_started:
209
+ fetched_count += 1
210
+ else:
211
+ # If job wasn't started (e.g., lock conflict), release semaphore immediately
212
+ self._semaphore.release()
213
+ except Exception as e_try:
214
+ # Catch errors during the _try_process_job itself
215
+ logger.error(
216
+ f"Worker {self.worker_id} exception trying to process job {job_id}: {e_try}",
217
+ exc_info=True,
218
+ )
219
+ self._semaphore.release() # Ensure semaphore is released on error
220
+
221
+ except Exception as e_poll:
222
+ logger.error(
223
+ f"Worker {self.worker_id} error polling queue '{queue_name}': {e_poll}",
224
+ exc_info=True,
225
+ )
226
+ await asyncio.sleep(1) # Avoid tight loop on polling error
227
+
228
+ async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
229
+ """Attempts to lock, fetch definition, and start the execution task for a specific job.
230
+
231
+ Args:
232
+ job_id: The ID of the job to attempt processing.
233
+ queue_name: The name of the queue the job ID was retrieved from.
234
+
235
+ Returns:
236
+ True if the job processing task was successfully started, False otherwise
237
+ (e.g., lock conflict, job definition not found, already removed).
238
+ """
239
+ logger.debug(
240
+ f"Worker {self.worker_id} attempting to process job {job_id} from queue '{queue_name}'"
241
+ )
242
+ job = await self.job_store.get_job_definition(job_id)
243
+ if not job:
244
+ logger.warning(
245
+ f"Worker {self.worker_id} job definition {job_id} not found during _try_process_job from queue {queue_name}."
246
+ )
247
+ return False # Job vanished between poll and fetch?
248
+
249
+ # Determine job-specific timeout and calculate lock timeout
250
+ job_timeout = (
251
+ job.job_timeout_seconds
252
+ if job.job_timeout_seconds is not None
253
+ else self.settings.default_job_timeout_seconds
254
+ )
255
+ lock_timeout_ms = (
256
+ job_timeout + self.settings.default_lock_timeout_extension_seconds
257
+ ) * 1000
258
+
259
+ # Attempt to acquire the processing lock
260
+ lock_acquired = await self.job_store.acquire_job_lock(
261
+ job.id, self.worker_id, int(lock_timeout_ms)
262
+ )
263
+ if not lock_acquired:
264
+ logger.debug(
265
+ f"Worker {self.worker_id} failed to acquire lock for job {job.id} (already locked by another worker)."
266
+ )
267
+ return False # Another worker got there first
268
+
269
+ logger.debug(f"Worker {self.worker_id} acquired lock for job {job.id}")
270
+
271
+ # Atomically remove the job from the queue (verify it was actually removed)
272
+ # Note: Ideally, lock acquisition and queue removal would be a single atomic operation (e.g., Lua script).
273
+ removed_count = await self.job_store.remove_job_from_queue(queue_name, job.id)
274
+ logger.debug(
275
+ f"Worker {self.worker_id} removed job {job.id} from queue '{queue_name}' (count: {removed_count})."
276
+ )
277
+ if removed_count == 0:
278
+ logger.warning(
279
+ f"Worker {self.worker_id} job {job.id} was already removed from queue '{queue_name}' after lock acquisition. Releasing lock."
280
+ )
281
+ await self.job_store.release_job_lock(job.id) # Release the acquired lock
282
+ return False # Job processed by another worker between our poll and lock
283
+
284
+ # We have the lock and have removed the job from the queue - proceed to execute
285
+ try:
286
+ await self.job_store.update_job_status(job.id, JobStatus.ACTIVE)
287
+ logger.debug(
288
+ f"Worker {self.worker_id} updated status to ACTIVE for job {job.id}"
289
+ )
290
+
291
+ # Create and track the execution task
292
+ task = self._loop.create_task(self._execute_job(job, queue_name))
293
+ self._running_tasks.add(task)
294
+ task.add_done_callback(lambda t: self._task_cleanup(t, self._semaphore))
295
+ logger.info(
296
+ f"Worker {self.worker_id} started job {job.id} ('{job.function_name}') from queue '{queue_name}'"
297
+ )
298
+ return True
299
+ except Exception as e_start:
300
+ # Catch errors during status update or task creation
301
+ logger.error(
302
+ f"Worker {self.worker_id} failed to start task for job {job.id} after lock/removal: {e_start}",
303
+ exc_info=True,
304
+ )
305
+ # Attempt to release the lock since task wasn't started
306
+ await self.job_store.release_job_lock(job.id)
307
+ return False
308
+
309
+ async def _execute_job(self, job: Job, queue_name: str) -> None:
310
+ """Executes a single job handler, managing timeouts, errors, retries, and results.
311
+
312
+ This method is run within an asyncio Task for each job.
313
+ It ensures the processing lock is released in a finally block.
314
+
315
+ Args:
316
+ job: The Job object to execute.
317
+ queue_name: The name of the queue the job was pulled from.
318
+ """
319
+ logger.debug(
320
+ f"Worker {self.worker_id} executing job {job.id} ('{job.function_name}') from queue '{queue_name}'"
321
+ )
322
+ start_time = time.monotonic()
323
+ actual_job_timeout = (
324
+ job.job_timeout_seconds
325
+ if job.job_timeout_seconds is not None
326
+ else self.settings.default_job_timeout_seconds
327
+ )
328
+
329
+ try:
330
+ # --- Find Handler ---
331
+ handler = self.job_registry.get_handler(job.function_name)
332
+ if not handler:
333
+ raise ValueError(
334
+ f"No handler registered for function '{job.function_name}'"
335
+ )
336
+
337
+ # --- Prepare Context ---
338
+ context = {
339
+ "job_id": job.id,
340
+ "job_try": job.current_retries + 1, # Attempt number (1-based)
341
+ "enqueue_time": job.enqueue_time,
342
+ "settings": self.settings,
343
+ "worker_id": self.worker_id,
344
+ "queue_name": queue_name,
345
+ "rrq_client": self.client,
346
+ }
347
+
348
+ # --- Execute Handler ---
349
+ result = None
350
+ exc: Optional[BaseException] = None # Stores caught exception
351
+
352
+ try: # Inner try for handler execution and its specific exceptions
353
+ logger.debug(f"Calling handler '{job.function_name}' for job {job.id}")
354
+ result = await asyncio.wait_for(
355
+ handler(context, *job.job_args, **job.job_kwargs),
356
+ timeout=float(actual_job_timeout),
357
+ )
358
+ logger.debug(f"Handler for job {job.id} returned successfully.")
359
+ except TimeoutError as e_timeout: # Specifically from wait_for
360
+ exc = e_timeout
361
+ logger.warning(
362
+ f"Job {job.id} execution timed out after {actual_job_timeout}s."
363
+ )
364
+ except RetryJob as e_retry: # Handler explicitly requests retry
365
+ exc = e_retry
366
+ logger.info(f"Job {job.id} requested retry: {e_retry}")
367
+ except Exception as e_other: # Any other exception from the handler itself
368
+ exc = e_other
369
+ logger.error(
370
+ f"Job {job.id} handler '{job.function_name}' raised unhandled exception:",
371
+ exc_info=e_other,
372
+ )
373
+
374
+ # --- Process Outcome ---
375
+ duration = time.monotonic() - start_time
376
+ if exc is None: # Success
377
+ await self._handle_job_success(job, result)
378
+ logger.info(f"Job {job.id} completed successfully in {duration:.2f}s.")
379
+ elif isinstance(exc, RetryJob):
380
+ await self._process_retry_job(job, exc, queue_name)
381
+ # Logging done within _process_retry_job
382
+ elif isinstance(exc, asyncio.TimeoutError):
383
+ error_msg = (
384
+ str(exc)
385
+ if str(exc)
386
+ else f"Job timed out after {actual_job_timeout}s."
387
+ )
388
+ await self._handle_job_timeout(job, queue_name, error_msg)
389
+ # Logging done within _handle_job_timeout
390
+ else: # Other unhandled exception from handler
391
+ await self._process_other_failure(job, exc, queue_name)
392
+ # Logging done within _process_other_failure
393
+
394
+ except ValueError as ve: # Catches "handler not found"
395
+ logger.error(f"Job {job.id} fatal error: {ve}. Moving to DLQ.")
396
+ await self._handle_fatal_job_error(job, queue_name, str(ve))
397
+ except asyncio.CancelledError:
398
+ # Catches cancellation of this _execute_job task (e.g., worker shutdown)
399
+ logger.warning(
400
+ f"Job {job.id} execution was cancelled (likely worker shutdown). Handling cancellation."
401
+ )
402
+ await self._handle_job_cancellation_on_shutdown(job, queue_name)
403
+ # Do not re-raise; cancellation is handled.
404
+ except (
405
+ Exception
406
+ ) as critical_exc: # Safety net for unexpected errors in this method
407
+ logger.critical(
408
+ f"Job {job.id} encountered an unexpected critical error during execution logic: {critical_exc}",
409
+ exc_info=critical_exc,
410
+ )
411
+ # Fallback: Try to move to DLQ to avoid losing the job entirely
412
+ await self._handle_fatal_job_error(
413
+ job, queue_name, f"Critical worker error: {critical_exc}"
414
+ )
415
+ finally:
416
+ # CRITICAL: Ensure the lock is released regardless of outcome
417
+ await self.job_store.release_job_lock(job.id)
418
+ # Logger call moved inside release_job_lock for context
419
+
420
+ async def _handle_job_success(self, job: Job, result: Any) -> None:
421
+ """Handles successful job completion: saves result, sets TTL, updates status, and releases unique lock."""
422
+ try:
423
+ ttl = (
424
+ job.result_ttl_seconds
425
+ if job.result_ttl_seconds is not None
426
+ else self.settings.default_result_ttl_seconds
427
+ )
428
+ await self.job_store.save_job_result(job.id, result, ttl_seconds=int(ttl))
429
+ # Status is set to COMPLETED within save_job_result
430
+
431
+ if job.job_unique_key:
432
+ logger.debug(
433
+ f"Job {job.id} completed successfully, releasing unique key: {job.job_unique_key}"
434
+ )
435
+ await self.job_store.release_unique_job_lock(job.job_unique_key)
436
+
437
+ except Exception as e_success:
438
+ logger.error(
439
+ f"Error during post-success handling for job {job.id}: {e_success}",
440
+ exc_info=True,
441
+ )
442
+ # Job finished, but result/unique lock release failed.
443
+ # Lock is released in _execute_job's finally. Unique lock might persist.
444
+
445
+ async def _process_retry_job(
446
+ self, job: Job, exc: RetryJob, queue_name: str
447
+ ) -> None:
448
+ """Handles job failures where the handler explicitly raised RetryJob.
449
+
450
+ Increments retry count, checks against max_retries, and re-queues with
451
+ appropriate delay (custom or exponential backoff) or moves to DLQ.
452
+ """
453
+ log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
454
+ job_key = f"{JOB_KEY_PREFIX}{job.id}"
455
+ try:
456
+ # Atomically increment retries in the store.
457
+ new_retry_count = await self.job_store.increment_job_retries(job.id)
458
+ max_retries = (
459
+ job.max_retries
460
+ ) # Use max_retries from the job object passed in
461
+
462
+ if new_retry_count < max_retries:
463
+ # Update status and error atomically
464
+ await self.job_store.redis.hset(
465
+ job_key,
466
+ mapping={
467
+ "status": JobStatus.RETRYING.value,
468
+ "last_error": str(exc),
469
+ },
470
+ )
471
+ logger.debug(f"{log_prefix} status set to RETRYING, error saved.")
472
+
473
+ # Determine deferral time
474
+ defer_seconds = exc.defer_seconds
475
+ if defer_seconds is None:
476
+ # Create a temporary job representation for backoff calculation
477
+ # using the *new* retry count.
478
+ temp_job_for_backoff = Job(
479
+ id=job.id,
480
+ function_name=job.function_name,
481
+ current_retries=new_retry_count, # Use updated count
482
+ max_retries=max_retries, # Ensure this is passed
483
+ )
484
+ defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
485
+ defer_seconds = defer_ms / 1000.0
486
+ else:
487
+ logger.debug(
488
+ f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
489
+ )
490
+
491
+ retry_at_score = (time.time() + defer_seconds) * 1000
492
+ target_queue = job.queue_name or self.settings.default_queue_name
493
+ await self.job_store.add_job_to_queue(
494
+ target_queue, job.id, retry_at_score
495
+ )
496
+ logger.info(
497
+ f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
498
+ f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
499
+ )
500
+ else:
501
+ # Max retries exceeded even though RetryJob was raised
502
+ logger.warning(
503
+ f"{log_prefix} max retries ({max_retries}) exceeded "
504
+ f"despite RetryJob exception. Moving to DLQ."
505
+ )
506
+ # _move_to_dlq handles setting FAILED status etc.
507
+ error_msg = (
508
+ str(exc) or f"Max retries ({max_retries}) exceeded after RetryJob"
509
+ )
510
+ await self._move_to_dlq(job, queue_name, error_msg)
511
+ except Exception as e_handle:
512
+ logger.exception(
513
+ f"{log_prefix} CRITICAL error during RetryJob processing: {e_handle}"
514
+ )
515
+
516
+ async def _process_other_failure(
517
+ self, job: Job, exc: Exception, queue_name: str
518
+ ) -> None:
519
+ """Handles general job failures (any exception other than RetryJob or timeout/cancellation).
520
+
521
+ Increments retry count, checks against max_retries, and re-queues with
522
+ exponential backoff or moves to DLQ.
523
+ """
524
+ log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
525
+ logger.debug(f"{log_prefix} processing general failure: {type(exc).__name__}")
526
+
527
+ try:
528
+ new_retry_count = await self.job_store.increment_job_retries(job.id)
529
+ # Re-fetch job state after incrementing retries might be safer if fields changed?
530
+ # For now, assume the job object passed in is mostly accurate except for retry count.
531
+ # Use max_retries from the job object passed in.
532
+ max_retries = job.max_retries
533
+ last_error_str = str(exc)
534
+
535
+ if new_retry_count < max_retries:
536
+ # Re-queue for standard retry with backoff
537
+ defer_ms = self._calculate_backoff_ms(
538
+ Job(
539
+ id=job.id,
540
+ function_name=job.function_name,
541
+ current_retries=new_retry_count,
542
+ max_retries=max_retries,
543
+ )
544
+ )
545
+ retry_at_score = (time.time() * 1000) + defer_ms
546
+ target_queue = job.queue_name or self.settings.default_queue_name
547
+
548
+ # Atomically update status/error and re-add to queue (if possible, else separate)
549
+ # For now, separate HSET and ZADD
550
+ await self.job_store.redis.hset(
551
+ f"{JOB_KEY_PREFIX}{job.id}",
552
+ mapping={
553
+ "status": JobStatus.RETRYING.value,
554
+ "last_error": last_error_str,
555
+ },
556
+ )
557
+ await self.job_store.add_job_to_queue(
558
+ target_queue, job.id, retry_at_score
559
+ )
560
+ logger.info(
561
+ f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
562
+ f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
563
+ )
564
+ else: # Max retries reached
565
+ logger.warning(
566
+ f"{log_prefix} failed after max retries ({max_retries}). Moving to DLQ. Error: {str(exc)[:100]}..."
567
+ )
568
+ # _move_to_dlq handles setting FAILED status, completion time, and last error.
569
+ await self._move_to_dlq(job, queue_name, last_error_str)
570
+
571
+ except Exception as e_handle:
572
+ logger.exception(
573
+ f"{log_prefix} CRITICAL error during general failure processing (original exc: {type(exc).__name__}): {e_handle}"
574
+ )
575
+
576
+ async def _move_to_dlq(self, job: Job, queue_name: str, error_message: str) -> None:
577
+ """Moves a job to the Dead Letter Queue (DLQ) and releases its unique lock if present."""
578
+
579
+ dlq_name = self.settings.default_dlq_name # Or derive from original queue_name
580
+ completion_time = datetime.now(UTC)
581
+ try:
582
+ await self.job_store.move_job_to_dlq(
583
+ job_id=job.id,
584
+ dlq_name=dlq_name,
585
+ error_message=error_message,
586
+ completion_time=completion_time,
587
+ )
588
+ logger.warning(
589
+ f"Worker {self.worker_id} moved job {job.id} from queue '{queue_name}' to DLQ '{dlq_name}'. Reason: {error_message}"
590
+ )
591
+
592
+ if job.job_unique_key:
593
+ logger.debug(
594
+ f"Job {job.id} moved to DLQ, releasing unique key: {job.job_unique_key}"
595
+ )
596
+ await self.job_store.release_unique_job_lock(job.job_unique_key)
597
+
598
+ except Exception as e_dlq:
599
+ logger.error(
600
+ f"Worker {self.worker_id} critical error trying to move job {job.id} to DLQ '{dlq_name}': {e_dlq}",
601
+ exc_info=True,
602
+ )
603
+ # If moving to DLQ fails, the job might be stuck.
604
+ # The processing lock is released in _execute_job's finally. Unique lock might persist.
605
+
606
+ def _task_cleanup(self, task: asyncio.Task, semaphore: asyncio.Semaphore) -> None:
607
+ """Callback executed when a job task finishes or is cancelled.
608
+
609
+ Removes the task from the running set and releases the concurrency semaphore.
610
+ Also logs any unexpected exceptions raised by the task itself.
611
+
612
+ Args:
613
+ task: The completed or cancelled asyncio Task.
614
+ semaphore: The worker's concurrency semaphore.
615
+ """
616
+ task_name = "N/A"
617
+ try:
618
+ if hasattr(task, "get_name"): # Ensure get_name exists
619
+ task_name = task.get_name()
620
+ elif hasattr(task, "_coro") and hasattr(task._coro, "__name__"): # Fallback
621
+ task_name = task._coro.__name__
622
+ except Exception:
623
+ pass # Ignore errors getting name
624
+
625
+ logger.debug(
626
+ f"Worker {self.worker_id} cleaning up task '{task_name}'. Releasing semaphore."
627
+ )
628
+ if task in self._running_tasks:
629
+ self._running_tasks.remove(task)
630
+ else:
631
+ logger.warning(
632
+ f"Worker {self.worker_id} task '{task_name}' already removed during cleanup callback? This might indicate an issue."
633
+ )
634
+
635
+ semaphore.release()
636
+
637
+ try:
638
+ task.result() # Check for unexpected exceptions from the task future itself
639
+ except asyncio.CancelledError:
640
+ logger.debug(
641
+ f"Task '{task_name}' in worker {self.worker_id} was cancelled."
642
+ )
643
+ except Exception as e:
644
+ logger.error(
645
+ f"Task '{task_name}' in worker {self.worker_id} raised an unhandled exception during cleanup check: {e}",
646
+ exc_info=True,
647
+ )
648
+
649
+ def _setup_signal_handlers(self) -> None:
650
+ """Sets up POSIX signal handlers for graceful shutdown."""
651
+ for sig in self.SIGNALS:
652
+ try:
653
+ self._loop.add_signal_handler(sig, self._request_shutdown)
654
+ logger.debug(
655
+ f"Worker {self.worker_id} registered signal handler for {sig.name}."
656
+ )
657
+ except (NotImplementedError, AttributeError):
658
+ logger.warning(
659
+ f"Worker {self.worker_id} could not set signal handler for {sig.name} (likely Windows or unsupported environment). Graceful shutdown via signals may not work."
660
+ )
661
+
662
+ def _request_shutdown(self) -> None:
663
+ """Callback triggered by a signal to initiate graceful shutdown."""
664
+ if not self._shutdown_event.is_set():
665
+ logger.info(
666
+ f"Worker {self.worker_id} received shutdown signal. Initiating graceful shutdown..."
667
+ )
668
+ self._shutdown_event.set()
669
+ else:
670
+ logger.info(
671
+ f"Worker {self.worker_id} received another shutdown signal, already shutting down."
672
+ )
673
+
674
+ async def _drain_tasks(self) -> None:
675
+ """Waits for currently running job tasks to complete, up to a grace period.
676
+
677
+ Tasks that do not complete within the grace period are cancelled.
678
+ """
679
+ if not self._running_tasks:
680
+ logger.debug(f"Worker {self.worker_id}: No active tasks to drain.")
681
+ return
682
+
683
+ logger.info(
684
+ f"Worker {self.worker_id}: Waiting for {len(self._running_tasks)} active tasks to complete (grace period: {self.settings.worker_shutdown_grace_period_seconds}s)..."
685
+ )
686
+ grace_period = self.settings.worker_shutdown_grace_period_seconds
687
+
688
+ # Use asyncio.shield if we want to prevent cancellation of _drain_tasks itself?
689
+ # For now, assume it runs to completion or the main loop handles its cancellation.
690
+ tasks_to_wait_on = list(self._running_tasks)
691
+
692
+ # Wait for tasks with timeout
693
+ done, pending = await asyncio.wait(tasks_to_wait_on, timeout=grace_period)
694
+
695
+ if done:
696
+ logger.info(
697
+ f"Worker {self.worker_id}: {len(done)} tasks completed within grace period."
698
+ )
699
+ if pending:
700
+ logger.warning(
701
+ f"Worker {self.worker_id}: {len(pending)} tasks did not complete within grace period. Cancelling remaining tasks..."
702
+ )
703
+ for task in pending:
704
+ task_name = "N/A"
705
+ try:
706
+ if hasattr(task, "get_name"):
707
+ task_name = task.get_name()
708
+ except Exception:
709
+ pass
710
+ logger.warning(
711
+ f"Worker {self.worker_id}: Cancelling task '{task_name}'."
712
+ )
713
+ task.cancel()
714
+
715
+ # Wait for the cancelled tasks to finish propagating the cancellation
716
+ await asyncio.gather(*pending, return_exceptions=True)
717
+ logger.info(
718
+ f"Worker {self.worker_id}: Finished waiting for cancelled tasks."
719
+ )
720
+
721
+ async def _heartbeat_loop(self) -> None:
722
+ """Periodically updates the worker's health status key in Redis with a TTL."""
723
+ logger.debug(f"Worker {self.worker_id} starting heartbeat loop.")
724
+ while not self._shutdown_event.is_set():
725
+ try:
726
+ health_data = {
727
+ "worker_id": self.worker_id,
728
+ "timestamp": datetime.now(UTC).isoformat(),
729
+ "status": self.status,
730
+ "active_jobs": len(self._running_tasks),
731
+ "concurrency_limit": self.settings.worker_concurrency,
732
+ "queues": self.queues,
733
+ }
734
+ ttl = (
735
+ self.settings.worker_health_check_interval_seconds + 10
736
+ ) # Add buffer
737
+ await self.job_store.set_worker_health(
738
+ self.worker_id, health_data, int(ttl)
739
+ )
740
+ # Logger call moved into set_worker_health
741
+ except Exception as e:
742
+ # Log error but continue the loop
743
+ logger.error(
744
+ f"Error updating health check for worker {self.worker_id}: {e}",
745
+ exc_info=True,
746
+ )
747
+
748
+ try:
749
+ # Sleep until the next interval, but wake up if shutdown is requested
750
+ await asyncio.wait_for(
751
+ self._shutdown_event.wait(),
752
+ timeout=min(60, self.settings.worker_health_check_interval_seconds),
753
+ )
754
+ # If wait_for doesn't time out, shutdown was requested
755
+ logger.debug(
756
+ f"Worker {self.worker_id} heartbeat loop exiting due to shutdown event."
757
+ )
758
+ break # Exit loop if shutdown event is set
759
+ except TimeoutError:
760
+ # This is the normal case, continue loop
761
+ pass
762
+ except Exception as sleep_err:
763
+ # Handle potential errors from wait_for itself
764
+ logger.error(
765
+ f"Worker {self.worker_id} error during heartbeat sleep: {sleep_err}",
766
+ exc_info=True,
767
+ )
768
+ await asyncio.sleep(1) # Avoid tight loop
769
+
770
+ logger.debug(f"Worker {self.worker_id} heartbeat loop finished.")
771
+
772
+ async def _close_resources(self) -> None:
773
+ """Closes the worker's resources, primarily the JobStore connection."""
774
+ logger.info(f"Worker {self.worker_id} closing resources...")
775
+ try:
776
+ await self.job_store.aclose()
777
+ logger.info(f"Worker {self.worker_id} JobStore Redis connection closed.")
778
+ except Exception as e_close:
779
+ logger.error(
780
+ f"Worker {self.worker_id} error closing JobStore: {e_close}",
781
+ exc_info=True,
782
+ )
783
+
784
+ def _calculate_backoff_ms(self, job: Job) -> int:
785
+ """Calculates exponential backoff delay in milliseconds based on retry count.
786
+
787
+ Uses `base_retry_delay_seconds` and `max_retry_delay_seconds` from settings.
788
+
789
+ Args:
790
+ job: The Job object (specifically needs `current_retries`).
791
+
792
+ Returns:
793
+ The calculated delay in milliseconds.
794
+ """
795
+ # Simple exponential backoff: base * (2^(retries-1))
796
+ # current_retries is 1-based for calculation after increment.
797
+ retry_attempt = job.current_retries
798
+ if retry_attempt <= 0:
799
+ # Should not happen if called after increment, but safeguard
800
+ retry_attempt = 1
801
+
802
+ base_delay = self.settings.base_retry_delay_seconds
803
+ max_delay = self.settings.max_retry_delay_seconds
804
+
805
+ delay_seconds = min(max_delay, base_delay * (2 ** (retry_attempt - 1)))
806
+ delay_ms = int(delay_seconds * 1000)
807
+ logger.debug(
808
+ f"Calculated backoff for job {job.id} (attempt {retry_attempt}): {delay_ms}ms"
809
+ )
810
+ return delay_ms
811
+
812
+ async def _handle_job_timeout(
813
+ self, job: Job, queue_name: str, error_message: str
814
+ ) -> None:
815
+ """Handles job timeouts by moving them directly to the DLQ."""
816
+ log_message_prefix = f"Worker {self.worker_id} job {job.id} {queue_name}"
817
+ logger.warning(f"{log_message_prefix} processing timeout: {error_message}")
818
+
819
+ try:
820
+ # Increment retries as an attempt was made.
821
+ # Even though it's a timeout, it did consume a slot and attempt execution.
822
+ # This also ensures that if _move_to_dlq relies on current_retries for anything, it's accurate.
823
+ await self.job_store.increment_job_retries(job.id)
824
+
825
+ # Update the job object with the error message before moving to DLQ
826
+ # _move_to_dlq will set FAILED status and completion_time
827
+ await self._move_to_dlq(job, queue_name, error_message)
828
+ logger.info(f"{log_message_prefix} moved to DLQ due to timeout.")
829
+ except Exception as e_timeout_handle:
830
+ logger.exception(
831
+ f"{log_message_prefix} CRITICAL error in _handle_job_timeout: {e_timeout_handle}"
832
+ )
833
+
834
+ async def _handle_fatal_job_error(
835
+ self, job: Job, queue_name: str, error_message: str
836
+ ) -> None:
837
+ """Handles fatal job errors (e.g., handler not found) by moving to DLQ without retries."""
838
+ log_message_prefix = f"Worker {self.worker_id} job {job.id} {queue_name}"
839
+ logger.error(
840
+ f"{log_message_prefix} fatal error: {error_message}. Moving to DLQ."
841
+ )
842
+ try:
843
+ # Increment retries as an attempt was made to process/find handler.
844
+ await self.job_store.increment_job_retries(job.id)
845
+ # Note: _move_to_dlq handles setting FAILED status, completion_time, and last_error.
846
+ await self._move_to_dlq(job, queue_name, error_message)
847
+ logger.info(f"{log_message_prefix} moved to DLQ due to fatal error.")
848
+ except Exception as e_fatal_handle:
849
+ logger.exception(
850
+ f"{log_message_prefix} CRITICAL error in _handle_fatal_job_error: {e_fatal_handle}"
851
+ )
852
+
853
+ async def _handle_job_cancellation_on_shutdown(self, job: Job, queue_name: str):
854
+ logger.warning(
855
+ f"Job {job.id} ({job.function_name}) was cancelled. Assuming worker shutdown. Re-queueing."
856
+ )
857
+ try:
858
+ job.status = JobStatus.PENDING
859
+ job.next_scheduled_run_time = datetime.now(UTC) # Re-queue immediately
860
+ job.last_error = "Job execution interrupted by worker shutdown. Re-queued."
861
+ # Do not increment retries for shutdown interruption
862
+
863
+ await self.job_store.save_job_definition(job)
864
+ await self.job_store.add_job_to_queue(
865
+ queue_name, job.id, job.next_scheduled_run_time.timestamp() * 1000
866
+ )
867
+ await self.job_store.release_job_lock(job.id) # Ensure lock is released
868
+
869
+ logger.info(f"Successfully re-queued job {job.id} to {queue_name}.")
870
+ except Exception as e_requeue:
871
+ logger.exception(
872
+ f"Failed to re-queue job {job.id} on cancellation/shutdown: {e_requeue}"
873
+ )
874
+ # Fallback: try to move to DLQ if re-queueing fails catastrophically
875
+ try:
876
+ await self.job_store.move_job_to_dlq(
877
+ job.id,
878
+ self.settings.default_dlq_name,
879
+ f"Failed to re-queue during cancellation: {e_requeue}",
880
+ datetime.now(UTC),
881
+ )
882
+ logger.info(
883
+ f"Successfully moved job {job.id} to DLQ due to re-queueing failure."
884
+ )
885
+ except Exception as e_move_to_dlq:
886
+ logger.exception(
887
+ f"Failed to move job {job.id} to DLQ after re-queueing failure: {e_move_to_dlq}"
888
+ )
889
+
890
+ async def close(self) -> None:
891
+ """Gracefully close worker resources."""
892
+ logger.info(f"[{self.worker_id}] Closing RRQ worker...")
893
+ if self.client: # Check if client exists before closing
894
+ await self.client.close()
895
+ if self.job_store:
896
+ await self.job_store.close()
897
+ logger.info(f"[{self.worker_id}] RRQ worker closed.")