rrq 0.2.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rrq/__init__.py +0 -0
- rrq/client.py +159 -0
- rrq/constants.py +42 -0
- rrq/exc.py +46 -0
- rrq/job.py +133 -0
- rrq/registry.py +77 -0
- rrq/rrq.py +328 -0
- rrq/settings.py +107 -0
- rrq/store.py +568 -0
- rrq/worker.py +897 -0
- rrq-0.2.5.dist-info/METADATA +201 -0
- rrq-0.2.5.dist-info/RECORD +15 -0
- rrq-0.2.5.dist-info/WHEEL +4 -0
- rrq-0.2.5.dist-info/entry_points.txt +2 -0
- rrq-0.2.5.dist-info/licenses/LICENSE +13 -0
rrq/worker.py
ADDED
|
@@ -0,0 +1,897 @@
|
|
|
1
|
+
"""This module defines the RRQWorker class, the core component responsible for
|
|
2
|
+
processing jobs from the Reliable Redis Queue (RRQ) system.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
|
|
7
|
+
# Use standard logging instead of custom one if appropriate
|
|
8
|
+
import logging
|
|
9
|
+
import os
|
|
10
|
+
import signal
|
|
11
|
+
import time
|
|
12
|
+
import uuid
|
|
13
|
+
from contextlib import suppress
|
|
14
|
+
from datetime import UTC, datetime
|
|
15
|
+
from typing import (
|
|
16
|
+
Any,
|
|
17
|
+
Optional,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
from rrq.client import RRQClient
|
|
21
|
+
|
|
22
|
+
from .constants import (
|
|
23
|
+
DEFAULT_WORKER_ID_PREFIX,
|
|
24
|
+
JOB_KEY_PREFIX,
|
|
25
|
+
)
|
|
26
|
+
from .exc import RetryJob
|
|
27
|
+
from .job import Job, JobStatus
|
|
28
|
+
from .registry import JobRegistry
|
|
29
|
+
from .settings import RRQSettings
|
|
30
|
+
from .store import JobStore
|
|
31
|
+
|
|
32
|
+
logger = logging.getLogger(__name__)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class RRQWorker:
|
|
36
|
+
"""An asynchronous worker process for the RRQ system.
|
|
37
|
+
|
|
38
|
+
Polls specified queues for ready jobs, acquires locks, executes job handlers,
|
|
39
|
+
manages job lifecycle states (success, failure, retry, timeout, DLQ),
|
|
40
|
+
handles graceful shutdown, and reports health status.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
SIGNALS = (signal.SIGINT, signal.SIGTERM)
|
|
44
|
+
|
|
45
|
+
def __init__(
|
|
46
|
+
self,
|
|
47
|
+
settings: RRQSettings,
|
|
48
|
+
job_registry: JobRegistry,
|
|
49
|
+
queues: Optional[list[str]] = None,
|
|
50
|
+
worker_id: Optional[str] = None,
|
|
51
|
+
):
|
|
52
|
+
"""Initializes the RRQWorker.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
settings: The RRQSettings instance for configuration.
|
|
56
|
+
job_registry: The JobRegistry containing the handler functions.
|
|
57
|
+
queues: A list of queue names (without prefix) to poll.
|
|
58
|
+
If None, defaults to `settings.default_queue_name`.
|
|
59
|
+
worker_id: A unique identifier for this worker instance.
|
|
60
|
+
If None, one is generated automatically.
|
|
61
|
+
"""
|
|
62
|
+
self.settings = settings
|
|
63
|
+
self.job_registry = job_registry
|
|
64
|
+
self.queues = (
|
|
65
|
+
queues if queues is not None else [self.settings.default_queue_name]
|
|
66
|
+
)
|
|
67
|
+
if not self.queues:
|
|
68
|
+
raise ValueError("Worker must be configured with at least one queue.")
|
|
69
|
+
|
|
70
|
+
self.job_store = JobStore(settings=settings)
|
|
71
|
+
self.client = RRQClient(settings=settings, job_store=self.job_store)
|
|
72
|
+
self.worker_id = (
|
|
73
|
+
worker_id
|
|
74
|
+
or f"{DEFAULT_WORKER_ID_PREFIX}{os.getpid()}_{uuid.uuid4().hex[:6]}"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
self._semaphore = asyncio.Semaphore(self.settings.worker_concurrency)
|
|
78
|
+
self._running_tasks: set[asyncio.Task] = set()
|
|
79
|
+
self._shutdown_event = asyncio.Event()
|
|
80
|
+
self._loop = None # Will be set in run()
|
|
81
|
+
self._health_check_task: Optional[asyncio.Task] = None
|
|
82
|
+
self.status: str = "initializing" # Worker status (e.g., initializing, running, polling, idle, stopped)
|
|
83
|
+
logger.info(
|
|
84
|
+
f"Initializing RRQWorker {self.worker_id} for queues: {self.queues}"
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
async def _call_startup_hook(self) -> None:
|
|
88
|
+
if self.settings.on_startup:
|
|
89
|
+
logger.info(f"Worker {self.worker_id} calling on_startup hook...")
|
|
90
|
+
try:
|
|
91
|
+
await self.settings.on_startup()
|
|
92
|
+
logger.info(f"Worker {self.worker_id} on_startup hook completed.")
|
|
93
|
+
except Exception as e:
|
|
94
|
+
logger.error(
|
|
95
|
+
f"Worker {self.worker_id} error during on_startup hook: {e}",
|
|
96
|
+
exc_info=True,
|
|
97
|
+
)
|
|
98
|
+
raise
|
|
99
|
+
|
|
100
|
+
async def _call_shutdown_hook(self) -> None:
|
|
101
|
+
if self.settings.on_shutdown:
|
|
102
|
+
logger.info(f"Worker {self.worker_id} calling on_shutdown hook...")
|
|
103
|
+
try:
|
|
104
|
+
await self.settings.on_shutdown()
|
|
105
|
+
logger.info(f"Worker {self.worker_id} on_shutdown hook completed.")
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.error(
|
|
108
|
+
f"Worker {self.worker_id} error during on_shutdown hook: {e}",
|
|
109
|
+
exc_info=True,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
async def run(self) -> None:
|
|
113
|
+
logger.info(f"RRQWorker {self.worker_id} starting.")
|
|
114
|
+
self.status = "running"
|
|
115
|
+
self._loop = asyncio.get_running_loop()
|
|
116
|
+
self._setup_signal_handlers()
|
|
117
|
+
try:
|
|
118
|
+
await self._call_startup_hook()
|
|
119
|
+
await self._run_loop()
|
|
120
|
+
except asyncio.CancelledError:
|
|
121
|
+
logger.info(f"Worker {self.worker_id} run cancelled.")
|
|
122
|
+
finally:
|
|
123
|
+
logger.info(f"Worker {self.worker_id} shutting down cleanly.")
|
|
124
|
+
await self._call_shutdown_hook()
|
|
125
|
+
self.status = "stopped"
|
|
126
|
+
logger.info(f"Worker {self.worker_id} stopped.")
|
|
127
|
+
|
|
128
|
+
async def _run_loop(self) -> None:
|
|
129
|
+
"""The main asynchronous execution loop for the worker.
|
|
130
|
+
|
|
131
|
+
Continuously polls queues for jobs, manages concurrency, and handles shutdown.
|
|
132
|
+
"""
|
|
133
|
+
logger.info(f"Worker {self.worker_id} starting run loop.")
|
|
134
|
+
self._health_check_task = self._loop.create_task(self._heartbeat_loop())
|
|
135
|
+
|
|
136
|
+
while not self._shutdown_event.is_set():
|
|
137
|
+
try:
|
|
138
|
+
jobs_to_fetch = self.settings.worker_concurrency - len(
|
|
139
|
+
self._running_tasks
|
|
140
|
+
)
|
|
141
|
+
if jobs_to_fetch > 0:
|
|
142
|
+
if self.status != "polling":
|
|
143
|
+
logger.debug(
|
|
144
|
+
f"Worker {self.worker_id} polling for up to {jobs_to_fetch} jobs..."
|
|
145
|
+
)
|
|
146
|
+
self.status = "polling"
|
|
147
|
+
await self._poll_for_jobs(jobs_to_fetch)
|
|
148
|
+
else:
|
|
149
|
+
if self.status != "idle (concurrency limit)":
|
|
150
|
+
logger.debug(
|
|
151
|
+
f"Worker {self.worker_id} at concurrency limit ({self.settings.worker_concurrency}). Waiting..."
|
|
152
|
+
)
|
|
153
|
+
self.status = "idle (concurrency limit)"
|
|
154
|
+
# At concurrency limit, wait for tasks to finish or poll delay
|
|
155
|
+
|
|
156
|
+
await asyncio.sleep(self.settings.default_poll_delay_seconds)
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(
|
|
159
|
+
f"Worker {self.worker_id} encountered error in main run loop: {e}",
|
|
160
|
+
exc_info=True,
|
|
161
|
+
)
|
|
162
|
+
# Avoid tight loop on persistent errors
|
|
163
|
+
await asyncio.sleep(1)
|
|
164
|
+
|
|
165
|
+
logger.info(
|
|
166
|
+
f"Worker {self.worker_id} shutdown signal received. Draining tasks..."
|
|
167
|
+
)
|
|
168
|
+
await self._drain_tasks()
|
|
169
|
+
logger.info(f"Worker {self.worker_id} task drain complete.")
|
|
170
|
+
if self._health_check_task:
|
|
171
|
+
self._health_check_task.cancel()
|
|
172
|
+
with suppress(asyncio.CancelledError):
|
|
173
|
+
await self._health_check_task
|
|
174
|
+
|
|
175
|
+
async def _poll_for_jobs(self, count: int) -> None:
|
|
176
|
+
"""Polls configured queues round-robin and attempts to start processing jobs.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
count: The maximum number of jobs to attempt to start in this poll cycle.
|
|
180
|
+
"""
|
|
181
|
+
fetched_count = 0
|
|
182
|
+
# Simple round-robin polling for now
|
|
183
|
+
# TODO: Add queue prioritization logic if needed.
|
|
184
|
+
for queue_name in self.queues:
|
|
185
|
+
if fetched_count >= count or self._shutdown_event.is_set():
|
|
186
|
+
break
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
ready_job_ids = await self.job_store.get_ready_job_ids(
|
|
190
|
+
queue_name, count - fetched_count
|
|
191
|
+
)
|
|
192
|
+
if not ready_job_ids:
|
|
193
|
+
continue # No jobs ready in this queue
|
|
194
|
+
|
|
195
|
+
logger.debug(
|
|
196
|
+
f"Worker {self.worker_id} found {len(ready_job_ids)} ready jobs in queue '{queue_name}'."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
for job_id in ready_job_ids:
|
|
200
|
+
if fetched_count >= count or self._shutdown_event.is_set():
|
|
201
|
+
break
|
|
202
|
+
|
|
203
|
+
# Attempt to acquire semaphore *before* trying to process
|
|
204
|
+
await self._semaphore.acquire()
|
|
205
|
+
try:
|
|
206
|
+
# _try_process_job handles lock acquisition, fetching, task creation
|
|
207
|
+
job_started = await self._try_process_job(job_id, queue_name)
|
|
208
|
+
if job_started:
|
|
209
|
+
fetched_count += 1
|
|
210
|
+
else:
|
|
211
|
+
# If job wasn't started (e.g., lock conflict), release semaphore immediately
|
|
212
|
+
self._semaphore.release()
|
|
213
|
+
except Exception as e_try:
|
|
214
|
+
# Catch errors during the _try_process_job itself
|
|
215
|
+
logger.error(
|
|
216
|
+
f"Worker {self.worker_id} exception trying to process job {job_id}: {e_try}",
|
|
217
|
+
exc_info=True,
|
|
218
|
+
)
|
|
219
|
+
self._semaphore.release() # Ensure semaphore is released on error
|
|
220
|
+
|
|
221
|
+
except Exception as e_poll:
|
|
222
|
+
logger.error(
|
|
223
|
+
f"Worker {self.worker_id} error polling queue '{queue_name}': {e_poll}",
|
|
224
|
+
exc_info=True,
|
|
225
|
+
)
|
|
226
|
+
await asyncio.sleep(1) # Avoid tight loop on polling error
|
|
227
|
+
|
|
228
|
+
async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
|
|
229
|
+
"""Attempts to lock, fetch definition, and start the execution task for a specific job.
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
job_id: The ID of the job to attempt processing.
|
|
233
|
+
queue_name: The name of the queue the job ID was retrieved from.
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
True if the job processing task was successfully started, False otherwise
|
|
237
|
+
(e.g., lock conflict, job definition not found, already removed).
|
|
238
|
+
"""
|
|
239
|
+
logger.debug(
|
|
240
|
+
f"Worker {self.worker_id} attempting to process job {job_id} from queue '{queue_name}'"
|
|
241
|
+
)
|
|
242
|
+
job = await self.job_store.get_job_definition(job_id)
|
|
243
|
+
if not job:
|
|
244
|
+
logger.warning(
|
|
245
|
+
f"Worker {self.worker_id} job definition {job_id} not found during _try_process_job from queue {queue_name}."
|
|
246
|
+
)
|
|
247
|
+
return False # Job vanished between poll and fetch?
|
|
248
|
+
|
|
249
|
+
# Determine job-specific timeout and calculate lock timeout
|
|
250
|
+
job_timeout = (
|
|
251
|
+
job.job_timeout_seconds
|
|
252
|
+
if job.job_timeout_seconds is not None
|
|
253
|
+
else self.settings.default_job_timeout_seconds
|
|
254
|
+
)
|
|
255
|
+
lock_timeout_ms = (
|
|
256
|
+
job_timeout + self.settings.default_lock_timeout_extension_seconds
|
|
257
|
+
) * 1000
|
|
258
|
+
|
|
259
|
+
# Attempt to acquire the processing lock
|
|
260
|
+
lock_acquired = await self.job_store.acquire_job_lock(
|
|
261
|
+
job.id, self.worker_id, int(lock_timeout_ms)
|
|
262
|
+
)
|
|
263
|
+
if not lock_acquired:
|
|
264
|
+
logger.debug(
|
|
265
|
+
f"Worker {self.worker_id} failed to acquire lock for job {job.id} (already locked by another worker)."
|
|
266
|
+
)
|
|
267
|
+
return False # Another worker got there first
|
|
268
|
+
|
|
269
|
+
logger.debug(f"Worker {self.worker_id} acquired lock for job {job.id}")
|
|
270
|
+
|
|
271
|
+
# Atomically remove the job from the queue (verify it was actually removed)
|
|
272
|
+
# Note: Ideally, lock acquisition and queue removal would be a single atomic operation (e.g., Lua script).
|
|
273
|
+
removed_count = await self.job_store.remove_job_from_queue(queue_name, job.id)
|
|
274
|
+
logger.debug(
|
|
275
|
+
f"Worker {self.worker_id} removed job {job.id} from queue '{queue_name}' (count: {removed_count})."
|
|
276
|
+
)
|
|
277
|
+
if removed_count == 0:
|
|
278
|
+
logger.warning(
|
|
279
|
+
f"Worker {self.worker_id} job {job.id} was already removed from queue '{queue_name}' after lock acquisition. Releasing lock."
|
|
280
|
+
)
|
|
281
|
+
await self.job_store.release_job_lock(job.id) # Release the acquired lock
|
|
282
|
+
return False # Job processed by another worker between our poll and lock
|
|
283
|
+
|
|
284
|
+
# We have the lock and have removed the job from the queue - proceed to execute
|
|
285
|
+
try:
|
|
286
|
+
await self.job_store.update_job_status(job.id, JobStatus.ACTIVE)
|
|
287
|
+
logger.debug(
|
|
288
|
+
f"Worker {self.worker_id} updated status to ACTIVE for job {job.id}"
|
|
289
|
+
)
|
|
290
|
+
|
|
291
|
+
# Create and track the execution task
|
|
292
|
+
task = self._loop.create_task(self._execute_job(job, queue_name))
|
|
293
|
+
self._running_tasks.add(task)
|
|
294
|
+
task.add_done_callback(lambda t: self._task_cleanup(t, self._semaphore))
|
|
295
|
+
logger.info(
|
|
296
|
+
f"Worker {self.worker_id} started job {job.id} ('{job.function_name}') from queue '{queue_name}'"
|
|
297
|
+
)
|
|
298
|
+
return True
|
|
299
|
+
except Exception as e_start:
|
|
300
|
+
# Catch errors during status update or task creation
|
|
301
|
+
logger.error(
|
|
302
|
+
f"Worker {self.worker_id} failed to start task for job {job.id} after lock/removal: {e_start}",
|
|
303
|
+
exc_info=True,
|
|
304
|
+
)
|
|
305
|
+
# Attempt to release the lock since task wasn't started
|
|
306
|
+
await self.job_store.release_job_lock(job.id)
|
|
307
|
+
return False
|
|
308
|
+
|
|
309
|
+
async def _execute_job(self, job: Job, queue_name: str) -> None:
|
|
310
|
+
"""Executes a single job handler, managing timeouts, errors, retries, and results.
|
|
311
|
+
|
|
312
|
+
This method is run within an asyncio Task for each job.
|
|
313
|
+
It ensures the processing lock is released in a finally block.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
job: The Job object to execute.
|
|
317
|
+
queue_name: The name of the queue the job was pulled from.
|
|
318
|
+
"""
|
|
319
|
+
logger.debug(
|
|
320
|
+
f"Worker {self.worker_id} executing job {job.id} ('{job.function_name}') from queue '{queue_name}'"
|
|
321
|
+
)
|
|
322
|
+
start_time = time.monotonic()
|
|
323
|
+
actual_job_timeout = (
|
|
324
|
+
job.job_timeout_seconds
|
|
325
|
+
if job.job_timeout_seconds is not None
|
|
326
|
+
else self.settings.default_job_timeout_seconds
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
try:
|
|
330
|
+
# --- Find Handler ---
|
|
331
|
+
handler = self.job_registry.get_handler(job.function_name)
|
|
332
|
+
if not handler:
|
|
333
|
+
raise ValueError(
|
|
334
|
+
f"No handler registered for function '{job.function_name}'"
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# --- Prepare Context ---
|
|
338
|
+
context = {
|
|
339
|
+
"job_id": job.id,
|
|
340
|
+
"job_try": job.current_retries + 1, # Attempt number (1-based)
|
|
341
|
+
"enqueue_time": job.enqueue_time,
|
|
342
|
+
"settings": self.settings,
|
|
343
|
+
"worker_id": self.worker_id,
|
|
344
|
+
"queue_name": queue_name,
|
|
345
|
+
"rrq_client": self.client,
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
# --- Execute Handler ---
|
|
349
|
+
result = None
|
|
350
|
+
exc: Optional[BaseException] = None # Stores caught exception
|
|
351
|
+
|
|
352
|
+
try: # Inner try for handler execution and its specific exceptions
|
|
353
|
+
logger.debug(f"Calling handler '{job.function_name}' for job {job.id}")
|
|
354
|
+
result = await asyncio.wait_for(
|
|
355
|
+
handler(context, *job.job_args, **job.job_kwargs),
|
|
356
|
+
timeout=float(actual_job_timeout),
|
|
357
|
+
)
|
|
358
|
+
logger.debug(f"Handler for job {job.id} returned successfully.")
|
|
359
|
+
except TimeoutError as e_timeout: # Specifically from wait_for
|
|
360
|
+
exc = e_timeout
|
|
361
|
+
logger.warning(
|
|
362
|
+
f"Job {job.id} execution timed out after {actual_job_timeout}s."
|
|
363
|
+
)
|
|
364
|
+
except RetryJob as e_retry: # Handler explicitly requests retry
|
|
365
|
+
exc = e_retry
|
|
366
|
+
logger.info(f"Job {job.id} requested retry: {e_retry}")
|
|
367
|
+
except Exception as e_other: # Any other exception from the handler itself
|
|
368
|
+
exc = e_other
|
|
369
|
+
logger.error(
|
|
370
|
+
f"Job {job.id} handler '{job.function_name}' raised unhandled exception:",
|
|
371
|
+
exc_info=e_other,
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# --- Process Outcome ---
|
|
375
|
+
duration = time.monotonic() - start_time
|
|
376
|
+
if exc is None: # Success
|
|
377
|
+
await self._handle_job_success(job, result)
|
|
378
|
+
logger.info(f"Job {job.id} completed successfully in {duration:.2f}s.")
|
|
379
|
+
elif isinstance(exc, RetryJob):
|
|
380
|
+
await self._process_retry_job(job, exc, queue_name)
|
|
381
|
+
# Logging done within _process_retry_job
|
|
382
|
+
elif isinstance(exc, asyncio.TimeoutError):
|
|
383
|
+
error_msg = (
|
|
384
|
+
str(exc)
|
|
385
|
+
if str(exc)
|
|
386
|
+
else f"Job timed out after {actual_job_timeout}s."
|
|
387
|
+
)
|
|
388
|
+
await self._handle_job_timeout(job, queue_name, error_msg)
|
|
389
|
+
# Logging done within _handle_job_timeout
|
|
390
|
+
else: # Other unhandled exception from handler
|
|
391
|
+
await self._process_other_failure(job, exc, queue_name)
|
|
392
|
+
# Logging done within _process_other_failure
|
|
393
|
+
|
|
394
|
+
except ValueError as ve: # Catches "handler not found"
|
|
395
|
+
logger.error(f"Job {job.id} fatal error: {ve}. Moving to DLQ.")
|
|
396
|
+
await self._handle_fatal_job_error(job, queue_name, str(ve))
|
|
397
|
+
except asyncio.CancelledError:
|
|
398
|
+
# Catches cancellation of this _execute_job task (e.g., worker shutdown)
|
|
399
|
+
logger.warning(
|
|
400
|
+
f"Job {job.id} execution was cancelled (likely worker shutdown). Handling cancellation."
|
|
401
|
+
)
|
|
402
|
+
await self._handle_job_cancellation_on_shutdown(job, queue_name)
|
|
403
|
+
# Do not re-raise; cancellation is handled.
|
|
404
|
+
except (
|
|
405
|
+
Exception
|
|
406
|
+
) as critical_exc: # Safety net for unexpected errors in this method
|
|
407
|
+
logger.critical(
|
|
408
|
+
f"Job {job.id} encountered an unexpected critical error during execution logic: {critical_exc}",
|
|
409
|
+
exc_info=critical_exc,
|
|
410
|
+
)
|
|
411
|
+
# Fallback: Try to move to DLQ to avoid losing the job entirely
|
|
412
|
+
await self._handle_fatal_job_error(
|
|
413
|
+
job, queue_name, f"Critical worker error: {critical_exc}"
|
|
414
|
+
)
|
|
415
|
+
finally:
|
|
416
|
+
# CRITICAL: Ensure the lock is released regardless of outcome
|
|
417
|
+
await self.job_store.release_job_lock(job.id)
|
|
418
|
+
# Logger call moved inside release_job_lock for context
|
|
419
|
+
|
|
420
|
+
async def _handle_job_success(self, job: Job, result: Any) -> None:
|
|
421
|
+
"""Handles successful job completion: saves result, sets TTL, updates status, and releases unique lock."""
|
|
422
|
+
try:
|
|
423
|
+
ttl = (
|
|
424
|
+
job.result_ttl_seconds
|
|
425
|
+
if job.result_ttl_seconds is not None
|
|
426
|
+
else self.settings.default_result_ttl_seconds
|
|
427
|
+
)
|
|
428
|
+
await self.job_store.save_job_result(job.id, result, ttl_seconds=int(ttl))
|
|
429
|
+
# Status is set to COMPLETED within save_job_result
|
|
430
|
+
|
|
431
|
+
if job.job_unique_key:
|
|
432
|
+
logger.debug(
|
|
433
|
+
f"Job {job.id} completed successfully, releasing unique key: {job.job_unique_key}"
|
|
434
|
+
)
|
|
435
|
+
await self.job_store.release_unique_job_lock(job.job_unique_key)
|
|
436
|
+
|
|
437
|
+
except Exception as e_success:
|
|
438
|
+
logger.error(
|
|
439
|
+
f"Error during post-success handling for job {job.id}: {e_success}",
|
|
440
|
+
exc_info=True,
|
|
441
|
+
)
|
|
442
|
+
# Job finished, but result/unique lock release failed.
|
|
443
|
+
# Lock is released in _execute_job's finally. Unique lock might persist.
|
|
444
|
+
|
|
445
|
+
async def _process_retry_job(
|
|
446
|
+
self, job: Job, exc: RetryJob, queue_name: str
|
|
447
|
+
) -> None:
|
|
448
|
+
"""Handles job failures where the handler explicitly raised RetryJob.
|
|
449
|
+
|
|
450
|
+
Increments retry count, checks against max_retries, and re-queues with
|
|
451
|
+
appropriate delay (custom or exponential backoff) or moves to DLQ.
|
|
452
|
+
"""
|
|
453
|
+
log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
|
|
454
|
+
job_key = f"{JOB_KEY_PREFIX}{job.id}"
|
|
455
|
+
try:
|
|
456
|
+
# Atomically increment retries in the store.
|
|
457
|
+
new_retry_count = await self.job_store.increment_job_retries(job.id)
|
|
458
|
+
max_retries = (
|
|
459
|
+
job.max_retries
|
|
460
|
+
) # Use max_retries from the job object passed in
|
|
461
|
+
|
|
462
|
+
if new_retry_count < max_retries:
|
|
463
|
+
# Update status and error atomically
|
|
464
|
+
await self.job_store.redis.hset(
|
|
465
|
+
job_key,
|
|
466
|
+
mapping={
|
|
467
|
+
"status": JobStatus.RETRYING.value,
|
|
468
|
+
"last_error": str(exc),
|
|
469
|
+
},
|
|
470
|
+
)
|
|
471
|
+
logger.debug(f"{log_prefix} status set to RETRYING, error saved.")
|
|
472
|
+
|
|
473
|
+
# Determine deferral time
|
|
474
|
+
defer_seconds = exc.defer_seconds
|
|
475
|
+
if defer_seconds is None:
|
|
476
|
+
# Create a temporary job representation for backoff calculation
|
|
477
|
+
# using the *new* retry count.
|
|
478
|
+
temp_job_for_backoff = Job(
|
|
479
|
+
id=job.id,
|
|
480
|
+
function_name=job.function_name,
|
|
481
|
+
current_retries=new_retry_count, # Use updated count
|
|
482
|
+
max_retries=max_retries, # Ensure this is passed
|
|
483
|
+
)
|
|
484
|
+
defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
|
|
485
|
+
defer_seconds = defer_ms / 1000.0
|
|
486
|
+
else:
|
|
487
|
+
logger.debug(
|
|
488
|
+
f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
retry_at_score = (time.time() + defer_seconds) * 1000
|
|
492
|
+
target_queue = job.queue_name or self.settings.default_queue_name
|
|
493
|
+
await self.job_store.add_job_to_queue(
|
|
494
|
+
target_queue, job.id, retry_at_score
|
|
495
|
+
)
|
|
496
|
+
logger.info(
|
|
497
|
+
f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
|
|
498
|
+
f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
|
|
499
|
+
)
|
|
500
|
+
else:
|
|
501
|
+
# Max retries exceeded even though RetryJob was raised
|
|
502
|
+
logger.warning(
|
|
503
|
+
f"{log_prefix} max retries ({max_retries}) exceeded "
|
|
504
|
+
f"despite RetryJob exception. Moving to DLQ."
|
|
505
|
+
)
|
|
506
|
+
# _move_to_dlq handles setting FAILED status etc.
|
|
507
|
+
error_msg = (
|
|
508
|
+
str(exc) or f"Max retries ({max_retries}) exceeded after RetryJob"
|
|
509
|
+
)
|
|
510
|
+
await self._move_to_dlq(job, queue_name, error_msg)
|
|
511
|
+
except Exception as e_handle:
|
|
512
|
+
logger.exception(
|
|
513
|
+
f"{log_prefix} CRITICAL error during RetryJob processing: {e_handle}"
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
async def _process_other_failure(
|
|
517
|
+
self, job: Job, exc: Exception, queue_name: str
|
|
518
|
+
) -> None:
|
|
519
|
+
"""Handles general job failures (any exception other than RetryJob or timeout/cancellation).
|
|
520
|
+
|
|
521
|
+
Increments retry count, checks against max_retries, and re-queues with
|
|
522
|
+
exponential backoff or moves to DLQ.
|
|
523
|
+
"""
|
|
524
|
+
log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
|
|
525
|
+
logger.debug(f"{log_prefix} processing general failure: {type(exc).__name__}")
|
|
526
|
+
|
|
527
|
+
try:
|
|
528
|
+
new_retry_count = await self.job_store.increment_job_retries(job.id)
|
|
529
|
+
# Re-fetch job state after incrementing retries might be safer if fields changed?
|
|
530
|
+
# For now, assume the job object passed in is mostly accurate except for retry count.
|
|
531
|
+
# Use max_retries from the job object passed in.
|
|
532
|
+
max_retries = job.max_retries
|
|
533
|
+
last_error_str = str(exc)
|
|
534
|
+
|
|
535
|
+
if new_retry_count < max_retries:
|
|
536
|
+
# Re-queue for standard retry with backoff
|
|
537
|
+
defer_ms = self._calculate_backoff_ms(
|
|
538
|
+
Job(
|
|
539
|
+
id=job.id,
|
|
540
|
+
function_name=job.function_name,
|
|
541
|
+
current_retries=new_retry_count,
|
|
542
|
+
max_retries=max_retries,
|
|
543
|
+
)
|
|
544
|
+
)
|
|
545
|
+
retry_at_score = (time.time() * 1000) + defer_ms
|
|
546
|
+
target_queue = job.queue_name or self.settings.default_queue_name
|
|
547
|
+
|
|
548
|
+
# Atomically update status/error and re-add to queue (if possible, else separate)
|
|
549
|
+
# For now, separate HSET and ZADD
|
|
550
|
+
await self.job_store.redis.hset(
|
|
551
|
+
f"{JOB_KEY_PREFIX}{job.id}",
|
|
552
|
+
mapping={
|
|
553
|
+
"status": JobStatus.RETRYING.value,
|
|
554
|
+
"last_error": last_error_str,
|
|
555
|
+
},
|
|
556
|
+
)
|
|
557
|
+
await self.job_store.add_job_to_queue(
|
|
558
|
+
target_queue, job.id, retry_at_score
|
|
559
|
+
)
|
|
560
|
+
logger.info(
|
|
561
|
+
f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
|
|
562
|
+
f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
|
|
563
|
+
)
|
|
564
|
+
else: # Max retries reached
|
|
565
|
+
logger.warning(
|
|
566
|
+
f"{log_prefix} failed after max retries ({max_retries}). Moving to DLQ. Error: {str(exc)[:100]}..."
|
|
567
|
+
)
|
|
568
|
+
# _move_to_dlq handles setting FAILED status, completion time, and last error.
|
|
569
|
+
await self._move_to_dlq(job, queue_name, last_error_str)
|
|
570
|
+
|
|
571
|
+
except Exception as e_handle:
|
|
572
|
+
logger.exception(
|
|
573
|
+
f"{log_prefix} CRITICAL error during general failure processing (original exc: {type(exc).__name__}): {e_handle}"
|
|
574
|
+
)
|
|
575
|
+
|
|
576
|
+
async def _move_to_dlq(self, job: Job, queue_name: str, error_message: str) -> None:
|
|
577
|
+
"""Moves a job to the Dead Letter Queue (DLQ) and releases its unique lock if present."""
|
|
578
|
+
|
|
579
|
+
dlq_name = self.settings.default_dlq_name # Or derive from original queue_name
|
|
580
|
+
completion_time = datetime.now(UTC)
|
|
581
|
+
try:
|
|
582
|
+
await self.job_store.move_job_to_dlq(
|
|
583
|
+
job_id=job.id,
|
|
584
|
+
dlq_name=dlq_name,
|
|
585
|
+
error_message=error_message,
|
|
586
|
+
completion_time=completion_time,
|
|
587
|
+
)
|
|
588
|
+
logger.warning(
|
|
589
|
+
f"Worker {self.worker_id} moved job {job.id} from queue '{queue_name}' to DLQ '{dlq_name}'. Reason: {error_message}"
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
if job.job_unique_key:
|
|
593
|
+
logger.debug(
|
|
594
|
+
f"Job {job.id} moved to DLQ, releasing unique key: {job.job_unique_key}"
|
|
595
|
+
)
|
|
596
|
+
await self.job_store.release_unique_job_lock(job.job_unique_key)
|
|
597
|
+
|
|
598
|
+
except Exception as e_dlq:
|
|
599
|
+
logger.error(
|
|
600
|
+
f"Worker {self.worker_id} critical error trying to move job {job.id} to DLQ '{dlq_name}': {e_dlq}",
|
|
601
|
+
exc_info=True,
|
|
602
|
+
)
|
|
603
|
+
# If moving to DLQ fails, the job might be stuck.
|
|
604
|
+
# The processing lock is released in _execute_job's finally. Unique lock might persist.
|
|
605
|
+
|
|
606
|
+
def _task_cleanup(self, task: asyncio.Task, semaphore: asyncio.Semaphore) -> None:
|
|
607
|
+
"""Callback executed when a job task finishes or is cancelled.
|
|
608
|
+
|
|
609
|
+
Removes the task from the running set and releases the concurrency semaphore.
|
|
610
|
+
Also logs any unexpected exceptions raised by the task itself.
|
|
611
|
+
|
|
612
|
+
Args:
|
|
613
|
+
task: The completed or cancelled asyncio Task.
|
|
614
|
+
semaphore: The worker's concurrency semaphore.
|
|
615
|
+
"""
|
|
616
|
+
task_name = "N/A"
|
|
617
|
+
try:
|
|
618
|
+
if hasattr(task, "get_name"): # Ensure get_name exists
|
|
619
|
+
task_name = task.get_name()
|
|
620
|
+
elif hasattr(task, "_coro") and hasattr(task._coro, "__name__"): # Fallback
|
|
621
|
+
task_name = task._coro.__name__
|
|
622
|
+
except Exception:
|
|
623
|
+
pass # Ignore errors getting name
|
|
624
|
+
|
|
625
|
+
logger.debug(
|
|
626
|
+
f"Worker {self.worker_id} cleaning up task '{task_name}'. Releasing semaphore."
|
|
627
|
+
)
|
|
628
|
+
if task in self._running_tasks:
|
|
629
|
+
self._running_tasks.remove(task)
|
|
630
|
+
else:
|
|
631
|
+
logger.warning(
|
|
632
|
+
f"Worker {self.worker_id} task '{task_name}' already removed during cleanup callback? This might indicate an issue."
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
semaphore.release()
|
|
636
|
+
|
|
637
|
+
try:
|
|
638
|
+
task.result() # Check for unexpected exceptions from the task future itself
|
|
639
|
+
except asyncio.CancelledError:
|
|
640
|
+
logger.debug(
|
|
641
|
+
f"Task '{task_name}' in worker {self.worker_id} was cancelled."
|
|
642
|
+
)
|
|
643
|
+
except Exception as e:
|
|
644
|
+
logger.error(
|
|
645
|
+
f"Task '{task_name}' in worker {self.worker_id} raised an unhandled exception during cleanup check: {e}",
|
|
646
|
+
exc_info=True,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
def _setup_signal_handlers(self) -> None:
|
|
650
|
+
"""Sets up POSIX signal handlers for graceful shutdown."""
|
|
651
|
+
for sig in self.SIGNALS:
|
|
652
|
+
try:
|
|
653
|
+
self._loop.add_signal_handler(sig, self._request_shutdown)
|
|
654
|
+
logger.debug(
|
|
655
|
+
f"Worker {self.worker_id} registered signal handler for {sig.name}."
|
|
656
|
+
)
|
|
657
|
+
except (NotImplementedError, AttributeError):
|
|
658
|
+
logger.warning(
|
|
659
|
+
f"Worker {self.worker_id} could not set signal handler for {sig.name} (likely Windows or unsupported environment). Graceful shutdown via signals may not work."
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
def _request_shutdown(self) -> None:
|
|
663
|
+
"""Callback triggered by a signal to initiate graceful shutdown."""
|
|
664
|
+
if not self._shutdown_event.is_set():
|
|
665
|
+
logger.info(
|
|
666
|
+
f"Worker {self.worker_id} received shutdown signal. Initiating graceful shutdown..."
|
|
667
|
+
)
|
|
668
|
+
self._shutdown_event.set()
|
|
669
|
+
else:
|
|
670
|
+
logger.info(
|
|
671
|
+
f"Worker {self.worker_id} received another shutdown signal, already shutting down."
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
async def _drain_tasks(self) -> None:
|
|
675
|
+
"""Waits for currently running job tasks to complete, up to a grace period.
|
|
676
|
+
|
|
677
|
+
Tasks that do not complete within the grace period are cancelled.
|
|
678
|
+
"""
|
|
679
|
+
if not self._running_tasks:
|
|
680
|
+
logger.debug(f"Worker {self.worker_id}: No active tasks to drain.")
|
|
681
|
+
return
|
|
682
|
+
|
|
683
|
+
logger.info(
|
|
684
|
+
f"Worker {self.worker_id}: Waiting for {len(self._running_tasks)} active tasks to complete (grace period: {self.settings.worker_shutdown_grace_period_seconds}s)..."
|
|
685
|
+
)
|
|
686
|
+
grace_period = self.settings.worker_shutdown_grace_period_seconds
|
|
687
|
+
|
|
688
|
+
# Use asyncio.shield if we want to prevent cancellation of _drain_tasks itself?
|
|
689
|
+
# For now, assume it runs to completion or the main loop handles its cancellation.
|
|
690
|
+
tasks_to_wait_on = list(self._running_tasks)
|
|
691
|
+
|
|
692
|
+
# Wait for tasks with timeout
|
|
693
|
+
done, pending = await asyncio.wait(tasks_to_wait_on, timeout=grace_period)
|
|
694
|
+
|
|
695
|
+
if done:
|
|
696
|
+
logger.info(
|
|
697
|
+
f"Worker {self.worker_id}: {len(done)} tasks completed within grace period."
|
|
698
|
+
)
|
|
699
|
+
if pending:
|
|
700
|
+
logger.warning(
|
|
701
|
+
f"Worker {self.worker_id}: {len(pending)} tasks did not complete within grace period. Cancelling remaining tasks..."
|
|
702
|
+
)
|
|
703
|
+
for task in pending:
|
|
704
|
+
task_name = "N/A"
|
|
705
|
+
try:
|
|
706
|
+
if hasattr(task, "get_name"):
|
|
707
|
+
task_name = task.get_name()
|
|
708
|
+
except Exception:
|
|
709
|
+
pass
|
|
710
|
+
logger.warning(
|
|
711
|
+
f"Worker {self.worker_id}: Cancelling task '{task_name}'."
|
|
712
|
+
)
|
|
713
|
+
task.cancel()
|
|
714
|
+
|
|
715
|
+
# Wait for the cancelled tasks to finish propagating the cancellation
|
|
716
|
+
await asyncio.gather(*pending, return_exceptions=True)
|
|
717
|
+
logger.info(
|
|
718
|
+
f"Worker {self.worker_id}: Finished waiting for cancelled tasks."
|
|
719
|
+
)
|
|
720
|
+
|
|
721
|
+
async def _heartbeat_loop(self) -> None:
|
|
722
|
+
"""Periodically updates the worker's health status key in Redis with a TTL."""
|
|
723
|
+
logger.debug(f"Worker {self.worker_id} starting heartbeat loop.")
|
|
724
|
+
while not self._shutdown_event.is_set():
|
|
725
|
+
try:
|
|
726
|
+
health_data = {
|
|
727
|
+
"worker_id": self.worker_id,
|
|
728
|
+
"timestamp": datetime.now(UTC).isoformat(),
|
|
729
|
+
"status": self.status,
|
|
730
|
+
"active_jobs": len(self._running_tasks),
|
|
731
|
+
"concurrency_limit": self.settings.worker_concurrency,
|
|
732
|
+
"queues": self.queues,
|
|
733
|
+
}
|
|
734
|
+
ttl = (
|
|
735
|
+
self.settings.worker_health_check_interval_seconds + 10
|
|
736
|
+
) # Add buffer
|
|
737
|
+
await self.job_store.set_worker_health(
|
|
738
|
+
self.worker_id, health_data, int(ttl)
|
|
739
|
+
)
|
|
740
|
+
# Logger call moved into set_worker_health
|
|
741
|
+
except Exception as e:
|
|
742
|
+
# Log error but continue the loop
|
|
743
|
+
logger.error(
|
|
744
|
+
f"Error updating health check for worker {self.worker_id}: {e}",
|
|
745
|
+
exc_info=True,
|
|
746
|
+
)
|
|
747
|
+
|
|
748
|
+
try:
|
|
749
|
+
# Sleep until the next interval, but wake up if shutdown is requested
|
|
750
|
+
await asyncio.wait_for(
|
|
751
|
+
self._shutdown_event.wait(),
|
|
752
|
+
timeout=min(60, self.settings.worker_health_check_interval_seconds),
|
|
753
|
+
)
|
|
754
|
+
# If wait_for doesn't time out, shutdown was requested
|
|
755
|
+
logger.debug(
|
|
756
|
+
f"Worker {self.worker_id} heartbeat loop exiting due to shutdown event."
|
|
757
|
+
)
|
|
758
|
+
break # Exit loop if shutdown event is set
|
|
759
|
+
except TimeoutError:
|
|
760
|
+
# This is the normal case, continue loop
|
|
761
|
+
pass
|
|
762
|
+
except Exception as sleep_err:
|
|
763
|
+
# Handle potential errors from wait_for itself
|
|
764
|
+
logger.error(
|
|
765
|
+
f"Worker {self.worker_id} error during heartbeat sleep: {sleep_err}",
|
|
766
|
+
exc_info=True,
|
|
767
|
+
)
|
|
768
|
+
await asyncio.sleep(1) # Avoid tight loop
|
|
769
|
+
|
|
770
|
+
logger.debug(f"Worker {self.worker_id} heartbeat loop finished.")
|
|
771
|
+
|
|
772
|
+
async def _close_resources(self) -> None:
|
|
773
|
+
"""Closes the worker's resources, primarily the JobStore connection."""
|
|
774
|
+
logger.info(f"Worker {self.worker_id} closing resources...")
|
|
775
|
+
try:
|
|
776
|
+
await self.job_store.aclose()
|
|
777
|
+
logger.info(f"Worker {self.worker_id} JobStore Redis connection closed.")
|
|
778
|
+
except Exception as e_close:
|
|
779
|
+
logger.error(
|
|
780
|
+
f"Worker {self.worker_id} error closing JobStore: {e_close}",
|
|
781
|
+
exc_info=True,
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
def _calculate_backoff_ms(self, job: Job) -> int:
|
|
785
|
+
"""Calculates exponential backoff delay in milliseconds based on retry count.
|
|
786
|
+
|
|
787
|
+
Uses `base_retry_delay_seconds` and `max_retry_delay_seconds` from settings.
|
|
788
|
+
|
|
789
|
+
Args:
|
|
790
|
+
job: The Job object (specifically needs `current_retries`).
|
|
791
|
+
|
|
792
|
+
Returns:
|
|
793
|
+
The calculated delay in milliseconds.
|
|
794
|
+
"""
|
|
795
|
+
# Simple exponential backoff: base * (2^(retries-1))
|
|
796
|
+
# current_retries is 1-based for calculation after increment.
|
|
797
|
+
retry_attempt = job.current_retries
|
|
798
|
+
if retry_attempt <= 0:
|
|
799
|
+
# Should not happen if called after increment, but safeguard
|
|
800
|
+
retry_attempt = 1
|
|
801
|
+
|
|
802
|
+
base_delay = self.settings.base_retry_delay_seconds
|
|
803
|
+
max_delay = self.settings.max_retry_delay_seconds
|
|
804
|
+
|
|
805
|
+
delay_seconds = min(max_delay, base_delay * (2 ** (retry_attempt - 1)))
|
|
806
|
+
delay_ms = int(delay_seconds * 1000)
|
|
807
|
+
logger.debug(
|
|
808
|
+
f"Calculated backoff for job {job.id} (attempt {retry_attempt}): {delay_ms}ms"
|
|
809
|
+
)
|
|
810
|
+
return delay_ms
|
|
811
|
+
|
|
812
|
+
async def _handle_job_timeout(
|
|
813
|
+
self, job: Job, queue_name: str, error_message: str
|
|
814
|
+
) -> None:
|
|
815
|
+
"""Handles job timeouts by moving them directly to the DLQ."""
|
|
816
|
+
log_message_prefix = f"Worker {self.worker_id} job {job.id} {queue_name}"
|
|
817
|
+
logger.warning(f"{log_message_prefix} processing timeout: {error_message}")
|
|
818
|
+
|
|
819
|
+
try:
|
|
820
|
+
# Increment retries as an attempt was made.
|
|
821
|
+
# Even though it's a timeout, it did consume a slot and attempt execution.
|
|
822
|
+
# This also ensures that if _move_to_dlq relies on current_retries for anything, it's accurate.
|
|
823
|
+
await self.job_store.increment_job_retries(job.id)
|
|
824
|
+
|
|
825
|
+
# Update the job object with the error message before moving to DLQ
|
|
826
|
+
# _move_to_dlq will set FAILED status and completion_time
|
|
827
|
+
await self._move_to_dlq(job, queue_name, error_message)
|
|
828
|
+
logger.info(f"{log_message_prefix} moved to DLQ due to timeout.")
|
|
829
|
+
except Exception as e_timeout_handle:
|
|
830
|
+
logger.exception(
|
|
831
|
+
f"{log_message_prefix} CRITICAL error in _handle_job_timeout: {e_timeout_handle}"
|
|
832
|
+
)
|
|
833
|
+
|
|
834
|
+
async def _handle_fatal_job_error(
|
|
835
|
+
self, job: Job, queue_name: str, error_message: str
|
|
836
|
+
) -> None:
|
|
837
|
+
"""Handles fatal job errors (e.g., handler not found) by moving to DLQ without retries."""
|
|
838
|
+
log_message_prefix = f"Worker {self.worker_id} job {job.id} {queue_name}"
|
|
839
|
+
logger.error(
|
|
840
|
+
f"{log_message_prefix} fatal error: {error_message}. Moving to DLQ."
|
|
841
|
+
)
|
|
842
|
+
try:
|
|
843
|
+
# Increment retries as an attempt was made to process/find handler.
|
|
844
|
+
await self.job_store.increment_job_retries(job.id)
|
|
845
|
+
# Note: _move_to_dlq handles setting FAILED status, completion_time, and last_error.
|
|
846
|
+
await self._move_to_dlq(job, queue_name, error_message)
|
|
847
|
+
logger.info(f"{log_message_prefix} moved to DLQ due to fatal error.")
|
|
848
|
+
except Exception as e_fatal_handle:
|
|
849
|
+
logger.exception(
|
|
850
|
+
f"{log_message_prefix} CRITICAL error in _handle_fatal_job_error: {e_fatal_handle}"
|
|
851
|
+
)
|
|
852
|
+
|
|
853
|
+
async def _handle_job_cancellation_on_shutdown(self, job: Job, queue_name: str):
|
|
854
|
+
logger.warning(
|
|
855
|
+
f"Job {job.id} ({job.function_name}) was cancelled. Assuming worker shutdown. Re-queueing."
|
|
856
|
+
)
|
|
857
|
+
try:
|
|
858
|
+
job.status = JobStatus.PENDING
|
|
859
|
+
job.next_scheduled_run_time = datetime.now(UTC) # Re-queue immediately
|
|
860
|
+
job.last_error = "Job execution interrupted by worker shutdown. Re-queued."
|
|
861
|
+
# Do not increment retries for shutdown interruption
|
|
862
|
+
|
|
863
|
+
await self.job_store.save_job_definition(job)
|
|
864
|
+
await self.job_store.add_job_to_queue(
|
|
865
|
+
queue_name, job.id, job.next_scheduled_run_time.timestamp() * 1000
|
|
866
|
+
)
|
|
867
|
+
await self.job_store.release_job_lock(job.id) # Ensure lock is released
|
|
868
|
+
|
|
869
|
+
logger.info(f"Successfully re-queued job {job.id} to {queue_name}.")
|
|
870
|
+
except Exception as e_requeue:
|
|
871
|
+
logger.exception(
|
|
872
|
+
f"Failed to re-queue job {job.id} on cancellation/shutdown: {e_requeue}"
|
|
873
|
+
)
|
|
874
|
+
# Fallback: try to move to DLQ if re-queueing fails catastrophically
|
|
875
|
+
try:
|
|
876
|
+
await self.job_store.move_job_to_dlq(
|
|
877
|
+
job.id,
|
|
878
|
+
self.settings.default_dlq_name,
|
|
879
|
+
f"Failed to re-queue during cancellation: {e_requeue}",
|
|
880
|
+
datetime.now(UTC),
|
|
881
|
+
)
|
|
882
|
+
logger.info(
|
|
883
|
+
f"Successfully moved job {job.id} to DLQ due to re-queueing failure."
|
|
884
|
+
)
|
|
885
|
+
except Exception as e_move_to_dlq:
|
|
886
|
+
logger.exception(
|
|
887
|
+
f"Failed to move job {job.id} to DLQ after re-queueing failure: {e_move_to_dlq}"
|
|
888
|
+
)
|
|
889
|
+
|
|
890
|
+
async def close(self) -> None:
|
|
891
|
+
"""Gracefully close worker resources."""
|
|
892
|
+
logger.info(f"[{self.worker_id}] Closing RRQ worker...")
|
|
893
|
+
if self.client: # Check if client exists before closing
|
|
894
|
+
await self.client.close()
|
|
895
|
+
if self.job_store:
|
|
896
|
+
await self.job_store.close()
|
|
897
|
+
logger.info(f"[{self.worker_id}] RRQ worker closed.")
|