rrq 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rrq/cli.py +303 -29
- rrq/cron.py +9 -8
- rrq/store.py +122 -2
- rrq/worker.py +193 -133
- {rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/METADATA +3 -2
- rrq-0.5.0.dist-info/RECORD +16 -0
- rrq-0.4.0.dist-info/RECORD +0 -16
- {rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/WHEEL +0 -0
- {rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/entry_points.txt +0 -0
- {rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/licenses/LICENSE +0 -0
rrq/cli.py
CHANGED
|
@@ -7,6 +7,9 @@ import os
|
|
|
7
7
|
import signal
|
|
8
8
|
import subprocess
|
|
9
9
|
import sys
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
# import multiprocessing # No longer needed directly, os.cpu_count() is sufficient
|
|
10
13
|
from contextlib import suppress
|
|
11
14
|
|
|
12
15
|
import click
|
|
@@ -30,6 +33,29 @@ logger = logging.getLogger(__name__)
|
|
|
30
33
|
|
|
31
34
|
|
|
32
35
|
# Helper to load settings for commands
|
|
36
|
+
def _resolve_settings_source(
|
|
37
|
+
settings_object_path: str | None = None,
|
|
38
|
+
) -> tuple[str | None, str]:
|
|
39
|
+
"""Resolve the settings path and its source.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
A tuple of (settings_path, source_description)
|
|
43
|
+
"""
|
|
44
|
+
if settings_object_path is not None:
|
|
45
|
+
return settings_object_path, "--settings parameter"
|
|
46
|
+
|
|
47
|
+
env_setting = os.getenv("RRQ_SETTINGS")
|
|
48
|
+
if env_setting is not None:
|
|
49
|
+
# Check if a .env file exists to give more specific info
|
|
50
|
+
if DOTENV_AVAILABLE and find_dotenv(usecwd=True):
|
|
51
|
+
# We can't definitively know if it came from .env or system env,
|
|
52
|
+
# but we can indicate both are possible
|
|
53
|
+
return env_setting, "RRQ_SETTINGS env var (system or .env)"
|
|
54
|
+
return env_setting, "RRQ_SETTINGS env var"
|
|
55
|
+
|
|
56
|
+
return None, "built-in defaults"
|
|
57
|
+
|
|
58
|
+
|
|
33
59
|
def _load_app_settings(settings_object_path: str | None = None) -> RRQSettings:
|
|
34
60
|
"""Load the settings object from the given path.
|
|
35
61
|
If not provided, the RRQ_SETTINGS environment variable will be used.
|
|
@@ -142,7 +168,12 @@ async def check_health_async_impl(settings_object_path: str | None = None) -> bo
|
|
|
142
168
|
)
|
|
143
169
|
return True
|
|
144
170
|
except redis.exceptions.ConnectionError as e:
|
|
145
|
-
|
|
171
|
+
click.echo(
|
|
172
|
+
click.style(
|
|
173
|
+
f"ERROR: Redis connection failed during health check: {e}", fg="red"
|
|
174
|
+
),
|
|
175
|
+
err=True,
|
|
176
|
+
)
|
|
146
177
|
click.echo(
|
|
147
178
|
click.style(
|
|
148
179
|
f"Worker Health Check: FAIL - Redis connection error: {e}", fg="red"
|
|
@@ -150,8 +181,12 @@ async def check_health_async_impl(settings_object_path: str | None = None) -> bo
|
|
|
150
181
|
)
|
|
151
182
|
return False
|
|
152
183
|
except Exception as e:
|
|
153
|
-
|
|
154
|
-
|
|
184
|
+
click.echo(
|
|
185
|
+
click.style(
|
|
186
|
+
f"ERROR: An unexpected error occurred during health check: {e}",
|
|
187
|
+
fg="red",
|
|
188
|
+
),
|
|
189
|
+
err=True,
|
|
155
190
|
)
|
|
156
191
|
click.echo(
|
|
157
192
|
click.style(f"Worker Health Check: FAIL - Unexpected error: {e}", fg="red")
|
|
@@ -169,7 +204,7 @@ def start_rrq_worker_subprocess(
|
|
|
169
204
|
queues: list[str] | None = None,
|
|
170
205
|
) -> subprocess.Popen | None:
|
|
171
206
|
"""Start an RRQ worker process, optionally for specific queues."""
|
|
172
|
-
command = ["rrq", "worker", "run"]
|
|
207
|
+
command = ["rrq", "worker", "run", "--num-workers", "1"]
|
|
173
208
|
|
|
174
209
|
if settings_object_path:
|
|
175
210
|
command.extend(["--settings", settings_object_path])
|
|
@@ -219,15 +254,25 @@ def terminate_worker_process(
|
|
|
219
254
|
f"Terminating worker process group for PID {process.pid} (PGID {pgid})..."
|
|
220
255
|
)
|
|
221
256
|
os.killpg(pgid, signal.SIGTERM)
|
|
222
|
-
process.wait(timeout=
|
|
257
|
+
process.wait(timeout=10)
|
|
223
258
|
except subprocess.TimeoutExpired:
|
|
224
|
-
|
|
225
|
-
|
|
259
|
+
click.echo(
|
|
260
|
+
click.style(
|
|
261
|
+
f"WARNING: Worker process {process.pid} did not terminate gracefully (SIGTERM timeout), sending SIGKILL.",
|
|
262
|
+
fg="yellow",
|
|
263
|
+
),
|
|
264
|
+
err=True,
|
|
226
265
|
)
|
|
227
266
|
with suppress(ProcessLookupError):
|
|
228
267
|
os.killpg(os.getpgid(process.pid), signal.SIGKILL)
|
|
229
268
|
except Exception as e:
|
|
230
|
-
|
|
269
|
+
click.echo(
|
|
270
|
+
click.style(
|
|
271
|
+
f"ERROR: Unexpected error checking worker process {process.pid}: {e}",
|
|
272
|
+
fg="red",
|
|
273
|
+
),
|
|
274
|
+
err=True,
|
|
275
|
+
)
|
|
231
276
|
|
|
232
277
|
|
|
233
278
|
async def watch_rrq_worker_impl(
|
|
@@ -236,9 +281,19 @@ async def watch_rrq_worker_impl(
|
|
|
236
281
|
queues: list[str] | None = None,
|
|
237
282
|
) -> None:
|
|
238
283
|
abs_watch_path = os.path.abspath(watch_path)
|
|
239
|
-
click.echo(
|
|
240
|
-
|
|
241
|
-
|
|
284
|
+
click.echo(f"Watching for file changes in {abs_watch_path}...")
|
|
285
|
+
|
|
286
|
+
# Load settings and display source
|
|
287
|
+
click.echo("Loading RRQ Settings... ", nl=False)
|
|
288
|
+
|
|
289
|
+
if settings_object_path:
|
|
290
|
+
click.echo(f"from --settings parameter ({settings_object_path}).")
|
|
291
|
+
elif os.getenv("RRQ_SETTINGS"):
|
|
292
|
+
click.echo(f"from RRQ_SETTINGS env var ({os.getenv('RRQ_SETTINGS')}).")
|
|
293
|
+
elif DOTENV_AVAILABLE and find_dotenv(usecwd=True):
|
|
294
|
+
click.echo("found in .env file.")
|
|
295
|
+
else:
|
|
296
|
+
click.echo("using defaults.")
|
|
242
297
|
worker_process: subprocess.Popen | None = None
|
|
243
298
|
loop = asyncio.get_event_loop()
|
|
244
299
|
shutdown_event = asyncio.Event()
|
|
@@ -278,7 +333,9 @@ async def watch_rrq_worker_impl(
|
|
|
278
333
|
queues=queues,
|
|
279
334
|
)
|
|
280
335
|
except Exception as e:
|
|
281
|
-
|
|
336
|
+
click.echo(
|
|
337
|
+
click.style(f"ERROR: Error in watch_rrq_worker: {e}", fg="red"), err=True
|
|
338
|
+
)
|
|
282
339
|
finally:
|
|
283
340
|
logger.info("Exiting watch mode. Ensuring worker process is terminated.")
|
|
284
341
|
if not shutdown_event.is_set():
|
|
@@ -338,28 +395,85 @@ def worker_cli():
|
|
|
338
395
|
"The specified settings object must include a `job_registry: JobRegistry`."
|
|
339
396
|
),
|
|
340
397
|
)
|
|
398
|
+
@click.option(
|
|
399
|
+
"--num-workers",
|
|
400
|
+
type=int,
|
|
401
|
+
default=None,
|
|
402
|
+
help="Number of parallel worker processes to start. Defaults to the number of CPU cores.",
|
|
403
|
+
)
|
|
341
404
|
def worker_run_command(
|
|
342
405
|
burst: bool,
|
|
343
406
|
queues: tuple[str, ...],
|
|
344
407
|
settings_object_path: str,
|
|
408
|
+
num_workers: int | None,
|
|
345
409
|
):
|
|
346
|
-
"""Run
|
|
410
|
+
"""Run RRQ worker processes.
|
|
347
411
|
Requires an application-specific settings object.
|
|
348
412
|
"""
|
|
349
|
-
|
|
413
|
+
if num_workers is None:
|
|
414
|
+
num_workers = (
|
|
415
|
+
os.cpu_count() or 1
|
|
416
|
+
) # Default to CPU cores, or 1 if cpu_count() is None
|
|
417
|
+
click.echo(
|
|
418
|
+
f"No --num-workers specified, defaulting to {num_workers} (CPU cores)."
|
|
419
|
+
)
|
|
420
|
+
elif num_workers <= 0:
|
|
421
|
+
click.echo(
|
|
422
|
+
click.style("ERROR: --num-workers must be a positive integer.", fg="red"),
|
|
423
|
+
err=True,
|
|
424
|
+
)
|
|
425
|
+
sys.exit(1)
|
|
350
426
|
|
|
351
|
-
#
|
|
352
|
-
|
|
353
|
-
|
|
427
|
+
# Restrict burst mode with multiple workers
|
|
428
|
+
if num_workers > 1 and burst:
|
|
429
|
+
click.echo(
|
|
430
|
+
click.style(
|
|
431
|
+
"ERROR: --burst mode is not supported with multiple workers (--num-workers > 1). "
|
|
432
|
+
"Burst mode cannot coordinate across multiple processes.",
|
|
433
|
+
fg="red",
|
|
434
|
+
),
|
|
435
|
+
err=True,
|
|
436
|
+
)
|
|
437
|
+
sys.exit(1)
|
|
354
438
|
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
439
|
+
# Display settings source
|
|
440
|
+
click.echo("Loading RRQ Settings... ", nl=False)
|
|
441
|
+
if settings_object_path:
|
|
442
|
+
click.echo(f"from --settings parameter ({settings_object_path}).")
|
|
443
|
+
elif os.getenv("RRQ_SETTINGS"):
|
|
444
|
+
click.echo(f"from RRQ_SETTINGS env var ({os.getenv('RRQ_SETTINGS')}).")
|
|
445
|
+
elif DOTENV_AVAILABLE and find_dotenv(usecwd=True):
|
|
446
|
+
click.echo("found in .env file.")
|
|
447
|
+
else:
|
|
448
|
+
click.echo("using defaults.")
|
|
449
|
+
|
|
450
|
+
if num_workers == 1:
|
|
451
|
+
# Run a single worker in the current process
|
|
452
|
+
click.echo(f"Starting 1 RRQ worker process (Burst: {burst})")
|
|
453
|
+
_run_single_worker(
|
|
454
|
+
burst, list(queues) if queues else None, settings_object_path
|
|
455
|
+
)
|
|
456
|
+
else:
|
|
457
|
+
# Run multiple worker subprocesses
|
|
458
|
+
click.echo(f"Starting {num_workers} RRQ worker processes")
|
|
459
|
+
# Burst is guaranteed to be False here
|
|
460
|
+
_run_multiple_workers(
|
|
461
|
+
num_workers, list(queues) if queues else None, settings_object_path
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def _run_single_worker(
|
|
466
|
+
burst: bool,
|
|
467
|
+
queues_arg: list[str] | None,
|
|
468
|
+
settings_object_path: str | None,
|
|
469
|
+
):
|
|
470
|
+
"""Helper function to run a single RRQ worker instance."""
|
|
471
|
+
rrq_settings = _load_app_settings(settings_object_path)
|
|
358
472
|
|
|
359
473
|
if not rrq_settings.job_registry:
|
|
360
474
|
click.echo(
|
|
361
475
|
click.style(
|
|
362
|
-
"ERROR: No '
|
|
476
|
+
"ERROR: No 'job_registry'. You must provide a JobRegistry instance in settings.",
|
|
363
477
|
fg="red",
|
|
364
478
|
),
|
|
365
479
|
err=True,
|
|
@@ -378,22 +492,182 @@ def worker_run_command(
|
|
|
378
492
|
burst=burst,
|
|
379
493
|
)
|
|
380
494
|
|
|
381
|
-
loop = asyncio.get_event_loop()
|
|
382
495
|
try:
|
|
383
|
-
logger.info("Starting worker run loop...")
|
|
384
|
-
|
|
496
|
+
logger.info("Starting worker run loop for single worker...")
|
|
497
|
+
asyncio.run(worker_instance.run())
|
|
385
498
|
except KeyboardInterrupt:
|
|
386
499
|
logger.info("RRQ Worker run interrupted by user (KeyboardInterrupt).")
|
|
387
500
|
except Exception as e:
|
|
388
|
-
|
|
501
|
+
click.echo(
|
|
502
|
+
click.style(f"ERROR: Exception during RRQ Worker run: {e}", fg="red"),
|
|
503
|
+
err=True,
|
|
504
|
+
)
|
|
505
|
+
# Consider re-raising or sys.exit(1) if the exception means failure
|
|
389
506
|
finally:
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
loop.run_until_complete(loop.shutdown_asyncgens())
|
|
393
|
-
loop.close()
|
|
507
|
+
# asyncio.run handles loop cleanup.
|
|
508
|
+
logger.info("RRQ Worker run finished or exited.")
|
|
394
509
|
logger.info("RRQ Worker has shut down.")
|
|
395
510
|
|
|
396
511
|
|
|
512
|
+
def _run_multiple_workers(
|
|
513
|
+
num_workers: int,
|
|
514
|
+
queues: list[str] | None,
|
|
515
|
+
settings_object_path: str | None,
|
|
516
|
+
):
|
|
517
|
+
"""Manages multiple worker subprocesses."""
|
|
518
|
+
processes: list[subprocess.Popen] = []
|
|
519
|
+
# loop = asyncio.get_event_loop() # Not needed here, this function is synchronous
|
|
520
|
+
|
|
521
|
+
original_sigint_handler = signal.getsignal(signal.SIGINT)
|
|
522
|
+
original_sigterm_handler = signal.getsignal(signal.SIGTERM)
|
|
523
|
+
|
|
524
|
+
def sig_handler(signum, frame):
|
|
525
|
+
click.echo(
|
|
526
|
+
f"\nSignal {signal.Signals(signum).name} received. Terminating child workers..."
|
|
527
|
+
)
|
|
528
|
+
# Send SIGTERM to all processes
|
|
529
|
+
for i, p in enumerate(processes):
|
|
530
|
+
if p.poll() is None: # Process is still running
|
|
531
|
+
try:
|
|
532
|
+
pgid = os.getpgid(p.pid)
|
|
533
|
+
click.echo(f"Sending SIGTERM to worker {i + 1} (PID {p.pid})...")
|
|
534
|
+
os.killpg(pgid, signal.SIGTERM)
|
|
535
|
+
except (ProcessLookupError, OSError):
|
|
536
|
+
pass # Process already dead
|
|
537
|
+
# Restore original handlers before exiting or re-raising
|
|
538
|
+
signal.signal(signal.SIGINT, original_sigint_handler)
|
|
539
|
+
signal.signal(signal.SIGTERM, original_sigterm_handler)
|
|
540
|
+
# Propagate signal to ensure parent exits if it was, e.g., a Ctrl+C
|
|
541
|
+
# This is a bit tricky; for now, just exit.
|
|
542
|
+
# A more robust way might involve re-raising the signal if not handled by click/asyncio.
|
|
543
|
+
sys.exit(0)
|
|
544
|
+
|
|
545
|
+
signal.signal(signal.SIGINT, sig_handler)
|
|
546
|
+
signal.signal(signal.SIGTERM, sig_handler)
|
|
547
|
+
|
|
548
|
+
try:
|
|
549
|
+
for i in range(num_workers):
|
|
550
|
+
# Construct the command for the subprocess.
|
|
551
|
+
# Each subprocess runs 'rrq worker run' for a single worker.
|
|
552
|
+
# We pass along relevant flags like --settings, --queue, and --burst.
|
|
553
|
+
# Crucially, we do *not* pass --num-workers to the child,
|
|
554
|
+
# or rather, we could conceptually pass --num-workers 1.
|
|
555
|
+
# Use the rrq executable from the same venv
|
|
556
|
+
venv_bin_dir = os.path.dirname(sys.executable)
|
|
557
|
+
rrq_executable = os.path.join(venv_bin_dir, "rrq")
|
|
558
|
+
cmd = [rrq_executable, "worker", "run", "--num-workers=1"]
|
|
559
|
+
if settings_object_path:
|
|
560
|
+
cmd.extend(["--settings", settings_object_path])
|
|
561
|
+
elif os.getenv("RRQ_SETTINGS"):
|
|
562
|
+
# Pass the RRQ_SETTINGS env var as explicit parameter to subprocess
|
|
563
|
+
cmd.extend(["--settings", os.getenv("RRQ_SETTINGS")])
|
|
564
|
+
else:
|
|
565
|
+
# Default to app.config.rrq.rrq_settings for ResQ
|
|
566
|
+
cmd.extend(["--settings", "app.config.rrq.rrq_settings"])
|
|
567
|
+
if queues:
|
|
568
|
+
for q_name in queues:
|
|
569
|
+
cmd.extend(["--queue", q_name])
|
|
570
|
+
click.echo(f"Starting worker subprocess {i + 1}/{num_workers}...")
|
|
571
|
+
|
|
572
|
+
# Set up environment - add current directory to PYTHONPATH
|
|
573
|
+
env = os.environ.copy()
|
|
574
|
+
current_pythonpath = env.get("PYTHONPATH", "")
|
|
575
|
+
current_dir = os.getcwd()
|
|
576
|
+
if current_pythonpath:
|
|
577
|
+
env["PYTHONPATH"] = f"{current_dir}:{current_pythonpath}"
|
|
578
|
+
else:
|
|
579
|
+
env["PYTHONPATH"] = current_dir
|
|
580
|
+
|
|
581
|
+
# Configure output redirection
|
|
582
|
+
is_testing = "PYTEST_CURRENT_TEST" in os.environ
|
|
583
|
+
stdout_dest = None if not is_testing else subprocess.DEVNULL
|
|
584
|
+
stderr_dest = None if not is_testing else subprocess.DEVNULL
|
|
585
|
+
|
|
586
|
+
process = subprocess.Popen(
|
|
587
|
+
cmd,
|
|
588
|
+
start_new_session=True,
|
|
589
|
+
stdout=stdout_dest,
|
|
590
|
+
stderr=stderr_dest,
|
|
591
|
+
cwd=os.getcwd(),
|
|
592
|
+
env=env,
|
|
593
|
+
)
|
|
594
|
+
processes.append(process)
|
|
595
|
+
click.echo(f"Worker subprocess {i + 1} started with PID {process.pid}")
|
|
596
|
+
|
|
597
|
+
# Wait for all processes to complete
|
|
598
|
+
click.echo(f"All {num_workers} workers started. Press Ctrl+C to stop.")
|
|
599
|
+
exit_codes = []
|
|
600
|
+
|
|
601
|
+
try:
|
|
602
|
+
for p in processes:
|
|
603
|
+
exit_code = p.wait()
|
|
604
|
+
exit_codes.append(exit_code)
|
|
605
|
+
except KeyboardInterrupt:
|
|
606
|
+
# Signal handler has already sent SIGTERM, now wait with timeout
|
|
607
|
+
max_wait = 10
|
|
608
|
+
check_interval = 0.1
|
|
609
|
+
elapsed = 0
|
|
610
|
+
|
|
611
|
+
while elapsed < max_wait:
|
|
612
|
+
time.sleep(check_interval)
|
|
613
|
+
elapsed += check_interval
|
|
614
|
+
|
|
615
|
+
# Check if all processes have terminated
|
|
616
|
+
all_terminated = all(p.poll() is not None for p in processes)
|
|
617
|
+
if all_terminated:
|
|
618
|
+
click.echo("All workers terminated gracefully.")
|
|
619
|
+
break
|
|
620
|
+
else:
|
|
621
|
+
# Timeout reached, force kill remaining processes
|
|
622
|
+
for i, p in enumerate(processes):
|
|
623
|
+
if p.poll() is None:
|
|
624
|
+
try:
|
|
625
|
+
click.echo(
|
|
626
|
+
click.style(
|
|
627
|
+
f"WARNING: Worker {i + 1} did not terminate gracefully, sending SIGKILL.",
|
|
628
|
+
fg="yellow",
|
|
629
|
+
),
|
|
630
|
+
err=True,
|
|
631
|
+
)
|
|
632
|
+
os.killpg(os.getpgid(p.pid), signal.SIGKILL)
|
|
633
|
+
except (ProcessLookupError, OSError):
|
|
634
|
+
pass
|
|
635
|
+
|
|
636
|
+
# Collect exit codes
|
|
637
|
+
for p in processes:
|
|
638
|
+
exit_codes.append(p.wait())
|
|
639
|
+
|
|
640
|
+
# Report results
|
|
641
|
+
for i, exit_code in enumerate(exit_codes):
|
|
642
|
+
click.echo(f"Worker subprocess {i + 1} exited with code {exit_code}")
|
|
643
|
+
if exit_code != 0:
|
|
644
|
+
click.echo(
|
|
645
|
+
click.style(
|
|
646
|
+
f"Worker subprocess {i + 1} failed with exit code {exit_code}",
|
|
647
|
+
fg="red",
|
|
648
|
+
),
|
|
649
|
+
err=True,
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
except Exception as e:
|
|
653
|
+
click.echo(
|
|
654
|
+
click.style(f"ERROR: Error managing worker subprocesses: {e}", fg="red"),
|
|
655
|
+
err=True,
|
|
656
|
+
)
|
|
657
|
+
# Terminate any running processes if an error occurs in the manager
|
|
658
|
+
for p in processes:
|
|
659
|
+
if p.poll() is None: # If process is still running
|
|
660
|
+
terminate_worker_process(p, logger)
|
|
661
|
+
finally:
|
|
662
|
+
logger.info("All worker subprocesses terminated or completed.")
|
|
663
|
+
# Restore original signal handlers
|
|
664
|
+
signal.signal(signal.SIGINT, original_sigint_handler)
|
|
665
|
+
signal.signal(signal.SIGTERM, original_sigterm_handler)
|
|
666
|
+
# Any other cleanup for the parent process
|
|
667
|
+
# No loop to check or close here as this part is synchronous
|
|
668
|
+
logger.info("Parent process for multi-worker management is exiting.")
|
|
669
|
+
|
|
670
|
+
|
|
397
671
|
@worker_cli.command("watch")
|
|
398
672
|
@click.option(
|
|
399
673
|
"--path",
|
rrq/cron.py
CHANGED
|
@@ -42,22 +42,24 @@ def _parse_value(value: str, names: dict[str, int], min_val: int, max_val: int)
|
|
|
42
42
|
return num
|
|
43
43
|
|
|
44
44
|
|
|
45
|
-
def _parse_field(
|
|
45
|
+
def _parse_field(
|
|
46
|
+
field: str, *, names: dict[str, int] | None, min_val: int, max_val: int
|
|
47
|
+
) -> Sequence[int]:
|
|
46
48
|
names = names or {}
|
|
47
49
|
if field == "*":
|
|
48
50
|
return list(range(min_val, max_val + 1))
|
|
49
51
|
values: set[int] = set()
|
|
50
|
-
for part in field.split(
|
|
52
|
+
for part in field.split(","):
|
|
51
53
|
step = 1
|
|
52
|
-
if
|
|
53
|
-
base, step_str = part.split(
|
|
54
|
+
if "/" in part:
|
|
55
|
+
base, step_str = part.split("/", 1)
|
|
54
56
|
step = int(step_str)
|
|
55
57
|
else:
|
|
56
58
|
base = part
|
|
57
59
|
if base == "*":
|
|
58
60
|
start, end = min_val, max_val
|
|
59
|
-
elif
|
|
60
|
-
a, b = base.split(
|
|
61
|
+
elif "-" in base:
|
|
62
|
+
a, b = base.split("-", 1)
|
|
61
63
|
start = _parse_value(a, names, min_val, max_val)
|
|
62
64
|
end = _parse_value(b, names, min_val, max_val)
|
|
63
65
|
else:
|
|
@@ -102,7 +104,7 @@ class CronSchedule:
|
|
|
102
104
|
python_weekday = dt.weekday()
|
|
103
105
|
cron_weekday = (python_weekday + 1) % 7
|
|
104
106
|
dow_match = cron_weekday in self.dow
|
|
105
|
-
|
|
107
|
+
|
|
106
108
|
if self.dom_all and self.dow_all:
|
|
107
109
|
condition = True
|
|
108
110
|
elif self.dom_all:
|
|
@@ -119,7 +121,6 @@ class CronSchedule:
|
|
|
119
121
|
dt += timedelta(minutes=1)
|
|
120
122
|
|
|
121
123
|
|
|
122
|
-
|
|
123
124
|
class CronJob(BaseModel):
|
|
124
125
|
"""Simple cron job specification based on a cron schedule."""
|
|
125
126
|
|
rrq/store.py
CHANGED
|
@@ -21,6 +21,7 @@ from .settings import RRQSettings
|
|
|
21
21
|
|
|
22
22
|
logger = logging.getLogger(__name__)
|
|
23
23
|
|
|
24
|
+
|
|
24
25
|
class JobStore:
|
|
25
26
|
"""Provides an abstraction layer for interacting with Redis for RRQ operations.
|
|
26
27
|
|
|
@@ -38,7 +39,33 @@ class JobStore:
|
|
|
38
39
|
self.redis = AsyncRedis.from_url(
|
|
39
40
|
settings.redis_dsn, decode_responses=False
|
|
40
41
|
) # Work with bytes initially
|
|
41
|
-
|
|
42
|
+
|
|
43
|
+
# LUA scripts for atomic operations
|
|
44
|
+
self._atomic_lock_and_remove_script = """
|
|
45
|
+
-- KEYS: [1] = lock_key, [2] = queue_key
|
|
46
|
+
-- ARGV: [1] = worker_id, [2] = lock_timeout_ms, [3] = job_id
|
|
47
|
+
local lock_result = redis.call('SET', KEYS[1], ARGV[1], 'NX', 'PX', ARGV[2])
|
|
48
|
+
if lock_result then
|
|
49
|
+
local removed_count = redis.call('ZREM', KEYS[2], ARGV[3])
|
|
50
|
+
if removed_count == 0 then
|
|
51
|
+
redis.call('DEL', KEYS[1]) -- Release lock if job wasn't in queue
|
|
52
|
+
return {0, 0} -- {lock_acquired, removed_count}
|
|
53
|
+
end
|
|
54
|
+
return {1, removed_count}
|
|
55
|
+
else
|
|
56
|
+
return {0, 0}
|
|
57
|
+
end
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
self._atomic_retry_script = """
|
|
61
|
+
-- KEYS: [1] = job_key, [2] = queue_key
|
|
62
|
+
-- ARGV: [1] = job_id, [2] = retry_at_score, [3] = error_message, [4] = status
|
|
63
|
+
local new_retry_count = redis.call('HINCRBY', KEYS[1], 'current_retries', 1)
|
|
64
|
+
redis.call('HMSET', KEYS[1], 'status', ARGV[4], 'last_error', ARGV[3])
|
|
65
|
+
redis.call('ZADD', KEYS[2], ARGV[2], ARGV[1])
|
|
66
|
+
return new_retry_count
|
|
67
|
+
"""
|
|
68
|
+
|
|
42
69
|
def _format_queue_key(self, queue_name: str) -> str:
|
|
43
70
|
"""Normalize a queue name or key into a Redis key for ZSET queues."""
|
|
44
71
|
|
|
@@ -308,6 +335,99 @@ class JobStore:
|
|
|
308
335
|
logger.debug(f"Released lock for job {job_id} ({lock_key}).")
|
|
309
336
|
# No need to log if lock didn't exist
|
|
310
337
|
|
|
338
|
+
async def atomic_lock_and_remove_job(
|
|
339
|
+
self, job_id: str, queue_name: str, worker_id: str, lock_timeout_ms: int
|
|
340
|
+
) -> tuple[bool, int]:
|
|
341
|
+
"""Atomically acquires a job lock and removes the job from the queue.
|
|
342
|
+
|
|
343
|
+
This is a critical operation that prevents race conditions between multiple
|
|
344
|
+
workers trying to process the same job.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
job_id: The ID of the job to lock and remove.
|
|
348
|
+
queue_name: The name of the queue to remove the job from.
|
|
349
|
+
worker_id: The ID of the worker attempting to acquire the lock.
|
|
350
|
+
lock_timeout_ms: The lock timeout/TTL in milliseconds.
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
A tuple of (lock_acquired: bool, removed_count: int).
|
|
354
|
+
- lock_acquired: True if the lock was successfully acquired
|
|
355
|
+
- removed_count: Number of jobs removed from the queue (0 or 1)
|
|
356
|
+
"""
|
|
357
|
+
lock_key = f"{LOCK_KEY_PREFIX}{job_id}"
|
|
358
|
+
queue_key = self._format_queue_key(queue_name)
|
|
359
|
+
|
|
360
|
+
result = await self.redis.eval(
|
|
361
|
+
self._atomic_lock_and_remove_script,
|
|
362
|
+
2, # Number of keys
|
|
363
|
+
lock_key,
|
|
364
|
+
queue_key,
|
|
365
|
+
worker_id.encode("utf-8"),
|
|
366
|
+
str(lock_timeout_ms),
|
|
367
|
+
job_id.encode("utf-8"),
|
|
368
|
+
)
|
|
369
|
+
|
|
370
|
+
lock_acquired = bool(result[0])
|
|
371
|
+
removed_count = int(result[1])
|
|
372
|
+
|
|
373
|
+
if lock_acquired and removed_count > 0:
|
|
374
|
+
logger.debug(
|
|
375
|
+
f"Worker {worker_id} atomically acquired lock and removed job {job_id} from queue '{queue_name}'."
|
|
376
|
+
)
|
|
377
|
+
elif not lock_acquired:
|
|
378
|
+
logger.debug(
|
|
379
|
+
f"Worker {worker_id} failed to acquire lock for job {job_id} (already locked by another worker)."
|
|
380
|
+
)
|
|
381
|
+
else:
|
|
382
|
+
logger.warning(
|
|
383
|
+
f"Worker {worker_id} acquired lock for job {job_id} but job was already removed from queue '{queue_name}'."
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
return lock_acquired, removed_count
|
|
387
|
+
|
|
388
|
+
async def atomic_retry_job(
|
|
389
|
+
self,
|
|
390
|
+
job_id: str,
|
|
391
|
+
queue_name: str,
|
|
392
|
+
retry_at_score: float,
|
|
393
|
+
error_message: str,
|
|
394
|
+
status: JobStatus,
|
|
395
|
+
) -> int:
|
|
396
|
+
"""Atomically increments job retry count, updates status/error, and re-queues the job.
|
|
397
|
+
|
|
398
|
+
This prevents race conditions in the retry logic where multiple operations
|
|
399
|
+
need to be performed atomically.
|
|
400
|
+
|
|
401
|
+
Args:
|
|
402
|
+
job_id: The ID of the job to retry.
|
|
403
|
+
queue_name: The name of the queue to add the job back to.
|
|
404
|
+
retry_at_score: The score (timestamp) when the job should be retried.
|
|
405
|
+
error_message: The error message to store.
|
|
406
|
+
status: The job status to set (usually RETRYING).
|
|
407
|
+
|
|
408
|
+
Returns:
|
|
409
|
+
The new retry count after incrementing.
|
|
410
|
+
"""
|
|
411
|
+
job_key = f"{JOB_KEY_PREFIX}{job_id}"
|
|
412
|
+
queue_key = self._format_queue_key(queue_name)
|
|
413
|
+
|
|
414
|
+
new_retry_count = await self.redis.eval(
|
|
415
|
+
self._atomic_retry_script,
|
|
416
|
+
2, # Number of keys
|
|
417
|
+
job_key,
|
|
418
|
+
queue_key,
|
|
419
|
+
job_id.encode("utf-8"),
|
|
420
|
+
str(retry_at_score),
|
|
421
|
+
error_message.encode("utf-8"),
|
|
422
|
+
status.value.encode("utf-8"),
|
|
423
|
+
)
|
|
424
|
+
|
|
425
|
+
new_count = int(new_retry_count)
|
|
426
|
+
logger.debug(
|
|
427
|
+
f"Atomically incremented retries for job {job_id} to {new_count} and re-queued for retry."
|
|
428
|
+
)
|
|
429
|
+
return new_count
|
|
430
|
+
|
|
311
431
|
async def update_job_status(self, job_id: str, status: JobStatus) -> None:
|
|
312
432
|
"""Updates only the status field of a job in its Redis hash.
|
|
313
433
|
|
|
@@ -368,7 +488,7 @@ class JobStore:
|
|
|
368
488
|
pipe.expire(job_key, DEFAULT_DLQ_RESULT_TTL_SECONDS)
|
|
369
489
|
results = await pipe.execute()
|
|
370
490
|
logger.info(f"Moved job {job_id} to DLQ '{dlq_redis_key}'. Results: {results}")
|
|
371
|
-
|
|
491
|
+
|
|
372
492
|
async def requeue_dlq(
|
|
373
493
|
self,
|
|
374
494
|
dlq_name: str,
|
rrq/worker.py
CHANGED
|
@@ -7,6 +7,7 @@ import asyncio
|
|
|
7
7
|
# Use standard logging instead of custom one if appropriate
|
|
8
8
|
import logging
|
|
9
9
|
import os
|
|
10
|
+
import random
|
|
10
11
|
import signal
|
|
11
12
|
import time
|
|
12
13
|
import uuid
|
|
@@ -21,7 +22,6 @@ from rrq.client import RRQClient
|
|
|
21
22
|
|
|
22
23
|
from .constants import (
|
|
23
24
|
DEFAULT_WORKER_ID_PREFIX,
|
|
24
|
-
JOB_KEY_PREFIX,
|
|
25
25
|
)
|
|
26
26
|
from .exc import RetryJob
|
|
27
27
|
from .job import Job, JobStatus
|
|
@@ -91,6 +91,30 @@ class RRQWorker:
|
|
|
91
91
|
f"Initializing RRQWorker {self.worker_id} for queues: {self.queues}"
|
|
92
92
|
)
|
|
93
93
|
|
|
94
|
+
def _calculate_jittered_delay(
|
|
95
|
+
self, base_delay: float, jitter_factor: float = 0.5
|
|
96
|
+
) -> float:
|
|
97
|
+
"""Calculate a jittered delay to prevent thundering herd effects.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
base_delay: The base delay in seconds.
|
|
101
|
+
jitter_factor: Factor for jitter (0.0 to 1.0). Default 0.5 means ±50% jitter.
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
The jittered delay in seconds.
|
|
105
|
+
"""
|
|
106
|
+
# Clamp jitter_factor to safe range to prevent negative delays
|
|
107
|
+
jitter_factor = max(0.0, min(jitter_factor, 0.99))
|
|
108
|
+
|
|
109
|
+
# Calculate jitter range: base_delay * (1 ± jitter_factor)
|
|
110
|
+
min_delay = base_delay * (1 - jitter_factor)
|
|
111
|
+
max_delay = base_delay * (1 + jitter_factor)
|
|
112
|
+
|
|
113
|
+
# Ensure min_delay is always positive
|
|
114
|
+
min_delay = max(0.001, min_delay)
|
|
115
|
+
|
|
116
|
+
return random.uniform(min_delay, max_delay)
|
|
117
|
+
|
|
94
118
|
async def _call_startup_hook(self) -> None:
|
|
95
119
|
if self.settings.on_startup:
|
|
96
120
|
logger.info(f"Worker {self.worker_id} calling on_startup hook...")
|
|
@@ -171,14 +195,19 @@ class RRQWorker:
|
|
|
171
195
|
self.status = "idle (concurrency limit)"
|
|
172
196
|
# At concurrency limit, wait for tasks to finish or poll delay
|
|
173
197
|
|
|
174
|
-
|
|
198
|
+
# Use jittered delay to prevent thundering herd effects
|
|
199
|
+
jittered_delay = self._calculate_jittered_delay(
|
|
200
|
+
self.settings.default_poll_delay_seconds
|
|
201
|
+
)
|
|
202
|
+
await asyncio.sleep(jittered_delay)
|
|
175
203
|
except Exception as e:
|
|
176
204
|
logger.error(
|
|
177
205
|
f"Worker {self.worker_id} encountered error in main run loop: {e}",
|
|
178
206
|
exc_info=True,
|
|
179
207
|
)
|
|
180
|
-
# Avoid tight loop on persistent errors
|
|
181
|
-
|
|
208
|
+
# Avoid tight loop on persistent errors with jittered delay
|
|
209
|
+
jittered_delay = self._calculate_jittered_delay(1.0)
|
|
210
|
+
await asyncio.sleep(jittered_delay)
|
|
182
211
|
|
|
183
212
|
logger.info(
|
|
184
213
|
f"Worker {self.worker_id} shutdown signal received. Draining tasks..."
|
|
@@ -222,53 +251,65 @@ class RRQWorker:
|
|
|
222
251
|
if fetched_count >= count or self._shutdown_event.is_set():
|
|
223
252
|
break
|
|
224
253
|
|
|
225
|
-
# Attempt to acquire semaphore *before* trying to process
|
|
226
|
-
await self._semaphore.acquire()
|
|
227
254
|
try:
|
|
228
|
-
#
|
|
229
|
-
|
|
230
|
-
if
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
255
|
+
# Try to acquire lock and remove from queue first (without semaphore)
|
|
256
|
+
job_acquired = await self._try_acquire_job(job_id, queue_name)
|
|
257
|
+
if job_acquired:
|
|
258
|
+
# Only acquire semaphore after successfully getting the job
|
|
259
|
+
await self._semaphore.acquire()
|
|
260
|
+
try:
|
|
261
|
+
# Process the job (we already have the lock and removed from queue)
|
|
262
|
+
# The semaphore will be released when the job task completes
|
|
263
|
+
await self._process_acquired_job(
|
|
264
|
+
job_acquired, queue_name
|
|
265
|
+
)
|
|
266
|
+
fetched_count += 1
|
|
267
|
+
except Exception as e_process:
|
|
268
|
+
logger.error(
|
|
269
|
+
f"Worker {self.worker_id} exception processing acquired job {job_id}: {e_process}",
|
|
270
|
+
exc_info=True,
|
|
271
|
+
)
|
|
272
|
+
# Release lock and semaphore since processing failed
|
|
273
|
+
await self.job_store.release_job_lock(job_id)
|
|
274
|
+
self._semaphore.release()
|
|
275
|
+
# If job_acquired is None, another worker got it - continue to next job
|
|
235
276
|
except Exception as e_try:
|
|
236
|
-
# Catch errors during the
|
|
277
|
+
# Catch errors during the job acquisition itself
|
|
237
278
|
logger.error(
|
|
238
|
-
f"Worker {self.worker_id} exception trying to
|
|
279
|
+
f"Worker {self.worker_id} exception trying to acquire job {job_id}: {e_try}",
|
|
239
280
|
exc_info=True,
|
|
240
281
|
)
|
|
241
|
-
self._semaphore.release() # Ensure semaphore is released on error
|
|
242
282
|
|
|
243
283
|
except Exception as e_poll:
|
|
244
284
|
logger.error(
|
|
245
285
|
f"Worker {self.worker_id} error polling queue '{queue_name}': {e_poll}",
|
|
246
286
|
exc_info=True,
|
|
247
287
|
)
|
|
248
|
-
|
|
288
|
+
# Avoid tight loop on polling error with jittered delay
|
|
289
|
+
jittered_delay = self._calculate_jittered_delay(1.0)
|
|
290
|
+
await asyncio.sleep(jittered_delay)
|
|
249
291
|
# For burst mode, return number of jobs fetched in this poll
|
|
250
292
|
return fetched_count
|
|
251
293
|
|
|
252
|
-
async def
|
|
253
|
-
"""Attempts to lock
|
|
294
|
+
async def _try_acquire_job(self, job_id: str, queue_name: str) -> Optional[Job]:
|
|
295
|
+
"""Attempts to atomically lock and remove a job from the queue.
|
|
254
296
|
|
|
255
297
|
Args:
|
|
256
|
-
job_id: The ID of the job to attempt
|
|
298
|
+
job_id: The ID of the job to attempt acquiring.
|
|
257
299
|
queue_name: The name of the queue the job ID was retrieved from.
|
|
258
300
|
|
|
259
301
|
Returns:
|
|
260
|
-
|
|
261
|
-
(e.g., lock conflict, job definition not found, already removed).
|
|
302
|
+
The Job object if successfully acquired, None otherwise.
|
|
262
303
|
"""
|
|
263
304
|
logger.debug(
|
|
264
|
-
f"Worker {self.worker_id} attempting to
|
|
305
|
+
f"Worker {self.worker_id} attempting to acquire job {job_id} from queue '{queue_name}'"
|
|
265
306
|
)
|
|
266
307
|
job = await self.job_store.get_job_definition(job_id)
|
|
267
308
|
if not job:
|
|
268
309
|
logger.warning(
|
|
269
|
-
f"Worker {self.worker_id} job definition {job_id} not found during
|
|
310
|
+
f"Worker {self.worker_id} job definition {job_id} not found during _try_acquire_job from queue {queue_name}."
|
|
270
311
|
)
|
|
271
|
-
return
|
|
312
|
+
return None # Job vanished between poll and fetch?
|
|
272
313
|
|
|
273
314
|
# Determine job-specific timeout and calculate lock timeout
|
|
274
315
|
job_timeout = (
|
|
@@ -280,32 +321,28 @@ class RRQWorker:
|
|
|
280
321
|
job_timeout + self.settings.default_lock_timeout_extension_seconds
|
|
281
322
|
) * 1000
|
|
282
323
|
|
|
283
|
-
#
|
|
284
|
-
lock_acquired = await self.job_store.
|
|
285
|
-
job.id, self.worker_id, int(lock_timeout_ms)
|
|
324
|
+
# Atomically acquire the processing lock and remove from queue
|
|
325
|
+
lock_acquired, removed_count = await self.job_store.atomic_lock_and_remove_job(
|
|
326
|
+
job.id, queue_name, self.worker_id, int(lock_timeout_ms)
|
|
286
327
|
)
|
|
287
|
-
if not lock_acquired:
|
|
288
|
-
logger.debug(
|
|
289
|
-
f"Worker {self.worker_id} failed to acquire lock for job {job.id} (already locked by another worker)."
|
|
290
|
-
)
|
|
291
|
-
return False # Another worker got there first
|
|
292
328
|
|
|
293
|
-
|
|
329
|
+
if not lock_acquired or removed_count == 0:
|
|
330
|
+
return None # Another worker got there first
|
|
294
331
|
|
|
295
|
-
#
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
)
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
)
|
|
305
|
-
await self.job_store.release_job_lock(job.id) # Release the acquired lock
|
|
306
|
-
return False # Job processed by another worker between our poll and lock
|
|
332
|
+
# Successfully acquired the job
|
|
333
|
+
logger.debug(f"Worker {self.worker_id} successfully acquired job {job.id}")
|
|
334
|
+
return job
|
|
335
|
+
|
|
336
|
+
async def _process_acquired_job(self, job: Job, queue_name: str) -> None:
|
|
337
|
+
"""Processes a job that has already been acquired (locked and removed from queue).
|
|
338
|
+
|
|
339
|
+
Note: This method assumes the worker has already acquired the concurrency semaphore.
|
|
340
|
+
The semaphore will be released when the job task completes via _task_cleanup.
|
|
307
341
|
|
|
308
|
-
|
|
342
|
+
Args:
|
|
343
|
+
job: The Job object that was successfully acquired.
|
|
344
|
+
queue_name: The name of the queue the job was retrieved from.
|
|
345
|
+
"""
|
|
309
346
|
try:
|
|
310
347
|
await self.job_store.update_job_status(job.id, JobStatus.ACTIVE)
|
|
311
348
|
logger.debug(
|
|
@@ -313,21 +350,58 @@ class RRQWorker:
|
|
|
313
350
|
)
|
|
314
351
|
|
|
315
352
|
# Create and track the execution task
|
|
353
|
+
# The semaphore will be released when this task completes
|
|
316
354
|
task = self._loop.create_task(self._execute_job(job, queue_name))
|
|
317
355
|
self._running_tasks.add(task)
|
|
318
356
|
task.add_done_callback(lambda t: self._task_cleanup(t, self._semaphore))
|
|
319
357
|
logger.info(
|
|
320
358
|
f"Worker {self.worker_id} started job {job.id} ('{job.function_name}') from queue '{queue_name}'"
|
|
321
359
|
)
|
|
322
|
-
return True
|
|
323
360
|
except Exception as e_start:
|
|
324
361
|
# Catch errors during status update or task creation
|
|
325
362
|
logger.error(
|
|
326
|
-
f"Worker {self.worker_id} failed to start task for job {job.id} after
|
|
363
|
+
f"Worker {self.worker_id} failed to start task for job {job.id} after acquisition: {e_start}",
|
|
327
364
|
exc_info=True,
|
|
328
365
|
)
|
|
329
|
-
#
|
|
366
|
+
# Release the lock since task wasn't started
|
|
330
367
|
await self.job_store.release_job_lock(job.id)
|
|
368
|
+
raise # Re-raise to be handled by caller
|
|
369
|
+
|
|
370
|
+
async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
|
|
371
|
+
"""Attempts to lock, fetch definition, and start the execution task for a specific job.
|
|
372
|
+
|
|
373
|
+
This method is kept for backward compatibility and uses the optimized approach internally.
|
|
374
|
+
For new code, prefer using _try_acquire_job and _process_acquired_job separately.
|
|
375
|
+
|
|
376
|
+
Note: This method handles semaphore acquisition internally for backward compatibility.
|
|
377
|
+
|
|
378
|
+
Args:
|
|
379
|
+
job_id: The ID of the job to attempt processing.
|
|
380
|
+
queue_name: The name of the queue the job ID was retrieved from.
|
|
381
|
+
|
|
382
|
+
Returns:
|
|
383
|
+
True if the job processing task was successfully started, False otherwise
|
|
384
|
+
(e.g., lock conflict, job definition not found, already removed).
|
|
385
|
+
"""
|
|
386
|
+
# Use the optimized approach: acquire job first, then process
|
|
387
|
+
job_acquired = await self._try_acquire_job(job_id, queue_name)
|
|
388
|
+
if not job_acquired:
|
|
389
|
+
return False
|
|
390
|
+
|
|
391
|
+
# For backward compatibility, acquire semaphore here since old callers expect it
|
|
392
|
+
await self._semaphore.acquire()
|
|
393
|
+
try:
|
|
394
|
+
# Process the acquired job
|
|
395
|
+
await self._process_acquired_job(job_acquired, queue_name)
|
|
396
|
+
return True
|
|
397
|
+
except Exception as e_process:
|
|
398
|
+
logger.error(
|
|
399
|
+
f"Worker {self.worker_id} failed to process acquired job {job_id}: {e_process}",
|
|
400
|
+
exc_info=True,
|
|
401
|
+
)
|
|
402
|
+
# Release semaphore on error since _process_acquired_job doesn't handle it
|
|
403
|
+
self._semaphore.release()
|
|
404
|
+
# Lock is already released in _process_acquired_job on error
|
|
331
405
|
return False
|
|
332
406
|
|
|
333
407
|
async def _execute_job(self, job: Job, queue_name: str) -> None:
|
|
@@ -475,63 +549,54 @@ class RRQWorker:
|
|
|
475
549
|
appropriate delay (custom or exponential backoff) or moves to DLQ.
|
|
476
550
|
"""
|
|
477
551
|
log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
|
|
478
|
-
|
|
479
|
-
try:
|
|
480
|
-
# Atomically increment retries in the store.
|
|
481
|
-
new_retry_count = await self.job_store.increment_job_retries(job.id)
|
|
482
|
-
max_retries = (
|
|
483
|
-
job.max_retries
|
|
484
|
-
) # Use max_retries from the job object passed in
|
|
485
|
-
|
|
486
|
-
if new_retry_count < max_retries:
|
|
487
|
-
# Update status and error atomically
|
|
488
|
-
await self.job_store.redis.hset(
|
|
489
|
-
job_key,
|
|
490
|
-
mapping={
|
|
491
|
-
"status": JobStatus.RETRYING.value,
|
|
492
|
-
"last_error": str(exc),
|
|
493
|
-
},
|
|
494
|
-
)
|
|
495
|
-
logger.debug(f"{log_prefix} status set to RETRYING, error saved.")
|
|
496
|
-
|
|
497
|
-
# Determine deferral time
|
|
498
|
-
defer_seconds = exc.defer_seconds
|
|
499
|
-
if defer_seconds is None:
|
|
500
|
-
# Create a temporary job representation for backoff calculation
|
|
501
|
-
# using the *new* retry count.
|
|
502
|
-
temp_job_for_backoff = Job(
|
|
503
|
-
id=job.id,
|
|
504
|
-
function_name=job.function_name,
|
|
505
|
-
current_retries=new_retry_count, # Use updated count
|
|
506
|
-
max_retries=max_retries, # Ensure this is passed
|
|
507
|
-
)
|
|
508
|
-
defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
|
|
509
|
-
defer_seconds = defer_ms / 1000.0
|
|
510
|
-
else:
|
|
511
|
-
logger.debug(
|
|
512
|
-
f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
|
|
513
|
-
)
|
|
552
|
+
max_retries = job.max_retries
|
|
514
553
|
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
logger.info(
|
|
521
|
-
f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
|
|
522
|
-
f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
|
|
523
|
-
)
|
|
524
|
-
else:
|
|
525
|
-
# Max retries exceeded even though RetryJob was raised
|
|
554
|
+
try:
|
|
555
|
+
# Check if we would exceed max retries
|
|
556
|
+
anticipated_retry_count = job.current_retries + 1
|
|
557
|
+
if anticipated_retry_count >= max_retries:
|
|
558
|
+
# Max retries exceeded, increment retry count and move directly to DLQ
|
|
526
559
|
logger.warning(
|
|
527
560
|
f"{log_prefix} max retries ({max_retries}) exceeded "
|
|
528
|
-
f"
|
|
561
|
+
f"with RetryJob exception. Moving to DLQ."
|
|
529
562
|
)
|
|
530
|
-
#
|
|
563
|
+
# Increment retry count before moving to DLQ
|
|
564
|
+
await self.job_store.increment_job_retries(job.id)
|
|
531
565
|
error_msg = (
|
|
532
566
|
str(exc) or f"Max retries ({max_retries}) exceeded after RetryJob"
|
|
533
567
|
)
|
|
534
568
|
await self._move_to_dlq(job, queue_name, error_msg)
|
|
569
|
+
return
|
|
570
|
+
|
|
571
|
+
# Determine deferral time
|
|
572
|
+
defer_seconds = exc.defer_seconds
|
|
573
|
+
if defer_seconds is None:
|
|
574
|
+
# Create a temporary job representation for backoff calculation
|
|
575
|
+
temp_job_for_backoff = Job(
|
|
576
|
+
id=job.id,
|
|
577
|
+
function_name=job.function_name,
|
|
578
|
+
current_retries=anticipated_retry_count, # Use anticipated count
|
|
579
|
+
max_retries=max_retries,
|
|
580
|
+
)
|
|
581
|
+
defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
|
|
582
|
+
defer_seconds = defer_ms / 1000.0
|
|
583
|
+
else:
|
|
584
|
+
logger.debug(
|
|
585
|
+
f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
retry_at_score = (time.time() + defer_seconds) * 1000
|
|
589
|
+
target_queue = job.queue_name or self.settings.default_queue_name
|
|
590
|
+
|
|
591
|
+
# Atomically increment retries, update status/error, and re-queue
|
|
592
|
+
new_retry_count = await self.job_store.atomic_retry_job(
|
|
593
|
+
job.id, target_queue, retry_at_score, str(exc), JobStatus.RETRYING
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
logger.info(
|
|
597
|
+
f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
|
|
598
|
+
f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
|
|
599
|
+
)
|
|
535
600
|
except Exception as e_handle:
|
|
536
601
|
logger.exception(
|
|
537
602
|
f"{log_prefix} CRITICAL error during RetryJob processing: {e_handle}"
|
|
@@ -549,48 +614,43 @@ class RRQWorker:
|
|
|
549
614
|
logger.debug(f"{log_prefix} processing general failure: {type(exc).__name__}")
|
|
550
615
|
|
|
551
616
|
try:
|
|
552
|
-
new_retry_count = await self.job_store.increment_job_retries(job.id)
|
|
553
|
-
# Re-fetch job state after incrementing retries might be safer if fields changed?
|
|
554
|
-
# For now, assume the job object passed in is mostly accurate except for retry count.
|
|
555
|
-
# Use max_retries from the job object passed in.
|
|
556
617
|
max_retries = job.max_retries
|
|
557
618
|
last_error_str = str(exc)
|
|
558
619
|
|
|
559
|
-
if
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
id=job.id,
|
|
564
|
-
function_name=job.function_name,
|
|
565
|
-
current_retries=new_retry_count,
|
|
566
|
-
max_retries=max_retries,
|
|
567
|
-
)
|
|
568
|
-
)
|
|
569
|
-
retry_at_score = (time.time() * 1000) + defer_ms
|
|
570
|
-
target_queue = job.queue_name or self.settings.default_queue_name
|
|
571
|
-
|
|
572
|
-
# Atomically update status/error and re-add to queue (if possible, else separate)
|
|
573
|
-
# For now, separate HSET and ZADD
|
|
574
|
-
await self.job_store.redis.hset(
|
|
575
|
-
f"{JOB_KEY_PREFIX}{job.id}",
|
|
576
|
-
mapping={
|
|
577
|
-
"status": JobStatus.RETRYING.value,
|
|
578
|
-
"last_error": last_error_str,
|
|
579
|
-
},
|
|
580
|
-
)
|
|
581
|
-
await self.job_store.add_job_to_queue(
|
|
582
|
-
target_queue, job.id, retry_at_score
|
|
583
|
-
)
|
|
584
|
-
logger.info(
|
|
585
|
-
f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
|
|
586
|
-
f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
|
|
587
|
-
)
|
|
588
|
-
else: # Max retries reached
|
|
620
|
+
# Check if we would exceed max retries
|
|
621
|
+
anticipated_retry_count = job.current_retries + 1
|
|
622
|
+
if anticipated_retry_count >= max_retries:
|
|
623
|
+
# Max retries exceeded, increment retry count and move directly to DLQ
|
|
589
624
|
logger.warning(
|
|
590
625
|
f"{log_prefix} failed after max retries ({max_retries}). Moving to DLQ. Error: {str(exc)[:100]}..."
|
|
591
626
|
)
|
|
627
|
+
# Increment retry count before moving to DLQ
|
|
628
|
+
await self.job_store.increment_job_retries(job.id)
|
|
592
629
|
# _move_to_dlq handles setting FAILED status, completion time, and last error.
|
|
593
630
|
await self._move_to_dlq(job, queue_name, last_error_str)
|
|
631
|
+
return
|
|
632
|
+
|
|
633
|
+
# Calculate backoff delay using anticipated retry count
|
|
634
|
+
defer_ms = self._calculate_backoff_ms(
|
|
635
|
+
Job(
|
|
636
|
+
id=job.id,
|
|
637
|
+
function_name=job.function_name,
|
|
638
|
+
current_retries=anticipated_retry_count, # Use anticipated count
|
|
639
|
+
max_retries=max_retries,
|
|
640
|
+
)
|
|
641
|
+
)
|
|
642
|
+
retry_at_score = (time.time() * 1000) + defer_ms
|
|
643
|
+
target_queue = job.queue_name or self.settings.default_queue_name
|
|
644
|
+
|
|
645
|
+
# Atomically increment retries, update status/error, and re-queue
|
|
646
|
+
new_retry_count = await self.job_store.atomic_retry_job(
|
|
647
|
+
job.id, target_queue, retry_at_score, last_error_str, JobStatus.RETRYING
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
logger.info(
|
|
651
|
+
f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
|
|
652
|
+
f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
|
|
653
|
+
)
|
|
594
654
|
|
|
595
655
|
except Exception as e_handle:
|
|
596
656
|
logger.exception(
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: rrq
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: RRQ is a Python library for creating reliable job queues using Redis and asyncio
|
|
5
5
|
Project-URL: Homepage, https://github.com/getresq/rrq
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/getresq/rrq/issues
|
|
@@ -265,7 +265,8 @@ RRQ provides a command-line interface (CLI) for managing workers and performing
|
|
|
265
265
|
- **`rrq worker run`** - Run an RRQ worker process.
|
|
266
266
|
- `--settings` (optional): Specify the Python path to your settings object (e.g., `myapp.worker_config.rrq_settings`). If not provided, it will use the `RRQ_SETTINGS` environment variable or default to a basic `RRQSettings` object.
|
|
267
267
|
- `--queue` (optional, multiple): Specify queue(s) to poll. Defaults to the `default_queue_name` in settings.
|
|
268
|
-
- `--burst` (flag): Run the worker in burst mode to process one job or batch and then exit.
|
|
268
|
+
- `--burst` (flag): Run the worker in burst mode to process one job or batch and then exit. Cannot be used with `--num-workers > 1`.
|
|
269
|
+
- `--num-workers` (optional, integer): Number of parallel worker processes to start. Defaults to the number of CPU cores available on the machine. Cannot be used with `--burst` mode.
|
|
269
270
|
- **`rrq worker watch`** - Run an RRQ worker with auto-restart on file changes.
|
|
270
271
|
- `--path` (optional): Directory path to watch for changes. Defaults to the current directory.
|
|
271
272
|
- `--settings` (optional): Same as above.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
rrq/__init__.py,sha256=3WYv9UkvnCbjKXrvmqiLm7yuVVQiLclbVCOXq5wb6ZM,290
|
|
2
|
+
rrq/cli.py,sha256=7wLO0gRl8Qe1Tf6dyELJnVfJc6rr5pw6m6Mj7qMl3bk,27550
|
|
3
|
+
rrq/client.py,sha256=5_bmZ05LKIfY9WFSKU-nYawEupsnrnHT2HewXfC2Ahg,7831
|
|
4
|
+
rrq/constants.py,sha256=F_uZgBI3h00MctnEjBjiCGMrg5jUaz5Bz9I1vkyqNrs,1654
|
|
5
|
+
rrq/cron.py,sha256=etDwnOXr5Ys1Vt08oYQsMjtLbPsjMWMvbund4bWOlCA,5237
|
|
6
|
+
rrq/exc.py,sha256=NJq3C7pUfcd47AB8kghIN8vdY0l90UrsHQEg4McBHP8,1281
|
|
7
|
+
rrq/job.py,sha256=eUbl33QDqDMXPKpo-0dl0Mp29LWWmtbBgRw0sclcwJ4,4011
|
|
8
|
+
rrq/registry.py,sha256=E9W_zx3QiKTBwMOGearaNpDKBDB87JIn0RlMQ3sAcP0,2925
|
|
9
|
+
rrq/settings.py,sha256=AxzSe_rw7-yduKST2c9mPunQWqPE2537XcC_XlMoHWM,4535
|
|
10
|
+
rrq/store.py,sha256=TrtVojnT7wJNV1jaXsjHXQa3IDeQJ4-0PKDCEjZuDi0,29537
|
|
11
|
+
rrq/worker.py,sha256=1bbZkUCSHwFzpsxcsc84RU_7h8dCnFItJCZ4SG4zASc,44940
|
|
12
|
+
rrq-0.5.0.dist-info/METADATA,sha256=vud54ZneWCUMJ0pjg_FmUHaBo1oxqOBbw2yC63gMKy0,13140
|
|
13
|
+
rrq-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
14
|
+
rrq-0.5.0.dist-info/entry_points.txt,sha256=f8eFjk2ygDSyu9USwXGj5IM8xeyQqZgDa1rSrCj4Mis,36
|
|
15
|
+
rrq-0.5.0.dist-info/licenses/LICENSE,sha256=XDvu5hKdS2-_ByiSj3tiu_3zSsrXXoJsgbILGoMpKCw,554
|
|
16
|
+
rrq-0.5.0.dist-info/RECORD,,
|
rrq-0.4.0.dist-info/RECORD
DELETED
|
@@ -1,16 +0,0 @@
|
|
|
1
|
-
rrq/__init__.py,sha256=3WYv9UkvnCbjKXrvmqiLm7yuVVQiLclbVCOXq5wb6ZM,290
|
|
2
|
-
rrq/cli.py,sha256=_LbaAH_w2a0VNRR0EctuE4afl-wccvMY2w2VbehFDEQ,16980
|
|
3
|
-
rrq/client.py,sha256=5_bmZ05LKIfY9WFSKU-nYawEupsnrnHT2HewXfC2Ahg,7831
|
|
4
|
-
rrq/constants.py,sha256=F_uZgBI3h00MctnEjBjiCGMrg5jUaz5Bz9I1vkyqNrs,1654
|
|
5
|
-
rrq/cron.py,sha256=9lxJ1OnrTbavJvbIdPp6u5ncYgyD35vRPsSulpVrQko,5244
|
|
6
|
-
rrq/exc.py,sha256=NJq3C7pUfcd47AB8kghIN8vdY0l90UrsHQEg4McBHP8,1281
|
|
7
|
-
rrq/job.py,sha256=eUbl33QDqDMXPKpo-0dl0Mp29LWWmtbBgRw0sclcwJ4,4011
|
|
8
|
-
rrq/registry.py,sha256=E9W_zx3QiKTBwMOGearaNpDKBDB87JIn0RlMQ3sAcP0,2925
|
|
9
|
-
rrq/settings.py,sha256=AxzSe_rw7-yduKST2c9mPunQWqPE2537XcC_XlMoHWM,4535
|
|
10
|
-
rrq/store.py,sha256=teO0Af8hzBiu7-dFn6_2lz5X90LAZXmtg0VDZuQoAwk,24972
|
|
11
|
-
rrq/worker.py,sha256=KspmZOL6i_dfIypcBi0UpQDpz2NrCj3vEl6CwTNlLKo,42479
|
|
12
|
-
rrq-0.4.0.dist-info/METADATA,sha256=2SFZJlfgwFSpmWfylQ6rSV072HGXlA2MBcECJppV_DY,12914
|
|
13
|
-
rrq-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
14
|
-
rrq-0.4.0.dist-info/entry_points.txt,sha256=f8eFjk2ygDSyu9USwXGj5IM8xeyQqZgDa1rSrCj4Mis,36
|
|
15
|
-
rrq-0.4.0.dist-info/licenses/LICENSE,sha256=XDvu5hKdS2-_ByiSj3tiu_3zSsrXXoJsgbILGoMpKCw,554
|
|
16
|
-
rrq-0.4.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|