PyPI - rrq - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

rrq 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

rrq/cli.py +303 -29
rrq/cron.py +9 -8
rrq/store.py +122 -2
rrq/worker.py +193 -133
{rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/METADATA +3 -2
rrq-0.5.0.dist-info/RECORD +16 -0
rrq-0.4.0.dist-info/RECORD +0 -16
{rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/WHEEL +0 -0
{rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/entry_points.txt +0 -0
{rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/licenses/LICENSE +0 -0

rrq/cli.py CHANGED Viewed

@@ -7,6 +7,9 @@ import os
 import signal
 import subprocess
 import sys
+import time
+# import multiprocessing # No longer needed directly, os.cpu_count() is sufficient
 from contextlib import suppress
 import click
@@ -30,6 +33,29 @@ logger = logging.getLogger(__name__)
 # Helper to load settings for commands
+def _resolve_settings_source(
+    settings_object_path: str | None = None,
+) -> tuple[str | None, str]:
+    """Resolve the settings path and its source.
+    Returns:
+        A tuple of (settings_path, source_description)
+    """
+    if settings_object_path is not None:
+        return settings_object_path, "--settings parameter"
+    env_setting = os.getenv("RRQ_SETTINGS")
+    if env_setting is not None:
+        # Check if a .env file exists to give more specific info
+        if DOTENV_AVAILABLE and find_dotenv(usecwd=True):
+            # We can't definitively know if it came from .env or system env,
+            # but we can indicate both are possible
+            return env_setting, "RRQ_SETTINGS env var (system or .env)"
+        return env_setting, "RRQ_SETTINGS env var"
+    return None, "built-in defaults"
 def _load_app_settings(settings_object_path: str | None = None) -> RRQSettings:
     """Load the settings object from the given path.
     If not provided, the RRQ_SETTINGS environment variable will be used.
@@ -142,7 +168,12 @@ async def check_health_async_impl(settings_object_path: str | None = None) -> bo
                 )
         return True
     except redis.exceptions.ConnectionError as e:
-        logger.error(f"Redis connection failed during health check: {e}", exc_info=True)
+        click.echo(
+            click.style(
+                f"ERROR: Redis connection failed during health check: {e}", fg="red"
+            ),
+            err=True,
+        )
         click.echo(
             click.style(
                 f"Worker Health Check: FAIL - Redis connection error: {e}", fg="red"
@@ -150,8 +181,12 @@ async def check_health_async_impl(settings_object_path: str | None = None) -> bo
         )
         return False
     except Exception as e:
-        logger.error(
-            f"An unexpected error occurred during health check: {e}", exc_info=True
+        click.echo(
+            click.style(
+                f"ERROR: An unexpected error occurred during health check: {e}",
+                fg="red",
+            ),
+            err=True,
         )
         click.echo(
             click.style(f"Worker Health Check: FAIL - Unexpected error: {e}", fg="red")
@@ -169,7 +204,7 @@ def start_rrq_worker_subprocess(
     queues: list[str] | None = None,
 ) -> subprocess.Popen | None:
     """Start an RRQ worker process, optionally for specific queues."""
-    command = ["rrq", "worker", "run"]
+    command = ["rrq", "worker", "run", "--num-workers", "1"]
     if settings_object_path:
         command.extend(["--settings", settings_object_path])
@@ -219,15 +254,25 @@ def terminate_worker_process(
             f"Terminating worker process group for PID {process.pid} (PGID {pgid})..."
         )
         os.killpg(pgid, signal.SIGTERM)
-        process.wait(timeout=5)
+        process.wait(timeout=10)
     except subprocess.TimeoutExpired:
-        logger.warning(
-            f"Worker process {process.pid} did not terminate gracefully (SIGTERM timeout), sending SIGKILL."
+        click.echo(
+            click.style(
+                f"WARNING: Worker process {process.pid} did not terminate gracefully (SIGTERM timeout), sending SIGKILL.",
+                fg="yellow",
+            ),
+            err=True,
         )
         with suppress(ProcessLookupError):
             os.killpg(os.getpgid(process.pid), signal.SIGKILL)
     except Exception as e:
-        logger.error(f"Unexpected error checking worker process {process.pid}: {e}")
+        click.echo(
+            click.style(
+                f"ERROR: Unexpected error checking worker process {process.pid}: {e}",
+                fg="red",
+            ),
+            err=True,
+        )
 async def watch_rrq_worker_impl(
@@ -236,9 +281,19 @@ async def watch_rrq_worker_impl(
     queues: list[str] | None = None,
 ) -> None:
     abs_watch_path = os.path.abspath(watch_path)
-    click.echo(
-        f"Watching for file changes in {abs_watch_path} to restart RRQ worker (app settings: {settings_object_path})..."
-    )
+    click.echo(f"Watching for file changes in {abs_watch_path}...")
+    # Load settings and display source
+    click.echo("Loading RRQ Settings... ", nl=False)
+    if settings_object_path:
+        click.echo(f"from --settings parameter ({settings_object_path}).")
+    elif os.getenv("RRQ_SETTINGS"):
+        click.echo(f"from RRQ_SETTINGS env var ({os.getenv('RRQ_SETTINGS')}).")
+    elif DOTENV_AVAILABLE and find_dotenv(usecwd=True):
+        click.echo("found in .env file.")
+    else:
+        click.echo("using defaults.")
     worker_process: subprocess.Popen | None = None
     loop = asyncio.get_event_loop()
     shutdown_event = asyncio.Event()
@@ -278,7 +333,9 @@ async def watch_rrq_worker_impl(
                 queues=queues,
             )
     except Exception as e:
-        logger.error(f"Error in watch_rrq_worker: {e}", exc_info=True)
+        click.echo(
+            click.style(f"ERROR: Error in watch_rrq_worker: {e}", fg="red"), err=True
+        )
     finally:
         logger.info("Exiting watch mode. Ensuring worker process is terminated.")
         if not shutdown_event.is_set():
@@ -338,28 +395,85 @@ def worker_cli():
         "The specified settings object must include a `job_registry: JobRegistry`."
     ),
 )
+@click.option(
+    "--num-workers",
+    type=int,
+    default=None,
+    help="Number of parallel worker processes to start. Defaults to the number of CPU cores.",
+)
 def worker_run_command(
     burst: bool,
     queues: tuple[str, ...],
     settings_object_path: str,
+    num_workers: int | None,
 ):
-    """Run an RRQ worker process.
+    """Run RRQ worker processes.
     Requires an application-specific settings object.
     """
-    rrq_settings = _load_app_settings(settings_object_path)
+    if num_workers is None:
+        num_workers = (
+            os.cpu_count() or 1
+        )  # Default to CPU cores, or 1 if cpu_count() is None
+        click.echo(
+            f"No --num-workers specified, defaulting to {num_workers} (CPU cores)."
+        )
+    elif num_workers <= 0:
+        click.echo(
+            click.style("ERROR: --num-workers must be a positive integer.", fg="red"),
+            err=True,
+        )
+        sys.exit(1)
-    # Determine queues to poll
-    queues_arg = list(queues) if queues else None
-    # Run worker in foreground (burst or continuous mode)
+    # Restrict burst mode with multiple workers
+    if num_workers > 1 and burst:
+        click.echo(
+            click.style(
+                "ERROR: --burst mode is not supported with multiple workers (--num-workers > 1). "
+                "Burst mode cannot coordinate across multiple processes.",
+                fg="red",
+            ),
+            err=True,
+        )
+        sys.exit(1)
-    logger.info(
-        f"Starting RRQ Worker (Burst: {burst}, App Settings: {settings_object_path})"
-    )
+    # Display settings source
+    click.echo("Loading RRQ Settings... ", nl=False)
+    if settings_object_path:
+        click.echo(f"from --settings parameter ({settings_object_path}).")
+    elif os.getenv("RRQ_SETTINGS"):
+        click.echo(f"from RRQ_SETTINGS env var ({os.getenv('RRQ_SETTINGS')}).")
+    elif DOTENV_AVAILABLE and find_dotenv(usecwd=True):
+        click.echo("found in .env file.")
+    else:
+        click.echo("using defaults.")
+    if num_workers == 1:
+        # Run a single worker in the current process
+        click.echo(f"Starting 1 RRQ worker process (Burst: {burst})")
+        _run_single_worker(
+            burst, list(queues) if queues else None, settings_object_path
+        )
+    else:
+        # Run multiple worker subprocesses
+        click.echo(f"Starting {num_workers} RRQ worker processes")
+        # Burst is guaranteed to be False here
+        _run_multiple_workers(
+            num_workers, list(queues) if queues else None, settings_object_path
+        )
+def _run_single_worker(
+    burst: bool,
+    queues_arg: list[str] | None,
+    settings_object_path: str | None,
+):
+    """Helper function to run a single RRQ worker instance."""
+    rrq_settings = _load_app_settings(settings_object_path)
     if not rrq_settings.job_registry:
         click.echo(
             click.style(
-                "ERROR: No 'job_registry_app'. You must provide a JobRegistry instance in settings.",
+                "ERROR: No 'job_registry'. You must provide a JobRegistry instance in settings.",
                 fg="red",
             ),
             err=True,
@@ -378,22 +492,182 @@ def worker_run_command(
         burst=burst,
     )
-    loop = asyncio.get_event_loop()
     try:
-        logger.info("Starting worker run loop...")
-        loop.run_until_complete(worker_instance.run())
+        logger.info("Starting worker run loop for single worker...")
+        asyncio.run(worker_instance.run())
     except KeyboardInterrupt:
         logger.info("RRQ Worker run interrupted by user (KeyboardInterrupt).")
     except Exception as e:
-        logger.error(f"Exception during RRQ Worker run: {e}", exc_info=True)
+        click.echo(
+            click.style(f"ERROR: Exception during RRQ Worker run: {e}", fg="red"),
+            err=True,
+        )
+        # Consider re-raising or sys.exit(1) if the exception means failure
     finally:
-        logger.info("RRQ Worker run finished or exited. Cleaning up event loop.")
-        if loop.is_running():
-            loop.run_until_complete(loop.shutdown_asyncgens())
-            loop.close()
+        # asyncio.run handles loop cleanup.
+        logger.info("RRQ Worker run finished or exited.")
         logger.info("RRQ Worker has shut down.")
+def _run_multiple_workers(
+    num_workers: int,
+    queues: list[str] | None,
+    settings_object_path: str | None,
+):
+    """Manages multiple worker subprocesses."""
+    processes: list[subprocess.Popen] = []
+    # loop = asyncio.get_event_loop() # Not needed here, this function is synchronous
+    original_sigint_handler = signal.getsignal(signal.SIGINT)
+    original_sigterm_handler = signal.getsignal(signal.SIGTERM)
+    def sig_handler(signum, frame):
+        click.echo(
+            f"\nSignal {signal.Signals(signum).name} received. Terminating child workers..."
+        )
+        # Send SIGTERM to all processes
+        for i, p in enumerate(processes):
+            if p.poll() is None:  # Process is still running
+                try:
+                    pgid = os.getpgid(p.pid)
+                    click.echo(f"Sending SIGTERM to worker {i + 1} (PID {p.pid})...")
+                    os.killpg(pgid, signal.SIGTERM)
+                except (ProcessLookupError, OSError):
+                    pass  # Process already dead
+        # Restore original handlers before exiting or re-raising
+        signal.signal(signal.SIGINT, original_sigint_handler)
+        signal.signal(signal.SIGTERM, original_sigterm_handler)
+        # Propagate signal to ensure parent exits if it was, e.g., a Ctrl+C
+        # This is a bit tricky; for now, just exit.
+        # A more robust way might involve re-raising the signal if not handled by click/asyncio.
+        sys.exit(0)
+    signal.signal(signal.SIGINT, sig_handler)
+    signal.signal(signal.SIGTERM, sig_handler)
+    try:
+        for i in range(num_workers):
+            # Construct the command for the subprocess.
+            # Each subprocess runs 'rrq worker run' for a single worker.
+            # We pass along relevant flags like --settings, --queue, and --burst.
+            # Crucially, we do *not* pass --num-workers to the child,
+            # or rather, we could conceptually pass --num-workers 1.
+            # Use the rrq executable from the same venv
+            venv_bin_dir = os.path.dirname(sys.executable)
+            rrq_executable = os.path.join(venv_bin_dir, "rrq")
+            cmd = [rrq_executable, "worker", "run", "--num-workers=1"]
+            if settings_object_path:
+                cmd.extend(["--settings", settings_object_path])
+            elif os.getenv("RRQ_SETTINGS"):
+                # Pass the RRQ_SETTINGS env var as explicit parameter to subprocess
+                cmd.extend(["--settings", os.getenv("RRQ_SETTINGS")])
+            else:
+                # Default to app.config.rrq.rrq_settings for ResQ
+                cmd.extend(["--settings", "app.config.rrq.rrq_settings"])
+            if queues:
+                for q_name in queues:
+                    cmd.extend(["--queue", q_name])
+            click.echo(f"Starting worker subprocess {i + 1}/{num_workers}...")
+            # Set up environment - add current directory to PYTHONPATH
+            env = os.environ.copy()
+            current_pythonpath = env.get("PYTHONPATH", "")
+            current_dir = os.getcwd()
+            if current_pythonpath:
+                env["PYTHONPATH"] = f"{current_dir}:{current_pythonpath}"
+            else:
+                env["PYTHONPATH"] = current_dir
+            # Configure output redirection
+            is_testing = "PYTEST_CURRENT_TEST" in os.environ
+            stdout_dest = None if not is_testing else subprocess.DEVNULL
+            stderr_dest = None if not is_testing else subprocess.DEVNULL
+            process = subprocess.Popen(
+                cmd,
+                start_new_session=True,
+                stdout=stdout_dest,
+                stderr=stderr_dest,
+                cwd=os.getcwd(),
+                env=env,
+            )
+            processes.append(process)
+            click.echo(f"Worker subprocess {i + 1} started with PID {process.pid}")
+        # Wait for all processes to complete
+        click.echo(f"All {num_workers} workers started. Press Ctrl+C to stop.")
+        exit_codes = []
+        try:
+            for p in processes:
+                exit_code = p.wait()
+                exit_codes.append(exit_code)
+        except KeyboardInterrupt:
+            # Signal handler has already sent SIGTERM, now wait with timeout
+            max_wait = 10
+            check_interval = 0.1
+            elapsed = 0
+            while elapsed < max_wait:
+                time.sleep(check_interval)
+                elapsed += check_interval
+                # Check if all processes have terminated
+                all_terminated = all(p.poll() is not None for p in processes)
+                if all_terminated:
+                    click.echo("All workers terminated gracefully.")
+                    break
+            else:
+                # Timeout reached, force kill remaining processes
+                for i, p in enumerate(processes):
+                    if p.poll() is None:
+                        try:
+                            click.echo(
+                                click.style(
+                                    f"WARNING: Worker {i + 1} did not terminate gracefully, sending SIGKILL.",
+                                    fg="yellow",
+                                ),
+                                err=True,
+                            )
+                            os.killpg(os.getpgid(p.pid), signal.SIGKILL)
+                        except (ProcessLookupError, OSError):
+                            pass
+            # Collect exit codes
+            for p in processes:
+                exit_codes.append(p.wait())
+        # Report results
+        for i, exit_code in enumerate(exit_codes):
+            click.echo(f"Worker subprocess {i + 1} exited with code {exit_code}")
+            if exit_code != 0:
+                click.echo(
+                    click.style(
+                        f"Worker subprocess {i + 1} failed with exit code {exit_code}",
+                        fg="red",
+                    ),
+                    err=True,
+                )
+    except Exception as e:
+        click.echo(
+            click.style(f"ERROR: Error managing worker subprocesses: {e}", fg="red"),
+            err=True,
+        )
+        # Terminate any running processes if an error occurs in the manager
+        for p in processes:
+            if p.poll() is None:  # If process is still running
+                terminate_worker_process(p, logger)
+    finally:
+        logger.info("All worker subprocesses terminated or completed.")
+        # Restore original signal handlers
+        signal.signal(signal.SIGINT, original_sigint_handler)
+        signal.signal(signal.SIGTERM, original_sigterm_handler)
+        # Any other cleanup for the parent process
+        # No loop to check or close here as this part is synchronous
+        logger.info("Parent process for multi-worker management is exiting.")
 @worker_cli.command("watch")
 @click.option(
     "--path",

rrq/cron.py CHANGED Viewed

@@ -42,22 +42,24 @@ def _parse_value(value: str, names: dict[str, int], min_val: int, max_val: int)
     return num
-def _parse_field(field: str, *, names: dict[str, int] | None, min_val: int, max_val: int) -> Sequence[int]:
+def _parse_field(
+    field: str, *, names: dict[str, int] | None, min_val: int, max_val: int
+) -> Sequence[int]:
     names = names or {}
     if field == "*":
         return list(range(min_val, max_val + 1))
     values: set[int] = set()
-    for part in field.split(','):
+    for part in field.split(","):
         step = 1
-        if '/' in part:
-            base, step_str = part.split('/', 1)
+        if "/" in part:
+            base, step_str = part.split("/", 1)
             step = int(step_str)
         else:
             base = part
         if base == "*":
             start, end = min_val, max_val
-        elif '-' in base:
-            a, b = base.split('-', 1)
+        elif "-" in base:
+            a, b = base.split("-", 1)
             start = _parse_value(a, names, min_val, max_val)
             end = _parse_value(b, names, min_val, max_val)
         else:
@@ -102,7 +104,7 @@ class CronSchedule:
             python_weekday = dt.weekday()
             cron_weekday = (python_weekday + 1) % 7
             dow_match = cron_weekday in self.dow
             if self.dom_all and self.dow_all:
                 condition = True
             elif self.dom_all:
@@ -119,7 +121,6 @@ class CronSchedule:
             dt += timedelta(minutes=1)
 class CronJob(BaseModel):
     """Simple cron job specification based on a cron schedule."""

rrq/store.py CHANGED Viewed

@@ -21,6 +21,7 @@ from .settings import RRQSettings
 logger = logging.getLogger(__name__)
 class JobStore:
     """Provides an abstraction layer for interacting with Redis for RRQ operations.
@@ -38,7 +39,33 @@ class JobStore:
         self.redis = AsyncRedis.from_url(
             settings.redis_dsn, decode_responses=False
         )  # Work with bytes initially
+        # LUA scripts for atomic operations
+        self._atomic_lock_and_remove_script = """
+        -- KEYS: [1] = lock_key, [2] = queue_key
+        -- ARGV: [1] = worker_id, [2] = lock_timeout_ms, [3] = job_id
+        local lock_result = redis.call('SET', KEYS[1], ARGV[1], 'NX', 'PX', ARGV[2])
+        if lock_result then
+            local removed_count = redis.call('ZREM', KEYS[2], ARGV[3])
+            if removed_count == 0 then
+                redis.call('DEL', KEYS[1])  -- Release lock if job wasn't in queue
+                return {0, 0}  -- {lock_acquired, removed_count}
+            end
+            return {1, removed_count}
+        else
+            return {0, 0}
+        end
+        """
+        self._atomic_retry_script = """
+        -- KEYS: [1] = job_key, [2] = queue_key
+        -- ARGV: [1] = job_id, [2] = retry_at_score, [3] = error_message, [4] = status
+        local new_retry_count = redis.call('HINCRBY', KEYS[1], 'current_retries', 1)
+        redis.call('HMSET', KEYS[1], 'status', ARGV[4], 'last_error', ARGV[3])
+        redis.call('ZADD', KEYS[2], ARGV[2], ARGV[1])
+        return new_retry_count
+        """
     def _format_queue_key(self, queue_name: str) -> str:
         """Normalize a queue name or key into a Redis key for ZSET queues."""
@@ -308,6 +335,99 @@ class JobStore:
             logger.debug(f"Released lock for job {job_id} ({lock_key}).")
         # No need to log if lock didn't exist
+    async def atomic_lock_and_remove_job(
+        self, job_id: str, queue_name: str, worker_id: str, lock_timeout_ms: int
+    ) -> tuple[bool, int]:
+        """Atomically acquires a job lock and removes the job from the queue.
+        This is a critical operation that prevents race conditions between multiple
+        workers trying to process the same job.
+        Args:
+            job_id: The ID of the job to lock and remove.
+            queue_name: The name of the queue to remove the job from.
+            worker_id: The ID of the worker attempting to acquire the lock.
+            lock_timeout_ms: The lock timeout/TTL in milliseconds.
+        Returns:
+            A tuple of (lock_acquired: bool, removed_count: int).
+            - lock_acquired: True if the lock was successfully acquired
+            - removed_count: Number of jobs removed from the queue (0 or 1)
+        """
+        lock_key = f"{LOCK_KEY_PREFIX}{job_id}"
+        queue_key = self._format_queue_key(queue_name)
+        result = await self.redis.eval(
+            self._atomic_lock_and_remove_script,
+            2,  # Number of keys
+            lock_key,
+            queue_key,
+            worker_id.encode("utf-8"),
+            str(lock_timeout_ms),
+            job_id.encode("utf-8"),
+        )
+        lock_acquired = bool(result[0])
+        removed_count = int(result[1])
+        if lock_acquired and removed_count > 0:
+            logger.debug(
+                f"Worker {worker_id} atomically acquired lock and removed job {job_id} from queue '{queue_name}'."
+            )
+        elif not lock_acquired:
+            logger.debug(
+                f"Worker {worker_id} failed to acquire lock for job {job_id} (already locked by another worker)."
+            )
+        else:
+            logger.warning(
+                f"Worker {worker_id} acquired lock for job {job_id} but job was already removed from queue '{queue_name}'."
+            )
+        return lock_acquired, removed_count
+    async def atomic_retry_job(
+        self,
+        job_id: str,
+        queue_name: str,
+        retry_at_score: float,
+        error_message: str,
+        status: JobStatus,
+    ) -> int:
+        """Atomically increments job retry count, updates status/error, and re-queues the job.
+        This prevents race conditions in the retry logic where multiple operations
+        need to be performed atomically.
+        Args:
+            job_id: The ID of the job to retry.
+            queue_name: The name of the queue to add the job back to.
+            retry_at_score: The score (timestamp) when the job should be retried.
+            error_message: The error message to store.
+            status: The job status to set (usually RETRYING).
+        Returns:
+            The new retry count after incrementing.
+        """
+        job_key = f"{JOB_KEY_PREFIX}{job_id}"
+        queue_key = self._format_queue_key(queue_name)
+        new_retry_count = await self.redis.eval(
+            self._atomic_retry_script,
+            2,  # Number of keys
+            job_key,
+            queue_key,
+            job_id.encode("utf-8"),
+            str(retry_at_score),
+            error_message.encode("utf-8"),
+            status.value.encode("utf-8"),
+        )
+        new_count = int(new_retry_count)
+        logger.debug(
+            f"Atomically incremented retries for job {job_id} to {new_count} and re-queued for retry."
+        )
+        return new_count
     async def update_job_status(self, job_id: str, status: JobStatus) -> None:
         """Updates only the status field of a job in its Redis hash.
@@ -368,7 +488,7 @@ class JobStore:
             pipe.expire(job_key, DEFAULT_DLQ_RESULT_TTL_SECONDS)
             results = await pipe.execute()
         logger.info(f"Moved job {job_id} to DLQ '{dlq_redis_key}'. Results: {results}")
     async def requeue_dlq(
         self,
         dlq_name: str,

rrq/worker.py CHANGED Viewed

@@ -7,6 +7,7 @@ import asyncio
 # Use standard logging instead of custom one if appropriate
 import logging
 import os
+import random
 import signal
 import time
 import uuid
@@ -21,7 +22,6 @@ from rrq.client import RRQClient
 from .constants import (
     DEFAULT_WORKER_ID_PREFIX,
-    JOB_KEY_PREFIX,
 )
 from .exc import RetryJob
 from .job import Job, JobStatus
@@ -91,6 +91,30 @@ class RRQWorker:
             f"Initializing RRQWorker {self.worker_id} for queues: {self.queues}"
         )
+    def _calculate_jittered_delay(
+        self, base_delay: float, jitter_factor: float = 0.5
+    ) -> float:
+        """Calculate a jittered delay to prevent thundering herd effects.
+        Args:
+            base_delay: The base delay in seconds.
+            jitter_factor: Factor for jitter (0.0 to 1.0). Default 0.5 means ±50% jitter.
+        Returns:
+            The jittered delay in seconds.
+        """
+        # Clamp jitter_factor to safe range to prevent negative delays
+        jitter_factor = max(0.0, min(jitter_factor, 0.99))
+        # Calculate jitter range: base_delay * (1 ± jitter_factor)
+        min_delay = base_delay * (1 - jitter_factor)
+        max_delay = base_delay * (1 + jitter_factor)
+        # Ensure min_delay is always positive
+        min_delay = max(0.001, min_delay)
+        return random.uniform(min_delay, max_delay)
     async def _call_startup_hook(self) -> None:
         if self.settings.on_startup:
             logger.info(f"Worker {self.worker_id} calling on_startup hook...")
@@ -171,14 +195,19 @@ class RRQWorker:
                         self.status = "idle (concurrency limit)"
                     # At concurrency limit, wait for tasks to finish or poll delay
-                await asyncio.sleep(self.settings.default_poll_delay_seconds)
+                    # Use jittered delay to prevent thundering herd effects
+                    jittered_delay = self._calculate_jittered_delay(
+                        self.settings.default_poll_delay_seconds
+                    )
+                    await asyncio.sleep(jittered_delay)
             except Exception as e:
                 logger.error(
                     f"Worker {self.worker_id} encountered error in main run loop: {e}",
                     exc_info=True,
                 )
-                # Avoid tight loop on persistent errors
-                await asyncio.sleep(1)
+                # Avoid tight loop on persistent errors with jittered delay
+                jittered_delay = self._calculate_jittered_delay(1.0)
+                await asyncio.sleep(jittered_delay)
         logger.info(
             f"Worker {self.worker_id} shutdown signal received. Draining tasks..."
@@ -222,53 +251,65 @@ class RRQWorker:
                     if fetched_count >= count or self._shutdown_event.is_set():
                         break
-                    # Attempt to acquire semaphore *before* trying to process
-                    await self._semaphore.acquire()
                     try:
-                        # _try_process_job handles lock acquisition, fetching, task creation
-                        job_started = await self._try_process_job(job_id, queue_name)
-                        if job_started:
-                            fetched_count += 1
-                        else:
-                            # If job wasn't started (e.g., lock conflict), release semaphore immediately
-                            self._semaphore.release()
+                        # Try to acquire lock and remove from queue first (without semaphore)
+                        job_acquired = await self._try_acquire_job(job_id, queue_name)
+                        if job_acquired:
+                            # Only acquire semaphore after successfully getting the job
+                            await self._semaphore.acquire()
+                            try:
+                                # Process the job (we already have the lock and removed from queue)
+                                # The semaphore will be released when the job task completes
+                                await self._process_acquired_job(
+                                    job_acquired, queue_name
+                                )
+                                fetched_count += 1
+                            except Exception as e_process:
+                                logger.error(
+                                    f"Worker {self.worker_id} exception processing acquired job {job_id}: {e_process}",
+                                    exc_info=True,
+                                )
+                                # Release lock and semaphore since processing failed
+                                await self.job_store.release_job_lock(job_id)
+                                self._semaphore.release()
+                        # If job_acquired is None, another worker got it - continue to next job
                     except Exception as e_try:
-                        # Catch errors during the _try_process_job itself
+                        # Catch errors during the job acquisition itself
                         logger.error(
-                            f"Worker {self.worker_id} exception trying to process job {job_id}: {e_try}",
+                            f"Worker {self.worker_id} exception trying to acquire job {job_id}: {e_try}",
                             exc_info=True,
                         )
-                        self._semaphore.release()  # Ensure semaphore is released on error
             except Exception as e_poll:
                 logger.error(
                     f"Worker {self.worker_id} error polling queue '{queue_name}': {e_poll}",
                     exc_info=True,
                 )
-                await asyncio.sleep(1)  # Avoid tight loop on polling error
+                # Avoid tight loop on polling error with jittered delay
+                jittered_delay = self._calculate_jittered_delay(1.0)
+                await asyncio.sleep(jittered_delay)
         # For burst mode, return number of jobs fetched in this poll
         return fetched_count
-    async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
-        """Attempts to lock, fetch definition, and start the execution task for a specific job.
+    async def _try_acquire_job(self, job_id: str, queue_name: str) -> Optional[Job]:
+        """Attempts to atomically lock and remove a job from the queue.
         Args:
-            job_id: The ID of the job to attempt processing.
+            job_id: The ID of the job to attempt acquiring.
             queue_name: The name of the queue the job ID was retrieved from.
         Returns:
-            True if the job processing task was successfully started, False otherwise
-            (e.g., lock conflict, job definition not found, already removed).
+            The Job object if successfully acquired, None otherwise.
         """
         logger.debug(
-            f"Worker {self.worker_id} attempting to process job {job_id} from queue '{queue_name}'"
+            f"Worker {self.worker_id} attempting to acquire job {job_id} from queue '{queue_name}'"
         )
         job = await self.job_store.get_job_definition(job_id)
         if not job:
             logger.warning(
-                f"Worker {self.worker_id} job definition {job_id} not found during _try_process_job from queue {queue_name}."
+                f"Worker {self.worker_id} job definition {job_id} not found during _try_acquire_job from queue {queue_name}."
             )
-            return False  # Job vanished between poll and fetch?
+            return None  # Job vanished between poll and fetch?
         # Determine job-specific timeout and calculate lock timeout
         job_timeout = (
@@ -280,32 +321,28 @@ class RRQWorker:
             job_timeout + self.settings.default_lock_timeout_extension_seconds
         ) * 1000
-        # Attempt to acquire the processing lock
-        lock_acquired = await self.job_store.acquire_job_lock(
-            job.id, self.worker_id, int(lock_timeout_ms)
+        # Atomically acquire the processing lock and remove from queue
+        lock_acquired, removed_count = await self.job_store.atomic_lock_and_remove_job(
+            job.id, queue_name, self.worker_id, int(lock_timeout_ms)
         )
-        if not lock_acquired:
-            logger.debug(
-                f"Worker {self.worker_id} failed to acquire lock for job {job.id} (already locked by another worker)."
-            )
-            return False  # Another worker got there first
-        logger.debug(f"Worker {self.worker_id} acquired lock for job {job.id}")
+        if not lock_acquired or removed_count == 0:
+            return None  # Another worker got there first
-        # Atomically remove the job from the queue (verify it was actually removed)
-        # Note: Ideally, lock acquisition and queue removal would be a single atomic operation (e.g., Lua script).
-        removed_count = await self.job_store.remove_job_from_queue(queue_name, job.id)
-        logger.debug(
-            f"Worker {self.worker_id} removed job {job.id} from queue '{queue_name}' (count: {removed_count})."
-        )
-        if removed_count == 0:
-            logger.warning(
-                f"Worker {self.worker_id} job {job.id} was already removed from queue '{queue_name}' after lock acquisition. Releasing lock."
-            )
-            await self.job_store.release_job_lock(job.id)  # Release the acquired lock
-            return False  # Job processed by another worker between our poll and lock
+        # Successfully acquired the job
+        logger.debug(f"Worker {self.worker_id} successfully acquired job {job.id}")
+        return job
+    async def _process_acquired_job(self, job: Job, queue_name: str) -> None:
+        """Processes a job that has already been acquired (locked and removed from queue).
+        Note: This method assumes the worker has already acquired the concurrency semaphore.
+        The semaphore will be released when the job task completes via _task_cleanup.
-        # We have the lock and have removed the job from the queue - proceed to execute
+        Args:
+            job: The Job object that was successfully acquired.
+            queue_name: The name of the queue the job was retrieved from.
+        """
         try:
             await self.job_store.update_job_status(job.id, JobStatus.ACTIVE)
             logger.debug(
@@ -313,21 +350,58 @@ class RRQWorker:
             )
             # Create and track the execution task
+            # The semaphore will be released when this task completes
             task = self._loop.create_task(self._execute_job(job, queue_name))
             self._running_tasks.add(task)
             task.add_done_callback(lambda t: self._task_cleanup(t, self._semaphore))
             logger.info(
                 f"Worker {self.worker_id} started job {job.id} ('{job.function_name}') from queue '{queue_name}'"
             )
-            return True
         except Exception as e_start:
             # Catch errors during status update or task creation
             logger.error(
-                f"Worker {self.worker_id} failed to start task for job {job.id} after lock/removal: {e_start}",
+                f"Worker {self.worker_id} failed to start task for job {job.id} after acquisition: {e_start}",
                 exc_info=True,
             )
-            # Attempt to release the lock since task wasn't started
+            # Release the lock since task wasn't started
             await self.job_store.release_job_lock(job.id)
+            raise  # Re-raise to be handled by caller
+    async def _try_process_job(self, job_id: str, queue_name: str) -> bool:
+        """Attempts to lock, fetch definition, and start the execution task for a specific job.
+        This method is kept for backward compatibility and uses the optimized approach internally.
+        For new code, prefer using _try_acquire_job and _process_acquired_job separately.
+        Note: This method handles semaphore acquisition internally for backward compatibility.
+        Args:
+            job_id: The ID of the job to attempt processing.
+            queue_name: The name of the queue the job ID was retrieved from.
+        Returns:
+            True if the job processing task was successfully started, False otherwise
+            (e.g., lock conflict, job definition not found, already removed).
+        """
+        # Use the optimized approach: acquire job first, then process
+        job_acquired = await self._try_acquire_job(job_id, queue_name)
+        if not job_acquired:
+            return False
+        # For backward compatibility, acquire semaphore here since old callers expect it
+        await self._semaphore.acquire()
+        try:
+            # Process the acquired job
+            await self._process_acquired_job(job_acquired, queue_name)
+            return True
+        except Exception as e_process:
+            logger.error(
+                f"Worker {self.worker_id} failed to process acquired job {job_id}: {e_process}",
+                exc_info=True,
+            )
+            # Release semaphore on error since _process_acquired_job doesn't handle it
+            self._semaphore.release()
+            # Lock is already released in _process_acquired_job on error
             return False
     async def _execute_job(self, job: Job, queue_name: str) -> None:
@@ -475,63 +549,54 @@ class RRQWorker:
         appropriate delay (custom or exponential backoff) or moves to DLQ.
         """
         log_prefix = f"Worker {self.worker_id} job {job.id} (queue '{queue_name}')"
-        job_key = f"{JOB_KEY_PREFIX}{job.id}"
-        try:
-            # Atomically increment retries in the store.
-            new_retry_count = await self.job_store.increment_job_retries(job.id)
-            max_retries = (
-                job.max_retries
-            )  # Use max_retries from the job object passed in
-            if new_retry_count < max_retries:
-                # Update status and error atomically
-                await self.job_store.redis.hset(
-                    job_key,
-                    mapping={
-                        "status": JobStatus.RETRYING.value,
-                        "last_error": str(exc),
-                    },
-                )
-                logger.debug(f"{log_prefix} status set to RETRYING, error saved.")
-                # Determine deferral time
-                defer_seconds = exc.defer_seconds
-                if defer_seconds is None:
-                    # Create a temporary job representation for backoff calculation
-                    # using the *new* retry count.
-                    temp_job_for_backoff = Job(
-                        id=job.id,
-                        function_name=job.function_name,
-                        current_retries=new_retry_count,  # Use updated count
-                        max_retries=max_retries,  # Ensure this is passed
-                    )
-                    defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
-                    defer_seconds = defer_ms / 1000.0
-                else:
-                    logger.debug(
-                        f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
-                    )
+        max_retries = job.max_retries
-                retry_at_score = (time.time() + defer_seconds) * 1000
-                target_queue = job.queue_name or self.settings.default_queue_name
-                await self.job_store.add_job_to_queue(
-                    target_queue, job.id, retry_at_score
-                )
-                logger.info(
-                    f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
-                    f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
-                )
-            else:
-                # Max retries exceeded even though RetryJob was raised
+        try:
+            # Check if we would exceed max retries
+            anticipated_retry_count = job.current_retries + 1
+            if anticipated_retry_count >= max_retries:
+                # Max retries exceeded, increment retry count and move directly to DLQ
                 logger.warning(
                     f"{log_prefix} max retries ({max_retries}) exceeded "
-                    f"despite RetryJob exception. Moving to DLQ."
+                    f"with RetryJob exception. Moving to DLQ."
                 )
-                # _move_to_dlq handles setting FAILED status etc.
+                # Increment retry count before moving to DLQ
+                await self.job_store.increment_job_retries(job.id)
                 error_msg = (
                     str(exc) or f"Max retries ({max_retries}) exceeded after RetryJob"
                 )
                 await self._move_to_dlq(job, queue_name, error_msg)
+                return
+            # Determine deferral time
+            defer_seconds = exc.defer_seconds
+            if defer_seconds is None:
+                # Create a temporary job representation for backoff calculation
+                temp_job_for_backoff = Job(
+                    id=job.id,
+                    function_name=job.function_name,
+                    current_retries=anticipated_retry_count,  # Use anticipated count
+                    max_retries=max_retries,
+                )
+                defer_ms = self._calculate_backoff_ms(temp_job_for_backoff)
+                defer_seconds = defer_ms / 1000.0
+            else:
+                logger.debug(
+                    f"{log_prefix} using custom deferral of {defer_seconds}s from RetryJob exception."
+                )
+            retry_at_score = (time.time() + defer_seconds) * 1000
+            target_queue = job.queue_name or self.settings.default_queue_name
+            # Atomically increment retries, update status/error, and re-queue
+            new_retry_count = await self.job_store.atomic_retry_job(
+                job.id, target_queue, retry_at_score, str(exc), JobStatus.RETRYING
+            )
+            logger.info(
+                f"{log_prefix} explicitly retrying in {defer_seconds:.2f}s "
+                f"(attempt {new_retry_count}/{max_retries}) due to RetryJob."
+            )
         except Exception as e_handle:
             logger.exception(
                 f"{log_prefix} CRITICAL error during RetryJob processing: {e_handle}"
@@ -549,48 +614,43 @@ class RRQWorker:
         logger.debug(f"{log_prefix} processing general failure: {type(exc).__name__}")
         try:
-            new_retry_count = await self.job_store.increment_job_retries(job.id)
-            # Re-fetch job state after incrementing retries might be safer if fields changed?
-            # For now, assume the job object passed in is mostly accurate except for retry count.
-            # Use max_retries from the job object passed in.
             max_retries = job.max_retries
             last_error_str = str(exc)
-            if new_retry_count < max_retries:
-                # Re-queue for standard retry with backoff
-                defer_ms = self._calculate_backoff_ms(
-                    Job(
-                        id=job.id,
-                        function_name=job.function_name,
-                        current_retries=new_retry_count,
-                        max_retries=max_retries,
-                    )
-                )
-                retry_at_score = (time.time() * 1000) + defer_ms
-                target_queue = job.queue_name or self.settings.default_queue_name
-                # Atomically update status/error and re-add to queue (if possible, else separate)
-                # For now, separate HSET and ZADD
-                await self.job_store.redis.hset(
-                    f"{JOB_KEY_PREFIX}{job.id}",
-                    mapping={
-                        "status": JobStatus.RETRYING.value,
-                        "last_error": last_error_str,
-                    },
-                )
-                await self.job_store.add_job_to_queue(
-                    target_queue, job.id, retry_at_score
-                )
-                logger.info(
-                    f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
-                    f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
-                )
-            else:  # Max retries reached
+            # Check if we would exceed max retries
+            anticipated_retry_count = job.current_retries + 1
+            if anticipated_retry_count >= max_retries:
+                # Max retries exceeded, increment retry count and move directly to DLQ
                 logger.warning(
                     f"{log_prefix} failed after max retries ({max_retries}). Moving to DLQ. Error: {str(exc)[:100]}..."
                 )
+                # Increment retry count before moving to DLQ
+                await self.job_store.increment_job_retries(job.id)
                 # _move_to_dlq handles setting FAILED status, completion time, and last error.
                 await self._move_to_dlq(job, queue_name, last_error_str)
+                return
+            # Calculate backoff delay using anticipated retry count
+            defer_ms = self._calculate_backoff_ms(
+                Job(
+                    id=job.id,
+                    function_name=job.function_name,
+                    current_retries=anticipated_retry_count,  # Use anticipated count
+                    max_retries=max_retries,
+                )
+            )
+            retry_at_score = (time.time() * 1000) + defer_ms
+            target_queue = job.queue_name or self.settings.default_queue_name
+            # Atomically increment retries, update status/error, and re-queue
+            new_retry_count = await self.job_store.atomic_retry_job(
+                job.id, target_queue, retry_at_score, last_error_str, JobStatus.RETRYING
+            )
+            logger.info(
+                f"{log_prefix} failed, retrying in {defer_ms / 1000.0:.2f}s "
+                f"(attempt {new_retry_count}/{max_retries}). Error: {str(exc)[:100]}..."
+            )
         except Exception as e_handle:
             logger.exception(

{rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: rrq
-Version: 0.4.0
+Version: 0.5.0
 Summary: RRQ is a Python library for creating reliable job queues using Redis and asyncio
 Project-URL: Homepage, https://github.com/getresq/rrq
 Project-URL: Bug Tracker, https://github.com/getresq/rrq/issues
@@ -265,7 +265,8 @@ RRQ provides a command-line interface (CLI) for managing workers and performing
 - **`rrq worker run`** - Run an RRQ worker process.
   - `--settings` (optional): Specify the Python path to your settings object (e.g., `myapp.worker_config.rrq_settings`). If not provided, it will use the `RRQ_SETTINGS` environment variable or default to a basic `RRQSettings` object.
   - `--queue` (optional, multiple): Specify queue(s) to poll. Defaults to the `default_queue_name` in settings.
-  - `--burst` (flag): Run the worker in burst mode to process one job or batch and then exit.
+  - `--burst` (flag): Run the worker in burst mode to process one job or batch and then exit. Cannot be used with `--num-workers > 1`.
+  - `--num-workers` (optional, integer): Number of parallel worker processes to start. Defaults to the number of CPU cores available on the machine. Cannot be used with `--burst` mode.
 - **`rrq worker watch`** - Run an RRQ worker with auto-restart on file changes.
   - `--path` (optional): Directory path to watch for changes. Defaults to the current directory.
   - `--settings` (optional): Same as above.

rrq-0.5.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,16 @@
+rrq/__init__.py,sha256=3WYv9UkvnCbjKXrvmqiLm7yuVVQiLclbVCOXq5wb6ZM,290
+rrq/cli.py,sha256=7wLO0gRl8Qe1Tf6dyELJnVfJc6rr5pw6m6Mj7qMl3bk,27550
+rrq/client.py,sha256=5_bmZ05LKIfY9WFSKU-nYawEupsnrnHT2HewXfC2Ahg,7831
+rrq/constants.py,sha256=F_uZgBI3h00MctnEjBjiCGMrg5jUaz5Bz9I1vkyqNrs,1654
+rrq/cron.py,sha256=etDwnOXr5Ys1Vt08oYQsMjtLbPsjMWMvbund4bWOlCA,5237
+rrq/exc.py,sha256=NJq3C7pUfcd47AB8kghIN8vdY0l90UrsHQEg4McBHP8,1281
+rrq/job.py,sha256=eUbl33QDqDMXPKpo-0dl0Mp29LWWmtbBgRw0sclcwJ4,4011
+rrq/registry.py,sha256=E9W_zx3QiKTBwMOGearaNpDKBDB87JIn0RlMQ3sAcP0,2925
+rrq/settings.py,sha256=AxzSe_rw7-yduKST2c9mPunQWqPE2537XcC_XlMoHWM,4535
+rrq/store.py,sha256=TrtVojnT7wJNV1jaXsjHXQa3IDeQJ4-0PKDCEjZuDi0,29537
+rrq/worker.py,sha256=1bbZkUCSHwFzpsxcsc84RU_7h8dCnFItJCZ4SG4zASc,44940
+rrq-0.5.0.dist-info/METADATA,sha256=vud54ZneWCUMJ0pjg_FmUHaBo1oxqOBbw2yC63gMKy0,13140
+rrq-0.5.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+rrq-0.5.0.dist-info/entry_points.txt,sha256=f8eFjk2ygDSyu9USwXGj5IM8xeyQqZgDa1rSrCj4Mis,36
+rrq-0.5.0.dist-info/licenses/LICENSE,sha256=XDvu5hKdS2-_ByiSj3tiu_3zSsrXXoJsgbILGoMpKCw,554
+rrq-0.5.0.dist-info/RECORD,,

rrq-0.4.0.dist-info/RECORD DELETED Viewed

@@ -1,16 +0,0 @@
-rrq/__init__.py,sha256=3WYv9UkvnCbjKXrvmqiLm7yuVVQiLclbVCOXq5wb6ZM,290
-rrq/cli.py,sha256=_LbaAH_w2a0VNRR0EctuE4afl-wccvMY2w2VbehFDEQ,16980
-rrq/client.py,sha256=5_bmZ05LKIfY9WFSKU-nYawEupsnrnHT2HewXfC2Ahg,7831
-rrq/constants.py,sha256=F_uZgBI3h00MctnEjBjiCGMrg5jUaz5Bz9I1vkyqNrs,1654
-rrq/cron.py,sha256=9lxJ1OnrTbavJvbIdPp6u5ncYgyD35vRPsSulpVrQko,5244
-rrq/exc.py,sha256=NJq3C7pUfcd47AB8kghIN8vdY0l90UrsHQEg4McBHP8,1281
-rrq/job.py,sha256=eUbl33QDqDMXPKpo-0dl0Mp29LWWmtbBgRw0sclcwJ4,4011
-rrq/registry.py,sha256=E9W_zx3QiKTBwMOGearaNpDKBDB87JIn0RlMQ3sAcP0,2925
-rrq/settings.py,sha256=AxzSe_rw7-yduKST2c9mPunQWqPE2537XcC_XlMoHWM,4535
-rrq/store.py,sha256=teO0Af8hzBiu7-dFn6_2lz5X90LAZXmtg0VDZuQoAwk,24972
-rrq/worker.py,sha256=KspmZOL6i_dfIypcBi0UpQDpz2NrCj3vEl6CwTNlLKo,42479
-rrq-0.4.0.dist-info/METADATA,sha256=2SFZJlfgwFSpmWfylQ6rSV072HGXlA2MBcECJppV_DY,12914
-rrq-0.4.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
-rrq-0.4.0.dist-info/entry_points.txt,sha256=f8eFjk2ygDSyu9USwXGj5IM8xeyQqZgDa1rSrCj4Mis,36
-rrq-0.4.0.dist-info/licenses/LICENSE,sha256=XDvu5hKdS2-_ByiSj3tiu_3zSsrXXoJsgbILGoMpKCw,554
-rrq-0.4.0.dist-info/RECORD,,

{rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{rrq-0.4.0.dist-info → rrq-0.5.0.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

rrq 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

rrq 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl