redis-message-queue 6.0.1__tar.gz → 7.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (24) hide show
  1. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/PKG-INFO +161 -14
  2. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/README.md +159 -12
  3. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/pyproject.toml +2 -2
  4. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/__init__.py +2 -0
  5. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/_abstract_redis_gateway.py +7 -6
  6. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/_config.py +109 -8
  7. redis_message_queue-7.0.0/redis_message_queue/_event.py +66 -0
  8. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/_exceptions.py +4 -0
  9. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/_queue_key_manager.py +4 -1
  10. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/_redis_cluster.py +8 -2
  11. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/_redis_gateway.py +30 -6
  12. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/asyncio/__init__.py +2 -0
  13. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/asyncio/_abstract_redis_gateway.py +7 -6
  14. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/asyncio/_redis_gateway.py +30 -6
  15. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/asyncio/redis_message_queue.py +86 -37
  16. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/interrupt_handler/_implementation.py +16 -3
  17. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/redis_message_queue.py +62 -31
  18. redis_message_queue-6.0.1/redis_message_queue/_event.py +0 -39
  19. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/LICENSE +0 -0
  20. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/_callable_utils.py +0 -0
  21. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/_stored_message.py +0 -0
  22. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/interrupt_handler/__init__.py +0 -0
  23. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/interrupt_handler/_interface.py +0 -0
  24. {redis_message_queue-6.0.1 → redis_message_queue-7.0.0}/redis_message_queue/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: redis-message-queue
3
- Version: 6.0.1
3
+ Version: 7.0.0
4
4
  Summary: Python message queuing with Redis and message deduplication
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -17,7 +17,7 @@ Classifier: Programming Language :: Python :: 3.13
17
17
  Classifier: Programming Language :: Python :: 3.14
18
18
  Classifier: Topic :: Software Development :: Libraries
19
19
  Classifier: Topic :: System :: Distributed Computing
20
- Requires-Dist: redis (>=5.0.0)
20
+ Requires-Dist: redis (>=5.0.0,<8.0.0)
21
21
  Requires-Dist: tenacity (>=8.1.0)
22
22
  Project-URL: Homepage, https://github.com/Elijas/redis-message-queue
23
23
  Project-URL: Issues, https://github.com/Elijas/redis-message-queue/issues
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
26
26
 
27
27
  # redis-message-queue
28
28
 
29
- [![PyPI Version](https://img.shields.io/badge/v6.0.1-version?color=43cd0f&style=flat&label=pypi)](https://pypi.org/project/redis-message-queue)
29
+ [![PyPI Version](https://img.shields.io/badge/v7.0.0-version?color=43cd0f&style=flat&label=pypi)](https://pypi.org/project/redis-message-queue)
30
30
  [![PyPI Downloads](https://img.shields.io/pypi/dm/redis-message-queue?color=43cd0f&style=flat&label=downloads)](https://pypistats.org/packages/redis-message-queue)
31
31
  [![License: MIT](https://img.shields.io/badge/License-MIT-43cd0f.svg?style=flat&label=license)](LICENSE)
32
32
  [![Maintained: yes](https://img.shields.io/badge/yes-43cd0f.svg?style=flat&label=maintained)](https://github.com/Elijas/redis-message-queue/issues)
@@ -260,6 +260,22 @@ consumer whose claim request Redis executes next. There is no round-robin,
260
260
  equal-share, or starvation-freedom guarantee; faster consumers can receive more
261
261
  than 1/N of messages.
262
262
 
263
+ ### If you need stronger ordering or fairness guarantees
264
+
265
+ - **Strict queue-wide processing order** — use a single consumer per queue.
266
+ Multiple consumers will interleave handler completions.
267
+ - **Per-key processing order** — partition by key into multiple queues
268
+ (`queue_<hash(key) % N>`), and consume each partition with a single consumer.
269
+ - **Equal-share / round-robin fairness across consumers** — choose a different
270
+ scheduler. This queue does not guarantee that any individual consumer makes
271
+ forward progress at any specific rate.
272
+ - **Cross-batch ordering after reclaim** — accept that reclaimed messages will
273
+ reappear after newer un-reclaimed messages have been consumed. If your handler
274
+ must observe original publish order, persist that order in the payload (for
275
+ example, a sequence number set by the producer). For clock-related operator
276
+ detail behind reclaim behavior, see
277
+ [production readiness R11](docs/production-readiness.md#r11-redis-clock-dependencies).
278
+
263
279
  ### Dead-letter queue
264
280
 
265
281
  ```python
@@ -310,14 +326,14 @@ There are three distinct shutdown shapes; pick the one that matches your runtime
310
326
  |---|---|---|---|
311
327
  | **Flag-based soft drain** (`GracefulInterruptHandler`) | First SIGINT/SIGTERM flips a flag | Runs to completion | Drained on the next claim call, not on signal arrival |
312
328
  | **Async task cancellation** (`asyncio.CancelledError`) | Framework cancels the worker task (Uvicorn/K8s SIGTERM in many setups) | **Hard abort** — message stays in `processing`; with VT it is reclaimed at deadline expiry, without VT it is orphaned | Not drained |
313
- | **Explicit drain** (`drain()` / `aclose()`) | You call the method | Caller's responsibility to let it finish (drain does **not** cancel) | Drained synchronously via the gateway recovery path |
329
+ | **Explicit drain** (`drain()` / `aclose()`) | You call the method | Caller's responsibility to let it finish (drain does **not** cancel) | Drained synchronously via the gateway recovery path; new publishes are refused |
314
330
 
315
331
  Use `drain()` / `aclose()` to bridge K8s `preStop` / SIGTERM grace windows without
316
332
  relying on signal interception:
317
333
 
318
334
  ```python
319
335
  # sync — in your SIGTERM handler or preStop hook
320
- queue.drain(timeout=25) # refuses new claims, recovers pending claim IDs
336
+ queue.drain(timeout=25) # refuses new publishes/claims, recovers pending claim IDs
321
337
  worker_thread.join() # wait for in-flight process_message to finish
322
338
 
323
339
  # async — same shape
@@ -326,12 +342,19 @@ await worker_task # task observes ``_draining`` and exits its loop
326
342
  ```
327
343
 
328
344
  `drain()` / `aclose()` set a queue-local flag so subsequent `process_message()`
329
- calls yield `None` immediately. They do not cancel in-flight handlers — the
330
- caller must arrange handler exit through normal thread/task coordination.
331
- Returns `True` if all in-memory pending claim IDs were recovered within the
332
- timeout; `False` if the deadline fired or transient Redis errors left claim
333
- IDs pending (call again to retry). `timeout=0` reports current state without
334
- attempting recovery.
345
+ calls yield `None` immediately and subsequent `publish()` calls raise
346
+ `QueueDrainedError("queue is drained")`. Drain also gates the publish path:
347
+ if a publish is already inside the queue instance's publish path, drain waits
348
+ for that publish to finish before it returns; publishes that arrive after the
349
+ drained flag is set are rejected. The drained state is local to that Python
350
+ queue object and is not written to Redis, so constructing a fresh
351
+ `RedisMessageQueue(...)` over the same keys remains usable.
352
+
353
+ Drain does not cancel in-flight handlers — the caller must arrange handler
354
+ exit through normal thread/task coordination. Returns `True` if all in-memory
355
+ pending claim IDs were recovered within the timeout; `False` if the deadline
356
+ fired or transient Redis errors left claim IDs pending (call again to retry).
357
+ `timeout=0` reports current state without attempting recovery.
335
358
 
336
359
  > **Heartbeat caveat (best-effort stop):** when `heartbeat_interval_seconds` is
337
360
  > set, the heartbeat sidecar's `stop()` is bounded but not strictly quiescent —
@@ -472,6 +495,32 @@ Notes:
472
495
  message. Do not call `fork()` from inside active message handlers unless the
473
496
  child exits without using the inherited queue/client.
474
497
 
498
+ #### Forking after constructing GracefulInterruptHandler
499
+
500
+ If your application constructed `GracefulInterruptHandler` in the parent process
501
+ before `os.fork()` (for example, via module import in a pre-fork app server),
502
+ forked children cannot construct a fresh handler for the same signal because the
503
+ inherited signal table still routes to the parent-process handler.
504
+
505
+ In each child process, call `parent_handler.reset()` before constructing a fresh
506
+ handler:
507
+
508
+ ```python
509
+ def worker_main():
510
+ # Inherited handler from parent - reset it.
511
+ if shared.interrupt_handler is not None:
512
+ shared.interrupt_handler.reset()
513
+
514
+ # Now safe to construct a fresh handler for this child.
515
+ interrupt = GracefulInterruptHandler()
516
+ queue = RedisMessageQueue("jobs", client=redis.Redis(), interrupt=interrupt)
517
+ ...
518
+ ```
519
+
520
+ Alternatively, defer all construction (handler and queue) to inside
521
+ `worker_main()` and pass `--no-preload` (or equivalent) to your app server. That
522
+ avoids the parent-construct hazard entirely.
523
+
475
524
  ### Redis memory sizing for deduplication and replay metadata
476
525
 
477
526
  When deduplication is enabled, each distinct dedup key creates one Redis string
@@ -530,6 +579,7 @@ Package logs remain diagnostic; use `on_event` rather than log parsing for
530
579
  metrics.
531
580
 
532
581
  ```python
582
+ from opentelemetry import trace
533
583
  from prometheus_client import Counter
534
584
  from redis_message_queue import QueueEvent, RedisMessageQueue
535
585
 
@@ -543,17 +593,78 @@ def observe(event: QueueEvent) -> None:
543
593
  events_total.labels(
544
594
  event.queue, event.operation, event.outcome, event.exception_type or ""
545
595
  ).inc()
596
+ if event.error is not None:
597
+ trace.get_current_span().record_exception(event.error)
546
598
 
547
599
  queue = RedisMessageQueue("jobs", client=client, on_event=observe)
548
600
  ```
549
601
 
602
+ #### Event dispatch context
603
+
604
+ Callbacks fire inline:
605
+
606
+ - **Sync queue:** the callback runs in the caller's thread. It sees
607
+ contextvars, the OpenTelemetry current span, and structlog contextvars bound
608
+ by the caller.
609
+ - **Async queue:** the callback is awaited in the current asyncio task. It has
610
+ the same contextvars, span, and structlog visibility.
611
+ - **Sync heartbeat:** heartbeat events fire from a separate
612
+ `threading.Thread`. That thread does not inherit caller contextvars or the
613
+ caller's OpenTelemetry current span. Use `event.message_id` and
614
+ `event.lease_token_hash` for correlation.
615
+ - **Async heartbeat:** heartbeat events fire from an asyncio task. The task
616
+ copies the context present when the heartbeat was started, so contextvars and
617
+ OpenTelemetry spans bound at handler entry are visible.
618
+
619
+ #### Event timing vs. Redis commit
620
+
621
+ Most events are post-commit, emitted after the Redis command or Lua script
622
+ returned: `publish/success`, `publish_dedup_hit`, `claim/success`,
623
+ `claim_empty`, `claim_reclaim`, `ack`, `nack`, `completed`, `dlq`,
624
+ `lease_renew`, `trim_failed`, and `stale_lease_*`.
625
+
626
+ Pre-commit and mid-flight exceptions:
627
+
628
+ - `failed/failure` fires after the handler raises but before failed-queue
629
+ cleanup completes. Use `nack` for cleanup-commit metrics; use `failed` for
630
+ handler-exception attribution.
631
+ - `retry_attempt/failure` and `retry_exhausted` fire on the claim-loop retry
632
+ path. The first Redis attempt may or may not have committed.
633
+ - `publish/failure`, `claim/failure`, and `cleanup_failed/failure` follow
634
+ exceptions. Under an ambiguous lost response, Redis may have committed
635
+ despite the exception. Treat them as "operation did not succeed from the
636
+ caller's perspective", not "Redis did not commit".
637
+
638
+ #### Intentionally silent paths
639
+
640
+ The following operations have no `on_event` surface by design:
641
+
642
+ - **B1 Cluster `pcall` cleanup failure:** three lease-aware Lua scripts wrap a
643
+ data-derived `DEL` in `redis.pcall(...)` and ignore the result. This
644
+ preserves queue safety on Cluster `CROSSSLOT` rejection but cannot be
645
+ observed through `on_event`. Operators watching key-TTL behavior or Redis
646
+ slow logs can detect orphans.
647
+ - **VT claim-store OOM compensation:** if the visibility-timeout Lua script
648
+ cannot store the claim result, it removes the message from processing, pushes
649
+ it back to pending, and returns `false`. Python translates that into
650
+ `claim_empty/skipped`, the same shape as an empty poll. This is intentional
651
+ fail-safe behavior; the message is not lost.
652
+ - **`drain()` / `close()` / `aclose()` lifecycle:** explicit shutdown
653
+ operations do not emit lifecycle events. Pending-claim-drain recovery work
654
+ counts as `claim_reclaim` events when reached.
655
+ - **Non-claim-loop retry attempts:** tenacity retries in deduplicated publish,
656
+ ack/remove, move-to-completed/failed, and lease renewal collapse into the
657
+ terminal operation's failure event. There is no per-attempt event for those
658
+ paths.
659
+
550
660
  The public exception hierarchy is rooted at `RedisMessageQueueError`.
551
661
  Configuration value/combinations raise `ConfigurationError` (also a
552
662
  `ValueError`), custom gateway contract violations raise `GatewayContractError`
553
663
  (also a `TypeError`), and Lua `redis.error_reply(...)` failures raise
554
664
  `LuaScriptError` (also a redis-py `ResponseError`). Publish overload raises
555
- `QueueBackpressureError`. `CleanupFailedError` and `RetryBudgetExhaustedError`
556
- are reserved categories for cleanup and retry surfaces.
665
+ `QueueBackpressureError`; publish after explicit drain raises
666
+ `QueueDrainedError`. `CleanupFailedError` and `RetryBudgetExhaustedError` are
667
+ reserved categories for cleanup and retry surfaces.
557
668
 
558
669
  ## Known limitations
559
670
 
@@ -564,13 +675,46 @@ are reserved categories for cleanup and retry surfaces.
564
675
  - **Cluster detection uses `isinstance(client, RedisCluster)`.** Wrapped or instrumented cluster clients that delegate without inheriting will bypass hash-tag validation. Custom gateways should set `is_redis_cluster = True` explicitly.
565
676
  - **Redis Cluster requires hash tags.** The built-in queue uses multiple Redis keys per operation. Wrap the queue name in hash tags (for example `{myqueue}`) so every generated key lands in the same slot. When you pass a Redis Cluster client to the built-in queue/gateway path, incompatible names are rejected early.
566
677
  - **Non-ASCII payloads use ~2x storage.** The default `ensure_ascii=True` in JSON serialization encodes non-ASCII characters as `\uXXXX` escape sequences. This is a deliberate compatibility choice.
567
- - **Client-side `Retry` can duplicate non-deduplicated publishes.** If you construct your `redis.Redis` client with `retry=Retry(...)`, redis-py retries `ConnectionError` / `TimeoutError` at the connection layer — *below* this library. Idempotent operations (deduplicated `publish()`, lease-scoped cleanup) are safe because their Lua scripts replay the original result. `add_message()` (used by `publish()` when `deduplication=False`) is a bare `LPUSH` by default, or a single non-idempotent Lua enqueue when `max_pending_length` is set: this library deliberately does not retry it, but a client-level `Retry` will, and if the server executed the command before the response was lost the message is enqueued twice. Leave `retry=None` (the default) if you need strict at-most-once semantics for non-deduplicated publishes, or accept the duplication risk. More broadly, any non-idempotent enqueue path is vulnerable if the connection drops after server execution but before the client receives the response; all other built-in operations (deduplicated publish, lease-scoped ack/move, lease renewal) use replay markers and are safe under client-level `Retry`.
678
+ - **Client-side `Retry` can duplicate non-deduplicated publishes.** If you construct your `redis.Redis` or `redis.asyncio.Redis` client with `retry=Retry(...)`, redis-py retries `ConnectionError` / `TimeoutError` at the connection layer — *below* this library. Idempotent operations (deduplicated `publish()`, lease-scoped cleanup) are safe because their Lua scripts replay the original result. `add_message()` (used by `publish()` when `deduplication=False`) is a bare `LPUSH` by default, or a single non-idempotent Lua enqueue when `max_pending_length` is set: this library deliberately does not retry it, but a client-level `Retry` will, and if the server executed the command before the response was lost the message is enqueued twice. redis-py 6.0+ changed the default standalone `Redis()` / `redis.asyncio.Redis()` retry policy from `None` (no retry) to a 3-attempt `ExponentialWithJitterBackoff`; pass `retry=None` explicitly if you need strict at-most-once semantics for non-deduplicated publishes, or accept the duplication risk. More broadly, any non-idempotent enqueue path is vulnerable if the connection drops after server execution but before the client receives the response; all other built-in operations (deduplicated publish, lease-scoped ack/move, lease renewal) use replay markers and are safe under client-level `Retry`.
679
+
680
+ ```python
681
+ import redis
682
+ from redis_message_queue import RedisMessageQueue
683
+
684
+ # Strict at-most-once for non-dedup messages: disable redis-py's
685
+ # default 3-retry policy explicitly.
686
+ client = redis.Redis(retry=None)
687
+ queue = RedisMessageQueue("jobs", client=client)
688
+ ```
689
+
690
+ ```python
691
+ import redis.asyncio as redis
692
+ from redis_message_queue.asyncio import RedisMessageQueue
693
+
694
+ # Strict at-most-once for non-dedup messages: disable redis-py's
695
+ # default 3-retry policy explicitly.
696
+ client = redis.Redis(retry=None)
697
+ queue = RedisMessageQueue("jobs", client=client)
698
+ ```
568
699
  - **Redis Cluster default retry can stack with this library's retry budget.** In redis-py 6.0+, `RedisCluster()` constructs a default `ExponentialWithJitterBackoff` retry below this library's `retry_budget_seconds`. If you need a single retry surface, pass `retry=Retry(NoBackoff(), 0)` to the cluster client or reduce `retry_budget_seconds` to account for the lower-level retry window.
569
700
 
570
701
  For a full analysis, see [docs/production-readiness.md](docs/production-readiness.md).
571
702
 
572
703
  ## Upgrading
573
704
 
705
+ ### v6 to v7 migration
706
+
707
+ v7.0.0 changes explicit drain shutdown semantics. After `queue.drain()` /
708
+ `queue.close()` (sync) or `await queue.drain()` / `await queue.aclose()`
709
+ (async), the same queue instance rejects `publish()` with
710
+ `QueueDrainedError("queue is drained")`.
711
+
712
+ This state is queue-local and process-local; it is not stored in Redis. If a
713
+ producer must continue publishing after a worker has drained, use a separate
714
+ `RedisMessageQueue(...)` instance for that producer lifecycle. During
715
+ shutdown, catch `QueueDrainedError` only at boundaries where late publishes are
716
+ expected and safe to drop or reschedule.
717
+
574
718
  ### Configuration changes on live queues
575
719
 
576
720
  > **Warning:** These changes are destructive on live queues. Drain the queue completely before applying them.
@@ -596,6 +740,9 @@ v6.0.0 is a non-breaking-defaults release that adds new public APIs. v5 code con
596
740
  - `max_pending_length=N` caps pending-list depth; with `pending_overload_policy="raise"` (default) producers see `QueueBackpressureError` when the cap is hit; `"block"` waits up to `pending_overload_block_timeout_seconds`; `"drop_oldest"` evicts silently, so use it only when data loss is acceptable.
597
741
  - `queue.drain(timeout=...)` (sync) and `await queue.aclose(timeout=...)` (async) are explicit graceful-shutdown hooks. They refuse new claims and recover pending claim IDs but do not cancel in-flight handlers; join or await your worker separately.
598
742
  - `on_event=callback` receives a `QueueEvent` dataclass for every publish/claim/ack/reclaim/dedup/cleanup lifecycle event. Use it for metrics, tracing, and structured logging. See [`examples/production/observability.py`](examples/production/observability.py) for the adapter pattern.
743
+ - See [`examples/production/backpressure.py`](examples/production/backpressure.py) and [`examples/production/graceful_shutdown.py`](examples/production/graceful_shutdown.py) for sync production patterns, with async siblings under [`examples/production/asyncio/`](examples/production/asyncio/).
744
+
745
+ > When using a pre-fork app server (gunicorn `--preload`, uvicorn workers that import the app at master startup), call `make_queue()` from your worker startup hook - NOT at module import. See [Fork safety](#fork-safety-and-pre-fork-servers) for why.
599
746
 
600
747
  **New constructor rejections:**
601
748
 
@@ -1,6 +1,6 @@
1
1
  # redis-message-queue
2
2
 
3
- [![PyPI Version](https://img.shields.io/badge/v6.0.1-version?color=43cd0f&style=flat&label=pypi)](https://pypi.org/project/redis-message-queue)
3
+ [![PyPI Version](https://img.shields.io/badge/v7.0.0-version?color=43cd0f&style=flat&label=pypi)](https://pypi.org/project/redis-message-queue)
4
4
  [![PyPI Downloads](https://img.shields.io/pypi/dm/redis-message-queue?color=43cd0f&style=flat&label=downloads)](https://pypistats.org/packages/redis-message-queue)
5
5
  [![License: MIT](https://img.shields.io/badge/License-MIT-43cd0f.svg?style=flat&label=license)](LICENSE)
6
6
  [![Maintained: yes](https://img.shields.io/badge/yes-43cd0f.svg?style=flat&label=maintained)](https://github.com/Elijas/redis-message-queue/issues)
@@ -234,6 +234,22 @@ consumer whose claim request Redis executes next. There is no round-robin,
234
234
  equal-share, or starvation-freedom guarantee; faster consumers can receive more
235
235
  than 1/N of messages.
236
236
 
237
+ ### If you need stronger ordering or fairness guarantees
238
+
239
+ - **Strict queue-wide processing order** — use a single consumer per queue.
240
+ Multiple consumers will interleave handler completions.
241
+ - **Per-key processing order** — partition by key into multiple queues
242
+ (`queue_<hash(key) % N>`), and consume each partition with a single consumer.
243
+ - **Equal-share / round-robin fairness across consumers** — choose a different
244
+ scheduler. This queue does not guarantee that any individual consumer makes
245
+ forward progress at any specific rate.
246
+ - **Cross-batch ordering after reclaim** — accept that reclaimed messages will
247
+ reappear after newer un-reclaimed messages have been consumed. If your handler
248
+ must observe original publish order, persist that order in the payload (for
249
+ example, a sequence number set by the producer). For clock-related operator
250
+ detail behind reclaim behavior, see
251
+ [production readiness R11](docs/production-readiness.md#r11-redis-clock-dependencies).
252
+
237
253
  ### Dead-letter queue
238
254
 
239
255
  ```python
@@ -284,14 +300,14 @@ There are three distinct shutdown shapes; pick the one that matches your runtime
284
300
  |---|---|---|---|
285
301
  | **Flag-based soft drain** (`GracefulInterruptHandler`) | First SIGINT/SIGTERM flips a flag | Runs to completion | Drained on the next claim call, not on signal arrival |
286
302
  | **Async task cancellation** (`asyncio.CancelledError`) | Framework cancels the worker task (Uvicorn/K8s SIGTERM in many setups) | **Hard abort** — message stays in `processing`; with VT it is reclaimed at deadline expiry, without VT it is orphaned | Not drained |
287
- | **Explicit drain** (`drain()` / `aclose()`) | You call the method | Caller's responsibility to let it finish (drain does **not** cancel) | Drained synchronously via the gateway recovery path |
303
+ | **Explicit drain** (`drain()` / `aclose()`) | You call the method | Caller's responsibility to let it finish (drain does **not** cancel) | Drained synchronously via the gateway recovery path; new publishes are refused |
288
304
 
289
305
  Use `drain()` / `aclose()` to bridge K8s `preStop` / SIGTERM grace windows without
290
306
  relying on signal interception:
291
307
 
292
308
  ```python
293
309
  # sync — in your SIGTERM handler or preStop hook
294
- queue.drain(timeout=25) # refuses new claims, recovers pending claim IDs
310
+ queue.drain(timeout=25) # refuses new publishes/claims, recovers pending claim IDs
295
311
  worker_thread.join() # wait for in-flight process_message to finish
296
312
 
297
313
  # async — same shape
@@ -300,12 +316,19 @@ await worker_task # task observes ``_draining`` and exits its loop
300
316
  ```
301
317
 
302
318
  `drain()` / `aclose()` set a queue-local flag so subsequent `process_message()`
303
- calls yield `None` immediately. They do not cancel in-flight handlers — the
304
- caller must arrange handler exit through normal thread/task coordination.
305
- Returns `True` if all in-memory pending claim IDs were recovered within the
306
- timeout; `False` if the deadline fired or transient Redis errors left claim
307
- IDs pending (call again to retry). `timeout=0` reports current state without
308
- attempting recovery.
319
+ calls yield `None` immediately and subsequent `publish()` calls raise
320
+ `QueueDrainedError("queue is drained")`. Drain also gates the publish path:
321
+ if a publish is already inside the queue instance's publish path, drain waits
322
+ for that publish to finish before it returns; publishes that arrive after the
323
+ drained flag is set are rejected. The drained state is local to that Python
324
+ queue object and is not written to Redis, so constructing a fresh
325
+ `RedisMessageQueue(...)` over the same keys remains usable.
326
+
327
+ Drain does not cancel in-flight handlers — the caller must arrange handler
328
+ exit through normal thread/task coordination. Returns `True` if all in-memory
329
+ pending claim IDs were recovered within the timeout; `False` if the deadline
330
+ fired or transient Redis errors left claim IDs pending (call again to retry).
331
+ `timeout=0` reports current state without attempting recovery.
309
332
 
310
333
  > **Heartbeat caveat (best-effort stop):** when `heartbeat_interval_seconds` is
311
334
  > set, the heartbeat sidecar's `stop()` is bounded but not strictly quiescent —
@@ -446,6 +469,32 @@ Notes:
446
469
  message. Do not call `fork()` from inside active message handlers unless the
447
470
  child exits without using the inherited queue/client.
448
471
 
472
+ #### Forking after constructing GracefulInterruptHandler
473
+
474
+ If your application constructed `GracefulInterruptHandler` in the parent process
475
+ before `os.fork()` (for example, via module import in a pre-fork app server),
476
+ forked children cannot construct a fresh handler for the same signal because the
477
+ inherited signal table still routes to the parent-process handler.
478
+
479
+ In each child process, call `parent_handler.reset()` before constructing a fresh
480
+ handler:
481
+
482
+ ```python
483
+ def worker_main():
484
+ # Inherited handler from parent - reset it.
485
+ if shared.interrupt_handler is not None:
486
+ shared.interrupt_handler.reset()
487
+
488
+ # Now safe to construct a fresh handler for this child.
489
+ interrupt = GracefulInterruptHandler()
490
+ queue = RedisMessageQueue("jobs", client=redis.Redis(), interrupt=interrupt)
491
+ ...
492
+ ```
493
+
494
+ Alternatively, defer all construction (handler and queue) to inside
495
+ `worker_main()` and pass `--no-preload` (or equivalent) to your app server. That
496
+ avoids the parent-construct hazard entirely.
497
+
449
498
  ### Redis memory sizing for deduplication and replay metadata
450
499
 
451
500
  When deduplication is enabled, each distinct dedup key creates one Redis string
@@ -504,6 +553,7 @@ Package logs remain diagnostic; use `on_event` rather than log parsing for
504
553
  metrics.
505
554
 
506
555
  ```python
556
+ from opentelemetry import trace
507
557
  from prometheus_client import Counter
508
558
  from redis_message_queue import QueueEvent, RedisMessageQueue
509
559
 
@@ -517,17 +567,78 @@ def observe(event: QueueEvent) -> None:
517
567
  events_total.labels(
518
568
  event.queue, event.operation, event.outcome, event.exception_type or ""
519
569
  ).inc()
570
+ if event.error is not None:
571
+ trace.get_current_span().record_exception(event.error)
520
572
 
521
573
  queue = RedisMessageQueue("jobs", client=client, on_event=observe)
522
574
  ```
523
575
 
576
+ #### Event dispatch context
577
+
578
+ Callbacks fire inline:
579
+
580
+ - **Sync queue:** the callback runs in the caller's thread. It sees
581
+ contextvars, the OpenTelemetry current span, and structlog contextvars bound
582
+ by the caller.
583
+ - **Async queue:** the callback is awaited in the current asyncio task. It has
584
+ the same contextvars, span, and structlog visibility.
585
+ - **Sync heartbeat:** heartbeat events fire from a separate
586
+ `threading.Thread`. That thread does not inherit caller contextvars or the
587
+ caller's OpenTelemetry current span. Use `event.message_id` and
588
+ `event.lease_token_hash` for correlation.
589
+ - **Async heartbeat:** heartbeat events fire from an asyncio task. The task
590
+ copies the context present when the heartbeat was started, so contextvars and
591
+ OpenTelemetry spans bound at handler entry are visible.
592
+
593
+ #### Event timing vs. Redis commit
594
+
595
+ Most events are post-commit, emitted after the Redis command or Lua script
596
+ returned: `publish/success`, `publish_dedup_hit`, `claim/success`,
597
+ `claim_empty`, `claim_reclaim`, `ack`, `nack`, `completed`, `dlq`,
598
+ `lease_renew`, `trim_failed`, and `stale_lease_*`.
599
+
600
+ Pre-commit and mid-flight exceptions:
601
+
602
+ - `failed/failure` fires after the handler raises but before failed-queue
603
+ cleanup completes. Use `nack` for cleanup-commit metrics; use `failed` for
604
+ handler-exception attribution.
605
+ - `retry_attempt/failure` and `retry_exhausted` fire on the claim-loop retry
606
+ path. The first Redis attempt may or may not have committed.
607
+ - `publish/failure`, `claim/failure`, and `cleanup_failed/failure` follow
608
+ exceptions. Under an ambiguous lost response, Redis may have committed
609
+ despite the exception. Treat them as "operation did not succeed from the
610
+ caller's perspective", not "Redis did not commit".
611
+
612
+ #### Intentionally silent paths
613
+
614
+ The following operations have no `on_event` surface by design:
615
+
616
+ - **B1 Cluster `pcall` cleanup failure:** three lease-aware Lua scripts wrap a
617
+ data-derived `DEL` in `redis.pcall(...)` and ignore the result. This
618
+ preserves queue safety on Cluster `CROSSSLOT` rejection but cannot be
619
+ observed through `on_event`. Operators watching key-TTL behavior or Redis
620
+ slow logs can detect orphans.
621
+ - **VT claim-store OOM compensation:** if the visibility-timeout Lua script
622
+ cannot store the claim result, it removes the message from processing, pushes
623
+ it back to pending, and returns `false`. Python translates that into
624
+ `claim_empty/skipped`, the same shape as an empty poll. This is intentional
625
+ fail-safe behavior; the message is not lost.
626
+ - **`drain()` / `close()` / `aclose()` lifecycle:** explicit shutdown
627
+ operations do not emit lifecycle events. Pending-claim-drain recovery work
628
+ counts as `claim_reclaim` events when reached.
629
+ - **Non-claim-loop retry attempts:** tenacity retries in deduplicated publish,
630
+ ack/remove, move-to-completed/failed, and lease renewal collapse into the
631
+ terminal operation's failure event. There is no per-attempt event for those
632
+ paths.
633
+
524
634
  The public exception hierarchy is rooted at `RedisMessageQueueError`.
525
635
  Configuration value/combinations raise `ConfigurationError` (also a
526
636
  `ValueError`), custom gateway contract violations raise `GatewayContractError`
527
637
  (also a `TypeError`), and Lua `redis.error_reply(...)` failures raise
528
638
  `LuaScriptError` (also a redis-py `ResponseError`). Publish overload raises
529
- `QueueBackpressureError`. `CleanupFailedError` and `RetryBudgetExhaustedError`
530
- are reserved categories for cleanup and retry surfaces.
639
+ `QueueBackpressureError`; publish after explicit drain raises
640
+ `QueueDrainedError`. `CleanupFailedError` and `RetryBudgetExhaustedError` are
641
+ reserved categories for cleanup and retry surfaces.
531
642
 
532
643
  ## Known limitations
533
644
 
@@ -538,13 +649,46 @@ are reserved categories for cleanup and retry surfaces.
538
649
  - **Cluster detection uses `isinstance(client, RedisCluster)`.** Wrapped or instrumented cluster clients that delegate without inheriting will bypass hash-tag validation. Custom gateways should set `is_redis_cluster = True` explicitly.
539
650
  - **Redis Cluster requires hash tags.** The built-in queue uses multiple Redis keys per operation. Wrap the queue name in hash tags (for example `{myqueue}`) so every generated key lands in the same slot. When you pass a Redis Cluster client to the built-in queue/gateway path, incompatible names are rejected early.
540
651
  - **Non-ASCII payloads use ~2x storage.** The default `ensure_ascii=True` in JSON serialization encodes non-ASCII characters as `\uXXXX` escape sequences. This is a deliberate compatibility choice.
541
- - **Client-side `Retry` can duplicate non-deduplicated publishes.** If you construct your `redis.Redis` client with `retry=Retry(...)`, redis-py retries `ConnectionError` / `TimeoutError` at the connection layer — *below* this library. Idempotent operations (deduplicated `publish()`, lease-scoped cleanup) are safe because their Lua scripts replay the original result. `add_message()` (used by `publish()` when `deduplication=False`) is a bare `LPUSH` by default, or a single non-idempotent Lua enqueue when `max_pending_length` is set: this library deliberately does not retry it, but a client-level `Retry` will, and if the server executed the command before the response was lost the message is enqueued twice. Leave `retry=None` (the default) if you need strict at-most-once semantics for non-deduplicated publishes, or accept the duplication risk. More broadly, any non-idempotent enqueue path is vulnerable if the connection drops after server execution but before the client receives the response; all other built-in operations (deduplicated publish, lease-scoped ack/move, lease renewal) use replay markers and are safe under client-level `Retry`.
652
+ - **Client-side `Retry` can duplicate non-deduplicated publishes.** If you construct your `redis.Redis` or `redis.asyncio.Redis` client with `retry=Retry(...)`, redis-py retries `ConnectionError` / `TimeoutError` at the connection layer — *below* this library. Idempotent operations (deduplicated `publish()`, lease-scoped cleanup) are safe because their Lua scripts replay the original result. `add_message()` (used by `publish()` when `deduplication=False`) is a bare `LPUSH` by default, or a single non-idempotent Lua enqueue when `max_pending_length` is set: this library deliberately does not retry it, but a client-level `Retry` will, and if the server executed the command before the response was lost the message is enqueued twice. redis-py 6.0+ changed the default standalone `Redis()` / `redis.asyncio.Redis()` retry policy from `None` (no retry) to a 3-attempt `ExponentialWithJitterBackoff`; pass `retry=None` explicitly if you need strict at-most-once semantics for non-deduplicated publishes, or accept the duplication risk. More broadly, any non-idempotent enqueue path is vulnerable if the connection drops after server execution but before the client receives the response; all other built-in operations (deduplicated publish, lease-scoped ack/move, lease renewal) use replay markers and are safe under client-level `Retry`.
653
+
654
+ ```python
655
+ import redis
656
+ from redis_message_queue import RedisMessageQueue
657
+
658
+ # Strict at-most-once for non-dedup messages: disable redis-py's
659
+ # default 3-retry policy explicitly.
660
+ client = redis.Redis(retry=None)
661
+ queue = RedisMessageQueue("jobs", client=client)
662
+ ```
663
+
664
+ ```python
665
+ import redis.asyncio as redis
666
+ from redis_message_queue.asyncio import RedisMessageQueue
667
+
668
+ # Strict at-most-once for non-dedup messages: disable redis-py's
669
+ # default 3-retry policy explicitly.
670
+ client = redis.Redis(retry=None)
671
+ queue = RedisMessageQueue("jobs", client=client)
672
+ ```
542
673
  - **Redis Cluster default retry can stack with this library's retry budget.** In redis-py 6.0+, `RedisCluster()` constructs a default `ExponentialWithJitterBackoff` retry below this library's `retry_budget_seconds`. If you need a single retry surface, pass `retry=Retry(NoBackoff(), 0)` to the cluster client or reduce `retry_budget_seconds` to account for the lower-level retry window.
543
674
 
544
675
  For a full analysis, see [docs/production-readiness.md](docs/production-readiness.md).
545
676
 
546
677
  ## Upgrading
547
678
 
679
+ ### v6 to v7 migration
680
+
681
+ v7.0.0 changes explicit drain shutdown semantics. After `queue.drain()` /
682
+ `queue.close()` (sync) or `await queue.drain()` / `await queue.aclose()`
683
+ (async), the same queue instance rejects `publish()` with
684
+ `QueueDrainedError("queue is drained")`.
685
+
686
+ This state is queue-local and process-local; it is not stored in Redis. If a
687
+ producer must continue publishing after a worker has drained, use a separate
688
+ `RedisMessageQueue(...)` instance for that producer lifecycle. During
689
+ shutdown, catch `QueueDrainedError` only at boundaries where late publishes are
690
+ expected and safe to drop or reschedule.
691
+
548
692
  ### Configuration changes on live queues
549
693
 
550
694
  > **Warning:** These changes are destructive on live queues. Drain the queue completely before applying them.
@@ -570,6 +714,9 @@ v6.0.0 is a non-breaking-defaults release that adds new public APIs. v5 code con
570
714
  - `max_pending_length=N` caps pending-list depth; with `pending_overload_policy="raise"` (default) producers see `QueueBackpressureError` when the cap is hit; `"block"` waits up to `pending_overload_block_timeout_seconds`; `"drop_oldest"` evicts silently, so use it only when data loss is acceptable.
571
715
  - `queue.drain(timeout=...)` (sync) and `await queue.aclose(timeout=...)` (async) are explicit graceful-shutdown hooks. They refuse new claims and recover pending claim IDs but do not cancel in-flight handlers; join or await your worker separately.
572
716
  - `on_event=callback` receives a `QueueEvent` dataclass for every publish/claim/ack/reclaim/dedup/cleanup lifecycle event. Use it for metrics, tracing, and structured logging. See [`examples/production/observability.py`](examples/production/observability.py) for the adapter pattern.
717
+ - See [`examples/production/backpressure.py`](examples/production/backpressure.py) and [`examples/production/graceful_shutdown.py`](examples/production/graceful_shutdown.py) for sync production patterns, with async siblings under [`examples/production/asyncio/`](examples/production/asyncio/).
718
+
719
+ > When using a pre-fork app server (gunicorn `--preload`, uvicorn workers that import the app at master startup), call `make_queue()` from your worker startup hook - NOT at module import. See [Fork safety](#fork-safety-and-pre-fork-servers) for why.
573
720
 
574
721
  **New constructor rejections:**
575
722
 
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "redis-message-queue"
3
- version = "6.0.1"
3
+ version = "7.0.0"
4
4
  description = "Python message queuing with Redis and message deduplication"
5
5
  authors = ["Elijas <4084885+Elijas@users.noreply.github.com>"]
6
6
  readme = "README.md"
@@ -24,7 +24,7 @@ Issues = "https://github.com/Elijas/redis-message-queue/issues"
24
24
 
25
25
  [tool.poetry.dependencies]
26
26
  python = "^3.12"
27
- redis = ">=5.0.0"
27
+ redis = ">=5.0.0,<8.0.0"
28
28
  tenacity = ">=8.1.0"
29
29
 
30
30
  [tool.poetry.group.test.dependencies]
@@ -6,6 +6,7 @@ from redis_message_queue._exceptions import (
6
6
  GatewayContractError,
7
7
  LuaScriptError,
8
8
  QueueBackpressureError,
9
+ QueueDrainedError,
9
10
  RedisMessageQueueError,
10
11
  RetryBudgetExhaustedError,
11
12
  )
@@ -33,6 +34,7 @@ __all__ = [
33
34
  "GatewayContractError",
34
35
  "LuaScriptError",
35
36
  "QueueBackpressureError",
37
+ "QueueDrainedError",
36
38
  "CleanupFailedError",
37
39
  "RetryBudgetExhaustedError",
38
40
  ]
@@ -83,12 +83,13 @@ class AbstractRedisGateway(ABC):
83
83
  command can silently duplicate the message. The caller can still
84
84
  retry (accepting duplicates).
85
85
 
86
- Note: a client-level retry policy bypasses this guarantee. If the
87
- underlying ``redis.Redis`` / ``redis.asyncio.Redis`` client was
88
- constructed with ``retry=Retry(...)``, redis-py retries on
89
- ``ConnectionError`` / ``TimeoutError`` below this call and may
90
- duplicate. Pass ``retry=None`` (the default) when strict at-most-once
91
- is required for non-deduplicated publishes.
86
+ Note on retries: redis-py 6.0+ changed the default standalone
87
+ ``Redis()`` / ``redis.asyncio.Redis()`` retry policy from ``None`` (no
88
+ retry) to a 3-attempt ``ExponentialWithJitterBackoff``. If you need
89
+ strict at-most-once for non-deduplicated publishes, pass ``retry=None``
90
+ explicitly when constructing the redis-py client. This library does
91
+ not configure the redis-py client retry; it only controls its own
92
+ retry budget on top of the client.
92
93
  """
93
94
 
94
95
  @abstractmethod