redis-message-queue 5.0.0__tar.gz → 6.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (23) hide show
  1. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/PKG-INFO +234 -7
  2. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/README.md +233 -6
  3. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/pyproject.toml +1 -1
  4. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/__init__.py +20 -0
  5. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/_abstract_redis_gateway.py +17 -1
  6. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/_config.py +130 -18
  7. redis_message_queue-6.0.0/redis_message_queue/_event.py +39 -0
  8. redis_message_queue-6.0.0/redis_message_queue/_exceptions.py +36 -0
  9. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/_queue_key_manager.py +9 -4
  10. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/_redis_cluster.py +4 -3
  11. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/_redis_gateway.py +250 -15
  12. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/_stored_message.py +17 -0
  13. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/asyncio/__init__.py +20 -0
  14. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/asyncio/_abstract_redis_gateway.py +17 -1
  15. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/asyncio/_redis_gateway.py +245 -15
  16. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/asyncio/redis_message_queue.py +371 -47
  17. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/redis_message_queue.py +381 -45
  18. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/LICENSE +0 -0
  19. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/_callable_utils.py +0 -0
  20. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/interrupt_handler/__init__.py +0 -0
  21. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/interrupt_handler/_implementation.py +0 -0
  22. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/interrupt_handler/_interface.py +0 -0
  23. {redis_message_queue-5.0.0 → redis_message_queue-6.0.0}/redis_message_queue/py.typed +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: redis-message-queue
3
- Version: 5.0.0
3
+ Version: 6.0.0
4
4
  Summary: Python message queuing with Redis and message deduplication
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -26,7 +26,7 @@ Description-Content-Type: text/markdown
26
26
 
27
27
  # redis-message-queue
28
28
 
29
- [![PyPI Version](https://img.shields.io/badge/v5.0.0-version?color=43cd0f&style=flat&label=pypi)](https://pypi.org/project/redis-message-queue)
29
+ [![PyPI Version](https://img.shields.io/badge/v6.0.0-version?color=43cd0f&style=flat&label=pypi)](https://pypi.org/project/redis-message-queue)
30
30
  [![PyPI Downloads](https://img.shields.io/pypi/dm/redis-message-queue?color=43cd0f&style=flat&label=downloads)](https://pypistats.org/packages/redis-message-queue)
31
31
  [![License: MIT](https://img.shields.io/badge/License-MIT-43cd0f.svg?style=flat&label=license)](LICENSE)
32
32
  [![Maintained: yes](https://img.shields.io/badge/yes-43cd0f.svg?style=flat&label=maintained)](https://github.com/Elijas/redis-message-queue/issues)
@@ -37,7 +37,7 @@ Description-Content-Type: text/markdown
37
37
  **Lightweight Python message queuing with Redis and built-in publish-side deduplication.** Deduplicate publishes within a TTL window, with optional crash recovery — across any number of producers and consumers.
38
38
 
39
39
  ```bash
40
- pip install "redis-message-queue>=3.0.0,<4.0.0"
40
+ pip install "redis-message-queue>=6.0.0,<7.0.0"
41
41
  ```
42
42
 
43
43
  Requires Redis server >= 6.2.
@@ -151,6 +151,43 @@ When set, `LTRIM` is called after each message is moved to the completed/failed
151
151
  Pass `max_completed_length=None` or `max_failed_length=None` explicitly if you
152
152
  want unbounded tracking queues.
153
153
 
154
+ ### Publish backpressure
155
+
156
+ By default, the pending queue is unbounded (`max_pending_length=None`), matching
157
+ the v5 behavior. Set `max_pending_length` when producers can outrun consumers
158
+ and Redis memory must fail closed before the broker is exhausted:
159
+
160
+ ```python
161
+ queue = RedisMessageQueue(
162
+ "q",
163
+ client=client,
164
+ max_pending_length=100_000,
165
+ pending_overload_policy="raise", # "raise", "drop_oldest", or "block"
166
+ )
167
+ ```
168
+
169
+ The built-in Redis path checks pending depth and enqueues in the same Lua script,
170
+ so concurrent publishers cannot race above the configured cap. Overload policies:
171
+
172
+ - `raise` raises `QueueBackpressureError` and leaves the pending list unchanged.
173
+ - `drop_oldest` removes the oldest pending message (`RPOP`) before enqueueing the
174
+ new message. This is silent data loss by design; deduplication markers for
175
+ dropped messages are not removed, so a dropped duplicate may still be
176
+ suppressed until its dedup TTL expires.
177
+ - `block` retries the atomic check until space opens or
178
+ `pending_overload_block_timeout_seconds` elapses (default: 1.0), then raises
179
+ `QueueBackpressureError`.
180
+
181
+ These limits apply only to the pending list at publish time. They do not cap
182
+ messages already in `processing`, dead-letter queues, deduplication keys, or
183
+ replay metadata. `max_completed_length` and `max_failed_length` only bound the
184
+ completed/failed history lists. Size pending payload memory separately from the
185
+ dedup/replay metadata described in
186
+ [Redis memory sizing](#redis-memory-sizing-for-deduplication-and-replay-metadata).
187
+
188
+ When using `gateway=`, configure backpressure on the gateway directly, for
189
+ example `RedisGateway(redis_client=client, max_pending_length=100_000)`.
190
+
154
191
  ### Crash recovery with visibility timeout
155
192
 
156
193
  ```python
@@ -186,6 +223,43 @@ The callback is **advisory** — it may fire briefly after a successful `process
186
223
 
187
224
  Without a visibility timeout, messages already moved to `processing` remain there indefinitely after a consumer crash and are not redelivered, even if the crash happened before your handler started running.
188
225
 
226
+ ### Ordering and multi-consumer fairness
227
+
228
+ The built-in queue is a shared-pull Redis list. Successful publishes push to the
229
+ left side of the pending list, and claims pop from the right side, so Redis
230
+ grants claims in enqueue order in the no-failure path.
231
+
232
+ This is a claim-order guarantee only. It is not a completion-order guarantee:
233
+ multiple consumers process concurrently, handlers can run for different
234
+ durations, and younger messages can finish before older messages.
235
+
236
+ With `visibility_timeout_seconds` enabled, expired messages from `processing`
237
+ are reclaimed before fresh pending work on the next consumer poll. A reclaimed
238
+ message may be delivered after younger messages were already processed, and may
239
+ be processed concurrently with a stale original handler if that handler keeps
240
+ running after its lease expires.
241
+
242
+ Expired reclaims are ordered by lease deadline within one reclaim batch.
243
+ `CLAIM_MESSAGE_WITH_VISIBILITY_TIMEOUT_LUA_SCRIPT` selects expired leases with
244
+ `ZRANGEBYSCORE ... LIMIT 0, 100` to bound Redis Lua execution time. When more
245
+ than 100 messages expire together, the next poll can append a later reclaim
246
+ batch at the claimable end of the pending list ahead of leftovers from the
247
+ previous batch, so cross-batch redelivery order is not guaranteed.
248
+
249
+ `max_delivery_count` can skip over poison messages during a claim poll by moving
250
+ over-limit messages to the dead-letter queue and returning a later pending
251
+ message. Deduplication is publish-side only: duplicate publishes are not
252
+ enqueued and therefore do not occupy a queue position.
253
+
254
+ Handler exceptions are not retries: the default behavior removes the message
255
+ from `processing`, or moves it to the failed queue when enabled. Redelivery is
256
+ for crash, stall, or stale-lease paths where cleanup does not complete.
257
+
258
+ Multiple consumers contend for the same queue. The next message goes to the
259
+ consumer whose claim request Redis executes next. There is no round-robin,
260
+ equal-share, or starvation-freedom guarantee; faster consumers can receive more
261
+ than 1/N of messages.
262
+
189
263
  ### Dead-letter queue
190
264
 
191
265
  ```python
@@ -230,6 +304,42 @@ while not interrupt.is_interrupted():
230
304
  > (for example, a second Ctrl+C raises `KeyboardInterrupt`). If you need multiple
231
305
  > shutdown hooks, use a single handler and fan out in your own code.
232
306
 
307
+ There are three distinct shutdown shapes; pick the one that matches your runtime:
308
+
309
+ | Shape | Trigger | In-flight handler | Pending claim IDs |
310
+ |---|---|---|---|
311
+ | **Flag-based soft drain** (`GracefulInterruptHandler`) | First SIGINT/SIGTERM flips a flag | Runs to completion | Drained on the next claim call, not on signal arrival |
312
+ | **Async task cancellation** (`asyncio.CancelledError`) | Framework cancels the worker task (Uvicorn/K8s SIGTERM in many setups) | **Hard abort** — message stays in `processing`; with VT it is reclaimed at deadline expiry, without VT it is orphaned | Not drained |
313
+ | **Explicit drain** (`drain()` / `aclose()`) | You call the method | Caller's responsibility to let it finish (drain does **not** cancel) | Drained synchronously via the gateway recovery path |
314
+
315
+ Use `drain()` / `aclose()` to bridge K8s `preStop` / SIGTERM grace windows without
316
+ relying on signal interception:
317
+
318
+ ```python
319
+ # sync — in your SIGTERM handler or preStop hook
320
+ queue.drain(timeout=25) # refuses new claims, recovers pending claim IDs
321
+ worker_thread.join() # wait for in-flight process_message to finish
322
+
323
+ # async — same shape
324
+ await queue.aclose(timeout=25)
325
+ await worker_task # task observes ``_draining`` and exits its loop
326
+ ```
327
+
328
+ `drain()` / `aclose()` set a queue-local flag so subsequent `process_message()`
329
+ calls yield `None` immediately. They do not cancel in-flight handlers — the
330
+ caller must arrange handler exit through normal thread/task coordination.
331
+ Returns `True` if all in-memory pending claim IDs were recovered within the
332
+ timeout; `False` if the deadline fired or transient Redis errors left claim
333
+ IDs pending (call again to retry). `timeout=0` reports current state without
334
+ attempting recovery.
335
+
336
+ > **Heartbeat caveat (best-effort stop):** when `heartbeat_interval_seconds` is
337
+ > set, the heartbeat sidecar's `stop()` is bounded but not strictly quiescent —
338
+ > a slow renewal in flight when `process_message` exits may still write to
339
+ > Redis after the caller believes shutdown is complete. The renewal is bounded
340
+ > by the configured visibility timeout and the lease token check on the Redis
341
+ > side, but plan for a small post-shutdown overlap rather than instant quiesce.
342
+
233
343
  ### Custom gateway
234
344
 
235
345
  ```python
@@ -250,12 +360,12 @@ queue = RedisMessageQueue("q", gateway=gateway)
250
360
 
251
361
  The retry knobs configure an internal `tenacity` strategy: exponential
252
362
  backoff with jitter, retry on transient Redis errors only, capped at
253
- `retry_budget_seconds`. The budget is wall-clock time from the first attempt (including attempt duration), not inter-attempt delay; a single attempt that takes longer than the budget results in zero retries. Setting `retry_budget_seconds=0` disables retry
363
+ `retry_budget_seconds`. The budget is monotonic elapsed time from the first attempt (including attempt duration), not inter-attempt delay; it is unaffected by Python-host NTP jumps. A single attempt that takes longer than the budget results in zero retries. Setting `retry_budget_seconds=0` disables retry
254
364
  entirely (single attempt; exceptions propagate). The library uses
255
365
  `retry_budget_seconds` to size the operation-result cache TTL automatically,
256
366
  so the previous footgun of an over-long retry budget out-living the cache
257
367
  and producing misleading "cleanup was a no-op" warnings is now structurally
258
- impossible. Note: tenacity may allow one additional attempt beyond the budget if the budget check passes at attempt start total wall-clock time can exceed `retry_budget_seconds` by the duration of that final attempt.
368
+ impossible. Note: tenacity may allow one additional attempt beyond the budget if the budget check passes at attempt start, so total monotonic elapsed time can exceed `retry_budget_seconds` by the duration of that final attempt.
259
369
 
260
370
  To plug in a different retry library (`backoff`, `asyncstdlib.retry`, or your
261
371
  own logic) or fundamentally different semantics, subclass
@@ -327,9 +437,126 @@ await client.aclose()
327
437
  For the sync Redis client, call `client.close()` during application shutdown when
328
438
  you own the client lifecycle.
329
439
 
440
+ ## Production notes
441
+
442
+ ### Fork safety and pre-fork servers
443
+
444
+ Construct Redis clients and `RedisMessageQueue` instances after a process forks.
445
+ This is the recommended pattern for `multiprocessing`, `ProcessPoolExecutor`,
446
+ and pre-fork servers such as gunicorn with `--preload`.
447
+
448
+ ```python
449
+ def worker_main():
450
+ client = redis.Redis()
451
+ queue = RedisMessageQueue("jobs", client=client)
452
+ ...
453
+ ```
454
+
455
+ Avoid constructing a queue/client in a parent process and then using that same
456
+ object in forked children, especially if the parent has already run any Redis
457
+ command. The queue stores the user-provided Redis client and process-local
458
+ claim-recovery state. Inherited Redis sockets can corrupt the Redis protocol if
459
+ two processes use the same file descriptor.
460
+
461
+ Notes:
462
+
463
+ - The sync redis-py pooled client attempts to reset its connection pool after
464
+ fork, but this does not apply to every client shape.
465
+ - The built-in sync gateway rejects `redis.Redis(single_connection_client=True)`
466
+ because that mode pins one socket instead of using the pool.
467
+ - Do not share `redis.asyncio.Redis` or async queues across fork; create or
468
+ reconnect them in the child process.
469
+ - If you use `GracefulInterruptHandler`, create it in the worker process after
470
+ fork so signal ownership is local to that worker.
471
+ - The heartbeat sidecar is lazy and starts only while processing a leased
472
+ message. Do not call `fork()` from inside active message handlers unless the
473
+ child exits without using the inherited queue/client.
474
+
475
+ ### Redis memory sizing for deduplication and replay metadata
476
+
477
+ When deduplication is enabled, each distinct dedup key creates one Redis string
478
+ for `message_deduplication_log_ttl_seconds` (default: 3600 seconds). The default
479
+ dedup key is a SHA-256 hash of the canonical message payload, so distinct
480
+ payloads are distinct keys. Size Redis for:
481
+
482
+ ```text
483
+ peak_unique_publish_rate_per_second
484
+ * message_deduplication_log_ttl_seconds
485
+ * bytes_per_dedup_key
486
+ ```
487
+
488
+ Use 200 bytes per dedup key as a conservative starting point for short queue
489
+ names, then validate with `MEMORY USAGE` in your Redis version. Example:
490
+ 1,000 unique messages/s * 3,600s * 200 B ~= 720 MB for dedup markers alone.
491
+ A 24h dedup window at the same rate is 86.4M keys, or roughly 17 GB before
492
+ message payload lists, lease metadata, completed/failed queues, and allocator
493
+ fragmentation.
494
+
495
+ Operation-result replay keys are normally deleted after a successful call, but
496
+ may live until their TTL after ambiguous connection drops or failed cleanup
497
+ deletes. With visibility timeouts, active claims also store replay metadata
498
+ until ack or reclaim. Without visibility timeouts, abandoned claims leave
499
+ `claim_result_ids` and `claim_result_backrefs` fields until the message is
500
+ acked or manually cleaned.
501
+
502
+ `max_completed_length` and `max_failed_length` only bound the completed/failed
503
+ lists. They do not bound deduplication keys or replay metadata.
504
+
505
+ Avoid sharing queue Redis DBs with unrelated high-cardinality workloads. If
506
+ idempotency matters, prefer explicit capacity planning and `noeviction` with
507
+ alerts over LRU/random eviction policies: evicting dedup/replay keys before
508
+ their TTL can weaken duplicate suppression and retry result replay.
509
+
510
+ ## Observability
511
+
512
+ Queue instances accept an optional `on_event` callback for metrics, tracing, or
513
+ structured logging. The sync queue expects a regular callable; the async queue
514
+ expects an async callable:
515
+
516
+ ```python
517
+ from redis_message_queue import QueueEvent, RedisMessageQueue
518
+
519
+ def on_event(event: QueueEvent) -> None:
520
+ ...
521
+
522
+ queue = RedisMessageQueue("jobs", client=client, on_event=on_event)
523
+ ```
524
+
525
+ Events cover publish, dedup hits, claim/empty polls, reclaim, ack/nack,
526
+ completed/failed cleanup, DLQ moves, heartbeat renewal, stale leases, cleanup
527
+ and trim failures, and retry attempts. Callback exceptions are logged and
528
+ reported with `RuntimeWarning`, but never propagate into queue operations.
529
+ Package logs remain diagnostic; use `on_event` rather than log parsing for
530
+ metrics.
531
+
532
+ ```python
533
+ from prometheus_client import Counter
534
+ from redis_message_queue import QueueEvent, RedisMessageQueue
535
+
536
+ events_total = Counter(
537
+ "rmq_events_total",
538
+ "redis-message-queue lifecycle events",
539
+ ["queue", "operation", "outcome", "exception_type"],
540
+ )
541
+
542
+ def observe(event: QueueEvent) -> None:
543
+ events_total.labels(
544
+ event.queue, event.operation, event.outcome, event.exception_type or ""
545
+ ).inc()
546
+
547
+ queue = RedisMessageQueue("jobs", client=client, on_event=observe)
548
+ ```
549
+
550
+ The public exception hierarchy is rooted at `RedisMessageQueueError`.
551
+ Configuration value/combinations raise `ConfigurationError` (also a
552
+ `ValueError`), custom gateway contract violations raise `GatewayContractError`
553
+ (also a `TypeError`), and Lua `redis.error_reply(...)` failures raise
554
+ `LuaScriptError` (also a redis-py `ResponseError`). Publish overload raises
555
+ `QueueBackpressureError`. `CleanupFailedError` and `RetryBudgetExhaustedError`
556
+ are reserved categories for cleanup and retry surfaces.
557
+
330
558
  ## Known limitations
331
559
 
332
- - **No metrics or observability hooks.** The library logs warnings (stale leases, heartbeat failures, transient errors) via Python's `logging` module but does not expose callbacks, event hooks, or metric counters. To monitor queue health, inspect the underlying Redis keys directly or parse log output.
333
560
  - **Timed waits use polling claim loops.** To make claims recoverable after ambiguous connection drops, `wait_for_message_and_move()` uses idempotent Lua claim polling instead of raw blocking list-move commands. This adds a small polling cadence during timed waits.
334
561
  - **Redis Lua is atomic, not rollback-transactional.** The built-in scripts now preflight queue key types and fail closed on `WRONGTYPE` before mutating queue state, but Redis does not undo earlier writes if a later script command fails for another reason (for example `OOM` under severe memory pressure).
335
562
  - **Batch reclaim limit of 100.** The visibility-timeout reclaim Lua script processes at most 100 expired messages per consumer poll. Under extreme backlog this may delay recovery, but prevents any single poll from blocking Redis.
@@ -337,7 +564,7 @@ you own the client lifecycle.
337
564
  - **Cluster detection uses `isinstance(client, RedisCluster)`.** Wrapped or instrumented cluster clients that delegate without inheriting will bypass hash-tag validation. Custom gateways should set `is_redis_cluster = True` explicitly.
338
565
  - **Redis Cluster requires hash tags.** The built-in queue uses multiple Redis keys per operation. Wrap the queue name in hash tags (for example `{myqueue}`) so every generated key lands in the same slot. When you pass a Redis Cluster client to the built-in queue/gateway path, incompatible names are rejected early.
339
566
  - **Non-ASCII payloads use ~2x storage.** The default `ensure_ascii=True` in JSON serialization encodes non-ASCII characters as `\uXXXX` escape sequences. This is a deliberate compatibility choice.
340
- - **Client-side `Retry` can duplicate non-deduplicated publishes.** If you construct your `redis.Redis` client with `retry=Retry(...)`, redis-py retries `ConnectionError` / `TimeoutError` at the connection layer — *below* this library. Idempotent operations (deduplicated `publish()`, lease-scoped cleanup) are safe because their Lua scripts replay the original result. `add_message()` (used by `publish()` when `deduplication=False`) is a bare `LPUSH`: this library deliberately does not retry it, but a client-level `Retry` will, and if the server executed the command before the response was lost the message is enqueued twice. Leave `retry=None` (the default) if you need strict at-most-once semantics for non-deduplicated publishes, or accept the duplication risk. More broadly, any non-idempotent `LPUSH` path is vulnerable if the connection drops after server execution but before the client receives the response; all other built-in operations (deduplicated publish, lease-scoped ack/move, lease renewal) use replay markers and are safe under client-level `Retry`.
567
+ - **Client-side `Retry` can duplicate non-deduplicated publishes.** If you construct your `redis.Redis` client with `retry=Retry(...)`, redis-py retries `ConnectionError` / `TimeoutError` at the connection layer — *below* this library. Idempotent operations (deduplicated `publish()`, lease-scoped cleanup) are safe because their Lua scripts replay the original result. `add_message()` (used by `publish()` when `deduplication=False`) is a bare `LPUSH` by default, or a single non-idempotent Lua enqueue when `max_pending_length` is set: this library deliberately does not retry it, but a client-level `Retry` will, and if the server executed the command before the response was lost the message is enqueued twice. Leave `retry=None` (the default) if you need strict at-most-once semantics for non-deduplicated publishes, or accept the duplication risk. More broadly, any non-idempotent enqueue path is vulnerable if the connection drops after server execution but before the client receives the response; all other built-in operations (deduplicated publish, lease-scoped ack/move, lease renewal) use replay markers and are safe under client-level `Retry`.
341
568
  - **Redis Cluster default retry can stack with this library's retry budget.** In redis-py 6.0+, `RedisCluster()` constructs a default `ExponentialWithJitterBackoff` retry below this library's `retry_budget_seconds`. If you need a single retry surface, pass `retry=Retry(NoBackoff(), 0)` to the cluster client or reduce `retry_budget_seconds` to account for the lower-level retry window.
342
569
 
343
570
  For a full analysis, see [docs/production-readiness.md](docs/production-readiness.md).
@@ -1,6 +1,6 @@
1
1
  # redis-message-queue
2
2
 
3
- [![PyPI Version](https://img.shields.io/badge/v5.0.0-version?color=43cd0f&style=flat&label=pypi)](https://pypi.org/project/redis-message-queue)
3
+ [![PyPI Version](https://img.shields.io/badge/v6.0.0-version?color=43cd0f&style=flat&label=pypi)](https://pypi.org/project/redis-message-queue)
4
4
  [![PyPI Downloads](https://img.shields.io/pypi/dm/redis-message-queue?color=43cd0f&style=flat&label=downloads)](https://pypistats.org/packages/redis-message-queue)
5
5
  [![License: MIT](https://img.shields.io/badge/License-MIT-43cd0f.svg?style=flat&label=license)](LICENSE)
6
6
  [![Maintained: yes](https://img.shields.io/badge/yes-43cd0f.svg?style=flat&label=maintained)](https://github.com/Elijas/redis-message-queue/issues)
@@ -11,7 +11,7 @@
11
11
  **Lightweight Python message queuing with Redis and built-in publish-side deduplication.** Deduplicate publishes within a TTL window, with optional crash recovery — across any number of producers and consumers.
12
12
 
13
13
  ```bash
14
- pip install "redis-message-queue>=3.0.0,<4.0.0"
14
+ pip install "redis-message-queue>=6.0.0,<7.0.0"
15
15
  ```
16
16
 
17
17
  Requires Redis server >= 6.2.
@@ -125,6 +125,43 @@ When set, `LTRIM` is called after each message is moved to the completed/failed
125
125
  Pass `max_completed_length=None` or `max_failed_length=None` explicitly if you
126
126
  want unbounded tracking queues.
127
127
 
128
+ ### Publish backpressure
129
+
130
+ By default, the pending queue is unbounded (`max_pending_length=None`), matching
131
+ the v5 behavior. Set `max_pending_length` when producers can outrun consumers
132
+ and Redis memory must fail closed before the broker is exhausted:
133
+
134
+ ```python
135
+ queue = RedisMessageQueue(
136
+ "q",
137
+ client=client,
138
+ max_pending_length=100_000,
139
+ pending_overload_policy="raise", # "raise", "drop_oldest", or "block"
140
+ )
141
+ ```
142
+
143
+ The built-in Redis path checks pending depth and enqueues in the same Lua script,
144
+ so concurrent publishers cannot race above the configured cap. Overload policies:
145
+
146
+ - `raise` raises `QueueBackpressureError` and leaves the pending list unchanged.
147
+ - `drop_oldest` removes the oldest pending message (`RPOP`) before enqueueing the
148
+ new message. This is silent data loss by design; deduplication markers for
149
+ dropped messages are not removed, so a dropped duplicate may still be
150
+ suppressed until its dedup TTL expires.
151
+ - `block` retries the atomic check until space opens or
152
+ `pending_overload_block_timeout_seconds` elapses (default: 1.0), then raises
153
+ `QueueBackpressureError`.
154
+
155
+ These limits apply only to the pending list at publish time. They do not cap
156
+ messages already in `processing`, dead-letter queues, deduplication keys, or
157
+ replay metadata. `max_completed_length` and `max_failed_length` only bound the
158
+ completed/failed history lists. Size pending payload memory separately from the
159
+ dedup/replay metadata described in
160
+ [Redis memory sizing](#redis-memory-sizing-for-deduplication-and-replay-metadata).
161
+
162
+ When using `gateway=`, configure backpressure on the gateway directly, for
163
+ example `RedisGateway(redis_client=client, max_pending_length=100_000)`.
164
+
128
165
  ### Crash recovery with visibility timeout
129
166
 
130
167
  ```python
@@ -160,6 +197,43 @@ The callback is **advisory** — it may fire briefly after a successful `process
160
197
 
161
198
  Without a visibility timeout, messages already moved to `processing` remain there indefinitely after a consumer crash and are not redelivered, even if the crash happened before your handler started running.
162
199
 
200
+ ### Ordering and multi-consumer fairness
201
+
202
+ The built-in queue is a shared-pull Redis list. Successful publishes push to the
203
+ left side of the pending list, and claims pop from the right side, so Redis
204
+ grants claims in enqueue order in the no-failure path.
205
+
206
+ This is a claim-order guarantee only. It is not a completion-order guarantee:
207
+ multiple consumers process concurrently, handlers can run for different
208
+ durations, and younger messages can finish before older messages.
209
+
210
+ With `visibility_timeout_seconds` enabled, expired messages from `processing`
211
+ are reclaimed before fresh pending work on the next consumer poll. A reclaimed
212
+ message may be delivered after younger messages were already processed, and may
213
+ be processed concurrently with a stale original handler if that handler keeps
214
+ running after its lease expires.
215
+
216
+ Expired reclaims are ordered by lease deadline within one reclaim batch.
217
+ `CLAIM_MESSAGE_WITH_VISIBILITY_TIMEOUT_LUA_SCRIPT` selects expired leases with
218
+ `ZRANGEBYSCORE ... LIMIT 0, 100` to bound Redis Lua execution time. When more
219
+ than 100 messages expire together, the next poll can append a later reclaim
220
+ batch at the claimable end of the pending list ahead of leftovers from the
221
+ previous batch, so cross-batch redelivery order is not guaranteed.
222
+
223
+ `max_delivery_count` can skip over poison messages during a claim poll by moving
224
+ over-limit messages to the dead-letter queue and returning a later pending
225
+ message. Deduplication is publish-side only: duplicate publishes are not
226
+ enqueued and therefore do not occupy a queue position.
227
+
228
+ Handler exceptions are not retries: the default behavior removes the message
229
+ from `processing`, or moves it to the failed queue when enabled. Redelivery is
230
+ for crash, stall, or stale-lease paths where cleanup does not complete.
231
+
232
+ Multiple consumers contend for the same queue. The next message goes to the
233
+ consumer whose claim request Redis executes next. There is no round-robin,
234
+ equal-share, or starvation-freedom guarantee; faster consumers can receive more
235
+ than 1/N of messages.
236
+
163
237
  ### Dead-letter queue
164
238
 
165
239
  ```python
@@ -204,6 +278,42 @@ while not interrupt.is_interrupted():
204
278
  > (for example, a second Ctrl+C raises `KeyboardInterrupt`). If you need multiple
205
279
  > shutdown hooks, use a single handler and fan out in your own code.
206
280
 
281
+ There are three distinct shutdown shapes; pick the one that matches your runtime:
282
+
283
+ | Shape | Trigger | In-flight handler | Pending claim IDs |
284
+ |---|---|---|---|
285
+ | **Flag-based soft drain** (`GracefulInterruptHandler`) | First SIGINT/SIGTERM flips a flag | Runs to completion | Drained on the next claim call, not on signal arrival |
286
+ | **Async task cancellation** (`asyncio.CancelledError`) | Framework cancels the worker task (Uvicorn/K8s SIGTERM in many setups) | **Hard abort** — message stays in `processing`; with VT it is reclaimed at deadline expiry, without VT it is orphaned | Not drained |
287
+ | **Explicit drain** (`drain()` / `aclose()`) | You call the method | Caller's responsibility to let it finish (drain does **not** cancel) | Drained synchronously via the gateway recovery path |
288
+
289
+ Use `drain()` / `aclose()` to bridge K8s `preStop` / SIGTERM grace windows without
290
+ relying on signal interception:
291
+
292
+ ```python
293
+ # sync — in your SIGTERM handler or preStop hook
294
+ queue.drain(timeout=25) # refuses new claims, recovers pending claim IDs
295
+ worker_thread.join() # wait for in-flight process_message to finish
296
+
297
+ # async — same shape
298
+ await queue.aclose(timeout=25)
299
+ await worker_task # task observes ``_draining`` and exits its loop
300
+ ```
301
+
302
+ `drain()` / `aclose()` set a queue-local flag so subsequent `process_message()`
303
+ calls yield `None` immediately. They do not cancel in-flight handlers — the
304
+ caller must arrange handler exit through normal thread/task coordination.
305
+ Returns `True` if all in-memory pending claim IDs were recovered within the
306
+ timeout; `False` if the deadline fired or transient Redis errors left claim
307
+ IDs pending (call again to retry). `timeout=0` reports current state without
308
+ attempting recovery.
309
+
310
+ > **Heartbeat caveat (best-effort stop):** when `heartbeat_interval_seconds` is
311
+ > set, the heartbeat sidecar's `stop()` is bounded but not strictly quiescent —
312
+ > a slow renewal in flight when `process_message` exits may still write to
313
+ > Redis after the caller believes shutdown is complete. The renewal is bounded
314
+ > by the configured visibility timeout and the lease token check on the Redis
315
+ > side, but plan for a small post-shutdown overlap rather than instant quiesce.
316
+
207
317
  ### Custom gateway
208
318
 
209
319
  ```python
@@ -224,12 +334,12 @@ queue = RedisMessageQueue("q", gateway=gateway)
224
334
 
225
335
  The retry knobs configure an internal `tenacity` strategy: exponential
226
336
  backoff with jitter, retry on transient Redis errors only, capped at
227
- `retry_budget_seconds`. The budget is wall-clock time from the first attempt (including attempt duration), not inter-attempt delay; a single attempt that takes longer than the budget results in zero retries. Setting `retry_budget_seconds=0` disables retry
337
+ `retry_budget_seconds`. The budget is monotonic elapsed time from the first attempt (including attempt duration), not inter-attempt delay; it is unaffected by Python-host NTP jumps. A single attempt that takes longer than the budget results in zero retries. Setting `retry_budget_seconds=0` disables retry
228
338
  entirely (single attempt; exceptions propagate). The library uses
229
339
  `retry_budget_seconds` to size the operation-result cache TTL automatically,
230
340
  so the previous footgun of an over-long retry budget out-living the cache
231
341
  and producing misleading "cleanup was a no-op" warnings is now structurally
232
- impossible. Note: tenacity may allow one additional attempt beyond the budget if the budget check passes at attempt start total wall-clock time can exceed `retry_budget_seconds` by the duration of that final attempt.
342
+ impossible. Note: tenacity may allow one additional attempt beyond the budget if the budget check passes at attempt start, so total monotonic elapsed time can exceed `retry_budget_seconds` by the duration of that final attempt.
233
343
 
234
344
  To plug in a different retry library (`backoff`, `asyncstdlib.retry`, or your
235
345
  own logic) or fundamentally different semantics, subclass
@@ -301,9 +411,126 @@ await client.aclose()
301
411
  For the sync Redis client, call `client.close()` during application shutdown when
302
412
  you own the client lifecycle.
303
413
 
414
+ ## Production notes
415
+
416
+ ### Fork safety and pre-fork servers
417
+
418
+ Construct Redis clients and `RedisMessageQueue` instances after a process forks.
419
+ This is the recommended pattern for `multiprocessing`, `ProcessPoolExecutor`,
420
+ and pre-fork servers such as gunicorn with `--preload`.
421
+
422
+ ```python
423
+ def worker_main():
424
+ client = redis.Redis()
425
+ queue = RedisMessageQueue("jobs", client=client)
426
+ ...
427
+ ```
428
+
429
+ Avoid constructing a queue/client in a parent process and then using that same
430
+ object in forked children, especially if the parent has already run any Redis
431
+ command. The queue stores the user-provided Redis client and process-local
432
+ claim-recovery state. Inherited Redis sockets can corrupt the Redis protocol if
433
+ two processes use the same file descriptor.
434
+
435
+ Notes:
436
+
437
+ - The sync redis-py pooled client attempts to reset its connection pool after
438
+ fork, but this does not apply to every client shape.
439
+ - The built-in sync gateway rejects `redis.Redis(single_connection_client=True)`
440
+ because that mode pins one socket instead of using the pool.
441
+ - Do not share `redis.asyncio.Redis` or async queues across fork; create or
442
+ reconnect them in the child process.
443
+ - If you use `GracefulInterruptHandler`, create it in the worker process after
444
+ fork so signal ownership is local to that worker.
445
+ - The heartbeat sidecar is lazy and starts only while processing a leased
446
+ message. Do not call `fork()` from inside active message handlers unless the
447
+ child exits without using the inherited queue/client.
448
+
449
+ ### Redis memory sizing for deduplication and replay metadata
450
+
451
+ When deduplication is enabled, each distinct dedup key creates one Redis string
452
+ for `message_deduplication_log_ttl_seconds` (default: 3600 seconds). The default
453
+ dedup key is a SHA-256 hash of the canonical message payload, so distinct
454
+ payloads are distinct keys. Size Redis for:
455
+
456
+ ```text
457
+ peak_unique_publish_rate_per_second
458
+ * message_deduplication_log_ttl_seconds
459
+ * bytes_per_dedup_key
460
+ ```
461
+
462
+ Use 200 bytes per dedup key as a conservative starting point for short queue
463
+ names, then validate with `MEMORY USAGE` in your Redis version. Example:
464
+ 1,000 unique messages/s * 3,600s * 200 B ~= 720 MB for dedup markers alone.
465
+ A 24h dedup window at the same rate is 86.4M keys, or roughly 17 GB before
466
+ message payload lists, lease metadata, completed/failed queues, and allocator
467
+ fragmentation.
468
+
469
+ Operation-result replay keys are normally deleted after a successful call, but
470
+ may live until their TTL after ambiguous connection drops or failed cleanup
471
+ deletes. With visibility timeouts, active claims also store replay metadata
472
+ until ack or reclaim. Without visibility timeouts, abandoned claims leave
473
+ `claim_result_ids` and `claim_result_backrefs` fields until the message is
474
+ acked or manually cleaned.
475
+
476
+ `max_completed_length` and `max_failed_length` only bound the completed/failed
477
+ lists. They do not bound deduplication keys or replay metadata.
478
+
479
+ Avoid sharing queue Redis DBs with unrelated high-cardinality workloads. If
480
+ idempotency matters, prefer explicit capacity planning and `noeviction` with
481
+ alerts over LRU/random eviction policies: evicting dedup/replay keys before
482
+ their TTL can weaken duplicate suppression and retry result replay.
483
+
484
+ ## Observability
485
+
486
+ Queue instances accept an optional `on_event` callback for metrics, tracing, or
487
+ structured logging. The sync queue expects a regular callable; the async queue
488
+ expects an async callable:
489
+
490
+ ```python
491
+ from redis_message_queue import QueueEvent, RedisMessageQueue
492
+
493
+ def on_event(event: QueueEvent) -> None:
494
+ ...
495
+
496
+ queue = RedisMessageQueue("jobs", client=client, on_event=on_event)
497
+ ```
498
+
499
+ Events cover publish, dedup hits, claim/empty polls, reclaim, ack/nack,
500
+ completed/failed cleanup, DLQ moves, heartbeat renewal, stale leases, cleanup
501
+ and trim failures, and retry attempts. Callback exceptions are logged and
502
+ reported with `RuntimeWarning`, but never propagate into queue operations.
503
+ Package logs remain diagnostic; use `on_event` rather than log parsing for
504
+ metrics.
505
+
506
+ ```python
507
+ from prometheus_client import Counter
508
+ from redis_message_queue import QueueEvent, RedisMessageQueue
509
+
510
+ events_total = Counter(
511
+ "rmq_events_total",
512
+ "redis-message-queue lifecycle events",
513
+ ["queue", "operation", "outcome", "exception_type"],
514
+ )
515
+
516
+ def observe(event: QueueEvent) -> None:
517
+ events_total.labels(
518
+ event.queue, event.operation, event.outcome, event.exception_type or ""
519
+ ).inc()
520
+
521
+ queue = RedisMessageQueue("jobs", client=client, on_event=observe)
522
+ ```
523
+
524
+ The public exception hierarchy is rooted at `RedisMessageQueueError`.
525
+ Configuration value/combinations raise `ConfigurationError` (also a
526
+ `ValueError`), custom gateway contract violations raise `GatewayContractError`
527
+ (also a `TypeError`), and Lua `redis.error_reply(...)` failures raise
528
+ `LuaScriptError` (also a redis-py `ResponseError`). Publish overload raises
529
+ `QueueBackpressureError`. `CleanupFailedError` and `RetryBudgetExhaustedError`
530
+ are reserved categories for cleanup and retry surfaces.
531
+
304
532
  ## Known limitations
305
533
 
306
- - **No metrics or observability hooks.** The library logs warnings (stale leases, heartbeat failures, transient errors) via Python's `logging` module but does not expose callbacks, event hooks, or metric counters. To monitor queue health, inspect the underlying Redis keys directly or parse log output.
307
534
  - **Timed waits use polling claim loops.** To make claims recoverable after ambiguous connection drops, `wait_for_message_and_move()` uses idempotent Lua claim polling instead of raw blocking list-move commands. This adds a small polling cadence during timed waits.
308
535
  - **Redis Lua is atomic, not rollback-transactional.** The built-in scripts now preflight queue key types and fail closed on `WRONGTYPE` before mutating queue state, but Redis does not undo earlier writes if a later script command fails for another reason (for example `OOM` under severe memory pressure).
309
536
  - **Batch reclaim limit of 100.** The visibility-timeout reclaim Lua script processes at most 100 expired messages per consumer poll. Under extreme backlog this may delay recovery, but prevents any single poll from blocking Redis.
@@ -311,7 +538,7 @@ you own the client lifecycle.
311
538
  - **Cluster detection uses `isinstance(client, RedisCluster)`.** Wrapped or instrumented cluster clients that delegate without inheriting will bypass hash-tag validation. Custom gateways should set `is_redis_cluster = True` explicitly.
312
539
  - **Redis Cluster requires hash tags.** The built-in queue uses multiple Redis keys per operation. Wrap the queue name in hash tags (for example `{myqueue}`) so every generated key lands in the same slot. When you pass a Redis Cluster client to the built-in queue/gateway path, incompatible names are rejected early.
313
540
  - **Non-ASCII payloads use ~2x storage.** The default `ensure_ascii=True` in JSON serialization encodes non-ASCII characters as `\uXXXX` escape sequences. This is a deliberate compatibility choice.
314
- - **Client-side `Retry` can duplicate non-deduplicated publishes.** If you construct your `redis.Redis` client with `retry=Retry(...)`, redis-py retries `ConnectionError` / `TimeoutError` at the connection layer — *below* this library. Idempotent operations (deduplicated `publish()`, lease-scoped cleanup) are safe because their Lua scripts replay the original result. `add_message()` (used by `publish()` when `deduplication=False`) is a bare `LPUSH`: this library deliberately does not retry it, but a client-level `Retry` will, and if the server executed the command before the response was lost the message is enqueued twice. Leave `retry=None` (the default) if you need strict at-most-once semantics for non-deduplicated publishes, or accept the duplication risk. More broadly, any non-idempotent `LPUSH` path is vulnerable if the connection drops after server execution but before the client receives the response; all other built-in operations (deduplicated publish, lease-scoped ack/move, lease renewal) use replay markers and are safe under client-level `Retry`.
541
+ - **Client-side `Retry` can duplicate non-deduplicated publishes.** If you construct your `redis.Redis` client with `retry=Retry(...)`, redis-py retries `ConnectionError` / `TimeoutError` at the connection layer — *below* this library. Idempotent operations (deduplicated `publish()`, lease-scoped cleanup) are safe because their Lua scripts replay the original result. `add_message()` (used by `publish()` when `deduplication=False`) is a bare `LPUSH` by default, or a single non-idempotent Lua enqueue when `max_pending_length` is set: this library deliberately does not retry it, but a client-level `Retry` will, and if the server executed the command before the response was lost the message is enqueued twice. Leave `retry=None` (the default) if you need strict at-most-once semantics for non-deduplicated publishes, or accept the duplication risk. More broadly, any non-idempotent enqueue path is vulnerable if the connection drops after server execution but before the client receives the response; all other built-in operations (deduplicated publish, lease-scoped ack/move, lease renewal) use replay markers and are safe under client-level `Retry`.
315
542
  - **Redis Cluster default retry can stack with this library's retry budget.** In redis-py 6.0+, `RedisCluster()` constructs a default `ExponentialWithJitterBackoff` retry below this library's `retry_budget_seconds`. If you need a single retry surface, pass `retry=Retry(NoBackoff(), 0)` to the cluster client or reduce `retry_budget_seconds` to account for the lower-level retry window.
316
543
 
317
544
  For a full analysis, see [docs/production-readiness.md](docs/production-readiness.md).
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "redis-message-queue"
3
- version = "5.0.0"
3
+ version = "6.0.0"
4
4
  description = "Python message queuing with Redis and message deduplication"
5
5
  authors = ["Elijas <4084885+Elijas@users.noreply.github.com>"]
6
6
  readme = "README.md"
@@ -1,4 +1,14 @@
1
1
  from redis_message_queue._abstract_redis_gateway import AbstractRedisGateway
2
+ from redis_message_queue._event import EventOperation, EventOutcome, QueueEvent
3
+ from redis_message_queue._exceptions import (
4
+ CleanupFailedError,
5
+ ConfigurationError,
6
+ GatewayContractError,
7
+ LuaScriptError,
8
+ QueueBackpressureError,
9
+ RedisMessageQueueError,
10
+ RetryBudgetExhaustedError,
11
+ )
2
12
  from redis_message_queue._redis_gateway import RedisGateway
3
13
  from redis_message_queue._stored_message import ClaimedMessage, MessageData
4
14
  from redis_message_queue.interrupt_handler import (
@@ -15,4 +25,14 @@ __all__ = [
15
25
  "MessageData",
16
26
  "GracefulInterruptHandler",
17
27
  "BaseGracefulInterruptHandler",
28
+ "QueueEvent",
29
+ "EventOperation",
30
+ "EventOutcome",
31
+ "RedisMessageQueueError",
32
+ "ConfigurationError",
33
+ "GatewayContractError",
34
+ "LuaScriptError",
35
+ "QueueBackpressureError",
36
+ "CleanupFailedError",
37
+ "RetryBudgetExhaustedError",
18
38
  ]