nodus-queue 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- nodus_queue/__init__.py +58 -0
- nodus_queue/backends.py +776 -0
- nodus_queue/metrics.py +52 -0
- nodus_queue/payload.py +58 -0
- nodus_queue/queue.py +242 -0
- nodus_queue-0.1.0.dist-info/METADATA +136 -0
- nodus_queue-0.1.0.dist-info/RECORD +10 -0
- nodus_queue-0.1.0.dist-info/WHEEL +5 -0
- nodus_queue-0.1.0.dist-info/licenses/LICENSE +21 -0
- nodus_queue-0.1.0.dist-info/top_level.txt +1 -0
nodus_queue/__init__.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""nodus-queue — distributed job queue with DLQ, delayed jobs, and in-flight tracking.
|
|
2
|
+
|
|
3
|
+
Backends:
|
|
4
|
+
RedisQueueBackend — LPUSH/BRPOP; single-consumer atomic; Lua-script capacity guard
|
|
5
|
+
InMemoryQueueBackend — thread-safe; Timer-based delayed enqueue; for tests and dev
|
|
6
|
+
|
|
7
|
+
Payload:
|
|
8
|
+
QueueJobPayload — serialisable job envelope with idempotency key
|
|
9
|
+
|
|
10
|
+
Metrics hook:
|
|
11
|
+
QueueMetrics — optional noop base class; subclass to wire Prometheus
|
|
12
|
+
|
|
13
|
+
Errors:
|
|
14
|
+
QueueSaturatedError — raised when the queue rejects work at capacity
|
|
15
|
+
|
|
16
|
+
Factory:
|
|
17
|
+
get_queue() — return the singleton backend (Redis or in-memory fallback)
|
|
18
|
+
reset_queue() — reset singleton for test isolation
|
|
19
|
+
validate_queue_backend() — fail fast if backend is unavailable
|
|
20
|
+
get_queue_health_snapshot() — health dict for monitoring
|
|
21
|
+
attempt_queue_backend_reconnect() — try to restore Redis after degraded fallback
|
|
22
|
+
"""
|
|
23
|
+
from .backends import (
|
|
24
|
+
QUEUE_NAME_DEFAULT,
|
|
25
|
+
DistributedQueueBackend,
|
|
26
|
+
InMemoryQueueBackend,
|
|
27
|
+
QueueSaturatedError,
|
|
28
|
+
RedisQueueBackend,
|
|
29
|
+
)
|
|
30
|
+
from .metrics import QueueMetrics
|
|
31
|
+
from .payload import QueueJobPayload
|
|
32
|
+
from .queue import (
|
|
33
|
+
attempt_queue_backend_reconnect,
|
|
34
|
+
get_queue,
|
|
35
|
+
get_queue_health_snapshot,
|
|
36
|
+
reset_queue,
|
|
37
|
+
validate_queue_backend,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
__all__ = [
|
|
41
|
+
# Backends
|
|
42
|
+
"DistributedQueueBackend",
|
|
43
|
+
"InMemoryQueueBackend",
|
|
44
|
+
"RedisQueueBackend",
|
|
45
|
+
"QUEUE_NAME_DEFAULT",
|
|
46
|
+
# Payload
|
|
47
|
+
"QueueJobPayload",
|
|
48
|
+
# Metrics
|
|
49
|
+
"QueueMetrics",
|
|
50
|
+
# Errors
|
|
51
|
+
"QueueSaturatedError",
|
|
52
|
+
# Factory
|
|
53
|
+
"get_queue",
|
|
54
|
+
"reset_queue",
|
|
55
|
+
"validate_queue_backend",
|
|
56
|
+
"get_queue_health_snapshot",
|
|
57
|
+
"attempt_queue_backend_reconnect",
|
|
58
|
+
]
|
nodus_queue/backends.py
ADDED
|
@@ -0,0 +1,776 @@
|
|
|
1
|
+
"""DistributedQueueBackend, RedisQueueBackend, InMemoryQueueBackend."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
import logging
|
|
6
|
+
import os
|
|
7
|
+
import queue
|
|
8
|
+
import threading
|
|
9
|
+
import time
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from typing import Optional
|
|
13
|
+
|
|
14
|
+
import tenacity
|
|
15
|
+
|
|
16
|
+
from .metrics import QueueMetrics
|
|
17
|
+
from .payload import QueueJobPayload
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
QUEUE_NAME_DEFAULT = "nodus:jobs"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
# ---------------------------------------------------------------------------
|
|
25
|
+
# Saturated error
|
|
26
|
+
# ---------------------------------------------------------------------------
|
|
27
|
+
|
|
28
|
+
class QueueSaturatedError(RuntimeError):
|
|
29
|
+
"""Raised when the queue rejects work due to reaching its capacity limit."""
|
|
30
|
+
|
|
31
|
+
def __init__(
|
|
32
|
+
self,
|
|
33
|
+
message: str,
|
|
34
|
+
*,
|
|
35
|
+
status_code: int = 503,
|
|
36
|
+
retry_after_seconds: int = 5,
|
|
37
|
+
) -> None:
|
|
38
|
+
super().__init__(message)
|
|
39
|
+
self.status_code = status_code
|
|
40
|
+
self.retry_after_seconds = retry_after_seconds
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ---------------------------------------------------------------------------
|
|
44
|
+
# Capacity helpers (env-var based — no config object dependency)
|
|
45
|
+
# ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
def _queue_capacity_limit() -> int:
|
|
48
|
+
for name in ("NODUS_QUEUE_MAXSIZE", "MAX_QUEUE_SIZE", "AINDY_ASYNC_QUEUE_MAXSIZE"):
|
|
49
|
+
raw = os.getenv(name)
|
|
50
|
+
if raw is not None:
|
|
51
|
+
try:
|
|
52
|
+
return max(1, int(raw))
|
|
53
|
+
except (TypeError, ValueError):
|
|
54
|
+
pass
|
|
55
|
+
return 100
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _saturation_threshold() -> int:
|
|
59
|
+
cap = _queue_capacity_limit()
|
|
60
|
+
raw = os.getenv("NODUS_QUEUE_SATURATION_THRESHOLD", "")
|
|
61
|
+
if raw:
|
|
62
|
+
try:
|
|
63
|
+
return max(1, min(int(raw), cap))
|
|
64
|
+
except (TypeError, ValueError):
|
|
65
|
+
pass
|
|
66
|
+
return cap
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# ---------------------------------------------------------------------------
|
|
70
|
+
# Redis retry decorator
|
|
71
|
+
# ---------------------------------------------------------------------------
|
|
72
|
+
|
|
73
|
+
def _log_redis_retry(retry_state: tenacity.RetryCallState) -> None:
|
|
74
|
+
if retry_state.outcome is None:
|
|
75
|
+
return
|
|
76
|
+
exc = retry_state.outcome.exception()
|
|
77
|
+
if exc is None:
|
|
78
|
+
return
|
|
79
|
+
logger.warning(
|
|
80
|
+
"RedisQueueBackend: retry attempt=%s exception=%s",
|
|
81
|
+
retry_state.attempt_number,
|
|
82
|
+
exc,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def _redis_retry():
|
|
87
|
+
"""Retry transient Redis failures with bounded exponential backoff."""
|
|
88
|
+
import redis # noqa: PLC0415
|
|
89
|
+
|
|
90
|
+
return tenacity.retry(
|
|
91
|
+
retry=tenacity.retry_if_exception_type(
|
|
92
|
+
(redis.ConnectionError, redis.TimeoutError, redis.BusyLoadingError)
|
|
93
|
+
),
|
|
94
|
+
wait=tenacity.wait_exponential(multiplier=2, min=0.1, max=2.0),
|
|
95
|
+
stop=tenacity.stop_after_attempt(3),
|
|
96
|
+
before_sleep=_log_redis_retry,
|
|
97
|
+
reraise=True,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# ---------------------------------------------------------------------------
|
|
102
|
+
# Abstract interface
|
|
103
|
+
# ---------------------------------------------------------------------------
|
|
104
|
+
|
|
105
|
+
class DistributedQueueBackend(ABC):
|
|
106
|
+
"""Abstract queue transport.
|
|
107
|
+
|
|
108
|
+
Required methods: ``enqueue``, ``dequeue``, ``ack``, ``fail``, ``get_dlq_depth``.
|
|
109
|
+
Optional overrides: ``enqueue_delayed``, ``process_delayed_jobs``,
|
|
110
|
+
``requeue_stale_jobs``, ``get_metrics``, ``assert_ready``,
|
|
111
|
+
``remove_dead_letter``, ``drain_dead_letters``.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
@abstractmethod
|
|
115
|
+
def enqueue(self, payload: QueueJobPayload) -> None:
|
|
116
|
+
"""Push a job to the tail of the queue."""
|
|
117
|
+
|
|
118
|
+
@abstractmethod
|
|
119
|
+
def dequeue(self, timeout: int = 5) -> Optional[QueueJobPayload]:
|
|
120
|
+
"""Block up to *timeout* seconds waiting for a job.
|
|
121
|
+
|
|
122
|
+
Returns ``None`` when no job arrives within the window. The returned
|
|
123
|
+
job is added to the in-flight store so ``requeue_stale_jobs`` can
|
|
124
|
+
recover it if the worker crashes.
|
|
125
|
+
"""
|
|
126
|
+
|
|
127
|
+
@abstractmethod
|
|
128
|
+
def ack(self, job_id: str) -> None:
|
|
129
|
+
"""Mark a job as successfully completed; remove from in-flight."""
|
|
130
|
+
|
|
131
|
+
@abstractmethod
|
|
132
|
+
def fail(self, job_id: str, error: str = "") -> None:
|
|
133
|
+
"""Mark a job as terminally failed; move to Dead Letter Queue."""
|
|
134
|
+
|
|
135
|
+
@abstractmethod
|
|
136
|
+
def get_dlq_depth(self) -> int:
|
|
137
|
+
"""Return the number of dead-lettered jobs."""
|
|
138
|
+
|
|
139
|
+
def remove_dead_letter(self, job_id: str) -> bool:
|
|
140
|
+
"""Remove one dead-lettered job by job_id."""
|
|
141
|
+
return False
|
|
142
|
+
|
|
143
|
+
def drain_dead_letters(self) -> int:
|
|
144
|
+
"""Remove all dead-lettered jobs and return the number removed."""
|
|
145
|
+
return 0
|
|
146
|
+
|
|
147
|
+
def enqueue_delayed(self, payload: QueueJobPayload, delay_seconds: float) -> None:
|
|
148
|
+
"""Schedule a job for future execution. Default: enqueue immediately."""
|
|
149
|
+
self.enqueue(payload)
|
|
150
|
+
|
|
151
|
+
def process_delayed_jobs(self) -> int:
|
|
152
|
+
"""Promote delayed jobs whose delay has elapsed. Default: no-op."""
|
|
153
|
+
return 0
|
|
154
|
+
|
|
155
|
+
def requeue_stale_jobs(self, timeout_seconds: int = 300) -> int:
|
|
156
|
+
"""Re-enqueue in-flight jobs older than *timeout_seconds*. Default: no-op."""
|
|
157
|
+
return 0
|
|
158
|
+
|
|
159
|
+
def get_metrics(self) -> dict:
|
|
160
|
+
return {
|
|
161
|
+
"queue_depth": 0,
|
|
162
|
+
"in_flight_count": 0,
|
|
163
|
+
"failed_jobs": 0,
|
|
164
|
+
"delayed_jobs": 0,
|
|
165
|
+
"dlq_depth": 0,
|
|
166
|
+
"max_queue_size": _queue_capacity_limit(),
|
|
167
|
+
"total_pending_jobs": 0,
|
|
168
|
+
"saturation_threshold": _saturation_threshold(),
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
def assert_ready(self) -> None:
|
|
172
|
+
"""Validate backend readiness. Default: no-op."""
|
|
173
|
+
|
|
174
|
+
@property
|
|
175
|
+
def backend_name(self) -> str:
|
|
176
|
+
return self.__class__.__name__.replace("QueueBackend", "").lower()
|
|
177
|
+
|
|
178
|
+
@property
|
|
179
|
+
def degraded(self) -> bool:
|
|
180
|
+
return False
|
|
181
|
+
|
|
182
|
+
@property
|
|
183
|
+
def redis_available(self) -> bool:
|
|
184
|
+
return self.backend_name == "redis"
|
|
185
|
+
|
|
186
|
+
@property
|
|
187
|
+
def fallback_reason(self) -> str | None:
|
|
188
|
+
return None
|
|
189
|
+
|
|
190
|
+
def health_snapshot(self) -> dict:
|
|
191
|
+
metrics = self.get_metrics()
|
|
192
|
+
return {
|
|
193
|
+
"backend": "redis" if self.backend_name == "redis" else "memory",
|
|
194
|
+
"backend_name": self.backend_name,
|
|
195
|
+
"degraded": self.degraded,
|
|
196
|
+
"redis_available": self.redis_available,
|
|
197
|
+
"metrics": metrics,
|
|
198
|
+
"queue_depth": metrics.get("queue_depth", 0),
|
|
199
|
+
"in_flight_count": metrics.get("in_flight_count", 0),
|
|
200
|
+
"dlq_depth": metrics.get("dlq_depth", metrics.get("failed_jobs", 0)),
|
|
201
|
+
"delayed_jobs": metrics.get("delayed_jobs", 0),
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
# ---------------------------------------------------------------------------
|
|
206
|
+
# Redis backend
|
|
207
|
+
# ---------------------------------------------------------------------------
|
|
208
|
+
|
|
209
|
+
class RedisQueueBackend(DistributedQueueBackend):
|
|
210
|
+
"""Redis-backed FIFO queue using LPUSH / BRPOP.
|
|
211
|
+
|
|
212
|
+
BRPOP is atomic — only one worker receives each message regardless of how
|
|
213
|
+
many worker processes are running.
|
|
214
|
+
|
|
215
|
+
Key layout (``queue_name`` = ``nodus:jobs`` by default)::
|
|
216
|
+
|
|
217
|
+
nodus:jobs — main job list (LPUSH left / BRPOP right)
|
|
218
|
+
nodus:jobs:inflight — hash { job_id → {payload, dequeued_at} }
|
|
219
|
+
nodus:jobs:delayed — sorted set; score = execute_at Unix timestamp
|
|
220
|
+
nodus:jobs:dead — dead letter list
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
url: Redis connection URL (e.g. ``redis://localhost:6379/0``).
|
|
224
|
+
queue_name: Key prefix for all queue-related Redis keys.
|
|
225
|
+
max_queue_size: Hard capacity limit. Defaults to ``NODUS_QUEUE_MAXSIZE``
|
|
226
|
+
env var or 100.
|
|
227
|
+
metrics: Optional ``QueueMetrics`` instance for observability hooks.
|
|
228
|
+
circuit_breaker_threshold: Consecutive Redis failures before the
|
|
229
|
+
circuit opens.
|
|
230
|
+
circuit_breaker_open_seconds: How long to reject calls when open.
|
|
231
|
+
"""
|
|
232
|
+
|
|
233
|
+
_PROCESS_DELAYED_LUA = """
|
|
234
|
+
local ready = redis.call('ZRANGEBYSCORE', KEYS[1], '-inf', ARGV[1], 'LIMIT', 0, 100)
|
|
235
|
+
for _, v in ipairs(ready) do
|
|
236
|
+
redis.call('LPUSH', KEYS[2], v)
|
|
237
|
+
redis.call('ZREM', KEYS[1], v)
|
|
238
|
+
end
|
|
239
|
+
return #ready
|
|
240
|
+
"""
|
|
241
|
+
|
|
242
|
+
_ENQUEUE_WITH_CAPACITY_LUA = """
|
|
243
|
+
local total = redis.call('LLEN', KEYS[1]) + redis.call('ZCARD', KEYS[2])
|
|
244
|
+
if total >= tonumber(ARGV[2]) then
|
|
245
|
+
return -1
|
|
246
|
+
end
|
|
247
|
+
return redis.call('LPUSH', KEYS[1], ARGV[1])
|
|
248
|
+
"""
|
|
249
|
+
|
|
250
|
+
_ENQUEUE_DELAYED_WITH_CAPACITY_LUA = """
|
|
251
|
+
local total = redis.call('LLEN', KEYS[1]) + redis.call('ZCARD', KEYS[2])
|
|
252
|
+
if total >= tonumber(ARGV[3]) then
|
|
253
|
+
return -1
|
|
254
|
+
end
|
|
255
|
+
return redis.call('ZADD', KEYS[2], ARGV[2], ARGV[1])
|
|
256
|
+
"""
|
|
257
|
+
|
|
258
|
+
def __init__(
|
|
259
|
+
self,
|
|
260
|
+
url: str,
|
|
261
|
+
queue_name: str = QUEUE_NAME_DEFAULT,
|
|
262
|
+
max_queue_size: int | None = None,
|
|
263
|
+
metrics: Optional[QueueMetrics] = None,
|
|
264
|
+
circuit_breaker_threshold: int = 5,
|
|
265
|
+
circuit_breaker_open_seconds: float = 30.0,
|
|
266
|
+
) -> None:
|
|
267
|
+
try:
|
|
268
|
+
import redis # noqa: PLC0415
|
|
269
|
+
except ImportError as exc:
|
|
270
|
+
raise ImportError(
|
|
271
|
+
"redis package is required for RedisQueueBackend. "
|
|
272
|
+
"Install with: pip install 'nodus-queue[redis]'"
|
|
273
|
+
) from exc
|
|
274
|
+
self._redis = redis.from_url(url, decode_responses=True, socket_timeout=10)
|
|
275
|
+
self._queue_name = queue_name
|
|
276
|
+
self._max_queue_size = max_queue_size or _queue_capacity_limit()
|
|
277
|
+
self._inflight_key = f"{queue_name}:inflight"
|
|
278
|
+
self._delayed_key = f"{queue_name}:delayed"
|
|
279
|
+
self._dlq_key = f"{queue_name}:dead"
|
|
280
|
+
self._process_delayed = self._redis.register_script(self._PROCESS_DELAYED_LUA)
|
|
281
|
+
self._enqueue_with_capacity = self._redis.register_script(self._ENQUEUE_WITH_CAPACITY_LUA)
|
|
282
|
+
self._enqueue_delayed_with_capacity = self._redis.register_script(
|
|
283
|
+
self._ENQUEUE_DELAYED_WITH_CAPACITY_LUA
|
|
284
|
+
)
|
|
285
|
+
import redis as _redis_module # noqa: PLC0415
|
|
286
|
+
self._redis_exceptions = (
|
|
287
|
+
_redis_module.ConnectionError,
|
|
288
|
+
_redis_module.TimeoutError,
|
|
289
|
+
_redis_module.BusyLoadingError,
|
|
290
|
+
)
|
|
291
|
+
self._failure_count = 0
|
|
292
|
+
self._open_until = 0.0
|
|
293
|
+
self._circuit_breaker_threshold = circuit_breaker_threshold
|
|
294
|
+
self._circuit_breaker_open_seconds = circuit_breaker_open_seconds
|
|
295
|
+
self._metrics = metrics or QueueMetrics()
|
|
296
|
+
|
|
297
|
+
def _check_circuit_breaker(self) -> None:
|
|
298
|
+
import redis # noqa: PLC0415
|
|
299
|
+
if time.monotonic() < self._open_until:
|
|
300
|
+
raise redis.ConnectionError("Circuit breaker open")
|
|
301
|
+
|
|
302
|
+
def _record_success(self) -> None:
|
|
303
|
+
if self._failure_count or self._open_until:
|
|
304
|
+
logger.info("RedisQueueBackend: circuit breaker CLOSED (connection restored)")
|
|
305
|
+
self._failure_count = 0
|
|
306
|
+
self._open_until = 0.0
|
|
307
|
+
|
|
308
|
+
def _record_failure(self, exc: Exception) -> None:
|
|
309
|
+
if not isinstance(exc, self._redis_exceptions):
|
|
310
|
+
return
|
|
311
|
+
self._failure_count += 1
|
|
312
|
+
if self._failure_count >= self._circuit_breaker_threshold:
|
|
313
|
+
self._open_until = time.monotonic() + self._circuit_breaker_open_seconds
|
|
314
|
+
logger.error(
|
|
315
|
+
"RedisQueueBackend: circuit breaker OPEN for %.1fs after %d failures",
|
|
316
|
+
self._circuit_breaker_open_seconds,
|
|
317
|
+
self._failure_count,
|
|
318
|
+
)
|
|
319
|
+
|
|
320
|
+
def _run_redis_operation(self, operation_name: str, fn):
|
|
321
|
+
@_redis_retry()
|
|
322
|
+
def _call():
|
|
323
|
+
return fn()
|
|
324
|
+
|
|
325
|
+
try:
|
|
326
|
+
result = _call()
|
|
327
|
+
except self._redis_exceptions as exc:
|
|
328
|
+
self._record_failure(exc)
|
|
329
|
+
logger.warning(
|
|
330
|
+
"RedisQueueBackend: operation=%s failed error=%s", operation_name, exc
|
|
331
|
+
)
|
|
332
|
+
raise
|
|
333
|
+
self._record_success()
|
|
334
|
+
return result
|
|
335
|
+
|
|
336
|
+
def assert_ready(self) -> None:
|
|
337
|
+
self._check_circuit_breaker()
|
|
338
|
+
self._run_redis_operation("ping", lambda: self._redis.ping())
|
|
339
|
+
|
|
340
|
+
# ── Core operations ────────────────────────────────────────────────────
|
|
341
|
+
|
|
342
|
+
def enqueue(self, payload: QueueJobPayload) -> None:
|
|
343
|
+
self._check_circuit_breaker()
|
|
344
|
+
raw = payload.to_json()
|
|
345
|
+
result = self._run_redis_operation(
|
|
346
|
+
"enqueue",
|
|
347
|
+
lambda: self._enqueue_with_capacity(
|
|
348
|
+
keys=[self._queue_name, self._delayed_key],
|
|
349
|
+
args=[raw, str(self._max_queue_size)],
|
|
350
|
+
),
|
|
351
|
+
)
|
|
352
|
+
if int(result) == -1:
|
|
353
|
+
self._metrics.on_enqueue(self.backend_name, "rejected")
|
|
354
|
+
raise QueueSaturatedError(
|
|
355
|
+
f"Queue is saturated (capacity={self._max_queue_size}). Retry later."
|
|
356
|
+
)
|
|
357
|
+
self._metrics.on_enqueue(self.backend_name, "accepted")
|
|
358
|
+
logger.debug(
|
|
359
|
+
"[Queue:redis] enqueued job_id=%s task=%s idempotency_key=%s",
|
|
360
|
+
payload.job_id, payload.task_name, payload.idempotency_key,
|
|
361
|
+
)
|
|
362
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
363
|
+
|
|
364
|
+
def dequeue(self, timeout: int = 5) -> Optional[QueueJobPayload]:
|
|
365
|
+
self._check_circuit_breaker()
|
|
366
|
+
result = self._run_redis_operation(
|
|
367
|
+
"dequeue",
|
|
368
|
+
(lambda: self._redis.rpop(self._queue_name))
|
|
369
|
+
if timeout == 0
|
|
370
|
+
else (lambda: self._redis.brpop(self._queue_name, timeout=timeout)),
|
|
371
|
+
)
|
|
372
|
+
if result is None:
|
|
373
|
+
return None
|
|
374
|
+
raw = result if timeout == 0 else result[1]
|
|
375
|
+
try:
|
|
376
|
+
job = QueueJobPayload.from_json(raw)
|
|
377
|
+
except Exception as exc:
|
|
378
|
+
logger.error("[Queue:redis] deserialise failed: %s — raw=%r", exc, raw[:200])
|
|
379
|
+
return None
|
|
380
|
+
inflight_entry = json.dumps({
|
|
381
|
+
"payload": raw,
|
|
382
|
+
"dequeued_at": datetime.now(timezone.utc).isoformat(),
|
|
383
|
+
})
|
|
384
|
+
self._run_redis_operation(
|
|
385
|
+
"dequeue_inflight_hset",
|
|
386
|
+
lambda: self._redis.hset(self._inflight_key, job.job_id, inflight_entry),
|
|
387
|
+
)
|
|
388
|
+
self._metrics.on_dequeue(self.backend_name)
|
|
389
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
390
|
+
return job
|
|
391
|
+
|
|
392
|
+
def ack(self, job_id: str) -> None:
|
|
393
|
+
self._run_redis_operation(
|
|
394
|
+
"ack",
|
|
395
|
+
lambda: self._redis.hdel(self._inflight_key, job_id),
|
|
396
|
+
)
|
|
397
|
+
logger.debug("[Queue:redis] ack job_id=%s", job_id)
|
|
398
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
399
|
+
|
|
400
|
+
def fail(self, job_id: str, error: str = "") -> None:
|
|
401
|
+
inflight_raw = self._run_redis_operation(
|
|
402
|
+
"fail_hget",
|
|
403
|
+
lambda: self._redis.hget(self._inflight_key, job_id),
|
|
404
|
+
)
|
|
405
|
+
self._run_redis_operation(
|
|
406
|
+
"fail_hdel",
|
|
407
|
+
lambda: self._redis.hdel(self._inflight_key, job_id),
|
|
408
|
+
)
|
|
409
|
+
try:
|
|
410
|
+
payload_raw = json.loads(inflight_raw or "{}").get("payload", "")
|
|
411
|
+
except Exception:
|
|
412
|
+
payload_raw = ""
|
|
413
|
+
dlq_entry = json.dumps({
|
|
414
|
+
"job_id": job_id,
|
|
415
|
+
"payload_raw": payload_raw,
|
|
416
|
+
"error": error,
|
|
417
|
+
"failed_at": datetime.now(timezone.utc).isoformat(),
|
|
418
|
+
})
|
|
419
|
+
self._run_redis_operation(
|
|
420
|
+
"fail_lpush_dlq",
|
|
421
|
+
lambda: self._redis.lpush(self._dlq_key, dlq_entry),
|
|
422
|
+
)
|
|
423
|
+
self._metrics.on_failure(self.backend_name, "job")
|
|
424
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
425
|
+
logger.warning("[Queue:redis] fail→DLQ job_id=%s error=%s", job_id, error)
|
|
426
|
+
|
|
427
|
+
# ── Delayed enqueue ────────────────────────────────────────────────────
|
|
428
|
+
|
|
429
|
+
def enqueue_delayed(self, payload: QueueJobPayload, delay_seconds: float) -> None:
|
|
430
|
+
"""Schedule *payload* for execution after *delay_seconds*.
|
|
431
|
+
|
|
432
|
+
Uses a Redis sorted set; call ``process_delayed_jobs()`` periodically
|
|
433
|
+
to promote ready jobs into the main queue.
|
|
434
|
+
"""
|
|
435
|
+
raw = payload.to_json()
|
|
436
|
+
execute_at = datetime.now(timezone.utc).timestamp() + delay_seconds
|
|
437
|
+
result = self._run_redis_operation(
|
|
438
|
+
"enqueue_delayed",
|
|
439
|
+
lambda: self._enqueue_delayed_with_capacity(
|
|
440
|
+
keys=[self._queue_name, self._delayed_key],
|
|
441
|
+
args=[raw, str(execute_at), str(self._max_queue_size)],
|
|
442
|
+
),
|
|
443
|
+
)
|
|
444
|
+
if int(result) == -1:
|
|
445
|
+
self._metrics.on_enqueue(self.backend_name, "rejected")
|
|
446
|
+
raise QueueSaturatedError(
|
|
447
|
+
f"Queue is saturated (capacity={self._max_queue_size}). Retry later."
|
|
448
|
+
)
|
|
449
|
+
self._metrics.on_enqueue(self.backend_name, "accepted")
|
|
450
|
+
logger.debug(
|
|
451
|
+
"[Queue:redis] delayed enqueue job_id=%s delay=%.1fs",
|
|
452
|
+
payload.job_id, delay_seconds,
|
|
453
|
+
)
|
|
454
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
455
|
+
|
|
456
|
+
def process_delayed_jobs(self) -> int:
|
|
457
|
+
"""Promote all delayed jobs whose execute_at ≤ now into the main queue.
|
|
458
|
+
|
|
459
|
+
Uses a Lua script for atomicity. Returns the number of jobs promoted.
|
|
460
|
+
"""
|
|
461
|
+
now_ts = datetime.now(timezone.utc).timestamp()
|
|
462
|
+
count = self._run_redis_operation(
|
|
463
|
+
"process_delayed_jobs",
|
|
464
|
+
lambda: self._process_delayed(
|
|
465
|
+
keys=[self._delayed_key, self._queue_name],
|
|
466
|
+
args=[str(now_ts)],
|
|
467
|
+
),
|
|
468
|
+
)
|
|
469
|
+
if count:
|
|
470
|
+
logger.info("[Queue:redis] promoted %d delayed jobs", count)
|
|
471
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
472
|
+
return int(count)
|
|
473
|
+
|
|
474
|
+
# ── Visibility timeout recovery ────────────────────────────────────────
|
|
475
|
+
|
|
476
|
+
def requeue_stale_jobs(self, timeout_seconds: int = 300) -> int:
|
|
477
|
+
"""Re-enqueue in-flight jobs dequeued more than *timeout_seconds* ago.
|
|
478
|
+
|
|
479
|
+
Safe to call from multiple workers concurrently — the first HDEL wins.
|
|
480
|
+
Returns the number of jobs re-enqueued.
|
|
481
|
+
"""
|
|
482
|
+
now = datetime.now(timezone.utc)
|
|
483
|
+
entries = self._run_redis_operation(
|
|
484
|
+
"requeue_stale_jobs_hgetall",
|
|
485
|
+
lambda: self._redis.hgetall(self._inflight_key),
|
|
486
|
+
)
|
|
487
|
+
requeued = 0
|
|
488
|
+
for job_id, entry_raw in entries.items():
|
|
489
|
+
try:
|
|
490
|
+
entry = json.loads(entry_raw)
|
|
491
|
+
dequeued_at = datetime.fromisoformat(entry["dequeued_at"])
|
|
492
|
+
age_seconds = (now - dequeued_at).total_seconds()
|
|
493
|
+
if age_seconds <= timeout_seconds:
|
|
494
|
+
continue
|
|
495
|
+
removed = self._run_redis_operation(
|
|
496
|
+
"requeue_stale_jobs_hdel",
|
|
497
|
+
lambda job_id=job_id: self._redis.hdel(self._inflight_key, job_id),
|
|
498
|
+
)
|
|
499
|
+
if not removed:
|
|
500
|
+
continue
|
|
501
|
+
self._run_redis_operation(
|
|
502
|
+
"requeue_stale_jobs_lpush",
|
|
503
|
+
lambda payload=entry["payload"]: self._redis.lpush(
|
|
504
|
+
self._queue_name, payload
|
|
505
|
+
),
|
|
506
|
+
)
|
|
507
|
+
requeued += 1
|
|
508
|
+
logger.info(
|
|
509
|
+
"[Queue:redis] requeued stale job_id=%s age=%.0fs", job_id, age_seconds
|
|
510
|
+
)
|
|
511
|
+
except Exception as exc:
|
|
512
|
+
logger.warning(
|
|
513
|
+
"[Queue:redis] stale check failed job_id=%s: %s", job_id, exc
|
|
514
|
+
)
|
|
515
|
+
self._metrics.on_failure(self.backend_name, "stale_recovery")
|
|
516
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
517
|
+
return requeued
|
|
518
|
+
|
|
519
|
+
# ── Metrics ───────────────────────────────────────────────────────────
|
|
520
|
+
|
|
521
|
+
def get_metrics(self) -> dict:
|
|
522
|
+
queue_depth = self._redis.llen(self._queue_name)
|
|
523
|
+
delayed_jobs = self._redis.zcard(self._delayed_key)
|
|
524
|
+
return {
|
|
525
|
+
"queue_depth": queue_depth,
|
|
526
|
+
"in_flight_count": self._redis.hlen(self._inflight_key),
|
|
527
|
+
"failed_jobs": self.get_dlq_depth(),
|
|
528
|
+
"delayed_jobs": delayed_jobs,
|
|
529
|
+
"dlq_depth": self.get_dlq_depth(),
|
|
530
|
+
"max_queue_size": self._max_queue_size,
|
|
531
|
+
"total_pending_jobs": queue_depth + delayed_jobs,
|
|
532
|
+
"saturation_threshold": _saturation_threshold(),
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
def get_dlq_depth(self) -> int:
|
|
536
|
+
return int(self._redis.llen(self._dlq_key))
|
|
537
|
+
|
|
538
|
+
def peek_dead_letters(self, n: int) -> list[dict]:
|
|
539
|
+
entries = self._redis.lrange(self._dlq_key, 0, max(0, n) - 1)
|
|
540
|
+
return [json.loads(entry) for entry in entries]
|
|
541
|
+
|
|
542
|
+
def remove_dead_letter(self, job_id: str) -> bool:
|
|
543
|
+
entries = self._run_redis_operation(
|
|
544
|
+
"remove_dead_letter_lrange",
|
|
545
|
+
lambda: self._redis.lrange(self._dlq_key, 0, -1),
|
|
546
|
+
)
|
|
547
|
+
target_raw = None
|
|
548
|
+
for entry_raw in entries:
|
|
549
|
+
try:
|
|
550
|
+
if json.loads(entry_raw).get("job_id") == job_id:
|
|
551
|
+
target_raw = entry_raw
|
|
552
|
+
break
|
|
553
|
+
except Exception:
|
|
554
|
+
continue
|
|
555
|
+
if target_raw is None:
|
|
556
|
+
return False
|
|
557
|
+
removed = int(
|
|
558
|
+
self._run_redis_operation(
|
|
559
|
+
"remove_dead_letter_lrem",
|
|
560
|
+
lambda: self._redis.lrem(self._dlq_key, 1, target_raw),
|
|
561
|
+
)
|
|
562
|
+
)
|
|
563
|
+
if removed:
|
|
564
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
565
|
+
return removed > 0
|
|
566
|
+
|
|
567
|
+
def drain_dead_letters(self) -> int:
|
|
568
|
+
count = self.get_dlq_depth()
|
|
569
|
+
if count <= 0:
|
|
570
|
+
return 0
|
|
571
|
+
self._run_redis_operation(
|
|
572
|
+
"drain_dead_letters_del",
|
|
573
|
+
lambda: self._redis.delete(self._dlq_key),
|
|
574
|
+
)
|
|
575
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
576
|
+
return count
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
# ---------------------------------------------------------------------------
|
|
580
|
+
# In-memory backend (tests / single-process dev)
|
|
581
|
+
# ---------------------------------------------------------------------------
|
|
582
|
+
|
|
583
|
+
class InMemoryQueueBackend(DistributedQueueBackend):
|
|
584
|
+
"""Thread-safe in-process FIFO queue backed by ``queue.Queue``.
|
|
585
|
+
|
|
586
|
+
Implements all reliability features of ``RedisQueueBackend``:
|
|
587
|
+
in-flight tracking, Dead Letter Queue, delayed enqueue (via
|
|
588
|
+
``threading.Timer``), and full ``get_metrics()``.
|
|
589
|
+
|
|
590
|
+
Suitable for unit tests and single-process development without Redis.
|
|
591
|
+
**NOT usable across OS processes** — items live in this process's heap only.
|
|
592
|
+
"""
|
|
593
|
+
|
|
594
|
+
def __init__(
|
|
595
|
+
self,
|
|
596
|
+
max_queue_size: int | None = None,
|
|
597
|
+
*,
|
|
598
|
+
metrics: Optional[QueueMetrics] = None,
|
|
599
|
+
degraded: bool = False,
|
|
600
|
+
fallback_reason: str | None = None,
|
|
601
|
+
) -> None:
|
|
602
|
+
self._max_queue_size = max_queue_size or _queue_capacity_limit()
|
|
603
|
+
self._metrics = metrics or QueueMetrics()
|
|
604
|
+
self._degraded = degraded
|
|
605
|
+
self._fallback_reason = fallback_reason
|
|
606
|
+
self._q: queue.Queue[QueueJobPayload] = queue.Queue(maxsize=self._max_queue_size)
|
|
607
|
+
self._inflight: dict[str, tuple[QueueJobPayload, datetime]] = {}
|
|
608
|
+
self._inflight_lock = threading.Lock()
|
|
609
|
+
self._dlq: list[dict] = []
|
|
610
|
+
self._dlq_lock = threading.Lock()
|
|
611
|
+
self._timers: list[threading.Timer] = []
|
|
612
|
+
self._timers_lock = threading.Lock()
|
|
613
|
+
self._delayed_count = 0
|
|
614
|
+
|
|
615
|
+
@property
|
|
616
|
+
def degraded(self) -> bool:
|
|
617
|
+
return self._degraded
|
|
618
|
+
|
|
619
|
+
@property
|
|
620
|
+
def redis_available(self) -> bool:
|
|
621
|
+
return False
|
|
622
|
+
|
|
623
|
+
@property
|
|
624
|
+
def fallback_reason(self) -> str | None:
|
|
625
|
+
return self._fallback_reason
|
|
626
|
+
|
|
627
|
+
def _pending_depth(self) -> int:
|
|
628
|
+
with self._timers_lock:
|
|
629
|
+
delayed = self._delayed_count
|
|
630
|
+
return self._q.qsize() + delayed
|
|
631
|
+
|
|
632
|
+
def _reject_if_full(self) -> None:
|
|
633
|
+
if self._pending_depth() >= self._max_queue_size:
|
|
634
|
+
self._metrics.on_enqueue(self.backend_name, "rejected")
|
|
635
|
+
raise QueueSaturatedError(
|
|
636
|
+
f"Queue is saturated (capacity={self._max_queue_size}). Retry later."
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
def enqueue(self, payload: QueueJobPayload) -> None:
|
|
640
|
+
self._reject_if_full()
|
|
641
|
+
self._q.put_nowait(payload)
|
|
642
|
+
self._metrics.on_enqueue(self.backend_name, "accepted")
|
|
643
|
+
logger.debug(
|
|
644
|
+
"[Queue:mem] enqueued job_id=%s task=%s", payload.job_id, payload.task_name
|
|
645
|
+
)
|
|
646
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
647
|
+
|
|
648
|
+
def dequeue(self, timeout: int = 5) -> Optional[QueueJobPayload]:
|
|
649
|
+
try:
|
|
650
|
+
job = self._q.get(timeout=timeout)
|
|
651
|
+
except queue.Empty:
|
|
652
|
+
return None
|
|
653
|
+
with self._inflight_lock:
|
|
654
|
+
self._inflight[job.job_id] = (job, datetime.now(timezone.utc))
|
|
655
|
+
self._metrics.on_dequeue(self.backend_name)
|
|
656
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
657
|
+
return job
|
|
658
|
+
|
|
659
|
+
def ack(self, job_id: str) -> None:
|
|
660
|
+
with self._inflight_lock:
|
|
661
|
+
self._inflight.pop(job_id, None)
|
|
662
|
+
logger.debug("[Queue:mem] ack job_id=%s", job_id)
|
|
663
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
664
|
+
|
|
665
|
+
def fail(self, job_id: str, error: str = "") -> None:
|
|
666
|
+
with self._inflight_lock:
|
|
667
|
+
entry = self._inflight.pop(job_id, None)
|
|
668
|
+
with self._dlq_lock:
|
|
669
|
+
self._dlq.append({
|
|
670
|
+
"job_id": job_id,
|
|
671
|
+
"payload_raw": entry[0].to_json() if entry else "",
|
|
672
|
+
"error": error,
|
|
673
|
+
"failed_at": datetime.now(timezone.utc).isoformat(),
|
|
674
|
+
})
|
|
675
|
+
self._metrics.on_failure(self.backend_name, "job")
|
|
676
|
+
logger.warning("[Queue:mem] fail→DLQ job_id=%s error=%s", job_id, error)
|
|
677
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
678
|
+
|
|
679
|
+
def enqueue_delayed(self, payload: QueueJobPayload, delay_seconds: float) -> None:
|
|
680
|
+
"""Schedule enqueue after *delay_seconds* using a daemon Timer."""
|
|
681
|
+
self._reject_if_full()
|
|
682
|
+
with self._timers_lock:
|
|
683
|
+
self._delayed_count += 1
|
|
684
|
+
|
|
685
|
+
def _fire() -> None:
|
|
686
|
+
try:
|
|
687
|
+
self._q.put_nowait(payload)
|
|
688
|
+
finally:
|
|
689
|
+
with self._timers_lock:
|
|
690
|
+
self._delayed_count = max(0, self._delayed_count - 1)
|
|
691
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
692
|
+
logger.debug("[Queue:mem] delayed enqueue fired job_id=%s", payload.job_id)
|
|
693
|
+
|
|
694
|
+
t = threading.Timer(delay_seconds, _fire)
|
|
695
|
+
t.daemon = True
|
|
696
|
+
t.start()
|
|
697
|
+
with self._timers_lock:
|
|
698
|
+
self._timers = [x for x in self._timers if x.is_alive()]
|
|
699
|
+
self._timers.append(t)
|
|
700
|
+
logger.debug(
|
|
701
|
+
"[Queue:mem] delayed enqueue job_id=%s delay=%.1fs", payload.job_id, delay_seconds
|
|
702
|
+
)
|
|
703
|
+
self._metrics.on_enqueue(self.backend_name, "accepted")
|
|
704
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
705
|
+
|
|
706
|
+
def requeue_stale_jobs(self, timeout_seconds: int = 300) -> int:
|
|
707
|
+
now = datetime.now(timezone.utc)
|
|
708
|
+
to_requeue: list[tuple[str, QueueJobPayload]] = []
|
|
709
|
+
with self._inflight_lock:
|
|
710
|
+
for job_id, (job, dequeued_at) in list(self._inflight.items()):
|
|
711
|
+
age = (now - dequeued_at).total_seconds()
|
|
712
|
+
if age > timeout_seconds:
|
|
713
|
+
to_requeue.append((job_id, job))
|
|
714
|
+
for job_id, _ in to_requeue:
|
|
715
|
+
del self._inflight[job_id]
|
|
716
|
+
for _, job in to_requeue:
|
|
717
|
+
self._q.put(job)
|
|
718
|
+
logger.info("[Queue:mem] requeued stale job_id=%s", job.job_id)
|
|
719
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
720
|
+
return len(to_requeue)
|
|
721
|
+
|
|
722
|
+
def get_metrics(self) -> dict:
|
|
723
|
+
with self._inflight_lock:
|
|
724
|
+
inflight = len(self._inflight)
|
|
725
|
+
with self._dlq_lock:
|
|
726
|
+
dlq = len(self._dlq)
|
|
727
|
+
return {
|
|
728
|
+
"queue_depth": self._q.qsize(),
|
|
729
|
+
"in_flight_count": inflight,
|
|
730
|
+
"failed_jobs": dlq,
|
|
731
|
+
"delayed_jobs": self._delayed_count,
|
|
732
|
+
"dlq_depth": dlq,
|
|
733
|
+
"max_queue_size": self._max_queue_size,
|
|
734
|
+
"total_pending_jobs": self._q.qsize() + self._delayed_count,
|
|
735
|
+
"saturation_threshold": _saturation_threshold(),
|
|
736
|
+
}
|
|
737
|
+
|
|
738
|
+
def get_dlq_depth(self) -> int:
|
|
739
|
+
with self._dlq_lock:
|
|
740
|
+
return len(self._dlq)
|
|
741
|
+
|
|
742
|
+
# ── Test helpers ──────────────────────────────────────────────────────
|
|
743
|
+
|
|
744
|
+
def qsize(self) -> int:
|
|
745
|
+
"""Number of items currently waiting (for test assertions)."""
|
|
746
|
+
return self._q.qsize()
|
|
747
|
+
|
|
748
|
+
def get_dead_letters(self) -> list[dict]:
|
|
749
|
+
"""Return a copy of the DLQ (for test assertions)."""
|
|
750
|
+
with self._dlq_lock:
|
|
751
|
+
return list(self._dlq)
|
|
752
|
+
|
|
753
|
+
def get_inflight_ids(self) -> list[str]:
|
|
754
|
+
"""Return current in-flight job IDs (for test assertions)."""
|
|
755
|
+
with self._inflight_lock:
|
|
756
|
+
return list(self._inflight.keys())
|
|
757
|
+
|
|
758
|
+
def remove_dead_letter(self, job_id: str) -> bool:
|
|
759
|
+
removed = False
|
|
760
|
+
with self._dlq_lock:
|
|
761
|
+
for idx, entry in enumerate(self._dlq):
|
|
762
|
+
if entry.get("job_id") == job_id:
|
|
763
|
+
del self._dlq[idx]
|
|
764
|
+
removed = True
|
|
765
|
+
break
|
|
766
|
+
if removed:
|
|
767
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
768
|
+
return removed
|
|
769
|
+
|
|
770
|
+
def drain_dead_letters(self) -> int:
|
|
771
|
+
with self._dlq_lock:
|
|
772
|
+
count = len(self._dlq)
|
|
773
|
+
self._dlq.clear()
|
|
774
|
+
if count:
|
|
775
|
+
self._metrics.on_snapshot(self.backend_name, self.get_metrics())
|
|
776
|
+
return count
|
nodus_queue/metrics.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
"""Optional metrics integration hook.
|
|
2
|
+
|
|
3
|
+
Subclass ``QueueMetrics`` and pass an instance to backend constructors or
|
|
4
|
+
``get_queue()`` to wire up Prometheus (or any other metrics system) without
|
|
5
|
+
adding a hard dependency on ``prometheus-client``.
|
|
6
|
+
|
|
7
|
+
Example — Prometheus integration::
|
|
8
|
+
|
|
9
|
+
from prometheus_client import CollectorRegistry, Counter, Gauge
|
|
10
|
+
from nodus_queue import QueueMetrics
|
|
11
|
+
|
|
12
|
+
REGISTRY = CollectorRegistry()
|
|
13
|
+
enqueue_total = Counter("queue_enqueue_total", "...", ["backend", "outcome"], registry=REGISTRY)
|
|
14
|
+
# ... define other metrics ...
|
|
15
|
+
|
|
16
|
+
class PrometheusQueueMetrics(QueueMetrics):
|
|
17
|
+
def on_enqueue(self, backend: str, outcome: str) -> None:
|
|
18
|
+
enqueue_total.labels(backend=backend, outcome=outcome).inc()
|
|
19
|
+
# override other methods as needed
|
|
20
|
+
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class QueueMetrics:
|
|
25
|
+
"""No-op base class for queue metrics hooks.
|
|
26
|
+
|
|
27
|
+
All methods are no-ops by default. Override only the ones you need.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def on_enqueue(self, backend: str, outcome: str) -> None:
|
|
31
|
+
"""Called after every enqueue attempt. ``outcome``: ``"accepted"`` or ``"rejected"``."""
|
|
32
|
+
|
|
33
|
+
def on_dequeue(self, backend: str) -> None:
|
|
34
|
+
"""Called after a successful dequeue."""
|
|
35
|
+
|
|
36
|
+
def on_failure(self, backend: str, stage: str) -> None:
|
|
37
|
+
"""Called when a job is moved to the DLQ or a stale-recovery fails.
|
|
38
|
+
``stage``: ``"job"`` | ``"stale_recovery"``.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def on_fallback(self) -> None:
|
|
42
|
+
"""Called when the Redis backend is unavailable and the queue falls
|
|
43
|
+
back to in-memory mode.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
def on_backend_mode_changed(self, is_redis: bool) -> None:
|
|
47
|
+
"""Called when the active backend type changes (redis ↔ memory)."""
|
|
48
|
+
|
|
49
|
+
def on_snapshot(self, backend: str, snapshot: dict) -> None:
|
|
50
|
+
"""Called after any operation that changes queue depth, with the
|
|
51
|
+
current metrics snapshot. Use to update Gauges.
|
|
52
|
+
"""
|
nodus_queue/payload.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""QueueJobPayload — serialisable representation of one distributed async job."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import json
|
|
5
|
+
from dataclasses import asdict, dataclass, field
|
|
6
|
+
from datetime import datetime, timezone
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class QueueJobPayload:
|
|
11
|
+
"""Serialisable representation of one distributed async job.
|
|
12
|
+
|
|
13
|
+
The payload is intentionally lightweight. Workers should re-fetch the
|
|
14
|
+
full job record from their own data store using ``job_id`` rather than
|
|
15
|
+
embedding large blobs in the queue entry.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
job_id: str
|
|
19
|
+
"""Primary key — used to look up the full record from the data store."""
|
|
20
|
+
|
|
21
|
+
task_name: str
|
|
22
|
+
"""Registered handler key (e.g. ``"agent.create_run"``)."""
|
|
23
|
+
|
|
24
|
+
idempotency_key: str = ""
|
|
25
|
+
"""Deduplication key. Defaults to ``job_id`` when not explicitly set.
|
|
26
|
+
Workers may check this to guard against double-execution after a
|
|
27
|
+
visibility-timeout re-enqueue.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
context: dict = field(default_factory=dict)
|
|
31
|
+
"""Execution context carried across the worker boundary:
|
|
32
|
+
``trace_id``, ``eu_id``, ``user_id``, ``capabilities``.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
retry_metadata: dict = field(default_factory=dict)
|
|
36
|
+
"""``attempt_count``, ``max_attempts``, ``is_retry`` — carried across
|
|
37
|
+
re-enqueues so the worker can restore the correct retry state.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
enqueued_at: str = field(
|
|
41
|
+
default_factory=lambda: datetime.now(timezone.utc).isoformat()
|
|
42
|
+
)
|
|
43
|
+
"""Wall-clock timestamp at enqueue time (UTC ISO 8601)."""
|
|
44
|
+
|
|
45
|
+
def __post_init__(self) -> None:
|
|
46
|
+
if not self.idempotency_key:
|
|
47
|
+
self.idempotency_key = self.job_id
|
|
48
|
+
|
|
49
|
+
def to_json(self) -> str:
|
|
50
|
+
"""Serialise to a compact JSON string (wire format)."""
|
|
51
|
+
return json.dumps(asdict(self))
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def from_json(cls, raw: str) -> "QueueJobPayload":
|
|
55
|
+
"""Deserialise from a JSON string. Unknown fields are silently dropped."""
|
|
56
|
+
data = json.loads(raw)
|
|
57
|
+
known = set(cls.__dataclass_fields__)
|
|
58
|
+
return cls(**{k: v for k, v in data.items() if k in known})
|
nodus_queue/queue.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""Singleton factory for the active queue backend."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
import os
|
|
6
|
+
import threading
|
|
7
|
+
from typing import Callable, Optional
|
|
8
|
+
|
|
9
|
+
from .backends import (
|
|
10
|
+
QUEUE_NAME_DEFAULT,
|
|
11
|
+
DistributedQueueBackend,
|
|
12
|
+
InMemoryQueueBackend,
|
|
13
|
+
QueueSaturatedError,
|
|
14
|
+
RedisQueueBackend,
|
|
15
|
+
_queue_capacity_limit,
|
|
16
|
+
)
|
|
17
|
+
from .metrics import QueueMetrics
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
_QUEUE_INSTANCE: Optional[DistributedQueueBackend] = None
|
|
22
|
+
_QUEUE_METRICS: Optional[QueueMetrics] = None
|
|
23
|
+
_QUEUE_ON_BACKEND_CHANGE: Optional[Callable[[str, dict], None]] = None
|
|
24
|
+
_QUEUE_LOCK = threading.Lock()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _is_test_mode() -> bool:
|
|
28
|
+
for var in ("TESTING", "TEST_MODE"):
|
|
29
|
+
if os.getenv(var, "false").lower() in {"1", "true", "yes"}:
|
|
30
|
+
return True
|
|
31
|
+
return False
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _is_production() -> bool:
|
|
35
|
+
return os.getenv("ENV", "").lower() in {"production", "prod"}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _require_redis() -> bool:
|
|
39
|
+
return os.getenv("NODUS_REQUIRE_REDIS", os.getenv("AINDY_REQUIRE_REDIS", "false")).lower() in {
|
|
40
|
+
"1", "true", "yes"
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _fallback_to_memory(
|
|
45
|
+
exc: Exception,
|
|
46
|
+
*,
|
|
47
|
+
metrics: QueueMetrics,
|
|
48
|
+
on_backend_change: Optional[Callable[[str, dict], None]],
|
|
49
|
+
queue_name: str,
|
|
50
|
+
) -> InMemoryQueueBackend:
|
|
51
|
+
if _require_redis():
|
|
52
|
+
raise RuntimeError(
|
|
53
|
+
f"NODUS_REQUIRE_REDIS=true but Redis is unavailable: {exc}. "
|
|
54
|
+
"Set NODUS_REQUIRE_REDIS=false to allow in-memory fallback."
|
|
55
|
+
) from exc
|
|
56
|
+
|
|
57
|
+
metrics.on_fallback()
|
|
58
|
+
logger.warning(
|
|
59
|
+
"[Queue] Redis unavailable (%s) — falling back to in-memory queue. "
|
|
60
|
+
"In multi-instance mode jobs will NOT be shared across instances. "
|
|
61
|
+
"Set NODUS_REQUIRE_REDIS=true to prevent degraded-mode startup.",
|
|
62
|
+
exc,
|
|
63
|
+
)
|
|
64
|
+
if on_backend_change is not None:
|
|
65
|
+
try:
|
|
66
|
+
on_backend_change("degraded", {"reason": str(exc), "fallback": "memory"})
|
|
67
|
+
except Exception:
|
|
68
|
+
pass
|
|
69
|
+
return InMemoryQueueBackend(
|
|
70
|
+
metrics=metrics,
|
|
71
|
+
degraded=True,
|
|
72
|
+
fallback_reason=str(exc),
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_queue(
|
|
77
|
+
*,
|
|
78
|
+
force_memory: bool = False,
|
|
79
|
+
metrics: Optional[QueueMetrics] = None,
|
|
80
|
+
on_backend_change: Optional[Callable[[str, dict], None]] = None,
|
|
81
|
+
) -> DistributedQueueBackend:
|
|
82
|
+
"""Return the process-level singleton queue backend.
|
|
83
|
+
|
|
84
|
+
Selection order
|
|
85
|
+
---------------
|
|
86
|
+
1. ``force_memory=True`` → fresh ``InMemoryQueueBackend`` (not cached).
|
|
87
|
+
2. ``TESTING=1`` / ``TEST_MODE=1`` → ``InMemoryQueueBackend`` (cached).
|
|
88
|
+
3. ``REDIS_URL`` is set → ``RedisQueueBackend``.
|
|
89
|
+
4. Fallback → ``InMemoryQueueBackend`` with a warning.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
force_memory: Return a fresh in-memory backend, bypassing the singleton.
|
|
93
|
+
Use in tests that need isolated queues.
|
|
94
|
+
metrics: Optional ``QueueMetrics`` hook. Ignored after the singleton is
|
|
95
|
+
created; pass it on the first call.
|
|
96
|
+
on_backend_change: Optional callback fired when the backend changes mode
|
|
97
|
+
(e.g. Redis → memory fallback or reconnect). Signature:
|
|
98
|
+
``fn(event: str, payload: dict) -> None``.
|
|
99
|
+
|
|
100
|
+
Call ``reset_queue()`` between tests to get a clean instance.
|
|
101
|
+
"""
|
|
102
|
+
global _QUEUE_INSTANCE, _QUEUE_METRICS, _QUEUE_ON_BACKEND_CHANGE
|
|
103
|
+
|
|
104
|
+
_metrics = metrics or QueueMetrics()
|
|
105
|
+
|
|
106
|
+
if force_memory:
|
|
107
|
+
backend = InMemoryQueueBackend(metrics=_metrics)
|
|
108
|
+
_metrics.on_backend_mode_changed(False)
|
|
109
|
+
return backend
|
|
110
|
+
|
|
111
|
+
if _QUEUE_INSTANCE is not None:
|
|
112
|
+
return _QUEUE_INSTANCE
|
|
113
|
+
|
|
114
|
+
with _QUEUE_LOCK:
|
|
115
|
+
if _QUEUE_INSTANCE is not None:
|
|
116
|
+
return _QUEUE_INSTANCE
|
|
117
|
+
|
|
118
|
+
_QUEUE_METRICS = _metrics
|
|
119
|
+
_QUEUE_ON_BACKEND_CHANGE = on_backend_change
|
|
120
|
+
|
|
121
|
+
if _is_test_mode():
|
|
122
|
+
_QUEUE_INSTANCE = InMemoryQueueBackend(metrics=_metrics)
|
|
123
|
+
_metrics.on_backend_mode_changed(False)
|
|
124
|
+
return _QUEUE_INSTANCE
|
|
125
|
+
|
|
126
|
+
redis_url = os.getenv("REDIS_URL", "")
|
|
127
|
+
queue_name = os.getenv("NODUS_QUEUE_NAME", os.getenv("AINDY_QUEUE_NAME", QUEUE_NAME_DEFAULT))
|
|
128
|
+
|
|
129
|
+
if _is_production() and not redis_url:
|
|
130
|
+
raise RuntimeError(
|
|
131
|
+
"Production deployments require RedisQueueBackend for job durability. "
|
|
132
|
+
"Set REDIS_URL before startup."
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if redis_url:
|
|
136
|
+
try:
|
|
137
|
+
candidate = RedisQueueBackend(
|
|
138
|
+
url=redis_url,
|
|
139
|
+
queue_name=queue_name,
|
|
140
|
+
metrics=_metrics,
|
|
141
|
+
)
|
|
142
|
+
candidate.assert_ready()
|
|
143
|
+
_QUEUE_INSTANCE = candidate
|
|
144
|
+
logger.info("[Queue] Redis backend url=%s queue=%s", redis_url, queue_name)
|
|
145
|
+
except Exception as exc:
|
|
146
|
+
_QUEUE_INSTANCE = _fallback_to_memory(
|
|
147
|
+
exc,
|
|
148
|
+
metrics=_metrics,
|
|
149
|
+
on_backend_change=on_backend_change,
|
|
150
|
+
queue_name=queue_name,
|
|
151
|
+
)
|
|
152
|
+
else:
|
|
153
|
+
if os.getenv("EXECUTION_MODE", "thread").lower() == "distributed":
|
|
154
|
+
raise RuntimeError(
|
|
155
|
+
"EXECUTION_MODE=distributed requires REDIS_URL. "
|
|
156
|
+
"Jobs would be lost on process restart with an in-memory queue. "
|
|
157
|
+
"Set REDIS_URL or switch to EXECUTION_MODE=thread."
|
|
158
|
+
)
|
|
159
|
+
logger.warning(
|
|
160
|
+
"[Queue] REDIS_URL not set — using in-memory queue. "
|
|
161
|
+
"Multi-process distributed execution requires Redis."
|
|
162
|
+
)
|
|
163
|
+
_QUEUE_INSTANCE = InMemoryQueueBackend(metrics=_metrics)
|
|
164
|
+
|
|
165
|
+
_metrics.on_backend_mode_changed(_QUEUE_INSTANCE.backend_name == "redis")
|
|
166
|
+
_metrics.on_snapshot(_QUEUE_INSTANCE.backend_name, _QUEUE_INSTANCE.get_metrics())
|
|
167
|
+
return _QUEUE_INSTANCE
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def reset_queue() -> None:
|
|
171
|
+
"""Reset the singleton to None.
|
|
172
|
+
|
|
173
|
+
Call in test teardown (or after changing ``REDIS_URL``) to force
|
|
174
|
+
re-initialisation on the next ``get_queue()`` call.
|
|
175
|
+
"""
|
|
176
|
+
global _QUEUE_INSTANCE
|
|
177
|
+
with _QUEUE_LOCK:
|
|
178
|
+
_QUEUE_INSTANCE = None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def validate_queue_backend() -> DistributedQueueBackend:
|
|
182
|
+
"""Fail fast when the configured queue backend is unavailable."""
|
|
183
|
+
backend = get_queue()
|
|
184
|
+
if backend.backend_name == "redis":
|
|
185
|
+
backend.assert_ready()
|
|
186
|
+
if _QUEUE_METRICS:
|
|
187
|
+
_QUEUE_METRICS.on_snapshot(backend.backend_name, backend.get_metrics())
|
|
188
|
+
_QUEUE_METRICS.on_backend_mode_changed(backend.backend_name == "redis")
|
|
189
|
+
return backend
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def get_queue_health_snapshot() -> dict:
|
|
193
|
+
"""Return a health dict for the active backend, including fallback reason."""
|
|
194
|
+
backend = get_queue()
|
|
195
|
+
snapshot = backend.health_snapshot()
|
|
196
|
+
snapshot["reason"] = backend.fallback_reason
|
|
197
|
+
return snapshot
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def attempt_queue_backend_reconnect() -> bool:
|
|
201
|
+
"""Try to promote a degraded in-memory backend back to Redis.
|
|
202
|
+
|
|
203
|
+
Returns True if the reconnect succeeded and the singleton was replaced.
|
|
204
|
+
No-op when the current backend is already Redis or Redis is not configured.
|
|
205
|
+
"""
|
|
206
|
+
global _QUEUE_INSTANCE
|
|
207
|
+
|
|
208
|
+
redis_url = os.getenv("REDIS_URL", "")
|
|
209
|
+
if not redis_url:
|
|
210
|
+
return False
|
|
211
|
+
|
|
212
|
+
with _QUEUE_LOCK:
|
|
213
|
+
current = _QUEUE_INSTANCE
|
|
214
|
+
if current is None or current.backend_name == "redis" or not current.degraded:
|
|
215
|
+
return False
|
|
216
|
+
|
|
217
|
+
queue_name = os.getenv("NODUS_QUEUE_NAME", os.getenv("AINDY_QUEUE_NAME", QUEUE_NAME_DEFAULT))
|
|
218
|
+
_metrics = _QUEUE_METRICS or QueueMetrics()
|
|
219
|
+
try:
|
|
220
|
+
candidate = RedisQueueBackend(
|
|
221
|
+
url=redis_url,
|
|
222
|
+
queue_name=queue_name,
|
|
223
|
+
metrics=_metrics,
|
|
224
|
+
)
|
|
225
|
+
candidate.assert_ready()
|
|
226
|
+
except Exception as exc:
|
|
227
|
+
logger.debug("[Queue] Redis reconnect attempt failed: %s", exc)
|
|
228
|
+
return False
|
|
229
|
+
|
|
230
|
+
_QUEUE_INSTANCE = candidate
|
|
231
|
+
|
|
232
|
+
_metrics.on_backend_mode_changed(True)
|
|
233
|
+
_metrics.on_snapshot(candidate.backend_name, candidate.get_metrics())
|
|
234
|
+
|
|
235
|
+
if _QUEUE_ON_BACKEND_CHANGE is not None:
|
|
236
|
+
try:
|
|
237
|
+
_QUEUE_ON_BACKEND_CHANGE("recovered", {"backend": "redis"})
|
|
238
|
+
except Exception:
|
|
239
|
+
pass
|
|
240
|
+
|
|
241
|
+
logger.info("[Queue] Redis connection restored — queue backend switched to Redis.")
|
|
242
|
+
return True
|
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: nodus-queue
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Distributed job queue with DLQ, delayed jobs, in-flight tracking, Redis backend, and in-memory fallback
|
|
5
|
+
Author: Shawn Knight
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Masterplanner25/nodus-queue
|
|
8
|
+
Project-URL: Repository, https://github.com/Masterplanner25/nodus-queue
|
|
9
|
+
Keywords: queue,redis,distributed,jobs,dlq,nodus
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Requires-Python: >=3.11
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
License-File: LICENSE
|
|
18
|
+
Requires-Dist: tenacity>=8.0.0
|
|
19
|
+
Provides-Extra: redis
|
|
20
|
+
Requires-Dist: redis>=4.0.0; extra == "redis"
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
23
|
+
Requires-Dist: fakeredis>=2.0.0; extra == "dev"
|
|
24
|
+
Requires-Dist: redis>=4.0.0; extra == "dev"
|
|
25
|
+
Dynamic: license-file
|
|
26
|
+
|
|
27
|
+
# nodus-queue
|
|
28
|
+
|
|
29
|
+
Distributed job queue with Dead Letter Queue, delayed jobs, in-flight tracking, and visibility-timeout recovery. Redis backend for multi-instance production; in-memory fallback for dev and tests. Zero hard dependencies beyond `tenacity`.
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install nodus-queue # core + in-memory backend
|
|
35
|
+
pip install "nodus-queue[redis]" # + Redis backend
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Quickstart
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
from nodus_queue import QueueJobPayload, get_queue, reset_queue
|
|
42
|
+
|
|
43
|
+
# In dev/test — in-memory backend (automatic when REDIS_URL is unset)
|
|
44
|
+
q = get_queue()
|
|
45
|
+
|
|
46
|
+
job = QueueJobPayload(job_id="run-123", task_name="agent.run")
|
|
47
|
+
q.enqueue(job)
|
|
48
|
+
|
|
49
|
+
# Worker side
|
|
50
|
+
job = q.dequeue(timeout=5) # blocks up to 5 seconds
|
|
51
|
+
if job:
|
|
52
|
+
try:
|
|
53
|
+
# ... process job ...
|
|
54
|
+
q.ack(job.job_id) # remove from in-flight
|
|
55
|
+
except Exception as e:
|
|
56
|
+
q.fail(job.job_id, str(e)) # move to DLQ
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
## Redis backend
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
REDIS_URL=redis://localhost:6379/0
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from nodus_queue import get_queue
|
|
67
|
+
|
|
68
|
+
q = get_queue() # picks up REDIS_URL automatically
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Delayed jobs
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
# Schedule a job to run after 30 seconds
|
|
75
|
+
q.enqueue_delayed(job, delay_seconds=30)
|
|
76
|
+
|
|
77
|
+
# Promote ready jobs (call periodically in Redis mode)
|
|
78
|
+
count = q.process_delayed_jobs()
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Crash recovery
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# On worker startup — re-enqueue jobs stuck in-flight for > 5 minutes
|
|
85
|
+
q.requeue_stale_jobs(timeout_seconds=300)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Dead Letter Queue
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
depth = q.get_dlq_depth()
|
|
92
|
+
q.drain_dead_letters() # clear all
|
|
93
|
+
q.remove_dead_letter("job-id-123") # remove one
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
## Optional Prometheus metrics
|
|
97
|
+
|
|
98
|
+
```python
|
|
99
|
+
from prometheus_client import CollectorRegistry, Counter, Gauge
|
|
100
|
+
from nodus_queue import QueueMetrics, get_queue
|
|
101
|
+
|
|
102
|
+
REGISTRY = CollectorRegistry()
|
|
103
|
+
enq = Counter("queue_enqueue_total", "...", ["backend", "outcome"], registry=REGISTRY)
|
|
104
|
+
|
|
105
|
+
class MyMetrics(QueueMetrics):
|
|
106
|
+
def on_enqueue(self, backend, outcome):
|
|
107
|
+
enq.labels(backend=backend, outcome=outcome).inc()
|
|
108
|
+
# override other hooks as needed
|
|
109
|
+
|
|
110
|
+
q = get_queue(metrics=MyMetrics())
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
## Backend change callback
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
def on_change(event: str, payload: dict) -> None:
|
|
117
|
+
print(f"Queue backend changed: {event} {payload}")
|
|
118
|
+
|
|
119
|
+
q = get_queue(on_backend_change=on_change)
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Environment variables
|
|
123
|
+
|
|
124
|
+
| Variable | Default | Purpose |
|
|
125
|
+
|---|---|---|
|
|
126
|
+
| `REDIS_URL` | — | Redis connection URL |
|
|
127
|
+
| `NODUS_QUEUE_NAME` | `nodus:jobs` | Key prefix for all queue Redis keys |
|
|
128
|
+
| `NODUS_QUEUE_MAXSIZE` | `100` | Hard capacity limit |
|
|
129
|
+
| `NODUS_REQUIRE_REDIS` | `false` | Fail on startup if Redis is unavailable |
|
|
130
|
+
| `EXECUTION_MODE` | `thread` | `distributed` requires `REDIS_URL` |
|
|
131
|
+
| `ENV` | — | `production`/`prod` requires `REDIS_URL` |
|
|
132
|
+
| `TESTING` / `TEST_MODE` | — | Auto-select in-memory backend |
|
|
133
|
+
|
|
134
|
+
## Extracted from
|
|
135
|
+
|
|
136
|
+
`AINDY/core/distributed_queue.py` in the A.I.N.D.Y. runtime.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
nodus_queue/__init__.py,sha256=z2fXlGIxazCY52StbTzn4CpHmDXA7bbB08MJLaYxoaw,1734
|
|
2
|
+
nodus_queue/backends.py,sha256=6zZfSsYCEjoJPiBo7t71o2zJ81vnsFrAySh-sGfvNFU,29690
|
|
3
|
+
nodus_queue/metrics.py,sha256=QewCwIz3q2HmP_aPe4Q2Qrs5_i4g8NNCmOiANw5Jw_Q,1950
|
|
4
|
+
nodus_queue/payload.py,sha256=oE7hWnCjYgp3erulkSeXYnBGoIBspFDVQER9T0HJAXM,2040
|
|
5
|
+
nodus_queue/queue.py,sha256=J-QI3RHtwEOZjGIXeIwp7YkZp4Pybb4isXCCvNoio54,8254
|
|
6
|
+
nodus_queue-0.1.0.dist-info/licenses/LICENSE,sha256=fPXtVFKk3GVjyErjlaH2F56cuzEfzln8OP_ElB7cXTI,1069
|
|
7
|
+
nodus_queue-0.1.0.dist-info/METADATA,sha256=6Kcsss0Zwgz2oi7sjhp7_-At-YEY6_3v_12LP54vKO4,3999
|
|
8
|
+
nodus_queue-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
9
|
+
nodus_queue-0.1.0.dist-info/top_level.txt,sha256=oI5yBO0ILWsfpsL554n68uOnegNyQVOUwmXZDWwFeDc,12
|
|
10
|
+
nodus_queue-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Shawn Knight
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
nodus_queue
|