toro-queue 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toro/__init__.py +9 -0
- toro/connection.py +31 -0
- toro/errors.py +15 -0
- toro/job.py +154 -0
- toro/keys.py +108 -0
- toro/py.typed +0 -0
- toro/queue.py +545 -0
- toro/scheduler.py +37 -0
- toro/scripts.py +433 -0
- toro/worker.py +525 -0
- toro_queue-0.1.0.dist-info/METADATA +127 -0
- toro_queue-0.1.0.dist-info/RECORD +14 -0
- toro_queue-0.1.0.dist-info/WHEEL +4 -0
- toro_queue-0.1.0.dist-info/licenses/LICENSE +21 -0
toro/worker.py
ADDED
|
@@ -0,0 +1,525 @@
|
|
|
1
|
+
"""Worker: the consumer side. Pulls jobs and runs a processor over them.
|
|
2
|
+
|
|
3
|
+
Reliability model (this is the core — see DESIGN.md):
|
|
4
|
+
* A blocking BLMOVE wakes the worker and moves a job id from `wait` to
|
|
5
|
+
`active`. `MOVE_TO_ACTIVE` then locks + loads it.
|
|
6
|
+
* Job acquisition (lock + load) funnels through ONE Lua routine, shared by the
|
|
7
|
+
blocking path and by fetch-next. That routine is the seed of a future
|
|
8
|
+
`moveToActive`: to add priorities/markers we change only which job it picks.
|
|
9
|
+
* Fetch-next: the finish scripts commit the current job AND acquire the next
|
|
10
|
+
one in the same round trip, so a busy worker loops without going back to the
|
|
11
|
+
blocking pop. It only re-blocks when the queue drains.
|
|
12
|
+
* On pickup the worker locks the job (`<id>:lock = <token> PX lockDuration`)
|
|
13
|
+
and a renewer extends it. If a worker dies, its lock expires and a background
|
|
14
|
+
mark-and-sweep recovers the job. Token-guarded finishes guarantee a result
|
|
15
|
+
is committed exactly once even though a handler may run more than once.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
import contextlib
|
|
22
|
+
import json
|
|
23
|
+
import os
|
|
24
|
+
import socket
|
|
25
|
+
import time
|
|
26
|
+
import traceback
|
|
27
|
+
import uuid
|
|
28
|
+
from collections.abc import Awaitable, Callable
|
|
29
|
+
from typing import Any, cast
|
|
30
|
+
|
|
31
|
+
from redis.asyncio import Redis
|
|
32
|
+
|
|
33
|
+
from . import scripts
|
|
34
|
+
from .connection import connect
|
|
35
|
+
from .job import Job, JobContext, JobOptions
|
|
36
|
+
from .keys import Keys
|
|
37
|
+
from .scheduler import next_run
|
|
38
|
+
|
|
39
|
+
Processor = Callable[[Job], Awaitable[Any]]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _now_ms() -> int:
|
|
43
|
+
return int(time.time() * 1000)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _pairs(flat: list | None) -> dict:
|
|
47
|
+
"""Turn a flat HGETALL array [k, v, k, v, ...] into a dict."""
|
|
48
|
+
if not flat:
|
|
49
|
+
return {}
|
|
50
|
+
it = iter(flat)
|
|
51
|
+
return dict(zip(it, it, strict=False))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def compute_backoff(backoff: Any, attempts_made: int) -> int:
|
|
55
|
+
"""Delay (ms) before the next attempt. `backoff` is None/0, an int (fixed ms),
|
|
56
|
+
or {"type": "fixed"|"exponential", "delay": ms}. Exponential doubles per attempt.
|
|
57
|
+
Pure function so it can be unit-tested without a Redis-bound Worker.
|
|
58
|
+
"""
|
|
59
|
+
if not backoff:
|
|
60
|
+
return 0
|
|
61
|
+
if isinstance(backoff, (int, float)):
|
|
62
|
+
return int(backoff)
|
|
63
|
+
delay = backoff.get("delay", 0)
|
|
64
|
+
if backoff.get("type") == "exponential":
|
|
65
|
+
return int(delay * (2 ** (attempts_made - 1)))
|
|
66
|
+
return int(delay)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
class Worker:
|
|
70
|
+
"""The consumer side: claims jobs, runs the processor, and recovers stalls."""
|
|
71
|
+
|
|
72
|
+
def __init__(
|
|
73
|
+
self,
|
|
74
|
+
name: str,
|
|
75
|
+
processor: Processor,
|
|
76
|
+
*,
|
|
77
|
+
connection: Redis | None = None,
|
|
78
|
+
url: str = "redis://localhost:6379",
|
|
79
|
+
prefix: str = "toro",
|
|
80
|
+
concurrency: int = 1,
|
|
81
|
+
rate_limit: dict | None = None,
|
|
82
|
+
block_timeout: float = 5.0,
|
|
83
|
+
lock_duration: int = 30000,
|
|
84
|
+
lock_renew_time: int | None = None,
|
|
85
|
+
renew_locks: bool = True,
|
|
86
|
+
stalled_interval: int = 30000,
|
|
87
|
+
max_stalled_count: int = 1,
|
|
88
|
+
grace_period: float = 30.0,
|
|
89
|
+
heartbeat_interval: int = 5000,
|
|
90
|
+
) -> None:
|
|
91
|
+
self.name = name
|
|
92
|
+
self.processor = processor
|
|
93
|
+
self.keys = Keys(name, prefix)
|
|
94
|
+
self.redis = connection or connect(url)
|
|
95
|
+
self.concurrency = concurrency
|
|
96
|
+
# Queue-wide rate limit, shared by all workers via one token bucket in Redis.
|
|
97
|
+
# `{"max": N, "duration": ms}` = at most N jobs per duration. All workers on a
|
|
98
|
+
# queue should pass the SAME config so the shared bucket behaves consistently.
|
|
99
|
+
if rate_limit is not None and (
|
|
100
|
+
int(rate_limit.get("max", 0)) <= 0 or int(rate_limit.get("duration", 0)) <= 0
|
|
101
|
+
):
|
|
102
|
+
raise ValueError("rate_limit needs {'max': positive, 'duration': positive ms}")
|
|
103
|
+
self.rl_max = int(rate_limit["max"]) if rate_limit else 0
|
|
104
|
+
self.rl_duration = int(rate_limit["duration"]) if rate_limit else 0
|
|
105
|
+
self.block_timeout = block_timeout
|
|
106
|
+
|
|
107
|
+
# Reliability knobs.
|
|
108
|
+
self.token = uuid.uuid4().hex
|
|
109
|
+
self.lock_duration = lock_duration
|
|
110
|
+
self.lock_renew_time = lock_renew_time or lock_duration // 2
|
|
111
|
+
self.renew_locks = renew_locks
|
|
112
|
+
self.stalled_interval = stalled_interval
|
|
113
|
+
self.max_stalled_count = max_stalled_count
|
|
114
|
+
self.grace_period = grace_period
|
|
115
|
+
self.heartbeat_interval = heartbeat_interval
|
|
116
|
+
|
|
117
|
+
self._running = False
|
|
118
|
+
self._tasks: list[asyncio.Task] = []
|
|
119
|
+
self._process_tasks: list[asyncio.Task] = []
|
|
120
|
+
|
|
121
|
+
# Presence + throughput for the "workers" view; flushed to Redis each heartbeat.
|
|
122
|
+
self.started_at = 0
|
|
123
|
+
self._processed = 0
|
|
124
|
+
self._failed = 0
|
|
125
|
+
self._current: set[str] = set()
|
|
126
|
+
# "running" until a graceful stop flips it to "stopping" — the dashboard shows
|
|
127
|
+
# a live "draining" state, and a worker that then vanishes was mid-shutdown,
|
|
128
|
+
# not a crash. (The only honest way to know graceful; absence can't say why.)
|
|
129
|
+
self._state = "running"
|
|
130
|
+
|
|
131
|
+
self._move_to_active = self.redis.register_script(scripts.MOVE_TO_ACTIVE)
|
|
132
|
+
self._extend_lock = self.redis.register_script(scripts.EXTEND_LOCK)
|
|
133
|
+
self._move_to_completed = self.redis.register_script(scripts.MOVE_TO_COMPLETED)
|
|
134
|
+
self._move_to_failed = self.redis.register_script(scripts.MOVE_TO_FAILED)
|
|
135
|
+
self._move_stalled = self.redis.register_script(scripts.MOVE_STALLED)
|
|
136
|
+
self._promote_delayed = self.redis.register_script(scripts.PROMOTE_DELAYED)
|
|
137
|
+
self._add_scheduled = self.redis.register_script(scripts.ADD_SCHEDULED)
|
|
138
|
+
|
|
139
|
+
# Simple event callbacks: worker.on("completed", fn)
|
|
140
|
+
self._handlers: dict[str, list[Callable]] = {}
|
|
141
|
+
|
|
142
|
+
def on(self, event: str, fn: Callable) -> None:
|
|
143
|
+
self._handlers.setdefault(event, []).append(fn)
|
|
144
|
+
|
|
145
|
+
def _emit(self, event: str, *args: Any) -> None:
|
|
146
|
+
for fn in self._handlers.get(event, []):
|
|
147
|
+
fn(*args)
|
|
148
|
+
|
|
149
|
+
async def run(self) -> None:
|
|
150
|
+
"""Start processing until stop() is called. Awaitable forever."""
|
|
151
|
+
self._running = True
|
|
152
|
+
self.started_at = _now_ms()
|
|
153
|
+
await self._write_heartbeat() # register at once so the worker shows up immediately
|
|
154
|
+
self._process_tasks = [
|
|
155
|
+
asyncio.create_task(self._process_loop()) for _ in range(self.concurrency)
|
|
156
|
+
]
|
|
157
|
+
bg = [asyncio.create_task(self._promote_loop())]
|
|
158
|
+
if self.stalled_interval > 0:
|
|
159
|
+
bg.append(asyncio.create_task(self._stalled_loop()))
|
|
160
|
+
if self.heartbeat_interval > 0:
|
|
161
|
+
bg.append(asyncio.create_task(self._heartbeat_loop()))
|
|
162
|
+
self._tasks = [*self._process_tasks, *bg]
|
|
163
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
164
|
+
await asyncio.gather(*self._tasks)
|
|
165
|
+
|
|
166
|
+
async def stop(self, grace_period: float | None = None) -> None:
|
|
167
|
+
"""Graceful shutdown: stop fetching new jobs, let in-flight jobs finish
|
|
168
|
+
(up to `grace_period` seconds), then cancel the rest and disconnect.
|
|
169
|
+
"""
|
|
170
|
+
grace = self.grace_period if grace_period is None else grace_period
|
|
171
|
+
self._running = False
|
|
172
|
+
# Flip to "stopping" (shown as "draining" in the dashboard) and flush it now, so
|
|
173
|
+
# this worker reads as shutting down in real time (a later vanish = graceful, not crash).
|
|
174
|
+
self._state = "stopping"
|
|
175
|
+
with contextlib.suppress(Exception):
|
|
176
|
+
await self._write_heartbeat()
|
|
177
|
+
# Wake an idle worker parked on BZPOPMIN so it notices the shutdown.
|
|
178
|
+
with contextlib.suppress(Exception): # pragma: no cover
|
|
179
|
+
await self.redis.zadd(self.keys.marker, {"0": 0})
|
|
180
|
+
# Let process loops drain their current job and exit on their own.
|
|
181
|
+
if self._process_tasks:
|
|
182
|
+
await asyncio.wait(self._process_tasks, timeout=grace)
|
|
183
|
+
# Force-cancel anything left (jobs past the grace period + background loops).
|
|
184
|
+
for t in self._tasks:
|
|
185
|
+
t.cancel()
|
|
186
|
+
await asyncio.gather(*self._tasks, return_exceptions=True)
|
|
187
|
+
with contextlib.suppress(Exception):
|
|
188
|
+
await self._deregister() # drop our presence record so we vanish at once
|
|
189
|
+
await self.redis.aclose()
|
|
190
|
+
|
|
191
|
+
# ---- presence / heartbeat ---------------------------------------------
|
|
192
|
+
|
|
193
|
+
async def _heartbeat_loop(self) -> None:
|
|
194
|
+
while self._running:
|
|
195
|
+
await asyncio.sleep(self.heartbeat_interval / 1000)
|
|
196
|
+
with contextlib.suppress(Exception):
|
|
197
|
+
await self._write_heartbeat()
|
|
198
|
+
|
|
199
|
+
async def _write_heartbeat(self) -> None:
|
|
200
|
+
"""Flush this worker's presence record and register it as live."""
|
|
201
|
+
now = _now_ms()
|
|
202
|
+
await self.redis.hset(
|
|
203
|
+
self.keys.worker(self.token),
|
|
204
|
+
mapping={
|
|
205
|
+
"id": self.token,
|
|
206
|
+
"host": socket.gethostname(),
|
|
207
|
+
"pid": os.getpid(),
|
|
208
|
+
"queue": self.name,
|
|
209
|
+
"concurrency": self.concurrency,
|
|
210
|
+
"started": self.started_at,
|
|
211
|
+
"heartbeat": now,
|
|
212
|
+
"processed": self._processed,
|
|
213
|
+
"failed": self._failed,
|
|
214
|
+
"current": json.dumps(sorted(self._current)),
|
|
215
|
+
"state": self._state,
|
|
216
|
+
},
|
|
217
|
+
)
|
|
218
|
+
await self.redis.zadd(self.keys.workers, {self.token: now})
|
|
219
|
+
|
|
220
|
+
async def _deregister(self) -> None:
|
|
221
|
+
await self._record_departure("stopped") # graceful shutdown
|
|
222
|
+
await self.redis.zrem(self.keys.workers, self.token)
|
|
223
|
+
await self.redis.delete(self.keys.worker(self.token))
|
|
224
|
+
|
|
225
|
+
async def _record_departure(self, reason: str) -> None:
|
|
226
|
+
"""Append to the capped death-log so the dashboard can show what left and why."""
|
|
227
|
+
now = _now_ms()
|
|
228
|
+
rec = json.dumps(
|
|
229
|
+
{
|
|
230
|
+
"id": self.token,
|
|
231
|
+
"host": socket.gethostname(),
|
|
232
|
+
"pid": os.getpid(),
|
|
233
|
+
"queue": self.name,
|
|
234
|
+
"concurrency": self.concurrency,
|
|
235
|
+
"processed": self._processed,
|
|
236
|
+
"failed": self._failed,
|
|
237
|
+
"started": self.started_at,
|
|
238
|
+
"last_seen": now,
|
|
239
|
+
"current": sorted(self._current), # what it was running at the end
|
|
240
|
+
"reason": reason,
|
|
241
|
+
"at": now,
|
|
242
|
+
}
|
|
243
|
+
)
|
|
244
|
+
await self.redis.lpush(self.keys.departed, rec)
|
|
245
|
+
await self.redis.ltrim(self.keys.departed, 0, 49)
|
|
246
|
+
|
|
247
|
+
# ---- the hot path -----------------------------------------------------
|
|
248
|
+
|
|
249
|
+
async def _process_loop(self) -> None:
|
|
250
|
+
while self._running:
|
|
251
|
+
try:
|
|
252
|
+
# The marker only wakes us; the real claim is the atomic
|
|
253
|
+
# MOVE_TO_ACTIVE below. A timeout (None) is fine — we still try
|
|
254
|
+
# to acquire, so a missed marker can never strand a job.
|
|
255
|
+
await self.redis.bzpopmin(self.keys.marker, self.block_timeout)
|
|
256
|
+
except asyncio.CancelledError:
|
|
257
|
+
raise
|
|
258
|
+
except Exception:
|
|
259
|
+
await asyncio.sleep(0.1)
|
|
260
|
+
continue
|
|
261
|
+
if not self._running:
|
|
262
|
+
break # shutting down — don't claim a new job
|
|
263
|
+
loaded = await self._acquire()
|
|
264
|
+
# Keep processing as long as each finish hands us the next job.
|
|
265
|
+
while loaded is not None and self._running:
|
|
266
|
+
loaded = await self._handle(loaded)
|
|
267
|
+
|
|
268
|
+
async def _acquire(self) -> tuple[str, dict] | None:
|
|
269
|
+
"""Pop the highest-priority job into `active`, lock + load it."""
|
|
270
|
+
res = await self._move_to_active(
|
|
271
|
+
keys=[
|
|
272
|
+
self.keys.prioritized,
|
|
273
|
+
self.keys.active,
|
|
274
|
+
self.keys.marker,
|
|
275
|
+
self.keys.stalled,
|
|
276
|
+
self.keys.base,
|
|
277
|
+
self.keys.pc,
|
|
278
|
+
self.keys.meta_paused,
|
|
279
|
+
self.keys.limiter,
|
|
280
|
+
],
|
|
281
|
+
args=[self.token, self.lock_duration, _now_ms(), self.rl_max, self.rl_duration],
|
|
282
|
+
)
|
|
283
|
+
if res and res[0] == scripts.RL_SENTINEL:
|
|
284
|
+
await self._on_rate_limited(int(res[1]))
|
|
285
|
+
return None
|
|
286
|
+
return self._loaded(res)
|
|
287
|
+
|
|
288
|
+
async def _on_rate_limited(self, retry_ms: int) -> None:
|
|
289
|
+
"""Rate limited: wait until a token frees up (the job stays queued, no
|
|
290
|
+
attempt consumed), then re-arm the marker so we re-check immediately.
|
|
291
|
+
Capped at block_timeout so shutdown stays responsive on long waits.
|
|
292
|
+
"""
|
|
293
|
+
self._emit("rate-limited", retry_ms)
|
|
294
|
+
await asyncio.sleep(min(retry_ms, self.block_timeout * 1000) / 1000)
|
|
295
|
+
if self._running:
|
|
296
|
+
with contextlib.suppress(Exception): # pragma: no cover
|
|
297
|
+
await self.redis.zadd(self.keys.marker, {"0": 0})
|
|
298
|
+
|
|
299
|
+
def _loaded(self, res: list | None) -> tuple[str, dict] | None:
|
|
300
|
+
if not res:
|
|
301
|
+
return None
|
|
302
|
+
fields = _pairs(res[0])
|
|
303
|
+
if not fields:
|
|
304
|
+
return None
|
|
305
|
+
return (res[1], fields)
|
|
306
|
+
|
|
307
|
+
async def _handle(self, loaded: tuple[str, dict]) -> tuple[str, dict] | None:
|
|
308
|
+
job_id, fields = loaded
|
|
309
|
+
job = Job.from_hash(job_id, fields)
|
|
310
|
+
# Give the handler the ability to report progress and append logs.
|
|
311
|
+
job._ctx = JobContext( # noqa: SLF001 — the worker injects the job's runtime context
|
|
312
|
+
redis=self.redis,
|
|
313
|
+
job_key=self.keys.job(job_id),
|
|
314
|
+
events_key=self.keys.events,
|
|
315
|
+
logs_key=self.keys.logs(job_id),
|
|
316
|
+
job_id=job_id,
|
|
317
|
+
)
|
|
318
|
+
# A scheduler job mints its successor on first pickup, so the schedule
|
|
319
|
+
# stays on time regardless of how long (or whether) this run succeeds.
|
|
320
|
+
if fields.get("schedulerId") and job.attempts_made == 1:
|
|
321
|
+
await self._schedule_next(fields["schedulerId"])
|
|
322
|
+
renewer = asyncio.create_task(self._renew_loop(job_id)) if self.renew_locks else None
|
|
323
|
+
self._current.add(job_id) # so the heartbeat reports what we're running
|
|
324
|
+
try:
|
|
325
|
+
result = await self.processor(job)
|
|
326
|
+
except Exception as exc:
|
|
327
|
+
await self.redis.hset(self.keys.job(job_id), "stacktrace", traceback.format_exc())
|
|
328
|
+
self._failed += 1
|
|
329
|
+
nxt = await self._finish_failed(job, exc)
|
|
330
|
+
else:
|
|
331
|
+
self._processed += 1
|
|
332
|
+
nxt = await self._finish_completed(job, result)
|
|
333
|
+
finally:
|
|
334
|
+
self._current.discard(job_id)
|
|
335
|
+
if renewer is not None:
|
|
336
|
+
renewer.cancel()
|
|
337
|
+
return nxt
|
|
338
|
+
|
|
339
|
+
async def _finish_completed(self, job: Job, result: Any) -> tuple[str, dict] | None:
|
|
340
|
+
res = await self._move_to_completed(
|
|
341
|
+
keys=[
|
|
342
|
+
self.keys.active,
|
|
343
|
+
self.keys.completed,
|
|
344
|
+
self.keys.job(job.id),
|
|
345
|
+
self.keys.lock(job.id),
|
|
346
|
+
self.keys.prioritized,
|
|
347
|
+
self.keys.marker,
|
|
348
|
+
self.keys.stalled,
|
|
349
|
+
self.keys.base,
|
|
350
|
+
self.keys.pc,
|
|
351
|
+
self.keys.events,
|
|
352
|
+
self.keys.meta_paused,
|
|
353
|
+
self.keys.limiter,
|
|
354
|
+
],
|
|
355
|
+
args=[
|
|
356
|
+
job.id,
|
|
357
|
+
json.dumps(result),
|
|
358
|
+
_now_ms(),
|
|
359
|
+
self.token,
|
|
360
|
+
self._fetch_flag(),
|
|
361
|
+
self.lock_duration,
|
|
362
|
+
*JobOptions.keep_args(job.opts.remove_on_complete),
|
|
363
|
+
self.rl_max,
|
|
364
|
+
self.rl_duration,
|
|
365
|
+
],
|
|
366
|
+
)
|
|
367
|
+
if res in (scripts.LOCK_LOST, scripts.NOT_ACTIVE): # finish script's int sentinel
|
|
368
|
+
self._emit("lock-lost", job.id)
|
|
369
|
+
return None
|
|
370
|
+
job.returnvalue = result
|
|
371
|
+
self._emit("completed", job, result)
|
|
372
|
+
return self._next_from(res)
|
|
373
|
+
|
|
374
|
+
async def _finish_failed(self, job: Job, exc: Exception) -> tuple[str, dict] | None:
|
|
375
|
+
res = await self._move_to_failed(
|
|
376
|
+
keys=[
|
|
377
|
+
self.keys.active,
|
|
378
|
+
self.keys.prioritized,
|
|
379
|
+
self.keys.delayed,
|
|
380
|
+
self.keys.failed,
|
|
381
|
+
self.keys.job(job.id),
|
|
382
|
+
self.keys.lock(job.id),
|
|
383
|
+
self.keys.marker,
|
|
384
|
+
self.keys.stalled,
|
|
385
|
+
self.keys.base,
|
|
386
|
+
self.keys.pc,
|
|
387
|
+
self.keys.events,
|
|
388
|
+
self.keys.meta_paused,
|
|
389
|
+
self.keys.limiter,
|
|
390
|
+
],
|
|
391
|
+
args=[
|
|
392
|
+
job.id,
|
|
393
|
+
str(exc),
|
|
394
|
+
_now_ms(),
|
|
395
|
+
job.attempts_made,
|
|
396
|
+
job.opts.attempts,
|
|
397
|
+
self._backoff_delay(job),
|
|
398
|
+
self.token,
|
|
399
|
+
self._fetch_flag(),
|
|
400
|
+
self.lock_duration,
|
|
401
|
+
*JobOptions.keep_args(job.opts.remove_on_fail),
|
|
402
|
+
self.rl_max,
|
|
403
|
+
self.rl_duration,
|
|
404
|
+
],
|
|
405
|
+
)
|
|
406
|
+
if res in (scripts.LOCK_LOST, scripts.NOT_ACTIVE): # finish script's int sentinel
|
|
407
|
+
self._emit("lock-lost", job.id)
|
|
408
|
+
return None
|
|
409
|
+
job.failed_reason = str(exc)
|
|
410
|
+
self._emit("failed" if res[0] == scripts.OUTCOME_FAILED else "retrying", job, exc)
|
|
411
|
+
return self._next_from(res)
|
|
412
|
+
|
|
413
|
+
async def _schedule_next(self, scheduler_id: str) -> None:
|
|
414
|
+
"""Enqueue the next occurrence of a scheduler (idempotent, stops if removed)."""
|
|
415
|
+
template = await self.redis.hgetall(self.keys.scheduler(scheduler_id))
|
|
416
|
+
if not template or await self.redis.zscore(self.keys.repeat, scheduler_id) is None:
|
|
417
|
+
return # scheduler was removed — stop the chain
|
|
418
|
+
every = int(template["every"]) if template.get("every") else None
|
|
419
|
+
cron = cast("str | None", template.get("cron") or None)
|
|
420
|
+
now = _now_ms()
|
|
421
|
+
when = next_run(now, every=every, cron=cron)
|
|
422
|
+
await self.redis.zadd(self.keys.repeat, {scheduler_id: when})
|
|
423
|
+
opts = json.loads(template["opts"])
|
|
424
|
+
await self._add_scheduled(
|
|
425
|
+
keys=[self.keys.delayed, self.keys.base],
|
|
426
|
+
args=[
|
|
427
|
+
f"repeat:{scheduler_id}:{when}",
|
|
428
|
+
template["name"],
|
|
429
|
+
template["data"],
|
|
430
|
+
template["opts"],
|
|
431
|
+
now,
|
|
432
|
+
when,
|
|
433
|
+
opts.get("priority", 0),
|
|
434
|
+
scheduler_id,
|
|
435
|
+
],
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
def _fetch_flag(self) -> str:
|
|
439
|
+
# Don't fetch a next job while shutting down — let the queue drain cleanly.
|
|
440
|
+
return "1" if self._running else "0"
|
|
441
|
+
|
|
442
|
+
def _next_from(self, res: Any) -> tuple[str, dict] | None:
|
|
443
|
+
if isinstance(res, (list, tuple)) and len(res) >= 3:
|
|
444
|
+
return (res[2], _pairs(res[1]))
|
|
445
|
+
return None
|
|
446
|
+
|
|
447
|
+
# ---- locks & recovery -------------------------------------------------
|
|
448
|
+
|
|
449
|
+
async def _renew_loop(self, job_id: str) -> None:
|
|
450
|
+
interval = self.lock_renew_time / 1000
|
|
451
|
+
while True:
|
|
452
|
+
await asyncio.sleep(interval)
|
|
453
|
+
try:
|
|
454
|
+
ok = await self._extend_lock(
|
|
455
|
+
keys=[self.keys.lock(job_id), self.keys.stalled],
|
|
456
|
+
args=[self.token, self.lock_duration, job_id],
|
|
457
|
+
)
|
|
458
|
+
except asyncio.CancelledError:
|
|
459
|
+
raise
|
|
460
|
+
except Exception: # pragma: no cover
|
|
461
|
+
ok = 0
|
|
462
|
+
if not ok:
|
|
463
|
+
self._emit("lock-lost", job_id)
|
|
464
|
+
return
|
|
465
|
+
|
|
466
|
+
async def _promote_loop(self) -> None:
|
|
467
|
+
while self._running:
|
|
468
|
+
try:
|
|
469
|
+
await self._promote_delayed(
|
|
470
|
+
keys=[
|
|
471
|
+
self.keys.delayed,
|
|
472
|
+
self.keys.prioritized,
|
|
473
|
+
self.keys.marker,
|
|
474
|
+
self.keys.base,
|
|
475
|
+
self.keys.pc,
|
|
476
|
+
],
|
|
477
|
+
args=[_now_ms()],
|
|
478
|
+
)
|
|
479
|
+
except asyncio.CancelledError:
|
|
480
|
+
raise
|
|
481
|
+
except Exception: # pragma: no cover - best-effort background sweep
|
|
482
|
+
pass
|
|
483
|
+
await asyncio.sleep(1.0)
|
|
484
|
+
|
|
485
|
+
async def _stalled_loop(self) -> None:
|
|
486
|
+
while self._running:
|
|
487
|
+
await asyncio.sleep(self.stalled_interval / 1000)
|
|
488
|
+
try:
|
|
489
|
+
failed, recovered = await self.check_stalled()
|
|
490
|
+
except asyncio.CancelledError:
|
|
491
|
+
raise
|
|
492
|
+
except Exception: # pragma: no cover - best-effort background sweep
|
|
493
|
+
continue
|
|
494
|
+
for job_id in recovered:
|
|
495
|
+
self._emit("stalled", job_id)
|
|
496
|
+
for job_id in failed:
|
|
497
|
+
self._emit("failed", job_id, RuntimeError("job stalled too many times"))
|
|
498
|
+
|
|
499
|
+
async def check_stalled(self, throttle_ms: int | None = None) -> tuple[list[str], list[str]]:
|
|
500
|
+
"""Run one mark-and-sweep pass. Returns (failed_ids, recovered_ids).
|
|
501
|
+
|
|
502
|
+
`throttle_ms=0` bypasses the cross-worker throttle (used by tests); by
|
|
503
|
+
default the throttle is `stalled_interval` so concurrent workers don't
|
|
504
|
+
all sweep at once.
|
|
505
|
+
"""
|
|
506
|
+
throttle = self.stalled_interval if throttle_ms is None else throttle_ms
|
|
507
|
+
res = await self._move_stalled(
|
|
508
|
+
keys=[
|
|
509
|
+
self.keys.stalled,
|
|
510
|
+
self.keys.active,
|
|
511
|
+
self.keys.prioritized,
|
|
512
|
+
self.keys.failed,
|
|
513
|
+
self.keys.stalled_check,
|
|
514
|
+
self.keys.base,
|
|
515
|
+
self.keys.marker,
|
|
516
|
+
self.keys.pc,
|
|
517
|
+
],
|
|
518
|
+
args=[self.max_stalled_count, _now_ms(), throttle],
|
|
519
|
+
)
|
|
520
|
+
failed = list(res[0]) if res else []
|
|
521
|
+
recovered = list(res[1]) if res and len(res) > 1 else []
|
|
522
|
+
return failed, recovered
|
|
523
|
+
|
|
524
|
+
def _backoff_delay(self, job: Job) -> int:
|
|
525
|
+
return compute_backoff(job.opts.backoff, job.attempts_made)
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: toro-queue
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: An async-first, Redis-backed job queue for Python.
|
|
5
|
+
Project-URL: Homepage, https://github.com/ilovepixelart/toro
|
|
6
|
+
Project-URL: Repository, https://github.com/ilovepixelart/toro
|
|
7
|
+
Project-URL: Issues, https://github.com/ilovepixelart/toro/issues
|
|
8
|
+
Author: ilovepixelart
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: async,asyncio,jobs,queue,redis,scheduler,task-queue,worker
|
|
12
|
+
Classifier: Development Status :: 3 - Alpha
|
|
13
|
+
Classifier: Framework :: AsyncIO
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
21
|
+
Classifier: Topic :: System :: Distributed Computing
|
|
22
|
+
Classifier: Typing :: Typed
|
|
23
|
+
Requires-Python: >=3.10
|
|
24
|
+
Requires-Dist: croniter>=2.0
|
|
25
|
+
Requires-Dist: redis>=5.0
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# toro 🐂
|
|
29
|
+
|
|
30
|
+
An **async-first**, Redis-backed job queue for Python. Every state transition is
|
|
31
|
+
an atomic Lua script; producing and processing are `asyncio` end to end.
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install toro-queue # the import name is `toro`
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
> Installed as **`toro-queue`** on PyPI (the name `toro` was taken), but you
|
|
38
|
+
> `import toro`. See [DESIGN.md](https://github.com/ilovepixelart/toro/blob/main/DESIGN.md) for the architecture and the
|
|
39
|
+
> at-least-once reliability model.
|
|
40
|
+
|
|
41
|
+
## Why toro
|
|
42
|
+
|
|
43
|
+
- **Async-native.** Enqueue and process with `async`/`await` — no thread pools,
|
|
44
|
+
no sync bridge. A natural fit for FastAPI, aiohttp, or any asyncio app.
|
|
45
|
+
- **Atomic by construction.** Claims, retries, promotions and finishes are Lua
|
|
46
|
+
scripts, so a job can't be lost or double-committed between two round trips.
|
|
47
|
+
- **At-least-once delivery.** Per-job locks + a background mark-and-sweep recover
|
|
48
|
+
jobs from workers that crashed — without the visibility-timeout double-delivery
|
|
49
|
+
trap of some other queues.
|
|
50
|
+
- **Typed.** Ships `py.typed`; the public API is fully annotated.
|
|
51
|
+
|
|
52
|
+
## Features
|
|
53
|
+
|
|
54
|
+
| | |
|
|
55
|
+
|---|---|
|
|
56
|
+
| **Enqueue** | delayed jobs, global **priorities** (FIFO within a band) |
|
|
57
|
+
| **Retries** | fixed or exponential **backoff**, capped attempts |
|
|
58
|
+
| **Schedules** | repeatable **cron** and fixed-interval (`every`) jobs |
|
|
59
|
+
| **Rate limiting** | queue-wide token bucket shared across all workers |
|
|
60
|
+
| **Dedup** | custom (idempotent) job ids + a throttle window (`{id, ttl}`) |
|
|
61
|
+
| **Auto-removal** | keep the last N and/or finished-within-age completed/failed |
|
|
62
|
+
| **Reliability** | per-job locks, lock renewal, stalled-job recovery |
|
|
63
|
+
| **Observability** | progress, per-job logs, lifecycle events, `await result()` |
|
|
64
|
+
| **Lifecycle** | pause / resume, graceful shutdown that drains in-flight jobs |
|
|
65
|
+
| **Dashboard** | [matador](https://github.com/ilovepixelart/matador) — a live web UI |
|
|
66
|
+
|
|
67
|
+
## Quick start
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
import asyncio
|
|
71
|
+
from toro import Queue, Worker
|
|
72
|
+
|
|
73
|
+
async def main():
|
|
74
|
+
queue = Queue("emails")
|
|
75
|
+
await queue.add("welcome", {"to": "ada@example.com"})
|
|
76
|
+
|
|
77
|
+
async def process(job):
|
|
78
|
+
print("sending", job.data)
|
|
79
|
+
return {"ok": True}
|
|
80
|
+
|
|
81
|
+
worker = Worker("emails", process, concurrency=8)
|
|
82
|
+
worker.on("completed", lambda job, result: print("done", job.id))
|
|
83
|
+
await worker.run()
|
|
84
|
+
|
|
85
|
+
asyncio.run(main())
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## A taste of the options
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
# Priorities, delay, and retry-with-backoff
|
|
92
|
+
await queue.add("report", data, priority=10, delay=5000,
|
|
93
|
+
attempts=5, backoff={"type": "exponential", "delay": 1000})
|
|
94
|
+
|
|
95
|
+
# Idempotent custom id (a second add with the same id is ignored)
|
|
96
|
+
await queue.add("charge", data, job_id="order-1234")
|
|
97
|
+
|
|
98
|
+
# A repeatable schedule (cron or every-N-ms); "run now" with trigger_scheduler
|
|
99
|
+
await queue.add_scheduler("nightly-rollup", cron="0 0 * * *")
|
|
100
|
+
|
|
101
|
+
# Queue-wide rate limit: at most 100 jobs / second across every worker
|
|
102
|
+
worker = Worker("emails", process, rate_limit={"max": 100, "duration": 1000})
|
|
103
|
+
|
|
104
|
+
# Wait for a result from the producer side
|
|
105
|
+
job = await queue.add("resize", {"src": "a.png"})
|
|
106
|
+
print(await job.result(timeout=30))
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Develop
|
|
110
|
+
|
|
111
|
+
Managed with [uv](https://astral.sh/uv); the Astral toolchain throughout.
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
uv sync # venv + deps + dev group
|
|
115
|
+
uv run ruff check . # lint (strict: select = ALL)
|
|
116
|
+
uv run ruff format . # format
|
|
117
|
+
uv run ty check # type check
|
|
118
|
+
uv run pytest -m "unit or integration" # tests (integration needs Redis on :6379)
|
|
119
|
+
uv run python examples/basic.py
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
The suite is a pyramid — `-m unit` (fast, no Redis), `-m integration` (Redis),
|
|
123
|
+
and `-m load` (the open-loop benchmark harness in `tests/load/`).
|
|
124
|
+
|
|
125
|
+
## License
|
|
126
|
+
|
|
127
|
+
[MIT](https://github.com/ilovepixelart/toro/blob/main/LICENSE)
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
toro/__init__.py,sha256=oR4z6jIOh3ZyIZg1tjV_RR1RWFucXiQJVnAv_TDci1Y,325
|
|
2
|
+
toro/connection.py,sha256=zpVYmcTd73Im8b518PoCNSDhkcVSpsWYwW5R8WZuD4o,1251
|
|
3
|
+
toro/errors.py,sha256=B1wAPzQtnIjz9ihHbhsmVur7zUcvB_lSnL2VnsMmCcs,370
|
|
4
|
+
toro/job.py,sha256=M1f6MqKcV1rnNxlRs_pbX1B9hBK_vLNNApu0NSXBqqk,6126
|
|
5
|
+
toro/keys.py,sha256=VhtYGdS6P1Q-1xbigptD7m_EgdYG6NfPFbagCdv_2SA,3333
|
|
6
|
+
toro/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
7
|
+
toro/queue.py,sha256=2UZsaPc4gxKlER_RP7cOJ15nnDNolKO07EXEvnmSJgE,23117
|
|
8
|
+
toro/scheduler.py,sha256=CQTNgQbd4SDhRL2seN-jpt8cpl3gGTrLX2k4h6-Wgc4,1619
|
|
9
|
+
toro/scripts.py,sha256=N3tIIg7GVyQ_WrpuVZpMGMYP_QDyyISZtSt-F6VMqRI,18473
|
|
10
|
+
toro/worker.py,sha256=khv7uFe3KmaFzpXfuf6aid3IaN1wyisKfiAQslTxOSs,21515
|
|
11
|
+
toro_queue-0.1.0.dist-info/METADATA,sha256=09M4Bg1W2ZYrPw54zw5rqDKcMS5eFrtZ752dPoPEm6k,4800
|
|
12
|
+
toro_queue-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
13
|
+
toro_queue-0.1.0.dist-info/licenses/LICENSE,sha256=CM5Sh4ZaqlQbqgsjTUvl-A7O0YipFw4V5aZt1K6sLC4,1070
|
|
14
|
+
toro_queue-0.1.0.dist-info/RECORD,,
|