toro-queue 0.1.0__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toro_queue-0.2.0/.github/workflows/pr-check.yaml +88 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/.github/workflows/release.yml +4 -4
- {toro_queue-0.1.0 → toro_queue-0.2.0}/.gitignore +1 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/.pre-commit-config.yaml +1 -1
- toro_queue-0.2.0/.vscode/extensions.json +8 -0
- toro_queue-0.2.0/.vscode/settings.json +19 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/PKG-INFO +18 -3
- {toro_queue-0.1.0 → toro_queue-0.2.0}/README.md +17 -2
- toro_queue-0.2.0/docs/architecture.md +143 -0
- toro_queue-0.2.0/docs/concepts.md +102 -0
- toro_queue-0.2.0/docs/data-model.md +63 -0
- toro_queue-0.2.0/docs/index.md +20 -0
- toro_queue-0.2.0/docs/processing.md +112 -0
- toro_queue-0.2.0/docs/producing.md +107 -0
- toro_queue-0.2.0/docs/reliability.md +108 -0
- toro_queue-0.2.0/docs/scheduling.md +71 -0
- toro_queue-0.2.0/examples/README.md +19 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/pyproject.toml +6 -2
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/conftest.py +16 -0
- toro_queue-0.2.0/tests/integration/test_admin_ordering.py +66 -0
- toro_queue-0.2.0/tests/integration/test_finished_retention.py +84 -0
- toro_queue-0.2.0/tests/integration/test_result_dispatcher.py +162 -0
- toro_queue-0.2.0/tests/integration/test_worker_resilience.py +82 -0
- toro_queue-0.2.0/tests/load/test_active_list_cost.py +118 -0
- toro_queue-0.2.0/tests/load/test_admin_scaling.py +83 -0
- toro_queue-0.2.0/tests/load/test_enqueue_rtt.py +65 -0
- toro_queue-0.2.0/tests/load/test_promote_blocking.py +107 -0
- toro_queue-0.2.0/tests/load/test_result_fanout.py +112 -0
- toro_queue-0.2.0/tests/load/test_worker_concurrency.py +52 -0
- toro_queue-0.2.0/toro/__init__.py +22 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/connection.py +11 -2
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/job.py +30 -10
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/queue.py +165 -53
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/scheduler.py +4 -2
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/scripts.py +33 -7
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/worker.py +79 -42
- {toro_queue-0.1.0 → toro_queue-0.2.0}/uv.lock +21 -21
- toro_queue-0.1.0/.github/workflows/ci.yml +0 -41
- toro_queue-0.1.0/DESIGN.md +0 -114
- toro_queue-0.1.0/toro/__init__.py +0 -9
- {toro_queue-0.1.0 → toro_queue-0.2.0}/LICENSE +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/bench/bench.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/examples/basic.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/examples/stalled.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_admin.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_connection.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_introspection.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_processing.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_reliability.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_retries.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_scheduler.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_workers.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/load/harness.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/load/test_load.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_backoff.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_job.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_job_options.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_keys.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_priority.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_scheduler.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/errors.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/keys.py +0 -0
- {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/py.typed +0 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
name: PR check
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
push:
|
|
6
|
+
branches:
|
|
7
|
+
- main
|
|
8
|
+
- feature*
|
|
9
|
+
pull_request:
|
|
10
|
+
types: [opened, synchronize, reopened]
|
|
11
|
+
|
|
12
|
+
# Read-only by default — nothing here writes to the repo. SonarCloud PR
|
|
13
|
+
# decoration comes from the SonarCloud GitHub App, not GITHUB_TOKEN write scopes.
|
|
14
|
+
permissions:
|
|
15
|
+
contents: read
|
|
16
|
+
|
|
17
|
+
# A new push to the same branch/PR supersedes the previous run.
|
|
18
|
+
concurrency:
|
|
19
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
20
|
+
cancel-in-progress: true
|
|
21
|
+
|
|
22
|
+
jobs:
|
|
23
|
+
check:
|
|
24
|
+
name: Lint, types, tests (py${{ matrix.python-version }})
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
strategy:
|
|
27
|
+
fail-fast: false
|
|
28
|
+
matrix:
|
|
29
|
+
# Everything the classifiers claim to support.
|
|
30
|
+
python-version: ["3.10", "3.11", "3.12", "3.13"]
|
|
31
|
+
permissions:
|
|
32
|
+
contents: read
|
|
33
|
+
services:
|
|
34
|
+
redis:
|
|
35
|
+
image: redis:7-alpine
|
|
36
|
+
ports:
|
|
37
|
+
- 6379:6379
|
|
38
|
+
options: >-
|
|
39
|
+
--health-cmd "redis-cli ping"
|
|
40
|
+
--health-interval 10s
|
|
41
|
+
--health-timeout 5s
|
|
42
|
+
--health-retries 5
|
|
43
|
+
steps:
|
|
44
|
+
- uses: actions/checkout@v6.0.3
|
|
45
|
+
with:
|
|
46
|
+
fetch-depth: 0 # full history improves Sonar new-code/blame relevancy
|
|
47
|
+
persist-credentials: false
|
|
48
|
+
|
|
49
|
+
- name: Install uv
|
|
50
|
+
uses: astral-sh/setup-uv@v8.2.0
|
|
51
|
+
with:
|
|
52
|
+
enable-cache: true
|
|
53
|
+
python-version: ${{ matrix.python-version }}
|
|
54
|
+
|
|
55
|
+
- name: Sync dependencies
|
|
56
|
+
run: uv sync --locked
|
|
57
|
+
|
|
58
|
+
- name: Lint (ruff)
|
|
59
|
+
run: uv run ruff check .
|
|
60
|
+
|
|
61
|
+
- name: Format (ruff)
|
|
62
|
+
run: uv run ruff format --check .
|
|
63
|
+
|
|
64
|
+
- name: Type check (ty)
|
|
65
|
+
run: uv run ty check
|
|
66
|
+
|
|
67
|
+
- name: Tests (unit + integration)
|
|
68
|
+
run: uv run pytest -m "unit or integration" --cov=toro --cov-report=xml
|
|
69
|
+
|
|
70
|
+
# Runs only when SONAR_TOKEN is set (skipped on forks / before setup, so the
|
|
71
|
+
# check stays green), and only once per matrix — one coverage upload.
|
|
72
|
+
# Config is passed inline; there is no sonar-project.properties.
|
|
73
|
+
- name: SonarCloud scan
|
|
74
|
+
if: matrix.python-version == '3.13' && env.SONAR_TOKEN
|
|
75
|
+
uses: SonarSource/sonarqube-scan-action@v8.2.0
|
|
76
|
+
env:
|
|
77
|
+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
78
|
+
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
|
|
79
|
+
with:
|
|
80
|
+
args: >
|
|
81
|
+
-Dsonar.organization=ilovepixelart
|
|
82
|
+
-Dsonar.projectName=toro
|
|
83
|
+
-Dsonar.projectKey=ilovepixelart_toro
|
|
84
|
+
-Dsonar.python.coverage.reportPaths=coverage.xml
|
|
85
|
+
-Dsonar.sources=toro
|
|
86
|
+
-Dsonar.tests=tests
|
|
87
|
+
-Dsonar.test.exclusions=tests/**
|
|
88
|
+
-Dsonar.coverage.exclusions=tests/**
|
|
@@ -12,12 +12,12 @@ jobs:
|
|
|
12
12
|
build:
|
|
13
13
|
runs-on: ubuntu-latest
|
|
14
14
|
steps:
|
|
15
|
-
- uses: actions/checkout@
|
|
15
|
+
- uses: actions/checkout@v6.0.3
|
|
16
16
|
- name: Install uv
|
|
17
|
-
uses: astral-sh/setup-uv@
|
|
17
|
+
uses: astral-sh/setup-uv@v8.2.0
|
|
18
18
|
- name: Build sdist + wheel
|
|
19
19
|
run: uv build
|
|
20
|
-
- uses: actions/upload-artifact@
|
|
20
|
+
- uses: actions/upload-artifact@v7.0.1
|
|
21
21
|
with:
|
|
22
22
|
name: dist
|
|
23
23
|
path: dist/
|
|
@@ -29,7 +29,7 @@ jobs:
|
|
|
29
29
|
permissions:
|
|
30
30
|
id-token: write
|
|
31
31
|
steps:
|
|
32
|
-
- uses: actions/download-artifact@
|
|
32
|
+
- uses: actions/download-artifact@v8.0.1
|
|
33
33
|
with:
|
|
34
34
|
name: dist
|
|
35
35
|
path: dist/
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
{
|
|
2
|
+
"[python]": {
|
|
3
|
+
"editor.defaultFormatter": "charliermarsh.ruff",
|
|
4
|
+
"editor.formatOnSave": true,
|
|
5
|
+
"editor.codeActionsOnSave": {
|
|
6
|
+
"source.fixAll.ruff": "explicit",
|
|
7
|
+
"source.organizeImports.ruff": "explicit"
|
|
8
|
+
}
|
|
9
|
+
},
|
|
10
|
+
"editor.rulers": [100],
|
|
11
|
+
"python.testing.pytestEnabled": true,
|
|
12
|
+
"python.testing.unittestEnabled": false,
|
|
13
|
+
"cSpell.words": [
|
|
14
|
+
"BZPOPMIN",
|
|
15
|
+
"keepalive",
|
|
16
|
+
"ZPOPMIN",
|
|
17
|
+
"ZSET"
|
|
18
|
+
]
|
|
19
|
+
}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: toro-queue
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0
|
|
4
4
|
Summary: An async-first, Redis-backed job queue for Python.
|
|
5
5
|
Project-URL: Homepage, https://github.com/ilovepixelart/toro
|
|
6
6
|
Project-URL: Repository, https://github.com/ilovepixelart/toro
|
|
@@ -30,13 +30,28 @@ Description-Content-Type: text/markdown
|
|
|
30
30
|
An **async-first**, Redis-backed job queue for Python. Every state transition is
|
|
31
31
|
an atomic Lua script; producing and processing are `asyncio` end to end.
|
|
32
32
|
|
|
33
|
+
[](https://pypi.org/project/toro-queue/)
|
|
34
|
+
\
|
|
35
|
+
[](https://pypi.org/project/toro-queue/)
|
|
36
|
+
[](https://pepy.tech/project/toro-queue)
|
|
37
|
+
[](https://github.com/ilovepixelart/toro/blob/main/LICENSE)
|
|
38
|
+
\
|
|
39
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
40
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
41
|
+
\
|
|
42
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
43
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
44
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
45
|
+
|
|
33
46
|
```bash
|
|
34
47
|
pip install toro-queue # the import name is `toro`
|
|
35
48
|
```
|
|
36
49
|
|
|
37
50
|
> Installed as **`toro-queue`** on PyPI (the name `toro` was taken), but you
|
|
38
|
-
> `import toro`. See [
|
|
39
|
-
>
|
|
51
|
+
> `import toro`. See the [docs](https://github.com/ilovepixelart/toro/tree/main/docs) for the
|
|
52
|
+
> architecture, the reliability model, and the detailed guides.
|
|
53
|
+
|
|
54
|
+
Pairs with **[matador](https://github.com/ilovepixelart/matador)**, a live web dashboard for your queues.
|
|
40
55
|
|
|
41
56
|
## Why toro
|
|
42
57
|
|
|
@@ -3,13 +3,28 @@
|
|
|
3
3
|
An **async-first**, Redis-backed job queue for Python. Every state transition is
|
|
4
4
|
an atomic Lua script; producing and processing are `asyncio` end to end.
|
|
5
5
|
|
|
6
|
+
[](https://pypi.org/project/toro-queue/)
|
|
7
|
+
\
|
|
8
|
+
[](https://pypi.org/project/toro-queue/)
|
|
9
|
+
[](https://pepy.tech/project/toro-queue)
|
|
10
|
+
[](https://github.com/ilovepixelart/toro/blob/main/LICENSE)
|
|
11
|
+
\
|
|
12
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
13
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
14
|
+
\
|
|
15
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
16
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
17
|
+
[](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
|
|
18
|
+
|
|
6
19
|
```bash
|
|
7
20
|
pip install toro-queue # the import name is `toro`
|
|
8
21
|
```
|
|
9
22
|
|
|
10
23
|
> Installed as **`toro-queue`** on PyPI (the name `toro` was taken), but you
|
|
11
|
-
> `import toro`. See [
|
|
12
|
-
>
|
|
24
|
+
> `import toro`. See the [docs](https://github.com/ilovepixelart/toro/tree/main/docs) for the
|
|
25
|
+
> architecture, the reliability model, and the detailed guides.
|
|
26
|
+
|
|
27
|
+
Pairs with **[matador](https://github.com/ilovepixelart/matador)**, a live web dashboard for your queues.
|
|
13
28
|
|
|
14
29
|
## Why toro
|
|
15
30
|
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# Architecture
|
|
2
|
+
|
|
3
|
+
How toro's core works, and why. Every state transition is an atomic Lua script;
|
|
4
|
+
every job is durable in Redis.
|
|
5
|
+
|
|
6
|
+
> Prior art: the atomic-Lua and lock/stalled-recovery patterns come from the
|
|
7
|
+
> Node.js Redis-queue ecosystem; the specifics are toro's own.
|
|
8
|
+
|
|
9
|
+
## Atomic state transitions via Lua
|
|
10
|
+
|
|
11
|
+
Every state move (`wait→active`, `active→completed/failed/delayed`,
|
|
12
|
+
`delayed→wait`) is a single Redis Lua script, run atomically, so multi-key
|
|
13
|
+
"check-then-act" sequences can't interleave. That removes whole classes of race:
|
|
14
|
+
|
|
15
|
+
- **pop-then-lock gap** — two workers claiming the same job: the claim pops from
|
|
16
|
+
the priority set and sets the lock inside one script.
|
|
17
|
+
- **finish-after-steal** — a worker committing a result for a job a stalled sweep
|
|
18
|
+
already re-queued: guarded by a token check plus `LREM active` returning 0.
|
|
19
|
+
|
|
20
|
+
Scripts live in `scripts.py`, registered with `redis.asyncio`'s `register_script`.
|
|
21
|
+
The Python side only assembles KEYS/ARGV; the guarantees live in the Lua.
|
|
22
|
+
|
|
23
|
+
## Claiming a job: the prioritized set + a wakeup marker
|
|
24
|
+
|
|
25
|
+
All waiting jobs live in one `prioritized` ZSET, scored
|
|
26
|
+
`(PRIORITY_OFFSET - priority) * 2^32 + seq` — a single global order where higher
|
|
27
|
+
priority is more urgent and ties stay FIFO (`seq` is a per-queue counter). This
|
|
28
|
+
*is* the `wait` state; there is no separate fast-lane list, so a low-priority job
|
|
29
|
+
can't starve a high-priority one.
|
|
30
|
+
|
|
31
|
+
A single ZSET can't be blocking-popped, so wakeup uses a small **base marker**:
|
|
32
|
+
producers `ZADD marker 0 "0"` (idempotent) on enqueue, and idle workers park on
|
|
33
|
+
`BZPOPMIN marker`. The marker only wakes a worker; the real claim is the atomic
|
|
34
|
+
`MOVE_TO_ACTIVE` (`ZPOPMIN prioritized` → push to `active` → set the lock → load
|
|
35
|
+
the job). Because the claim is atomic and idempotent, a missed marker can never
|
|
36
|
+
strand a job.
|
|
37
|
+
|
|
38
|
+
## Fetch-next inside finish
|
|
39
|
+
|
|
40
|
+
A busy worker doesn't go back to the blocking wait between jobs. The finish
|
|
41
|
+
scripts (`MOVE_TO_COMPLETED` / `MOVE_TO_FAILED`) commit the current job **and**
|
|
42
|
+
claim the next one in the same round trip; the worker only re-parks on the marker
|
|
43
|
+
when the queue is empty (or it's shutting down, signaled by a fetch flag, so it
|
|
44
|
+
drains cleanly). All claiming funnels through one shared Lua routine
|
|
45
|
+
(`lockAndLoad` / `acquireNext`), used by both the wakeup path and fetch-next.
|
|
46
|
+
|
|
47
|
+
It's mainly a round-trip win: at concurrency 20, process throughput is roughly
|
|
48
|
+
2.3× a claim-per-job design, because the separate per-job claim and load collapse
|
|
49
|
+
into the finish call.
|
|
50
|
+
|
|
51
|
+
## At-least-once: locks, tokens, and stalled recovery
|
|
52
|
+
|
|
53
|
+
The reliability core ([Reliability](reliability.md) is the full guide):
|
|
54
|
+
|
|
55
|
+
- On claim, the job gets a lock `<id>:lock = <token>` with `PX lockDuration`
|
|
56
|
+
(default 30s). The token is the claiming worker's; only it can renew or finish.
|
|
57
|
+
- A per-job renewer extends the lock on a timer and clears the job from the
|
|
58
|
+
`stalled` set while it's alive.
|
|
59
|
+
- A background sweep runs every `stalled_interval` (throttled cluster-wide by a
|
|
60
|
+
`stalled-check` key): any job in `stalled` whose lock has expired is recovered
|
|
61
|
+
(`LREM active`, back to the prioritized set, or failed after
|
|
62
|
+
`max_stalled_count`), then the current `active` list is re-marked as stalled.
|
|
63
|
+
|
|
64
|
+
The guarantee is **at-least-once**: a job is never lost while Redis persists, but
|
|
65
|
+
its handler can run more than once (bounded by `max_stalled_count`) if a worker
|
|
66
|
+
dies mid-job. Exactly-once *result commit* is enforced by the token-guarded lock
|
|
67
|
+
at finish, not by preventing duplicate handler runs.
|
|
68
|
+
|
|
69
|
+
## Delayed jobs
|
|
70
|
+
|
|
71
|
+
Delayed jobs and retries with backoff sit in a `delayed` ZSET scored by their
|
|
72
|
+
process-at timestamp (ms). A one-second promotion loop in the worker moves any
|
|
73
|
+
due jobs into the prioritized set.
|
|
74
|
+
|
|
75
|
+
## Higher-level features
|
|
76
|
+
|
|
77
|
+
- **Priorities** — every job is in the one prioritized ZSET above, so priority is
|
|
78
|
+
a single global order with no starvation, FIFO within a band.
|
|
79
|
+
- **Repeatable / cron** — `add_scheduler(every=ms | cron=...)` stores a template
|
|
80
|
+
and enqueues the first occurrence as a delayed job; each occurrence mints its
|
|
81
|
+
successor with a deterministic id when a worker picks it up. `trigger_scheduler`
|
|
82
|
+
runs one now, `remove_scheduler` stops the chain. See [Scheduling](scheduling.md).
|
|
83
|
+
- **Rate limiting** — a queue-wide token bucket in Redis
|
|
84
|
+
(`Worker(rate_limit={"max": N, "duration": ms})`), shared by every worker on the
|
|
85
|
+
queue. An over-limit claim returns a sentinel and the worker waits out the window.
|
|
86
|
+
- **Events** — Redis pub/sub on an `events` channel (`added`, `progress`,
|
|
87
|
+
`completed`, `failed`); `Queue.result()` awaits the terminal event and
|
|
88
|
+
`Worker.on(event, fn)` exposes in-process hooks. See [Concepts](concepts.md).
|
|
89
|
+
- **Auto-removal** — `remove_on_complete` / `remove_on_fail` (bool / count /
|
|
90
|
+
`{count, age}`) enforced inside the finish script, not by a separate sweeper.
|
|
91
|
+
|
|
92
|
+
## The Lua scripts
|
|
93
|
+
|
|
94
|
+
Every state change is a Lua script in `scripts.py`, registered once per process
|
|
95
|
+
with `register_script` (run by `EVALSHA`). Python only assembles `KEYS`/`ARGV`.
|
|
96
|
+
|
|
97
|
+
The scripts share a small library of routines:
|
|
98
|
+
|
|
99
|
+
| Routine | Does |
|
|
100
|
+
|---|---|
|
|
101
|
+
| `priorityScore` | Packs `(PRIORITY_OFFSET - priority) * 2^32 + seq` for the prioritized ZSET. |
|
|
102
|
+
| `enqueue` | Adds a job to `prioritized` at its score and arms the marker. |
|
|
103
|
+
| `lockAndLoad` | Sets the lock token and loads the hash for a just-claimed id. |
|
|
104
|
+
| `acquireNext` | Pops the top prioritized job into `active` and locks it, honoring the rate limit. |
|
|
105
|
+
| `tryRateLimit` | Token bucket: ms until a token frees, or 0 to proceed. |
|
|
106
|
+
| `recordFinished` | Records a terminal job in `completed`/`failed` and applies auto-removal. |
|
|
107
|
+
|
|
108
|
+
And the scripts themselves:
|
|
109
|
+
|
|
110
|
+
| Script | Caller | Does |
|
|
111
|
+
|---|---|---|
|
|
112
|
+
| `ADD_JOB` | producer | Mint/accept an id, write the hash, enqueue or delay, dedup, publish `added`. |
|
|
113
|
+
| `MOVE_TO_ACTIVE` | worker wakeup | Claim the next job: `ZPOPMIN prioritized` → `active` → lock + load. |
|
|
114
|
+
| `MOVE_TO_COMPLETED` | worker finish | Commit the result and fetch-next in one round trip. |
|
|
115
|
+
| `MOVE_TO_FAILED` | worker finish | Retry (to `wait`/`delayed`) or terminally fail, and fetch-next. |
|
|
116
|
+
| `EXTEND_LOCK` | renewer | Token-guarded lock renewal; clears the job from `stalled`. |
|
|
117
|
+
| `MOVE_STALLED` | sweep | Mark-and-sweep recovery of jobs whose lock expired. |
|
|
118
|
+
| `PROMOTE_DELAYED` | promote loop | Move up to `PROMOTE_BATCH` (1000) due delayed jobs to `prioritized`. |
|
|
119
|
+
| `ADD_SCHEDULED` | scheduler | Enqueue a scheduler occurrence under a deterministic id (idempotent). |
|
|
120
|
+
| `PROMOTE_JOB` / `RETRY_JOB` / `REMOVE_JOB` | dashboard | Run a delayed job now / re-enqueue a failed one / delete a job with its lock and logs. |
|
|
121
|
+
|
|
122
|
+
### Lua → Python return protocol
|
|
123
|
+
|
|
124
|
+
Scripts signal outcomes with sentinels the worker decodes:
|
|
125
|
+
|
|
126
|
+
- `RL_SENTINEL` (`"__rl__"`) — a claim hit the rate limiter; the second value is
|
|
127
|
+
ms until a token frees, so the worker waits instead of busy-spinning.
|
|
128
|
+
- `LOCK_LOST` (`-2`) — a finish ran but the worker no longer held the lock (the
|
|
129
|
+
job was reclaimed); the result is dropped.
|
|
130
|
+
- `NOT_ACTIVE` (`-3`) — a finish ran but the job was no longer in `active`.
|
|
131
|
+
- `OUTCOME_FAILED` (`1`) vs `0` — `MOVE_TO_FAILED` telling the worker whether the
|
|
132
|
+
job terminally failed or will retry.
|
|
133
|
+
|
|
134
|
+
Scores are packed under 2^53 (`PRIORITY_OFFSET = 2^20`, `SEQ_MOD = 2^32`) so ZSET
|
|
135
|
+
double scores stay exact, and the scripts use only plain JSON and integer ARGV —
|
|
136
|
+
no `cmsgpack` / `bit` / `cjson` — so they run on any Redis build.
|
|
137
|
+
|
|
138
|
+
## Python-specific choices
|
|
139
|
+
|
|
140
|
+
- **async-first** — `redis.asyncio`, `async def` processors, one event loop;
|
|
141
|
+
concurrency is N `asyncio` tasks sharing the loop.
|
|
142
|
+
- **Cluster** — a `{braces}` hash-tag in the prefix keeps all of a queue's keys on
|
|
143
|
+
one slot, which the multi-key Lua scripts require.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Concepts
|
|
2
|
+
|
|
3
|
+
The mental model behind toro.
|
|
4
|
+
|
|
5
|
+
## Queue, Worker, Job
|
|
6
|
+
|
|
7
|
+
toro has a clean producer/consumer split, and both talk to the same Redis.
|
|
8
|
+
|
|
9
|
+
- A **`Queue`** is the *producer* handle. You use it to enqueue jobs
|
|
10
|
+
(`queue.add(...)`), schedule repeatable ones, and inspect state (counts,
|
|
11
|
+
listing, search). Creating a `Queue` opens (or shares) a Redis connection but
|
|
12
|
+
starts no background work.
|
|
13
|
+
- A **`Worker`** is the *consumer*. You give it a queue name and an `async`
|
|
14
|
+
processor function; calling `worker.run()` starts claiming jobs, running the
|
|
15
|
+
processor over each, and recovering jobs from workers that died. A worker also
|
|
16
|
+
runs small background loops (delayed-job promotion, stalled-job sweep,
|
|
17
|
+
heartbeat) while it's alive.
|
|
18
|
+
- A **`Job`** is one unit of work. It carries an `id`, a `name` (a label you
|
|
19
|
+
choose, e.g. `"welcome"`), a JSON-serializable `data` payload, its options, and
|
|
20
|
+
bookkeeping the system fills in: `state`, `attempts_made`, timestamps
|
|
21
|
+
(`timestamp`, `processed_on`, `finished_on`), `progress`, `stacktrace`, and
|
|
22
|
+
either a `returnvalue` or a `failed_reason`. (A job's log lines and its lock
|
|
23
|
+
live in separate Redis keys, not as fields on the `Job` — see the
|
|
24
|
+
[data model](data-model.md).)
|
|
25
|
+
|
|
26
|
+
Producers and consumers never call each other. They coordinate only through
|
|
27
|
+
Redis, which is what lets you run them in different processes or on different
|
|
28
|
+
machines.
|
|
29
|
+
|
|
30
|
+
## Job states
|
|
31
|
+
|
|
32
|
+
Every job is in exactly one state at a time. toro exposes them as a `Literal`
|
|
33
|
+
type, `JobState`:
|
|
34
|
+
|
|
35
|
+
| State | Meaning |
|
|
36
|
+
|---|---|
|
|
37
|
+
| `wait` | Ready to run, waiting for a free worker. (Stored in the priority-ordered set, so "wait" and "prioritized" are the same place.) |
|
|
38
|
+
| `delayed` | Scheduled for the future; not yet runnable. Promoted to `wait` when due. |
|
|
39
|
+
| `active` | Claimed by a worker and currently running. |
|
|
40
|
+
| `completed` | Finished successfully; `returnvalue` holds the result. |
|
|
41
|
+
| `failed` | Exhausted its retry attempts; `failed_reason` holds the error. |
|
|
42
|
+
|
|
43
|
+
The normal path is `wait → active → completed`. A failure with retries left goes
|
|
44
|
+
`active → wait` (or `active → delayed`, if a backoff delay applies) and tries
|
|
45
|
+
again; only after the last attempt does it land in `failed`. A delayed or
|
|
46
|
+
repeatable job starts in `delayed`. See [Job lifecycle](architecture.md) for the
|
|
47
|
+
exact transitions and [Producing jobs](producing.md) for how delay and retries
|
|
48
|
+
are configured.
|
|
49
|
+
|
|
50
|
+
## Workers vs. slots
|
|
51
|
+
|
|
52
|
+
These are easy to conflate but distinct, and the dashboard shows both.
|
|
53
|
+
|
|
54
|
+
- A **worker** is a running `Worker` instance (one heartbeat, one identity). It
|
|
55
|
+
lives inside an OS process, but it is *not* the process: you can run several
|
|
56
|
+
workers in one process, one per process, or spread across machines.
|
|
57
|
+
- A **slot** is one unit of *parallel* work *inside* a worker. A worker created
|
|
58
|
+
with `concurrency=N` runs N async processing loops, so it can have up to N jobs
|
|
59
|
+
in flight at once. "Slots" on the dashboard is the sum of every live worker's
|
|
60
|
+
concurrency: your total throughput capacity.
|
|
61
|
+
|
|
62
|
+
So `live` counts workers, `slots` counts concurrent capacity. With the default
|
|
63
|
+
`concurrency=1` they happen to match; bump concurrency and slots climb while the
|
|
64
|
+
worker count stays put.
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
host (machine)
|
|
68
|
+
└── process (pid)
|
|
69
|
+
└── worker (a Worker instance, unique id) ← "live"
|
|
70
|
+
└── slots (concurrency async loops) ← "slots"
|
|
71
|
+
└── jobs (one per slot at a time)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Because slots are `asyncio` tasks sharing one event loop (not threads or
|
|
75
|
+
processes), a processor that blocks the loop blocks its sibling slots. Keep
|
|
76
|
+
processors `await`-y.
|
|
77
|
+
|
|
78
|
+
## Events
|
|
79
|
+
|
|
80
|
+
toro publishes events to a Redis pub/sub channel: `added` when a job is enqueued
|
|
81
|
+
(published by the add script, atomically with the enqueue), `progress` from a running processor
|
|
82
|
+
(`job.update_progress`), and `completed` / `failed`, which the finish Lua scripts
|
|
83
|
+
publish atomically with the state change. `failed` fires only on terminal failure,
|
|
84
|
+
not on a retry. Two things consume the channel:
|
|
85
|
+
|
|
86
|
+
- **`await job.result()`** (or `queue.result(job_id)`) on the producer side
|
|
87
|
+
subscribes and waits for the terminal event, returning the value or raising
|
|
88
|
+
`JobFailedError`.
|
|
89
|
+
- **A dashboard** (such as [matador](https://github.com/ilovepixelart/matador))
|
|
90
|
+
subscribes to refresh live as state changes.
|
|
91
|
+
|
|
92
|
+
`Worker.on(event, fn)` lets a worker react to its own lifecycle with in-process
|
|
93
|
+
callbacks (`completed`, `failed`, `retrying`, `stalled`, `lock-lost`,
|
|
94
|
+
`rate-limited`) — separate from the pub/sub channel above. See
|
|
95
|
+
[Processing jobs](processing.md).
|
|
96
|
+
|
|
97
|
+
## Reliability in one sentence
|
|
98
|
+
|
|
99
|
+
toro is **at-least-once**: a job is never lost while Redis persists, but its
|
|
100
|
+
handler can run more than once (bounded) if a worker dies mid-job. Exactly-once
|
|
101
|
+
*result commit* is enforced by a per-job lock token. The full story is in
|
|
102
|
+
[Reliability](reliability.md).
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# Data model
|
|
2
|
+
|
|
3
|
+
Everything toro stores lives in Redis under a per-queue prefix. All key names are
|
|
4
|
+
computed in one place (`toro/keys.py`) so the Lua scripts and the Python side can
|
|
5
|
+
never disagree about where something lives.
|
|
6
|
+
|
|
7
|
+
## Key prefix
|
|
8
|
+
|
|
9
|
+
For a queue named `<name>` with prefix `<prefix>` (default `toro`), every key
|
|
10
|
+
starts with:
|
|
11
|
+
|
|
12
|
+
```
|
|
13
|
+
<prefix>:<name>:
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
So `Queue("emails")` (default prefix) stores everything under `toro:emails:`.
|
|
17
|
+
Using a `{braces}` hash-tag in the prefix forces all of a queue's keys onto one
|
|
18
|
+
Redis Cluster slot, which the multi-key Lua scripts require.
|
|
19
|
+
|
|
20
|
+
## Queue-wide keys
|
|
21
|
+
|
|
22
|
+
| Key suffix | Type | Holds |
|
|
23
|
+
|---|---|---|
|
|
24
|
+
| `id` | string (counter) | `INCR`-ed to mint auto job ids. |
|
|
25
|
+
| `prioritized` | ZSET | Waiting jobs in global priority order; score packs (priority, sequence). This *is* the `wait` state. |
|
|
26
|
+
| `marker` | ZSET | A single idempotent base member (`"0"`); idle workers `BZPOPMIN` it to wake. It only signals; the real claim is atomic. |
|
|
27
|
+
| `pc` | string (counter) | Priority sequence counter, so same-priority jobs stay FIFO. |
|
|
28
|
+
| `active` | LIST | Ids currently claimed by a worker and running. |
|
|
29
|
+
| `delayed` | ZSET | Ids scored by their process-at timestamp (ms); promoted to `prioritized` when due. |
|
|
30
|
+
| `completed` | ZSET | Successfully-finished ids, scored by finish time (for auto-removal + listing). |
|
|
31
|
+
| `failed` | ZSET | Terminally-failed ids, scored by finish time. |
|
|
32
|
+
| `meta-paused` | string (flag) | Exists only while the queue is paused; workers stop claiming new jobs. |
|
|
33
|
+
| `events` | pub/sub channel | Carries `added` / `progress` / `completed` / `failed`; drives `result()` and live dashboards. |
|
|
34
|
+
| `limiter` | HASH | The queue-wide rate-limit token bucket (`{tokens, ts}`), shared by every worker. |
|
|
35
|
+
| `stalled` | SET | Candidate ids for the mark-and-sweep recovery pass. |
|
|
36
|
+
| `stalled-check` | string (PX) | Throttle key so the stalled sweep runs about once per interval cluster-wide. |
|
|
37
|
+
| `repeat` | ZSET | Scheduler id -> next-run timestamp. |
|
|
38
|
+
| `workers` | ZSET | Live worker id -> last-heartbeat ms; stale entries pruned lazily on read. |
|
|
39
|
+
| `departed` | LIST (capped) | Recent worker departures: graceful `stopped` or `lost` (crashed). |
|
|
40
|
+
|
|
41
|
+
## Per-scheduler, per-worker, per-job keys
|
|
42
|
+
|
|
43
|
+
| Key | Type | Holds |
|
|
44
|
+
|---|---|---|
|
|
45
|
+
| `repeat:<schedulerId>` | HASH | A scheduler's template: `name`, `every`/`cron`, `data`, `opts`. |
|
|
46
|
+
| `worker:<workerId>` | HASH | A worker's presence record: host, pid, concurrency, current jobs, processed/failed counts, state. |
|
|
47
|
+
| `<jobId>` | HASH | The job itself: `name`, `data`, `opts`, `state`, `attemptsMade`, timestamps, `returnvalue`/`failedReason`, `progress`, `stacktrace`, ... |
|
|
48
|
+
| `<jobId>:lock` | string (token, PX) | The per-job lock: the owning worker's token with an expiry. Only the holder may finish or renew it. |
|
|
49
|
+
| `<jobId>:logs` | LIST | Log lines appended by `job.log(...)` from inside a processor. |
|
|
50
|
+
|
|
51
|
+
Note the job hash key is just `<prefix>:<name>:<jobId>` (no extra segment), so a job
|
|
52
|
+
`5` on `toro:emails:` is the hash `toro:emails:5`, with `toro:emails:5:lock` and
|
|
53
|
+
`toro:emails:5:logs` beside it.
|
|
54
|
+
|
|
55
|
+
## How the pieces connect
|
|
56
|
+
|
|
57
|
+
- A job moves between `prioritized` / `active` / `delayed` / `completed` / `failed`
|
|
58
|
+
as its state changes; the move and the hash update happen in one Lua script. See
|
|
59
|
+
[Architecture](architecture.md).
|
|
60
|
+
- The `lock` + `stalled` keys are the at-least-once machinery. See
|
|
61
|
+
[Reliability](reliability.md).
|
|
62
|
+
- `repeat` + `repeat:<id>` drive [scheduling](scheduling.md); `workers` +
|
|
63
|
+
`worker:<id>` + `departed` drive worker presence in the dashboard.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
# toro documentation
|
|
2
|
+
|
|
3
|
+
Reference docs for how toro works. The [README](../README.md) is the quick start.
|
|
4
|
+
|
|
5
|
+
## Pages
|
|
6
|
+
|
|
7
|
+
- **[Concepts](concepts.md)** — the mental model: queues, workers, jobs, the five
|
|
8
|
+
job states, and the difference between *workers* and *slots*.
|
|
9
|
+
- **[Data model](data-model.md)** — the exact Redis keys a queue uses and what
|
|
10
|
+
each one stores.
|
|
11
|
+
- **[Reliability](reliability.md)** — the at-least-once guarantee: per-job locks,
|
|
12
|
+
worker tokens, and stalled-job recovery.
|
|
13
|
+
- **[Producing jobs](producing.md)** — `Queue.add()` and every option (priority,
|
|
14
|
+
delay, retries/backoff, deduplication, custom ids).
|
|
15
|
+
- **[Processing jobs](processing.md)** — `Worker`: concurrency, lifecycle events,
|
|
16
|
+
rate limiting, and graceful shutdown.
|
|
17
|
+
- **[Scheduling](scheduling.md)** — repeatable and cron jobs, and how each
|
|
18
|
+
occurrence schedules the next.
|
|
19
|
+
- **[Architecture](architecture.md)** — the atomic-Lua core and the design
|
|
20
|
+
decisions behind the queue.
|