toro-queue 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. toro_queue-0.2.0/.github/workflows/pr-check.yaml +88 -0
  2. {toro_queue-0.1.0 → toro_queue-0.2.0}/.github/workflows/release.yml +4 -4
  3. {toro_queue-0.1.0 → toro_queue-0.2.0}/.gitignore +1 -0
  4. {toro_queue-0.1.0 → toro_queue-0.2.0}/.pre-commit-config.yaml +1 -1
  5. toro_queue-0.2.0/.vscode/extensions.json +8 -0
  6. toro_queue-0.2.0/.vscode/settings.json +19 -0
  7. {toro_queue-0.1.0 → toro_queue-0.2.0}/PKG-INFO +18 -3
  8. {toro_queue-0.1.0 → toro_queue-0.2.0}/README.md +17 -2
  9. toro_queue-0.2.0/docs/architecture.md +143 -0
  10. toro_queue-0.2.0/docs/concepts.md +102 -0
  11. toro_queue-0.2.0/docs/data-model.md +63 -0
  12. toro_queue-0.2.0/docs/index.md +20 -0
  13. toro_queue-0.2.0/docs/processing.md +112 -0
  14. toro_queue-0.2.0/docs/producing.md +107 -0
  15. toro_queue-0.2.0/docs/reliability.md +108 -0
  16. toro_queue-0.2.0/docs/scheduling.md +71 -0
  17. toro_queue-0.2.0/examples/README.md +19 -0
  18. {toro_queue-0.1.0 → toro_queue-0.2.0}/pyproject.toml +6 -2
  19. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/conftest.py +16 -0
  20. toro_queue-0.2.0/tests/integration/test_admin_ordering.py +66 -0
  21. toro_queue-0.2.0/tests/integration/test_finished_retention.py +84 -0
  22. toro_queue-0.2.0/tests/integration/test_result_dispatcher.py +162 -0
  23. toro_queue-0.2.0/tests/integration/test_worker_resilience.py +82 -0
  24. toro_queue-0.2.0/tests/load/test_active_list_cost.py +118 -0
  25. toro_queue-0.2.0/tests/load/test_admin_scaling.py +83 -0
  26. toro_queue-0.2.0/tests/load/test_enqueue_rtt.py +65 -0
  27. toro_queue-0.2.0/tests/load/test_promote_blocking.py +107 -0
  28. toro_queue-0.2.0/tests/load/test_result_fanout.py +112 -0
  29. toro_queue-0.2.0/tests/load/test_worker_concurrency.py +52 -0
  30. toro_queue-0.2.0/toro/__init__.py +22 -0
  31. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/connection.py +11 -2
  32. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/job.py +30 -10
  33. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/queue.py +165 -53
  34. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/scheduler.py +4 -2
  35. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/scripts.py +33 -7
  36. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/worker.py +79 -42
  37. {toro_queue-0.1.0 → toro_queue-0.2.0}/uv.lock +21 -21
  38. toro_queue-0.1.0/.github/workflows/ci.yml +0 -41
  39. toro_queue-0.1.0/DESIGN.md +0 -114
  40. toro_queue-0.1.0/toro/__init__.py +0 -9
  41. {toro_queue-0.1.0 → toro_queue-0.2.0}/LICENSE +0 -0
  42. {toro_queue-0.1.0 → toro_queue-0.2.0}/bench/bench.py +0 -0
  43. {toro_queue-0.1.0 → toro_queue-0.2.0}/examples/basic.py +0 -0
  44. {toro_queue-0.1.0 → toro_queue-0.2.0}/examples/stalled.py +0 -0
  45. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_admin.py +0 -0
  46. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_connection.py +0 -0
  47. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_introspection.py +0 -0
  48. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_processing.py +0 -0
  49. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_reliability.py +0 -0
  50. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_retries.py +0 -0
  51. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_scheduler.py +0 -0
  52. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/integration/test_workers.py +0 -0
  53. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/load/harness.py +0 -0
  54. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/load/test_load.py +0 -0
  55. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_backoff.py +0 -0
  56. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_job.py +0 -0
  57. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_job_options.py +0 -0
  58. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_keys.py +0 -0
  59. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_priority.py +0 -0
  60. {toro_queue-0.1.0 → toro_queue-0.2.0}/tests/unit/test_scheduler.py +0 -0
  61. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/errors.py +0 -0
  62. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/keys.py +0 -0
  63. {toro_queue-0.1.0 → toro_queue-0.2.0}/toro/py.typed +0 -0
@@ -0,0 +1,88 @@
1
+ name: PR check
2
+
3
+ on:
4
+ workflow_dispatch:
5
+ push:
6
+ branches:
7
+ - main
8
+ - feature*
9
+ pull_request:
10
+ types: [opened, synchronize, reopened]
11
+
12
+ # Read-only by default — nothing here writes to the repo. SonarCloud PR
13
+ # decoration comes from the SonarCloud GitHub App, not GITHUB_TOKEN write scopes.
14
+ permissions:
15
+ contents: read
16
+
17
+ # A new push to the same branch/PR supersedes the previous run.
18
+ concurrency:
19
+ group: ${{ github.workflow }}-${{ github.ref }}
20
+ cancel-in-progress: true
21
+
22
+ jobs:
23
+ check:
24
+ name: Lint, types, tests (py${{ matrix.python-version }})
25
+ runs-on: ubuntu-latest
26
+ strategy:
27
+ fail-fast: false
28
+ matrix:
29
+ # Everything the classifiers claim to support.
30
+ python-version: ["3.10", "3.11", "3.12", "3.13"]
31
+ permissions:
32
+ contents: read
33
+ services:
34
+ redis:
35
+ image: redis:7-alpine
36
+ ports:
37
+ - 6379:6379
38
+ options: >-
39
+ --health-cmd "redis-cli ping"
40
+ --health-interval 10s
41
+ --health-timeout 5s
42
+ --health-retries 5
43
+ steps:
44
+ - uses: actions/checkout@v6.0.3
45
+ with:
46
+ fetch-depth: 0 # full history improves Sonar new-code/blame relevancy
47
+ persist-credentials: false
48
+
49
+ - name: Install uv
50
+ uses: astral-sh/setup-uv@v8.2.0
51
+ with:
52
+ enable-cache: true
53
+ python-version: ${{ matrix.python-version }}
54
+
55
+ - name: Sync dependencies
56
+ run: uv sync --locked
57
+
58
+ - name: Lint (ruff)
59
+ run: uv run ruff check .
60
+
61
+ - name: Format (ruff)
62
+ run: uv run ruff format --check .
63
+
64
+ - name: Type check (ty)
65
+ run: uv run ty check
66
+
67
+ - name: Tests (unit + integration)
68
+ run: uv run pytest -m "unit or integration" --cov=toro --cov-report=xml
69
+
70
+ # Runs only when SONAR_TOKEN is set (skipped on forks / before setup, so the
71
+ # check stays green), and only once per matrix — one coverage upload.
72
+ # Config is passed inline; there is no sonar-project.properties.
73
+ - name: SonarCloud scan
74
+ if: matrix.python-version == '3.13' && env.SONAR_TOKEN
75
+ uses: SonarSource/sonarqube-scan-action@v8.2.0
76
+ env:
77
+ GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
78
+ SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
79
+ with:
80
+ args: >
81
+ -Dsonar.organization=ilovepixelart
82
+ -Dsonar.projectName=toro
83
+ -Dsonar.projectKey=ilovepixelart_toro
84
+ -Dsonar.python.coverage.reportPaths=coverage.xml
85
+ -Dsonar.sources=toro
86
+ -Dsonar.tests=tests
87
+ -Dsonar.test.exclusions=tests/**
88
+ -Dsonar.coverage.exclusions=tests/**
@@ -12,12 +12,12 @@ jobs:
12
12
  build:
13
13
  runs-on: ubuntu-latest
14
14
  steps:
15
- - uses: actions/checkout@v4
15
+ - uses: actions/checkout@v6.0.3
16
16
  - name: Install uv
17
- uses: astral-sh/setup-uv@v5
17
+ uses: astral-sh/setup-uv@v8.2.0
18
18
  - name: Build sdist + wheel
19
19
  run: uv build
20
- - uses: actions/upload-artifact@v4
20
+ - uses: actions/upload-artifact@v7.0.1
21
21
  with:
22
22
  name: dist
23
23
  path: dist/
@@ -29,7 +29,7 @@ jobs:
29
29
  permissions:
30
30
  id-token: write
31
31
  steps:
32
- - uses: actions/download-artifact@v4
32
+ - uses: actions/download-artifact@v8.0.1
33
33
  with:
34
34
  name: dist
35
35
  path: dist/
@@ -8,3 +8,4 @@ __pycache__/
8
8
  /build/
9
9
 
10
10
  .coverage
11
+ coverage.xml
@@ -2,7 +2,7 @@
2
2
  # Install once with: uvx pre-commit install
3
3
  repos:
4
4
  - repo: https://github.com/astral-sh/ruff-pre-commit
5
- rev: v0.15.15
5
+ rev: v0.15.16
6
6
  hooks:
7
7
  - id: ruff-check
8
8
  args: [--fix]
@@ -0,0 +1,8 @@
1
+ {
2
+ "recommendations": [
3
+ "ms-python.python",
4
+ "charliermarsh.ruff",
5
+ "astral-sh.ty",
6
+ "streetsidesoftware.code-spell-checker"
7
+ ]
8
+ }
@@ -0,0 +1,19 @@
1
+ {
2
+ "[python]": {
3
+ "editor.defaultFormatter": "charliermarsh.ruff",
4
+ "editor.formatOnSave": true,
5
+ "editor.codeActionsOnSave": {
6
+ "source.fixAll.ruff": "explicit",
7
+ "source.organizeImports.ruff": "explicit"
8
+ }
9
+ },
10
+ "editor.rulers": [100],
11
+ "python.testing.pytestEnabled": true,
12
+ "python.testing.unittestEnabled": false,
13
+ "cSpell.words": [
14
+ "BZPOPMIN",
15
+ "keepalive",
16
+ "ZPOPMIN",
17
+ "ZSET"
18
+ ]
19
+ }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: toro-queue
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: An async-first, Redis-backed job queue for Python.
5
5
  Project-URL: Homepage, https://github.com/ilovepixelart/toro
6
6
  Project-URL: Repository, https://github.com/ilovepixelart/toro
@@ -30,13 +30,28 @@ Description-Content-Type: text/markdown
30
30
  An **async-first**, Redis-backed job queue for Python. Every state transition is
31
31
  an atomic Lua script; producing and processing are `asyncio` end to end.
32
32
 
33
+ [![Python](https://img.shields.io/pypi/pyversions/toro-queue)](https://pypi.org/project/toro-queue/)
34
+ \
35
+ [![PyPI](https://img.shields.io/pypi/v/toro-queue)](https://pypi.org/project/toro-queue/)
36
+ [![Downloads](https://static.pepy.tech/badge/toro-queue)](https://pepy.tech/project/toro-queue)
37
+ [![License](https://img.shields.io/github/license/ilovepixelart/toro)](https://github.com/ilovepixelart/toro/blob/main/LICENSE)
38
+ \
39
+ [![Coverage](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=coverage)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
40
+ [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
41
+ \
42
+ [![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=reliability_rating)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
43
+ [![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=sqale_rating)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
44
+ [![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=security_rating)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
45
+
33
46
  ```bash
34
47
  pip install toro-queue # the import name is `toro`
35
48
  ```
36
49
 
37
50
  > Installed as **`toro-queue`** on PyPI (the name `toro` was taken), but you
38
- > `import toro`. See [DESIGN.md](https://github.com/ilovepixelart/toro/blob/main/DESIGN.md) for the architecture and the
39
- > at-least-once reliability model.
51
+ > `import toro`. See the [docs](https://github.com/ilovepixelart/toro/tree/main/docs) for the
52
+ > architecture, the reliability model, and the detailed guides.
53
+
54
+ Pairs with **[matador](https://github.com/ilovepixelart/matador)**, a live web dashboard for your queues.
40
55
 
41
56
  ## Why toro
42
57
 
@@ -3,13 +3,28 @@
3
3
  An **async-first**, Redis-backed job queue for Python. Every state transition is
4
4
  an atomic Lua script; producing and processing are `asyncio` end to end.
5
5
 
6
+ [![Python](https://img.shields.io/pypi/pyversions/toro-queue)](https://pypi.org/project/toro-queue/)
7
+ \
8
+ [![PyPI](https://img.shields.io/pypi/v/toro-queue)](https://pypi.org/project/toro-queue/)
9
+ [![Downloads](https://static.pepy.tech/badge/toro-queue)](https://pepy.tech/project/toro-queue)
10
+ [![License](https://img.shields.io/github/license/ilovepixelart/toro)](https://github.com/ilovepixelart/toro/blob/main/LICENSE)
11
+ \
12
+ [![Coverage](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=coverage)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
13
+ [![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
14
+ \
15
+ [![Reliability Rating](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=reliability_rating)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
16
+ [![Maintainability Rating](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=sqale_rating)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
17
+ [![Security Rating](https://sonarcloud.io/api/project_badges/measure?project=ilovepixelart_toro&metric=security_rating)](https://sonarcloud.io/summary/new_code?id=ilovepixelart_toro)
18
+
6
19
  ```bash
7
20
  pip install toro-queue # the import name is `toro`
8
21
  ```
9
22
 
10
23
  > Installed as **`toro-queue`** on PyPI (the name `toro` was taken), but you
11
- > `import toro`. See [DESIGN.md](https://github.com/ilovepixelart/toro/blob/main/DESIGN.md) for the architecture and the
12
- > at-least-once reliability model.
24
+ > `import toro`. See the [docs](https://github.com/ilovepixelart/toro/tree/main/docs) for the
25
+ > architecture, the reliability model, and the detailed guides.
26
+
27
+ Pairs with **[matador](https://github.com/ilovepixelart/matador)**, a live web dashboard for your queues.
13
28
 
14
29
  ## Why toro
15
30
 
@@ -0,0 +1,143 @@
1
+ # Architecture
2
+
3
+ How toro's core works, and why. Every state transition is an atomic Lua script;
4
+ every job is durable in Redis.
5
+
6
+ > Prior art: the atomic-Lua and lock/stalled-recovery patterns come from the
7
+ > Node.js Redis-queue ecosystem; the specifics are toro's own.
8
+
9
+ ## Atomic state transitions via Lua
10
+
11
+ Every state move (`wait→active`, `active→completed/failed/delayed`,
12
+ `delayed→wait`) is a single Redis Lua script, run atomically, so multi-key
13
+ "check-then-act" sequences can't interleave. That removes whole classes of race:
14
+
15
+ - **pop-then-lock gap** — two workers claiming the same job: the claim pops from
16
+ the priority set and sets the lock inside one script.
17
+ - **finish-after-steal** — a worker committing a result for a job a stalled sweep
18
+ already re-queued: guarded by a token check plus `LREM active` returning 0.
19
+
20
+ Scripts live in `scripts.py`, registered with `redis.asyncio`'s `register_script`.
21
+ The Python side only assembles KEYS/ARGV; the guarantees live in the Lua.
22
+
23
+ ## Claiming a job: the prioritized set + a wakeup marker
24
+
25
+ All waiting jobs live in one `prioritized` ZSET, scored
26
+ `(PRIORITY_OFFSET - priority) * 2^32 + seq` — a single global order where higher
27
+ priority is more urgent and ties stay FIFO (`seq` is a per-queue counter). This
28
+ *is* the `wait` state; there is no separate fast-lane list, so a low-priority job
29
+ can't starve a high-priority one.
30
+
31
+ A single ZSET can't be blocking-popped, so wakeup uses a small **base marker**:
32
+ producers `ZADD marker 0 "0"` (idempotent) on enqueue, and idle workers park on
33
+ `BZPOPMIN marker`. The marker only wakes a worker; the real claim is the atomic
34
+ `MOVE_TO_ACTIVE` (`ZPOPMIN prioritized` → push to `active` → set the lock → load
35
+ the job). Because the claim is atomic and idempotent, a missed marker can never
36
+ strand a job.
37
+
38
+ ## Fetch-next inside finish
39
+
40
+ A busy worker doesn't go back to the blocking wait between jobs. The finish
41
+ scripts (`MOVE_TO_COMPLETED` / `MOVE_TO_FAILED`) commit the current job **and**
42
+ claim the next one in the same round trip; the worker only re-parks on the marker
43
+ when the queue is empty (or it's shutting down, signaled by a fetch flag, so it
44
+ drains cleanly). All claiming funnels through one shared Lua routine
45
+ (`lockAndLoad` / `acquireNext`), used by both the wakeup path and fetch-next.
46
+
47
+ It's mainly a round-trip win: at concurrency 20, process throughput is roughly
48
+ 2.3× a claim-per-job design, because the separate per-job claim and load collapse
49
+ into the finish call.
50
+
51
+ ## At-least-once: locks, tokens, and stalled recovery
52
+
53
+ The reliability core ([Reliability](reliability.md) is the full guide):
54
+
55
+ - On claim, the job gets a lock `<id>:lock = <token>` with `PX lockDuration`
56
+ (default 30s). The token is the claiming worker's; only it can renew or finish.
57
+ - A per-job renewer extends the lock on a timer and clears the job from the
58
+ `stalled` set while it's alive.
59
+ - A background sweep runs every `stalled_interval` (throttled cluster-wide by a
60
+ `stalled-check` key): any job in `stalled` whose lock has expired is recovered
61
+ (`LREM active`, back to the prioritized set, or failed after
62
+ `max_stalled_count`), then the current `active` list is re-marked as stalled.
63
+
64
+ The guarantee is **at-least-once**: a job is never lost while Redis persists, but
65
+ its handler can run more than once (bounded by `max_stalled_count`) if a worker
66
+ dies mid-job. Exactly-once *result commit* is enforced by the token-guarded lock
67
+ at finish, not by preventing duplicate handler runs.
68
+
69
+ ## Delayed jobs
70
+
71
+ Delayed jobs and retries with backoff sit in a `delayed` ZSET scored by their
72
+ process-at timestamp (ms). A one-second promotion loop in the worker moves any
73
+ due jobs into the prioritized set.
74
+
75
+ ## Higher-level features
76
+
77
+ - **Priorities** — every job is in the one prioritized ZSET above, so priority is
78
+ a single global order with no starvation, FIFO within a band.
79
+ - **Repeatable / cron** — `add_scheduler(every=ms | cron=...)` stores a template
80
+ and enqueues the first occurrence as a delayed job; each occurrence mints its
81
+ successor with a deterministic id when a worker picks it up. `trigger_scheduler`
82
+ runs one now, `remove_scheduler` stops the chain. See [Scheduling](scheduling.md).
83
+ - **Rate limiting** — a queue-wide token bucket in Redis
84
+ (`Worker(rate_limit={"max": N, "duration": ms})`), shared by every worker on the
85
+ queue. An over-limit claim returns a sentinel and the worker waits out the window.
86
+ - **Events** — Redis pub/sub on an `events` channel (`added`, `progress`,
87
+ `completed`, `failed`); `Queue.result()` awaits the terminal event and
88
+ `Worker.on(event, fn)` exposes in-process hooks. See [Concepts](concepts.md).
89
+ - **Auto-removal** — `remove_on_complete` / `remove_on_fail` (bool / count /
90
+ `{count, age}`) enforced inside the finish script, not by a separate sweeper.
91
+
92
+ ## The Lua scripts
93
+
94
+ Every state change is a Lua script in `scripts.py`, registered once per process
95
+ with `register_script` (run by `EVALSHA`). Python only assembles `KEYS`/`ARGV`.
96
+
97
+ The scripts share a small library of routines:
98
+
99
+ | Routine | Does |
100
+ |---|---|
101
+ | `priorityScore` | Packs `(PRIORITY_OFFSET - priority) * 2^32 + seq` for the prioritized ZSET. |
102
+ | `enqueue` | Adds a job to `prioritized` at its score and arms the marker. |
103
+ | `lockAndLoad` | Sets the lock token and loads the hash for a just-claimed id. |
104
+ | `acquireNext` | Pops the top prioritized job into `active` and locks it, honoring the rate limit. |
105
+ | `tryRateLimit` | Token bucket: ms until a token frees, or 0 to proceed. |
106
+ | `recordFinished` | Records a terminal job in `completed`/`failed` and applies auto-removal. |
107
+
108
+ And the scripts themselves:
109
+
110
+ | Script | Caller | Does |
111
+ |---|---|---|
112
+ | `ADD_JOB` | producer | Mint/accept an id, write the hash, enqueue or delay, dedup, publish `added`. |
113
+ | `MOVE_TO_ACTIVE` | worker wakeup | Claim the next job: `ZPOPMIN prioritized` → `active` → lock + load. |
114
+ | `MOVE_TO_COMPLETED` | worker finish | Commit the result and fetch-next in one round trip. |
115
+ | `MOVE_TO_FAILED` | worker finish | Retry (to `wait`/`delayed`) or terminally fail, and fetch-next. |
116
+ | `EXTEND_LOCK` | renewer | Token-guarded lock renewal; clears the job from `stalled`. |
117
+ | `MOVE_STALLED` | sweep | Mark-and-sweep recovery of jobs whose lock expired. |
118
+ | `PROMOTE_DELAYED` | promote loop | Move up to `PROMOTE_BATCH` (1000) due delayed jobs to `prioritized`. |
119
+ | `ADD_SCHEDULED` | scheduler | Enqueue a scheduler occurrence under a deterministic id (idempotent). |
120
+ | `PROMOTE_JOB` / `RETRY_JOB` / `REMOVE_JOB` | dashboard | Run a delayed job now / re-enqueue a failed one / delete a job with its lock and logs. |
121
+
122
+ ### Lua → Python return protocol
123
+
124
+ Scripts signal outcomes with sentinels the worker decodes:
125
+
126
+ - `RL_SENTINEL` (`"__rl__"`) — a claim hit the rate limiter; the second value is
127
+ ms until a token frees, so the worker waits instead of busy-spinning.
128
+ - `LOCK_LOST` (`-2`) — a finish ran but the worker no longer held the lock (the
129
+ job was reclaimed); the result is dropped.
130
+ - `NOT_ACTIVE` (`-3`) — a finish ran but the job was no longer in `active`.
131
+ - `OUTCOME_FAILED` (`1`) vs `0` — `MOVE_TO_FAILED` telling the worker whether the
132
+ job terminally failed or will retry.
133
+
134
+ Scores are packed under 2^53 (`PRIORITY_OFFSET = 2^20`, `SEQ_MOD = 2^32`) so ZSET
135
+ double scores stay exact, and the scripts use only plain JSON and integer ARGV —
136
+ no `cmsgpack` / `bit` / `cjson` — so they run on any Redis build.
137
+
138
+ ## Python-specific choices
139
+
140
+ - **async-first** — `redis.asyncio`, `async def` processors, one event loop;
141
+ concurrency is N `asyncio` tasks sharing the loop.
142
+ - **Cluster** — a `{braces}` hash-tag in the prefix keeps all of a queue's keys on
143
+ one slot, which the multi-key Lua scripts require.
@@ -0,0 +1,102 @@
1
+ # Concepts
2
+
3
+ The mental model behind toro.
4
+
5
+ ## Queue, Worker, Job
6
+
7
+ toro has a clean producer/consumer split, and both talk to the same Redis.
8
+
9
+ - A **`Queue`** is the *producer* handle. You use it to enqueue jobs
10
+ (`queue.add(...)`), schedule repeatable ones, and inspect state (counts,
11
+ listing, search). Creating a `Queue` opens (or shares) a Redis connection but
12
+ starts no background work.
13
+ - A **`Worker`** is the *consumer*. You give it a queue name and an `async`
14
+ processor function; calling `worker.run()` starts claiming jobs, running the
15
+ processor over each, and recovering jobs from workers that died. A worker also
16
+ runs small background loops (delayed-job promotion, stalled-job sweep,
17
+ heartbeat) while it's alive.
18
+ - A **`Job`** is one unit of work. It carries an `id`, a `name` (a label you
19
+ choose, e.g. `"welcome"`), a JSON-serializable `data` payload, its options, and
20
+ bookkeeping the system fills in: `state`, `attempts_made`, timestamps
21
+ (`timestamp`, `processed_on`, `finished_on`), `progress`, `stacktrace`, and
22
+ either a `returnvalue` or a `failed_reason`. (A job's log lines and its lock
23
+ live in separate Redis keys, not as fields on the `Job` — see the
24
+ [data model](data-model.md).)
25
+
26
+ Producers and consumers never call each other. They coordinate only through
27
+ Redis, which is what lets you run them in different processes or on different
28
+ machines.
29
+
30
+ ## Job states
31
+
32
+ Every job is in exactly one state at a time. toro exposes them as a `Literal`
33
+ type, `JobState`:
34
+
35
+ | State | Meaning |
36
+ |---|---|
37
+ | `wait` | Ready to run, waiting for a free worker. (Stored in the priority-ordered set, so "wait" and "prioritized" are the same place.) |
38
+ | `delayed` | Scheduled for the future; not yet runnable. Promoted to `wait` when due. |
39
+ | `active` | Claimed by a worker and currently running. |
40
+ | `completed` | Finished successfully; `returnvalue` holds the result. |
41
+ | `failed` | Exhausted its retry attempts; `failed_reason` holds the error. |
42
+
43
+ The normal path is `wait → active → completed`. A failure with retries left goes
44
+ `active → wait` (or `active → delayed`, if a backoff delay applies) and tries
45
+ again; only after the last attempt does it land in `failed`. A delayed or
46
+ repeatable job starts in `delayed`. See [Job lifecycle](architecture.md) for the
47
+ exact transitions and [Producing jobs](producing.md) for how delay and retries
48
+ are configured.
49
+
50
+ ## Workers vs. slots
51
+
52
+ These are easy to conflate but distinct, and the dashboard shows both.
53
+
54
+ - A **worker** is a running `Worker` instance (one heartbeat, one identity). It
55
+ lives inside an OS process, but it is *not* the process: you can run several
56
+ workers in one process, one per process, or spread across machines.
57
+ - A **slot** is one unit of *parallel* work *inside* a worker. A worker created
58
+ with `concurrency=N` runs N async processing loops, so it can have up to N jobs
59
+ in flight at once. "Slots" on the dashboard is the sum of every live worker's
60
+ concurrency: your total throughput capacity.
61
+
62
+ So `live` counts workers, `slots` counts concurrent capacity. With the default
63
+ `concurrency=1` they happen to match; bump concurrency and slots climb while the
64
+ worker count stays put.
65
+
66
+ ```
67
+ host (machine)
68
+ └── process (pid)
69
+ └── worker (a Worker instance, unique id) ← "live"
70
+ └── slots (concurrency async loops) ← "slots"
71
+ └── jobs (one per slot at a time)
72
+ ```
73
+
74
+ Because slots are `asyncio` tasks sharing one event loop (not threads or
75
+ processes), a processor that blocks the loop blocks its sibling slots. Keep
76
+ processors `await`-y.
77
+
78
+ ## Events
79
+
80
+ toro publishes events to a Redis pub/sub channel: `added` when a job is enqueued
81
+ (published by the add script, atomically with the enqueue), `progress` from a running processor
82
+ (`job.update_progress`), and `completed` / `failed`, which the finish Lua scripts
83
+ publish atomically with the state change. `failed` fires only on terminal failure,
84
+ not on a retry. Two things consume the channel:
85
+
86
+ - **`await job.result()`** (or `queue.result(job_id)`) on the producer side
87
+ subscribes and waits for the terminal event, returning the value or raising
88
+ `JobFailedError`.
89
+ - **A dashboard** (such as [matador](https://github.com/ilovepixelart/matador))
90
+ subscribes to refresh live as state changes.
91
+
92
+ `Worker.on(event, fn)` lets a worker react to its own lifecycle with in-process
93
+ callbacks (`completed`, `failed`, `retrying`, `stalled`, `lock-lost`,
94
+ `rate-limited`) — separate from the pub/sub channel above. See
95
+ [Processing jobs](processing.md).
96
+
97
+ ## Reliability in one sentence
98
+
99
+ toro is **at-least-once**: a job is never lost while Redis persists, but its
100
+ handler can run more than once (bounded) if a worker dies mid-job. Exactly-once
101
+ *result commit* is enforced by a per-job lock token. The full story is in
102
+ [Reliability](reliability.md).
@@ -0,0 +1,63 @@
1
+ # Data model
2
+
3
+ Everything toro stores lives in Redis under a per-queue prefix. All key names are
4
+ computed in one place (`toro/keys.py`) so the Lua scripts and the Python side can
5
+ never disagree about where something lives.
6
+
7
+ ## Key prefix
8
+
9
+ For a queue named `<name>` with prefix `<prefix>` (default `toro`), every key
10
+ starts with:
11
+
12
+ ```
13
+ <prefix>:<name>:
14
+ ```
15
+
16
+ So `Queue("emails")` (default prefix) stores everything under `toro:emails:`.
17
+ Using a `{braces}` hash-tag in the prefix forces all of a queue's keys onto one
18
+ Redis Cluster slot, which the multi-key Lua scripts require.
19
+
20
+ ## Queue-wide keys
21
+
22
+ | Key suffix | Type | Holds |
23
+ |---|---|---|
24
+ | `id` | string (counter) | `INCR`-ed to mint auto job ids. |
25
+ | `prioritized` | ZSET | Waiting jobs in global priority order; score packs (priority, sequence). This *is* the `wait` state. |
26
+ | `marker` | ZSET | A single idempotent base member (`"0"`); idle workers `BZPOPMIN` it to wake. It only signals; the real claim is atomic. |
27
+ | `pc` | string (counter) | Priority sequence counter, so same-priority jobs stay FIFO. |
28
+ | `active` | LIST | Ids currently claimed by a worker and running. |
29
+ | `delayed` | ZSET | Ids scored by their process-at timestamp (ms); promoted to `prioritized` when due. |
30
+ | `completed` | ZSET | Successfully-finished ids, scored by finish time (for auto-removal + listing). |
31
+ | `failed` | ZSET | Terminally-failed ids, scored by finish time. |
32
+ | `meta-paused` | string (flag) | Exists only while the queue is paused; workers stop claiming new jobs. |
33
+ | `events` | pub/sub channel | Carries `added` / `progress` / `completed` / `failed`; drives `result()` and live dashboards. |
34
+ | `limiter` | HASH | The queue-wide rate-limit token bucket (`{tokens, ts}`), shared by every worker. |
35
+ | `stalled` | SET | Candidate ids for the mark-and-sweep recovery pass. |
36
+ | `stalled-check` | string (PX) | Throttle key so the stalled sweep runs about once per interval cluster-wide. |
37
+ | `repeat` | ZSET | Scheduler id -> next-run timestamp. |
38
+ | `workers` | ZSET | Live worker id -> last-heartbeat ms; stale entries pruned lazily on read. |
39
+ | `departed` | LIST (capped) | Recent worker departures: graceful `stopped` or `lost` (crashed). |
40
+
41
+ ## Per-scheduler, per-worker, per-job keys
42
+
43
+ | Key | Type | Holds |
44
+ |---|---|---|
45
+ | `repeat:<schedulerId>` | HASH | A scheduler's template: `name`, `every`/`cron`, `data`, `opts`. |
46
+ | `worker:<workerId>` | HASH | A worker's presence record: host, pid, concurrency, current jobs, processed/failed counts, state. |
47
+ | `<jobId>` | HASH | The job itself: `name`, `data`, `opts`, `state`, `attemptsMade`, timestamps, `returnvalue`/`failedReason`, `progress`, `stacktrace`, ... |
48
+ | `<jobId>:lock` | string (token, PX) | The per-job lock: the owning worker's token with an expiry. Only the holder may finish or renew it. |
49
+ | `<jobId>:logs` | LIST | Log lines appended by `job.log(...)` from inside a processor. |
50
+
51
+ Note the job hash key is just `<prefix>:<name>:<jobId>` (no extra segment), so a job
52
+ `5` on `toro:emails:` is the hash `toro:emails:5`, with `toro:emails:5:lock` and
53
+ `toro:emails:5:logs` beside it.
54
+
55
+ ## How the pieces connect
56
+
57
+ - A job moves between `prioritized` / `active` / `delayed` / `completed` / `failed`
58
+ as its state changes; the move and the hash update happen in one Lua script. See
59
+ [Architecture](architecture.md).
60
+ - The `lock` + `stalled` keys are the at-least-once machinery. See
61
+ [Reliability](reliability.md).
62
+ - `repeat` + `repeat:<id>` drive [scheduling](scheduling.md); `workers` +
63
+ `worker:<id>` + `departed` drive worker presence in the dashboard.
@@ -0,0 +1,20 @@
1
+ # toro documentation
2
+
3
+ Reference docs for how toro works. The [README](../README.md) is the quick start.
4
+
5
+ ## Pages
6
+
7
+ - **[Concepts](concepts.md)** — the mental model: queues, workers, jobs, the five
8
+ job states, and the difference between *workers* and *slots*.
9
+ - **[Data model](data-model.md)** — the exact Redis keys a queue uses and what
10
+ each one stores.
11
+ - **[Reliability](reliability.md)** — the at-least-once guarantee: per-job locks,
12
+ worker tokens, and stalled-job recovery.
13
+ - **[Producing jobs](producing.md)** — `Queue.add()` and every option (priority,
14
+ delay, retries/backoff, deduplication, custom ids).
15
+ - **[Processing jobs](processing.md)** — `Worker`: concurrency, lifecycle events,
16
+ rate limiting, and graceful shutdown.
17
+ - **[Scheduling](scheduling.md)** — repeatable and cron jobs, and how each
18
+ occurrence schedules the next.
19
+ - **[Architecture](architecture.md)** — the atomic-Lua core and the design
20
+ decisions behind the queue.