swarmq 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swarmq-0.1.0/.gitignore +42 -0
- swarmq-0.1.0/LICENSE +21 -0
- swarmq-0.1.0/PKG-INFO +334 -0
- swarmq-0.1.0/README.md +290 -0
- swarmq-0.1.0/pyproject.toml +179 -0
- swarmq-0.1.0/src/swarmq/__init__.py +162 -0
- swarmq-0.1.0/src/swarmq/_http.py +214 -0
- swarmq-0.1.0/src/swarmq/agent_spec.py +535 -0
- swarmq-0.1.0/src/swarmq/app.py +248 -0
- swarmq-0.1.0/src/swarmq/backend/__init__.py +0 -0
- swarmq-0.1.0/src/swarmq/backend/valkey.py +904 -0
- swarmq-0.1.0/src/swarmq/backoff.py +34 -0
- swarmq-0.1.0/src/swarmq/broker/__init__.py +0 -0
- swarmq-0.1.0/src/swarmq/broker/rabbitmq.py +952 -0
- swarmq-0.1.0/src/swarmq/cancellation.py +89 -0
- swarmq-0.1.0/src/swarmq/cli/__init__.py +338 -0
- swarmq-0.1.0/src/swarmq/cli/__main__.py +24 -0
- swarmq-0.1.0/src/swarmq/cli/_shared.py +261 -0
- swarmq-0.1.0/src/swarmq/cli/check_cmd.py +210 -0
- swarmq-0.1.0/src/swarmq/cli/dlq_cmd.py +287 -0
- swarmq-0.1.0/src/swarmq/cli/inspect_cmd.py +83 -0
- swarmq-0.1.0/src/swarmq/cli/parser.py +555 -0
- swarmq-0.1.0/src/swarmq/cli/schedule_cmd.py +313 -0
- swarmq-0.1.0/src/swarmq/cli/worker_cmd.py +102 -0
- swarmq-0.1.0/src/swarmq/client.py +1262 -0
- swarmq-0.1.0/src/swarmq/config.py +361 -0
- swarmq-0.1.0/src/swarmq/dishka.py +325 -0
- swarmq-0.1.0/src/swarmq/exceptions.py +77 -0
- swarmq-0.1.0/src/swarmq/headers.py +32 -0
- swarmq-0.1.0/src/swarmq/health.py +220 -0
- swarmq-0.1.0/src/swarmq/hot_reload.py +341 -0
- swarmq-0.1.0/src/swarmq/limits.py +127 -0
- swarmq-0.1.0/src/swarmq/locking.py +208 -0
- swarmq-0.1.0/src/swarmq/logging.py +136 -0
- swarmq-0.1.0/src/swarmq/lua/__init__.py +33 -0
- swarmq-0.1.0/src/swarmq/lua/chord_complete.lua +38 -0
- swarmq-0.1.0/src/swarmq/lua/lock_acquire.lua +66 -0
- swarmq-0.1.0/src/swarmq/lua/lock_release.lua +61 -0
- swarmq-0.1.0/src/swarmq/lua/lock_renew.lua +23 -0
- swarmq-0.1.0/src/swarmq/lua/sched_commit.lua +43 -0
- swarmq-0.1.0/src/swarmq/lua/sched_disable.lua +24 -0
- swarmq-0.1.0/src/swarmq/lua/sched_poll.lua +36 -0
- swarmq-0.1.0/src/swarmq/lua/sliding_window_counter.lua +65 -0
- swarmq-0.1.0/src/swarmq/lua/unique_release.lua +31 -0
- swarmq-0.1.0/src/swarmq/metrics.py +375 -0
- swarmq-0.1.0/src/swarmq/middleware.py +175 -0
- swarmq-0.1.0/src/swarmq/process_manager.py +1145 -0
- swarmq-0.1.0/src/swarmq/progress.py +109 -0
- swarmq-0.1.0/src/swarmq/registry.py +57 -0
- swarmq-0.1.0/src/swarmq/scheduler.py +1176 -0
- swarmq-0.1.0/src/swarmq/serialization.py +260 -0
- swarmq-0.1.0/src/swarmq/signals.py +200 -0
- swarmq-0.1.0/src/swarmq/task.py +503 -0
- swarmq-0.1.0/src/swarmq/worker.py +2808 -0
- swarmq-0.1.0/src/swarmq/workflow.py +281 -0
- swarmq-0.1.0/src/swarmq/workflow_orchestration.py +740 -0
swarmq-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# Python
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.egg-info/
|
|
6
|
+
*.egg
|
|
7
|
+
dist/
|
|
8
|
+
build/
|
|
9
|
+
*.whl
|
|
10
|
+
|
|
11
|
+
# Virtual environments
|
|
12
|
+
.venv/
|
|
13
|
+
venv/
|
|
14
|
+
benchmarks/.venv/
|
|
15
|
+
|
|
16
|
+
# IDE
|
|
17
|
+
.idea/
|
|
18
|
+
.vscode/
|
|
19
|
+
*.swp
|
|
20
|
+
*.swo
|
|
21
|
+
|
|
22
|
+
# OS
|
|
23
|
+
.DS_Store
|
|
24
|
+
Thumbs.db
|
|
25
|
+
|
|
26
|
+
# Testing
|
|
27
|
+
.coverage
|
|
28
|
+
htmlcov/
|
|
29
|
+
.pytest_cache/
|
|
30
|
+
.mypy_cache/
|
|
31
|
+
|
|
32
|
+
# Claude
|
|
33
|
+
.claude/
|
|
34
|
+
|
|
35
|
+
# Compound-engineering review/work artifacts (machine-generated, run-scoped)
|
|
36
|
+
.context/
|
|
37
|
+
|
|
38
|
+
# Benchmark comparison run outputs (per-run JSON, not source)
|
|
39
|
+
benchmarks/comparison/results/
|
|
40
|
+
|
|
41
|
+
# mkdocs build output (generated; source lives in website/)
|
|
42
|
+
site/
|
swarmq-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Georg Stricker
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
swarmq-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,334 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: swarmq
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Async task manager for Python 3.14+ using RabbitMQ and Valkey
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
License-File: LICENSE
|
|
7
|
+
Requires-Python: >=3.14
|
|
8
|
+
Requires-Dist: aio-pika>=9
|
|
9
|
+
Requires-Dist: croniter>=2
|
|
10
|
+
Requires-Dist: msgpack>=1.0
|
|
11
|
+
Requires-Dist: msgspec>=0.18
|
|
12
|
+
Requires-Dist: structlog>=24
|
|
13
|
+
Requires-Dist: valkey[libvalkey]>=6
|
|
14
|
+
Provides-Extra: dishka
|
|
15
|
+
Requires-Dist: dishka>=1.10.1; extra == 'dishka'
|
|
16
|
+
Provides-Extra: docs
|
|
17
|
+
Requires-Dist: mkdocs-llmstxt>=0.3; extra == 'docs'
|
|
18
|
+
Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
|
|
19
|
+
Requires-Dist: mkdocs-static-i18n>=1.2; extra == 'docs'
|
|
20
|
+
Requires-Dist: mkdocstrings[python]>=0.26; extra == 'docs'
|
|
21
|
+
Requires-Dist: pymdown-extensions>=10.11; extra == 'docs'
|
|
22
|
+
Provides-Extra: fast
|
|
23
|
+
Requires-Dist: orjson; extra == 'fast'
|
|
24
|
+
Provides-Extra: metrics
|
|
25
|
+
Requires-Dist: prometheus-client; extra == 'metrics'
|
|
26
|
+
Provides-Extra: reload
|
|
27
|
+
Requires-Dist: watchfiles>=0.21; extra == 'reload'
|
|
28
|
+
Provides-Extra: test
|
|
29
|
+
Requires-Dist: dishka>=1.10.1; extra == 'test'
|
|
30
|
+
Requires-Dist: freezegun; extra == 'test'
|
|
31
|
+
Requires-Dist: httpx; extra == 'test'
|
|
32
|
+
Requires-Dist: hypothesis; extra == 'test'
|
|
33
|
+
Requires-Dist: mutmut; extra == 'test'
|
|
34
|
+
Requires-Dist: mypy; extra == 'test'
|
|
35
|
+
Requires-Dist: prometheus-client; extra == 'test'
|
|
36
|
+
Requires-Dist: psutil; extra == 'test'
|
|
37
|
+
Requires-Dist: pytest; extra == 'test'
|
|
38
|
+
Requires-Dist: pytest-asyncio; extra == 'test'
|
|
39
|
+
Requires-Dist: pytest-cov; extra == 'test'
|
|
40
|
+
Requires-Dist: pytest-rerunfailures; extra == 'test'
|
|
41
|
+
Requires-Dist: pytest-timeout; extra == 'test'
|
|
42
|
+
Requires-Dist: ruff; extra == 'test'
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
|
|
45
|
+
# SwarmQ
|
|
46
|
+
|
|
47
|
+
Async-first Python task queue on RabbitMQ + Valkey.
|
|
48
|
+
|
|
49
|
+
> **Status:** Phase 3 + E2E hardening complete; Phase 5 in progress
|
|
50
|
+
> (metrics + health endpoints landed, hot-reload + agent-docs to come).
|
|
51
|
+
> 1280+ tests across unit / integration / e2e / chaos / property / fuzz
|
|
52
|
+
> / stress suites. Not yet production-tested by anyone but the author.
|
|
53
|
+
|
|
54
|
+
## Quick Start
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
# 1. Start RabbitMQ + Valkey (compose file lives in this repo)
|
|
58
|
+
docker compose up -d
|
|
59
|
+
|
|
60
|
+
# 2. Define a task
|
|
61
|
+
cat > myapp/tasks.py <<'EOF'
|
|
62
|
+
from swarmq import Task, TaskInfo
|
|
63
|
+
|
|
64
|
+
class SendEmail(Task):
|
|
65
|
+
task_info = TaskInfo(name="send_email", queues=["default"])
|
|
66
|
+
|
|
67
|
+
async def run(self, to: str, subject: str) -> str:
|
|
68
|
+
# ... your code here ...
|
|
69
|
+
return f"sent to {to}"
|
|
70
|
+
EOF
|
|
71
|
+
|
|
72
|
+
# 3. Tell SwarmQ where the broker and backend live
|
|
73
|
+
export SWARMQ_BROKER_URL="amqp://guest:guest@localhost:5672/"
|
|
74
|
+
export SWARMQ_BACKEND_URL="valkey://localhost:6379/0"
|
|
75
|
+
|
|
76
|
+
# 4. Run a worker
|
|
77
|
+
swarmq worker --module myapp.tasks --queues default --concurrency 10
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The worker handles SIGTERM / SIGINT for graceful shutdown. Multiple
|
|
81
|
+
queues: `--queues default,emails,priority`. Structured logs:
|
|
82
|
+
`--log-format json`.
|
|
83
|
+
|
|
84
|
+
## Hello World — Client side
|
|
85
|
+
|
|
86
|
+
```python
|
|
87
|
+
import asyncio
|
|
88
|
+
from swarmq import SwarmQ, RabbitMQBroker, ValkeyBackend
|
|
89
|
+
import myapp.tasks # noqa — auto-registers SendEmail
|
|
90
|
+
|
|
91
|
+
async def main():
|
|
92
|
+
app = SwarmQ(
|
|
93
|
+
RabbitMQBroker("amqp://guest:guest@localhost:5672/"),
|
|
94
|
+
ValkeyBackend("valkey://localhost:6379/0"),
|
|
95
|
+
queues=["default"],
|
|
96
|
+
)
|
|
97
|
+
async with app:
|
|
98
|
+
task_id = await app.schedule(
|
|
99
|
+
"send_email", to="op@example.com", subject="hi",
|
|
100
|
+
)
|
|
101
|
+
result = await app.get_result(task_id, timeout=10)
|
|
102
|
+
print(result) # sent to op@example.com
|
|
103
|
+
|
|
104
|
+
asyncio.run(main())
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## Reusing an existing connection pool (producer)
|
|
108
|
+
|
|
109
|
+
If your app already holds its own RabbitMQ connection and/or Valkey
|
|
110
|
+
client, hand them to SwarmQ instead of a URL — SwarmQ reuses them rather
|
|
111
|
+
than opening a second, parallel connection to the same server. You can
|
|
112
|
+
pass either a high-level client/connection **or** a low-level pool, and
|
|
113
|
+
mix sources per backend:
|
|
114
|
+
|
|
115
|
+
```python
|
|
116
|
+
import aio_pika
|
|
117
|
+
import valkey.asyncio
|
|
118
|
+
from swarmq import SwarmQ, RabbitMQBroker, ValkeyBackend
|
|
119
|
+
|
|
120
|
+
# objects your app already owns
|
|
121
|
+
connection = await aio_pika.connect_robust("amqp://guest:guest@localhost:5672/")
|
|
122
|
+
valkey_client = valkey.asyncio.Valkey.from_url("valkey://localhost:6379/0")
|
|
123
|
+
|
|
124
|
+
app = SwarmQ(
|
|
125
|
+
RabbitMQBroker(connection=connection), # or connection_pool=<aio_pika.pool.Pool>
|
|
126
|
+
ValkeyBackend(client=valkey_client), # or pool=<valkey ConnectionPool>
|
|
127
|
+
# no broker_url / backend_url needed when a connection object is injected
|
|
128
|
+
)
|
|
129
|
+
async with app:
|
|
130
|
+
await app.schedule("send_email", to="op@example.com", subject="hi")
|
|
131
|
+
# closing `app` does NOT close `connection` or `valkey_client` — your app keeps them
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
Things to know:
|
|
135
|
+
|
|
136
|
+
- **Ownership:** SwarmQ never closes a connection/client/pool you inject —
|
|
137
|
+
it belongs to your app. SwarmQ does close the publish channel it opens
|
|
138
|
+
on a borrowed RabbitMQ connection (that channel is SwarmQ's).
|
|
139
|
+
- **Producer-only:** injection is for the task-*sending* path. Calling
|
|
140
|
+
`start_worker()` on an injected broker/backend raises
|
|
141
|
+
`ConfigurationError` — workers are spawned as subprocesses and
|
|
142
|
+
reconstruct their broker/backend from `SWARMQ_BROKER_URL` /
|
|
143
|
+
`SWARMQ_BACKEND_URL`, which a borrowed connection can't cross.
|
|
144
|
+
- **Robustness is yours:** an injected Valkey client/pool does **not**
|
|
145
|
+
inherit SwarmQ's retry / health-check policy, and an injected aio-pika
|
|
146
|
+
connection should be a robust one (`connect_robust`) for SwarmQ's
|
|
147
|
+
channel-reconnect handling to behave as designed.
|
|
148
|
+
- **Same namespace assumed:** the injected object must point at the same
|
|
149
|
+
server and logical namespace (RabbitMQ vhost, Valkey DB) the rest of
|
|
150
|
+
your config expects — SwarmQ does not validate this, and a mismatch
|
|
151
|
+
silently sends/reads tasks in the wrong namespace.
|
|
152
|
+
|
|
153
|
+
## What's in the box
|
|
154
|
+
|
|
155
|
+
### Core (Phase 1+2+3)
|
|
156
|
+
|
|
157
|
+
- **At-least-once delivery** with Quorum-Queue durability and
|
|
158
|
+
`x-delivery-limit=20` against pathological redelivery loops.
|
|
159
|
+
- **Retries** with exponential / linear / fixed backoff +
|
|
160
|
+
configurable `max_retries`, `NoRetry`, `Retry(delay=...)`.
|
|
161
|
+
- **DLQ** routing for retry-exhaustion + `swarmq dlq list/inspect/
|
|
162
|
+
retry/purge` CLI for operator recovery.
|
|
163
|
+
- **Middleware** stack with 6 hooks (pre/post enqueue, pre/post
|
|
164
|
+
execute, on_error, on_retry).
|
|
165
|
+
- **Signals** — 14 lifecycle events with `@app.on_signal(...)`
|
|
166
|
+
decorator.
|
|
167
|
+
- **Rate limiting** — Cloudflare 2-period sliding window via Lua
|
|
168
|
+
(`TaskInfo(rate_limit="100/m")`).
|
|
169
|
+
- **Locking** — mutex + semaphore via Valkey atomic primitives, with
|
|
170
|
+
format-string keys (`lock="user:{user_id}"`).
|
|
171
|
+
- **Unique tasks** — dedup by canonical-encoded args, with
|
|
172
|
+
`unique_until="start"` or `"completion"`.
|
|
173
|
+
- **Cancellation** — pre-pickup + during-execution via
|
|
174
|
+
`Client.cancel(task_id)`, dedicated `TASK_CANCELLED` signal.
|
|
175
|
+
- **Scheduler** — cron, delayed (`eta=...`), recurring with
|
|
176
|
+
leader-election failover.
|
|
177
|
+
- **Progress tracking** — `Task.update_progress(current, total, msg)`
|
|
178
|
+
+ `Client.get_progress` / `Client.watch_progress` AsyncIterator.
|
|
179
|
+
- **CLI** — `swarmq worker`, `swarmq schedule`, `swarmq dlq *`,
|
|
180
|
+
`swarmq inspect`.
|
|
181
|
+
|
|
182
|
+
### Workflow primitives
|
|
183
|
+
|
|
184
|
+
- **`chain` / `group` / `chord`** with `Signature` building blocks
|
|
185
|
+
(`Task.s()`, `sig("name")`, immutable `Task.si()`) and operator
|
|
186
|
+
syntax (`A.s() | B.s()`, `group(...) | merge.s()`). Flat-DAG
|
|
187
|
+
composition with construction-time limits (`max_workflow_depth=10`,
|
|
188
|
+
`max_group_size=1000`).
|
|
189
|
+
- **Fire-and-forget `apply()`** returns a reattachable handle:
|
|
190
|
+
`await app.apply(chain(...))` → `handle.get(timeout=...)`, `cancel()`,
|
|
191
|
+
`children`, stable `workflow_id` reattach via `app.workflow(id)`.
|
|
192
|
+
- **Worker-driven orchestration** — workers advance the workflow on each
|
|
193
|
+
task completion (chain result-injection, atomic+idempotent chord
|
|
194
|
+
fan-in via Lua). Fail-fast with liveness: a failed step marks the
|
|
195
|
+
workflow failed and wakes `get()` instead of hanging.
|
|
196
|
+
|
|
197
|
+
```python
|
|
198
|
+
result = await (await app.apply(
|
|
199
|
+
chord(group(Download.s(u) for u in urls), Merge.s())
|
|
200
|
+
)).get(timeout=30)
|
|
201
|
+
```
|
|
202
|
+
|
|
203
|
+
### Operations (Phase 4 + Phase 5)
|
|
204
|
+
|
|
205
|
+
- **Multi-process workers** (`--processes N`) — supervisor starts N
|
|
206
|
+
subprocesses, restarts crashed workers (5/hr rolling-window limit),
|
|
207
|
+
reloads on SIGHUP via `os.execvp`. SIGHUP replays the `sys.argv`
|
|
208
|
+
snapshot captured at supervisor start (by design — in-process
|
|
209
|
+
argv-mutation cannot influence the re-exec vector). Sichere
|
|
210
|
+
Launch-Entrypoints (Container CMD, systemd ExecStart) bleiben die
|
|
211
|
+
Trust-Grenze für das initiale argv.
|
|
212
|
+
- **Autoscaling** (`--autoscale=MIN,MAX`) — queue-depth-driven
|
|
213
|
+
scale up/down with 30s cooldown, 60s idle window, and a 30s
|
|
214
|
+
minimum worker age that suppresses spawn→immediate-kill churn
|
|
215
|
+
when a crashed worker is restarted during an idle period.
|
|
216
|
+
- **Prometheus metrics** (`pip install swarmq[metrics]`) — 7 metrics
|
|
217
|
+
exposed on `/metrics`, drop-in no-op when the dep is absent.
|
|
218
|
+
- **Health endpoints** — `/health` (liveness) + `/ready` (broker +
|
|
219
|
+
backend reachable, structured 503 body).
|
|
220
|
+
- **Hot reload** (`--reload`, `pip install swarmq[reload]`) — file-
|
|
221
|
+
watch trigger that fires the existing SIGHUP drain-and-execvpe
|
|
222
|
+
pipeline. Developer-only (WARNING log on every start); see
|
|
223
|
+
`website/guides/hot-reload.md`.
|
|
224
|
+
|
|
225
|
+
### Reliability hardening (E2E + Chaos suites)
|
|
226
|
+
|
|
227
|
+
- **Channel-recreate** survives RabbitMQ broker restarts mid-publish.
|
|
228
|
+
- **Pub/Sub resubscribe** survives Valkey restarts mid-stream.
|
|
229
|
+
- **Toxiproxy chaos suite** verifies behavior under network latency,
|
|
230
|
+
bandwidth limits, slicer (packet loss), full disconnect, and
|
|
231
|
+
partial partitions.
|
|
232
|
+
- **Property-based tests** on dedup-hash, serialization, backoff,
|
|
233
|
+
rate-limiter; **fuzz-tests** on message parsers and external
|
|
234
|
+
publishers.
|
|
235
|
+
|
|
236
|
+
## Configuration
|
|
237
|
+
|
|
238
|
+
Worker behaviour is configured via env vars and CLI flags (CLI flag
|
|
239
|
+
wins where both are set).
|
|
240
|
+
|
|
241
|
+
| Variable / flag | Default | Purpose |
|
|
242
|
+
|---|---|---|
|
|
243
|
+
| `SWARMQ_BROKER_URL` | required | AMQP URL for RabbitMQ |
|
|
244
|
+
| `SWARMQ_BACKEND_URL` | required | Valkey/Redis URL for results |
|
|
245
|
+
| `--queues a,b,c` | `default` | Queues to consume |
|
|
246
|
+
| `--concurrency N` | `10` | Max parallel tasks per worker |
|
|
247
|
+
| `--log-level` | `INFO` | DEBUG / INFO / WARNING / ERROR |
|
|
248
|
+
| `--log-format` | `human` | `human` or `json` |
|
|
249
|
+
|
|
250
|
+
Other env vars: `SWARMQ_RESULT_TTL`, `SWARMQ_QUEUES`,
|
|
251
|
+
`SWARMQ_CONCURRENCY`, `SWARMQ_LOG_LEVEL`, `SWARMQ_LOG_JSON`. Full
|
|
252
|
+
list in `src/swarmq/config.py`.
|
|
253
|
+
|
|
254
|
+
## What's planned next
|
|
255
|
+
|
|
256
|
+
Phase 4 operator features complete (Priorities, Expiry, Burst,
|
|
257
|
+
ProcessManager, Autoscaling). Phase 5 Wave 1 complete (Metrics,
|
|
258
|
+
Health, Hot reload). Workflow primitives (`chain` / `group` / `chord`
|
|
259
|
+
+ Signatures) implemented — see "Workflow primitives" above.
|
|
260
|
+
Machine-readable agent spec (R5.5) implemented — generated JSON for
|
|
261
|
+
errors, config, CLI, and message schemas plus an `llms.txt` index
|
|
262
|
+
under `reference/agent/spec/`, kept in sync with the code by tests
|
|
263
|
+
(`python -m swarmq.agent_spec`).
|
|
264
|
+
Remaining — bulk-throughput optimisation (`schedule_many` fast path)
|
|
265
|
+
— see `docs/brainstorms/implementation-order-requirements.md`.
|
|
266
|
+
|
|
267
|
+
## Project layout
|
|
268
|
+
|
|
269
|
+
```
|
|
270
|
+
src/swarmq/
|
|
271
|
+
app.py SwarmQ application
|
|
272
|
+
worker.py Consumer loop
|
|
273
|
+
client.py Schedule API + result retrieval
|
|
274
|
+
task.py Task base + TaskInfo
|
|
275
|
+
middleware.py Middleware base + chain
|
|
276
|
+
signals.py Signal enum + dispatcher
|
|
277
|
+
limits.py RateLimiter + parser
|
|
278
|
+
locking.py Mutex / semaphore
|
|
279
|
+
cancellation.py Cancel-flag protocol
|
|
280
|
+
progress.py ProgressInfo (R3.7)
|
|
281
|
+
scheduler.py Embedded cron + delayed scheduler
|
|
282
|
+
metrics.py Prometheus exporter (R5.1)
|
|
283
|
+
health.py /health + /ready (R5.2)
|
|
284
|
+
cli.py argparse entrypoint
|
|
285
|
+
lua/ Lua scripts loaded into Valkey
|
|
286
|
+
broker/rabbitmq.py
|
|
287
|
+
backend/valkey.py
|
|
288
|
+
tests/
|
|
289
|
+
unit/ ~895 fast, no infra
|
|
290
|
+
integration/ broker + backend, real Docker
|
|
291
|
+
e2e/ full client + worker round-trip
|
|
292
|
+
chaos/ toxiproxy + container restarts (opt-in, nightly)
|
|
293
|
+
property/ hypothesis property-based (opt-in)
|
|
294
|
+
fuzz/ byte + JSON fuzz (opt-in)
|
|
295
|
+
stress/ high-parallelism (opt-in)
|
|
296
|
+
soak/ 1h leak detection (opt-in, weekly)
|
|
297
|
+
docs/
|
|
298
|
+
brainstorms/ requirements (per feature)
|
|
299
|
+
plans/ implementation plans (per feature)
|
|
300
|
+
solutions/ postmortem-style learnings
|
|
301
|
+
testing/ test strategy + feature-parity matrix
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
## Development
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
# Fast dev loop — unit only
|
|
308
|
+
uv run --extra test pytest tests/unit -q
|
|
309
|
+
|
|
310
|
+
# E2E suite (needs docker compose up -d)
|
|
311
|
+
SWARMQ_TEST_RABBITMQ_PORT=5673 SWARMQ_TEST_VALKEY_PORT=6380 \
|
|
312
|
+
uv run --extra test pytest tests/e2e -q
|
|
313
|
+
|
|
314
|
+
# Chaos suite (starts its own toxiproxy stack per test class)
|
|
315
|
+
uv run --extra test pytest tests/chaos -q -m chaos
|
|
316
|
+
|
|
317
|
+
# E2E feature-combinations suite (31 combos × 3 failure tiers)
|
|
318
|
+
# Tier-1 only — no Toxiproxy needed, runs as part of the regular e2e
|
|
319
|
+
# suite above. Tier-2/3 (chaos-marked) needs the Toxiproxy stack:
|
|
320
|
+
docker compose -f docker-compose.yml -f docker-compose.chaos.yml up -d
|
|
321
|
+
uv run --extra test pytest tests/e2e/combinations -v -m chaos
|
|
322
|
+
|
|
323
|
+
# Property + fuzz
|
|
324
|
+
uv run --extra test pytest tests/property tests/fuzz -q -m "property or fuzz"
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
Three-tier CI in `.github/workflows/`: pre-merge (≤5 min, every PR),
|
|
328
|
+
nightly (chaos + stress + fuzz), weekly (mutation + 1h soak).
|
|
329
|
+
|
|
330
|
+
For coding conventions and TDD discipline see `AGENTS.md`.
|
|
331
|
+
|
|
332
|
+
## License
|
|
333
|
+
|
|
334
|
+
[MIT](LICENSE).
|
swarmq-0.1.0/README.md
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
# SwarmQ
|
|
2
|
+
|
|
3
|
+
Async-first Python task queue on RabbitMQ + Valkey.
|
|
4
|
+
|
|
5
|
+
> **Status:** Phase 3 + E2E hardening complete; Phase 5 in progress
|
|
6
|
+
> (metrics + health endpoints landed, hot-reload + agent-docs to come).
|
|
7
|
+
> 1280+ tests across unit / integration / e2e / chaos / property / fuzz
|
|
8
|
+
> / stress suites. Not yet production-tested by anyone but the author.
|
|
9
|
+
|
|
10
|
+
## Quick Start
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
# 1. Start RabbitMQ + Valkey (compose file lives in this repo)
|
|
14
|
+
docker compose up -d
|
|
15
|
+
|
|
16
|
+
# 2. Define a task
|
|
17
|
+
cat > myapp/tasks.py <<'EOF'
|
|
18
|
+
from swarmq import Task, TaskInfo
|
|
19
|
+
|
|
20
|
+
class SendEmail(Task):
|
|
21
|
+
task_info = TaskInfo(name="send_email", queues=["default"])
|
|
22
|
+
|
|
23
|
+
async def run(self, to: str, subject: str) -> str:
|
|
24
|
+
# ... your code here ...
|
|
25
|
+
return f"sent to {to}"
|
|
26
|
+
EOF
|
|
27
|
+
|
|
28
|
+
# 3. Tell SwarmQ where the broker and backend live
|
|
29
|
+
export SWARMQ_BROKER_URL="amqp://guest:guest@localhost:5672/"
|
|
30
|
+
export SWARMQ_BACKEND_URL="valkey://localhost:6379/0"
|
|
31
|
+
|
|
32
|
+
# 4. Run a worker
|
|
33
|
+
swarmq worker --module myapp.tasks --queues default --concurrency 10
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
The worker handles SIGTERM / SIGINT for graceful shutdown. Multiple
|
|
37
|
+
queues: `--queues default,emails,priority`. Structured logs:
|
|
38
|
+
`--log-format json`.
|
|
39
|
+
|
|
40
|
+
## Hello World — Client side
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
import asyncio
|
|
44
|
+
from swarmq import SwarmQ, RabbitMQBroker, ValkeyBackend
|
|
45
|
+
import myapp.tasks # noqa — auto-registers SendEmail
|
|
46
|
+
|
|
47
|
+
async def main():
|
|
48
|
+
app = SwarmQ(
|
|
49
|
+
RabbitMQBroker("amqp://guest:guest@localhost:5672/"),
|
|
50
|
+
ValkeyBackend("valkey://localhost:6379/0"),
|
|
51
|
+
queues=["default"],
|
|
52
|
+
)
|
|
53
|
+
async with app:
|
|
54
|
+
task_id = await app.schedule(
|
|
55
|
+
"send_email", to="op@example.com", subject="hi",
|
|
56
|
+
)
|
|
57
|
+
result = await app.get_result(task_id, timeout=10)
|
|
58
|
+
print(result) # sent to op@example.com
|
|
59
|
+
|
|
60
|
+
asyncio.run(main())
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## Reusing an existing connection pool (producer)
|
|
64
|
+
|
|
65
|
+
If your app already holds its own RabbitMQ connection and/or Valkey
|
|
66
|
+
client, hand them to SwarmQ instead of a URL — SwarmQ reuses them rather
|
|
67
|
+
than opening a second, parallel connection to the same server. You can
|
|
68
|
+
pass either a high-level client/connection **or** a low-level pool, and
|
|
69
|
+
mix sources per backend:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
import aio_pika
|
|
73
|
+
import valkey.asyncio
|
|
74
|
+
from swarmq import SwarmQ, RabbitMQBroker, ValkeyBackend
|
|
75
|
+
|
|
76
|
+
# objects your app already owns
|
|
77
|
+
connection = await aio_pika.connect_robust("amqp://guest:guest@localhost:5672/")
|
|
78
|
+
valkey_client = valkey.asyncio.Valkey.from_url("valkey://localhost:6379/0")
|
|
79
|
+
|
|
80
|
+
app = SwarmQ(
|
|
81
|
+
RabbitMQBroker(connection=connection), # or connection_pool=<aio_pika.pool.Pool>
|
|
82
|
+
ValkeyBackend(client=valkey_client), # or pool=<valkey ConnectionPool>
|
|
83
|
+
# no broker_url / backend_url needed when a connection object is injected
|
|
84
|
+
)
|
|
85
|
+
async with app:
|
|
86
|
+
await app.schedule("send_email", to="op@example.com", subject="hi")
|
|
87
|
+
# closing `app` does NOT close `connection` or `valkey_client` — your app keeps them
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Things to know:
|
|
91
|
+
|
|
92
|
+
- **Ownership:** SwarmQ never closes a connection/client/pool you inject —
|
|
93
|
+
it belongs to your app. SwarmQ does close the publish channel it opens
|
|
94
|
+
on a borrowed RabbitMQ connection (that channel is SwarmQ's).
|
|
95
|
+
- **Producer-only:** injection is for the task-*sending* path. Calling
|
|
96
|
+
`start_worker()` on an injected broker/backend raises
|
|
97
|
+
`ConfigurationError` — workers are spawned as subprocesses and
|
|
98
|
+
reconstruct their broker/backend from `SWARMQ_BROKER_URL` /
|
|
99
|
+
`SWARMQ_BACKEND_URL`, which a borrowed connection can't cross.
|
|
100
|
+
- **Robustness is yours:** an injected Valkey client/pool does **not**
|
|
101
|
+
inherit SwarmQ's retry / health-check policy, and an injected aio-pika
|
|
102
|
+
connection should be a robust one (`connect_robust`) for SwarmQ's
|
|
103
|
+
channel-reconnect handling to behave as designed.
|
|
104
|
+
- **Same namespace assumed:** the injected object must point at the same
|
|
105
|
+
server and logical namespace (RabbitMQ vhost, Valkey DB) the rest of
|
|
106
|
+
your config expects — SwarmQ does not validate this, and a mismatch
|
|
107
|
+
silently sends/reads tasks in the wrong namespace.
|
|
108
|
+
|
|
109
|
+
## What's in the box
|
|
110
|
+
|
|
111
|
+
### Core (Phase 1+2+3)
|
|
112
|
+
|
|
113
|
+
- **At-least-once delivery** with Quorum-Queue durability and
|
|
114
|
+
`x-delivery-limit=20` against pathological redelivery loops.
|
|
115
|
+
- **Retries** with exponential / linear / fixed backoff +
|
|
116
|
+
configurable `max_retries`, `NoRetry`, `Retry(delay=...)`.
|
|
117
|
+
- **DLQ** routing for retry-exhaustion + `swarmq dlq list/inspect/
|
|
118
|
+
retry/purge` CLI for operator recovery.
|
|
119
|
+
- **Middleware** stack with 6 hooks (pre/post enqueue, pre/post
|
|
120
|
+
execute, on_error, on_retry).
|
|
121
|
+
- **Signals** — 14 lifecycle events with `@app.on_signal(...)`
|
|
122
|
+
decorator.
|
|
123
|
+
- **Rate limiting** — Cloudflare 2-period sliding window via Lua
|
|
124
|
+
(`TaskInfo(rate_limit="100/m")`).
|
|
125
|
+
- **Locking** — mutex + semaphore via Valkey atomic primitives, with
|
|
126
|
+
format-string keys (`lock="user:{user_id}"`).
|
|
127
|
+
- **Unique tasks** — dedup by canonical-encoded args, with
|
|
128
|
+
`unique_until="start"` or `"completion"`.
|
|
129
|
+
- **Cancellation** — pre-pickup + during-execution via
|
|
130
|
+
`Client.cancel(task_id)`, dedicated `TASK_CANCELLED` signal.
|
|
131
|
+
- **Scheduler** — cron, delayed (`eta=...`), recurring with
|
|
132
|
+
leader-election failover.
|
|
133
|
+
- **Progress tracking** — `Task.update_progress(current, total, msg)`
|
|
134
|
+
+ `Client.get_progress` / `Client.watch_progress` AsyncIterator.
|
|
135
|
+
- **CLI** — `swarmq worker`, `swarmq schedule`, `swarmq dlq *`,
|
|
136
|
+
`swarmq inspect`.
|
|
137
|
+
|
|
138
|
+
### Workflow primitives
|
|
139
|
+
|
|
140
|
+
- **`chain` / `group` / `chord`** with `Signature` building blocks
|
|
141
|
+
(`Task.s()`, `sig("name")`, immutable `Task.si()`) and operator
|
|
142
|
+
syntax (`A.s() | B.s()`, `group(...) | merge.s()`). Flat-DAG
|
|
143
|
+
composition with construction-time limits (`max_workflow_depth=10`,
|
|
144
|
+
`max_group_size=1000`).
|
|
145
|
+
- **Fire-and-forget `apply()`** returns a reattachable handle:
|
|
146
|
+
`await app.apply(chain(...))` → `handle.get(timeout=...)`, `cancel()`,
|
|
147
|
+
`children`, stable `workflow_id` reattach via `app.workflow(id)`.
|
|
148
|
+
- **Worker-driven orchestration** — workers advance the workflow on each
|
|
149
|
+
task completion (chain result-injection, atomic+idempotent chord
|
|
150
|
+
fan-in via Lua). Fail-fast with liveness: a failed step marks the
|
|
151
|
+
workflow failed and wakes `get()` instead of hanging.
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
result = await (await app.apply(
|
|
155
|
+
chord(group(Download.s(u) for u in urls), Merge.s())
|
|
156
|
+
)).get(timeout=30)
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
### Operations (Phase 4 + Phase 5)
|
|
160
|
+
|
|
161
|
+
- **Multi-process workers** (`--processes N`) — supervisor starts N
|
|
162
|
+
subprocesses, restarts crashed workers (5/hr rolling-window limit),
|
|
163
|
+
reloads on SIGHUP via `os.execvp`. SIGHUP replays the `sys.argv`
|
|
164
|
+
snapshot captured at supervisor start (by design — in-process
|
|
165
|
+
argv-mutation cannot influence the re-exec vector). Sichere
|
|
166
|
+
Launch-Entrypoints (Container CMD, systemd ExecStart) bleiben die
|
|
167
|
+
Trust-Grenze für das initiale argv.
|
|
168
|
+
- **Autoscaling** (`--autoscale=MIN,MAX`) — queue-depth-driven
|
|
169
|
+
scale up/down with 30s cooldown, 60s idle window, and a 30s
|
|
170
|
+
minimum worker age that suppresses spawn→immediate-kill churn
|
|
171
|
+
when a crashed worker is restarted during an idle period.
|
|
172
|
+
- **Prometheus metrics** (`pip install swarmq[metrics]`) — 7 metrics
|
|
173
|
+
exposed on `/metrics`, drop-in no-op when the dep is absent.
|
|
174
|
+
- **Health endpoints** — `/health` (liveness) + `/ready` (broker +
|
|
175
|
+
backend reachable, structured 503 body).
|
|
176
|
+
- **Hot reload** (`--reload`, `pip install swarmq[reload]`) — file-
|
|
177
|
+
watch trigger that fires the existing SIGHUP drain-and-execvpe
|
|
178
|
+
pipeline. Developer-only (WARNING log on every start); see
|
|
179
|
+
`website/guides/hot-reload.md`.
|
|
180
|
+
|
|
181
|
+
### Reliability hardening (E2E + Chaos suites)
|
|
182
|
+
|
|
183
|
+
- **Channel-recreate** survives RabbitMQ broker restarts mid-publish.
|
|
184
|
+
- **Pub/Sub resubscribe** survives Valkey restarts mid-stream.
|
|
185
|
+
- **Toxiproxy chaos suite** verifies behavior under network latency,
|
|
186
|
+
bandwidth limits, slicer (packet loss), full disconnect, and
|
|
187
|
+
partial partitions.
|
|
188
|
+
- **Property-based tests** on dedup-hash, serialization, backoff,
|
|
189
|
+
rate-limiter; **fuzz-tests** on message parsers and external
|
|
190
|
+
publishers.
|
|
191
|
+
|
|
192
|
+
## Configuration
|
|
193
|
+
|
|
194
|
+
Worker behaviour is configured via env vars and CLI flags (CLI flag
|
|
195
|
+
wins where both are set).
|
|
196
|
+
|
|
197
|
+
| Variable / flag | Default | Purpose |
|
|
198
|
+
|---|---|---|
|
|
199
|
+
| `SWARMQ_BROKER_URL` | required | AMQP URL for RabbitMQ |
|
|
200
|
+
| `SWARMQ_BACKEND_URL` | required | Valkey/Redis URL for results |
|
|
201
|
+
| `--queues a,b,c` | `default` | Queues to consume |
|
|
202
|
+
| `--concurrency N` | `10` | Max parallel tasks per worker |
|
|
203
|
+
| `--log-level` | `INFO` | DEBUG / INFO / WARNING / ERROR |
|
|
204
|
+
| `--log-format` | `human` | `human` or `json` |
|
|
205
|
+
|
|
206
|
+
Other env vars: `SWARMQ_RESULT_TTL`, `SWARMQ_QUEUES`,
|
|
207
|
+
`SWARMQ_CONCURRENCY`, `SWARMQ_LOG_LEVEL`, `SWARMQ_LOG_JSON`. Full
|
|
208
|
+
list in `src/swarmq/config.py`.
|
|
209
|
+
|
|
210
|
+
## What's planned next
|
|
211
|
+
|
|
212
|
+
Phase 4 operator features complete (Priorities, Expiry, Burst,
|
|
213
|
+
ProcessManager, Autoscaling). Phase 5 Wave 1 complete (Metrics,
|
|
214
|
+
Health, Hot reload). Workflow primitives (`chain` / `group` / `chord`
|
|
215
|
+
+ Signatures) implemented — see "Workflow primitives" above.
|
|
216
|
+
Machine-readable agent spec (R5.5) implemented — generated JSON for
|
|
217
|
+
errors, config, CLI, and message schemas plus an `llms.txt` index
|
|
218
|
+
under `reference/agent/spec/`, kept in sync with the code by tests
|
|
219
|
+
(`python -m swarmq.agent_spec`).
|
|
220
|
+
Remaining — bulk-throughput optimisation (`schedule_many` fast path)
|
|
221
|
+
— see `docs/brainstorms/implementation-order-requirements.md`.
|
|
222
|
+
|
|
223
|
+
## Project layout
|
|
224
|
+
|
|
225
|
+
```
|
|
226
|
+
src/swarmq/
|
|
227
|
+
app.py SwarmQ application
|
|
228
|
+
worker.py Consumer loop
|
|
229
|
+
client.py Schedule API + result retrieval
|
|
230
|
+
task.py Task base + TaskInfo
|
|
231
|
+
middleware.py Middleware base + chain
|
|
232
|
+
signals.py Signal enum + dispatcher
|
|
233
|
+
limits.py RateLimiter + parser
|
|
234
|
+
locking.py Mutex / semaphore
|
|
235
|
+
cancellation.py Cancel-flag protocol
|
|
236
|
+
progress.py ProgressInfo (R3.7)
|
|
237
|
+
scheduler.py Embedded cron + delayed scheduler
|
|
238
|
+
metrics.py Prometheus exporter (R5.1)
|
|
239
|
+
health.py /health + /ready (R5.2)
|
|
240
|
+
cli.py argparse entrypoint
|
|
241
|
+
lua/ Lua scripts loaded into Valkey
|
|
242
|
+
broker/rabbitmq.py
|
|
243
|
+
backend/valkey.py
|
|
244
|
+
tests/
|
|
245
|
+
unit/ ~895 fast, no infra
|
|
246
|
+
integration/ broker + backend, real Docker
|
|
247
|
+
e2e/ full client + worker round-trip
|
|
248
|
+
chaos/ toxiproxy + container restarts (opt-in, nightly)
|
|
249
|
+
property/ hypothesis property-based (opt-in)
|
|
250
|
+
fuzz/ byte + JSON fuzz (opt-in)
|
|
251
|
+
stress/ high-parallelism (opt-in)
|
|
252
|
+
soak/ 1h leak detection (opt-in, weekly)
|
|
253
|
+
docs/
|
|
254
|
+
brainstorms/ requirements (per feature)
|
|
255
|
+
plans/ implementation plans (per feature)
|
|
256
|
+
solutions/ postmortem-style learnings
|
|
257
|
+
testing/ test strategy + feature-parity matrix
|
|
258
|
+
```
|
|
259
|
+
|
|
260
|
+
## Development
|
|
261
|
+
|
|
262
|
+
```bash
|
|
263
|
+
# Fast dev loop — unit only
|
|
264
|
+
uv run --extra test pytest tests/unit -q
|
|
265
|
+
|
|
266
|
+
# E2E suite (needs docker compose up -d)
|
|
267
|
+
SWARMQ_TEST_RABBITMQ_PORT=5673 SWARMQ_TEST_VALKEY_PORT=6380 \
|
|
268
|
+
uv run --extra test pytest tests/e2e -q
|
|
269
|
+
|
|
270
|
+
# Chaos suite (starts its own toxiproxy stack per test class)
|
|
271
|
+
uv run --extra test pytest tests/chaos -q -m chaos
|
|
272
|
+
|
|
273
|
+
# E2E feature-combinations suite (31 combos × 3 failure tiers)
|
|
274
|
+
# Tier-1 only — no Toxiproxy needed, runs as part of the regular e2e
|
|
275
|
+
# suite above. Tier-2/3 (chaos-marked) needs the Toxiproxy stack:
|
|
276
|
+
docker compose -f docker-compose.yml -f docker-compose.chaos.yml up -d
|
|
277
|
+
uv run --extra test pytest tests/e2e/combinations -v -m chaos
|
|
278
|
+
|
|
279
|
+
# Property + fuzz
|
|
280
|
+
uv run --extra test pytest tests/property tests/fuzz -q -m "property or fuzz"
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
Three-tier CI in `.github/workflows/`: pre-merge (≤5 min, every PR),
|
|
284
|
+
nightly (chaos + stress + fuzz), weekly (mutation + 1h soak).
|
|
285
|
+
|
|
286
|
+
For coding conventions and TDD discipline see `AGENTS.md`.
|
|
287
|
+
|
|
288
|
+
## License
|
|
289
|
+
|
|
290
|
+
[MIT](LICENSE).
|