swarmq 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. swarmq-0.1.0/.gitignore +42 -0
  2. swarmq-0.1.0/LICENSE +21 -0
  3. swarmq-0.1.0/PKG-INFO +334 -0
  4. swarmq-0.1.0/README.md +290 -0
  5. swarmq-0.1.0/pyproject.toml +179 -0
  6. swarmq-0.1.0/src/swarmq/__init__.py +162 -0
  7. swarmq-0.1.0/src/swarmq/_http.py +214 -0
  8. swarmq-0.1.0/src/swarmq/agent_spec.py +535 -0
  9. swarmq-0.1.0/src/swarmq/app.py +248 -0
  10. swarmq-0.1.0/src/swarmq/backend/__init__.py +0 -0
  11. swarmq-0.1.0/src/swarmq/backend/valkey.py +904 -0
  12. swarmq-0.1.0/src/swarmq/backoff.py +34 -0
  13. swarmq-0.1.0/src/swarmq/broker/__init__.py +0 -0
  14. swarmq-0.1.0/src/swarmq/broker/rabbitmq.py +952 -0
  15. swarmq-0.1.0/src/swarmq/cancellation.py +89 -0
  16. swarmq-0.1.0/src/swarmq/cli/__init__.py +338 -0
  17. swarmq-0.1.0/src/swarmq/cli/__main__.py +24 -0
  18. swarmq-0.1.0/src/swarmq/cli/_shared.py +261 -0
  19. swarmq-0.1.0/src/swarmq/cli/check_cmd.py +210 -0
  20. swarmq-0.1.0/src/swarmq/cli/dlq_cmd.py +287 -0
  21. swarmq-0.1.0/src/swarmq/cli/inspect_cmd.py +83 -0
  22. swarmq-0.1.0/src/swarmq/cli/parser.py +555 -0
  23. swarmq-0.1.0/src/swarmq/cli/schedule_cmd.py +313 -0
  24. swarmq-0.1.0/src/swarmq/cli/worker_cmd.py +102 -0
  25. swarmq-0.1.0/src/swarmq/client.py +1262 -0
  26. swarmq-0.1.0/src/swarmq/config.py +361 -0
  27. swarmq-0.1.0/src/swarmq/dishka.py +325 -0
  28. swarmq-0.1.0/src/swarmq/exceptions.py +77 -0
  29. swarmq-0.1.0/src/swarmq/headers.py +32 -0
  30. swarmq-0.1.0/src/swarmq/health.py +220 -0
  31. swarmq-0.1.0/src/swarmq/hot_reload.py +341 -0
  32. swarmq-0.1.0/src/swarmq/limits.py +127 -0
  33. swarmq-0.1.0/src/swarmq/locking.py +208 -0
  34. swarmq-0.1.0/src/swarmq/logging.py +136 -0
  35. swarmq-0.1.0/src/swarmq/lua/__init__.py +33 -0
  36. swarmq-0.1.0/src/swarmq/lua/chord_complete.lua +38 -0
  37. swarmq-0.1.0/src/swarmq/lua/lock_acquire.lua +66 -0
  38. swarmq-0.1.0/src/swarmq/lua/lock_release.lua +61 -0
  39. swarmq-0.1.0/src/swarmq/lua/lock_renew.lua +23 -0
  40. swarmq-0.1.0/src/swarmq/lua/sched_commit.lua +43 -0
  41. swarmq-0.1.0/src/swarmq/lua/sched_disable.lua +24 -0
  42. swarmq-0.1.0/src/swarmq/lua/sched_poll.lua +36 -0
  43. swarmq-0.1.0/src/swarmq/lua/sliding_window_counter.lua +65 -0
  44. swarmq-0.1.0/src/swarmq/lua/unique_release.lua +31 -0
  45. swarmq-0.1.0/src/swarmq/metrics.py +375 -0
  46. swarmq-0.1.0/src/swarmq/middleware.py +175 -0
  47. swarmq-0.1.0/src/swarmq/process_manager.py +1145 -0
  48. swarmq-0.1.0/src/swarmq/progress.py +109 -0
  49. swarmq-0.1.0/src/swarmq/registry.py +57 -0
  50. swarmq-0.1.0/src/swarmq/scheduler.py +1176 -0
  51. swarmq-0.1.0/src/swarmq/serialization.py +260 -0
  52. swarmq-0.1.0/src/swarmq/signals.py +200 -0
  53. swarmq-0.1.0/src/swarmq/task.py +503 -0
  54. swarmq-0.1.0/src/swarmq/worker.py +2808 -0
  55. swarmq-0.1.0/src/swarmq/workflow.py +281 -0
  56. swarmq-0.1.0/src/swarmq/workflow_orchestration.py +740 -0
@@ -0,0 +1,42 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.egg-info/
6
+ *.egg
7
+ dist/
8
+ build/
9
+ *.whl
10
+
11
+ # Virtual environments
12
+ .venv/
13
+ venv/
14
+ benchmarks/.venv/
15
+
16
+ # IDE
17
+ .idea/
18
+ .vscode/
19
+ *.swp
20
+ *.swo
21
+
22
+ # OS
23
+ .DS_Store
24
+ Thumbs.db
25
+
26
+ # Testing
27
+ .coverage
28
+ htmlcov/
29
+ .pytest_cache/
30
+ .mypy_cache/
31
+
32
+ # Claude
33
+ .claude/
34
+
35
+ # Compound-engineering review/work artifacts (machine-generated, run-scoped)
36
+ .context/
37
+
38
+ # Benchmark comparison run outputs (per-run JSON, not source)
39
+ benchmarks/comparison/results/
40
+
41
+ # mkdocs build output (generated; source lives in website/)
42
+ site/
swarmq-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Georg Stricker
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
swarmq-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,334 @@
1
+ Metadata-Version: 2.4
2
+ Name: swarmq
3
+ Version: 0.1.0
4
+ Summary: Async task manager for Python 3.14+ using RabbitMQ and Valkey
5
+ License-Expression: MIT
6
+ License-File: LICENSE
7
+ Requires-Python: >=3.14
8
+ Requires-Dist: aio-pika>=9
9
+ Requires-Dist: croniter>=2
10
+ Requires-Dist: msgpack>=1.0
11
+ Requires-Dist: msgspec>=0.18
12
+ Requires-Dist: structlog>=24
13
+ Requires-Dist: valkey[libvalkey]>=6
14
+ Provides-Extra: dishka
15
+ Requires-Dist: dishka>=1.10.1; extra == 'dishka'
16
+ Provides-Extra: docs
17
+ Requires-Dist: mkdocs-llmstxt>=0.3; extra == 'docs'
18
+ Requires-Dist: mkdocs-material>=9.5; extra == 'docs'
19
+ Requires-Dist: mkdocs-static-i18n>=1.2; extra == 'docs'
20
+ Requires-Dist: mkdocstrings[python]>=0.26; extra == 'docs'
21
+ Requires-Dist: pymdown-extensions>=10.11; extra == 'docs'
22
+ Provides-Extra: fast
23
+ Requires-Dist: orjson; extra == 'fast'
24
+ Provides-Extra: metrics
25
+ Requires-Dist: prometheus-client; extra == 'metrics'
26
+ Provides-Extra: reload
27
+ Requires-Dist: watchfiles>=0.21; extra == 'reload'
28
+ Provides-Extra: test
29
+ Requires-Dist: dishka>=1.10.1; extra == 'test'
30
+ Requires-Dist: freezegun; extra == 'test'
31
+ Requires-Dist: httpx; extra == 'test'
32
+ Requires-Dist: hypothesis; extra == 'test'
33
+ Requires-Dist: mutmut; extra == 'test'
34
+ Requires-Dist: mypy; extra == 'test'
35
+ Requires-Dist: prometheus-client; extra == 'test'
36
+ Requires-Dist: psutil; extra == 'test'
37
+ Requires-Dist: pytest; extra == 'test'
38
+ Requires-Dist: pytest-asyncio; extra == 'test'
39
+ Requires-Dist: pytest-cov; extra == 'test'
40
+ Requires-Dist: pytest-rerunfailures; extra == 'test'
41
+ Requires-Dist: pytest-timeout; extra == 'test'
42
+ Requires-Dist: ruff; extra == 'test'
43
+ Description-Content-Type: text/markdown
44
+
45
+ # SwarmQ
46
+
47
+ Async-first Python task queue on RabbitMQ + Valkey.
48
+
49
+ > **Status:** Phase 3 + E2E hardening complete; Phase 5 in progress
50
+ > (metrics + health endpoints landed, hot-reload + agent-docs to come).
51
+ > 1280+ tests across unit / integration / e2e / chaos / property / fuzz
52
+ > / stress suites. Not yet production-tested by anyone but the author.
53
+
54
+ ## Quick Start
55
+
56
+ ```bash
57
+ # 1. Start RabbitMQ + Valkey (compose file lives in this repo)
58
+ docker compose up -d
59
+
60
+ # 2. Define a task
61
+ cat > myapp/tasks.py <<'EOF'
62
+ from swarmq import Task, TaskInfo
63
+
64
+ class SendEmail(Task):
65
+ task_info = TaskInfo(name="send_email", queues=["default"])
66
+
67
+ async def run(self, to: str, subject: str) -> str:
68
+ # ... your code here ...
69
+ return f"sent to {to}"
70
+ EOF
71
+
72
+ # 3. Tell SwarmQ where the broker and backend live
73
+ export SWARMQ_BROKER_URL="amqp://guest:guest@localhost:5672/"
74
+ export SWARMQ_BACKEND_URL="valkey://localhost:6379/0"
75
+
76
+ # 4. Run a worker
77
+ swarmq worker --module myapp.tasks --queues default --concurrency 10
78
+ ```
79
+
80
+ The worker handles SIGTERM / SIGINT for graceful shutdown. Multiple
81
+ queues: `--queues default,emails,priority`. Structured logs:
82
+ `--log-format json`.
83
+
84
+ ## Hello World — Client side
85
+
86
+ ```python
87
+ import asyncio
88
+ from swarmq import SwarmQ, RabbitMQBroker, ValkeyBackend
89
+ import myapp.tasks # noqa — auto-registers SendEmail
90
+
91
+ async def main():
92
+ app = SwarmQ(
93
+ RabbitMQBroker("amqp://guest:guest@localhost:5672/"),
94
+ ValkeyBackend("valkey://localhost:6379/0"),
95
+ queues=["default"],
96
+ )
97
+ async with app:
98
+ task_id = await app.schedule(
99
+ "send_email", to="op@example.com", subject="hi",
100
+ )
101
+ result = await app.get_result(task_id, timeout=10)
102
+ print(result) # sent to op@example.com
103
+
104
+ asyncio.run(main())
105
+ ```
106
+
107
+ ## Reusing an existing connection pool (producer)
108
+
109
+ If your app already holds its own RabbitMQ connection and/or Valkey
110
+ client, hand them to SwarmQ instead of a URL — SwarmQ reuses them rather
111
+ than opening a second, parallel connection to the same server. You can
112
+ pass either a high-level client/connection **or** a low-level pool, and
113
+ mix sources per backend:
114
+
115
+ ```python
116
+ import aio_pika
117
+ import valkey.asyncio
118
+ from swarmq import SwarmQ, RabbitMQBroker, ValkeyBackend
119
+
120
+ # objects your app already owns
121
+ connection = await aio_pika.connect_robust("amqp://guest:guest@localhost:5672/")
122
+ valkey_client = valkey.asyncio.Valkey.from_url("valkey://localhost:6379/0")
123
+
124
+ app = SwarmQ(
125
+ RabbitMQBroker(connection=connection), # or connection_pool=<aio_pika.pool.Pool>
126
+ ValkeyBackend(client=valkey_client), # or pool=<valkey ConnectionPool>
127
+ # no broker_url / backend_url needed when a connection object is injected
128
+ )
129
+ async with app:
130
+ await app.schedule("send_email", to="op@example.com", subject="hi")
131
+ # closing `app` does NOT close `connection` or `valkey_client` — your app keeps them
132
+ ```
133
+
134
+ Things to know:
135
+
136
+ - **Ownership:** SwarmQ never closes a connection/client/pool you inject —
137
+ it belongs to your app. SwarmQ does close the publish channel it opens
138
+ on a borrowed RabbitMQ connection (that channel is SwarmQ's).
139
+ - **Producer-only:** injection is for the task-*sending* path. Calling
140
+ `start_worker()` on an injected broker/backend raises
141
+ `ConfigurationError` — workers are spawned as subprocesses and
142
+ reconstruct their broker/backend from `SWARMQ_BROKER_URL` /
143
+ `SWARMQ_BACKEND_URL`, which a borrowed connection can't cross.
144
+ - **Robustness is yours:** an injected Valkey client/pool does **not**
145
+ inherit SwarmQ's retry / health-check policy, and an injected aio-pika
146
+ connection should be a robust one (`connect_robust`) for SwarmQ's
147
+ channel-reconnect handling to behave as designed.
148
+ - **Same namespace assumed:** the injected object must point at the same
149
+ server and logical namespace (RabbitMQ vhost, Valkey DB) the rest of
150
+ your config expects — SwarmQ does not validate this, and a mismatch
151
+ silently sends/reads tasks in the wrong namespace.
152
+
153
+ ## What's in the box
154
+
155
+ ### Core (Phase 1+2+3)
156
+
157
+ - **At-least-once delivery** with Quorum-Queue durability and
158
+ `x-delivery-limit=20` against pathological redelivery loops.
159
+ - **Retries** with exponential / linear / fixed backoff +
160
+ configurable `max_retries`, `NoRetry`, `Retry(delay=...)`.
161
+ - **DLQ** routing for retry-exhaustion + `swarmq dlq list/inspect/
162
+ retry/purge` CLI for operator recovery.
163
+ - **Middleware** stack with 6 hooks (pre/post enqueue, pre/post
164
+ execute, on_error, on_retry).
165
+ - **Signals** — 14 lifecycle events with `@app.on_signal(...)`
166
+ decorator.
167
+ - **Rate limiting** — Cloudflare 2-period sliding window via Lua
168
+ (`TaskInfo(rate_limit="100/m")`).
169
+ - **Locking** — mutex + semaphore via Valkey atomic primitives, with
170
+ format-string keys (`lock="user:{user_id}"`).
171
+ - **Unique tasks** — dedup by canonical-encoded args, with
172
+ `unique_until="start"` or `"completion"`.
173
+ - **Cancellation** — pre-pickup + during-execution via
174
+ `Client.cancel(task_id)`, dedicated `TASK_CANCELLED` signal.
175
+ - **Scheduler** — cron, delayed (`eta=...`), recurring with
176
+ leader-election failover.
177
+ - **Progress tracking** — `Task.update_progress(current, total, msg)`
178
+ + `Client.get_progress` / `Client.watch_progress` AsyncIterator.
179
+ - **CLI** — `swarmq worker`, `swarmq schedule`, `swarmq dlq *`,
180
+ `swarmq inspect`.
181
+
182
+ ### Workflow primitives
183
+
184
+ - **`chain` / `group` / `chord`** with `Signature` building blocks
185
+ (`Task.s()`, `sig("name")`, immutable `Task.si()`) and operator
186
+ syntax (`A.s() | B.s()`, `group(...) | merge.s()`). Flat-DAG
187
+ composition with construction-time limits (`max_workflow_depth=10`,
188
+ `max_group_size=1000`).
189
+ - **Fire-and-forget `apply()`** returns a reattachable handle:
190
+ `await app.apply(chain(...))` → `handle.get(timeout=...)`, `cancel()`,
191
+ `children`, stable `workflow_id` reattach via `app.workflow(id)`.
192
+ - **Worker-driven orchestration** — workers advance the workflow on each
193
+ task completion (chain result-injection, atomic+idempotent chord
194
+ fan-in via Lua). Fail-fast with liveness: a failed step marks the
195
+ workflow failed and wakes `get()` instead of hanging.
196
+
197
+ ```python
198
+ result = await (await app.apply(
199
+ chord(group(Download.s(u) for u in urls), Merge.s())
200
+ )).get(timeout=30)
201
+ ```
202
+
203
+ ### Operations (Phase 4 + Phase 5)
204
+
205
+ - **Multi-process workers** (`--processes N`) — supervisor starts N
206
+ subprocesses, restarts crashed workers (5/hr rolling-window limit),
207
+ reloads on SIGHUP via `os.execvp`. SIGHUP replays the `sys.argv`
208
+ snapshot captured at supervisor start (by design — in-process
209
+ argv-mutation cannot influence the re-exec vector). Sichere
210
+ Launch-Entrypoints (Container CMD, systemd ExecStart) bleiben die
211
+ Trust-Grenze für das initiale argv.
212
+ - **Autoscaling** (`--autoscale=MIN,MAX`) — queue-depth-driven
213
+ scale up/down with 30s cooldown, 60s idle window, and a 30s
214
+ minimum worker age that suppresses spawn→immediate-kill churn
215
+ when a crashed worker is restarted during an idle period.
216
+ - **Prometheus metrics** (`pip install swarmq[metrics]`) — 7 metrics
217
+ exposed on `/metrics`, drop-in no-op when the dep is absent.
218
+ - **Health endpoints** — `/health` (liveness) + `/ready` (broker +
219
+ backend reachable, structured 503 body).
220
+ - **Hot reload** (`--reload`, `pip install swarmq[reload]`) — file-
221
+ watch trigger that fires the existing SIGHUP drain-and-execvpe
222
+ pipeline. Developer-only (WARNING log on every start); see
223
+ `website/guides/hot-reload.md`.
224
+
225
+ ### Reliability hardening (E2E + Chaos suites)
226
+
227
+ - **Channel-recreate** survives RabbitMQ broker restarts mid-publish.
228
+ - **Pub/Sub resubscribe** survives Valkey restarts mid-stream.
229
+ - **Toxiproxy chaos suite** verifies behavior under network latency,
230
+ bandwidth limits, slicer (packet loss), full disconnect, and
231
+ partial partitions.
232
+ - **Property-based tests** on dedup-hash, serialization, backoff,
233
+ rate-limiter; **fuzz-tests** on message parsers and external
234
+ publishers.
235
+
236
+ ## Configuration
237
+
238
+ Worker behaviour is configured via env vars and CLI flags (CLI flag
239
+ wins where both are set).
240
+
241
+ | Variable / flag | Default | Purpose |
242
+ |---|---|---|
243
+ | `SWARMQ_BROKER_URL` | required | AMQP URL for RabbitMQ |
244
+ | `SWARMQ_BACKEND_URL` | required | Valkey/Redis URL for results |
245
+ | `--queues a,b,c` | `default` | Queues to consume |
246
+ | `--concurrency N` | `10` | Max parallel tasks per worker |
247
+ | `--log-level` | `INFO` | DEBUG / INFO / WARNING / ERROR |
248
+ | `--log-format` | `human` | `human` or `json` |
249
+
250
+ Other env vars: `SWARMQ_RESULT_TTL`, `SWARMQ_QUEUES`,
251
+ `SWARMQ_CONCURRENCY`, `SWARMQ_LOG_LEVEL`, `SWARMQ_LOG_JSON`. Full
252
+ list in `src/swarmq/config.py`.
253
+
254
+ ## What's planned next
255
+
256
+ Phase 4 operator features complete (Priorities, Expiry, Burst,
257
+ ProcessManager, Autoscaling). Phase 5 Wave 1 complete (Metrics,
258
+ Health, Hot reload). Workflow primitives (`chain` / `group` / `chord`
259
+ + Signatures) implemented — see "Workflow primitives" above.
260
+ Machine-readable agent spec (R5.5) implemented — generated JSON for
261
+ errors, config, CLI, and message schemas plus an `llms.txt` index
262
+ under `reference/agent/spec/`, kept in sync with the code by tests
263
+ (`python -m swarmq.agent_spec`).
264
+ Remaining — bulk-throughput optimisation (`schedule_many` fast path)
265
+ — see `docs/brainstorms/implementation-order-requirements.md`.
266
+
267
+ ## Project layout
268
+
269
+ ```
270
+ src/swarmq/
271
+ app.py SwarmQ application
272
+ worker.py Consumer loop
273
+ client.py Schedule API + result retrieval
274
+ task.py Task base + TaskInfo
275
+ middleware.py Middleware base + chain
276
+ signals.py Signal enum + dispatcher
277
+ limits.py RateLimiter + parser
278
+ locking.py Mutex / semaphore
279
+ cancellation.py Cancel-flag protocol
280
+ progress.py ProgressInfo (R3.7)
281
+ scheduler.py Embedded cron + delayed scheduler
282
+ metrics.py Prometheus exporter (R5.1)
283
+ health.py /health + /ready (R5.2)
284
+ cli.py argparse entrypoint
285
+ lua/ Lua scripts loaded into Valkey
286
+ broker/rabbitmq.py
287
+ backend/valkey.py
288
+ tests/
289
+ unit/ ~895 fast, no infra
290
+ integration/ broker + backend, real Docker
291
+ e2e/ full client + worker round-trip
292
+ chaos/ toxiproxy + container restarts (opt-in, nightly)
293
+ property/ hypothesis property-based (opt-in)
294
+ fuzz/ byte + JSON fuzz (opt-in)
295
+ stress/ high-parallelism (opt-in)
296
+ soak/ 1h leak detection (opt-in, weekly)
297
+ docs/
298
+ brainstorms/ requirements (per feature)
299
+ plans/ implementation plans (per feature)
300
+ solutions/ postmortem-style learnings
301
+ testing/ test strategy + feature-parity matrix
302
+ ```
303
+
304
+ ## Development
305
+
306
+ ```bash
307
+ # Fast dev loop — unit only
308
+ uv run --extra test pytest tests/unit -q
309
+
310
+ # E2E suite (needs docker compose up -d)
311
+ SWARMQ_TEST_RABBITMQ_PORT=5673 SWARMQ_TEST_VALKEY_PORT=6380 \
312
+ uv run --extra test pytest tests/e2e -q
313
+
314
+ # Chaos suite (starts its own toxiproxy stack per test class)
315
+ uv run --extra test pytest tests/chaos -q -m chaos
316
+
317
+ # E2E feature-combinations suite (31 combos × 3 failure tiers)
318
+ # Tier-1 only — no Toxiproxy needed, runs as part of the regular e2e
319
+ # suite above. Tier-2/3 (chaos-marked) needs the Toxiproxy stack:
320
+ docker compose -f docker-compose.yml -f docker-compose.chaos.yml up -d
321
+ uv run --extra test pytest tests/e2e/combinations -v -m chaos
322
+
323
+ # Property + fuzz
324
+ uv run --extra test pytest tests/property tests/fuzz -q -m "property or fuzz"
325
+ ```
326
+
327
+ Three-tier CI in `.github/workflows/`: pre-merge (≤5 min, every PR),
328
+ nightly (chaos + stress + fuzz), weekly (mutation + 1h soak).
329
+
330
+ For coding conventions and TDD discipline see `AGENTS.md`.
331
+
332
+ ## License
333
+
334
+ [MIT](LICENSE).
swarmq-0.1.0/README.md ADDED
@@ -0,0 +1,290 @@
1
+ # SwarmQ
2
+
3
+ Async-first Python task queue on RabbitMQ + Valkey.
4
+
5
+ > **Status:** Phase 3 + E2E hardening complete; Phase 5 in progress
6
+ > (metrics + health endpoints landed, hot-reload + agent-docs to come).
7
+ > 1280+ tests across unit / integration / e2e / chaos / property / fuzz
8
+ > / stress suites. Not yet production-tested by anyone but the author.
9
+
10
+ ## Quick Start
11
+
12
+ ```bash
13
+ # 1. Start RabbitMQ + Valkey (compose file lives in this repo)
14
+ docker compose up -d
15
+
16
+ # 2. Define a task
17
+ cat > myapp/tasks.py <<'EOF'
18
+ from swarmq import Task, TaskInfo
19
+
20
+ class SendEmail(Task):
21
+ task_info = TaskInfo(name="send_email", queues=["default"])
22
+
23
+ async def run(self, to: str, subject: str) -> str:
24
+ # ... your code here ...
25
+ return f"sent to {to}"
26
+ EOF
27
+
28
+ # 3. Tell SwarmQ where the broker and backend live
29
+ export SWARMQ_BROKER_URL="amqp://guest:guest@localhost:5672/"
30
+ export SWARMQ_BACKEND_URL="valkey://localhost:6379/0"
31
+
32
+ # 4. Run a worker
33
+ swarmq worker --module myapp.tasks --queues default --concurrency 10
34
+ ```
35
+
36
+ The worker handles SIGTERM / SIGINT for graceful shutdown. Multiple
37
+ queues: `--queues default,emails,priority`. Structured logs:
38
+ `--log-format json`.
39
+
40
+ ## Hello World — Client side
41
+
42
+ ```python
43
+ import asyncio
44
+ from swarmq import SwarmQ, RabbitMQBroker, ValkeyBackend
45
+ import myapp.tasks # noqa — auto-registers SendEmail
46
+
47
+ async def main():
48
+ app = SwarmQ(
49
+ RabbitMQBroker("amqp://guest:guest@localhost:5672/"),
50
+ ValkeyBackend("valkey://localhost:6379/0"),
51
+ queues=["default"],
52
+ )
53
+ async with app:
54
+ task_id = await app.schedule(
55
+ "send_email", to="op@example.com", subject="hi",
56
+ )
57
+ result = await app.get_result(task_id, timeout=10)
58
+ print(result) # sent to op@example.com
59
+
60
+ asyncio.run(main())
61
+ ```
62
+
63
+ ## Reusing an existing connection pool (producer)
64
+
65
+ If your app already holds its own RabbitMQ connection and/or Valkey
66
+ client, hand them to SwarmQ instead of a URL — SwarmQ reuses them rather
67
+ than opening a second, parallel connection to the same server. You can
68
+ pass either a high-level client/connection **or** a low-level pool, and
69
+ mix sources per backend:
70
+
71
+ ```python
72
+ import aio_pika
73
+ import valkey.asyncio
74
+ from swarmq import SwarmQ, RabbitMQBroker, ValkeyBackend
75
+
76
+ # objects your app already owns
77
+ connection = await aio_pika.connect_robust("amqp://guest:guest@localhost:5672/")
78
+ valkey_client = valkey.asyncio.Valkey.from_url("valkey://localhost:6379/0")
79
+
80
+ app = SwarmQ(
81
+ RabbitMQBroker(connection=connection), # or connection_pool=<aio_pika.pool.Pool>
82
+ ValkeyBackend(client=valkey_client), # or pool=<valkey ConnectionPool>
83
+ # no broker_url / backend_url needed when a connection object is injected
84
+ )
85
+ async with app:
86
+ await app.schedule("send_email", to="op@example.com", subject="hi")
87
+ # closing `app` does NOT close `connection` or `valkey_client` — your app keeps them
88
+ ```
89
+
90
+ Things to know:
91
+
92
+ - **Ownership:** SwarmQ never closes a connection/client/pool you inject —
93
+ it belongs to your app. SwarmQ does close the publish channel it opens
94
+ on a borrowed RabbitMQ connection (that channel is SwarmQ's).
95
+ - **Producer-only:** injection is for the task-*sending* path. Calling
96
+ `start_worker()` on an injected broker/backend raises
97
+ `ConfigurationError` — workers are spawned as subprocesses and
98
+ reconstruct their broker/backend from `SWARMQ_BROKER_URL` /
99
+ `SWARMQ_BACKEND_URL`, which a borrowed connection can't cross.
100
+ - **Robustness is yours:** an injected Valkey client/pool does **not**
101
+ inherit SwarmQ's retry / health-check policy, and an injected aio-pika
102
+ connection should be a robust one (`connect_robust`) for SwarmQ's
103
+ channel-reconnect handling to behave as designed.
104
+ - **Same namespace assumed:** the injected object must point at the same
105
+ server and logical namespace (RabbitMQ vhost, Valkey DB) the rest of
106
+ your config expects — SwarmQ does not validate this, and a mismatch
107
+ silently sends/reads tasks in the wrong namespace.
108
+
109
+ ## What's in the box
110
+
111
+ ### Core (Phase 1+2+3)
112
+
113
+ - **At-least-once delivery** with Quorum-Queue durability and
114
+ `x-delivery-limit=20` against pathological redelivery loops.
115
+ - **Retries** with exponential / linear / fixed backoff +
116
+ configurable `max_retries`, `NoRetry`, `Retry(delay=...)`.
117
+ - **DLQ** routing for retry-exhaustion + `swarmq dlq list/inspect/
118
+ retry/purge` CLI for operator recovery.
119
+ - **Middleware** stack with 6 hooks (pre/post enqueue, pre/post
120
+ execute, on_error, on_retry).
121
+ - **Signals** — 14 lifecycle events with `@app.on_signal(...)`
122
+ decorator.
123
+ - **Rate limiting** — Cloudflare 2-period sliding window via Lua
124
+ (`TaskInfo(rate_limit="100/m")`).
125
+ - **Locking** — mutex + semaphore via Valkey atomic primitives, with
126
+ format-string keys (`lock="user:{user_id}"`).
127
+ - **Unique tasks** — dedup by canonical-encoded args, with
128
+ `unique_until="start"` or `"completion"`.
129
+ - **Cancellation** — pre-pickup + during-execution via
130
+ `Client.cancel(task_id)`, dedicated `TASK_CANCELLED` signal.
131
+ - **Scheduler** — cron, delayed (`eta=...`), recurring with
132
+ leader-election failover.
133
+ - **Progress tracking** — `Task.update_progress(current, total, msg)`
134
+ + `Client.get_progress` / `Client.watch_progress` AsyncIterator.
135
+ - **CLI** — `swarmq worker`, `swarmq schedule`, `swarmq dlq *`,
136
+ `swarmq inspect`.
137
+
138
+ ### Workflow primitives
139
+
140
+ - **`chain` / `group` / `chord`** with `Signature` building blocks
141
+ (`Task.s()`, `sig("name")`, immutable `Task.si()`) and operator
142
+ syntax (`A.s() | B.s()`, `group(...) | merge.s()`). Flat-DAG
143
+ composition with construction-time limits (`max_workflow_depth=10`,
144
+ `max_group_size=1000`).
145
+ - **Fire-and-forget `apply()`** returns a reattachable handle:
146
+ `await app.apply(chain(...))` → `handle.get(timeout=...)`, `cancel()`,
147
+ `children`, stable `workflow_id` reattach via `app.workflow(id)`.
148
+ - **Worker-driven orchestration** — workers advance the workflow on each
149
+ task completion (chain result-injection, atomic+idempotent chord
150
+ fan-in via Lua). Fail-fast with liveness: a failed step marks the
151
+ workflow failed and wakes `get()` instead of hanging.
152
+
153
+ ```python
154
+ result = await (await app.apply(
155
+ chord(group(Download.s(u) for u in urls), Merge.s())
156
+ )).get(timeout=30)
157
+ ```
158
+
159
+ ### Operations (Phase 4 + Phase 5)
160
+
161
+ - **Multi-process workers** (`--processes N`) — supervisor starts N
162
+ subprocesses, restarts crashed workers (5/hr rolling-window limit),
163
+ reloads on SIGHUP via `os.execvp`. SIGHUP replays the `sys.argv`
164
+ snapshot captured at supervisor start (by design — in-process
165
+ argv-mutation cannot influence the re-exec vector). Sichere
166
+ Launch-Entrypoints (Container CMD, systemd ExecStart) bleiben die
167
+ Trust-Grenze für das initiale argv.
168
+ - **Autoscaling** (`--autoscale=MIN,MAX`) — queue-depth-driven
169
+ scale up/down with 30s cooldown, 60s idle window, and a 30s
170
+ minimum worker age that suppresses spawn→immediate-kill churn
171
+ when a crashed worker is restarted during an idle period.
172
+ - **Prometheus metrics** (`pip install swarmq[metrics]`) — 7 metrics
173
+ exposed on `/metrics`, drop-in no-op when the dep is absent.
174
+ - **Health endpoints** — `/health` (liveness) + `/ready` (broker +
175
+ backend reachable, structured 503 body).
176
+ - **Hot reload** (`--reload`, `pip install swarmq[reload]`) — file-
177
+ watch trigger that fires the existing SIGHUP drain-and-execvpe
178
+ pipeline. Developer-only (WARNING log on every start); see
179
+ `website/guides/hot-reload.md`.
180
+
181
+ ### Reliability hardening (E2E + Chaos suites)
182
+
183
+ - **Channel-recreate** survives RabbitMQ broker restarts mid-publish.
184
+ - **Pub/Sub resubscribe** survives Valkey restarts mid-stream.
185
+ - **Toxiproxy chaos suite** verifies behavior under network latency,
186
+ bandwidth limits, slicer (packet loss), full disconnect, and
187
+ partial partitions.
188
+ - **Property-based tests** on dedup-hash, serialization, backoff,
189
+ rate-limiter; **fuzz-tests** on message parsers and external
190
+ publishers.
191
+
192
+ ## Configuration
193
+
194
+ Worker behaviour is configured via env vars and CLI flags (CLI flag
195
+ wins where both are set).
196
+
197
+ | Variable / flag | Default | Purpose |
198
+ |---|---|---|
199
+ | `SWARMQ_BROKER_URL` | required | AMQP URL for RabbitMQ |
200
+ | `SWARMQ_BACKEND_URL` | required | Valkey/Redis URL for results |
201
+ | `--queues a,b,c` | `default` | Queues to consume |
202
+ | `--concurrency N` | `10` | Max parallel tasks per worker |
203
+ | `--log-level` | `INFO` | DEBUG / INFO / WARNING / ERROR |
204
+ | `--log-format` | `human` | `human` or `json` |
205
+
206
+ Other env vars: `SWARMQ_RESULT_TTL`, `SWARMQ_QUEUES`,
207
+ `SWARMQ_CONCURRENCY`, `SWARMQ_LOG_LEVEL`, `SWARMQ_LOG_JSON`. Full
208
+ list in `src/swarmq/config.py`.
209
+
210
+ ## What's planned next
211
+
212
+ Phase 4 operator features complete (Priorities, Expiry, Burst,
213
+ ProcessManager, Autoscaling). Phase 5 Wave 1 complete (Metrics,
214
+ Health, Hot reload). Workflow primitives (`chain` / `group` / `chord`
215
+ + Signatures) implemented — see "Workflow primitives" above.
216
+ Machine-readable agent spec (R5.5) implemented — generated JSON for
217
+ errors, config, CLI, and message schemas plus an `llms.txt` index
218
+ under `reference/agent/spec/`, kept in sync with the code by tests
219
+ (`python -m swarmq.agent_spec`).
220
+ Remaining — bulk-throughput optimisation (`schedule_many` fast path)
221
+ — see `docs/brainstorms/implementation-order-requirements.md`.
222
+
223
+ ## Project layout
224
+
225
+ ```
226
+ src/swarmq/
227
+ app.py SwarmQ application
228
+ worker.py Consumer loop
229
+ client.py Schedule API + result retrieval
230
+ task.py Task base + TaskInfo
231
+ middleware.py Middleware base + chain
232
+ signals.py Signal enum + dispatcher
233
+ limits.py RateLimiter + parser
234
+ locking.py Mutex / semaphore
235
+ cancellation.py Cancel-flag protocol
236
+ progress.py ProgressInfo (R3.7)
237
+ scheduler.py Embedded cron + delayed scheduler
238
+ metrics.py Prometheus exporter (R5.1)
239
+ health.py /health + /ready (R5.2)
240
+ cli.py argparse entrypoint
241
+ lua/ Lua scripts loaded into Valkey
242
+ broker/rabbitmq.py
243
+ backend/valkey.py
244
+ tests/
245
+ unit/ ~895 fast, no infra
246
+ integration/ broker + backend, real Docker
247
+ e2e/ full client + worker round-trip
248
+ chaos/ toxiproxy + container restarts (opt-in, nightly)
249
+ property/ hypothesis property-based (opt-in)
250
+ fuzz/ byte + JSON fuzz (opt-in)
251
+ stress/ high-parallelism (opt-in)
252
+ soak/ 1h leak detection (opt-in, weekly)
253
+ docs/
254
+ brainstorms/ requirements (per feature)
255
+ plans/ implementation plans (per feature)
256
+ solutions/ postmortem-style learnings
257
+ testing/ test strategy + feature-parity matrix
258
+ ```
259
+
260
+ ## Development
261
+
262
+ ```bash
263
+ # Fast dev loop — unit only
264
+ uv run --extra test pytest tests/unit -q
265
+
266
+ # E2E suite (needs docker compose up -d)
267
+ SWARMQ_TEST_RABBITMQ_PORT=5673 SWARMQ_TEST_VALKEY_PORT=6380 \
268
+ uv run --extra test pytest tests/e2e -q
269
+
270
+ # Chaos suite (starts its own toxiproxy stack per test class)
271
+ uv run --extra test pytest tests/chaos -q -m chaos
272
+
273
+ # E2E feature-combinations suite (31 combos × 3 failure tiers)
274
+ # Tier-1 only — no Toxiproxy needed, runs as part of the regular e2e
275
+ # suite above. Tier-2/3 (chaos-marked) needs the Toxiproxy stack:
276
+ docker compose -f docker-compose.yml -f docker-compose.chaos.yml up -d
277
+ uv run --extra test pytest tests/e2e/combinations -v -m chaos
278
+
279
+ # Property + fuzz
280
+ uv run --extra test pytest tests/property tests/fuzz -q -m "property or fuzz"
281
+ ```
282
+
283
+ Three-tier CI in `.github/workflows/`: pre-merge (≤5 min, every PR),
284
+ nightly (chaos + stress + fuzz), weekly (mutation + 1h soak).
285
+
286
+ For coding conventions and TDD discipline see `AGENTS.md`.
287
+
288
+ ## License
289
+
290
+ [MIT](LICENSE).