relier 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- relier/__init__.py +111 -0
- relier/chaos/__init__.py +19 -0
- relier/chaos/engine.py +49 -0
- relier/chaos/load_spike.py +92 -0
- relier/chaos/network.py +130 -0
- relier/chaos/slow_task.py +48 -0
- relier/chaos/task_corrupt.py +74 -0
- relier/chaos/tasks.py +98 -0
- relier/chaos/worker_kill.py +72 -0
- relier/cli/__init__.py +5 -0
- relier/cli/admin.py +80 -0
- relier/cli/admission.py +46 -0
- relier/cli/base.py +52 -0
- relier/cli/chaos.py +176 -0
- relier/cli/cluster.py +145 -0
- relier/cli/config.py +117 -0
- relier/cli/dlq.py +118 -0
- relier/cli/main.py +322 -0
- relier/cli/manual.py +255 -0
- relier/cli/slo.py +173 -0
- relier/cli/tasks.py +562 -0
- relier/cli/ui/__init__.py +5 -0
- relier/cli/ui/live.py +20 -0
- relier/cli/ui/panels.py +39 -0
- relier/cli/ui/tables.py +201 -0
- relier/cli/utils.py +55 -0
- relier/cli/workers.py +209 -0
- relier/config.py +490 -0
- relier/core/__init__.py +32 -0
- relier/core/admission.py +172 -0
- relier/core/checkpoint.py +333 -0
- relier/core/dlq.py +315 -0
- relier/core/exceptions.py +227 -0
- relier/core/idempotency.py +243 -0
- relier/core/keys.py +199 -0
- relier/core/phoenix.py +1052 -0
- relier/core/schema.py +205 -0
- relier/core/shutdown.py +193 -0
- relier/core/slo.py +144 -0
- relier/core/timeouts.py +197 -0
- relier/core/validation.py +209 -0
- relier/py.typed +0 -0
- relier/storage/__init__.py +13 -0
- relier/storage/lua/__init__.py +5 -0
- relier/storage/lua/scripts.py +140 -0
- relier/storage/redis.py +335 -0
- relier/tasks/__init__.py +16 -0
- relier/tasks/app.py +466 -0
- relier/tasks/context.py +180 -0
- relier/tasks/decorator.py +1121 -0
- relier/tasks/signals.py +101 -0
- relier/telemetry/__init__.py +16 -0
- relier/telemetry/logging.py +98 -0
- relier/telemetry/metrics.py +151 -0
- relier/telemetry/setup.py +82 -0
- relier/telemetry/spans.py +107 -0
- relier-0.1.0.dist-info/METADATA +366 -0
- relier-0.1.0.dist-info/RECORD +61 -0
- relier-0.1.0.dist-info/WHEEL +4 -0
- relier-0.1.0.dist-info/entry_points.txt +2 -0
- relier-0.1.0.dist-info/licenses/LICENSE +21 -0
relier/__init__.py
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relier — Reliability layer for Celery. Zero job loss.
|
|
3
|
+
|
|
4
|
+
Core Guarantees
|
|
5
|
+
---------------
|
|
6
|
+
- **Zero job loss** — every dispatched task is tracked in Redis and
|
|
7
|
+
automatically resurrected if its worker crashes mid-execution.
|
|
8
|
+
- **Atomic execution** — SHA-256 idempotency keys ensure tasks run exactly
|
|
9
|
+
once, even under concurrent retries.
|
|
10
|
+
- **Schema safety** — versioned envelopes with checksums protect against
|
|
11
|
+
payload corruption during rolling deployments.
|
|
12
|
+
- **Observability** — full OpenTelemetry span hierarchy and Prometheus
|
|
13
|
+
metrics with no extra wiring required.
|
|
14
|
+
|
|
15
|
+
Quick Start
|
|
16
|
+
-----------
|
|
17
|
+
>>> from relier import rl_task
|
|
18
|
+
>>> from relier.tasks.context import TaskContext
|
|
19
|
+
>>>
|
|
20
|
+
>>> @rl_task(
|
|
21
|
+
... queue="high_priority",
|
|
22
|
+
... idempotent=True,
|
|
23
|
+
... soft_timeout=25,
|
|
24
|
+
... hard_timeout=30,
|
|
25
|
+
... )
|
|
26
|
+
... async def process_order(order_id: str, ctx: TaskContext) -> dict:
|
|
27
|
+
... if ctx.partial_result:
|
|
28
|
+
... resume_from = ctx.partial_result["step"]
|
|
29
|
+
... result = await charge_and_fulfil(order_id)
|
|
30
|
+
... return result
|
|
31
|
+
>>>
|
|
32
|
+
>>> # FastAPI / async Django — dispatch without blocking:
|
|
33
|
+
>>> receipt = await process_order.apush(order_id="ord-001")
|
|
34
|
+
>>>
|
|
35
|
+
>>> # Django views / Flask routes / scripts — sync dispatch:
|
|
36
|
+
>>> receipt = process_order.push(order_id="ord-001")
|
|
37
|
+
|
|
38
|
+
Configuration
|
|
39
|
+
-------------
|
|
40
|
+
All settings are read from environment variables prefixed ``RELIER_``.
|
|
41
|
+
See :class:`~relier.config.Settings` for the full reference, or call
|
|
42
|
+
:func:`~relier.config.get_settings` to inspect the active configuration.
|
|
43
|
+
|
|
44
|
+
Error Handling
|
|
45
|
+
--------------
|
|
46
|
+
All exceptions raised by Relier inherit from :class:`~relier.core.exceptions.RelierError`.
|
|
47
|
+
Catch that base class to handle any framework error uniformly:
|
|
48
|
+
|
|
49
|
+
>>> from relier import RelierError, AdmissionRejectedError
|
|
50
|
+
>>> try:
|
|
51
|
+
... receipt = await process_order.apush(order_id="ord-001")
|
|
52
|
+
... except AdmissionRejectedError as exc:
|
|
53
|
+
... # Back off and retry after exc.retry_after seconds.
|
|
54
|
+
... await asyncio.sleep(exc.retry_after)
|
|
55
|
+
... except RelierError:
|
|
56
|
+
... # Unexpected framework error — log and alert.
|
|
57
|
+
... raise
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
__version__ = "0.1.0"
|
|
61
|
+
|
|
62
|
+
# ---------------------------------------------------------------------------
|
|
63
|
+
# Public API re-exports
|
|
64
|
+
#
|
|
65
|
+
# Everything a user needs for day-to-day Relier usage is importable from
|
|
66
|
+
# the top-level ``relier`` package. Internal subsystem modules remain
|
|
67
|
+
# accessible via their full dotted paths for advanced use cases.
|
|
68
|
+
# ---------------------------------------------------------------------------
|
|
69
|
+
|
|
70
|
+
from relier.config import Settings, get_settings
|
|
71
|
+
from relier.core.exceptions import (
|
|
72
|
+
AdmissionRejectedError,
|
|
73
|
+
ConfigurationError,
|
|
74
|
+
IdempotencyInFlightError,
|
|
75
|
+
MaxResurrectionsExceededError,
|
|
76
|
+
PayloadIntegrityError,
|
|
77
|
+
RedisConnectionError,
|
|
78
|
+
RelierError,
|
|
79
|
+
SchemaMigrationError,
|
|
80
|
+
WorkerInitializationError,
|
|
81
|
+
)
|
|
82
|
+
from relier.tasks.context import TaskContext, task_context
|
|
83
|
+
from relier.tasks.decorator import PUBLIC_QUEUES, RelierTask, rl_task
|
|
84
|
+
|
|
85
|
+
__all__ = [
|
|
86
|
+
# Version
|
|
87
|
+
"__version__",
|
|
88
|
+
# Core decorator & typed handle
|
|
89
|
+
"rl_task",
|
|
90
|
+
"RelierTask",
|
|
91
|
+
# Task context
|
|
92
|
+
"TaskContext",
|
|
93
|
+
"task_context",
|
|
94
|
+
# Configuration
|
|
95
|
+
"Settings",
|
|
96
|
+
"get_settings",
|
|
97
|
+
# Queue topology
|
|
98
|
+
"PUBLIC_QUEUES",
|
|
99
|
+
# Exceptions — base
|
|
100
|
+
"RelierError",
|
|
101
|
+
# Exceptions — configuration & infrastructure
|
|
102
|
+
"ConfigurationError",
|
|
103
|
+
"RedisConnectionError",
|
|
104
|
+
"WorkerInitializationError",
|
|
105
|
+
# Exceptions — task lifecycle
|
|
106
|
+
"AdmissionRejectedError",
|
|
107
|
+
"IdempotencyInFlightError",
|
|
108
|
+
"MaxResurrectionsExceededError",
|
|
109
|
+
"PayloadIntegrityError",
|
|
110
|
+
"SchemaMigrationError",
|
|
111
|
+
]
|
relier/chaos/__init__.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relier Chaos, Reliability validation suite.
|
|
3
|
+
|
|
4
|
+
Importing this package registers every chaos scenario with the
|
|
5
|
+
``ChaosEngine`` singleton via side effects. Without this, the CLI would
|
|
6
|
+
resolve ``chaos_engine.run("worker-kill", ...)`` against an empty registry
|
|
7
|
+
and raise ``ValueError``.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from relier.chaos import ( # noqa: F401
|
|
11
|
+
load_spike,
|
|
12
|
+
network,
|
|
13
|
+
slow_task,
|
|
14
|
+
task_corrupt,
|
|
15
|
+
worker_kill,
|
|
16
|
+
)
|
|
17
|
+
from relier.chaos.engine import chaos_engine
|
|
18
|
+
|
|
19
|
+
__all__ = ["chaos_engine"]
|
relier/chaos/engine.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relier Chaos, The Engine of Destruction.
|
|
3
|
+
|
|
4
|
+
Registry and orchestration for chaos experiments designed to validate
|
|
5
|
+
Relier's reliability mechanisms (Phoenix, timeouts, circuit breakers).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import logging
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ChaosEngine:
|
|
15
|
+
"""Registry and runner for all chaos scenarios."""
|
|
16
|
+
|
|
17
|
+
_scenarios: dict[str, type] = {}
|
|
18
|
+
|
|
19
|
+
@classmethod
|
|
20
|
+
def register(cls, name: str) -> Any:
|
|
21
|
+
"""Decorator for registering a chaos scenario class."""
|
|
22
|
+
|
|
23
|
+
def decorator(scenario_class: type) -> type:
|
|
24
|
+
cls._scenarios[name] = scenario_class
|
|
25
|
+
return scenario_class
|
|
26
|
+
|
|
27
|
+
return decorator
|
|
28
|
+
|
|
29
|
+
@classmethod
|
|
30
|
+
async def run(cls, name: str, **kwargs: Any) -> Any:
|
|
31
|
+
"""Instantiate and execute a named chaos scenario.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
name: Name of the scenario (e.g. "worker-kill").
|
|
35
|
+
kwargs: Parameters passed to the scenario constructor.
|
|
36
|
+
"""
|
|
37
|
+
if name not in cls._scenarios:
|
|
38
|
+
raise ValueError(f"Chaos scenario {name!r} not found.")
|
|
39
|
+
|
|
40
|
+
scenario = cls._scenarios[name](**kwargs)
|
|
41
|
+
logger.warning(
|
|
42
|
+
"Unleashing chaos scenario.",
|
|
43
|
+
extra={"scenario": name, "params": kwargs},
|
|
44
|
+
)
|
|
45
|
+
return await scenario.execute()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Global Registry Singleton
|
|
49
|
+
chaos_engine = ChaosEngine()
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relier Chaos, Load Spike Scenario.
|
|
3
|
+
|
|
4
|
+
Drives a sustained burst of task dispatches through the Relier admission
|
|
5
|
+
controller. Each dispatch goes through ``apush()``, which means the
|
|
6
|
+
fixed-window admission limiter (``admission_limit`` / ``admission_window``)
|
|
7
|
+
is the actual choke point, not a fake HTTP layer.
|
|
8
|
+
|
|
9
|
+
When the cluster is at capacity, ``apush`` raises ``AdmissionRejectedError``.
|
|
10
|
+
We tally accepts vs. rejects so the operator can confirm that admission
|
|
11
|
+
control engages and that workers are not flooded with unbounded backlog.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import logging
|
|
16
|
+
|
|
17
|
+
from relier.chaos.engine import chaos_engine
|
|
18
|
+
from relier.core.exceptions import AdmissionRejectedError
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@chaos_engine.register("load-spike")
|
|
24
|
+
class LoadSpikeScenario:
|
|
25
|
+
"""Flood Relier with task dispatches to trigger admission rejections."""
|
|
26
|
+
|
|
27
|
+
def __init__(self, rps: int = 100, duration: int = 10) -> None:
|
|
28
|
+
self.rps = rps
|
|
29
|
+
self.duration = duration
|
|
30
|
+
|
|
31
|
+
async def execute(self) -> dict[str, int]:
|
|
32
|
+
"""Burst ``rps`` dispatches per second for ``duration`` seconds."""
|
|
33
|
+
# Imported lazily so the chaos package does not pull in the Celery
|
|
34
|
+
# runtime at import time (matters for `rl chaos --help`).
|
|
35
|
+
from relier.chaos.tasks import chaos_noop
|
|
36
|
+
|
|
37
|
+
logger.critical(
|
|
38
|
+
"Triggering load spike.",
|
|
39
|
+
extra={"rps": self.rps, "duration_s": self.duration},
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
accepted = 0
|
|
43
|
+
rejected = 0
|
|
44
|
+
errored = 0
|
|
45
|
+
|
|
46
|
+
async def _dispatch_one() -> str:
|
|
47
|
+
try:
|
|
48
|
+
await chaos_noop.apush("chaos-load-spike")
|
|
49
|
+
return "ok"
|
|
50
|
+
except AdmissionRejectedError:
|
|
51
|
+
return "rejected"
|
|
52
|
+
except Exception as exc:
|
|
53
|
+
logger.debug(
|
|
54
|
+
"Dispatch failed unexpectedly.",
|
|
55
|
+
extra={"error_type": type(exc).__name__},
|
|
56
|
+
)
|
|
57
|
+
return "error"
|
|
58
|
+
|
|
59
|
+
loop = asyncio.get_running_loop()
|
|
60
|
+
start = loop.time()
|
|
61
|
+
# Slice into 10 sub-intervals per second to approximate the target RPS
|
|
62
|
+
# without spawning all dispatches in a single huge gather().
|
|
63
|
+
batch_size = max(1, self.rps // 10)
|
|
64
|
+
|
|
65
|
+
while (loop.time() - start) < self.duration:
|
|
66
|
+
tick_start = loop.time()
|
|
67
|
+
results = await asyncio.gather(
|
|
68
|
+
*(_dispatch_one() for _ in range(batch_size))
|
|
69
|
+
)
|
|
70
|
+
for r in results:
|
|
71
|
+
if r == "ok":
|
|
72
|
+
accepted += 1
|
|
73
|
+
elif r == "rejected":
|
|
74
|
+
rejected += 1
|
|
75
|
+
else:
|
|
76
|
+
errored += 1
|
|
77
|
+
|
|
78
|
+
elapsed = loop.time() - tick_start
|
|
79
|
+
sleep_for = max(0.0, 0.1 - elapsed)
|
|
80
|
+
await asyncio.sleep(sleep_for)
|
|
81
|
+
|
|
82
|
+
logger.info(
|
|
83
|
+
"Load spike simulation complete.",
|
|
84
|
+
extra={
|
|
85
|
+
"accepted": accepted,
|
|
86
|
+
"rejected": rejected,
|
|
87
|
+
"errored": errored,
|
|
88
|
+
"duration_s": self.duration,
|
|
89
|
+
"target_rps": self.rps,
|
|
90
|
+
},
|
|
91
|
+
)
|
|
92
|
+
return {"accepted": accepted, "rejected": rejected, "errored": errored}
|
relier/chaos/network.py
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relier Chaos — Network Partition Scenario.
|
|
3
|
+
|
|
4
|
+
Simulates a network disconnect between the Relier cluster and its Redis
|
|
5
|
+
broker, exercising worker resilience, "fail-open" admission control, and
|
|
6
|
+
Phoenix's behaviour when heartbeat refreshes cannot land.
|
|
7
|
+
|
|
8
|
+
The Compose project does not declare a custom network, so the Redis
|
|
9
|
+
container is attached to ``<project>_default`` (typically
|
|
10
|
+
``relier-cluster_default``). Rather than hard-coding that name, we discover
|
|
11
|
+
every network the ``relier-redis`` container is attached to at runtime and
|
|
12
|
+
detach/reattach all of them.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import json
|
|
17
|
+
import logging
|
|
18
|
+
import subprocess
|
|
19
|
+
|
|
20
|
+
from relier.chaos.engine import chaos_engine
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
REDIS_CONTAINER = "relier-redis"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _discover_redis_networks(
|
|
28
|
+
container: str = REDIS_CONTAINER,
|
|
29
|
+
) -> dict[str, list[str]]:
|
|
30
|
+
"""Return a mapping of network-name -> list of DNS aliases for the container.
|
|
31
|
+
|
|
32
|
+
Aliases must be captured *before* detaching so we can restore them on
|
|
33
|
+
reconnect. ``docker network connect`` does not preserve the original
|
|
34
|
+
compose service-name alias (e.g. ``redis``) without it, other
|
|
35
|
+
containers can still reach Redis by container name (``relier-redis``)
|
|
36
|
+
but DNS lookups for the service name fail.
|
|
37
|
+
"""
|
|
38
|
+
try:
|
|
39
|
+
raw = (
|
|
40
|
+
subprocess.check_output(
|
|
41
|
+
[
|
|
42
|
+
"docker",
|
|
43
|
+
"inspect",
|
|
44
|
+
container,
|
|
45
|
+
"--format",
|
|
46
|
+
"{{json .NetworkSettings.Networks}}",
|
|
47
|
+
]
|
|
48
|
+
)
|
|
49
|
+
.decode()
|
|
50
|
+
.strip()
|
|
51
|
+
)
|
|
52
|
+
if not raw or raw == "null":
|
|
53
|
+
return {}
|
|
54
|
+
data = json.loads(raw)
|
|
55
|
+
return {net: (cfg.get("Aliases") or []) for net, cfg in data.items()}
|
|
56
|
+
except subprocess.CalledProcessError as exc:
|
|
57
|
+
logger.error(
|
|
58
|
+
"Failed to inspect Redis container networks.",
|
|
59
|
+
extra={"container": container, "error": str(exc)},
|
|
60
|
+
)
|
|
61
|
+
return {}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
@chaos_engine.register("network-partition")
|
|
65
|
+
class NetworkPartitionScenario:
|
|
66
|
+
"""Isolate the Redis broker from every network it is attached to."""
|
|
67
|
+
|
|
68
|
+
def __init__(self, duration: int = 15) -> None:
|
|
69
|
+
self.duration = duration
|
|
70
|
+
|
|
71
|
+
async def execute(self) -> None:
|
|
72
|
+
"""Disconnect Redis from all its networks, then reconnect after `duration`."""
|
|
73
|
+
net_aliases = _discover_redis_networks()
|
|
74
|
+
if not net_aliases:
|
|
75
|
+
logger.error(
|
|
76
|
+
"No docker networks discovered for Redis; cannot partition.",
|
|
77
|
+
extra={"container": REDIS_CONTAINER},
|
|
78
|
+
)
|
|
79
|
+
return
|
|
80
|
+
|
|
81
|
+
logger.critical(
|
|
82
|
+
"Executing network partition (isolating Redis).",
|
|
83
|
+
extra={
|
|
84
|
+
"duration_s": self.duration,
|
|
85
|
+
"networks": list(net_aliases.keys()),
|
|
86
|
+
},
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
detached: dict[str, list[str]] = {}
|
|
90
|
+
try:
|
|
91
|
+
for net, aliases in net_aliases.items():
|
|
92
|
+
try:
|
|
93
|
+
subprocess.run(
|
|
94
|
+
["docker", "network", "disconnect", net, REDIS_CONTAINER],
|
|
95
|
+
check=True,
|
|
96
|
+
)
|
|
97
|
+
detached[net] = aliases
|
|
98
|
+
except subprocess.CalledProcessError as exc:
|
|
99
|
+
logger.error(
|
|
100
|
+
"Failed to disconnect Redis from network.",
|
|
101
|
+
extra={"network": net, "error": str(exc)},
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
await asyncio.sleep(self.duration)
|
|
105
|
+
|
|
106
|
+
finally:
|
|
107
|
+
# Reattach every network we successfully removed, restoring the
|
|
108
|
+
# original DNS aliases (in particular the compose service name
|
|
109
|
+
# without it, other containers cannot resolve ``redis``).
|
|
110
|
+
for net, aliases in detached.items():
|
|
111
|
+
cmd = ["docker", "network", "connect"]
|
|
112
|
+
for alias in aliases:
|
|
113
|
+
# Skip the auto-assigned container-ID-prefix alias; it is
|
|
114
|
+
# re-added automatically on connect and rejecting it with
|
|
115
|
+
# --alias is harmless but noisy.
|
|
116
|
+
if not alias or alias == REDIS_CONTAINER:
|
|
117
|
+
continue
|
|
118
|
+
cmd.extend(["--alias", alias])
|
|
119
|
+
cmd.extend([net, REDIS_CONTAINER])
|
|
120
|
+
try:
|
|
121
|
+
subprocess.run(cmd, check=True)
|
|
122
|
+
except Exception as exc:
|
|
123
|
+
logger.critical(
|
|
124
|
+
"FAILED TO HEAL NETWORK PARTITION.",
|
|
125
|
+
extra={"network": net, "error": str(exc)},
|
|
126
|
+
)
|
|
127
|
+
logger.info(
|
|
128
|
+
"Network partition healed.",
|
|
129
|
+
extra={"networks": list(detached.keys())},
|
|
130
|
+
)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relier Chaos — Slow Task Scenario.
|
|
3
|
+
|
|
4
|
+
Dispatches a task that intentionally sleeps for longer than its configured
|
|
5
|
+
``hard_timeout`` so the two-tier (soft/hard) timeout machinery in
|
|
6
|
+
``relier.core.timeouts`` is forced to fire.
|
|
7
|
+
|
|
8
|
+
We use the internal ``relier.chaos.tasks.chaos_slow`` task, which is
|
|
9
|
+
decorated with ``hard_timeout=2``. Any ``duration`` >= 2s exercises:
|
|
10
|
+
|
|
11
|
+
1. Soft-timeout cleanup hook (if registered)
|
|
12
|
+
2. Hard-timeout cancellation
|
|
13
|
+
3. DLQ quarantine for ``TimeoutError`` in the decorator
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import logging
|
|
17
|
+
import uuid
|
|
18
|
+
|
|
19
|
+
from relier.chaos.engine import chaos_engine
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@chaos_engine.register("slow-task")
|
|
25
|
+
class SlowTaskScenario:
|
|
26
|
+
"""Dispatch a deliberately-slow task to force a hard-timeout cancellation."""
|
|
27
|
+
|
|
28
|
+
def __init__(self, duration: int = 60) -> None:
|
|
29
|
+
self.duration = duration
|
|
30
|
+
|
|
31
|
+
async def execute(self) -> str:
|
|
32
|
+
"""Dispatch a slow task whose runtime exceeds its decorator hard_timeout."""
|
|
33
|
+
# Lazy import: avoids pulling Celery into `rl chaos --help`.
|
|
34
|
+
from relier.chaos.tasks import chaos_slow
|
|
35
|
+
|
|
36
|
+
marker = f"chaos-slow-{uuid.uuid4().hex[:8]}"
|
|
37
|
+
logger.critical(
|
|
38
|
+
"Dispatching slow task to provoke timeout enforcement.",
|
|
39
|
+
extra={"duration_s": self.duration, "marker": marker},
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
await chaos_slow.apush(duration=self.duration, marker_key=marker)
|
|
43
|
+
|
|
44
|
+
logger.info(
|
|
45
|
+
"Slow task dispatched; expect a hard-timeout + DLQ quarantine.",
|
|
46
|
+
extra={"marker": marker},
|
|
47
|
+
)
|
|
48
|
+
return marker
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relier Chaos — Task Corruption Scenario.
|
|
3
|
+
|
|
4
|
+
Injects a 'poison pill' envelope whose payload checksum does not match
|
|
5
|
+
its body. The producer side (``apush``) normally guarantees envelope
|
|
6
|
+
integrity, so this scenario bypasses it by calling
|
|
7
|
+
``celery_app.send_task`` directly with a hand-crafted envelope.
|
|
8
|
+
|
|
9
|
+
The receiving worker will:
|
|
10
|
+
1. Unwrap the envelope through ``SchemaRegistry.unwrap_and_migrate``
|
|
11
|
+
2. Detect the checksum mismatch and raise ``PayloadIntegrityError``
|
|
12
|
+
3. Quarantine the task to the DLQ via ``DeadLetterQueue.quarantine``
|
|
13
|
+
|
|
14
|
+
This validates the full integrity-failure path end-to-end.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import logging
|
|
19
|
+
import uuid
|
|
20
|
+
from datetime import UTC, datetime
|
|
21
|
+
|
|
22
|
+
from relier.chaos.engine import chaos_engine
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger(__name__)
|
|
25
|
+
|
|
26
|
+
# Any registered task name will do, the worker rejects the envelope before
|
|
27
|
+
# the user function ever runs. We target an internal chaos task that the
|
|
28
|
+
# Celery app loads in every environment (see ``relier.chaos.tasks``).
|
|
29
|
+
POISON_TARGET_TASK = "relier.chaos.tasks.chaos_noop"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@chaos_engine.register("task-corrupt")
|
|
33
|
+
class TaskCorruptScenario:
|
|
34
|
+
"""Inject a malformed task to trigger PayloadIntegrityError."""
|
|
35
|
+
|
|
36
|
+
async def execute(self) -> None:
|
|
37
|
+
"""Send a structurally-valid envelope with a deliberately-bad checksum."""
|
|
38
|
+
from relier.tasks.app import celery_app
|
|
39
|
+
|
|
40
|
+
task_id = f"chaos-poison-{uuid.uuid4().hex[:8]}"
|
|
41
|
+
|
|
42
|
+
# Structurally valid Pydantic envelope (all required fields present),
|
|
43
|
+
# but checksum is wrong on purpose so unwrap_and_migrate rejects it
|
|
44
|
+
# and the decorator routes the task to the DLQ.
|
|
45
|
+
bad_envelope = {
|
|
46
|
+
"schema_version": 1,
|
|
47
|
+
"task_id": task_id,
|
|
48
|
+
"payload": {"args": ["poison"], "kwargs": {}},
|
|
49
|
+
"checksum": "sha256:TAMPERED_INVALID_CHECKSUM_VALUE",
|
|
50
|
+
"enqueued_at": datetime.now(UTC).isoformat(),
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
logger.critical(
|
|
54
|
+
"Injecting corrupted poison pill task.",
|
|
55
|
+
extra={"task_id": task_id, "target": POISON_TARGET_TASK},
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# Celery's publishing API is synchronous; dispatch off the event loop
|
|
59
|
+
# so the broker round-trip does not block the asyncio runtime.
|
|
60
|
+
loop = asyncio.get_running_loop()
|
|
61
|
+
await loop.run_in_executor(
|
|
62
|
+
None,
|
|
63
|
+
lambda: celery_app.send_task(
|
|
64
|
+
POISON_TARGET_TASK,
|
|
65
|
+
args=(bad_envelope,),
|
|
66
|
+
queue="default",
|
|
67
|
+
task_id=task_id,
|
|
68
|
+
),
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
logger.info(
|
|
72
|
+
"Poison pill injected; expect a DLQ quarantine for PayloadIntegrityError.",
|
|
73
|
+
extra={"task_id": task_id},
|
|
74
|
+
)
|
relier/chaos/tasks.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relier Chaos — Internal target tasks.
|
|
3
|
+
|
|
4
|
+
The chaos suite needs tasks it can dispatch against in every environment
|
|
5
|
+
(including production), so it cannot depend on ``relier.tasks.debug``
|
|
6
|
+
which is only included by the Celery app outside production.
|
|
7
|
+
|
|
8
|
+
These tasks are deliberately minimal: they exist solely to exercise
|
|
9
|
+
Relier's reliability machinery (admission control, hard timeouts,
|
|
10
|
+
Phoenix resurrection, payload integrity) from the chaos CLI. They do no
|
|
11
|
+
useful application work and write only to a chaos-scoped Redis namespace.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
|
|
16
|
+
from relier.storage.redis import get_relier_redis
|
|
17
|
+
from relier.tasks.context import task_context
|
|
18
|
+
from relier.tasks.decorator import rl_task
|
|
19
|
+
|
|
20
|
+
# Namespaced so chaos markers never collide with application state.
|
|
21
|
+
_MARKER_PREFIX = "rl:chaos:marker"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@rl_task(idempotent=True)
|
|
25
|
+
async def chaos_counter(key: str) -> int:
|
|
26
|
+
"""Idempotent counter. Increments once per unique key regardless of how many
|
|
27
|
+
times the task is dispatched with the same arguments."""
|
|
28
|
+
redis = await get_relier_redis()
|
|
29
|
+
val = await redis.incr(f"{_MARKER_PREFIX}:counter:{key}")
|
|
30
|
+
return int(val)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@rl_task()
|
|
34
|
+
async def chaos_noop(tag: str = "chaos") -> str:
|
|
35
|
+
"""No-op target for ``load-spike`` and ``task-corrupt``.
|
|
36
|
+
|
|
37
|
+
Increments a per-tag counter so operators can confirm dispatches landed
|
|
38
|
+
on a worker (useful when validating that admission rejections were the
|
|
39
|
+
actual choke point, not silent broker drops).
|
|
40
|
+
"""
|
|
41
|
+
redis = await get_relier_redis()
|
|
42
|
+
await redis.incr(f"{_MARKER_PREFIX}:noop:{tag}")
|
|
43
|
+
return "ok"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@rl_task(hard_timeout=2)
|
|
47
|
+
async def chaos_slow(duration: int, marker_key: str) -> str:
|
|
48
|
+
"""Sleep for ``duration`` seconds, used by ``slow-task``.
|
|
49
|
+
|
|
50
|
+
Decorated with ``hard_timeout=2`` so any ``duration >= 2`` forces the
|
|
51
|
+
two-tier timeout machinery to fire and the task to land in the DLQ.
|
|
52
|
+
"""
|
|
53
|
+
redis = await get_relier_redis()
|
|
54
|
+
await redis.set(f"{_MARKER_PREFIX}:slow:{marker_key}:started", "1", ex=300)
|
|
55
|
+
await asyncio.sleep(duration)
|
|
56
|
+
await redis.set(f"{_MARKER_PREFIX}:slow:{marker_key}:finished", "1", ex=300)
|
|
57
|
+
return "done"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@rl_task()
|
|
61
|
+
async def chaos_long_running(duration: int, marker_key: str) -> str:
|
|
62
|
+
"""Sleep for ``duration`` seconds without a hard timeout.
|
|
63
|
+
|
|
64
|
+
Used as the seed task for ``worker-kill``: it must still be running when
|
|
65
|
+
the SIGKILL lands so Phoenix has something orphaned to resurrect. We
|
|
66
|
+
yield to the event loop in small steps so cancellation lands cleanly if
|
|
67
|
+
the test ever needs to abort it.
|
|
68
|
+
"""
|
|
69
|
+
redis = await get_relier_redis()
|
|
70
|
+
await redis.set(f"{_MARKER_PREFIX}:longrun:{marker_key}:started", "1", ex=600)
|
|
71
|
+
for _ in range(duration * 10):
|
|
72
|
+
await asyncio.sleep(0.1)
|
|
73
|
+
await redis.set(f"{_MARKER_PREFIX}:longrun:{marker_key}:finished", "1", ex=600)
|
|
74
|
+
return "done"
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@rl_task()
|
|
78
|
+
async def chaos_checkpoint(steps: int, marker: str) -> str:
|
|
79
|
+
"""Checkpointed task that saves progress at each step.
|
|
80
|
+
|
|
81
|
+
Used to verify Phoenix resurrection resumes from the last persisted
|
|
82
|
+
checkpoint rather than restarting from the beginning.
|
|
83
|
+
"""
|
|
84
|
+
redis = await get_relier_redis()
|
|
85
|
+
start_step = 0
|
|
86
|
+
if task_context.partial_result:
|
|
87
|
+
start_step = task_context.partial_result.get("last_step", 0)
|
|
88
|
+
for i in range(start_step, steps):
|
|
89
|
+
await asyncio.sleep(0.1)
|
|
90
|
+
await task_context.set_partial({"last_step": i + 1})
|
|
91
|
+
await redis.set(f"{_MARKER_PREFIX}:checkpoint:{marker}:finished", "1", ex=300)
|
|
92
|
+
return "done"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@rl_task()
|
|
96
|
+
async def chaos_fail() -> None:
|
|
97
|
+
"""Always raises an error. Used to exercise the failure and DLQ path."""
|
|
98
|
+
raise ValueError("Intentional failure")
|