relier 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. relier/__init__.py +111 -0
  2. relier/chaos/__init__.py +19 -0
  3. relier/chaos/engine.py +49 -0
  4. relier/chaos/load_spike.py +92 -0
  5. relier/chaos/network.py +130 -0
  6. relier/chaos/slow_task.py +48 -0
  7. relier/chaos/task_corrupt.py +74 -0
  8. relier/chaos/tasks.py +98 -0
  9. relier/chaos/worker_kill.py +72 -0
  10. relier/cli/__init__.py +5 -0
  11. relier/cli/admin.py +80 -0
  12. relier/cli/admission.py +46 -0
  13. relier/cli/base.py +52 -0
  14. relier/cli/chaos.py +176 -0
  15. relier/cli/cluster.py +145 -0
  16. relier/cli/config.py +117 -0
  17. relier/cli/dlq.py +118 -0
  18. relier/cli/main.py +322 -0
  19. relier/cli/manual.py +255 -0
  20. relier/cli/slo.py +173 -0
  21. relier/cli/tasks.py +562 -0
  22. relier/cli/ui/__init__.py +5 -0
  23. relier/cli/ui/live.py +20 -0
  24. relier/cli/ui/panels.py +39 -0
  25. relier/cli/ui/tables.py +201 -0
  26. relier/cli/utils.py +55 -0
  27. relier/cli/workers.py +209 -0
  28. relier/config.py +490 -0
  29. relier/core/__init__.py +32 -0
  30. relier/core/admission.py +172 -0
  31. relier/core/checkpoint.py +333 -0
  32. relier/core/dlq.py +315 -0
  33. relier/core/exceptions.py +227 -0
  34. relier/core/idempotency.py +243 -0
  35. relier/core/keys.py +199 -0
  36. relier/core/phoenix.py +1052 -0
  37. relier/core/schema.py +205 -0
  38. relier/core/shutdown.py +193 -0
  39. relier/core/slo.py +144 -0
  40. relier/core/timeouts.py +197 -0
  41. relier/core/validation.py +209 -0
  42. relier/py.typed +0 -0
  43. relier/storage/__init__.py +13 -0
  44. relier/storage/lua/__init__.py +5 -0
  45. relier/storage/lua/scripts.py +140 -0
  46. relier/storage/redis.py +335 -0
  47. relier/tasks/__init__.py +16 -0
  48. relier/tasks/app.py +466 -0
  49. relier/tasks/context.py +180 -0
  50. relier/tasks/decorator.py +1121 -0
  51. relier/tasks/signals.py +101 -0
  52. relier/telemetry/__init__.py +16 -0
  53. relier/telemetry/logging.py +98 -0
  54. relier/telemetry/metrics.py +151 -0
  55. relier/telemetry/setup.py +82 -0
  56. relier/telemetry/spans.py +107 -0
  57. relier-0.1.0.dist-info/METADATA +366 -0
  58. relier-0.1.0.dist-info/RECORD +61 -0
  59. relier-0.1.0.dist-info/WHEEL +4 -0
  60. relier-0.1.0.dist-info/entry_points.txt +2 -0
  61. relier-0.1.0.dist-info/licenses/LICENSE +21 -0
relier/__init__.py ADDED
@@ -0,0 +1,111 @@
1
+ """
2
+ Relier — Reliability layer for Celery. Zero job loss.
3
+
4
+ Core Guarantees
5
+ ---------------
6
+ - **Zero job loss** — every dispatched task is tracked in Redis and
7
+ automatically resurrected if its worker crashes mid-execution.
8
+ - **Atomic execution** — SHA-256 idempotency keys ensure tasks run exactly
9
+ once, even under concurrent retries.
10
+ - **Schema safety** — versioned envelopes with checksums protect against
11
+ payload corruption during rolling deployments.
12
+ - **Observability** — full OpenTelemetry span hierarchy and Prometheus
13
+ metrics with no extra wiring required.
14
+
15
+ Quick Start
16
+ -----------
17
+ >>> from relier import rl_task
18
+ >>> from relier.tasks.context import TaskContext
19
+ >>>
20
+ >>> @rl_task(
21
+ ... queue="high_priority",
22
+ ... idempotent=True,
23
+ ... soft_timeout=25,
24
+ ... hard_timeout=30,
25
+ ... )
26
+ ... async def process_order(order_id: str, ctx: TaskContext) -> dict:
27
+ ... if ctx.partial_result:
28
+ ... resume_from = ctx.partial_result["step"]
29
+ ... result = await charge_and_fulfil(order_id)
30
+ ... return result
31
+ >>>
32
+ >>> # FastAPI / async Django — dispatch without blocking:
33
+ >>> receipt = await process_order.apush(order_id="ord-001")
34
+ >>>
35
+ >>> # Django views / Flask routes / scripts — sync dispatch:
36
+ >>> receipt = process_order.push(order_id="ord-001")
37
+
38
+ Configuration
39
+ -------------
40
+ All settings are read from environment variables prefixed ``RELIER_``.
41
+ See :class:`~relier.config.Settings` for the full reference, or call
42
+ :func:`~relier.config.get_settings` to inspect the active configuration.
43
+
44
+ Error Handling
45
+ --------------
46
+ All exceptions raised by Relier inherit from :class:`~relier.core.exceptions.RelierError`.
47
+ Catch that base class to handle any framework error uniformly:
48
+
49
+ >>> from relier import RelierError, AdmissionRejectedError
50
+ >>> try:
51
+ ... receipt = await process_order.apush(order_id="ord-001")
52
+ ... except AdmissionRejectedError as exc:
53
+ ... # Back off and retry after exc.retry_after seconds.
54
+ ... await asyncio.sleep(exc.retry_after)
55
+ ... except RelierError:
56
+ ... # Unexpected framework error — log and alert.
57
+ ... raise
58
+ """
59
+
60
+ __version__ = "0.1.0"
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Public API re-exports
64
+ #
65
+ # Everything a user needs for day-to-day Relier usage is importable from
66
+ # the top-level ``relier`` package. Internal subsystem modules remain
67
+ # accessible via their full dotted paths for advanced use cases.
68
+ # ---------------------------------------------------------------------------
69
+
70
+ from relier.config import Settings, get_settings
71
+ from relier.core.exceptions import (
72
+ AdmissionRejectedError,
73
+ ConfigurationError,
74
+ IdempotencyInFlightError,
75
+ MaxResurrectionsExceededError,
76
+ PayloadIntegrityError,
77
+ RedisConnectionError,
78
+ RelierError,
79
+ SchemaMigrationError,
80
+ WorkerInitializationError,
81
+ )
82
+ from relier.tasks.context import TaskContext, task_context
83
+ from relier.tasks.decorator import PUBLIC_QUEUES, RelierTask, rl_task
84
+
85
+ __all__ = [
86
+ # Version
87
+ "__version__",
88
+ # Core decorator & typed handle
89
+ "rl_task",
90
+ "RelierTask",
91
+ # Task context
92
+ "TaskContext",
93
+ "task_context",
94
+ # Configuration
95
+ "Settings",
96
+ "get_settings",
97
+ # Queue topology
98
+ "PUBLIC_QUEUES",
99
+ # Exceptions — base
100
+ "RelierError",
101
+ # Exceptions — configuration & infrastructure
102
+ "ConfigurationError",
103
+ "RedisConnectionError",
104
+ "WorkerInitializationError",
105
+ # Exceptions — task lifecycle
106
+ "AdmissionRejectedError",
107
+ "IdempotencyInFlightError",
108
+ "MaxResurrectionsExceededError",
109
+ "PayloadIntegrityError",
110
+ "SchemaMigrationError",
111
+ ]
@@ -0,0 +1,19 @@
1
+ """
2
+ Relier Chaos, Reliability validation suite.
3
+
4
+ Importing this package registers every chaos scenario with the
5
+ ``ChaosEngine`` singleton via side effects. Without this, the CLI would
6
+ resolve ``chaos_engine.run("worker-kill", ...)`` against an empty registry
7
+ and raise ``ValueError``.
8
+ """
9
+
10
+ from relier.chaos import ( # noqa: F401
11
+ load_spike,
12
+ network,
13
+ slow_task,
14
+ task_corrupt,
15
+ worker_kill,
16
+ )
17
+ from relier.chaos.engine import chaos_engine
18
+
19
+ __all__ = ["chaos_engine"]
relier/chaos/engine.py ADDED
@@ -0,0 +1,49 @@
1
+ """
2
+ Relier Chaos, The Engine of Destruction.
3
+
4
+ Registry and orchestration for chaos experiments designed to validate
5
+ Relier's reliability mechanisms (Phoenix, timeouts, circuit breakers).
6
+ """
7
+
8
+ import logging
9
+ from typing import Any
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ class ChaosEngine:
15
+ """Registry and runner for all chaos scenarios."""
16
+
17
+ _scenarios: dict[str, type] = {}
18
+
19
+ @classmethod
20
+ def register(cls, name: str) -> Any:
21
+ """Decorator for registering a chaos scenario class."""
22
+
23
+ def decorator(scenario_class: type) -> type:
24
+ cls._scenarios[name] = scenario_class
25
+ return scenario_class
26
+
27
+ return decorator
28
+
29
+ @classmethod
30
+ async def run(cls, name: str, **kwargs: Any) -> Any:
31
+ """Instantiate and execute a named chaos scenario.
32
+
33
+ Args:
34
+ name: Name of the scenario (e.g. "worker-kill").
35
+ kwargs: Parameters passed to the scenario constructor.
36
+ """
37
+ if name not in cls._scenarios:
38
+ raise ValueError(f"Chaos scenario {name!r} not found.")
39
+
40
+ scenario = cls._scenarios[name](**kwargs)
41
+ logger.warning(
42
+ "Unleashing chaos scenario.",
43
+ extra={"scenario": name, "params": kwargs},
44
+ )
45
+ return await scenario.execute()
46
+
47
+
48
+ # Global Registry Singleton
49
+ chaos_engine = ChaosEngine()
@@ -0,0 +1,92 @@
1
+ """
2
+ Relier Chaos, Load Spike Scenario.
3
+
4
+ Drives a sustained burst of task dispatches through the Relier admission
5
+ controller. Each dispatch goes through ``apush()``, which means the
6
+ fixed-window admission limiter (``admission_limit`` / ``admission_window``)
7
+ is the actual choke point, not a fake HTTP layer.
8
+
9
+ When the cluster is at capacity, ``apush`` raises ``AdmissionRejectedError``.
10
+ We tally accepts vs. rejects so the operator can confirm that admission
11
+ control engages and that workers are not flooded with unbounded backlog.
12
+ """
13
+
14
+ import asyncio
15
+ import logging
16
+
17
+ from relier.chaos.engine import chaos_engine
18
+ from relier.core.exceptions import AdmissionRejectedError
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @chaos_engine.register("load-spike")
24
+ class LoadSpikeScenario:
25
+ """Flood Relier with task dispatches to trigger admission rejections."""
26
+
27
+ def __init__(self, rps: int = 100, duration: int = 10) -> None:
28
+ self.rps = rps
29
+ self.duration = duration
30
+
31
+ async def execute(self) -> dict[str, int]:
32
+ """Burst ``rps`` dispatches per second for ``duration`` seconds."""
33
+ # Imported lazily so the chaos package does not pull in the Celery
34
+ # runtime at import time (matters for `rl chaos --help`).
35
+ from relier.chaos.tasks import chaos_noop
36
+
37
+ logger.critical(
38
+ "Triggering load spike.",
39
+ extra={"rps": self.rps, "duration_s": self.duration},
40
+ )
41
+
42
+ accepted = 0
43
+ rejected = 0
44
+ errored = 0
45
+
46
+ async def _dispatch_one() -> str:
47
+ try:
48
+ await chaos_noop.apush("chaos-load-spike")
49
+ return "ok"
50
+ except AdmissionRejectedError:
51
+ return "rejected"
52
+ except Exception as exc:
53
+ logger.debug(
54
+ "Dispatch failed unexpectedly.",
55
+ extra={"error_type": type(exc).__name__},
56
+ )
57
+ return "error"
58
+
59
+ loop = asyncio.get_running_loop()
60
+ start = loop.time()
61
+ # Slice into 10 sub-intervals per second to approximate the target RPS
62
+ # without spawning all dispatches in a single huge gather().
63
+ batch_size = max(1, self.rps // 10)
64
+
65
+ while (loop.time() - start) < self.duration:
66
+ tick_start = loop.time()
67
+ results = await asyncio.gather(
68
+ *(_dispatch_one() for _ in range(batch_size))
69
+ )
70
+ for r in results:
71
+ if r == "ok":
72
+ accepted += 1
73
+ elif r == "rejected":
74
+ rejected += 1
75
+ else:
76
+ errored += 1
77
+
78
+ elapsed = loop.time() - tick_start
79
+ sleep_for = max(0.0, 0.1 - elapsed)
80
+ await asyncio.sleep(sleep_for)
81
+
82
+ logger.info(
83
+ "Load spike simulation complete.",
84
+ extra={
85
+ "accepted": accepted,
86
+ "rejected": rejected,
87
+ "errored": errored,
88
+ "duration_s": self.duration,
89
+ "target_rps": self.rps,
90
+ },
91
+ )
92
+ return {"accepted": accepted, "rejected": rejected, "errored": errored}
@@ -0,0 +1,130 @@
1
+ """
2
+ Relier Chaos — Network Partition Scenario.
3
+
4
+ Simulates a network disconnect between the Relier cluster and its Redis
5
+ broker, exercising worker resilience, "fail-open" admission control, and
6
+ Phoenix's behaviour when heartbeat refreshes cannot land.
7
+
8
+ The Compose project does not declare a custom network, so the Redis
9
+ container is attached to ``<project>_default`` (typically
10
+ ``relier-cluster_default``). Rather than hard-coding that name, we discover
11
+ every network the ``relier-redis`` container is attached to at runtime and
12
+ detach/reattach all of them.
13
+ """
14
+
15
+ import asyncio
16
+ import json
17
+ import logging
18
+ import subprocess
19
+
20
+ from relier.chaos.engine import chaos_engine
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ REDIS_CONTAINER = "relier-redis"
25
+
26
+
27
+ def _discover_redis_networks(
28
+ container: str = REDIS_CONTAINER,
29
+ ) -> dict[str, list[str]]:
30
+ """Return a mapping of network-name -> list of DNS aliases for the container.
31
+
32
+ Aliases must be captured *before* detaching so we can restore them on
33
+ reconnect. ``docker network connect`` does not preserve the original
34
+ compose service-name alias (e.g. ``redis``) without it, other
35
+ containers can still reach Redis by container name (``relier-redis``)
36
+ but DNS lookups for the service name fail.
37
+ """
38
+ try:
39
+ raw = (
40
+ subprocess.check_output(
41
+ [
42
+ "docker",
43
+ "inspect",
44
+ container,
45
+ "--format",
46
+ "{{json .NetworkSettings.Networks}}",
47
+ ]
48
+ )
49
+ .decode()
50
+ .strip()
51
+ )
52
+ if not raw or raw == "null":
53
+ return {}
54
+ data = json.loads(raw)
55
+ return {net: (cfg.get("Aliases") or []) for net, cfg in data.items()}
56
+ except subprocess.CalledProcessError as exc:
57
+ logger.error(
58
+ "Failed to inspect Redis container networks.",
59
+ extra={"container": container, "error": str(exc)},
60
+ )
61
+ return {}
62
+
63
+
64
+ @chaos_engine.register("network-partition")
65
+ class NetworkPartitionScenario:
66
+ """Isolate the Redis broker from every network it is attached to."""
67
+
68
+ def __init__(self, duration: int = 15) -> None:
69
+ self.duration = duration
70
+
71
+ async def execute(self) -> None:
72
+ """Disconnect Redis from all its networks, then reconnect after `duration`."""
73
+ net_aliases = _discover_redis_networks()
74
+ if not net_aliases:
75
+ logger.error(
76
+ "No docker networks discovered for Redis; cannot partition.",
77
+ extra={"container": REDIS_CONTAINER},
78
+ )
79
+ return
80
+
81
+ logger.critical(
82
+ "Executing network partition (isolating Redis).",
83
+ extra={
84
+ "duration_s": self.duration,
85
+ "networks": list(net_aliases.keys()),
86
+ },
87
+ )
88
+
89
+ detached: dict[str, list[str]] = {}
90
+ try:
91
+ for net, aliases in net_aliases.items():
92
+ try:
93
+ subprocess.run(
94
+ ["docker", "network", "disconnect", net, REDIS_CONTAINER],
95
+ check=True,
96
+ )
97
+ detached[net] = aliases
98
+ except subprocess.CalledProcessError as exc:
99
+ logger.error(
100
+ "Failed to disconnect Redis from network.",
101
+ extra={"network": net, "error": str(exc)},
102
+ )
103
+
104
+ await asyncio.sleep(self.duration)
105
+
106
+ finally:
107
+ # Reattach every network we successfully removed, restoring the
108
+ # original DNS aliases (in particular the compose service name
109
+ # without it, other containers cannot resolve ``redis``).
110
+ for net, aliases in detached.items():
111
+ cmd = ["docker", "network", "connect"]
112
+ for alias in aliases:
113
+ # Skip the auto-assigned container-ID-prefix alias; it is
114
+ # re-added automatically on connect and rejecting it with
115
+ # --alias is harmless but noisy.
116
+ if not alias or alias == REDIS_CONTAINER:
117
+ continue
118
+ cmd.extend(["--alias", alias])
119
+ cmd.extend([net, REDIS_CONTAINER])
120
+ try:
121
+ subprocess.run(cmd, check=True)
122
+ except Exception as exc:
123
+ logger.critical(
124
+ "FAILED TO HEAL NETWORK PARTITION.",
125
+ extra={"network": net, "error": str(exc)},
126
+ )
127
+ logger.info(
128
+ "Network partition healed.",
129
+ extra={"networks": list(detached.keys())},
130
+ )
@@ -0,0 +1,48 @@
1
+ """
2
+ Relier Chaos — Slow Task Scenario.
3
+
4
+ Dispatches a task that intentionally sleeps for longer than its configured
5
+ ``hard_timeout`` so the two-tier (soft/hard) timeout machinery in
6
+ ``relier.core.timeouts`` is forced to fire.
7
+
8
+ We use the internal ``relier.chaos.tasks.chaos_slow`` task, which is
9
+ decorated with ``hard_timeout=2``. Any ``duration`` >= 2s exercises:
10
+
11
+ 1. Soft-timeout cleanup hook (if registered)
12
+ 2. Hard-timeout cancellation
13
+ 3. DLQ quarantine for ``TimeoutError`` in the decorator
14
+ """
15
+
16
+ import logging
17
+ import uuid
18
+
19
+ from relier.chaos.engine import chaos_engine
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ @chaos_engine.register("slow-task")
25
+ class SlowTaskScenario:
26
+ """Dispatch a deliberately-slow task to force a hard-timeout cancellation."""
27
+
28
+ def __init__(self, duration: int = 60) -> None:
29
+ self.duration = duration
30
+
31
+ async def execute(self) -> str:
32
+ """Dispatch a slow task whose runtime exceeds its decorator hard_timeout."""
33
+ # Lazy import: avoids pulling Celery into `rl chaos --help`.
34
+ from relier.chaos.tasks import chaos_slow
35
+
36
+ marker = f"chaos-slow-{uuid.uuid4().hex[:8]}"
37
+ logger.critical(
38
+ "Dispatching slow task to provoke timeout enforcement.",
39
+ extra={"duration_s": self.duration, "marker": marker},
40
+ )
41
+
42
+ await chaos_slow.apush(duration=self.duration, marker_key=marker)
43
+
44
+ logger.info(
45
+ "Slow task dispatched; expect a hard-timeout + DLQ quarantine.",
46
+ extra={"marker": marker},
47
+ )
48
+ return marker
@@ -0,0 +1,74 @@
1
+ """
2
+ Relier Chaos — Task Corruption Scenario.
3
+
4
+ Injects a 'poison pill' envelope whose payload checksum does not match
5
+ its body. The producer side (``apush``) normally guarantees envelope
6
+ integrity, so this scenario bypasses it by calling
7
+ ``celery_app.send_task`` directly with a hand-crafted envelope.
8
+
9
+ The receiving worker will:
10
+ 1. Unwrap the envelope through ``SchemaRegistry.unwrap_and_migrate``
11
+ 2. Detect the checksum mismatch and raise ``PayloadIntegrityError``
12
+ 3. Quarantine the task to the DLQ via ``DeadLetterQueue.quarantine``
13
+
14
+ This validates the full integrity-failure path end-to-end.
15
+ """
16
+
17
+ import asyncio
18
+ import logging
19
+ import uuid
20
+ from datetime import UTC, datetime
21
+
22
+ from relier.chaos.engine import chaos_engine
23
+
24
+ logger = logging.getLogger(__name__)
25
+
26
+ # Any registered task name will do, the worker rejects the envelope before
27
+ # the user function ever runs. We target an internal chaos task that the
28
+ # Celery app loads in every environment (see ``relier.chaos.tasks``).
29
+ POISON_TARGET_TASK = "relier.chaos.tasks.chaos_noop"
30
+
31
+
32
+ @chaos_engine.register("task-corrupt")
33
+ class TaskCorruptScenario:
34
+ """Inject a malformed task to trigger PayloadIntegrityError."""
35
+
36
+ async def execute(self) -> None:
37
+ """Send a structurally-valid envelope with a deliberately-bad checksum."""
38
+ from relier.tasks.app import celery_app
39
+
40
+ task_id = f"chaos-poison-{uuid.uuid4().hex[:8]}"
41
+
42
+ # Structurally valid Pydantic envelope (all required fields present),
43
+ # but checksum is wrong on purpose so unwrap_and_migrate rejects it
44
+ # and the decorator routes the task to the DLQ.
45
+ bad_envelope = {
46
+ "schema_version": 1,
47
+ "task_id": task_id,
48
+ "payload": {"args": ["poison"], "kwargs": {}},
49
+ "checksum": "sha256:TAMPERED_INVALID_CHECKSUM_VALUE",
50
+ "enqueued_at": datetime.now(UTC).isoformat(),
51
+ }
52
+
53
+ logger.critical(
54
+ "Injecting corrupted poison pill task.",
55
+ extra={"task_id": task_id, "target": POISON_TARGET_TASK},
56
+ )
57
+
58
+ # Celery's publishing API is synchronous; dispatch off the event loop
59
+ # so the broker round-trip does not block the asyncio runtime.
60
+ loop = asyncio.get_running_loop()
61
+ await loop.run_in_executor(
62
+ None,
63
+ lambda: celery_app.send_task(
64
+ POISON_TARGET_TASK,
65
+ args=(bad_envelope,),
66
+ queue="default",
67
+ task_id=task_id,
68
+ ),
69
+ )
70
+
71
+ logger.info(
72
+ "Poison pill injected; expect a DLQ quarantine for PayloadIntegrityError.",
73
+ extra={"task_id": task_id},
74
+ )
relier/chaos/tasks.py ADDED
@@ -0,0 +1,98 @@
1
+ """
2
+ Relier Chaos — Internal target tasks.
3
+
4
+ The chaos suite needs tasks it can dispatch against in every environment
5
+ (including production), so it cannot depend on ``relier.tasks.debug``
6
+ which is only included by the Celery app outside production.
7
+
8
+ These tasks are deliberately minimal: they exist solely to exercise
9
+ Relier's reliability machinery (admission control, hard timeouts,
10
+ Phoenix resurrection, payload integrity) from the chaos CLI. They do no
11
+ useful application work and write only to a chaos-scoped Redis namespace.
12
+ """
13
+
14
+ import asyncio
15
+
16
+ from relier.storage.redis import get_relier_redis
17
+ from relier.tasks.context import task_context
18
+ from relier.tasks.decorator import rl_task
19
+
20
+ # Namespaced so chaos markers never collide with application state.
21
+ _MARKER_PREFIX = "rl:chaos:marker"
22
+
23
+
24
+ @rl_task(idempotent=True)
25
+ async def chaos_counter(key: str) -> int:
26
+ """Idempotent counter. Increments once per unique key regardless of how many
27
+ times the task is dispatched with the same arguments."""
28
+ redis = await get_relier_redis()
29
+ val = await redis.incr(f"{_MARKER_PREFIX}:counter:{key}")
30
+ return int(val)
31
+
32
+
33
+ @rl_task()
34
+ async def chaos_noop(tag: str = "chaos") -> str:
35
+ """No-op target for ``load-spike`` and ``task-corrupt``.
36
+
37
+ Increments a per-tag counter so operators can confirm dispatches landed
38
+ on a worker (useful when validating that admission rejections were the
39
+ actual choke point, not silent broker drops).
40
+ """
41
+ redis = await get_relier_redis()
42
+ await redis.incr(f"{_MARKER_PREFIX}:noop:{tag}")
43
+ return "ok"
44
+
45
+
46
+ @rl_task(hard_timeout=2)
47
+ async def chaos_slow(duration: int, marker_key: str) -> str:
48
+ """Sleep for ``duration`` seconds, used by ``slow-task``.
49
+
50
+ Decorated with ``hard_timeout=2`` so any ``duration >= 2`` forces the
51
+ two-tier timeout machinery to fire and the task to land in the DLQ.
52
+ """
53
+ redis = await get_relier_redis()
54
+ await redis.set(f"{_MARKER_PREFIX}:slow:{marker_key}:started", "1", ex=300)
55
+ await asyncio.sleep(duration)
56
+ await redis.set(f"{_MARKER_PREFIX}:slow:{marker_key}:finished", "1", ex=300)
57
+ return "done"
58
+
59
+
60
+ @rl_task()
61
+ async def chaos_long_running(duration: int, marker_key: str) -> str:
62
+ """Sleep for ``duration`` seconds without a hard timeout.
63
+
64
+ Used as the seed task for ``worker-kill``: it must still be running when
65
+ the SIGKILL lands so Phoenix has something orphaned to resurrect. We
66
+ yield to the event loop in small steps so cancellation lands cleanly if
67
+ the test ever needs to abort it.
68
+ """
69
+ redis = await get_relier_redis()
70
+ await redis.set(f"{_MARKER_PREFIX}:longrun:{marker_key}:started", "1", ex=600)
71
+ for _ in range(duration * 10):
72
+ await asyncio.sleep(0.1)
73
+ await redis.set(f"{_MARKER_PREFIX}:longrun:{marker_key}:finished", "1", ex=600)
74
+ return "done"
75
+
76
+
77
+ @rl_task()
78
+ async def chaos_checkpoint(steps: int, marker: str) -> str:
79
+ """Checkpointed task that saves progress at each step.
80
+
81
+ Used to verify Phoenix resurrection resumes from the last persisted
82
+ checkpoint rather than restarting from the beginning.
83
+ """
84
+ redis = await get_relier_redis()
85
+ start_step = 0
86
+ if task_context.partial_result:
87
+ start_step = task_context.partial_result.get("last_step", 0)
88
+ for i in range(start_step, steps):
89
+ await asyncio.sleep(0.1)
90
+ await task_context.set_partial({"last_step": i + 1})
91
+ await redis.set(f"{_MARKER_PREFIX}:checkpoint:{marker}:finished", "1", ex=300)
92
+ return "done"
93
+
94
+
95
+ @rl_task()
96
+ async def chaos_fail() -> None:
97
+ """Always raises an error. Used to exercise the failure and DLQ path."""
98
+ raise ValueError("Intentional failure")