rabbitkit 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- rabbitkit/__init__.py +201 -0
- rabbitkit/_version.py +3 -0
- rabbitkit/aio/__init__.py +31 -0
- rabbitkit/async_/__init__.py +9 -0
- rabbitkit/async_/batch.py +213 -0
- rabbitkit/async_/broker.py +1123 -0
- rabbitkit/async_/connection.py +274 -0
- rabbitkit/async_/pool.py +363 -0
- rabbitkit/async_/transport.py +877 -0
- rabbitkit/asyncapi/__init__.py +5 -0
- rabbitkit/asyncapi/generator.py +219 -0
- rabbitkit/asyncapi/schema.py +98 -0
- rabbitkit/cli/__init__.py +77 -0
- rabbitkit/cli/_utils.py +38 -0
- rabbitkit/cli/commands/__init__.py +0 -0
- rabbitkit/cli/commands/dlq.py +190 -0
- rabbitkit/cli/commands/health.py +34 -0
- rabbitkit/cli/commands/migrate.py +570 -0
- rabbitkit/cli/commands/routes.py +88 -0
- rabbitkit/cli/commands/run.py +144 -0
- rabbitkit/cli/commands/shell.py +72 -0
- rabbitkit/cli/commands/topology.py +346 -0
- rabbitkit/concurrency.py +451 -0
- rabbitkit/core/__init__.py +5 -0
- rabbitkit/core/app.py +323 -0
- rabbitkit/core/config.py +849 -0
- rabbitkit/core/env_config.py +251 -0
- rabbitkit/core/errors.py +199 -0
- rabbitkit/core/logging.py +261 -0
- rabbitkit/core/message.py +235 -0
- rabbitkit/core/path.py +53 -0
- rabbitkit/core/pipeline.py +1289 -0
- rabbitkit/core/protocols.py +349 -0
- rabbitkit/core/registry.py +284 -0
- rabbitkit/core/route.py +329 -0
- rabbitkit/core/router.py +142 -0
- rabbitkit/core/topology.py +261 -0
- rabbitkit/core/topology_dispatch.py +74 -0
- rabbitkit/core/types.py +324 -0
- rabbitkit/dashboard/__init__.py +5 -0
- rabbitkit/dashboard/app.py +212 -0
- rabbitkit/di/__init__.py +19 -0
- rabbitkit/di/context.py +193 -0
- rabbitkit/di/depends.py +42 -0
- rabbitkit/di/resolver.py +503 -0
- rabbitkit/dlq.py +320 -0
- rabbitkit/experimental/__init__.py +50 -0
- rabbitkit/fastapi.py +91 -0
- rabbitkit/health.py +654 -0
- rabbitkit/highload/__init__.py +10 -0
- rabbitkit/highload/backpressure.py +514 -0
- rabbitkit/highload/batch.py +448 -0
- rabbitkit/locking.py +277 -0
- rabbitkit/management.py +470 -0
- rabbitkit/middleware/__init__.py +27 -0
- rabbitkit/middleware/base.py +125 -0
- rabbitkit/middleware/circuit_breaker.py +131 -0
- rabbitkit/middleware/compression.py +267 -0
- rabbitkit/middleware/deduplication.py +651 -0
- rabbitkit/middleware/error_classifier.py +43 -0
- rabbitkit/middleware/exception.py +105 -0
- rabbitkit/middleware/metrics.py +440 -0
- rabbitkit/middleware/otel.py +203 -0
- rabbitkit/middleware/rate_limit.py +247 -0
- rabbitkit/middleware/retry.py +540 -0
- rabbitkit/middleware/signing.py +682 -0
- rabbitkit/middleware/timeout.py +291 -0
- rabbitkit/py.typed +0 -0
- rabbitkit/queue_metrics.py +174 -0
- rabbitkit/results/__init__.py +6 -0
- rabbitkit/results/backend.py +102 -0
- rabbitkit/results/middleware.py +123 -0
- rabbitkit/rpc.py +632 -0
- rabbitkit/serialization/__init__.py +25 -0
- rabbitkit/serialization/base.py +35 -0
- rabbitkit/serialization/json.py +122 -0
- rabbitkit/serialization/msgspec.py +136 -0
- rabbitkit/serialization/pipeline.py +255 -0
- rabbitkit/streams.py +139 -0
- rabbitkit/sync/__init__.py +11 -0
- rabbitkit/sync/batch.py +595 -0
- rabbitkit/sync/broker.py +996 -0
- rabbitkit/sync/connection.py +209 -0
- rabbitkit/sync/pool.py +262 -0
- rabbitkit/sync/transport.py +1085 -0
- rabbitkit/testing/__init__.py +20 -0
- rabbitkit/testing/app.py +99 -0
- rabbitkit/testing/broker.py +540 -0
- rabbitkit/testing/fixtures.py +56 -0
- rabbitkit-0.9.0.dist-info/METADATA +575 -0
- rabbitkit-0.9.0.dist-info/RECORD +95 -0
- rabbitkit-0.9.0.dist-info/WHEEL +5 -0
- rabbitkit-0.9.0.dist-info/entry_points.txt +2 -0
- rabbitkit-0.9.0.dist-info/licenses/LICENSE +21 -0
- rabbitkit-0.9.0.dist-info/top_level.txt +1 -0
rabbitkit/health.py
ADDED
|
@@ -0,0 +1,654 @@
|
|
|
1
|
+
"""Health check utilities for rabbitkit brokers.
|
|
2
|
+
|
|
3
|
+
Provides callables suitable for use with any monitoring or health-check
|
|
4
|
+
framework.
|
|
5
|
+
|
|
6
|
+
Usage::
|
|
7
|
+
|
|
8
|
+
from rabbitkit.health import broker_health_check, BrokerStatus
|
|
9
|
+
|
|
10
|
+
# Standalone
|
|
11
|
+
status = broker_health_check(broker)
|
|
12
|
+
print(status.status, status.details)
|
|
13
|
+
|
|
14
|
+
# With any health-router framework
|
|
15
|
+
register_check(name="rabbitmq", check=lambda: broker_health_check(broker))
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import enum
|
|
21
|
+
import logging
|
|
22
|
+
import time
|
|
23
|
+
import typing
|
|
24
|
+
import warnings
|
|
25
|
+
from dataclasses import dataclass, field, replace
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
from rabbitkit.core.config import HealthCheckConfig
|
|
29
|
+
from rabbitkit.core.protocols import HealthProvider
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class HealthStatus(str, enum.Enum):
|
|
35
|
+
"""Health status levels."""
|
|
36
|
+
|
|
37
|
+
HEALTHY = "healthy"
|
|
38
|
+
DEGRADED = "degraded"
|
|
39
|
+
UNHEALTHY = "unhealthy"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True, slots=True)
|
|
43
|
+
class BrokerHealthResult:
|
|
44
|
+
"""Result of a broker health check."""
|
|
45
|
+
|
|
46
|
+
status: HealthStatus
|
|
47
|
+
started: bool = False
|
|
48
|
+
connected: bool = False
|
|
49
|
+
consumer_count: int = 0
|
|
50
|
+
route_count: int = 0
|
|
51
|
+
worker_pool_pending: int = 0
|
|
52
|
+
blocked: bool = False
|
|
53
|
+
details: dict[str, Any] = field(default_factory=dict)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ── Public-property / private-attr fallback helper ──────────────────────
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class _Missing:
|
|
60
|
+
"""Sentinel for "attribute absent" (distinct from a real False/None)."""
|
|
61
|
+
|
|
62
|
+
__slots__ = ()
|
|
63
|
+
|
|
64
|
+
def __repr__(self) -> str: # pragma: no cover — debug only
|
|
65
|
+
return "<missing>"
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
_MISSING = _Missing()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _get(broker: Any, public: str, private: str, default: Any = False) -> Any:
|
|
72
|
+
"""Try the public property first, then the private attr, then the default.
|
|
73
|
+
|
|
74
|
+
This makes the transition from private attributes (``_started``) to typed
|
|
75
|
+
properties (``started``) gradual: brokers can add the ``@property`` when
|
|
76
|
+
ready, and health checks pick it up automatically. When only the private
|
|
77
|
+
attribute exists, a ``DeprecationWarning`` is emitted once per process
|
|
78
|
+
per (public, private) pair so callers know to migrate.
|
|
79
|
+
"""
|
|
80
|
+
value = getattr(broker, public, _MISSING)
|
|
81
|
+
if value is not _MISSING:
|
|
82
|
+
return value
|
|
83
|
+
value = getattr(broker, private, _MISSING)
|
|
84
|
+
if value is not _MISSING:
|
|
85
|
+
warnings.warn(
|
|
86
|
+
f"Broker {type(broker).__name__} does not expose the typed "
|
|
87
|
+
f"property {public!r}; falling back to private attribute "
|
|
88
|
+
f"{private!r}. Implement {public!r} on the broker to silence this.",
|
|
89
|
+
DeprecationWarning,
|
|
90
|
+
stacklevel=3,
|
|
91
|
+
)
|
|
92
|
+
return value
|
|
93
|
+
return default
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def _get_started(broker: Any) -> bool:
|
|
97
|
+
return bool(_get(broker, "started", "_started", False))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _get_connected(broker: Any) -> bool:
|
|
101
|
+
"""Check transport connectivity via the typed property or private attr."""
|
|
102
|
+
connected = _get(broker, "connected", "_connected", _MISSING)
|
|
103
|
+
if connected is not _MISSING:
|
|
104
|
+
return bool(connected)
|
|
105
|
+
# Fallback: probe the private transport attribute directly.
|
|
106
|
+
transport = getattr(broker, "_transport", None)
|
|
107
|
+
if transport is not None:
|
|
108
|
+
return bool(getattr(transport, "is_connected", lambda: False)())
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _get_transport(broker: Any) -> Any:
|
|
113
|
+
"""Return the transport object, trying public then private names."""
|
|
114
|
+
transport = getattr(broker, "transport", _MISSING)
|
|
115
|
+
if transport is not _MISSING:
|
|
116
|
+
return transport
|
|
117
|
+
return getattr(broker, "_transport", None)
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def _get_consumer_count(broker: Any, transport: Any) -> int:
|
|
121
|
+
"""Count routes with an active consumer, cross-checked against transport liveness."""
|
|
122
|
+
consumer_count = _get(broker, "consumer_count", "_consumer_count", _MISSING)
|
|
123
|
+
if consumer_count is not _MISSING:
|
|
124
|
+
return int(consumer_count)
|
|
125
|
+
# Fallback: compute from routes list.
|
|
126
|
+
routes = getattr(broker, "routes", [])
|
|
127
|
+
computed: int = sum(1 for r in routes if getattr(r, "consumer_tag", None))
|
|
128
|
+
# M-SRE3: cross-check consumer registration against live transport
|
|
129
|
+
# connectivity. consumer_tag is set at registration time and may remain
|
|
130
|
+
# set even after the channel dies, so a registered-but-disconnected
|
|
131
|
+
# consumer must not be counted as active.
|
|
132
|
+
if transport is not None and not _transport_consumers_alive(transport):
|
|
133
|
+
computed = 0
|
|
134
|
+
return computed
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _get_route_count(broker: Any) -> int:
|
|
138
|
+
route_count = _get(broker, "route_count", "_route_count", _MISSING)
|
|
139
|
+
if route_count is not _MISSING:
|
|
140
|
+
return int(route_count)
|
|
141
|
+
return len(getattr(broker, "routes", []))
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _get_worker_pool_pending(broker: Any) -> int:
|
|
145
|
+
pending = _get(broker, "worker_pool_pending", "_worker_pool_pending", _MISSING)
|
|
146
|
+
if pending is not _MISSING:
|
|
147
|
+
return int(pending)
|
|
148
|
+
# Fallback: probe the private worker pool attribute.
|
|
149
|
+
pool = getattr(broker, "_worker_pool", None)
|
|
150
|
+
if pool is not None:
|
|
151
|
+
return int(getattr(pool, "pending_count", 0))
|
|
152
|
+
return 0
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _get_is_blocked(broker: Any, transport: Any) -> bool:
|
|
156
|
+
"""L15: True if the connection is currently blocked by broker-side flow
|
|
157
|
+
control (e.g. a memory/disk alarm). This is orthogonal to
|
|
158
|
+
``connected`` -- an open connection can be blocked while still
|
|
159
|
+
reporting connected, since ``connection.blocked`` is a soft
|
|
160
|
+
publish-side flow-control notification, not a disconnect.
|
|
161
|
+
|
|
162
|
+
Checks the opt-in ``FlowController`` first
|
|
163
|
+
(``broker.flow_controller.is_blocked``); falls back to the transport's
|
|
164
|
+
own passive blocked-state tracking so this is visible even without an
|
|
165
|
+
explicit ``FlowController`` wired in.
|
|
166
|
+
"""
|
|
167
|
+
fc = getattr(broker, "flow_controller", None)
|
|
168
|
+
if fc is not None:
|
|
169
|
+
is_blocked = getattr(fc, "is_blocked", _MISSING)
|
|
170
|
+
if is_blocked is not _MISSING:
|
|
171
|
+
return bool(is_blocked)
|
|
172
|
+
if transport is not None:
|
|
173
|
+
is_blocked = getattr(transport, "is_blocked", _MISSING)
|
|
174
|
+
if is_blocked is not _MISSING:
|
|
175
|
+
return bool(is_blocked)
|
|
176
|
+
return False
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _get_last_heartbeat(broker: Any) -> float | None:
|
|
180
|
+
hb = _get(broker, "last_heartbeat", "_last_heartbeat", None)
|
|
181
|
+
if hb is None:
|
|
182
|
+
return None
|
|
183
|
+
return float(hb)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def _local_broker_health_check(
|
|
187
|
+
broker: HealthProvider | Any,
|
|
188
|
+
config: HealthCheckConfig | None = None,
|
|
189
|
+
) -> BrokerHealthResult:
|
|
190
|
+
"""Process-local health check — see :func:`broker_health_check`.
|
|
191
|
+
|
|
192
|
+
Only inspects *this process's* view of the broker (its own connection,
|
|
193
|
+
consumers, worker pool). Cannot detect a network partition where this
|
|
194
|
+
process still holds a live connection to one node while the rest of the
|
|
195
|
+
cluster is unreachable — that requires an independent signal, which
|
|
196
|
+
:func:`broker_health_check`'s optional ``management_client`` provides.
|
|
197
|
+
"""
|
|
198
|
+
cfg = config or HealthCheckConfig()
|
|
199
|
+
|
|
200
|
+
# Not started
|
|
201
|
+
started = _get_started(broker)
|
|
202
|
+
if not started:
|
|
203
|
+
return BrokerHealthResult(
|
|
204
|
+
status=HealthStatus.UNHEALTHY,
|
|
205
|
+
started=False,
|
|
206
|
+
details={"reason": "broker not started"},
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
# Check transport connectivity
|
|
210
|
+
transport = _get_transport(broker)
|
|
211
|
+
connected = _get_connected(broker)
|
|
212
|
+
if not connected:
|
|
213
|
+
return BrokerHealthResult(
|
|
214
|
+
status=HealthStatus.UNHEALTHY,
|
|
215
|
+
started=True,
|
|
216
|
+
connected=False,
|
|
217
|
+
details={"reason": "transport not connected"},
|
|
218
|
+
)
|
|
219
|
+
|
|
220
|
+
# L15: a connection can be blocked (broker memory/disk alarm pausing
|
|
221
|
+
# publishes) while still reporting connected -- check this before the
|
|
222
|
+
# route/consumer checks below since it's the more actionable root cause.
|
|
223
|
+
blocked = _get_is_blocked(broker, transport)
|
|
224
|
+
|
|
225
|
+
# Check routes and consumers
|
|
226
|
+
route_count = _get_route_count(broker)
|
|
227
|
+
consumer_count = _get_consumer_count(broker, transport)
|
|
228
|
+
|
|
229
|
+
# Check worker pool
|
|
230
|
+
worker_pool_pending = _get_worker_pool_pending(broker)
|
|
231
|
+
|
|
232
|
+
# Determine status
|
|
233
|
+
if blocked:
|
|
234
|
+
status = HealthStatus.DEGRADED
|
|
235
|
+
details: dict[str, Any] = {
|
|
236
|
+
"reason": "connection blocked by broker flow control (memory/disk alarm); publishes will stall"
|
|
237
|
+
}
|
|
238
|
+
elif consumer_count < route_count:
|
|
239
|
+
status = HealthStatus.DEGRADED
|
|
240
|
+
details = {"reason": f"only {consumer_count}/{route_count} consumers active"}
|
|
241
|
+
elif worker_pool_pending > cfg.pending_threshold:
|
|
242
|
+
status = HealthStatus.DEGRADED
|
|
243
|
+
details = {"reason": f"worker pool backlog: {worker_pool_pending}"}
|
|
244
|
+
else:
|
|
245
|
+
status = HealthStatus.HEALTHY
|
|
246
|
+
details = {}
|
|
247
|
+
|
|
248
|
+
return BrokerHealthResult(
|
|
249
|
+
status=status,
|
|
250
|
+
started=True,
|
|
251
|
+
connected=connected,
|
|
252
|
+
consumer_count=consumer_count,
|
|
253
|
+
route_count=route_count,
|
|
254
|
+
worker_pool_pending=worker_pool_pending,
|
|
255
|
+
blocked=blocked,
|
|
256
|
+
details=details,
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _apply_management_check(result: BrokerHealthResult, ok: bool) -> BrokerHealthResult:
|
|
261
|
+
if ok or result.status == HealthStatus.UNHEALTHY:
|
|
262
|
+
return result
|
|
263
|
+
return replace(
|
|
264
|
+
result,
|
|
265
|
+
status=HealthStatus.DEGRADED,
|
|
266
|
+
details={
|
|
267
|
+
**result.details,
|
|
268
|
+
"management_check": "failed — this process has a live broker connection, but the "
|
|
269
|
+
"management API reports the node unhealthy (possible cluster partition)",
|
|
270
|
+
},
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def broker_health_check(
|
|
275
|
+
broker: HealthProvider | Any,
|
|
276
|
+
config: HealthCheckConfig | None = None,
|
|
277
|
+
management_client: Any = None,
|
|
278
|
+
) -> BrokerHealthResult:
|
|
279
|
+
"""Check broker health status (sync).
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
broker: A broker implementing :class:`HealthProvider` (typed
|
|
283
|
+
properties) or a legacy broker exposing private attributes
|
|
284
|
+
(``_started``, ``_transport``, ``_worker_pool``).
|
|
285
|
+
config: Optional :class:`HealthCheckConfig` to tune thresholds.
|
|
286
|
+
management_client: Optional :class:`~rabbitkit.management.RabbitManagementClient`
|
|
287
|
+
(sync ``.health_check()``). When given, its result is folded in
|
|
288
|
+
as an additional signal: this process may hold a perfectly live
|
|
289
|
+
connection to one node while the rest of a partitioned cluster is
|
|
290
|
+
unreachable, which the process-local checks alone cannot detect.
|
|
291
|
+
A failing management check downgrades an otherwise-HEALTHY result
|
|
292
|
+
to DEGRADED (never overrides an UNHEALTHY local result). Omit for
|
|
293
|
+
the original process-local-only behavior.
|
|
294
|
+
|
|
295
|
+
Returns:
|
|
296
|
+
BrokerHealthResult with status HEALTHY (started, connected, not
|
|
297
|
+
blocked, all consumers active), DEGRADED (started but connection
|
|
298
|
+
blocked (L15), consumers missing, pool backlog high, or — with
|
|
299
|
+
``management_client`` — the management API reports the node
|
|
300
|
+
unhealthy), or UNHEALTHY (not started or not connected).
|
|
301
|
+
"""
|
|
302
|
+
result = _local_broker_health_check(broker, config=config)
|
|
303
|
+
if management_client is not None and result.status != HealthStatus.UNHEALTHY:
|
|
304
|
+
result = _apply_management_check(result, management_client.health_check())
|
|
305
|
+
return result
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
async def broker_health_check_async(
|
|
309
|
+
broker: HealthProvider | Any,
|
|
310
|
+
config: HealthCheckConfig | None = None,
|
|
311
|
+
management_client: Any = None,
|
|
312
|
+
) -> BrokerHealthResult:
|
|
313
|
+
"""Async variant of broker_health_check.
|
|
314
|
+
|
|
315
|
+
Same local logic as sync -- transport.is_connected() is always sync.
|
|
316
|
+
``management_client`` (if given) must expose an async ``.health_check_async()``
|
|
317
|
+
— see :func:`broker_health_check` for the rationale and semantics.
|
|
318
|
+
"""
|
|
319
|
+
result = _local_broker_health_check(broker, config=config)
|
|
320
|
+
if management_client is not None and result.status != HealthStatus.UNHEALTHY:
|
|
321
|
+
result = _apply_management_check(result, await management_client.health_check_async())
|
|
322
|
+
return result
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
# Transport contract (I-5): a transport MAY expose any of these optional
|
|
326
|
+
# attributes/properties to advertise live consumer state. Each is checked
|
|
327
|
+
# independently; if any present one reports False, registered consumer_tags
|
|
328
|
+
# are treated as stale. When NONE of these exist on the transport, we cannot
|
|
329
|
+
# prove the channels are dead, so we fall back to trusting the registered
|
|
330
|
+
# consumer_tag (current/backwards-compatible behaviour).
|
|
331
|
+
#
|
|
332
|
+
# has_open_channels -> bool | () -> bool (e.g. SyncTransport exposes this)
|
|
333
|
+
# is_consuming -> bool | () -> bool
|
|
334
|
+
# consumers_active -> bool | () -> bool
|
|
335
|
+
_TRANSPORT_LIVENESS_ATTRS: tuple[str, ...] = (
|
|
336
|
+
"has_open_channels",
|
|
337
|
+
"is_consuming",
|
|
338
|
+
"consumers_active",
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
|
|
342
|
+
def _transport_consumers_alive(transport: Any) -> bool:
|
|
343
|
+
"""Best-effort check that the transport still has live consumers.
|
|
344
|
+
|
|
345
|
+
Probes the optional transport-contract attributes
|
|
346
|
+
(:data:`_TRANSPORT_LIVENESS_ATTRS`). Each attribute may be a plain bool
|
|
347
|
+
or a zero-arg callable returning bool. When any present attribute reports
|
|
348
|
+
``False``, the transport's consumer channels are considered dead and
|
|
349
|
+
this returns ``False``.
|
|
350
|
+
|
|
351
|
+
When NONE of the contract attributes exist on the transport, we cannot
|
|
352
|
+
prove the channels are dead, so we trust the registered ``consumer_tag``
|
|
353
|
+
(backwards-compatible behaviour — returns ``True``).
|
|
354
|
+
"""
|
|
355
|
+
any_present = False
|
|
356
|
+
for attr in _TRANSPORT_LIVENESS_ATTRS:
|
|
357
|
+
# Use getattr-without-default via a sentinel so a transport that
|
|
358
|
+
# genuinely sets the attribute to False is still detected.
|
|
359
|
+
flag = getattr(transport, attr, _MISSING)
|
|
360
|
+
if flag is _MISSING:
|
|
361
|
+
continue
|
|
362
|
+
any_present = True
|
|
363
|
+
if callable(flag):
|
|
364
|
+
try:
|
|
365
|
+
value = flag()
|
|
366
|
+
except Exception:
|
|
367
|
+
logger.debug("transport %s check raised", attr, exc_info=True)
|
|
368
|
+
continue
|
|
369
|
+
else:
|
|
370
|
+
value = flag
|
|
371
|
+
if value is False:
|
|
372
|
+
return False
|
|
373
|
+
# No richer signal available — trust the registered consumer_tag.
|
|
374
|
+
if not any_present:
|
|
375
|
+
return True
|
|
376
|
+
# At least one signal was present and none reported False.
|
|
377
|
+
return True
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def mark_heartbeat(broker: Any) -> None:
|
|
381
|
+
"""Record a liveness heartbeat on *broker*.
|
|
382
|
+
|
|
383
|
+
Brokers/transports should call this from their consume callback / I/O loop
|
|
384
|
+
(or any other "I made forward progress" signal) so :func:`broker_liveness`
|
|
385
|
+
can detect a wedged broker whose process is alive but is no longer
|
|
386
|
+
draining the network. Sets ``broker.last_heartbeat`` to the current
|
|
387
|
+
``time.monotonic()`` value.
|
|
388
|
+
|
|
389
|
+
Safe to call when the broker does not yet expose ``last_heartbeat`` — it
|
|
390
|
+
simply sets the attribute.
|
|
391
|
+
"""
|
|
392
|
+
broker.last_heartbeat = time.monotonic()
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def broker_liveness(broker: HealthProvider | Any, wedged_timeout: float = 60.0) -> bool:
|
|
396
|
+
"""Liveness probe — is the broker process alive and not hard-wedged?
|
|
397
|
+
|
|
398
|
+
Args:
|
|
399
|
+
broker: A broker implementing :class:`HealthProvider` (typed
|
|
400
|
+
properties) or a legacy broker exposing private attributes
|
|
401
|
+
(``_started``, ``_wedged``, ``last_heartbeat``).
|
|
402
|
+
wedged_timeout: Seconds without a heartbeat before liveness fails.
|
|
403
|
+
|
|
404
|
+
Liveness fails when:
|
|
405
|
+
|
|
406
|
+
- the broker is not started (``_started``/``started`` False/absent), OR
|
|
407
|
+
- an explicit ``_wedged`` flag is set to ``True`` (transports/brokers may
|
|
408
|
+
set this on a hard fault), OR
|
|
409
|
+
- a ``last_heartbeat`` is present and ``now - last_heartbeat >
|
|
410
|
+
wedged_timeout`` (a heartbeat was recorded via :func:`mark_heartbeat`
|
|
411
|
+
but has gone stale, meaning the I/O loop is wedged).
|
|
412
|
+
|
|
413
|
+
A transient broker/transport disconnect is *not* a liveness failure: the
|
|
414
|
+
process is still running and can recover. Use :func:`broker_readiness` to
|
|
415
|
+
decide whether to route traffic.
|
|
416
|
+
|
|
417
|
+
When no ``last_heartbeat`` attribute exists, liveness falls back to the
|
|
418
|
+
``_started`` / ``_wedged`` checks only (backwards compatible).
|
|
419
|
+
"""
|
|
420
|
+
started = _get_started(broker)
|
|
421
|
+
if not started:
|
|
422
|
+
return False
|
|
423
|
+
if _get(broker, "wedged", "_wedged", False):
|
|
424
|
+
return False
|
|
425
|
+
last_heartbeat = _get_last_heartbeat(broker)
|
|
426
|
+
if last_heartbeat is not None:
|
|
427
|
+
if time.monotonic() - last_heartbeat > wedged_timeout:
|
|
428
|
+
return False
|
|
429
|
+
return True
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def broker_readiness(
|
|
433
|
+
broker: HealthProvider | Any,
|
|
434
|
+
config: HealthCheckConfig | None = None,
|
|
435
|
+
management_client: Any = None,
|
|
436
|
+
) -> bool:
|
|
437
|
+
"""Readiness probe — is the broker ready to serve traffic right now?
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
broker: A broker implementing :class:`HealthProvider` (typed
|
|
441
|
+
properties) or a legacy broker exposing private attributes.
|
|
442
|
+
config: Optional :class:`HealthCheckConfig` to tune thresholds.
|
|
443
|
+
management_client: Optional :class:`~rabbitkit.management.RabbitManagementClient`
|
|
444
|
+
— see :func:`broker_health_check`. A failing management check
|
|
445
|
+
fails readiness even if this process's own connection looks fine.
|
|
446
|
+
|
|
447
|
+
Requires: health check not UNHEALTHY, transport connected, connection
|
|
448
|
+
not blocked by broker flow control (L15), and every registered route
|
|
449
|
+
has an active (live) consumer. Use this for load-balancer / ingress
|
|
450
|
+
gating; use :func:`broker_liveness` for restart decisions.
|
|
451
|
+
"""
|
|
452
|
+
result = broker_health_check(broker, config=config, management_client=management_client)
|
|
453
|
+
if result.status == HealthStatus.UNHEALTHY:
|
|
454
|
+
return False
|
|
455
|
+
if not result.connected:
|
|
456
|
+
return False
|
|
457
|
+
# L15: a blocked connection can't publish -- not ready for traffic even
|
|
458
|
+
# though it's technically still "connected" and may still have live
|
|
459
|
+
# consumers.
|
|
460
|
+
if result.blocked:
|
|
461
|
+
return False
|
|
462
|
+
# A failing management check downgrades to DEGRADED rather than
|
|
463
|
+
# UNHEALTHY (this process's own connection may be fine), but a
|
|
464
|
+
# partitioned/unreachable node is still not ready for traffic.
|
|
465
|
+
if "management_check" in result.details:
|
|
466
|
+
return False
|
|
467
|
+
# M-SRE3: every route must have a live consumer. The health check already
|
|
468
|
+
# verified transport connectivity above.
|
|
469
|
+
return result.consumer_count == result.route_count
|
|
470
|
+
|
|
471
|
+
|
|
472
|
+
async def broker_liveness_async(broker: HealthProvider | Any, wedged_timeout: float = 60.0) -> bool:
|
|
473
|
+
"""Async variant of :func:`broker_liveness`."""
|
|
474
|
+
return broker_liveness(broker, wedged_timeout=wedged_timeout)
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
async def broker_readiness_async(
|
|
478
|
+
broker: HealthProvider | Any,
|
|
479
|
+
config: HealthCheckConfig | None = None,
|
|
480
|
+
management_client: Any = None,
|
|
481
|
+
) -> bool:
|
|
482
|
+
"""Async variant of :func:`broker_readiness`.
|
|
483
|
+
|
|
484
|
+
Uses :func:`broker_health_check_async` (``management_client.health_check_async()``)
|
|
485
|
+
rather than delegating to the sync ``broker_readiness`` — the sync
|
|
486
|
+
management check makes a blocking network call, which must not run on
|
|
487
|
+
the event loop.
|
|
488
|
+
"""
|
|
489
|
+
result = await broker_health_check_async(broker, config=config, management_client=management_client)
|
|
490
|
+
if result.status == HealthStatus.UNHEALTHY:
|
|
491
|
+
return False
|
|
492
|
+
if not result.connected:
|
|
493
|
+
return False
|
|
494
|
+
if result.blocked:
|
|
495
|
+
return False
|
|
496
|
+
if "management_check" in result.details:
|
|
497
|
+
return False
|
|
498
|
+
return result.consumer_count == result.route_count
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
# ── Health-transition watcher ────────────────────────────────────────────
|
|
502
|
+
|
|
503
|
+
|
|
504
|
+
class HealthWatcher:
|
|
505
|
+
"""Opt-in push-style health notifications (sync, daemon-thread poller).
|
|
506
|
+
|
|
507
|
+
Polls :func:`broker_health_check` every *interval* seconds and fires
|
|
508
|
+
``on_change(old, new, result)`` when the status transitions -- but only
|
|
509
|
+
after *debounce* consecutive identical readings, so a single flapping
|
|
510
|
+
poll never pages anyone. Callback exceptions are logged, never raised,
|
|
511
|
+
and never stall the loop.
|
|
512
|
+
|
|
513
|
+
Positioning: for deployments that aren't (only) Kubernetes -- bare
|
|
514
|
+
metal, VMs, direct pager/webhook wiring. On k8s, keep
|
|
515
|
+
:func:`broker_liveness`/:func:`broker_readiness` probes as the primary
|
|
516
|
+
signal; this watcher complements, never replaces, them.
|
|
517
|
+
|
|
518
|
+
When *collector* is given (any ``MetricsCollector`` with ``set_gauge``),
|
|
519
|
+
every poll also emits a ``rabbitkit_health_state`` gauge
|
|
520
|
+
(0 healthy / 1 degraded / 2 unhealthy), so Prometheus users get a state
|
|
521
|
+
series without writing a callback.
|
|
522
|
+
|
|
523
|
+
*clock* and *sleeper* are injectable for tests (no wall-clock sleeps in
|
|
524
|
+
the test suite -- a hard-won deflaking lesson).
|
|
525
|
+
"""
|
|
526
|
+
|
|
527
|
+
_GAUGE_VALUES: typing.ClassVar[dict[HealthStatus, int]] = {
|
|
528
|
+
HealthStatus.HEALTHY: 0,
|
|
529
|
+
HealthStatus.DEGRADED: 1,
|
|
530
|
+
HealthStatus.UNHEALTHY: 2,
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
def __init__(
|
|
534
|
+
self,
|
|
535
|
+
broker: HealthProvider | Any,
|
|
536
|
+
*,
|
|
537
|
+
interval: float = 10.0,
|
|
538
|
+
on_change: Any = None,
|
|
539
|
+
management_client: Any = None,
|
|
540
|
+
config: HealthCheckConfig | None = None,
|
|
541
|
+
debounce: int = 2,
|
|
542
|
+
collector: Any = None,
|
|
543
|
+
gauge_name: str = "rabbitkit_health_state",
|
|
544
|
+
) -> None:
|
|
545
|
+
if interval <= 0:
|
|
546
|
+
raise ValueError(f"HealthWatcher interval must be > 0, got {interval}")
|
|
547
|
+
if debounce < 1:
|
|
548
|
+
raise ValueError(f"HealthWatcher debounce must be >= 1, got {debounce}")
|
|
549
|
+
self._broker = broker
|
|
550
|
+
self._interval = interval
|
|
551
|
+
self._on_change = on_change
|
|
552
|
+
self._management_client = management_client
|
|
553
|
+
self._config = config
|
|
554
|
+
self._debounce = debounce
|
|
555
|
+
self._collector = collector
|
|
556
|
+
self._gauge_name = gauge_name
|
|
557
|
+
|
|
558
|
+
self._current: HealthStatus | None = None # last CONFIRMED status
|
|
559
|
+
self._candidate: HealthStatus | None = None
|
|
560
|
+
self._candidate_count = 0
|
|
561
|
+
self._thread: Any = None
|
|
562
|
+
self._stop_event: Any = None
|
|
563
|
+
|
|
564
|
+
@property
|
|
565
|
+
def current_status(self) -> HealthStatus | None:
|
|
566
|
+
"""Last debounce-confirmed status (None until the first confirmation)."""
|
|
567
|
+
return self._current
|
|
568
|
+
|
|
569
|
+
def _tick(self) -> None:
|
|
570
|
+
"""One poll: read health, then run the shared debounced state machine."""
|
|
571
|
+
result = broker_health_check(
|
|
572
|
+
self._broker, config=self._config, management_client=self._management_client
|
|
573
|
+
)
|
|
574
|
+
self._apply(result)
|
|
575
|
+
|
|
576
|
+
def _apply(self, result: BrokerHealthResult) -> None:
|
|
577
|
+
"""Debounced state machine on an already-obtained result (shared with
|
|
578
|
+
the async variant)."""
|
|
579
|
+
if self._collector is not None:
|
|
580
|
+
try:
|
|
581
|
+
self._collector.set_gauge(self._gauge_name, {}, self._GAUGE_VALUES[result.status])
|
|
582
|
+
except Exception: # pragma: no cover — collector bugs never stall the loop
|
|
583
|
+
logger.exception("HealthWatcher gauge emission raised")
|
|
584
|
+
|
|
585
|
+
status = result.status
|
|
586
|
+
if status == self._current:
|
|
587
|
+
# Confirmed state re-observed; reset any half-built candidate.
|
|
588
|
+
self._candidate = None
|
|
589
|
+
self._candidate_count = 0
|
|
590
|
+
return
|
|
591
|
+
if status != self._candidate:
|
|
592
|
+
self._candidate = status
|
|
593
|
+
self._candidate_count = 0
|
|
594
|
+
self._candidate_count += 1
|
|
595
|
+
if self._candidate_count < self._debounce:
|
|
596
|
+
return
|
|
597
|
+
old, self._current = self._current, status
|
|
598
|
+
self._candidate = None
|
|
599
|
+
self._candidate_count = 0
|
|
600
|
+
if self._on_change is not None:
|
|
601
|
+
try:
|
|
602
|
+
self._on_change(old, status, result)
|
|
603
|
+
except Exception:
|
|
604
|
+
logger.exception("HealthWatcher on_change callback raised")
|
|
605
|
+
|
|
606
|
+
def start(self) -> None:
|
|
607
|
+
"""Start the daemon poller thread. Idempotent."""
|
|
608
|
+
import threading
|
|
609
|
+
|
|
610
|
+
if self._thread is not None and self._thread.is_alive():
|
|
611
|
+
return
|
|
612
|
+
self._stop_event = threading.Event()
|
|
613
|
+
stop = self._stop_event
|
|
614
|
+
|
|
615
|
+
def _loop() -> None:
|
|
616
|
+
while not stop.wait(timeout=self._interval):
|
|
617
|
+
try:
|
|
618
|
+
self._tick()
|
|
619
|
+
except Exception: # pragma: no cover — defensive; _tick guards itself
|
|
620
|
+
logger.exception("HealthWatcher tick raised")
|
|
621
|
+
|
|
622
|
+
self._thread = threading.Thread(target=_loop, name="rabbitkit-health-watcher", daemon=True)
|
|
623
|
+
self._thread.start()
|
|
624
|
+
|
|
625
|
+
def stop(self, timeout: float = 5.0) -> None:
|
|
626
|
+
"""Stop the poller (bounded join). Idempotent."""
|
|
627
|
+
if self._stop_event is not None:
|
|
628
|
+
self._stop_event.set()
|
|
629
|
+
if self._thread is not None:
|
|
630
|
+
self._thread.join(timeout=timeout)
|
|
631
|
+
self._thread = None
|
|
632
|
+
|
|
633
|
+
|
|
634
|
+
class AsyncHealthWatcher(HealthWatcher):
|
|
635
|
+
"""Async variant of :class:`HealthWatcher` — an asyncio task instead of
|
|
636
|
+
a thread, and the management check (if any) awaited via
|
|
637
|
+
:func:`broker_health_check_async` so it never blocks the event loop."""
|
|
638
|
+
|
|
639
|
+
async def _tick_async(self) -> None:
|
|
640
|
+
result = await broker_health_check_async(
|
|
641
|
+
self._broker, config=self._config, management_client=self._management_client
|
|
642
|
+
)
|
|
643
|
+
self._apply(result)
|
|
644
|
+
|
|
645
|
+
async def run(self) -> None:
|
|
646
|
+
"""Poll forever (cancel the task to stop)."""
|
|
647
|
+
import asyncio
|
|
648
|
+
|
|
649
|
+
while True:
|
|
650
|
+
await asyncio.sleep(self._interval)
|
|
651
|
+
try:
|
|
652
|
+
await self._tick_async()
|
|
653
|
+
except Exception: # pragma: no cover — defensive; _tick guards itself
|
|
654
|
+
logger.exception("HealthWatcher tick raised")
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""High-load infrastructure module — backpressure, batch publish/ack."""
|
|
2
|
+
|
|
3
|
+
from rabbitkit.highload.backpressure import FlowController
|
|
4
|
+
from rabbitkit.highload.batch import BatchAcker, BatchPublisher
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"BatchAcker",
|
|
8
|
+
"BatchPublisher",
|
|
9
|
+
"FlowController",
|
|
10
|
+
]
|