rabbitkit 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. rabbitkit/__init__.py +201 -0
  2. rabbitkit/_version.py +3 -0
  3. rabbitkit/aio/__init__.py +31 -0
  4. rabbitkit/async_/__init__.py +9 -0
  5. rabbitkit/async_/batch.py +213 -0
  6. rabbitkit/async_/broker.py +1123 -0
  7. rabbitkit/async_/connection.py +274 -0
  8. rabbitkit/async_/pool.py +363 -0
  9. rabbitkit/async_/transport.py +877 -0
  10. rabbitkit/asyncapi/__init__.py +5 -0
  11. rabbitkit/asyncapi/generator.py +219 -0
  12. rabbitkit/asyncapi/schema.py +98 -0
  13. rabbitkit/cli/__init__.py +77 -0
  14. rabbitkit/cli/_utils.py +38 -0
  15. rabbitkit/cli/commands/__init__.py +0 -0
  16. rabbitkit/cli/commands/dlq.py +190 -0
  17. rabbitkit/cli/commands/health.py +34 -0
  18. rabbitkit/cli/commands/migrate.py +570 -0
  19. rabbitkit/cli/commands/routes.py +88 -0
  20. rabbitkit/cli/commands/run.py +144 -0
  21. rabbitkit/cli/commands/shell.py +72 -0
  22. rabbitkit/cli/commands/topology.py +346 -0
  23. rabbitkit/concurrency.py +451 -0
  24. rabbitkit/core/__init__.py +5 -0
  25. rabbitkit/core/app.py +323 -0
  26. rabbitkit/core/config.py +849 -0
  27. rabbitkit/core/env_config.py +251 -0
  28. rabbitkit/core/errors.py +199 -0
  29. rabbitkit/core/logging.py +261 -0
  30. rabbitkit/core/message.py +235 -0
  31. rabbitkit/core/path.py +53 -0
  32. rabbitkit/core/pipeline.py +1289 -0
  33. rabbitkit/core/protocols.py +349 -0
  34. rabbitkit/core/registry.py +284 -0
  35. rabbitkit/core/route.py +329 -0
  36. rabbitkit/core/router.py +142 -0
  37. rabbitkit/core/topology.py +261 -0
  38. rabbitkit/core/topology_dispatch.py +74 -0
  39. rabbitkit/core/types.py +324 -0
  40. rabbitkit/dashboard/__init__.py +5 -0
  41. rabbitkit/dashboard/app.py +212 -0
  42. rabbitkit/di/__init__.py +19 -0
  43. rabbitkit/di/context.py +193 -0
  44. rabbitkit/di/depends.py +42 -0
  45. rabbitkit/di/resolver.py +503 -0
  46. rabbitkit/dlq.py +320 -0
  47. rabbitkit/experimental/__init__.py +50 -0
  48. rabbitkit/fastapi.py +91 -0
  49. rabbitkit/health.py +654 -0
  50. rabbitkit/highload/__init__.py +10 -0
  51. rabbitkit/highload/backpressure.py +514 -0
  52. rabbitkit/highload/batch.py +448 -0
  53. rabbitkit/locking.py +277 -0
  54. rabbitkit/management.py +470 -0
  55. rabbitkit/middleware/__init__.py +27 -0
  56. rabbitkit/middleware/base.py +125 -0
  57. rabbitkit/middleware/circuit_breaker.py +131 -0
  58. rabbitkit/middleware/compression.py +267 -0
  59. rabbitkit/middleware/deduplication.py +651 -0
  60. rabbitkit/middleware/error_classifier.py +43 -0
  61. rabbitkit/middleware/exception.py +105 -0
  62. rabbitkit/middleware/metrics.py +440 -0
  63. rabbitkit/middleware/otel.py +203 -0
  64. rabbitkit/middleware/rate_limit.py +247 -0
  65. rabbitkit/middleware/retry.py +540 -0
  66. rabbitkit/middleware/signing.py +682 -0
  67. rabbitkit/middleware/timeout.py +291 -0
  68. rabbitkit/py.typed +0 -0
  69. rabbitkit/queue_metrics.py +174 -0
  70. rabbitkit/results/__init__.py +6 -0
  71. rabbitkit/results/backend.py +102 -0
  72. rabbitkit/results/middleware.py +123 -0
  73. rabbitkit/rpc.py +632 -0
  74. rabbitkit/serialization/__init__.py +25 -0
  75. rabbitkit/serialization/base.py +35 -0
  76. rabbitkit/serialization/json.py +122 -0
  77. rabbitkit/serialization/msgspec.py +136 -0
  78. rabbitkit/serialization/pipeline.py +255 -0
  79. rabbitkit/streams.py +139 -0
  80. rabbitkit/sync/__init__.py +11 -0
  81. rabbitkit/sync/batch.py +595 -0
  82. rabbitkit/sync/broker.py +996 -0
  83. rabbitkit/sync/connection.py +209 -0
  84. rabbitkit/sync/pool.py +262 -0
  85. rabbitkit/sync/transport.py +1085 -0
  86. rabbitkit/testing/__init__.py +20 -0
  87. rabbitkit/testing/app.py +99 -0
  88. rabbitkit/testing/broker.py +540 -0
  89. rabbitkit/testing/fixtures.py +56 -0
  90. rabbitkit-0.9.0.dist-info/METADATA +575 -0
  91. rabbitkit-0.9.0.dist-info/RECORD +95 -0
  92. rabbitkit-0.9.0.dist-info/WHEEL +5 -0
  93. rabbitkit-0.9.0.dist-info/entry_points.txt +2 -0
  94. rabbitkit-0.9.0.dist-info/licenses/LICENSE +21 -0
  95. rabbitkit-0.9.0.dist-info/top_level.txt +1 -0
rabbitkit/health.py ADDED
@@ -0,0 +1,654 @@
1
+ """Health check utilities for rabbitkit brokers.
2
+
3
+ Provides callables suitable for use with any monitoring or health-check
4
+ framework.
5
+
6
+ Usage::
7
+
8
+ from rabbitkit.health import broker_health_check, BrokerStatus
9
+
10
+ # Standalone
11
+ status = broker_health_check(broker)
12
+ print(status.status, status.details)
13
+
14
+ # With any health-router framework
15
+ register_check(name="rabbitmq", check=lambda: broker_health_check(broker))
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import enum
21
+ import logging
22
+ import time
23
+ import typing
24
+ import warnings
25
+ from dataclasses import dataclass, field, replace
26
+ from typing import Any
27
+
28
+ from rabbitkit.core.config import HealthCheckConfig
29
+ from rabbitkit.core.protocols import HealthProvider
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+
34
+ class HealthStatus(str, enum.Enum):
35
+ """Health status levels."""
36
+
37
+ HEALTHY = "healthy"
38
+ DEGRADED = "degraded"
39
+ UNHEALTHY = "unhealthy"
40
+
41
+
42
+ @dataclass(frozen=True, slots=True)
43
+ class BrokerHealthResult:
44
+ """Result of a broker health check."""
45
+
46
+ status: HealthStatus
47
+ started: bool = False
48
+ connected: bool = False
49
+ consumer_count: int = 0
50
+ route_count: int = 0
51
+ worker_pool_pending: int = 0
52
+ blocked: bool = False
53
+ details: dict[str, Any] = field(default_factory=dict)
54
+
55
+
56
+ # ── Public-property / private-attr fallback helper ──────────────────────
57
+
58
+
59
+ class _Missing:
60
+ """Sentinel for "attribute absent" (distinct from a real False/None)."""
61
+
62
+ __slots__ = ()
63
+
64
+ def __repr__(self) -> str: # pragma: no cover — debug only
65
+ return "<missing>"
66
+
67
+
68
+ _MISSING = _Missing()
69
+
70
+
71
+ def _get(broker: Any, public: str, private: str, default: Any = False) -> Any:
72
+ """Try the public property first, then the private attr, then the default.
73
+
74
+ This makes the transition from private attributes (``_started``) to typed
75
+ properties (``started``) gradual: brokers can add the ``@property`` when
76
+ ready, and health checks pick it up automatically. When only the private
77
+ attribute exists, a ``DeprecationWarning`` is emitted once per process
78
+ per (public, private) pair so callers know to migrate.
79
+ """
80
+ value = getattr(broker, public, _MISSING)
81
+ if value is not _MISSING:
82
+ return value
83
+ value = getattr(broker, private, _MISSING)
84
+ if value is not _MISSING:
85
+ warnings.warn(
86
+ f"Broker {type(broker).__name__} does not expose the typed "
87
+ f"property {public!r}; falling back to private attribute "
88
+ f"{private!r}. Implement {public!r} on the broker to silence this.",
89
+ DeprecationWarning,
90
+ stacklevel=3,
91
+ )
92
+ return value
93
+ return default
94
+
95
+
96
+ def _get_started(broker: Any) -> bool:
97
+ return bool(_get(broker, "started", "_started", False))
98
+
99
+
100
+ def _get_connected(broker: Any) -> bool:
101
+ """Check transport connectivity via the typed property or private attr."""
102
+ connected = _get(broker, "connected", "_connected", _MISSING)
103
+ if connected is not _MISSING:
104
+ return bool(connected)
105
+ # Fallback: probe the private transport attribute directly.
106
+ transport = getattr(broker, "_transport", None)
107
+ if transport is not None:
108
+ return bool(getattr(transport, "is_connected", lambda: False)())
109
+ return False
110
+
111
+
112
+ def _get_transport(broker: Any) -> Any:
113
+ """Return the transport object, trying public then private names."""
114
+ transport = getattr(broker, "transport", _MISSING)
115
+ if transport is not _MISSING:
116
+ return transport
117
+ return getattr(broker, "_transport", None)
118
+
119
+
120
+ def _get_consumer_count(broker: Any, transport: Any) -> int:
121
+ """Count routes with an active consumer, cross-checked against transport liveness."""
122
+ consumer_count = _get(broker, "consumer_count", "_consumer_count", _MISSING)
123
+ if consumer_count is not _MISSING:
124
+ return int(consumer_count)
125
+ # Fallback: compute from routes list.
126
+ routes = getattr(broker, "routes", [])
127
+ computed: int = sum(1 for r in routes if getattr(r, "consumer_tag", None))
128
+ # M-SRE3: cross-check consumer registration against live transport
129
+ # connectivity. consumer_tag is set at registration time and may remain
130
+ # set even after the channel dies, so a registered-but-disconnected
131
+ # consumer must not be counted as active.
132
+ if transport is not None and not _transport_consumers_alive(transport):
133
+ computed = 0
134
+ return computed
135
+
136
+
137
+ def _get_route_count(broker: Any) -> int:
138
+ route_count = _get(broker, "route_count", "_route_count", _MISSING)
139
+ if route_count is not _MISSING:
140
+ return int(route_count)
141
+ return len(getattr(broker, "routes", []))
142
+
143
+
144
+ def _get_worker_pool_pending(broker: Any) -> int:
145
+ pending = _get(broker, "worker_pool_pending", "_worker_pool_pending", _MISSING)
146
+ if pending is not _MISSING:
147
+ return int(pending)
148
+ # Fallback: probe the private worker pool attribute.
149
+ pool = getattr(broker, "_worker_pool", None)
150
+ if pool is not None:
151
+ return int(getattr(pool, "pending_count", 0))
152
+ return 0
153
+
154
+
155
+ def _get_is_blocked(broker: Any, transport: Any) -> bool:
156
+ """L15: True if the connection is currently blocked by broker-side flow
157
+ control (e.g. a memory/disk alarm). This is orthogonal to
158
+ ``connected`` -- an open connection can be blocked while still
159
+ reporting connected, since ``connection.blocked`` is a soft
160
+ publish-side flow-control notification, not a disconnect.
161
+
162
+ Checks the opt-in ``FlowController`` first
163
+ (``broker.flow_controller.is_blocked``); falls back to the transport's
164
+ own passive blocked-state tracking so this is visible even without an
165
+ explicit ``FlowController`` wired in.
166
+ """
167
+ fc = getattr(broker, "flow_controller", None)
168
+ if fc is not None:
169
+ is_blocked = getattr(fc, "is_blocked", _MISSING)
170
+ if is_blocked is not _MISSING:
171
+ return bool(is_blocked)
172
+ if transport is not None:
173
+ is_blocked = getattr(transport, "is_blocked", _MISSING)
174
+ if is_blocked is not _MISSING:
175
+ return bool(is_blocked)
176
+ return False
177
+
178
+
179
+ def _get_last_heartbeat(broker: Any) -> float | None:
180
+ hb = _get(broker, "last_heartbeat", "_last_heartbeat", None)
181
+ if hb is None:
182
+ return None
183
+ return float(hb)
184
+
185
+
186
+ def _local_broker_health_check(
187
+ broker: HealthProvider | Any,
188
+ config: HealthCheckConfig | None = None,
189
+ ) -> BrokerHealthResult:
190
+ """Process-local health check — see :func:`broker_health_check`.
191
+
192
+ Only inspects *this process's* view of the broker (its own connection,
193
+ consumers, worker pool). Cannot detect a network partition where this
194
+ process still holds a live connection to one node while the rest of the
195
+ cluster is unreachable — that requires an independent signal, which
196
+ :func:`broker_health_check`'s optional ``management_client`` provides.
197
+ """
198
+ cfg = config or HealthCheckConfig()
199
+
200
+ # Not started
201
+ started = _get_started(broker)
202
+ if not started:
203
+ return BrokerHealthResult(
204
+ status=HealthStatus.UNHEALTHY,
205
+ started=False,
206
+ details={"reason": "broker not started"},
207
+ )
208
+
209
+ # Check transport connectivity
210
+ transport = _get_transport(broker)
211
+ connected = _get_connected(broker)
212
+ if not connected:
213
+ return BrokerHealthResult(
214
+ status=HealthStatus.UNHEALTHY,
215
+ started=True,
216
+ connected=False,
217
+ details={"reason": "transport not connected"},
218
+ )
219
+
220
+ # L15: a connection can be blocked (broker memory/disk alarm pausing
221
+ # publishes) while still reporting connected -- check this before the
222
+ # route/consumer checks below since it's the more actionable root cause.
223
+ blocked = _get_is_blocked(broker, transport)
224
+
225
+ # Check routes and consumers
226
+ route_count = _get_route_count(broker)
227
+ consumer_count = _get_consumer_count(broker, transport)
228
+
229
+ # Check worker pool
230
+ worker_pool_pending = _get_worker_pool_pending(broker)
231
+
232
+ # Determine status
233
+ if blocked:
234
+ status = HealthStatus.DEGRADED
235
+ details: dict[str, Any] = {
236
+ "reason": "connection blocked by broker flow control (memory/disk alarm); publishes will stall"
237
+ }
238
+ elif consumer_count < route_count:
239
+ status = HealthStatus.DEGRADED
240
+ details = {"reason": f"only {consumer_count}/{route_count} consumers active"}
241
+ elif worker_pool_pending > cfg.pending_threshold:
242
+ status = HealthStatus.DEGRADED
243
+ details = {"reason": f"worker pool backlog: {worker_pool_pending}"}
244
+ else:
245
+ status = HealthStatus.HEALTHY
246
+ details = {}
247
+
248
+ return BrokerHealthResult(
249
+ status=status,
250
+ started=True,
251
+ connected=connected,
252
+ consumer_count=consumer_count,
253
+ route_count=route_count,
254
+ worker_pool_pending=worker_pool_pending,
255
+ blocked=blocked,
256
+ details=details,
257
+ )
258
+
259
+
260
+ def _apply_management_check(result: BrokerHealthResult, ok: bool) -> BrokerHealthResult:
261
+ if ok or result.status == HealthStatus.UNHEALTHY:
262
+ return result
263
+ return replace(
264
+ result,
265
+ status=HealthStatus.DEGRADED,
266
+ details={
267
+ **result.details,
268
+ "management_check": "failed — this process has a live broker connection, but the "
269
+ "management API reports the node unhealthy (possible cluster partition)",
270
+ },
271
+ )
272
+
273
+
274
+ def broker_health_check(
275
+ broker: HealthProvider | Any,
276
+ config: HealthCheckConfig | None = None,
277
+ management_client: Any = None,
278
+ ) -> BrokerHealthResult:
279
+ """Check broker health status (sync).
280
+
281
+ Args:
282
+ broker: A broker implementing :class:`HealthProvider` (typed
283
+ properties) or a legacy broker exposing private attributes
284
+ (``_started``, ``_transport``, ``_worker_pool``).
285
+ config: Optional :class:`HealthCheckConfig` to tune thresholds.
286
+ management_client: Optional :class:`~rabbitkit.management.RabbitManagementClient`
287
+ (sync ``.health_check()``). When given, its result is folded in
288
+ as an additional signal: this process may hold a perfectly live
289
+ connection to one node while the rest of a partitioned cluster is
290
+ unreachable, which the process-local checks alone cannot detect.
291
+ A failing management check downgrades an otherwise-HEALTHY result
292
+ to DEGRADED (never overrides an UNHEALTHY local result). Omit for
293
+ the original process-local-only behavior.
294
+
295
+ Returns:
296
+ BrokerHealthResult with status HEALTHY (started, connected, not
297
+ blocked, all consumers active), DEGRADED (started but connection
298
+ blocked (L15), consumers missing, pool backlog high, or — with
299
+ ``management_client`` — the management API reports the node
300
+ unhealthy), or UNHEALTHY (not started or not connected).
301
+ """
302
+ result = _local_broker_health_check(broker, config=config)
303
+ if management_client is not None and result.status != HealthStatus.UNHEALTHY:
304
+ result = _apply_management_check(result, management_client.health_check())
305
+ return result
306
+
307
+
308
+ async def broker_health_check_async(
309
+ broker: HealthProvider | Any,
310
+ config: HealthCheckConfig | None = None,
311
+ management_client: Any = None,
312
+ ) -> BrokerHealthResult:
313
+ """Async variant of broker_health_check.
314
+
315
+ Same local logic as sync -- transport.is_connected() is always sync.
316
+ ``management_client`` (if given) must expose an async ``.health_check_async()``
317
+ — see :func:`broker_health_check` for the rationale and semantics.
318
+ """
319
+ result = _local_broker_health_check(broker, config=config)
320
+ if management_client is not None and result.status != HealthStatus.UNHEALTHY:
321
+ result = _apply_management_check(result, await management_client.health_check_async())
322
+ return result
323
+
324
+
325
+ # Transport contract (I-5): a transport MAY expose any of these optional
326
+ # attributes/properties to advertise live consumer state. Each is checked
327
+ # independently; if any present one reports False, registered consumer_tags
328
+ # are treated as stale. When NONE of these exist on the transport, we cannot
329
+ # prove the channels are dead, so we fall back to trusting the registered
330
+ # consumer_tag (current/backwards-compatible behaviour).
331
+ #
332
+ # has_open_channels -> bool | () -> bool (e.g. SyncTransport exposes this)
333
+ # is_consuming -> bool | () -> bool
334
+ # consumers_active -> bool | () -> bool
335
+ _TRANSPORT_LIVENESS_ATTRS: tuple[str, ...] = (
336
+ "has_open_channels",
337
+ "is_consuming",
338
+ "consumers_active",
339
+ )
340
+
341
+
342
+ def _transport_consumers_alive(transport: Any) -> bool:
343
+ """Best-effort check that the transport still has live consumers.
344
+
345
+ Probes the optional transport-contract attributes
346
+ (:data:`_TRANSPORT_LIVENESS_ATTRS`). Each attribute may be a plain bool
347
+ or a zero-arg callable returning bool. When any present attribute reports
348
+ ``False``, the transport's consumer channels are considered dead and
349
+ this returns ``False``.
350
+
351
+ When NONE of the contract attributes exist on the transport, we cannot
352
+ prove the channels are dead, so we trust the registered ``consumer_tag``
353
+ (backwards-compatible behaviour — returns ``True``).
354
+ """
355
+ any_present = False
356
+ for attr in _TRANSPORT_LIVENESS_ATTRS:
357
+ # Use getattr-without-default via a sentinel so a transport that
358
+ # genuinely sets the attribute to False is still detected.
359
+ flag = getattr(transport, attr, _MISSING)
360
+ if flag is _MISSING:
361
+ continue
362
+ any_present = True
363
+ if callable(flag):
364
+ try:
365
+ value = flag()
366
+ except Exception:
367
+ logger.debug("transport %s check raised", attr, exc_info=True)
368
+ continue
369
+ else:
370
+ value = flag
371
+ if value is False:
372
+ return False
373
+ # No richer signal available — trust the registered consumer_tag.
374
+ if not any_present:
375
+ return True
376
+ # At least one signal was present and none reported False.
377
+ return True
378
+
379
+
380
+ def mark_heartbeat(broker: Any) -> None:
381
+ """Record a liveness heartbeat on *broker*.
382
+
383
+ Brokers/transports should call this from their consume callback / I/O loop
384
+ (or any other "I made forward progress" signal) so :func:`broker_liveness`
385
+ can detect a wedged broker whose process is alive but is no longer
386
+ draining the network. Sets ``broker.last_heartbeat`` to the current
387
+ ``time.monotonic()`` value.
388
+
389
+ Safe to call when the broker does not yet expose ``last_heartbeat`` — it
390
+ simply sets the attribute.
391
+ """
392
+ broker.last_heartbeat = time.monotonic()
393
+
394
+
395
+ def broker_liveness(broker: HealthProvider | Any, wedged_timeout: float = 60.0) -> bool:
396
+ """Liveness probe — is the broker process alive and not hard-wedged?
397
+
398
+ Args:
399
+ broker: A broker implementing :class:`HealthProvider` (typed
400
+ properties) or a legacy broker exposing private attributes
401
+ (``_started``, ``_wedged``, ``last_heartbeat``).
402
+ wedged_timeout: Seconds without a heartbeat before liveness fails.
403
+
404
+ Liveness fails when:
405
+
406
+ - the broker is not started (``_started``/``started`` False/absent), OR
407
+ - an explicit ``_wedged`` flag is set to ``True`` (transports/brokers may
408
+ set this on a hard fault), OR
409
+ - a ``last_heartbeat`` is present and ``now - last_heartbeat >
410
+ wedged_timeout`` (a heartbeat was recorded via :func:`mark_heartbeat`
411
+ but has gone stale, meaning the I/O loop is wedged).
412
+
413
+ A transient broker/transport disconnect is *not* a liveness failure: the
414
+ process is still running and can recover. Use :func:`broker_readiness` to
415
+ decide whether to route traffic.
416
+
417
+ When no ``last_heartbeat`` attribute exists, liveness falls back to the
418
+ ``_started`` / ``_wedged`` checks only (backwards compatible).
419
+ """
420
+ started = _get_started(broker)
421
+ if not started:
422
+ return False
423
+ if _get(broker, "wedged", "_wedged", False):
424
+ return False
425
+ last_heartbeat = _get_last_heartbeat(broker)
426
+ if last_heartbeat is not None:
427
+ if time.monotonic() - last_heartbeat > wedged_timeout:
428
+ return False
429
+ return True
430
+
431
+
432
+ def broker_readiness(
433
+ broker: HealthProvider | Any,
434
+ config: HealthCheckConfig | None = None,
435
+ management_client: Any = None,
436
+ ) -> bool:
437
+ """Readiness probe — is the broker ready to serve traffic right now?
438
+
439
+ Args:
440
+ broker: A broker implementing :class:`HealthProvider` (typed
441
+ properties) or a legacy broker exposing private attributes.
442
+ config: Optional :class:`HealthCheckConfig` to tune thresholds.
443
+ management_client: Optional :class:`~rabbitkit.management.RabbitManagementClient`
444
+ — see :func:`broker_health_check`. A failing management check
445
+ fails readiness even if this process's own connection looks fine.
446
+
447
+ Requires: health check not UNHEALTHY, transport connected, connection
448
+ not blocked by broker flow control (L15), and every registered route
449
+ has an active (live) consumer. Use this for load-balancer / ingress
450
+ gating; use :func:`broker_liveness` for restart decisions.
451
+ """
452
+ result = broker_health_check(broker, config=config, management_client=management_client)
453
+ if result.status == HealthStatus.UNHEALTHY:
454
+ return False
455
+ if not result.connected:
456
+ return False
457
+ # L15: a blocked connection can't publish -- not ready for traffic even
458
+ # though it's technically still "connected" and may still have live
459
+ # consumers.
460
+ if result.blocked:
461
+ return False
462
+ # A failing management check downgrades to DEGRADED rather than
463
+ # UNHEALTHY (this process's own connection may be fine), but a
464
+ # partitioned/unreachable node is still not ready for traffic.
465
+ if "management_check" in result.details:
466
+ return False
467
+ # M-SRE3: every route must have a live consumer. The health check already
468
+ # verified transport connectivity above.
469
+ return result.consumer_count == result.route_count
470
+
471
+
472
+ async def broker_liveness_async(broker: HealthProvider | Any, wedged_timeout: float = 60.0) -> bool:
473
+ """Async variant of :func:`broker_liveness`."""
474
+ return broker_liveness(broker, wedged_timeout=wedged_timeout)
475
+
476
+
477
+ async def broker_readiness_async(
478
+ broker: HealthProvider | Any,
479
+ config: HealthCheckConfig | None = None,
480
+ management_client: Any = None,
481
+ ) -> bool:
482
+ """Async variant of :func:`broker_readiness`.
483
+
484
+ Uses :func:`broker_health_check_async` (``management_client.health_check_async()``)
485
+ rather than delegating to the sync ``broker_readiness`` — the sync
486
+ management check makes a blocking network call, which must not run on
487
+ the event loop.
488
+ """
489
+ result = await broker_health_check_async(broker, config=config, management_client=management_client)
490
+ if result.status == HealthStatus.UNHEALTHY:
491
+ return False
492
+ if not result.connected:
493
+ return False
494
+ if result.blocked:
495
+ return False
496
+ if "management_check" in result.details:
497
+ return False
498
+ return result.consumer_count == result.route_count
499
+
500
+
501
+ # ── Health-transition watcher ────────────────────────────────────────────
502
+
503
+
504
+ class HealthWatcher:
505
+ """Opt-in push-style health notifications (sync, daemon-thread poller).
506
+
507
+ Polls :func:`broker_health_check` every *interval* seconds and fires
508
+ ``on_change(old, new, result)`` when the status transitions -- but only
509
+ after *debounce* consecutive identical readings, so a single flapping
510
+ poll never pages anyone. Callback exceptions are logged, never raised,
511
+ and never stall the loop.
512
+
513
+ Positioning: for deployments that aren't (only) Kubernetes -- bare
514
+ metal, VMs, direct pager/webhook wiring. On k8s, keep
515
+ :func:`broker_liveness`/:func:`broker_readiness` probes as the primary
516
+ signal; this watcher complements, never replaces, them.
517
+
518
+ When *collector* is given (any ``MetricsCollector`` with ``set_gauge``),
519
+ every poll also emits a ``rabbitkit_health_state`` gauge
520
+ (0 healthy / 1 degraded / 2 unhealthy), so Prometheus users get a state
521
+ series without writing a callback.
522
+
523
+ *clock* and *sleeper* are injectable for tests (no wall-clock sleeps in
524
+ the test suite -- a hard-won deflaking lesson).
525
+ """
526
+
527
+ _GAUGE_VALUES: typing.ClassVar[dict[HealthStatus, int]] = {
528
+ HealthStatus.HEALTHY: 0,
529
+ HealthStatus.DEGRADED: 1,
530
+ HealthStatus.UNHEALTHY: 2,
531
+ }
532
+
533
+ def __init__(
534
+ self,
535
+ broker: HealthProvider | Any,
536
+ *,
537
+ interval: float = 10.0,
538
+ on_change: Any = None,
539
+ management_client: Any = None,
540
+ config: HealthCheckConfig | None = None,
541
+ debounce: int = 2,
542
+ collector: Any = None,
543
+ gauge_name: str = "rabbitkit_health_state",
544
+ ) -> None:
545
+ if interval <= 0:
546
+ raise ValueError(f"HealthWatcher interval must be > 0, got {interval}")
547
+ if debounce < 1:
548
+ raise ValueError(f"HealthWatcher debounce must be >= 1, got {debounce}")
549
+ self._broker = broker
550
+ self._interval = interval
551
+ self._on_change = on_change
552
+ self._management_client = management_client
553
+ self._config = config
554
+ self._debounce = debounce
555
+ self._collector = collector
556
+ self._gauge_name = gauge_name
557
+
558
+ self._current: HealthStatus | None = None # last CONFIRMED status
559
+ self._candidate: HealthStatus | None = None
560
+ self._candidate_count = 0
561
+ self._thread: Any = None
562
+ self._stop_event: Any = None
563
+
564
+ @property
565
+ def current_status(self) -> HealthStatus | None:
566
+ """Last debounce-confirmed status (None until the first confirmation)."""
567
+ return self._current
568
+
569
+ def _tick(self) -> None:
570
+ """One poll: read health, then run the shared debounced state machine."""
571
+ result = broker_health_check(
572
+ self._broker, config=self._config, management_client=self._management_client
573
+ )
574
+ self._apply(result)
575
+
576
+ def _apply(self, result: BrokerHealthResult) -> None:
577
+ """Debounced state machine on an already-obtained result (shared with
578
+ the async variant)."""
579
+ if self._collector is not None:
580
+ try:
581
+ self._collector.set_gauge(self._gauge_name, {}, self._GAUGE_VALUES[result.status])
582
+ except Exception: # pragma: no cover — collector bugs never stall the loop
583
+ logger.exception("HealthWatcher gauge emission raised")
584
+
585
+ status = result.status
586
+ if status == self._current:
587
+ # Confirmed state re-observed; reset any half-built candidate.
588
+ self._candidate = None
589
+ self._candidate_count = 0
590
+ return
591
+ if status != self._candidate:
592
+ self._candidate = status
593
+ self._candidate_count = 0
594
+ self._candidate_count += 1
595
+ if self._candidate_count < self._debounce:
596
+ return
597
+ old, self._current = self._current, status
598
+ self._candidate = None
599
+ self._candidate_count = 0
600
+ if self._on_change is not None:
601
+ try:
602
+ self._on_change(old, status, result)
603
+ except Exception:
604
+ logger.exception("HealthWatcher on_change callback raised")
605
+
606
+ def start(self) -> None:
607
+ """Start the daemon poller thread. Idempotent."""
608
+ import threading
609
+
610
+ if self._thread is not None and self._thread.is_alive():
611
+ return
612
+ self._stop_event = threading.Event()
613
+ stop = self._stop_event
614
+
615
+ def _loop() -> None:
616
+ while not stop.wait(timeout=self._interval):
617
+ try:
618
+ self._tick()
619
+ except Exception: # pragma: no cover — defensive; _tick guards itself
620
+ logger.exception("HealthWatcher tick raised")
621
+
622
+ self._thread = threading.Thread(target=_loop, name="rabbitkit-health-watcher", daemon=True)
623
+ self._thread.start()
624
+
625
+ def stop(self, timeout: float = 5.0) -> None:
626
+ """Stop the poller (bounded join). Idempotent."""
627
+ if self._stop_event is not None:
628
+ self._stop_event.set()
629
+ if self._thread is not None:
630
+ self._thread.join(timeout=timeout)
631
+ self._thread = None
632
+
633
+
634
+ class AsyncHealthWatcher(HealthWatcher):
635
+ """Async variant of :class:`HealthWatcher` — an asyncio task instead of
636
+ a thread, and the management check (if any) awaited via
637
+ :func:`broker_health_check_async` so it never blocks the event loop."""
638
+
639
+ async def _tick_async(self) -> None:
640
+ result = await broker_health_check_async(
641
+ self._broker, config=self._config, management_client=self._management_client
642
+ )
643
+ self._apply(result)
644
+
645
+ async def run(self) -> None:
646
+ """Poll forever (cancel the task to stop)."""
647
+ import asyncio
648
+
649
+ while True:
650
+ await asyncio.sleep(self._interval)
651
+ try:
652
+ await self._tick_async()
653
+ except Exception: # pragma: no cover — defensive; _tick guards itself
654
+ logger.exception("HealthWatcher tick raised")
@@ -0,0 +1,10 @@
1
+ """High-load infrastructure module — backpressure, batch publish/ack."""
2
+
3
+ from rabbitkit.highload.backpressure import FlowController
4
+ from rabbitkit.highload.batch import BatchAcker, BatchPublisher
5
+
6
+ __all__ = [
7
+ "BatchAcker",
8
+ "BatchPublisher",
9
+ "FlowController",
10
+ ]