messagefoundry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. messagefoundry/__init__.py +108 -0
  2. messagefoundry/__main__.py +1155 -0
  3. messagefoundry/api/__init__.py +27 -0
  4. messagefoundry/api/app.py +1581 -0
  5. messagefoundry/api/approvals.py +184 -0
  6. messagefoundry/api/auth_models.py +211 -0
  7. messagefoundry/api/auth_routes.py +655 -0
  8. messagefoundry/api/field_authz.py +96 -0
  9. messagefoundry/api/models.py +374 -0
  10. messagefoundry/api/security.py +247 -0
  11. messagefoundry/api/tls.py +47 -0
  12. messagefoundry/auth/__init__.py +39 -0
  13. messagefoundry/auth/data/common_passwords.NOTICE +13 -0
  14. messagefoundry/auth/data/common_passwords.txt +10000 -0
  15. messagefoundry/auth/identity.py +71 -0
  16. messagefoundry/auth/ldap.py +264 -0
  17. messagefoundry/auth/notifications.py +68 -0
  18. messagefoundry/auth/passwords.py +53 -0
  19. messagefoundry/auth/permissions.py +120 -0
  20. messagefoundry/auth/policy.py +153 -0
  21. messagefoundry/auth/ratelimit.py +55 -0
  22. messagefoundry/auth/service.py +1323 -0
  23. messagefoundry/auth/tokens.py +26 -0
  24. messagefoundry/auth/totp.py +174 -0
  25. messagefoundry/checks.py +174 -0
  26. messagefoundry/config/__init__.py +30 -0
  27. messagefoundry/config/active_environment.py +80 -0
  28. messagefoundry/config/ai_policy.py +140 -0
  29. messagefoundry/config/code_sets.py +260 -0
  30. messagefoundry/config/connections_edit.py +200 -0
  31. messagefoundry/config/connections_file.py +287 -0
  32. messagefoundry/config/db_lookup.py +117 -0
  33. messagefoundry/config/environments.py +116 -0
  34. messagefoundry/config/ingest_time.py +83 -0
  35. messagefoundry/config/models.py +240 -0
  36. messagefoundry/config/reference.py +158 -0
  37. messagefoundry/config/response.py +83 -0
  38. messagefoundry/config/run_context.py +153 -0
  39. messagefoundry/config/settings.py +1311 -0
  40. messagefoundry/config/state.py +99 -0
  41. messagefoundry/config/tls_policy.py +110 -0
  42. messagefoundry/config/wiring.py +1918 -0
  43. messagefoundry/console/__init__.py +20 -0
  44. messagefoundry/console/__main__.py +274 -0
  45. messagefoundry/console/_async.py +107 -0
  46. messagefoundry/console/change_password.py +111 -0
  47. messagefoundry/console/client.py +552 -0
  48. messagefoundry/console/connections.py +324 -0
  49. messagefoundry/console/login.py +107 -0
  50. messagefoundry/console/mfa.py +205 -0
  51. messagefoundry/console/reauth.py +94 -0
  52. messagefoundry/console/search.py +57 -0
  53. messagefoundry/console/service_control.py +137 -0
  54. messagefoundry/console/sessions.py +122 -0
  55. messagefoundry/console/shell.py +410 -0
  56. messagefoundry/console/status.py +377 -0
  57. messagefoundry/console/users_page.py +282 -0
  58. messagefoundry/console/widgets.py +553 -0
  59. messagefoundry/generators/README.md +27 -0
  60. messagefoundry/generators/__init__.py +15 -0
  61. messagefoundry/generators/_core.py +589 -0
  62. messagefoundry/generators/_hl7data.py +428 -0
  63. messagefoundry/generators/adt.py +286 -0
  64. messagefoundry/generators/all_types.py +24 -0
  65. messagefoundry/generators/bar.py +28 -0
  66. messagefoundry/generators/dft.py +20 -0
  67. messagefoundry/generators/mdm.py +39 -0
  68. messagefoundry/generators/mfn.py +46 -0
  69. messagefoundry/generators/oml.py +32 -0
  70. messagefoundry/generators/orl.py +30 -0
  71. messagefoundry/generators/orm.py +23 -0
  72. messagefoundry/generators/oru.py +21 -0
  73. messagefoundry/generators/ras.py +20 -0
  74. messagefoundry/generators/rde.py +54 -0
  75. messagefoundry/generators/siu.py +64 -0
  76. messagefoundry/generators/vxu.py +20 -0
  77. messagefoundry/hl7schema.py +75 -0
  78. messagefoundry/last_resort.py +55 -0
  79. messagefoundry/logging_setup.py +332 -0
  80. messagefoundry/parsing/__init__.py +64 -0
  81. messagefoundry/parsing/consistency.py +166 -0
  82. messagefoundry/parsing/groups.py +228 -0
  83. messagefoundry/parsing/message.py +453 -0
  84. messagefoundry/parsing/peek.py +237 -0
  85. messagefoundry/parsing/split.py +120 -0
  86. messagefoundry/parsing/summary.py +46 -0
  87. messagefoundry/parsing/tree.py +128 -0
  88. messagefoundry/parsing/validate.py +95 -0
  89. messagefoundry/parsing/x12/__init__.py +46 -0
  90. messagefoundry/parsing/x12/delimiters.py +140 -0
  91. messagefoundry/parsing/x12/errors.py +30 -0
  92. messagefoundry/parsing/x12/interchange.py +232 -0
  93. messagefoundry/parsing/x12/message.py +200 -0
  94. messagefoundry/parsing/x12/peek.py +207 -0
  95. messagefoundry/pipeline/__init__.py +21 -0
  96. messagefoundry/pipeline/alert_sinks.py +486 -0
  97. messagefoundry/pipeline/alerts.py +100 -0
  98. messagefoundry/pipeline/cert_expiry.py +219 -0
  99. messagefoundry/pipeline/cluster.py +955 -0
  100. messagefoundry/pipeline/cluster_sqlserver.py +444 -0
  101. messagefoundry/pipeline/config_convergence.py +137 -0
  102. messagefoundry/pipeline/dryrun.py +450 -0
  103. messagefoundry/pipeline/engine.py +756 -0
  104. messagefoundry/pipeline/leader_tasks.py +158 -0
  105. messagefoundry/pipeline/reference_sync.py +369 -0
  106. messagefoundry/pipeline/retention.py +289 -0
  107. messagefoundry/pipeline/security_notify.py +168 -0
  108. messagefoundry/pipeline/state_convergence.py +143 -0
  109. messagefoundry/pipeline/wiring_runner.py +1722 -0
  110. messagefoundry/py.typed +0 -0
  111. messagefoundry/redaction.py +71 -0
  112. messagefoundry/scaffold.py +321 -0
  113. messagefoundry/secrets_dpapi.py +129 -0
  114. messagefoundry/store/__init__.py +46 -0
  115. messagefoundry/store/audit_tee.py +67 -0
  116. messagefoundry/store/base.py +758 -0
  117. messagefoundry/store/crypto.py +166 -0
  118. messagefoundry/store/keyprovider.py +192 -0
  119. messagefoundry/store/postgres.py +3447 -0
  120. messagefoundry/store/sqlserver.py +3014 -0
  121. messagefoundry/store/store.py +3790 -0
  122. messagefoundry/timezone.py +207 -0
  123. messagefoundry/transports/__init__.py +50 -0
  124. messagefoundry/transports/base.py +269 -0
  125. messagefoundry/transports/database.py +693 -0
  126. messagefoundry/transports/file.py +551 -0
  127. messagefoundry/transports/framing.py +164 -0
  128. messagefoundry/transports/loopback.py +53 -0
  129. messagefoundry/transports/mllp.py +644 -0
  130. messagefoundry/transports/remotefile.py +664 -0
  131. messagefoundry/transports/rest.py +281 -0
  132. messagefoundry/transports/signing.py +321 -0
  133. messagefoundry/transports/soap.py +507 -0
  134. messagefoundry/transports/tcp.py +307 -0
  135. messagefoundry/transports/timer.py +146 -0
  136. messagefoundry/transports/x12.py +323 -0
  137. messagefoundry-0.1.0.dist-info/METADATA +212 -0
  138. messagefoundry-0.1.0.dist-info/RECORD +142 -0
  139. messagefoundry-0.1.0.dist-info/WHEEL +4 -0
  140. messagefoundry-0.1.0.dist-info/entry_points.txt +2 -0
  141. messagefoundry-0.1.0.dist-info/licenses/LICENSE +662 -0
  142. messagefoundry-0.1.0.dist-info/licenses/NOTICE +27 -0
@@ -0,0 +1,756 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 MessageFoundry Organization and contributors
3
+ """The engine: owns the store and supervises the code-first :class:`RegistryRunner`.
4
+
5
+ This is the object the API layer (and tests) drive. It opens the durable store, recovers
6
+ any deliveries left ``inflight`` by a previous crash, and runs the wired Connection/Router/
7
+ Handler graph.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import logging
14
+ import time
15
+ from collections.abc import Callable, Mapping, Sequence
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ from messagefoundry.config.models import (
20
+ AckAfter,
21
+ BuildupThreshold,
22
+ InternalErrorPolicy,
23
+ OrderingMode,
24
+ RetryPolicy,
25
+ )
26
+ from messagefoundry.config.settings import (
27
+ CertMonitorSettings,
28
+ ClusterSettings,
29
+ EgressSettings,
30
+ ReferenceSettings,
31
+ RetentionSettings,
32
+ ShadowSettings,
33
+ )
34
+ from messagefoundry.config.wiring import Registry, WiringError, load_config
35
+ from messagefoundry.pipeline.alerts import AlertSink
36
+ from messagefoundry.pipeline.cert_expiry import CertExpiryRunner, MonitoredCert, certs_from_registry
37
+ from messagefoundry.pipeline.cluster import ClusterCoordinator, NullCoordinator
38
+ from messagefoundry.pipeline.config_convergence import ConfigConvergenceRunner
39
+ from messagefoundry.pipeline.leader_tasks import LeaderMaintenanceRunner
40
+ from messagefoundry.pipeline.reference_sync import ReferenceSyncRunner
41
+ from messagefoundry.pipeline.retention import RetentionRunner
42
+ from messagefoundry.pipeline.state_convergence import StateConvergenceRunner
43
+ from messagefoundry.pipeline.wiring_runner import RegistryRunner
44
+ from messagefoundry.store import MessageStore, Store
45
+
46
+ __all__ = ["Engine", "ConfigReloadDenied"]
47
+
48
+ log = logging.getLogger(__name__)
49
+
50
+
51
+ class ConfigReloadDenied(Exception):
52
+ """A /config/reload target resolved outside the allowed reload roots (RCE guard).
53
+
54
+ The API maps this to 403. Because the loader executes Python from the target directory, a
55
+ reload may only load from the server's startup ``--config`` dir or an explicitly configured
56
+ ``config_reload_roots`` entry — never an arbitrary client-supplied path."""
57
+
58
+
59
+ def _within(path: Path, root: Path) -> bool:
60
+ """True if ``path`` is ``root`` itself or nested under it (both already resolved)."""
61
+ return path == root or root in path.parents
62
+
63
+
64
+ class Engine:
65
+ def __init__(
66
+ self,
67
+ store: Store,
68
+ *,
69
+ poll_interval: float = 0.25,
70
+ max_correlation_depth: int = 8,
71
+ config_dir: str | Path | None = None,
72
+ config_reload_roots: Sequence[str | Path] = (),
73
+ inbound_bind_host: str = "127.0.0.1",
74
+ allow_insecure_bind: bool = False,
75
+ delivery_defaults: RetryPolicy | None = None,
76
+ ordering_default: OrderingMode | None = None,
77
+ internal_error_default: InternalErrorPolicy | None = None,
78
+ buildup_default: BuildupThreshold | None = None,
79
+ ack_after_default: AckAfter | None = None,
80
+ alert_sink: AlertSink | None = None,
81
+ retention_settings: RetentionSettings | None = None,
82
+ cert_monitor_settings: CertMonitorSettings | None = None,
83
+ api_tls_cert_file: str | None = None,
84
+ reference_settings: ReferenceSettings | None = None,
85
+ egress_settings: EgressSettings | None = None,
86
+ shadow_settings: ShadowSettings | None = None,
87
+ active_environment: str | None = None,
88
+ env_values: Mapping[str, Any] | None = None,
89
+ env_values_provider: Callable[[], Mapping[str, Any]] | None = None,
90
+ coordinator: ClusterCoordinator | None = None,
91
+ cluster_settings: ClusterSettings | None = None,
92
+ ) -> None:
93
+ self.store = store
94
+ # Cluster coordination seam (Track B Step 3). None → the no-op NullCoordinator, so single-node
95
+ # (SQLite and single-node Postgres) is byte-identical: is_leader()/owns_lane() are always True
96
+ # and start()/stop() do nothing. A DbCoordinator (built by build_coordinator on an enabled
97
+ # [cluster] Postgres store) registers the node + heartbeats and (Step 4) elects a leader; its
98
+ # owns_lane() still reports True until Step 5. Threaded into every runner this engine builds.
99
+ self._coordinator: ClusterCoordinator = coordinator or NullCoordinator()
100
+ # [cluster] knobs (Track B Step 4). Only reclaim_interval_seconds is read here (the cadence of
101
+ # the leader's lease-reclaim sweep); the rest drive build_coordinator upstream. None → the
102
+ # ClusterSettings() defaults, which is fine because the leader sweep only spawns when the
103
+ # coordinator reclaims inflight rows (i.e. a DbCoordinator), never for the single-node default.
104
+ self._cluster_settings = cluster_settings or ClusterSettings()
105
+ self._leader_maintenance: LeaderMaintenanceRunner | None = None
106
+ # Config-reload convergence (Track B Step 6). Spawned ONLY in clustered mode (is_clustered()),
107
+ # so single-node never pays for it. _applied_config_version is the shared config version this
108
+ # node has applied; seeded at start() to the coordinator's current version (so a fresh node
109
+ # doesn't self-reload) and advanced when this node bumps (operator reload) or converges (follower
110
+ # reload). The node that bumps advances it itself, so its own convergence loop sees no change.
111
+ self._config_convergence: ConfigConvergenceRunner | None = None
112
+ self._applied_config_version: int = 0
113
+ # Transform-state read-through convergence (Track B Step 6b). Spawned ONLY in clustered mode
114
+ # (is_clustered()), so single-node never pays for it. Each tick it read-throughs any namespace a
115
+ # sibling node wrote/purged into this node's local _state_cache (off the hot path, so state_get
116
+ # stays a pure sync dict lookup). Mirrors _config_convergence's lifecycle.
117
+ self._state_convergence: StateConvergenceRunner | None = None
118
+ # The active environment name ([ai].environment / serve --env), passed to every runner this
119
+ # engine builds so a Handler's current_environment() resolves to it (per-face transform logic).
120
+ self._active_environment = active_environment
121
+ self._poll_interval = poll_interval
122
+ # [pipeline] re-ingress loop-prevention cap (ADR 0013 Increment 2); every runner inherits it.
123
+ self._max_correlation_depth = max_correlation_depth
124
+ # Where the runner reports operational alerts; None → the runner's default logging sink.
125
+ self._alert_sink = alert_sink
126
+ # [retention] enforcement. None (embedding/tests) → no retention task; the runner itself is a
127
+ # no-op when nothing is configured, so passing default settings is also safe.
128
+ self._retention_settings = retention_settings
129
+ self._retention_runner: RetentionRunner | None = None
130
+ # [cert_monitor] TLS-cert expiry monitor (Q5c). None (embedding/tests) → no monitor task. The
131
+ # set of certs to watch is derived at scan time from the [api] TLS cert + the wired graph's MLLP
132
+ # certs (read live, so a reload that adds/removes a TLS connection is picked up).
133
+ self._cert_monitor_settings = cert_monitor_settings
134
+ self._api_tls_cert_file = api_tls_cert_file
135
+ self._cert_expiry_runner: CertExpiryRunner | None = None
136
+ # [reference] enforcement (ADR 0006). None (embedding/tests) → default settings; the reference
137
+ # sync runner is a no-op when the graph declares no reference sets.
138
+ self._reference_settings = reference_settings
139
+ self._reference_runner: ReferenceSyncRunner | None = None
140
+ # Fail-closed outbound destination allowlist (WP-11c); passed to every runner this engine builds
141
+ # (and the reload dry-run checker), so a denied destination is refused at start + on reload.
142
+ self._egress_settings = egress_settings
143
+ # [shadow] parallel-run egress suppression (#15); simulate_all_egress is threaded into every
144
+ # runner this engine builds so a shadow instance suppresses all delivery. None → defaults (off).
145
+ self._shadow_settings = shadow_settings or ShadowSettings()
146
+ # The interface inbound listeners bind to; every runner this engine builds inherits it.
147
+ self._inbound_bind_host = inbound_bind_host
148
+ # The serve --allow-insecure-bind dev escape; every runner inherits it for the §0 exposed-gate.
149
+ self._allow_insecure_bind = allow_insecure_bind
150
+ # Global [delivery] defaults (retry + ordering + internal-error action + buildup thresholds);
151
+ # every runner inherits them. A connection's own retry=/ordering=/internal_error=/buildup= wins.
152
+ self._delivery_defaults = delivery_defaults
153
+ self._ordering_default = ordering_default
154
+ self._internal_error_default = internal_error_default
155
+ self._buildup_default = buildup_default
156
+ # Global [inbound] ACK-timing default (ADR 0001); every runner inherits it.
157
+ self._ack_after_default = ack_after_default
158
+ # This instance's environment values (DEV/PROD), shared with every runner the engine builds —
159
+ # so env() references in a reloaded graph resolve against THIS environment (and a missing
160
+ # value is refused here, on this engine, not on the box the graph was authored on). The
161
+ # optional provider is re-invoked on each reload so a promote picks up edited values files
162
+ # without a restart (review M-23); without it the values are static (embedding/tests).
163
+ self._env_values_provider = env_values_provider
164
+ initial = env_values_provider() if env_values_provider is not None else env_values
165
+ self._env_values: dict[str, Any] = dict(initial or {})
166
+ self._registry_runner: RegistryRunner | None = None
167
+ # Active-passive graph supervisor (Workstream A1). In CLUSTERED mode the wired graph (listeners
168
+ # + workers) runs ONLY while this node holds leadership: this task polls leadership and
169
+ # starts/stops the graph on acquire/lose, so a standby stays warm without binding listeners or
170
+ # processing. NEVER spawned single-node (NullCoordinator is always leader, so the graph is
171
+ # brought up directly at start() — byte-identical). The lock serializes reconciles; the event
172
+ # stops the loop. NOTE the hard guarantee against concurrent double-processing of any given row
173
+ # is NOT this gate — it is the store's row/lane leases (a standby's reclaim only takes EXPIRED
174
+ # leases, so it can never claim a row the old leader still holds; Track B Step 2/5). This gate
175
+ # promptly stops a demoted/fenced node from accepting NEW inbound work and initiating NEW
176
+ # processing; the poll interval is bounded (at start()) to keep that stop prompt.
177
+ self._graph_supervisor: asyncio.Task[None] | None = None
178
+ self._graph_stop = asyncio.Event()
179
+ self._graph_lock = asyncio.Lock()
180
+ self._graph_reconcile_interval = 1.0
181
+ # Set when start() runs; the "since" for since-engine-start metric counts.
182
+ self.started_at: float = 0.0
183
+ # The startup config dir is the default reload target and an implicit allowed root.
184
+ self.config_dir: Path | None = Path(config_dir).resolve() if config_dir else None
185
+ roots = [Path(r).resolve() for r in config_reload_roots]
186
+ if self.config_dir is not None:
187
+ roots.append(self.config_dir)
188
+ # Empty => unconstrained (embedding/tests). The served path always sets config_dir.
189
+ self._reload_roots: tuple[Path, ...] = tuple(dict.fromkeys(roots))
190
+ # The directory the most recent reload loaded from (resolved) — for audit by the API.
191
+ self.last_reload_dir: Path | None = None
192
+
193
+ @classmethod
194
+ async def create(
195
+ cls,
196
+ db_path: str | Path,
197
+ *,
198
+ poll_interval: float = 0.25,
199
+ max_correlation_depth: int = 8,
200
+ synchronous: str = "NORMAL",
201
+ config_dir: str | Path | None = None,
202
+ config_reload_roots: Sequence[str | Path] = (),
203
+ inbound_bind_host: str = "127.0.0.1",
204
+ allow_insecure_bind: bool = False,
205
+ delivery_defaults: RetryPolicy | None = None,
206
+ ordering_default: OrderingMode | None = None,
207
+ internal_error_default: InternalErrorPolicy | None = None,
208
+ buildup_default: BuildupThreshold | None = None,
209
+ ack_after_default: AckAfter | None = None,
210
+ alert_sink: AlertSink | None = None,
211
+ retention_settings: RetentionSettings | None = None,
212
+ cert_monitor_settings: CertMonitorSettings | None = None,
213
+ api_tls_cert_file: str | None = None,
214
+ reference_settings: ReferenceSettings | None = None,
215
+ egress_settings: EgressSettings | None = None,
216
+ shadow_settings: ShadowSettings | None = None,
217
+ active_environment: str | None = None,
218
+ env_values: Mapping[str, Any] | None = None,
219
+ env_values_provider: Callable[[], Mapping[str, Any]] | None = None,
220
+ coordinator: ClusterCoordinator | None = None,
221
+ cluster_settings: ClusterSettings | None = None,
222
+ ) -> "Engine":
223
+ """Open a SQLite-backed engine from a path (convenience for tests/embedding). The service
224
+ path goes through :func:`~messagefoundry.store.open_store` (backend-agnostic). The SQLite
225
+ convenience path leaves ``coordinator`` unset → the no-op :class:`NullCoordinator`
226
+ (single-node), so it is byte-identical to before this seam."""
227
+ store = await MessageStore.open(db_path, synchronous=synchronous)
228
+ return cls(
229
+ store,
230
+ poll_interval=poll_interval,
231
+ max_correlation_depth=max_correlation_depth,
232
+ config_dir=config_dir,
233
+ config_reload_roots=config_reload_roots,
234
+ inbound_bind_host=inbound_bind_host,
235
+ allow_insecure_bind=allow_insecure_bind,
236
+ delivery_defaults=delivery_defaults,
237
+ ordering_default=ordering_default,
238
+ internal_error_default=internal_error_default,
239
+ buildup_default=buildup_default,
240
+ ack_after_default=ack_after_default,
241
+ alert_sink=alert_sink,
242
+ retention_settings=retention_settings,
243
+ cert_monitor_settings=cert_monitor_settings,
244
+ api_tls_cert_file=api_tls_cert_file,
245
+ reference_settings=reference_settings,
246
+ egress_settings=egress_settings,
247
+ shadow_settings=shadow_settings,
248
+ active_environment=active_environment,
249
+ env_values=env_values,
250
+ env_values_provider=env_values_provider,
251
+ coordinator=coordinator,
252
+ cluster_settings=cluster_settings,
253
+ )
254
+
255
+ # --- code-first wiring ---------------------------------------------------
256
+
257
+ def add_registry(self, registry: Registry) -> RegistryRunner:
258
+ """Run a code-first Connection/Router/Handler graph (one runner for the whole graph)."""
259
+ runner = RegistryRunner(
260
+ registry,
261
+ self.store,
262
+ poll_interval=self._poll_interval,
263
+ inbound_bind_host=self._inbound_bind_host,
264
+ allow_insecure_bind=self._allow_insecure_bind,
265
+ delivery_defaults=self._delivery_defaults,
266
+ ordering_default=self._ordering_default,
267
+ internal_error_default=self._internal_error_default,
268
+ buildup_default=self._buildup_default,
269
+ ack_after_default=self._ack_after_default,
270
+ alert_sink=self._alert_sink,
271
+ egress=self._egress_settings,
272
+ simulate_all=self._shadow_settings.simulate_all_egress,
273
+ env_values=self._env_values,
274
+ active_environment=self._active_environment,
275
+ coordinator=self._coordinator,
276
+ max_correlation_depth=self._max_correlation_depth,
277
+ )
278
+ self._registry_runner = runner
279
+ return runner
280
+
281
+ @property
282
+ def registry_runner(self) -> RegistryRunner | None:
283
+ return self._registry_runner
284
+
285
+ def _monitored_certs(self) -> list[MonitoredCert]:
286
+ """The TLS certs the engine serves with right now: the ``[api]`` cert + the wired graph's MLLP
287
+ ``tls_cert_file`` certs (read live off the registry, so a config reload is reflected). Passed to
288
+ the :class:`CertExpiryRunner` as its cert source so each scan reflects the current graph."""
289
+ registry = self._registry_runner.registry if self._registry_runner is not None else None
290
+ return certs_from_registry(registry, self._api_tls_cert_file)
291
+
292
+ @property
293
+ def coordinator(self) -> ClusterCoordinator:
294
+ """The cluster coordinator (NullCoordinator single-node, DbCoordinator clustered) — Track B
295
+ Step 7. A public accessor so the observability API reads membership/leadership through the
296
+ contract instead of reaching the private ``_coordinator`` attribute."""
297
+ return self._coordinator
298
+
299
+ # --- reference sets (ADR 0006) -------------------------------------------
300
+
301
+ def _make_reference_runner(self) -> ReferenceSyncRunner:
302
+ """Build the reference sync runner; its specs are read **live** from the current registry, so a
303
+ reload's swapped declarations are picked up without rebuilding it."""
304
+ return ReferenceSyncRunner(
305
+ self.store,
306
+ lambda: (
307
+ self._registry_runner.registry.references.values()
308
+ if self._registry_runner is not None
309
+ else []
310
+ ),
311
+ self._reference_settings or ReferenceSettings(),
312
+ env_values=self._env_values,
313
+ egress=self._egress_settings,
314
+ alert_sink=self._alert_sink,
315
+ # Track B Step 6: gate materialize-from-source on the leader; every node still converges its
316
+ # read cache from the shared snapshot. NullCoordinator (single-node) is always leader, so
317
+ # this materializes from source every pass exactly as before.
318
+ coordinator=self._coordinator,
319
+ )
320
+
321
+ async def _reconcile_reference_sync(self, *, startup: bool) -> None:
322
+ """Ensure the reference runner exists, materialize the declared sets, and (re-)arm the loop.
323
+
324
+ Called at :meth:`start` and after every successful :meth:`reload`, so: a set added by a reload
325
+ materializes **immediately** (not only on the next refresh tick), a graph that goes from zero
326
+ reference sets to ≥1 across a reload actually starts the loop, and an engine started without a
327
+ graph then loaded via reload still gets a runner. ``start()`` is idempotent (a no-op when the
328
+ loop is already up). The pre-sync runs on a reload unconditionally (so a new set resolves on the
329
+ next message); at startup it honors ``[reference].sync_on_startup``. A sync failure is isolated
330
+ per-set (last-good kept) and never blocks start/reload."""
331
+ if self._reference_runner is None:
332
+ self._reference_runner = self._make_reference_runner()
333
+ if not startup or (self._reference_settings or ReferenceSettings()).sync_on_startup:
334
+ await self._reference_runner.sync_all()
335
+ self._reference_runner.start()
336
+
337
+ # --- lifecycle -----------------------------------------------------------
338
+
339
+ async def start(self) -> None:
340
+ """Recover crashed in-flight rows (every stage), dead-letter outbound rows for removed
341
+ outbounds, then start the wired graph."""
342
+ self.started_at = time.time()
343
+ # All-stages recovery: returns any row a crash left `inflight` — ingress rows mid-route and
344
+ # outbound rows mid-delivery alike — to `pending` so the staged workers re-claim them
345
+ # (staged pipeline, ADR 0001). The handoff/delivery transactions make the re-run idempotent.
346
+ if not self._coordinator.reclaims_inflight():
347
+ # Single-node (SQLite / single-node Postgres): the unconditional reset is immediate self-
348
+ # recovery of this node's own crash residue — today's behavior, byte-identical.
349
+ await self.store.reset_stale_inflight()
350
+ # else clustered (Track B Step 4): the leader's periodic reclaim_expired_leases sweep (started
351
+ # below) recovers expired-lease rows; the unconditional reset ignores leases and would steal a
352
+ # live sibling's in-flight rows, so it must NOT run here.
353
+ # Bring cluster membership + leader election up BEFORE the workers run, so the node's heartbeat
354
+ # is registered and leadership is contended the moment it starts processing (Track B Step 3/4).
355
+ # NullCoordinator (the single-node default) is a no-op here, so this line is free for SQLite /
356
+ # single-node Postgres.
357
+ await self._coordinator.start()
358
+ # Track B Step 6b: in a cluster, turn ON the store's per-namespace state-version bumping BEFORE the
359
+ # workers (hence transform_handoff) start, so the very first state write bumps and a sibling's
360
+ # convergence loop can see it. Single-node (NullCoordinator, is_clustered() False) never calls this,
361
+ # so no state_version rows are written and the backend stays byte-identical.
362
+ if self._coordinator.is_clustered():
363
+ self.store.enable_state_convergence()
364
+ if self._registry_runner is not None:
365
+ # Fail loud (not at the first received message) if the configured store can't run the
366
+ # staged ingress pipeline: the inbound path unconditionally calls store.enqueue_ingress,
367
+ # so a backend whose enqueue_ingress/handoff is a NotImplementedError stub (SQL Server,
368
+ # gated on BACKLOG #1) would otherwise wedge every inbound at runtime with no ACK/NAK. This
369
+ # check fails loud on EVERY node (leader or standby) — a misconfigured backend should refuse
370
+ # at startup, not only when this node is promoted.
371
+ if not getattr(self.store, "supports_ingest_stage", True):
372
+ raise RuntimeError(
373
+ "the configured store backend does not support the staged ingress pipeline "
374
+ "(ADR 0001 Step A is SQLite-only; SQL Server staging is gated on BACKLOG #1) — "
375
+ "use the sqlite backend"
376
+ )
377
+ if not self._coordinator.is_clustered():
378
+ # SINGLE-NODE (NullCoordinator, always leader): bring the graph up now, exactly as
379
+ # before — byte-identical. The config-drift sweeps + reference materialize + listener
380
+ # bring-up live in _start_graph (shared with the clustered leader path).
381
+ await self._start_graph()
382
+ else:
383
+ # CLUSTERED (active-passive, Workstream A1): the graph runs ONLY on the leader, so do
384
+ # NOT bring it up here — the graph supervisor (spawned at the end of start()) starts it
385
+ # when this node acquires leadership and stops it on loss. A standby stays warm without
386
+ # binding listeners or running workers. Start the reference-sync loop on EVERY node now
387
+ # so a follower converges its read cache from the leader's snapshot (the leader also
388
+ # materializes before listeners in _start_graph). Idempotent: _start_graph re-ensures it.
389
+ if self._reference_runner is None:
390
+ self._reference_runner = self._make_reference_runner()
391
+ self._reference_runner.start()
392
+ # Retention/purge is independent of the message graph (a store-level maintenance task), so it
393
+ # runs whether or not a graph is wired and survives config reloads. The runner is a no-op when
394
+ # nothing is configured, so this only spawns a task when [retention] is actually set. It is a
395
+ # leader-only WRITE singleton (purges bodies + writes audit rows), so it is gated on the
396
+ # coordinator: in a cluster a follower's runner ticks but no-ops; single-node always leads.
397
+ if self._retention_settings is not None:
398
+ self._retention_runner = RetentionRunner(
399
+ self.store,
400
+ self._retention_settings,
401
+ alert_sink=self._alert_sink,
402
+ coordinator=self._coordinator,
403
+ )
404
+ self._retention_runner.start()
405
+ # [cert_monitor] TLS-cert expiry monitor (Q5c) — a maintenance task like retention, independent
406
+ # of the message graph and surviving reloads; a no-op when warn_days=0. NOT leader-gated: certs
407
+ # are node-local files, so each node alerts on its own (the per-cert realert throttle bounds
408
+ # spam). The served-cert set is recomputed each scan from the live registry + [api] cert.
409
+ if self._cert_monitor_settings is not None:
410
+ self._cert_expiry_runner = CertExpiryRunner(
411
+ self._monitored_certs,
412
+ self._cert_monitor_settings,
413
+ alert_sink=self._alert_sink,
414
+ )
415
+ self._cert_expiry_runner.start()
416
+ # Leader lease-reclaim sweep (Track B Step 4) — only in clustered mode (reclaims_inflight()),
417
+ # so single-node / SQLite never spawns it. It is itself leader-gated each pass, so a follower's
418
+ # runner ticks but no-ops; the current leader recovers crashed nodes' expired-lease rows.
419
+ if self._coordinator.reclaims_inflight() and hasattr(self.store, "reclaim_expired_leases"):
420
+ # Postgres active-active: per-row lease reclaim recovers crashed nodes' EXPIRED-lease rows.
421
+ self._leader_maintenance = LeaderMaintenanceRunner(
422
+ self.store, # type: ignore[arg-type] # reclaim_expired_leases guarded above (Postgres)
423
+ self._coordinator,
424
+ interval_seconds=self._cluster_settings.reclaim_interval_seconds,
425
+ )
426
+ self._leader_maintenance.start()
427
+ # else (SQL Server active-passive): no per-row leases, so there is no reclaim sweep — failover
428
+ # recovery is the on-promotion reset_stale_inflight in _start_graph (the old leader self-fenced
429
+ # before its lease expired, so re-pending its in-flight rows can't steal from a live processor).
430
+ # Config-reload convergence (Track B Step 6) — only in clustered mode (is_clustered()), so
431
+ # single-node / SQLite never spawns it. Seed the applied version to the coordinator's CURRENT
432
+ # shared version BEFORE the loop starts, so a fresh node does not immediately self-reload (it is
433
+ # already in sync with whatever reloads happened before it joined); then poll the cached version
434
+ # each tick and reload this node's own config dir when it falls behind.
435
+ if self._coordinator.is_clustered():
436
+ self._applied_config_version = await self._coordinator.config_version()
437
+ self._config_convergence = ConfigConvergenceRunner(
438
+ self._coordinator,
439
+ applied_version=lambda: self._applied_config_version,
440
+ set_applied_version=self._set_applied_config_version,
441
+ reload=self._converge_reload,
442
+ interval_seconds=self._cluster_settings.heartbeat_seconds,
443
+ )
444
+ self._config_convergence.start()
445
+ # Transform-state read-through convergence (Track B Step 6b) — each tick read-throughs any
446
+ # namespace a sibling wrote/purged into this node's local _state_cache. Reuses the cluster
447
+ # heartbeat interval (owner decision) and the same alert sink as the rest of the engine.
448
+ self._state_convergence = StateConvergenceRunner(
449
+ converge=self.store.converge_state_cache,
450
+ interval_seconds=self._cluster_settings.heartbeat_seconds,
451
+ alert_sink=self._alert_sink,
452
+ )
453
+ self._state_convergence.start()
454
+ # Active-passive graph supervisor (Workstream A1) — spawned LAST (after _leader_maintenance
455
+ # exists, so the on-promotion reclaim can fire) and ONLY in clustered mode with a wired graph.
456
+ # It polls leadership and starts/stops the graph so only the leader binds listeners + runs
457
+ # workers. The poll interval is kept short (relative to the fence/TTL margin) so a demoted/fenced
458
+ # node stops accepting + initiating new work promptly; concurrent double-processing of a given
459
+ # row is independently prevented by the store's row/lane leases (see __init__). Single-node
460
+ # never spawns it (the graph is already running, brought up directly above).
461
+ if self._coordinator.is_clustered() and self._registry_runner is not None:
462
+ ttl = self._cluster_settings.leader_lease_ttl_seconds
463
+ fence = self._cluster_settings.leader_fence_timeout_seconds
464
+ # Stay comfortably inside the (ttl - fence) margin and never slower than ~1s.
465
+ self._graph_reconcile_interval = max(0.1, min(1.0, (ttl - fence) / 3.0))
466
+ self._graph_stop.clear()
467
+ # Reconcile ONCE synchronously before the loop: if this node is already the leader (it
468
+ # acquired the lease on coordinator.start()'s first tick, or in tests a stand-in reports
469
+ # leader immediately), the graph comes up during start() rather than a poll-interval later.
470
+ # A real DbCoordinator is usually not-yet-leader here (the lease is acquired asynchronously),
471
+ # so this is a no-op and the supervisor brings the graph up on promotion.
472
+ await self._reconcile_graph()
473
+ self._graph_supervisor = asyncio.create_task(self._graph_supervisor_loop())
474
+
475
+ # --- active-passive graph gating (Workstream A1/A3/A4) -------------------
476
+
477
+ async def _start_graph(self) -> None:
478
+ """Bring the wired graph up: (A4) recover the prior leader's stranded in-flight rows + lane
479
+ leases on promotion, (A3) dead-letter rows whose outbound/handler left the config, materialize
480
+ reference sets, then start the listeners + workers. In a cluster this runs ONLY on the leader and
481
+ is (re)invoked on each leadership acquire; single-node runs it once at startup. Idempotent
482
+ against the runner's own ``running`` guard."""
483
+ if self._registry_runner is None:
484
+ return
485
+ # A4 — on promotion (clustered Postgres), recover the prior leader's stranded in-flight rows AND
486
+ # take over its lane leases IMMEDIATELY (owner-scoped, lease-blind), instead of waiting out the
487
+ # ~[store].lease_ttl_seconds per-row/lane lease TTL — which was the dominant failover-recovery
488
+ # delay (#293: ~60s on PG vs ~7s on SQL Server). This brings Postgres to parity with the SQL
489
+ # Server reset_stale_inflight path; the periodic, lease-GATED sweep keeps running in the
490
+ # background (clock-skew / future active-active recovery). Single-node has no leader maintenance
491
+ # (_leader_maintenance is None), and its own crash residue was already recovered by the
492
+ # unconditional reset_stale_inflight in start().
493
+ if self._leader_maintenance is not None:
494
+ await self._leader_maintenance.recover_on_promotion()
495
+ elif self._coordinator.is_clustered():
496
+ # Active-passive without per-row leases (SQL Server): on promotion, re-pend the prior
497
+ # leader's in-flight rows. The prior leader self-fenced and its leadership lease EXPIRED
498
+ # before this node could acquire it, so it has stopped processing — and the graph runs ONLY
499
+ # on the leader, so there is no live sibling whose rows an unconditional reset could steal.
500
+ # (Single-node NullCoordinator is_clustered() is False, so this never runs there; its boot
501
+ # residue was already recovered by the unconditional reset_stale_inflight in start().)
502
+ await self.store.reset_stale_inflight()
503
+ # A3 — dead-letter OUTBOUND/ROUTED rows whose destination/handler left the config (no worker
504
+ # would ever drain them). Now part of graph bring-up, so in a cluster ONLY the leader (the one
505
+ # node that runs the graph) sweeps — a restarting standby never dead-letters the primary's
506
+ # in-flight rows (the hazard the old unconditional placement carried). Single-node is unchanged
507
+ # (it always runs the graph). Keyed off THIS node's registry, so clustered nodes must still run
508
+ # identical config (a coordinated, not rolling, restart for config changes).
509
+ await self.store.dead_letter_missing_destinations(
510
+ set(self._registry_runner.registry.outbound)
511
+ )
512
+ await self.store.dead_letter_missing_handlers(set(self._registry_runner.registry.handlers))
513
+ # Reference sets (ADR 0006): materialize declared sets BEFORE listeners accept (a transform's
514
+ # reference(...) resolves on the first message), then keep the periodic loop running (idempotent
515
+ # — already started on every node in start() for clustered followers to converge). Leader-gated
516
+ # materialize inside the runner; a sync failure is isolated per-set and never blocks intake.
517
+ await self._reconcile_reference_sync(startup=True)
518
+ await self._registry_runner.start()
519
+ log.info("engine graph started — this node is processing")
520
+
521
+ async def _stop_graph(self) -> None:
522
+ """Tear the graph down on loss of leadership: stop the listeners + workers so a demoted node
523
+ stops binding/processing. The reference-sync loop and the self-gated maintenance/convergence
524
+ loops keep running (a follower still converges its caches), so only the runner is stopped."""
525
+ if self._registry_runner is not None:
526
+ await self._registry_runner.stop()
527
+ log.info("engine graph stopped — this node is now standby")
528
+
529
+ async def _reconcile_graph(self) -> None:
530
+ """Align the running graph with this node's leadership: start it on becoming leader, stop it on
531
+ losing leadership. Serialized by ``_graph_lock`` so overlapping triggers can't double act."""
532
+ if self._registry_runner is None:
533
+ return
534
+ async with self._graph_lock:
535
+ running = self._registry_runner.running
536
+ if self._coordinator.is_leader() and not running:
537
+ await self._start_graph()
538
+ # Leadership can be lost DURING the (potentially slow) bring-up — a fence mid-start. If
539
+ # so, tear straight back down within the same lock so a demoted node never keeps the
540
+ # graph running for a whole extra poll cycle.
541
+ if not self._coordinator.is_leader():
542
+ await self._stop_graph()
543
+ elif not self._coordinator.is_leader() and running:
544
+ await self._stop_graph()
545
+
546
+ async def _graph_supervisor_loop(self) -> None:
547
+ """Active-passive graph supervisor (Workstream A1): poll leadership and start/stop the graph so
548
+ only the leader binds listeners + runs workers. Polled at ``_graph_reconcile_interval`` (kept
549
+ short so a demotion/fence promptly stops this node accepting + initiating new work; the row/lane
550
+ leases independently prevent concurrent double-processing of a given row). Clustered only;
551
+ cooperatively stopped via ``_graph_stop`` (the loop wakes on it and exits between reconciles)."""
552
+ while not self._graph_stop.is_set():
553
+ try:
554
+ await self._reconcile_graph()
555
+ except asyncio.CancelledError:
556
+ raise
557
+ except Exception:
558
+ log.exception("engine graph supervisor reconcile failed; will retry")
559
+ try:
560
+ await asyncio.wait_for(
561
+ self._graph_stop.wait(), timeout=self._graph_reconcile_interval
562
+ )
563
+ except asyncio.TimeoutError:
564
+ pass
565
+
566
+ def _set_applied_config_version(self, version: int) -> None:
567
+ """Setter the convergence runner calls after a successful follower reload (Track B Step 6)."""
568
+ self._applied_config_version = version
569
+
570
+ async def _converge_reload(self) -> None:
571
+ """Re-read THIS node's own startup config dir to converge on a cluster reload (Track B Step 6).
572
+
573
+ Non-propagating (``propagate=False``): this is convergence, not initiation, so it must NOT bump
574
+ the shared version token again (or nodes would chase each other's reloads). Passing ``None``
575
+ reloads the startup ``--config`` dir."""
576
+ await self.reload(propagate=False)
577
+
578
+ async def reload(
579
+ self,
580
+ config_dir: str | Path | None = None,
581
+ *,
582
+ dry_run: bool = False,
583
+ propagate: bool = False,
584
+ ) -> Registry:
585
+ """Load the code-first graph from ``config_dir`` and apply it to the running engine.
586
+
587
+ ``config_dir`` defaults to the server's startup ``--config`` dir. Any explicit value must
588
+ resolve **within** an allowed reload root (the startup dir + ``config_reload_roots``);
589
+ otherwise :class:`ConfigReloadDenied` is raised — the loader executes Python, so an
590
+ arbitrary client path must never be honoured. The resolved directory is recorded on
591
+ :attr:`last_reload_dir` for auditing.
592
+
593
+ Validates first (a bad config raises before anything is swapped, so the running graph is
594
+ left untouched), then atomically swaps via the runner's quiesce-and-swap reload. If the
595
+ engine was started without a graph, this loads and starts one. Returns the new Registry.
596
+
597
+ ``dry_run`` performs the full validation **against this instance's environment** — it loads
598
+ the graph and build-checks every connector, which resolves the graph's ``env()`` references
599
+ against *this* engine's values, so a key the target environment doesn't define fails here —
600
+ then returns **without swapping** the live graph. This is the promote pre-flight: it answers
601
+ "will this graph go live cleanly on THIS environment?" without touching running traffic.
602
+
603
+ ``propagate`` (Track B Step 6): on a SUCCESSFUL non-dry-run apply in a clustered deployment,
604
+ bump the shared ``cluster_config`` version token so every OTHER node's convergence loop reloads
605
+ its own (identically-deployed) config dir. The OPERATOR-initiated path (``/config/reload``)
606
+ passes ``propagate=True``; the per-node convergence reload passes ``False`` (convergence, not
607
+ initiation — bumping there would make nodes chase each other). A dry_run never bumps, and
608
+ single-node (``is_clustered()`` False) never bumps. The initiator advances its OWN applied
609
+ version right after bumping, so its convergence loop sees no change and does not re-reload.
610
+
611
+ Raises ``ConfigReloadDenied`` (path outside the allowed roots), ``FileNotFoundError``
612
+ (missing dir) or ``WiringError`` (invalid / empty config / unresolved env value) — the
613
+ caller maps these to HTTP errors.
614
+ """
615
+ path = self._resolve_reload_target(config_dir)
616
+ self.last_reload_dir = path
617
+ if not path.is_dir():
618
+ raise FileNotFoundError(f"config directory not found: {config_dir}")
619
+ # Re-gather this environment's values so a reload/promote picks up edited environments/<env>.toml
620
+ # (or MEFOR_VALUE_* changes) without a restart — otherwise the WiringError telling the operator
621
+ # to add a missing value would never clear (review M-23).
622
+ if self._env_values_provider is not None:
623
+ self._env_values = dict(self._env_values_provider())
624
+ if self._registry_runner is not None:
625
+ self._registry_runner.set_env_values(self._env_values)
626
+ # Off the event loop: load_config executes user config modules (arbitrary, potentially heavy
627
+ # imports), which would otherwise stall every listener mid-reload (review low-3).
628
+ registry = await asyncio.to_thread(load_config, path) # raises WiringError on a bad config
629
+ if not registry.inbound and not registry.outbound:
630
+ raise WiringError(
631
+ f"config directory {config_dir!r} declares no connections — "
632
+ "refusing to reload to an empty graph"
633
+ )
634
+ runner = self._registry_runner
635
+ if dry_run:
636
+ # Validate against THIS environment without swapping: build-check every connector (which
637
+ # resolves env() refs against this instance's values and raises on a missing key or bad
638
+ # spec), then discard. Reuse the live runner if present; else a throwaway one carrying the
639
+ # same bind host + env values, so the check sees exactly what a real reload would.
640
+ checker = runner or RegistryRunner(
641
+ registry,
642
+ self.store,
643
+ poll_interval=self._poll_interval,
644
+ inbound_bind_host=self._inbound_bind_host,
645
+ delivery_defaults=self._delivery_defaults,
646
+ ordering_default=self._ordering_default,
647
+ internal_error_default=self._internal_error_default,
648
+ buildup_default=self._buildup_default,
649
+ ack_after_default=self._ack_after_default,
650
+ alert_sink=self._alert_sink,
651
+ egress=self._egress_settings,
652
+ simulate_all=self._shadow_settings.simulate_all_egress,
653
+ env_values=self._env_values,
654
+ coordinator=self._coordinator,
655
+ )
656
+ checker.build_check(registry)
657
+ return registry
658
+ if runner is None:
659
+ runner = self.add_registry(registry)
660
+ try:
661
+ runner.build_check(registry) # bad connector → WiringError (422), before any start
662
+ await runner.start()
663
+ except Exception:
664
+ # Don't leave a half-started runner: a later reload would take the "runner exists"
665
+ # path and no-op the start, wedging intake. Clear it so a retry re-enters cleanly.
666
+ self._registry_runner = None
667
+ raise
668
+ else:
669
+ await runner.reload(registry)
670
+ # Reference sets (ADR 0006): re-arm + materialize after the swap, so a reference set added by
671
+ # this reload syncs immediately (resolves on the next message, not only after the refresh
672
+ # interval) and a 0->N change actually starts the loop. Idempotent when nothing changed.
673
+ await self._reconcile_reference_sync(startup=False)
674
+ # Config-reload convergence (Track B Step 6): only the OPERATOR-initiated path propagates. Bump
675
+ # the shared version so other nodes converge, and advance THIS node's applied version to the new
676
+ # value so its own convergence loop sees no change (feedback-avoidance — the initiator does not
677
+ # re-reload). A no-op on single-node (is_clustered() False). The per-node convergence reload
678
+ # passes propagate=False and so never bumps (it would otherwise make nodes chase each other).
679
+ if propagate and self._coordinator.is_clustered():
680
+ self._applied_config_version = await self._coordinator.bump_config_version()
681
+ return registry
682
+
683
+ def _resolve_reload_target(self, config_dir: str | Path | None) -> Path:
684
+ """Resolve the reload target and enforce the allow-list (see :class:`ConfigReloadDenied`)."""
685
+ if config_dir is None:
686
+ if self.config_dir is None:
687
+ raise WiringError("no config directory configured; pass one to reload")
688
+ return self.config_dir
689
+ path = Path(config_dir).resolve()
690
+ if self._reload_roots and not any(_within(path, root) for root in self._reload_roots):
691
+ # Don't echo the rejected path back to the client (info disclosure); log it server-side.
692
+ log.warning("rejected /config/reload outside allowed roots: %s", path)
693
+ raise ConfigReloadDenied("config directory is not an allowed reload root")
694
+ return path
695
+
696
+ async def replay(self, message_id: str) -> int:
697
+ """Re-queue every delivery for a message and wake the delivery workers."""
698
+ requeued = await self.store.replay(message_id)
699
+ if self._registry_runner is not None and self._registry_runner.running:
700
+ self._registry_runner.notify_work()
701
+ return requeued
702
+
703
+ async def replay_dead(
704
+ self, *, channel_id: str | None = None, destination_name: str | None = None
705
+ ) -> int:
706
+ """Re-queue dead-lettered deliveries (optionally scoped) and wake the delivery workers."""
707
+ requeued = await self.store.replay_dead(
708
+ channel_id=channel_id, destination_name=destination_name
709
+ )
710
+ if requeued and self._registry_runner is not None and self._registry_runner.running:
711
+ self._registry_runner.notify_work()
712
+ return requeued
713
+
714
+ async def stop(self) -> None:
715
+ """Stop the retention task + the wired graph, then close the store."""
716
+ log.info("engine stopping")
717
+ # Quiesce the active-passive graph supervisor FIRST (Workstream A1) so it can't reconcile (and
718
+ # re-start the graph) while we tear down. A no-op single-node (never spawned). Cooperative: set
719
+ # the stop event and let any in-flight reconcile finish under the lock (so we never abandon a
720
+ # half-started graph), falling back to cancel only if a reconcile hangs past the timeout. The
721
+ # graph itself is then stopped by the registry_runner.stop() below, as before.
722
+ if self._graph_supervisor is not None:
723
+ self._graph_stop.set()
724
+ supervisor = self._graph_supervisor
725
+ self._graph_supervisor = None
726
+ try:
727
+ await asyncio.wait_for(supervisor, timeout=10.0)
728
+ except asyncio.TimeoutError:
729
+ # wait_for already cancelled the task on timeout; absorb its cancellation.
730
+ await asyncio.gather(supervisor, return_exceptions=True)
731
+ if self._retention_runner is not None:
732
+ await self._retention_runner.stop()
733
+ if self._cert_expiry_runner is not None:
734
+ await self._cert_expiry_runner.stop()
735
+ # Stop the leader sweep before deregistering membership (it consults the coordinator's gate, so
736
+ # it must quiesce while the coordinator is still up). A no-op when single-node (never spawned).
737
+ if self._leader_maintenance is not None:
738
+ await self._leader_maintenance.stop()
739
+ # Stop the config-convergence loop before the coordinator (it polls the coordinator's cached
740
+ # version). A no-op when single-node (never spawned).
741
+ if self._config_convergence is not None:
742
+ await self._config_convergence.stop()
743
+ # Stop the transform-state convergence loop before the coordinator/pool tear down (it polls the
744
+ # store). A no-op when single-node (never spawned). (Track B Step 6b.)
745
+ if self._state_convergence is not None:
746
+ await self._state_convergence.stop()
747
+ self._state_convergence = None
748
+ if self._reference_runner is not None:
749
+ await self._reference_runner.stop()
750
+ if self._registry_runner is not None:
751
+ await self._registry_runner.stop()
752
+ # Deregister cluster membership after the runner has quiesced but before the store closes (the
753
+ # coordinator marks its node left over the same pool). stop() is idempotent and safe even if
754
+ # start() raised (then there's just nothing to cancel). NullCoordinator is a no-op.
755
+ await self._coordinator.stop()
756
+ await self.store.close()