messagefoundry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. messagefoundry/__init__.py +108 -0
  2. messagefoundry/__main__.py +1155 -0
  3. messagefoundry/api/__init__.py +27 -0
  4. messagefoundry/api/app.py +1581 -0
  5. messagefoundry/api/approvals.py +184 -0
  6. messagefoundry/api/auth_models.py +211 -0
  7. messagefoundry/api/auth_routes.py +655 -0
  8. messagefoundry/api/field_authz.py +96 -0
  9. messagefoundry/api/models.py +374 -0
  10. messagefoundry/api/security.py +247 -0
  11. messagefoundry/api/tls.py +47 -0
  12. messagefoundry/auth/__init__.py +39 -0
  13. messagefoundry/auth/data/common_passwords.NOTICE +13 -0
  14. messagefoundry/auth/data/common_passwords.txt +10000 -0
  15. messagefoundry/auth/identity.py +71 -0
  16. messagefoundry/auth/ldap.py +264 -0
  17. messagefoundry/auth/notifications.py +68 -0
  18. messagefoundry/auth/passwords.py +53 -0
  19. messagefoundry/auth/permissions.py +120 -0
  20. messagefoundry/auth/policy.py +153 -0
  21. messagefoundry/auth/ratelimit.py +55 -0
  22. messagefoundry/auth/service.py +1323 -0
  23. messagefoundry/auth/tokens.py +26 -0
  24. messagefoundry/auth/totp.py +174 -0
  25. messagefoundry/checks.py +174 -0
  26. messagefoundry/config/__init__.py +30 -0
  27. messagefoundry/config/active_environment.py +80 -0
  28. messagefoundry/config/ai_policy.py +140 -0
  29. messagefoundry/config/code_sets.py +260 -0
  30. messagefoundry/config/connections_edit.py +200 -0
  31. messagefoundry/config/connections_file.py +287 -0
  32. messagefoundry/config/db_lookup.py +117 -0
  33. messagefoundry/config/environments.py +116 -0
  34. messagefoundry/config/ingest_time.py +83 -0
  35. messagefoundry/config/models.py +240 -0
  36. messagefoundry/config/reference.py +158 -0
  37. messagefoundry/config/response.py +83 -0
  38. messagefoundry/config/run_context.py +153 -0
  39. messagefoundry/config/settings.py +1311 -0
  40. messagefoundry/config/state.py +99 -0
  41. messagefoundry/config/tls_policy.py +110 -0
  42. messagefoundry/config/wiring.py +1918 -0
  43. messagefoundry/console/__init__.py +20 -0
  44. messagefoundry/console/__main__.py +274 -0
  45. messagefoundry/console/_async.py +107 -0
  46. messagefoundry/console/change_password.py +111 -0
  47. messagefoundry/console/client.py +552 -0
  48. messagefoundry/console/connections.py +324 -0
  49. messagefoundry/console/login.py +107 -0
  50. messagefoundry/console/mfa.py +205 -0
  51. messagefoundry/console/reauth.py +94 -0
  52. messagefoundry/console/search.py +57 -0
  53. messagefoundry/console/service_control.py +137 -0
  54. messagefoundry/console/sessions.py +122 -0
  55. messagefoundry/console/shell.py +410 -0
  56. messagefoundry/console/status.py +377 -0
  57. messagefoundry/console/users_page.py +282 -0
  58. messagefoundry/console/widgets.py +553 -0
  59. messagefoundry/generators/README.md +27 -0
  60. messagefoundry/generators/__init__.py +15 -0
  61. messagefoundry/generators/_core.py +589 -0
  62. messagefoundry/generators/_hl7data.py +428 -0
  63. messagefoundry/generators/adt.py +286 -0
  64. messagefoundry/generators/all_types.py +24 -0
  65. messagefoundry/generators/bar.py +28 -0
  66. messagefoundry/generators/dft.py +20 -0
  67. messagefoundry/generators/mdm.py +39 -0
  68. messagefoundry/generators/mfn.py +46 -0
  69. messagefoundry/generators/oml.py +32 -0
  70. messagefoundry/generators/orl.py +30 -0
  71. messagefoundry/generators/orm.py +23 -0
  72. messagefoundry/generators/oru.py +21 -0
  73. messagefoundry/generators/ras.py +20 -0
  74. messagefoundry/generators/rde.py +54 -0
  75. messagefoundry/generators/siu.py +64 -0
  76. messagefoundry/generators/vxu.py +20 -0
  77. messagefoundry/hl7schema.py +75 -0
  78. messagefoundry/last_resort.py +55 -0
  79. messagefoundry/logging_setup.py +332 -0
  80. messagefoundry/parsing/__init__.py +64 -0
  81. messagefoundry/parsing/consistency.py +166 -0
  82. messagefoundry/parsing/groups.py +228 -0
  83. messagefoundry/parsing/message.py +453 -0
  84. messagefoundry/parsing/peek.py +237 -0
  85. messagefoundry/parsing/split.py +120 -0
  86. messagefoundry/parsing/summary.py +46 -0
  87. messagefoundry/parsing/tree.py +128 -0
  88. messagefoundry/parsing/validate.py +95 -0
  89. messagefoundry/parsing/x12/__init__.py +46 -0
  90. messagefoundry/parsing/x12/delimiters.py +140 -0
  91. messagefoundry/parsing/x12/errors.py +30 -0
  92. messagefoundry/parsing/x12/interchange.py +232 -0
  93. messagefoundry/parsing/x12/message.py +200 -0
  94. messagefoundry/parsing/x12/peek.py +207 -0
  95. messagefoundry/pipeline/__init__.py +21 -0
  96. messagefoundry/pipeline/alert_sinks.py +486 -0
  97. messagefoundry/pipeline/alerts.py +100 -0
  98. messagefoundry/pipeline/cert_expiry.py +219 -0
  99. messagefoundry/pipeline/cluster.py +955 -0
  100. messagefoundry/pipeline/cluster_sqlserver.py +444 -0
  101. messagefoundry/pipeline/config_convergence.py +137 -0
  102. messagefoundry/pipeline/dryrun.py +450 -0
  103. messagefoundry/pipeline/engine.py +756 -0
  104. messagefoundry/pipeline/leader_tasks.py +158 -0
  105. messagefoundry/pipeline/reference_sync.py +369 -0
  106. messagefoundry/pipeline/retention.py +289 -0
  107. messagefoundry/pipeline/security_notify.py +168 -0
  108. messagefoundry/pipeline/state_convergence.py +143 -0
  109. messagefoundry/pipeline/wiring_runner.py +1722 -0
  110. messagefoundry/py.typed +0 -0
  111. messagefoundry/redaction.py +71 -0
  112. messagefoundry/scaffold.py +321 -0
  113. messagefoundry/secrets_dpapi.py +129 -0
  114. messagefoundry/store/__init__.py +46 -0
  115. messagefoundry/store/audit_tee.py +67 -0
  116. messagefoundry/store/base.py +758 -0
  117. messagefoundry/store/crypto.py +166 -0
  118. messagefoundry/store/keyprovider.py +192 -0
  119. messagefoundry/store/postgres.py +3447 -0
  120. messagefoundry/store/sqlserver.py +3014 -0
  121. messagefoundry/store/store.py +3790 -0
  122. messagefoundry/timezone.py +207 -0
  123. messagefoundry/transports/__init__.py +50 -0
  124. messagefoundry/transports/base.py +269 -0
  125. messagefoundry/transports/database.py +693 -0
  126. messagefoundry/transports/file.py +551 -0
  127. messagefoundry/transports/framing.py +164 -0
  128. messagefoundry/transports/loopback.py +53 -0
  129. messagefoundry/transports/mllp.py +644 -0
  130. messagefoundry/transports/remotefile.py +664 -0
  131. messagefoundry/transports/rest.py +281 -0
  132. messagefoundry/transports/signing.py +321 -0
  133. messagefoundry/transports/soap.py +507 -0
  134. messagefoundry/transports/tcp.py +307 -0
  135. messagefoundry/transports/timer.py +146 -0
  136. messagefoundry/transports/x12.py +323 -0
  137. messagefoundry-0.1.0.dist-info/METADATA +212 -0
  138. messagefoundry-0.1.0.dist-info/RECORD +142 -0
  139. messagefoundry-0.1.0.dist-info/WHEEL +4 -0
  140. messagefoundry-0.1.0.dist-info/entry_points.txt +2 -0
  141. messagefoundry-0.1.0.dist-info/licenses/LICENSE +662 -0
  142. messagefoundry-0.1.0.dist-info/licenses/NOTICE +27 -0
@@ -0,0 +1,444 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 MessageFoundry Organization and contributors
3
+ """SQL Server-backed cluster coordinator for **active-passive HA** (SQL Server store, Phase 4).
4
+
5
+ The Postgres :class:`~messagefoundry.pipeline.cluster.DbCoordinator` drives an asyncpg pool; the SQL
6
+ Server store's pool is aioodbc (cursor-based, ``?`` params, tuple rows). This class is the SQL Server
7
+ sibling: it implements the same :class:`~messagefoundry.pipeline.cluster.ClusterCoordinator` contract so
8
+ a hot standby can take over when the primary dies, but on aioodbc + T-SQL.
9
+
10
+ **Scope = active-passive ONLY.** It provides leader election (the self-fencing ``leader_lease``) +
11
+ membership/observability + the cross-node config-version token. It does NOT provide the active-active
12
+ per-lane row leases — :meth:`lane_owner` returns ``None`` and :meth:`owns_lane` returns ``True`` (the
13
+ single active node, the leader, drains every lane on the unchanged no-owner claim path). Per-lane FIFO
14
+ ownership across many active nodes stays Postgres-only (0.2 scale-out).
15
+
16
+ The leadership lease, self-fence watchdog, and cached cheap/synchronous gates are identical in *design*
17
+ to :class:`DbCoordinator` (see that module's docstring) — the in-memory pieces (fence math, ``is_leader``
18
+ caching) are copied verbatim and only the DB layer differs:
19
+
20
+ - **DB clock:** ``DATEDIFF_BIG(millisecond, '1970-01-01', SYSUTCDATETIME()) / 1000.0`` (epoch seconds) —
21
+ the SQL Server analog of PG ``EXTRACT(EPOCH FROM clock_timestamp())``; computed in T-SQL so all nodes
22
+ share one logical clock and inter-node skew is irrelevant to lease correctness.
23
+ - **Atomic acquire/renew:** ``MERGE leader_lease WITH (HOLDLOCK)`` with the take-over predicate
24
+ ``owner = me OR lease_expires_at < @now`` — the single-statement, serializable analog of PG's
25
+ ``INSERT ... ON CONFLICT ... WHERE``.
26
+ - **DDL race guard:** the store's transaction-scoped ``sp_getapplock`` (``store._applock``), the T-SQL
27
+ analog of PG's ``pg_advisory_xact_lock``.
28
+
29
+ It is duck-typed on the store (``store._acquire`` / ``store._applock`` / ``store._fetchone`` /
30
+ ``store._fetchall`` / ``store._execute`` / ``store._settings``) so this module imports cleanly without
31
+ the optional ``aioodbc`` extra and never hard-imports the concrete store.
32
+
33
+ .. note::
34
+ FAILOVER IN-FLIGHT RECOVERY is unresolved here (see :meth:`reclaims_inflight`). A standby becomes
35
+ leader WITHOUT a restart, so the engine's startup ``reset_stale_inflight`` never re-fires for it. The
36
+ planned fix is an **on-acquire** ``Store.reset_stale_inflight`` hook (run once when this node flips
37
+ non-leader→leader, before it drains). That engine-seam wiring + :func:`build_coordinator` dispatch +
38
+ the ``[cluster]``-requires-postgres relaxation are DEFERRED until the cluster.py observability work
39
+ (branch ``ha-cluster-status-failover``) lands on main, to avoid editing cluster.py in parallel.
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ import asyncio
45
+ import logging
46
+ import os
47
+ import socket
48
+ import time
49
+ from collections.abc import Callable
50
+ from typing import TYPE_CHECKING, Any
51
+
52
+ from messagefoundry.pipeline.cluster import ClusterMember, default_node_id
53
+ from messagefoundry.redaction import safe_exc
54
+
55
+ log = logging.getLogger(__name__)
56
+
57
+ __all__ = ["SqlServerCoordinator"]
58
+
59
+ if TYPE_CHECKING:
60
+ from messagefoundry.pipeline.cluster import ClusterCoordinator
61
+
62
+ def _assert_satisfies_protocol(c: "SqlServerCoordinator") -> "ClusterCoordinator":
63
+ # Compile-time guard (mypy, every PR): SqlServerCoordinator MUST satisfy the ClusterCoordinator
64
+ # Protocol owned by cluster.py. If a future increment adds a contract method (as #257 added
65
+ # leadership_lease), this assignment fails mypy until it's implemented here too.
66
+ return c
67
+
68
+
69
+ # epoch seconds from the DB's own UTC clock — the SQL Server analog of PG clock_timestamp().
70
+ _DB_NOW = "DATEDIFF_BIG(millisecond, '1970-01-01', SYSUTCDATETIME()) / 1000.0"
71
+
72
+ _logged_cluster_enabled = False
73
+
74
+
75
+ class SqlServerCoordinator:
76
+ """Active-passive leader election + membership on the SQL Server store (aioodbc + T-SQL).
77
+
78
+ Mirrors :class:`~messagefoundry.pipeline.cluster.DbCoordinator` minus the active-active lane leases.
79
+ On :meth:`start` it idempotently creates the ``nodes`` / ``leader_lease`` / ``cluster_config`` tables
80
+ (under the store's ``sp_getapplock`` DDL guard), upserts this node, and spawns a **maintenance** task
81
+ (heartbeat + lease acquire/renew + config-version refresh each tick) and a DB-free **fence watchdog**
82
+ that demotes this node if it cannot renew within ``leader_fence_timeout`` (< the lease TTL) — so a
83
+ partitioned old leader stops reporting :meth:`is_leader` ``True`` before any standby can acquire the
84
+ lease (the split-brain guard).
85
+ """
86
+
87
+ def __init__(
88
+ self,
89
+ store: Any,
90
+ node_id: str,
91
+ *,
92
+ heartbeat_seconds: float = 10.0,
93
+ node_timeout_seconds: float = 30.0,
94
+ leader_lease_ttl_seconds: float = 30.0,
95
+ leader_fence_timeout_seconds: float = 20.0,
96
+ monotonic: Callable[[], float] = time.monotonic,
97
+ ) -> None:
98
+ self._store = store
99
+ self.node_id = node_id
100
+ self._heartbeat_seconds = heartbeat_seconds
101
+ self._node_timeout_seconds = node_timeout_seconds
102
+ self._lease_ttl = leader_lease_ttl_seconds
103
+ self._fence_timeout = leader_fence_timeout_seconds
104
+ # Small relative to the fence timeout so a fence fires promptly (well before the lease TTL).
105
+ self._fence_tick = max(0.05, min(1.0, leader_fence_timeout_seconds / 5.0))
106
+ self._monotonic = monotonic
107
+ # Schema-namespace the DDL applock + the lease key, exactly as DbCoordinator does, so two
108
+ # deployments sharing one database via different schemas don't contend / co-elect.
109
+ schema = getattr(getattr(store, "_settings", None), "db_schema", None) or "dbo"
110
+ self._lock_key = f"{schema}:mefor_cluster_nodes"
111
+ self._lease_key = f"{schema}:mefor_cluster_leader"
112
+ self._host = socket.gethostname()
113
+ self._pid = os.getpid()
114
+ self._heartbeat_task: asyncio.Task[None] | None = None
115
+ self._fence_task: asyncio.Task[None] | None = None
116
+ self._stop = asyncio.Event()
117
+ # Cached leadership state read by the cheap/synchronous is_leader() gate (no DB on the hot path).
118
+ self._is_leader: bool = False
119
+ # Monotonic time of the last CONFIRMED lease hold; the fence demotes when now - this > timeout.
120
+ self._last_renew_ok: float | None = None
121
+ self._config_version: int = 0
122
+
123
+ # --- lifecycle -----------------------------------------------------------
124
+
125
+ async def start(self) -> None:
126
+ """Register this node and begin heartbeating. Idempotent (a second call is a no-op while the
127
+ heartbeat already runs; the row upsert is idempotent too)."""
128
+ if self._heartbeat_task is not None:
129
+ return
130
+ self._log_cluster_enabled_once()
131
+ await self._ensure_tables()
132
+ await self._register()
133
+ self._stop.clear()
134
+ self._heartbeat_task = asyncio.create_task(self._heartbeat_loop())
135
+ # Separate from the maintenance loop and does NO DB I/O, so a hung DB can never block fencing.
136
+ self._fence_task = asyncio.create_task(self._fence_watchdog_loop())
137
+
138
+ async def stop(self) -> None:
139
+ """Release leadership, cancel both tasks, mark this node left. Idempotent and never raises."""
140
+ self._stop.set()
141
+ tasks = [t for t in (self._heartbeat_task, self._fence_task) if t is not None]
142
+ self._heartbeat_task = None
143
+ self._fence_task = None
144
+ for t in tasks:
145
+ t.cancel()
146
+ if tasks:
147
+ await asyncio.gather(*tasks, return_exceptions=True)
148
+ # Demote the cached gate FIRST (a concurrent is_leader() reader sees "not leader" at once), then
149
+ # expire the lease row so a standby can take over immediately on a clean shutdown.
150
+ await self._release_leadership()
151
+ try:
152
+ await self._store._execute(
153
+ "UPDATE nodes SET status=?, last_seen=?, is_leader=0 WHERE node_id=?",
154
+ ("left", time.time(), self.node_id),
155
+ )
156
+ except Exception as exc: # pool may already be closing on shutdown — log, don't raise
157
+ log.warning("cluster: failed to mark node %s left: %s", self.node_id, safe_exc(exc))
158
+
159
+ # --- cheap/synchronous gates --------------------------------------------
160
+
161
+ def is_leader(self) -> bool:
162
+ return self._is_leader # cached; no DB. The active-passive gate.
163
+
164
+ def owns_lane(self, lane_key: str) -> bool:
165
+ return True # active-passive: the single active node (leader) owns every lane.
166
+
167
+ def lane_owner(self) -> str | None:
168
+ return None # no per-lane leasing → claim_next_fifo takes its unchanged no-owner path.
169
+
170
+ def reclaims_inflight(self) -> bool:
171
+ # OPEN (deferred to wiring): a standby is promoted WITHOUT a restart, so startup
172
+ # reset_stale_inflight never re-fires. The planned fix is an on-acquire reset_stale_inflight hook
173
+ # (run when _is_leader flips False->True). Reported True (clustered, leader-driven recovery) to
174
+ # match the clustered contract; the exact engine seam is resolved with the cluster.py work.
175
+ return True
176
+
177
+ def is_clustered(self) -> bool:
178
+ return True
179
+
180
+ def config_version_cached(self) -> int:
181
+ return self._config_version
182
+
183
+ # --- config version (cross-node convergence token) ----------------------
184
+
185
+ async def config_version(self) -> int:
186
+ """Read (seeding to 0 if absent) and cache the cluster-wide config-reload version."""
187
+ row = await self._store._fetchone(
188
+ "SET NOCOUNT ON;"
189
+ " MERGE cluster_config WITH (HOLDLOCK) AS t USING (SELECT 1 AS id) AS s ON t.id = s.id"
190
+ " WHEN MATCHED THEN UPDATE SET id = t.id" # no-op update so OUTPUT yields the current row
191
+ " WHEN NOT MATCHED THEN INSERT (id, config_version, updated_at) VALUES (1, 0, ?)"
192
+ " OUTPUT inserted.config_version AS config_version;",
193
+ (time.time(),),
194
+ )
195
+ assert row is not None, "cluster_config upsert returned no row"
196
+ self._config_version = int(row["config_version"])
197
+ return self._config_version
198
+
199
+ async def bump_config_version(self) -> int:
200
+ """Atomically increment + cache the cluster config version (operator reload on THIS node)."""
201
+ now = time.time()
202
+ row = await self._store._fetchone(
203
+ "SET NOCOUNT ON;"
204
+ " MERGE cluster_config WITH (HOLDLOCK) AS t USING (SELECT 1 AS id) AS s ON t.id = s.id"
205
+ " WHEN MATCHED THEN UPDATE SET config_version = t.config_version + 1, updated_at = ?"
206
+ " WHEN NOT MATCHED THEN INSERT (id, config_version, updated_at) VALUES (1, 1, ?)"
207
+ " OUTPUT inserted.config_version AS config_version;",
208
+ (now, now),
209
+ )
210
+ assert row is not None, "cluster_config upsert returned no row"
211
+ self._config_version = int(
212
+ row["config_version"]
213
+ ) # feedback-avoidance: see our own new value
214
+ return self._config_version
215
+
216
+ # --- observability -------------------------------------------------------
217
+
218
+ async def cluster_members(self) -> list[ClusterMember]:
219
+ """One :class:`ClusterMember` per node; ``is_leader`` derived as the single freshest fresh
220
+ ``is_leader``-flagged node (so a crashed ex-leader's stale flag is never the live leader)."""
221
+ rows = await self._store._fetchall(
222
+ "SELECT node_id, host, pid, started_at, last_seen, status, is_leader"
223
+ " FROM nodes ORDER BY node_id"
224
+ )
225
+ now = time.time()
226
+ leader_node_id: str | None = None
227
+ leader_last_seen: float = -1.0
228
+ for r in rows:
229
+ last_seen = r["last_seen"]
230
+ fresh = last_seen is not None and (now - last_seen) <= self._node_timeout_seconds
231
+ if bool(r["is_leader"]) and fresh and last_seen > leader_last_seen:
232
+ leader_last_seen = last_seen
233
+ leader_node_id = r["node_id"]
234
+ return [
235
+ ClusterMember(
236
+ node_id=r["node_id"],
237
+ host=r["host"],
238
+ pid=int(r["pid"]) if r["pid"] is not None else None,
239
+ started_at=r["started_at"],
240
+ last_seen=r["last_seen"],
241
+ status=r["status"],
242
+ is_leader=(r["node_id"] == leader_node_id),
243
+ )
244
+ for r in rows
245
+ ]
246
+
247
+ async def leadership_lease(self) -> tuple[str | None, float | None]:
248
+ """The authoritative lease state ``(owner, DB-clock expiry)`` for the observability API;
249
+ ``(None, None)`` before any lease row exists."""
250
+ row = await self._store._fetchone(
251
+ "SELECT owner, lease_expires_at FROM leader_lease WHERE lease_key = ?",
252
+ (self._lease_key,),
253
+ )
254
+ if row is None:
255
+ return (None, None)
256
+ return (row["owner"], row["lease_expires_at"])
257
+
258
+ # --- internals -----------------------------------------------------------
259
+
260
+ def _log_cluster_enabled_once(self) -> None:
261
+ global _logged_cluster_enabled
262
+ if _logged_cluster_enabled:
263
+ return
264
+ _logged_cluster_enabled = True
265
+ log.info(
266
+ "cluster: SQL Server active-passive coordination enabled for node %s — exactly one node "
267
+ "holds the leadership lease and drains the graph; standbys stay leader-gated until failover",
268
+ self.node_id,
269
+ )
270
+
271
+ async def _ensure_tables(self) -> None:
272
+ """Create the nodes / leader_lease / cluster_config tables under the store's transaction-scoped
273
+ applock (serializes concurrent first-opens so two nodes can't race the CREATE TABLEs)."""
274
+ async with self._store._acquire() as conn:
275
+ cur = await conn.cursor()
276
+ try:
277
+ # A real statement MUST precede sp_getapplock @LockOwner='Transaction', or it does not
278
+ # release on commit (the Phase-1 audit-chain gotcha). This benign SELECT is that statement.
279
+ await cur.execute("SELECT 1")
280
+ await cur.fetchone()
281
+ await self._store._applock(cur, self._lock_key)
282
+ await cur.execute(
283
+ "IF OBJECT_ID(N'nodes', N'U') IS NULL"
284
+ " CREATE TABLE nodes ("
285
+ " node_id NVARCHAR(256) NOT NULL PRIMARY KEY, host NVARCHAR(256) NULL,"
286
+ " pid INT NULL, started_at FLOAT NULL, last_seen FLOAT NULL,"
287
+ " status NVARCHAR(32) NULL,"
288
+ " is_leader BIT NOT NULL CONSTRAINT DF_nodes_is_leader DEFAULT 0);"
289
+ )
290
+ await cur.execute(
291
+ "IF OBJECT_ID(N'leader_lease', N'U') IS NULL"
292
+ " CREATE TABLE leader_lease ("
293
+ " lease_key NVARCHAR(256) NOT NULL PRIMARY KEY, owner NVARCHAR(256) NULL,"
294
+ " lease_expires_at FLOAT NOT NULL);"
295
+ )
296
+ await cur.execute(
297
+ "IF OBJECT_ID(N'cluster_config', N'U') IS NULL"
298
+ " CREATE TABLE cluster_config ("
299
+ " id INT NOT NULL PRIMARY KEY, config_version INT NOT NULL,"
300
+ " updated_at FLOAT NOT NULL);"
301
+ )
302
+ await conn.commit()
303
+ except Exception:
304
+ await conn.rollback()
305
+ raise
306
+
307
+ async def _register(self) -> None:
308
+ now = time.time()
309
+ await self._store._execute(
310
+ "SET NOCOUNT ON;"
311
+ " MERGE nodes WITH (HOLDLOCK) AS t USING (SELECT ? AS node_id) AS s"
312
+ " ON t.node_id = s.node_id"
313
+ " WHEN MATCHED THEN UPDATE SET host=?, pid=?, started_at=?, last_seen=?, status=?,"
314
+ " is_leader=0"
315
+ " WHEN NOT MATCHED THEN INSERT (node_id, host, pid, started_at, last_seen, status,"
316
+ " is_leader) VALUES (?, ?, ?, ?, ?, ?, 0);",
317
+ (
318
+ self.node_id,
319
+ self._host,
320
+ self._pid,
321
+ now,
322
+ now,
323
+ "active",
324
+ self.node_id,
325
+ self._host,
326
+ self._pid,
327
+ now,
328
+ now,
329
+ "active",
330
+ ),
331
+ )
332
+
333
+ async def heartbeat_once(self) -> None:
334
+ # Refresh last_seen (wall clock, like DbCoordinator) and fold the current is_leader flag — the
335
+ # flag lags by at most one tick (the loop beats before _maintain_leadership). Zero extra writes.
336
+ await self._store._execute(
337
+ "UPDATE nodes SET last_seen=?, status=?, is_leader=? WHERE node_id=?",
338
+ (time.time(), "active", 1 if self._is_leader else 0, self.node_id),
339
+ )
340
+
341
+ async def _heartbeat_loop(self) -> None:
342
+ while not self._stop.is_set():
343
+ for step in (self.heartbeat_once, self._maintain_leadership, self.config_version):
344
+ try:
345
+ await step()
346
+ except asyncio.CancelledError:
347
+ raise
348
+ except Exception as exc:
349
+ log.warning(
350
+ "cluster: %s failed for node %s; will retry: %s",
351
+ step.__name__,
352
+ self.node_id,
353
+ safe_exc(exc),
354
+ )
355
+ try:
356
+ await asyncio.wait_for(self._stop.wait(), timeout=self._heartbeat_seconds)
357
+ except asyncio.TimeoutError:
358
+ continue
359
+
360
+ async def _maintain_leadership(self) -> None:
361
+ held = await self._claim_or_renew_lease()
362
+ if held:
363
+ self._last_renew_ok = self._monotonic() # stamp for the fence watchdog
364
+ if not self._is_leader:
365
+ self._is_leader = True
366
+ log.info("cluster: node %s acquired leadership (lease)", self.node_id)
367
+ elif self._is_leader:
368
+ self._is_leader = False
369
+ log.info("cluster: node %s lost leadership (lease taken or expired)", self.node_id)
370
+
371
+ async def _claim_or_renew_lease(self) -> bool:
372
+ """Atomically acquire (fresh / expired) or renew (already ours) the single leadership lease, all
373
+ against the DB clock. Held iff the OUTPUT row exists AND names us. ``HOLDLOCK`` makes the
374
+ take-over race serializable on the lease key (the analog of PG's single-statement upsert)."""
375
+ row = await self._store._fetchone(
376
+ "SET NOCOUNT ON;"
377
+ f" DECLARE @now FLOAT = {_DB_NOW};"
378
+ " MERGE leader_lease WITH (HOLDLOCK) AS t USING (SELECT ? AS lease_key) AS s"
379
+ " ON t.lease_key = s.lease_key"
380
+ " WHEN MATCHED AND (t.owner = ? OR t.lease_expires_at < @now)"
381
+ " THEN UPDATE SET owner = ?, lease_expires_at = @now + ?"
382
+ " WHEN NOT MATCHED"
383
+ " THEN INSERT (lease_key, owner, lease_expires_at) VALUES (?, ?, @now + ?)"
384
+ " OUTPUT inserted.owner AS owner;",
385
+ (
386
+ self._lease_key,
387
+ self.node_id,
388
+ self.node_id,
389
+ self._lease_ttl,
390
+ self._lease_key,
391
+ self.node_id,
392
+ self._lease_ttl,
393
+ ),
394
+ )
395
+ return row is not None and row["owner"] == self.node_id
396
+
397
+ async def _fence_watchdog_loop(self) -> None:
398
+ while not self._stop.is_set():
399
+ try:
400
+ await asyncio.wait_for(self._stop.wait(), timeout=self._fence_tick)
401
+ return # stop requested
402
+ except asyncio.TimeoutError:
403
+ pass
404
+ self._check_fence()
405
+
406
+ def _check_fence(self) -> None:
407
+ if not self._is_leader:
408
+ return
409
+ last = self._last_renew_ok
410
+ if last is None:
411
+ return # defensive: _is_leader is only set alongside _last_renew_ok
412
+ if self._monotonic() - last > self._fence_timeout:
413
+ self._is_leader = False
414
+ log.warning(
415
+ "cluster: node %s SELF-FENCED — leadership lease not renewed within %.1fs (fence "
416
+ "timeout); halting leader work before the lease (TTL %.1fs) can expire",
417
+ self.node_id,
418
+ self._fence_timeout,
419
+ self._lease_ttl,
420
+ )
421
+
422
+ async def _release_leadership(self) -> None:
423
+ was_leader = self._is_leader
424
+ self._is_leader = False
425
+ self._last_renew_ok = None
426
+ if not was_leader:
427
+ return
428
+ try:
429
+ await self._store._execute(
430
+ "UPDATE leader_lease SET lease_expires_at = 0 WHERE lease_key = ? AND owner = ?",
431
+ (self._lease_key, self.node_id),
432
+ )
433
+ except Exception as exc:
434
+ log.warning(
435
+ "cluster: node %s failed to release the leadership lease (it will expire on its "
436
+ "own): %s",
437
+ self.node_id,
438
+ safe_exc(exc),
439
+ )
440
+
441
+
442
+ def default_sqlserver_node_id() -> str:
443
+ """Convenience re-export shim so callers can build a node id without importing cluster.py directly."""
444
+ return default_node_id()
@@ -0,0 +1,137 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 MessageFoundry Organization and contributors
3
+ """Cluster-wide config-reload convergence (Track B Step 6).
4
+
5
+ When an operator reloads config on ONE clustered node, that node bumps a shared
6
+ ``cluster_config.config_version`` token (see :class:`~messagefoundry.pipeline.cluster.DbCoordinator`).
7
+ :class:`ConfigConvergenceRunner` is the engine-owned background loop on every node that polls the
8
+ coordinator's *cached* version each tick and, when it observes a version higher than the one this node
9
+ has applied, re-reads **this node's own** (identically-deployed) startup config dir and applies it — so
10
+ a single operator reload propagates to the whole cluster without per-node operator action.
11
+
12
+ **Feedback-avoidance.** The node that initiated the reload already advanced its applied version (the
13
+ engine sets ``_applied_config_version`` right after bumping), so its own loop sees no change and does
14
+ NOT re-reload — only the OTHER nodes, whose applied version is behind, converge.
15
+
16
+ **Homogeneous-config assumption.** The version token coordinates *when* nodes reload; each node reloads
17
+ its OWN config dir. Skewed config dirs would diverge — the same assumption as Step 4's
18
+ dead-letter-missing-destinations/handlers sweeps (clustered nodes run identical config).
19
+
20
+ It mirrors :class:`~messagefoundry.pipeline.leader_tasks.LeaderMaintenanceRunner`'s shape: construct
21
+ with the engine convergence callbacks + coordinator + interval, call :meth:`start`/:meth:`stop` for the
22
+ supervised loop, or :meth:`converge_once` for a single deterministic pass (tests). The engine only
23
+ spawns it in clustered mode (``coordinator.is_clustered()``), so single-node / SQLite never pays for it.
24
+
25
+ Engine-side and dependency-light (stdlib + the cluster seam only), so it never pulls the API or console
26
+ into the engine.
27
+ """
28
+
29
+ from __future__ import annotations
30
+
31
+ import asyncio
32
+ import logging
33
+ from collections.abc import Awaitable, Callable
34
+
35
+ from messagefoundry.pipeline.cluster import ClusterCoordinator
36
+
37
+ __all__ = ["ConfigConvergenceRunner"]
38
+
39
+ log = logging.getLogger(__name__)
40
+
41
+
42
+ class ConfigConvergenceRunner:
43
+ """Polls the shared config version each tick and reloads this node's config when it falls behind.
44
+
45
+ Construct with: ``applied_version`` (a getter for this node's currently-applied config version),
46
+ ``set_applied_version`` (a setter the runner calls after a successful convergence reload), and
47
+ ``reload`` (an awaitable that re-applies this node's local startup config, NON-propagating). The
48
+ runner never imports the engine — it takes these as plain callbacks so the dependency direction
49
+ stays one-way (pipeline only).
50
+ """
51
+
52
+ def __init__(
53
+ self,
54
+ coordinator: ClusterCoordinator,
55
+ *,
56
+ applied_version: Callable[[], int],
57
+ set_applied_version: Callable[[int], None],
58
+ reload: Callable[[], Awaitable[None]],
59
+ interval_seconds: float,
60
+ ) -> None:
61
+ self._coordinator = coordinator
62
+ self._applied_version = applied_version
63
+ self._set_applied_version = set_applied_version
64
+ self._reload = reload
65
+ self._interval_seconds = interval_seconds
66
+ self._stop = asyncio.Event()
67
+ self._task: asyncio.Task[None] | None = None
68
+
69
+ # --- lifecycle -----------------------------------------------------------
70
+
71
+ def start(self) -> None:
72
+ """Spawn the supervised convergence loop (idempotent: a second call while running is a no-op)."""
73
+ if self._task is not None:
74
+ return
75
+ self._stop.clear()
76
+ self._task = asyncio.create_task(self._run())
77
+ log.info(
78
+ "cluster config convergence enabled: polling the shared config version every %gs",
79
+ self._interval_seconds,
80
+ )
81
+
82
+ async def stop(self) -> None:
83
+ """Signal the loop and await its exit (idempotent)."""
84
+ self._stop.set()
85
+ task = self._task
86
+ self._task = None
87
+ if task is not None:
88
+ task.cancel()
89
+ try:
90
+ await task
91
+ except asyncio.CancelledError:
92
+ pass
93
+
94
+ async def _run(self) -> None:
95
+ # One isolated pass per interval; an error in a pass (e.g. a bad local config) is logged and the
96
+ # loop continues — a convergence hiccup must never take the node down. Cooperatively cancellable.
97
+ while not self._stop.is_set():
98
+ try:
99
+ await self.converge_once()
100
+ except asyncio.CancelledError:
101
+ raise
102
+ except Exception:
103
+ log.exception("cluster: config convergence pass failed; will retry next interval")
104
+ await self._sleep(self._interval_seconds)
105
+
106
+ async def _sleep(self, delay: float) -> None:
107
+ """Sleep up to ``delay``, waking immediately on stop (so shutdown isn't held by the interval)."""
108
+ try:
109
+ await asyncio.wait_for(self._stop.wait(), delay)
110
+ except asyncio.TimeoutError:
111
+ pass
112
+
113
+ # --- one pass ------------------------------------------------------------
114
+
115
+ async def converge_once(self) -> bool:
116
+ """If the shared config version is ahead of this node's applied version, reload this node's own
117
+ config dir to converge and advance the applied version. Returns whether a reload happened.
118
+
119
+ The poll reads the coordinator's CACHED version (cheap/synchronous, refreshed on the
120
+ coordinator's maintenance tick). The node that initiated the reload already advanced its applied
121
+ version (feedback-avoidance), so only nodes that are behind reload."""
122
+ shared = self._coordinator.config_version_cached()
123
+ if shared <= self._applied_version():
124
+ return False
125
+ log.info(
126
+ "cluster: shared config version %d is ahead of this node's applied %d; converging",
127
+ shared,
128
+ self._applied_version(),
129
+ )
130
+ # Re-read THIS node's own startup config dir (NON-propagating: convergence, not initiation — it
131
+ # must not bump the token again, or nodes would chase each other's reloads). A bad local config
132
+ # raises here; the loop isolates it (logged, the node keeps running its current graph).
133
+ await self._reload()
134
+ # Only advance the applied version after a clean reload, so a failed convergence retries next
135
+ # tick rather than silently skipping the version it couldn't apply.
136
+ self._set_applied_version(shared)
137
+ return True