messagefoundry 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- messagefoundry/__init__.py +108 -0
- messagefoundry/__main__.py +1155 -0
- messagefoundry/api/__init__.py +27 -0
- messagefoundry/api/app.py +1581 -0
- messagefoundry/api/approvals.py +184 -0
- messagefoundry/api/auth_models.py +211 -0
- messagefoundry/api/auth_routes.py +655 -0
- messagefoundry/api/field_authz.py +96 -0
- messagefoundry/api/models.py +374 -0
- messagefoundry/api/security.py +247 -0
- messagefoundry/api/tls.py +47 -0
- messagefoundry/auth/__init__.py +39 -0
- messagefoundry/auth/data/common_passwords.NOTICE +13 -0
- messagefoundry/auth/data/common_passwords.txt +10000 -0
- messagefoundry/auth/identity.py +71 -0
- messagefoundry/auth/ldap.py +264 -0
- messagefoundry/auth/notifications.py +68 -0
- messagefoundry/auth/passwords.py +53 -0
- messagefoundry/auth/permissions.py +120 -0
- messagefoundry/auth/policy.py +153 -0
- messagefoundry/auth/ratelimit.py +55 -0
- messagefoundry/auth/service.py +1323 -0
- messagefoundry/auth/tokens.py +26 -0
- messagefoundry/auth/totp.py +174 -0
- messagefoundry/checks.py +174 -0
- messagefoundry/config/__init__.py +30 -0
- messagefoundry/config/active_environment.py +80 -0
- messagefoundry/config/ai_policy.py +140 -0
- messagefoundry/config/code_sets.py +260 -0
- messagefoundry/config/connections_edit.py +200 -0
- messagefoundry/config/connections_file.py +287 -0
- messagefoundry/config/db_lookup.py +117 -0
- messagefoundry/config/environments.py +116 -0
- messagefoundry/config/ingest_time.py +83 -0
- messagefoundry/config/models.py +240 -0
- messagefoundry/config/reference.py +158 -0
- messagefoundry/config/response.py +83 -0
- messagefoundry/config/run_context.py +153 -0
- messagefoundry/config/settings.py +1311 -0
- messagefoundry/config/state.py +99 -0
- messagefoundry/config/tls_policy.py +110 -0
- messagefoundry/config/wiring.py +1918 -0
- messagefoundry/console/__init__.py +20 -0
- messagefoundry/console/__main__.py +274 -0
- messagefoundry/console/_async.py +107 -0
- messagefoundry/console/change_password.py +111 -0
- messagefoundry/console/client.py +552 -0
- messagefoundry/console/connections.py +324 -0
- messagefoundry/console/login.py +107 -0
- messagefoundry/console/mfa.py +205 -0
- messagefoundry/console/reauth.py +94 -0
- messagefoundry/console/search.py +57 -0
- messagefoundry/console/service_control.py +137 -0
- messagefoundry/console/sessions.py +122 -0
- messagefoundry/console/shell.py +410 -0
- messagefoundry/console/status.py +377 -0
- messagefoundry/console/users_page.py +282 -0
- messagefoundry/console/widgets.py +553 -0
- messagefoundry/generators/README.md +27 -0
- messagefoundry/generators/__init__.py +15 -0
- messagefoundry/generators/_core.py +589 -0
- messagefoundry/generators/_hl7data.py +428 -0
- messagefoundry/generators/adt.py +286 -0
- messagefoundry/generators/all_types.py +24 -0
- messagefoundry/generators/bar.py +28 -0
- messagefoundry/generators/dft.py +20 -0
- messagefoundry/generators/mdm.py +39 -0
- messagefoundry/generators/mfn.py +46 -0
- messagefoundry/generators/oml.py +32 -0
- messagefoundry/generators/orl.py +30 -0
- messagefoundry/generators/orm.py +23 -0
- messagefoundry/generators/oru.py +21 -0
- messagefoundry/generators/ras.py +20 -0
- messagefoundry/generators/rde.py +54 -0
- messagefoundry/generators/siu.py +64 -0
- messagefoundry/generators/vxu.py +20 -0
- messagefoundry/hl7schema.py +75 -0
- messagefoundry/last_resort.py +55 -0
- messagefoundry/logging_setup.py +332 -0
- messagefoundry/parsing/__init__.py +64 -0
- messagefoundry/parsing/consistency.py +166 -0
- messagefoundry/parsing/groups.py +228 -0
- messagefoundry/parsing/message.py +453 -0
- messagefoundry/parsing/peek.py +237 -0
- messagefoundry/parsing/split.py +120 -0
- messagefoundry/parsing/summary.py +46 -0
- messagefoundry/parsing/tree.py +128 -0
- messagefoundry/parsing/validate.py +95 -0
- messagefoundry/parsing/x12/__init__.py +46 -0
- messagefoundry/parsing/x12/delimiters.py +140 -0
- messagefoundry/parsing/x12/errors.py +30 -0
- messagefoundry/parsing/x12/interchange.py +232 -0
- messagefoundry/parsing/x12/message.py +200 -0
- messagefoundry/parsing/x12/peek.py +207 -0
- messagefoundry/pipeline/__init__.py +21 -0
- messagefoundry/pipeline/alert_sinks.py +486 -0
- messagefoundry/pipeline/alerts.py +100 -0
- messagefoundry/pipeline/cert_expiry.py +219 -0
- messagefoundry/pipeline/cluster.py +955 -0
- messagefoundry/pipeline/cluster_sqlserver.py +444 -0
- messagefoundry/pipeline/config_convergence.py +137 -0
- messagefoundry/pipeline/dryrun.py +450 -0
- messagefoundry/pipeline/engine.py +756 -0
- messagefoundry/pipeline/leader_tasks.py +158 -0
- messagefoundry/pipeline/reference_sync.py +369 -0
- messagefoundry/pipeline/retention.py +289 -0
- messagefoundry/pipeline/security_notify.py +168 -0
- messagefoundry/pipeline/state_convergence.py +143 -0
- messagefoundry/pipeline/wiring_runner.py +1722 -0
- messagefoundry/py.typed +0 -0
- messagefoundry/redaction.py +71 -0
- messagefoundry/scaffold.py +321 -0
- messagefoundry/secrets_dpapi.py +129 -0
- messagefoundry/store/__init__.py +46 -0
- messagefoundry/store/audit_tee.py +67 -0
- messagefoundry/store/base.py +758 -0
- messagefoundry/store/crypto.py +166 -0
- messagefoundry/store/keyprovider.py +192 -0
- messagefoundry/store/postgres.py +3447 -0
- messagefoundry/store/sqlserver.py +3014 -0
- messagefoundry/store/store.py +3790 -0
- messagefoundry/timezone.py +207 -0
- messagefoundry/transports/__init__.py +50 -0
- messagefoundry/transports/base.py +269 -0
- messagefoundry/transports/database.py +693 -0
- messagefoundry/transports/file.py +551 -0
- messagefoundry/transports/framing.py +164 -0
- messagefoundry/transports/loopback.py +53 -0
- messagefoundry/transports/mllp.py +644 -0
- messagefoundry/transports/remotefile.py +664 -0
- messagefoundry/transports/rest.py +281 -0
- messagefoundry/transports/signing.py +321 -0
- messagefoundry/transports/soap.py +507 -0
- messagefoundry/transports/tcp.py +307 -0
- messagefoundry/transports/timer.py +146 -0
- messagefoundry/transports/x12.py +323 -0
- messagefoundry-0.1.0.dist-info/METADATA +212 -0
- messagefoundry-0.1.0.dist-info/RECORD +142 -0
- messagefoundry-0.1.0.dist-info/WHEEL +4 -0
- messagefoundry-0.1.0.dist-info/entry_points.txt +2 -0
- messagefoundry-0.1.0.dist-info/licenses/LICENSE +662 -0
- messagefoundry-0.1.0.dist-info/licenses/NOTICE +27 -0
|
@@ -0,0 +1,756 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 MessageFoundry Organization and contributors
|
|
3
|
+
"""The engine: owns the store and supervises the code-first :class:`RegistryRunner`.
|
|
4
|
+
|
|
5
|
+
This is the object the API layer (and tests) drive. It opens the durable store, recovers
|
|
6
|
+
any deliveries left ``inflight`` by a previous crash, and runs the wired Connection/Router/
|
|
7
|
+
Handler graph.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import asyncio
|
|
13
|
+
import logging
|
|
14
|
+
import time
|
|
15
|
+
from collections.abc import Callable, Mapping, Sequence
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from messagefoundry.config.models import (
|
|
20
|
+
AckAfter,
|
|
21
|
+
BuildupThreshold,
|
|
22
|
+
InternalErrorPolicy,
|
|
23
|
+
OrderingMode,
|
|
24
|
+
RetryPolicy,
|
|
25
|
+
)
|
|
26
|
+
from messagefoundry.config.settings import (
|
|
27
|
+
CertMonitorSettings,
|
|
28
|
+
ClusterSettings,
|
|
29
|
+
EgressSettings,
|
|
30
|
+
ReferenceSettings,
|
|
31
|
+
RetentionSettings,
|
|
32
|
+
ShadowSettings,
|
|
33
|
+
)
|
|
34
|
+
from messagefoundry.config.wiring import Registry, WiringError, load_config
|
|
35
|
+
from messagefoundry.pipeline.alerts import AlertSink
|
|
36
|
+
from messagefoundry.pipeline.cert_expiry import CertExpiryRunner, MonitoredCert, certs_from_registry
|
|
37
|
+
from messagefoundry.pipeline.cluster import ClusterCoordinator, NullCoordinator
|
|
38
|
+
from messagefoundry.pipeline.config_convergence import ConfigConvergenceRunner
|
|
39
|
+
from messagefoundry.pipeline.leader_tasks import LeaderMaintenanceRunner
|
|
40
|
+
from messagefoundry.pipeline.reference_sync import ReferenceSyncRunner
|
|
41
|
+
from messagefoundry.pipeline.retention import RetentionRunner
|
|
42
|
+
from messagefoundry.pipeline.state_convergence import StateConvergenceRunner
|
|
43
|
+
from messagefoundry.pipeline.wiring_runner import RegistryRunner
|
|
44
|
+
from messagefoundry.store import MessageStore, Store
|
|
45
|
+
|
|
46
|
+
__all__ = ["Engine", "ConfigReloadDenied"]
|
|
47
|
+
|
|
48
|
+
log = logging.getLogger(__name__)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class ConfigReloadDenied(Exception):
|
|
52
|
+
"""A /config/reload target resolved outside the allowed reload roots (RCE guard).
|
|
53
|
+
|
|
54
|
+
The API maps this to 403. Because the loader executes Python from the target directory, a
|
|
55
|
+
reload may only load from the server's startup ``--config`` dir or an explicitly configured
|
|
56
|
+
``config_reload_roots`` entry — never an arbitrary client-supplied path."""
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _within(path: Path, root: Path) -> bool:
|
|
60
|
+
"""True if ``path`` is ``root`` itself or nested under it (both already resolved)."""
|
|
61
|
+
return path == root or root in path.parents
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class Engine:
|
|
65
|
+
def __init__(
|
|
66
|
+
self,
|
|
67
|
+
store: Store,
|
|
68
|
+
*,
|
|
69
|
+
poll_interval: float = 0.25,
|
|
70
|
+
max_correlation_depth: int = 8,
|
|
71
|
+
config_dir: str | Path | None = None,
|
|
72
|
+
config_reload_roots: Sequence[str | Path] = (),
|
|
73
|
+
inbound_bind_host: str = "127.0.0.1",
|
|
74
|
+
allow_insecure_bind: bool = False,
|
|
75
|
+
delivery_defaults: RetryPolicy | None = None,
|
|
76
|
+
ordering_default: OrderingMode | None = None,
|
|
77
|
+
internal_error_default: InternalErrorPolicy | None = None,
|
|
78
|
+
buildup_default: BuildupThreshold | None = None,
|
|
79
|
+
ack_after_default: AckAfter | None = None,
|
|
80
|
+
alert_sink: AlertSink | None = None,
|
|
81
|
+
retention_settings: RetentionSettings | None = None,
|
|
82
|
+
cert_monitor_settings: CertMonitorSettings | None = None,
|
|
83
|
+
api_tls_cert_file: str | None = None,
|
|
84
|
+
reference_settings: ReferenceSettings | None = None,
|
|
85
|
+
egress_settings: EgressSettings | None = None,
|
|
86
|
+
shadow_settings: ShadowSettings | None = None,
|
|
87
|
+
active_environment: str | None = None,
|
|
88
|
+
env_values: Mapping[str, Any] | None = None,
|
|
89
|
+
env_values_provider: Callable[[], Mapping[str, Any]] | None = None,
|
|
90
|
+
coordinator: ClusterCoordinator | None = None,
|
|
91
|
+
cluster_settings: ClusterSettings | None = None,
|
|
92
|
+
) -> None:
|
|
93
|
+
self.store = store
|
|
94
|
+
# Cluster coordination seam (Track B Step 3). None → the no-op NullCoordinator, so single-node
|
|
95
|
+
# (SQLite and single-node Postgres) is byte-identical: is_leader()/owns_lane() are always True
|
|
96
|
+
# and start()/stop() do nothing. A DbCoordinator (built by build_coordinator on an enabled
|
|
97
|
+
# [cluster] Postgres store) registers the node + heartbeats and (Step 4) elects a leader; its
|
|
98
|
+
# owns_lane() still reports True until Step 5. Threaded into every runner this engine builds.
|
|
99
|
+
self._coordinator: ClusterCoordinator = coordinator or NullCoordinator()
|
|
100
|
+
# [cluster] knobs (Track B Step 4). Only reclaim_interval_seconds is read here (the cadence of
|
|
101
|
+
# the leader's lease-reclaim sweep); the rest drive build_coordinator upstream. None → the
|
|
102
|
+
# ClusterSettings() defaults, which is fine because the leader sweep only spawns when the
|
|
103
|
+
# coordinator reclaims inflight rows (i.e. a DbCoordinator), never for the single-node default.
|
|
104
|
+
self._cluster_settings = cluster_settings or ClusterSettings()
|
|
105
|
+
self._leader_maintenance: LeaderMaintenanceRunner | None = None
|
|
106
|
+
# Config-reload convergence (Track B Step 6). Spawned ONLY in clustered mode (is_clustered()),
|
|
107
|
+
# so single-node never pays for it. _applied_config_version is the shared config version this
|
|
108
|
+
# node has applied; seeded at start() to the coordinator's current version (so a fresh node
|
|
109
|
+
# doesn't self-reload) and advanced when this node bumps (operator reload) or converges (follower
|
|
110
|
+
# reload). The node that bumps advances it itself, so its own convergence loop sees no change.
|
|
111
|
+
self._config_convergence: ConfigConvergenceRunner | None = None
|
|
112
|
+
self._applied_config_version: int = 0
|
|
113
|
+
# Transform-state read-through convergence (Track B Step 6b). Spawned ONLY in clustered mode
|
|
114
|
+
# (is_clustered()), so single-node never pays for it. Each tick it read-throughs any namespace a
|
|
115
|
+
# sibling node wrote/purged into this node's local _state_cache (off the hot path, so state_get
|
|
116
|
+
# stays a pure sync dict lookup). Mirrors _config_convergence's lifecycle.
|
|
117
|
+
self._state_convergence: StateConvergenceRunner | None = None
|
|
118
|
+
# The active environment name ([ai].environment / serve --env), passed to every runner this
|
|
119
|
+
# engine builds so a Handler's current_environment() resolves to it (per-face transform logic).
|
|
120
|
+
self._active_environment = active_environment
|
|
121
|
+
self._poll_interval = poll_interval
|
|
122
|
+
# [pipeline] re-ingress loop-prevention cap (ADR 0013 Increment 2); every runner inherits it.
|
|
123
|
+
self._max_correlation_depth = max_correlation_depth
|
|
124
|
+
# Where the runner reports operational alerts; None → the runner's default logging sink.
|
|
125
|
+
self._alert_sink = alert_sink
|
|
126
|
+
# [retention] enforcement. None (embedding/tests) → no retention task; the runner itself is a
|
|
127
|
+
# no-op when nothing is configured, so passing default settings is also safe.
|
|
128
|
+
self._retention_settings = retention_settings
|
|
129
|
+
self._retention_runner: RetentionRunner | None = None
|
|
130
|
+
# [cert_monitor] TLS-cert expiry monitor (Q5c). None (embedding/tests) → no monitor task. The
|
|
131
|
+
# set of certs to watch is derived at scan time from the [api] TLS cert + the wired graph's MLLP
|
|
132
|
+
# certs (read live, so a reload that adds/removes a TLS connection is picked up).
|
|
133
|
+
self._cert_monitor_settings = cert_monitor_settings
|
|
134
|
+
self._api_tls_cert_file = api_tls_cert_file
|
|
135
|
+
self._cert_expiry_runner: CertExpiryRunner | None = None
|
|
136
|
+
# [reference] enforcement (ADR 0006). None (embedding/tests) → default settings; the reference
|
|
137
|
+
# sync runner is a no-op when the graph declares no reference sets.
|
|
138
|
+
self._reference_settings = reference_settings
|
|
139
|
+
self._reference_runner: ReferenceSyncRunner | None = None
|
|
140
|
+
# Fail-closed outbound destination allowlist (WP-11c); passed to every runner this engine builds
|
|
141
|
+
# (and the reload dry-run checker), so a denied destination is refused at start + on reload.
|
|
142
|
+
self._egress_settings = egress_settings
|
|
143
|
+
# [shadow] parallel-run egress suppression (#15); simulate_all_egress is threaded into every
|
|
144
|
+
# runner this engine builds so a shadow instance suppresses all delivery. None → defaults (off).
|
|
145
|
+
self._shadow_settings = shadow_settings or ShadowSettings()
|
|
146
|
+
# The interface inbound listeners bind to; every runner this engine builds inherits it.
|
|
147
|
+
self._inbound_bind_host = inbound_bind_host
|
|
148
|
+
# The serve --allow-insecure-bind dev escape; every runner inherits it for the §0 exposed-gate.
|
|
149
|
+
self._allow_insecure_bind = allow_insecure_bind
|
|
150
|
+
# Global [delivery] defaults (retry + ordering + internal-error action + buildup thresholds);
|
|
151
|
+
# every runner inherits them. A connection's own retry=/ordering=/internal_error=/buildup= wins.
|
|
152
|
+
self._delivery_defaults = delivery_defaults
|
|
153
|
+
self._ordering_default = ordering_default
|
|
154
|
+
self._internal_error_default = internal_error_default
|
|
155
|
+
self._buildup_default = buildup_default
|
|
156
|
+
# Global [inbound] ACK-timing default (ADR 0001); every runner inherits it.
|
|
157
|
+
self._ack_after_default = ack_after_default
|
|
158
|
+
# This instance's environment values (DEV/PROD), shared with every runner the engine builds —
|
|
159
|
+
# so env() references in a reloaded graph resolve against THIS environment (and a missing
|
|
160
|
+
# value is refused here, on this engine, not on the box the graph was authored on). The
|
|
161
|
+
# optional provider is re-invoked on each reload so a promote picks up edited values files
|
|
162
|
+
# without a restart (review M-23); without it the values are static (embedding/tests).
|
|
163
|
+
self._env_values_provider = env_values_provider
|
|
164
|
+
initial = env_values_provider() if env_values_provider is not None else env_values
|
|
165
|
+
self._env_values: dict[str, Any] = dict(initial or {})
|
|
166
|
+
self._registry_runner: RegistryRunner | None = None
|
|
167
|
+
# Active-passive graph supervisor (Workstream A1). In CLUSTERED mode the wired graph (listeners
|
|
168
|
+
# + workers) runs ONLY while this node holds leadership: this task polls leadership and
|
|
169
|
+
# starts/stops the graph on acquire/lose, so a standby stays warm without binding listeners or
|
|
170
|
+
# processing. NEVER spawned single-node (NullCoordinator is always leader, so the graph is
|
|
171
|
+
# brought up directly at start() — byte-identical). The lock serializes reconciles; the event
|
|
172
|
+
# stops the loop. NOTE the hard guarantee against concurrent double-processing of any given row
|
|
173
|
+
# is NOT this gate — it is the store's row/lane leases (a standby's reclaim only takes EXPIRED
|
|
174
|
+
# leases, so it can never claim a row the old leader still holds; Track B Step 2/5). This gate
|
|
175
|
+
# promptly stops a demoted/fenced node from accepting NEW inbound work and initiating NEW
|
|
176
|
+
# processing; the poll interval is bounded (at start()) to keep that stop prompt.
|
|
177
|
+
self._graph_supervisor: asyncio.Task[None] | None = None
|
|
178
|
+
self._graph_stop = asyncio.Event()
|
|
179
|
+
self._graph_lock = asyncio.Lock()
|
|
180
|
+
self._graph_reconcile_interval = 1.0
|
|
181
|
+
# Set when start() runs; the "since" for since-engine-start metric counts.
|
|
182
|
+
self.started_at: float = 0.0
|
|
183
|
+
# The startup config dir is the default reload target and an implicit allowed root.
|
|
184
|
+
self.config_dir: Path | None = Path(config_dir).resolve() if config_dir else None
|
|
185
|
+
roots = [Path(r).resolve() for r in config_reload_roots]
|
|
186
|
+
if self.config_dir is not None:
|
|
187
|
+
roots.append(self.config_dir)
|
|
188
|
+
# Empty => unconstrained (embedding/tests). The served path always sets config_dir.
|
|
189
|
+
self._reload_roots: tuple[Path, ...] = tuple(dict.fromkeys(roots))
|
|
190
|
+
# The directory the most recent reload loaded from (resolved) — for audit by the API.
|
|
191
|
+
self.last_reload_dir: Path | None = None
|
|
192
|
+
|
|
193
|
+
@classmethod
|
|
194
|
+
async def create(
|
|
195
|
+
cls,
|
|
196
|
+
db_path: str | Path,
|
|
197
|
+
*,
|
|
198
|
+
poll_interval: float = 0.25,
|
|
199
|
+
max_correlation_depth: int = 8,
|
|
200
|
+
synchronous: str = "NORMAL",
|
|
201
|
+
config_dir: str | Path | None = None,
|
|
202
|
+
config_reload_roots: Sequence[str | Path] = (),
|
|
203
|
+
inbound_bind_host: str = "127.0.0.1",
|
|
204
|
+
allow_insecure_bind: bool = False,
|
|
205
|
+
delivery_defaults: RetryPolicy | None = None,
|
|
206
|
+
ordering_default: OrderingMode | None = None,
|
|
207
|
+
internal_error_default: InternalErrorPolicy | None = None,
|
|
208
|
+
buildup_default: BuildupThreshold | None = None,
|
|
209
|
+
ack_after_default: AckAfter | None = None,
|
|
210
|
+
alert_sink: AlertSink | None = None,
|
|
211
|
+
retention_settings: RetentionSettings | None = None,
|
|
212
|
+
cert_monitor_settings: CertMonitorSettings | None = None,
|
|
213
|
+
api_tls_cert_file: str | None = None,
|
|
214
|
+
reference_settings: ReferenceSettings | None = None,
|
|
215
|
+
egress_settings: EgressSettings | None = None,
|
|
216
|
+
shadow_settings: ShadowSettings | None = None,
|
|
217
|
+
active_environment: str | None = None,
|
|
218
|
+
env_values: Mapping[str, Any] | None = None,
|
|
219
|
+
env_values_provider: Callable[[], Mapping[str, Any]] | None = None,
|
|
220
|
+
coordinator: ClusterCoordinator | None = None,
|
|
221
|
+
cluster_settings: ClusterSettings | None = None,
|
|
222
|
+
) -> "Engine":
|
|
223
|
+
"""Open a SQLite-backed engine from a path (convenience for tests/embedding). The service
|
|
224
|
+
path goes through :func:`~messagefoundry.store.open_store` (backend-agnostic). The SQLite
|
|
225
|
+
convenience path leaves ``coordinator`` unset → the no-op :class:`NullCoordinator`
|
|
226
|
+
(single-node), so it is byte-identical to before this seam."""
|
|
227
|
+
store = await MessageStore.open(db_path, synchronous=synchronous)
|
|
228
|
+
return cls(
|
|
229
|
+
store,
|
|
230
|
+
poll_interval=poll_interval,
|
|
231
|
+
max_correlation_depth=max_correlation_depth,
|
|
232
|
+
config_dir=config_dir,
|
|
233
|
+
config_reload_roots=config_reload_roots,
|
|
234
|
+
inbound_bind_host=inbound_bind_host,
|
|
235
|
+
allow_insecure_bind=allow_insecure_bind,
|
|
236
|
+
delivery_defaults=delivery_defaults,
|
|
237
|
+
ordering_default=ordering_default,
|
|
238
|
+
internal_error_default=internal_error_default,
|
|
239
|
+
buildup_default=buildup_default,
|
|
240
|
+
ack_after_default=ack_after_default,
|
|
241
|
+
alert_sink=alert_sink,
|
|
242
|
+
retention_settings=retention_settings,
|
|
243
|
+
cert_monitor_settings=cert_monitor_settings,
|
|
244
|
+
api_tls_cert_file=api_tls_cert_file,
|
|
245
|
+
reference_settings=reference_settings,
|
|
246
|
+
egress_settings=egress_settings,
|
|
247
|
+
shadow_settings=shadow_settings,
|
|
248
|
+
active_environment=active_environment,
|
|
249
|
+
env_values=env_values,
|
|
250
|
+
env_values_provider=env_values_provider,
|
|
251
|
+
coordinator=coordinator,
|
|
252
|
+
cluster_settings=cluster_settings,
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# --- code-first wiring ---------------------------------------------------
|
|
256
|
+
|
|
257
|
+
def add_registry(self, registry: Registry) -> RegistryRunner:
|
|
258
|
+
"""Run a code-first Connection/Router/Handler graph (one runner for the whole graph)."""
|
|
259
|
+
runner = RegistryRunner(
|
|
260
|
+
registry,
|
|
261
|
+
self.store,
|
|
262
|
+
poll_interval=self._poll_interval,
|
|
263
|
+
inbound_bind_host=self._inbound_bind_host,
|
|
264
|
+
allow_insecure_bind=self._allow_insecure_bind,
|
|
265
|
+
delivery_defaults=self._delivery_defaults,
|
|
266
|
+
ordering_default=self._ordering_default,
|
|
267
|
+
internal_error_default=self._internal_error_default,
|
|
268
|
+
buildup_default=self._buildup_default,
|
|
269
|
+
ack_after_default=self._ack_after_default,
|
|
270
|
+
alert_sink=self._alert_sink,
|
|
271
|
+
egress=self._egress_settings,
|
|
272
|
+
simulate_all=self._shadow_settings.simulate_all_egress,
|
|
273
|
+
env_values=self._env_values,
|
|
274
|
+
active_environment=self._active_environment,
|
|
275
|
+
coordinator=self._coordinator,
|
|
276
|
+
max_correlation_depth=self._max_correlation_depth,
|
|
277
|
+
)
|
|
278
|
+
self._registry_runner = runner
|
|
279
|
+
return runner
|
|
280
|
+
|
|
281
|
+
@property
|
|
282
|
+
def registry_runner(self) -> RegistryRunner | None:
|
|
283
|
+
return self._registry_runner
|
|
284
|
+
|
|
285
|
+
def _monitored_certs(self) -> list[MonitoredCert]:
|
|
286
|
+
"""The TLS certs the engine serves with right now: the ``[api]`` cert + the wired graph's MLLP
|
|
287
|
+
``tls_cert_file`` certs (read live off the registry, so a config reload is reflected). Passed to
|
|
288
|
+
the :class:`CertExpiryRunner` as its cert source so each scan reflects the current graph."""
|
|
289
|
+
registry = self._registry_runner.registry if self._registry_runner is not None else None
|
|
290
|
+
return certs_from_registry(registry, self._api_tls_cert_file)
|
|
291
|
+
|
|
292
|
+
@property
|
|
293
|
+
def coordinator(self) -> ClusterCoordinator:
|
|
294
|
+
"""The cluster coordinator (NullCoordinator single-node, DbCoordinator clustered) — Track B
|
|
295
|
+
Step 7. A public accessor so the observability API reads membership/leadership through the
|
|
296
|
+
contract instead of reaching the private ``_coordinator`` attribute."""
|
|
297
|
+
return self._coordinator
|
|
298
|
+
|
|
299
|
+
# --- reference sets (ADR 0006) -------------------------------------------
|
|
300
|
+
|
|
301
|
+
def _make_reference_runner(self) -> ReferenceSyncRunner:
|
|
302
|
+
"""Build the reference sync runner; its specs are read **live** from the current registry, so a
|
|
303
|
+
reload's swapped declarations are picked up without rebuilding it."""
|
|
304
|
+
return ReferenceSyncRunner(
|
|
305
|
+
self.store,
|
|
306
|
+
lambda: (
|
|
307
|
+
self._registry_runner.registry.references.values()
|
|
308
|
+
if self._registry_runner is not None
|
|
309
|
+
else []
|
|
310
|
+
),
|
|
311
|
+
self._reference_settings or ReferenceSettings(),
|
|
312
|
+
env_values=self._env_values,
|
|
313
|
+
egress=self._egress_settings,
|
|
314
|
+
alert_sink=self._alert_sink,
|
|
315
|
+
# Track B Step 6: gate materialize-from-source on the leader; every node still converges its
|
|
316
|
+
# read cache from the shared snapshot. NullCoordinator (single-node) is always leader, so
|
|
317
|
+
# this materializes from source every pass exactly as before.
|
|
318
|
+
coordinator=self._coordinator,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
async def _reconcile_reference_sync(self, *, startup: bool) -> None:
|
|
322
|
+
"""Ensure the reference runner exists, materialize the declared sets, and (re-)arm the loop.
|
|
323
|
+
|
|
324
|
+
Called at :meth:`start` and after every successful :meth:`reload`, so: a set added by a reload
|
|
325
|
+
materializes **immediately** (not only on the next refresh tick), a graph that goes from zero
|
|
326
|
+
reference sets to ≥1 across a reload actually starts the loop, and an engine started without a
|
|
327
|
+
graph then loaded via reload still gets a runner. ``start()`` is idempotent (a no-op when the
|
|
328
|
+
loop is already up). The pre-sync runs on a reload unconditionally (so a new set resolves on the
|
|
329
|
+
next message); at startup it honors ``[reference].sync_on_startup``. A sync failure is isolated
|
|
330
|
+
per-set (last-good kept) and never blocks start/reload."""
|
|
331
|
+
if self._reference_runner is None:
|
|
332
|
+
self._reference_runner = self._make_reference_runner()
|
|
333
|
+
if not startup or (self._reference_settings or ReferenceSettings()).sync_on_startup:
|
|
334
|
+
await self._reference_runner.sync_all()
|
|
335
|
+
self._reference_runner.start()
|
|
336
|
+
|
|
337
|
+
# --- lifecycle -----------------------------------------------------------
|
|
338
|
+
|
|
339
|
+
async def start(self) -> None:
|
|
340
|
+
"""Recover crashed in-flight rows (every stage), dead-letter outbound rows for removed
|
|
341
|
+
outbounds, then start the wired graph."""
|
|
342
|
+
self.started_at = time.time()
|
|
343
|
+
# All-stages recovery: returns any row a crash left `inflight` — ingress rows mid-route and
|
|
344
|
+
# outbound rows mid-delivery alike — to `pending` so the staged workers re-claim them
|
|
345
|
+
# (staged pipeline, ADR 0001). The handoff/delivery transactions make the re-run idempotent.
|
|
346
|
+
if not self._coordinator.reclaims_inflight():
|
|
347
|
+
# Single-node (SQLite / single-node Postgres): the unconditional reset is immediate self-
|
|
348
|
+
# recovery of this node's own crash residue — today's behavior, byte-identical.
|
|
349
|
+
await self.store.reset_stale_inflight()
|
|
350
|
+
# else clustered (Track B Step 4): the leader's periodic reclaim_expired_leases sweep (started
|
|
351
|
+
# below) recovers expired-lease rows; the unconditional reset ignores leases and would steal a
|
|
352
|
+
# live sibling's in-flight rows, so it must NOT run here.
|
|
353
|
+
# Bring cluster membership + leader election up BEFORE the workers run, so the node's heartbeat
|
|
354
|
+
# is registered and leadership is contended the moment it starts processing (Track B Step 3/4).
|
|
355
|
+
# NullCoordinator (the single-node default) is a no-op here, so this line is free for SQLite /
|
|
356
|
+
# single-node Postgres.
|
|
357
|
+
await self._coordinator.start()
|
|
358
|
+
# Track B Step 6b: in a cluster, turn ON the store's per-namespace state-version bumping BEFORE the
|
|
359
|
+
# workers (hence transform_handoff) start, so the very first state write bumps and a sibling's
|
|
360
|
+
# convergence loop can see it. Single-node (NullCoordinator, is_clustered() False) never calls this,
|
|
361
|
+
# so no state_version rows are written and the backend stays byte-identical.
|
|
362
|
+
if self._coordinator.is_clustered():
|
|
363
|
+
self.store.enable_state_convergence()
|
|
364
|
+
if self._registry_runner is not None:
|
|
365
|
+
# Fail loud (not at the first received message) if the configured store can't run the
|
|
366
|
+
# staged ingress pipeline: the inbound path unconditionally calls store.enqueue_ingress,
|
|
367
|
+
# so a backend whose enqueue_ingress/handoff is a NotImplementedError stub (SQL Server,
|
|
368
|
+
# gated on BACKLOG #1) would otherwise wedge every inbound at runtime with no ACK/NAK. This
|
|
369
|
+
# check fails loud on EVERY node (leader or standby) — a misconfigured backend should refuse
|
|
370
|
+
# at startup, not only when this node is promoted.
|
|
371
|
+
if not getattr(self.store, "supports_ingest_stage", True):
|
|
372
|
+
raise RuntimeError(
|
|
373
|
+
"the configured store backend does not support the staged ingress pipeline "
|
|
374
|
+
"(ADR 0001 Step A is SQLite-only; SQL Server staging is gated on BACKLOG #1) — "
|
|
375
|
+
"use the sqlite backend"
|
|
376
|
+
)
|
|
377
|
+
if not self._coordinator.is_clustered():
|
|
378
|
+
# SINGLE-NODE (NullCoordinator, always leader): bring the graph up now, exactly as
|
|
379
|
+
# before — byte-identical. The config-drift sweeps + reference materialize + listener
|
|
380
|
+
# bring-up live in _start_graph (shared with the clustered leader path).
|
|
381
|
+
await self._start_graph()
|
|
382
|
+
else:
|
|
383
|
+
# CLUSTERED (active-passive, Workstream A1): the graph runs ONLY on the leader, so do
|
|
384
|
+
# NOT bring it up here — the graph supervisor (spawned at the end of start()) starts it
|
|
385
|
+
# when this node acquires leadership and stops it on loss. A standby stays warm without
|
|
386
|
+
# binding listeners or running workers. Start the reference-sync loop on EVERY node now
|
|
387
|
+
# so a follower converges its read cache from the leader's snapshot (the leader also
|
|
388
|
+
# materializes before listeners in _start_graph). Idempotent: _start_graph re-ensures it.
|
|
389
|
+
if self._reference_runner is None:
|
|
390
|
+
self._reference_runner = self._make_reference_runner()
|
|
391
|
+
self._reference_runner.start()
|
|
392
|
+
# Retention/purge is independent of the message graph (a store-level maintenance task), so it
|
|
393
|
+
# runs whether or not a graph is wired and survives config reloads. The runner is a no-op when
|
|
394
|
+
# nothing is configured, so this only spawns a task when [retention] is actually set. It is a
|
|
395
|
+
# leader-only WRITE singleton (purges bodies + writes audit rows), so it is gated on the
|
|
396
|
+
# coordinator: in a cluster a follower's runner ticks but no-ops; single-node always leads.
|
|
397
|
+
if self._retention_settings is not None:
|
|
398
|
+
self._retention_runner = RetentionRunner(
|
|
399
|
+
self.store,
|
|
400
|
+
self._retention_settings,
|
|
401
|
+
alert_sink=self._alert_sink,
|
|
402
|
+
coordinator=self._coordinator,
|
|
403
|
+
)
|
|
404
|
+
self._retention_runner.start()
|
|
405
|
+
# [cert_monitor] TLS-cert expiry monitor (Q5c) — a maintenance task like retention, independent
|
|
406
|
+
# of the message graph and surviving reloads; a no-op when warn_days=0. NOT leader-gated: certs
|
|
407
|
+
# are node-local files, so each node alerts on its own (the per-cert realert throttle bounds
|
|
408
|
+
# spam). The served-cert set is recomputed each scan from the live registry + [api] cert.
|
|
409
|
+
if self._cert_monitor_settings is not None:
|
|
410
|
+
self._cert_expiry_runner = CertExpiryRunner(
|
|
411
|
+
self._monitored_certs,
|
|
412
|
+
self._cert_monitor_settings,
|
|
413
|
+
alert_sink=self._alert_sink,
|
|
414
|
+
)
|
|
415
|
+
self._cert_expiry_runner.start()
|
|
416
|
+
# Leader lease-reclaim sweep (Track B Step 4) — only in clustered mode (reclaims_inflight()),
|
|
417
|
+
# so single-node / SQLite never spawns it. It is itself leader-gated each pass, so a follower's
|
|
418
|
+
# runner ticks but no-ops; the current leader recovers crashed nodes' expired-lease rows.
|
|
419
|
+
if self._coordinator.reclaims_inflight() and hasattr(self.store, "reclaim_expired_leases"):
|
|
420
|
+
# Postgres active-active: per-row lease reclaim recovers crashed nodes' EXPIRED-lease rows.
|
|
421
|
+
self._leader_maintenance = LeaderMaintenanceRunner(
|
|
422
|
+
self.store, # type: ignore[arg-type] # reclaim_expired_leases guarded above (Postgres)
|
|
423
|
+
self._coordinator,
|
|
424
|
+
interval_seconds=self._cluster_settings.reclaim_interval_seconds,
|
|
425
|
+
)
|
|
426
|
+
self._leader_maintenance.start()
|
|
427
|
+
# else (SQL Server active-passive): no per-row leases, so there is no reclaim sweep — failover
|
|
428
|
+
# recovery is the on-promotion reset_stale_inflight in _start_graph (the old leader self-fenced
|
|
429
|
+
# before its lease expired, so re-pending its in-flight rows can't steal from a live processor).
|
|
430
|
+
# Config-reload convergence (Track B Step 6) — only in clustered mode (is_clustered()), so
|
|
431
|
+
# single-node / SQLite never spawns it. Seed the applied version to the coordinator's CURRENT
|
|
432
|
+
# shared version BEFORE the loop starts, so a fresh node does not immediately self-reload (it is
|
|
433
|
+
# already in sync with whatever reloads happened before it joined); then poll the cached version
|
|
434
|
+
# each tick and reload this node's own config dir when it falls behind.
|
|
435
|
+
if self._coordinator.is_clustered():
|
|
436
|
+
self._applied_config_version = await self._coordinator.config_version()
|
|
437
|
+
self._config_convergence = ConfigConvergenceRunner(
|
|
438
|
+
self._coordinator,
|
|
439
|
+
applied_version=lambda: self._applied_config_version,
|
|
440
|
+
set_applied_version=self._set_applied_config_version,
|
|
441
|
+
reload=self._converge_reload,
|
|
442
|
+
interval_seconds=self._cluster_settings.heartbeat_seconds,
|
|
443
|
+
)
|
|
444
|
+
self._config_convergence.start()
|
|
445
|
+
# Transform-state read-through convergence (Track B Step 6b) — each tick read-throughs any
|
|
446
|
+
# namespace a sibling wrote/purged into this node's local _state_cache. Reuses the cluster
|
|
447
|
+
# heartbeat interval (owner decision) and the same alert sink as the rest of the engine.
|
|
448
|
+
self._state_convergence = StateConvergenceRunner(
|
|
449
|
+
converge=self.store.converge_state_cache,
|
|
450
|
+
interval_seconds=self._cluster_settings.heartbeat_seconds,
|
|
451
|
+
alert_sink=self._alert_sink,
|
|
452
|
+
)
|
|
453
|
+
self._state_convergence.start()
|
|
454
|
+
# Active-passive graph supervisor (Workstream A1) — spawned LAST (after _leader_maintenance
|
|
455
|
+
# exists, so the on-promotion reclaim can fire) and ONLY in clustered mode with a wired graph.
|
|
456
|
+
# It polls leadership and starts/stops the graph so only the leader binds listeners + runs
|
|
457
|
+
# workers. The poll interval is kept short (relative to the fence/TTL margin) so a demoted/fenced
|
|
458
|
+
# node stops accepting + initiating new work promptly; concurrent double-processing of a given
|
|
459
|
+
# row is independently prevented by the store's row/lane leases (see __init__). Single-node
|
|
460
|
+
# never spawns it (the graph is already running, brought up directly above).
|
|
461
|
+
if self._coordinator.is_clustered() and self._registry_runner is not None:
|
|
462
|
+
ttl = self._cluster_settings.leader_lease_ttl_seconds
|
|
463
|
+
fence = self._cluster_settings.leader_fence_timeout_seconds
|
|
464
|
+
# Stay comfortably inside the (ttl - fence) margin and never slower than ~1s.
|
|
465
|
+
self._graph_reconcile_interval = max(0.1, min(1.0, (ttl - fence) / 3.0))
|
|
466
|
+
self._graph_stop.clear()
|
|
467
|
+
# Reconcile ONCE synchronously before the loop: if this node is already the leader (it
|
|
468
|
+
# acquired the lease on coordinator.start()'s first tick, or in tests a stand-in reports
|
|
469
|
+
# leader immediately), the graph comes up during start() rather than a poll-interval later.
|
|
470
|
+
# A real DbCoordinator is usually not-yet-leader here (the lease is acquired asynchronously),
|
|
471
|
+
# so this is a no-op and the supervisor brings the graph up on promotion.
|
|
472
|
+
await self._reconcile_graph()
|
|
473
|
+
self._graph_supervisor = asyncio.create_task(self._graph_supervisor_loop())
|
|
474
|
+
|
|
475
|
+
# --- active-passive graph gating (Workstream A1/A3/A4) -------------------
|
|
476
|
+
|
|
477
|
+
async def _start_graph(self) -> None:
|
|
478
|
+
"""Bring the wired graph up: (A4) recover the prior leader's stranded in-flight rows + lane
|
|
479
|
+
leases on promotion, (A3) dead-letter rows whose outbound/handler left the config, materialize
|
|
480
|
+
reference sets, then start the listeners + workers. In a cluster this runs ONLY on the leader and
|
|
481
|
+
is (re)invoked on each leadership acquire; single-node runs it once at startup. Idempotent
|
|
482
|
+
against the runner's own ``running`` guard."""
|
|
483
|
+
if self._registry_runner is None:
|
|
484
|
+
return
|
|
485
|
+
# A4 — on promotion (clustered Postgres), recover the prior leader's stranded in-flight rows AND
|
|
486
|
+
# take over its lane leases IMMEDIATELY (owner-scoped, lease-blind), instead of waiting out the
|
|
487
|
+
# ~[store].lease_ttl_seconds per-row/lane lease TTL — which was the dominant failover-recovery
|
|
488
|
+
# delay (#293: ~60s on PG vs ~7s on SQL Server). This brings Postgres to parity with the SQL
|
|
489
|
+
# Server reset_stale_inflight path; the periodic, lease-GATED sweep keeps running in the
|
|
490
|
+
# background (clock-skew / future active-active recovery). Single-node has no leader maintenance
|
|
491
|
+
# (_leader_maintenance is None), and its own crash residue was already recovered by the
|
|
492
|
+
# unconditional reset_stale_inflight in start().
|
|
493
|
+
if self._leader_maintenance is not None:
|
|
494
|
+
await self._leader_maintenance.recover_on_promotion()
|
|
495
|
+
elif self._coordinator.is_clustered():
|
|
496
|
+
# Active-passive without per-row leases (SQL Server): on promotion, re-pend the prior
|
|
497
|
+
# leader's in-flight rows. The prior leader self-fenced and its leadership lease EXPIRED
|
|
498
|
+
# before this node could acquire it, so it has stopped processing — and the graph runs ONLY
|
|
499
|
+
# on the leader, so there is no live sibling whose rows an unconditional reset could steal.
|
|
500
|
+
# (Single-node NullCoordinator is_clustered() is False, so this never runs there; its boot
|
|
501
|
+
# residue was already recovered by the unconditional reset_stale_inflight in start().)
|
|
502
|
+
await self.store.reset_stale_inflight()
|
|
503
|
+
# A3 — dead-letter OUTBOUND/ROUTED rows whose destination/handler left the config (no worker
|
|
504
|
+
# would ever drain them). Now part of graph bring-up, so in a cluster ONLY the leader (the one
|
|
505
|
+
# node that runs the graph) sweeps — a restarting standby never dead-letters the primary's
|
|
506
|
+
# in-flight rows (the hazard the old unconditional placement carried). Single-node is unchanged
|
|
507
|
+
# (it always runs the graph). Keyed off THIS node's registry, so clustered nodes must still run
|
|
508
|
+
# identical config (a coordinated, not rolling, restart for config changes).
|
|
509
|
+
await self.store.dead_letter_missing_destinations(
|
|
510
|
+
set(self._registry_runner.registry.outbound)
|
|
511
|
+
)
|
|
512
|
+
await self.store.dead_letter_missing_handlers(set(self._registry_runner.registry.handlers))
|
|
513
|
+
# Reference sets (ADR 0006): materialize declared sets BEFORE listeners accept (a transform's
|
|
514
|
+
# reference(...) resolves on the first message), then keep the periodic loop running (idempotent
|
|
515
|
+
# — already started on every node in start() for clustered followers to converge). Leader-gated
|
|
516
|
+
# materialize inside the runner; a sync failure is isolated per-set and never blocks intake.
|
|
517
|
+
await self._reconcile_reference_sync(startup=True)
|
|
518
|
+
await self._registry_runner.start()
|
|
519
|
+
log.info("engine graph started — this node is processing")
|
|
520
|
+
|
|
521
|
+
async def _stop_graph(self) -> None:
|
|
522
|
+
"""Tear the graph down on loss of leadership: stop the listeners + workers so a demoted node
|
|
523
|
+
stops binding/processing. The reference-sync loop and the self-gated maintenance/convergence
|
|
524
|
+
loops keep running (a follower still converges its caches), so only the runner is stopped."""
|
|
525
|
+
if self._registry_runner is not None:
|
|
526
|
+
await self._registry_runner.stop()
|
|
527
|
+
log.info("engine graph stopped — this node is now standby")
|
|
528
|
+
|
|
529
|
+
async def _reconcile_graph(self) -> None:
|
|
530
|
+
"""Align the running graph with this node's leadership: start it on becoming leader, stop it on
|
|
531
|
+
losing leadership. Serialized by ``_graph_lock`` so overlapping triggers can't double act."""
|
|
532
|
+
if self._registry_runner is None:
|
|
533
|
+
return
|
|
534
|
+
async with self._graph_lock:
|
|
535
|
+
running = self._registry_runner.running
|
|
536
|
+
if self._coordinator.is_leader() and not running:
|
|
537
|
+
await self._start_graph()
|
|
538
|
+
# Leadership can be lost DURING the (potentially slow) bring-up — a fence mid-start. If
|
|
539
|
+
# so, tear straight back down within the same lock so a demoted node never keeps the
|
|
540
|
+
# graph running for a whole extra poll cycle.
|
|
541
|
+
if not self._coordinator.is_leader():
|
|
542
|
+
await self._stop_graph()
|
|
543
|
+
elif not self._coordinator.is_leader() and running:
|
|
544
|
+
await self._stop_graph()
|
|
545
|
+
|
|
546
|
+
async def _graph_supervisor_loop(self) -> None:
|
|
547
|
+
"""Active-passive graph supervisor (Workstream A1): poll leadership and start/stop the graph so
|
|
548
|
+
only the leader binds listeners + runs workers. Polled at ``_graph_reconcile_interval`` (kept
|
|
549
|
+
short so a demotion/fence promptly stops this node accepting + initiating new work; the row/lane
|
|
550
|
+
leases independently prevent concurrent double-processing of a given row). Clustered only;
|
|
551
|
+
cooperatively stopped via ``_graph_stop`` (the loop wakes on it and exits between reconciles)."""
|
|
552
|
+
while not self._graph_stop.is_set():
|
|
553
|
+
try:
|
|
554
|
+
await self._reconcile_graph()
|
|
555
|
+
except asyncio.CancelledError:
|
|
556
|
+
raise
|
|
557
|
+
except Exception:
|
|
558
|
+
log.exception("engine graph supervisor reconcile failed; will retry")
|
|
559
|
+
try:
|
|
560
|
+
await asyncio.wait_for(
|
|
561
|
+
self._graph_stop.wait(), timeout=self._graph_reconcile_interval
|
|
562
|
+
)
|
|
563
|
+
except asyncio.TimeoutError:
|
|
564
|
+
pass
|
|
565
|
+
|
|
566
|
+
def _set_applied_config_version(self, version: int) -> None:
|
|
567
|
+
"""Setter the convergence runner calls after a successful follower reload (Track B Step 6)."""
|
|
568
|
+
self._applied_config_version = version
|
|
569
|
+
|
|
570
|
+
async def _converge_reload(self) -> None:
|
|
571
|
+
"""Re-read THIS node's own startup config dir to converge on a cluster reload (Track B Step 6).
|
|
572
|
+
|
|
573
|
+
Non-propagating (``propagate=False``): this is convergence, not initiation, so it must NOT bump
|
|
574
|
+
the shared version token again (or nodes would chase each other's reloads). Passing ``None``
|
|
575
|
+
reloads the startup ``--config`` dir."""
|
|
576
|
+
await self.reload(propagate=False)
|
|
577
|
+
|
|
578
|
+
async def reload(
|
|
579
|
+
self,
|
|
580
|
+
config_dir: str | Path | None = None,
|
|
581
|
+
*,
|
|
582
|
+
dry_run: bool = False,
|
|
583
|
+
propagate: bool = False,
|
|
584
|
+
) -> Registry:
|
|
585
|
+
"""Load the code-first graph from ``config_dir`` and apply it to the running engine.
|
|
586
|
+
|
|
587
|
+
``config_dir`` defaults to the server's startup ``--config`` dir. Any explicit value must
|
|
588
|
+
resolve **within** an allowed reload root (the startup dir + ``config_reload_roots``);
|
|
589
|
+
otherwise :class:`ConfigReloadDenied` is raised — the loader executes Python, so an
|
|
590
|
+
arbitrary client path must never be honoured. The resolved directory is recorded on
|
|
591
|
+
:attr:`last_reload_dir` for auditing.
|
|
592
|
+
|
|
593
|
+
Validates first (a bad config raises before anything is swapped, so the running graph is
|
|
594
|
+
left untouched), then atomically swaps via the runner's quiesce-and-swap reload. If the
|
|
595
|
+
engine was started without a graph, this loads and starts one. Returns the new Registry.
|
|
596
|
+
|
|
597
|
+
``dry_run`` performs the full validation **against this instance's environment** — it loads
|
|
598
|
+
the graph and build-checks every connector, which resolves the graph's ``env()`` references
|
|
599
|
+
against *this* engine's values, so a key the target environment doesn't define fails here —
|
|
600
|
+
then returns **without swapping** the live graph. This is the promote pre-flight: it answers
|
|
601
|
+
"will this graph go live cleanly on THIS environment?" without touching running traffic.
|
|
602
|
+
|
|
603
|
+
``propagate`` (Track B Step 6): on a SUCCESSFUL non-dry-run apply in a clustered deployment,
|
|
604
|
+
bump the shared ``cluster_config`` version token so every OTHER node's convergence loop reloads
|
|
605
|
+
its own (identically-deployed) config dir. The OPERATOR-initiated path (``/config/reload``)
|
|
606
|
+
passes ``propagate=True``; the per-node convergence reload passes ``False`` (convergence, not
|
|
607
|
+
initiation — bumping there would make nodes chase each other). A dry_run never bumps, and
|
|
608
|
+
single-node (``is_clustered()`` False) never bumps. The initiator advances its OWN applied
|
|
609
|
+
version right after bumping, so its convergence loop sees no change and does not re-reload.
|
|
610
|
+
|
|
611
|
+
Raises ``ConfigReloadDenied`` (path outside the allowed roots), ``FileNotFoundError``
|
|
612
|
+
(missing dir) or ``WiringError`` (invalid / empty config / unresolved env value) — the
|
|
613
|
+
caller maps these to HTTP errors.
|
|
614
|
+
"""
|
|
615
|
+
path = self._resolve_reload_target(config_dir)
|
|
616
|
+
self.last_reload_dir = path
|
|
617
|
+
if not path.is_dir():
|
|
618
|
+
raise FileNotFoundError(f"config directory not found: {config_dir}")
|
|
619
|
+
# Re-gather this environment's values so a reload/promote picks up edited environments/<env>.toml
|
|
620
|
+
# (or MEFOR_VALUE_* changes) without a restart — otherwise the WiringError telling the operator
|
|
621
|
+
# to add a missing value would never clear (review M-23).
|
|
622
|
+
if self._env_values_provider is not None:
|
|
623
|
+
self._env_values = dict(self._env_values_provider())
|
|
624
|
+
if self._registry_runner is not None:
|
|
625
|
+
self._registry_runner.set_env_values(self._env_values)
|
|
626
|
+
# Off the event loop: load_config executes user config modules (arbitrary, potentially heavy
|
|
627
|
+
# imports), which would otherwise stall every listener mid-reload (review low-3).
|
|
628
|
+
registry = await asyncio.to_thread(load_config, path) # raises WiringError on a bad config
|
|
629
|
+
if not registry.inbound and not registry.outbound:
|
|
630
|
+
raise WiringError(
|
|
631
|
+
f"config directory {config_dir!r} declares no connections — "
|
|
632
|
+
"refusing to reload to an empty graph"
|
|
633
|
+
)
|
|
634
|
+
runner = self._registry_runner
|
|
635
|
+
if dry_run:
|
|
636
|
+
# Validate against THIS environment without swapping: build-check every connector (which
|
|
637
|
+
# resolves env() refs against this instance's values and raises on a missing key or bad
|
|
638
|
+
# spec), then discard. Reuse the live runner if present; else a throwaway one carrying the
|
|
639
|
+
# same bind host + env values, so the check sees exactly what a real reload would.
|
|
640
|
+
checker = runner or RegistryRunner(
|
|
641
|
+
registry,
|
|
642
|
+
self.store,
|
|
643
|
+
poll_interval=self._poll_interval,
|
|
644
|
+
inbound_bind_host=self._inbound_bind_host,
|
|
645
|
+
delivery_defaults=self._delivery_defaults,
|
|
646
|
+
ordering_default=self._ordering_default,
|
|
647
|
+
internal_error_default=self._internal_error_default,
|
|
648
|
+
buildup_default=self._buildup_default,
|
|
649
|
+
ack_after_default=self._ack_after_default,
|
|
650
|
+
alert_sink=self._alert_sink,
|
|
651
|
+
egress=self._egress_settings,
|
|
652
|
+
simulate_all=self._shadow_settings.simulate_all_egress,
|
|
653
|
+
env_values=self._env_values,
|
|
654
|
+
coordinator=self._coordinator,
|
|
655
|
+
)
|
|
656
|
+
checker.build_check(registry)
|
|
657
|
+
return registry
|
|
658
|
+
if runner is None:
|
|
659
|
+
runner = self.add_registry(registry)
|
|
660
|
+
try:
|
|
661
|
+
runner.build_check(registry) # bad connector → WiringError (422), before any start
|
|
662
|
+
await runner.start()
|
|
663
|
+
except Exception:
|
|
664
|
+
# Don't leave a half-started runner: a later reload would take the "runner exists"
|
|
665
|
+
# path and no-op the start, wedging intake. Clear it so a retry re-enters cleanly.
|
|
666
|
+
self._registry_runner = None
|
|
667
|
+
raise
|
|
668
|
+
else:
|
|
669
|
+
await runner.reload(registry)
|
|
670
|
+
# Reference sets (ADR 0006): re-arm + materialize after the swap, so a reference set added by
|
|
671
|
+
# this reload syncs immediately (resolves on the next message, not only after the refresh
|
|
672
|
+
# interval) and a 0->N change actually starts the loop. Idempotent when nothing changed.
|
|
673
|
+
await self._reconcile_reference_sync(startup=False)
|
|
674
|
+
# Config-reload convergence (Track B Step 6): only the OPERATOR-initiated path propagates. Bump
|
|
675
|
+
# the shared version so other nodes converge, and advance THIS node's applied version to the new
|
|
676
|
+
# value so its own convergence loop sees no change (feedback-avoidance — the initiator does not
|
|
677
|
+
# re-reload). A no-op on single-node (is_clustered() False). The per-node convergence reload
|
|
678
|
+
# passes propagate=False and so never bumps (it would otherwise make nodes chase each other).
|
|
679
|
+
if propagate and self._coordinator.is_clustered():
|
|
680
|
+
self._applied_config_version = await self._coordinator.bump_config_version()
|
|
681
|
+
return registry
|
|
682
|
+
|
|
683
|
+
def _resolve_reload_target(self, config_dir: str | Path | None) -> Path:
|
|
684
|
+
"""Resolve the reload target and enforce the allow-list (see :class:`ConfigReloadDenied`)."""
|
|
685
|
+
if config_dir is None:
|
|
686
|
+
if self.config_dir is None:
|
|
687
|
+
raise WiringError("no config directory configured; pass one to reload")
|
|
688
|
+
return self.config_dir
|
|
689
|
+
path = Path(config_dir).resolve()
|
|
690
|
+
if self._reload_roots and not any(_within(path, root) for root in self._reload_roots):
|
|
691
|
+
# Don't echo the rejected path back to the client (info disclosure); log it server-side.
|
|
692
|
+
log.warning("rejected /config/reload outside allowed roots: %s", path)
|
|
693
|
+
raise ConfigReloadDenied("config directory is not an allowed reload root")
|
|
694
|
+
return path
|
|
695
|
+
|
|
696
|
+
async def replay(self, message_id: str) -> int:
|
|
697
|
+
"""Re-queue every delivery for a message and wake the delivery workers."""
|
|
698
|
+
requeued = await self.store.replay(message_id)
|
|
699
|
+
if self._registry_runner is not None and self._registry_runner.running:
|
|
700
|
+
self._registry_runner.notify_work()
|
|
701
|
+
return requeued
|
|
702
|
+
|
|
703
|
+
async def replay_dead(
|
|
704
|
+
self, *, channel_id: str | None = None, destination_name: str | None = None
|
|
705
|
+
) -> int:
|
|
706
|
+
"""Re-queue dead-lettered deliveries (optionally scoped) and wake the delivery workers."""
|
|
707
|
+
requeued = await self.store.replay_dead(
|
|
708
|
+
channel_id=channel_id, destination_name=destination_name
|
|
709
|
+
)
|
|
710
|
+
if requeued and self._registry_runner is not None and self._registry_runner.running:
|
|
711
|
+
self._registry_runner.notify_work()
|
|
712
|
+
return requeued
|
|
713
|
+
|
|
714
|
+
async def stop(self) -> None:
|
|
715
|
+
"""Stop the retention task + the wired graph, then close the store."""
|
|
716
|
+
log.info("engine stopping")
|
|
717
|
+
# Quiesce the active-passive graph supervisor FIRST (Workstream A1) so it can't reconcile (and
|
|
718
|
+
# re-start the graph) while we tear down. A no-op single-node (never spawned). Cooperative: set
|
|
719
|
+
# the stop event and let any in-flight reconcile finish under the lock (so we never abandon a
|
|
720
|
+
# half-started graph), falling back to cancel only if a reconcile hangs past the timeout. The
|
|
721
|
+
# graph itself is then stopped by the registry_runner.stop() below, as before.
|
|
722
|
+
if self._graph_supervisor is not None:
|
|
723
|
+
self._graph_stop.set()
|
|
724
|
+
supervisor = self._graph_supervisor
|
|
725
|
+
self._graph_supervisor = None
|
|
726
|
+
try:
|
|
727
|
+
await asyncio.wait_for(supervisor, timeout=10.0)
|
|
728
|
+
except asyncio.TimeoutError:
|
|
729
|
+
# wait_for already cancelled the task on timeout; absorb its cancellation.
|
|
730
|
+
await asyncio.gather(supervisor, return_exceptions=True)
|
|
731
|
+
if self._retention_runner is not None:
|
|
732
|
+
await self._retention_runner.stop()
|
|
733
|
+
if self._cert_expiry_runner is not None:
|
|
734
|
+
await self._cert_expiry_runner.stop()
|
|
735
|
+
# Stop the leader sweep before deregistering membership (it consults the coordinator's gate, so
|
|
736
|
+
# it must quiesce while the coordinator is still up). A no-op when single-node (never spawned).
|
|
737
|
+
if self._leader_maintenance is not None:
|
|
738
|
+
await self._leader_maintenance.stop()
|
|
739
|
+
# Stop the config-convergence loop before the coordinator (it polls the coordinator's cached
|
|
740
|
+
# version). A no-op when single-node (never spawned).
|
|
741
|
+
if self._config_convergence is not None:
|
|
742
|
+
await self._config_convergence.stop()
|
|
743
|
+
# Stop the transform-state convergence loop before the coordinator/pool tear down (it polls the
|
|
744
|
+
# store). A no-op when single-node (never spawned). (Track B Step 6b.)
|
|
745
|
+
if self._state_convergence is not None:
|
|
746
|
+
await self._state_convergence.stop()
|
|
747
|
+
self._state_convergence = None
|
|
748
|
+
if self._reference_runner is not None:
|
|
749
|
+
await self._reference_runner.stop()
|
|
750
|
+
if self._registry_runner is not None:
|
|
751
|
+
await self._registry_runner.stop()
|
|
752
|
+
# Deregister cluster membership after the runner has quiesced but before the store closes (the
|
|
753
|
+
# coordinator marks its node left over the same pool). stop() is idempotent and safe even if
|
|
754
|
+
# start() raised (then there's just nothing to cancel). NullCoordinator is a no-op.
|
|
755
|
+
await self._coordinator.stop()
|
|
756
|
+
await self.store.close()
|