messagefoundry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. messagefoundry/__init__.py +108 -0
  2. messagefoundry/__main__.py +1155 -0
  3. messagefoundry/api/__init__.py +27 -0
  4. messagefoundry/api/app.py +1581 -0
  5. messagefoundry/api/approvals.py +184 -0
  6. messagefoundry/api/auth_models.py +211 -0
  7. messagefoundry/api/auth_routes.py +655 -0
  8. messagefoundry/api/field_authz.py +96 -0
  9. messagefoundry/api/models.py +374 -0
  10. messagefoundry/api/security.py +247 -0
  11. messagefoundry/api/tls.py +47 -0
  12. messagefoundry/auth/__init__.py +39 -0
  13. messagefoundry/auth/data/common_passwords.NOTICE +13 -0
  14. messagefoundry/auth/data/common_passwords.txt +10000 -0
  15. messagefoundry/auth/identity.py +71 -0
  16. messagefoundry/auth/ldap.py +264 -0
  17. messagefoundry/auth/notifications.py +68 -0
  18. messagefoundry/auth/passwords.py +53 -0
  19. messagefoundry/auth/permissions.py +120 -0
  20. messagefoundry/auth/policy.py +153 -0
  21. messagefoundry/auth/ratelimit.py +55 -0
  22. messagefoundry/auth/service.py +1323 -0
  23. messagefoundry/auth/tokens.py +26 -0
  24. messagefoundry/auth/totp.py +174 -0
  25. messagefoundry/checks.py +174 -0
  26. messagefoundry/config/__init__.py +30 -0
  27. messagefoundry/config/active_environment.py +80 -0
  28. messagefoundry/config/ai_policy.py +140 -0
  29. messagefoundry/config/code_sets.py +260 -0
  30. messagefoundry/config/connections_edit.py +200 -0
  31. messagefoundry/config/connections_file.py +287 -0
  32. messagefoundry/config/db_lookup.py +117 -0
  33. messagefoundry/config/environments.py +116 -0
  34. messagefoundry/config/ingest_time.py +83 -0
  35. messagefoundry/config/models.py +240 -0
  36. messagefoundry/config/reference.py +158 -0
  37. messagefoundry/config/response.py +83 -0
  38. messagefoundry/config/run_context.py +153 -0
  39. messagefoundry/config/settings.py +1311 -0
  40. messagefoundry/config/state.py +99 -0
  41. messagefoundry/config/tls_policy.py +110 -0
  42. messagefoundry/config/wiring.py +1918 -0
  43. messagefoundry/console/__init__.py +20 -0
  44. messagefoundry/console/__main__.py +274 -0
  45. messagefoundry/console/_async.py +107 -0
  46. messagefoundry/console/change_password.py +111 -0
  47. messagefoundry/console/client.py +552 -0
  48. messagefoundry/console/connections.py +324 -0
  49. messagefoundry/console/login.py +107 -0
  50. messagefoundry/console/mfa.py +205 -0
  51. messagefoundry/console/reauth.py +94 -0
  52. messagefoundry/console/search.py +57 -0
  53. messagefoundry/console/service_control.py +137 -0
  54. messagefoundry/console/sessions.py +122 -0
  55. messagefoundry/console/shell.py +410 -0
  56. messagefoundry/console/status.py +377 -0
  57. messagefoundry/console/users_page.py +282 -0
  58. messagefoundry/console/widgets.py +553 -0
  59. messagefoundry/generators/README.md +27 -0
  60. messagefoundry/generators/__init__.py +15 -0
  61. messagefoundry/generators/_core.py +589 -0
  62. messagefoundry/generators/_hl7data.py +428 -0
  63. messagefoundry/generators/adt.py +286 -0
  64. messagefoundry/generators/all_types.py +24 -0
  65. messagefoundry/generators/bar.py +28 -0
  66. messagefoundry/generators/dft.py +20 -0
  67. messagefoundry/generators/mdm.py +39 -0
  68. messagefoundry/generators/mfn.py +46 -0
  69. messagefoundry/generators/oml.py +32 -0
  70. messagefoundry/generators/orl.py +30 -0
  71. messagefoundry/generators/orm.py +23 -0
  72. messagefoundry/generators/oru.py +21 -0
  73. messagefoundry/generators/ras.py +20 -0
  74. messagefoundry/generators/rde.py +54 -0
  75. messagefoundry/generators/siu.py +64 -0
  76. messagefoundry/generators/vxu.py +20 -0
  77. messagefoundry/hl7schema.py +75 -0
  78. messagefoundry/last_resort.py +55 -0
  79. messagefoundry/logging_setup.py +332 -0
  80. messagefoundry/parsing/__init__.py +64 -0
  81. messagefoundry/parsing/consistency.py +166 -0
  82. messagefoundry/parsing/groups.py +228 -0
  83. messagefoundry/parsing/message.py +453 -0
  84. messagefoundry/parsing/peek.py +237 -0
  85. messagefoundry/parsing/split.py +120 -0
  86. messagefoundry/parsing/summary.py +46 -0
  87. messagefoundry/parsing/tree.py +128 -0
  88. messagefoundry/parsing/validate.py +95 -0
  89. messagefoundry/parsing/x12/__init__.py +46 -0
  90. messagefoundry/parsing/x12/delimiters.py +140 -0
  91. messagefoundry/parsing/x12/errors.py +30 -0
  92. messagefoundry/parsing/x12/interchange.py +232 -0
  93. messagefoundry/parsing/x12/message.py +200 -0
  94. messagefoundry/parsing/x12/peek.py +207 -0
  95. messagefoundry/pipeline/__init__.py +21 -0
  96. messagefoundry/pipeline/alert_sinks.py +486 -0
  97. messagefoundry/pipeline/alerts.py +100 -0
  98. messagefoundry/pipeline/cert_expiry.py +219 -0
  99. messagefoundry/pipeline/cluster.py +955 -0
  100. messagefoundry/pipeline/cluster_sqlserver.py +444 -0
  101. messagefoundry/pipeline/config_convergence.py +137 -0
  102. messagefoundry/pipeline/dryrun.py +450 -0
  103. messagefoundry/pipeline/engine.py +756 -0
  104. messagefoundry/pipeline/leader_tasks.py +158 -0
  105. messagefoundry/pipeline/reference_sync.py +369 -0
  106. messagefoundry/pipeline/retention.py +289 -0
  107. messagefoundry/pipeline/security_notify.py +168 -0
  108. messagefoundry/pipeline/state_convergence.py +143 -0
  109. messagefoundry/pipeline/wiring_runner.py +1722 -0
  110. messagefoundry/py.typed +0 -0
  111. messagefoundry/redaction.py +71 -0
  112. messagefoundry/scaffold.py +321 -0
  113. messagefoundry/secrets_dpapi.py +129 -0
  114. messagefoundry/store/__init__.py +46 -0
  115. messagefoundry/store/audit_tee.py +67 -0
  116. messagefoundry/store/base.py +758 -0
  117. messagefoundry/store/crypto.py +166 -0
  118. messagefoundry/store/keyprovider.py +192 -0
  119. messagefoundry/store/postgres.py +3447 -0
  120. messagefoundry/store/sqlserver.py +3014 -0
  121. messagefoundry/store/store.py +3790 -0
  122. messagefoundry/timezone.py +207 -0
  123. messagefoundry/transports/__init__.py +50 -0
  124. messagefoundry/transports/base.py +269 -0
  125. messagefoundry/transports/database.py +693 -0
  126. messagefoundry/transports/file.py +551 -0
  127. messagefoundry/transports/framing.py +164 -0
  128. messagefoundry/transports/loopback.py +53 -0
  129. messagefoundry/transports/mllp.py +644 -0
  130. messagefoundry/transports/remotefile.py +664 -0
  131. messagefoundry/transports/rest.py +281 -0
  132. messagefoundry/transports/signing.py +321 -0
  133. messagefoundry/transports/soap.py +507 -0
  134. messagefoundry/transports/tcp.py +307 -0
  135. messagefoundry/transports/timer.py +146 -0
  136. messagefoundry/transports/x12.py +323 -0
  137. messagefoundry-0.1.0.dist-info/METADATA +212 -0
  138. messagefoundry-0.1.0.dist-info/RECORD +142 -0
  139. messagefoundry-0.1.0.dist-info/WHEEL +4 -0
  140. messagefoundry-0.1.0.dist-info/entry_points.txt +2 -0
  141. messagefoundry-0.1.0.dist-info/licenses/LICENSE +662 -0
  142. messagefoundry-0.1.0.dist-info/licenses/NOTICE +27 -0
@@ -0,0 +1,1722 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 MessageFoundry Organization and contributors
3
+ """Run a code-first wiring :class:`~messagefoundry.config.wiring.Registry` as a **staged pipeline**.
4
+
5
+ Staged pipeline (ADR 0001, Step A): for each **inbound connection** a listener decodes/parses/
6
+ (strict-)validates each message **synchronously** (still NAKing those failures), then commits the
7
+ raw to the **ingress** stage and ACKs (**ACK-on-receipt**). A per-inbound **ingress worker** then
8
+ runs the **Router** (returns handler names) + named **Handlers** (filter → transform → ``Send``,
9
+ combined — not split) and **hands off** the resulting deliveries to the **outbound** stage in one
10
+ transaction. One delivery worker per **outbound connection** drains its rows (across all inbounds)
11
+ independently, with retries. Router/Handlers are pure; a re-run after a crash re-derives the same
12
+ output (at-least-once).
13
+
14
+ Every received message is persisted before the ACK (``RECEIVED``); its disposition is then recorded
15
+ as it flows (the count-and-log invariant): ``ROUTED`` (≥1 delivery → ``PROCESSED`` once drained),
16
+ ``UNROUTED`` (router routed nowhere), ``FILTERED`` (handlers dropped it), or ``ERROR``/dead-letter at
17
+ the failing stage. Decode/parse/validate failures still NAK + record ``ERROR`` synchronously;
18
+ routing/transform failures are post-ACK (no NAK — a logged ``ERROR``/dead-letter + alert).
19
+
20
+ Reuses the store, the connector registry, and the ACK builder.
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import asyncio
26
+ import functools
27
+ import json
28
+ import logging
29
+ import time
30
+ import urllib.parse
31
+ from collections.abc import Mapping
32
+ from pathlib import Path
33
+ from typing import Any
34
+
35
+ from messagefoundry.config.models import (
36
+ AckAfter,
37
+ AckMode,
38
+ BuildupThreshold,
39
+ ConnectorType,
40
+ ContentType,
41
+ Destination,
42
+ InternalErrorPolicy,
43
+ OrderingMode,
44
+ OutboundSigning,
45
+ RetryPolicy,
46
+ Source,
47
+ )
48
+ from messagefoundry.config.db_lookup import DbLookupError, activated as db_lookup_activated
49
+ from messagefoundry.config.run_context import RunContext, run_contexts
50
+ from messagefoundry.config.settings import EgressSettings
51
+ from messagefoundry.config.wiring import (
52
+ InboundConnection,
53
+ OutboundConnection,
54
+ Registry,
55
+ WiringError,
56
+ resolve_env_settings,
57
+ )
58
+ from messagefoundry.parsing import HL7PeekError, Peek, normalize, summarize, validate
59
+ from messagefoundry.pipeline.alerts import AlertSink, LoggingAlertSink
60
+ from messagefoundry.pipeline.cluster import ClusterCoordinator, NullCoordinator
61
+ from messagefoundry.redaction import safe_exc, safe_text
62
+ from messagefoundry.pipeline.dryrun import route_only, transform_one
63
+ from messagefoundry.store import MessageStatus, QueueStore, Stage
64
+ from messagefoundry.transports import (
65
+ DeliveryError,
66
+ DestinationConnector,
67
+ NegativeAckError,
68
+ SourceConnector,
69
+ build_destination,
70
+ build_source,
71
+ )
72
+ from messagefoundry.transports.database import DatabaseLookupExecutor
73
+ from messagefoundry.transports.mllp import build_ack
74
+
75
+ __all__ = ["RegistryRunner"]
76
+
77
+ log = logging.getLogger(__name__)
78
+
79
+ # A delivery worker backs off this long after an *unexpected* error (e.g. the store being briefly
80
+ # unavailable) before retrying, so a transient failure logs once and recovers instead of hot-looping.
81
+ _WORKER_ERROR_BACKOFF_SECONDS = 1.0
82
+
83
+ # A queue_buildup alert re-fires at most this often per connection while the lane stays over threshold,
84
+ # so an ongoing stall reminds the operator without spamming on every backed-off retry.
85
+ _BUILDUP_REALERT_SECONDS = 300.0
86
+
87
+ # The ingress worker has no per-message "failure" to hang a buildup check on (a slow-but-working
88
+ # router just falls behind), so it polls the lane depth at most this often — bounding the extra
89
+ # COUNT+MIN query rate on the ingress hot path regardless of throughput.
90
+ _BUILDUP_CHECK_INTERVAL = 1.0
91
+
92
+ # How long the handler's worker thread blocks on a single db_lookup() before giving up (ADR 0010).
93
+ # A live lookup that exceeds this raises (→ the message's transform fails and dead-letters) rather than
94
+ # pinning a worker thread forever; the orphaned query still completes on the loop and releases its conn.
95
+ _LOOKUP_RESULT_TIMEOUT_SECONDS = 30.0
96
+
97
+
98
+ def _peek_for_loopback(
99
+ ic: InboundConnection, body: str
100
+ ) -> tuple[str | None, str | None, str | None, bool]:
101
+ """Derive ``(control_id, message_type, summary, peek_failed)`` for a re-ingressed loopback body
102
+ (ADR 0013 Increment 2, Q5) — the re-ingress worker's parsing step, kept in ``pipeline/`` (not the
103
+ store) so the store stays parsing-free, exactly as ``_handle_inbound`` peeks before
104
+ ``enqueue_ingress``. An HL7V2 loopback runs ``Peek.parse`` (``peek_failed=True`` on ``HL7PeekError``
105
+ → the child is recorded RECEIVED→ERROR, not dropped); any other ``content_type`` (x12/text/json) is
106
+ relayed verbatim as a ``RawMessage`` — no parse, ``message_type`` = the content_type value."""
107
+ if ic.content_type is ContentType.HL7V2:
108
+ try:
109
+ peek = Peek.parse(body)
110
+ except HL7PeekError:
111
+ return None, None, None, True
112
+ return peek.control_id, peek.message_type, (summarize(peek) or None), False
113
+ return None, ic.content_type.value, None, False
114
+
115
+
116
+ class RegistryRunner:
117
+ """Runs every inbound connection in a Registry + one delivery worker per outbound."""
118
+
119
+ def __init__(
120
+ self,
121
+ registry: Registry,
122
+ store: QueueStore,
123
+ *,
124
+ poll_interval: float = 0.25,
125
+ claim_limit: int = 20,
126
+ inbound_bind_host: str = "127.0.0.1",
127
+ allow_insecure_bind: bool = False,
128
+ delivery_defaults: RetryPolicy | None = None,
129
+ ordering_default: OrderingMode | None = None,
130
+ internal_error_default: InternalErrorPolicy | None = None,
131
+ buildup_default: BuildupThreshold | None = None,
132
+ ack_after_default: AckAfter | None = None,
133
+ alert_sink: AlertSink | None = None,
134
+ egress: EgressSettings | None = None,
135
+ simulate_all: bool = False,
136
+ env_values: Mapping[str, Any] | None = None,
137
+ active_environment: str | None = None,
138
+ coordinator: ClusterCoordinator | None = None,
139
+ max_correlation_depth: int = 8,
140
+ ) -> None:
141
+ self.registry = registry
142
+ self.store = store
143
+ # ADR 0013 Increment 2: the loop-prevention cap for re-ingress. A re-ingressed message at this
144
+ # correlation depth still routes; the next hop (depth+1) dead-letters its work-row and ERRORs the
145
+ # origin. Coarse by design (bounds total work, not topology). From [pipeline] max_correlation_depth.
146
+ self._max_correlation_depth = max_correlation_depth
147
+ # Cluster coordination seam (Track B Step 3). Threaded in + held so Steps 4/5 can consult the
148
+ # cheap, synchronous gates (is_leader / owns_lane) on the hot path — this step adds NO call
149
+ # sites; the object is only stored + exposed. None → the no-op NullCoordinator (every gate
150
+ # True), so single-node operation is byte-identical to before this seam existed.
151
+ self._coordinator: ClusterCoordinator = coordinator or NullCoordinator()
152
+ # The active environment name ([ai].environment / serve --env), published around each
153
+ # router/transform run so a Handler's current_environment() resolves (ADR 0006-style per-face
154
+ # logic). A deployment constant, so the read is pure/re-run-safe.
155
+ self._active_environment = active_environment
156
+ self.poll_interval = poll_interval
157
+ self.claim_limit = claim_limit
158
+ # Global outbound defaults (from [delivery]); a connection's own settings override them.
159
+ # An outbound with none inherits these (per-connection override > global default > built-in).
160
+ self._delivery_defaults = delivery_defaults or RetryPolicy()
161
+ self._ordering_default = ordering_default or OrderingMode.FIFO
162
+ self._internal_error_default = internal_error_default or InternalErrorPolicy.CONTINUE
163
+ self._buildup_default = buildup_default or BuildupThreshold()
164
+ # Global inbound ACK-timing default (from [inbound]); a connection's own ack_after overrides
165
+ # it. Step A only supports INGEST (ACK-on-receipt); a resolved DELIVERED fails loud at start.
166
+ self._ack_after_default = ack_after_default or AckAfter.INGEST
167
+ # Where the delivery workers report operational stalls (a stopped connection, a building
168
+ # backlog). Defaults to the logging sink until a real notifier is wired (docs/BACKLOG.md item 5).
169
+ self._alert_sink: AlertSink = alert_sink or LoggingAlertSink()
170
+ # Fail-closed outbound destination allowlist (WP-11c); empty = unrestricted. Enforced at
171
+ # build_check (config load/reload) and start, so a non-allowed destination is refused.
172
+ self._egress = egress or EgressSettings()
173
+ # Deployment-wide shadow override ([shadow].simulate_all_egress, #15): when True, EVERY outbound
174
+ # runs egress-suppressed regardless of its own simulate= flag. Resolved per-connection into
175
+ # self._simulate at reconcile (per-connection simulate OR this).
176
+ self._simulate_all = simulate_all
177
+ # The interface inbound listeners bind to (service-level; authors never set a host). Loopback
178
+ # by default — see config.settings.InboundSettings.bind_host.
179
+ self._inbound_bind_host = inbound_bind_host
180
+ # Whether `serve --allow-insecure-bind` was passed — the dev escape that downgrades the MLLP
181
+ # exposed-gate (a non-loopback plaintext bind) from refuse to a loud warning (ADR 0002 §0).
182
+ self._allow_insecure_bind = allow_insecure_bind
183
+ # This instance's environment values (DEV/PROD): env() references in connection specs resolve
184
+ # against this map when a connector is built (a missing key fails loud — see resolve_env_settings).
185
+ self._env_values: dict[str, Any] = dict(env_values or {})
186
+ self._sources: dict[str, SourceConnector] = {}
187
+ self._destinations: dict[str, DestinationConnector] = {}
188
+ # One delivery worker per outbound connection, addressable by name so a reload can
189
+ # gracefully stop/swap a single connection's worker without touching its siblings.
190
+ self._workers: dict[str, asyncio.Task[None]] = {}
191
+ # Two workers per inbound connection (staged pipeline, ADR 0001 Step B): a ROUTER worker drains
192
+ # the ingress stage (Router → routed-stage rows) and a TRANSFORM worker drains the routed stage
193
+ # (handler transform → outbound rows). Both run independently of whether the source is actively
194
+ # listening, so messages already ACKed at ingress are always carried through (even while the
195
+ # source is stopped). Addressable by inbound name so a reload/restart can re-arm one in place.
196
+ self._router_workers: dict[str, asyncio.Task[None]] = {}
197
+ self._transform_workers: dict[str, asyncio.Task[None]] = {}
198
+ # ADR 0013 Increment 2: a RESPONSE worker per LOOPBACK inbound, draining its Stage.RESPONSE
199
+ # tokens (a captured reply owes a re-ingress) via ingress_handoff. Non-loopback inbounds have none.
200
+ self._response_workers: dict[str, asyncio.Task[None]] = {}
201
+ # connector + retry are re-resolved per item from these maps, so a reload can swap an
202
+ # outbound's settings under a running worker without tearing the worker down.
203
+ self._retry: dict[str, RetryPolicy] = {}
204
+ self._ordering: dict[str, OrderingMode] = {}
205
+ self._internal_error: dict[str, InternalErrorPolicy] = {}
206
+ self._buildup: dict[str, BuildupThreshold] = {}
207
+ # Effective per-connection egress-suppression (#15): per-connection simulate= OR simulate_all.
208
+ self._simulate: dict[str, bool] = {}
209
+ # Per-connection re-alert throttle: the earliest time a queue_buildup alert may fire again.
210
+ self._next_buildup_alert: dict[str, float] = {}
211
+ # Live-lookup executor (db_lookup, ADR 0010): built from registry.lookups at start/reload, None
212
+ # when the graph declares no DatabaseLookup — in which case the transform path stays byte-identical
213
+ # (inline call, no thread hop, no runner). The engine loop is captured at start so a handler's
214
+ # worker thread can bridge a db_lookup back onto it (run_coroutine_threadsafe).
215
+ self._lookup_executor: DatabaseLookupExecutor | None = None
216
+ self._loop: asyncio.AbstractEventLoop | None = None
217
+ self._stop = asyncio.Event()
218
+ # Per-stage wake events so a producer wakes only its own downstream consumer class. A single
219
+ # shared auto-clearing event would let an idle worker of one class swallow another class's
220
+ # wakeup (lost wakeup) — masked by poll_interval but defeating the prompt set(). Listener →
221
+ # router (_ingress_work); router → transform (_routed_work); transform / replay → delivery
222
+ # (_work). Each worker class waits on (and clears) only its own event.
223
+ self._ingress_work = asyncio.Event()
224
+ self._routed_work = asyncio.Event()
225
+ # ADR 0013 Increment 2: wakes the per-loopback re-ingress worker when a Stage.RESPONSE work-row
226
+ # is produced (a captured reply owes a re-ingress) — a sibling of _ingress_work/_routed_work.
227
+ self._response_work = asyncio.Event()
228
+ self._work = asyncio.Event()
229
+ self._running = False
230
+ self._reload_lock = asyncio.Lock() # serialize concurrent reloads
231
+
232
+ @property
233
+ def running(self) -> bool:
234
+ return self._running
235
+
236
+ @property
237
+ def coordinator(self) -> ClusterCoordinator:
238
+ """The cluster coordinator threaded in by the engine (Track B Step 3). Steps 4/5 consume its
239
+ cheap, synchronous gates (``is_leader`` / ``owns_lane``); this step only exposes the object."""
240
+ return self._coordinator
241
+
242
+ def notify_work(self) -> None:
243
+ """Wake every stage worker now (e.g. after a replay re-queues rows at an unknown stage)."""
244
+ self._ingress_work.set()
245
+ self._routed_work.set()
246
+ self._response_work.set()
247
+ self._work.set()
248
+
249
+ def set_env_values(self, values: Mapping[str, Any]) -> None:
250
+ """Replace the environment values used to resolve ``env()`` refs when (re)building connectors.
251
+ The engine calls this on reload so a promote picks up edited values without a restart (M-23)."""
252
+ self._env_values = dict(values)
253
+
254
+ def _build_lookup_executor(self) -> DatabaseLookupExecutor | None:
255
+ """Build the pooled live-lookup executor from the current graph's ``DatabaseLookup`` specs, or
256
+ ``None`` if the graph declares none (so the transform path stays byte-identical — inline call,
257
+ no thread hop, no runner). Resolves ``env()`` in each spec and fail-closed egress-checks the
258
+ server, exactly like a DATABASE source. ``build_check`` already validated these on a reload, so
259
+ this won't raise there; at start a bad spec surfaces here and unwinds the partial start."""
260
+ if not self.registry.lookups:
261
+ return None
262
+ resolved: dict[str, dict[str, Any]] = {}
263
+ for name, spec in self.registry.lookups.items():
264
+ settings = resolve_env_settings(spec.settings, self._env_values)
265
+ check_lookup_allowed(name, settings, self._egress)
266
+ resolved[name] = settings
267
+ return DatabaseLookupExecutor(resolved)
268
+
269
+ def _run_lookup(
270
+ self, connection: str, statement: str, params: Mapping[str, Any] | None
271
+ ) -> list[dict[str, Any]]:
272
+ """The lookup runner published to Handlers (``db_lookup`` → this). Called FROM the handler's
273
+ worker thread (``transform_one`` runs off the loop when lookups are declared), it bridges the
274
+ async query onto the engine loop via ``run_coroutine_threadsafe`` and blocks the WORKER THREAD —
275
+ never the loop — for the result (bounded by ``_LOOKUP_RESULT_TIMEOUT_SECONDS``)."""
276
+ executor = self._lookup_executor
277
+ loop = self._loop
278
+ if executor is None or loop is None: # only published when both exist; guard defensively
279
+ raise DbLookupError("db_lookup is unavailable — no lookup connections are configured")
280
+ future = asyncio.run_coroutine_threadsafe(
281
+ executor.query(connection, statement, params), loop
282
+ )
283
+ return future.result(_LOOKUP_RESULT_TIMEOUT_SECONDS)
284
+
285
+ # --- per-connection control (console operations) -------------------------
286
+
287
+ def inbound_running(self, name: str) -> bool:
288
+ return name in self._sources
289
+
290
+ def outbound_simulated(self, name: str) -> bool:
291
+ """Whether the named outbound is in **simulate** mode — egress suppressed (#15). The *effective*
292
+ value (per-connection ``simulate=`` OR ``[shadow].simulate_all_egress``), for the ``/connections``
293
+ API + console so a simulated lane is unmissable.
294
+
295
+ Prefers the value resolved at reconcile (what the delivery worker actually uses, and the only
296
+ source for a *draining* outbound the registry no longer declares); falls back to resolving from
297
+ the registry for a connection that is declared but not yet reconciled (e.g. the metadata endpoint
298
+ on a not-yet-started engine)."""
299
+ if name in self._simulate:
300
+ return self._simulate[name]
301
+ oc = self.registry.outbound.get(name)
302
+ return (bool(oc.simulate) or self._simulate_all) if oc is not None else False
303
+
304
+ def _resolve_simulate(self, name: str, oc: OutboundConnection) -> bool:
305
+ """Resolve a connection's effective simulate flag and log **once** when a lane (newly) enters
306
+ simulate mode (so it's loud in the operator log, not just the API)."""
307
+ simulate = bool(oc.simulate) or self._simulate_all
308
+ if simulate and not self._simulate.get(name, False):
309
+ log.warning(
310
+ "outbound %r is in SIMULATE mode — real egress SUPPRESSED (no delivery to the live "
311
+ "peer); messages still finalize PROCESSED for shadow/parallel-run comparison (#15)",
312
+ name,
313
+ )
314
+ return simulate
315
+
316
+ def build_test_connector(self, name: str) -> tuple[str, SourceConnector | DestinationConnector]:
317
+ """Build a **fresh** connector for the named connection so it can be reachability-tested —
318
+ never the live one in ``_sources``/``_destinations`` (probing the live connector would disturb
319
+ running traffic). Resolves ``env()`` and enforces the ``[egress]`` allowlist fail-closed, the
320
+ same as a real build. Returns ``("in", source)`` or ``("out", destination)``. Raises
321
+ :class:`KeyError` if ``name`` isn't a connection, :class:`WiringError` on a bad ``env()`` /
322
+ egress. The caller closes the connector (``stop()`` / ``aclose()``) after testing."""
323
+ ic = self.registry.inbound.get(name)
324
+ if ic is not None:
325
+ source_cfg = _source_config(ic, self._inbound_bind_host, self._env_values)
326
+ check_source_allowed(source_cfg, name, self._egress)
327
+ return "in", build_source(source_cfg)
328
+ oc = self.registry.outbound.get(name)
329
+ if oc is not None:
330
+ dest_cfg = _dest_config(oc, self._env_values)
331
+ check_egress_allowed(dest_cfg, self._egress)
332
+ return "out", build_destination(dest_cfg)
333
+ raise KeyError(name)
334
+
335
+ async def start_inbound(self, name: str) -> None:
336
+ """Start receiving on one inbound connection (no-op if already listening).
337
+
338
+ Public console/API entrypoint — takes the reload lock so it can't race a concurrent
339
+ reload()/stop() mutating _sources/_workers (review M-10). Internal callers that already hold
340
+ the lock (start, reload) use :meth:`_start_inbound_unsafe`."""
341
+ async with self._reload_lock:
342
+ await self._start_inbound_unsafe(name)
343
+
344
+ async def stop_inbound(self, name: str) -> None:
345
+ """Stop receiving on one inbound connection (its delivery workers keep draining)."""
346
+ async with self._reload_lock:
347
+ await self._stop_inbound_unsafe(name)
348
+
349
+ async def restart_inbound(self, name: str) -> None:
350
+ # One lock span so stop+start is atomic w.r.t. a concurrent reload (review M-10).
351
+ async with self._reload_lock:
352
+ await self._stop_inbound_unsafe(name)
353
+ await self._start_inbound_unsafe(name)
354
+
355
+ async def _start_inbound_unsafe(self, name: str) -> None:
356
+ """start_inbound body without the reload lock — for callers that already hold it (start,
357
+ reload). asyncio.Lock isn't reentrant, so the public wrappers must not call each other."""
358
+ if name in self._sources:
359
+ return
360
+ ic = self.registry.inbound[name]
361
+ # Resolve + guard the ACK-timing setting (per-connection override > global default). Step A
362
+ # only ships ACK-on-receipt; reject a resolved 'delivered' loud at start/reload rather than
363
+ # silently downgrade (covers a global [inbound] ack_after='delivered' inherited by a
364
+ # connection — the per-connection case is already rejected in inbound()). Compare by VALUE,
365
+ # not identity: AckAfter is a str-Enum, so a stray raw-string 'delivered' must still be caught.
366
+ if (ic.ack_after or self._ack_after_default) == AckAfter.DELIVERED:
367
+ raise WiringError(
368
+ f"inbound connection {name!r}: ack_after='delivered' is not yet implemented "
369
+ "(Step A ships ACK-on-receipt only — use ack_after='ingest', the default)"
370
+ )
371
+ source_cfg = _source_config(ic, self._inbound_bind_host, self._env_values)
372
+ check_source_allowed(source_cfg, ic.name, self._egress) # fail-closed connect allowlist
373
+ # Exposed-gate (ADR 0002 §0): refuse a non-loopback MLLP listener without TLS at start.
374
+ check_mllp_tls_exposure(source_cfg, ic.name, allow_insecure_bind=self._allow_insecure_bind)
375
+ source = build_source(source_cfg)
376
+ # Leader-gate the source's intake (Track B Step 4b). is_leader is a cheap, synchronous bound
377
+ # method = Callable[[], bool]; passing the bound METHOD (not the coordinator) keeps transports/
378
+ # free of any pipeline/cluster import. Only POLL sources act on it — they skip a scan when it
379
+ # returns False so exactly one node ingests a shared external resource (a dir / DB table /
380
+ # remote dir); LISTEN sources (MLLP/TCP) accept-and-ignore it (each binds its own endpoint). For
381
+ # single-node (NullCoordinator) is_leader is always True, so every poll source scans as before.
382
+ # Bind BEFORE registering: a failed bind (e.g. port in use) must not leave a dead source in
383
+ # _sources, where inbound_running() would report True and a retry would no-op (review M-9).
384
+ await source.start(self._make_handler(ic), leader_gate=self._coordinator.is_leader)
385
+ self._sources[name] = source
386
+ # Once the source is live, note (start-time only, never per-tick) that a poll source's intake
387
+ # is leader-gated, so an operator reading the log knows only the leader polls this resource.
388
+ if getattr(source, "polls_shared_resource", False):
389
+ log.info(
390
+ "inbound %r polls a shared external resource; intake is leader-gated (only the "
391
+ "cluster leader polls it — single-node always does)",
392
+ name,
393
+ )
394
+ # Ensure this inbound's router + transform workers are running. They are registry-tied, not
395
+ # source-tied — so a per-connection start/restart, or a reload, re-arms a worker that exited
396
+ # (e.g. halted by the STOP internal-error policy), otherwise the restarted source would resume
397
+ # ACK-on-receipt into an ingress/routed backlog with nothing draining it. Idempotent (same guard
398
+ # reload() uses); only runs once the runner is up so start()'s own spawn loop owns first boot.
399
+ if self._running:
400
+ self._ensure_inbound_workers(name)
401
+
402
+ async def _stop_inbound_unsafe(self, name: str) -> None:
403
+ """stop_inbound body without the reload lock — for callers that already hold it."""
404
+ source = self._sources.pop(name, None)
405
+ if source is not None:
406
+ await source.stop()
407
+
408
+ async def start(self) -> None:
409
+ async with self._reload_lock:
410
+ if self._running:
411
+ return
412
+ self._stop.clear()
413
+ # Capture the engine loop so a handler's worker thread can bridge a db_lookup back onto it.
414
+ self._loop = asyncio.get_running_loop()
415
+ try:
416
+ for name, oc in self.registry.outbound.items():
417
+ dest = _dest_config(oc, self._env_values)
418
+ check_egress_allowed(
419
+ dest, self._egress
420
+ ) # fail-closed egress allowlist (WP-11c)
421
+ self._destinations[name] = build_destination(dest)
422
+ # ADR 0013: fail closed at start if a capturing outbound is wired on a backend that
423
+ # can't persist captures (the SQL Server preview) — never silently drop replies.
424
+ if getattr(self._destinations[name], "capture_response", False) and not getattr(
425
+ self.store, "supports_response_capture", True
426
+ ):
427
+ raise RuntimeError(
428
+ f"outbound {name!r} sets capture_response=True but the store backend does "
429
+ "not support request/response capture (ADR 0013); use the SQLite or "
430
+ "Postgres backend"
431
+ )
432
+ self._retry[name] = oc.retry or self._delivery_defaults
433
+ self._ordering[name] = oc.ordering or self._ordering_default
434
+ self._internal_error[name] = oc.internal_error or self._internal_error_default
435
+ self._buildup[name] = oc.buildup or self._buildup_default
436
+ self._simulate[name] = self._resolve_simulate(name, oc)
437
+ self._spawn_worker(name)
438
+ # Build the live-lookup executor from the graph (env-resolved + egress-checked here);
439
+ # None when no DatabaseLookup is declared, keeping the transform path byte-identical.
440
+ self._lookup_executor = self._build_lookup_executor()
441
+ for ic in self.registry.inbound.values():
442
+ await self._start_inbound_unsafe(ic.name)
443
+ # A router + transform worker per inbound — spawned after the sources bind, so a bind
444
+ # failure above unwinds before any inbound worker exists. They drain ingress→routed→
445
+ # outbound, independently of the source's listen state.
446
+ for name in self.registry.inbound:
447
+ self._ensure_inbound_workers(name)
448
+ except Exception:
449
+ # A partial start (typically an inbound bind failure) must not leave half the graph
450
+ # wired with _running still False — unwind everything we started so the listeners are
451
+ # released and a retry can rebind the same ports (review M-8).
452
+ log.exception("wiring start failed; unwinding the partial start")
453
+ await self._teardown_unsafe()
454
+ raise
455
+ self._running = True
456
+ log.info(
457
+ "wiring started: %d inbound, %d outbound connection(s)",
458
+ len(self.registry.inbound),
459
+ len(self.registry.outbound),
460
+ )
461
+
462
+ async def stop(self) -> None:
463
+ async with self._reload_lock: # serialize against an in-flight reload (no torn-down state)
464
+ had_state = self._running or bool(self._sources or self._workers or self._destinations)
465
+ await self._teardown_unsafe()
466
+ if had_state:
467
+ log.info("wiring stopped")
468
+
469
+ async def _teardown_unsafe(self) -> None:
470
+ """Tear down all sources/workers/destinations and mark stopped. Lock-free (callers hold
471
+ _reload_lock) and idempotent — cleans up whatever is registered even if the runner never
472
+ reached _running, so a half-started runner (review M-8) and a double stop() are both safe."""
473
+ self._stop.set()
474
+ self._ingress_work.set()
475
+ self._routed_work.set()
476
+ self._response_work.set()
477
+ self._work.set()
478
+ for source in self._sources.values():
479
+ await source.stop()
480
+ inbound_tasks = (
481
+ *self._router_workers.values(),
482
+ *self._transform_workers.values(),
483
+ *self._response_workers.values(),
484
+ )
485
+ for task in (*self._workers.values(), *inbound_tasks):
486
+ task.cancel()
487
+ await asyncio.gather(*self._workers.values(), *inbound_tasks, return_exceptions=True)
488
+ for connector in self._destinations.values():
489
+ await connector.aclose()
490
+ if self._lookup_executor is not None:
491
+ await self._lookup_executor.aclose()
492
+ self._lookup_executor = None
493
+ self._workers.clear()
494
+ self._router_workers.clear()
495
+ self._transform_workers.clear()
496
+ self._response_workers.clear()
497
+ self._destinations.clear()
498
+ self._retry.clear()
499
+ self._internal_error.clear()
500
+ self._buildup.clear()
501
+ self._simulate.clear()
502
+ self._next_buildup_alert.clear()
503
+ self._sources.clear()
504
+ self._running = False
505
+
506
+ # --- outbound worker management ------------------------------------------
507
+
508
+ def _spawn_worker(self, name: str) -> None:
509
+ """Start a delivery worker for one outbound connection (drains its outbox rows)."""
510
+ task = asyncio.create_task(self._delivery_worker(name))
511
+ task.add_done_callback(functools.partial(self._on_worker_done, name))
512
+ self._workers[name] = task
513
+
514
+ def _on_worker_done(self, name: str, task: asyncio.Task[None]) -> None:
515
+ """A delivery worker should only finish on shutdown — its loop swallows + backs off on
516
+ errors. If one somehow dies while the engine is running, log and respawn so the destination
517
+ keeps draining rather than silently stalling (review H-1)."""
518
+ if self._stop.is_set() or not self._running or task.cancelled():
519
+ return # expected shutdown / cancellation
520
+ if task.exception() is None:
521
+ return
522
+ if self._workers.get(name) is task: # still the registered worker (not mid-reconcile/stop)
523
+ log.error(
524
+ "delivery worker %r exited unexpectedly; respawning",
525
+ name,
526
+ exc_info=task.exception(),
527
+ )
528
+ self._spawn_worker(name)
529
+
530
+ def _inbound_worker_coro(self, kind: str): # type: ignore[no-untyped-def]
531
+ """The coroutine factory for an inbound worker ``kind`` (``router`` | ``transform`` |
532
+ ``response``). The ``response`` worker (ADR 0013) runs only for loopback inbounds."""
533
+ return {
534
+ "router": self._router_worker,
535
+ "transform": self._transform_worker,
536
+ "response": self._response_worker,
537
+ }[kind]
538
+
539
+ def _inbound_worker_dict(self, kind: str) -> dict[str, asyncio.Task[None]]:
540
+ return {
541
+ "router": self._router_workers,
542
+ "transform": self._transform_workers,
543
+ "response": self._response_workers,
544
+ }[kind]
545
+
546
+ def _ensure_inbound_workers(self, name: str) -> None:
547
+ """Ensure the router + transform (+ for a loopback inbound, the response) workers for one inbound
548
+ are running, spawning any that exited (a STOP-policy halt, a reload adding the inbound, or a
549
+ crash). Idempotent — the shared re-arm used by start(), start_inbound(), and reload()."""
550
+ kinds = ["router", "transform"]
551
+ ic = self.registry.inbound.get(name)
552
+ if ic is not None and ic.spec.type is ConnectorType.LOOPBACK:
553
+ # ADR 0013: a loopback inbound also gets a RESPONSE worker draining its Stage.RESPONSE tokens.
554
+ kinds.append("response")
555
+ for kind in kinds:
556
+ task = self._inbound_worker_dict(kind).get(name)
557
+ if task is None or task.done():
558
+ self._spawn_inbound_worker(kind, name)
559
+
560
+ def _spawn_inbound_worker(self, kind: str, name: str) -> None:
561
+ """Start the ``kind`` (router/transform) worker for one inbound connection."""
562
+ workers = self._inbound_worker_dict(kind)
563
+ task = asyncio.create_task(self._inbound_worker_coro(kind)(name))
564
+ task.add_done_callback(functools.partial(self._on_inbound_worker_done, kind, name))
565
+ workers[name] = task
566
+
567
+ def _on_inbound_worker_done(self, kind: str, name: str, task: asyncio.Task[None]) -> None:
568
+ """A router/transform worker should only finish on shutdown or a STOP-policy halt. If it dies
569
+ on an unexpected error while running, respawn it so the inbound keeps processing (mirrors the
570
+ delivery worker's supervisor). A STOP-policy halt returns normally (no exception) and is left
571
+ down until a reload re-arms it."""
572
+ if self._stop.is_set() or not self._running or task.cancelled():
573
+ return # expected shutdown / cancellation
574
+ if task.exception() is None:
575
+ return # normal return (e.g. STOP policy halted the lane) — not respawned
576
+ if self._inbound_worker_dict(kind).get(name) is task:
577
+ log.error(
578
+ "%s worker %r exited unexpectedly; respawning",
579
+ kind,
580
+ name,
581
+ exc_info=task.exception(),
582
+ )
583
+ self._spawn_inbound_worker(kind, name)
584
+
585
+ def build_check(self, registry: Registry) -> None:
586
+ """Construct (and discard) every connector in ``registry`` so a bad connector spec fails
587
+ BEFORE a reload quiesces anything — i.e. the running graph is left untouched. Construction
588
+ is side-effect-free (no socket bind / file I/O — binding happens later in ``start_inbound``).
589
+ Raises :class:`WiringError` so the API maps it to 422 like other invalid-config errors."""
590
+ build_check_registry(
591
+ registry,
592
+ inbound_bind_host=self._inbound_bind_host,
593
+ env_values=self._env_values,
594
+ egress=self._egress,
595
+ )
596
+
597
+ async def _reconcile_outbounds(self, old: Registry, new: Registry) -> None:
598
+ """Bring the outbound connectors/workers in line with ``new`` without tearing down a live
599
+ worker (so its in-flight outbox batch keeps draining). A worker re-resolves its connector
600
+ per item, so a changed connector is swapped in place; the old one is closed (a single racing
601
+ send at most fails and retries — outbounds are idempotent). An outbound dropped by ``new`` is
602
+ left running so rows already queued to it still drain. Connector builds here cannot fail —
603
+ :meth:`_build_check` already validated them before any quiesce."""
604
+ for name, oc in new.outbound.items():
605
+ # workers read retry + ordering + internal-error policy live each item, so a reload
606
+ # retunes (incl. re-arming a previously stopped connection) without a restart
607
+ self._retry[name] = oc.retry or self._delivery_defaults
608
+ self._ordering[name] = oc.ordering or self._ordering_default
609
+ self._internal_error[name] = oc.internal_error or self._internal_error_default
610
+ self._buildup[name] = oc.buildup or self._buildup_default
611
+ self._simulate[name] = self._resolve_simulate(name, oc)
612
+ worker = self._workers.get(name)
613
+ if worker is None or worker.done():
614
+ # added (or replacing a crashed worker): close any stale connector, build + spawn.
615
+ stale = self._destinations.pop(name, None)
616
+ if stale is not None:
617
+ await stale.aclose()
618
+ self._destinations[name] = build_destination(_dest_config(oc, self._env_values))
619
+ self._spawn_worker(name)
620
+ elif old.outbound.get(name) is None or old.outbound[name].spec != oc.spec:
621
+ # live worker, connector type/settings changed → swap in place, close the old one.
622
+ old_conn = self._destinations.get(name)
623
+ self._destinations[name] = build_destination(_dest_config(oc, self._env_values))
624
+ if old_conn is not None:
625
+ await old_conn.aclose()
626
+ # else: unchanged & live → leave the worker/connector as-is.
627
+ # Outbounds removed by ``new`` keep their worker so already-queued rows finish draining.
628
+
629
+ # --- atomic reload (quiesce-and-swap) ------------------------------------
630
+
631
+ async def reload(self, new_registry: Registry) -> None:
632
+ """Atomically swap to ``new_registry`` on the running graph (whole-config swap).
633
+
634
+ Quiesce-and-swap, in this order: (0) build-check every new connector — a bad spec raises
635
+ here, before anything is touched, so the running graph is left intact; (1) stop accepting new
636
+ inbound messages; (2) swap the registry + restart the inbound listeners from it (Router/
637
+ Handler changes take effect immediately — the inbound path reads ``self.registry`` live);
638
+ (3) reconcile the outbound connectors/workers *without* tearing them down, so in-flight
639
+ outbox rows keep draining (at-least-once preserved). If any step fails the previous graph's
640
+ intake is restored before the error propagates. Restarting inbounds before reconciling
641
+ outbounds means a slow/hung outbound never blocks the engine's intake.
642
+ """
643
+ async with self._reload_lock:
644
+ self.build_check(new_registry) # raises before any change on a bad connector
645
+ if not self._running:
646
+ self.registry = new_registry
647
+ return
648
+
649
+ old = self.registry
650
+ old_inbound_names = list(self._sources)
651
+
652
+ # 1. Quiesce intake: stop every inbound source so no NEW messages are accepted. Any
653
+ # message already in flight completes under its arrival-time registry (snapshotted in
654
+ # _make_handler), so it stays consistent even if a source's stop() returns early.
655
+ for name in old_inbound_names:
656
+ await self._stop_inbound_unsafe(
657
+ name
658
+ ) # we hold _reload_lock — use the unsafe variant
659
+
660
+ try:
661
+ # 2. Swap the registry and restart inbound listeners from it (intake back up first).
662
+ self.registry = new_registry
663
+ # Rebuild the live-lookup executor from the new graph, closing the old pools. build_check
664
+ # already validated the new specs, so this can't fail on a bad spec here.
665
+ old_lookup_executor = self._lookup_executor
666
+ self._lookup_executor = self._build_lookup_executor()
667
+ if old_lookup_executor is not None:
668
+ await old_lookup_executor.aclose()
669
+ for ic in new_registry.inbound.values():
670
+ await self._start_inbound_unsafe(ic.name)
671
+ # 2b. Ensure the router + transform workers run for every inbound in the new graph.
672
+ # Workers read self.registry live, so a Router/Handler change applies to rows processed
673
+ # after the swap; a removed inbound keeps its workers so residual ingress/routed rows
674
+ # still drain.
675
+ for name in new_registry.inbound:
676
+ self._ensure_inbound_workers(name)
677
+ # 3. Reconcile outbound connectors/workers (intake already live).
678
+ await self._reconcile_outbounds(old, new_registry)
679
+ except Exception:
680
+ # Roll back to the previous graph's intake so a failed reload leaves the engine
681
+ # accepting exactly what it did before (the realistic failure is an inbound bind).
682
+ log.exception("reload failed; rolling back inbound intake to the previous graph")
683
+ self.registry = old
684
+ for name in list(self._sources):
685
+ await self._stop_inbound_unsafe(name)
686
+ for name in old_inbound_names:
687
+ try:
688
+ await self._start_inbound_unsafe(name)
689
+ except Exception:
690
+ log.exception("rollback: could not restart inbound %r", name)
691
+ raise
692
+
693
+ # Wake every stage (new connections / freshly enqueued rows may sit at any stage).
694
+ self._ingress_work.set()
695
+ self._routed_work.set()
696
+ self._work.set()
697
+ log.info(
698
+ "wiring reloaded: %d inbound, %d outbound connection(s)",
699
+ len(new_registry.inbound),
700
+ len(new_registry.outbound),
701
+ )
702
+
703
+ # --- inbound path --------------------------------------------------------
704
+
705
+ def _make_handler(self, ic: InboundConnection): # type: ignore[no-untyped-def]
706
+ # The listener only decodes/parses/validates and commits the raw message to the ingress stage
707
+ # before ACKing (ACK-on-receipt) — it no longer routes, so it needs no registry snapshot.
708
+ # Routing happens later in the router worker against the LIVE registry, so a message ingested
709
+ # before a reload is routed under the new graph (the staged model decouples intake from
710
+ # routing). The inbound name is fixed for this source.
711
+ async def on_message(raw: bytes) -> str | None:
712
+ return await self._handle_inbound(ic, raw)
713
+
714
+ return on_message
715
+
716
+ async def _handle_inbound(self, ic: InboundConnection, raw: bytes) -> str | None:
717
+ ack_mode = ic.ack_mode
718
+ reply = ack_mode is not AckMode.NONE
719
+ src = ic.spec.type.value
720
+ hl7v2 = ic.content_type is ContentType.HL7V2
721
+
722
+ # Decode with the connection's configured charset. A genuine decode failure means the bytes
723
+ # aren't valid in the declared encoding — record ERROR (preserving the exact bytes via a
724
+ # lossless latin-1 view) and NAK, rather than silently substituting U+FFFD into the stored
725
+ # raw and the delivered copy (review H-3). HL7 also normalizes line endings to \r; a non-HL7
726
+ # body (JSON/XML/text) is decoded verbatim — \r-normalizing it would corrupt it (ADR 0004).
727
+ encoding = ic.spec.settings.get("encoding", "utf-8")
728
+ try:
729
+ text = (
730
+ normalize(raw, encoding=encoding, errors="strict")
731
+ if hl7v2
732
+ else raw.decode(encoding)
733
+ )
734
+ except UnicodeDecodeError as exc:
735
+ await self.store.record_received(
736
+ channel_id=ic.name,
737
+ raw=raw.decode("latin-1"), # lossless byte view — the declared encoding rejected it
738
+ status=MessageStatus.ERROR,
739
+ error=f"decode error ({encoding}): {safe_exc(exc)}",
740
+ source_type=src,
741
+ message_type=None if hl7v2 else ic.content_type.value,
742
+ )
743
+ return (
744
+ build_ack(raw, code="AR", text="decode error", ack_mode=ack_mode)
745
+ if (hl7v2 and reply)
746
+ else None
747
+ )
748
+
749
+ if not hl7v2:
750
+ # Payload-agnostic ingress (ADR 0004): a non-HL7 inbound skips HL7 peek/validate and the
751
+ # HL7 ACK. The decoded body is committed verbatim and the router/transform workers route it
752
+ # as a RawMessage; the source connector owns its own receive-time response (no MLLP ACK).
753
+ await self.store.enqueue_ingress(
754
+ channel_id=ic.name,
755
+ raw=text,
756
+ control_id=None,
757
+ message_type=ic.content_type.value,
758
+ source_type=src,
759
+ summary=None,
760
+ )
761
+ self._ingress_work.set()
762
+ return None
763
+
764
+ try:
765
+ peek = Peek.parse(text)
766
+ except HL7PeekError as exc:
767
+ await self.store.record_received(
768
+ channel_id=ic.name,
769
+ raw=text,
770
+ status=MessageStatus.ERROR,
771
+ error=f"parse error: {safe_exc(exc)}",
772
+ source_type=src,
773
+ )
774
+ return build_ack(text, code="AR", text=str(exc), ack_mode=ack_mode) if reply else None
775
+
776
+ if ic.validation.strict:
777
+ # hl7apy validation is CPU-bound (full structure/cardinality parse) — run it off the event
778
+ # loop so a strict feed can't stall every other listener, worker, and API call (review M-11).
779
+ result = await asyncio.to_thread(
780
+ validate, text, expected_version=ic.validation.hl7_version
781
+ )
782
+ if not result.ok:
783
+ joined = "; ".join(result.errors)
784
+ # Persist a PHI-scrubbed form: hl7apy error strings quote the offending field VALUE
785
+ # (PHI), so this is a persisted-disposition write that must go through the scrub like
786
+ # every other one — it keeps the field NAME / segment ID (the diagnostic an operator
787
+ # needs) but cuts the value (review #120). The scrubbed text is gated behind
788
+ # messages:view_summary on read, like every other stored error.
789
+ persisted = f"strict-validation failed: {safe_text(joined)}"
790
+ await self._record(ic, peek, text, MessageStatus.ERROR, error=persisted)
791
+ # The AE ACK goes back to the partner that SENT this message (their own data) and is
792
+ # transient (never persisted), so it may carry the fuller, bounded validation text.
793
+ return (
794
+ build_ack(peek, code="AE", text=joined[:200], ack_mode=ack_mode)
795
+ if reply
796
+ else None
797
+ )
798
+
799
+ # ACK-on-receipt (staged pipeline, ADR 0001 Step A): persist the raw message durably to the
800
+ # ingress stage, then ACK. Routing/transform/delivery run AFTER the ACK in the ingress worker,
801
+ # so a slow/hung router or outbound never stalls intake — and a router/handler failure no
802
+ # longer NAKs the sender (it becomes a logged ERROR/dead-letter at the ingress stage). Decode,
803
+ # parse, and strict validation above stay synchronous and still NAK, preserving the partner
804
+ # contract for a malformed message. ack_after='delivered' (defer the ACK) is rejected at
805
+ # wiring in Step A, so this is always ACK-on-ingest.
806
+ await self.store.enqueue_ingress(
807
+ channel_id=ic.name,
808
+ raw=text,
809
+ control_id=peek.control_id,
810
+ message_type=peek.message_type,
811
+ source_type=src,
812
+ summary=summarize(peek) or None,
813
+ )
814
+ self._ingress_work.set() # wake the router worker to route the freshly-committed message
815
+ return build_ack(peek, code="AA", ack_mode=ack_mode) if reply else None
816
+
817
+ async def _record(
818
+ self,
819
+ ic: InboundConnection,
820
+ peek: Peek,
821
+ raw: str, # already the decoded, \r-normalized text (see _handle_inbound)
822
+ status: MessageStatus,
823
+ *,
824
+ error: str | None = None,
825
+ ) -> None:
826
+ await self.store.record_received(
827
+ channel_id=ic.name,
828
+ raw=raw,
829
+ status=status,
830
+ error=error,
831
+ control_id=peek.control_id,
832
+ message_type=peek.message_type,
833
+ source_type=ic.spec.type.value,
834
+ summary=summarize(peek) or None,
835
+ )
836
+
837
+ # --- delivery path -------------------------------------------------------
838
+
839
+ async def _delivery_worker(self, name: str) -> None:
840
+ while not self._stop.is_set():
841
+ try:
842
+ # FIFO (default): claim only the due head — a backing-off head blocks the lane
843
+ # (head-of-line), so order is preserved. UNORDERED: claim a batch and rotate past a
844
+ # backing-off row to drain others. Resolved live so a reload can retune it.
845
+ if self._ordering.get(name, self._ordering_default) is OrderingMode.FIFO:
846
+ # lane_owner() gates the claim to a single owner per lane (Track B Step 5) so strict
847
+ # FIFO holds ACROSS nodes; it's None single-node (byte-identical no-owner claim).
848
+ head = await self.store.claim_next_fifo(
849
+ name, owner=self._coordinator.lane_owner()
850
+ )
851
+ items = [head] if head is not None else []
852
+ else:
853
+ # UNORDERED lanes are intentionally NOT lane-owned — concurrent draining across
854
+ # nodes is fine, so claim_ready stays unchanged.
855
+ items = await self.store.claim_ready(
856
+ limit=self.claim_limit, destination_name=name
857
+ )
858
+ if not items:
859
+ await self._wait_for_work(self._work)
860
+ continue
861
+ for item in items:
862
+ # Connector + retry re-resolved per item so a reload can swap an outbound's
863
+ # settings under us with at most one racing send (which fails + retries —
864
+ # outbounds are idempotent).
865
+ retry = self._retry.get(name) or RetryPolicy()
866
+ connector = self._destinations.get(name)
867
+ if connector is None:
868
+ # No connector for a claimed row (extremely unlikely mid-reconcile).
869
+ # Reschedule it rather than strand the claimed row, then move on.
870
+ await self.store.mark_failed(item.id, "outbound reloading", retry)
871
+ continue
872
+ try:
873
+ if self._simulate.get(name, False):
874
+ # Shadow / parallel-run (#15): suppress the real egress entirely — no bytes/
875
+ # SQL leave the box. With egress suppressed there is no real partner reply to
876
+ # capture or re-ingress, so treat it as a completed ONE-WAY delivery: response
877
+ # = None → mark_done → the message finalizes PROCESSED, and the would-send
878
+ # outbound payload is retained on the done row for parity comparison. (A
879
+ # capturing/reingress_to outbound therefore captures nothing in simulate.)
880
+ response = None
881
+ else:
882
+ response = await connector.send(item.payload)
883
+ except NegativeAckError as exc:
884
+ # Partner rejection. AR/CR (permanent) → fail-fast: the partner will never
885
+ # accept this message, so dead-letter it now rather than block the FIFO lane
886
+ # forever (still replayable from the DLQ). AE/CE (transient) → retry per
887
+ # policy, like a transport failure.
888
+ if exc.permanent:
889
+ await self.store.dead_letter_now(item.id, safe_exc(exc))
890
+ else:
891
+ await self.store.mark_failed(item.id, safe_exc(exc), retry)
892
+ await self._maybe_alert_buildup(name)
893
+ except DeliveryError as exc:
894
+ # Transport failure (connect/IO/timeout/unparseable ACK) — transient; retry
895
+ # per policy (retry-forever by default, so nothing is silently lost).
896
+ await self.store.mark_failed(item.id, safe_exc(exc), retry)
897
+ await self._maybe_alert_buildup(name)
898
+ except Exception as exc:
899
+ # Internal/code error (our bug, not the partner). The per-connection policy
900
+ # decides: STOP halts the lane (preserve the message, alert an operator) while
901
+ # CONTINUE (default) dead-letters this row and advances so a code bug can't
902
+ # wedge the lane forever. Log the exception TYPE only — the full detail goes to
903
+ # the secured store's last_error, never the general log (PHI).
904
+ if (
905
+ self._internal_error.get(name, self._internal_error_default)
906
+ is InternalErrorPolicy.STOP
907
+ ):
908
+ log.error(
909
+ "delivery worker %r: internal error delivering %s (%s); STOPPING "
910
+ "connection (operator must fix + reload/restart to resume)",
911
+ name,
912
+ item.id,
913
+ type(exc).__name__,
914
+ )
915
+ # Preserve the message for replay (reschedule, don't dead-letter) and halt
916
+ # this worker. A normal return is not respawned (_on_worker_done); a later
917
+ # reload re-spawns the worker, re-arming the lane.
918
+ await self.store.mark_failed(
919
+ item.id,
920
+ f"internal error (connection stopped): {safe_exc(exc)}",
921
+ retry,
922
+ )
923
+ self._alert_sink.connection_stopped(
924
+ name, detail=f"{type(exc).__name__} delivering {item.id}"
925
+ )
926
+ return
927
+ log.warning(
928
+ "delivery worker %r: internal error delivering %s (%s); dead-lettering",
929
+ name,
930
+ item.id,
931
+ type(exc).__name__,
932
+ )
933
+ await self.store.dead_letter_now(
934
+ item.id, f"internal error: {safe_exc(exc)}"
935
+ )
936
+ else:
937
+ # ADR 0013: a capturing outbound returns a DeliveryResponse; persist the reply
938
+ # AND mark the row done in ONE transaction (exactly-once capture). A non-capturing
939
+ # outbound returns None → plain mark_done, byte-identical. The XOR (never both)
940
+ # is the single-writer discipline that yields exactly one captured reply per row.
941
+ if response is not None:
942
+ # ADR 0013 Increment 2: if this outbound declares reingress_to, the same
943
+ # capture transaction also produces a Stage.RESPONSE work-row; wake the
944
+ # re-ingress worker. Read live from the registry (a reload swaps it).
945
+ oc = self.registry.outbound.get(name)
946
+ reingress_to = (
947
+ oc.spec.settings.get("reingress_to") if oc is not None else None
948
+ )
949
+ await self.store.complete_with_response(
950
+ item.id,
951
+ body=response.body,
952
+ outcome=response.outcome,
953
+ detail=response.detail,
954
+ reingress_to=reingress_to,
955
+ )
956
+ if reingress_to is not None:
957
+ self._response_work.set() # wake the re-ingress worker for the new token
958
+ else:
959
+ await self.store.mark_done(item.id)
960
+ except asyncio.CancelledError:
961
+ raise
962
+ except Exception:
963
+ # A store error in the loop itself (claim_ready / mark_* failing — DB locked, disk
964
+ # full) must never kill the worker: that would silently stop THIS destination from
965
+ # draining while inbound keeps ACKing (review H-1). Log, back off, and keep going.
966
+ log.exception(
967
+ "delivery worker %r: unexpected error; backing off and retrying", name
968
+ )
969
+ if await self._stop_or_sleep(_WORKER_ERROR_BACKOFF_SECONDS):
970
+ return
971
+
972
+ async def _router_worker(self, name: str) -> None:
973
+ """Drain the **ingress** stage for one inbound — the router half of the split pipeline (ADR
974
+ 0001 Step B).
975
+
976
+ Strict FIFO per inbound (preserving arrival order into routing): claim the oldest ingress row,
977
+ run its Router (``route_only``), and hand the selected handlers to the **routed** stage
978
+ (``route_handoff``) — one routed row per handler. It runs no transform. A Router failure no
979
+ longer NAKs the sender (already ACKed at ingress) — under the global ``internal_error`` policy
980
+ it dead-letters the ingress row (``CONTINUE`` → message ``ERROR``, advance) or halts this lane
981
+ preserving the row (``STOP`` → ``connection_stopped`` alert, return). Shares the delivery
982
+ worker's wait/backoff supervision.
983
+ """
984
+ last_buildup_check = 0.0
985
+ while not self._stop.is_set():
986
+ try:
987
+ # FIFO per inbound: claim only the due head (ingress rows never back off, so this is
988
+ # effectively the oldest pending row for this inbound). lane_owner() gates the claim to a
989
+ # single owner per lane (Track B Step 5) so strict FIFO holds across nodes; None
990
+ # single-node (byte-identical).
991
+ item = await self.store.claim_next_fifo(
992
+ name, stage=Stage.INGRESS.value, owner=self._coordinator.lane_owner()
993
+ )
994
+ if item is None:
995
+ await self._wait_for_work(self._ingress_work)
996
+ continue
997
+ ic = self.registry.inbound.get(name)
998
+ if ic is None:
999
+ # The inbound was removed from the registry but residual ingress rows remain.
1000
+ # Revert this just-claimed row to pending and EXIT the worker — there is nothing to
1001
+ # route it with until a reload restores the inbound (which re-arms this worker and
1002
+ # drains the backlog). Reschedule with a retry-FOREVER policy (NOT the outbound
1003
+ # delivery defaults, whose finite max_attempts would dead-letter an ACKed-but-
1004
+ # never-attempted message purely for being removed) so the message is never dropped.
1005
+ await self.store.mark_failed(item.id, "inbound not in registry", RetryPolicy())
1006
+ return
1007
+ try:
1008
+ # Publish the live graph's run-scoped views (code sets / reference snapshots /
1009
+ # active environment) so a call-time code_set(...)/reference(...)/current_environment()
1010
+ # inside the Router resolves (the loader only had them active during import). Views
1011
+ # are read from self.registry/self.store live, so a reload's swapped tables apply to
1012
+ # the next routed row; run_contexts restores cleanly after each run (no leak). The
1013
+ # set of providers is the run_context registry (router phase) — features add one
1014
+ # provider there, never edit this call site.
1015
+ with run_contexts(
1016
+ RunContext(
1017
+ code_sets=self.registry.code_sets,
1018
+ reference_view=self.store.reference_view(),
1019
+ active_environment=self._active_environment,
1020
+ ingest_time=item.created_at,
1021
+ ),
1022
+ phase="router",
1023
+ ):
1024
+ names = route_only(self.registry, ic, item.payload)
1025
+ except Exception as exc:
1026
+ # Router code error (incl. an unknown handler name). Post-ACK, so no NAK — the
1027
+ # global internal_error policy decides. Log the exception TYPE only; full detail
1028
+ # goes to the secured store's last_error, never the general log (PHI).
1029
+ if self._internal_error_default is InternalErrorPolicy.STOP:
1030
+ log.error(
1031
+ "router worker %r: router error on %s (%s); STOPPING ingest processing "
1032
+ "(operator must fix + reload to resume)",
1033
+ name,
1034
+ item.id,
1035
+ type(exc).__name__,
1036
+ )
1037
+ await self.store.mark_failed(
1038
+ item.id,
1039
+ f"router error (ingest stopped): {safe_exc(exc)}",
1040
+ self._delivery_defaults,
1041
+ )
1042
+ self._alert_sink.connection_stopped(
1043
+ name, detail=f"router {type(exc).__name__} on {item.id}"
1044
+ )
1045
+ return
1046
+ log.warning(
1047
+ "router worker %r: router error on %s (%s); dead-lettering",
1048
+ name,
1049
+ item.id,
1050
+ type(exc).__name__,
1051
+ )
1052
+ await self.store.dead_letter_now(item.id, f"router error: {safe_exc(exc)}")
1053
+ continue
1054
+ disposition = MessageStatus.ROUTED if names else MessageStatus.UNROUTED
1055
+ await self.store.route_handoff(
1056
+ ingress_id=item.id,
1057
+ message_id=item.message_id,
1058
+ channel_id=name,
1059
+ handlers=[(h, item.payload) for h in names],
1060
+ disposition=disposition,
1061
+ )
1062
+ if names:
1063
+ self._routed_work.set() # wake the transform worker for the new routed rows
1064
+ # Off the hot path (rate-limited): alert if this inbound's ingress backlog is building
1065
+ # (a slow/hung router). Uses the global buildup threshold (no per-inbound override yet).
1066
+ now = time.time()
1067
+ if now - last_buildup_check >= _BUILDUP_CHECK_INTERVAL:
1068
+ last_buildup_check = now
1069
+ await self._maybe_alert_buildup(
1070
+ name, stage=Stage.INGRESS.value, threshold=self._buildup_default
1071
+ )
1072
+ except asyncio.CancelledError:
1073
+ raise
1074
+ except Exception:
1075
+ # A store error in the loop itself (claim/handoff failing — DB locked, disk full) must
1076
+ # never kill the worker: that would stall routing while the listener keeps ACKing. Log,
1077
+ # back off, and keep going (mirrors the delivery worker).
1078
+ log.exception("router worker %r: unexpected error; backing off and retrying", name)
1079
+ if await self._stop_or_sleep(_WORKER_ERROR_BACKOFF_SECONDS):
1080
+ return
1081
+
1082
+ async def _response_worker(self, name: str) -> None:
1083
+ """Drain the **response** stage for one LOOPBACK inbound — re-ingress a captured reply as a new
1084
+ inbound message (ADR 0013 Increment 2). Strict FIFO per loopback lane: claim the oldest
1085
+ ``Stage.RESPONSE`` token, peek the reply body for the loopback's ``content_type``, and hand it
1086
+ off **atomically** via :meth:`~messagefoundry.store.base.QueueStore.ingress_handoff` (which
1087
+ produces the re-ingressed message + ingress row, depth-caps it, or errors a non-peekable body).
1088
+ Mirrors :meth:`_router_worker`'s claim / missing-inbound / backoff supervision. Re-ingress is a
1089
+ single-owner internal stage: the per-lane claim owner is the only leader gate (``LoopbackSource``
1090
+ is inert, so there is no source-level gate)."""
1091
+ while not self._stop.is_set():
1092
+ try:
1093
+ item = await self.store.claim_next_fifo(
1094
+ name, stage=Stage.RESPONSE.value, owner=self._coordinator.lane_owner()
1095
+ )
1096
+ if item is None:
1097
+ await self._wait_for_work(self._response_work)
1098
+ continue
1099
+ ic = self.registry.inbound.get(name)
1100
+ if ic is None:
1101
+ # The loopback was removed by a reload but residual tokens remain. Revert the claim
1102
+ # (retry-FOREVER, never dropped) and EXIT; a reload restoring the loopback re-arms
1103
+ # this worker and drains the backlog — mirrors the router worker's missing-inbound exit.
1104
+ await self.store.mark_failed(item.id, "inbound not in registry", RetryPolicy())
1105
+ return
1106
+ # Peek the reply body for the loopback's content_type (in pipeline/, not the store), then
1107
+ # hand off in one atomic transaction. response_body_for_work_row reads the same immutable
1108
+ # artifact ingress_handoff re-reads for the message raw, so peek and raw always agree.
1109
+ body = await self.store.response_body_for_work_row(item.id)
1110
+ control_id, message_type, summary, peek_failed = _peek_for_loopback(ic, body or "")
1111
+ produced = await self.store.ingress_handoff(
1112
+ response_row_id=item.id,
1113
+ loopback_channel_id=name,
1114
+ correlation_depth_cap=self._max_correlation_depth,
1115
+ control_id=control_id,
1116
+ message_type=message_type,
1117
+ summary=summary,
1118
+ peek_failed=peek_failed,
1119
+ )
1120
+ if produced:
1121
+ # Wake the loopback's router worker to route the freshly-ingressed answer (a no-op
1122
+ # wake for a depth-capped / peek-failed token that produced no ingress row).
1123
+ self._ingress_work.set()
1124
+ except asyncio.CancelledError:
1125
+ raise
1126
+ except Exception:
1127
+ # A store error in the loop itself (claim/handoff failing) must never kill the worker —
1128
+ # log, back off, keep going (mirrors the router/delivery workers).
1129
+ log.exception(
1130
+ "response worker %r: unexpected error; backing off and retrying", name
1131
+ )
1132
+ if await self._stop_or_sleep(_WORKER_ERROR_BACKOFF_SECONDS):
1133
+ return
1134
+
1135
+ async def _transform_worker(self, name: str) -> None:
1136
+ """Drain the **routed** stage for one inbound — the transform half of the split pipeline (ADR
1137
+ 0001 Step B).
1138
+
1139
+ Strict FIFO per inbound (preserving order into transform): claim the oldest routed row, run its
1140
+ **single** handler's transform (``transform_one``), and hand the resulting deliveries to the
1141
+ **outbound** stage (``transform_handoff``). A slow/failing transform here can no longer block
1142
+ routing — the router worker keeps producing routed rows independently. A transform failure is
1143
+ post-ACK (no NAK): under the global ``internal_error`` policy it dead-letters the routed row
1144
+ (``CONTINUE`` → message ``ERROR``, advance) or halts this lane (``STOP`` → ``connection_stopped``
1145
+ alert, return). A handler removed since routing (a racing reload) is dead-lettered too —
1146
+ recoverable via per-message replay once restored, matching the missing-outbound path.
1147
+ """
1148
+ last_buildup_check = 0.0
1149
+ while not self._stop.is_set():
1150
+ try:
1151
+ # lane_owner() gates the claim to a single owner per lane (Track B Step 5) so strict
1152
+ # FIFO holds across nodes; None single-node (byte-identical no-owner claim).
1153
+ item = await self.store.claim_next_fifo(
1154
+ name, stage=Stage.ROUTED.value, owner=self._coordinator.lane_owner()
1155
+ )
1156
+ if item is None:
1157
+ await self._wait_for_work(self._routed_work)
1158
+ continue
1159
+ ic = self.registry.inbound.get(name)
1160
+ if ic is None:
1161
+ # Inbound removed; nothing to transform with until a reload restores it (which
1162
+ # re-arms this worker). Revert the row (retry-forever) and exit (mirrors the router
1163
+ # worker), so the ACKed-but-unprocessed message is never dropped.
1164
+ await self.store.mark_failed(item.id, "inbound not in registry", RetryPolicy())
1165
+ return
1166
+ hname = item.handler_name
1167
+ if hname is None or hname not in self.registry.handlers:
1168
+ # Handler gone (removed/renamed since routing). Can't transform this row; dead-letter
1169
+ # it (message ERROR, replayable once restored) — the per-row analogue of the startup
1170
+ # dead_letter_missing_handlers sweep. Dead-lettering (vs reverting) avoids a hot-loop
1171
+ # on a permanently-missing handler and gives the operator visibility.
1172
+ log.warning(
1173
+ "transform worker %r: handler %r for %s is missing; dead-lettering",
1174
+ name,
1175
+ hname,
1176
+ item.id,
1177
+ )
1178
+ await self.store.dead_letter_now(
1179
+ item.id, f"handler {hname!r} removed from registry"
1180
+ )
1181
+ continue
1182
+ # ADR 0013 Increment 2: for a RE-INGRESSED message (only ever on a loopback inbound),
1183
+ # feed the run-context `response` provider the ORIGIN request's captured replies so its
1184
+ # Handler can read them via response_get(dest). A normal message → None (byte-identical,
1185
+ # and the metadata read is skipped entirely for non-loopback inbounds).
1186
+ response_view: dict[str, Any] | None = None
1187
+ if ic.spec.type is ConnectorType.LOOPBACK:
1188
+ msg = await self.store.get_message(item.message_id)
1189
+ raw_meta = msg.get("metadata") if msg else None
1190
+ meta = json.loads(raw_meta) if raw_meta else {}
1191
+ corr = meta.get("correlation_id") if isinstance(meta, dict) else None
1192
+ if corr:
1193
+ # {destination_name: latest CapturedResponse}: correlate_response orders by
1194
+ # (dest, response_seq), so the last per destination wins (the authoritative
1195
+ # reply). Immutable committed rows → re-run-stable (ADR 0009).
1196
+ response_view = {
1197
+ c.destination_name: c for c in await self.store.correlate_response(corr)
1198
+ }
1199
+ try:
1200
+ # Same as the router worker, plus the transform-only providers: publish the run-scoped
1201
+ # views so call-time code_set(...)/reference(...)/state_get(...)/current_environment()
1202
+ # inside the Handler resolve; restored cleanly after the run. The transform phase adds
1203
+ # the store's transform-state read-through cache view (ADR 0005) so state_get(...)
1204
+ # resolves against committed writes. Providers come from the run_context registry
1205
+ # (transform phase) — features add one provider, never edit this call site.
1206
+ with run_contexts(
1207
+ RunContext(
1208
+ code_sets=self.registry.code_sets,
1209
+ reference_view=self.store.reference_view(),
1210
+ state_view=self.store.state_view(),
1211
+ response_view=response_view,
1212
+ active_environment=self._active_environment,
1213
+ ingest_time=item.created_at,
1214
+ ),
1215
+ phase="transform",
1216
+ ):
1217
+ if self._lookup_executor is not None:
1218
+ # The graph declares ≥1 DatabaseLookup, so a Handler may call db_lookup() — a
1219
+ # LIVE, synchronous DB read (ADR 0010). A handler is synchronous and must not
1220
+ # block the event loop, so run the transform OFF the loop in a worker thread.
1221
+ # asyncio.to_thread copies THIS context into the thread — the run_contexts
1222
+ # views AND the active lookup runner — so db_lookup()/code_set()/reference()/
1223
+ # state_get()/current_environment() all resolve there, while the loop stays
1224
+ # free to service the lookup's async query and every other connection. The
1225
+ # runner bridges back onto the loop (run_coroutine_threadsafe). db_lookup is
1226
+ # the deliberate re-run-stability exception (ADR 0009) and raises in dry-run.
1227
+ with db_lookup_activated(self._run_lookup):
1228
+ deliveries_preview, state_preview = await asyncio.to_thread(
1229
+ transform_one,
1230
+ self.registry,
1231
+ hname,
1232
+ item.payload,
1233
+ self.registry.inbound[name].content_type.value,
1234
+ )
1235
+ else:
1236
+ # No DatabaseLookup declared → byte-identical to before: run inline on the loop.
1237
+ deliveries_preview, state_preview = transform_one(
1238
+ self.registry,
1239
+ hname,
1240
+ item.payload,
1241
+ self.registry.inbound[name].content_type.value,
1242
+ )
1243
+ except Exception as exc:
1244
+ # Handler/transform code error (incl. an unknown outbound name). Post-ACK, so no
1245
+ # NAK — the global internal_error policy decides. Log the exception TYPE only (PHI).
1246
+ if self._internal_error_default is InternalErrorPolicy.STOP:
1247
+ log.error(
1248
+ "transform worker %r: handler error on %s (%s); STOPPING transform "
1249
+ "processing (operator must fix + reload to resume)",
1250
+ name,
1251
+ item.id,
1252
+ type(exc).__name__,
1253
+ )
1254
+ await self.store.mark_failed(
1255
+ item.id,
1256
+ f"handler error (transform stopped): {safe_exc(exc)}",
1257
+ self._delivery_defaults,
1258
+ )
1259
+ self._alert_sink.connection_stopped(
1260
+ name, detail=f"handler {type(exc).__name__} on {item.id}"
1261
+ )
1262
+ return
1263
+ log.warning(
1264
+ "transform worker %r: handler error on %s (%s); dead-lettering",
1265
+ name,
1266
+ item.id,
1267
+ type(exc).__name__,
1268
+ )
1269
+ await self.store.dead_letter_now(item.id, f"handler error: {safe_exc(exc)}")
1270
+ continue
1271
+ deliveries = [(d.to, d.payload) for d in deliveries_preview]
1272
+ state_ops = [(s.namespace, s.key, s.value) for s in state_preview]
1273
+ await self.store.transform_handoff(
1274
+ routed_id=item.id,
1275
+ message_id=item.message_id,
1276
+ channel_id=name,
1277
+ deliveries=deliveries,
1278
+ state_ops=state_ops,
1279
+ )
1280
+ if deliveries:
1281
+ self._work.set() # wake the outbound delivery workers for the freshly-queued rows
1282
+ # Off the hot path (rate-limited): alert if this inbound's routed (transform) backlog is
1283
+ # building behind a slow/hung handler — reported separately from the ingress lane.
1284
+ now = time.time()
1285
+ if now - last_buildup_check >= _BUILDUP_CHECK_INTERVAL:
1286
+ last_buildup_check = now
1287
+ await self._maybe_alert_buildup(
1288
+ name, stage=Stage.ROUTED.value, threshold=self._buildup_default
1289
+ )
1290
+ except asyncio.CancelledError:
1291
+ raise
1292
+ except Exception:
1293
+ # A store error in the loop itself must never kill the worker (mirrors the others).
1294
+ log.exception(
1295
+ "transform worker %r: unexpected error; backing off and retrying", name
1296
+ )
1297
+ if await self._stop_or_sleep(_WORKER_ERROR_BACKOFF_SECONDS):
1298
+ return
1299
+
1300
+ async def _maybe_alert_buildup(
1301
+ self,
1302
+ name: str,
1303
+ *,
1304
+ stage: str = Stage.OUTBOUND.value,
1305
+ threshold: BuildupThreshold | None = None,
1306
+ ) -> None:
1307
+ """Raise a ``queue_buildup`` alert if a lane has crossed its depth/age threshold.
1308
+
1309
+ Used for both stages: an outbound lane that isn't draining (a retry-forever head; ``threshold``
1310
+ defaults to the connection's resolved one) and an ingress lane backing up behind a slow router
1311
+ (caller passes ``stage='ingress'`` + the global threshold). The single COUNT+MIN query is
1312
+ cheap and rate-paced by callers. The re-alert is throttled per (stage, connection)
1313
+ (``_BUILDUP_REALERT_SECONDS``) so an ongoing stall reminds the operator without spamming. A
1314
+ sink must never raise (contract), but we still guard so an alerting bug can't kill the worker."""
1315
+ threshold = threshold or self._buildup.get(name) or self._buildup_default
1316
+ if threshold.max_depth is None and threshold.max_oldest_seconds is None:
1317
+ return # buildup alerting disabled for this lane
1318
+ key = f"{stage}:{name}"
1319
+ now = time.time()
1320
+ if now < self._next_buildup_alert.get(key, 0.0):
1321
+ return # re-alert throttled
1322
+ depth, oldest_created = await self.store.pending_depth(name, stage=stage)
1323
+ if depth == 0:
1324
+ return
1325
+ oldest_age = (now - oldest_created) if oldest_created is not None else None
1326
+ crossed = (threshold.max_depth is not None and depth >= threshold.max_depth) or (
1327
+ threshold.max_oldest_seconds is not None
1328
+ and oldest_age is not None
1329
+ and oldest_age >= threshold.max_oldest_seconds
1330
+ )
1331
+ if not crossed:
1332
+ return
1333
+ self._next_buildup_alert[key] = now + _BUILDUP_REALERT_SECONDS
1334
+ try:
1335
+ self._alert_sink.queue_buildup(name, depth=depth, oldest_age_seconds=oldest_age or 0.0)
1336
+ except Exception:
1337
+ log.exception("alert sink raised on queue_buildup for %r", name)
1338
+
1339
+ async def _wait_for_work(self, event: asyncio.Event) -> None:
1340
+ """Wait up to ``poll_interval`` for ``event`` (this worker class's wake event), then clear it.
1341
+ Per-class events mean a worker only clears its own signal, so one class can't swallow another's
1342
+ wakeup; ``poll_interval`` still backstops any missed set()."""
1343
+ try:
1344
+ await asyncio.wait_for(event.wait(), self.poll_interval)
1345
+ except asyncio.TimeoutError:
1346
+ pass
1347
+ finally:
1348
+ event.clear()
1349
+
1350
+ async def _stop_or_sleep(self, delay: float) -> bool:
1351
+ """Sleep up to ``delay`` seconds; return True if a stop was requested meanwhile (so a
1352
+ backing-off worker exits promptly on shutdown instead of sleeping out the full delay)."""
1353
+ try:
1354
+ await asyncio.wait_for(self._stop.wait(), delay)
1355
+ return True
1356
+ except asyncio.TimeoutError:
1357
+ return False
1358
+
1359
+
1360
+ def _source_config(ic: InboundConnection, bind_host: str, env_values: Mapping[str, Any]) -> Source:
1361
+ # Resolve any env() references first (a missing value raises WiringError here, before bind).
1362
+ settings = resolve_env_settings(ic.spec.settings, env_values)
1363
+ # Inbound MLLP/TCP/X12 listeners never carry an author-supplied host (wiring rejects one) — they
1364
+ # bind to the per-connection bind_address if set, else the service-level [inbound].bind_host. File
1365
+ # and other inbounds have no host and ignore this. A peer-IP allowlist rides into the connector's
1366
+ # settings so the listener can reject a non-allowlisted peer at accept time. (bind_address and the
1367
+ # allowlist are MLLP/TCP-only at wiring, so for X12 both fields are None here = unchanged behaviour.)
1368
+ if ic.spec.type in (ConnectorType.MLLP, ConnectorType.TCP, ConnectorType.X12):
1369
+ settings["host"] = ic.bind_address or bind_host
1370
+ if ic.source_ip_allowlist:
1371
+ settings["source_ip_allowlist"] = list(ic.source_ip_allowlist)
1372
+ return Source(type=ic.spec.type, settings=settings, ack_mode=ic.ack_mode)
1373
+
1374
+
1375
+ def _dest_config(oc: OutboundConnection, env_values: Mapping[str, Any]) -> Destination:
1376
+ # Resolve env() first so any signing key/password ref is materialized here, then assemble the
1377
+ # typed signing config (ASVS 4.1.5, ADR 0018) from the resolved sign_* settings. None = signing
1378
+ # off (every existing outbound unchanged). The connector loads the key + mints the signature; this
1379
+ # is the single choke point feeding start/check/dry-run, so a bad key fails loud at all three.
1380
+ settings = resolve_env_settings(oc.spec.settings, env_values)
1381
+ return Destination(
1382
+ name=oc.name,
1383
+ type=oc.spec.type,
1384
+ settings=settings,
1385
+ retry=oc.retry or RetryPolicy(),
1386
+ sign=OutboundSigning.from_settings(settings),
1387
+ )
1388
+
1389
+
1390
+ def build_check_registry(
1391
+ registry: Registry,
1392
+ *,
1393
+ inbound_bind_host: str,
1394
+ env_values: Mapping[str, Any],
1395
+ egress: EgressSettings,
1396
+ ) -> None:
1397
+ """Construct (and discard) every connector in ``registry`` + run the fail-closed connect/egress
1398
+ allowlists, so a bad connector spec or a non-allowlisted host fails as a :class:`WiringError`
1399
+ BEFORE anything is applied. The standalone core of :meth:`RegistryRunner.build_check`, callable
1400
+ offline — e.g. the ``connection`` CLI validating an edit before it persists (ADR 0007). Builds
1401
+ nothing live (no socket bind / file I/O — binding happens later in ``start_inbound``)."""
1402
+ try:
1403
+ for ic in registry.inbound.values():
1404
+ source_cfg = _source_config(ic, inbound_bind_host, env_values)
1405
+ check_source_allowed(source_cfg, ic.name, egress)
1406
+ build_source(source_cfg)
1407
+ reingress_targets: set[str] = set()
1408
+ for oc in registry.outbound.values():
1409
+ dest = _dest_config(oc, env_values)
1410
+ check_egress_allowed(dest, egress) # fail-closed egress allowlist (WP-11c)
1411
+ build_destination(dest)
1412
+ # ADR 0013 Increment 2: reingress_to must name an existing Loopback() inbound. This is a
1413
+ # CROSS-registry fact (build_outbound_connection is registry-blind), enforced here so it
1414
+ # fails at `check`/dry-run with no store, like every other connector validation.
1415
+ target = oc.spec.settings.get("reingress_to")
1416
+ if target is not None:
1417
+ tic = registry.inbound.get(str(target))
1418
+ if tic is None or tic.spec.type is not ConnectorType.LOOPBACK:
1419
+ raise WiringError(
1420
+ f"outbound connection {oc.name!r}: reingress_to names unknown/non-loopback "
1421
+ f"inbound {target!r} — declare it as inbound(..., Loopback(), ...) (ADR 0013)."
1422
+ )
1423
+ reingress_targets.add(str(target))
1424
+ # A loopback inbound with no capturing outbound pointing at it is legal but inert (never fed) —
1425
+ # surface it (it may be a staging artifact), but don't error.
1426
+ for iname, ic in registry.inbound.items():
1427
+ if ic.spec.type is ConnectorType.LOOPBACK and iname not in reingress_targets:
1428
+ log.warning(
1429
+ "loopback inbound %r has no reingress_to source; it will never receive a message",
1430
+ iname,
1431
+ )
1432
+ resolved_lookups: dict[str, dict[str, Any]] = {}
1433
+ for lname, lspec in registry.lookups.items():
1434
+ lsettings = resolve_env_settings(lspec.settings, env_values)
1435
+ check_lookup_allowed(lname, lsettings, egress) # fail-closed connect allowlist
1436
+ resolved_lookups[lname] = lsettings
1437
+ if resolved_lookups:
1438
+ # Construct (and discard) the executor: validates each DSN (TLS/auth) without opening a pool.
1439
+ DatabaseLookupExecutor(resolved_lookups)
1440
+ except WiringError:
1441
+ raise
1442
+ except Exception as exc:
1443
+ raise WiringError(f"connector build failed: {exc}") from exc
1444
+
1445
+
1446
+ def _allowlist_for(conn_type: ConnectorType, egress: EgressSettings) -> list[str]:
1447
+ """The ``[egress]`` allowlist that governs a connector type (X12 shares TCP's; REST/SOAP share the
1448
+ HTTP list). Returns ``[]`` for a type with no egress list — which under ``deny_by_default`` means
1449
+ 'nothing is configured to permit it', so the destination is refused."""
1450
+ if conn_type is ConnectorType.MLLP:
1451
+ return egress.allowed_mllp
1452
+ if conn_type in (ConnectorType.TCP, ConnectorType.X12):
1453
+ return egress.allowed_tcp
1454
+ if conn_type is ConnectorType.FILE:
1455
+ return egress.allowed_file_dirs
1456
+ if conn_type in (ConnectorType.REST, ConnectorType.SOAP):
1457
+ return egress.allowed_http
1458
+ if conn_type is ConnectorType.DATABASE:
1459
+ return egress.allowed_db
1460
+ if conn_type is ConnectorType.REMOTEFILE:
1461
+ return egress.allowed_remote
1462
+ return []
1463
+
1464
+
1465
+ def check_source_allowed(source: Source, name: str, egress: EgressSettings) -> None:
1466
+ """Fail-closed connect-allowlist for an inbound connector that **dials out** to a server to receive
1467
+ (today: the DATABASE source, which polls a SQL host). Reuses ``[egress].allowed_db``: although the
1468
+ DB source pulls data *in* rather than exfiltrating it, it still opens an outbound connection to an
1469
+ operator-named host, so the same allowlist guards against pointing the engine at an arbitrary
1470
+ server. Opt-in (an empty list = unrestricted), matching destinations; checked at load/reload/start.
1471
+
1472
+ A TCP/MLLP/File *source* is a local **listener** (it binds ``[inbound].bind_host`` and waits for
1473
+ peers, never dialing out), so there is nothing to connect-gate here — ``[egress].allowed_tcp``
1474
+ governs only the TCP *destination* (see :func:`check_egress_allowed`).
1475
+
1476
+ Under ``[egress].deny_by_default`` a DATABASE/REMOTEFILE source whose allowlist is empty is refused
1477
+ outright; a listener source (TCP/MLLP/File) never dials out, so it is unaffected."""
1478
+ if egress.deny_by_default:
1479
+ if source.type is ConnectorType.DATABASE and not egress.allowed_db:
1480
+ raise WiringError(
1481
+ f"inbound {name!r}: [egress].deny_by_default is set and [egress].allowed_db is empty "
1482
+ "— list the DATABASE server to permit it"
1483
+ )
1484
+ if source.type is ConnectorType.REMOTEFILE and not egress.allowed_remote:
1485
+ raise WiringError(
1486
+ f"inbound {name!r}: [egress].deny_by_default is set and [egress].allowed_remote is "
1487
+ "empty — list the REMOTEFILE host to permit it"
1488
+ )
1489
+ if source.type is ConnectorType.DATABASE and egress.allowed_db:
1490
+ host = str(source.settings.get("server", ""))
1491
+ port = source.settings.get("port", 1433)
1492
+ if not _mllp_egress_allowed(host, port, egress.allowed_db): # same host[:port] matching
1493
+ log.warning(
1494
+ "connect denied: inbound %r DATABASE server %r not in [egress].allowed_db",
1495
+ name,
1496
+ host,
1497
+ )
1498
+ raise WiringError(
1499
+ f"inbound {name!r}: DATABASE server {host!r} is not in the "
1500
+ "[egress].allowed_db allowlist"
1501
+ )
1502
+ elif source.type is ConnectorType.REMOTEFILE and egress.allowed_remote:
1503
+ host = str(source.settings.get("host", ""))
1504
+ port = source.settings.get("port")
1505
+ if not _mllp_egress_allowed(host, port, egress.allowed_remote): # same host[:port] matching
1506
+ log.warning(
1507
+ "connect denied: inbound %r REMOTEFILE host %r not in [egress].allowed_remote",
1508
+ name,
1509
+ host,
1510
+ )
1511
+ raise WiringError(
1512
+ f"inbound {name!r}: REMOTEFILE host {host!r} is not in the "
1513
+ "[egress].allowed_remote allowlist"
1514
+ )
1515
+
1516
+
1517
+ def check_lookup_allowed(name: str, settings: Mapping[str, Any], egress: EgressSettings) -> None:
1518
+ """Fail-closed connect-allowlist for a ``DatabaseLookup`` (it dials out to a SQL host for a live,
1519
+ read-only ``db_lookup``). Reuses ``[egress].allowed_db`` (opt-in; an empty list = unrestricted), like
1520
+ the DATABASE source — checked at load/reload/start so the engine is never pointed at a non-allowlisted
1521
+ server. ``settings`` are the already-``env()``-resolved connection settings. Under
1522
+ ``[egress].deny_by_default`` an empty ``allowed_db`` refuses the lookup outright."""
1523
+ if egress.deny_by_default and not egress.allowed_db:
1524
+ raise WiringError(
1525
+ f"DatabaseLookup {name!r}: [egress].deny_by_default is set and [egress].allowed_db is "
1526
+ "empty — list the lookup server to permit it"
1527
+ )
1528
+ if egress.allowed_db:
1529
+ host = str(settings.get("server", ""))
1530
+ port = settings.get("port", 1433)
1531
+ if not _mllp_egress_allowed(host, port, egress.allowed_db): # same host[:port] matching
1532
+ log.warning(
1533
+ "connect denied: DatabaseLookup %r server %r not in [egress].allowed_db", name, host
1534
+ )
1535
+ raise WiringError(
1536
+ f"DatabaseLookup {name!r}: server {host!r} is not in the [egress].allowed_db allowlist"
1537
+ )
1538
+
1539
+
1540
+ _LOOPBACK_HOSTS = frozenset({"127.0.0.1", "localhost", "::1", "::ffff:127.0.0.1"})
1541
+
1542
+
1543
+ def check_mllp_tls_exposure(source: Source, name: str, *, allow_insecure_bind: bool) -> None:
1544
+ """Exposed-gate (ADR 0002 §0, MLLP side): refuse a **non-loopback MLLP listener without TLS** — it
1545
+ would put HL7 bodies on the wire in cleartext. Set ``tls=true`` (+ cert) on the connection, or pass
1546
+ ``serve --allow-insecure-bind`` to accept the risk on a trusted segment (then warn). Loopback binds
1547
+ and TLS-on binds pass unconditionally. MLLP only (raw-TCP/X12 TLS is out of ADR-0002 scope)."""
1548
+ if source.type is not ConnectorType.MLLP:
1549
+ return
1550
+ host = str(source.settings.get("host", "127.0.0.1"))
1551
+ if host in _LOOPBACK_HOSTS or source.settings.get("tls"):
1552
+ return
1553
+ if allow_insecure_bind:
1554
+ log.warning(
1555
+ "inbound %r binds non-loopback host %r without TLS (--allow-insecure-bind); HL7 bodies "
1556
+ "cross the network in cleartext — set tls=true (+ tls_cert_file/tls_key_file) on it.",
1557
+ name,
1558
+ host,
1559
+ )
1560
+ return
1561
+ raise WiringError(
1562
+ f"inbound connection {name!r} binds non-loopback host {host!r} without TLS; HL7 bodies would "
1563
+ "cross the network in cleartext. Set tls=true (+ tls_cert_file/tls_key_file) on the MLLP "
1564
+ "connection, or pass `serve --allow-insecure-bind` to accept the cleartext risk on a trusted, "
1565
+ "firewalled network."
1566
+ )
1567
+
1568
+
1569
+ def check_egress_allowed(dest: Destination, egress: EgressSettings) -> None:
1570
+ """Fail-closed: refuse (raise :class:`WiringError`) an outbound destination not on the ``[egress]``
1571
+ allowlist (WP-11c — ASVS 13.2.4/13.2.5/14.2.3), so a fat-fingered or hostile destination can't
1572
+ exfiltrate PHI. Opt-in per transport (an empty list = unrestricted), checked against the resolved
1573
+ (``env()``-substituted) destination at config load/reload/start. Webhook/SMTP alert sinks carry no
1574
+ PHI bodies and keep their own ``[alerts]`` host allowlists.
1575
+
1576
+ Under ``[egress].deny_by_default`` a destination whose transport has no allowlist is refused
1577
+ outright (fail-closed); with the list set, the per-list matching below is unchanged."""
1578
+ if egress.deny_by_default and not _allowlist_for(dest.type, egress):
1579
+ log.warning(
1580
+ "egress denied: outbound %r %s has no [egress] allowlist under deny_by_default",
1581
+ dest.name,
1582
+ dest.type.value,
1583
+ )
1584
+ raise WiringError(
1585
+ f"outbound {dest.name!r}: [egress].deny_by_default is set and no allowlist permits a "
1586
+ f"{dest.type.value} destination — add it to the matching [egress].allowed_* list"
1587
+ )
1588
+ if dest.type is ConnectorType.MLLP and egress.allowed_mllp:
1589
+ host = str(dest.settings.get("host", "127.0.0.1"))
1590
+ port = dest.settings.get("port")
1591
+ if not _mllp_egress_allowed(host, port, egress.allowed_mllp):
1592
+ log.warning(
1593
+ "egress denied: outbound %r MLLP %s:%s not in [egress].allowed_mllp",
1594
+ dest.name,
1595
+ host,
1596
+ port,
1597
+ )
1598
+ raise WiringError(
1599
+ f"outbound {dest.name!r}: MLLP destination {host}:{port} is not in the "
1600
+ "[egress].allowed_mllp allowlist"
1601
+ )
1602
+ elif dest.type is ConnectorType.TCP and egress.allowed_tcp:
1603
+ host = str(dest.settings.get("host", "127.0.0.1"))
1604
+ port = dest.settings.get("port")
1605
+ if not _mllp_egress_allowed(host, port, egress.allowed_tcp): # same host[:port] matching
1606
+ log.warning(
1607
+ "egress denied: outbound %r TCP %s:%s not in [egress].allowed_tcp",
1608
+ dest.name,
1609
+ host,
1610
+ port,
1611
+ )
1612
+ raise WiringError(
1613
+ f"outbound {dest.name!r}: TCP destination {host}:{port} is not in the "
1614
+ "[egress].allowed_tcp allowlist"
1615
+ )
1616
+ elif dest.type is ConnectorType.X12 and egress.allowed_tcp:
1617
+ # X12 is raw TCP, so it shares the [egress].allowed_tcp allowlist (same host[:port] matching).
1618
+ host = str(dest.settings.get("host", "127.0.0.1"))
1619
+ port = dest.settings.get("port")
1620
+ if not _mllp_egress_allowed(host, port, egress.allowed_tcp):
1621
+ log.warning(
1622
+ "egress denied: outbound %r X12 %s:%s not in [egress].allowed_tcp",
1623
+ dest.name,
1624
+ host,
1625
+ port,
1626
+ )
1627
+ raise WiringError(
1628
+ f"outbound {dest.name!r}: X12 destination {host}:{port} is not in the "
1629
+ "[egress].allowed_tcp allowlist"
1630
+ )
1631
+ elif dest.type is ConnectorType.FILE and egress.allowed_file_dirs:
1632
+ directory = dest.settings.get("directory")
1633
+ if directory is None or not _dir_egress_allowed(str(directory), egress.allowed_file_dirs):
1634
+ log.warning(
1635
+ "egress denied: outbound %r File dir %r not under [egress].allowed_file_dirs",
1636
+ dest.name,
1637
+ directory,
1638
+ )
1639
+ raise WiringError(
1640
+ f"outbound {dest.name!r}: File directory {directory!r} is not under any "
1641
+ "[egress].allowed_file_dirs entry"
1642
+ )
1643
+ elif dest.type in (ConnectorType.REST, ConnectorType.SOAP) and egress.allowed_http:
1644
+ url = str(dest.settings.get("url", ""))
1645
+ if not _http_egress_allowed(url, egress.allowed_http):
1646
+ host = urllib.parse.urlsplit(url).hostname or ""
1647
+ log.warning(
1648
+ "egress denied: outbound %r %s host %r not in [egress].allowed_http",
1649
+ dest.name,
1650
+ dest.type.value,
1651
+ host,
1652
+ )
1653
+ raise WiringError(
1654
+ f"outbound {dest.name!r}: {dest.type.value} host {host!r} is not in the "
1655
+ "[egress].allowed_http allowlist"
1656
+ )
1657
+ elif dest.type is ConnectorType.DATABASE and egress.allowed_db:
1658
+ host = str(dest.settings.get("server", ""))
1659
+ port = dest.settings.get("port", 1433)
1660
+ if not _mllp_egress_allowed(host, port, egress.allowed_db): # same host[:port] matching
1661
+ log.warning(
1662
+ "egress denied: outbound %r DATABASE server %r not in [egress].allowed_db",
1663
+ dest.name,
1664
+ host,
1665
+ )
1666
+ raise WiringError(
1667
+ f"outbound {dest.name!r}: DATABASE server {host!r} is not in the "
1668
+ "[egress].allowed_db allowlist"
1669
+ )
1670
+ elif dest.type is ConnectorType.REMOTEFILE and egress.allowed_remote:
1671
+ host = str(dest.settings.get("host", ""))
1672
+ port = dest.settings.get("port")
1673
+ if not _mllp_egress_allowed(host, port, egress.allowed_remote): # same host[:port] matching
1674
+ log.warning(
1675
+ "egress denied: outbound %r REMOTEFILE host %r not in [egress].allowed_remote",
1676
+ dest.name,
1677
+ host,
1678
+ )
1679
+ raise WiringError(
1680
+ f"outbound {dest.name!r}: REMOTEFILE host {host!r} is not in the "
1681
+ "[egress].allowed_remote allowlist"
1682
+ )
1683
+
1684
+
1685
+ def _mllp_egress_allowed(host: str, port: object, allowed: list[str]) -> bool:
1686
+ host = host.lower()
1687
+ for entry in allowed:
1688
+ allow_host, _, allow_port = entry.partition(":")
1689
+ if allow_host.strip().lower() == host and (
1690
+ not allow_port or str(port) == allow_port.strip()
1691
+ ):
1692
+ return True
1693
+ return False
1694
+
1695
+
1696
+ def _dir_egress_allowed(directory: str, allowed: list[str]) -> bool:
1697
+ try:
1698
+ target = Path(directory).resolve()
1699
+ except (OSError, ValueError, RuntimeError):
1700
+ return False
1701
+ for entry in allowed:
1702
+ try:
1703
+ base = Path(entry).resolve()
1704
+ except (OSError, ValueError, RuntimeError):
1705
+ continue
1706
+ if target == base or base in target.parents:
1707
+ return True
1708
+ return False
1709
+
1710
+
1711
+ def _http_egress_allowed(url: str, allowed: list[str]) -> bool:
1712
+ """True if ``url``'s host (and port, when an allow entry pins one) is on the allowlist — the same
1713
+ ``host`` / ``host:port`` matching as MLLP."""
1714
+ parts = urllib.parse.urlsplit(url)
1715
+ host = (parts.hostname or "").lower()
1716
+ for entry in allowed:
1717
+ allow_host, _, allow_port = entry.partition(":")
1718
+ if allow_host.strip().lower() == host and (
1719
+ not allow_port or str(parts.port) == allow_port.strip()
1720
+ ):
1721
+ return True
1722
+ return False