messagefoundry 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- messagefoundry/__init__.py +108 -0
- messagefoundry/__main__.py +1155 -0
- messagefoundry/api/__init__.py +27 -0
- messagefoundry/api/app.py +1581 -0
- messagefoundry/api/approvals.py +184 -0
- messagefoundry/api/auth_models.py +211 -0
- messagefoundry/api/auth_routes.py +655 -0
- messagefoundry/api/field_authz.py +96 -0
- messagefoundry/api/models.py +374 -0
- messagefoundry/api/security.py +247 -0
- messagefoundry/api/tls.py +47 -0
- messagefoundry/auth/__init__.py +39 -0
- messagefoundry/auth/data/common_passwords.NOTICE +13 -0
- messagefoundry/auth/data/common_passwords.txt +10000 -0
- messagefoundry/auth/identity.py +71 -0
- messagefoundry/auth/ldap.py +264 -0
- messagefoundry/auth/notifications.py +68 -0
- messagefoundry/auth/passwords.py +53 -0
- messagefoundry/auth/permissions.py +120 -0
- messagefoundry/auth/policy.py +153 -0
- messagefoundry/auth/ratelimit.py +55 -0
- messagefoundry/auth/service.py +1323 -0
- messagefoundry/auth/tokens.py +26 -0
- messagefoundry/auth/totp.py +174 -0
- messagefoundry/checks.py +174 -0
- messagefoundry/config/__init__.py +30 -0
- messagefoundry/config/active_environment.py +80 -0
- messagefoundry/config/ai_policy.py +140 -0
- messagefoundry/config/code_sets.py +260 -0
- messagefoundry/config/connections_edit.py +200 -0
- messagefoundry/config/connections_file.py +287 -0
- messagefoundry/config/db_lookup.py +117 -0
- messagefoundry/config/environments.py +116 -0
- messagefoundry/config/ingest_time.py +83 -0
- messagefoundry/config/models.py +240 -0
- messagefoundry/config/reference.py +158 -0
- messagefoundry/config/response.py +83 -0
- messagefoundry/config/run_context.py +153 -0
- messagefoundry/config/settings.py +1311 -0
- messagefoundry/config/state.py +99 -0
- messagefoundry/config/tls_policy.py +110 -0
- messagefoundry/config/wiring.py +1918 -0
- messagefoundry/console/__init__.py +20 -0
- messagefoundry/console/__main__.py +274 -0
- messagefoundry/console/_async.py +107 -0
- messagefoundry/console/change_password.py +111 -0
- messagefoundry/console/client.py +552 -0
- messagefoundry/console/connections.py +324 -0
- messagefoundry/console/login.py +107 -0
- messagefoundry/console/mfa.py +205 -0
- messagefoundry/console/reauth.py +94 -0
- messagefoundry/console/search.py +57 -0
- messagefoundry/console/service_control.py +137 -0
- messagefoundry/console/sessions.py +122 -0
- messagefoundry/console/shell.py +410 -0
- messagefoundry/console/status.py +377 -0
- messagefoundry/console/users_page.py +282 -0
- messagefoundry/console/widgets.py +553 -0
- messagefoundry/generators/README.md +27 -0
- messagefoundry/generators/__init__.py +15 -0
- messagefoundry/generators/_core.py +589 -0
- messagefoundry/generators/_hl7data.py +428 -0
- messagefoundry/generators/adt.py +286 -0
- messagefoundry/generators/all_types.py +24 -0
- messagefoundry/generators/bar.py +28 -0
- messagefoundry/generators/dft.py +20 -0
- messagefoundry/generators/mdm.py +39 -0
- messagefoundry/generators/mfn.py +46 -0
- messagefoundry/generators/oml.py +32 -0
- messagefoundry/generators/orl.py +30 -0
- messagefoundry/generators/orm.py +23 -0
- messagefoundry/generators/oru.py +21 -0
- messagefoundry/generators/ras.py +20 -0
- messagefoundry/generators/rde.py +54 -0
- messagefoundry/generators/siu.py +64 -0
- messagefoundry/generators/vxu.py +20 -0
- messagefoundry/hl7schema.py +75 -0
- messagefoundry/last_resort.py +55 -0
- messagefoundry/logging_setup.py +332 -0
- messagefoundry/parsing/__init__.py +64 -0
- messagefoundry/parsing/consistency.py +166 -0
- messagefoundry/parsing/groups.py +228 -0
- messagefoundry/parsing/message.py +453 -0
- messagefoundry/parsing/peek.py +237 -0
- messagefoundry/parsing/split.py +120 -0
- messagefoundry/parsing/summary.py +46 -0
- messagefoundry/parsing/tree.py +128 -0
- messagefoundry/parsing/validate.py +95 -0
- messagefoundry/parsing/x12/__init__.py +46 -0
- messagefoundry/parsing/x12/delimiters.py +140 -0
- messagefoundry/parsing/x12/errors.py +30 -0
- messagefoundry/parsing/x12/interchange.py +232 -0
- messagefoundry/parsing/x12/message.py +200 -0
- messagefoundry/parsing/x12/peek.py +207 -0
- messagefoundry/pipeline/__init__.py +21 -0
- messagefoundry/pipeline/alert_sinks.py +486 -0
- messagefoundry/pipeline/alerts.py +100 -0
- messagefoundry/pipeline/cert_expiry.py +219 -0
- messagefoundry/pipeline/cluster.py +955 -0
- messagefoundry/pipeline/cluster_sqlserver.py +444 -0
- messagefoundry/pipeline/config_convergence.py +137 -0
- messagefoundry/pipeline/dryrun.py +450 -0
- messagefoundry/pipeline/engine.py +756 -0
- messagefoundry/pipeline/leader_tasks.py +158 -0
- messagefoundry/pipeline/reference_sync.py +369 -0
- messagefoundry/pipeline/retention.py +289 -0
- messagefoundry/pipeline/security_notify.py +168 -0
- messagefoundry/pipeline/state_convergence.py +143 -0
- messagefoundry/pipeline/wiring_runner.py +1722 -0
- messagefoundry/py.typed +0 -0
- messagefoundry/redaction.py +71 -0
- messagefoundry/scaffold.py +321 -0
- messagefoundry/secrets_dpapi.py +129 -0
- messagefoundry/store/__init__.py +46 -0
- messagefoundry/store/audit_tee.py +67 -0
- messagefoundry/store/base.py +758 -0
- messagefoundry/store/crypto.py +166 -0
- messagefoundry/store/keyprovider.py +192 -0
- messagefoundry/store/postgres.py +3447 -0
- messagefoundry/store/sqlserver.py +3014 -0
- messagefoundry/store/store.py +3790 -0
- messagefoundry/timezone.py +207 -0
- messagefoundry/transports/__init__.py +50 -0
- messagefoundry/transports/base.py +269 -0
- messagefoundry/transports/database.py +693 -0
- messagefoundry/transports/file.py +551 -0
- messagefoundry/transports/framing.py +164 -0
- messagefoundry/transports/loopback.py +53 -0
- messagefoundry/transports/mllp.py +644 -0
- messagefoundry/transports/remotefile.py +664 -0
- messagefoundry/transports/rest.py +281 -0
- messagefoundry/transports/signing.py +321 -0
- messagefoundry/transports/soap.py +507 -0
- messagefoundry/transports/tcp.py +307 -0
- messagefoundry/transports/timer.py +146 -0
- messagefoundry/transports/x12.py +323 -0
- messagefoundry-0.1.0.dist-info/METADATA +212 -0
- messagefoundry-0.1.0.dist-info/RECORD +142 -0
- messagefoundry-0.1.0.dist-info/WHEEL +4 -0
- messagefoundry-0.1.0.dist-info/entry_points.txt +2 -0
- messagefoundry-0.1.0.dist-info/licenses/LICENSE +662 -0
- messagefoundry-0.1.0.dist-info/licenses/NOTICE +27 -0
|
@@ -0,0 +1,1722 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 MessageFoundry Organization and contributors
|
|
3
|
+
"""Run a code-first wiring :class:`~messagefoundry.config.wiring.Registry` as a **staged pipeline**.
|
|
4
|
+
|
|
5
|
+
Staged pipeline (ADR 0001, Step A): for each **inbound connection** a listener decodes/parses/
|
|
6
|
+
(strict-)validates each message **synchronously** (still NAKing those failures), then commits the
|
|
7
|
+
raw to the **ingress** stage and ACKs (**ACK-on-receipt**). A per-inbound **ingress worker** then
|
|
8
|
+
runs the **Router** (returns handler names) + named **Handlers** (filter → transform → ``Send``,
|
|
9
|
+
combined — not split) and **hands off** the resulting deliveries to the **outbound** stage in one
|
|
10
|
+
transaction. One delivery worker per **outbound connection** drains its rows (across all inbounds)
|
|
11
|
+
independently, with retries. Router/Handlers are pure; a re-run after a crash re-derives the same
|
|
12
|
+
output (at-least-once).
|
|
13
|
+
|
|
14
|
+
Every received message is persisted before the ACK (``RECEIVED``); its disposition is then recorded
|
|
15
|
+
as it flows (the count-and-log invariant): ``ROUTED`` (≥1 delivery → ``PROCESSED`` once drained),
|
|
16
|
+
``UNROUTED`` (router routed nowhere), ``FILTERED`` (handlers dropped it), or ``ERROR``/dead-letter at
|
|
17
|
+
the failing stage. Decode/parse/validate failures still NAK + record ``ERROR`` synchronously;
|
|
18
|
+
routing/transform failures are post-ACK (no NAK — a logged ``ERROR``/dead-letter + alert).
|
|
19
|
+
|
|
20
|
+
Reuses the store, the connector registry, and the ACK builder.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from __future__ import annotations
|
|
24
|
+
|
|
25
|
+
import asyncio
|
|
26
|
+
import functools
|
|
27
|
+
import json
|
|
28
|
+
import logging
|
|
29
|
+
import time
|
|
30
|
+
import urllib.parse
|
|
31
|
+
from collections.abc import Mapping
|
|
32
|
+
from pathlib import Path
|
|
33
|
+
from typing import Any
|
|
34
|
+
|
|
35
|
+
from messagefoundry.config.models import (
|
|
36
|
+
AckAfter,
|
|
37
|
+
AckMode,
|
|
38
|
+
BuildupThreshold,
|
|
39
|
+
ConnectorType,
|
|
40
|
+
ContentType,
|
|
41
|
+
Destination,
|
|
42
|
+
InternalErrorPolicy,
|
|
43
|
+
OrderingMode,
|
|
44
|
+
OutboundSigning,
|
|
45
|
+
RetryPolicy,
|
|
46
|
+
Source,
|
|
47
|
+
)
|
|
48
|
+
from messagefoundry.config.db_lookup import DbLookupError, activated as db_lookup_activated
|
|
49
|
+
from messagefoundry.config.run_context import RunContext, run_contexts
|
|
50
|
+
from messagefoundry.config.settings import EgressSettings
|
|
51
|
+
from messagefoundry.config.wiring import (
|
|
52
|
+
InboundConnection,
|
|
53
|
+
OutboundConnection,
|
|
54
|
+
Registry,
|
|
55
|
+
WiringError,
|
|
56
|
+
resolve_env_settings,
|
|
57
|
+
)
|
|
58
|
+
from messagefoundry.parsing import HL7PeekError, Peek, normalize, summarize, validate
|
|
59
|
+
from messagefoundry.pipeline.alerts import AlertSink, LoggingAlertSink
|
|
60
|
+
from messagefoundry.pipeline.cluster import ClusterCoordinator, NullCoordinator
|
|
61
|
+
from messagefoundry.redaction import safe_exc, safe_text
|
|
62
|
+
from messagefoundry.pipeline.dryrun import route_only, transform_one
|
|
63
|
+
from messagefoundry.store import MessageStatus, QueueStore, Stage
|
|
64
|
+
from messagefoundry.transports import (
|
|
65
|
+
DeliveryError,
|
|
66
|
+
DestinationConnector,
|
|
67
|
+
NegativeAckError,
|
|
68
|
+
SourceConnector,
|
|
69
|
+
build_destination,
|
|
70
|
+
build_source,
|
|
71
|
+
)
|
|
72
|
+
from messagefoundry.transports.database import DatabaseLookupExecutor
|
|
73
|
+
from messagefoundry.transports.mllp import build_ack
|
|
74
|
+
|
|
75
|
+
__all__ = ["RegistryRunner"]
|
|
76
|
+
|
|
77
|
+
log = logging.getLogger(__name__)
|
|
78
|
+
|
|
79
|
+
# A delivery worker backs off this long after an *unexpected* error (e.g. the store being briefly
|
|
80
|
+
# unavailable) before retrying, so a transient failure logs once and recovers instead of hot-looping.
|
|
81
|
+
_WORKER_ERROR_BACKOFF_SECONDS = 1.0
|
|
82
|
+
|
|
83
|
+
# A queue_buildup alert re-fires at most this often per connection while the lane stays over threshold,
|
|
84
|
+
# so an ongoing stall reminds the operator without spamming on every backed-off retry.
|
|
85
|
+
_BUILDUP_REALERT_SECONDS = 300.0
|
|
86
|
+
|
|
87
|
+
# The ingress worker has no per-message "failure" to hang a buildup check on (a slow-but-working
|
|
88
|
+
# router just falls behind), so it polls the lane depth at most this often — bounding the extra
|
|
89
|
+
# COUNT+MIN query rate on the ingress hot path regardless of throughput.
|
|
90
|
+
_BUILDUP_CHECK_INTERVAL = 1.0
|
|
91
|
+
|
|
92
|
+
# How long the handler's worker thread blocks on a single db_lookup() before giving up (ADR 0010).
|
|
93
|
+
# A live lookup that exceeds this raises (→ the message's transform fails and dead-letters) rather than
|
|
94
|
+
# pinning a worker thread forever; the orphaned query still completes on the loop and releases its conn.
|
|
95
|
+
_LOOKUP_RESULT_TIMEOUT_SECONDS = 30.0
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _peek_for_loopback(
|
|
99
|
+
ic: InboundConnection, body: str
|
|
100
|
+
) -> tuple[str | None, str | None, str | None, bool]:
|
|
101
|
+
"""Derive ``(control_id, message_type, summary, peek_failed)`` for a re-ingressed loopback body
|
|
102
|
+
(ADR 0013 Increment 2, Q5) — the re-ingress worker's parsing step, kept in ``pipeline/`` (not the
|
|
103
|
+
store) so the store stays parsing-free, exactly as ``_handle_inbound`` peeks before
|
|
104
|
+
``enqueue_ingress``. An HL7V2 loopback runs ``Peek.parse`` (``peek_failed=True`` on ``HL7PeekError``
|
|
105
|
+
→ the child is recorded RECEIVED→ERROR, not dropped); any other ``content_type`` (x12/text/json) is
|
|
106
|
+
relayed verbatim as a ``RawMessage`` — no parse, ``message_type`` = the content_type value."""
|
|
107
|
+
if ic.content_type is ContentType.HL7V2:
|
|
108
|
+
try:
|
|
109
|
+
peek = Peek.parse(body)
|
|
110
|
+
except HL7PeekError:
|
|
111
|
+
return None, None, None, True
|
|
112
|
+
return peek.control_id, peek.message_type, (summarize(peek) or None), False
|
|
113
|
+
return None, ic.content_type.value, None, False
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class RegistryRunner:
|
|
117
|
+
"""Runs every inbound connection in a Registry + one delivery worker per outbound."""
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
registry: Registry,
|
|
122
|
+
store: QueueStore,
|
|
123
|
+
*,
|
|
124
|
+
poll_interval: float = 0.25,
|
|
125
|
+
claim_limit: int = 20,
|
|
126
|
+
inbound_bind_host: str = "127.0.0.1",
|
|
127
|
+
allow_insecure_bind: bool = False,
|
|
128
|
+
delivery_defaults: RetryPolicy | None = None,
|
|
129
|
+
ordering_default: OrderingMode | None = None,
|
|
130
|
+
internal_error_default: InternalErrorPolicy | None = None,
|
|
131
|
+
buildup_default: BuildupThreshold | None = None,
|
|
132
|
+
ack_after_default: AckAfter | None = None,
|
|
133
|
+
alert_sink: AlertSink | None = None,
|
|
134
|
+
egress: EgressSettings | None = None,
|
|
135
|
+
simulate_all: bool = False,
|
|
136
|
+
env_values: Mapping[str, Any] | None = None,
|
|
137
|
+
active_environment: str | None = None,
|
|
138
|
+
coordinator: ClusterCoordinator | None = None,
|
|
139
|
+
max_correlation_depth: int = 8,
|
|
140
|
+
) -> None:
|
|
141
|
+
self.registry = registry
|
|
142
|
+
self.store = store
|
|
143
|
+
# ADR 0013 Increment 2: the loop-prevention cap for re-ingress. A re-ingressed message at this
|
|
144
|
+
# correlation depth still routes; the next hop (depth+1) dead-letters its work-row and ERRORs the
|
|
145
|
+
# origin. Coarse by design (bounds total work, not topology). From [pipeline] max_correlation_depth.
|
|
146
|
+
self._max_correlation_depth = max_correlation_depth
|
|
147
|
+
# Cluster coordination seam (Track B Step 3). Threaded in + held so Steps 4/5 can consult the
|
|
148
|
+
# cheap, synchronous gates (is_leader / owns_lane) on the hot path — this step adds NO call
|
|
149
|
+
# sites; the object is only stored + exposed. None → the no-op NullCoordinator (every gate
|
|
150
|
+
# True), so single-node operation is byte-identical to before this seam existed.
|
|
151
|
+
self._coordinator: ClusterCoordinator = coordinator or NullCoordinator()
|
|
152
|
+
# The active environment name ([ai].environment / serve --env), published around each
|
|
153
|
+
# router/transform run so a Handler's current_environment() resolves (ADR 0006-style per-face
|
|
154
|
+
# logic). A deployment constant, so the read is pure/re-run-safe.
|
|
155
|
+
self._active_environment = active_environment
|
|
156
|
+
self.poll_interval = poll_interval
|
|
157
|
+
self.claim_limit = claim_limit
|
|
158
|
+
# Global outbound defaults (from [delivery]); a connection's own settings override them.
|
|
159
|
+
# An outbound with none inherits these (per-connection override > global default > built-in).
|
|
160
|
+
self._delivery_defaults = delivery_defaults or RetryPolicy()
|
|
161
|
+
self._ordering_default = ordering_default or OrderingMode.FIFO
|
|
162
|
+
self._internal_error_default = internal_error_default or InternalErrorPolicy.CONTINUE
|
|
163
|
+
self._buildup_default = buildup_default or BuildupThreshold()
|
|
164
|
+
# Global inbound ACK-timing default (from [inbound]); a connection's own ack_after overrides
|
|
165
|
+
# it. Step A only supports INGEST (ACK-on-receipt); a resolved DELIVERED fails loud at start.
|
|
166
|
+
self._ack_after_default = ack_after_default or AckAfter.INGEST
|
|
167
|
+
# Where the delivery workers report operational stalls (a stopped connection, a building
|
|
168
|
+
# backlog). Defaults to the logging sink until a real notifier is wired (docs/BACKLOG.md item 5).
|
|
169
|
+
self._alert_sink: AlertSink = alert_sink or LoggingAlertSink()
|
|
170
|
+
# Fail-closed outbound destination allowlist (WP-11c); empty = unrestricted. Enforced at
|
|
171
|
+
# build_check (config load/reload) and start, so a non-allowed destination is refused.
|
|
172
|
+
self._egress = egress or EgressSettings()
|
|
173
|
+
# Deployment-wide shadow override ([shadow].simulate_all_egress, #15): when True, EVERY outbound
|
|
174
|
+
# runs egress-suppressed regardless of its own simulate= flag. Resolved per-connection into
|
|
175
|
+
# self._simulate at reconcile (per-connection simulate OR this).
|
|
176
|
+
self._simulate_all = simulate_all
|
|
177
|
+
# The interface inbound listeners bind to (service-level; authors never set a host). Loopback
|
|
178
|
+
# by default — see config.settings.InboundSettings.bind_host.
|
|
179
|
+
self._inbound_bind_host = inbound_bind_host
|
|
180
|
+
# Whether `serve --allow-insecure-bind` was passed — the dev escape that downgrades the MLLP
|
|
181
|
+
# exposed-gate (a non-loopback plaintext bind) from refuse to a loud warning (ADR 0002 §0).
|
|
182
|
+
self._allow_insecure_bind = allow_insecure_bind
|
|
183
|
+
# This instance's environment values (DEV/PROD): env() references in connection specs resolve
|
|
184
|
+
# against this map when a connector is built (a missing key fails loud — see resolve_env_settings).
|
|
185
|
+
self._env_values: dict[str, Any] = dict(env_values or {})
|
|
186
|
+
self._sources: dict[str, SourceConnector] = {}
|
|
187
|
+
self._destinations: dict[str, DestinationConnector] = {}
|
|
188
|
+
# One delivery worker per outbound connection, addressable by name so a reload can
|
|
189
|
+
# gracefully stop/swap a single connection's worker without touching its siblings.
|
|
190
|
+
self._workers: dict[str, asyncio.Task[None]] = {}
|
|
191
|
+
# Two workers per inbound connection (staged pipeline, ADR 0001 Step B): a ROUTER worker drains
|
|
192
|
+
# the ingress stage (Router → routed-stage rows) and a TRANSFORM worker drains the routed stage
|
|
193
|
+
# (handler transform → outbound rows). Both run independently of whether the source is actively
|
|
194
|
+
# listening, so messages already ACKed at ingress are always carried through (even while the
|
|
195
|
+
# source is stopped). Addressable by inbound name so a reload/restart can re-arm one in place.
|
|
196
|
+
self._router_workers: dict[str, asyncio.Task[None]] = {}
|
|
197
|
+
self._transform_workers: dict[str, asyncio.Task[None]] = {}
|
|
198
|
+
# ADR 0013 Increment 2: a RESPONSE worker per LOOPBACK inbound, draining its Stage.RESPONSE
|
|
199
|
+
# tokens (a captured reply owes a re-ingress) via ingress_handoff. Non-loopback inbounds have none.
|
|
200
|
+
self._response_workers: dict[str, asyncio.Task[None]] = {}
|
|
201
|
+
# connector + retry are re-resolved per item from these maps, so a reload can swap an
|
|
202
|
+
# outbound's settings under a running worker without tearing the worker down.
|
|
203
|
+
self._retry: dict[str, RetryPolicy] = {}
|
|
204
|
+
self._ordering: dict[str, OrderingMode] = {}
|
|
205
|
+
self._internal_error: dict[str, InternalErrorPolicy] = {}
|
|
206
|
+
self._buildup: dict[str, BuildupThreshold] = {}
|
|
207
|
+
# Effective per-connection egress-suppression (#15): per-connection simulate= OR simulate_all.
|
|
208
|
+
self._simulate: dict[str, bool] = {}
|
|
209
|
+
# Per-connection re-alert throttle: the earliest time a queue_buildup alert may fire again.
|
|
210
|
+
self._next_buildup_alert: dict[str, float] = {}
|
|
211
|
+
# Live-lookup executor (db_lookup, ADR 0010): built from registry.lookups at start/reload, None
|
|
212
|
+
# when the graph declares no DatabaseLookup — in which case the transform path stays byte-identical
|
|
213
|
+
# (inline call, no thread hop, no runner). The engine loop is captured at start so a handler's
|
|
214
|
+
# worker thread can bridge a db_lookup back onto it (run_coroutine_threadsafe).
|
|
215
|
+
self._lookup_executor: DatabaseLookupExecutor | None = None
|
|
216
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
217
|
+
self._stop = asyncio.Event()
|
|
218
|
+
# Per-stage wake events so a producer wakes only its own downstream consumer class. A single
|
|
219
|
+
# shared auto-clearing event would let an idle worker of one class swallow another class's
|
|
220
|
+
# wakeup (lost wakeup) — masked by poll_interval but defeating the prompt set(). Listener →
|
|
221
|
+
# router (_ingress_work); router → transform (_routed_work); transform / replay → delivery
|
|
222
|
+
# (_work). Each worker class waits on (and clears) only its own event.
|
|
223
|
+
self._ingress_work = asyncio.Event()
|
|
224
|
+
self._routed_work = asyncio.Event()
|
|
225
|
+
# ADR 0013 Increment 2: wakes the per-loopback re-ingress worker when a Stage.RESPONSE work-row
|
|
226
|
+
# is produced (a captured reply owes a re-ingress) — a sibling of _ingress_work/_routed_work.
|
|
227
|
+
self._response_work = asyncio.Event()
|
|
228
|
+
self._work = asyncio.Event()
|
|
229
|
+
self._running = False
|
|
230
|
+
self._reload_lock = asyncio.Lock() # serialize concurrent reloads
|
|
231
|
+
|
|
232
|
+
@property
|
|
233
|
+
def running(self) -> bool:
|
|
234
|
+
return self._running
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def coordinator(self) -> ClusterCoordinator:
|
|
238
|
+
"""The cluster coordinator threaded in by the engine (Track B Step 3). Steps 4/5 consume its
|
|
239
|
+
cheap, synchronous gates (``is_leader`` / ``owns_lane``); this step only exposes the object."""
|
|
240
|
+
return self._coordinator
|
|
241
|
+
|
|
242
|
+
def notify_work(self) -> None:
|
|
243
|
+
"""Wake every stage worker now (e.g. after a replay re-queues rows at an unknown stage)."""
|
|
244
|
+
self._ingress_work.set()
|
|
245
|
+
self._routed_work.set()
|
|
246
|
+
self._response_work.set()
|
|
247
|
+
self._work.set()
|
|
248
|
+
|
|
249
|
+
def set_env_values(self, values: Mapping[str, Any]) -> None:
|
|
250
|
+
"""Replace the environment values used to resolve ``env()`` refs when (re)building connectors.
|
|
251
|
+
The engine calls this on reload so a promote picks up edited values without a restart (M-23)."""
|
|
252
|
+
self._env_values = dict(values)
|
|
253
|
+
|
|
254
|
+
def _build_lookup_executor(self) -> DatabaseLookupExecutor | None:
|
|
255
|
+
"""Build the pooled live-lookup executor from the current graph's ``DatabaseLookup`` specs, or
|
|
256
|
+
``None`` if the graph declares none (so the transform path stays byte-identical — inline call,
|
|
257
|
+
no thread hop, no runner). Resolves ``env()`` in each spec and fail-closed egress-checks the
|
|
258
|
+
server, exactly like a DATABASE source. ``build_check`` already validated these on a reload, so
|
|
259
|
+
this won't raise there; at start a bad spec surfaces here and unwinds the partial start."""
|
|
260
|
+
if not self.registry.lookups:
|
|
261
|
+
return None
|
|
262
|
+
resolved: dict[str, dict[str, Any]] = {}
|
|
263
|
+
for name, spec in self.registry.lookups.items():
|
|
264
|
+
settings = resolve_env_settings(spec.settings, self._env_values)
|
|
265
|
+
check_lookup_allowed(name, settings, self._egress)
|
|
266
|
+
resolved[name] = settings
|
|
267
|
+
return DatabaseLookupExecutor(resolved)
|
|
268
|
+
|
|
269
|
+
def _run_lookup(
|
|
270
|
+
self, connection: str, statement: str, params: Mapping[str, Any] | None
|
|
271
|
+
) -> list[dict[str, Any]]:
|
|
272
|
+
"""The lookup runner published to Handlers (``db_lookup`` → this). Called FROM the handler's
|
|
273
|
+
worker thread (``transform_one`` runs off the loop when lookups are declared), it bridges the
|
|
274
|
+
async query onto the engine loop via ``run_coroutine_threadsafe`` and blocks the WORKER THREAD —
|
|
275
|
+
never the loop — for the result (bounded by ``_LOOKUP_RESULT_TIMEOUT_SECONDS``)."""
|
|
276
|
+
executor = self._lookup_executor
|
|
277
|
+
loop = self._loop
|
|
278
|
+
if executor is None or loop is None: # only published when both exist; guard defensively
|
|
279
|
+
raise DbLookupError("db_lookup is unavailable — no lookup connections are configured")
|
|
280
|
+
future = asyncio.run_coroutine_threadsafe(
|
|
281
|
+
executor.query(connection, statement, params), loop
|
|
282
|
+
)
|
|
283
|
+
return future.result(_LOOKUP_RESULT_TIMEOUT_SECONDS)
|
|
284
|
+
|
|
285
|
+
# --- per-connection control (console operations) -------------------------
|
|
286
|
+
|
|
287
|
+
def inbound_running(self, name: str) -> bool:
|
|
288
|
+
return name in self._sources
|
|
289
|
+
|
|
290
|
+
def outbound_simulated(self, name: str) -> bool:
|
|
291
|
+
"""Whether the named outbound is in **simulate** mode — egress suppressed (#15). The *effective*
|
|
292
|
+
value (per-connection ``simulate=`` OR ``[shadow].simulate_all_egress``), for the ``/connections``
|
|
293
|
+
API + console so a simulated lane is unmissable.
|
|
294
|
+
|
|
295
|
+
Prefers the value resolved at reconcile (what the delivery worker actually uses, and the only
|
|
296
|
+
source for a *draining* outbound the registry no longer declares); falls back to resolving from
|
|
297
|
+
the registry for a connection that is declared but not yet reconciled (e.g. the metadata endpoint
|
|
298
|
+
on a not-yet-started engine)."""
|
|
299
|
+
if name in self._simulate:
|
|
300
|
+
return self._simulate[name]
|
|
301
|
+
oc = self.registry.outbound.get(name)
|
|
302
|
+
return (bool(oc.simulate) or self._simulate_all) if oc is not None else False
|
|
303
|
+
|
|
304
|
+
def _resolve_simulate(self, name: str, oc: OutboundConnection) -> bool:
|
|
305
|
+
"""Resolve a connection's effective simulate flag and log **once** when a lane (newly) enters
|
|
306
|
+
simulate mode (so it's loud in the operator log, not just the API)."""
|
|
307
|
+
simulate = bool(oc.simulate) or self._simulate_all
|
|
308
|
+
if simulate and not self._simulate.get(name, False):
|
|
309
|
+
log.warning(
|
|
310
|
+
"outbound %r is in SIMULATE mode — real egress SUPPRESSED (no delivery to the live "
|
|
311
|
+
"peer); messages still finalize PROCESSED for shadow/parallel-run comparison (#15)",
|
|
312
|
+
name,
|
|
313
|
+
)
|
|
314
|
+
return simulate
|
|
315
|
+
|
|
316
|
+
def build_test_connector(self, name: str) -> tuple[str, SourceConnector | DestinationConnector]:
|
|
317
|
+
"""Build a **fresh** connector for the named connection so it can be reachability-tested —
|
|
318
|
+
never the live one in ``_sources``/``_destinations`` (probing the live connector would disturb
|
|
319
|
+
running traffic). Resolves ``env()`` and enforces the ``[egress]`` allowlist fail-closed, the
|
|
320
|
+
same as a real build. Returns ``("in", source)`` or ``("out", destination)``. Raises
|
|
321
|
+
:class:`KeyError` if ``name`` isn't a connection, :class:`WiringError` on a bad ``env()`` /
|
|
322
|
+
egress. The caller closes the connector (``stop()`` / ``aclose()``) after testing."""
|
|
323
|
+
ic = self.registry.inbound.get(name)
|
|
324
|
+
if ic is not None:
|
|
325
|
+
source_cfg = _source_config(ic, self._inbound_bind_host, self._env_values)
|
|
326
|
+
check_source_allowed(source_cfg, name, self._egress)
|
|
327
|
+
return "in", build_source(source_cfg)
|
|
328
|
+
oc = self.registry.outbound.get(name)
|
|
329
|
+
if oc is not None:
|
|
330
|
+
dest_cfg = _dest_config(oc, self._env_values)
|
|
331
|
+
check_egress_allowed(dest_cfg, self._egress)
|
|
332
|
+
return "out", build_destination(dest_cfg)
|
|
333
|
+
raise KeyError(name)
|
|
334
|
+
|
|
335
|
+
async def start_inbound(self, name: str) -> None:
|
|
336
|
+
"""Start receiving on one inbound connection (no-op if already listening).
|
|
337
|
+
|
|
338
|
+
Public console/API entrypoint — takes the reload lock so it can't race a concurrent
|
|
339
|
+
reload()/stop() mutating _sources/_workers (review M-10). Internal callers that already hold
|
|
340
|
+
the lock (start, reload) use :meth:`_start_inbound_unsafe`."""
|
|
341
|
+
async with self._reload_lock:
|
|
342
|
+
await self._start_inbound_unsafe(name)
|
|
343
|
+
|
|
344
|
+
async def stop_inbound(self, name: str) -> None:
|
|
345
|
+
"""Stop receiving on one inbound connection (its delivery workers keep draining)."""
|
|
346
|
+
async with self._reload_lock:
|
|
347
|
+
await self._stop_inbound_unsafe(name)
|
|
348
|
+
|
|
349
|
+
async def restart_inbound(self, name: str) -> None:
|
|
350
|
+
# One lock span so stop+start is atomic w.r.t. a concurrent reload (review M-10).
|
|
351
|
+
async with self._reload_lock:
|
|
352
|
+
await self._stop_inbound_unsafe(name)
|
|
353
|
+
await self._start_inbound_unsafe(name)
|
|
354
|
+
|
|
355
|
+
async def _start_inbound_unsafe(self, name: str) -> None:
|
|
356
|
+
"""start_inbound body without the reload lock — for callers that already hold it (start,
|
|
357
|
+
reload). asyncio.Lock isn't reentrant, so the public wrappers must not call each other."""
|
|
358
|
+
if name in self._sources:
|
|
359
|
+
return
|
|
360
|
+
ic = self.registry.inbound[name]
|
|
361
|
+
# Resolve + guard the ACK-timing setting (per-connection override > global default). Step A
|
|
362
|
+
# only ships ACK-on-receipt; reject a resolved 'delivered' loud at start/reload rather than
|
|
363
|
+
# silently downgrade (covers a global [inbound] ack_after='delivered' inherited by a
|
|
364
|
+
# connection — the per-connection case is already rejected in inbound()). Compare by VALUE,
|
|
365
|
+
# not identity: AckAfter is a str-Enum, so a stray raw-string 'delivered' must still be caught.
|
|
366
|
+
if (ic.ack_after or self._ack_after_default) == AckAfter.DELIVERED:
|
|
367
|
+
raise WiringError(
|
|
368
|
+
f"inbound connection {name!r}: ack_after='delivered' is not yet implemented "
|
|
369
|
+
"(Step A ships ACK-on-receipt only — use ack_after='ingest', the default)"
|
|
370
|
+
)
|
|
371
|
+
source_cfg = _source_config(ic, self._inbound_bind_host, self._env_values)
|
|
372
|
+
check_source_allowed(source_cfg, ic.name, self._egress) # fail-closed connect allowlist
|
|
373
|
+
# Exposed-gate (ADR 0002 §0): refuse a non-loopback MLLP listener without TLS at start.
|
|
374
|
+
check_mllp_tls_exposure(source_cfg, ic.name, allow_insecure_bind=self._allow_insecure_bind)
|
|
375
|
+
source = build_source(source_cfg)
|
|
376
|
+
# Leader-gate the source's intake (Track B Step 4b). is_leader is a cheap, synchronous bound
|
|
377
|
+
# method = Callable[[], bool]; passing the bound METHOD (not the coordinator) keeps transports/
|
|
378
|
+
# free of any pipeline/cluster import. Only POLL sources act on it — they skip a scan when it
|
|
379
|
+
# returns False so exactly one node ingests a shared external resource (a dir / DB table /
|
|
380
|
+
# remote dir); LISTEN sources (MLLP/TCP) accept-and-ignore it (each binds its own endpoint). For
|
|
381
|
+
# single-node (NullCoordinator) is_leader is always True, so every poll source scans as before.
|
|
382
|
+
# Bind BEFORE registering: a failed bind (e.g. port in use) must not leave a dead source in
|
|
383
|
+
# _sources, where inbound_running() would report True and a retry would no-op (review M-9).
|
|
384
|
+
await source.start(self._make_handler(ic), leader_gate=self._coordinator.is_leader)
|
|
385
|
+
self._sources[name] = source
|
|
386
|
+
# Once the source is live, note (start-time only, never per-tick) that a poll source's intake
|
|
387
|
+
# is leader-gated, so an operator reading the log knows only the leader polls this resource.
|
|
388
|
+
if getattr(source, "polls_shared_resource", False):
|
|
389
|
+
log.info(
|
|
390
|
+
"inbound %r polls a shared external resource; intake is leader-gated (only the "
|
|
391
|
+
"cluster leader polls it — single-node always does)",
|
|
392
|
+
name,
|
|
393
|
+
)
|
|
394
|
+
# Ensure this inbound's router + transform workers are running. They are registry-tied, not
|
|
395
|
+
# source-tied — so a per-connection start/restart, or a reload, re-arms a worker that exited
|
|
396
|
+
# (e.g. halted by the STOP internal-error policy), otherwise the restarted source would resume
|
|
397
|
+
# ACK-on-receipt into an ingress/routed backlog with nothing draining it. Idempotent (same guard
|
|
398
|
+
# reload() uses); only runs once the runner is up so start()'s own spawn loop owns first boot.
|
|
399
|
+
if self._running:
|
|
400
|
+
self._ensure_inbound_workers(name)
|
|
401
|
+
|
|
402
|
+
async def _stop_inbound_unsafe(self, name: str) -> None:
|
|
403
|
+
"""stop_inbound body without the reload lock — for callers that already hold it."""
|
|
404
|
+
source = self._sources.pop(name, None)
|
|
405
|
+
if source is not None:
|
|
406
|
+
await source.stop()
|
|
407
|
+
|
|
408
|
+
async def start(self) -> None:
|
|
409
|
+
async with self._reload_lock:
|
|
410
|
+
if self._running:
|
|
411
|
+
return
|
|
412
|
+
self._stop.clear()
|
|
413
|
+
# Capture the engine loop so a handler's worker thread can bridge a db_lookup back onto it.
|
|
414
|
+
self._loop = asyncio.get_running_loop()
|
|
415
|
+
try:
|
|
416
|
+
for name, oc in self.registry.outbound.items():
|
|
417
|
+
dest = _dest_config(oc, self._env_values)
|
|
418
|
+
check_egress_allowed(
|
|
419
|
+
dest, self._egress
|
|
420
|
+
) # fail-closed egress allowlist (WP-11c)
|
|
421
|
+
self._destinations[name] = build_destination(dest)
|
|
422
|
+
# ADR 0013: fail closed at start if a capturing outbound is wired on a backend that
|
|
423
|
+
# can't persist captures (the SQL Server preview) — never silently drop replies.
|
|
424
|
+
if getattr(self._destinations[name], "capture_response", False) and not getattr(
|
|
425
|
+
self.store, "supports_response_capture", True
|
|
426
|
+
):
|
|
427
|
+
raise RuntimeError(
|
|
428
|
+
f"outbound {name!r} sets capture_response=True but the store backend does "
|
|
429
|
+
"not support request/response capture (ADR 0013); use the SQLite or "
|
|
430
|
+
"Postgres backend"
|
|
431
|
+
)
|
|
432
|
+
self._retry[name] = oc.retry or self._delivery_defaults
|
|
433
|
+
self._ordering[name] = oc.ordering or self._ordering_default
|
|
434
|
+
self._internal_error[name] = oc.internal_error or self._internal_error_default
|
|
435
|
+
self._buildup[name] = oc.buildup or self._buildup_default
|
|
436
|
+
self._simulate[name] = self._resolve_simulate(name, oc)
|
|
437
|
+
self._spawn_worker(name)
|
|
438
|
+
# Build the live-lookup executor from the graph (env-resolved + egress-checked here);
|
|
439
|
+
# None when no DatabaseLookup is declared, keeping the transform path byte-identical.
|
|
440
|
+
self._lookup_executor = self._build_lookup_executor()
|
|
441
|
+
for ic in self.registry.inbound.values():
|
|
442
|
+
await self._start_inbound_unsafe(ic.name)
|
|
443
|
+
# A router + transform worker per inbound — spawned after the sources bind, so a bind
|
|
444
|
+
# failure above unwinds before any inbound worker exists. They drain ingress→routed→
|
|
445
|
+
# outbound, independently of the source's listen state.
|
|
446
|
+
for name in self.registry.inbound:
|
|
447
|
+
self._ensure_inbound_workers(name)
|
|
448
|
+
except Exception:
|
|
449
|
+
# A partial start (typically an inbound bind failure) must not leave half the graph
|
|
450
|
+
# wired with _running still False — unwind everything we started so the listeners are
|
|
451
|
+
# released and a retry can rebind the same ports (review M-8).
|
|
452
|
+
log.exception("wiring start failed; unwinding the partial start")
|
|
453
|
+
await self._teardown_unsafe()
|
|
454
|
+
raise
|
|
455
|
+
self._running = True
|
|
456
|
+
log.info(
|
|
457
|
+
"wiring started: %d inbound, %d outbound connection(s)",
|
|
458
|
+
len(self.registry.inbound),
|
|
459
|
+
len(self.registry.outbound),
|
|
460
|
+
)
|
|
461
|
+
|
|
462
|
+
async def stop(self) -> None:
|
|
463
|
+
async with self._reload_lock: # serialize against an in-flight reload (no torn-down state)
|
|
464
|
+
had_state = self._running or bool(self._sources or self._workers or self._destinations)
|
|
465
|
+
await self._teardown_unsafe()
|
|
466
|
+
if had_state:
|
|
467
|
+
log.info("wiring stopped")
|
|
468
|
+
|
|
469
|
+
async def _teardown_unsafe(self) -> None:
|
|
470
|
+
"""Tear down all sources/workers/destinations and mark stopped. Lock-free (callers hold
|
|
471
|
+
_reload_lock) and idempotent — cleans up whatever is registered even if the runner never
|
|
472
|
+
reached _running, so a half-started runner (review M-8) and a double stop() are both safe."""
|
|
473
|
+
self._stop.set()
|
|
474
|
+
self._ingress_work.set()
|
|
475
|
+
self._routed_work.set()
|
|
476
|
+
self._response_work.set()
|
|
477
|
+
self._work.set()
|
|
478
|
+
for source in self._sources.values():
|
|
479
|
+
await source.stop()
|
|
480
|
+
inbound_tasks = (
|
|
481
|
+
*self._router_workers.values(),
|
|
482
|
+
*self._transform_workers.values(),
|
|
483
|
+
*self._response_workers.values(),
|
|
484
|
+
)
|
|
485
|
+
for task in (*self._workers.values(), *inbound_tasks):
|
|
486
|
+
task.cancel()
|
|
487
|
+
await asyncio.gather(*self._workers.values(), *inbound_tasks, return_exceptions=True)
|
|
488
|
+
for connector in self._destinations.values():
|
|
489
|
+
await connector.aclose()
|
|
490
|
+
if self._lookup_executor is not None:
|
|
491
|
+
await self._lookup_executor.aclose()
|
|
492
|
+
self._lookup_executor = None
|
|
493
|
+
self._workers.clear()
|
|
494
|
+
self._router_workers.clear()
|
|
495
|
+
self._transform_workers.clear()
|
|
496
|
+
self._response_workers.clear()
|
|
497
|
+
self._destinations.clear()
|
|
498
|
+
self._retry.clear()
|
|
499
|
+
self._internal_error.clear()
|
|
500
|
+
self._buildup.clear()
|
|
501
|
+
self._simulate.clear()
|
|
502
|
+
self._next_buildup_alert.clear()
|
|
503
|
+
self._sources.clear()
|
|
504
|
+
self._running = False
|
|
505
|
+
|
|
506
|
+
# --- outbound worker management ------------------------------------------
|
|
507
|
+
|
|
508
|
+
def _spawn_worker(self, name: str) -> None:
|
|
509
|
+
"""Start a delivery worker for one outbound connection (drains its outbox rows)."""
|
|
510
|
+
task = asyncio.create_task(self._delivery_worker(name))
|
|
511
|
+
task.add_done_callback(functools.partial(self._on_worker_done, name))
|
|
512
|
+
self._workers[name] = task
|
|
513
|
+
|
|
514
|
+
def _on_worker_done(self, name: str, task: asyncio.Task[None]) -> None:
|
|
515
|
+
"""A delivery worker should only finish on shutdown — its loop swallows + backs off on
|
|
516
|
+
errors. If one somehow dies while the engine is running, log and respawn so the destination
|
|
517
|
+
keeps draining rather than silently stalling (review H-1)."""
|
|
518
|
+
if self._stop.is_set() or not self._running or task.cancelled():
|
|
519
|
+
return # expected shutdown / cancellation
|
|
520
|
+
if task.exception() is None:
|
|
521
|
+
return
|
|
522
|
+
if self._workers.get(name) is task: # still the registered worker (not mid-reconcile/stop)
|
|
523
|
+
log.error(
|
|
524
|
+
"delivery worker %r exited unexpectedly; respawning",
|
|
525
|
+
name,
|
|
526
|
+
exc_info=task.exception(),
|
|
527
|
+
)
|
|
528
|
+
self._spawn_worker(name)
|
|
529
|
+
|
|
530
|
+
def _inbound_worker_coro(self, kind: str): # type: ignore[no-untyped-def]
|
|
531
|
+
"""The coroutine factory for an inbound worker ``kind`` (``router`` | ``transform`` |
|
|
532
|
+
``response``). The ``response`` worker (ADR 0013) runs only for loopback inbounds."""
|
|
533
|
+
return {
|
|
534
|
+
"router": self._router_worker,
|
|
535
|
+
"transform": self._transform_worker,
|
|
536
|
+
"response": self._response_worker,
|
|
537
|
+
}[kind]
|
|
538
|
+
|
|
539
|
+
def _inbound_worker_dict(self, kind: str) -> dict[str, asyncio.Task[None]]:
|
|
540
|
+
return {
|
|
541
|
+
"router": self._router_workers,
|
|
542
|
+
"transform": self._transform_workers,
|
|
543
|
+
"response": self._response_workers,
|
|
544
|
+
}[kind]
|
|
545
|
+
|
|
546
|
+
def _ensure_inbound_workers(self, name: str) -> None:
|
|
547
|
+
"""Ensure the router + transform (+ for a loopback inbound, the response) workers for one inbound
|
|
548
|
+
are running, spawning any that exited (a STOP-policy halt, a reload adding the inbound, or a
|
|
549
|
+
crash). Idempotent — the shared re-arm used by start(), start_inbound(), and reload()."""
|
|
550
|
+
kinds = ["router", "transform"]
|
|
551
|
+
ic = self.registry.inbound.get(name)
|
|
552
|
+
if ic is not None and ic.spec.type is ConnectorType.LOOPBACK:
|
|
553
|
+
# ADR 0013: a loopback inbound also gets a RESPONSE worker draining its Stage.RESPONSE tokens.
|
|
554
|
+
kinds.append("response")
|
|
555
|
+
for kind in kinds:
|
|
556
|
+
task = self._inbound_worker_dict(kind).get(name)
|
|
557
|
+
if task is None or task.done():
|
|
558
|
+
self._spawn_inbound_worker(kind, name)
|
|
559
|
+
|
|
560
|
+
def _spawn_inbound_worker(self, kind: str, name: str) -> None:
|
|
561
|
+
"""Start the ``kind`` (router/transform) worker for one inbound connection."""
|
|
562
|
+
workers = self._inbound_worker_dict(kind)
|
|
563
|
+
task = asyncio.create_task(self._inbound_worker_coro(kind)(name))
|
|
564
|
+
task.add_done_callback(functools.partial(self._on_inbound_worker_done, kind, name))
|
|
565
|
+
workers[name] = task
|
|
566
|
+
|
|
567
|
+
def _on_inbound_worker_done(self, kind: str, name: str, task: asyncio.Task[None]) -> None:
|
|
568
|
+
"""A router/transform worker should only finish on shutdown or a STOP-policy halt. If it dies
|
|
569
|
+
on an unexpected error while running, respawn it so the inbound keeps processing (mirrors the
|
|
570
|
+
delivery worker's supervisor). A STOP-policy halt returns normally (no exception) and is left
|
|
571
|
+
down until a reload re-arms it."""
|
|
572
|
+
if self._stop.is_set() or not self._running or task.cancelled():
|
|
573
|
+
return # expected shutdown / cancellation
|
|
574
|
+
if task.exception() is None:
|
|
575
|
+
return # normal return (e.g. STOP policy halted the lane) — not respawned
|
|
576
|
+
if self._inbound_worker_dict(kind).get(name) is task:
|
|
577
|
+
log.error(
|
|
578
|
+
"%s worker %r exited unexpectedly; respawning",
|
|
579
|
+
kind,
|
|
580
|
+
name,
|
|
581
|
+
exc_info=task.exception(),
|
|
582
|
+
)
|
|
583
|
+
self._spawn_inbound_worker(kind, name)
|
|
584
|
+
|
|
585
|
+
def build_check(self, registry: Registry) -> None:
|
|
586
|
+
"""Construct (and discard) every connector in ``registry`` so a bad connector spec fails
|
|
587
|
+
BEFORE a reload quiesces anything — i.e. the running graph is left untouched. Construction
|
|
588
|
+
is side-effect-free (no socket bind / file I/O — binding happens later in ``start_inbound``).
|
|
589
|
+
Raises :class:`WiringError` so the API maps it to 422 like other invalid-config errors."""
|
|
590
|
+
build_check_registry(
|
|
591
|
+
registry,
|
|
592
|
+
inbound_bind_host=self._inbound_bind_host,
|
|
593
|
+
env_values=self._env_values,
|
|
594
|
+
egress=self._egress,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
async def _reconcile_outbounds(self, old: Registry, new: Registry) -> None:
|
|
598
|
+
"""Bring the outbound connectors/workers in line with ``new`` without tearing down a live
|
|
599
|
+
worker (so its in-flight outbox batch keeps draining). A worker re-resolves its connector
|
|
600
|
+
per item, so a changed connector is swapped in place; the old one is closed (a single racing
|
|
601
|
+
send at most fails and retries — outbounds are idempotent). An outbound dropped by ``new`` is
|
|
602
|
+
left running so rows already queued to it still drain. Connector builds here cannot fail —
|
|
603
|
+
:meth:`_build_check` already validated them before any quiesce."""
|
|
604
|
+
for name, oc in new.outbound.items():
|
|
605
|
+
# workers read retry + ordering + internal-error policy live each item, so a reload
|
|
606
|
+
# retunes (incl. re-arming a previously stopped connection) without a restart
|
|
607
|
+
self._retry[name] = oc.retry or self._delivery_defaults
|
|
608
|
+
self._ordering[name] = oc.ordering or self._ordering_default
|
|
609
|
+
self._internal_error[name] = oc.internal_error or self._internal_error_default
|
|
610
|
+
self._buildup[name] = oc.buildup or self._buildup_default
|
|
611
|
+
self._simulate[name] = self._resolve_simulate(name, oc)
|
|
612
|
+
worker = self._workers.get(name)
|
|
613
|
+
if worker is None or worker.done():
|
|
614
|
+
# added (or replacing a crashed worker): close any stale connector, build + spawn.
|
|
615
|
+
stale = self._destinations.pop(name, None)
|
|
616
|
+
if stale is not None:
|
|
617
|
+
await stale.aclose()
|
|
618
|
+
self._destinations[name] = build_destination(_dest_config(oc, self._env_values))
|
|
619
|
+
self._spawn_worker(name)
|
|
620
|
+
elif old.outbound.get(name) is None or old.outbound[name].spec != oc.spec:
|
|
621
|
+
# live worker, connector type/settings changed → swap in place, close the old one.
|
|
622
|
+
old_conn = self._destinations.get(name)
|
|
623
|
+
self._destinations[name] = build_destination(_dest_config(oc, self._env_values))
|
|
624
|
+
if old_conn is not None:
|
|
625
|
+
await old_conn.aclose()
|
|
626
|
+
# else: unchanged & live → leave the worker/connector as-is.
|
|
627
|
+
# Outbounds removed by ``new`` keep their worker so already-queued rows finish draining.
|
|
628
|
+
|
|
629
|
+
# --- atomic reload (quiesce-and-swap) ------------------------------------
|
|
630
|
+
|
|
631
|
+
async def reload(self, new_registry: Registry) -> None:
|
|
632
|
+
"""Atomically swap to ``new_registry`` on the running graph (whole-config swap).
|
|
633
|
+
|
|
634
|
+
Quiesce-and-swap, in this order: (0) build-check every new connector — a bad spec raises
|
|
635
|
+
here, before anything is touched, so the running graph is left intact; (1) stop accepting new
|
|
636
|
+
inbound messages; (2) swap the registry + restart the inbound listeners from it (Router/
|
|
637
|
+
Handler changes take effect immediately — the inbound path reads ``self.registry`` live);
|
|
638
|
+
(3) reconcile the outbound connectors/workers *without* tearing them down, so in-flight
|
|
639
|
+
outbox rows keep draining (at-least-once preserved). If any step fails the previous graph's
|
|
640
|
+
intake is restored before the error propagates. Restarting inbounds before reconciling
|
|
641
|
+
outbounds means a slow/hung outbound never blocks the engine's intake.
|
|
642
|
+
"""
|
|
643
|
+
async with self._reload_lock:
|
|
644
|
+
self.build_check(new_registry) # raises before any change on a bad connector
|
|
645
|
+
if not self._running:
|
|
646
|
+
self.registry = new_registry
|
|
647
|
+
return
|
|
648
|
+
|
|
649
|
+
old = self.registry
|
|
650
|
+
old_inbound_names = list(self._sources)
|
|
651
|
+
|
|
652
|
+
# 1. Quiesce intake: stop every inbound source so no NEW messages are accepted. Any
|
|
653
|
+
# message already in flight completes under its arrival-time registry (snapshotted in
|
|
654
|
+
# _make_handler), so it stays consistent even if a source's stop() returns early.
|
|
655
|
+
for name in old_inbound_names:
|
|
656
|
+
await self._stop_inbound_unsafe(
|
|
657
|
+
name
|
|
658
|
+
) # we hold _reload_lock — use the unsafe variant
|
|
659
|
+
|
|
660
|
+
try:
|
|
661
|
+
# 2. Swap the registry and restart inbound listeners from it (intake back up first).
|
|
662
|
+
self.registry = new_registry
|
|
663
|
+
# Rebuild the live-lookup executor from the new graph, closing the old pools. build_check
|
|
664
|
+
# already validated the new specs, so this can't fail on a bad spec here.
|
|
665
|
+
old_lookup_executor = self._lookup_executor
|
|
666
|
+
self._lookup_executor = self._build_lookup_executor()
|
|
667
|
+
if old_lookup_executor is not None:
|
|
668
|
+
await old_lookup_executor.aclose()
|
|
669
|
+
for ic in new_registry.inbound.values():
|
|
670
|
+
await self._start_inbound_unsafe(ic.name)
|
|
671
|
+
# 2b. Ensure the router + transform workers run for every inbound in the new graph.
|
|
672
|
+
# Workers read self.registry live, so a Router/Handler change applies to rows processed
|
|
673
|
+
# after the swap; a removed inbound keeps its workers so residual ingress/routed rows
|
|
674
|
+
# still drain.
|
|
675
|
+
for name in new_registry.inbound:
|
|
676
|
+
self._ensure_inbound_workers(name)
|
|
677
|
+
# 3. Reconcile outbound connectors/workers (intake already live).
|
|
678
|
+
await self._reconcile_outbounds(old, new_registry)
|
|
679
|
+
except Exception:
|
|
680
|
+
# Roll back to the previous graph's intake so a failed reload leaves the engine
|
|
681
|
+
# accepting exactly what it did before (the realistic failure is an inbound bind).
|
|
682
|
+
log.exception("reload failed; rolling back inbound intake to the previous graph")
|
|
683
|
+
self.registry = old
|
|
684
|
+
for name in list(self._sources):
|
|
685
|
+
await self._stop_inbound_unsafe(name)
|
|
686
|
+
for name in old_inbound_names:
|
|
687
|
+
try:
|
|
688
|
+
await self._start_inbound_unsafe(name)
|
|
689
|
+
except Exception:
|
|
690
|
+
log.exception("rollback: could not restart inbound %r", name)
|
|
691
|
+
raise
|
|
692
|
+
|
|
693
|
+
# Wake every stage (new connections / freshly enqueued rows may sit at any stage).
|
|
694
|
+
self._ingress_work.set()
|
|
695
|
+
self._routed_work.set()
|
|
696
|
+
self._work.set()
|
|
697
|
+
log.info(
|
|
698
|
+
"wiring reloaded: %d inbound, %d outbound connection(s)",
|
|
699
|
+
len(new_registry.inbound),
|
|
700
|
+
len(new_registry.outbound),
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
# --- inbound path --------------------------------------------------------
|
|
704
|
+
|
|
705
|
+
def _make_handler(self, ic: InboundConnection): # type: ignore[no-untyped-def]
|
|
706
|
+
# The listener only decodes/parses/validates and commits the raw message to the ingress stage
|
|
707
|
+
# before ACKing (ACK-on-receipt) — it no longer routes, so it needs no registry snapshot.
|
|
708
|
+
# Routing happens later in the router worker against the LIVE registry, so a message ingested
|
|
709
|
+
# before a reload is routed under the new graph (the staged model decouples intake from
|
|
710
|
+
# routing). The inbound name is fixed for this source.
|
|
711
|
+
async def on_message(raw: bytes) -> str | None:
|
|
712
|
+
return await self._handle_inbound(ic, raw)
|
|
713
|
+
|
|
714
|
+
return on_message
|
|
715
|
+
|
|
716
|
+
async def _handle_inbound(self, ic: InboundConnection, raw: bytes) -> str | None:
|
|
717
|
+
ack_mode = ic.ack_mode
|
|
718
|
+
reply = ack_mode is not AckMode.NONE
|
|
719
|
+
src = ic.spec.type.value
|
|
720
|
+
hl7v2 = ic.content_type is ContentType.HL7V2
|
|
721
|
+
|
|
722
|
+
# Decode with the connection's configured charset. A genuine decode failure means the bytes
|
|
723
|
+
# aren't valid in the declared encoding — record ERROR (preserving the exact bytes via a
|
|
724
|
+
# lossless latin-1 view) and NAK, rather than silently substituting U+FFFD into the stored
|
|
725
|
+
# raw and the delivered copy (review H-3). HL7 also normalizes line endings to \r; a non-HL7
|
|
726
|
+
# body (JSON/XML/text) is decoded verbatim — \r-normalizing it would corrupt it (ADR 0004).
|
|
727
|
+
encoding = ic.spec.settings.get("encoding", "utf-8")
|
|
728
|
+
try:
|
|
729
|
+
text = (
|
|
730
|
+
normalize(raw, encoding=encoding, errors="strict")
|
|
731
|
+
if hl7v2
|
|
732
|
+
else raw.decode(encoding)
|
|
733
|
+
)
|
|
734
|
+
except UnicodeDecodeError as exc:
|
|
735
|
+
await self.store.record_received(
|
|
736
|
+
channel_id=ic.name,
|
|
737
|
+
raw=raw.decode("latin-1"), # lossless byte view — the declared encoding rejected it
|
|
738
|
+
status=MessageStatus.ERROR,
|
|
739
|
+
error=f"decode error ({encoding}): {safe_exc(exc)}",
|
|
740
|
+
source_type=src,
|
|
741
|
+
message_type=None if hl7v2 else ic.content_type.value,
|
|
742
|
+
)
|
|
743
|
+
return (
|
|
744
|
+
build_ack(raw, code="AR", text="decode error", ack_mode=ack_mode)
|
|
745
|
+
if (hl7v2 and reply)
|
|
746
|
+
else None
|
|
747
|
+
)
|
|
748
|
+
|
|
749
|
+
if not hl7v2:
|
|
750
|
+
# Payload-agnostic ingress (ADR 0004): a non-HL7 inbound skips HL7 peek/validate and the
|
|
751
|
+
# HL7 ACK. The decoded body is committed verbatim and the router/transform workers route it
|
|
752
|
+
# as a RawMessage; the source connector owns its own receive-time response (no MLLP ACK).
|
|
753
|
+
await self.store.enqueue_ingress(
|
|
754
|
+
channel_id=ic.name,
|
|
755
|
+
raw=text,
|
|
756
|
+
control_id=None,
|
|
757
|
+
message_type=ic.content_type.value,
|
|
758
|
+
source_type=src,
|
|
759
|
+
summary=None,
|
|
760
|
+
)
|
|
761
|
+
self._ingress_work.set()
|
|
762
|
+
return None
|
|
763
|
+
|
|
764
|
+
try:
|
|
765
|
+
peek = Peek.parse(text)
|
|
766
|
+
except HL7PeekError as exc:
|
|
767
|
+
await self.store.record_received(
|
|
768
|
+
channel_id=ic.name,
|
|
769
|
+
raw=text,
|
|
770
|
+
status=MessageStatus.ERROR,
|
|
771
|
+
error=f"parse error: {safe_exc(exc)}",
|
|
772
|
+
source_type=src,
|
|
773
|
+
)
|
|
774
|
+
return build_ack(text, code="AR", text=str(exc), ack_mode=ack_mode) if reply else None
|
|
775
|
+
|
|
776
|
+
if ic.validation.strict:
|
|
777
|
+
# hl7apy validation is CPU-bound (full structure/cardinality parse) — run it off the event
|
|
778
|
+
# loop so a strict feed can't stall every other listener, worker, and API call (review M-11).
|
|
779
|
+
result = await asyncio.to_thread(
|
|
780
|
+
validate, text, expected_version=ic.validation.hl7_version
|
|
781
|
+
)
|
|
782
|
+
if not result.ok:
|
|
783
|
+
joined = "; ".join(result.errors)
|
|
784
|
+
# Persist a PHI-scrubbed form: hl7apy error strings quote the offending field VALUE
|
|
785
|
+
# (PHI), so this is a persisted-disposition write that must go through the scrub like
|
|
786
|
+
# every other one — it keeps the field NAME / segment ID (the diagnostic an operator
|
|
787
|
+
# needs) but cuts the value (review #120). The scrubbed text is gated behind
|
|
788
|
+
# messages:view_summary on read, like every other stored error.
|
|
789
|
+
persisted = f"strict-validation failed: {safe_text(joined)}"
|
|
790
|
+
await self._record(ic, peek, text, MessageStatus.ERROR, error=persisted)
|
|
791
|
+
# The AE ACK goes back to the partner that SENT this message (their own data) and is
|
|
792
|
+
# transient (never persisted), so it may carry the fuller, bounded validation text.
|
|
793
|
+
return (
|
|
794
|
+
build_ack(peek, code="AE", text=joined[:200], ack_mode=ack_mode)
|
|
795
|
+
if reply
|
|
796
|
+
else None
|
|
797
|
+
)
|
|
798
|
+
|
|
799
|
+
# ACK-on-receipt (staged pipeline, ADR 0001 Step A): persist the raw message durably to the
|
|
800
|
+
# ingress stage, then ACK. Routing/transform/delivery run AFTER the ACK in the ingress worker,
|
|
801
|
+
# so a slow/hung router or outbound never stalls intake — and a router/handler failure no
|
|
802
|
+
# longer NAKs the sender (it becomes a logged ERROR/dead-letter at the ingress stage). Decode,
|
|
803
|
+
# parse, and strict validation above stay synchronous and still NAK, preserving the partner
|
|
804
|
+
# contract for a malformed message. ack_after='delivered' (defer the ACK) is rejected at
|
|
805
|
+
# wiring in Step A, so this is always ACK-on-ingest.
|
|
806
|
+
await self.store.enqueue_ingress(
|
|
807
|
+
channel_id=ic.name,
|
|
808
|
+
raw=text,
|
|
809
|
+
control_id=peek.control_id,
|
|
810
|
+
message_type=peek.message_type,
|
|
811
|
+
source_type=src,
|
|
812
|
+
summary=summarize(peek) or None,
|
|
813
|
+
)
|
|
814
|
+
self._ingress_work.set() # wake the router worker to route the freshly-committed message
|
|
815
|
+
return build_ack(peek, code="AA", ack_mode=ack_mode) if reply else None
|
|
816
|
+
|
|
817
|
+
async def _record(
|
|
818
|
+
self,
|
|
819
|
+
ic: InboundConnection,
|
|
820
|
+
peek: Peek,
|
|
821
|
+
raw: str, # already the decoded, \r-normalized text (see _handle_inbound)
|
|
822
|
+
status: MessageStatus,
|
|
823
|
+
*,
|
|
824
|
+
error: str | None = None,
|
|
825
|
+
) -> None:
|
|
826
|
+
await self.store.record_received(
|
|
827
|
+
channel_id=ic.name,
|
|
828
|
+
raw=raw,
|
|
829
|
+
status=status,
|
|
830
|
+
error=error,
|
|
831
|
+
control_id=peek.control_id,
|
|
832
|
+
message_type=peek.message_type,
|
|
833
|
+
source_type=ic.spec.type.value,
|
|
834
|
+
summary=summarize(peek) or None,
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
# --- delivery path -------------------------------------------------------
|
|
838
|
+
|
|
839
|
+
async def _delivery_worker(self, name: str) -> None:
|
|
840
|
+
while not self._stop.is_set():
|
|
841
|
+
try:
|
|
842
|
+
# FIFO (default): claim only the due head — a backing-off head blocks the lane
|
|
843
|
+
# (head-of-line), so order is preserved. UNORDERED: claim a batch and rotate past a
|
|
844
|
+
# backing-off row to drain others. Resolved live so a reload can retune it.
|
|
845
|
+
if self._ordering.get(name, self._ordering_default) is OrderingMode.FIFO:
|
|
846
|
+
# lane_owner() gates the claim to a single owner per lane (Track B Step 5) so strict
|
|
847
|
+
# FIFO holds ACROSS nodes; it's None single-node (byte-identical no-owner claim).
|
|
848
|
+
head = await self.store.claim_next_fifo(
|
|
849
|
+
name, owner=self._coordinator.lane_owner()
|
|
850
|
+
)
|
|
851
|
+
items = [head] if head is not None else []
|
|
852
|
+
else:
|
|
853
|
+
# UNORDERED lanes are intentionally NOT lane-owned — concurrent draining across
|
|
854
|
+
# nodes is fine, so claim_ready stays unchanged.
|
|
855
|
+
items = await self.store.claim_ready(
|
|
856
|
+
limit=self.claim_limit, destination_name=name
|
|
857
|
+
)
|
|
858
|
+
if not items:
|
|
859
|
+
await self._wait_for_work(self._work)
|
|
860
|
+
continue
|
|
861
|
+
for item in items:
|
|
862
|
+
# Connector + retry re-resolved per item so a reload can swap an outbound's
|
|
863
|
+
# settings under us with at most one racing send (which fails + retries —
|
|
864
|
+
# outbounds are idempotent).
|
|
865
|
+
retry = self._retry.get(name) or RetryPolicy()
|
|
866
|
+
connector = self._destinations.get(name)
|
|
867
|
+
if connector is None:
|
|
868
|
+
# No connector for a claimed row (extremely unlikely mid-reconcile).
|
|
869
|
+
# Reschedule it rather than strand the claimed row, then move on.
|
|
870
|
+
await self.store.mark_failed(item.id, "outbound reloading", retry)
|
|
871
|
+
continue
|
|
872
|
+
try:
|
|
873
|
+
if self._simulate.get(name, False):
|
|
874
|
+
# Shadow / parallel-run (#15): suppress the real egress entirely — no bytes/
|
|
875
|
+
# SQL leave the box. With egress suppressed there is no real partner reply to
|
|
876
|
+
# capture or re-ingress, so treat it as a completed ONE-WAY delivery: response
|
|
877
|
+
# = None → mark_done → the message finalizes PROCESSED, and the would-send
|
|
878
|
+
# outbound payload is retained on the done row for parity comparison. (A
|
|
879
|
+
# capturing/reingress_to outbound therefore captures nothing in simulate.)
|
|
880
|
+
response = None
|
|
881
|
+
else:
|
|
882
|
+
response = await connector.send(item.payload)
|
|
883
|
+
except NegativeAckError as exc:
|
|
884
|
+
# Partner rejection. AR/CR (permanent) → fail-fast: the partner will never
|
|
885
|
+
# accept this message, so dead-letter it now rather than block the FIFO lane
|
|
886
|
+
# forever (still replayable from the DLQ). AE/CE (transient) → retry per
|
|
887
|
+
# policy, like a transport failure.
|
|
888
|
+
if exc.permanent:
|
|
889
|
+
await self.store.dead_letter_now(item.id, safe_exc(exc))
|
|
890
|
+
else:
|
|
891
|
+
await self.store.mark_failed(item.id, safe_exc(exc), retry)
|
|
892
|
+
await self._maybe_alert_buildup(name)
|
|
893
|
+
except DeliveryError as exc:
|
|
894
|
+
# Transport failure (connect/IO/timeout/unparseable ACK) — transient; retry
|
|
895
|
+
# per policy (retry-forever by default, so nothing is silently lost).
|
|
896
|
+
await self.store.mark_failed(item.id, safe_exc(exc), retry)
|
|
897
|
+
await self._maybe_alert_buildup(name)
|
|
898
|
+
except Exception as exc:
|
|
899
|
+
# Internal/code error (our bug, not the partner). The per-connection policy
|
|
900
|
+
# decides: STOP halts the lane (preserve the message, alert an operator) while
|
|
901
|
+
# CONTINUE (default) dead-letters this row and advances so a code bug can't
|
|
902
|
+
# wedge the lane forever. Log the exception TYPE only — the full detail goes to
|
|
903
|
+
# the secured store's last_error, never the general log (PHI).
|
|
904
|
+
if (
|
|
905
|
+
self._internal_error.get(name, self._internal_error_default)
|
|
906
|
+
is InternalErrorPolicy.STOP
|
|
907
|
+
):
|
|
908
|
+
log.error(
|
|
909
|
+
"delivery worker %r: internal error delivering %s (%s); STOPPING "
|
|
910
|
+
"connection (operator must fix + reload/restart to resume)",
|
|
911
|
+
name,
|
|
912
|
+
item.id,
|
|
913
|
+
type(exc).__name__,
|
|
914
|
+
)
|
|
915
|
+
# Preserve the message for replay (reschedule, don't dead-letter) and halt
|
|
916
|
+
# this worker. A normal return is not respawned (_on_worker_done); a later
|
|
917
|
+
# reload re-spawns the worker, re-arming the lane.
|
|
918
|
+
await self.store.mark_failed(
|
|
919
|
+
item.id,
|
|
920
|
+
f"internal error (connection stopped): {safe_exc(exc)}",
|
|
921
|
+
retry,
|
|
922
|
+
)
|
|
923
|
+
self._alert_sink.connection_stopped(
|
|
924
|
+
name, detail=f"{type(exc).__name__} delivering {item.id}"
|
|
925
|
+
)
|
|
926
|
+
return
|
|
927
|
+
log.warning(
|
|
928
|
+
"delivery worker %r: internal error delivering %s (%s); dead-lettering",
|
|
929
|
+
name,
|
|
930
|
+
item.id,
|
|
931
|
+
type(exc).__name__,
|
|
932
|
+
)
|
|
933
|
+
await self.store.dead_letter_now(
|
|
934
|
+
item.id, f"internal error: {safe_exc(exc)}"
|
|
935
|
+
)
|
|
936
|
+
else:
|
|
937
|
+
# ADR 0013: a capturing outbound returns a DeliveryResponse; persist the reply
|
|
938
|
+
# AND mark the row done in ONE transaction (exactly-once capture). A non-capturing
|
|
939
|
+
# outbound returns None → plain mark_done, byte-identical. The XOR (never both)
|
|
940
|
+
# is the single-writer discipline that yields exactly one captured reply per row.
|
|
941
|
+
if response is not None:
|
|
942
|
+
# ADR 0013 Increment 2: if this outbound declares reingress_to, the same
|
|
943
|
+
# capture transaction also produces a Stage.RESPONSE work-row; wake the
|
|
944
|
+
# re-ingress worker. Read live from the registry (a reload swaps it).
|
|
945
|
+
oc = self.registry.outbound.get(name)
|
|
946
|
+
reingress_to = (
|
|
947
|
+
oc.spec.settings.get("reingress_to") if oc is not None else None
|
|
948
|
+
)
|
|
949
|
+
await self.store.complete_with_response(
|
|
950
|
+
item.id,
|
|
951
|
+
body=response.body,
|
|
952
|
+
outcome=response.outcome,
|
|
953
|
+
detail=response.detail,
|
|
954
|
+
reingress_to=reingress_to,
|
|
955
|
+
)
|
|
956
|
+
if reingress_to is not None:
|
|
957
|
+
self._response_work.set() # wake the re-ingress worker for the new token
|
|
958
|
+
else:
|
|
959
|
+
await self.store.mark_done(item.id)
|
|
960
|
+
except asyncio.CancelledError:
|
|
961
|
+
raise
|
|
962
|
+
except Exception:
|
|
963
|
+
# A store error in the loop itself (claim_ready / mark_* failing — DB locked, disk
|
|
964
|
+
# full) must never kill the worker: that would silently stop THIS destination from
|
|
965
|
+
# draining while inbound keeps ACKing (review H-1). Log, back off, and keep going.
|
|
966
|
+
log.exception(
|
|
967
|
+
"delivery worker %r: unexpected error; backing off and retrying", name
|
|
968
|
+
)
|
|
969
|
+
if await self._stop_or_sleep(_WORKER_ERROR_BACKOFF_SECONDS):
|
|
970
|
+
return
|
|
971
|
+
|
|
972
|
+
async def _router_worker(self, name: str) -> None:
|
|
973
|
+
"""Drain the **ingress** stage for one inbound — the router half of the split pipeline (ADR
|
|
974
|
+
0001 Step B).
|
|
975
|
+
|
|
976
|
+
Strict FIFO per inbound (preserving arrival order into routing): claim the oldest ingress row,
|
|
977
|
+
run its Router (``route_only``), and hand the selected handlers to the **routed** stage
|
|
978
|
+
(``route_handoff``) — one routed row per handler. It runs no transform. A Router failure no
|
|
979
|
+
longer NAKs the sender (already ACKed at ingress) — under the global ``internal_error`` policy
|
|
980
|
+
it dead-letters the ingress row (``CONTINUE`` → message ``ERROR``, advance) or halts this lane
|
|
981
|
+
preserving the row (``STOP`` → ``connection_stopped`` alert, return). Shares the delivery
|
|
982
|
+
worker's wait/backoff supervision.
|
|
983
|
+
"""
|
|
984
|
+
last_buildup_check = 0.0
|
|
985
|
+
while not self._stop.is_set():
|
|
986
|
+
try:
|
|
987
|
+
# FIFO per inbound: claim only the due head (ingress rows never back off, so this is
|
|
988
|
+
# effectively the oldest pending row for this inbound). lane_owner() gates the claim to a
|
|
989
|
+
# single owner per lane (Track B Step 5) so strict FIFO holds across nodes; None
|
|
990
|
+
# single-node (byte-identical).
|
|
991
|
+
item = await self.store.claim_next_fifo(
|
|
992
|
+
name, stage=Stage.INGRESS.value, owner=self._coordinator.lane_owner()
|
|
993
|
+
)
|
|
994
|
+
if item is None:
|
|
995
|
+
await self._wait_for_work(self._ingress_work)
|
|
996
|
+
continue
|
|
997
|
+
ic = self.registry.inbound.get(name)
|
|
998
|
+
if ic is None:
|
|
999
|
+
# The inbound was removed from the registry but residual ingress rows remain.
|
|
1000
|
+
# Revert this just-claimed row to pending and EXIT the worker — there is nothing to
|
|
1001
|
+
# route it with until a reload restores the inbound (which re-arms this worker and
|
|
1002
|
+
# drains the backlog). Reschedule with a retry-FOREVER policy (NOT the outbound
|
|
1003
|
+
# delivery defaults, whose finite max_attempts would dead-letter an ACKed-but-
|
|
1004
|
+
# never-attempted message purely for being removed) so the message is never dropped.
|
|
1005
|
+
await self.store.mark_failed(item.id, "inbound not in registry", RetryPolicy())
|
|
1006
|
+
return
|
|
1007
|
+
try:
|
|
1008
|
+
# Publish the live graph's run-scoped views (code sets / reference snapshots /
|
|
1009
|
+
# active environment) so a call-time code_set(...)/reference(...)/current_environment()
|
|
1010
|
+
# inside the Router resolves (the loader only had them active during import). Views
|
|
1011
|
+
# are read from self.registry/self.store live, so a reload's swapped tables apply to
|
|
1012
|
+
# the next routed row; run_contexts restores cleanly after each run (no leak). The
|
|
1013
|
+
# set of providers is the run_context registry (router phase) — features add one
|
|
1014
|
+
# provider there, never edit this call site.
|
|
1015
|
+
with run_contexts(
|
|
1016
|
+
RunContext(
|
|
1017
|
+
code_sets=self.registry.code_sets,
|
|
1018
|
+
reference_view=self.store.reference_view(),
|
|
1019
|
+
active_environment=self._active_environment,
|
|
1020
|
+
ingest_time=item.created_at,
|
|
1021
|
+
),
|
|
1022
|
+
phase="router",
|
|
1023
|
+
):
|
|
1024
|
+
names = route_only(self.registry, ic, item.payload)
|
|
1025
|
+
except Exception as exc:
|
|
1026
|
+
# Router code error (incl. an unknown handler name). Post-ACK, so no NAK — the
|
|
1027
|
+
# global internal_error policy decides. Log the exception TYPE only; full detail
|
|
1028
|
+
# goes to the secured store's last_error, never the general log (PHI).
|
|
1029
|
+
if self._internal_error_default is InternalErrorPolicy.STOP:
|
|
1030
|
+
log.error(
|
|
1031
|
+
"router worker %r: router error on %s (%s); STOPPING ingest processing "
|
|
1032
|
+
"(operator must fix + reload to resume)",
|
|
1033
|
+
name,
|
|
1034
|
+
item.id,
|
|
1035
|
+
type(exc).__name__,
|
|
1036
|
+
)
|
|
1037
|
+
await self.store.mark_failed(
|
|
1038
|
+
item.id,
|
|
1039
|
+
f"router error (ingest stopped): {safe_exc(exc)}",
|
|
1040
|
+
self._delivery_defaults,
|
|
1041
|
+
)
|
|
1042
|
+
self._alert_sink.connection_stopped(
|
|
1043
|
+
name, detail=f"router {type(exc).__name__} on {item.id}"
|
|
1044
|
+
)
|
|
1045
|
+
return
|
|
1046
|
+
log.warning(
|
|
1047
|
+
"router worker %r: router error on %s (%s); dead-lettering",
|
|
1048
|
+
name,
|
|
1049
|
+
item.id,
|
|
1050
|
+
type(exc).__name__,
|
|
1051
|
+
)
|
|
1052
|
+
await self.store.dead_letter_now(item.id, f"router error: {safe_exc(exc)}")
|
|
1053
|
+
continue
|
|
1054
|
+
disposition = MessageStatus.ROUTED if names else MessageStatus.UNROUTED
|
|
1055
|
+
await self.store.route_handoff(
|
|
1056
|
+
ingress_id=item.id,
|
|
1057
|
+
message_id=item.message_id,
|
|
1058
|
+
channel_id=name,
|
|
1059
|
+
handlers=[(h, item.payload) for h in names],
|
|
1060
|
+
disposition=disposition,
|
|
1061
|
+
)
|
|
1062
|
+
if names:
|
|
1063
|
+
self._routed_work.set() # wake the transform worker for the new routed rows
|
|
1064
|
+
# Off the hot path (rate-limited): alert if this inbound's ingress backlog is building
|
|
1065
|
+
# (a slow/hung router). Uses the global buildup threshold (no per-inbound override yet).
|
|
1066
|
+
now = time.time()
|
|
1067
|
+
if now - last_buildup_check >= _BUILDUP_CHECK_INTERVAL:
|
|
1068
|
+
last_buildup_check = now
|
|
1069
|
+
await self._maybe_alert_buildup(
|
|
1070
|
+
name, stage=Stage.INGRESS.value, threshold=self._buildup_default
|
|
1071
|
+
)
|
|
1072
|
+
except asyncio.CancelledError:
|
|
1073
|
+
raise
|
|
1074
|
+
except Exception:
|
|
1075
|
+
# A store error in the loop itself (claim/handoff failing — DB locked, disk full) must
|
|
1076
|
+
# never kill the worker: that would stall routing while the listener keeps ACKing. Log,
|
|
1077
|
+
# back off, and keep going (mirrors the delivery worker).
|
|
1078
|
+
log.exception("router worker %r: unexpected error; backing off and retrying", name)
|
|
1079
|
+
if await self._stop_or_sleep(_WORKER_ERROR_BACKOFF_SECONDS):
|
|
1080
|
+
return
|
|
1081
|
+
|
|
1082
|
+
async def _response_worker(self, name: str) -> None:
|
|
1083
|
+
"""Drain the **response** stage for one LOOPBACK inbound — re-ingress a captured reply as a new
|
|
1084
|
+
inbound message (ADR 0013 Increment 2). Strict FIFO per loopback lane: claim the oldest
|
|
1085
|
+
``Stage.RESPONSE`` token, peek the reply body for the loopback's ``content_type``, and hand it
|
|
1086
|
+
off **atomically** via :meth:`~messagefoundry.store.base.QueueStore.ingress_handoff` (which
|
|
1087
|
+
produces the re-ingressed message + ingress row, depth-caps it, or errors a non-peekable body).
|
|
1088
|
+
Mirrors :meth:`_router_worker`'s claim / missing-inbound / backoff supervision. Re-ingress is a
|
|
1089
|
+
single-owner internal stage: the per-lane claim owner is the only leader gate (``LoopbackSource``
|
|
1090
|
+
is inert, so there is no source-level gate)."""
|
|
1091
|
+
while not self._stop.is_set():
|
|
1092
|
+
try:
|
|
1093
|
+
item = await self.store.claim_next_fifo(
|
|
1094
|
+
name, stage=Stage.RESPONSE.value, owner=self._coordinator.lane_owner()
|
|
1095
|
+
)
|
|
1096
|
+
if item is None:
|
|
1097
|
+
await self._wait_for_work(self._response_work)
|
|
1098
|
+
continue
|
|
1099
|
+
ic = self.registry.inbound.get(name)
|
|
1100
|
+
if ic is None:
|
|
1101
|
+
# The loopback was removed by a reload but residual tokens remain. Revert the claim
|
|
1102
|
+
# (retry-FOREVER, never dropped) and EXIT; a reload restoring the loopback re-arms
|
|
1103
|
+
# this worker and drains the backlog — mirrors the router worker's missing-inbound exit.
|
|
1104
|
+
await self.store.mark_failed(item.id, "inbound not in registry", RetryPolicy())
|
|
1105
|
+
return
|
|
1106
|
+
# Peek the reply body for the loopback's content_type (in pipeline/, not the store), then
|
|
1107
|
+
# hand off in one atomic transaction. response_body_for_work_row reads the same immutable
|
|
1108
|
+
# artifact ingress_handoff re-reads for the message raw, so peek and raw always agree.
|
|
1109
|
+
body = await self.store.response_body_for_work_row(item.id)
|
|
1110
|
+
control_id, message_type, summary, peek_failed = _peek_for_loopback(ic, body or "")
|
|
1111
|
+
produced = await self.store.ingress_handoff(
|
|
1112
|
+
response_row_id=item.id,
|
|
1113
|
+
loopback_channel_id=name,
|
|
1114
|
+
correlation_depth_cap=self._max_correlation_depth,
|
|
1115
|
+
control_id=control_id,
|
|
1116
|
+
message_type=message_type,
|
|
1117
|
+
summary=summary,
|
|
1118
|
+
peek_failed=peek_failed,
|
|
1119
|
+
)
|
|
1120
|
+
if produced:
|
|
1121
|
+
# Wake the loopback's router worker to route the freshly-ingressed answer (a no-op
|
|
1122
|
+
# wake for a depth-capped / peek-failed token that produced no ingress row).
|
|
1123
|
+
self._ingress_work.set()
|
|
1124
|
+
except asyncio.CancelledError:
|
|
1125
|
+
raise
|
|
1126
|
+
except Exception:
|
|
1127
|
+
# A store error in the loop itself (claim/handoff failing) must never kill the worker —
|
|
1128
|
+
# log, back off, keep going (mirrors the router/delivery workers).
|
|
1129
|
+
log.exception(
|
|
1130
|
+
"response worker %r: unexpected error; backing off and retrying", name
|
|
1131
|
+
)
|
|
1132
|
+
if await self._stop_or_sleep(_WORKER_ERROR_BACKOFF_SECONDS):
|
|
1133
|
+
return
|
|
1134
|
+
|
|
1135
|
+
async def _transform_worker(self, name: str) -> None:
|
|
1136
|
+
"""Drain the **routed** stage for one inbound — the transform half of the split pipeline (ADR
|
|
1137
|
+
0001 Step B).
|
|
1138
|
+
|
|
1139
|
+
Strict FIFO per inbound (preserving order into transform): claim the oldest routed row, run its
|
|
1140
|
+
**single** handler's transform (``transform_one``), and hand the resulting deliveries to the
|
|
1141
|
+
**outbound** stage (``transform_handoff``). A slow/failing transform here can no longer block
|
|
1142
|
+
routing — the router worker keeps producing routed rows independently. A transform failure is
|
|
1143
|
+
post-ACK (no NAK): under the global ``internal_error`` policy it dead-letters the routed row
|
|
1144
|
+
(``CONTINUE`` → message ``ERROR``, advance) or halts this lane (``STOP`` → ``connection_stopped``
|
|
1145
|
+
alert, return). A handler removed since routing (a racing reload) is dead-lettered too —
|
|
1146
|
+
recoverable via per-message replay once restored, matching the missing-outbound path.
|
|
1147
|
+
"""
|
|
1148
|
+
last_buildup_check = 0.0
|
|
1149
|
+
while not self._stop.is_set():
|
|
1150
|
+
try:
|
|
1151
|
+
# lane_owner() gates the claim to a single owner per lane (Track B Step 5) so strict
|
|
1152
|
+
# FIFO holds across nodes; None single-node (byte-identical no-owner claim).
|
|
1153
|
+
item = await self.store.claim_next_fifo(
|
|
1154
|
+
name, stage=Stage.ROUTED.value, owner=self._coordinator.lane_owner()
|
|
1155
|
+
)
|
|
1156
|
+
if item is None:
|
|
1157
|
+
await self._wait_for_work(self._routed_work)
|
|
1158
|
+
continue
|
|
1159
|
+
ic = self.registry.inbound.get(name)
|
|
1160
|
+
if ic is None:
|
|
1161
|
+
# Inbound removed; nothing to transform with until a reload restores it (which
|
|
1162
|
+
# re-arms this worker). Revert the row (retry-forever) and exit (mirrors the router
|
|
1163
|
+
# worker), so the ACKed-but-unprocessed message is never dropped.
|
|
1164
|
+
await self.store.mark_failed(item.id, "inbound not in registry", RetryPolicy())
|
|
1165
|
+
return
|
|
1166
|
+
hname = item.handler_name
|
|
1167
|
+
if hname is None or hname not in self.registry.handlers:
|
|
1168
|
+
# Handler gone (removed/renamed since routing). Can't transform this row; dead-letter
|
|
1169
|
+
# it (message ERROR, replayable once restored) — the per-row analogue of the startup
|
|
1170
|
+
# dead_letter_missing_handlers sweep. Dead-lettering (vs reverting) avoids a hot-loop
|
|
1171
|
+
# on a permanently-missing handler and gives the operator visibility.
|
|
1172
|
+
log.warning(
|
|
1173
|
+
"transform worker %r: handler %r for %s is missing; dead-lettering",
|
|
1174
|
+
name,
|
|
1175
|
+
hname,
|
|
1176
|
+
item.id,
|
|
1177
|
+
)
|
|
1178
|
+
await self.store.dead_letter_now(
|
|
1179
|
+
item.id, f"handler {hname!r} removed from registry"
|
|
1180
|
+
)
|
|
1181
|
+
continue
|
|
1182
|
+
# ADR 0013 Increment 2: for a RE-INGRESSED message (only ever on a loopback inbound),
|
|
1183
|
+
# feed the run-context `response` provider the ORIGIN request's captured replies so its
|
|
1184
|
+
# Handler can read them via response_get(dest). A normal message → None (byte-identical,
|
|
1185
|
+
# and the metadata read is skipped entirely for non-loopback inbounds).
|
|
1186
|
+
response_view: dict[str, Any] | None = None
|
|
1187
|
+
if ic.spec.type is ConnectorType.LOOPBACK:
|
|
1188
|
+
msg = await self.store.get_message(item.message_id)
|
|
1189
|
+
raw_meta = msg.get("metadata") if msg else None
|
|
1190
|
+
meta = json.loads(raw_meta) if raw_meta else {}
|
|
1191
|
+
corr = meta.get("correlation_id") if isinstance(meta, dict) else None
|
|
1192
|
+
if corr:
|
|
1193
|
+
# {destination_name: latest CapturedResponse}: correlate_response orders by
|
|
1194
|
+
# (dest, response_seq), so the last per destination wins (the authoritative
|
|
1195
|
+
# reply). Immutable committed rows → re-run-stable (ADR 0009).
|
|
1196
|
+
response_view = {
|
|
1197
|
+
c.destination_name: c for c in await self.store.correlate_response(corr)
|
|
1198
|
+
}
|
|
1199
|
+
try:
|
|
1200
|
+
# Same as the router worker, plus the transform-only providers: publish the run-scoped
|
|
1201
|
+
# views so call-time code_set(...)/reference(...)/state_get(...)/current_environment()
|
|
1202
|
+
# inside the Handler resolve; restored cleanly after the run. The transform phase adds
|
|
1203
|
+
# the store's transform-state read-through cache view (ADR 0005) so state_get(...)
|
|
1204
|
+
# resolves against committed writes. Providers come from the run_context registry
|
|
1205
|
+
# (transform phase) — features add one provider, never edit this call site.
|
|
1206
|
+
with run_contexts(
|
|
1207
|
+
RunContext(
|
|
1208
|
+
code_sets=self.registry.code_sets,
|
|
1209
|
+
reference_view=self.store.reference_view(),
|
|
1210
|
+
state_view=self.store.state_view(),
|
|
1211
|
+
response_view=response_view,
|
|
1212
|
+
active_environment=self._active_environment,
|
|
1213
|
+
ingest_time=item.created_at,
|
|
1214
|
+
),
|
|
1215
|
+
phase="transform",
|
|
1216
|
+
):
|
|
1217
|
+
if self._lookup_executor is not None:
|
|
1218
|
+
# The graph declares ≥1 DatabaseLookup, so a Handler may call db_lookup() — a
|
|
1219
|
+
# LIVE, synchronous DB read (ADR 0010). A handler is synchronous and must not
|
|
1220
|
+
# block the event loop, so run the transform OFF the loop in a worker thread.
|
|
1221
|
+
# asyncio.to_thread copies THIS context into the thread — the run_contexts
|
|
1222
|
+
# views AND the active lookup runner — so db_lookup()/code_set()/reference()/
|
|
1223
|
+
# state_get()/current_environment() all resolve there, while the loop stays
|
|
1224
|
+
# free to service the lookup's async query and every other connection. The
|
|
1225
|
+
# runner bridges back onto the loop (run_coroutine_threadsafe). db_lookup is
|
|
1226
|
+
# the deliberate re-run-stability exception (ADR 0009) and raises in dry-run.
|
|
1227
|
+
with db_lookup_activated(self._run_lookup):
|
|
1228
|
+
deliveries_preview, state_preview = await asyncio.to_thread(
|
|
1229
|
+
transform_one,
|
|
1230
|
+
self.registry,
|
|
1231
|
+
hname,
|
|
1232
|
+
item.payload,
|
|
1233
|
+
self.registry.inbound[name].content_type.value,
|
|
1234
|
+
)
|
|
1235
|
+
else:
|
|
1236
|
+
# No DatabaseLookup declared → byte-identical to before: run inline on the loop.
|
|
1237
|
+
deliveries_preview, state_preview = transform_one(
|
|
1238
|
+
self.registry,
|
|
1239
|
+
hname,
|
|
1240
|
+
item.payload,
|
|
1241
|
+
self.registry.inbound[name].content_type.value,
|
|
1242
|
+
)
|
|
1243
|
+
except Exception as exc:
|
|
1244
|
+
# Handler/transform code error (incl. an unknown outbound name). Post-ACK, so no
|
|
1245
|
+
# NAK — the global internal_error policy decides. Log the exception TYPE only (PHI).
|
|
1246
|
+
if self._internal_error_default is InternalErrorPolicy.STOP:
|
|
1247
|
+
log.error(
|
|
1248
|
+
"transform worker %r: handler error on %s (%s); STOPPING transform "
|
|
1249
|
+
"processing (operator must fix + reload to resume)",
|
|
1250
|
+
name,
|
|
1251
|
+
item.id,
|
|
1252
|
+
type(exc).__name__,
|
|
1253
|
+
)
|
|
1254
|
+
await self.store.mark_failed(
|
|
1255
|
+
item.id,
|
|
1256
|
+
f"handler error (transform stopped): {safe_exc(exc)}",
|
|
1257
|
+
self._delivery_defaults,
|
|
1258
|
+
)
|
|
1259
|
+
self._alert_sink.connection_stopped(
|
|
1260
|
+
name, detail=f"handler {type(exc).__name__} on {item.id}"
|
|
1261
|
+
)
|
|
1262
|
+
return
|
|
1263
|
+
log.warning(
|
|
1264
|
+
"transform worker %r: handler error on %s (%s); dead-lettering",
|
|
1265
|
+
name,
|
|
1266
|
+
item.id,
|
|
1267
|
+
type(exc).__name__,
|
|
1268
|
+
)
|
|
1269
|
+
await self.store.dead_letter_now(item.id, f"handler error: {safe_exc(exc)}")
|
|
1270
|
+
continue
|
|
1271
|
+
deliveries = [(d.to, d.payload) for d in deliveries_preview]
|
|
1272
|
+
state_ops = [(s.namespace, s.key, s.value) for s in state_preview]
|
|
1273
|
+
await self.store.transform_handoff(
|
|
1274
|
+
routed_id=item.id,
|
|
1275
|
+
message_id=item.message_id,
|
|
1276
|
+
channel_id=name,
|
|
1277
|
+
deliveries=deliveries,
|
|
1278
|
+
state_ops=state_ops,
|
|
1279
|
+
)
|
|
1280
|
+
if deliveries:
|
|
1281
|
+
self._work.set() # wake the outbound delivery workers for the freshly-queued rows
|
|
1282
|
+
# Off the hot path (rate-limited): alert if this inbound's routed (transform) backlog is
|
|
1283
|
+
# building behind a slow/hung handler — reported separately from the ingress lane.
|
|
1284
|
+
now = time.time()
|
|
1285
|
+
if now - last_buildup_check >= _BUILDUP_CHECK_INTERVAL:
|
|
1286
|
+
last_buildup_check = now
|
|
1287
|
+
await self._maybe_alert_buildup(
|
|
1288
|
+
name, stage=Stage.ROUTED.value, threshold=self._buildup_default
|
|
1289
|
+
)
|
|
1290
|
+
except asyncio.CancelledError:
|
|
1291
|
+
raise
|
|
1292
|
+
except Exception:
|
|
1293
|
+
# A store error in the loop itself must never kill the worker (mirrors the others).
|
|
1294
|
+
log.exception(
|
|
1295
|
+
"transform worker %r: unexpected error; backing off and retrying", name
|
|
1296
|
+
)
|
|
1297
|
+
if await self._stop_or_sleep(_WORKER_ERROR_BACKOFF_SECONDS):
|
|
1298
|
+
return
|
|
1299
|
+
|
|
1300
|
+
async def _maybe_alert_buildup(
|
|
1301
|
+
self,
|
|
1302
|
+
name: str,
|
|
1303
|
+
*,
|
|
1304
|
+
stage: str = Stage.OUTBOUND.value,
|
|
1305
|
+
threshold: BuildupThreshold | None = None,
|
|
1306
|
+
) -> None:
|
|
1307
|
+
"""Raise a ``queue_buildup`` alert if a lane has crossed its depth/age threshold.
|
|
1308
|
+
|
|
1309
|
+
Used for both stages: an outbound lane that isn't draining (a retry-forever head; ``threshold``
|
|
1310
|
+
defaults to the connection's resolved one) and an ingress lane backing up behind a slow router
|
|
1311
|
+
(caller passes ``stage='ingress'`` + the global threshold). The single COUNT+MIN query is
|
|
1312
|
+
cheap and rate-paced by callers. The re-alert is throttled per (stage, connection)
|
|
1313
|
+
(``_BUILDUP_REALERT_SECONDS``) so an ongoing stall reminds the operator without spamming. A
|
|
1314
|
+
sink must never raise (contract), but we still guard so an alerting bug can't kill the worker."""
|
|
1315
|
+
threshold = threshold or self._buildup.get(name) or self._buildup_default
|
|
1316
|
+
if threshold.max_depth is None and threshold.max_oldest_seconds is None:
|
|
1317
|
+
return # buildup alerting disabled for this lane
|
|
1318
|
+
key = f"{stage}:{name}"
|
|
1319
|
+
now = time.time()
|
|
1320
|
+
if now < self._next_buildup_alert.get(key, 0.0):
|
|
1321
|
+
return # re-alert throttled
|
|
1322
|
+
depth, oldest_created = await self.store.pending_depth(name, stage=stage)
|
|
1323
|
+
if depth == 0:
|
|
1324
|
+
return
|
|
1325
|
+
oldest_age = (now - oldest_created) if oldest_created is not None else None
|
|
1326
|
+
crossed = (threshold.max_depth is not None and depth >= threshold.max_depth) or (
|
|
1327
|
+
threshold.max_oldest_seconds is not None
|
|
1328
|
+
and oldest_age is not None
|
|
1329
|
+
and oldest_age >= threshold.max_oldest_seconds
|
|
1330
|
+
)
|
|
1331
|
+
if not crossed:
|
|
1332
|
+
return
|
|
1333
|
+
self._next_buildup_alert[key] = now + _BUILDUP_REALERT_SECONDS
|
|
1334
|
+
try:
|
|
1335
|
+
self._alert_sink.queue_buildup(name, depth=depth, oldest_age_seconds=oldest_age or 0.0)
|
|
1336
|
+
except Exception:
|
|
1337
|
+
log.exception("alert sink raised on queue_buildup for %r", name)
|
|
1338
|
+
|
|
1339
|
+
async def _wait_for_work(self, event: asyncio.Event) -> None:
|
|
1340
|
+
"""Wait up to ``poll_interval`` for ``event`` (this worker class's wake event), then clear it.
|
|
1341
|
+
Per-class events mean a worker only clears its own signal, so one class can't swallow another's
|
|
1342
|
+
wakeup; ``poll_interval`` still backstops any missed set()."""
|
|
1343
|
+
try:
|
|
1344
|
+
await asyncio.wait_for(event.wait(), self.poll_interval)
|
|
1345
|
+
except asyncio.TimeoutError:
|
|
1346
|
+
pass
|
|
1347
|
+
finally:
|
|
1348
|
+
event.clear()
|
|
1349
|
+
|
|
1350
|
+
async def _stop_or_sleep(self, delay: float) -> bool:
|
|
1351
|
+
"""Sleep up to ``delay`` seconds; return True if a stop was requested meanwhile (so a
|
|
1352
|
+
backing-off worker exits promptly on shutdown instead of sleeping out the full delay)."""
|
|
1353
|
+
try:
|
|
1354
|
+
await asyncio.wait_for(self._stop.wait(), delay)
|
|
1355
|
+
return True
|
|
1356
|
+
except asyncio.TimeoutError:
|
|
1357
|
+
return False
|
|
1358
|
+
|
|
1359
|
+
|
|
1360
|
+
def _source_config(ic: InboundConnection, bind_host: str, env_values: Mapping[str, Any]) -> Source:
|
|
1361
|
+
# Resolve any env() references first (a missing value raises WiringError here, before bind).
|
|
1362
|
+
settings = resolve_env_settings(ic.spec.settings, env_values)
|
|
1363
|
+
# Inbound MLLP/TCP/X12 listeners never carry an author-supplied host (wiring rejects one) — they
|
|
1364
|
+
# bind to the per-connection bind_address if set, else the service-level [inbound].bind_host. File
|
|
1365
|
+
# and other inbounds have no host and ignore this. A peer-IP allowlist rides into the connector's
|
|
1366
|
+
# settings so the listener can reject a non-allowlisted peer at accept time. (bind_address and the
|
|
1367
|
+
# allowlist are MLLP/TCP-only at wiring, so for X12 both fields are None here = unchanged behaviour.)
|
|
1368
|
+
if ic.spec.type in (ConnectorType.MLLP, ConnectorType.TCP, ConnectorType.X12):
|
|
1369
|
+
settings["host"] = ic.bind_address or bind_host
|
|
1370
|
+
if ic.source_ip_allowlist:
|
|
1371
|
+
settings["source_ip_allowlist"] = list(ic.source_ip_allowlist)
|
|
1372
|
+
return Source(type=ic.spec.type, settings=settings, ack_mode=ic.ack_mode)
|
|
1373
|
+
|
|
1374
|
+
|
|
1375
|
+
def _dest_config(oc: OutboundConnection, env_values: Mapping[str, Any]) -> Destination:
|
|
1376
|
+
# Resolve env() first so any signing key/password ref is materialized here, then assemble the
|
|
1377
|
+
# typed signing config (ASVS 4.1.5, ADR 0018) from the resolved sign_* settings. None = signing
|
|
1378
|
+
# off (every existing outbound unchanged). The connector loads the key + mints the signature; this
|
|
1379
|
+
# is the single choke point feeding start/check/dry-run, so a bad key fails loud at all three.
|
|
1380
|
+
settings = resolve_env_settings(oc.spec.settings, env_values)
|
|
1381
|
+
return Destination(
|
|
1382
|
+
name=oc.name,
|
|
1383
|
+
type=oc.spec.type,
|
|
1384
|
+
settings=settings,
|
|
1385
|
+
retry=oc.retry or RetryPolicy(),
|
|
1386
|
+
sign=OutboundSigning.from_settings(settings),
|
|
1387
|
+
)
|
|
1388
|
+
|
|
1389
|
+
|
|
1390
|
+
def build_check_registry(
|
|
1391
|
+
registry: Registry,
|
|
1392
|
+
*,
|
|
1393
|
+
inbound_bind_host: str,
|
|
1394
|
+
env_values: Mapping[str, Any],
|
|
1395
|
+
egress: EgressSettings,
|
|
1396
|
+
) -> None:
|
|
1397
|
+
"""Construct (and discard) every connector in ``registry`` + run the fail-closed connect/egress
|
|
1398
|
+
allowlists, so a bad connector spec or a non-allowlisted host fails as a :class:`WiringError`
|
|
1399
|
+
BEFORE anything is applied. The standalone core of :meth:`RegistryRunner.build_check`, callable
|
|
1400
|
+
offline — e.g. the ``connection`` CLI validating an edit before it persists (ADR 0007). Builds
|
|
1401
|
+
nothing live (no socket bind / file I/O — binding happens later in ``start_inbound``)."""
|
|
1402
|
+
try:
|
|
1403
|
+
for ic in registry.inbound.values():
|
|
1404
|
+
source_cfg = _source_config(ic, inbound_bind_host, env_values)
|
|
1405
|
+
check_source_allowed(source_cfg, ic.name, egress)
|
|
1406
|
+
build_source(source_cfg)
|
|
1407
|
+
reingress_targets: set[str] = set()
|
|
1408
|
+
for oc in registry.outbound.values():
|
|
1409
|
+
dest = _dest_config(oc, env_values)
|
|
1410
|
+
check_egress_allowed(dest, egress) # fail-closed egress allowlist (WP-11c)
|
|
1411
|
+
build_destination(dest)
|
|
1412
|
+
# ADR 0013 Increment 2: reingress_to must name an existing Loopback() inbound. This is a
|
|
1413
|
+
# CROSS-registry fact (build_outbound_connection is registry-blind), enforced here so it
|
|
1414
|
+
# fails at `check`/dry-run with no store, like every other connector validation.
|
|
1415
|
+
target = oc.spec.settings.get("reingress_to")
|
|
1416
|
+
if target is not None:
|
|
1417
|
+
tic = registry.inbound.get(str(target))
|
|
1418
|
+
if tic is None or tic.spec.type is not ConnectorType.LOOPBACK:
|
|
1419
|
+
raise WiringError(
|
|
1420
|
+
f"outbound connection {oc.name!r}: reingress_to names unknown/non-loopback "
|
|
1421
|
+
f"inbound {target!r} — declare it as inbound(..., Loopback(), ...) (ADR 0013)."
|
|
1422
|
+
)
|
|
1423
|
+
reingress_targets.add(str(target))
|
|
1424
|
+
# A loopback inbound with no capturing outbound pointing at it is legal but inert (never fed) —
|
|
1425
|
+
# surface it (it may be a staging artifact), but don't error.
|
|
1426
|
+
for iname, ic in registry.inbound.items():
|
|
1427
|
+
if ic.spec.type is ConnectorType.LOOPBACK and iname not in reingress_targets:
|
|
1428
|
+
log.warning(
|
|
1429
|
+
"loopback inbound %r has no reingress_to source; it will never receive a message",
|
|
1430
|
+
iname,
|
|
1431
|
+
)
|
|
1432
|
+
resolved_lookups: dict[str, dict[str, Any]] = {}
|
|
1433
|
+
for lname, lspec in registry.lookups.items():
|
|
1434
|
+
lsettings = resolve_env_settings(lspec.settings, env_values)
|
|
1435
|
+
check_lookup_allowed(lname, lsettings, egress) # fail-closed connect allowlist
|
|
1436
|
+
resolved_lookups[lname] = lsettings
|
|
1437
|
+
if resolved_lookups:
|
|
1438
|
+
# Construct (and discard) the executor: validates each DSN (TLS/auth) without opening a pool.
|
|
1439
|
+
DatabaseLookupExecutor(resolved_lookups)
|
|
1440
|
+
except WiringError:
|
|
1441
|
+
raise
|
|
1442
|
+
except Exception as exc:
|
|
1443
|
+
raise WiringError(f"connector build failed: {exc}") from exc
|
|
1444
|
+
|
|
1445
|
+
|
|
1446
|
+
def _allowlist_for(conn_type: ConnectorType, egress: EgressSettings) -> list[str]:
|
|
1447
|
+
"""The ``[egress]`` allowlist that governs a connector type (X12 shares TCP's; REST/SOAP share the
|
|
1448
|
+
HTTP list). Returns ``[]`` for a type with no egress list — which under ``deny_by_default`` means
|
|
1449
|
+
'nothing is configured to permit it', so the destination is refused."""
|
|
1450
|
+
if conn_type is ConnectorType.MLLP:
|
|
1451
|
+
return egress.allowed_mllp
|
|
1452
|
+
if conn_type in (ConnectorType.TCP, ConnectorType.X12):
|
|
1453
|
+
return egress.allowed_tcp
|
|
1454
|
+
if conn_type is ConnectorType.FILE:
|
|
1455
|
+
return egress.allowed_file_dirs
|
|
1456
|
+
if conn_type in (ConnectorType.REST, ConnectorType.SOAP):
|
|
1457
|
+
return egress.allowed_http
|
|
1458
|
+
if conn_type is ConnectorType.DATABASE:
|
|
1459
|
+
return egress.allowed_db
|
|
1460
|
+
if conn_type is ConnectorType.REMOTEFILE:
|
|
1461
|
+
return egress.allowed_remote
|
|
1462
|
+
return []
|
|
1463
|
+
|
|
1464
|
+
|
|
1465
|
+
def check_source_allowed(source: Source, name: str, egress: EgressSettings) -> None:
|
|
1466
|
+
"""Fail-closed connect-allowlist for an inbound connector that **dials out** to a server to receive
|
|
1467
|
+
(today: the DATABASE source, which polls a SQL host). Reuses ``[egress].allowed_db``: although the
|
|
1468
|
+
DB source pulls data *in* rather than exfiltrating it, it still opens an outbound connection to an
|
|
1469
|
+
operator-named host, so the same allowlist guards against pointing the engine at an arbitrary
|
|
1470
|
+
server. Opt-in (an empty list = unrestricted), matching destinations; checked at load/reload/start.
|
|
1471
|
+
|
|
1472
|
+
A TCP/MLLP/File *source* is a local **listener** (it binds ``[inbound].bind_host`` and waits for
|
|
1473
|
+
peers, never dialing out), so there is nothing to connect-gate here — ``[egress].allowed_tcp``
|
|
1474
|
+
governs only the TCP *destination* (see :func:`check_egress_allowed`).
|
|
1475
|
+
|
|
1476
|
+
Under ``[egress].deny_by_default`` a DATABASE/REMOTEFILE source whose allowlist is empty is refused
|
|
1477
|
+
outright; a listener source (TCP/MLLP/File) never dials out, so it is unaffected."""
|
|
1478
|
+
if egress.deny_by_default:
|
|
1479
|
+
if source.type is ConnectorType.DATABASE and not egress.allowed_db:
|
|
1480
|
+
raise WiringError(
|
|
1481
|
+
f"inbound {name!r}: [egress].deny_by_default is set and [egress].allowed_db is empty "
|
|
1482
|
+
"— list the DATABASE server to permit it"
|
|
1483
|
+
)
|
|
1484
|
+
if source.type is ConnectorType.REMOTEFILE and not egress.allowed_remote:
|
|
1485
|
+
raise WiringError(
|
|
1486
|
+
f"inbound {name!r}: [egress].deny_by_default is set and [egress].allowed_remote is "
|
|
1487
|
+
"empty — list the REMOTEFILE host to permit it"
|
|
1488
|
+
)
|
|
1489
|
+
if source.type is ConnectorType.DATABASE and egress.allowed_db:
|
|
1490
|
+
host = str(source.settings.get("server", ""))
|
|
1491
|
+
port = source.settings.get("port", 1433)
|
|
1492
|
+
if not _mllp_egress_allowed(host, port, egress.allowed_db): # same host[:port] matching
|
|
1493
|
+
log.warning(
|
|
1494
|
+
"connect denied: inbound %r DATABASE server %r not in [egress].allowed_db",
|
|
1495
|
+
name,
|
|
1496
|
+
host,
|
|
1497
|
+
)
|
|
1498
|
+
raise WiringError(
|
|
1499
|
+
f"inbound {name!r}: DATABASE server {host!r} is not in the "
|
|
1500
|
+
"[egress].allowed_db allowlist"
|
|
1501
|
+
)
|
|
1502
|
+
elif source.type is ConnectorType.REMOTEFILE and egress.allowed_remote:
|
|
1503
|
+
host = str(source.settings.get("host", ""))
|
|
1504
|
+
port = source.settings.get("port")
|
|
1505
|
+
if not _mllp_egress_allowed(host, port, egress.allowed_remote): # same host[:port] matching
|
|
1506
|
+
log.warning(
|
|
1507
|
+
"connect denied: inbound %r REMOTEFILE host %r not in [egress].allowed_remote",
|
|
1508
|
+
name,
|
|
1509
|
+
host,
|
|
1510
|
+
)
|
|
1511
|
+
raise WiringError(
|
|
1512
|
+
f"inbound {name!r}: REMOTEFILE host {host!r} is not in the "
|
|
1513
|
+
"[egress].allowed_remote allowlist"
|
|
1514
|
+
)
|
|
1515
|
+
|
|
1516
|
+
|
|
1517
|
+
def check_lookup_allowed(name: str, settings: Mapping[str, Any], egress: EgressSettings) -> None:
|
|
1518
|
+
"""Fail-closed connect-allowlist for a ``DatabaseLookup`` (it dials out to a SQL host for a live,
|
|
1519
|
+
read-only ``db_lookup``). Reuses ``[egress].allowed_db`` (opt-in; an empty list = unrestricted), like
|
|
1520
|
+
the DATABASE source — checked at load/reload/start so the engine is never pointed at a non-allowlisted
|
|
1521
|
+
server. ``settings`` are the already-``env()``-resolved connection settings. Under
|
|
1522
|
+
``[egress].deny_by_default`` an empty ``allowed_db`` refuses the lookup outright."""
|
|
1523
|
+
if egress.deny_by_default and not egress.allowed_db:
|
|
1524
|
+
raise WiringError(
|
|
1525
|
+
f"DatabaseLookup {name!r}: [egress].deny_by_default is set and [egress].allowed_db is "
|
|
1526
|
+
"empty — list the lookup server to permit it"
|
|
1527
|
+
)
|
|
1528
|
+
if egress.allowed_db:
|
|
1529
|
+
host = str(settings.get("server", ""))
|
|
1530
|
+
port = settings.get("port", 1433)
|
|
1531
|
+
if not _mllp_egress_allowed(host, port, egress.allowed_db): # same host[:port] matching
|
|
1532
|
+
log.warning(
|
|
1533
|
+
"connect denied: DatabaseLookup %r server %r not in [egress].allowed_db", name, host
|
|
1534
|
+
)
|
|
1535
|
+
raise WiringError(
|
|
1536
|
+
f"DatabaseLookup {name!r}: server {host!r} is not in the [egress].allowed_db allowlist"
|
|
1537
|
+
)
|
|
1538
|
+
|
|
1539
|
+
|
|
1540
|
+
_LOOPBACK_HOSTS = frozenset({"127.0.0.1", "localhost", "::1", "::ffff:127.0.0.1"})
|
|
1541
|
+
|
|
1542
|
+
|
|
1543
|
+
def check_mllp_tls_exposure(source: Source, name: str, *, allow_insecure_bind: bool) -> None:
|
|
1544
|
+
"""Exposed-gate (ADR 0002 §0, MLLP side): refuse a **non-loopback MLLP listener without TLS** — it
|
|
1545
|
+
would put HL7 bodies on the wire in cleartext. Set ``tls=true`` (+ cert) on the connection, or pass
|
|
1546
|
+
``serve --allow-insecure-bind`` to accept the risk on a trusted segment (then warn). Loopback binds
|
|
1547
|
+
and TLS-on binds pass unconditionally. MLLP only (raw-TCP/X12 TLS is out of ADR-0002 scope)."""
|
|
1548
|
+
if source.type is not ConnectorType.MLLP:
|
|
1549
|
+
return
|
|
1550
|
+
host = str(source.settings.get("host", "127.0.0.1"))
|
|
1551
|
+
if host in _LOOPBACK_HOSTS or source.settings.get("tls"):
|
|
1552
|
+
return
|
|
1553
|
+
if allow_insecure_bind:
|
|
1554
|
+
log.warning(
|
|
1555
|
+
"inbound %r binds non-loopback host %r without TLS (--allow-insecure-bind); HL7 bodies "
|
|
1556
|
+
"cross the network in cleartext — set tls=true (+ tls_cert_file/tls_key_file) on it.",
|
|
1557
|
+
name,
|
|
1558
|
+
host,
|
|
1559
|
+
)
|
|
1560
|
+
return
|
|
1561
|
+
raise WiringError(
|
|
1562
|
+
f"inbound connection {name!r} binds non-loopback host {host!r} without TLS; HL7 bodies would "
|
|
1563
|
+
"cross the network in cleartext. Set tls=true (+ tls_cert_file/tls_key_file) on the MLLP "
|
|
1564
|
+
"connection, or pass `serve --allow-insecure-bind` to accept the cleartext risk on a trusted, "
|
|
1565
|
+
"firewalled network."
|
|
1566
|
+
)
|
|
1567
|
+
|
|
1568
|
+
|
|
1569
|
+
def check_egress_allowed(dest: Destination, egress: EgressSettings) -> None:
|
|
1570
|
+
"""Fail-closed: refuse (raise :class:`WiringError`) an outbound destination not on the ``[egress]``
|
|
1571
|
+
allowlist (WP-11c — ASVS 13.2.4/13.2.5/14.2.3), so a fat-fingered or hostile destination can't
|
|
1572
|
+
exfiltrate PHI. Opt-in per transport (an empty list = unrestricted), checked against the resolved
|
|
1573
|
+
(``env()``-substituted) destination at config load/reload/start. Webhook/SMTP alert sinks carry no
|
|
1574
|
+
PHI bodies and keep their own ``[alerts]`` host allowlists.
|
|
1575
|
+
|
|
1576
|
+
Under ``[egress].deny_by_default`` a destination whose transport has no allowlist is refused
|
|
1577
|
+
outright (fail-closed); with the list set, the per-list matching below is unchanged."""
|
|
1578
|
+
if egress.deny_by_default and not _allowlist_for(dest.type, egress):
|
|
1579
|
+
log.warning(
|
|
1580
|
+
"egress denied: outbound %r %s has no [egress] allowlist under deny_by_default",
|
|
1581
|
+
dest.name,
|
|
1582
|
+
dest.type.value,
|
|
1583
|
+
)
|
|
1584
|
+
raise WiringError(
|
|
1585
|
+
f"outbound {dest.name!r}: [egress].deny_by_default is set and no allowlist permits a "
|
|
1586
|
+
f"{dest.type.value} destination — add it to the matching [egress].allowed_* list"
|
|
1587
|
+
)
|
|
1588
|
+
if dest.type is ConnectorType.MLLP and egress.allowed_mllp:
|
|
1589
|
+
host = str(dest.settings.get("host", "127.0.0.1"))
|
|
1590
|
+
port = dest.settings.get("port")
|
|
1591
|
+
if not _mllp_egress_allowed(host, port, egress.allowed_mllp):
|
|
1592
|
+
log.warning(
|
|
1593
|
+
"egress denied: outbound %r MLLP %s:%s not in [egress].allowed_mllp",
|
|
1594
|
+
dest.name,
|
|
1595
|
+
host,
|
|
1596
|
+
port,
|
|
1597
|
+
)
|
|
1598
|
+
raise WiringError(
|
|
1599
|
+
f"outbound {dest.name!r}: MLLP destination {host}:{port} is not in the "
|
|
1600
|
+
"[egress].allowed_mllp allowlist"
|
|
1601
|
+
)
|
|
1602
|
+
elif dest.type is ConnectorType.TCP and egress.allowed_tcp:
|
|
1603
|
+
host = str(dest.settings.get("host", "127.0.0.1"))
|
|
1604
|
+
port = dest.settings.get("port")
|
|
1605
|
+
if not _mllp_egress_allowed(host, port, egress.allowed_tcp): # same host[:port] matching
|
|
1606
|
+
log.warning(
|
|
1607
|
+
"egress denied: outbound %r TCP %s:%s not in [egress].allowed_tcp",
|
|
1608
|
+
dest.name,
|
|
1609
|
+
host,
|
|
1610
|
+
port,
|
|
1611
|
+
)
|
|
1612
|
+
raise WiringError(
|
|
1613
|
+
f"outbound {dest.name!r}: TCP destination {host}:{port} is not in the "
|
|
1614
|
+
"[egress].allowed_tcp allowlist"
|
|
1615
|
+
)
|
|
1616
|
+
elif dest.type is ConnectorType.X12 and egress.allowed_tcp:
|
|
1617
|
+
# X12 is raw TCP, so it shares the [egress].allowed_tcp allowlist (same host[:port] matching).
|
|
1618
|
+
host = str(dest.settings.get("host", "127.0.0.1"))
|
|
1619
|
+
port = dest.settings.get("port")
|
|
1620
|
+
if not _mllp_egress_allowed(host, port, egress.allowed_tcp):
|
|
1621
|
+
log.warning(
|
|
1622
|
+
"egress denied: outbound %r X12 %s:%s not in [egress].allowed_tcp",
|
|
1623
|
+
dest.name,
|
|
1624
|
+
host,
|
|
1625
|
+
port,
|
|
1626
|
+
)
|
|
1627
|
+
raise WiringError(
|
|
1628
|
+
f"outbound {dest.name!r}: X12 destination {host}:{port} is not in the "
|
|
1629
|
+
"[egress].allowed_tcp allowlist"
|
|
1630
|
+
)
|
|
1631
|
+
elif dest.type is ConnectorType.FILE and egress.allowed_file_dirs:
|
|
1632
|
+
directory = dest.settings.get("directory")
|
|
1633
|
+
if directory is None or not _dir_egress_allowed(str(directory), egress.allowed_file_dirs):
|
|
1634
|
+
log.warning(
|
|
1635
|
+
"egress denied: outbound %r File dir %r not under [egress].allowed_file_dirs",
|
|
1636
|
+
dest.name,
|
|
1637
|
+
directory,
|
|
1638
|
+
)
|
|
1639
|
+
raise WiringError(
|
|
1640
|
+
f"outbound {dest.name!r}: File directory {directory!r} is not under any "
|
|
1641
|
+
"[egress].allowed_file_dirs entry"
|
|
1642
|
+
)
|
|
1643
|
+
elif dest.type in (ConnectorType.REST, ConnectorType.SOAP) and egress.allowed_http:
|
|
1644
|
+
url = str(dest.settings.get("url", ""))
|
|
1645
|
+
if not _http_egress_allowed(url, egress.allowed_http):
|
|
1646
|
+
host = urllib.parse.urlsplit(url).hostname or ""
|
|
1647
|
+
log.warning(
|
|
1648
|
+
"egress denied: outbound %r %s host %r not in [egress].allowed_http",
|
|
1649
|
+
dest.name,
|
|
1650
|
+
dest.type.value,
|
|
1651
|
+
host,
|
|
1652
|
+
)
|
|
1653
|
+
raise WiringError(
|
|
1654
|
+
f"outbound {dest.name!r}: {dest.type.value} host {host!r} is not in the "
|
|
1655
|
+
"[egress].allowed_http allowlist"
|
|
1656
|
+
)
|
|
1657
|
+
elif dest.type is ConnectorType.DATABASE and egress.allowed_db:
|
|
1658
|
+
host = str(dest.settings.get("server", ""))
|
|
1659
|
+
port = dest.settings.get("port", 1433)
|
|
1660
|
+
if not _mllp_egress_allowed(host, port, egress.allowed_db): # same host[:port] matching
|
|
1661
|
+
log.warning(
|
|
1662
|
+
"egress denied: outbound %r DATABASE server %r not in [egress].allowed_db",
|
|
1663
|
+
dest.name,
|
|
1664
|
+
host,
|
|
1665
|
+
)
|
|
1666
|
+
raise WiringError(
|
|
1667
|
+
f"outbound {dest.name!r}: DATABASE server {host!r} is not in the "
|
|
1668
|
+
"[egress].allowed_db allowlist"
|
|
1669
|
+
)
|
|
1670
|
+
elif dest.type is ConnectorType.REMOTEFILE and egress.allowed_remote:
|
|
1671
|
+
host = str(dest.settings.get("host", ""))
|
|
1672
|
+
port = dest.settings.get("port")
|
|
1673
|
+
if not _mllp_egress_allowed(host, port, egress.allowed_remote): # same host[:port] matching
|
|
1674
|
+
log.warning(
|
|
1675
|
+
"egress denied: outbound %r REMOTEFILE host %r not in [egress].allowed_remote",
|
|
1676
|
+
dest.name,
|
|
1677
|
+
host,
|
|
1678
|
+
)
|
|
1679
|
+
raise WiringError(
|
|
1680
|
+
f"outbound {dest.name!r}: REMOTEFILE host {host!r} is not in the "
|
|
1681
|
+
"[egress].allowed_remote allowlist"
|
|
1682
|
+
)
|
|
1683
|
+
|
|
1684
|
+
|
|
1685
|
+
def _mllp_egress_allowed(host: str, port: object, allowed: list[str]) -> bool:
|
|
1686
|
+
host = host.lower()
|
|
1687
|
+
for entry in allowed:
|
|
1688
|
+
allow_host, _, allow_port = entry.partition(":")
|
|
1689
|
+
if allow_host.strip().lower() == host and (
|
|
1690
|
+
not allow_port or str(port) == allow_port.strip()
|
|
1691
|
+
):
|
|
1692
|
+
return True
|
|
1693
|
+
return False
|
|
1694
|
+
|
|
1695
|
+
|
|
1696
|
+
def _dir_egress_allowed(directory: str, allowed: list[str]) -> bool:
|
|
1697
|
+
try:
|
|
1698
|
+
target = Path(directory).resolve()
|
|
1699
|
+
except (OSError, ValueError, RuntimeError):
|
|
1700
|
+
return False
|
|
1701
|
+
for entry in allowed:
|
|
1702
|
+
try:
|
|
1703
|
+
base = Path(entry).resolve()
|
|
1704
|
+
except (OSError, ValueError, RuntimeError):
|
|
1705
|
+
continue
|
|
1706
|
+
if target == base or base in target.parents:
|
|
1707
|
+
return True
|
|
1708
|
+
return False
|
|
1709
|
+
|
|
1710
|
+
|
|
1711
|
+
def _http_egress_allowed(url: str, allowed: list[str]) -> bool:
|
|
1712
|
+
"""True if ``url``'s host (and port, when an allow entry pins one) is on the allowlist — the same
|
|
1713
|
+
``host`` / ``host:port`` matching as MLLP."""
|
|
1714
|
+
parts = urllib.parse.urlsplit(url)
|
|
1715
|
+
host = (parts.hostname or "").lower()
|
|
1716
|
+
for entry in allowed:
|
|
1717
|
+
allow_host, _, allow_port = entry.partition(":")
|
|
1718
|
+
if allow_host.strip().lower() == host and (
|
|
1719
|
+
not allow_port or str(parts.port) == allow_port.strip()
|
|
1720
|
+
):
|
|
1721
|
+
return True
|
|
1722
|
+
return False
|