messagefoundry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. messagefoundry/__init__.py +108 -0
  2. messagefoundry/__main__.py +1155 -0
  3. messagefoundry/api/__init__.py +27 -0
  4. messagefoundry/api/app.py +1581 -0
  5. messagefoundry/api/approvals.py +184 -0
  6. messagefoundry/api/auth_models.py +211 -0
  7. messagefoundry/api/auth_routes.py +655 -0
  8. messagefoundry/api/field_authz.py +96 -0
  9. messagefoundry/api/models.py +374 -0
  10. messagefoundry/api/security.py +247 -0
  11. messagefoundry/api/tls.py +47 -0
  12. messagefoundry/auth/__init__.py +39 -0
  13. messagefoundry/auth/data/common_passwords.NOTICE +13 -0
  14. messagefoundry/auth/data/common_passwords.txt +10000 -0
  15. messagefoundry/auth/identity.py +71 -0
  16. messagefoundry/auth/ldap.py +264 -0
  17. messagefoundry/auth/notifications.py +68 -0
  18. messagefoundry/auth/passwords.py +53 -0
  19. messagefoundry/auth/permissions.py +120 -0
  20. messagefoundry/auth/policy.py +153 -0
  21. messagefoundry/auth/ratelimit.py +55 -0
  22. messagefoundry/auth/service.py +1323 -0
  23. messagefoundry/auth/tokens.py +26 -0
  24. messagefoundry/auth/totp.py +174 -0
  25. messagefoundry/checks.py +174 -0
  26. messagefoundry/config/__init__.py +30 -0
  27. messagefoundry/config/active_environment.py +80 -0
  28. messagefoundry/config/ai_policy.py +140 -0
  29. messagefoundry/config/code_sets.py +260 -0
  30. messagefoundry/config/connections_edit.py +200 -0
  31. messagefoundry/config/connections_file.py +287 -0
  32. messagefoundry/config/db_lookup.py +117 -0
  33. messagefoundry/config/environments.py +116 -0
  34. messagefoundry/config/ingest_time.py +83 -0
  35. messagefoundry/config/models.py +240 -0
  36. messagefoundry/config/reference.py +158 -0
  37. messagefoundry/config/response.py +83 -0
  38. messagefoundry/config/run_context.py +153 -0
  39. messagefoundry/config/settings.py +1311 -0
  40. messagefoundry/config/state.py +99 -0
  41. messagefoundry/config/tls_policy.py +110 -0
  42. messagefoundry/config/wiring.py +1918 -0
  43. messagefoundry/console/__init__.py +20 -0
  44. messagefoundry/console/__main__.py +274 -0
  45. messagefoundry/console/_async.py +107 -0
  46. messagefoundry/console/change_password.py +111 -0
  47. messagefoundry/console/client.py +552 -0
  48. messagefoundry/console/connections.py +324 -0
  49. messagefoundry/console/login.py +107 -0
  50. messagefoundry/console/mfa.py +205 -0
  51. messagefoundry/console/reauth.py +94 -0
  52. messagefoundry/console/search.py +57 -0
  53. messagefoundry/console/service_control.py +137 -0
  54. messagefoundry/console/sessions.py +122 -0
  55. messagefoundry/console/shell.py +410 -0
  56. messagefoundry/console/status.py +377 -0
  57. messagefoundry/console/users_page.py +282 -0
  58. messagefoundry/console/widgets.py +553 -0
  59. messagefoundry/generators/README.md +27 -0
  60. messagefoundry/generators/__init__.py +15 -0
  61. messagefoundry/generators/_core.py +589 -0
  62. messagefoundry/generators/_hl7data.py +428 -0
  63. messagefoundry/generators/adt.py +286 -0
  64. messagefoundry/generators/all_types.py +24 -0
  65. messagefoundry/generators/bar.py +28 -0
  66. messagefoundry/generators/dft.py +20 -0
  67. messagefoundry/generators/mdm.py +39 -0
  68. messagefoundry/generators/mfn.py +46 -0
  69. messagefoundry/generators/oml.py +32 -0
  70. messagefoundry/generators/orl.py +30 -0
  71. messagefoundry/generators/orm.py +23 -0
  72. messagefoundry/generators/oru.py +21 -0
  73. messagefoundry/generators/ras.py +20 -0
  74. messagefoundry/generators/rde.py +54 -0
  75. messagefoundry/generators/siu.py +64 -0
  76. messagefoundry/generators/vxu.py +20 -0
  77. messagefoundry/hl7schema.py +75 -0
  78. messagefoundry/last_resort.py +55 -0
  79. messagefoundry/logging_setup.py +332 -0
  80. messagefoundry/parsing/__init__.py +64 -0
  81. messagefoundry/parsing/consistency.py +166 -0
  82. messagefoundry/parsing/groups.py +228 -0
  83. messagefoundry/parsing/message.py +453 -0
  84. messagefoundry/parsing/peek.py +237 -0
  85. messagefoundry/parsing/split.py +120 -0
  86. messagefoundry/parsing/summary.py +46 -0
  87. messagefoundry/parsing/tree.py +128 -0
  88. messagefoundry/parsing/validate.py +95 -0
  89. messagefoundry/parsing/x12/__init__.py +46 -0
  90. messagefoundry/parsing/x12/delimiters.py +140 -0
  91. messagefoundry/parsing/x12/errors.py +30 -0
  92. messagefoundry/parsing/x12/interchange.py +232 -0
  93. messagefoundry/parsing/x12/message.py +200 -0
  94. messagefoundry/parsing/x12/peek.py +207 -0
  95. messagefoundry/pipeline/__init__.py +21 -0
  96. messagefoundry/pipeline/alert_sinks.py +486 -0
  97. messagefoundry/pipeline/alerts.py +100 -0
  98. messagefoundry/pipeline/cert_expiry.py +219 -0
  99. messagefoundry/pipeline/cluster.py +955 -0
  100. messagefoundry/pipeline/cluster_sqlserver.py +444 -0
  101. messagefoundry/pipeline/config_convergence.py +137 -0
  102. messagefoundry/pipeline/dryrun.py +450 -0
  103. messagefoundry/pipeline/engine.py +756 -0
  104. messagefoundry/pipeline/leader_tasks.py +158 -0
  105. messagefoundry/pipeline/reference_sync.py +369 -0
  106. messagefoundry/pipeline/retention.py +289 -0
  107. messagefoundry/pipeline/security_notify.py +168 -0
  108. messagefoundry/pipeline/state_convergence.py +143 -0
  109. messagefoundry/pipeline/wiring_runner.py +1722 -0
  110. messagefoundry/py.typed +0 -0
  111. messagefoundry/redaction.py +71 -0
  112. messagefoundry/scaffold.py +321 -0
  113. messagefoundry/secrets_dpapi.py +129 -0
  114. messagefoundry/store/__init__.py +46 -0
  115. messagefoundry/store/audit_tee.py +67 -0
  116. messagefoundry/store/base.py +758 -0
  117. messagefoundry/store/crypto.py +166 -0
  118. messagefoundry/store/keyprovider.py +192 -0
  119. messagefoundry/store/postgres.py +3447 -0
  120. messagefoundry/store/sqlserver.py +3014 -0
  121. messagefoundry/store/store.py +3790 -0
  122. messagefoundry/timezone.py +207 -0
  123. messagefoundry/transports/__init__.py +50 -0
  124. messagefoundry/transports/base.py +269 -0
  125. messagefoundry/transports/database.py +693 -0
  126. messagefoundry/transports/file.py +551 -0
  127. messagefoundry/transports/framing.py +164 -0
  128. messagefoundry/transports/loopback.py +53 -0
  129. messagefoundry/transports/mllp.py +644 -0
  130. messagefoundry/transports/remotefile.py +664 -0
  131. messagefoundry/transports/rest.py +281 -0
  132. messagefoundry/transports/signing.py +321 -0
  133. messagefoundry/transports/soap.py +507 -0
  134. messagefoundry/transports/tcp.py +307 -0
  135. messagefoundry/transports/timer.py +146 -0
  136. messagefoundry/transports/x12.py +323 -0
  137. messagefoundry-0.1.0.dist-info/METADATA +212 -0
  138. messagefoundry-0.1.0.dist-info/RECORD +142 -0
  139. messagefoundry-0.1.0.dist-info/WHEEL +4 -0
  140. messagefoundry-0.1.0.dist-info/entry_points.txt +2 -0
  141. messagefoundry-0.1.0.dist-info/licenses/LICENSE +662 -0
  142. messagefoundry-0.1.0.dist-info/licenses/NOTICE +27 -0
@@ -0,0 +1,1581 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 MessageFoundry Organization and contributors
3
+ """Localhost FastAPI surface for the console.
4
+
5
+ This is the *only* boundary a client uses, so in-process / local-daemon / remote
6
+ deployments are indistinguishable to the UI. Routes resolve the live :class:`Engine`
7
+ from ``app.state`` at request time (not at construction), which lets the same app object
8
+ be driven two ways:
9
+
10
+ * :func:`create_app(engine)` — bind an engine the caller already manages (embedding, and
11
+ the async test client).
12
+ * :func:`create_managed_app(...)` — own the engine via an ASGI lifespan (the CLI server,
13
+ and anything driven by a synchronous test client).
14
+
15
+ Authentication + RBAC are enforced whenever an enabled :class:`AuthService` is attached (the
16
+ ``serve`` path always attaches one). With **no** auth attached the routes are **fail-closed** (403)
17
+ unless the app explicitly opts out via ``allow_no_auth=True`` (embedding / dev), in which case
18
+ requests run as the full-access system identity (SYS-1). The API still binds localhost by default;
19
+ remote exposure (TLS) is later.
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import asyncio
25
+ import json
26
+ import logging
27
+ import os
28
+ import time
29
+ from collections.abc import Callable, Mapping, Sequence
30
+ from contextlib import asynccontextmanager, suppress
31
+ from pathlib import Path
32
+ from typing import Any, AsyncIterator
33
+
34
+ from fastapi import (
35
+ Depends,
36
+ FastAPI,
37
+ HTTPException,
38
+ Query,
39
+ Request,
40
+ Response,
41
+ WebSocket,
42
+ WebSocketDisconnect,
43
+ )
44
+ from fastapi.responses import JSONResponse
45
+
46
+ from messagefoundry import __version__
47
+ from messagefoundry.api.approvals import ApprovalError, ApprovalGate
48
+ from messagefoundry.api.models import (
49
+ AiPolicy,
50
+ ApprovalDecisionResult,
51
+ ApprovalList,
52
+ CapturedResponseInfo,
53
+ ChannelInfo,
54
+ ClusterNode,
55
+ ClusterNodeList,
56
+ ClusterStatus,
57
+ ConnectionMetadata,
58
+ ConnectionRow,
59
+ ConnectionTestResult,
60
+ DbInfo,
61
+ DeadLetterList,
62
+ DeadLetterReplayRequest,
63
+ DeadLetterReplayResult,
64
+ DeadLetterRow,
65
+ EngineInfo,
66
+ EventInfo,
67
+ Health,
68
+ IntegrityResult,
69
+ MessageDetail,
70
+ MessageList,
71
+ MessageResponses,
72
+ MessageSummary,
73
+ OutboundPayloadInfo,
74
+ OutboundPayloads,
75
+ OutboxInfo,
76
+ PendingApprovalInfo,
77
+ PendingApprovalResponse,
78
+ PurgeResult,
79
+ ReloadRequest,
80
+ ReloadResult,
81
+ ReplayResult,
82
+ StatsResponse,
83
+ SystemStatus,
84
+ )
85
+ from messagefoundry.api.auth_routes import add_auth_routes
86
+ from messagefoundry.api.field_authz import count_exposed, redact_unauthorized
87
+ from messagefoundry.api.security import (
88
+ authorize_ws,
89
+ optional_identity,
90
+ require,
91
+ require_phi_read,
92
+ require_step_up,
93
+ ws_token,
94
+ )
95
+ from messagefoundry.auth import Identity, Permission
96
+ from messagefoundry.auth.service import AuthService, BootstrapAdmin
97
+ from messagefoundry.config.ai_policy import resolve_effective_policy
98
+ from messagefoundry.config.models import (
99
+ AckAfter,
100
+ BuildupThreshold,
101
+ InternalErrorPolicy,
102
+ OrderingMode,
103
+ RetryPolicy,
104
+ )
105
+ from messagefoundry.config.settings import (
106
+ AiSettings,
107
+ AlertsSettings,
108
+ ApprovalsSettings,
109
+ AuthSettings,
110
+ CertMonitorSettings,
111
+ ClusterSettings,
112
+ EgressSettings,
113
+ ReferenceSettings,
114
+ RetentionSettings,
115
+ ShadowSettings,
116
+ StoreSettings,
117
+ )
118
+ from messagefoundry.config.wiring import EnvRef, WiringError, load_config, redacted_settings
119
+ from messagefoundry.last_resort import install_loop_exception_handler
120
+ from messagefoundry.pipeline import ConfigReloadDenied, Engine
121
+ from messagefoundry.pipeline.alert_sinks import notifier_from_settings
122
+ from messagefoundry.pipeline.security_notify import security_notifier_from_settings
123
+ from messagefoundry.pipeline.cluster import build_coordinator
124
+ from messagefoundry.pipeline.wiring_runner import RegistryRunner
125
+ from messagefoundry.transports.base import (
126
+ DeliveryError,
127
+ DestinationConnector,
128
+ TestNotSupportedError,
129
+ )
130
+ from messagefoundry.store import Row, open_store, sqlite_settings
131
+ from messagefoundry.store.base import Store
132
+ from messagefoundry.store.store import _secure_file
133
+
134
+ __all__ = ["create_app", "create_managed_app"]
135
+
136
+ _RATE_WINDOW = 60.0 # seconds; window for the backlog throughput estimate
137
+ _MAX_REQUEST_BODY_BYTES = 1 * 1024 * 1024 # 1 MiB cap on HTTP request bodies (API-INPUT)
138
+ _CONNECTION_TEST_TIMEOUT = 35.0 # overall cap for a POST /connections/{name}/test probe (seconds)
139
+ _MAX_WS_CONNECTIONS = 64 # cap concurrent /ws/stats sockets (API-WS)
140
+ _WS_REVALIDATE_SECONDS = 30.0 # re-check the session on an open /ws/stats this often (API-WS)
141
+ _log = logging.getLogger(__name__)
142
+
143
+
144
+ def _peer_display(value: Any) -> str | None:
145
+ """Render a connector address field for the dashboard: a literal, or an ``env()`` reference shown
146
+ symbolically (``env:<key>``). The live value is resolved per-instance; the spec only holds the ref."""
147
+ if value is None:
148
+ return None
149
+ if isinstance(value, EnvRef):
150
+ return f"env:{value.key}"
151
+ return str(value)
152
+
153
+
154
+ def _peer_port(type_value: str, settings: dict[str, Any]) -> tuple[str | None, int | None]:
155
+ """Best-effort (peer, port) for a connector: MLLP host+port, or a file directory."""
156
+ if type_value == "mllp":
157
+ port = settings.get("port")
158
+ port_int = None if port is None or isinstance(port, EnvRef) else int(port)
159
+ return (_peer_display(settings.get("host")), port_int)
160
+ if type_value == "file":
161
+ return (_peer_display(settings.get("directory")), None)
162
+ return (None, None)
163
+
164
+
165
+ # Display labels for the connection method/protocol. Includes types not yet built so the
166
+ # column reads well the moment a connector lands; unknown types fall back to upper-case.
167
+ _METHOD_LABELS = {
168
+ "mllp": "MLLP",
169
+ "file": "File",
170
+ "tcp": "TCP",
171
+ "soap": "SOAP",
172
+ "rest": "REST",
173
+ "http": "HTTP",
174
+ "sftp": "SFTP",
175
+ "db": "Database",
176
+ }
177
+
178
+
179
+ def _method_label(type_value: str) -> str:
180
+ return _METHOD_LABELS.get(type_value, type_value.upper())
181
+
182
+
183
+ def _backlog(depth: int, recent: int) -> float | None:
184
+ """Estimated seconds to clear the queue: 0 if empty, None if queued but nothing draining."""
185
+ if depth == 0:
186
+ return 0.0
187
+ return depth * _RATE_WINDOW / recent if recent > 0 else None
188
+
189
+
190
+ def _get_engine(request: Request) -> Engine:
191
+ engine: Engine | None = getattr(request.app.state, "engine", None)
192
+ if engine is None:
193
+ raise HTTPException(status_code=503, detail="engine not started")
194
+ return engine
195
+
196
+
197
+ def _get_gate(request: Request) -> ApprovalGate | None:
198
+ """The dual-control approval gate (ASVS 2.3.5), or ``None`` when no engine is bound — then gated
199
+ endpoints execute inline and the ``/approvals`` routes report 503."""
200
+ return getattr(request.app.state, "approval_gate", None)
201
+
202
+
203
+ def _build_approval_gate(engine: Engine, settings: ApprovalsSettings) -> ApprovalGate:
204
+ """Build the approval gate and register the high-value operations dual-control can hold. Each
205
+ executor re-runs its captured operation on approval (params are JSON, persisted at request time)."""
206
+ gate = ApprovalGate(engine.store, settings)
207
+
208
+ async def _replay(p: Mapping[str, Any]) -> dict[str, Any]:
209
+ requeued = await engine.replay_dead(
210
+ channel_id=p.get("channel_id"), destination_name=p.get("destination_name")
211
+ )
212
+ return {"requeued": requeued}
213
+
214
+ async def _purge(p: Mapping[str, Any]) -> dict[str, Any]:
215
+ cancelled = await engine.store.cancel_queued(
216
+ None, str(p["name"]), top_only=(p.get("scope") == "top")
217
+ )
218
+ return {"cancelled": cancelled}
219
+
220
+ gate.register("dead_letter_replay", "Replay dead-lettered deliveries", _replay)
221
+ gate.register("connection_purge", "Purge queued deliveries to an outbound connection", _purge)
222
+ return gate
223
+
224
+
225
+ def _summary(row: Row) -> MessageSummary:
226
+ # dict() so optional columns (last_event on list rows; summary/metadata) read via .get,
227
+ # letting the same builder serve list rows and SELECT * detail rows.
228
+ d = dict(row)
229
+ return MessageSummary(
230
+ id=d["id"],
231
+ channel_id=d["channel_id"],
232
+ received_at=d["received_at"],
233
+ source_type=d.get("source_type"),
234
+ control_id=d.get("control_id"),
235
+ message_type=d.get("message_type"),
236
+ status=d["status"],
237
+ error=d.get("error"),
238
+ event=d.get("last_event"),
239
+ summary=d.get("summary"),
240
+ metadata=d.get("metadata"),
241
+ )
242
+
243
+
244
+ def _dead_row(row: Row) -> DeadLetterRow:
245
+ d = dict(row)
246
+ return DeadLetterRow(
247
+ outbox_id=d["outbox_id"],
248
+ message_id=d["message_id"],
249
+ channel_id=d["channel_id"],
250
+ destination_name=d["destination_name"],
251
+ attempts=d["attempts"],
252
+ last_error=d.get("last_error"),
253
+ failed_at=d["updated_at"],
254
+ control_id=d.get("control_id"),
255
+ message_type=d.get("message_type"),
256
+ received_at=d["received_at"],
257
+ summary=d.get("summary"),
258
+ )
259
+
260
+
261
+ def _scope(identity: Identity) -> list[str] | None:
262
+ """The caller's per-channel allow-list for store filters (None = all channels)."""
263
+ return None if identity.allowed_channels is None else sorted(identity.allowed_channels)
264
+
265
+
266
+ async def _audit_channel_denied(engine: Engine, identity: Identity, channel: str | None) -> None:
267
+ """Audit a per-channel RBAC denial (mirrors auth.permission_denied)."""
268
+ await engine.store.record_audit(
269
+ "auth.channel_denied",
270
+ actor=identity.username,
271
+ channel_id=channel,
272
+ detail=json.dumps({"channel": channel}),
273
+ )
274
+
275
+
276
+ async def _run_connection_test(
277
+ rr: RegistryRunner, name: str, direction: str
278
+ ) -> ConnectionTestResult:
279
+ """Build a fresh connector for ``name`` and probe its reachability, never disturbing the live one.
280
+ Reports a config (bad ``env()``/egress) or connectivity failure in the result rather than raising —
281
+ only an unexpected bug would 500. Closes the test connector afterward."""
282
+
283
+ def _result(
284
+ *, supported: bool, success: bool, ms: float, detail: str | None
285
+ ) -> ConnectionTestResult:
286
+ return ConnectionTestResult(
287
+ name=name,
288
+ direction=direction,
289
+ supported=supported,
290
+ success=success,
291
+ duration_ms=round(ms, 1),
292
+ detail=detail,
293
+ )
294
+
295
+ try:
296
+ _direction, connector = rr.build_test_connector(name)
297
+ except WiringError as exc:
298
+ return _result(supported=True, success=False, ms=0.0, detail=str(exc))
299
+ start = time.monotonic()
300
+ supported, success, detail = True, False, None
301
+ try:
302
+ await asyncio.wait_for(connector.test_connection(), _CONNECTION_TEST_TIMEOUT)
303
+ success = True
304
+ except TestNotSupportedError as exc:
305
+ supported, detail = False, str(exc)
306
+ except asyncio.TimeoutError:
307
+ detail = f"timed out after {_CONNECTION_TEST_TIMEOUT:.0f}s"
308
+ except DeliveryError as exc:
309
+ detail = str(exc)
310
+ except Exception as exc: # noqa: BLE001 - any probe failure is reported in the result, never a 500
311
+ detail = f"{type(exc).__name__}: {exc}"
312
+ finally:
313
+ with suppress(Exception): # closing a test connector must never mask the result
314
+ if isinstance(connector, DestinationConnector):
315
+ await connector.aclose()
316
+ else:
317
+ await connector.stop()
318
+ return _result(
319
+ supported=supported, success=success, ms=(time.monotonic() - start) * 1000.0, detail=detail
320
+ )
321
+
322
+
323
+ class _SummaryAuditCoalescer:
324
+ """Coalesces PHI-summary access auditing into ONE ``summary_access`` audit row per
325
+ ``(actor, channel-scope, hour)`` window, carrying the running count of summaries exposed in that
326
+ window (review M-5).
327
+
328
+ Auditing is **server-enforced**: every list response that returns non-redacted summaries is
329
+ counted, regardless of any client flag — so a scripted bulk fetch can't harvest the patient census
330
+ unaudited. Coalescing keeps routine console polling to one row/hour while a bulk harvest shows a
331
+ large count. A window's total is flushed when a later summary access rolls into a new hour (the
332
+ keyed window, plus a sweep so a *different* actor's later access also flushes stragglers); the
333
+ active window is also flushed on :meth:`flush` (engine shutdown). The in-process dict is safe
334
+ because the engine is a single uvicorn worker (single-connection store + ``asyncio.Lock``)."""
335
+
336
+ def __init__(self) -> None:
337
+ # (actor, scope) -> {"hour": int, "count": int}; scope is the channel filter ("" = all channels)
338
+ self._windows: dict[tuple[str | None, str], dict[str, int]] = {}
339
+
340
+ def _roll(
341
+ self, actor: str | None, scope: str, count: int, hour: int
342
+ ) -> list[tuple[str | None, str, int, int]]:
343
+ """Accumulate ``count`` into the ``(actor, scope)`` window for ``hour`` and return any windows
344
+ to flush now — every window whose hour has passed. Synchronous (no ``await``), so the dict is
345
+ mutated atomically w.r.t. the event loop and a window can't be double-emitted."""
346
+ emit: list[tuple[str | None, str, int, int]] = []
347
+ for (a, sc), win in list(self._windows.items()):
348
+ if win["hour"] != hour:
349
+ emit.append((a, sc, win["hour"], win["count"]))
350
+ del self._windows[(a, sc)]
351
+ self._windows.setdefault((actor, scope), {"hour": hour, "count": 0})["count"] += count
352
+ return emit
353
+
354
+ async def note(
355
+ self, store: Store, actor: str | None, scope: str | None, count: int, now: float
356
+ ) -> None:
357
+ """Count ``count`` exposed summaries for ``actor``; emit a coalesced audit row for any window
358
+ that just rolled over. No-op when nothing was exposed."""
359
+ if count <= 0:
360
+ return
361
+ for a, sc, win_hour, win_count in self._roll(actor, scope or "", count, int(now // 3600)):
362
+ await self._emit(store, a, sc, win_hour, win_count)
363
+
364
+ async def flush(self, store: Store) -> None:
365
+ """Emit every pending window (e.g. on engine shutdown) so an active window isn't lost."""
366
+ windows = list(self._windows.items())
367
+ self._windows.clear()
368
+ for (a, sc), win in windows:
369
+ await self._emit(store, a, sc, win["hour"], win["count"])
370
+
371
+ @staticmethod
372
+ async def _emit(store: Store, actor: str | None, scope: str, hour: int, count: int) -> None:
373
+ await store.record_audit(
374
+ "summary_access",
375
+ actor=actor,
376
+ channel_id=(scope or None),
377
+ detail=json.dumps({"count": count, "window_start": hour * 3600}),
378
+ )
379
+
380
+
381
+ def create_app(
382
+ engine: Engine | None = None,
383
+ *,
384
+ lifespan: object | None = None,
385
+ auth: AuthService | None = None,
386
+ ai_settings: AiSettings | None = None,
387
+ approvals: ApprovalsSettings | None = None,
388
+ expose_docs: bool = False,
389
+ allow_no_auth: bool = False,
390
+ ws_allowed_origins: Sequence[str] = (),
391
+ ) -> FastAPI:
392
+ # The interactive docs (/docs, /redoc) and the OpenAPI schema (/openapi.json) are off by
393
+ # default: they widen the attack surface and disclose the schema, which matters the moment the
394
+ # API binds off-loopback. Opt in with [api] expose_docs = true. See docs/PHI.md §10.
395
+ app = FastAPI(
396
+ title="MessageFoundry",
397
+ version=__version__,
398
+ lifespan=lifespan, # type: ignore[arg-type]
399
+ docs_url="/docs" if expose_docs else None,
400
+ redoc_url="/redoc" if expose_docs else None,
401
+ openapi_url="/openapi.json" if expose_docs else None,
402
+ )
403
+ if engine is not None:
404
+ app.state.engine = engine
405
+ app.state.approval_gate = _build_approval_gate(engine, approvals or ApprovalsSettings())
406
+ if auth is not None:
407
+ app.state.auth = auth
408
+ if ai_settings is not None:
409
+ app.state.ai = ai_settings
410
+ # Fail-closed when no auth is attached unless explicitly opted out (embedding/dev) — SYS-1.
411
+ app.state.allow_no_auth = allow_no_auth
412
+ app.state.ws_count = 0 # live /ws/stats connection count (API-WS cap)
413
+ app.state.ws_allowed_origins = tuple(
414
+ ws_allowed_origins
415
+ ) # browser Origins for /ws/stats (4.4.2)
416
+ app.state.summary_auditor = _SummaryAuditCoalescer() # coalesced PHI-summary access audit (M-5)
417
+ add_auth_routes(app)
418
+
419
+ @app.exception_handler(Exception)
420
+ async def _unhandled_exception(request: Request, exc: Exception) -> JSONResponse:
421
+ # Catch-all so an unexpected error returns a generic 500 — never a stack trace or internal
422
+ # detail to the client (ASVS 16.5.1). The real cause is logged server-side only; we log the
423
+ # exception TYPE + route, not str(exc), to avoid a stray PHI fragment reaching the general
424
+ # log (the "never log bodies" rule; centralized redaction is the WP-6c follow-up).
425
+ _log.error(
426
+ "unhandled error on %s %s: %s", request.method, request.url.path, type(exc).__name__
427
+ )
428
+ return JSONResponse({"detail": "internal error"}, status_code=500)
429
+
430
+ @app.middleware("http")
431
+ async def _security_headers(request: Request, call_next: Any) -> Any:
432
+ # Defense-in-depth response headers (ASVS 3.4.4 / 3.4.5 / 3.2.1). The shipped client is a
433
+ # desktop app, but these are mandatory the moment a browser/off-loopback client appears and
434
+ # cost nothing on a JSON API. HSTS is only meaningful over TLS, so it is emitted only when the
435
+ # request actually arrived over https (wired when API TLS lands — WP-13a).
436
+ response = await call_next(request)
437
+ response.headers.setdefault("X-Content-Type-Options", "nosniff")
438
+ response.headers.setdefault("Referrer-Policy", "no-referrer")
439
+ response.headers.setdefault("X-Frame-Options", "DENY")
440
+ if request.url.scheme == "https":
441
+ response.headers.setdefault(
442
+ "Strict-Transport-Security", "max-age=31536000; includeSubDomains"
443
+ )
444
+ return response
445
+
446
+ @app.middleware("http")
447
+ async def _limit_request_body(request: Request, call_next: Any) -> Any:
448
+ # The HTTP API carries only small JSON (HL7 payloads arrive via MLLP/file, not here), so a
449
+ # generous cap rejects oversized/abusive bodies early (API-INPUT).
450
+ # Rejections are logged (ASVS 16.3.3) — these are control-bypass attempts (a pre-auth memory
451
+ # DoS probe) and were previously dropped silently. We log to the rotating general log rather
452
+ # than the audit_log: it's pre-auth (no actor) and a flood must not grow the audit DB.
453
+ client = request.client.host if request.client else None
454
+ length = request.headers.get("content-length")
455
+ transfer_encoding = request.headers.get("transfer-encoding", "").lower()
456
+ # A request carrying BOTH Content-Length and Transfer-Encoding is ambiguously framed (RFC 9112
457
+ # §6.1 — TE overrides CL) and is the classic CL.TE request-smuggling vector. Our single h11
458
+ # parser doesn't desync on the default loopback bind, but reject it outright so a future front
459
+ # proxy can never disagree with us about where the message ends (ASVS 4.2.1).
460
+ if length is not None and "chunked" in transfer_encoding:
461
+ _log.warning(
462
+ "rejected request with both Content-Length and Transfer-Encoding on %s from %s",
463
+ request.url.path,
464
+ client,
465
+ )
466
+ return JSONResponse(
467
+ {
468
+ "detail": "ambiguous framing: Content-Length with Transfer-Encoding is not accepted"
469
+ },
470
+ status_code=400,
471
+ )
472
+ if length is None:
473
+ # No Content-Length means a chunked body (HTTP/1.1 requires one or the other), which the
474
+ # Content-Length cap can't bound up front — Starlette would buffer it unbounded, a pre-auth
475
+ # memory DoS. We only accept small JSON, so require a Content-Length (review M-19).
476
+ if "chunked" in transfer_encoding:
477
+ _log.warning(
478
+ "rejected chunked request body on %s from %s", request.url.path, client
479
+ )
480
+ return JSONResponse(
481
+ {"detail": "chunked request bodies are not accepted; send a Content-Length"},
482
+ status_code=411,
483
+ )
484
+ return await call_next(request)
485
+ try:
486
+ too_big = int(length) > _MAX_REQUEST_BODY_BYTES
487
+ except ValueError:
488
+ _log.warning("rejected invalid Content-Length on %s from %s", request.url.path, client)
489
+ return JSONResponse({"detail": "invalid Content-Length"}, status_code=400)
490
+ if too_big:
491
+ _log.warning("rejected oversized request body on %s from %s", request.url.path, client)
492
+ return JSONResponse({"detail": "request body too large"}, status_code=413)
493
+ return await call_next(request)
494
+
495
+ @app.get("/health", response_model=Health)
496
+ async def health(identity: Identity | None = Depends(optional_identity)) -> Health:
497
+ # Liveness is always answerable (tokenless), but the build version is fingerprinting info, so
498
+ # it is disclosed only to an authenticated caller (WP-L3-07 / ASVS 13.4.6). When auth is
499
+ # disabled-with-allow_no_auth, optional_identity returns the system identity → version shown.
500
+ return Health(version=__version__ if identity is not None else None)
501
+
502
+ @app.get("/ai/policy", response_model=AiPolicy)
503
+ async def ai_policy(
504
+ request: Request, identity: Identity | None = Depends(optional_identity)
505
+ ) -> AiPolicy:
506
+ """The central AI-assistance policy (mode/scope/environment) plus the caller's
507
+ ``assist_permitted`` bit, for the IDE gate.
508
+
509
+ Intentionally NOT behind ``require()``: the install policy is non-sensitive operational
510
+ config and must be readable even by a tokenless client, so a central ``off`` is honored.
511
+ ``assist_permitted`` carries the identity-dependent bit (``None`` = RBAC not evaluable, i.e.
512
+ no/invalid token under enabled auth). Policy reads are not audited in this MVP."""
513
+ ai = getattr(request.app.state, "ai", None) or AiSettings()
514
+ data_class, prod = ai.derived_posture()
515
+ production = True if prod is None else prod # unresolved posture -> strictest ceiling
516
+ eff = resolve_effective_policy(
517
+ mode=ai.mode, data_scope=ai.data_scope, production=production
518
+ )
519
+ permitted = None if identity is None else identity.has(Permission.AI_ASSIST)
520
+ return AiPolicy(
521
+ mode=eff.mode,
522
+ data_scope=eff.data_scope,
523
+ environment=ai.environment,
524
+ data_class=data_class,
525
+ production=production,
526
+ assist_permitted=permitted,
527
+ reason=eff.reason,
528
+ )
529
+
530
+ # --- connections list (inbound connections, for the Log Search filter) ---
531
+
532
+ @app.get("/channels", response_model=list[ChannelInfo])
533
+ async def list_channels(
534
+ engine: Engine = Depends(_get_engine),
535
+ _user: Identity = Depends(require(Permission.MONITORING_READ)),
536
+ ) -> list[ChannelInfo]:
537
+ """Inbound connections as ChannelInfo (id = connection name) for the Log Search filter."""
538
+ runner = engine.registry_runner
539
+ if runner is None:
540
+ return []
541
+ return [
542
+ ChannelInfo(
543
+ id=name,
544
+ name=name,
545
+ enabled=True,
546
+ running=runner.inbound_running(name),
547
+ source_type=ic.spec.type.value,
548
+ destinations=[],
549
+ )
550
+ for name, ic in runner.registry.inbound.items()
551
+ ]
552
+
553
+ # --- connections (per-endpoint dashboard) --------------------------------
554
+
555
+ @app.get("/connections", response_model=list[ConnectionRow])
556
+ async def list_connections(
557
+ engine: Engine = Depends(_get_engine),
558
+ _user: Identity = Depends(require(Permission.MONITORING_READ)),
559
+ ) -> list[ConnectionRow]:
560
+ now = time.time()
561
+ metrics = await engine.store.connection_metrics(
562
+ since=engine.started_at, now=now, rate_window=_RATE_WINDOW
563
+ )
564
+ rows: list[ConnectionRow] = []
565
+
566
+ # A source row per inbound connection, and a destination row per (inbound → outbound)
567
+ # edge that has carried traffic (the outbox metrics are keyed that way).
568
+ rr = engine.registry_runner
569
+ if rr is not None:
570
+ reg = rr.registry
571
+ rstatus = "running" if rr.running else "stopped"
572
+ for iname, ic in reg.inbound.items():
573
+ inb = metrics.inbound.get(iname)
574
+ speer, sport = _peer_port(ic.spec.type.value, ic.spec.settings)
575
+ rows.append(
576
+ ConnectionRow(
577
+ role="source",
578
+ channel_id=iname,
579
+ channel_name=iname,
580
+ destination=None,
581
+ name=f"{iname} ▸ in",
582
+ status="running" if rr.inbound_running(iname) else "stopped",
583
+ direction="in",
584
+ method=_method_label(ic.spec.type.value),
585
+ peer=speer,
586
+ port=sport,
587
+ queue_depth=None,
588
+ idle_seconds=(now - inb.last_at) if inb and inb.last_at else None,
589
+ alerts_active=0,
590
+ errored=inb.errored if inb else 0,
591
+ read=inb.read if inb else 0,
592
+ written=None,
593
+ backlog_seconds=None,
594
+ delivered_age_seconds=None,
595
+ )
596
+ )
597
+ for (cid, dname), dm in metrics.destinations.items():
598
+ if cid not in reg.inbound:
599
+ continue # a declarative-channel edge, already emitted above
600
+ oc = reg.outbound.get(dname)
601
+ # An outbound the live graph no longer declares (removed by a reload) keeps draining
602
+ # its queued rows — report it honestly as "draining" with an unknown method, rather
603
+ # than mislabeling it as a running File connector.
604
+ if oc is not None:
605
+ dmethod = _method_label(oc.spec.type.value)
606
+ dpeer, dport = _peer_port(oc.spec.type.value, oc.spec.settings)
607
+ dstatus = rstatus
608
+ else:
609
+ dmethod, dpeer, dport, dstatus = "—", None, None, "draining"
610
+ rows.append(
611
+ ConnectionRow(
612
+ role="destination",
613
+ channel_id=cid,
614
+ channel_name=cid,
615
+ destination=dname,
616
+ name=f"{cid} ▸ {dname}",
617
+ status=dstatus,
618
+ direction="out",
619
+ method=dmethod,
620
+ peer=dpeer,
621
+ port=dport,
622
+ queue_depth=dm.queue_depth,
623
+ idle_seconds=(now - dm.last_done_at) if dm.last_done_at else None,
624
+ alerts_active=0,
625
+ errored=dm.dead,
626
+ read=None,
627
+ written=dm.written,
628
+ backlog_seconds=_backlog(dm.queue_depth, dm.recent_done),
629
+ delivered_age_seconds=(
630
+ (now - dm.oldest_pending_at) if dm.oldest_pending_at else None
631
+ ),
632
+ # Effective simulate flag — queried even for a draining (removed) outbound,
633
+ # whose suppression persists in the runner until full shutdown (#15).
634
+ simulated=rr.outbound_simulated(dname),
635
+ )
636
+ )
637
+ return rows
638
+
639
+ # --- code-first connection operations ------------------------------------
640
+
641
+ def _inbound(engine: Engine, name: str) -> RegistryRunner:
642
+ rr = engine.registry_runner
643
+ if rr is None or name not in rr.registry.inbound:
644
+ raise HTTPException(404, f"no such inbound connection: {name}")
645
+ return rr
646
+
647
+ async def _control_guard(engine: Engine, identity: Identity, name: str) -> None:
648
+ # Controlling an inbound connection is scoped per-channel (the connection IS the channel).
649
+ if not identity.can_access_channel(name):
650
+ await _audit_channel_denied(engine, identity, name)
651
+ raise HTTPException(403, "not authorized for this connection")
652
+
653
+ @app.post("/connections/{name}/start")
654
+ async def start_connection(
655
+ name: str,
656
+ engine: Engine = Depends(_get_engine),
657
+ identity: Identity = Depends(require(Permission.CONNECTIONS_CONTROL)),
658
+ ) -> dict[str, object]:
659
+ await _control_guard(engine, identity, name)
660
+ rr = _inbound(engine, name)
661
+ await rr.start_inbound(name)
662
+ return {"name": name, "running": rr.inbound_running(name)}
663
+
664
+ @app.post("/connections/{name}/stop")
665
+ async def stop_connection(
666
+ name: str,
667
+ engine: Engine = Depends(_get_engine),
668
+ identity: Identity = Depends(require(Permission.CONNECTIONS_CONTROL)),
669
+ ) -> dict[str, object]:
670
+ await _control_guard(engine, identity, name)
671
+ rr = _inbound(engine, name)
672
+ await rr.stop_inbound(name)
673
+ return {"name": name, "running": rr.inbound_running(name)}
674
+
675
+ @app.post("/connections/{name}/restart")
676
+ async def restart_connection(
677
+ name: str,
678
+ engine: Engine = Depends(_get_engine),
679
+ identity: Identity = Depends(require(Permission.CONNECTIONS_CONTROL)),
680
+ ) -> dict[str, object]:
681
+ await _control_guard(engine, identity, name)
682
+ rr = _inbound(engine, name)
683
+ await rr.restart_inbound(name)
684
+ return {"name": name, "running": rr.inbound_running(name)}
685
+
686
+ @app.get("/connections/{name}/metadata", response_model=ConnectionMetadata)
687
+ async def connection_metadata(
688
+ name: str,
689
+ engine: Engine = Depends(_get_engine),
690
+ identity: Identity = Depends(require(Permission.MONITORING_READ)),
691
+ ) -> ConnectionMetadata:
692
+ """Static metadata for one connection (operability Tier 4): operator labels + a secret-scrubbed
693
+ settings view. No live probe — see ``POST /connections/{name}/test``."""
694
+ rr = engine.registry_runner
695
+ if rr is None:
696
+ raise HTTPException(503, "engine not started")
697
+ ic = rr.registry.inbound.get(name)
698
+ if ic is not None:
699
+ await _control_guard(engine, identity, name) # inbound config is per-channel
700
+ return ConnectionMetadata(
701
+ name=name,
702
+ direction="in",
703
+ method=ic.spec.type.value,
704
+ running=rr.inbound_running(name),
705
+ router=ic.router,
706
+ metadata=dict(ic.metadata) if ic.metadata else None,
707
+ settings=redacted_settings(ic.spec.settings),
708
+ )
709
+ oc = rr.registry.outbound.get(name)
710
+ if oc is not None:
711
+ if identity.allowed_channels is not None:
712
+ # An outbound spans channels, so a channel-scoped user can't read a shared one — the
713
+ # same boundary /test and /purge enforce (don't disclose shared-outbound topology).
714
+ await _audit_channel_denied(engine, identity, name)
715
+ raise HTTPException(
716
+ 403, "channel-scoped users cannot read a shared outbound connection"
717
+ )
718
+ return ConnectionMetadata(
719
+ name=name,
720
+ direction="out",
721
+ method=oc.spec.type.value,
722
+ running=rr.running,
723
+ metadata=dict(oc.metadata) if oc.metadata else None,
724
+ settings=redacted_settings(oc.spec.settings),
725
+ simulated=rr.outbound_simulated(name),
726
+ )
727
+ raise HTTPException(404, f"no such connection: {name}")
728
+
729
+ @app.post("/connections/{name}/test", response_model=ConnectionTestResult)
730
+ async def connection_test(
731
+ name: str,
732
+ engine: Engine = Depends(_get_engine),
733
+ identity: Identity = Depends(require(Permission.CONNECTIONS_TEST)),
734
+ ) -> ConnectionTestResult:
735
+ """Probe a connection's reachability (operability Tier 4) — builds a **fresh** connector
736
+ (never the live one), honors the ``[egress]`` allowlist, and sends NO real data. Audited."""
737
+ rr = engine.registry_runner
738
+ if rr is None:
739
+ raise HTTPException(503, "engine not started")
740
+ is_inbound = name in rr.registry.inbound
741
+ if not is_inbound and name not in rr.registry.outbound:
742
+ raise HTTPException(404, f"no such connection: {name}")
743
+ direction = "in" if is_inbound else "out"
744
+ if is_inbound:
745
+ await _control_guard(engine, identity, name) # inbound test is per-channel
746
+ elif identity.allowed_channels is not None:
747
+ # An outbound spans channels, so a channel-scoped user can't probe a shared one (like purge).
748
+ await _audit_channel_denied(engine, identity, name)
749
+ raise HTTPException(
750
+ 403, "channel-scoped users cannot test a shared outbound connection"
751
+ )
752
+
753
+ result = await _run_connection_test(rr, name, direction)
754
+ await engine.store.record_audit(
755
+ "connection_test",
756
+ actor=identity.username,
757
+ channel_id=name if direction == "in" else None,
758
+ detail=json.dumps(
759
+ {
760
+ "connection": name,
761
+ "direction": direction,
762
+ "supported": result.supported,
763
+ "success": result.success,
764
+ "detail": result.detail,
765
+ }
766
+ ),
767
+ )
768
+ return result
769
+
770
+ @app.post("/connections/{name}/purge", response_model=PurgeResult | PendingApprovalResponse)
771
+ async def purge_connection(
772
+ name: str,
773
+ response: Response,
774
+ engine: Engine = Depends(_get_engine),
775
+ scope: str = Query("all", pattern="^(top|all)$"),
776
+ identity: Identity = Depends(require_step_up(Permission.MESSAGES_PURGE)),
777
+ gate: ApprovalGate | None = Depends(_get_gate),
778
+ ) -> PurgeResult | PendingApprovalResponse:
779
+ """Soft-cancel queued deliveries to an outbound connection (across all inbounds)."""
780
+ # Purge targets an outbound and spans every inbound feeding it, so it can't be confined to a
781
+ # per-(inbound-)channel scope — a channel-scoped user may not purge a shared outbound.
782
+ if identity.allowed_channels is not None:
783
+ await _audit_channel_denied(engine, identity, name)
784
+ raise HTTPException(
785
+ 403, "channel-scoped users cannot purge a shared outbound connection"
786
+ )
787
+ rr = engine.registry_runner
788
+ if rr is None or name not in rr.registry.outbound:
789
+ raise HTTPException(404, f"no such outbound connection: {name}")
790
+ if (
791
+ gate is not None
792
+ ): # dual-control: hold for a second approver when [approvals] gates purge
793
+ pending = await gate.guard(
794
+ "connection_purge", {"name": name, "scope": scope}, requester=identity.username
795
+ )
796
+ if pending is not None:
797
+ response.status_code = 202
798
+ return PendingApprovalResponse(
799
+ approval_id=pending,
800
+ operation="connection_purge",
801
+ detail="held for a second approver (dual-control)",
802
+ )
803
+ cancelled = await engine.store.cancel_queued(None, name, top_only=(scope == "top"))
804
+ return PurgeResult(cancelled=cancelled)
805
+
806
+ # --- dead letters (verify + recover) -------------------------------------
807
+
808
+ @app.get("/dead-letters", response_model=DeadLetterList)
809
+ async def list_dead_letters(
810
+ request: Request,
811
+ engine: Engine = Depends(_get_engine),
812
+ identity: Identity = Depends(require_phi_read(Permission.MESSAGES_READ)),
813
+ channel_id: str | None = Query(None, max_length=256),
814
+ destination_name: str | None = Query(None, max_length=256),
815
+ limit: int = Query(50, ge=1, le=500),
816
+ offset: int = Query(0, ge=0),
817
+ ) -> DeadLetterList:
818
+ """Dead-lettered deliveries (newest first), optionally scoped to an inbound/outbound."""
819
+ allowed = _scope(
820
+ identity
821
+ ) # per-channel RBAC: restrict to the caller's channels (None = all)
822
+ rows = await engine.store.list_dead(
823
+ channel_id=channel_id,
824
+ destination_name=destination_name,
825
+ limit=limit,
826
+ offset=offset,
827
+ allowed_channels=allowed,
828
+ )
829
+ total = await engine.store.count_dead(
830
+ channel_id=channel_id, destination_name=destination_name, allowed_channels=allowed
831
+ )
832
+ dead = [_dead_row(r) for r in rows]
833
+ # Same centralized per-property PHI gate as /messages (WP-9): messages:view_summary unlocks the
834
+ # patient-identifying `summary` and the delivery `last_error` (which can quote field values —
835
+ # review low-8); a caller without it gets them nulled. Exposure audited server-side (M-5).
836
+ dead = [redact_unauthorized(d, identity) for d in dead]
837
+ exposed = count_exposed(dead)
838
+ if exposed:
839
+ await request.app.state.summary_auditor.note(
840
+ engine.store, identity.username, channel_id, exposed, time.time()
841
+ )
842
+ return DeadLetterList(total=total, limit=limit, offset=offset, dead_letters=dead)
843
+
844
+ @app.post(
845
+ "/dead-letters/replay", response_model=DeadLetterReplayResult | PendingApprovalResponse
846
+ )
847
+ async def replay_dead_letters(
848
+ req: DeadLetterReplayRequest,
849
+ response: Response,
850
+ engine: Engine = Depends(_get_engine),
851
+ identity: Identity = Depends(require_step_up(Permission.MESSAGES_REPLAY)),
852
+ gate: ApprovalGate | None = Depends(_get_gate),
853
+ ) -> DeadLetterReplayResult | PendingApprovalResponse:
854
+ """Re-queue dead-lettered deliveries (optionally scoped). Already-delivered rows are left
855
+ alone; each affected message reverts from ``error`` to ``received`` and re-drains."""
856
+ # A channel-scoped user must target one of their channels (replay isn't channel-filtered at
857
+ # the engine level, so an unscoped "replay all" would cross channels).
858
+ if identity.allowed_channels is not None and not identity.can_access_channel(
859
+ req.channel_id
860
+ ):
861
+ await _audit_channel_denied(engine, identity, req.channel_id)
862
+ raise HTTPException(403, "specify a channel within your scope to replay")
863
+ if (
864
+ gate is not None
865
+ ): # dual-control: hold for a second approver when [approvals] gates replay
866
+ pending = await gate.guard(
867
+ "dead_letter_replay",
868
+ {"channel_id": req.channel_id, "destination_name": req.destination_name},
869
+ requester=identity.username,
870
+ )
871
+ if pending is not None:
872
+ response.status_code = 202
873
+ return PendingApprovalResponse(
874
+ approval_id=pending,
875
+ operation="dead_letter_replay",
876
+ detail="held for a second approver (dual-control)",
877
+ )
878
+ requeued = await engine.replay_dead(
879
+ channel_id=req.channel_id, destination_name=req.destination_name
880
+ )
881
+ if requeued: # only when PHI was actually re-transmitted (review M-4)
882
+ await engine.store.record_audit(
883
+ "dead_letter_replay",
884
+ actor=identity.username,
885
+ channel_id=req.channel_id,
886
+ detail=json.dumps({"destination_name": req.destination_name, "requeued": requeued}),
887
+ )
888
+ return DeadLetterReplayResult(requeued=requeued)
889
+
890
+ # --- dual-control approvals (ASVS 2.3.5) ---------------------------------
891
+
892
+ @app.get("/approvals", response_model=ApprovalList)
893
+ async def list_approvals(
894
+ _: Identity = Depends(require(Permission.APPROVALS_APPROVE)),
895
+ gate: ApprovalGate | None = Depends(_get_gate),
896
+ ) -> ApprovalList:
897
+ """Open (still-pending, unexpired) high-value actions awaiting a second approver."""
898
+ if gate is None:
899
+ raise HTTPException(503, "approval workflow is not available")
900
+ return ApprovalList(approvals=[PendingApprovalInfo(**a) for a in await gate.list_pending()])
901
+
902
+ @app.post("/approvals/{approval_id}/approve", response_model=ApprovalDecisionResult)
903
+ async def approve_action(
904
+ approval_id: str,
905
+ identity: Identity = Depends(require(Permission.APPROVALS_APPROVE)),
906
+ gate: ApprovalGate | None = Depends(_get_gate),
907
+ ) -> ApprovalDecisionResult:
908
+ """Release a pending action: re-executes the captured operation and audits both identities. A
909
+ requester can never approve their own request (dual-control, 2.3.5)."""
910
+ if gate is None:
911
+ raise HTTPException(503, "approval workflow is not available")
912
+ try:
913
+ outcome = await gate.approve(approval_id, approver=identity.username)
914
+ except ApprovalError as exc:
915
+ raise HTTPException(exc.status, exc.detail) from exc
916
+ return ApprovalDecisionResult(**outcome)
917
+
918
+ @app.post("/approvals/{approval_id}/reject", response_model=ApprovalDecisionResult)
919
+ async def reject_action(
920
+ approval_id: str,
921
+ identity: Identity = Depends(require(Permission.APPROVALS_APPROVE)),
922
+ gate: ApprovalGate | None = Depends(_get_gate),
923
+ ) -> ApprovalDecisionResult:
924
+ """Decline a pending action without executing it (audited)."""
925
+ if gate is None:
926
+ raise HTTPException(503, "approval workflow is not available")
927
+ try:
928
+ outcome = await gate.reject(approval_id, approver=identity.username)
929
+ except ApprovalError as exc:
930
+ raise HTTPException(exc.status, exc.detail) from exc
931
+ return ApprovalDecisionResult(**outcome)
932
+
933
+ # --- config promote / reload ---------------------------------------------
934
+
935
+ @app.post("/config/reload", response_model=ReloadResult)
936
+ async def reload_config(
937
+ req: ReloadRequest,
938
+ engine: Engine = Depends(_get_engine),
939
+ user: Identity = Depends(require_step_up(Permission.CONFIG_DEPLOY)),
940
+ ) -> ReloadResult:
941
+ """Load the code-first graph and atomically apply it to the running engine (quiesce-and-swap;
942
+ in-flight outbox deliveries keep draining). ``config_dir`` defaults to the server's startup
943
+ --config dir and must resolve within an allowed reload root — the loader executes Python, so
944
+ an arbitrary path is refused (403). A bad/empty config is rejected and the running graph is
945
+ left untouched. Every reload (and dry-run) is audited. Requires ``config:deploy``.
946
+
947
+ ``dry_run=true`` is the promote pre-flight: it validates the graph against THIS environment's
948
+ values (a missing ``env()`` value → 422) and reports the would-be graph **without** swapping.
949
+
950
+ Error responses are intentionally generic (the detail is logged server-side, not returned)
951
+ so a config:deploy holder can't probe the filesystem via reload error text."""
952
+ try:
953
+ # propagate=True on the real apply so an operator reload on one node bumps the cluster-wide
954
+ # config version and every other node converges (Track B Step 6); a dry_run never propagates
955
+ # (it doesn't apply anything) and single-node ignores it (is_clustered() False).
956
+ registry = await engine.reload(
957
+ req.config_dir, dry_run=req.dry_run, propagate=not req.dry_run
958
+ )
959
+ except ConfigReloadDenied as exc:
960
+ await engine.store.record_audit(
961
+ "config_reload_denied",
962
+ actor=user.username,
963
+ detail=json.dumps({"requested": req.config_dir, "dry_run": req.dry_run}),
964
+ )
965
+ raise HTTPException(403, "config directory is not an allowed reload root") from exc
966
+ except FileNotFoundError as exc:
967
+ _log.warning("config reload failed (missing dir): %s", exc)
968
+ await engine.store.record_audit(
969
+ "config_reload_failed",
970
+ actor=user.username,
971
+ detail=json.dumps(
972
+ {"requested": req.config_dir, "dry_run": req.dry_run, "reason": "not_found"}
973
+ ),
974
+ )
975
+ raise HTTPException(404, "config directory not found") from exc
976
+ except WiringError as exc:
977
+ _log.warning("config reload failed (invalid config): %s", exc)
978
+ await engine.store.record_audit(
979
+ "config_reload_failed",
980
+ actor=user.username,
981
+ detail=json.dumps(
982
+ {
983
+ "requested": req.config_dir,
984
+ "dry_run": req.dry_run,
985
+ "reason": "invalid_config",
986
+ }
987
+ ),
988
+ )
989
+ raise HTTPException(422, "invalid configuration") from exc
990
+ await engine.store.record_audit(
991
+ "config_reload_check" if req.dry_run else "config_reload",
992
+ actor=user.username,
993
+ detail=json.dumps(
994
+ {
995
+ "dir": str(engine.last_reload_dir) if engine.last_reload_dir else None,
996
+ "inbound": len(registry.inbound),
997
+ "outbound": len(registry.outbound),
998
+ "dry_run": req.dry_run,
999
+ }
1000
+ ),
1001
+ )
1002
+ rr = engine.registry_runner
1003
+ return ReloadResult(
1004
+ inbound=len(registry.inbound),
1005
+ outbound=len(registry.outbound),
1006
+ routers=len(registry.routers),
1007
+ handlers=len(registry.handlers),
1008
+ running=bool(rr and rr.running),
1009
+ dry_run=req.dry_run,
1010
+ )
1011
+
1012
+ # --- messages ------------------------------------------------------------
1013
+
1014
+ @app.get("/messages", response_model=MessageList)
1015
+ async def list_messages(
1016
+ request: Request,
1017
+ engine: Engine = Depends(_get_engine),
1018
+ identity: Identity = Depends(require_phi_read(Permission.MESSAGES_READ)),
1019
+ channel_id: str | None = Query(None, max_length=256),
1020
+ status: str | None = Query(None, max_length=64),
1021
+ message_type: str | None = Query(None, max_length=64),
1022
+ control_id: str | None = Query(None, max_length=256),
1023
+ limit: int = Query(50, ge=1, le=500),
1024
+ offset: int = Query(0, ge=0),
1025
+ ) -> MessageList:
1026
+ filters = dict(
1027
+ channel_id=channel_id,
1028
+ status=status,
1029
+ message_type=message_type,
1030
+ control_id=control_id,
1031
+ )
1032
+ allowed = _scope(identity) # per-channel RBAC: only the caller's channels (None = all)
1033
+ rows = await engine.store.list_messages(
1034
+ limit=limit, offset=offset, allowed_channels=allowed, **filters
1035
+ )
1036
+ total = await engine.store.count_messages(allowed_channels=allowed, **filters)
1037
+ messages = [_summary(r) for r in rows]
1038
+ # Per-property PHI gate, centralized in api/field_authz (WP-9, ASVS 8.2.3): a caller without
1039
+ # messages:view_summary gets `summary` AND `error` (handler exception text can quote field
1040
+ # values — review low-8) nulled; the detail endpoint keeps them, gated instead by
1041
+ # messages:view_raw which already exposes the body.
1042
+ messages = [redact_unauthorized(m, identity) for m in messages]
1043
+ # Every patient-identifying value actually returned is audited SERVER-SIDE (coalesced per
1044
+ # actor/hour) — never gated on a client flag, so a scripted bulk fetch can't harvest the
1045
+ # patient census unaudited (review M-5). Counted post-redaction = exactly what's returned.
1046
+ exposed = count_exposed(messages)
1047
+ if exposed:
1048
+ await request.app.state.summary_auditor.note(
1049
+ engine.store, identity.username, channel_id, exposed, time.time()
1050
+ )
1051
+ return MessageList(total=total, limit=limit, offset=offset, messages=messages)
1052
+
1053
+ @app.get("/messages/{message_id}", response_model=MessageDetail)
1054
+ async def get_message(
1055
+ message_id: str,
1056
+ request: Request,
1057
+ engine: Engine = Depends(_get_engine),
1058
+ identity: Identity = Depends(require_phi_read(Permission.MESSAGES_VIEW_RAW)),
1059
+ ) -> MessageDetail:
1060
+ row = await engine.store.get_message(message_id)
1061
+ # 404 (not 403) when the message is outside the caller's channel scope — don't reveal that a
1062
+ # message exists in another tenant's channel (per-channel RBAC).
1063
+ if row is None or not identity.can_access_channel(row["channel_id"]):
1064
+ if row is not None:
1065
+ await _audit_channel_denied(engine, identity, row["channel_id"])
1066
+ raise HTTPException(404, f"no such message: {message_id}")
1067
+ # Opening a body is PHI access — record it (with the viewer) before returning. record_view
1068
+ # gives the per-message timeline; record_audit puts it in the tamper-evident, GET /audit-visible
1069
+ # compliance chain (docs/PHI.md §6 names message_view as audited — review M-3).
1070
+ await engine.store.record_view(message_id, actor=identity.username)
1071
+ await engine.store.record_audit(
1072
+ "message_view",
1073
+ actor=identity.username,
1074
+ channel_id=row["channel_id"],
1075
+ detail=json.dumps({"message_id": message_id}),
1076
+ )
1077
+ outbox_rows = await engine.store.outbox_for(message_id)
1078
+ event_rows = await engine.store.events_for(message_id)
1079
+ detail = MessageDetail(
1080
+ **_summary(row).model_dump(),
1081
+ raw=row["raw"],
1082
+ outbox=[
1083
+ OutboxInfo(
1084
+ id=o["id"],
1085
+ destination_name=o["destination_name"],
1086
+ status=o["status"],
1087
+ attempts=o["attempts"],
1088
+ next_attempt_at=o["next_attempt_at"],
1089
+ last_error=o["last_error"],
1090
+ )
1091
+ for o in outbox_rows
1092
+ ],
1093
+ events=[
1094
+ EventInfo(
1095
+ ts=e["ts"],
1096
+ event=e["event"],
1097
+ destination=e["destination"],
1098
+ detail=e["detail"],
1099
+ )
1100
+ for e in event_rows
1101
+ ],
1102
+ )
1103
+ # Per-property PHI gate (#120): the patient `summary`, the exception `error`, every delivery
1104
+ # `last_error`, and every event `detail` gate on messages:view_summary. Redaction keys on the
1105
+ # EXACT type (no MRO walk), so the MessageDetail wrapper and each nested OutboxInfo/EventInfo are
1106
+ # redacted individually. The raw body stays on this route's view_raw gate. Exposure is audited
1107
+ # server-side, mirroring the list endpoints (count after redaction = what's actually returned).
1108
+ outbox = [redact_unauthorized(o, identity) for o in detail.outbox]
1109
+ events = [redact_unauthorized(e, identity) for e in detail.events]
1110
+ detail = redact_unauthorized(detail, identity).model_copy(
1111
+ update={"outbox": outbox, "events": events}
1112
+ )
1113
+ exposed = count_exposed([detail, *outbox, *events])
1114
+ if exposed:
1115
+ await request.app.state.summary_auditor.note(
1116
+ engine.store, identity.username, row["channel_id"], exposed, time.time()
1117
+ )
1118
+ return detail
1119
+
1120
+ @app.get("/messages/{message_id}/responses", response_model=MessageResponses)
1121
+ async def get_message_responses(
1122
+ message_id: str,
1123
+ engine: Engine = Depends(_get_engine),
1124
+ identity: Identity = Depends(require_phi_read(Permission.MESSAGES_READ)),
1125
+ ) -> MessageResponses:
1126
+ """The captured request/response replies for a message (ADR 0013). ``outcome``/``detail`` need
1127
+ the message-read permission; the PHI ``body`` is included only for a caller that also holds the
1128
+ raw-body permission (``MESSAGES_VIEW_RAW``). Every access is audited (``response.read``)."""
1129
+ row = await engine.store.get_message(message_id)
1130
+ # 404 (not 403) outside the caller's channel scope — don't reveal a message in another tenant's
1131
+ # channel (per-channel RBAC), mirroring get_message.
1132
+ if row is None or not identity.can_access_channel(row["channel_id"]):
1133
+ if row is not None:
1134
+ await _audit_channel_denied(engine, identity, row["channel_id"])
1135
+ raise HTTPException(404, f"no such message: {message_id}")
1136
+ captured = await engine.store.correlate_response(message_id)
1137
+ include_body = identity.has(Permission.MESSAGES_VIEW_RAW)
1138
+ # Reading captured replies is PHI access — audit it. If bodies are exposed, also record the
1139
+ # per-message PHI view timeline (record_view), exactly like opening a raw body.
1140
+ await engine.store.record_audit(
1141
+ "response.read",
1142
+ actor=identity.username,
1143
+ channel_id=row["channel_id"],
1144
+ detail=json.dumps(
1145
+ {"message_id": message_id, "count": len(captured), "body": include_body}
1146
+ ),
1147
+ )
1148
+ if include_body and captured:
1149
+ await engine.store.record_view(message_id, actor=identity.username)
1150
+ # `detail` can embed a reply fragment (e.g. an unparseable-ACK note), so it gates on
1151
+ # messages:view_summary like every other disposition text (#120) — a bare messages:read caller
1152
+ # (Viewer) reaches this endpoint but gets `detail` nulled. The PHI `body` stays on view_raw above.
1153
+ return MessageResponses(
1154
+ message_id=message_id,
1155
+ responses=[
1156
+ redact_unauthorized(
1157
+ CapturedResponseInfo(
1158
+ destination_name=c.destination_name,
1159
+ response_seq=c.response_seq,
1160
+ outcome=c.outcome,
1161
+ detail=c.detail,
1162
+ captured_at=c.captured_at,
1163
+ body=c.body if include_body else None,
1164
+ ),
1165
+ identity,
1166
+ )
1167
+ for c in captured
1168
+ ],
1169
+ )
1170
+
1171
+ @app.get("/messages/{message_id}/outbound", response_model=OutboundPayloads)
1172
+ async def get_message_outbound(
1173
+ message_id: str,
1174
+ engine: Engine = Depends(_get_engine),
1175
+ identity: Identity = Depends(require_phi_read(Permission.MESSAGES_VIEW_RAW)),
1176
+ ) -> OutboundPayloads:
1177
+ """The **transformed outbound payloads** MEFOR routed for a message — one entry per
1178
+ destination (#14 parity tool). The PHI bodies are returned in full, so the route requires
1179
+ ``MESSAGES_VIEW_RAW`` outright (unlike ``/responses``, where the body is conditional). Works on
1180
+ both simulate/shadow and live runs — the transformed payload is retained on the done outbound
1181
+ row in either mode. Every access is audited (``outbound.read`` + a per-message ``viewed``
1182
+ event when bodies are returned)."""
1183
+ row = await engine.store.get_message(message_id)
1184
+ # 404 (not 403) outside the caller's channel scope — don't reveal a message in another tenant's
1185
+ # channel (per-channel RBAC), mirroring get_message.
1186
+ if row is None or not identity.can_access_channel(row["channel_id"]):
1187
+ if row is not None:
1188
+ await _audit_channel_denied(engine, identity, row["channel_id"])
1189
+ raise HTTPException(404, f"no such message: {message_id}")
1190
+ payload_rows = await engine.store.outbox_payloads_for(message_id)
1191
+ # Returning transformed bodies is PHI access — audit the read, and (when bodies are actually
1192
+ # returned) record the per-message PHI view timeline, exactly like opening a raw body.
1193
+ await engine.store.record_audit(
1194
+ "outbound.read",
1195
+ actor=identity.username,
1196
+ channel_id=row["channel_id"],
1197
+ detail=json.dumps({"message_id": message_id, "count": len(payload_rows)}),
1198
+ )
1199
+ if payload_rows:
1200
+ await engine.store.record_view(message_id, actor=identity.username)
1201
+ return OutboundPayloads(
1202
+ message_id=message_id,
1203
+ payloads=[
1204
+ OutboundPayloadInfo(
1205
+ destination_name=o["destination_name"],
1206
+ status=o["status"],
1207
+ payload=o["payload"],
1208
+ )
1209
+ for o in payload_rows
1210
+ ],
1211
+ )
1212
+
1213
+ @app.post("/messages/{message_id}/replay", response_model=ReplayResult)
1214
+ async def replay_message(
1215
+ message_id: str,
1216
+ engine: Engine = Depends(_get_engine),
1217
+ identity: Identity = Depends(require_step_up(Permission.MESSAGES_REPLAY)),
1218
+ ) -> ReplayResult:
1219
+ row = await engine.store.get_message(message_id)
1220
+ if row is None or not identity.can_access_channel(row["channel_id"]):
1221
+ if row is not None:
1222
+ await _audit_channel_denied(engine, identity, row["channel_id"])
1223
+ raise HTTPException(404, f"no such message: {message_id}")
1224
+ requeued = await engine.replay(message_id)
1225
+ if requeued == 0:
1226
+ # The message exists (checked above) but has no re-queueable outbox rows — it errored,
1227
+ # was filtered, or routed nowhere. Replaying is a no-op there; say so rather than report
1228
+ # a misleading 200/requeued=0 (and the store leaves its disposition intact — review M-2).
1229
+ raise HTTPException(
1230
+ 409,
1231
+ f"message {message_id} has no deliveries to replay "
1232
+ "(it errored, was filtered, or routed nowhere)",
1233
+ )
1234
+ # An actual re-transmission of PHI: record who did it in the tamper-evident chain (review M-4).
1235
+ await engine.store.record_audit(
1236
+ "message_replay",
1237
+ actor=identity.username,
1238
+ channel_id=row["channel_id"],
1239
+ detail=json.dumps({"message_id": message_id, "requeued": requeued}),
1240
+ )
1241
+ return ReplayResult(message_id=message_id, requeued=requeued)
1242
+
1243
+ # --- stats ---------------------------------------------------------------
1244
+
1245
+ @app.get("/stats", response_model=StatsResponse)
1246
+ async def stats(
1247
+ engine: Engine = Depends(_get_engine),
1248
+ _user: Identity = Depends(require(Permission.MONITORING_READ)),
1249
+ ) -> StatsResponse:
1250
+ return StatsResponse(
1251
+ outbox_by_status=await engine.store.stats(),
1252
+ in_pipeline=await engine.store.in_pipeline_depth(),
1253
+ )
1254
+
1255
+ # --- engine + DB status --------------------------------------------------
1256
+
1257
+ @app.get("/status", response_model=SystemStatus)
1258
+ async def system_status(
1259
+ engine: Engine = Depends(_get_engine),
1260
+ _user: Identity = Depends(require(Permission.MONITORING_READ)),
1261
+ ) -> SystemStatus:
1262
+ total = running = 0
1263
+ rr = engine.registry_runner
1264
+ if rr is not None: # one "channel" per inbound connection
1265
+ total = len(rr.registry.inbound)
1266
+ running = sum(1 for name in rr.registry.inbound if rr.inbound_running(name))
1267
+ db = await engine.store.db_status()
1268
+ return SystemStatus(
1269
+ engine=EngineInfo(
1270
+ version=__version__,
1271
+ uptime_seconds=max(0.0, time.time() - engine.started_at)
1272
+ if engine.started_at
1273
+ else 0.0,
1274
+ pid=os.getpid(),
1275
+ channels_total=total,
1276
+ channels_running=running,
1277
+ channels_stopped=total - running,
1278
+ outbox_by_status=await engine.store.stats(),
1279
+ ),
1280
+ db=DbInfo(
1281
+ path=db.path,
1282
+ size_bytes=db.size_bytes,
1283
+ disk_free_bytes=db.disk_free_bytes,
1284
+ journal_mode=db.journal_mode,
1285
+ messages=db.messages,
1286
+ events=db.events,
1287
+ audit=db.audit,
1288
+ ),
1289
+ )
1290
+
1291
+ # --- cluster observability (Track B Step 7) ------------------------------
1292
+
1293
+ @app.get("/cluster/status", response_model=ClusterStatus)
1294
+ async def cluster_status(
1295
+ engine: Engine = Depends(_get_engine),
1296
+ _user: Identity = Depends(require(Permission.MONITORING_READ)),
1297
+ ) -> ClusterStatus:
1298
+ """This node's cluster posture: id, whether it's clustered, whether it's the leader, its
1299
+ active-passive role, and the cached config version. All cheap in-memory coordinator gates — no DB
1300
+ round-trip. Single-node (NullCoordinator) reports clustered=false, is_leader=true,
1301
+ role="single-node", config_version=0."""
1302
+ c = engine.coordinator
1303
+ clustered = c.is_clustered()
1304
+ is_leader = c.is_leader()
1305
+ role = "single-node" if not clustered else ("primary" if is_leader else "standby")
1306
+ return ClusterStatus(
1307
+ node_id=c.node_id,
1308
+ clustered=clustered,
1309
+ is_leader=is_leader,
1310
+ role=role,
1311
+ config_version=c.config_version_cached(),
1312
+ )
1313
+
1314
+ @app.get("/cluster/nodes", response_model=ClusterNodeList)
1315
+ async def cluster_nodes(
1316
+ engine: Engine = Depends(_get_engine),
1317
+ _user: Identity = Depends(require(Permission.MONITORING_READ)),
1318
+ ) -> ClusterNodeList:
1319
+ """Cluster membership: one row per known node with liveness + derived leadership, plus the single
1320
+ leader's node_id and the authoritative leadership-lease state (owner + expiry). One-to-two DB
1321
+ reads on a real cluster (the shared ``nodes`` table + the ``leader_lease`` row); single-node
1322
+ synthesizes one self-entry with no DB."""
1323
+ c = engine.coordinator
1324
+ members = await c.cluster_members()
1325
+ nodes = [
1326
+ ClusterNode(
1327
+ node_id=m.node_id,
1328
+ host=m.host,
1329
+ pid=m.pid,
1330
+ status=m.status,
1331
+ started_at=m.started_at,
1332
+ last_seen=m.last_seen,
1333
+ is_leader=m.is_leader,
1334
+ )
1335
+ for m in members
1336
+ ]
1337
+ leader = next((n.node_id for n in nodes if n.is_leader), None)
1338
+ lease_owner, lease_expires_at = await c.leadership_lease()
1339
+ return ClusterNodeList(
1340
+ nodes=nodes,
1341
+ leader_node_id=leader,
1342
+ lease_owner=lease_owner,
1343
+ lease_expires_at=lease_expires_at,
1344
+ )
1345
+
1346
+ @app.post("/status/integrity-check", response_model=IntegrityResult)
1347
+ async def integrity_check(
1348
+ engine: Engine = Depends(_get_engine),
1349
+ _user: Identity = Depends(require(Permission.MONITORING_DIAGNOSE)),
1350
+ ) -> IntegrityResult:
1351
+ """Run a database integrity check on demand (PRAGMA quick_check)."""
1352
+ ok, detail = await engine.store.integrity_check()
1353
+ return IntegrityResult(ok=ok, detail=detail)
1354
+
1355
+ @app.websocket("/ws/stats")
1356
+ async def ws_stats(websocket: WebSocket) -> None:
1357
+ """Push queue-depth stats to the console roughly once a second until it disconnects — the
1358
+ live monitor feed. The session is re-validated periodically so a revoked/expired/downgraded
1359
+ token can't keep streaming forever, and concurrent sockets are capped (API-WS)."""
1360
+ identity = await authorize_ws(websocket, Permission.MONITORING_READ)
1361
+ if identity is None:
1362
+ await websocket.close(code=1008) # policy violation (unauthenticated/forbidden)
1363
+ return
1364
+ engine_obj: Engine | None = getattr(websocket.app.state, "engine", None)
1365
+ if engine_obj is None:
1366
+ await websocket.close(code=1011)
1367
+ return
1368
+ state = websocket.app.state
1369
+ if getattr(state, "ws_count", 0) >= _MAX_WS_CONNECTIONS:
1370
+ await websocket.close(code=1013) # try again later — too many live monitor sockets
1371
+ return
1372
+ auth = getattr(state, "auth", None)
1373
+ token = ws_token(websocket)
1374
+ await websocket.accept()
1375
+ state.ws_count = getattr(state, "ws_count", 0) + 1
1376
+ elapsed = 0.0
1377
+ try:
1378
+ while True:
1379
+ await websocket.send_json({"outbox_by_status": await engine_obj.store.stats()})
1380
+ await asyncio.sleep(1.0)
1381
+ elapsed += 1.0
1382
+ if auth is not None and auth.enabled and elapsed >= _WS_REVALIDATE_SECONDS:
1383
+ elapsed = 0.0
1384
+ # activity=False: this keepalive must not reset the session's idle clock.
1385
+ current = await auth.identity_for_token(token, activity=False)
1386
+ if current is None or not current.has(Permission.MONITORING_READ):
1387
+ await websocket.close(code=1008)
1388
+ return
1389
+ except WebSocketDisconnect:
1390
+ return
1391
+ finally:
1392
+ state.ws_count = max(0, getattr(state, "ws_count", 1) - 1)
1393
+
1394
+ return app
1395
+
1396
+
1397
+ def _emit_bootstrap_admin(bootstrap: BootstrapAdmin, store_settings: StoreSettings) -> None:
1398
+ """Persist the one-time bootstrap password to a restricted file — never the rotating log.
1399
+
1400
+ Until rotated it is a standing Administrator credential, so it must not land in NSSM's broadly
1401
+ readable stdout capture. Write it to an owner-only file the operator consumes and deletes; log
1402
+ only the location. Paired with server-side must_change_password enforcement, it dies at first login.
1403
+ """
1404
+ base = Path(store_settings.path or ".").resolve()
1405
+ secret_file = base.parent / "bootstrap-admin.txt"
1406
+ secret_file.write_text(
1407
+ f"username: {bootstrap.username}\npassword: {bootstrap.password}\n", encoding="utf-8"
1408
+ )
1409
+ # Reuse the store's platform-correct primitive: os.chmod(0o600) is a no-op on Windows (the NSSM
1410
+ # deployment target), so _secure_file sets an owner-only DACL via icacls there, chmod on POSIX.
1411
+ _secure_file(secret_file)
1412
+ _log.warning(
1413
+ "Created bootstrap admin %r; one-time password written to %s — sign in, change it, then "
1414
+ "delete that file.",
1415
+ bootstrap.username,
1416
+ secret_file,
1417
+ )
1418
+
1419
+
1420
+ _SESSION_REAP_INTERVAL = 3600.0 # purge expired/idle sessions hourly to bound the sessions table
1421
+
1422
+
1423
+ async def _session_reaper(store: Store) -> None:
1424
+ """Drop expired session rows (immediately, then on an interval) until the task is cancelled.
1425
+
1426
+ A transient store error must not kill the reaper for the process lifetime (it would let the
1427
+ sessions table grow unbounded, and its stored exception could later abort lifespan shutdown) —
1428
+ log and retry next interval (review M-33)."""
1429
+ while True:
1430
+ try:
1431
+ await store.purge_expired_sessions()
1432
+ except asyncio.CancelledError:
1433
+ raise
1434
+ except Exception:
1435
+ _log.exception("session reaper: purge failed; will retry next interval")
1436
+ await asyncio.sleep(_SESSION_REAP_INTERVAL)
1437
+
1438
+
1439
+ def create_managed_app(
1440
+ *,
1441
+ db_path: str | Path | None = None,
1442
+ store_settings: StoreSettings | None = None,
1443
+ config_dir: str | Path | None = None,
1444
+ config_reload_roots: Sequence[str] = (),
1445
+ poll_interval: float = 0.25,
1446
+ synchronous: str = "NORMAL",
1447
+ inbound_bind_host: str = "127.0.0.1",
1448
+ allow_insecure_bind: bool = False,
1449
+ delivery_defaults: RetryPolicy | None = None,
1450
+ ordering_default: OrderingMode | None = None,
1451
+ internal_error_default: InternalErrorPolicy | None = None,
1452
+ buildup_default: BuildupThreshold | None = None,
1453
+ ack_after_default: AckAfter | None = None,
1454
+ max_correlation_depth: int = 8,
1455
+ env_values: Mapping[str, Any] | None = None,
1456
+ env_values_provider: Callable[[], Mapping[str, Any]] | None = None,
1457
+ auth_settings: AuthSettings | None = None,
1458
+ ai_settings: AiSettings | None = None,
1459
+ alerts_settings: AlertsSettings | None = None,
1460
+ retention_settings: RetentionSettings | None = None,
1461
+ cert_monitor_settings: CertMonitorSettings | None = None,
1462
+ api_tls_cert_file: str | None = None,
1463
+ reference_settings: ReferenceSettings | None = None,
1464
+ egress_settings: EgressSettings | None = None,
1465
+ shadow_settings: ShadowSettings | None = None,
1466
+ cluster_settings: ClusterSettings | None = None,
1467
+ approvals_settings: ApprovalsSettings | None = None,
1468
+ expose_docs: bool = False,
1469
+ ws_allowed_origins: Sequence[str] = (),
1470
+ ) -> FastAPI:
1471
+ """Build an app that owns its engine for its whole lifespan (CLI server / sync tests).
1472
+
1473
+ Pass ``store_settings`` for full backend selection (the service path), or ``db_path`` (+optional
1474
+ ``synchronous``) as a SQLite shortcut. ``config_dir`` loads the code-first Connection/Router/
1475
+ Handler graph. ``auth_settings`` (when enabled) attaches an :class:`AuthService`, seeds the
1476
+ built-in roles, and creates a bootstrap admin on first run. The store is opened via the
1477
+ backend-agnostic :func:`~messagefoundry.store.open_store`.
1478
+ """
1479
+ if store_settings is None:
1480
+ if db_path is None:
1481
+ raise ValueError("create_managed_app requires either store_settings or db_path")
1482
+ store_settings = sqlite_settings(db_path, synchronous=synchronous)
1483
+ resolved = store_settings
1484
+
1485
+ @asynccontextmanager
1486
+ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
1487
+ # Process-level last-resort: route any otherwise-unhandled asyncio task/callback exception
1488
+ # through safe_exc → the log, so it can't escape as a raw traceback (possible PHI) or die
1489
+ # silently (ASVS 16.5.4). Here because set_exception_handler needs the running loop.
1490
+ install_loop_exception_handler()
1491
+ store = await open_store(resolved)
1492
+ # Operational alert notifier (webhook/email). None when no transport is configured → the
1493
+ # engine falls back to the logging sink. Its background dispatch task is owned by this
1494
+ # lifespan: started here, drained + stopped after the engine in the finally below.
1495
+ notifier = notifier_from_settings(alerts_settings) if alerts_settings is not None else None
1496
+ if notifier is not None:
1497
+ notifier.start()
1498
+ # Cluster coordinator (Track B Step 3) — built from the opened store so a Postgres-backed
1499
+ # store can reach its pool. Returns the no-op NullCoordinator unless [cluster].enabled on a
1500
+ # Postgres store, so single-node is byte-identical. The Engine owns its lifecycle (start/stop
1501
+ # in engine.start()/stop()), so the lifespan only constructs + passes it here.
1502
+ coordinator = build_coordinator(store, cluster_settings)
1503
+ engine = Engine(
1504
+ store,
1505
+ poll_interval=poll_interval,
1506
+ max_correlation_depth=max_correlation_depth,
1507
+ config_dir=config_dir,
1508
+ config_reload_roots=config_reload_roots,
1509
+ inbound_bind_host=inbound_bind_host,
1510
+ allow_insecure_bind=allow_insecure_bind,
1511
+ delivery_defaults=delivery_defaults,
1512
+ ordering_default=ordering_default,
1513
+ internal_error_default=internal_error_default,
1514
+ buildup_default=buildup_default,
1515
+ ack_after_default=ack_after_default,
1516
+ alert_sink=notifier,
1517
+ retention_settings=retention_settings,
1518
+ cert_monitor_settings=cert_monitor_settings,
1519
+ api_tls_cert_file=api_tls_cert_file,
1520
+ reference_settings=reference_settings,
1521
+ egress_settings=egress_settings,
1522
+ shadow_settings=shadow_settings,
1523
+ active_environment=ai_settings.environment if ai_settings else None,
1524
+ env_values=env_values,
1525
+ env_values_provider=env_values_provider,
1526
+ coordinator=coordinator,
1527
+ cluster_settings=cluster_settings,
1528
+ )
1529
+ if config_dir is not None:
1530
+ engine.add_registry(load_config(config_dir))
1531
+ await engine.start()
1532
+ app.state.engine = engine
1533
+ app.state.approval_gate = _build_approval_gate(
1534
+ engine, approvals_settings or ApprovalsSettings()
1535
+ )
1536
+ reaper: asyncio.Task[None] | None = None
1537
+ security_notifier = None
1538
+ if auth_settings is not None and auth_settings.enabled:
1539
+ # Out-of-band security-event email (ASVS 6.3.5/6.3.7) — reuses the [alerts] SMTP transport,
1540
+ # sent to each affected user's own address. None when disabled or no SMTP configured; the
1541
+ # /me/security-events feed still records events. Its background task is owned by this
1542
+ # lifespan (started here, drained + closed after the engine in the finally below).
1543
+ if auth_settings.notify_security_events and alerts_settings is not None:
1544
+ security_notifier = security_notifier_from_settings(alerts_settings)
1545
+ if security_notifier is not None:
1546
+ security_notifier.start()
1547
+ auth = AuthService(store, auth_settings, security_notifier=security_notifier)
1548
+ bootstrap = await auth.initialize()
1549
+ app.state.auth = auth
1550
+ if bootstrap is not None:
1551
+ _emit_bootstrap_admin(bootstrap, resolved)
1552
+ reaper = asyncio.create_task(_session_reaper(store))
1553
+ try:
1554
+ yield
1555
+ finally:
1556
+ if reaper is not None:
1557
+ reaper.cancel()
1558
+ # gather(return_exceptions): absorbs both our cancellation AND any exception a
1559
+ # previously-died reaper stored, so it can't propagate here and skip engine.stop()
1560
+ # (review M-33).
1561
+ await asyncio.gather(reaper, return_exceptions=True)
1562
+ await engine.stop()
1563
+ if security_notifier is not None:
1564
+ await (
1565
+ security_notifier.aclose()
1566
+ ) # drain queued user emails, bounded by SMTP timeout
1567
+ if notifier is not None:
1568
+ # Stop accepting alerts last (after the engine quiesces) so any final
1569
+ # connection_stopped/queue_buildup still drains; bounded by the transport timeouts.
1570
+ await notifier.aclose()
1571
+
1572
+ # Auth disabled (or unset) → explicitly run open (dev/loopback; __main__ refuses a non-loopback
1573
+ # serve when auth is off). Auth enabled → fail-closed until the lifespan attaches the service.
1574
+ allow_no_auth = auth_settings is None or not auth_settings.enabled
1575
+ return create_app(
1576
+ lifespan=lifespan,
1577
+ ai_settings=ai_settings,
1578
+ expose_docs=expose_docs,
1579
+ allow_no_auth=allow_no_auth,
1580
+ ws_allowed_origins=ws_allowed_origins,
1581
+ )