messagefoundry 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- messagefoundry/__init__.py +108 -0
- messagefoundry/__main__.py +1155 -0
- messagefoundry/api/__init__.py +27 -0
- messagefoundry/api/app.py +1581 -0
- messagefoundry/api/approvals.py +184 -0
- messagefoundry/api/auth_models.py +211 -0
- messagefoundry/api/auth_routes.py +655 -0
- messagefoundry/api/field_authz.py +96 -0
- messagefoundry/api/models.py +374 -0
- messagefoundry/api/security.py +247 -0
- messagefoundry/api/tls.py +47 -0
- messagefoundry/auth/__init__.py +39 -0
- messagefoundry/auth/data/common_passwords.NOTICE +13 -0
- messagefoundry/auth/data/common_passwords.txt +10000 -0
- messagefoundry/auth/identity.py +71 -0
- messagefoundry/auth/ldap.py +264 -0
- messagefoundry/auth/notifications.py +68 -0
- messagefoundry/auth/passwords.py +53 -0
- messagefoundry/auth/permissions.py +120 -0
- messagefoundry/auth/policy.py +153 -0
- messagefoundry/auth/ratelimit.py +55 -0
- messagefoundry/auth/service.py +1323 -0
- messagefoundry/auth/tokens.py +26 -0
- messagefoundry/auth/totp.py +174 -0
- messagefoundry/checks.py +174 -0
- messagefoundry/config/__init__.py +30 -0
- messagefoundry/config/active_environment.py +80 -0
- messagefoundry/config/ai_policy.py +140 -0
- messagefoundry/config/code_sets.py +260 -0
- messagefoundry/config/connections_edit.py +200 -0
- messagefoundry/config/connections_file.py +287 -0
- messagefoundry/config/db_lookup.py +117 -0
- messagefoundry/config/environments.py +116 -0
- messagefoundry/config/ingest_time.py +83 -0
- messagefoundry/config/models.py +240 -0
- messagefoundry/config/reference.py +158 -0
- messagefoundry/config/response.py +83 -0
- messagefoundry/config/run_context.py +153 -0
- messagefoundry/config/settings.py +1311 -0
- messagefoundry/config/state.py +99 -0
- messagefoundry/config/tls_policy.py +110 -0
- messagefoundry/config/wiring.py +1918 -0
- messagefoundry/console/__init__.py +20 -0
- messagefoundry/console/__main__.py +274 -0
- messagefoundry/console/_async.py +107 -0
- messagefoundry/console/change_password.py +111 -0
- messagefoundry/console/client.py +552 -0
- messagefoundry/console/connections.py +324 -0
- messagefoundry/console/login.py +107 -0
- messagefoundry/console/mfa.py +205 -0
- messagefoundry/console/reauth.py +94 -0
- messagefoundry/console/search.py +57 -0
- messagefoundry/console/service_control.py +137 -0
- messagefoundry/console/sessions.py +122 -0
- messagefoundry/console/shell.py +410 -0
- messagefoundry/console/status.py +377 -0
- messagefoundry/console/users_page.py +282 -0
- messagefoundry/console/widgets.py +553 -0
- messagefoundry/generators/README.md +27 -0
- messagefoundry/generators/__init__.py +15 -0
- messagefoundry/generators/_core.py +589 -0
- messagefoundry/generators/_hl7data.py +428 -0
- messagefoundry/generators/adt.py +286 -0
- messagefoundry/generators/all_types.py +24 -0
- messagefoundry/generators/bar.py +28 -0
- messagefoundry/generators/dft.py +20 -0
- messagefoundry/generators/mdm.py +39 -0
- messagefoundry/generators/mfn.py +46 -0
- messagefoundry/generators/oml.py +32 -0
- messagefoundry/generators/orl.py +30 -0
- messagefoundry/generators/orm.py +23 -0
- messagefoundry/generators/oru.py +21 -0
- messagefoundry/generators/ras.py +20 -0
- messagefoundry/generators/rde.py +54 -0
- messagefoundry/generators/siu.py +64 -0
- messagefoundry/generators/vxu.py +20 -0
- messagefoundry/hl7schema.py +75 -0
- messagefoundry/last_resort.py +55 -0
- messagefoundry/logging_setup.py +332 -0
- messagefoundry/parsing/__init__.py +64 -0
- messagefoundry/parsing/consistency.py +166 -0
- messagefoundry/parsing/groups.py +228 -0
- messagefoundry/parsing/message.py +453 -0
- messagefoundry/parsing/peek.py +237 -0
- messagefoundry/parsing/split.py +120 -0
- messagefoundry/parsing/summary.py +46 -0
- messagefoundry/parsing/tree.py +128 -0
- messagefoundry/parsing/validate.py +95 -0
- messagefoundry/parsing/x12/__init__.py +46 -0
- messagefoundry/parsing/x12/delimiters.py +140 -0
- messagefoundry/parsing/x12/errors.py +30 -0
- messagefoundry/parsing/x12/interchange.py +232 -0
- messagefoundry/parsing/x12/message.py +200 -0
- messagefoundry/parsing/x12/peek.py +207 -0
- messagefoundry/pipeline/__init__.py +21 -0
- messagefoundry/pipeline/alert_sinks.py +486 -0
- messagefoundry/pipeline/alerts.py +100 -0
- messagefoundry/pipeline/cert_expiry.py +219 -0
- messagefoundry/pipeline/cluster.py +955 -0
- messagefoundry/pipeline/cluster_sqlserver.py +444 -0
- messagefoundry/pipeline/config_convergence.py +137 -0
- messagefoundry/pipeline/dryrun.py +450 -0
- messagefoundry/pipeline/engine.py +756 -0
- messagefoundry/pipeline/leader_tasks.py +158 -0
- messagefoundry/pipeline/reference_sync.py +369 -0
- messagefoundry/pipeline/retention.py +289 -0
- messagefoundry/pipeline/security_notify.py +168 -0
- messagefoundry/pipeline/state_convergence.py +143 -0
- messagefoundry/pipeline/wiring_runner.py +1722 -0
- messagefoundry/py.typed +0 -0
- messagefoundry/redaction.py +71 -0
- messagefoundry/scaffold.py +321 -0
- messagefoundry/secrets_dpapi.py +129 -0
- messagefoundry/store/__init__.py +46 -0
- messagefoundry/store/audit_tee.py +67 -0
- messagefoundry/store/base.py +758 -0
- messagefoundry/store/crypto.py +166 -0
- messagefoundry/store/keyprovider.py +192 -0
- messagefoundry/store/postgres.py +3447 -0
- messagefoundry/store/sqlserver.py +3014 -0
- messagefoundry/store/store.py +3790 -0
- messagefoundry/timezone.py +207 -0
- messagefoundry/transports/__init__.py +50 -0
- messagefoundry/transports/base.py +269 -0
- messagefoundry/transports/database.py +693 -0
- messagefoundry/transports/file.py +551 -0
- messagefoundry/transports/framing.py +164 -0
- messagefoundry/transports/loopback.py +53 -0
- messagefoundry/transports/mllp.py +644 -0
- messagefoundry/transports/remotefile.py +664 -0
- messagefoundry/transports/rest.py +281 -0
- messagefoundry/transports/signing.py +321 -0
- messagefoundry/transports/soap.py +507 -0
- messagefoundry/transports/tcp.py +307 -0
- messagefoundry/transports/timer.py +146 -0
- messagefoundry/transports/x12.py +323 -0
- messagefoundry-0.1.0.dist-info/METADATA +212 -0
- messagefoundry-0.1.0.dist-info/RECORD +142 -0
- messagefoundry-0.1.0.dist-info/WHEEL +4 -0
- messagefoundry-0.1.0.dist-info/entry_points.txt +2 -0
- messagefoundry-0.1.0.dist-info/licenses/LICENSE +662 -0
- messagefoundry-0.1.0.dist-info/licenses/NOTICE +27 -0
|
@@ -0,0 +1,551 @@
|
|
|
1
|
+
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
2
|
+
# Copyright (C) 2026 MessageFoundry Organization and contributors
|
|
3
|
+
"""File transport: directory destination + directory-polling source.
|
|
4
|
+
|
|
5
|
+
**Destination** writes each payload to a file in a directory. The filename may contain
|
|
6
|
+
``{HL7-path}`` placeholders (e.g. ``{MSH-10}.hl7``) resolved by peeking the payload, so
|
|
7
|
+
archived files are named by control id / message type. Writes are atomic (write to a
|
|
8
|
+
temp name, then ``rename``) so a reader watching the directory never sees a partial file.
|
|
9
|
+
|
|
10
|
+
**Source** polls a directory for files, hands each to the pipeline handler, then moves the
|
|
11
|
+
file into a ``.processed`` subdirectory (or ``.error`` if the handler raised). Files have
|
|
12
|
+
no reply channel, so the handler's return value is ignored.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import asyncio
|
|
18
|
+
import logging
|
|
19
|
+
import os
|
|
20
|
+
import re
|
|
21
|
+
import tempfile
|
|
22
|
+
import time
|
|
23
|
+
from collections.abc import Callable
|
|
24
|
+
from contextlib import suppress
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
|
|
27
|
+
from messagefoundry.config.models import ConnectorType, Destination, Source
|
|
28
|
+
from messagefoundry.parsing.peek import HL7PeekError, Peek
|
|
29
|
+
from messagefoundry.parsing.split import split_batch
|
|
30
|
+
from messagefoundry.transports.base import (
|
|
31
|
+
DeliveryError,
|
|
32
|
+
DestinationConnector,
|
|
33
|
+
InboundHandler,
|
|
34
|
+
SourceConnector,
|
|
35
|
+
register_destination,
|
|
36
|
+
register_source,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
__all__ = ["FileDestination", "FileSource", "render_filename", "DEFAULT_MAX_FILE_BYTES"]
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
# Cap a single inbound file read so a multi-GB drop can't OOM the engine (DoS guard). A
|
|
44
|
+
# falsy value (None/0) in settings disables the cap; see docs/CONNECTIONS.md.
|
|
45
|
+
DEFAULT_MAX_FILE_BYTES = 16 * 1024 * 1024 # 16 MiB — matches the MLLP frame cap
|
|
46
|
+
|
|
47
|
+
_PLACEHOLDER = re.compile(r"\{([A-Z][A-Z0-9]{2}-\d+(?:\.\d+){0,2})\}")
|
|
48
|
+
# Strip characters that are unsafe in filenames on Windows and POSIX alike (path separators
|
|
49
|
+
# included, so a resolved value can never introduce a directory component).
|
|
50
|
+
_UNSAFE = re.compile(r'[<>:"/\\|?*\x00-\x1f]')
|
|
51
|
+
# Windows reserved device names (case-insensitive, optionally with an extension) — never usable.
|
|
52
|
+
_RESERVED = {
|
|
53
|
+
"CON",
|
|
54
|
+
"PRN",
|
|
55
|
+
"AUX",
|
|
56
|
+
"NUL",
|
|
57
|
+
*(f"COM{i}" for i in range(1, 10)),
|
|
58
|
+
*(f"LPT{i}" for i in range(1, 10)),
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def render_filename(template: str, payload: str, *, fallback: str) -> str:
|
|
63
|
+
"""Resolve ``{HL7-path}`` placeholders in ``template`` against ``payload``, producing a single
|
|
64
|
+
safe filename (never a path).
|
|
65
|
+
|
|
66
|
+
Unresolvable placeholders (missing field, or an unparseable payload) fall back to ``fallback``
|
|
67
|
+
so a delivery never fails merely because a name couldn't be built. The result is constrained to
|
|
68
|
+
one path component: unsafe characters are stripped, leading dots removed, and ``.``/``..``/empty
|
|
69
|
+
or a reserved device name falls back — so an attacker-controlled field can't write outside the
|
|
70
|
+
target directory or shadow ``.processed``/``.error`` (FILE-1)."""
|
|
71
|
+
try:
|
|
72
|
+
peek: Peek | None = Peek.parse(payload)
|
|
73
|
+
except HL7PeekError:
|
|
74
|
+
peek = None
|
|
75
|
+
|
|
76
|
+
def repl(match: re.Match[str]) -> str:
|
|
77
|
+
value = peek.field(match.group(1)) if peek else None
|
|
78
|
+
return _sanitize(value) if value else fallback
|
|
79
|
+
|
|
80
|
+
name = _sanitize(_PLACEHOLDER.sub(repl, template))
|
|
81
|
+
stem = name.split(".", 1)[0].upper()
|
|
82
|
+
if not name or name in (".", "..") or stem in _RESERVED:
|
|
83
|
+
return fallback
|
|
84
|
+
return name
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _sanitize(value: str) -> str:
|
|
88
|
+
"""Reduce ``value`` to a safe single-component filename: drop unsafe chars and leading dots
|
|
89
|
+
(which would create hidden files or ``.``/``..`` traversal)."""
|
|
90
|
+
return _UNSAFE.sub("_", value).lstrip(".")
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _probe_dir_writable(directory: Path) -> None:
|
|
94
|
+
"""Reachability probe shared by the FILE connectors: ensure ``directory`` exists and accepts a
|
|
95
|
+
write — a destination writes messages there and a source moves processed files into its subdirs,
|
|
96
|
+
so writability is the meaningful check for both. Creates and removes a temp file; raises ``OSError``
|
|
97
|
+
if the directory is missing or unwritable."""
|
|
98
|
+
directory.mkdir(parents=True, exist_ok=True)
|
|
99
|
+
fd, tmp = tempfile.mkstemp(dir=directory, suffix=".probe")
|
|
100
|
+
os.close(fd)
|
|
101
|
+
os.unlink(tmp)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class FileDestination(DestinationConnector):
|
|
105
|
+
def __init__(self, config: Destination) -> None:
|
|
106
|
+
s = config.settings
|
|
107
|
+
if "directory" not in s:
|
|
108
|
+
raise ValueError("file destination requires a 'directory' setting")
|
|
109
|
+
self.directory = Path(s["directory"])
|
|
110
|
+
self.filename_template: str = s.get("filename", "{MSH-10}.hl7")
|
|
111
|
+
# When two messages resolve to the same name, append a counter rather than clobber.
|
|
112
|
+
self._overwrite: bool = bool(s.get("overwrite", False))
|
|
113
|
+
self.encoding: str = s.get("encoding", "utf-8")
|
|
114
|
+
|
|
115
|
+
async def send(self, payload: str) -> None:
|
|
116
|
+
try:
|
|
117
|
+
await asyncio.to_thread(self._write, payload)
|
|
118
|
+
except OSError as exc:
|
|
119
|
+
raise DeliveryError(f"file write failed: {exc}") from exc
|
|
120
|
+
|
|
121
|
+
async def test_connection(self) -> None:
|
|
122
|
+
try:
|
|
123
|
+
await asyncio.to_thread(_probe_dir_writable, self.directory)
|
|
124
|
+
except OSError as exc:
|
|
125
|
+
raise DeliveryError(f"file directory {self.directory} not writable: {exc}") from exc
|
|
126
|
+
|
|
127
|
+
def _write(self, payload: str) -> None:
|
|
128
|
+
self.directory.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
name = render_filename(self.filename_template, payload, fallback="message.hl7")
|
|
130
|
+
target = self.directory / name
|
|
131
|
+
# Defence in depth atop the filename sanitization (FILE-1): never write outside the
|
|
132
|
+
# configured directory even if a name somehow carried a path component.
|
|
133
|
+
if self.directory.resolve() not in target.resolve().parents:
|
|
134
|
+
raise DeliveryError(f"refusing to write outside the destination directory: {name!r}")
|
|
135
|
+
data = payload.encode(self.encoding)
|
|
136
|
+
# Write to a uniquely-named temp (mkstemp — no shared counter, no name race), then publish
|
|
137
|
+
# atomically. For no-overwrite, claim the final name by exclusive create so two concurrent
|
|
138
|
+
# deliveries can't clobber each other (FILE-5: replaces the TOCTOU exists()-then-rename).
|
|
139
|
+
fd, tmp_name = tempfile.mkstemp(dir=self.directory, suffix=".part")
|
|
140
|
+
tmp = Path(tmp_name)
|
|
141
|
+
try:
|
|
142
|
+
with os.fdopen(fd, "wb") as handle:
|
|
143
|
+
handle.write(data)
|
|
144
|
+
if self._overwrite:
|
|
145
|
+
os.replace(tmp, target) # atomic overwrite; consumes tmp
|
|
146
|
+
else:
|
|
147
|
+
_claim_unique(tmp, target) # hard-links tmp → a free name
|
|
148
|
+
finally:
|
|
149
|
+
# Remove the temp; after a successful os.replace it's already gone (suppressed).
|
|
150
|
+
with suppress(OSError):
|
|
151
|
+
os.unlink(tmp)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class FileSource(SourceConnector):
|
|
155
|
+
"""Poll a directory for files and feed each to the pipeline handler."""
|
|
156
|
+
|
|
157
|
+
polls_shared_resource = True # a directory is a shared external resource — leader-gate it
|
|
158
|
+
|
|
159
|
+
def __init__(self, config: Source) -> None:
|
|
160
|
+
s = config.settings
|
|
161
|
+
if "directory" not in s:
|
|
162
|
+
raise ValueError("file source requires a 'directory' setting")
|
|
163
|
+
self.directory = Path(s["directory"])
|
|
164
|
+
# Resolved watch root for path-confinement: a recursive scan must not be walked out of the
|
|
165
|
+
# configured directory via a symlinked file/subdir (see _within_root). resolve() is
|
|
166
|
+
# non-strict, so it's fine that the directory is created later in start().
|
|
167
|
+
self._root_real = self.directory.resolve()
|
|
168
|
+
self.pattern: str = s.get("pattern", "*")
|
|
169
|
+
self.poll_seconds: float = float(s.get("poll_seconds", 1.0))
|
|
170
|
+
self.min_age_seconds: float = float(s.get("min_age_seconds", 0.0))
|
|
171
|
+
self.after_read: str = s.get("after_read", "move") # "move" | "delete"
|
|
172
|
+
self.sort: str = s.get("sort", "name") # "name" | "mtime"
|
|
173
|
+
self.recursive: bool = bool(s.get("recursive", False))
|
|
174
|
+
# Encoding used to re-encode split batch messages back to bytes for the handler. A single
|
|
175
|
+
# (non-batch) message is handed off verbatim, so its bytes never round-trip through this.
|
|
176
|
+
self.encoding: str = s.get("encoding", "utf-8")
|
|
177
|
+
mfb = s.get("max_file_bytes", DEFAULT_MAX_FILE_BYTES)
|
|
178
|
+
self.max_file_bytes: int | None = int(mfb) if mfb else None
|
|
179
|
+
self.processed_dir = self.directory / s.get("processed_subdir", ".processed")
|
|
180
|
+
self.error_dir = self.directory / s.get("error_subdir", ".error")
|
|
181
|
+
self._handler: InboundHandler | None = None
|
|
182
|
+
# Leader-gate (Track B Step 4b): when set, this directory (a shared external resource) is
|
|
183
|
+
# polled only while the gate returns True, so in a cluster exactly one node ingests its
|
|
184
|
+
# files. None = always poll (single-node / direct callers / tests) — byte-identical.
|
|
185
|
+
self._leader_gate: Callable[[], bool] | None = None
|
|
186
|
+
self._skipping = False # whether the last tick was gated out (for a single transition log)
|
|
187
|
+
self._stop = asyncio.Event()
|
|
188
|
+
self._task: asyncio.Task[None] | None = None
|
|
189
|
+
|
|
190
|
+
async def start(
|
|
191
|
+
self, handler: InboundHandler, *, leader_gate: Callable[[], bool] | None = None
|
|
192
|
+
) -> None:
|
|
193
|
+
"""Begin polling in the background. Returns once the source is set up so the
|
|
194
|
+
caller can rely on it being live (consistent with the TCP sources)."""
|
|
195
|
+
self._handler = handler
|
|
196
|
+
self._leader_gate = leader_gate
|
|
197
|
+
self._stop.clear()
|
|
198
|
+
self.processed_dir.mkdir(parents=True, exist_ok=True)
|
|
199
|
+
self.error_dir.mkdir(parents=True, exist_ok=True)
|
|
200
|
+
self._task = asyncio.create_task(self._run())
|
|
201
|
+
|
|
202
|
+
async def test_connection(self) -> None:
|
|
203
|
+
try:
|
|
204
|
+
await asyncio.to_thread(_probe_dir_writable, self.directory)
|
|
205
|
+
except OSError as exc:
|
|
206
|
+
raise DeliveryError(f"file directory {self.directory} not writable: {exc}") from exc
|
|
207
|
+
|
|
208
|
+
async def _run(self) -> None:
|
|
209
|
+
while not self._stop.is_set():
|
|
210
|
+
try:
|
|
211
|
+
if self._may_poll():
|
|
212
|
+
await self._scan_once()
|
|
213
|
+
except asyncio.CancelledError:
|
|
214
|
+
raise
|
|
215
|
+
except Exception:
|
|
216
|
+
# A scan error (watch dir vanished/unreadable, a bad glob, a move/read failure) must
|
|
217
|
+
# NOT kill the poller — that would silently stop the connection from receiving while
|
|
218
|
+
# it still reports running, and re-raise inside stop()/reload (review H-4). Log and
|
|
219
|
+
# retry on the next interval.
|
|
220
|
+
logger.exception(
|
|
221
|
+
"file source scan failed for %s; retrying next poll", self.directory
|
|
222
|
+
)
|
|
223
|
+
try:
|
|
224
|
+
await asyncio.wait_for(self._stop.wait(), self.poll_seconds)
|
|
225
|
+
except asyncio.TimeoutError:
|
|
226
|
+
pass # poll interval elapsed; scan again
|
|
227
|
+
|
|
228
|
+
def _may_poll(self) -> bool:
|
|
229
|
+
"""Whether this tick may scan the directory. False on a follower (leader-gated, Step 4b):
|
|
230
|
+
a non-leader must NOT read or move/delete files, since the directory is shared and two
|
|
231
|
+
nodes ingesting it would duplicate intake. The loop still ticks, so a node that becomes
|
|
232
|
+
leader scans on its next tick (reactive-by-polling, no restart). When the gate is None or
|
|
233
|
+
True, behaves exactly as before. Logged once on each transition (never per skipped tick —
|
|
234
|
+
that would spam a follower's log every poll interval)."""
|
|
235
|
+
if self._leader_gate is None or self._leader_gate():
|
|
236
|
+
if self._skipping:
|
|
237
|
+
self._skipping = False
|
|
238
|
+
logger.debug("file source resuming polling of %s (now leader)", self.directory)
|
|
239
|
+
return True
|
|
240
|
+
if not self._skipping:
|
|
241
|
+
self._skipping = True
|
|
242
|
+
logger.debug(
|
|
243
|
+
"file source skipping polling of %s (not leader; another node ingests it)",
|
|
244
|
+
self.directory,
|
|
245
|
+
)
|
|
246
|
+
return False
|
|
247
|
+
|
|
248
|
+
async def stop(self) -> None:
|
|
249
|
+
self._stop.set()
|
|
250
|
+
if self._task is not None:
|
|
251
|
+
# return_exceptions: a faulted poll task must not re-raise here — stop() runs during
|
|
252
|
+
# reload quiesce, outside its rollback (review H-4). _run already guards scans; this is
|
|
253
|
+
# the belt-and-suspenders.
|
|
254
|
+
await asyncio.gather(self._task, return_exceptions=True)
|
|
255
|
+
self._task = None
|
|
256
|
+
|
|
257
|
+
async def _scan_once(self) -> None:
|
|
258
|
+
assert self._handler is not None
|
|
259
|
+
for path in await asyncio.to_thread(self._candidates):
|
|
260
|
+
if await asyncio.to_thread(self._oversize, path):
|
|
261
|
+
# Transport-level reject *before* any message is read — parallels MLLP dropping an
|
|
262
|
+
# over-cap frame. It never became a "received message", so (like MLLP) there's no
|
|
263
|
+
# store disposition to record; preserve the file in .error for the operator and log it.
|
|
264
|
+
logger.warning(
|
|
265
|
+
"file %s exceeds max_file_bytes (%s); routing to error dir",
|
|
266
|
+
path.name,
|
|
267
|
+
self.max_file_bytes,
|
|
268
|
+
)
|
|
269
|
+
await asyncio.to_thread(self._move, path, self.error_dir)
|
|
270
|
+
continue
|
|
271
|
+
try:
|
|
272
|
+
raw = await asyncio.to_thread(path.read_bytes)
|
|
273
|
+
except OSError as exc:
|
|
274
|
+
# Transient (file locked / vanished mid-scan): leave it in place to retry next scan
|
|
275
|
+
# rather than quarantining a healthy file. Logged, never silently swallowed.
|
|
276
|
+
logger.warning("could not read %s (will retry next scan): %s", path.name, exc)
|
|
277
|
+
continue
|
|
278
|
+
if not _looks_like_hl7(raw):
|
|
279
|
+
# Content doesn't match the declared .hl7 type (binary / non-HL7 text) — quarantine
|
|
280
|
+
# before its bytes reach the pipeline (ASVS 5.2.2). Like the oversize reject above, it
|
|
281
|
+
# never became a "received message", so there's no store disposition; preserve it in
|
|
282
|
+
# .error and log it (never a silent drop).
|
|
283
|
+
logger.warning(
|
|
284
|
+
"file %s is not HL7 (no MSH/FHS/BHS header); routing to error dir", path.name
|
|
285
|
+
)
|
|
286
|
+
await asyncio.to_thread(self._move, path, self.error_dir)
|
|
287
|
+
continue
|
|
288
|
+
try:
|
|
289
|
+
await asyncio.to_thread(scan_inbound_file, raw, path.name)
|
|
290
|
+
except ScanRejected as exc:
|
|
291
|
+
# A configured pre-ingest scanner (AV/ICAP/plugin) rejected the content before it
|
|
292
|
+
# entered the pipeline (ASVS 5.4.3). Like the oversize / non-HL7 rejects above, it
|
|
293
|
+
# never became a "received message", so there's no store disposition; quarantine + log.
|
|
294
|
+
logger.warning(
|
|
295
|
+
"file %s rejected by the pre-ingest scan hook (%s); routing to error dir",
|
|
296
|
+
path.name,
|
|
297
|
+
exc,
|
|
298
|
+
)
|
|
299
|
+
await asyncio.to_thread(self._move, path, self.error_dir)
|
|
300
|
+
continue
|
|
301
|
+
try:
|
|
302
|
+
await self._emit(raw)
|
|
303
|
+
except Exception as exc:
|
|
304
|
+
# The handler records every message-level outcome (parse/validation/routing → ERROR)
|
|
305
|
+
# itself and returns, so an exception escaping here is an infrastructure failure: the
|
|
306
|
+
# durable store write failed (DB locked, disk full). Leave the file in place so the
|
|
307
|
+
# next scan retries once the store recovers (at-least-once) — moving it to .error would
|
|
308
|
+
# drop a *received* message that was never recorded, an accept-and-drop (review M-15).
|
|
309
|
+
#
|
|
310
|
+
# CRITICAL (Tier 2.2 batch split): a batch is split into N hand-offs (_emit), and the
|
|
311
|
+
# file is moved/deleted ONLY after ALL of them succeed (below). If hand-off K fails,
|
|
312
|
+
# we `continue` WITHOUT moving the file, so the next scan re-reads the WHOLE file and
|
|
313
|
+
# re-emits every message 1..N. That is at-least-once: messages 1..K-1 may be re-emitted
|
|
314
|
+
# (duplicates, acceptable — handlers are idempotent), but the file is NEVER moved with
|
|
315
|
+
# only some of its messages emitted (no accept-and-drop of the tail).
|
|
316
|
+
logger.warning("handler failed for %s (will retry next scan): %s", path.name, exc)
|
|
317
|
+
continue
|
|
318
|
+
await asyncio.to_thread(self._after_processing, path)
|
|
319
|
+
|
|
320
|
+
async def _emit(self, raw: bytes) -> None:
|
|
321
|
+
"""Hand every HL7 message in ``raw`` to the pipeline handler, in file order (FIFO).
|
|
322
|
+
|
|
323
|
+
Corepoint-style **batch split** (Tier 2.2-A): a dropped file may hold several MSH-delimited
|
|
324
|
+
messages (a batch, or an FHS/BHS envelope). Each becomes one pipeline hand-off — the same
|
|
325
|
+
per-message split a dry-run / ``messagefoundry check`` sees, via the shared
|
|
326
|
+
:func:`~messagefoundry.parsing.split.split_batch`.
|
|
327
|
+
|
|
328
|
+
Splitting must decode the bytes to find the MSH boundaries, so we decode with the
|
|
329
|
+
connection's **declared encoding** (``errors="strict"``) — never UTF-8 by accident — so a
|
|
330
|
+
non-UTF-8 batch (e.g. latin-1) splits without mojibake. If the file isn't decodable in that
|
|
331
|
+
encoding, or it holds a single message, the **original bytes are handed off verbatim** (one
|
|
332
|
+
hand-off): a single-message file is then byte-for-byte identical to before the split existed,
|
|
333
|
+
and an undecodable file flows to the pipeline unchanged so its ``normalize(errors="strict")``
|
|
334
|
+
records the proper ``ERROR`` disposition exactly as today (we don't pre-empt that here). A
|
|
335
|
+
true batch is split and each message **re-encoded with the same declared encoding**, so the
|
|
336
|
+
handler still receives ``bytes`` exactly as in the un-split path.
|
|
337
|
+
|
|
338
|
+
Any exception (a durable-store failure on hand-off K) propagates to the caller, which then
|
|
339
|
+
leaves the whole file in place for the next scan — preserving at-least-once with no partial
|
|
340
|
+
move (see :meth:`_scan_once`)."""
|
|
341
|
+
assert self._handler is not None
|
|
342
|
+
try:
|
|
343
|
+
text = raw.decode(self.encoding)
|
|
344
|
+
except (UnicodeDecodeError, LookupError):
|
|
345
|
+
# Not decodable in the declared encoding (or an unknown codec name): can't safely find MSH
|
|
346
|
+
# boundaries, so hand the raw bytes off unchanged — the pipeline's strict-decode then
|
|
347
|
+
# records ERROR for it, exactly as in the pre-split single-hand-off path. Never a drop.
|
|
348
|
+
await self._handler(raw)
|
|
349
|
+
return
|
|
350
|
+
messages = split_batch(
|
|
351
|
+
text
|
|
352
|
+
) # str in → no UTF-8 re-decode (normalize only fixes line endings)
|
|
353
|
+
if len(messages) == 1:
|
|
354
|
+
# Fast path / strict back-compat: a lone message is handed off verbatim (its original
|
|
355
|
+
# bytes), so a non-batch file behaves byte-for-byte as before the split was introduced.
|
|
356
|
+
await self._handler(raw)
|
|
357
|
+
return
|
|
358
|
+
for message in messages:
|
|
359
|
+
# FIFO per connection: emit in file order, awaiting each so a slow/failing hand-off
|
|
360
|
+
# back-pressures the rest (and a failure stops the file from being moved — see above).
|
|
361
|
+
await self._handler(message.encode(self.encoding))
|
|
362
|
+
|
|
363
|
+
def _oversize(self, path: Path) -> bool:
|
|
364
|
+
"""True if ``path`` is larger than the configured cap (checked before reading it)."""
|
|
365
|
+
if self.max_file_bytes is None:
|
|
366
|
+
return False
|
|
367
|
+
try:
|
|
368
|
+
return path.stat().st_size > self.max_file_bytes
|
|
369
|
+
except OSError:
|
|
370
|
+
return False # vanished/locked — let the read path handle it
|
|
371
|
+
|
|
372
|
+
def _candidates(self) -> list[Path]:
|
|
373
|
+
"""Files ready to process, honoring recursion, min-age, and sort order."""
|
|
374
|
+
globber = self.directory.rglob if self.recursive else self.directory.glob
|
|
375
|
+
try:
|
|
376
|
+
matched = list(globber(self.pattern))
|
|
377
|
+
except (OSError, ValueError) as exc:
|
|
378
|
+
# Watch dir vanished/unreadable, or an invalid glob pattern: treat as "nothing this
|
|
379
|
+
# scan" (logged) rather than letting it propagate and kill the poller (review H-4).
|
|
380
|
+
logger.warning(
|
|
381
|
+
"file source could not list %s (pattern %r): %s", self.directory, self.pattern, exc
|
|
382
|
+
)
|
|
383
|
+
return []
|
|
384
|
+
files = [
|
|
385
|
+
p
|
|
386
|
+
for p in matched
|
|
387
|
+
if p.is_file()
|
|
388
|
+
and self.processed_dir not in p.parents
|
|
389
|
+
and self.error_dir not in p.parents
|
|
390
|
+
and self._within_root(p)
|
|
391
|
+
]
|
|
392
|
+
if self.min_age_seconds > 0:
|
|
393
|
+
cutoff = time.time() - self.min_age_seconds
|
|
394
|
+
files = [p for p in files if _mtime(p) <= cutoff] # skip files still being written
|
|
395
|
+
if self.sort == "mtime":
|
|
396
|
+
files.sort(key=_mtime)
|
|
397
|
+
else:
|
|
398
|
+
files.sort(key=lambda p: p.name)
|
|
399
|
+
return files
|
|
400
|
+
|
|
401
|
+
def _within_root(self, path: Path) -> bool:
|
|
402
|
+
"""True if ``path`` resolves inside the configured watch root.
|
|
403
|
+
|
|
404
|
+
A symlinked file or subdirectory that points outside the root (e.g. ``in/link -> /etc``)
|
|
405
|
+
resolves elsewhere and is skipped, so a recursive scan can't be walked out of its directory
|
|
406
|
+
to read arbitrary files (path-confinement / symlink-escape guard)."""
|
|
407
|
+
try:
|
|
408
|
+
resolved = path.resolve()
|
|
409
|
+
except OSError:
|
|
410
|
+
return False
|
|
411
|
+
if resolved == self._root_real or self._root_real in resolved.parents:
|
|
412
|
+
return True
|
|
413
|
+
logger.warning(
|
|
414
|
+
"file source: skipping %s — it resolves outside the watch root (symlink escape?)",
|
|
415
|
+
path.name,
|
|
416
|
+
)
|
|
417
|
+
return False
|
|
418
|
+
|
|
419
|
+
def _after_processing(self, path: Path) -> None:
|
|
420
|
+
if self.after_read == "delete":
|
|
421
|
+
try:
|
|
422
|
+
path.unlink()
|
|
423
|
+
except OSError as exc:
|
|
424
|
+
# A processed file we can't delete will be re-read (duplicate); surface it (FILE-4).
|
|
425
|
+
logger.warning("could not delete processed file %s: %s", path.name, exc)
|
|
426
|
+
else:
|
|
427
|
+
self._move(path, self.processed_dir)
|
|
428
|
+
|
|
429
|
+
@staticmethod
|
|
430
|
+
def _move(path: Path, dest_dir: Path) -> None:
|
|
431
|
+
try:
|
|
432
|
+
path.replace(_unique(dest_dir / path.name))
|
|
433
|
+
except OSError as exc:
|
|
434
|
+
# A stuck file (locked / dest unwritable) stays and is re-read; log it (FILE-4).
|
|
435
|
+
logger.warning("could not move %s to %s: %s", path.name, dest_dir.name, exc)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# --- helpers -----------------------------------------------------------------
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
# Segment ids a valid HL7 v2 payload (single message or batch file) may start with.
|
|
442
|
+
_HL7_LEADING_SEGMENTS = (b"MSH", b"FHS", b"BHS")
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
def _looks_like_hl7(raw: bytes) -> bool:
|
|
446
|
+
"""Cheap content sniff: does ``raw`` start with an HL7 v2 header segment (ASVS 5.2.2)?
|
|
447
|
+
|
|
448
|
+
Mirrors what the tolerant parser accepts at the very start — an optional UTF-8 BOM, an MLLP
|
|
449
|
+
start byte, and leading whitespace — then requires the first segment id to be MSH (message), FHS
|
|
450
|
+
(file) or BHS (batch). This rejects a binary or non-HL7 file that merely carries the ``.hl7``
|
|
451
|
+
extension before its bytes enter the pipeline, without rejecting a structurally-odd-but-textual
|
|
452
|
+
HL7 message (which still flows through and is recorded as ``ERROR`` by the parser)."""
|
|
453
|
+
head = raw.lstrip(b"\x0b\r\n \t")
|
|
454
|
+
if head.startswith(b"\xef\xbb\xbf"): # UTF-8 BOM
|
|
455
|
+
head = head[3:].lstrip(b"\x0b\r\n \t")
|
|
456
|
+
return head[:3] in _HL7_LEADING_SEGMENTS
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
class ScanRejected(Exception):
|
|
460
|
+
"""Raised by a pre-ingest scan hook to reject malicious/disallowed inbound file content (ASVS
|
|
461
|
+
5.4.3). The connector quarantines the file to its error dir and never emits it."""
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
#: Pre-ingest content-scan hook: ``(raw_bytes, source_label) -> None``; raise :class:`ScanRejected`
|
|
465
|
+
#: to reject. ``(bytes, str)`` so an operator scanner can label its logs. Default = no-op.
|
|
466
|
+
ScanHook = Callable[[bytes, str], None]
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
def _no_scan(raw: bytes, source: str) -> None:
|
|
470
|
+
return None
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
_scan_hook: ScanHook = _no_scan
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
def set_scan_hook(hook: ScanHook | None) -> None:
|
|
477
|
+
"""Install (or clear, with ``None``) the pre-ingest content-scan hook (ASVS 5.4.3).
|
|
478
|
+
|
|
479
|
+
MessageFoundry ships **no** built-in antivirus/malware scan: the supported model trusts the drop
|
|
480
|
+
directory, and a less-trusted or remote source should be fronted by an AV/ICAP gateway (see
|
|
481
|
+
docs/CONNECTIONS.md). This seam lets an operator/plugin install an in-process scanner that runs over
|
|
482
|
+
the raw bytes of every inbound file — both the local FILE source and the remote SFTP/FTP(S) source —
|
|
483
|
+
*before* they enter the pipeline; it must raise :class:`ScanRejected` to reject content, which the
|
|
484
|
+
connector then quarantines to its error dir (never emitted). Format-agnostic (it sees raw bytes), so
|
|
485
|
+
it works for HL7, X12, or any payload."""
|
|
486
|
+
global _scan_hook
|
|
487
|
+
_scan_hook = hook or _no_scan
|
|
488
|
+
|
|
489
|
+
|
|
490
|
+
def scan_inbound_file(raw: bytes, source: str) -> None:
|
|
491
|
+
"""Run the configured pre-ingest scan hook over ``raw`` (default no-op); raise :class:`ScanRejected`
|
|
492
|
+
to reject — the caller quarantines and never emits. Run off the event loop (it may do blocking I/O
|
|
493
|
+
to an AV/ICAP service)."""
|
|
494
|
+
_scan_hook(raw, source)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
def _claim_unique(tmp: Path, target: Path) -> Path:
|
|
498
|
+
"""Claim ``target`` (or ``name-1.ext``, ``name-2.ext``, … if taken) for ``tmp``, atomically.
|
|
499
|
+
|
|
500
|
+
Prefers ``os.link`` (the target becomes a hard link to ``tmp``); ``FileExistsError`` means the
|
|
501
|
+
name is taken, so claiming a free name is a single atomic step — no check-then-act window where
|
|
502
|
+
a concurrent writer could clobber us. Where hard links aren't supported (FAT/exFAT, many SMB/NAS
|
|
503
|
+
mounts) ``os.link`` raises a different ``OSError``; fall back to an exclusive-create copy
|
|
504
|
+
(``O_CREAT | O_EXCL``), which is also atomic no-clobber but works cross-filesystem (review low-5)."""
|
|
505
|
+
stem, suffix = target.stem, target.suffix
|
|
506
|
+
candidate, n = target, 0
|
|
507
|
+
linkable = True
|
|
508
|
+
while True:
|
|
509
|
+
if linkable:
|
|
510
|
+
try:
|
|
511
|
+
os.link(tmp, candidate)
|
|
512
|
+
return candidate
|
|
513
|
+
except FileExistsError:
|
|
514
|
+
n += 1
|
|
515
|
+
candidate = target.with_name(f"{stem}-{n}{suffix}")
|
|
516
|
+
continue
|
|
517
|
+
except OSError:
|
|
518
|
+
linkable = False # hard links unusable on this filesystem — copy instead
|
|
519
|
+
try:
|
|
520
|
+
fd = os.open(candidate, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
|
521
|
+
except FileExistsError:
|
|
522
|
+
n += 1
|
|
523
|
+
candidate = target.with_name(f"{stem}-{n}{suffix}")
|
|
524
|
+
continue
|
|
525
|
+
with os.fdopen(fd, "wb") as handle:
|
|
526
|
+
handle.write(tmp.read_bytes())
|
|
527
|
+
return candidate
|
|
528
|
+
|
|
529
|
+
|
|
530
|
+
def _mtime(p: Path) -> float:
|
|
531
|
+
try:
|
|
532
|
+
return p.stat().st_mtime
|
|
533
|
+
except OSError:
|
|
534
|
+
return 0.0
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _unique(target: Path) -> Path:
|
|
538
|
+
"""Return ``target`` or, if it exists, ``name-1.ext``, ``name-2.ext``, …"""
|
|
539
|
+
if not target.exists():
|
|
540
|
+
return target
|
|
541
|
+
stem, suffix = target.stem, target.suffix
|
|
542
|
+
n = 1
|
|
543
|
+
while True:
|
|
544
|
+
candidate = target.with_name(f"{stem}-{n}{suffix}")
|
|
545
|
+
if not candidate.exists():
|
|
546
|
+
return candidate
|
|
547
|
+
n += 1
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
register_destination(ConnectorType.FILE, FileDestination)
|
|
551
|
+
register_source(ConnectorType.FILE, FileSource)
|