messagefoundry 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. messagefoundry/__init__.py +108 -0
  2. messagefoundry/__main__.py +1155 -0
  3. messagefoundry/api/__init__.py +27 -0
  4. messagefoundry/api/app.py +1581 -0
  5. messagefoundry/api/approvals.py +184 -0
  6. messagefoundry/api/auth_models.py +211 -0
  7. messagefoundry/api/auth_routes.py +655 -0
  8. messagefoundry/api/field_authz.py +96 -0
  9. messagefoundry/api/models.py +374 -0
  10. messagefoundry/api/security.py +247 -0
  11. messagefoundry/api/tls.py +47 -0
  12. messagefoundry/auth/__init__.py +39 -0
  13. messagefoundry/auth/data/common_passwords.NOTICE +13 -0
  14. messagefoundry/auth/data/common_passwords.txt +10000 -0
  15. messagefoundry/auth/identity.py +71 -0
  16. messagefoundry/auth/ldap.py +264 -0
  17. messagefoundry/auth/notifications.py +68 -0
  18. messagefoundry/auth/passwords.py +53 -0
  19. messagefoundry/auth/permissions.py +120 -0
  20. messagefoundry/auth/policy.py +153 -0
  21. messagefoundry/auth/ratelimit.py +55 -0
  22. messagefoundry/auth/service.py +1323 -0
  23. messagefoundry/auth/tokens.py +26 -0
  24. messagefoundry/auth/totp.py +174 -0
  25. messagefoundry/checks.py +174 -0
  26. messagefoundry/config/__init__.py +30 -0
  27. messagefoundry/config/active_environment.py +80 -0
  28. messagefoundry/config/ai_policy.py +140 -0
  29. messagefoundry/config/code_sets.py +260 -0
  30. messagefoundry/config/connections_edit.py +200 -0
  31. messagefoundry/config/connections_file.py +287 -0
  32. messagefoundry/config/db_lookup.py +117 -0
  33. messagefoundry/config/environments.py +116 -0
  34. messagefoundry/config/ingest_time.py +83 -0
  35. messagefoundry/config/models.py +240 -0
  36. messagefoundry/config/reference.py +158 -0
  37. messagefoundry/config/response.py +83 -0
  38. messagefoundry/config/run_context.py +153 -0
  39. messagefoundry/config/settings.py +1311 -0
  40. messagefoundry/config/state.py +99 -0
  41. messagefoundry/config/tls_policy.py +110 -0
  42. messagefoundry/config/wiring.py +1918 -0
  43. messagefoundry/console/__init__.py +20 -0
  44. messagefoundry/console/__main__.py +274 -0
  45. messagefoundry/console/_async.py +107 -0
  46. messagefoundry/console/change_password.py +111 -0
  47. messagefoundry/console/client.py +552 -0
  48. messagefoundry/console/connections.py +324 -0
  49. messagefoundry/console/login.py +107 -0
  50. messagefoundry/console/mfa.py +205 -0
  51. messagefoundry/console/reauth.py +94 -0
  52. messagefoundry/console/search.py +57 -0
  53. messagefoundry/console/service_control.py +137 -0
  54. messagefoundry/console/sessions.py +122 -0
  55. messagefoundry/console/shell.py +410 -0
  56. messagefoundry/console/status.py +377 -0
  57. messagefoundry/console/users_page.py +282 -0
  58. messagefoundry/console/widgets.py +553 -0
  59. messagefoundry/generators/README.md +27 -0
  60. messagefoundry/generators/__init__.py +15 -0
  61. messagefoundry/generators/_core.py +589 -0
  62. messagefoundry/generators/_hl7data.py +428 -0
  63. messagefoundry/generators/adt.py +286 -0
  64. messagefoundry/generators/all_types.py +24 -0
  65. messagefoundry/generators/bar.py +28 -0
  66. messagefoundry/generators/dft.py +20 -0
  67. messagefoundry/generators/mdm.py +39 -0
  68. messagefoundry/generators/mfn.py +46 -0
  69. messagefoundry/generators/oml.py +32 -0
  70. messagefoundry/generators/orl.py +30 -0
  71. messagefoundry/generators/orm.py +23 -0
  72. messagefoundry/generators/oru.py +21 -0
  73. messagefoundry/generators/ras.py +20 -0
  74. messagefoundry/generators/rde.py +54 -0
  75. messagefoundry/generators/siu.py +64 -0
  76. messagefoundry/generators/vxu.py +20 -0
  77. messagefoundry/hl7schema.py +75 -0
  78. messagefoundry/last_resort.py +55 -0
  79. messagefoundry/logging_setup.py +332 -0
  80. messagefoundry/parsing/__init__.py +64 -0
  81. messagefoundry/parsing/consistency.py +166 -0
  82. messagefoundry/parsing/groups.py +228 -0
  83. messagefoundry/parsing/message.py +453 -0
  84. messagefoundry/parsing/peek.py +237 -0
  85. messagefoundry/parsing/split.py +120 -0
  86. messagefoundry/parsing/summary.py +46 -0
  87. messagefoundry/parsing/tree.py +128 -0
  88. messagefoundry/parsing/validate.py +95 -0
  89. messagefoundry/parsing/x12/__init__.py +46 -0
  90. messagefoundry/parsing/x12/delimiters.py +140 -0
  91. messagefoundry/parsing/x12/errors.py +30 -0
  92. messagefoundry/parsing/x12/interchange.py +232 -0
  93. messagefoundry/parsing/x12/message.py +200 -0
  94. messagefoundry/parsing/x12/peek.py +207 -0
  95. messagefoundry/pipeline/__init__.py +21 -0
  96. messagefoundry/pipeline/alert_sinks.py +486 -0
  97. messagefoundry/pipeline/alerts.py +100 -0
  98. messagefoundry/pipeline/cert_expiry.py +219 -0
  99. messagefoundry/pipeline/cluster.py +955 -0
  100. messagefoundry/pipeline/cluster_sqlserver.py +444 -0
  101. messagefoundry/pipeline/config_convergence.py +137 -0
  102. messagefoundry/pipeline/dryrun.py +450 -0
  103. messagefoundry/pipeline/engine.py +756 -0
  104. messagefoundry/pipeline/leader_tasks.py +158 -0
  105. messagefoundry/pipeline/reference_sync.py +369 -0
  106. messagefoundry/pipeline/retention.py +289 -0
  107. messagefoundry/pipeline/security_notify.py +168 -0
  108. messagefoundry/pipeline/state_convergence.py +143 -0
  109. messagefoundry/pipeline/wiring_runner.py +1722 -0
  110. messagefoundry/py.typed +0 -0
  111. messagefoundry/redaction.py +71 -0
  112. messagefoundry/scaffold.py +321 -0
  113. messagefoundry/secrets_dpapi.py +129 -0
  114. messagefoundry/store/__init__.py +46 -0
  115. messagefoundry/store/audit_tee.py +67 -0
  116. messagefoundry/store/base.py +758 -0
  117. messagefoundry/store/crypto.py +166 -0
  118. messagefoundry/store/keyprovider.py +192 -0
  119. messagefoundry/store/postgres.py +3447 -0
  120. messagefoundry/store/sqlserver.py +3014 -0
  121. messagefoundry/store/store.py +3790 -0
  122. messagefoundry/timezone.py +207 -0
  123. messagefoundry/transports/__init__.py +50 -0
  124. messagefoundry/transports/base.py +269 -0
  125. messagefoundry/transports/database.py +693 -0
  126. messagefoundry/transports/file.py +551 -0
  127. messagefoundry/transports/framing.py +164 -0
  128. messagefoundry/transports/loopback.py +53 -0
  129. messagefoundry/transports/mllp.py +644 -0
  130. messagefoundry/transports/remotefile.py +664 -0
  131. messagefoundry/transports/rest.py +281 -0
  132. messagefoundry/transports/signing.py +321 -0
  133. messagefoundry/transports/soap.py +507 -0
  134. messagefoundry/transports/tcp.py +307 -0
  135. messagefoundry/transports/timer.py +146 -0
  136. messagefoundry/transports/x12.py +323 -0
  137. messagefoundry-0.1.0.dist-info/METADATA +212 -0
  138. messagefoundry-0.1.0.dist-info/RECORD +142 -0
  139. messagefoundry-0.1.0.dist-info/WHEEL +4 -0
  140. messagefoundry-0.1.0.dist-info/entry_points.txt +2 -0
  141. messagefoundry-0.1.0.dist-info/licenses/LICENSE +662 -0
  142. messagefoundry-0.1.0.dist-info/licenses/NOTICE +27 -0
@@ -0,0 +1,551 @@
1
+ # SPDX-License-Identifier: AGPL-3.0-or-later
2
+ # Copyright (C) 2026 MessageFoundry Organization and contributors
3
+ """File transport: directory destination + directory-polling source.
4
+
5
+ **Destination** writes each payload to a file in a directory. The filename may contain
6
+ ``{HL7-path}`` placeholders (e.g. ``{MSH-10}.hl7``) resolved by peeking the payload, so
7
+ archived files are named by control id / message type. Writes are atomic (write to a
8
+ temp name, then ``rename``) so a reader watching the directory never sees a partial file.
9
+
10
+ **Source** polls a directory for files, hands each to the pipeline handler, then moves the
11
+ file into a ``.processed`` subdirectory (or ``.error`` if the handler raised). Files have
12
+ no reply channel, so the handler's return value is ignored.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import asyncio
18
+ import logging
19
+ import os
20
+ import re
21
+ import tempfile
22
+ import time
23
+ from collections.abc import Callable
24
+ from contextlib import suppress
25
+ from pathlib import Path
26
+
27
+ from messagefoundry.config.models import ConnectorType, Destination, Source
28
+ from messagefoundry.parsing.peek import HL7PeekError, Peek
29
+ from messagefoundry.parsing.split import split_batch
30
+ from messagefoundry.transports.base import (
31
+ DeliveryError,
32
+ DestinationConnector,
33
+ InboundHandler,
34
+ SourceConnector,
35
+ register_destination,
36
+ register_source,
37
+ )
38
+
39
+ __all__ = ["FileDestination", "FileSource", "render_filename", "DEFAULT_MAX_FILE_BYTES"]
40
+
41
+ logger = logging.getLogger(__name__)
42
+
43
+ # Cap a single inbound file read so a multi-GB drop can't OOM the engine (DoS guard). A
44
+ # falsy value (None/0) in settings disables the cap; see docs/CONNECTIONS.md.
45
+ DEFAULT_MAX_FILE_BYTES = 16 * 1024 * 1024 # 16 MiB — matches the MLLP frame cap
46
+
47
+ _PLACEHOLDER = re.compile(r"\{([A-Z][A-Z0-9]{2}-\d+(?:\.\d+){0,2})\}")
48
+ # Strip characters that are unsafe in filenames on Windows and POSIX alike (path separators
49
+ # included, so a resolved value can never introduce a directory component).
50
+ _UNSAFE = re.compile(r'[<>:"/\\|?*\x00-\x1f]')
51
+ # Windows reserved device names (case-insensitive, optionally with an extension) — never usable.
52
+ _RESERVED = {
53
+ "CON",
54
+ "PRN",
55
+ "AUX",
56
+ "NUL",
57
+ *(f"COM{i}" for i in range(1, 10)),
58
+ *(f"LPT{i}" for i in range(1, 10)),
59
+ }
60
+
61
+
62
+ def render_filename(template: str, payload: str, *, fallback: str) -> str:
63
+ """Resolve ``{HL7-path}`` placeholders in ``template`` against ``payload``, producing a single
64
+ safe filename (never a path).
65
+
66
+ Unresolvable placeholders (missing field, or an unparseable payload) fall back to ``fallback``
67
+ so a delivery never fails merely because a name couldn't be built. The result is constrained to
68
+ one path component: unsafe characters are stripped, leading dots removed, and ``.``/``..``/empty
69
+ or a reserved device name falls back — so an attacker-controlled field can't write outside the
70
+ target directory or shadow ``.processed``/``.error`` (FILE-1)."""
71
+ try:
72
+ peek: Peek | None = Peek.parse(payload)
73
+ except HL7PeekError:
74
+ peek = None
75
+
76
+ def repl(match: re.Match[str]) -> str:
77
+ value = peek.field(match.group(1)) if peek else None
78
+ return _sanitize(value) if value else fallback
79
+
80
+ name = _sanitize(_PLACEHOLDER.sub(repl, template))
81
+ stem = name.split(".", 1)[0].upper()
82
+ if not name or name in (".", "..") or stem in _RESERVED:
83
+ return fallback
84
+ return name
85
+
86
+
87
+ def _sanitize(value: str) -> str:
88
+ """Reduce ``value`` to a safe single-component filename: drop unsafe chars and leading dots
89
+ (which would create hidden files or ``.``/``..`` traversal)."""
90
+ return _UNSAFE.sub("_", value).lstrip(".")
91
+
92
+
93
+ def _probe_dir_writable(directory: Path) -> None:
94
+ """Reachability probe shared by the FILE connectors: ensure ``directory`` exists and accepts a
95
+ write — a destination writes messages there and a source moves processed files into its subdirs,
96
+ so writability is the meaningful check for both. Creates and removes a temp file; raises ``OSError``
97
+ if the directory is missing or unwritable."""
98
+ directory.mkdir(parents=True, exist_ok=True)
99
+ fd, tmp = tempfile.mkstemp(dir=directory, suffix=".probe")
100
+ os.close(fd)
101
+ os.unlink(tmp)
102
+
103
+
104
+ class FileDestination(DestinationConnector):
105
+ def __init__(self, config: Destination) -> None:
106
+ s = config.settings
107
+ if "directory" not in s:
108
+ raise ValueError("file destination requires a 'directory' setting")
109
+ self.directory = Path(s["directory"])
110
+ self.filename_template: str = s.get("filename", "{MSH-10}.hl7")
111
+ # When two messages resolve to the same name, append a counter rather than clobber.
112
+ self._overwrite: bool = bool(s.get("overwrite", False))
113
+ self.encoding: str = s.get("encoding", "utf-8")
114
+
115
+ async def send(self, payload: str) -> None:
116
+ try:
117
+ await asyncio.to_thread(self._write, payload)
118
+ except OSError as exc:
119
+ raise DeliveryError(f"file write failed: {exc}") from exc
120
+
121
+ async def test_connection(self) -> None:
122
+ try:
123
+ await asyncio.to_thread(_probe_dir_writable, self.directory)
124
+ except OSError as exc:
125
+ raise DeliveryError(f"file directory {self.directory} not writable: {exc}") from exc
126
+
127
+ def _write(self, payload: str) -> None:
128
+ self.directory.mkdir(parents=True, exist_ok=True)
129
+ name = render_filename(self.filename_template, payload, fallback="message.hl7")
130
+ target = self.directory / name
131
+ # Defence in depth atop the filename sanitization (FILE-1): never write outside the
132
+ # configured directory even if a name somehow carried a path component.
133
+ if self.directory.resolve() not in target.resolve().parents:
134
+ raise DeliveryError(f"refusing to write outside the destination directory: {name!r}")
135
+ data = payload.encode(self.encoding)
136
+ # Write to a uniquely-named temp (mkstemp — no shared counter, no name race), then publish
137
+ # atomically. For no-overwrite, claim the final name by exclusive create so two concurrent
138
+ # deliveries can't clobber each other (FILE-5: replaces the TOCTOU exists()-then-rename).
139
+ fd, tmp_name = tempfile.mkstemp(dir=self.directory, suffix=".part")
140
+ tmp = Path(tmp_name)
141
+ try:
142
+ with os.fdopen(fd, "wb") as handle:
143
+ handle.write(data)
144
+ if self._overwrite:
145
+ os.replace(tmp, target) # atomic overwrite; consumes tmp
146
+ else:
147
+ _claim_unique(tmp, target) # hard-links tmp → a free name
148
+ finally:
149
+ # Remove the temp; after a successful os.replace it's already gone (suppressed).
150
+ with suppress(OSError):
151
+ os.unlink(tmp)
152
+
153
+
154
+ class FileSource(SourceConnector):
155
+ """Poll a directory for files and feed each to the pipeline handler."""
156
+
157
+ polls_shared_resource = True # a directory is a shared external resource — leader-gate it
158
+
159
+ def __init__(self, config: Source) -> None:
160
+ s = config.settings
161
+ if "directory" not in s:
162
+ raise ValueError("file source requires a 'directory' setting")
163
+ self.directory = Path(s["directory"])
164
+ # Resolved watch root for path-confinement: a recursive scan must not be walked out of the
165
+ # configured directory via a symlinked file/subdir (see _within_root). resolve() is
166
+ # non-strict, so it's fine that the directory is created later in start().
167
+ self._root_real = self.directory.resolve()
168
+ self.pattern: str = s.get("pattern", "*")
169
+ self.poll_seconds: float = float(s.get("poll_seconds", 1.0))
170
+ self.min_age_seconds: float = float(s.get("min_age_seconds", 0.0))
171
+ self.after_read: str = s.get("after_read", "move") # "move" | "delete"
172
+ self.sort: str = s.get("sort", "name") # "name" | "mtime"
173
+ self.recursive: bool = bool(s.get("recursive", False))
174
+ # Encoding used to re-encode split batch messages back to bytes for the handler. A single
175
+ # (non-batch) message is handed off verbatim, so its bytes never round-trip through this.
176
+ self.encoding: str = s.get("encoding", "utf-8")
177
+ mfb = s.get("max_file_bytes", DEFAULT_MAX_FILE_BYTES)
178
+ self.max_file_bytes: int | None = int(mfb) if mfb else None
179
+ self.processed_dir = self.directory / s.get("processed_subdir", ".processed")
180
+ self.error_dir = self.directory / s.get("error_subdir", ".error")
181
+ self._handler: InboundHandler | None = None
182
+ # Leader-gate (Track B Step 4b): when set, this directory (a shared external resource) is
183
+ # polled only while the gate returns True, so in a cluster exactly one node ingests its
184
+ # files. None = always poll (single-node / direct callers / tests) — byte-identical.
185
+ self._leader_gate: Callable[[], bool] | None = None
186
+ self._skipping = False # whether the last tick was gated out (for a single transition log)
187
+ self._stop = asyncio.Event()
188
+ self._task: asyncio.Task[None] | None = None
189
+
190
+ async def start(
191
+ self, handler: InboundHandler, *, leader_gate: Callable[[], bool] | None = None
192
+ ) -> None:
193
+ """Begin polling in the background. Returns once the source is set up so the
194
+ caller can rely on it being live (consistent with the TCP sources)."""
195
+ self._handler = handler
196
+ self._leader_gate = leader_gate
197
+ self._stop.clear()
198
+ self.processed_dir.mkdir(parents=True, exist_ok=True)
199
+ self.error_dir.mkdir(parents=True, exist_ok=True)
200
+ self._task = asyncio.create_task(self._run())
201
+
202
+ async def test_connection(self) -> None:
203
+ try:
204
+ await asyncio.to_thread(_probe_dir_writable, self.directory)
205
+ except OSError as exc:
206
+ raise DeliveryError(f"file directory {self.directory} not writable: {exc}") from exc
207
+
208
+ async def _run(self) -> None:
209
+ while not self._stop.is_set():
210
+ try:
211
+ if self._may_poll():
212
+ await self._scan_once()
213
+ except asyncio.CancelledError:
214
+ raise
215
+ except Exception:
216
+ # A scan error (watch dir vanished/unreadable, a bad glob, a move/read failure) must
217
+ # NOT kill the poller — that would silently stop the connection from receiving while
218
+ # it still reports running, and re-raise inside stop()/reload (review H-4). Log and
219
+ # retry on the next interval.
220
+ logger.exception(
221
+ "file source scan failed for %s; retrying next poll", self.directory
222
+ )
223
+ try:
224
+ await asyncio.wait_for(self._stop.wait(), self.poll_seconds)
225
+ except asyncio.TimeoutError:
226
+ pass # poll interval elapsed; scan again
227
+
228
+ def _may_poll(self) -> bool:
229
+ """Whether this tick may scan the directory. False on a follower (leader-gated, Step 4b):
230
+ a non-leader must NOT read or move/delete files, since the directory is shared and two
231
+ nodes ingesting it would duplicate intake. The loop still ticks, so a node that becomes
232
+ leader scans on its next tick (reactive-by-polling, no restart). When the gate is None or
233
+ True, behaves exactly as before. Logged once on each transition (never per skipped tick —
234
+ that would spam a follower's log every poll interval)."""
235
+ if self._leader_gate is None or self._leader_gate():
236
+ if self._skipping:
237
+ self._skipping = False
238
+ logger.debug("file source resuming polling of %s (now leader)", self.directory)
239
+ return True
240
+ if not self._skipping:
241
+ self._skipping = True
242
+ logger.debug(
243
+ "file source skipping polling of %s (not leader; another node ingests it)",
244
+ self.directory,
245
+ )
246
+ return False
247
+
248
+ async def stop(self) -> None:
249
+ self._stop.set()
250
+ if self._task is not None:
251
+ # return_exceptions: a faulted poll task must not re-raise here — stop() runs during
252
+ # reload quiesce, outside its rollback (review H-4). _run already guards scans; this is
253
+ # the belt-and-suspenders.
254
+ await asyncio.gather(self._task, return_exceptions=True)
255
+ self._task = None
256
+
257
+ async def _scan_once(self) -> None:
258
+ assert self._handler is not None
259
+ for path in await asyncio.to_thread(self._candidates):
260
+ if await asyncio.to_thread(self._oversize, path):
261
+ # Transport-level reject *before* any message is read — parallels MLLP dropping an
262
+ # over-cap frame. It never became a "received message", so (like MLLP) there's no
263
+ # store disposition to record; preserve the file in .error for the operator and log it.
264
+ logger.warning(
265
+ "file %s exceeds max_file_bytes (%s); routing to error dir",
266
+ path.name,
267
+ self.max_file_bytes,
268
+ )
269
+ await asyncio.to_thread(self._move, path, self.error_dir)
270
+ continue
271
+ try:
272
+ raw = await asyncio.to_thread(path.read_bytes)
273
+ except OSError as exc:
274
+ # Transient (file locked / vanished mid-scan): leave it in place to retry next scan
275
+ # rather than quarantining a healthy file. Logged, never silently swallowed.
276
+ logger.warning("could not read %s (will retry next scan): %s", path.name, exc)
277
+ continue
278
+ if not _looks_like_hl7(raw):
279
+ # Content doesn't match the declared .hl7 type (binary / non-HL7 text) — quarantine
280
+ # before its bytes reach the pipeline (ASVS 5.2.2). Like the oversize reject above, it
281
+ # never became a "received message", so there's no store disposition; preserve it in
282
+ # .error and log it (never a silent drop).
283
+ logger.warning(
284
+ "file %s is not HL7 (no MSH/FHS/BHS header); routing to error dir", path.name
285
+ )
286
+ await asyncio.to_thread(self._move, path, self.error_dir)
287
+ continue
288
+ try:
289
+ await asyncio.to_thread(scan_inbound_file, raw, path.name)
290
+ except ScanRejected as exc:
291
+ # A configured pre-ingest scanner (AV/ICAP/plugin) rejected the content before it
292
+ # entered the pipeline (ASVS 5.4.3). Like the oversize / non-HL7 rejects above, it
293
+ # never became a "received message", so there's no store disposition; quarantine + log.
294
+ logger.warning(
295
+ "file %s rejected by the pre-ingest scan hook (%s); routing to error dir",
296
+ path.name,
297
+ exc,
298
+ )
299
+ await asyncio.to_thread(self._move, path, self.error_dir)
300
+ continue
301
+ try:
302
+ await self._emit(raw)
303
+ except Exception as exc:
304
+ # The handler records every message-level outcome (parse/validation/routing → ERROR)
305
+ # itself and returns, so an exception escaping here is an infrastructure failure: the
306
+ # durable store write failed (DB locked, disk full). Leave the file in place so the
307
+ # next scan retries once the store recovers (at-least-once) — moving it to .error would
308
+ # drop a *received* message that was never recorded, an accept-and-drop (review M-15).
309
+ #
310
+ # CRITICAL (Tier 2.2 batch split): a batch is split into N hand-offs (_emit), and the
311
+ # file is moved/deleted ONLY after ALL of them succeed (below). If hand-off K fails,
312
+ # we `continue` WITHOUT moving the file, so the next scan re-reads the WHOLE file and
313
+ # re-emits every message 1..N. That is at-least-once: messages 1..K-1 may be re-emitted
314
+ # (duplicates, acceptable — handlers are idempotent), but the file is NEVER moved with
315
+ # only some of its messages emitted (no accept-and-drop of the tail).
316
+ logger.warning("handler failed for %s (will retry next scan): %s", path.name, exc)
317
+ continue
318
+ await asyncio.to_thread(self._after_processing, path)
319
+
320
+ async def _emit(self, raw: bytes) -> None:
321
+ """Hand every HL7 message in ``raw`` to the pipeline handler, in file order (FIFO).
322
+
323
+ Corepoint-style **batch split** (Tier 2.2-A): a dropped file may hold several MSH-delimited
324
+ messages (a batch, or an FHS/BHS envelope). Each becomes one pipeline hand-off — the same
325
+ per-message split a dry-run / ``messagefoundry check`` sees, via the shared
326
+ :func:`~messagefoundry.parsing.split.split_batch`.
327
+
328
+ Splitting must decode the bytes to find the MSH boundaries, so we decode with the
329
+ connection's **declared encoding** (``errors="strict"``) — never UTF-8 by accident — so a
330
+ non-UTF-8 batch (e.g. latin-1) splits without mojibake. If the file isn't decodable in that
331
+ encoding, or it holds a single message, the **original bytes are handed off verbatim** (one
332
+ hand-off): a single-message file is then byte-for-byte identical to before the split existed,
333
+ and an undecodable file flows to the pipeline unchanged so its ``normalize(errors="strict")``
334
+ records the proper ``ERROR`` disposition exactly as today (we don't pre-empt that here). A
335
+ true batch is split and each message **re-encoded with the same declared encoding**, so the
336
+ handler still receives ``bytes`` exactly as in the un-split path.
337
+
338
+ Any exception (a durable-store failure on hand-off K) propagates to the caller, which then
339
+ leaves the whole file in place for the next scan — preserving at-least-once with no partial
340
+ move (see :meth:`_scan_once`)."""
341
+ assert self._handler is not None
342
+ try:
343
+ text = raw.decode(self.encoding)
344
+ except (UnicodeDecodeError, LookupError):
345
+ # Not decodable in the declared encoding (or an unknown codec name): can't safely find MSH
346
+ # boundaries, so hand the raw bytes off unchanged — the pipeline's strict-decode then
347
+ # records ERROR for it, exactly as in the pre-split single-hand-off path. Never a drop.
348
+ await self._handler(raw)
349
+ return
350
+ messages = split_batch(
351
+ text
352
+ ) # str in → no UTF-8 re-decode (normalize only fixes line endings)
353
+ if len(messages) == 1:
354
+ # Fast path / strict back-compat: a lone message is handed off verbatim (its original
355
+ # bytes), so a non-batch file behaves byte-for-byte as before the split was introduced.
356
+ await self._handler(raw)
357
+ return
358
+ for message in messages:
359
+ # FIFO per connection: emit in file order, awaiting each so a slow/failing hand-off
360
+ # back-pressures the rest (and a failure stops the file from being moved — see above).
361
+ await self._handler(message.encode(self.encoding))
362
+
363
+ def _oversize(self, path: Path) -> bool:
364
+ """True if ``path`` is larger than the configured cap (checked before reading it)."""
365
+ if self.max_file_bytes is None:
366
+ return False
367
+ try:
368
+ return path.stat().st_size > self.max_file_bytes
369
+ except OSError:
370
+ return False # vanished/locked — let the read path handle it
371
+
372
+ def _candidates(self) -> list[Path]:
373
+ """Files ready to process, honoring recursion, min-age, and sort order."""
374
+ globber = self.directory.rglob if self.recursive else self.directory.glob
375
+ try:
376
+ matched = list(globber(self.pattern))
377
+ except (OSError, ValueError) as exc:
378
+ # Watch dir vanished/unreadable, or an invalid glob pattern: treat as "nothing this
379
+ # scan" (logged) rather than letting it propagate and kill the poller (review H-4).
380
+ logger.warning(
381
+ "file source could not list %s (pattern %r): %s", self.directory, self.pattern, exc
382
+ )
383
+ return []
384
+ files = [
385
+ p
386
+ for p in matched
387
+ if p.is_file()
388
+ and self.processed_dir not in p.parents
389
+ and self.error_dir not in p.parents
390
+ and self._within_root(p)
391
+ ]
392
+ if self.min_age_seconds > 0:
393
+ cutoff = time.time() - self.min_age_seconds
394
+ files = [p for p in files if _mtime(p) <= cutoff] # skip files still being written
395
+ if self.sort == "mtime":
396
+ files.sort(key=_mtime)
397
+ else:
398
+ files.sort(key=lambda p: p.name)
399
+ return files
400
+
401
+ def _within_root(self, path: Path) -> bool:
402
+ """True if ``path`` resolves inside the configured watch root.
403
+
404
+ A symlinked file or subdirectory that points outside the root (e.g. ``in/link -> /etc``)
405
+ resolves elsewhere and is skipped, so a recursive scan can't be walked out of its directory
406
+ to read arbitrary files (path-confinement / symlink-escape guard)."""
407
+ try:
408
+ resolved = path.resolve()
409
+ except OSError:
410
+ return False
411
+ if resolved == self._root_real or self._root_real in resolved.parents:
412
+ return True
413
+ logger.warning(
414
+ "file source: skipping %s — it resolves outside the watch root (symlink escape?)",
415
+ path.name,
416
+ )
417
+ return False
418
+
419
+ def _after_processing(self, path: Path) -> None:
420
+ if self.after_read == "delete":
421
+ try:
422
+ path.unlink()
423
+ except OSError as exc:
424
+ # A processed file we can't delete will be re-read (duplicate); surface it (FILE-4).
425
+ logger.warning("could not delete processed file %s: %s", path.name, exc)
426
+ else:
427
+ self._move(path, self.processed_dir)
428
+
429
+ @staticmethod
430
+ def _move(path: Path, dest_dir: Path) -> None:
431
+ try:
432
+ path.replace(_unique(dest_dir / path.name))
433
+ except OSError as exc:
434
+ # A stuck file (locked / dest unwritable) stays and is re-read; log it (FILE-4).
435
+ logger.warning("could not move %s to %s: %s", path.name, dest_dir.name, exc)
436
+
437
+
438
+ # --- helpers -----------------------------------------------------------------
439
+
440
+
441
+ # Segment ids a valid HL7 v2 payload (single message or batch file) may start with.
442
+ _HL7_LEADING_SEGMENTS = (b"MSH", b"FHS", b"BHS")
443
+
444
+
445
+ def _looks_like_hl7(raw: bytes) -> bool:
446
+ """Cheap content sniff: does ``raw`` start with an HL7 v2 header segment (ASVS 5.2.2)?
447
+
448
+ Mirrors what the tolerant parser accepts at the very start — an optional UTF-8 BOM, an MLLP
449
+ start byte, and leading whitespace — then requires the first segment id to be MSH (message), FHS
450
+ (file) or BHS (batch). This rejects a binary or non-HL7 file that merely carries the ``.hl7``
451
+ extension before its bytes enter the pipeline, without rejecting a structurally-odd-but-textual
452
+ HL7 message (which still flows through and is recorded as ``ERROR`` by the parser)."""
453
+ head = raw.lstrip(b"\x0b\r\n \t")
454
+ if head.startswith(b"\xef\xbb\xbf"): # UTF-8 BOM
455
+ head = head[3:].lstrip(b"\x0b\r\n \t")
456
+ return head[:3] in _HL7_LEADING_SEGMENTS
457
+
458
+
459
+ class ScanRejected(Exception):
460
+ """Raised by a pre-ingest scan hook to reject malicious/disallowed inbound file content (ASVS
461
+ 5.4.3). The connector quarantines the file to its error dir and never emits it."""
462
+
463
+
464
+ #: Pre-ingest content-scan hook: ``(raw_bytes, source_label) -> None``; raise :class:`ScanRejected`
465
+ #: to reject. ``(bytes, str)`` so an operator scanner can label its logs. Default = no-op.
466
+ ScanHook = Callable[[bytes, str], None]
467
+
468
+
469
+ def _no_scan(raw: bytes, source: str) -> None:
470
+ return None
471
+
472
+
473
+ _scan_hook: ScanHook = _no_scan
474
+
475
+
476
+ def set_scan_hook(hook: ScanHook | None) -> None:
477
+ """Install (or clear, with ``None``) the pre-ingest content-scan hook (ASVS 5.4.3).
478
+
479
+ MessageFoundry ships **no** built-in antivirus/malware scan: the supported model trusts the drop
480
+ directory, and a less-trusted or remote source should be fronted by an AV/ICAP gateway (see
481
+ docs/CONNECTIONS.md). This seam lets an operator/plugin install an in-process scanner that runs over
482
+ the raw bytes of every inbound file — both the local FILE source and the remote SFTP/FTP(S) source —
483
+ *before* they enter the pipeline; it must raise :class:`ScanRejected` to reject content, which the
484
+ connector then quarantines to its error dir (never emitted). Format-agnostic (it sees raw bytes), so
485
+ it works for HL7, X12, or any payload."""
486
+ global _scan_hook
487
+ _scan_hook = hook or _no_scan
488
+
489
+
490
+ def scan_inbound_file(raw: bytes, source: str) -> None:
491
+ """Run the configured pre-ingest scan hook over ``raw`` (default no-op); raise :class:`ScanRejected`
492
+ to reject — the caller quarantines and never emits. Run off the event loop (it may do blocking I/O
493
+ to an AV/ICAP service)."""
494
+ _scan_hook(raw, source)
495
+
496
+
497
+ def _claim_unique(tmp: Path, target: Path) -> Path:
498
+ """Claim ``target`` (or ``name-1.ext``, ``name-2.ext``, … if taken) for ``tmp``, atomically.
499
+
500
+ Prefers ``os.link`` (the target becomes a hard link to ``tmp``); ``FileExistsError`` means the
501
+ name is taken, so claiming a free name is a single atomic step — no check-then-act window where
502
+ a concurrent writer could clobber us. Where hard links aren't supported (FAT/exFAT, many SMB/NAS
503
+ mounts) ``os.link`` raises a different ``OSError``; fall back to an exclusive-create copy
504
+ (``O_CREAT | O_EXCL``), which is also atomic no-clobber but works cross-filesystem (review low-5)."""
505
+ stem, suffix = target.stem, target.suffix
506
+ candidate, n = target, 0
507
+ linkable = True
508
+ while True:
509
+ if linkable:
510
+ try:
511
+ os.link(tmp, candidate)
512
+ return candidate
513
+ except FileExistsError:
514
+ n += 1
515
+ candidate = target.with_name(f"{stem}-{n}{suffix}")
516
+ continue
517
+ except OSError:
518
+ linkable = False # hard links unusable on this filesystem — copy instead
519
+ try:
520
+ fd = os.open(candidate, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
521
+ except FileExistsError:
522
+ n += 1
523
+ candidate = target.with_name(f"{stem}-{n}{suffix}")
524
+ continue
525
+ with os.fdopen(fd, "wb") as handle:
526
+ handle.write(tmp.read_bytes())
527
+ return candidate
528
+
529
+
530
+ def _mtime(p: Path) -> float:
531
+ try:
532
+ return p.stat().st_mtime
533
+ except OSError:
534
+ return 0.0
535
+
536
+
537
+ def _unique(target: Path) -> Path:
538
+ """Return ``target`` or, if it exists, ``name-1.ext``, ``name-2.ext``, …"""
539
+ if not target.exists():
540
+ return target
541
+ stem, suffix = target.stem, target.suffix
542
+ n = 1
543
+ while True:
544
+ candidate = target.with_name(f"{stem}-{n}{suffix}")
545
+ if not candidate.exists():
546
+ return candidate
547
+ n += 1
548
+
549
+
550
+ register_destination(ConnectorType.FILE, FileDestination)
551
+ register_source(ConnectorType.FILE, FileSource)