loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,326 @@
1
+ """HDBSCAN backend shim — prefer fast_hdbscan, fall back to stock hdbscan.
2
+
3
+ The dns detector was originally built and calibrated on stock ``hdbscan``;
4
+ ``fast_hdbscan`` was added later as a drop-in accelerator with an identical
5
+ ``HDBSCAN(min_cluster_size=, min_samples=)`` plus ``.fit_predict(X)`` API.
6
+ Both produce equivalent findings.
7
+
8
+ This module resolves which implementation is in use exactly once at import
9
+ time and exposes that class at module level as ``HDBSCAN`` so callers can
10
+ ``from loghunter.common.clustering import HDBSCAN`` and construct it
11
+ directly — no factory, no rename. ``ACTIVE_BACKEND`` records which one
12
+ resolved.
13
+
14
+ Resolution order: ``fast_hdbscan`` first (the accelerator, opt-in via the
15
+ ``loghunt[fast]`` extra), then stock ``hdbscan`` (the guaranteed base
16
+ dependency). If neither is importable we raise ``ImportError`` at import
17
+ time rather than letting the failure surface later at a construction site.
18
+
19
+ Process-isolated entry point
20
+ ----------------------------
21
+
22
+ ``fit_predict_interruptible`` is the new shared call site for DNS clustering
23
+ (both Zeek and Pi-hole paths). It runs ``HDBSCAN(...).fit_predict(X)`` in a
24
+ spawned child process so the parent main thread can honour Ctrl-C
25
+ regardless of the native call's GIL state. On ``KeyboardInterrupt`` the
26
+ child is terminated, the queue is drained and closed, and the exception
27
+ re-raises to the caller (the existing ``liveness()`` teardown + the
28
+ ``cli.main()`` top-level handler print "Stopped." and exit 130).
29
+
30
+ For notebook / standalone callers (or any environment where ``spawn`` is
31
+ fragile — Jupyter is the canonical case), the module-level switch
32
+ ``_CLUSTERING_ISOLATE_ENABLED`` may be flipped to ``False``; the helper
33
+ then runs in-process via ``_inline_fit_predict`` and preserves today's
34
+ ``HDBSCAN(min_cluster_size=, min_samples=).fit_predict(X)`` call verbatim.
35
+ The CLI/runner path inherits the default ON.
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ import multiprocessing
41
+ import queue as _queue
42
+ from typing import Any
43
+
44
+ import numpy as np
45
+
46
+ try:
47
+ from fast_hdbscan import HDBSCAN
48
+ ACTIVE_BACKEND = "fast_hdbscan"
49
+ except ImportError:
50
+ try:
51
+ from hdbscan import HDBSCAN
52
+ ACTIVE_BACKEND = "hdbscan"
53
+ except ImportError as e:
54
+ raise ImportError(
55
+ "No HDBSCAN backend available — neither 'hdbscan' nor "
56
+ "'fast_hdbscan' was importable. The 'hdbscan' package is a base "
57
+ "dependency of loghunt and should always be present; reinstall "
58
+ "with 'pip install hdbscan', or 'pip install loghunt[fast]' for "
59
+ "the fast_hdbscan accelerator."
60
+ ) from e
61
+
62
+
63
+ # ── Process-isolation knobs ──────────────────────────────────────────────────
64
+
65
+ # Quarantine-style switch — mirrors digest/blob.py:_BLOB_DRAIN3_ENABLED.
66
+ # When False, fit_predict_interruptible runs in-process (today's behaviour
67
+ # exactly). The CLI/runner path inherits the default True; notebook /
68
+ # standalone callers can flip it for in-process determinism, or to keep
69
+ # multiprocessing out of a Jupyter kernel where spawn is fragile.
70
+ # DO NOT route this through DetectorContext — detector signatures stay
71
+ # clean; this is environment-shaped, not detector-shaped.
72
+ _CLUSTERING_ISOLATE_ENABLED: bool = True
73
+
74
+ # Queue polling interval — the main thread wakes this often to check the
75
+ # child's exit status. 100 ms is fast enough that Ctrl-C feels instant
76
+ # without spinning the CPU.
77
+ _POLL_INTERVAL_SEC: float = 0.1
78
+
79
+ # Bounded join window after SIGTERM before escalating to SIGKILL.
80
+ _TERMINATE_TIMEOUT_SEC: float = 1.0
81
+
82
+ # Brief join on the NORMAL-return path so a healthy child exit isn't
83
+ # converted into a SIGTERM for no reason. Not used on the interrupt path —
84
+ # the committed interrupt sequence is terminate → join → kill (no grace).
85
+ _GRACEFUL_JOIN_SEC: float = 0.2
86
+
87
+
88
+ def _build_clusterer(
89
+ backend: str, *, min_cluster_size: int, min_samples: int,
90
+ ) -> Any:
91
+ """Construct an HDBSCAN clusterer with backend-conditional kwargs.
92
+
93
+ Stock ``hdbscan`` gets ``core_dist_n_jobs=1`` so it spawns no nested
94
+ multiprocessing pool — our one child is then the only extra process
95
+ and SIGKILL is clean (no ``resource_tracker`` "leaked semaphore"
96
+ warning on shutdown). ``fast_hdbscan`` has no ``core_dist_n_jobs``
97
+ parameter (would TypeError); it uses numba threads anyway, so there
98
+ are no semaphores to leak.
99
+
100
+ Called by ``_cluster_worker`` in the SPAWNED CHILD only. The
101
+ in-process escape-hatch path (``_inline_fit_predict``) does NOT
102
+ call this helper — that path preserves today's
103
+ ``HDBSCAN(min_cluster_size=, min_samples=)`` construction
104
+ byte-for-byte to avoid drifting the detector's calibration surface.
105
+ """
106
+ if backend == "hdbscan":
107
+ return HDBSCAN(
108
+ min_cluster_size=min_cluster_size,
109
+ min_samples=min_samples,
110
+ core_dist_n_jobs=1,
111
+ )
112
+ return HDBSCAN(
113
+ min_cluster_size=min_cluster_size,
114
+ min_samples=min_samples,
115
+ )
116
+
117
+
118
+ def _cluster_worker(
119
+ result_queue: Any,
120
+ X: "np.ndarray",
121
+ min_cluster_size: int,
122
+ min_samples: int,
123
+ backend: str,
124
+ ) -> None:
125
+ """Module-level worker for spawn pickling.
126
+
127
+ Picklable BECAUSE it is module-level — nested functions, lambdas, and
128
+ closures would not survive ``spawn`` re-import. Constructs an HDBSCAN
129
+ clusterer via ``_build_clusterer`` (which applies the
130
+ backend-conditional ``core_dist_n_jobs=1`` for stock hdbscan), calls
131
+ ``.fit_predict(X)``, and puts ``("ok", labels)`` or
132
+ ``("error", "<ExcType>: <msg>")`` on the queue.
133
+
134
+ Does NOT raise to the parent — all failures become serialised error
135
+ tuples so the parent path is uniform and arbitrary exception objects
136
+ never need to round-trip through pickle.
137
+
138
+ ``backend`` is passed explicitly (not re-read from ``ACTIVE_BACKEND``
139
+ in the child) so that test paths exercising backend-conditional
140
+ behaviour can target a fixed string without monkeypatching across
141
+ the spawn boundary.
142
+ """
143
+ try:
144
+ clusterer = _build_clusterer(
145
+ backend,
146
+ min_cluster_size=min_cluster_size,
147
+ min_samples=min_samples,
148
+ )
149
+ labels = clusterer.fit_predict(X)
150
+ result_queue.put(("ok", labels))
151
+ except Exception as exc: # noqa: BLE001 — serialised, not raised
152
+ result_queue.put(("error", f"{type(exc).__name__}: {exc}"))
153
+
154
+
155
+ # Indirection seam for tests: rebind this to one of the module-level test
156
+ # helpers in tests/test_clustering_interruptible.py to exercise specific
157
+ # child-process behaviours (block, raise, die without queueing) without
158
+ # closures crossing the spawn boundary.
159
+ _WORKER_TARGET = _cluster_worker
160
+
161
+
162
+ def _await_child_result(
163
+ result_queue: Any, child: "multiprocessing.Process",
164
+ ) -> tuple:
165
+ """Poll the result queue + child liveness until one of them yields.
166
+
167
+ Does NOT call an indefinite ``queue.get()`` — that would hang
168
+ forever if the child segfaults / OOMs / exits without putting a
169
+ result. Instead polls with ``queue.get(timeout=_POLL_INTERVAL_SEC)``
170
+ and, between polls, checks ``child.is_alive()`` / ``child.exitcode``.
171
+
172
+ On a dead child without a queued result, raises ``RuntimeError`` —
173
+ a normal exception that the existing CLI ``ValueError``/``OSError``
174
+ arms can surface. Never returns ``None``.
175
+
176
+ ``KeyboardInterrupt`` propagates naturally: ``queue.get`` is a
177
+ Python-level wait, so SIGINT delivered to the main thread is
178
+ raised out of this function and the caller's interrupt branch
179
+ handles cleanup.
180
+ """
181
+ while True:
182
+ try:
183
+ return result_queue.get(timeout=_POLL_INTERVAL_SEC)
184
+ except _queue.Empty:
185
+ pass
186
+ if not child.is_alive():
187
+ exitcode = child.exitcode
188
+ raise RuntimeError(
189
+ "DNS clustering worker died "
190
+ f"(exitcode={exitcode}) without returning a result. "
191
+ "The clustering library may have crashed; try the "
192
+ "alternate backend (pip install 'loghunt[fast]' or "
193
+ "pip install hdbscan)."
194
+ )
195
+
196
+
197
+ def _drain_and_close_queue(result_queue: Any) -> None:
198
+ """Drain pending items, then close and join the feeder thread.
199
+
200
+ Called from BOTH the normal-return and interrupt cleanup paths,
201
+ always AFTER the child is no longer alive. ``close()`` + ``join_thread()``
202
+ together prevent the ``multiprocessing.resource_tracker`` "leaked
203
+ semaphore" warning at process shutdown; ``close()`` alone is not
204
+ enough (the feeder thread may still hold a reference). Draining
205
+ pending items first stops ``close()`` from blocking the feeder on
206
+ unsent data.
207
+ """
208
+ try:
209
+ while True:
210
+ result_queue.get_nowait()
211
+ except _queue.Empty:
212
+ pass
213
+ result_queue.close()
214
+ result_queue.join_thread()
215
+
216
+
217
+ def _inline_fit_predict(
218
+ X: "np.ndarray", min_cluster_size: int, min_samples: int,
219
+ ) -> "np.ndarray":
220
+ """In-process escape hatch — preserves today's calibration surface
221
+ exactly. NOT routed through ``_build_clusterer`` on purpose: the
222
+ backend-conditional ``core_dist_n_jobs=1`` is a child-process
223
+ resource-tracker concern, not a calibration choice we want to drift
224
+ into notebook / standalone callers.
225
+ """
226
+ clusterer = HDBSCAN(
227
+ min_cluster_size=min_cluster_size, min_samples=min_samples,
228
+ )
229
+ return clusterer.fit_predict(X)
230
+
231
+
232
+ def fit_predict_interruptible(
233
+ X: "np.ndarray", *, min_cluster_size: int, min_samples: int,
234
+ ) -> "np.ndarray":
235
+ """HDBSCAN ``.fit_predict(X)`` with Ctrl-C honoured on a long compute.
236
+
237
+ Replaces the inline ``clusterer = HDBSCAN(...); labels =
238
+ clusterer.fit_predict(X)`` shape at the two DNS call sites. Return
239
+ contract is identical (label int array, shape ``(len(X),)``); the
240
+ detector logic above/below is unchanged.
241
+
242
+ When ``_CLUSTERING_ISOLATE_ENABLED`` is True (the CLI/runner default),
243
+ runs the compute in a spawned child process so SIGINT delivered to
244
+ the parent main thread is honoured regardless of the native call's
245
+ GIL state. On ``KeyboardInterrupt`` the child is terminated and the
246
+ exception re-raises to the caller (``liveness()`` teardown +
247
+ ``cli.main()``'s top-level handler print "Stopped." and exit 130).
248
+
249
+ When False (notebook / standalone escape hatch), runs in-process via
250
+ ``_inline_fit_predict`` and preserves today's behaviour byte-for-byte.
251
+
252
+ Raises:
253
+ ValueError: when the child reports a clustering failure
254
+ (degenerate input, etc.) — preserves the detector contract
255
+ that a clustering failure surfaces as a normal exception.
256
+ RuntimeError: when the child dies without putting a result
257
+ (segfault, OOM kill, etc.) — never silently hangs.
258
+ KeyboardInterrupt: re-raised after child termination so the
259
+ existing ``liveness()`` + ``cli.main()`` machinery handles
260
+ teardown.
261
+ """
262
+ if not _CLUSTERING_ISOLATE_ENABLED:
263
+ return _inline_fit_predict(X, min_cluster_size, min_samples)
264
+
265
+ ctx = multiprocessing.get_context("spawn")
266
+ result_queue = ctx.Queue()
267
+ child = ctx.Process(
268
+ target=_WORKER_TARGET,
269
+ args=(result_queue, X, min_cluster_size, min_samples, ACTIVE_BACKEND),
270
+ )
271
+ child.start()
272
+
273
+ # Universal cleanup discipline. The try body runs the await + an
274
+ # optional graceful join on the normal-return path; the finally
275
+ # ALWAYS runs the committed terminate → kill → drain/close → close
276
+ # sequence regardless of how the body exited. This covers four
277
+ # cases with one structure:
278
+ #
279
+ # - Normal return: await yields, graceful join runs in the body,
280
+ # finally's terminate/kill are no-ops (child already exited),
281
+ # queue is drained+closed, child handle closed.
282
+ # - KeyboardInterrupt: await raises, body skips straight to
283
+ # finally. Graceful join is NEVER reached on this path — the
284
+ # operator wants the helper out NOW — so the finally's
285
+ # terminate fires immediately. This matches the committed
286
+ # interrupt sequence (terminate → join → kill → cleanup →
287
+ # re-raise) exactly: the re-raise is the implicit post-finally
288
+ # propagation.
289
+ # - RuntimeError (dead child without result): await raises,
290
+ # finally runs. Child is already dead so terminate/kill are
291
+ # no-ops, but queue drain/close + child.close still run —
292
+ # preventing the resource_tracker leak on the exact abnormal-
293
+ # death path the helper exists to handle.
294
+ # - Any future non-KBI exception from the await path: same
295
+ # guarantee as the RuntimeError case. If the child happens to
296
+ # still be alive, the finally's terminate/kill takes it down
297
+ # before draining the queue (closing the queue with a live
298
+ # child would hang the feeder thread).
299
+ try:
300
+ result = _await_child_result(result_queue, child)
301
+ # NORMAL-RETURN PATH continues here. Brief join so a healthy
302
+ # exit isn't converted to SIGTERM by the finally below; if
303
+ # the child stalls past the grace window, finally's
304
+ # terminate/kill takes over.
305
+ if child.is_alive():
306
+ child.join(_GRACEFUL_JOIN_SEC)
307
+ finally:
308
+ if child.is_alive():
309
+ child.terminate()
310
+ child.join(_TERMINATE_TIMEOUT_SEC)
311
+ if child.is_alive():
312
+ child.kill()
313
+ child.join()
314
+ _drain_and_close_queue(result_queue)
315
+ child.close()
316
+
317
+ if result[0] == "ok":
318
+ return result[1]
319
+ raise ValueError(f"DNS clustering failed in worker: {result[1]}")
320
+
321
+
322
+ __all__ = [
323
+ "HDBSCAN",
324
+ "ACTIVE_BACKEND",
325
+ "fit_predict_interruptible",
326
+ ]
@@ -0,0 +1,221 @@
1
+ """Config loading with precedence chain: CLI arg > user > system.
2
+
3
+ Precedence (highest to lowest):
4
+ 1. Explicit --config=FILE argument
5
+ 2. ~/.loghunter/config.toml (user default)
6
+ 3. /etc/loghunter/config.toml (system-wide)
7
+
8
+ When no config file is found, returns a deep copy of built-in defaults — no exception raised.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import copy
14
+ import tomllib
15
+ from datetime import timedelta
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+
20
+ class ConfigError(Exception):
21
+ """Raised for config problems that the user needs to act on."""
22
+
23
+
24
+ def parse_window_span(spec: str | None) -> timedelta | None:
25
+ """Parse a default_window config value into a timedelta.
26
+
27
+ Returns None for: None, "", "all" (case-insensitive) — meaning "no default".
28
+ Accepts: "Nd" (days), "Nh" (hours) where N is a positive integer.
29
+ Raises ConfigError for any other value — silent fallback hides real config bugs.
30
+ """
31
+ if spec is None:
32
+ return None
33
+ s = str(spec).strip()
34
+ if s == "" or s.lower() == "all":
35
+ return None
36
+ try:
37
+ if s.endswith("d"):
38
+ n = int(s[:-1])
39
+ if n > 0:
40
+ return timedelta(days=n)
41
+ elif s.endswith("h"):
42
+ n = int(s[:-1])
43
+ if n > 0:
44
+ return timedelta(hours=n)
45
+ except ValueError:
46
+ pass
47
+ raise ConfigError(
48
+ f"default_window={spec!r} is not a valid duration. "
49
+ f"Use 'Nd' (days), 'Nh' (hours), '' or 'all' to disable."
50
+ )
51
+
52
+
53
+ _DEFAULTS: dict[str, Any] = {
54
+ "loghunter": {
55
+ # LH_ROOT — base for RELATIVE paths in config-file values. Empty = use
56
+ # CWD for relative paths. Absolute and ~-anchored paths ignore it.
57
+ # Env override: LOGHUNTER_ROOT (env wins over config).
58
+ "root": "~/.loghunter",
59
+ "detect": "all",
60
+ # Conventional source locations; tried out-of-box. pihole/cloudtrail
61
+ # stay None (opt-in — no missing-file warning when absent).
62
+ "zeek_dir": "/var/log/zeek",
63
+ "syslog_dir": "/var/log",
64
+ "pihole_dir": None,
65
+ "cloudtrail_dir": None,
66
+ # Internal networks for traffic-direction classification. Topology
67
+ # fact, not detector tuning. RFC1918 default; override only if your
68
+ # internal address plan differs.
69
+ "home_net": ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"],
70
+ # Where exporters write pulled logs. Backends and per-query stanzas
71
+ # may override per the precedence cascade.
72
+ # Trailing slash communicates directory intent to be_like_water —
73
+ # without it, a non-existent path would be interpreted as a FILE.
74
+ "export_dir": "exports/",
75
+ # report_dir intentionally OMITTED — no shipped default. Setting it is
76
+ # an explicit opt-in to file-mode analyze output. Bare analyze prints
77
+ # to stdout when report_dir is unset and --out is not passed.
78
+ "output_format": "text",
79
+ "default_window": "1d",
80
+ "warn_above": 5_000_000,
81
+ # Per-detector total row cap for TEXT output only (json/csv/html
82
+ # render everything — machine formats must not lose data). The cap
83
+ # is a running budget across the detector's subsections in declared
84
+ # order; the disclosure line reports rendered-vs-total. 0 = unlimited.
85
+ "max_findings_per_detector": 100,
86
+ },
87
+ "detectors": {},
88
+ "allowlist": {
89
+ "domain_patterns": [
90
+ "~/.loghunter/allowlist.d/domains_user.txt",
91
+ ],
92
+ "connection_rules": [
93
+ "~/.loghunter/allowlist.d/connections.txt",
94
+ ],
95
+ "allowlist_dir": "~/.loghunter/allowlist.d/",
96
+ },
97
+ "export": {
98
+ "splunk": {
99
+ "host": "",
100
+ "port": 8089,
101
+ "username": "",
102
+ "password": "",
103
+ },
104
+ # cloudtrail exporter — boto3 pull from S3, writes CloudTrail JSON locally.
105
+ # path is the s3:// URL to the CloudTrail tree; egress_warn_gb is the
106
+ # cost guard threshold. Activation is a non-empty path.
107
+ "cloudtrail": {
108
+ "path": "",
109
+ "egress_warn_gb": 5.0,
110
+ },
111
+ },
112
+ }
113
+
114
+ SEARCH_PATHS: list[Path] = [
115
+ Path("~/.loghunter/config.toml").expanduser(),
116
+ Path("/etc/loghunter/config.toml"),
117
+ ]
118
+
119
+
120
+ def load(config_file: str | Path | None = None) -> dict[str, Any]:
121
+ """Load config from the precedence chain and return the merged config dict.
122
+
123
+ If config_file is given, it is used directly; raises ConfigError if missing.
124
+ If no config file is found in the search path, returns built-in defaults cleanly.
125
+ """
126
+ if config_file is not None:
127
+ path = Path(config_file)
128
+ if not path.exists():
129
+ raise ConfigError(
130
+ f"Config file not found: {path}\n"
131
+ f"Check the path or run 'loghunter init' to create a config."
132
+ )
133
+ config = _load_file(path)
134
+ else:
135
+ found = _find_config_file()
136
+ if found is None:
137
+ config = copy.deepcopy(_DEFAULTS)
138
+ else:
139
+ config = _load_file(found)
140
+
141
+ # Validate default_window eagerly so typos in user config fail at load time —
142
+ # not lazily during the run, where bounded paths would never notice.
143
+ parse_window_span(config.get("loghunter", {}).get("default_window"))
144
+ return config
145
+
146
+
147
+ def default_allowlist_paths() -> dict[str, Any]:
148
+ """Return a deep copy of ``_DEFAULTS["allowlist"]`` — the single source of
149
+ truth for fallback paths when an allowlist config key is absent.
150
+
151
+ Used by ``common/allowlist.py:build_matcher`` when a raw / notebook config
152
+ arrives without ``domain_patterns``, ``connection_rules``, or
153
+ ``allowlist_dir`` set (the ``cfg.load`` deep-merge would otherwise have
154
+ supplied them from ``_DEFAULTS``).
155
+ """
156
+ return copy.deepcopy(_DEFAULTS["allowlist"])
157
+
158
+
159
+ def get_detector_config(
160
+ config: dict[str, Any],
161
+ detector_name: str,
162
+ detector_defaults: dict[str, Any] | None = None,
163
+ ) -> dict[str, Any]:
164
+ """Return the merged config for a specific detector.
165
+
166
+ File config wins over detector_defaults, which win over nothing.
167
+ """
168
+ base = copy.deepcopy(detector_defaults or {})
169
+ file_section = config.get("detectors", {}).get(detector_name, {})
170
+ return _deep_merge(base, file_section)
171
+
172
+
173
+ def _find_config_file() -> Path | None:
174
+ """Walk SEARCH_PATHS and return the first existing file."""
175
+ for path in SEARCH_PATHS:
176
+ if path.exists():
177
+ return path
178
+ return None
179
+
180
+
181
+ def _load_file(path: Path) -> dict[str, Any]:
182
+ """Parse a TOML config file and deep-merge it over built-in defaults.
183
+
184
+ Attaches a ``__user_set__`` sidecar to the returned merged dict: a mapping
185
+ from top-level section name to the set of key names the operator declared
186
+ in that section. This is provenance metadata for runner-level disclosures
187
+ (e.g. "default RFC1918 vs. operator-declared home_net") — a value-only
188
+ check cannot distinguish a defaulted value from a user-declared value that
189
+ happens to equal the default. The "no config file found" path in load()
190
+ skips _load_file entirely; absence of the sidecar is correctly read as
191
+ "no user declarations".
192
+ """
193
+ try:
194
+ with path.open("rb") as fh:
195
+ user_config = tomllib.load(fh)
196
+ except tomllib.TOMLDecodeError as exc:
197
+ raise ConfigError(
198
+ f"Config file parse error in {path}:\n {exc}\n"
199
+ f"Check the file for TOML syntax errors."
200
+ ) from exc
201
+
202
+ merged = _deep_merge(copy.deepcopy(_DEFAULTS), user_config)
203
+ merged["__user_set__"] = {
204
+ section: set(content.keys()) if isinstance(content, dict) else set()
205
+ for section, content in user_config.items()
206
+ }
207
+ return merged
208
+
209
+
210
+ def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
211
+ """Recursively merge override into base, returning a new dict.
212
+
213
+ Scalars and lists in override replace those in base. Dicts are merged recursively.
214
+ """
215
+ result = dict(base)
216
+ for key, val in override.items():
217
+ if key in result and isinstance(result[key], dict) and isinstance(val, dict):
218
+ result[key] = _deep_merge(result[key], val)
219
+ else:
220
+ result[key] = copy.deepcopy(val)
221
+ return result