loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
"""HDBSCAN backend shim — prefer fast_hdbscan, fall back to stock hdbscan.
|
|
2
|
+
|
|
3
|
+
The dns detector was originally built and calibrated on stock ``hdbscan``;
|
|
4
|
+
``fast_hdbscan`` was added later as a drop-in accelerator with an identical
|
|
5
|
+
``HDBSCAN(min_cluster_size=, min_samples=)`` plus ``.fit_predict(X)`` API.
|
|
6
|
+
Both produce equivalent findings.
|
|
7
|
+
|
|
8
|
+
This module resolves which implementation is in use exactly once at import
|
|
9
|
+
time and exposes that class at module level as ``HDBSCAN`` so callers can
|
|
10
|
+
``from loghunter.common.clustering import HDBSCAN`` and construct it
|
|
11
|
+
directly — no factory, no rename. ``ACTIVE_BACKEND`` records which one
|
|
12
|
+
resolved.
|
|
13
|
+
|
|
14
|
+
Resolution order: ``fast_hdbscan`` first (the accelerator, opt-in via the
|
|
15
|
+
``loghunt[fast]`` extra), then stock ``hdbscan`` (the guaranteed base
|
|
16
|
+
dependency). If neither is importable we raise ``ImportError`` at import
|
|
17
|
+
time rather than letting the failure surface later at a construction site.
|
|
18
|
+
|
|
19
|
+
Process-isolated entry point
|
|
20
|
+
----------------------------
|
|
21
|
+
|
|
22
|
+
``fit_predict_interruptible`` is the new shared call site for DNS clustering
|
|
23
|
+
(both Zeek and Pi-hole paths). It runs ``HDBSCAN(...).fit_predict(X)`` in a
|
|
24
|
+
spawned child process so the parent main thread can honour Ctrl-C
|
|
25
|
+
regardless of the native call's GIL state. On ``KeyboardInterrupt`` the
|
|
26
|
+
child is terminated, the queue is drained and closed, and the exception
|
|
27
|
+
re-raises to the caller (the existing ``liveness()`` teardown + the
|
|
28
|
+
``cli.main()`` top-level handler print "Stopped." and exit 130).
|
|
29
|
+
|
|
30
|
+
For notebook / standalone callers (or any environment where ``spawn`` is
|
|
31
|
+
fragile — Jupyter is the canonical case), the module-level switch
|
|
32
|
+
``_CLUSTERING_ISOLATE_ENABLED`` may be flipped to ``False``; the helper
|
|
33
|
+
then runs in-process via ``_inline_fit_predict`` and preserves today's
|
|
34
|
+
``HDBSCAN(min_cluster_size=, min_samples=).fit_predict(X)`` call verbatim.
|
|
35
|
+
The CLI/runner path inherits the default ON.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import multiprocessing
|
|
41
|
+
import queue as _queue
|
|
42
|
+
from typing import Any
|
|
43
|
+
|
|
44
|
+
import numpy as np
|
|
45
|
+
|
|
46
|
+
try:
|
|
47
|
+
from fast_hdbscan import HDBSCAN
|
|
48
|
+
ACTIVE_BACKEND = "fast_hdbscan"
|
|
49
|
+
except ImportError:
|
|
50
|
+
try:
|
|
51
|
+
from hdbscan import HDBSCAN
|
|
52
|
+
ACTIVE_BACKEND = "hdbscan"
|
|
53
|
+
except ImportError as e:
|
|
54
|
+
raise ImportError(
|
|
55
|
+
"No HDBSCAN backend available — neither 'hdbscan' nor "
|
|
56
|
+
"'fast_hdbscan' was importable. The 'hdbscan' package is a base "
|
|
57
|
+
"dependency of loghunt and should always be present; reinstall "
|
|
58
|
+
"with 'pip install hdbscan', or 'pip install loghunt[fast]' for "
|
|
59
|
+
"the fast_hdbscan accelerator."
|
|
60
|
+
) from e
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
# ── Process-isolation knobs ──────────────────────────────────────────────────
|
|
64
|
+
|
|
65
|
+
# Quarantine-style switch — mirrors digest/blob.py:_BLOB_DRAIN3_ENABLED.
|
|
66
|
+
# When False, fit_predict_interruptible runs in-process (today's behaviour
|
|
67
|
+
# exactly). The CLI/runner path inherits the default True; notebook /
|
|
68
|
+
# standalone callers can flip it for in-process determinism, or to keep
|
|
69
|
+
# multiprocessing out of a Jupyter kernel where spawn is fragile.
|
|
70
|
+
# DO NOT route this through DetectorContext — detector signatures stay
|
|
71
|
+
# clean; this is environment-shaped, not detector-shaped.
|
|
72
|
+
_CLUSTERING_ISOLATE_ENABLED: bool = True
|
|
73
|
+
|
|
74
|
+
# Queue polling interval — the main thread wakes this often to check the
|
|
75
|
+
# child's exit status. 100 ms is fast enough that Ctrl-C feels instant
|
|
76
|
+
# without spinning the CPU.
|
|
77
|
+
_POLL_INTERVAL_SEC: float = 0.1
|
|
78
|
+
|
|
79
|
+
# Bounded join window after SIGTERM before escalating to SIGKILL.
|
|
80
|
+
_TERMINATE_TIMEOUT_SEC: float = 1.0
|
|
81
|
+
|
|
82
|
+
# Brief join on the NORMAL-return path so a healthy child exit isn't
|
|
83
|
+
# converted into a SIGTERM for no reason. Not used on the interrupt path —
|
|
84
|
+
# the committed interrupt sequence is terminate → join → kill (no grace).
|
|
85
|
+
_GRACEFUL_JOIN_SEC: float = 0.2
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _build_clusterer(
|
|
89
|
+
backend: str, *, min_cluster_size: int, min_samples: int,
|
|
90
|
+
) -> Any:
|
|
91
|
+
"""Construct an HDBSCAN clusterer with backend-conditional kwargs.
|
|
92
|
+
|
|
93
|
+
Stock ``hdbscan`` gets ``core_dist_n_jobs=1`` so it spawns no nested
|
|
94
|
+
multiprocessing pool — our one child is then the only extra process
|
|
95
|
+
and SIGKILL is clean (no ``resource_tracker`` "leaked semaphore"
|
|
96
|
+
warning on shutdown). ``fast_hdbscan`` has no ``core_dist_n_jobs``
|
|
97
|
+
parameter (would TypeError); it uses numba threads anyway, so there
|
|
98
|
+
are no semaphores to leak.
|
|
99
|
+
|
|
100
|
+
Called by ``_cluster_worker`` in the SPAWNED CHILD only. The
|
|
101
|
+
in-process escape-hatch path (``_inline_fit_predict``) does NOT
|
|
102
|
+
call this helper — that path preserves today's
|
|
103
|
+
``HDBSCAN(min_cluster_size=, min_samples=)`` construction
|
|
104
|
+
byte-for-byte to avoid drifting the detector's calibration surface.
|
|
105
|
+
"""
|
|
106
|
+
if backend == "hdbscan":
|
|
107
|
+
return HDBSCAN(
|
|
108
|
+
min_cluster_size=min_cluster_size,
|
|
109
|
+
min_samples=min_samples,
|
|
110
|
+
core_dist_n_jobs=1,
|
|
111
|
+
)
|
|
112
|
+
return HDBSCAN(
|
|
113
|
+
min_cluster_size=min_cluster_size,
|
|
114
|
+
min_samples=min_samples,
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _cluster_worker(
|
|
119
|
+
result_queue: Any,
|
|
120
|
+
X: "np.ndarray",
|
|
121
|
+
min_cluster_size: int,
|
|
122
|
+
min_samples: int,
|
|
123
|
+
backend: str,
|
|
124
|
+
) -> None:
|
|
125
|
+
"""Module-level worker for spawn pickling.
|
|
126
|
+
|
|
127
|
+
Picklable BECAUSE it is module-level — nested functions, lambdas, and
|
|
128
|
+
closures would not survive ``spawn`` re-import. Constructs an HDBSCAN
|
|
129
|
+
clusterer via ``_build_clusterer`` (which applies the
|
|
130
|
+
backend-conditional ``core_dist_n_jobs=1`` for stock hdbscan), calls
|
|
131
|
+
``.fit_predict(X)``, and puts ``("ok", labels)`` or
|
|
132
|
+
``("error", "<ExcType>: <msg>")`` on the queue.
|
|
133
|
+
|
|
134
|
+
Does NOT raise to the parent — all failures become serialised error
|
|
135
|
+
tuples so the parent path is uniform and arbitrary exception objects
|
|
136
|
+
never need to round-trip through pickle.
|
|
137
|
+
|
|
138
|
+
``backend`` is passed explicitly (not re-read from ``ACTIVE_BACKEND``
|
|
139
|
+
in the child) so that test paths exercising backend-conditional
|
|
140
|
+
behaviour can target a fixed string without monkeypatching across
|
|
141
|
+
the spawn boundary.
|
|
142
|
+
"""
|
|
143
|
+
try:
|
|
144
|
+
clusterer = _build_clusterer(
|
|
145
|
+
backend,
|
|
146
|
+
min_cluster_size=min_cluster_size,
|
|
147
|
+
min_samples=min_samples,
|
|
148
|
+
)
|
|
149
|
+
labels = clusterer.fit_predict(X)
|
|
150
|
+
result_queue.put(("ok", labels))
|
|
151
|
+
except Exception as exc: # noqa: BLE001 — serialised, not raised
|
|
152
|
+
result_queue.put(("error", f"{type(exc).__name__}: {exc}"))
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# Indirection seam for tests: rebind this to one of the module-level test
|
|
156
|
+
# helpers in tests/test_clustering_interruptible.py to exercise specific
|
|
157
|
+
# child-process behaviours (block, raise, die without queueing) without
|
|
158
|
+
# closures crossing the spawn boundary.
|
|
159
|
+
_WORKER_TARGET = _cluster_worker
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _await_child_result(
|
|
163
|
+
result_queue: Any, child: "multiprocessing.Process",
|
|
164
|
+
) -> tuple:
|
|
165
|
+
"""Poll the result queue + child liveness until one of them yields.
|
|
166
|
+
|
|
167
|
+
Does NOT call an indefinite ``queue.get()`` — that would hang
|
|
168
|
+
forever if the child segfaults / OOMs / exits without putting a
|
|
169
|
+
result. Instead polls with ``queue.get(timeout=_POLL_INTERVAL_SEC)``
|
|
170
|
+
and, between polls, checks ``child.is_alive()`` / ``child.exitcode``.
|
|
171
|
+
|
|
172
|
+
On a dead child without a queued result, raises ``RuntimeError`` —
|
|
173
|
+
a normal exception that the existing CLI ``ValueError``/``OSError``
|
|
174
|
+
arms can surface. Never returns ``None``.
|
|
175
|
+
|
|
176
|
+
``KeyboardInterrupt`` propagates naturally: ``queue.get`` is a
|
|
177
|
+
Python-level wait, so SIGINT delivered to the main thread is
|
|
178
|
+
raised out of this function and the caller's interrupt branch
|
|
179
|
+
handles cleanup.
|
|
180
|
+
"""
|
|
181
|
+
while True:
|
|
182
|
+
try:
|
|
183
|
+
return result_queue.get(timeout=_POLL_INTERVAL_SEC)
|
|
184
|
+
except _queue.Empty:
|
|
185
|
+
pass
|
|
186
|
+
if not child.is_alive():
|
|
187
|
+
exitcode = child.exitcode
|
|
188
|
+
raise RuntimeError(
|
|
189
|
+
"DNS clustering worker died "
|
|
190
|
+
f"(exitcode={exitcode}) without returning a result. "
|
|
191
|
+
"The clustering library may have crashed; try the "
|
|
192
|
+
"alternate backend (pip install 'loghunt[fast]' or "
|
|
193
|
+
"pip install hdbscan)."
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def _drain_and_close_queue(result_queue: Any) -> None:
|
|
198
|
+
"""Drain pending items, then close and join the feeder thread.
|
|
199
|
+
|
|
200
|
+
Called from BOTH the normal-return and interrupt cleanup paths,
|
|
201
|
+
always AFTER the child is no longer alive. ``close()`` + ``join_thread()``
|
|
202
|
+
together prevent the ``multiprocessing.resource_tracker`` "leaked
|
|
203
|
+
semaphore" warning at process shutdown; ``close()`` alone is not
|
|
204
|
+
enough (the feeder thread may still hold a reference). Draining
|
|
205
|
+
pending items first stops ``close()`` from blocking the feeder on
|
|
206
|
+
unsent data.
|
|
207
|
+
"""
|
|
208
|
+
try:
|
|
209
|
+
while True:
|
|
210
|
+
result_queue.get_nowait()
|
|
211
|
+
except _queue.Empty:
|
|
212
|
+
pass
|
|
213
|
+
result_queue.close()
|
|
214
|
+
result_queue.join_thread()
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _inline_fit_predict(
|
|
218
|
+
X: "np.ndarray", min_cluster_size: int, min_samples: int,
|
|
219
|
+
) -> "np.ndarray":
|
|
220
|
+
"""In-process escape hatch — preserves today's calibration surface
|
|
221
|
+
exactly. NOT routed through ``_build_clusterer`` on purpose: the
|
|
222
|
+
backend-conditional ``core_dist_n_jobs=1`` is a child-process
|
|
223
|
+
resource-tracker concern, not a calibration choice we want to drift
|
|
224
|
+
into notebook / standalone callers.
|
|
225
|
+
"""
|
|
226
|
+
clusterer = HDBSCAN(
|
|
227
|
+
min_cluster_size=min_cluster_size, min_samples=min_samples,
|
|
228
|
+
)
|
|
229
|
+
return clusterer.fit_predict(X)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def fit_predict_interruptible(
|
|
233
|
+
X: "np.ndarray", *, min_cluster_size: int, min_samples: int,
|
|
234
|
+
) -> "np.ndarray":
|
|
235
|
+
"""HDBSCAN ``.fit_predict(X)`` with Ctrl-C honoured on a long compute.
|
|
236
|
+
|
|
237
|
+
Replaces the inline ``clusterer = HDBSCAN(...); labels =
|
|
238
|
+
clusterer.fit_predict(X)`` shape at the two DNS call sites. Return
|
|
239
|
+
contract is identical (label int array, shape ``(len(X),)``); the
|
|
240
|
+
detector logic above/below is unchanged.
|
|
241
|
+
|
|
242
|
+
When ``_CLUSTERING_ISOLATE_ENABLED`` is True (the CLI/runner default),
|
|
243
|
+
runs the compute in a spawned child process so SIGINT delivered to
|
|
244
|
+
the parent main thread is honoured regardless of the native call's
|
|
245
|
+
GIL state. On ``KeyboardInterrupt`` the child is terminated and the
|
|
246
|
+
exception re-raises to the caller (``liveness()`` teardown +
|
|
247
|
+
``cli.main()``'s top-level handler print "Stopped." and exit 130).
|
|
248
|
+
|
|
249
|
+
When False (notebook / standalone escape hatch), runs in-process via
|
|
250
|
+
``_inline_fit_predict`` and preserves today's behaviour byte-for-byte.
|
|
251
|
+
|
|
252
|
+
Raises:
|
|
253
|
+
ValueError: when the child reports a clustering failure
|
|
254
|
+
(degenerate input, etc.) — preserves the detector contract
|
|
255
|
+
that a clustering failure surfaces as a normal exception.
|
|
256
|
+
RuntimeError: when the child dies without putting a result
|
|
257
|
+
(segfault, OOM kill, etc.) — never silently hangs.
|
|
258
|
+
KeyboardInterrupt: re-raised after child termination so the
|
|
259
|
+
existing ``liveness()`` + ``cli.main()`` machinery handles
|
|
260
|
+
teardown.
|
|
261
|
+
"""
|
|
262
|
+
if not _CLUSTERING_ISOLATE_ENABLED:
|
|
263
|
+
return _inline_fit_predict(X, min_cluster_size, min_samples)
|
|
264
|
+
|
|
265
|
+
ctx = multiprocessing.get_context("spawn")
|
|
266
|
+
result_queue = ctx.Queue()
|
|
267
|
+
child = ctx.Process(
|
|
268
|
+
target=_WORKER_TARGET,
|
|
269
|
+
args=(result_queue, X, min_cluster_size, min_samples, ACTIVE_BACKEND),
|
|
270
|
+
)
|
|
271
|
+
child.start()
|
|
272
|
+
|
|
273
|
+
# Universal cleanup discipline. The try body runs the await + an
|
|
274
|
+
# optional graceful join on the normal-return path; the finally
|
|
275
|
+
# ALWAYS runs the committed terminate → kill → drain/close → close
|
|
276
|
+
# sequence regardless of how the body exited. This covers four
|
|
277
|
+
# cases with one structure:
|
|
278
|
+
#
|
|
279
|
+
# - Normal return: await yields, graceful join runs in the body,
|
|
280
|
+
# finally's terminate/kill are no-ops (child already exited),
|
|
281
|
+
# queue is drained+closed, child handle closed.
|
|
282
|
+
# - KeyboardInterrupt: await raises, body skips straight to
|
|
283
|
+
# finally. Graceful join is NEVER reached on this path — the
|
|
284
|
+
# operator wants the helper out NOW — so the finally's
|
|
285
|
+
# terminate fires immediately. This matches the committed
|
|
286
|
+
# interrupt sequence (terminate → join → kill → cleanup →
|
|
287
|
+
# re-raise) exactly: the re-raise is the implicit post-finally
|
|
288
|
+
# propagation.
|
|
289
|
+
# - RuntimeError (dead child without result): await raises,
|
|
290
|
+
# finally runs. Child is already dead so terminate/kill are
|
|
291
|
+
# no-ops, but queue drain/close + child.close still run —
|
|
292
|
+
# preventing the resource_tracker leak on the exact abnormal-
|
|
293
|
+
# death path the helper exists to handle.
|
|
294
|
+
# - Any future non-KBI exception from the await path: same
|
|
295
|
+
# guarantee as the RuntimeError case. If the child happens to
|
|
296
|
+
# still be alive, the finally's terminate/kill takes it down
|
|
297
|
+
# before draining the queue (closing the queue with a live
|
|
298
|
+
# child would hang the feeder thread).
|
|
299
|
+
try:
|
|
300
|
+
result = _await_child_result(result_queue, child)
|
|
301
|
+
# NORMAL-RETURN PATH continues here. Brief join so a healthy
|
|
302
|
+
# exit isn't converted to SIGTERM by the finally below; if
|
|
303
|
+
# the child stalls past the grace window, finally's
|
|
304
|
+
# terminate/kill takes over.
|
|
305
|
+
if child.is_alive():
|
|
306
|
+
child.join(_GRACEFUL_JOIN_SEC)
|
|
307
|
+
finally:
|
|
308
|
+
if child.is_alive():
|
|
309
|
+
child.terminate()
|
|
310
|
+
child.join(_TERMINATE_TIMEOUT_SEC)
|
|
311
|
+
if child.is_alive():
|
|
312
|
+
child.kill()
|
|
313
|
+
child.join()
|
|
314
|
+
_drain_and_close_queue(result_queue)
|
|
315
|
+
child.close()
|
|
316
|
+
|
|
317
|
+
if result[0] == "ok":
|
|
318
|
+
return result[1]
|
|
319
|
+
raise ValueError(f"DNS clustering failed in worker: {result[1]}")
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
__all__ = [
|
|
323
|
+
"HDBSCAN",
|
|
324
|
+
"ACTIVE_BACKEND",
|
|
325
|
+
"fit_predict_interruptible",
|
|
326
|
+
]
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Config loading with precedence chain: CLI arg > user > system.
|
|
2
|
+
|
|
3
|
+
Precedence (highest to lowest):
|
|
4
|
+
1. Explicit --config=FILE argument
|
|
5
|
+
2. ~/.loghunter/config.toml (user default)
|
|
6
|
+
3. /etc/loghunter/config.toml (system-wide)
|
|
7
|
+
|
|
8
|
+
When no config file is found, returns a deep copy of built-in defaults — no exception raised.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import copy
|
|
14
|
+
import tomllib
|
|
15
|
+
from datetime import timedelta
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ConfigError(Exception):
|
|
21
|
+
"""Raised for config problems that the user needs to act on."""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse_window_span(spec: str | None) -> timedelta | None:
|
|
25
|
+
"""Parse a default_window config value into a timedelta.
|
|
26
|
+
|
|
27
|
+
Returns None for: None, "", "all" (case-insensitive) — meaning "no default".
|
|
28
|
+
Accepts: "Nd" (days), "Nh" (hours) where N is a positive integer.
|
|
29
|
+
Raises ConfigError for any other value — silent fallback hides real config bugs.
|
|
30
|
+
"""
|
|
31
|
+
if spec is None:
|
|
32
|
+
return None
|
|
33
|
+
s = str(spec).strip()
|
|
34
|
+
if s == "" or s.lower() == "all":
|
|
35
|
+
return None
|
|
36
|
+
try:
|
|
37
|
+
if s.endswith("d"):
|
|
38
|
+
n = int(s[:-1])
|
|
39
|
+
if n > 0:
|
|
40
|
+
return timedelta(days=n)
|
|
41
|
+
elif s.endswith("h"):
|
|
42
|
+
n = int(s[:-1])
|
|
43
|
+
if n > 0:
|
|
44
|
+
return timedelta(hours=n)
|
|
45
|
+
except ValueError:
|
|
46
|
+
pass
|
|
47
|
+
raise ConfigError(
|
|
48
|
+
f"default_window={spec!r} is not a valid duration. "
|
|
49
|
+
f"Use 'Nd' (days), 'Nh' (hours), '' or 'all' to disable."
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
_DEFAULTS: dict[str, Any] = {
|
|
54
|
+
"loghunter": {
|
|
55
|
+
# LH_ROOT — base for RELATIVE paths in config-file values. Empty = use
|
|
56
|
+
# CWD for relative paths. Absolute and ~-anchored paths ignore it.
|
|
57
|
+
# Env override: LOGHUNTER_ROOT (env wins over config).
|
|
58
|
+
"root": "~/.loghunter",
|
|
59
|
+
"detect": "all",
|
|
60
|
+
# Conventional source locations; tried out-of-box. pihole/cloudtrail
|
|
61
|
+
# stay None (opt-in — no missing-file warning when absent).
|
|
62
|
+
"zeek_dir": "/var/log/zeek",
|
|
63
|
+
"syslog_dir": "/var/log",
|
|
64
|
+
"pihole_dir": None,
|
|
65
|
+
"cloudtrail_dir": None,
|
|
66
|
+
# Internal networks for traffic-direction classification. Topology
|
|
67
|
+
# fact, not detector tuning. RFC1918 default; override only if your
|
|
68
|
+
# internal address plan differs.
|
|
69
|
+
"home_net": ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"],
|
|
70
|
+
# Where exporters write pulled logs. Backends and per-query stanzas
|
|
71
|
+
# may override per the precedence cascade.
|
|
72
|
+
# Trailing slash communicates directory intent to be_like_water —
|
|
73
|
+
# without it, a non-existent path would be interpreted as a FILE.
|
|
74
|
+
"export_dir": "exports/",
|
|
75
|
+
# report_dir intentionally OMITTED — no shipped default. Setting it is
|
|
76
|
+
# an explicit opt-in to file-mode analyze output. Bare analyze prints
|
|
77
|
+
# to stdout when report_dir is unset and --out is not passed.
|
|
78
|
+
"output_format": "text",
|
|
79
|
+
"default_window": "1d",
|
|
80
|
+
"warn_above": 5_000_000,
|
|
81
|
+
# Per-detector total row cap for TEXT output only (json/csv/html
|
|
82
|
+
# render everything — machine formats must not lose data). The cap
|
|
83
|
+
# is a running budget across the detector's subsections in declared
|
|
84
|
+
# order; the disclosure line reports rendered-vs-total. 0 = unlimited.
|
|
85
|
+
"max_findings_per_detector": 100,
|
|
86
|
+
},
|
|
87
|
+
"detectors": {},
|
|
88
|
+
"allowlist": {
|
|
89
|
+
"domain_patterns": [
|
|
90
|
+
"~/.loghunter/allowlist.d/domains_user.txt",
|
|
91
|
+
],
|
|
92
|
+
"connection_rules": [
|
|
93
|
+
"~/.loghunter/allowlist.d/connections.txt",
|
|
94
|
+
],
|
|
95
|
+
"allowlist_dir": "~/.loghunter/allowlist.d/",
|
|
96
|
+
},
|
|
97
|
+
"export": {
|
|
98
|
+
"splunk": {
|
|
99
|
+
"host": "",
|
|
100
|
+
"port": 8089,
|
|
101
|
+
"username": "",
|
|
102
|
+
"password": "",
|
|
103
|
+
},
|
|
104
|
+
# cloudtrail exporter — boto3 pull from S3, writes CloudTrail JSON locally.
|
|
105
|
+
# path is the s3:// URL to the CloudTrail tree; egress_warn_gb is the
|
|
106
|
+
# cost guard threshold. Activation is a non-empty path.
|
|
107
|
+
"cloudtrail": {
|
|
108
|
+
"path": "",
|
|
109
|
+
"egress_warn_gb": 5.0,
|
|
110
|
+
},
|
|
111
|
+
},
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
SEARCH_PATHS: list[Path] = [
|
|
115
|
+
Path("~/.loghunter/config.toml").expanduser(),
|
|
116
|
+
Path("/etc/loghunter/config.toml"),
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def load(config_file: str | Path | None = None) -> dict[str, Any]:
|
|
121
|
+
"""Load config from the precedence chain and return the merged config dict.
|
|
122
|
+
|
|
123
|
+
If config_file is given, it is used directly; raises ConfigError if missing.
|
|
124
|
+
If no config file is found in the search path, returns built-in defaults cleanly.
|
|
125
|
+
"""
|
|
126
|
+
if config_file is not None:
|
|
127
|
+
path = Path(config_file)
|
|
128
|
+
if not path.exists():
|
|
129
|
+
raise ConfigError(
|
|
130
|
+
f"Config file not found: {path}\n"
|
|
131
|
+
f"Check the path or run 'loghunter init' to create a config."
|
|
132
|
+
)
|
|
133
|
+
config = _load_file(path)
|
|
134
|
+
else:
|
|
135
|
+
found = _find_config_file()
|
|
136
|
+
if found is None:
|
|
137
|
+
config = copy.deepcopy(_DEFAULTS)
|
|
138
|
+
else:
|
|
139
|
+
config = _load_file(found)
|
|
140
|
+
|
|
141
|
+
# Validate default_window eagerly so typos in user config fail at load time —
|
|
142
|
+
# not lazily during the run, where bounded paths would never notice.
|
|
143
|
+
parse_window_span(config.get("loghunter", {}).get("default_window"))
|
|
144
|
+
return config
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def default_allowlist_paths() -> dict[str, Any]:
|
|
148
|
+
"""Return a deep copy of ``_DEFAULTS["allowlist"]`` — the single source of
|
|
149
|
+
truth for fallback paths when an allowlist config key is absent.
|
|
150
|
+
|
|
151
|
+
Used by ``common/allowlist.py:build_matcher`` when a raw / notebook config
|
|
152
|
+
arrives without ``domain_patterns``, ``connection_rules``, or
|
|
153
|
+
``allowlist_dir`` set (the ``cfg.load`` deep-merge would otherwise have
|
|
154
|
+
supplied them from ``_DEFAULTS``).
|
|
155
|
+
"""
|
|
156
|
+
return copy.deepcopy(_DEFAULTS["allowlist"])
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def get_detector_config(
|
|
160
|
+
config: dict[str, Any],
|
|
161
|
+
detector_name: str,
|
|
162
|
+
detector_defaults: dict[str, Any] | None = None,
|
|
163
|
+
) -> dict[str, Any]:
|
|
164
|
+
"""Return the merged config for a specific detector.
|
|
165
|
+
|
|
166
|
+
File config wins over detector_defaults, which win over nothing.
|
|
167
|
+
"""
|
|
168
|
+
base = copy.deepcopy(detector_defaults or {})
|
|
169
|
+
file_section = config.get("detectors", {}).get(detector_name, {})
|
|
170
|
+
return _deep_merge(base, file_section)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _find_config_file() -> Path | None:
|
|
174
|
+
"""Walk SEARCH_PATHS and return the first existing file."""
|
|
175
|
+
for path in SEARCH_PATHS:
|
|
176
|
+
if path.exists():
|
|
177
|
+
return path
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _load_file(path: Path) -> dict[str, Any]:
|
|
182
|
+
"""Parse a TOML config file and deep-merge it over built-in defaults.
|
|
183
|
+
|
|
184
|
+
Attaches a ``__user_set__`` sidecar to the returned merged dict: a mapping
|
|
185
|
+
from top-level section name to the set of key names the operator declared
|
|
186
|
+
in that section. This is provenance metadata for runner-level disclosures
|
|
187
|
+
(e.g. "default RFC1918 vs. operator-declared home_net") — a value-only
|
|
188
|
+
check cannot distinguish a defaulted value from a user-declared value that
|
|
189
|
+
happens to equal the default. The "no config file found" path in load()
|
|
190
|
+
skips _load_file entirely; absence of the sidecar is correctly read as
|
|
191
|
+
"no user declarations".
|
|
192
|
+
"""
|
|
193
|
+
try:
|
|
194
|
+
with path.open("rb") as fh:
|
|
195
|
+
user_config = tomllib.load(fh)
|
|
196
|
+
except tomllib.TOMLDecodeError as exc:
|
|
197
|
+
raise ConfigError(
|
|
198
|
+
f"Config file parse error in {path}:\n {exc}\n"
|
|
199
|
+
f"Check the file for TOML syntax errors."
|
|
200
|
+
) from exc
|
|
201
|
+
|
|
202
|
+
merged = _deep_merge(copy.deepcopy(_DEFAULTS), user_config)
|
|
203
|
+
merged["__user_set__"] = {
|
|
204
|
+
section: set(content.keys()) if isinstance(content, dict) else set()
|
|
205
|
+
for section, content in user_config.items()
|
|
206
|
+
}
|
|
207
|
+
return merged
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
|
211
|
+
"""Recursively merge override into base, returning a new dict.
|
|
212
|
+
|
|
213
|
+
Scalars and lists in override replace those in base. Dicts are merged recursively.
|
|
214
|
+
"""
|
|
215
|
+
result = dict(base)
|
|
216
|
+
for key, val in override.items():
|
|
217
|
+
if key in result and isinstance(result[key], dict) and isinstance(val, dict):
|
|
218
|
+
result[key] = _deep_merge(result[key], val)
|
|
219
|
+
else:
|
|
220
|
+
result[key] = copy.deepcopy(val)
|
|
221
|
+
return result
|