loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
"""Shared display constants and helpers for human-facing terminal output.
|
|
2
|
+
|
|
3
|
+
The ``liveness`` context manager is the shared primitive for ROADMAP-12 TTY
|
|
4
|
+
progress narration. It draws an indeterminate spinner on stderr for opaque
|
|
5
|
+
blocking phases when stderr is a tty, and seals a permanent one-line record
|
|
6
|
+
on the way out (visible on tty AND non-tty — only the live animation is
|
|
7
|
+
tty-gated). Countable phases stay on ``tqdm``; ``liveness`` is for the cases
|
|
8
|
+
where there is no natural tick.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
import threading
|
|
16
|
+
from contextlib import contextmanager
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any, Iterable, Iterator
|
|
19
|
+
|
|
20
|
+
from tqdm import tqdm
|
|
21
|
+
|
|
22
|
+
TEXT_RULE_WIDTH = 80
|
|
23
|
+
TEXT_RULE = "─" * TEXT_RULE_WIDTH
|
|
24
|
+
TEXT_RULE_DOUBLE = "═" * TEXT_RULE_WIDTH
|
|
25
|
+
|
|
26
|
+
# Spinner frames cycle in this exact order. ASCII only — no unicode-width
|
|
27
|
+
# surprises in clearing math.
|
|
28
|
+
_SPINNER_FRAMES = ("|", "/", "-", "\\")
|
|
29
|
+
# Per-frame interval. 120ms sits in the middle of the 100-150ms band that
|
|
30
|
+
# reads as "alive" without thrashing the terminal.
|
|
31
|
+
_SPINNER_INTERVAL_S = 0.12
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ── Method-chrome color seam (text handler only) ────────────────────────────
|
|
35
|
+
#
|
|
36
|
+
# Minimal: a single SGR constant for the method-glow paint, a TTY/NO_COLOR
|
|
37
|
+
# gate, and one paint() helper. Not a general terminal-capability layer; the
|
|
38
|
+
# only consumer is the text handler's Detectors: line. Rebound for retuning
|
|
39
|
+
# the glow in a single place — Dave will experiment live.
|
|
40
|
+
_METHOD_SGR = "\x1b[96;1m" # bright-cyan + bold
|
|
41
|
+
_RESET = "\x1b[0m"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _stream_isatty(stream: Any) -> bool:
|
|
45
|
+
"""Raw TTY probe. No color-policy coupling.
|
|
46
|
+
|
|
47
|
+
Shared by ``_color_enabled``, ``_LivenessHandle``, and ``progress``. Color
|
|
48
|
+
layers ``NO_COLOR``/``TERM=dumb`` on top; liveness and progress gate on
|
|
49
|
+
TTY only — a color preference is not a progress preference.
|
|
50
|
+
"""
|
|
51
|
+
isatty = getattr(stream, "isatty", lambda: False)
|
|
52
|
+
try:
|
|
53
|
+
return bool(isatty())
|
|
54
|
+
except Exception:
|
|
55
|
+
return False
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _color_enabled(stream: Any) -> bool:
|
|
59
|
+
"""True when the stream is a real TTY and color is not opted out.
|
|
60
|
+
|
|
61
|
+
Honors the NO_COLOR ambient convention and the legacy TERM=dumb signal.
|
|
62
|
+
File streams (--out / report_dir) and pipes are not TTYs and therefore
|
|
63
|
+
plain — automatic, no extra wiring at call sites.
|
|
64
|
+
"""
|
|
65
|
+
if not _stream_isatty(stream):
|
|
66
|
+
return False
|
|
67
|
+
if os.environ.get("NO_COLOR") is not None:
|
|
68
|
+
return False
|
|
69
|
+
if os.environ.get("TERM") == "dumb":
|
|
70
|
+
return False
|
|
71
|
+
return True
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def paint(text: str, *, stream: Any) -> str:
|
|
75
|
+
"""Wrap ``text`` in the method SGR when ``stream`` admits color.
|
|
76
|
+
|
|
77
|
+
No-op on non-TTYs, on NO_COLOR-set environments, and on TERM=dumb. The
|
|
78
|
+
single SGR constant ``_METHOD_SGR`` is the one place to retune the glow.
|
|
79
|
+
"""
|
|
80
|
+
return f"{_METHOD_SGR}{text}{_RESET}" if _color_enabled(stream) else text
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def human_bytes(n: float) -> str:
|
|
84
|
+
"""Human-readable -h-style byte size.
|
|
85
|
+
|
|
86
|
+
Consolidated from outputs/text.py's prior ``_format_bytes``. Consumers
|
|
87
|
+
today: the digest/blob renderers in text.py and the exporter narration
|
|
88
|
+
in W4. Digest ``conn.py`` keeps its deliberate local helper (kept-local
|
|
89
|
+
note in DESIGN.md) — do not repoint that one.
|
|
90
|
+
"""
|
|
91
|
+
if n < 1024:
|
|
92
|
+
return f"{int(n)} B"
|
|
93
|
+
if n < 1024 ** 2:
|
|
94
|
+
return f"{n / 1024:.1f} KB"
|
|
95
|
+
if n < 1024 ** 3:
|
|
96
|
+
return f"{n / (1024 ** 2):.1f} MB"
|
|
97
|
+
if n < 1024 ** 4:
|
|
98
|
+
return f"{n / (1024 ** 3):.1f} GB"
|
|
99
|
+
return f"{n / (1024 ** 4):.1f} TB"
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def compact_home(path: "str | Path") -> str:
|
|
103
|
+
"""Return ``path`` as a string with the user's home prefix replaced by ``~``.
|
|
104
|
+
|
|
105
|
+
Pure display helper for exporter narration and similar surfaces. Operates
|
|
106
|
+
on the STRING form so a trailing slash is preserved (callers shouldn't
|
|
107
|
+
have to special-case that). Returns ``path`` unchanged when it doesn't
|
|
108
|
+
fall under ``$HOME`` or when ``HOME`` is unset.
|
|
109
|
+
"""
|
|
110
|
+
text = str(path)
|
|
111
|
+
home = os.path.expanduser("~")
|
|
112
|
+
if not home or home == "~":
|
|
113
|
+
return text
|
|
114
|
+
if text == home:
|
|
115
|
+
return "~"
|
|
116
|
+
prefix = home if home.endswith(os.sep) else home + os.sep
|
|
117
|
+
if text.startswith(prefix):
|
|
118
|
+
return "~" + os.sep + text[len(prefix):]
|
|
119
|
+
return text
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def progress(
|
|
123
|
+
iterable: Iterable[Any],
|
|
124
|
+
*,
|
|
125
|
+
desc: str,
|
|
126
|
+
show_progress: bool = True,
|
|
127
|
+
unit: str = " lines",
|
|
128
|
+
total: int | None = None,
|
|
129
|
+
stream: Any = None,
|
|
130
|
+
) -> Iterator[Any]:
|
|
131
|
+
"""TTY-aware tqdm wrapper for loader read loops.
|
|
132
|
+
|
|
133
|
+
Returns a counting GENERATOR on a TTY when ``show_progress`` is True;
|
|
134
|
+
otherwise returns the bare iterable (``tqdm`` is NEVER constructed). Gate
|
|
135
|
+
is raw isatty + the explicit ``show_progress`` flag; color policy
|
|
136
|
+
(``NO_COLOR``/``TERM=dumb``) is NOT consulted — a color preference is not
|
|
137
|
+
a progress preference.
|
|
138
|
+
|
|
139
|
+
The TTY branch constructs the tqdm WITHOUT an iterable and drives it via
|
|
140
|
+
``bar.update(1)`` from a generator. This is what makes the count survive
|
|
141
|
+
PARSER RE-ITERATION: parsers that sniff-then-resume (Zeek
|
|
142
|
+
``itertools.chain(prefix, line_iter)``; CloudTrail's second loop) call
|
|
143
|
+
``iter()`` on the returned object a second time — for a generator,
|
|
144
|
+
``iter(gen) is gen``, so the same generator (and its counter) continues,
|
|
145
|
+
whereas a bare tqdm's own ``__iter__`` would be orphaned (the
|
|
146
|
+
``loaded X: 0.00 lines`` bug).
|
|
147
|
+
|
|
148
|
+
The pinned ``bar_format`` reproduces the long-standing NDJSON loader bar
|
|
149
|
+
byte-for-byte when ``unit=" lines"``; ``unit`` is parameterized so future
|
|
150
|
+
non-line-oriented callers can be honest about what they count.
|
|
151
|
+
"""
|
|
152
|
+
if stream is None:
|
|
153
|
+
stream = sys.stderr
|
|
154
|
+
if not show_progress or not _stream_isatty(stream):
|
|
155
|
+
return iter(iterable)
|
|
156
|
+
bar = tqdm(
|
|
157
|
+
desc=desc,
|
|
158
|
+
unit=unit,
|
|
159
|
+
unit_scale=True,
|
|
160
|
+
leave=True,
|
|
161
|
+
mininterval=0.5,
|
|
162
|
+
total=total,
|
|
163
|
+
file=stream,
|
|
164
|
+
bar_format=f"{{desc}}: {{n_fmt}}{unit} [{{elapsed}}]",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
def _counting() -> Iterator[Any]:
|
|
168
|
+
try:
|
|
169
|
+
for item in iterable:
|
|
170
|
+
bar.update(1)
|
|
171
|
+
yield item
|
|
172
|
+
finally:
|
|
173
|
+
bar.close()
|
|
174
|
+
|
|
175
|
+
return _counting()
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class _LivenessHandle:
|
|
179
|
+
"""Handle returned from a ``liveness`` context manager.
|
|
180
|
+
|
|
181
|
+
Owns the spinner thread, the captured stderr stream, and the bookkeeping
|
|
182
|
+
needed to keep ``seal()`` and ``__exit__`` honest about whether the
|
|
183
|
+
spinner actually drew anything.
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
def __init__(self, label: str, delay: float) -> None:
|
|
187
|
+
self._label = label
|
|
188
|
+
self._delay = delay
|
|
189
|
+
# Capture the stream at construction so a single
|
|
190
|
+
# monkeypatch.setattr(sys, "stderr", fake) before __enter__ is
|
|
191
|
+
# observed for the whole lifecycle, and so a later redirect of
|
|
192
|
+
# sys.stderr does not steal our writes.
|
|
193
|
+
self._stream = sys.stderr
|
|
194
|
+
self._isatty = _stream_isatty(self._stream)
|
|
195
|
+
self._stop = threading.Event()
|
|
196
|
+
self._thread: threading.Thread | None = None
|
|
197
|
+
# _drew flips True inside the spinner thread immediately before its
|
|
198
|
+
# first frame write. seal() / __exit__ only emit a clearing sequence
|
|
199
|
+
# when _drew is True — a phase that seals before the spinner ever
|
|
200
|
+
# drew prints exactly the sealed line, no \r flicker.
|
|
201
|
+
self._drew = False
|
|
202
|
+
self._sealed = False
|
|
203
|
+
# Guard concurrent writes between the spinner thread and seal/exit.
|
|
204
|
+
self._lock = threading.Lock()
|
|
205
|
+
|
|
206
|
+
# ── lifecycle ───────────────────────────────────────────────────────────
|
|
207
|
+
|
|
208
|
+
def _start(self) -> None:
|
|
209
|
+
if not self._isatty:
|
|
210
|
+
return
|
|
211
|
+
self._thread = threading.Thread(target=self._spin, daemon=True)
|
|
212
|
+
self._thread.start()
|
|
213
|
+
|
|
214
|
+
def _spin(self) -> None:
|
|
215
|
+
# Wait out the initial delay. If sealed during the delay, leave
|
|
216
|
+
# without ever writing — this is the seal-before-delay invariant.
|
|
217
|
+
if self._stop.wait(self._delay):
|
|
218
|
+
return
|
|
219
|
+
i = 0
|
|
220
|
+
frames = _SPINNER_FRAMES
|
|
221
|
+
while not self._stop.is_set():
|
|
222
|
+
with self._lock:
|
|
223
|
+
if self._stop.is_set():
|
|
224
|
+
return
|
|
225
|
+
# Flip _drew before the first write so the buffer state
|
|
226
|
+
# and the flag agree from the same critical section.
|
|
227
|
+
self._drew = True
|
|
228
|
+
try:
|
|
229
|
+
self._stream.write(f"\r{frames[i % len(frames)]} {self._label}")
|
|
230
|
+
self._stream.flush()
|
|
231
|
+
except Exception:
|
|
232
|
+
return
|
|
233
|
+
i += 1
|
|
234
|
+
if self._stop.wait(_SPINNER_INTERVAL_S):
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
def _clear_line(self) -> None:
|
|
238
|
+
# Pad-erase wide enough to cover one frame char + space + label.
|
|
239
|
+
width = 2 + len(self._label)
|
|
240
|
+
try:
|
|
241
|
+
self._stream.write("\r" + (" " * width) + "\r")
|
|
242
|
+
self._stream.flush()
|
|
243
|
+
except Exception:
|
|
244
|
+
pass
|
|
245
|
+
|
|
246
|
+
def seal(self, text: str) -> None:
|
|
247
|
+
"""Commit a permanent one-line record and stop the spinner.
|
|
248
|
+
|
|
249
|
+
Idempotent — a second seal is a no-op. On a tty, clears the spinner
|
|
250
|
+
line first (only if the spinner actually drew); otherwise writes the
|
|
251
|
+
record straight to the stream. The sealed record is the only stderr
|
|
252
|
+
artifact that promises "this phase finished cleanly."
|
|
253
|
+
"""
|
|
254
|
+
with self._lock:
|
|
255
|
+
if self._sealed:
|
|
256
|
+
return
|
|
257
|
+
self._sealed = True
|
|
258
|
+
self._stop.set()
|
|
259
|
+
if self._thread is not None:
|
|
260
|
+
self._thread.join()
|
|
261
|
+
with self._lock:
|
|
262
|
+
if self._isatty and self._drew:
|
|
263
|
+
self._clear_line()
|
|
264
|
+
try:
|
|
265
|
+
self._stream.write(f"{text}\n")
|
|
266
|
+
self._stream.flush()
|
|
267
|
+
except Exception:
|
|
268
|
+
pass
|
|
269
|
+
|
|
270
|
+
def _teardown(self, had_exception: bool) -> None:
|
|
271
|
+
"""Single teardown path called from __exit__.
|
|
272
|
+
|
|
273
|
+
Stops the spinner thread, then clears the partial spinner line if
|
|
274
|
+
the spinner drew anything. Never writes a sealed record — that is
|
|
275
|
+
seal()'s job, and a body that did not call seal() (whether by
|
|
276
|
+
exception or by choice) leaves no record.
|
|
277
|
+
"""
|
|
278
|
+
with self._lock:
|
|
279
|
+
already_sealed = self._sealed
|
|
280
|
+
self._stop.set()
|
|
281
|
+
if self._thread is not None:
|
|
282
|
+
self._thread.join()
|
|
283
|
+
if already_sealed:
|
|
284
|
+
# seal() already cleared and wrote; nothing more to do.
|
|
285
|
+
return
|
|
286
|
+
with self._lock:
|
|
287
|
+
if self._isatty and self._drew:
|
|
288
|
+
self._clear_line()
|
|
289
|
+
# had_exception is informational — same teardown either way once
|
|
290
|
+
# we know seal() did not fire. A partial spinner gets cleared; no
|
|
291
|
+
# record is written. The exception (if any) propagates from
|
|
292
|
+
# __exit__'s caller.
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
@contextmanager
|
|
296
|
+
def liveness(label: str, delay: float = 0.0) -> Iterator[_LivenessHandle]:
|
|
297
|
+
"""Indeterminate-spinner liveness for an opaque blocking phase.
|
|
298
|
+
|
|
299
|
+
Usage::
|
|
300
|
+
|
|
301
|
+
with liveness("running beacon") as ln:
|
|
302
|
+
findings = run_beacon(ctx)
|
|
303
|
+
ln.seal(f"beacon: {len(findings)} findings")
|
|
304
|
+
|
|
305
|
+
On a tty, draws a single-line spinner ``"<frame> <label>"`` on stderr
|
|
306
|
+
after ``delay`` seconds (so fast phases never flicker). On non-tty,
|
|
307
|
+
draws nothing. ``ln.seal(text)`` commits a permanent record line.
|
|
308
|
+
|
|
309
|
+
If the body raises (including KeyboardInterrupt — Ctrl-C during the
|
|
310
|
+
phase), the spinner line is cleared, no sealed record is written, and
|
|
311
|
+
the exception propagates. This is what lets the runner's top-level
|
|
312
|
+
Ctrl-C handler print "Stopped." without a false-success seal landing
|
|
313
|
+
just before it.
|
|
314
|
+
"""
|
|
315
|
+
handle = _LivenessHandle(label, delay)
|
|
316
|
+
handle._start()
|
|
317
|
+
try:
|
|
318
|
+
yield handle
|
|
319
|
+
except BaseException:
|
|
320
|
+
handle._teardown(had_exception=True)
|
|
321
|
+
raise
|
|
322
|
+
else:
|
|
323
|
+
handle._teardown(had_exception=False)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Shared exception types used across runner, exporters, and CLI.
|
|
2
|
+
|
|
3
|
+
ExportAborted lives here (not under exporters/) so that runner.py can raise it
|
|
4
|
+
without creating a runner → exporter dependency. The CLI catches it once and
|
|
5
|
+
translates to a clean exit 0.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ExportAborted(Exception):
|
|
12
|
+
"""Raised when the operator declines an advisory confirmation prompt.
|
|
13
|
+
|
|
14
|
+
Used by the runner's large-dataset prompt and by exporter backends'
|
|
15
|
+
cost-prompts (e.g. CloudTrail's S3 egress guard). Caught by cli.main()
|
|
16
|
+
and translated to a clean exit 0 with the message. Not a ValueError —
|
|
17
|
+
distinct from the user-facing error path.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DigestEmpty(Exception):
|
|
22
|
+
"""Raised by run_digest when a RECOGNIZED schema loads to an empty frame.
|
|
23
|
+
|
|
24
|
+
Not an error — the file was understood, it simply had no parseable
|
|
25
|
+
records (e.g. a Zeek conn.log with header rows but zero data rows).
|
|
26
|
+
Callers catch this and narrate it without rendering a card.
|
|
27
|
+
|
|
28
|
+
Explicitly NOT a subclass of ValueError: catch-arms in cli.py that
|
|
29
|
+
handle real per-path failures (corrupt gzip, parser errors) MUST NOT
|
|
30
|
+
consume DigestEmpty, which is a control signal carrying a successful
|
|
31
|
+
"the file was understood and contained nothing to render" outcome.
|
|
32
|
+
|
|
33
|
+
basename: filename when the digest source was a file (sniff-driven
|
|
34
|
+
fan-out, single-file source_dir); directory name when the source was
|
|
35
|
+
a configured directory (bare-config branch). The stderr narration
|
|
36
|
+
"recognized as <schema> but no parseable records" reads correctly
|
|
37
|
+
in both cases.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, basename: str, schema: str) -> None:
|
|
41
|
+
super().__init__(
|
|
42
|
+
f"recognized {basename} as {schema} but no parseable records"
|
|
43
|
+
)
|
|
44
|
+
self.basename = basename
|
|
45
|
+
self.schema = schema
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
"""Finding dataclass and Severity enum — the shared contract between detectors and outputs.
|
|
2
|
+
|
|
3
|
+
Detectors produce list[Finding]. Output handlers consume list[Finding].
|
|
4
|
+
Nothing else crosses that boundary.
|
|
5
|
+
|
|
6
|
+
DigestCard and DigestSlot are peer types to Finding/RunSummary — used by the
|
|
7
|
+
digest verb to render an orient-before-the-hunt card. They carry no severity,
|
|
8
|
+
no evidence, no next_steps; digest never produces a verdict.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import copy
|
|
14
|
+
import enum
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from datetime import datetime, timedelta
|
|
17
|
+
from typing import TYPE_CHECKING, Any
|
|
18
|
+
|
|
19
|
+
if TYPE_CHECKING:
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
from loghunter.common.allowlist import AllowlistMatcher
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Severity(enum.Enum):
|
|
26
|
+
"""Detection severity levels, rendered as bracketed tags in text output."""
|
|
27
|
+
|
|
28
|
+
HIGH = "H"
|
|
29
|
+
MEDIUM = "M"
|
|
30
|
+
LOW = "L"
|
|
31
|
+
INFO = "I"
|
|
32
|
+
|
|
33
|
+
def __str__(self) -> str:
|
|
34
|
+
return f"[{self.value}]"
|
|
35
|
+
|
|
36
|
+
def __lt__(self, other: "Severity") -> bool:
|
|
37
|
+
_order = [Severity.HIGH, Severity.MEDIUM, Severity.LOW, Severity.INFO]
|
|
38
|
+
return _order.index(self) < _order.index(other)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class Finding:
|
|
43
|
+
"""A single detection result produced by a detector.
|
|
44
|
+
|
|
45
|
+
Detectors return list[Finding]. Output handlers render them.
|
|
46
|
+
The evidence dict is detector-specific — no fixed schema.
|
|
47
|
+
description and next_steps are only shown in verbose output.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
detector: str
|
|
51
|
+
severity: Severity
|
|
52
|
+
title: str
|
|
53
|
+
description: str
|
|
54
|
+
evidence: dict[str, Any]
|
|
55
|
+
next_steps: list[str]
|
|
56
|
+
ts_generated: datetime
|
|
57
|
+
data_window: tuple[datetime, datetime]
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class MethodTag:
|
|
62
|
+
"""Per-detector method label, surfaced in the run-summary banner.
|
|
63
|
+
|
|
64
|
+
``named=True`` marks a published algorithm (FFT, drain3, fast-HDBSCAN) —
|
|
65
|
+
rendered with parentheses and painted by the text handler. ``named=False``
|
|
66
|
+
marks an honest house badge (pattern, heuristics, statistical) — rendered
|
|
67
|
+
in plain brackets, never painted. The parens-vs-brackets carry 100% of
|
|
68
|
+
the meaning; color is enhancement only.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
label: str
|
|
72
|
+
named: bool
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@dataclass
|
|
76
|
+
class RunSummary:
|
|
77
|
+
"""Metadata about a loghunter run, printed before analysis begins and passed to output handlers."""
|
|
78
|
+
|
|
79
|
+
data_window: tuple[datetime, datetime]
|
|
80
|
+
record_counts: dict[str, int]
|
|
81
|
+
data_size_bytes: int
|
|
82
|
+
detectors_run: list[str]
|
|
83
|
+
detectors_skipped: dict[str, str] # name → reason
|
|
84
|
+
notes: list[str] = field(default_factory=list)
|
|
85
|
+
data_sources: list[str] = field(default_factory=list)
|
|
86
|
+
detector_methods: dict[str, "MethodTag | None"] = field(default_factory=dict)
|
|
87
|
+
# The window the operator asked for (default-window spec, explicit since/until
|
|
88
|
+
# span, or since→now), used by the text handler's data-found underfill
|
|
89
|
+
# parenthetical. None = unconstrained (--all / until-only / bounded full-load).
|
|
90
|
+
requested_span: timedelta | None = None
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
@dataclass
|
|
94
|
+
class DigestSlot:
|
|
95
|
+
"""One row in a DigestCard's fields block.
|
|
96
|
+
|
|
97
|
+
Bi-state:
|
|
98
|
+
|
|
99
|
+
- SPEAKING: cells (pre-formatted column strings) AND
|
|
100
|
+
entity/magnitude/ratio (raw values) are populated together. Cells and
|
|
101
|
+
raw values come from the same source numbers; keeping the raw values
|
|
102
|
+
lets insight selection sort by salience without parsing "3.7x" back
|
|
103
|
+
out of a display string.
|
|
104
|
+
|
|
105
|
+
- NON-SPEAKING (applicable but nothing notable, or feed cannot compute):
|
|
106
|
+
all four value fields are None. The renderer never sees these slots
|
|
107
|
+
— the summariser filters them out before handing fields to the card.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
label: str # "conn-share", "fan-out", ...
|
|
111
|
+
statistic: str # "cliff" | "tail" | "rate" | "share" | "dist"
|
|
112
|
+
cells: list[str] | None = None
|
|
113
|
+
entity: str | None = None
|
|
114
|
+
magnitude: float | None = None
|
|
115
|
+
ratio: float | None = None
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class DigestCard:
|
|
120
|
+
"""A digest's per-schema rendered body. Peer to Finding, not a subclass.
|
|
121
|
+
|
|
122
|
+
Carries the spine-derived ambient facts (window, record count, histogram,
|
|
123
|
+
bytes) plus the summariser-derived ``zone1_extras`` ambient block,
|
|
124
|
+
``insights`` (prose sentences mechanically derived from speaking gated
|
|
125
|
+
slots), and ``fields`` (the display-ready, already-filtered speaking
|
|
126
|
+
non-insight slots). Selection happens in the summariser; the renderer
|
|
127
|
+
is dumb.
|
|
128
|
+
|
|
129
|
+
``data_window`` may be ``(None, None)`` when timestamps in the loaded
|
|
130
|
+
frame are unparseable below the confidence floor; the renderer renders
|
|
131
|
+
identity line 2 as a bare ``—`` and the histogram line as
|
|
132
|
+
``(timeline unavailable)``.
|
|
133
|
+
|
|
134
|
+
``timeline_unavailable`` is the explicit sentinel for the histogram-
|
|
135
|
+
suppression path. Without it, an empty ``histogram_counts`` could also
|
|
136
|
+
mean "no events in window" (the valid empty-frame case) — a renderer
|
|
137
|
+
looking only at counts cannot disambiguate.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
schema: str
|
|
141
|
+
source_name: str # identity-line-1; basename of source file or dir
|
|
142
|
+
data_window: tuple[datetime | None, datetime | None]
|
|
143
|
+
record_count: int
|
|
144
|
+
histogram_counts: list[int]
|
|
145
|
+
histogram_unit: str # "hr" | "day"
|
|
146
|
+
histogram_peak: int
|
|
147
|
+
zone1_extras: list[tuple[str, str]] = field(default_factory=list)
|
|
148
|
+
insights: list[str] = field(default_factory=list)
|
|
149
|
+
fields: list[DigestSlot] = field(default_factory=list)
|
|
150
|
+
data_size_bytes: int = 0
|
|
151
|
+
timeline_unavailable: bool = False
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
@dataclass
|
|
155
|
+
class BlobCard:
|
|
156
|
+
"""Unrecognized-source panel for the blob digest path. Peer to DigestCard,
|
|
157
|
+
not a subclass. Carries the description-of-bytes-as-bytes panel the blob
|
|
158
|
+
renderer needs — no slots, no histogram, no data_window — by design.
|
|
159
|
+
|
|
160
|
+
The blob path describes bytes and never extracts fields. No timestamp is
|
|
161
|
+
read, no schema is assumed. ``line_length_shape`` is exactly ``"uniform"``
|
|
162
|
+
or ``"varied"`` when set. The char-class fractions are computed over RAW
|
|
163
|
+
sampled BYTES (before UTF-8 decode) so binary-looking input shows up as
|
|
164
|
+
low ``printable_pct`` rather than being masked by ``errors="replace"``.
|
|
165
|
+
|
|
166
|
+
Vanish-don't-dash: optional fields default to None when their slot does
|
|
167
|
+
not apply (binary terminal magic, drain3 dormant, freeform-no-structure
|
|
168
|
+
template floor). The renderer omits the row entirely — no "-", no
|
|
169
|
+
placeholder. Required fields are sample-derived facts that always exist
|
|
170
|
+
for any input the profiler can read.
|
|
171
|
+
|
|
172
|
+
O(sample) rail: every field is computed from the bounded sample. The only
|
|
173
|
+
whole-file fact is ``byte_size`` (a stat). ``sampled_line_count`` is the
|
|
174
|
+
line count over the sample, NEVER a whole-file total.
|
|
175
|
+
"""
|
|
176
|
+
|
|
177
|
+
# always present — sample-derived facts that exist for any input
|
|
178
|
+
source_name: str # identity-line-1; basename of the source path
|
|
179
|
+
byte_size: int # on-disk size from stat (compressed for .gz)
|
|
180
|
+
sampled_line_count: int
|
|
181
|
+
sample_read_count: int # 1 head + K seeks (plain); 1 (compressed head-only)
|
|
182
|
+
is_compressed: bool
|
|
183
|
+
printable_pct: float
|
|
184
|
+
nonprintable_pct: float
|
|
185
|
+
utf8_clean: bool # strict-decode probe over the sample
|
|
186
|
+
|
|
187
|
+
# identification — exactly one of file_type_guess or shape_guess set
|
|
188
|
+
file_type_guess: str | None = None # terminal magic label (e.g. "PNG image")
|
|
189
|
+
file_type_magic: bytes | None = None # matched magic bytes for the Bytes row repr
|
|
190
|
+
shape_guess: str | None = None # text-shape cascade result
|
|
191
|
+
|
|
192
|
+
# text slots — None on binary terminal hit
|
|
193
|
+
mean_line_length: float | None = None
|
|
194
|
+
median_line_length: float | None = None
|
|
195
|
+
line_length_p95: int | None = None
|
|
196
|
+
max_line_length: int | None = None
|
|
197
|
+
line_length_stdev: float | None = None
|
|
198
|
+
line_length_shape: str | None = None # "uniform" | "varied" | None
|
|
199
|
+
|
|
200
|
+
top_tokens: list[tuple[str, int]] | None = None
|
|
201
|
+
|
|
202
|
+
# JSON shape-guess only — first-seen union of top-level object keys
|
|
203
|
+
# across the sample. None on binary, on non-JSON text, and on top-
|
|
204
|
+
# level-array/scalar JSON (no object keys to list). When set, the
|
|
205
|
+
# renderer emits a `fields:` row in place of `tokens:` — names-no-
|
|
206
|
+
# values, structurally describing the shape one rung deeper than the
|
|
207
|
+
# `shape: JSON` label.
|
|
208
|
+
json_field_names: list[str] | None = None
|
|
209
|
+
|
|
210
|
+
# templates — None on binary / freeform-no-structure / drain3 dormant
|
|
211
|
+
distinct_templates: int | None = None
|
|
212
|
+
top_template_coverage_pct: float | None = None
|
|
213
|
+
top_template_n: int | None = None
|
|
214
|
+
singleton_template_count: int | None = None
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
@dataclass
|
|
218
|
+
class DetectorContext:
|
|
219
|
+
"""Everything a detector needs to do its job.
|
|
220
|
+
|
|
221
|
+
The framework constructs this and passes it to each detector's run() function.
|
|
222
|
+
Detectors never open files, read config, or format output directly.
|
|
223
|
+
|
|
224
|
+
home_net is run/environment metadata (operator-declared internal networks
|
|
225
|
+
for traffic-direction classification) — peer to data_window and data_sources,
|
|
226
|
+
not detector tuning. Empty list means "not supplied"; detectors that need
|
|
227
|
+
direction classification may apply a sensible fallback in that case.
|
|
228
|
+
|
|
229
|
+
Verbosity is intentionally absent: the result set is verbosity-invariant
|
|
230
|
+
by construction (W6). Level-aware filtering happens at the text handler,
|
|
231
|
+
not in detector ``run()``.
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
logs: dict[str, "pd.DataFrame"]
|
|
235
|
+
config: dict[str, Any]
|
|
236
|
+
allowlist: "AllowlistMatcher"
|
|
237
|
+
data_window: tuple[datetime, datetime]
|
|
238
|
+
data_sources: list[str] = field(default_factory=list)
|
|
239
|
+
home_net: list[str] = field(default_factory=list)
|