loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,323 @@
1
+ """Shared display constants and helpers for human-facing terminal output.
2
+
3
+ The ``liveness`` context manager is the shared primitive for ROADMAP-12 TTY
4
+ progress narration. It draws an indeterminate spinner on stderr for opaque
5
+ blocking phases when stderr is a tty, and seals a permanent one-line record
6
+ on the way out (visible on tty AND non-tty — only the live animation is
7
+ tty-gated). Countable phases stay on ``tqdm``; ``liveness`` is for the cases
8
+ where there is no natural tick.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import os
14
+ import sys
15
+ import threading
16
+ from contextlib import contextmanager
17
+ from pathlib import Path
18
+ from typing import Any, Iterable, Iterator
19
+
20
+ from tqdm import tqdm
21
+
22
+ TEXT_RULE_WIDTH = 80
23
+ TEXT_RULE = "─" * TEXT_RULE_WIDTH
24
+ TEXT_RULE_DOUBLE = "═" * TEXT_RULE_WIDTH
25
+
26
+ # Spinner frames cycle in this exact order. ASCII only — no unicode-width
27
+ # surprises in clearing math.
28
+ _SPINNER_FRAMES = ("|", "/", "-", "\\")
29
+ # Per-frame interval. 120ms sits in the middle of the 100-150ms band that
30
+ # reads as "alive" without thrashing the terminal.
31
+ _SPINNER_INTERVAL_S = 0.12
32
+
33
+
34
+ # ── Method-chrome color seam (text handler only) ────────────────────────────
35
+ #
36
+ # Minimal: a single SGR constant for the method-glow paint, a TTY/NO_COLOR
37
+ # gate, and one paint() helper. Not a general terminal-capability layer; the
38
+ # only consumer is the text handler's Detectors: line. Rebound for retuning
39
+ # the glow in a single place — Dave will experiment live.
40
+ _METHOD_SGR = "\x1b[96;1m" # bright-cyan + bold
41
+ _RESET = "\x1b[0m"
42
+
43
+
44
+ def _stream_isatty(stream: Any) -> bool:
45
+ """Raw TTY probe. No color-policy coupling.
46
+
47
+ Shared by ``_color_enabled``, ``_LivenessHandle``, and ``progress``. Color
48
+ layers ``NO_COLOR``/``TERM=dumb`` on top; liveness and progress gate on
49
+ TTY only — a color preference is not a progress preference.
50
+ """
51
+ isatty = getattr(stream, "isatty", lambda: False)
52
+ try:
53
+ return bool(isatty())
54
+ except Exception:
55
+ return False
56
+
57
+
58
+ def _color_enabled(stream: Any) -> bool:
59
+ """True when the stream is a real TTY and color is not opted out.
60
+
61
+ Honors the NO_COLOR ambient convention and the legacy TERM=dumb signal.
62
+ File streams (--out / report_dir) and pipes are not TTYs and therefore
63
+ plain — automatic, no extra wiring at call sites.
64
+ """
65
+ if not _stream_isatty(stream):
66
+ return False
67
+ if os.environ.get("NO_COLOR") is not None:
68
+ return False
69
+ if os.environ.get("TERM") == "dumb":
70
+ return False
71
+ return True
72
+
73
+
74
+ def paint(text: str, *, stream: Any) -> str:
75
+ """Wrap ``text`` in the method SGR when ``stream`` admits color.
76
+
77
+ No-op on non-TTYs, on NO_COLOR-set environments, and on TERM=dumb. The
78
+ single SGR constant ``_METHOD_SGR`` is the one place to retune the glow.
79
+ """
80
+ return f"{_METHOD_SGR}{text}{_RESET}" if _color_enabled(stream) else text
81
+
82
+
83
+ def human_bytes(n: float) -> str:
84
+ """Human-readable -h-style byte size.
85
+
86
+ Consolidated from outputs/text.py's prior ``_format_bytes``. Consumers
87
+ today: the digest/blob renderers in text.py and the exporter narration
88
+ in W4. Digest ``conn.py`` keeps its deliberate local helper (kept-local
89
+ note in DESIGN.md) — do not repoint that one.
90
+ """
91
+ if n < 1024:
92
+ return f"{int(n)} B"
93
+ if n < 1024 ** 2:
94
+ return f"{n / 1024:.1f} KB"
95
+ if n < 1024 ** 3:
96
+ return f"{n / (1024 ** 2):.1f} MB"
97
+ if n < 1024 ** 4:
98
+ return f"{n / (1024 ** 3):.1f} GB"
99
+ return f"{n / (1024 ** 4):.1f} TB"
100
+
101
+
102
+ def compact_home(path: "str | Path") -> str:
103
+ """Return ``path`` as a string with the user's home prefix replaced by ``~``.
104
+
105
+ Pure display helper for exporter narration and similar surfaces. Operates
106
+ on the STRING form so a trailing slash is preserved (callers shouldn't
107
+ have to special-case that). Returns ``path`` unchanged when it doesn't
108
+ fall under ``$HOME`` or when ``HOME`` is unset.
109
+ """
110
+ text = str(path)
111
+ home = os.path.expanduser("~")
112
+ if not home or home == "~":
113
+ return text
114
+ if text == home:
115
+ return "~"
116
+ prefix = home if home.endswith(os.sep) else home + os.sep
117
+ if text.startswith(prefix):
118
+ return "~" + os.sep + text[len(prefix):]
119
+ return text
120
+
121
+
122
+ def progress(
123
+ iterable: Iterable[Any],
124
+ *,
125
+ desc: str,
126
+ show_progress: bool = True,
127
+ unit: str = " lines",
128
+ total: int | None = None,
129
+ stream: Any = None,
130
+ ) -> Iterator[Any]:
131
+ """TTY-aware tqdm wrapper for loader read loops.
132
+
133
+ Returns a counting GENERATOR on a TTY when ``show_progress`` is True;
134
+ otherwise returns the bare iterable (``tqdm`` is NEVER constructed). Gate
135
+ is raw isatty + the explicit ``show_progress`` flag; color policy
136
+ (``NO_COLOR``/``TERM=dumb``) is NOT consulted — a color preference is not
137
+ a progress preference.
138
+
139
+ The TTY branch constructs the tqdm WITHOUT an iterable and drives it via
140
+ ``bar.update(1)`` from a generator. This is what makes the count survive
141
+ PARSER RE-ITERATION: parsers that sniff-then-resume (Zeek
142
+ ``itertools.chain(prefix, line_iter)``; CloudTrail's second loop) call
143
+ ``iter()`` on the returned object a second time — for a generator,
144
+ ``iter(gen) is gen``, so the same generator (and its counter) continues,
145
+ whereas a bare tqdm's own ``__iter__`` would be orphaned (the
146
+ ``loaded X: 0.00 lines`` bug).
147
+
148
+ The pinned ``bar_format`` reproduces the long-standing NDJSON loader bar
149
+ byte-for-byte when ``unit=" lines"``; ``unit`` is parameterized so future
150
+ non-line-oriented callers can be honest about what they count.
151
+ """
152
+ if stream is None:
153
+ stream = sys.stderr
154
+ if not show_progress or not _stream_isatty(stream):
155
+ return iter(iterable)
156
+ bar = tqdm(
157
+ desc=desc,
158
+ unit=unit,
159
+ unit_scale=True,
160
+ leave=True,
161
+ mininterval=0.5,
162
+ total=total,
163
+ file=stream,
164
+ bar_format=f"{{desc}}: {{n_fmt}}{unit} [{{elapsed}}]",
165
+ )
166
+
167
+ def _counting() -> Iterator[Any]:
168
+ try:
169
+ for item in iterable:
170
+ bar.update(1)
171
+ yield item
172
+ finally:
173
+ bar.close()
174
+
175
+ return _counting()
176
+
177
+
178
+ class _LivenessHandle:
179
+ """Handle returned from a ``liveness`` context manager.
180
+
181
+ Owns the spinner thread, the captured stderr stream, and the bookkeeping
182
+ needed to keep ``seal()`` and ``__exit__`` honest about whether the
183
+ spinner actually drew anything.
184
+ """
185
+
186
+ def __init__(self, label: str, delay: float) -> None:
187
+ self._label = label
188
+ self._delay = delay
189
+ # Capture the stream at construction so a single
190
+ # monkeypatch.setattr(sys, "stderr", fake) before __enter__ is
191
+ # observed for the whole lifecycle, and so a later redirect of
192
+ # sys.stderr does not steal our writes.
193
+ self._stream = sys.stderr
194
+ self._isatty = _stream_isatty(self._stream)
195
+ self._stop = threading.Event()
196
+ self._thread: threading.Thread | None = None
197
+ # _drew flips True inside the spinner thread immediately before its
198
+ # first frame write. seal() / __exit__ only emit a clearing sequence
199
+ # when _drew is True — a phase that seals before the spinner ever
200
+ # drew prints exactly the sealed line, no \r flicker.
201
+ self._drew = False
202
+ self._sealed = False
203
+ # Guard concurrent writes between the spinner thread and seal/exit.
204
+ self._lock = threading.Lock()
205
+
206
+ # ── lifecycle ───────────────────────────────────────────────────────────
207
+
208
+ def _start(self) -> None:
209
+ if not self._isatty:
210
+ return
211
+ self._thread = threading.Thread(target=self._spin, daemon=True)
212
+ self._thread.start()
213
+
214
+ def _spin(self) -> None:
215
+ # Wait out the initial delay. If sealed during the delay, leave
216
+ # without ever writing — this is the seal-before-delay invariant.
217
+ if self._stop.wait(self._delay):
218
+ return
219
+ i = 0
220
+ frames = _SPINNER_FRAMES
221
+ while not self._stop.is_set():
222
+ with self._lock:
223
+ if self._stop.is_set():
224
+ return
225
+ # Flip _drew before the first write so the buffer state
226
+ # and the flag agree from the same critical section.
227
+ self._drew = True
228
+ try:
229
+ self._stream.write(f"\r{frames[i % len(frames)]} {self._label}")
230
+ self._stream.flush()
231
+ except Exception:
232
+ return
233
+ i += 1
234
+ if self._stop.wait(_SPINNER_INTERVAL_S):
235
+ return
236
+
237
+ def _clear_line(self) -> None:
238
+ # Pad-erase wide enough to cover one frame char + space + label.
239
+ width = 2 + len(self._label)
240
+ try:
241
+ self._stream.write("\r" + (" " * width) + "\r")
242
+ self._stream.flush()
243
+ except Exception:
244
+ pass
245
+
246
+ def seal(self, text: str) -> None:
247
+ """Commit a permanent one-line record and stop the spinner.
248
+
249
+ Idempotent — a second seal is a no-op. On a tty, clears the spinner
250
+ line first (only if the spinner actually drew); otherwise writes the
251
+ record straight to the stream. The sealed record is the only stderr
252
+ artifact that promises "this phase finished cleanly."
253
+ """
254
+ with self._lock:
255
+ if self._sealed:
256
+ return
257
+ self._sealed = True
258
+ self._stop.set()
259
+ if self._thread is not None:
260
+ self._thread.join()
261
+ with self._lock:
262
+ if self._isatty and self._drew:
263
+ self._clear_line()
264
+ try:
265
+ self._stream.write(f"{text}\n")
266
+ self._stream.flush()
267
+ except Exception:
268
+ pass
269
+
270
+ def _teardown(self, had_exception: bool) -> None:
271
+ """Single teardown path called from __exit__.
272
+
273
+ Stops the spinner thread, then clears the partial spinner line if
274
+ the spinner drew anything. Never writes a sealed record — that is
275
+ seal()'s job, and a body that did not call seal() (whether by
276
+ exception or by choice) leaves no record.
277
+ """
278
+ with self._lock:
279
+ already_sealed = self._sealed
280
+ self._stop.set()
281
+ if self._thread is not None:
282
+ self._thread.join()
283
+ if already_sealed:
284
+ # seal() already cleared and wrote; nothing more to do.
285
+ return
286
+ with self._lock:
287
+ if self._isatty and self._drew:
288
+ self._clear_line()
289
+ # had_exception is informational — same teardown either way once
290
+ # we know seal() did not fire. A partial spinner gets cleared; no
291
+ # record is written. The exception (if any) propagates from
292
+ # __exit__'s caller.
293
+
294
+
295
+ @contextmanager
296
+ def liveness(label: str, delay: float = 0.0) -> Iterator[_LivenessHandle]:
297
+ """Indeterminate-spinner liveness for an opaque blocking phase.
298
+
299
+ Usage::
300
+
301
+ with liveness("running beacon") as ln:
302
+ findings = run_beacon(ctx)
303
+ ln.seal(f"beacon: {len(findings)} findings")
304
+
305
+ On a tty, draws a single-line spinner ``"<frame> <label>"`` on stderr
306
+ after ``delay`` seconds (so fast phases never flicker). On non-tty,
307
+ draws nothing. ``ln.seal(text)`` commits a permanent record line.
308
+
309
+ If the body raises (including KeyboardInterrupt — Ctrl-C during the
310
+ phase), the spinner line is cleared, no sealed record is written, and
311
+ the exception propagates. This is what lets the runner's top-level
312
+ Ctrl-C handler print "Stopped." without a false-success seal landing
313
+ just before it.
314
+ """
315
+ handle = _LivenessHandle(label, delay)
316
+ handle._start()
317
+ try:
318
+ yield handle
319
+ except BaseException:
320
+ handle._teardown(had_exception=True)
321
+ raise
322
+ else:
323
+ handle._teardown(had_exception=False)
@@ -0,0 +1,45 @@
1
+ """Shared exception types used across runner, exporters, and CLI.
2
+
3
+ ExportAborted lives here (not under exporters/) so that runner.py can raise it
4
+ without creating a runner → exporter dependency. The CLI catches it once and
5
+ translates to a clean exit 0.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+
11
+ class ExportAborted(Exception):
12
+ """Raised when the operator declines an advisory confirmation prompt.
13
+
14
+ Used by the runner's large-dataset prompt and by exporter backends'
15
+ cost-prompts (e.g. CloudTrail's S3 egress guard). Caught by cli.main()
16
+ and translated to a clean exit 0 with the message. Not a ValueError —
17
+ distinct from the user-facing error path.
18
+ """
19
+
20
+
21
+ class DigestEmpty(Exception):
22
+ """Raised by run_digest when a RECOGNIZED schema loads to an empty frame.
23
+
24
+ Not an error — the file was understood, it simply had no parseable
25
+ records (e.g. a Zeek conn.log with header rows but zero data rows).
26
+ Callers catch this and narrate it without rendering a card.
27
+
28
+ Explicitly NOT a subclass of ValueError: catch-arms in cli.py that
29
+ handle real per-path failures (corrupt gzip, parser errors) MUST NOT
30
+ consume DigestEmpty, which is a control signal carrying a successful
31
+ "the file was understood and contained nothing to render" outcome.
32
+
33
+ basename: filename when the digest source was a file (sniff-driven
34
+ fan-out, single-file source_dir); directory name when the source was
35
+ a configured directory (bare-config branch). The stderr narration
36
+ "recognized as <schema> but no parseable records" reads correctly
37
+ in both cases.
38
+ """
39
+
40
+ def __init__(self, basename: str, schema: str) -> None:
41
+ super().__init__(
42
+ f"recognized {basename} as {schema} but no parseable records"
43
+ )
44
+ self.basename = basename
45
+ self.schema = schema
@@ -0,0 +1,239 @@
1
+ """Finding dataclass and Severity enum — the shared contract between detectors and outputs.
2
+
3
+ Detectors produce list[Finding]. Output handlers consume list[Finding].
4
+ Nothing else crosses that boundary.
5
+
6
+ DigestCard and DigestSlot are peer types to Finding/RunSummary — used by the
7
+ digest verb to render an orient-before-the-hunt card. They carry no severity,
8
+ no evidence, no next_steps; digest never produces a verdict.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import copy
14
+ import enum
15
+ from dataclasses import dataclass, field
16
+ from datetime import datetime, timedelta
17
+ from typing import TYPE_CHECKING, Any
18
+
19
+ if TYPE_CHECKING:
20
+ import pandas as pd
21
+
22
+ from loghunter.common.allowlist import AllowlistMatcher
23
+
24
+
25
+ class Severity(enum.Enum):
26
+ """Detection severity levels, rendered as bracketed tags in text output."""
27
+
28
+ HIGH = "H"
29
+ MEDIUM = "M"
30
+ LOW = "L"
31
+ INFO = "I"
32
+
33
+ def __str__(self) -> str:
34
+ return f"[{self.value}]"
35
+
36
+ def __lt__(self, other: "Severity") -> bool:
37
+ _order = [Severity.HIGH, Severity.MEDIUM, Severity.LOW, Severity.INFO]
38
+ return _order.index(self) < _order.index(other)
39
+
40
+
41
+ @dataclass
42
+ class Finding:
43
+ """A single detection result produced by a detector.
44
+
45
+ Detectors return list[Finding]. Output handlers render them.
46
+ The evidence dict is detector-specific — no fixed schema.
47
+ description and next_steps are only shown in verbose output.
48
+ """
49
+
50
+ detector: str
51
+ severity: Severity
52
+ title: str
53
+ description: str
54
+ evidence: dict[str, Any]
55
+ next_steps: list[str]
56
+ ts_generated: datetime
57
+ data_window: tuple[datetime, datetime]
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class MethodTag:
62
+ """Per-detector method label, surfaced in the run-summary banner.
63
+
64
+ ``named=True`` marks a published algorithm (FFT, drain3, fast-HDBSCAN) —
65
+ rendered with parentheses and painted by the text handler. ``named=False``
66
+ marks an honest house badge (pattern, heuristics, statistical) — rendered
67
+ in plain brackets, never painted. The parens-vs-brackets carry 100% of
68
+ the meaning; color is enhancement only.
69
+ """
70
+
71
+ label: str
72
+ named: bool
73
+
74
+
75
+ @dataclass
76
+ class RunSummary:
77
+ """Metadata about a loghunter run, printed before analysis begins and passed to output handlers."""
78
+
79
+ data_window: tuple[datetime, datetime]
80
+ record_counts: dict[str, int]
81
+ data_size_bytes: int
82
+ detectors_run: list[str]
83
+ detectors_skipped: dict[str, str] # name → reason
84
+ notes: list[str] = field(default_factory=list)
85
+ data_sources: list[str] = field(default_factory=list)
86
+ detector_methods: dict[str, "MethodTag | None"] = field(default_factory=dict)
87
+ # The window the operator asked for (default-window spec, explicit since/until
88
+ # span, or since→now), used by the text handler's data-found underfill
89
+ # parenthetical. None = unconstrained (--all / until-only / bounded full-load).
90
+ requested_span: timedelta | None = None
91
+
92
+
93
+ @dataclass
94
+ class DigestSlot:
95
+ """One row in a DigestCard's fields block.
96
+
97
+ Bi-state:
98
+
99
+ - SPEAKING: cells (pre-formatted column strings) AND
100
+ entity/magnitude/ratio (raw values) are populated together. Cells and
101
+ raw values come from the same source numbers; keeping the raw values
102
+ lets insight selection sort by salience without parsing "3.7x" back
103
+ out of a display string.
104
+
105
+ - NON-SPEAKING (applicable but nothing notable, or feed cannot compute):
106
+ all four value fields are None. The renderer never sees these slots
107
+ — the summariser filters them out before handing fields to the card.
108
+ """
109
+
110
+ label: str # "conn-share", "fan-out", ...
111
+ statistic: str # "cliff" | "tail" | "rate" | "share" | "dist"
112
+ cells: list[str] | None = None
113
+ entity: str | None = None
114
+ magnitude: float | None = None
115
+ ratio: float | None = None
116
+
117
+
118
+ @dataclass
119
+ class DigestCard:
120
+ """A digest's per-schema rendered body. Peer to Finding, not a subclass.
121
+
122
+ Carries the spine-derived ambient facts (window, record count, histogram,
123
+ bytes) plus the summariser-derived ``zone1_extras`` ambient block,
124
+ ``insights`` (prose sentences mechanically derived from speaking gated
125
+ slots), and ``fields`` (the display-ready, already-filtered speaking
126
+ non-insight slots). Selection happens in the summariser; the renderer
127
+ is dumb.
128
+
129
+ ``data_window`` may be ``(None, None)`` when timestamps in the loaded
130
+ frame are unparseable below the confidence floor; the renderer renders
131
+ identity line 2 as a bare ``—`` and the histogram line as
132
+ ``(timeline unavailable)``.
133
+
134
+ ``timeline_unavailable`` is the explicit sentinel for the histogram-
135
+ suppression path. Without it, an empty ``histogram_counts`` could also
136
+ mean "no events in window" (the valid empty-frame case) — a renderer
137
+ looking only at counts cannot disambiguate.
138
+ """
139
+
140
+ schema: str
141
+ source_name: str # identity-line-1; basename of source file or dir
142
+ data_window: tuple[datetime | None, datetime | None]
143
+ record_count: int
144
+ histogram_counts: list[int]
145
+ histogram_unit: str # "hr" | "day"
146
+ histogram_peak: int
147
+ zone1_extras: list[tuple[str, str]] = field(default_factory=list)
148
+ insights: list[str] = field(default_factory=list)
149
+ fields: list[DigestSlot] = field(default_factory=list)
150
+ data_size_bytes: int = 0
151
+ timeline_unavailable: bool = False
152
+
153
+
154
+ @dataclass
155
+ class BlobCard:
156
+ """Unrecognized-source panel for the blob digest path. Peer to DigestCard,
157
+ not a subclass. Carries the description-of-bytes-as-bytes panel the blob
158
+ renderer needs — no slots, no histogram, no data_window — by design.
159
+
160
+ The blob path describes bytes and never extracts fields. No timestamp is
161
+ read, no schema is assumed. ``line_length_shape`` is exactly ``"uniform"``
162
+ or ``"varied"`` when set. The char-class fractions are computed over RAW
163
+ sampled BYTES (before UTF-8 decode) so binary-looking input shows up as
164
+ low ``printable_pct`` rather than being masked by ``errors="replace"``.
165
+
166
+ Vanish-don't-dash: optional fields default to None when their slot does
167
+ not apply (binary terminal magic, drain3 dormant, freeform-no-structure
168
+ template floor). The renderer omits the row entirely — no "-", no
169
+ placeholder. Required fields are sample-derived facts that always exist
170
+ for any input the profiler can read.
171
+
172
+ O(sample) rail: every field is computed from the bounded sample. The only
173
+ whole-file fact is ``byte_size`` (a stat). ``sampled_line_count`` is the
174
+ line count over the sample, NEVER a whole-file total.
175
+ """
176
+
177
+ # always present — sample-derived facts that exist for any input
178
+ source_name: str # identity-line-1; basename of the source path
179
+ byte_size: int # on-disk size from stat (compressed for .gz)
180
+ sampled_line_count: int
181
+ sample_read_count: int # 1 head + K seeks (plain); 1 (compressed head-only)
182
+ is_compressed: bool
183
+ printable_pct: float
184
+ nonprintable_pct: float
185
+ utf8_clean: bool # strict-decode probe over the sample
186
+
187
+ # identification — exactly one of file_type_guess or shape_guess set
188
+ file_type_guess: str | None = None # terminal magic label (e.g. "PNG image")
189
+ file_type_magic: bytes | None = None # matched magic bytes for the Bytes row repr
190
+ shape_guess: str | None = None # text-shape cascade result
191
+
192
+ # text slots — None on binary terminal hit
193
+ mean_line_length: float | None = None
194
+ median_line_length: float | None = None
195
+ line_length_p95: int | None = None
196
+ max_line_length: int | None = None
197
+ line_length_stdev: float | None = None
198
+ line_length_shape: str | None = None # "uniform" | "varied" | None
199
+
200
+ top_tokens: list[tuple[str, int]] | None = None
201
+
202
+ # JSON shape-guess only — first-seen union of top-level object keys
203
+ # across the sample. None on binary, on non-JSON text, and on top-
204
+ # level-array/scalar JSON (no object keys to list). When set, the
205
+ # renderer emits a `fields:` row in place of `tokens:` — names-no-
206
+ # values, structurally describing the shape one rung deeper than the
207
+ # `shape: JSON` label.
208
+ json_field_names: list[str] | None = None
209
+
210
+ # templates — None on binary / freeform-no-structure / drain3 dormant
211
+ distinct_templates: int | None = None
212
+ top_template_coverage_pct: float | None = None
213
+ top_template_n: int | None = None
214
+ singleton_template_count: int | None = None
215
+
216
+
217
+ @dataclass
218
+ class DetectorContext:
219
+ """Everything a detector needs to do its job.
220
+
221
+ The framework constructs this and passes it to each detector's run() function.
222
+ Detectors never open files, read config, or format output directly.
223
+
224
+ home_net is run/environment metadata (operator-declared internal networks
225
+ for traffic-direction classification) — peer to data_window and data_sources,
226
+ not detector tuning. Empty list means "not supplied"; detectors that need
227
+ direction classification may apply a sensible fallback in that case.
228
+
229
+ Verbosity is intentionally absent: the result set is verbosity-invariant
230
+ by construction (W6). Level-aware filtering happens at the text handler,
231
+ not in detector ``run()``.
232
+ """
233
+
234
+ logs: dict[str, "pd.DataFrame"]
235
+ config: dict[str, Any]
236
+ allowlist: "AllowlistMatcher"
237
+ data_window: tuple[datetime, datetime]
238
+ data_sources: list[str] = field(default_factory=list)
239
+ home_net: list[str] = field(default_factory=list)