loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,1422 @@
1
+ """Text output handler — default stdout format.
2
+
3
+ Output is grouped by detector, each section with a header and ───── separator.
4
+ Default output: title, severity tag, key evidence fields only.
5
+ Verbose adds: description, full evidence dict, next_steps, data window.
6
+ next_steps are never shown in default output.
7
+
8
+ Looks crafted, not generated. Minimal ASCII decoration.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import sys
14
+ import textwrap
15
+ from collections import defaultdict
16
+ from dataclasses import dataclass, field
17
+ from datetime import timedelta
18
+ from typing import Any, TextIO
19
+
20
+ from loghunter.common.display import (
21
+ TEXT_RULE,
22
+ TEXT_RULE_DOUBLE,
23
+ TEXT_RULE_WIDTH,
24
+ human_bytes,
25
+ paint,
26
+ )
27
+ from loghunter.common.finding import (
28
+ BlobCard,
29
+ DigestCard,
30
+ Finding,
31
+ MethodTag,
32
+ RunSummary,
33
+ Severity,
34
+ )
35
+ from loghunter.common.output import OutputHandler, register_handler
36
+
37
+ _WIDTH = TEXT_RULE_WIDTH
38
+ _SEP = TEXT_RULE
39
+ _SEP_DOUBLE = TEXT_RULE_DOUBLE
40
+ _SUMMARY_LABEL_WIDTH = 14
41
+
42
+ # Minimum (requested_span − data_span) before the data-found line discloses an
43
+ # underfilled window — below this the operator effectively got what they asked for.
44
+ _UNDERFILL_TOLERANCE = timedelta(hours=1)
45
+
46
+
47
+ def _fmt_window(window: tuple) -> str:
48
+ s, e = window
49
+ span = e - s
50
+ days = span.days + span.seconds / 86400
51
+ return (
52
+ f"{s.strftime('%Y-%m-%d %H:%M')} → {e.strftime('%Y-%m-%d %H:%M')}"
53
+ f" ({days:.1f}d)"
54
+ )
55
+
56
+
57
+ def _fmt_span(td: timedelta) -> str:
58
+ """Compact span for the data-found underfill parenthetical.
59
+
60
+ ``< 24h`` → integer hours (``"18h"``); ``>= 24h`` → days, integer when whole
61
+ else one decimal (``"2d"``, ``"1.5d"``). Rounding never crosses a unit
62
+ surprisingly: an hours value that rounds up to a full day prints ``"1d"``,
63
+ not ``"24h"``.
64
+ """
65
+ hours = td.total_seconds() / 3600
66
+ if hours < 24:
67
+ rounded = int(round(hours))
68
+ if rounded < 24:
69
+ return f"{rounded}h"
70
+ # rounded up to a full day — promote the unit rather than print "24h"
71
+ return "1d"
72
+ days = td.total_seconds() / 86400
73
+ if abs(days - round(days)) < 1e-9:
74
+ return f"{int(round(days))}d"
75
+ return f"{days:.1f}d"
76
+
77
+
78
+ def _aws_span_str(seconds: float) -> str:
79
+ """Compact span used by burst rows: 45s / 7m / 3h / 2d."""
80
+ s = int(seconds)
81
+ if s < 60:
82
+ return f"{s}s"
83
+ if s < 3600:
84
+ return f"{s // 60}m"
85
+ if s < 86400:
86
+ return f"{s // 3600}h"
87
+ return f"{s // 86400}d"
88
+
89
+
90
+ # ── digest helpers (used by TextHandler.render_digest) ───────────────────────
91
+
92
+ _HIST_GLYPHS = "▁▂▃▄▅▆▇█" # U+2581..U+2588
93
+
94
+
95
+ def _bar_glyph(value: int, peak: int) -> str:
96
+ """Map a per-bin count to one of 8 block-character glyphs.
97
+
98
+ Zero and below render as the lowest glyph (▁) — visual continuity beats
99
+ a blank space when the histogram band is meant to be read as a line.
100
+ Values at or above peak render as the highest glyph (█).
101
+ """
102
+ if peak <= 0 or value <= 0:
103
+ return _HIST_GLYPHS[0]
104
+ if value >= peak:
105
+ return _HIST_GLYPHS[-1]
106
+ idx = int((value / peak) * (len(_HIST_GLYPHS) - 1))
107
+ return _HIST_GLYPHS[max(0, min(len(_HIST_GLYPHS) - 1, idx))]
108
+
109
+
110
+ def _format_count(n: int) -> str:
111
+ """Compact-number formatter for histogram peak anchors."""
112
+ if n < 1000:
113
+ return str(n)
114
+ if n < 1_000_000:
115
+ return f"{n / 1000:.1f}k"
116
+ if n < 1_000_000_000:
117
+ return f"{n / 1_000_000:.1f}M"
118
+ return f"{n / 1_000_000_000:.1f}B"
119
+
120
+
121
+ def _render_histogram(
122
+ counts: list[int], unit: str, peak: int, *, unavailable: bool = False,
123
+ ) -> str:
124
+ """Render the temporal histogram as a single-line band, flush-left.
125
+
126
+ The line carries BOTH an axis unit label ("hourly bins" / "daily bins")
127
+ AND a scale anchor ("peak: N"). Without both, a busy-flat and a
128
+ quiet-flat timeline render identically — the unit names the bar width
129
+ and the anchor names the bar height.
130
+
131
+ Three rendering branches, in precedence order:
132
+
133
+ 1. ``unavailable=True`` → bare ``(timeline unavailable)`` — the caller
134
+ suppressed the histogram because timestamps in the source frame
135
+ could not be parsed with confidence. Distinct from "no events".
136
+ Both former failure modes (low-coverage / zero-span) render here
137
+ identically; the flat card has no footer to differentiate them.
138
+ 2. ``peak <= 0`` or empty ``counts`` → ``(no events in window)`` — the
139
+ valid empty-timeline case: no records in the loaded window.
140
+ 3. Otherwise → the bar render with unit label + peak anchor.
141
+ """
142
+ if unavailable:
143
+ return "(timeline unavailable)"
144
+ if peak <= 0 or not counts:
145
+ return "(no events in window)"
146
+ bars = "".join(_bar_glyph(c, peak) for c in counts)
147
+ unit_label = "hourly bins" if unit == "hr" else "daily bins"
148
+ return f"{bars} {unit_label} · peak: {_format_count(peak)}"
149
+
150
+
151
+ def _render_label_value_block(rows: list[tuple[str, str]]) -> list[str]:
152
+ """Flush-left ``label: value`` block with the value column aligned.
153
+
154
+ Shared by the ambient block (Zone 1) and the fields block (former
155
+ Zone 3) on every digest card. Label width is computed from the rows
156
+ in this block only — no cross-block alignment. The labels are
157
+ flush-left at column 0; alignment is in the value column.
158
+
159
+ Long entities (flows, domains) render in FULL — never truncated. The
160
+ text-output rail forbids truncating naturally-long values on schema
161
+ cards. Blob's wide-list slots use a separate blob-local clamp; see
162
+ ``_wrap_blob_slot_value`` below.
163
+ """
164
+ if not rows:
165
+ return []
166
+ label_w = max(len(label) for label, _ in rows)
167
+ return [
168
+ f"{(label + ':').ljust(label_w + 2)}{value}"
169
+ for label, value in rows
170
+ ]
171
+
172
+
173
+ # ── Blob-only: two-line clamp for the wide-list slot (`fields:` / `tokens:`)
174
+ #
175
+ # Blob's `fields:` row carries a top-level-keys list that on a Zeek conn
176
+ # log can easily run 20+ names; the `tokens:` row defensively shares the
177
+ # same clamp so a degenerate token row cannot blow past the 80-col frame.
178
+ # Schema cards keep rendering through ``_render_label_value_block`` and
179
+ # are exempt from this clamp — long entities like flows/domains must
180
+ # render in full per the text-output rail.
181
+
182
+ def _wrap_blob_slot_value(
183
+ value: str, *, label_col: int, sep: str,
184
+ ) -> list[str]:
185
+ """Two-line clamp for blob's wide-list slot value.
186
+
187
+ Line 1 starts at column ``label_col`` (max blob-slot label width + 2,
188
+ matching ``_render_label_value_block``'s sizing exactly). Line 2 hang-
189
+ indents to ``label_col``. Splits ONLY on ``sep`` — never breaks a
190
+ part — so the "never split a field name" rule is honoured.
191
+
192
+ A list that fits one line renders one line, no suffix. When the full
193
+ list doesn't fit two lines, truncates to what fits on line 2 and
194
+ appends ``… +N more`` (N = total parts minus parts rendered). A part
195
+ longer than the available width lands on its own line and may exceed
196
+ the 80-col frame; the load-bearing rule is unbroken parts.
197
+ """
198
+ available = _WIDTH - label_col
199
+ parts = value.split(sep)
200
+
201
+ # Single-line short-circuit.
202
+ if len(value) <= available:
203
+ return [value]
204
+
205
+ # Greedy-pack line 1.
206
+ line1_parts: list[str] = []
207
+ line1_len = 0
208
+ i = 0
209
+ while i < len(parts):
210
+ part = parts[i]
211
+ added = len(part) if not line1_parts else len(sep) + len(part)
212
+ if line1_parts and line1_len + added > available:
213
+ break
214
+ line1_parts.append(part)
215
+ line1_len += added
216
+ i += 1
217
+ if not line1_parts: # first part already too wide — emit it alone
218
+ line1_parts = [parts[0]]
219
+ i = 1
220
+ line1 = sep.join(line1_parts)
221
+
222
+ # Greedy-pack line 2, reserving suffix room only if MORE parts remain
223
+ # after a tentative full pack.
224
+ indent = " " * label_col
225
+ remaining = parts[i:]
226
+ suffix_template = f"{sep}… +{{n}} more"
227
+
228
+ # First pass: greedy pack remaining into line 2 without suffix reserve.
229
+ line2_parts: list[str] = []
230
+ line2_len = 0
231
+ j = 0
232
+ while j < len(remaining):
233
+ part = remaining[j]
234
+ added = len(part) if not line2_parts else len(sep) + len(part)
235
+ if line2_parts and line2_len + added > available:
236
+ break
237
+ line2_parts.append(part)
238
+ line2_len += added
239
+ j += 1
240
+ if not line2_parts and remaining:
241
+ line2_parts = [remaining[0]]
242
+ j = 1
243
+
244
+ truncated = j < len(remaining)
245
+ if truncated:
246
+ # Re-pack reserving room for `… +N more`. N is unknown until
247
+ # we know how many parts we kept, so iterate: each removed part
248
+ # bumps N (suffix grows by ~1 char per digit decade). Cap the
249
+ # re-pack loop trivially — at most len(remaining) iterations.
250
+ for _ in range(len(remaining) + 1):
251
+ n_remaining = len(remaining) - len(line2_parts)
252
+ if n_remaining <= 0:
253
+ break
254
+ suffix = suffix_template.format(n=n_remaining)
255
+ candidate_len = (
256
+ sum(len(p) for p in line2_parts)
257
+ + len(sep) * (len(line2_parts) - 1)
258
+ + len(suffix)
259
+ )
260
+ if candidate_len <= available:
261
+ break
262
+ if len(line2_parts) <= 1:
263
+ break # can't shrink further — accept overflow
264
+ line2_parts.pop()
265
+ n_remaining = len(remaining) - len(line2_parts)
266
+ line2 = (
267
+ indent
268
+ + sep.join(line2_parts)
269
+ + suffix_template.format(n=n_remaining)
270
+ )
271
+ else:
272
+ line2 = indent + sep.join(line2_parts) if line2_parts else ""
273
+
274
+ return [line1, line2] if line2 else [line1]
275
+
276
+
277
+ def _summary_line(label: str, value: object) -> list[str]:
278
+ """Render a wrapped run-summary row with continuation text aligned."""
279
+ prefix = f"{label:<{_SUMMARY_LABEL_WIDTH}} "
280
+ subsequent = " " * len(prefix)
281
+ text = str(value)
282
+ wrap_width = max(20, _WIDTH - len(prefix))
283
+ wrapped = textwrap.wrap(
284
+ text,
285
+ width=wrap_width,
286
+ break_long_words=False,
287
+ break_on_hyphens=False,
288
+ )
289
+ if not wrapped:
290
+ wrapped = [""]
291
+ return [
292
+ f"{prefix if i == 0 else subsequent}{part}"
293
+ for i, part in enumerate(wrapped)
294
+ ]
295
+
296
+
297
+ # ── W2 card pipeline — structured findings before row formatting ────────────
298
+
299
+
300
+ @dataclass
301
+ class Section:
302
+ """One subsection of a detector's findings — already level-filtered,
303
+ severity-sorted, and post-cap. Renderers consume this — no filtering,
304
+ sorting, or capping happens inside per-detector row formatters.
305
+
306
+ ``label`` is None for a flat detector (no subsection line emitted).
307
+ ``pre_cap_count`` is this section's level-visible size BEFORE the cap;
308
+ the subsection label always reports the pre-cap count.
309
+ """
310
+
311
+ label: str | None
312
+ findings: list[Finding]
313
+ pre_cap_count: int
314
+
315
+
316
+ @dataclass
317
+ class DetectorRenderable:
318
+ """Per-detector pipeline result. Built by ``_build_renderable`` before any
319
+ row formatting. Carries pre-cap counts and severity breakdown as sidecars
320
+ so the group header NEVER re-reads severity from post-cap ``Section.findings``.
321
+ """
322
+
323
+ sections: list[Section]
324
+ level_visible_total: int
325
+ severity_breakdown: dict[Severity, int]
326
+ cap_truncated: int = 0
327
+
328
+
329
+ _SEVERITY_ORDER: tuple[Severity, ...] = (
330
+ Severity.HIGH,
331
+ Severity.MEDIUM,
332
+ Severity.LOW,
333
+ Severity.INFO,
334
+ )
335
+
336
+
337
+ def _severity_sort_key(f: Finding) -> int:
338
+ """Stable severity-primary sort key (HIGH=0 … INFO=3). Within a band, the
339
+ detector's incoming secondary order survives (entropy desc, composite-z
340
+ desc, etc.) because Python's sort is stable."""
341
+ return _SEVERITY_ORDER.index(f.severity)
342
+
343
+
344
+ def _partition_dns(findings: list[Finding]) -> list[Section]:
345
+ """DNS: singletons FIRST (no subdomain_count), then groups (Dave's call —
346
+ the singletons tier is consistently the more interesting one). Each
347
+ speaks-iff-non-empty: an empty subsection vanishes entirely."""
348
+ singletons = [f for f in findings if "subdomain_count" not in f.evidence]
349
+ groups = [f for f in findings if "subdomain_count" in f.evidence]
350
+ out: list[Section] = []
351
+ if singletons:
352
+ out.append(Section("singletons", singletons, len(singletons)))
353
+ if groups:
354
+ out.append(Section("groups", groups, len(groups)))
355
+ return out
356
+
357
+
358
+ def _partition_aws(findings: list[Finding]) -> list[Section]:
359
+ """AWS: bursts first, then ranked (+ synthetic ranked_summary). The ranked
360
+ section bundles per-principal and the summary line together."""
361
+ bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
362
+ ranked = [f for f in findings if f.evidence.get("tier") in ("ranked", "ranked_summary")]
363
+ out: list[Section] = []
364
+ if bursts:
365
+ out.append(Section("burst sweeps", bursts, len(bursts)))
366
+ if ranked:
367
+ out.append(Section("ranked principals", ranked, len(ranked)))
368
+ return out
369
+
370
+
371
+ def _partition_flat(findings: list[Finding]) -> list[Section]:
372
+ """Flat detector — one section with no label."""
373
+ return [Section(None, findings, len(findings))]
374
+
375
+
376
+ _PARTITIONERS = {
377
+ "dns": _partition_dns,
378
+ "aws": _partition_aws,
379
+ }
380
+
381
+ # Per-detector severity-sort opt-out (CR #2 from James). Severity sort is the
382
+ # right DEFAULT — within a flat or per-section list, H → M → L → I reads as
383
+ # urgency-first. But syslog's row order CARRIES meaning: the detector emits
384
+ # chronologically so a synthetic reboot INFO annotation sits AMONG the rare
385
+ # MEDIUM template events near it (the "these rare events cluster around this
386
+ # reboot" narrative). Severity-sorting regroups all-MEDIUM-then-all-INFO and
387
+ # divorces each reboot from its context. Detectors listed here keep their
388
+ # incoming order.
389
+ _SEVERITY_SORT_EXEMPT: frozenset[str] = frozenset({"syslog"})
390
+
391
+ # Synthetic always-show finding tiers (CR #4 from James). These are
392
+ # all-clear / quiet-summary rows the detector designed to render
393
+ # unconditionally. They are exempt from the W5 cap budget — they neither
394
+ # count against the budget nor get dropped when the budget runs out. Today
395
+ # the only entry is aws's ``ranked_summary`` (the "nothing stood out" line).
396
+ # New synthetic all-show tiers join this set; the renderer is otherwise
397
+ # unchanged.
398
+ _ALWAYS_SHOW_TIERS: frozenset[str] = frozenset({"ranked_summary"})
399
+
400
+
401
+ def _is_always_show(finding: Finding) -> bool:
402
+ """True for synthetic always-show findings (CR #4). Exempt from the cap."""
403
+ return finding.evidence.get("tier") in _ALWAYS_SHOW_TIERS
404
+
405
+
406
+ def _level_filter(detector: str, findings: list[Finding], verbose_level: int) -> list[Finding]:
407
+ """W2 pipeline step 1 — the one finding-visibility-by-level rule.
408
+
409
+ Duration hides LOW findings at verbose_level 0 (W6 moved this off the
410
+ detector's run() and into the text-render seam). Every other detector is
411
+ a no-op. The result-set returned to machine handlers is invariant; only
412
+ the text handler applies this filter.
413
+ """
414
+ if detector == "duration" and verbose_level == 0:
415
+ return [f for f in findings if f.severity != Severity.LOW]
416
+ return findings
417
+
418
+
419
+ def _build_renderable(
420
+ detector: str,
421
+ findings: list[Finding],
422
+ verbose_level: int,
423
+ max_per_detector: int,
424
+ ) -> DetectorRenderable:
425
+ """Run the W2 pipeline on one detector's findings.
426
+
427
+ Order is binding:
428
+ 1. level-filter (duration LOW at level 0)
429
+ 2. partition into Sections (detector-specific)
430
+ 3. capture pre-cap level_visible_total + severity_breakdown
431
+ 4. severity-sort each section in place
432
+ 5. cap walks sections in declared order; truncates findings; sets
433
+ cap_truncated; later sections may end up with findings=[] and
434
+ vanish at render time
435
+
436
+ Both ``level_visible_total`` and ``severity_breakdown`` are captured
437
+ BEFORE the cap so the group header NEVER drifts to post-cap counts —
438
+ the pre-cap regression test in tests/test_text_output.py guards this.
439
+ """
440
+ level_visible = _level_filter(detector, findings, verbose_level)
441
+
442
+ partition = _PARTITIONERS.get(detector, _partition_flat)
443
+ sections = partition(level_visible)
444
+
445
+ level_visible_total = len(level_visible)
446
+ breakdown: dict[Severity, int] = {}
447
+ for f in level_visible:
448
+ breakdown[f.severity] = breakdown.get(f.severity, 0) + 1
449
+
450
+ if detector not in _SEVERITY_SORT_EXEMPT:
451
+ for s in sections:
452
+ s.findings.sort(key=_severity_sort_key)
453
+
454
+ # CR #4: synthetic always-show findings are exempt from the cap. Pull
455
+ # them out per-section before the budget walk so they neither consume
456
+ # the budget nor risk being dropped, then re-append them at the tail
457
+ # of their section (preserving the existing aws renderer's
458
+ # per-principal-then-summary order). Renderer code is unchanged.
459
+ always_show_by_section: list[list[Finding]] = []
460
+ for s in sections:
461
+ always = [f for f in s.findings if _is_always_show(f)]
462
+ if always:
463
+ s.findings = [f for f in s.findings if not _is_always_show(f)]
464
+ always_show_by_section.append(always)
465
+
466
+ cap_truncated = 0
467
+ # Cap accounting runs against the cappable count only (always-show
468
+ # findings live outside the budget).
469
+ cappable_total = sum(len(s.findings) for s in sections)
470
+ if max_per_detector > 0 and cappable_total > max_per_detector:
471
+ remaining = max_per_detector
472
+ for s in sections:
473
+ if remaining <= 0:
474
+ cap_truncated += len(s.findings)
475
+ s.findings = []
476
+ continue
477
+ if len(s.findings) > remaining:
478
+ cap_truncated += len(s.findings) - remaining
479
+ s.findings = s.findings[:remaining]
480
+ remaining = 0
481
+ else:
482
+ remaining -= len(s.findings)
483
+
484
+ # Re-append the held-back always-show findings at the tail of their
485
+ # section. This preserves the existing aws renderer's "per-principal
486
+ # rows, then summary line" layout and keeps the all-clear visible even
487
+ # when the cap empties the cappable rows.
488
+ for s, always in zip(sections, always_show_by_section):
489
+ if always:
490
+ s.findings.extend(always)
491
+
492
+ return DetectorRenderable(
493
+ sections=sections,
494
+ level_visible_total=level_visible_total,
495
+ severity_breakdown=breakdown,
496
+ cap_truncated=cap_truncated,
497
+ )
498
+
499
+
500
+ def _verbose_tail(finding: Finding, indent: str, extras: dict[str, Any] | None = None) -> list[str]:
501
+ """Curated 'why it scored' — level 1. Returns [] when no material to show.
502
+
503
+ Vanish discipline: a Finding with empty description / next_steps and an
504
+ empty curated-evidence subset renders the title line ALONE — no empty
505
+ headers, no dangling indents, NO trailing ``data window:`` line. The
506
+ data-window line appears only when at least one other body element is
507
+ present.
508
+ """
509
+ body: list[str] = []
510
+ if finding.description:
511
+ body.append(f"{indent}{finding.description}")
512
+ if extras:
513
+ body.append(f"{indent}evidence:")
514
+ for k, v in extras.items():
515
+ body.append(f"{indent} {k}: {v}")
516
+ if finding.next_steps:
517
+ body.append(f"{indent}next steps:")
518
+ for step in finding.next_steps:
519
+ body.append(f"{indent} · {step}")
520
+ if not body:
521
+ return []
522
+ body.append(f"{indent}data window: {_fmt_window(finding.data_window)}")
523
+ return body
524
+
525
+
526
+ def _debug_tail(finding: Finding, indent: str) -> list[str]:
527
+ """Raw debug — level 2. Full evidence dict. Same vanish discipline as
528
+ ``_verbose_tail``: empty description / evidence / next_steps → ``[]``."""
529
+ body: list[str] = []
530
+ if finding.description:
531
+ body.append(f"{indent}{finding.description}")
532
+ if finding.evidence:
533
+ body.append(f"{indent}evidence:")
534
+ for k, v in finding.evidence.items():
535
+ body.append(f"{indent} {k}: {v}")
536
+ if finding.next_steps:
537
+ body.append(f"{indent}next steps:")
538
+ for step in finding.next_steps:
539
+ body.append(f"{indent} · {step}")
540
+ if not body:
541
+ return []
542
+ body.append(f"{indent}data window: {_fmt_window(finding.data_window)}")
543
+ return body
544
+
545
+
546
+ # Per-detector curated-evidence subsets for level 1 — tolerant: omit absent
547
+ # keys rather than printing ``None``. Per-variant lookup uses existing
548
+ # evidence keys (scan's scan_type, dns's source, aws's tier, syslog by
549
+ # template-vs-reboot shape).
550
+ def _curated_evidence(finding: Finding) -> dict[str, Any]:
551
+ """Return ONLY the keys present on this Finding from the curated set for
552
+ its detector (and variant where applicable)."""
553
+ ev = finding.evidence
554
+ keys: tuple[str, ...] = ()
555
+ det = finding.detector
556
+
557
+ if det == "beacon":
558
+ keys = (
559
+ "beacon_score", "spectral_ratio", "prominence_norm",
560
+ "jitter_cv", "conn_count", "period_str",
561
+ )
562
+ elif det == "dns":
563
+ src = ev.get("source")
564
+ if "subdomain_count" in ev: # group
565
+ base = ("sample_domains", "unique_sources", "min_entropy", "max_entropy")
566
+ extra = ("was_blocked", "block_ratio", "qtype_counts") if src == "pihole" else ()
567
+ keys = base + extra
568
+ elif src == "pihole": # pihole singleton
569
+ keys = (
570
+ "unique_sources", "querier_ips",
571
+ "was_blocked", "block_ratio",
572
+ "cache_ratio", "forward_ratio", "qtype_counts",
573
+ )
574
+ else: # zeek singleton (and both-mode Zeek with pihole enrichment)
575
+ base = ("rcode_distribution", "unique_sources", "querier_ips")
576
+ extra = ("was_blocked", "block_ratio") if "was_blocked" in ev else ()
577
+ keys = base + extra
578
+ elif det == "syslog":
579
+ if "template_str" in ev:
580
+ keys = ("template_str", "host", "count", "threshold")
581
+ else: # reboot annotation
582
+ keys = ("host", "reboot_ts", "suppressed_window_seconds")
583
+ elif det == "scan":
584
+ keys = ("scan_state_ratio", "top_states", "direction", "pattern_tag")
585
+ elif det == "duration":
586
+ keys = ("avg_bytes_per_second", "conn_states", "connection_count")
587
+ elif det == "aws":
588
+ tier = ev.get("tier")
589
+ if tier == "burst":
590
+ keys = ("new_actions", "new_services", "error_rate", "mean_rarity")
591
+ else: # ranked / ranked_summary
592
+ keys = (
593
+ "composite_z", "z_error_rate", "event_count",
594
+ "top_actions", "distinct_event_source",
595
+ )
596
+
597
+ return {k: ev[k] for k in keys if k in ev and not _is_empty(ev[k])}
598
+
599
+
600
+ def _is_empty(value: Any) -> bool:
601
+ """numpy-safe emptiness test for curated evidence values.
602
+
603
+ The naive ``value not in (None, [], {})`` idiom evaluates ``value == []``
604
+ which a numpy scalar broadcasts into an empty boolean array; ``bool()`` of
605
+ that array raises ``ValueError`` and propagates straight out through
606
+ ``reporter.write``. ``aws`` burst ``error_rate``, ``scan``
607
+ ``scan_state_ratio`` (a rounded pandas mean), and beacon's spectral
608
+ scores all reach this path as numpy scalars under real data —
609
+ ``loghunter aws -v`` / ``scan -v`` died on every run.
610
+
611
+ Guard explicitly on the container types we want to omit (None / empty
612
+ str/list/tuple/dict). Anything else — including numpy scalars and
613
+ arrays — is treated as "has content" for display purposes; the renderer
614
+ formats them via the default ``f"{value}"`` path downstream.
615
+ """
616
+ if value is None:
617
+ return True
618
+ if isinstance(value, (list, tuple, dict, str)) and len(value) == 0:
619
+ return True
620
+ return False
621
+
622
+
623
+ def _level_tail(finding: Finding, indent: str, verbose_level: int) -> list[str]:
624
+ """Dispatch to the right tail by level. Level 0 returns []; level 1 emits
625
+ the curated tail; level 2 emits the full debug tail. Both tails honor
626
+ vanish-don't-dash."""
627
+ if verbose_level <= 0:
628
+ return []
629
+ if verbose_level >= 2:
630
+ return _debug_tail(finding, indent)
631
+ return _verbose_tail(finding, indent, _curated_evidence(finding))
632
+
633
+
634
+ def _render_group_header(detector: str, renderable: DetectorRenderable) -> list[str]:
635
+ """Header line: ``detector — N findings · 3 H 18 M 51 I`` + 80-col rule.
636
+
637
+ Counts and breakdown are PRE-CAP — read straight off the renderable's
638
+ ``level_visible_total`` and ``severity_breakdown`` sidecars, never
639
+ recomputed from ``Section.findings`` (which is post-cap)."""
640
+ total = renderable.level_visible_total
641
+ label = "findings" if total != 1 else "finding"
642
+ parts = [f"{detector} — {total} {label}"]
643
+ breakdown = renderable.severity_breakdown
644
+ cells = []
645
+ for sev in _SEVERITY_ORDER:
646
+ n = breakdown.get(sev, 0)
647
+ if n > 0:
648
+ cells.append(f"{n} {sev.value}")
649
+ if cells:
650
+ parts.append(" · " + " ".join(cells))
651
+ return ["".join(parts), TEXT_RULE]
652
+
653
+
654
+ def _render_cap_disclosure(detector: str, renderable: DetectorRenderable, cap: int) -> str:
655
+ """W5 disclosure: factual; deferred error-voice pass owns final wording.
656
+
657
+ Honesty rail (CR #3): the cap trims sections in DECLARED order, NOT
658
+ global severity. For a FLAT detector (one implicit section) the cap is
659
+ indeed by severity — sort-then-cap retains the highest tiers. For a
660
+ SUBSECTIONED detector (dns: singletons-first; aws: bursts-first) a
661
+ later section's HIGH row can be dropped while an earlier section's LOW
662
+ row survives the cap. So "by severity" is true only in the flat case.
663
+
664
+ Rather than spell out the cross-section non-guarantee in two arms, the
665
+ wording simply drops the severity claim. The hidden count and the cap
666
+ cap are what the operator needs to act on. Wording is placeholder-tier
667
+ pending the error-voice pass — the binding constraint is that we MUST
668
+ NOT claim severity-retention the cap doesn't provide.
669
+ """
670
+ hidden = renderable.cap_truncated
671
+ return (
672
+ f"… {hidden:,} more not shown (showing first {cap:,}). "
673
+ f"Unusually high — narrow with the allowlist, or this detector may be "
674
+ f"misbehaving."
675
+ )
676
+
677
+
678
+ class TextHandler(OutputHandler):
679
+ """Write findings as aligned plain text to stdout (or a given stream)."""
680
+
681
+ def __init__(
682
+ self,
683
+ stream: TextIO = sys.stdout,
684
+ verbose_level: int = 0,
685
+ max_findings_per_detector: int = 100,
686
+ ) -> None:
687
+ self._stream = stream
688
+ self._verbose_level = verbose_level
689
+ self._max_findings_per_detector = max_findings_per_detector
690
+
691
+ def begin(self, run_summary: RunSummary) -> None:
692
+ """Print the run summary header before any findings."""
693
+ print(file=self._stream)
694
+ print(self._render_run_summary(run_summary), file=self._stream)
695
+
696
+ def write(self, findings: list[Finding]) -> None:
697
+ """Print findings grouped by detector with aligned columns.
698
+
699
+ Per detector, runs the W2 pipeline via ``_build_renderable`` (level-
700
+ filter → partition → pre-cap stats → severity-sort → cap). A detector
701
+ whose level-visible set is empty renders NOTHING — no header, no
702
+ rule, no label (the vanish rule). The disclosure line fires only
703
+ when the cap actually trimmed rows.
704
+ """
705
+ if not findings:
706
+ return
707
+
708
+ by_detector: dict[str, list[Finding]] = defaultdict(list)
709
+ for f in findings:
710
+ by_detector[f.detector].append(f)
711
+
712
+ for detector, group in by_detector.items():
713
+ renderable = _build_renderable(
714
+ detector, group, self._verbose_level, self._max_findings_per_detector,
715
+ )
716
+ if renderable.level_visible_total == 0:
717
+ continue
718
+ print(file=self._stream)
719
+ for line in _render_group_header(detector, renderable):
720
+ print(line, file=self._stream)
721
+ for line in self._render_group(detector, renderable.sections):
722
+ print(line, file=self._stream)
723
+ if renderable.cap_truncated > 0:
724
+ print(file=self._stream)
725
+ print(
726
+ _render_cap_disclosure(detector, renderable, self._max_findings_per_detector),
727
+ file=self._stream,
728
+ )
729
+ print(file=self._stream)
730
+
731
+ def end(self) -> None:
732
+ """No-op for text output — stdout needs no closing."""
733
+
734
+ def _render_run_summary(
735
+ self,
736
+ run_summary: RunSummary,
737
+ banner: str = "LogHunter · Threat Hunt",
738
+ ) -> str:
739
+ """Render the detect-run banner.
740
+
741
+ Digest no longer flows through this helper — each digest card
742
+ carries its own identity block. This stays the single source of
743
+ truth for the detect-path banner; its output must be byte-identical
744
+ to its pre-flat-digest form on a normal detect run.
745
+ """
746
+ lines = [
747
+ banner,
748
+ _SEP_DOUBLE,
749
+ ]
750
+
751
+ if run_summary.data_window[0] and run_summary.data_window[1]:
752
+ lines.extend(_summary_line(
753
+ "Data found:", self._fmt_data_found(run_summary)
754
+ ))
755
+
756
+ if run_summary.record_counts:
757
+ counts_str = " · ".join(
758
+ f"{v:,} {k}" for k, v in run_summary.record_counts.items()
759
+ )
760
+ lines.extend(_summary_line("Records:", counts_str))
761
+
762
+ if run_summary.detectors_run:
763
+ lines.extend(_summary_line(
764
+ "Detectors:",
765
+ self._render_detectors_value(run_summary),
766
+ ))
767
+
768
+ if run_summary.detectors_skipped:
769
+ for name, reason in run_summary.detectors_skipped.items():
770
+ lines.extend(_summary_line("Skipped:", f"{name} — {reason}"))
771
+
772
+ for note in run_summary.notes:
773
+ lines.extend(_summary_line("Note:", note))
774
+
775
+ lines.append(_SEP_DOUBLE)
776
+ return "\n".join(lines)
777
+
778
+ @staticmethod
779
+ def _fmt_data_found(run_summary: RunSummary) -> str:
780
+ """Render the data-found value.
781
+
782
+ Full / disjoint runs use ``_fmt_window`` UNCHANGED (the same helper feeds
783
+ the digest card and verbose finding tails — it must stay byte-identical).
784
+ Only an underfilled default/explicit window — data span short of the
785
+ requested span by at least ``_UNDERFILL_TOLERANCE`` — swaps in the
786
+ informative parenthetical (both spans via ``_fmt_span``).
787
+ """
788
+ s, e = run_summary.data_window
789
+ data_span = e - s
790
+ rs = run_summary.requested_span
791
+ if rs is not None and (rs - data_span) >= _UNDERFILL_TOLERANCE:
792
+ return (
793
+ f"{s.strftime('%Y-%m-%d %H:%M')} → {e.strftime('%Y-%m-%d %H:%M')}"
794
+ f" ({_fmt_span(data_span)} data span in {_fmt_span(rs)} window)"
795
+ )
796
+ return _fmt_window(run_summary.data_window)
797
+
798
+ def _render_detectors_value(self, run_summary: RunSummary) -> str:
799
+ """Build the right-hand side of the Detectors: row.
800
+
801
+ Named methods (``MethodTag(named=True)``) render as ``name (label)``
802
+ with the label painted when ``self._stream`` is a real TTY (and
803
+ NO_COLOR / TERM=dumb don't opt out). Honest house badges
804
+ (``named=False``) render as ``name [label]`` plain. Detectors with
805
+ no ``DETECTOR_METHOD`` constant fall back to the bare name —
806
+ forward-compat for any future detector that ships without one.
807
+ Detectors joined by `` · ``.
808
+ """
809
+ parts: list[str] = []
810
+ for name in run_summary.detectors_run:
811
+ tag: "MethodTag | None" = run_summary.detector_methods.get(name)
812
+ if tag is None:
813
+ parts.append(name)
814
+ elif tag.named:
815
+ parts.append(f"{name} ({paint(tag.label, stream=self._stream)})")
816
+ else:
817
+ parts.append(f"{name} [{tag.label}]")
818
+ return " · ".join(parts)
819
+
820
+ def _render_group(self, detector: str, sections: list[Section]) -> list[str]:
821
+ """Render a detector's already-prepared sections (post level-filter,
822
+ sort, and cap). Per-detector renderers do pure row formatting only —
823
+ no filtering, sorting, counting, or capping leaks back here.
824
+ """
825
+ # Drop empty sections (cap may have emptied a later section — its label
826
+ # vanishes, no lonely label).
827
+ live = [s for s in sections if s.findings]
828
+ if not live:
829
+ return []
830
+ if detector == "beacon":
831
+ return self._render_beacon_group(live)
832
+ if detector == "dns":
833
+ return self._render_dns_group(live)
834
+ if detector == "scan":
835
+ return self._render_scan_group(live)
836
+ if detector == "syslog":
837
+ return self._render_syslog_group(live)
838
+ if detector == "duration":
839
+ return self._render_duration_group(live)
840
+ if detector == "aws":
841
+ return self._render_aws_group(live)
842
+ # Generic fallback — flat detector, one Section with label=None.
843
+ out: list[str] = []
844
+ for s in live:
845
+ for f in s.findings:
846
+ out.append(self._render_finding(f))
847
+ return out
848
+
849
+ def _render_beacon_group(self, sections: list[Section]) -> list[str]:
850
+ """Render beacon findings with fully aligned columns. Beacon is a flat
851
+ detector — one Section with label=None — so per-section column widths
852
+ match per-detector. → arrows align via independent sub-field padding."""
853
+ indent = " "
854
+ out: list[str] = []
855
+ findings = sections[0].findings # flat: single section
856
+
857
+ rows = []
858
+ for f in findings:
859
+ ev = f.evidence
860
+ src = ev.get("src_ip", "")
861
+ dst_str = f"{ev.get('dst_ip', '')}:{ev.get('dst_port', '')}/{ev.get('proto', '')}"
862
+ period_col = f"period={ev.get('period_str', '?')}"
863
+ score_col = f"score={ev.get('beacon_score', 0):.3f}"
864
+ conns_col = f"{ev.get('conn_count', 0):,} conns"
865
+ rows.append((str(f.severity), src, dst_str, period_col, score_col, conns_col, f))
866
+
867
+ src_w = max(len(r[1]) for r in rows)
868
+ dst_w = max(len(r[2]) for r in rows)
869
+ period_w = max(len(r[3]) for r in rows)
870
+ # score is always "score=0.XXX" — 11 chars, no padding needed
871
+ conns_w = max(len(r[5]) for r in rows)
872
+
873
+ for tag, src, dst_str, period_col, score_col, conns_col, f in rows:
874
+ line = (
875
+ f"{tag} {src:<{src_w}} → {dst_str:<{dst_w}} "
876
+ f"{period_col:<{period_w}} {score_col} "
877
+ f"{conns_col:>{conns_w}}"
878
+ )
879
+ tail = _level_tail(f, indent, self._verbose_level)
880
+ if tail:
881
+ out.append(line + "\n" + "\n".join(tail))
882
+ else:
883
+ out.append(line)
884
+ return out
885
+
886
+ def _render_dns_group(self, sections: list[Section]) -> list[str]:
887
+ """Render DNS findings: singletons FIRST, then groups (Dave redline:
888
+ singletons-first preserved). Each section gets a plain lowercase
889
+ ``label (N)`` line (pre-cap count from the section) — no ── rules.
890
+ Column widths derived from the section's findings before any row is
891
+ formatted. blocked column omitted when no row in the section has
892
+ was_blocked=True — preserves the pre-pihole Zeek-only output exactly.
893
+ """
894
+ indent = " "
895
+ out: list[str] = []
896
+
897
+ for si, section in enumerate(sections):
898
+ label_line = f"{section.label} ({section.pre_cap_count})"
899
+ if si > 0:
900
+ out.append("")
901
+ out.append(label_line)
902
+
903
+ if section.label == "singletons":
904
+ rows = []
905
+ for f in section.findings:
906
+ ev = f.evidence
907
+ tag = f"{str(f.severity):<4}"
908
+ ent_col = f"ent={ev['entropy']:.2f}"
909
+ qry_col = f"{ev['query_count']} qry"
910
+ src_col = f"{ev['unique_sources']} src"
911
+ blocked_col = "BLOCKED" if ev.get("was_blocked") else ""
912
+ rows.append((tag, ent_col, qry_col, src_col, blocked_col, f.title, f))
913
+
914
+ ent_w = max(len(r[1]) for r in rows)
915
+ qry_w = max(len(r[2]) for r in rows)
916
+ src_w = max(len(r[3]) for r in rows)
917
+ blocked_w = max(len(r[4]) for r in rows)
918
+
919
+ for tag, ent_col, qry_col, src_col, blocked_col, domain, f in rows:
920
+ if blocked_w > 0:
921
+ line = (
922
+ f" {tag} {ent_col:<{ent_w}} "
923
+ f"{qry_col:>{qry_w}} {src_col:>{src_w}} "
924
+ f"{blocked_col:<{blocked_w}} {domain}"
925
+ )
926
+ else:
927
+ line = (
928
+ f" {tag} {ent_col:<{ent_w}} "
929
+ f"{qry_col:>{qry_w}} {src_col:>{src_w}} {domain}"
930
+ )
931
+ tail = _level_tail(f, indent, self._verbose_level)
932
+ if tail:
933
+ out.append(line + "\n" + "\n".join(tail))
934
+ else:
935
+ out.append(line)
936
+ else: # "groups"
937
+ rows = []
938
+ for f in section.findings:
939
+ ev = f.evidence
940
+ tag = f"{str(f.severity):<4}"
941
+ sub_col = f"{ev['subdomain_count']} sub"
942
+ max_e, min_e = ev["max_entropy"], ev["min_entropy"]
943
+ ent_col = (
944
+ f"ent={max_e:.2f}"
945
+ if max_e == min_e
946
+ else f"ent={max_e:.2f}–{min_e:.2f}"
947
+ )
948
+ qry_col = f"{ev['total_queries']} qry"
949
+ src_col = f"{ev['unique_sources']} src"
950
+ blocked_col = "BLOCKED" if ev.get("was_blocked") else ""
951
+ rows.append((tag, sub_col, ent_col, qry_col, src_col, blocked_col, ev["registrable_domain"], f))
952
+
953
+ sub_w = max(len(r[1]) for r in rows)
954
+ ent_w = max(len(r[2]) for r in rows)
955
+ qry_w = max(len(r[3]) for r in rows)
956
+ src_w = max(len(r[4]) for r in rows)
957
+ blocked_w = max(len(r[5]) for r in rows)
958
+
959
+ for tag, sub_col, ent_col, qry_col, src_col, blocked_col, domain, f in rows:
960
+ if blocked_w > 0:
961
+ line = (
962
+ f" {tag} {sub_col:>{sub_w}} {ent_col:<{ent_w}} "
963
+ f"{qry_col:>{qry_w}} {src_col:>{src_w}} "
964
+ f"{blocked_col:<{blocked_w}} {domain}"
965
+ )
966
+ else:
967
+ line = (
968
+ f" {tag} {sub_col:>{sub_w}} {ent_col:<{ent_w}} "
969
+ f"{qry_col:>{qry_w}} {src_col:>{src_w}} {domain}"
970
+ )
971
+ tail = _level_tail(f, indent, self._verbose_level)
972
+ if tail:
973
+ out.append(line + "\n" + "\n".join(tail))
974
+ else:
975
+ out.append(line)
976
+
977
+ return out
978
+
979
+ def _render_scan_group(self, sections: list[Section]) -> list[str]:
980
+ """Render scan findings with aligned columns across all scan types. Flat
981
+ detector. Columns: severity | scan_type | ratio | src | type-specific
982
+ middle | metric. Widths derived from the section's findings before any
983
+ row is formatted."""
984
+ indent = " "
985
+ out: list[str] = []
986
+ findings = sections[0].findings
987
+
988
+ rows = []
989
+ for f in findings:
990
+ ev = f.evidence
991
+ tag = f"{str(f.severity):<4}"
992
+ type_col = ev.get("scan_type", "")
993
+ ratio_col = f"ratio={ev.get('scan_state_ratio', 0):.2f}"
994
+ src_col = ev.get("src", "")
995
+ scan_type = ev.get("scan_type", "")
996
+
997
+ if scan_type == "vertical":
998
+ middle_col = f"→ {ev.get('dst', '')}"
999
+ metric_col = f"{ev.get('distinct_ports', 0)} ports"
1000
+ elif scan_type == "horizontal":
1001
+ middle_col = f"→ *:{ev.get('port', '')}"
1002
+ metric_col = f"{ev.get('distinct_hosts', 0)} hosts"
1003
+ elif scan_type == "block":
1004
+ middle_col = "→ *"
1005
+ metric_col = f"{ev.get('distinct_ports', 0)}p × {ev.get('distinct_hosts', 0)}h"
1006
+ else: # slow
1007
+ middle_col = ""
1008
+ metric_col = f"{ev.get('distinct_ports', 0)} ports/{ev.get('active_buckets', 0)} win"
1009
+
1010
+ rows.append((tag, type_col, ratio_col, src_col, middle_col, metric_col, f))
1011
+
1012
+ type_w = max(len(r[1]) for r in rows)
1013
+ ratio_w = max(len(r[2]) for r in rows)
1014
+ src_w = max(len(r[3]) for r in rows)
1015
+ middle_w = max(len(r[4]) for r in rows)
1016
+ metric_w = max(len(r[5]) for r in rows)
1017
+
1018
+ for tag, type_col, ratio_col, src_col, middle_col, metric_col, f in rows:
1019
+ line = (
1020
+ f"{tag} {type_col:<{type_w}} {ratio_col:<{ratio_w}} "
1021
+ f"{src_col:<{src_w}} {middle_col:<{middle_w}} {metric_col:>{metric_w}}"
1022
+ )
1023
+ tail = _level_tail(f, indent, self._verbose_level)
1024
+ if tail:
1025
+ out.append(line + "\n" + "\n".join(tail))
1026
+ else:
1027
+ out.append(line)
1028
+ return out
1029
+
1030
+ def _render_syslog_group(self, sections: list[Section]) -> list[str]:
1031
+ """Render syslog findings. Flat detector. Default MEDIUM output is the
1032
+ severity + raw event line; INFO reboot annotations are a compact single
1033
+ line. Verbose tails are shared via ``_level_tail`` — for MEDIUM the
1034
+ curated subset surfaces the template & count details (drain3 internals
1035
+ that stay behind -v / -vv)."""
1036
+ indent = " "
1037
+ out: list[str] = []
1038
+ findings = sections[0].findings
1039
+
1040
+ host_w = max(len(f.evidence.get("host", "")) for f in findings)
1041
+
1042
+ for f in findings:
1043
+ ev = f.evidence
1044
+ host = ev.get("host", "")
1045
+ tag = str(f.severity)
1046
+
1047
+ if f.severity == Severity.MEDIUM:
1048
+ line = f"{tag} {f.title}"
1049
+ else: # INFO — reboot annotation
1050
+ reboot_ts = ev.get("reboot_ts") or "unknown"
1051
+ window = ev.get("suppressed_window_seconds", 0)
1052
+ line = (
1053
+ f"{tag} {host:<{host_w}} reboot @ {reboot_ts}"
1054
+ f" suppressed {window}s window"
1055
+ )
1056
+
1057
+ tail = _level_tail(f, indent, self._verbose_level)
1058
+ if tail:
1059
+ out.append(line + "\n" + "\n".join(tail))
1060
+ else:
1061
+ out.append(line)
1062
+
1063
+ return out
1064
+
1065
+ def _render_duration_group(self, sections: list[Section]) -> list[str]:
1066
+ """Render duration findings with aligned columns. Flat detector. Each
1067
+ sub-field of the flow is padded independently so → arrows align
1068
+ vertically. Columns: severity | src → dst:port/proto | max_dur_str |
1069
+ avg_bps | N_conns | states."""
1070
+ indent = " "
1071
+ out: list[str] = []
1072
+ findings = sections[0].findings
1073
+
1074
+ rows = []
1075
+ for f in findings:
1076
+ ev = f.evidence
1077
+ src = ev.get("src", "")
1078
+ port = ev.get("port")
1079
+ proto = ev.get("proto", "")
1080
+ port_str = str(port) if port is not None else "?"
1081
+ dst_str = f"{ev.get('dst', '')}:{port_str}/{proto}"
1082
+ dur_str = ev.get("max_duration_str", "")
1083
+ bps = ev.get("avg_bytes_per_second")
1084
+ if bps is None:
1085
+ bps_col = ""
1086
+ elif bps >= 1_000_000:
1087
+ bps_col = f"{bps / 1_000_000:.1f}mbps"
1088
+ elif bps >= 1_000:
1089
+ bps_col = f"{bps / 1_000:.1f}kbps"
1090
+ else:
1091
+ bps_col = f"{bps:.0f}bps"
1092
+ count = ev.get("connection_count", 1)
1093
+ conns_col = f"{count} conn" if count == 1 else f"{count} conns"
1094
+ states = ev.get("conn_states", [])
1095
+ state_col = ", ".join(states) if states else ""
1096
+ rows.append((str(f.severity), src, dst_str, dur_str, bps_col, conns_col, state_col, f))
1097
+
1098
+ src_w = max(len(r[1]) for r in rows)
1099
+ dst_w = max(len(r[2]) for r in rows)
1100
+ dur_w = max(len(r[3]) for r in rows)
1101
+ bps_w = max(len(r[4]) for r in rows)
1102
+ conns_w = max(len(r[5]) for r in rows)
1103
+ state_w = max(len(r[6]) for r in rows)
1104
+
1105
+ for tag, src, dst_str, dur_str, bps_col, conns_col, state_col, f in rows:
1106
+ line = (
1107
+ f"{tag} {src:<{src_w}} → {dst_str:<{dst_w}} "
1108
+ f"{dur_str:<{dur_w}} {bps_col:>{bps_w}} {conns_col:>{conns_w}} {state_col:>{state_w}}"
1109
+ ).rstrip()
1110
+ tail = _level_tail(f, indent, self._verbose_level)
1111
+ if tail:
1112
+ out.append(line + "\n" + "\n".join(tail))
1113
+ else:
1114
+ out.append(line)
1115
+ return out
1116
+
1117
+ def _render_aws_group(self, sections: list[Section]) -> list[str]:
1118
+ """Render AWS findings as subsections: burst sweeps, then ranked
1119
+ principals. The ranked section already bundles per-principal
1120
+ ``ranked`` rows + the synthetic ``ranked_summary`` quiet line (the
1121
+ partitioner glued them together). Plain lowercase subsection labels
1122
+ (no ── rules). Each tier computes its own column widths."""
1123
+ indent = " "
1124
+ out: list[str] = []
1125
+
1126
+ for si, section in enumerate(sections):
1127
+ label_line = f"{section.label} ({section.pre_cap_count})"
1128
+ if si > 0:
1129
+ out.append("")
1130
+ out.append(label_line)
1131
+
1132
+ if section.label == "burst sweeps":
1133
+ rows = []
1134
+ for f in section.findings:
1135
+ ev = f.evidence
1136
+ tag = f"{str(f.severity):<4}"
1137
+ principal = str(ev.get("principal", ""))
1138
+ actions_col = f"{int(ev.get('new_action_count', 0))} new"
1139
+ svcs_col = f"{int(ev.get('new_service_count', 0))} svc"
1140
+ span_col = _aws_span_str(float(ev.get("span_seconds", 0.0)))
1141
+ err_col = f"err={float(ev.get('error_rate', 0.0)):.0%}"
1142
+ rows.append((tag, principal, actions_col, svcs_col, span_col, err_col, f))
1143
+
1144
+ principal_w = max(len(r[1]) for r in rows)
1145
+ actions_w = max(len(r[2]) for r in rows)
1146
+ svcs_w = max(len(r[3]) for r in rows)
1147
+ span_w = max(len(r[4]) for r in rows)
1148
+ err_w = max(len(r[5]) for r in rows)
1149
+
1150
+ for tag, principal, actions_col, svcs_col, span_col, err_col, f in rows:
1151
+ line = (
1152
+ f" {tag} {principal:<{principal_w}} "
1153
+ f"{actions_col:>{actions_w}} {svcs_col:>{svcs_w}} "
1154
+ f"{span_col:>{span_w}} {err_col:>{err_w}}"
1155
+ )
1156
+ tail = _level_tail(f, indent, self._verbose_level)
1157
+ if tail:
1158
+ out.append(line + "\n" + "\n".join(tail))
1159
+ else:
1160
+ out.append(line)
1161
+ else: # "ranked principals"
1162
+ ranked = [f for f in section.findings if f.evidence.get("tier") == "ranked"]
1163
+ summary = [f for f in section.findings if f.evidence.get("tier") == "ranked_summary"]
1164
+
1165
+ if ranked:
1166
+ rows = []
1167
+ for f in ranked:
1168
+ ev = f.evidence
1169
+ tag = f"{str(f.severity):<4}"
1170
+ principal = str(ev.get("principal", ""))
1171
+ z_col = f"z={float(ev.get('composite_z', 0.0)):.2f}"
1172
+ err_col = f"err={float(ev.get('error_rate', 0.0)):.0%}"
1173
+ ev_col = f"{int(ev.get('event_count', 0))} ev"
1174
+ ip_col = f"{int(ev.get('distinct_source_ip', 0))} ip"
1175
+ rows.append((tag, principal, z_col, err_col, ev_col, ip_col, f))
1176
+
1177
+ principal_w = max(len(r[1]) for r in rows)
1178
+ z_w = max(len(r[2]) for r in rows)
1179
+ err_w = max(len(r[3]) for r in rows)
1180
+ ev_w = max(len(r[4]) for r in rows)
1181
+ ip_w = max(len(r[5]) for r in rows)
1182
+
1183
+ for tag, principal, z_col, err_col, ev_col, ip_col, f in rows:
1184
+ line = (
1185
+ f" {tag} {principal:<{principal_w}} "
1186
+ f"{z_col:>{z_w}} {err_col:>{err_w}} "
1187
+ f"{ev_col:>{ev_w}} {ip_col:>{ip_w}}"
1188
+ )
1189
+ tail = _level_tail(f, indent, self._verbose_level)
1190
+ if tail:
1191
+ out.append(line + "\n" + "\n".join(tail))
1192
+ else:
1193
+ out.append(line)
1194
+
1195
+ for f in summary:
1196
+ ev = f.evidence
1197
+ tag = f"{str(f.severity):<4}"
1198
+ line = (
1199
+ f" {tag} {f.title} "
1200
+ f"({int(ev.get('scorable_count', 0))} scored; "
1201
+ f"top {ev.get('top_principal', '')} "
1202
+ f"z={float(ev.get('top_composite_z', 0.0)):.2f})"
1203
+ )
1204
+ tail = _level_tail(f, indent, self._verbose_level)
1205
+ if tail:
1206
+ out.append(line + "\n" + "\n".join(tail))
1207
+ else:
1208
+ out.append(line)
1209
+
1210
+ return out
1211
+
1212
+ def _render_finding(self, finding: Finding) -> str:
1213
+ tag = str(finding.severity)
1214
+ line = f"{tag} {finding.title}"
1215
+
1216
+ indent = " "
1217
+ tail = _level_tail(finding, indent, self._verbose_level)
1218
+ if not tail:
1219
+ return line
1220
+ return line + "\n" + "\n".join(tail)
1221
+
1222
+ def render_digest(self, card: DigestCard) -> None:
1223
+ """Render a digest schema card — flat, flush-left, no banner.
1224
+
1225
+ Order: 3-line identity block · ambient block · histogram · insights
1226
+ · fields. Each block separated by one blank line. No header rule,
1227
+ no N.B. footer, no trailing rule. The inter-card separator on a
1228
+ multi-card run is emitted by the caller (run_digest) immediately
1229
+ before invoking this method.
1230
+
1231
+ Called directly by ``runner.run_digest`` — bypassing the Finding-
1232
+ shaped Reporter.begin/write/end lifecycle, because a digest run
1233
+ produces ONE card. The Finding render path is intentionally
1234
+ untouched.
1235
+ """
1236
+ # ── Identity block ────────────────────────────────────────────────
1237
+ print(card.source_name, file=self._stream)
1238
+ if card.data_window[0] and card.data_window[1]:
1239
+ print(_fmt_window(card.data_window), file=self._stream)
1240
+ else:
1241
+ # Timeline unavailable: line 2 dashes; the histogram line
1242
+ # below carries the descriptive "(timeline unavailable)".
1243
+ print("—", file=self._stream)
1244
+ print(
1245
+ f"{card.schema} · {card.record_count:,} lines · "
1246
+ f"{human_bytes(card.data_size_bytes)}",
1247
+ file=self._stream,
1248
+ )
1249
+
1250
+ # ── Ambient (former Zone 1) block ─────────────────────────────────
1251
+ ambient = _render_label_value_block(card.zone1_extras)
1252
+ if ambient:
1253
+ print(file=self._stream)
1254
+ for line in ambient:
1255
+ print(line, file=self._stream)
1256
+
1257
+ # ── Histogram ─────────────────────────────────────────────────────
1258
+ print(file=self._stream)
1259
+ print(
1260
+ _render_histogram(
1261
+ card.histogram_counts,
1262
+ card.histogram_unit,
1263
+ card.histogram_peak,
1264
+ unavailable=card.timeline_unavailable,
1265
+ ),
1266
+ file=self._stream,
1267
+ )
1268
+
1269
+ # ── Insights ──────────────────────────────────────────────────────
1270
+ if card.insights:
1271
+ print(file=self._stream)
1272
+ for insight in card.insights:
1273
+ print(insight, file=self._stream)
1274
+
1275
+ # ── Fields (former Zone 3) block ──────────────────────────────────
1276
+ field_rows = [
1277
+ (slot.label, " ".join(slot.cells))
1278
+ for slot in card.fields
1279
+ if slot.cells is not None
1280
+ ]
1281
+ field_lines = _render_label_value_block(field_rows)
1282
+ if field_lines:
1283
+ print(file=self._stream)
1284
+ for line in field_lines:
1285
+ print(line, file=self._stream)
1286
+
1287
+
1288
+ def render_blob(self, card: BlobCard) -> None:
1289
+ """Render a blob digest card — flat, flush-left, no banner.
1290
+
1291
+ Two-line identity block (blob has no window), labeled best-guess
1292
+ headline, vanish-don't-dash slot list rendered through the shared
1293
+ flat label/value helper. No footer, no inner separator, no
1294
+ trailing rule. The inter-card separator on a multi-card run is
1295
+ emitted by the caller (_render_blob_for_path) immediately before
1296
+ invoking this method.
1297
+ """
1298
+ # ── Identity block (two lines — blob has no window) ───────────────
1299
+ print(card.source_name, file=self._stream)
1300
+ # Provenance line — blob's own; not the schema cards' rows/size line.
1301
+ # Terminal-binary FIRST: a positive-magic ID has no line concept.
1302
+ # For today's gzip-container path, file_type_guess is None —
1303
+ # containers profile the content under decompression — so this
1304
+ # ordering does not steal the compressed branch. The "binary,
1305
+ # sampled from head" phrasing is card grammar, not a literal I/O
1306
+ # trace: a large plain binary may have done seek reads before the
1307
+ # terminal verdict held; the user-facing fact is "we ID'd it from
1308
+ # the head and stopped looking for log content."
1309
+ if card.file_type_guess is not None:
1310
+ provenance = (
1311
+ f"{human_bytes(card.byte_size)} · binary, sampled from head"
1312
+ )
1313
+ elif card.is_compressed:
1314
+ provenance = (
1315
+ f"{human_bytes(card.byte_size)} compressed · sampled from head"
1316
+ )
1317
+ else:
1318
+ provenance = (
1319
+ f"{human_bytes(card.byte_size)} · "
1320
+ f"sampled {card.sampled_line_count:,} lines across "
1321
+ f"{card.sample_read_count} reads"
1322
+ )
1323
+ print(provenance, file=self._stream)
1324
+ print(file=self._stream)
1325
+
1326
+ # ── Headline — labeled best-guess ─────────────────────────────────
1327
+ if card.file_type_guess is not None:
1328
+ headline = f"This looks like a {card.file_type_guess}, not a log."
1329
+ else:
1330
+ headline = (
1331
+ f"Unrecognized source — but this looks like {card.shape_guess}."
1332
+ )
1333
+ print(headline, file=self._stream)
1334
+ print(file=self._stream)
1335
+
1336
+ # ── Slot list (vanish-don't-dash) ─────────────────────────────────
1337
+ slot_rows: list[tuple[str, str]] = []
1338
+
1339
+ # bytes: always present.
1340
+ if card.file_type_guess is not None:
1341
+ magic_repr = (
1342
+ repr(card.file_type_magic)[2:-1] # strip b'...' wrapper
1343
+ if card.file_type_magic is not None else "?"
1344
+ )
1345
+ slot_rows.append((
1346
+ "bytes",
1347
+ f"binary ({card.printable_pct:.1f}% printable), "
1348
+ f"magic {magic_repr}",
1349
+ ))
1350
+ else:
1351
+ tail = ", UTF-8 clean" if card.utf8_clean else ""
1352
+ slot_rows.append((
1353
+ "bytes",
1354
+ f"text ({card.printable_pct:.1f}% printable){tail}",
1355
+ ))
1356
+
1357
+ # shape: text only.
1358
+ if card.shape_guess is not None:
1359
+ slot_rows.append(("shape", card.shape_guess))
1360
+
1361
+ # lines: text only; absent on binary terminal.
1362
+ if card.mean_line_length is not None:
1363
+ shape_tail = (
1364
+ f", {card.line_length_shape}"
1365
+ if card.line_length_shape else ""
1366
+ )
1367
+ slot_rows.append((
1368
+ "lines",
1369
+ f"mean {card.mean_line_length:.0f} chars, "
1370
+ f"p95 {card.line_length_p95}, "
1371
+ f"max {card.max_line_length}{shape_tail}",
1372
+ ))
1373
+
1374
+ # fields: / tokens: — one or the other, never both. The summariser
1375
+ # sets json_field_names on a JSON shape-guess (names-no-values),
1376
+ # which the renderer prefers; otherwise the existing top-tokens
1377
+ # row carries the literal-token spray. Vanish if neither populates.
1378
+ wrap_label: str | None = None # which label gets the two-line clamp
1379
+ wrap_sep: str = ", "
1380
+ if card.json_field_names:
1381
+ slot_rows.append(("fields", ", ".join(card.json_field_names)))
1382
+ wrap_label = "fields"
1383
+ wrap_sep = ", "
1384
+ elif card.top_tokens:
1385
+ tokens_str = " ".join(f'"{tok}"' for tok, _ in card.top_tokens[:5])
1386
+ slot_rows.append(("tokens", f"{tokens_str} [literal]"))
1387
+ wrap_label = "tokens"
1388
+ wrap_sep = " "
1389
+
1390
+ # templates: text only; vanish on freeform floor / drain3 dormant.
1391
+ if card.distinct_templates is not None:
1392
+ slot_rows.append((
1393
+ "templates",
1394
+ f"~{card.distinct_templates} distinct structures over "
1395
+ f"{card.sampled_line_count:,} sampled lines",
1396
+ ))
1397
+
1398
+ # Render: single-line slots through the shared label/value shape
1399
+ # (matching _render_label_value_block's sizing exactly so the two
1400
+ # cannot drift); the wrap-label row through the blob-local
1401
+ # _wrap_blob_slot_value clamp.
1402
+ label_w = max(len(lbl) for lbl, _ in slot_rows)
1403
+ label_col = label_w + 2
1404
+ for lbl, val in slot_rows:
1405
+ if lbl == wrap_label:
1406
+ wrapped = _wrap_blob_slot_value(
1407
+ val, label_col=label_col, sep=wrap_sep,
1408
+ )
1409
+ print(
1410
+ f"{(lbl + ':').ljust(label_col)}{wrapped[0]}",
1411
+ file=self._stream,
1412
+ )
1413
+ for cont in wrapped[1:]:
1414
+ print(cont, file=self._stream)
1415
+ else:
1416
+ print(
1417
+ f"{(lbl + ':').ljust(label_col)}{val}",
1418
+ file=self._stream,
1419
+ )
1420
+
1421
+
1422
+ register_handler("text", TextHandler)