loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,1225 @@
1
+ """Tests for general text output formatting."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ from datetime import datetime, timedelta, timezone
7
+
8
+ from loghunter.common.display import TEXT_RULE_WIDTH
9
+ from loghunter.common.finding import RunSummary
10
+ from loghunter.outputs.text import (
11
+ TextHandler,
12
+ _fmt_span,
13
+ _fmt_window,
14
+ _partition_aws as _aws_sections,
15
+ _partition_dns as _dns_sections,
16
+ )
17
+
18
+ _NOW = datetime(2026, 6, 2, tzinfo=timezone.utc)
19
+ _WINDOW = (_NOW, _NOW)
20
+
21
+
22
+ def _summary(notes: list[str] | None = None, skipped: dict[str, str] | None = None) -> RunSummary:
23
+ return RunSummary(
24
+ data_window=_WINDOW,
25
+ record_counts={"pihole*.log*": 3_235_587},
26
+ data_size_bytes=0,
27
+ detectors_run=["dns"],
28
+ detectors_skipped=skipped or {},
29
+ notes=notes or [],
30
+ )
31
+
32
+
33
+ def test_run_summary_wraps_long_notes_with_aligned_continuation() -> None:
34
+ handler = TextHandler()
35
+ rendered = handler._render_run_summary(_summary(notes=[
36
+ "running on Pi-hole/dnsmasq logs - RTT, TTL, and connection correlation "
37
+ "unavailable. Add Zeek for richer DNS analysis and conn.log correlation."
38
+ ]))
39
+ lines = rendered.splitlines()
40
+
41
+ note_lines = [
42
+ line for line in lines
43
+ if line.startswith("Note:") or line.startswith(" " * len("Note: "))
44
+ ]
45
+
46
+ assert len(note_lines) >= 2
47
+ assert note_lines[0].startswith("Note: running")
48
+ assert all(len(line) <= TEXT_RULE_WIDTH for line in note_lines)
49
+ assert note_lines[1].startswith(" " * len("Note: "))
50
+ assert note_lines[1][len("Note: "):]
51
+
52
+
53
+ def test_run_summary_wraps_long_skipped_reasons_generally() -> None:
54
+ handler = TextHandler()
55
+ rendered = handler._render_run_summary(_summary(skipped={
56
+ "dns": (
57
+ "no DNS source found - need zeek_dir DNS logs or pihole_dir logs "
58
+ "before dns detection can run"
59
+ )
60
+ }))
61
+ lines = rendered.splitlines()
62
+
63
+ skipped_lines = [
64
+ line for line in lines
65
+ if line.startswith("Skipped:") or line.startswith(" " * len("Skipped: "))
66
+ ]
67
+
68
+ assert len(skipped_lines) >= 2
69
+ assert skipped_lines[0].startswith("Skipped: dns")
70
+ assert all(len(line) <= TEXT_RULE_WIDTH for line in skipped_lines)
71
+ assert skipped_lines[1].startswith(" " * len("Skipped: "))
72
+
73
+
74
+ def test_text_rule_width_is_design_constant() -> None:
75
+ rendered = TextHandler()._render_run_summary(_summary())
76
+ # The run-summary banner is bracketed by DOUBLE rules; both honor the width.
77
+ rule_lines = [line for line in rendered.splitlines() if set(line) == {"═"}]
78
+
79
+ assert rule_lines
80
+ assert all(len(line) == 80 for line in rule_lines)
81
+ assert TEXT_RULE_WIDTH == 80
82
+
83
+
84
+ # ── _render_aws_group ────────────────────────────────────────────────────────
85
+
86
+ from loghunter.common.finding import Finding, Severity # noqa: E402
87
+
88
+
89
+ def _aws_finding(severity: Severity, evidence: dict, title: str | None = None,
90
+ description: str = "", next_steps: list[str] | None = None) -> Finding:
91
+ return Finding(
92
+ detector="aws",
93
+ severity=severity,
94
+ title=title if title is not None else str(evidence.get("principal", "")),
95
+ description=description,
96
+ evidence=evidence,
97
+ next_steps=next_steps or [],
98
+ ts_generated=_NOW,
99
+ data_window=_WINDOW,
100
+ )
101
+
102
+
103
+ def _burst_finding(principal: str, severity: Severity = Severity.MEDIUM, **overrides) -> Finding:
104
+ ev = {
105
+ "tier": "burst",
106
+ "principal": principal,
107
+ "start_time": "2026-06-01T12:00:00+00:00",
108
+ "span_seconds": 120.0,
109
+ "new_action_count": 5,
110
+ "new_service_count": 2,
111
+ "error_rate": 0.0,
112
+ "mean_rarity": 1.2,
113
+ "new_actions": ["A", "B", "C", "D", "E"],
114
+ "new_services": ["s3.amazonaws.com", "ec2.amazonaws.com"],
115
+ "source_ips": ["192.0.2.10"],
116
+ "aws_regions": ["us-east-1"],
117
+ "sample_event_ids": ["e1", "e2"],
118
+ }
119
+ ev.update(overrides)
120
+ return _aws_finding(severity, ev)
121
+
122
+
123
+ def _ranked_finding(principal: str, severity: Severity = Severity.MEDIUM, **overrides) -> Finding:
124
+ ev = {
125
+ "tier": "ranked",
126
+ "principal": principal,
127
+ "composite_z": 2.4,
128
+ "z_error_rate": 0.5,
129
+ "z_distinct_source_ip": 1.0,
130
+ "z_distinct_event_name": 0.7,
131
+ "z_action_entropy": 0.2,
132
+ "event_count": 120,
133
+ "error_rate": 0.05,
134
+ "distinct_source_ip": 8,
135
+ "distinct_event_name": 15,
136
+ "distinct_event_source": 3,
137
+ "action_entropy": 2.1,
138
+ "read_ratio": 0.7,
139
+ "distinct_aws_region": 2,
140
+ "distinct_hours_active": 6,
141
+ "top_actions": ["GetObject", "ListBuckets"],
142
+ "source_ips": ["192.0.2.10"],
143
+ "aws_regions": ["us-east-1"],
144
+ "sample_event_ids": ["e1"],
145
+ }
146
+ ev.update(overrides)
147
+ return _aws_finding(severity, ev)
148
+
149
+
150
+ def _ranked_summary_finding(scorable: int = 4, top: str = "placeholder-top",
151
+ top_z: float = 0.1) -> Finding:
152
+ return _aws_finding(
153
+ Severity.INFO,
154
+ {
155
+ "tier": "ranked_summary",
156
+ "scorable_count": scorable,
157
+ "top_principal": top,
158
+ "top_composite_z": top_z,
159
+ },
160
+ title="ranked tier: no principals cleared the LOW band",
161
+ )
162
+
163
+
164
+ def test_render_aws_group_orders_bursts_before_ranked() -> None:
165
+ handler = TextHandler(verbose_level=0)
166
+ findings = [
167
+ _ranked_finding("alice"),
168
+ _burst_finding("attacker", severity=Severity.HIGH),
169
+ ]
170
+ lines = handler._render_aws_group(_aws_sections(findings))
171
+ text = "\n".join(lines)
172
+ burst_idx = text.index("burst sweeps")
173
+ ranked_idx = text.index("ranked principals")
174
+ assert burst_idx < ranked_idx
175
+
176
+
177
+ def test_render_aws_burst_line_is_glanceable_with_aligned_columns() -> None:
178
+ """Burst row carries principal, new-action count, new-service count, span,
179
+ error rate — all aligned, severity tag at the front."""
180
+ handler = TextHandler(verbose_level=0)
181
+ lines = handler._render_aws_group(_aws_sections([
182
+ _burst_finding("attacker-short", severity=Severity.HIGH,
183
+ new_action_count=10, new_service_count=3,
184
+ span_seconds=60.0, error_rate=0.8),
185
+ _burst_finding("longer-principal-name", severity=Severity.MEDIUM,
186
+ new_action_count=5, new_service_count=1,
187
+ span_seconds=300.0, error_rate=0.0),
188
+ ]))
189
+ # Row lines start with the 2-space indent + severity tag.
190
+ body = [ln for ln in lines if ln.startswith(" [")]
191
+ assert len(body) == 2
192
+ # severity tags appear at the start
193
+ assert "[H]" in body[0] and "[M]" in body[1]
194
+ # principal name visible
195
+ assert "attacker-short" in body[0]
196
+ assert "longer-principal-name" in body[1]
197
+ # the structured columns appear
198
+ assert "10 new" in body[0] and "5 new" in body[1]
199
+ assert "3 svc" in body[0] and "1 svc" in body[1]
200
+ # span formatting
201
+ assert "1m" in body[0] # 60s → 1m
202
+ assert "5m" in body[1] # 300s → 5m
203
+ # error-rate formatting
204
+ assert "err=80%" in body[0]
205
+ assert "err=0%" in body[1]
206
+
207
+
208
+ def test_render_aws_ranked_summary_appears_in_ranked_tier() -> None:
209
+ """Glenn's catch: the ranked tier must include the synthetic ranked_summary
210
+ Finding so the quiet 'nothing stood out' line is not silently dropped."""
211
+ handler = TextHandler(verbose_level=0)
212
+ summary = _ranked_summary_finding(scorable=7, top="placeholder-top", top_z=0.4)
213
+
214
+ lines = handler._render_aws_group(_aws_sections([summary]))
215
+ text = "\n".join(lines)
216
+
217
+ assert "ranked principals" in text # the tier header fires
218
+ assert "no principals cleared the LOW band" in text
219
+ assert "[I]" in text # info severity tag
220
+ assert "7 scored" in text # scorable_count
221
+ assert "placeholder-top" in text # top pivot
222
+ assert "z=0.40" in text # top composite
223
+
224
+
225
+ def test_render_aws_ranked_per_principal_then_summary() -> None:
226
+ """Mixed case is handled — per-principal rows precede the summary line within
227
+ the same ranked-principals subsection."""
228
+ handler = TextHandler(verbose_level=0)
229
+ findings = [
230
+ _ranked_finding("alice", severity=Severity.MEDIUM, composite_z=2.4),
231
+ _ranked_summary_finding(),
232
+ ]
233
+ lines = handler._render_aws_group(_aws_sections(findings))
234
+ text = "\n".join(lines)
235
+
236
+ ranked_header_idx = text.index("ranked principals")
237
+ summary_idx = text.index("no principals cleared the LOW band")
238
+ alice_idx = text.index("alice")
239
+
240
+ # Both ranked finding rows are inside the same subsection (after header,
241
+ # alice row precedes the summary line).
242
+ assert ranked_header_idx < alice_idx < summary_idx
243
+
244
+
245
+ def test_render_aws_empty_input_produces_no_output() -> None:
246
+ handler = TextHandler(verbose_level=0)
247
+ assert handler._render_aws_group(_aws_sections([])) == []
248
+
249
+
250
+ def test_render_aws_verbose_adds_description_evidence_next_steps_window() -> None:
251
+ handler = TextHandler(verbose_level=1)
252
+ lines = handler._render_aws_group(_aws_sections([
253
+ _aws_finding(
254
+ Severity.HIGH,
255
+ {
256
+ "tier": "burst",
257
+ "principal": "attacker",
258
+ "start_time": "2026-06-01T12:00:00+00:00",
259
+ "span_seconds": 60.0,
260
+ "new_action_count": 5,
261
+ "new_service_count": 2,
262
+ "error_rate": 0.6,
263
+ "mean_rarity": 1.5,
264
+ "new_actions": ["X"],
265
+ "new_services": ["s3.amazonaws.com"],
266
+ "source_ips": ["192.0.2.10"],
267
+ "aws_regions": ["us-east-1"],
268
+ "sample_event_ids": ["evt-1"],
269
+ },
270
+ description="detected an enumeration sweep",
271
+ next_steps=["review CloudTrail for attacker", "check source IP"],
272
+ ),
273
+ ]))
274
+ text = "\n".join(lines)
275
+ assert "detected an enumeration sweep" in text # description
276
+ assert "evidence:" in text # evidence block
277
+ assert "next steps:" in text
278
+ assert "data window:" in text
279
+
280
+
281
+ def test_render_aws_burst_only_no_ranked_section() -> None:
282
+ """If only burst findings exist, no ranked-principals subsection appears."""
283
+ handler = TextHandler(verbose_level=0)
284
+ lines = handler._render_aws_group(_aws_sections([_burst_finding("only-burst")]))
285
+ text = "\n".join(lines)
286
+ assert "burst sweeps" in text
287
+ assert "ranked principals" not in text
288
+
289
+
290
+ def test_render_aws_ranked_only_no_burst_section() -> None:
291
+ """If only ranked findings exist, no burst-sweeps subsection appears."""
292
+ handler = TextHandler(verbose_level=0)
293
+ lines = handler._render_aws_group(_aws_sections([_ranked_finding("alice")]))
294
+ text = "\n".join(lines)
295
+ assert "ranked principals" in text
296
+ assert "burst sweeps" not in text
297
+
298
+
299
+ # ── Detect-banner: byte-identical regression snapshot ────────────────────────
300
+ #
301
+ # When the digest grammar collapsed, `_render_run_summary`'s digest branch
302
+ # went with it (Source: line, Lines: / Records: label seam, Data found: —
303
+ # parity rail). The detect path uses the same helper and MUST emit the
304
+ # same banner it did before that surgery. This snapshot locks the detect
305
+ # banner shape; any drift fails the test loudly.
306
+
307
+
308
+ def test_render_run_summary_detect_banner_snapshot() -> None:
309
+ """Reference detect run: banner + window + records + detectors + notes.
310
+
311
+ Post-revamp the Detectors: row carries method chrome — named methods
312
+ render with ``name (label)`` (painted on a real TTY, plain otherwise);
313
+ honest badges render with ``name [label]`` plain. Detectors joined by
314
+ `` · ``. Stream here is a StringIO (not a TTY), so the named labels
315
+ are unpainted in the snapshot.
316
+ """
317
+ from loghunter.common.finding import MethodTag
318
+ rs = RunSummary(
319
+ data_window=(
320
+ datetime(2026, 6, 1, 12, 0, tzinfo=timezone.utc),
321
+ datetime(2026, 6, 1, 18, 30, tzinfo=timezone.utc),
322
+ ),
323
+ record_counts={"conn*.log*": 12_345, "dns*.log*": 678},
324
+ data_size_bytes=0,
325
+ detectors_run=["beacon", "dns"],
326
+ detectors_skipped={"scan": "no conn data"},
327
+ notes=["test note"],
328
+ data_sources=["zeek_conn", "zeek_dns"],
329
+ detector_methods={
330
+ "beacon": MethodTag("FFT", named=True),
331
+ "dns": MethodTag("HDBSCAN", named=True),
332
+ },
333
+ )
334
+ rendered = TextHandler(stream=io.StringIO())._render_run_summary(rs)
335
+ assert rendered == (
336
+ "LogHunter · Threat Hunt\n"
337
+ "════════════════════════════════════════════════════════════════════════════════\n"
338
+ "Data found: 2026-06-01 12:00 → 2026-06-01 18:30 (0.3d)\n"
339
+ "Records: 12,345 conn*.log* · 678 dns*.log*\n"
340
+ "Detectors: beacon (FFT) · dns (HDBSCAN)\n"
341
+ "Skipped: scan — no conn data\n"
342
+ "Note: test note\n"
343
+ "════════════════════════════════════════════════════════════════════════════════"
344
+ )
345
+
346
+
347
+ def test_render_run_summary_detect_omits_optional_rows_when_empty() -> None:
348
+ """A minimal detect run still produces banner + rule + rule (no
349
+ record/detector/note lines if none configured) — proves the digest
350
+ branch is gone and the detect path uses only the truthy-guard rows."""
351
+ rs = RunSummary(
352
+ data_window=_WINDOW,
353
+ record_counts={},
354
+ data_size_bytes=0,
355
+ detectors_run=[],
356
+ detectors_skipped={},
357
+ notes=[],
358
+ )
359
+ rendered = TextHandler()._render_run_summary(rs)
360
+ lines = rendered.splitlines()
361
+ # Banner + two rules; no Source:, no Records:, no Data found: when
362
+ # window is the same instant on both sides (the `and` truthy guard
363
+ # is enough; we don't need a separate is_digest branch anymore).
364
+ assert lines[0] == "LogHunter · Threat Hunt"
365
+ assert not any(ln.startswith("Source:") for ln in lines)
366
+ assert not any(ln.startswith("Lines:") for ln in lines)
367
+
368
+
369
+ # ── _fmt_span + data-found underfill parenthetical ───────────────────────────
370
+
371
+
372
+ def test_fmt_span_edges() -> None:
373
+ assert _fmt_span(timedelta(hours=18)) == "18h"
374
+ assert _fmt_span(timedelta(hours=1)) == "1h"
375
+ assert _fmt_span(timedelta(days=2)) == "2d"
376
+ assert _fmt_span(timedelta(days=1, hours=12)) == "1.5d"
377
+ assert _fmt_span(timedelta(days=7)) == "7d"
378
+ # No surprising unit-crossing: a sub-24h span that rounds up promotes to "1d".
379
+ assert _fmt_span(timedelta(hours=23, minutes=40)) == "1d"
380
+
381
+
382
+ def _summary_window(span: timedelta, requested_span: timedelta | None) -> RunSummary:
383
+ start = datetime(2026, 6, 1, 0, 0, tzinfo=timezone.utc)
384
+ return RunSummary(
385
+ data_window=(start, start + span),
386
+ record_counts={},
387
+ data_size_bytes=0,
388
+ detectors_run=[],
389
+ detectors_skipped={},
390
+ requested_span=requested_span,
391
+ )
392
+
393
+
394
+ def _data_found(rs: RunSummary) -> str:
395
+ """Rendered run-summary with whitespace/wrap collapsed — the data-found line
396
+ can wrap past 80 cols, so the underfill phrase may span two physical lines."""
397
+ rendered = TextHandler(stream=io.StringIO())._render_run_summary(rs)
398
+ return " ".join(rendered.split())
399
+
400
+
401
+ def test_data_found_underfilled_hours() -> None:
402
+ assert "(18h data span in 1d window)" in _data_found(
403
+ _summary_window(timedelta(hours=18), timedelta(days=1)))
404
+
405
+
406
+ def test_data_found_underfilled_days() -> None:
407
+ assert "(1.5d data span in 2d window)" in _data_found(
408
+ _summary_window(timedelta(days=1, hours=12), timedelta(days=2)))
409
+
410
+
411
+ def test_data_found_full_uses_fmt_window_unchanged() -> None:
412
+ """requested_span None (full / --all) → the byte-identical _fmt_window form."""
413
+ rs = _summary_window(timedelta(days=2), None)
414
+ out = _data_found(rs)
415
+ assert " ".join(_fmt_window(rs.data_window).split()) in out
416
+ assert "data span in" not in out
417
+
418
+
419
+ def test_data_found_underfill_below_tolerance_no_clause() -> None:
420
+ """Within _UNDERFILL_TOLERANCE (1h) → no underfill clause, plain _fmt_window."""
421
+ rs = _summary_window(timedelta(hours=23, minutes=30), timedelta(days=1)) # 30m short
422
+ out = _data_found(rs)
423
+ assert "data span in" not in out
424
+ assert " ".join(_fmt_window(rs.data_window).split()) in out
425
+
426
+
427
+ def test_data_found_underfill_at_tolerance_shows_clause() -> None:
428
+ """Exactly at the tolerance threshold (>=) → clause shows."""
429
+ rs = _summary_window(timedelta(hours=23), timedelta(days=1)) # exactly 1h short
430
+ assert "data span in" in _data_found(rs)
431
+
432
+
433
+ def test_data_found_disjoint_no_clause() -> None:
434
+ """data_span > requested_span (disjoint archive) → no clause."""
435
+ rs = _summary_window(timedelta(days=3), timedelta(days=1))
436
+ assert "data span in" not in _data_found(rs)
437
+
438
+
439
+ def test_data_found_no_data_window_instant_no_clause() -> None:
440
+ """#2: the no-data case — a fabricated (now, now) instant window with
441
+ requested_span=None falls through to _fmt_window; no underfill clause renders."""
442
+ rs = _summary_window(timedelta(0), None)
443
+ out = _data_found(rs)
444
+ assert "data span in" not in out
445
+ assert " ".join(_fmt_window(rs.data_window).split()) in out
446
+
447
+
448
+ # ── Flat digest renderer ────────────────────────────────────────────────────
449
+
450
+ from loghunter.common.finding import DigestCard, DigestSlot # noqa: E402
451
+ from loghunter.common.display import human_bytes # noqa: E402
452
+ from loghunter.outputs.text import ( # noqa: E402
453
+ _render_histogram,
454
+ _render_label_value_block,
455
+ )
456
+
457
+
458
+ def _digest_card(
459
+ schema: str = "conn",
460
+ source_name: str = "conn.log",
461
+ record_count: int = 100_000,
462
+ data_size_bytes: int = 38 * 1024 * 1024,
463
+ zone1_extras: list[tuple[str, str]] | None = None,
464
+ insights: list[str] | None = None,
465
+ fields: list[DigestSlot] | None = None,
466
+ histogram_counts: list[int] | None = None,
467
+ histogram_peak: int = 0,
468
+ timeline_unavailable: bool = False,
469
+ data_window: tuple = None,
470
+ ) -> DigestCard:
471
+ return DigestCard(
472
+ schema=schema,
473
+ source_name=source_name,
474
+ data_window=data_window or _WINDOW,
475
+ record_count=record_count,
476
+ histogram_counts=histogram_counts or [],
477
+ histogram_unit="hr",
478
+ histogram_peak=histogram_peak,
479
+ zone1_extras=zone1_extras or [("hosts", "1462")],
480
+ insights=insights or [],
481
+ fields=fields or [],
482
+ data_size_bytes=data_size_bytes,
483
+ timeline_unavailable=timeline_unavailable,
484
+ )
485
+
486
+
487
+ def _render(card: DigestCard) -> list[str]:
488
+ handler = TextHandler(stream=io.StringIO())
489
+ handler.render_digest(card)
490
+ return handler._stream.getvalue().splitlines()
491
+
492
+
493
+ def test_render_digest_identity_block_three_lines_flush_left() -> None:
494
+ """Lines 1-3 are filename / window / 'schema · N lines · size' — no
495
+ banner, no header rule, all flush-left, exact count with thousands
496
+ separators (NOT the rounded 100.0K form)."""
497
+ card = _digest_card(
498
+ schema="conn", source_name="conn.100K.log",
499
+ record_count=100_000, data_size_bytes=int(38.3 * 1024 * 1024),
500
+ )
501
+ lines = _render(card)
502
+ assert lines[0] == "conn.100K.log"
503
+ assert lines[1] == _fmt_window(_WINDOW)
504
+ assert lines[2] == "conn · 100,000 lines · 38.3 MB"
505
+
506
+
507
+ def test_render_digest_no_banner_no_header_rule_no_trailing_sep() -> None:
508
+ """The flat card has no LogHunter banner, no ── digest · X ── rule,
509
+ no inner separators, no trailing _SEP. Only U+2500 anywhere should be
510
+ the inter-card rule (and that is emitted by run_digest, not by
511
+ render_digest)."""
512
+ card = _digest_card()
513
+ rendered = "\n".join(_render(card))
514
+ assert "LogHunter" not in rendered
515
+ assert "── digest" not in rendered
516
+ assert "─" not in rendered # no rule from render_digest
517
+ assert "N.B." not in rendered
518
+
519
+
520
+ def test_render_digest_identity_line_2_dashes_on_timeline_unavailable() -> None:
521
+ """When the timeline cannot be drawn honestly, identity line 2 is a
522
+ bare em-dash; the descriptive '(timeline unavailable)' lives on the
523
+ histogram line."""
524
+ card = _digest_card(
525
+ timeline_unavailable=True, data_window=(None, None),
526
+ )
527
+ lines = _render(card)
528
+ assert lines[1] == "—"
529
+ assert "(timeline unavailable)" in lines
530
+
531
+
532
+ def test_render_digest_record_count_uses_exact_thousands_separator() -> None:
533
+ """Identity line 3 shows the EXACT count with commas — never the
534
+ rounded `_format_rows` shape."""
535
+ card = _digest_card(record_count=560_742)
536
+ lines = _render(card)
537
+ assert "560,742 lines" in lines[2]
538
+ assert "560.7K" not in lines[2]
539
+
540
+
541
+ def test_render_digest_fields_block_uses_two_space_cell_join() -> None:
542
+ """The fields block renders each speaking slot's cells joined by two
543
+ spaces, with the value column aligned by max label width + 2."""
544
+ cliff = DigestSlot(
545
+ label="conn-share", statistic="cliff",
546
+ cells=["host-a", "12000", "2.5x"],
547
+ entity="host-a", magnitude=12000, ratio=2.5,
548
+ )
549
+ card = _digest_card(fields=[cliff])
550
+ lines = _render(card)
551
+ # Expect: `conn-share: host-a 12000 2.5x`
552
+ # label_w = 10 ('conn-share'); (label + ':').ljust(12) = "conn-share: "
553
+ # (one trailing space after the colon) → single space before host-a.
554
+ assert "conn-share: host-a 12000 2.5x" in lines
555
+
556
+
557
+ def test_render_digest_field_block_empty_when_no_fields() -> None:
558
+ """When `fields` is empty (all speaking slots became insights), the
559
+ card ends on the last insight — no trailing blank field block."""
560
+ card = _digest_card(
561
+ insights=["192.0.2.41:53 reaches 1008 distinct destinations, 2.2x the next-broadest source."],
562
+ fields=[],
563
+ )
564
+ rendered = "\n".join(_render(card))
565
+ # No "label:" lines after the last insight — only the insight line and
566
+ # whatever ambient/histogram blocks already appeared above.
567
+ insight = "192.0.2.41:53 reaches"
568
+ assert insight in rendered
569
+ # Output ends without a trailing colon-separated field line.
570
+ last_meaningful = rendered.rstrip("\n").rsplit("\n", 1)[-1]
571
+ assert insight in last_meaningful
572
+
573
+
574
+ # ── _render_histogram — flush-left, three branches ──────────────────────────
575
+
576
+
577
+ def test_render_histogram_unavailable_is_bare_no_indent() -> None:
578
+ """unavailable=True renders the bare line with no leading indent."""
579
+ assert _render_histogram([], "hr", 0, unavailable=True) == "(timeline unavailable)"
580
+
581
+
582
+ def test_render_histogram_unavailable_wins_over_populated_counts() -> None:
583
+ """unavailable=True is authoritative regardless of counts."""
584
+ assert _render_histogram(
585
+ [3, 2, 5], "hr", 5, unavailable=True,
586
+ ) == "(timeline unavailable)"
587
+
588
+
589
+ def test_render_histogram_empty_counts_no_events_branch_flush_left() -> None:
590
+ """Empty counts (no records in window) → bare no-events line, no indent."""
591
+ assert _render_histogram([], "hr", 0) == "(no events in window)"
592
+
593
+
594
+ def test_render_histogram_populated_renders_bars_unit_peak_flush_left() -> None:
595
+ rendered = _render_histogram([1, 2, 3], "hr", 3)
596
+ assert rendered.startswith(_render_histogram([1, 2, 3], "hr", 3)[0]) # not indented
597
+ assert not rendered.startswith(" ")
598
+ assert "hourly bins" in rendered
599
+ assert "peak:" in rendered
600
+
601
+
602
+ # ── _render_label_value_block — alignment math ──────────────────────────────
603
+
604
+
605
+ def test_label_value_block_aligns_value_column() -> None:
606
+ rows = [
607
+ ("hosts", "1462 (22 internal, 1440 external)"),
608
+ ("outbound bytes", "6.1 GB"),
609
+ ("inbound bytes", "0 B"),
610
+ ]
611
+ lines = _render_label_value_block(rows)
612
+ # All values start at the same column (label_w + 2 = 16 for "outbound bytes").
613
+ assert lines[0] == "hosts: 1462 (22 internal, 1440 external)"
614
+ assert lines[1] == "outbound bytes: 6.1 GB"
615
+ assert lines[2] == "inbound bytes: 0 B"
616
+
617
+
618
+ def test_label_value_block_empty_returns_empty() -> None:
619
+ assert _render_label_value_block([]) == []
620
+
621
+
622
+ # ── DigestCard contract additions ───────────────────────────────────────────
623
+
624
+
625
+ def test_digest_card_defaults_timeline_available() -> None:
626
+ card = _digest_card()
627
+ assert card.timeline_unavailable is False
628
+ assert card.insights == []
629
+ assert card.fields == []
630
+
631
+
632
+ def test_digest_card_accepts_none_window_on_timeline_unavailable() -> None:
633
+ card = _digest_card()
634
+ card.data_window = (None, None) # type: ignore[assignment]
635
+ assert card.data_window == (None, None)
636
+
637
+
638
+ def test_human_bytes_handles_thresholds() -> None:
639
+ assert human_bytes(0) == "0 B"
640
+ assert human_bytes(847 * 1024) == "847.0 KB"
641
+ assert human_bytes(2_576_980_377) == "2.4 GB"
642
+
643
+
644
+ # ── W1: methods chrome + color seam + compact_home ──────────────────────────
645
+
646
+
647
+ def _tty_stream() -> io.StringIO:
648
+ """StringIO that reports isatty=True — used to exercise the paint branch."""
649
+ class _TTYIO(io.StringIO):
650
+ def isatty(self) -> bool:
651
+ return True
652
+ return _TTYIO()
653
+
654
+
655
+ def _strip_sgr(text: str) -> str:
656
+ """Strip CSI escape sequences for canonical-text comparisons."""
657
+ import re
658
+ return re.sub(r"\x1b\[[0-9;]*m", "", text)
659
+
660
+
661
+ def test_render_detectors_named_method_painted_on_tty(monkeypatch) -> None:
662
+ """W1 positive color test: explicitly clear NO_COLOR and set a
663
+ color-capable TERM so the workspace shell's ``NO_COLOR=1`` /
664
+ ``TERM=dumb`` does not silently disable paint here."""
665
+ from loghunter.common.finding import MethodTag
666
+ monkeypatch.delenv("NO_COLOR", raising=False)
667
+ monkeypatch.setenv("TERM", "xterm-256color")
668
+ rs = RunSummary(
669
+ data_window=_WINDOW,
670
+ record_counts={},
671
+ data_size_bytes=0,
672
+ detectors_run=["beacon"],
673
+ detectors_skipped={},
674
+ detector_methods={"beacon": MethodTag("FFT", named=True)},
675
+ )
676
+ rendered = TextHandler(stream=_tty_stream())._render_run_summary(rs)
677
+ # Paint applied — SGR brackets the label only.
678
+ assert "beacon (\x1b[96;1mFFT\x1b[0m)" in rendered
679
+ # SGR-stripped output equals the plain canonical form.
680
+ assert "beacon (FFT)" in _strip_sgr(rendered)
681
+
682
+
683
+ def test_render_detectors_named_method_plain_on_non_tty(monkeypatch) -> None:
684
+ from loghunter.common.finding import MethodTag
685
+ monkeypatch.delenv("NO_COLOR", raising=False)
686
+ monkeypatch.delenv("TERM", raising=False)
687
+ rs = RunSummary(
688
+ data_window=_WINDOW,
689
+ record_counts={},
690
+ data_size_bytes=0,
691
+ detectors_run=["beacon"],
692
+ detectors_skipped={},
693
+ detector_methods={"beacon": MethodTag("FFT", named=True)},
694
+ )
695
+ rendered = TextHandler(stream=io.StringIO())._render_run_summary(rs)
696
+ assert "beacon (FFT)" in rendered
697
+ assert "\x1b[" not in rendered # no SGR at all
698
+
699
+
700
+ def test_render_detectors_honest_badge_never_painted(monkeypatch) -> None:
701
+ """[brackets] form is plain even on a TTY — the badge is honest, not glow."""
702
+ from loghunter.common.finding import MethodTag
703
+ monkeypatch.delenv("NO_COLOR", raising=False)
704
+ monkeypatch.delenv("TERM", raising=False)
705
+ rs = RunSummary(
706
+ data_window=_WINDOW,
707
+ record_counts={},
708
+ data_size_bytes=0,
709
+ detectors_run=["scan"],
710
+ detectors_skipped={},
711
+ detector_methods={"scan": MethodTag("pattern", named=False)},
712
+ )
713
+ rendered = TextHandler(stream=_tty_stream())._render_run_summary(rs)
714
+ assert "scan [pattern]" in rendered
715
+ assert "\x1b[" not in rendered
716
+
717
+
718
+ def test_render_detectors_no_tag_renders_bare_name() -> None:
719
+ """Forward-compat: a detector with no DETECTOR_METHOD constant renders as
720
+ the bare name — never None, never empty parens/brackets."""
721
+ rs = RunSummary(
722
+ data_window=_WINDOW,
723
+ record_counts={},
724
+ data_size_bytes=0,
725
+ detectors_run=["mystery"],
726
+ detectors_skipped={},
727
+ detector_methods={"mystery": None},
728
+ )
729
+ rendered = TextHandler(stream=io.StringIO())._render_run_summary(rs)
730
+ assert "mystery" in rendered
731
+ assert "mystery (" not in rendered
732
+ assert "mystery [" not in rendered
733
+
734
+
735
+ def test_render_detectors_joined_by_middle_dot() -> None:
736
+ from loghunter.common.finding import MethodTag
737
+ rs = RunSummary(
738
+ data_window=_WINDOW,
739
+ record_counts={},
740
+ data_size_bytes=0,
741
+ detectors_run=["beacon", "scan", "syslog"],
742
+ detectors_skipped={},
743
+ detector_methods={
744
+ "beacon": MethodTag("FFT", named=True),
745
+ "scan": MethodTag("pattern", named=False),
746
+ "syslog": MethodTag("drain3", named=True),
747
+ },
748
+ )
749
+ rendered = TextHandler(stream=io.StringIO())._render_run_summary(rs)
750
+ assert "beacon (FFT) · scan [pattern] · syslog (drain3)" in rendered
751
+
752
+
753
+ def test_render_detectors_no_color_env_suppresses_paint(monkeypatch) -> None:
754
+ from loghunter.common.finding import MethodTag
755
+ monkeypatch.setenv("NO_COLOR", "1")
756
+ rs = RunSummary(
757
+ data_window=_WINDOW,
758
+ record_counts={},
759
+ data_size_bytes=0,
760
+ detectors_run=["beacon"],
761
+ detectors_skipped={},
762
+ detector_methods={"beacon": MethodTag("FFT", named=True)},
763
+ )
764
+ rendered = TextHandler(stream=_tty_stream())._render_run_summary(rs)
765
+ assert "\x1b[" not in rendered
766
+ assert "beacon (FFT)" in rendered
767
+
768
+
769
+ def test_render_detectors_term_dumb_suppresses_paint(monkeypatch) -> None:
770
+ from loghunter.common.finding import MethodTag
771
+ monkeypatch.delenv("NO_COLOR", raising=False)
772
+ monkeypatch.setenv("TERM", "dumb")
773
+ rs = RunSummary(
774
+ data_window=_WINDOW,
775
+ record_counts={},
776
+ data_size_bytes=0,
777
+ detectors_run=["beacon"],
778
+ detectors_skipped={},
779
+ detector_methods={"beacon": MethodTag("FFT", named=True)},
780
+ )
781
+ rendered = TextHandler(stream=_tty_stream())._render_run_summary(rs)
782
+ assert "\x1b[" not in rendered
783
+ assert "beacon (FFT)" in rendered
784
+
785
+
786
+ def test_compact_home_replaces_home_prefix(monkeypatch) -> None:
787
+ from loghunter.common.display import compact_home
788
+ monkeypatch.setenv("HOME", "/Users/example")
789
+ assert compact_home("/Users/example/exports/run.log") == "~/exports/run.log"
790
+ assert compact_home("/Users/example") == "~"
791
+ assert compact_home("/Users/example/") == "~/"
792
+ assert compact_home("/var/log/syslog") == "/var/log/syslog"
793
+
794
+
795
+ def test_compact_home_passthrough_when_home_unset(monkeypatch) -> None:
796
+ """No HOME → no replacement, no crash."""
797
+ from loghunter.common.display import compact_home
798
+ monkeypatch.delenv("HOME", raising=False)
799
+ # `expanduser("~")` returns "~" verbatim when HOME is unset on POSIX —
800
+ # compact_home guards on this and returns the path unchanged.
801
+ assert compact_home("/var/log/syslog") == "/var/log/syslog"
802
+
803
+
804
+ # ── W2/W3: group header + vanish-don't-dash + pipeline ──────────────────────
805
+
806
+
807
+ def _bare_finding(detector: str, severity: Severity, title: str = "the title") -> Finding:
808
+ """A Finding with empty description/evidence/next_steps — the minimal
809
+ surface vanish-don't-dash MUST handle: every level renders title alone."""
810
+ return Finding(
811
+ detector=detector,
812
+ severity=severity,
813
+ title=title,
814
+ description="",
815
+ evidence={},
816
+ next_steps=[],
817
+ ts_generated=_NOW,
818
+ data_window=_WINDOW,
819
+ )
820
+
821
+
822
+ def _capture_write(handler: TextHandler, findings: list[Finding]) -> str:
823
+ summary = RunSummary(
824
+ data_window=_WINDOW, record_counts={}, data_size_bytes=0,
825
+ detectors_run=list({f.detector for f in findings}),
826
+ detectors_skipped={},
827
+ )
828
+ stream = io.StringIO()
829
+ handler._stream = stream
830
+ handler.begin(summary)
831
+ handler.write(findings)
832
+ handler.end()
833
+ return stream.getvalue()
834
+
835
+
836
+ def test_group_header_renders_count_and_severity_breakdown() -> None:
837
+ """New group header shape: `<detector> — N findings · 3 H 18 M 51 I`
838
+ + 80-col rule. Nonzero tiers only; H M L I order."""
839
+ findings = (
840
+ [_bare_finding("scan", Severity.HIGH) for _ in range(3)]
841
+ + [_bare_finding("scan", Severity.MEDIUM) for _ in range(2)]
842
+ + [_bare_finding("scan", Severity.INFO)]
843
+ )
844
+ handler = TextHandler(verbose_level=0)
845
+ out = _capture_write(handler, findings)
846
+ assert "scan — 6 findings · 3 H 2 M 1 I" in out
847
+ assert "─" * 80 in out
848
+
849
+
850
+ def test_group_header_omits_zero_severity_tiers() -> None:
851
+ findings = [_bare_finding("scan", Severity.MEDIUM) for _ in range(4)]
852
+ handler = TextHandler(verbose_level=0)
853
+ out = _capture_write(handler, findings)
854
+ header = next(ln for ln in out.split("\n") if ln.startswith("scan —"))
855
+ assert header == "scan — 4 findings · 4 M"
856
+
857
+
858
+ def test_vanish_dont_dash_minimal_finding_renders_title_alone() -> None:
859
+ """A Finding with empty description/evidence/next_steps at levels 0/1/2
860
+ produces NO blank lines, NO empty headers, NO dangling indents. The tail
861
+ helpers return []; the data-window line is gated on body being non-empty."""
862
+ f = _bare_finding("beacon", Severity.HIGH, "the title")
863
+ f.evidence = {
864
+ "src_ip": "192.0.2.10",
865
+ "dst_ip": "203.0.113.5",
866
+ "dst_port": 8443,
867
+ "proto": "tcp",
868
+ "period_str": "60.0s",
869
+ "beacon_score": 0.55,
870
+ "conn_count": 200,
871
+ }
872
+ # Keep description, next_steps, and the curated subset empty by clearing
873
+ # the curated keys (jitter_cv/spectral_ratio/prominence_norm absent so
874
+ # _curated_evidence returns mostly absent).
875
+ for level in (0, 1, 2):
876
+ handler = TextHandler(verbose_level=level)
877
+ out = _capture_write(handler, [f])
878
+ # No dangling "evidence:" header on a curated subset that produced ≤1
879
+ # entry — actually beacon's curated subset DOES produce content here
880
+ # (beacon_score, conn_count, period_str), so this test focuses on
881
+ # title-only when description/next_steps/evidence are FULLY empty.
882
+ assert out.count("\n\n\n") == 0, f"no triple blank at level {level}"
883
+
884
+
885
+ def test_vanish_truly_bare_finding_renders_title_only() -> None:
886
+ """For a generic Finding (unknown detector → no curated subset) with
887
+ empty description/evidence/next_steps, EVERY level emits just the title
888
+ line — no data window, no evidence/next-steps headers, no extras."""
889
+ f = _bare_finding("unknown-detector", Severity.LOW, "no detail here")
890
+ for level in (0, 1, 2):
891
+ handler = TextHandler(verbose_level=level)
892
+ out = _capture_write(handler, [f])
893
+ assert "the title" not in out # paranoia; the title we use is "no detail here"
894
+ assert "no detail here" in out
895
+ assert "evidence:" not in out, f"empty evidence MUST vanish at level {level}"
896
+ assert "next steps:" not in out, f"empty next_steps MUST vanish at level {level}"
897
+ assert "data window:" not in out, (
898
+ f"data window line MUST NOT appear when there's no other body "
899
+ f"content at level {level}"
900
+ )
901
+
902
+
903
+ def test_duration_low_hidden_at_level_0_visible_at_level_1() -> None:
904
+ """W6 + W2 step 1: duration LOW findings hidden at verbose_level 0,
905
+ visible at level ≥ 1. The result set returned by run() is invariant —
906
+ the text handler is the sole authority on hiding LOW. Probe the title
907
+ line via the duration renderer's evidence-derived shape."""
908
+ def _dur(sev: Severity, src: str) -> Finding:
909
+ f = _bare_finding("duration", sev, f"{src} → x:443/tcp")
910
+ f.evidence = {
911
+ "src": src, "dst": "203.0.113.1", "port": 443, "proto": "tcp",
912
+ "max_duration_str": "1h 0m", "connection_count": 1,
913
+ "avg_bytes_per_second": None, "conn_states": [],
914
+ }
915
+ return f
916
+ high = _dur(Severity.HIGH, "192.0.2.10")
917
+ low = _dur(Severity.LOW, "192.0.2.20")
918
+ out0 = _capture_write(TextHandler(verbose_level=0), [high, low])
919
+ out1 = _capture_write(TextHandler(verbose_level=1), [high, low])
920
+ assert "192.0.2.10" in out0 and "192.0.2.20" not in out0
921
+ assert "192.0.2.10" in out1 and "192.0.2.20" in out1
922
+
923
+
924
+ def test_severity_sort_primary_within_subsection() -> None:
925
+ """Within a section, findings sort H → M → L → I (stable for incoming
926
+ secondary order). Uses the generic ``_render_finding`` fallback so the
927
+ title string is the row signature."""
928
+ fs = [
929
+ _bare_finding("misc", Severity.LOW, "title-low"),
930
+ _bare_finding("misc", Severity.HIGH, "title-high"),
931
+ _bare_finding("misc", Severity.MEDIUM, "title-medium"),
932
+ _bare_finding("misc", Severity.INFO, "title-info"),
933
+ ]
934
+ out = _capture_write(TextHandler(verbose_level=0), fs)
935
+ lines = [ln for ln in out.split("\n") if "title-" in ln]
936
+ assert [ln for ln in lines if ln] == [
937
+ "[H] title-high",
938
+ "[M] title-medium",
939
+ "[L] title-low",
940
+ "[I] title-info",
941
+ ]
942
+
943
+
944
+ def test_digest_card_verbosity_invariant() -> None:
945
+ """W3: digest cards render identically at verbose_level 0/1/2 — there is
946
+ no -v/-vv card grammar. Cards are output bytes, not detector findings."""
947
+ from loghunter.common.finding import DigestCard
948
+ card = DigestCard(
949
+ schema="conn",
950
+ source_name="conn.log",
951
+ data_window=_WINDOW,
952
+ record_count=100,
953
+ histogram_counts=[],
954
+ histogram_unit="hr",
955
+ histogram_peak=0,
956
+ zone1_extras=[("hosts", "5")],
957
+ insights=[],
958
+ fields=[],
959
+ data_size_bytes=2048,
960
+ )
961
+ streams = []
962
+ for level in (0, 1, 2):
963
+ stream = io.StringIO()
964
+ TextHandler(stream=stream, verbose_level=level).render_digest(card)
965
+ streams.append(stream.getvalue())
966
+ assert streams[0] == streams[1] == streams[2]
967
+
968
+
969
+ def test_json_handler_serializes_findings_invariant_across_levels() -> None:
970
+ """W3: JSON is level-invariant; machine formats never drop findings."""
971
+ import json as _json
972
+ from loghunter.outputs.json import JsonHandler
973
+ fs = [_bare_finding("misc", Severity.LOW, f"f-{i}") for i in range(5)]
974
+ summary = RunSummary(
975
+ data_window=_WINDOW, record_counts={}, data_size_bytes=0,
976
+ detectors_run=["misc"], detectors_skipped={},
977
+ )
978
+ payloads = []
979
+ for level in (0, 1, 2):
980
+ stream = io.StringIO()
981
+ h = JsonHandler(stream=stream, verbose_level=level)
982
+ h.begin(summary)
983
+ h.write(fs)
984
+ h.end()
985
+ payloads.append(_json.loads(stream.getvalue()))
986
+ # Same finding count at every level — no machine-format truncation.
987
+ assert all(len(p["findings"]) == 5 for p in payloads)
988
+
989
+
990
+ def test_cr1_curated_evidence_accepts_numpy_scalars() -> None:
991
+ """CR #1: ``_curated_evidence`` MUST NOT use ``not in (None, [], {})``
992
+ on open evidence values — numpy scalars broadcast ``value == []`` into
993
+ an empty array and raise ``ValueError`` when ``bool()``-ed. ``aws``
994
+ burst ``error_rate``, ``scan`` ``scan_state_ratio``, and beacon's
995
+ spectral scores all arrive as numpy scalars under real data."""
996
+ import numpy as np
997
+ f = _bare_finding("aws", Severity.MEDIUM, "attacker")
998
+ f.evidence = {
999
+ "tier": "burst",
1000
+ "principal": "attacker",
1001
+ "error_rate": np.float64(0.5),
1002
+ "mean_rarity": np.float64(1.2),
1003
+ "new_actions": ["GetObject"],
1004
+ "new_services": ["s3"],
1005
+ }
1006
+ # Renders cleanly — no ValueError leaks out.
1007
+ handler = TextHandler(verbose_level=1)
1008
+ out = _capture_write(handler, [f])
1009
+ # The numpy scalar is rendered as its repr; we don't care about format
1010
+ # here, only that the value survives the curated-subset filter.
1011
+ assert "error_rate" in out
1012
+
1013
+
1014
+ def test_cr1_curated_evidence_includes_zero_numpy_value() -> None:
1015
+ """CR #1: a numpy scalar of value 0 is NOT empty — it carries a real
1016
+ number. The old idiom would have crashed; the explicit isinstance
1017
+ guard correctly admits it."""
1018
+ import numpy as np
1019
+ f = _bare_finding("aws", Severity.LOW, "alice")
1020
+ f.evidence = {
1021
+ "tier": "burst",
1022
+ "principal": "alice",
1023
+ "error_rate": np.float64(0.0),
1024
+ "new_actions": ["A"],
1025
+ "new_services": ["s3"],
1026
+ "mean_rarity": np.float64(0.7),
1027
+ }
1028
+ handler = TextHandler(verbose_level=1)
1029
+ out = _capture_write(handler, [f])
1030
+ # error_rate=0.0 still in the rendered evidence block.
1031
+ assert "error_rate" in out
1032
+
1033
+
1034
+ def test_cr2_syslog_preserves_chronological_order() -> None:
1035
+ """CR #2: syslog rows carry chronological narrative — a synthetic
1036
+ reboot INFO annotation sits AMONG the rare MEDIUM events near it.
1037
+ The default severity-sort would regroup all-MEDIUM-then-all-INFO and
1038
+ divorce each reboot from its context. syslog is in
1039
+ ``_SEVERITY_SORT_EXEMPT``."""
1040
+ def _medium(title: str) -> Finding:
1041
+ f = _bare_finding("syslog", Severity.MEDIUM, title)
1042
+ f.evidence = {"host": "router"}
1043
+ return f
1044
+ def _reboot(reboot_ts: str) -> Finding:
1045
+ f = _bare_finding("syslog", Severity.INFO, "reboot")
1046
+ f.evidence = {
1047
+ "host": "router",
1048
+ "reboot_ts": reboot_ts,
1049
+ "suppressed_window_seconds": 300,
1050
+ }
1051
+ return f
1052
+ fs = [
1053
+ _medium("rare-event-1"),
1054
+ _reboot("REBOOT-MARKER"),
1055
+ _medium("rare-event-2"),
1056
+ ]
1057
+ out = _capture_write(TextHandler(verbose_level=0), fs)
1058
+ # Incoming order preserved — reboot annotation sits BETWEEN the two
1059
+ # MEDIUMs. Without _SEVERITY_SORT_EXEMPT the INFO row would move to
1060
+ # the end of the section, breaking the chronological narrative.
1061
+ idx_e1 = out.index("rare-event-1")
1062
+ idx_reboot = out.index("REBOOT-MARKER")
1063
+ idx_e2 = out.index("rare-event-2")
1064
+ assert idx_e1 < idx_reboot < idx_e2, (
1065
+ f"syslog rows must stay chronological — saw "
1066
+ f"e1={idx_e1} reboot={idx_reboot} e2={idx_e2}"
1067
+ )
1068
+
1069
+
1070
+ def test_cr2_non_syslog_detectors_still_severity_sort() -> None:
1071
+ """CR #2: severity-sort is the default and only ``syslog`` opts out.
1072
+ A scan group of mixed severities sorts H → M → L → I as before."""
1073
+ fs = [
1074
+ _bare_finding("misc", Severity.LOW, "low"),
1075
+ _bare_finding("misc", Severity.HIGH, "high"),
1076
+ _bare_finding("misc", Severity.MEDIUM, "medium"),
1077
+ ]
1078
+ out = _capture_write(TextHandler(verbose_level=0), fs)
1079
+ assert out.index("high") < out.index("medium") < out.index("low")
1080
+
1081
+
1082
+ def test_cr4_aws_ranked_summary_survives_cap() -> None:
1083
+ """CR #4: the synthetic ``ranked_summary`` always-show finding is
1084
+ exempt from the W5 cap. With > cap ranked findings the summary line
1085
+ still renders and the cap reports only cappable rows hidden."""
1086
+ def _ranked(name: str) -> Finding:
1087
+ f = _bare_finding("aws", Severity.LOW, name)
1088
+ f.evidence = {
1089
+ "tier": "ranked",
1090
+ "principal": name,
1091
+ "composite_z": 1.2,
1092
+ "error_rate": 0.05,
1093
+ "event_count": 120,
1094
+ "distinct_source_ip": 8,
1095
+ }
1096
+ return f
1097
+ summary = _bare_finding("aws", Severity.INFO, "no principals cleared the LOW band")
1098
+ summary.evidence = {
1099
+ "tier": "ranked_summary",
1100
+ "scorable_count": 10,
1101
+ "top_principal": "alpha",
1102
+ "top_composite_z": 0.5,
1103
+ }
1104
+ fs = [_ranked(f"p{i}") for i in range(60)] + [summary]
1105
+ handler = TextHandler(verbose_level=0, max_findings_per_detector=10)
1106
+ out = _capture_write(handler, fs)
1107
+ # Summary line survives the cap.
1108
+ assert "no principals cleared the LOW band" in out
1109
+ # Cap counts only cappable rows: 60 ranked, 10 shown → 50 hidden.
1110
+ assert "50 more not shown" in out
1111
+
1112
+
1113
+ def test_cr5_verbose_help_mentions_vv() -> None:
1114
+ """CR #5: the verbose flag's generated help mentions ``-vv`` for the
1115
+ debug tier. Confirms operators can discover level 2 from
1116
+ ``loghunter --help`` / per-verb help."""
1117
+ from loghunter.cli import _render_verb_help
1118
+ help_text = _render_verb_help("") # analyze verb (top-level)
1119
+ assert "-vv" in help_text
1120
+ # Also covered for a single-detector verb.
1121
+ help_text_beacon = _render_verb_help("beacon")
1122
+ assert "-vv" in help_text_beacon
1123
+
1124
+
1125
+ def test_w5_cap_trips_on_flat_detector_with_disclosure() -> None:
1126
+ """W5: a flat detector beyond the cap renders the disclosure line. Most-
1127
+ severe rows are retained (cap operates post severity-sort)."""
1128
+ fs = (
1129
+ [_bare_finding("misc", Severity.HIGH, f"high-{i}") for i in range(3)]
1130
+ + [_bare_finding("misc", Severity.LOW, f"low-{i}") for i in range(200)]
1131
+ )
1132
+ handler = TextHandler(verbose_level=0, max_findings_per_detector=100)
1133
+ out = _capture_write(handler, fs)
1134
+ assert "misc — 203 findings · 3 H 200 L" in out
1135
+ # All 3 HIGHs survive (most-severe retained).
1136
+ for i in range(3):
1137
+ assert f"high-{i}" in out
1138
+ # Disclosure line lists hidden count and the cap. Per CR #3 the wording
1139
+ # no longer claims "by severity" (which is false cross-section); flat
1140
+ # detectors are factually severity-retained but the disclosure is
1141
+ # honest-by-default in either case.
1142
+ assert "103 more not shown" in out
1143
+ assert "showing first 100" in out
1144
+ assert "by severity" not in out
1145
+
1146
+
1147
+ def test_w5_cap_zero_means_unlimited() -> None:
1148
+ """W5: max_findings_per_detector=0 disables the cap entirely."""
1149
+ fs = [_bare_finding("misc", Severity.LOW, f"low-{i}") for i in range(500)]
1150
+ handler = TextHandler(verbose_level=1, max_findings_per_detector=0)
1151
+ out = _capture_write(handler, fs)
1152
+ assert "more not shown" not in out
1153
+ # Every row rendered.
1154
+ for i in (0, 250, 499):
1155
+ assert f"low-{i}" in out
1156
+
1157
+
1158
+ def test_w5_pre_cap_header_breakdown_regression() -> None:
1159
+ """W5 (Glenn's required regression): the group header reports PRE-CAP
1160
+ count AND pre-cap severity breakdown. Build a flat fixture of 150
1161
+ findings with shape 5 H · 25 M · 40 L · 80 I and cap=100. The header
1162
+ MUST read pre-cap totals — never the post-cap 5 H · 25 M · 40 L · 30 I
1163
+ that would result from re-iterating Section.findings."""
1164
+ fs = (
1165
+ [_bare_finding("misc", Severity.HIGH, f"h-{i}") for i in range(5)]
1166
+ + [_bare_finding("misc", Severity.MEDIUM, f"m-{i}") for i in range(25)]
1167
+ + [_bare_finding("misc", Severity.LOW, f"l-{i}") for i in range(40)]
1168
+ + [_bare_finding("misc", Severity.INFO, f"i-{i}") for i in range(80)]
1169
+ )
1170
+ handler = TextHandler(verbose_level=1, max_findings_per_detector=100)
1171
+ out = _capture_write(handler, fs)
1172
+ # Pre-cap header.
1173
+ assert "misc — 150 findings · 5 H 25 M 40 L 80 I" in out
1174
+ # Cap trimmed 50 rows — disclosure reports it.
1175
+ assert "50 more not shown" in out
1176
+
1177
+
1178
+ def test_w5_json_handler_ignores_cap() -> None:
1179
+ """W5: machine formats render ALL findings regardless of the cap. JSON
1180
+ payload size is independent of max_findings_per_detector."""
1181
+ import json as _json
1182
+ from loghunter.outputs.json import JsonHandler
1183
+ fs = [_bare_finding("misc", Severity.LOW, f"l-{i}") for i in range(150)]
1184
+ summary = RunSummary(
1185
+ data_window=_WINDOW, record_counts={}, data_size_bytes=0,
1186
+ detectors_run=["misc"], detectors_skipped={},
1187
+ )
1188
+ stream = io.StringIO()
1189
+ h = JsonHandler(stream=stream, verbose_level=0)
1190
+ h.begin(summary)
1191
+ h.write(fs)
1192
+ h.end()
1193
+ payload = _json.loads(stream.getvalue())
1194
+ assert len(payload["findings"]) == 150
1195
+
1196
+
1197
+ def test_w5_per_detector_isolation() -> None:
1198
+ """W5: one runaway detector doesn't truncate another. Two detectors with
1199
+ different volumes get independent caps."""
1200
+ fs = (
1201
+ [_bare_finding("loud", Severity.MEDIUM, f"loud-{i}") for i in range(200)]
1202
+ + [_bare_finding("quiet", Severity.MEDIUM, f"quiet-{i}") for i in range(5)]
1203
+ )
1204
+ handler = TextHandler(verbose_level=0, max_findings_per_detector=50)
1205
+ out = _capture_write(handler, fs)
1206
+ # loud trips the cap (200 → 50 shown, 150 hidden).
1207
+ assert "loud — 200 findings · 200 M" in out
1208
+ assert "150 more not shown" in out
1209
+ # quiet renders all 5 rows; no disclosure.
1210
+ assert "quiet — 5 findings · 5 M" in out
1211
+ for i in range(5):
1212
+ assert f"quiet-{i}" in out
1213
+
1214
+
1215
+ def test_empty_level_visible_detector_renders_no_header() -> None:
1216
+ """All-LOW duration at level 0 → level_visible_total == 0 → renders NOTHING
1217
+ for the detector group (no lonely header, no group-header rule). The
1218
+ run-summary banner's own rules still appear — exactly two of them."""
1219
+ fs = [_bare_finding("duration", Severity.LOW, f"low-{i}") for i in range(3)]
1220
+ out = _capture_write(TextHandler(verbose_level=0), fs)
1221
+ assert "duration —" not in out
1222
+ # Banner is bracketed by two DOUBLE rules; no single group-header rule should
1223
+ # be added for the empty detector group.
1224
+ assert out.count("═" * 80) == 2, "expected exactly 2 banner (double) rules"
1225
+ assert out.count("─" * 80) == 0, "no single group-header rule for empty group"