loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,1189 @@
1
+ """Tests for the digest verb and the conn schema summariser.
2
+
3
+ Covers:
4
+ - cliff statistic (gate, population floor, rank2=0)
5
+ - the four conn slots (host involvement, internal/external endpoint rules)
6
+ - histogram adaptive binning + axis label + empty-frame fallback
7
+ - mechanical lede derivation (sorted by raw slot.ratio, never by parsing cells)
8
+ - text renderer (order of zones, scale anchor, axis label)
9
+ - allowlist non-invocation (architectural fork)
10
+ - default-window paths for all three boundedness states
11
+ - CLI dispatch and whitelist enforcement
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import io
17
+ import json
18
+ from datetime import datetime, timedelta, timezone
19
+ from pathlib import Path
20
+ from typing import Any
21
+
22
+ import pandas as pd
23
+ import pytest
24
+
25
+ import loghunter.runner as runner
26
+ from loghunter.common.finding import DigestCard, DigestSlot, RunSummary
27
+ from loghunter.digest import conn as conn_digest
28
+ from loghunter.digest import _stats
29
+ from loghunter.outputs.text import (
30
+ TextHandler,
31
+ _bar_glyph,
32
+ _format_count,
33
+ _render_histogram,
34
+ )
35
+
36
+
37
+ def _conn_insights_and_fields(slots: list[DigestSlot]) -> tuple[list[str], list[DigestSlot]]:
38
+ """Adapter — exercises the new shared selection helper with conn's
39
+ own formatter map. Equivalent to the deleted conn_digest._build_ledes."""
40
+ return _stats.select_insights_and_fields(slots, conn_digest._INSIGHT_FORMATTERS)
41
+
42
+
43
+ # ─── Fixtures ────────────────────────────────────────────────────────────────
44
+
45
+ _NOW = datetime(2026, 6, 11, 12, 0, tzinfo=timezone.utc)
46
+ _BASE_TS = _NOW.timestamp()
47
+
48
+
49
+ def _conn_row(
50
+ src: str = "10.0.0.10",
51
+ dst: str = "192.0.2.20",
52
+ port: int = 443,
53
+ proto: str = "tcp",
54
+ ts: float = _BASE_TS,
55
+ bytes_: float | None = 1000,
56
+ conn_state: str | None = "SF",
57
+ local_orig: bool | None = True,
58
+ ) -> dict:
59
+ """Build a single canonical conn row.
60
+
61
+ Defaults to internal-source (RFC1918) → external-dst (RFC 5737), TCP/443,
62
+ 1000 originator bytes, local_orig=True. Override any column via kwargs.
63
+ """
64
+ return {
65
+ "src": src,
66
+ "dst": dst,
67
+ "port": port,
68
+ "proto": proto,
69
+ "ts": ts,
70
+ "bytes": bytes_,
71
+ "conn_state": conn_state,
72
+ "local_orig": local_orig,
73
+ }
74
+
75
+
76
+ def _conn_df(rows: list[dict]) -> pd.DataFrame:
77
+ """Build a canonical conn DataFrame from row dicts."""
78
+ columns = ["src", "dst", "port", "proto", "ts", "bytes", "conn_state", "local_orig"]
79
+ if not rows:
80
+ return pd.DataFrame(columns=columns)
81
+ return pd.DataFrame(rows, columns=columns)
82
+
83
+
84
+ def _run_summary(window: tuple[datetime, datetime] = (_NOW - timedelta(days=1), _NOW)) -> RunSummary:
85
+ return RunSummary(
86
+ data_window=window,
87
+ record_counts={"conn*.log*": 100},
88
+ data_size_bytes=0,
89
+ detectors_run=[],
90
+ detectors_skipped={},
91
+ notes=[],
92
+ data_sources=["zeek_conn"],
93
+ )
94
+
95
+
96
+ def _write_conn_ndjson(path: Path, rows: list[dict]) -> None:
97
+ """Write conn rows as Zeek-shaped NDJSON (loader will normalise)."""
98
+ path.parent.mkdir(parents=True, exist_ok=True)
99
+ records = []
100
+ for row in rows:
101
+ records.append({
102
+ "ts": row["ts"],
103
+ "id.orig_h": row["src"],
104
+ "id.resp_h": row["dst"],
105
+ "id.resp_p": row["port"],
106
+ "proto": row["proto"],
107
+ **({"orig_bytes": row["bytes"]} if row.get("bytes") is not None else {}),
108
+ **({"conn_state": row["conn_state"]} if row.get("conn_state") is not None else {}),
109
+ **({"local_orig": row["local_orig"]} if row.get("local_orig") is not None else {}),
110
+ })
111
+ path.write_text(
112
+ "\n".join(json.dumps(r) for r in records) + "\n",
113
+ encoding="utf-8",
114
+ )
115
+
116
+
117
+ # ─── Cliff statistic ─────────────────────────────────────────────────────────
118
+
119
+ def test_cliff_dashes_below_population_floor() -> None:
120
+ series = pd.Series([100, 10, 5], index=["a", "b", "c"]).sort_values(ascending=False)
121
+ assert conn_digest._cliff(series) is None
122
+
123
+
124
+ def test_cliff_dashes_below_gate() -> None:
125
+ series = pd.Series([15, 10, 9, 8, 7, 6], index=list("abcdef")).sort_values(ascending=False)
126
+ assert conn_digest._cliff(series) is None # 15 / 10 = 1.5 < 2.0
127
+
128
+
129
+ def test_cliff_names_rank1_when_speaking() -> None:
130
+ series = pd.Series([40, 10, 9, 8, 7, 6], index=list("abcdef")).sort_values(ascending=False)
131
+ result = conn_digest._cliff(series)
132
+ assert result is not None
133
+ entity, magnitude, ratio = result
134
+ assert entity == "a"
135
+ assert magnitude == 40.0
136
+ assert ratio == pytest.approx(4.0)
137
+
138
+
139
+ def test_cliff_handles_rank2_zero() -> None:
140
+ series = pd.Series([10, 0, 0, 0, 0], index=list("abcde")).sort_values(ascending=False)
141
+ assert conn_digest._cliff(series) is None
142
+
143
+
144
+ # ─── conn-share semantics ────────────────────────────────────────────────────
145
+
146
+ def test_conn_share_counts_host_involvement_across_src_and_dst() -> None:
147
+ # Host "10.0.0.50" appears only as dst; should still contribute to its
148
+ # involvement count and to the distinct-host population.
149
+ rows = [
150
+ _conn_row(src="10.0.0.10", dst="10.0.0.50"),
151
+ _conn_row(src="10.0.0.11", dst="10.0.0.50"),
152
+ _conn_row(src="10.0.0.12", dst="10.0.0.50"),
153
+ _conn_row(src="10.0.0.13", dst="10.0.0.50"),
154
+ _conn_row(src="10.0.0.14", dst="10.0.0.50"),
155
+ ]
156
+ df = _conn_df(rows)
157
+ slot = conn_digest._slot_conn_share(df)
158
+ # 5 distinct srcs + the one common dst = 6 hosts → population floor met
159
+ assert slot.cells is not None
160
+ assert slot.entity == "10.0.0.50"
161
+ # 5 involvements out of 5 rows = 100%
162
+ assert slot.magnitude == pytest.approx(100.0)
163
+ assert slot.ratio == pytest.approx(5.0) # rank1=5, rank2=1 each
164
+
165
+
166
+ def test_conn_share_speaks_with_dominant_host() -> None:
167
+ rows = [_conn_row(src="10.0.0.50", dst=f"192.0.2.{i}") for i in range(10)]
168
+ rows.append(_conn_row(src="10.0.0.11", dst="192.0.2.11"))
169
+ df = _conn_df(rows)
170
+ slot = conn_digest._slot_conn_share(df)
171
+ assert slot.cells is not None
172
+ assert slot.entity == "10.0.0.50"
173
+ assert slot.cells[0] == "10.0.0.50"
174
+ assert "%" in slot.cells[1]
175
+ assert slot.cells[2].endswith("x")
176
+ # Raw cliff ratio carried for lede sorting
177
+ assert slot.ratio is not None and slot.ratio >= 2.0
178
+
179
+
180
+ def test_conn_share_dashes_on_flat_pile() -> None:
181
+ rows = [_conn_row(src=f"10.0.0.{i}", dst=f"192.0.2.{i}") for i in range(10, 15)]
182
+ df = _conn_df(rows)
183
+ slot = conn_digest._slot_conn_share(df)
184
+ assert slot.cells is None
185
+ assert slot.entity is None
186
+ assert slot.magnitude is None
187
+ assert slot.ratio is None
188
+
189
+
190
+ # ─── densest-tuple, fan-out ──────────────────────────────────────────────────
191
+
192
+ def test_densest_tuple_speaks_with_dominant_flow() -> None:
193
+ rows = [_conn_row(src="10.0.0.10", dst="10.0.0.1", port=22) for _ in range(20)]
194
+ for i in range(5):
195
+ rows.append(_conn_row(src=f"10.0.0.{i+20}", dst="192.0.2.99", port=443))
196
+ df = _conn_df(rows)
197
+ slot = conn_digest._slot_densest_tuple(df)
198
+ assert slot.cells is not None
199
+ assert slot.entity == "10.0.0.10 → 10.0.0.1:22"
200
+ assert slot.cells[0] == "10.0.0.10 → 10.0.0.1:22"
201
+
202
+
203
+ def test_fan_out_speaks_with_dominant_source() -> None:
204
+ rows = [
205
+ _conn_row(src="10.0.0.53", dst=f"192.0.2.{i}", port=53)
206
+ for i in range(20)
207
+ ]
208
+ for i in range(5):
209
+ rows.append(_conn_row(src=f"10.0.0.{i+100}", dst="198.51.100.1", port=80))
210
+ df = _conn_df(rows)
211
+ slot = conn_digest._slot_fan_out(df)
212
+ assert slot.cells is not None
213
+ assert slot.entity == "10.0.0.53:53"
214
+ assert "dsts" in slot.cells[1]
215
+
216
+
217
+ # ─── byte-direction: internal/external endpoint rules ───────────────────────
218
+
219
+ def test_byte_direction_requires_internal_src_and_external_dst() -> None:
220
+ # Mix of internal↔internal, external→internal, and only ONE genuinely
221
+ # outbound flow group → population of 1 outbound dst → slot must dash
222
+ # (population floor).
223
+ rows = [
224
+ _conn_row(src="10.0.0.10", dst="10.0.0.20", bytes_=10_000), # int→int
225
+ _conn_row(src="198.51.100.1", dst="10.0.0.10", bytes_=10_000, local_orig=False), # ext→int
226
+ _conn_row(src="10.0.0.10", dst="192.0.2.1", bytes_=10_000), # int→ext
227
+ ]
228
+ df = _conn_df(rows)
229
+ slot = conn_digest._slot_byte_direction(df)
230
+ # Only 1 outbound destination → population floor (5) not met → dash.
231
+ assert slot.cells is None
232
+
233
+
234
+ def test_byte_direction_uses_local_orig_when_present() -> None:
235
+ # A public-looking src with local_orig=True must be treated as internal.
236
+ # Six distinct external dsts; the 50k-byte one must dominate cliff.
237
+ rows = [
238
+ _conn_row(src="203.0.113.10", dst="192.0.2.50", local_orig=True, bytes_=50_000),
239
+ ]
240
+ for i in range(6):
241
+ rows.append(_conn_row(src="203.0.113.10", dst=f"198.51.100.{i+1}",
242
+ local_orig=True, bytes_=1_000))
243
+ df = _conn_df(rows)
244
+ slot = conn_digest._slot_byte_direction(df)
245
+ assert slot.cells is not None
246
+ assert slot.entity == "192.0.2.50"
247
+
248
+
249
+ def test_byte_direction_local_orig_false_excludes_rfc1918_src() -> None:
250
+ # local_orig=False overrides RFC1918 — src is treated as external, so the
251
+ # 999_999-byte row to 192.0.2.50 is NOT outbound and must not dominate.
252
+ # Six other outbound rows with varied bytes give a clear rank-1 elsewhere.
253
+ rows = [
254
+ _conn_row(src="10.0.0.10", dst="192.0.2.50",
255
+ local_orig=False, bytes_=999_999),
256
+ ]
257
+ for i, b in enumerate([10_000, 1_000, 500, 200, 100, 50]):
258
+ rows.append(_conn_row(src="10.0.0.11", dst=f"198.51.100.{i+1}",
259
+ local_orig=True, bytes_=b))
260
+ df = _conn_df(rows)
261
+ slot = conn_digest._slot_byte_direction(df)
262
+ assert slot.cells is not None
263
+ assert slot.entity == "198.51.100.1"
264
+ assert slot.entity != "192.0.2.50"
265
+
266
+
267
+ def test_byte_direction_falls_back_to_rfc1918_when_local_orig_nan() -> None:
268
+ # local_orig missing → RFC1918(src) decides. RFC1918 src + external dst → outbound.
269
+ # 50k-byte dst should dominate over six 1k-byte dsts. The 50k/1k = 50.0
270
+ # ratio lands exactly on CLIFF_DISPLAY_CAP, so the rendered cell caps but
271
+ # slot.ratio stays the raw float — locks the display/storage separation
272
+ # at a realistic call site.
273
+ rows = [
274
+ _conn_row(src="10.0.0.10", dst="192.0.2.50", local_orig=None, bytes_=50_000),
275
+ ]
276
+ for i in range(6):
277
+ rows.append(_conn_row(src="10.0.0.10", dst=f"198.51.100.{i+1}",
278
+ local_orig=None, bytes_=1_000))
279
+ df = _conn_df(rows)
280
+ slot = conn_digest._slot_byte_direction(df)
281
+ assert slot.cells is not None
282
+ assert slot.entity == "192.0.2.50"
283
+ # Display-cap separation: raw ratio preserved, rendered cell capped
284
+ assert slot.ratio == pytest.approx(50.0)
285
+ assert slot.cells[2] == ">50x"
286
+
287
+
288
+ def test_byte_direction_treats_nan_bytes_as_zero() -> None:
289
+ # NaN bytes contribute 0 — 192.0.2.50 has NaN bytes; the five varied-byte
290
+ # outbound rows give a clear rank-1 elsewhere.
291
+ rows = [
292
+ _conn_row(src="10.0.0.10", dst="192.0.2.50", bytes_=None),
293
+ ]
294
+ for i, b in enumerate([50_000, 1_000, 500, 200, 100]):
295
+ rows.append(_conn_row(src="10.0.0.10", dst=f"198.51.100.{i+1}", bytes_=b))
296
+ df = _conn_df(rows)
297
+ slot = conn_digest._slot_byte_direction(df)
298
+ assert slot.cells is not None
299
+ # 192.0.2.50 with NaN bytes (counted as 0) must NOT be rank-1
300
+ assert slot.entity == "198.51.100.1"
301
+ assert slot.entity != "192.0.2.50"
302
+
303
+
304
+ # ─── Zone-1 extras ───────────────────────────────────────────────────────────
305
+
306
+ def test_zone1_split_uses_rfc1918_per_host_not_local_orig() -> None:
307
+ # 10.0.0.X appears only as dst (no local_orig for that endpoint) but is
308
+ # RFC1918, so it must count as internal in the Zone-1 split.
309
+ rows = [
310
+ _conn_row(src="198.51.100.5", dst="10.0.0.50", local_orig=False),
311
+ ]
312
+ df = _conn_df(rows)
313
+ body = conn_digest.summarize(df)
314
+ # zone1_extras: first entry is the "hosts" combined line
315
+ label, value = body["zone1_extras"][0]
316
+ assert label == "hosts"
317
+ # Both endpoints are visible; 10.0.0.50 must be classified internal.
318
+ assert "1 internal" in value
319
+
320
+
321
+ def test_zone1_byte_totals_outbound_and_inbound() -> None:
322
+ rows = [
323
+ _conn_row(src="10.0.0.10", dst="192.0.2.1", bytes_=1000, local_orig=True), # outbound
324
+ _conn_row(src="198.51.100.5", dst="10.0.0.10", bytes_=500, local_orig=False), # inbound
325
+ ]
326
+ df = _conn_df(rows)
327
+ body = conn_digest.summarize(df)
328
+ labels_to_values = dict(body["zone1_extras"])
329
+ assert labels_to_values["outbound bytes"] == "1000 B"
330
+ assert labels_to_values["inbound bytes"] == "500 B"
331
+
332
+
333
+ # ─── Histogram ───────────────────────────────────────────────────────────────
334
+
335
+ def test_histogram_picks_hourly_for_short_span() -> None:
336
+ start = datetime(2026, 6, 11, 0, 0, tzinfo=timezone.utc)
337
+ end = start + timedelta(hours=24)
338
+ ts = pd.Series([
339
+ (start + timedelta(hours=h)).timestamp() for h in range(0, 24)
340
+ ])
341
+ counts, unit, peak = runner._compute_histogram(ts, (start, end))
342
+ assert unit == "hr"
343
+ assert len(counts) == 24
344
+ assert peak == 1
345
+
346
+
347
+ def test_histogram_picks_daily_for_long_span() -> None:
348
+ start = datetime(2026, 5, 1, 0, 0, tzinfo=timezone.utc)
349
+ end = start + timedelta(days=30)
350
+ ts = pd.Series([
351
+ (start + timedelta(days=d, hours=12)).timestamp() for d in range(30)
352
+ ])
353
+ counts, unit, peak = runner._compute_histogram(ts, (start, end))
354
+ assert unit == "day"
355
+ assert len(counts) == 30
356
+ assert peak == 1
357
+
358
+
359
+ def test_histogram_peak_reflects_max_bin() -> None:
360
+ start = datetime(2026, 6, 11, 0, 0, tzinfo=timezone.utc)
361
+ end = start + timedelta(hours=4)
362
+ # Five events all in hour-1
363
+ ts = pd.Series([
364
+ (start + timedelta(hours=1, minutes=m)).timestamp() for m in (1, 2, 3, 4, 5)
365
+ ])
366
+ counts, _unit, peak = runner._compute_histogram(ts, (start, end))
367
+ assert peak == 5
368
+ assert counts[1] == 5
369
+
370
+
371
+ def test_histogram_zero_span_single_record_emits_one_bin() -> None:
372
+ """A frame whose min-ts == max-ts (single event, or all events sharing one
373
+ timestamp) must emit a one-bin histogram, not the no-events fallback.
374
+
375
+ Regression for the zero-span defect: the prior implementation returned
376
+ `[], "hr", 0` whenever start == end, silently discarding non-empty ts.
377
+ """
378
+ ts_value = datetime(2026, 6, 11, 12, 0, tzinfo=timezone.utc).timestamp()
379
+ ts = pd.Series([ts_value, ts_value, ts_value])
380
+ window_dt = datetime.fromtimestamp(ts_value, tz=timezone.utc)
381
+ counts, unit, peak = runner._compute_histogram(ts, (window_dt, window_dt))
382
+ assert counts == [3]
383
+ assert peak == 3
384
+ assert unit == "hr"
385
+
386
+
387
+ def test_histogram_right_edge_event_lands_in_final_bin() -> None:
388
+ """An event at exactly data_window[1] must land in the final bin.
389
+
390
+ Regression for the half-open-window defect: when the span is an exact
391
+ multiple of bin_seconds (e.g. 24 hours with hourly bins), the prior
392
+ implementation filtered out offsets equal to bin_count, silently
393
+ undercounting the most-recent bin. data_window is derived from
394
+ min(ts)/max(ts), so the max-ts event sits on the right edge by
395
+ construction — it must land in the final bin, not be dropped.
396
+ """
397
+ start = datetime(2026, 6, 11, 0, 0, tzinfo=timezone.utc)
398
+ end = start + timedelta(hours=24) # exact 24h → bin_count == 24
399
+ ts = pd.Series([start.timestamp(), end.timestamp()])
400
+ counts, unit, peak = runner._compute_histogram(ts, (start, end))
401
+ assert unit == "hr" # locks the hourly binning branch
402
+ assert len(counts) == 24
403
+ assert counts[0] == 1
404
+ assert counts[-1] == 1 # right-edge event lands in final bin
405
+ assert peak == 1
406
+ assert sum(counts) == 2 # no events lost
407
+
408
+
409
+ def test_histogram_caps_long_span_to_max_bins() -> None:
410
+ """A 219-day span produces 219 raw daily bins; the width cap folds them
411
+ so the single-line renderer can fit within terminal width."""
412
+ start = datetime(2026, 1, 1, 0, 0, tzinfo=timezone.utc)
413
+ end = start + timedelta(days=219)
414
+ ts = pd.Series([
415
+ (start + timedelta(days=d, hours=12)).timestamp() for d in range(219)
416
+ ])
417
+ counts, unit, _peak = runner._compute_histogram(ts, (start, end))
418
+ assert unit == "day" # label stays nominal even when bins are folded
419
+ assert len(counts) <= runner._HISTOGRAM_MAX_BINS
420
+ # group_size = ceil(219 / 60) = 4 → ceil(219 / 4) = 55 folded buckets
421
+ assert len(counts) == 55
422
+
423
+
424
+ def test_histogram_downsampling_preserves_total_event_count() -> None:
425
+ """Folding adjacent bins by sum loses nothing — every raw event is
426
+ accounted for in the post-fold counts."""
427
+ start = datetime(2026, 1, 1, 0, 0, tzinfo=timezone.utc)
428
+ end = start + timedelta(days=219)
429
+ ts = pd.Series([
430
+ (start + timedelta(days=d, hours=12)).timestamp() for d in range(219)
431
+ ])
432
+ counts, _unit, _peak = runner._compute_histogram(ts, (start, end))
433
+ assert sum(counts) == 219
434
+
435
+
436
+ def test_histogram_peak_reflects_post_fold_bucket() -> None:
437
+ """Peak is recomputed AFTER the fold, so the rendered scale anchor
438
+ reflects the summed bucket value the tallest glyph actually represents.
439
+
440
+ Fixture: 219-day span (forces daily binning + cap to 55 buckets at
441
+ group_size=4). Days 0–3 hold 3 events each; days 4–218 hold 1 event
442
+ each. By construction the largest single-day raw count is 3, but the
443
+ first folded bucket sums to 12 (3+3+3+3).
444
+ """
445
+ start = datetime(2026, 1, 1, 0, 0, tzinfo=timezone.utc)
446
+ end = start + timedelta(days=219)
447
+ events: list[float] = []
448
+ for d in range(4):
449
+ for _ in range(3):
450
+ events.append((start + timedelta(days=d, hours=12)).timestamp())
451
+ for d in range(4, 219):
452
+ events.append((start + timedelta(days=d, hours=12)).timestamp())
453
+ ts = pd.Series(events)
454
+ raw_max_single_bin = 3 # largest single-day raw count by construction
455
+ counts, _unit, peak = runner._compute_histogram(ts, (start, end))
456
+ assert peak == max(counts)
457
+ assert peak > raw_max_single_bin
458
+ assert peak == 12 # days 0–3 fold into bucket 0
459
+
460
+
461
+ def test_histogram_short_span_unchanged_by_cap() -> None:
462
+ """Spans yielding <= 60 raw bins must be returned untouched — the cap
463
+ must not perturb the common case. Locks concrete pre-cap values rather
464
+ than mirroring the daily-switch test loosely.
465
+ """
466
+ start = datetime(2026, 5, 1, 0, 0, tzinfo=timezone.utc)
467
+ end = start + timedelta(days=30)
468
+ ts = pd.Series([
469
+ (start + timedelta(days=d, hours=12)).timestamp() for d in range(30)
470
+ ])
471
+ counts, unit, peak = runner._compute_histogram(ts, (start, end))
472
+ assert unit == "day"
473
+ assert counts == [1] * 30 # exact pre-cap values, no folding
474
+ assert peak == 1
475
+
476
+
477
+ def test_histogram_empty_frame_renders_no_events_line() -> None:
478
+ rendered = _render_histogram([], "hr", 0)
479
+ assert "no events in window" in rendered
480
+
481
+
482
+ def test_render_histogram_carries_axis_unit_label() -> None:
483
+ hourly = _render_histogram([1, 2, 3], "hr", 3)
484
+ assert "hourly bins" in hourly
485
+ daily = _render_histogram([1, 2, 3], "day", 3)
486
+ assert "daily bins" in daily
487
+
488
+
489
+ def test_render_histogram_carries_scale_anchor() -> None:
490
+ rendered = _render_histogram([1, 5, 3], "hr", 5)
491
+ assert "peak: 5" in rendered
492
+
493
+
494
+ def test_bar_glyph_low_and_high() -> None:
495
+ assert _bar_glyph(0, 10) == "▁"
496
+ assert _bar_glyph(10, 10) == "█"
497
+ assert _bar_glyph(5, 10) in "▃▄▅"
498
+
499
+
500
+ def test_format_count_thresholds() -> None:
501
+ assert _format_count(42) == "42"
502
+ assert _format_count(1500) == "1.5k"
503
+ assert _format_count(14_200) == "14.2k"
504
+ assert _format_count(3_400_000) == "3.4M"
505
+
506
+
507
+ # ─── Ledes ───────────────────────────────────────────────────────────────────
508
+
509
+ def test_insights_silent_on_flat_pile() -> None:
510
+ rows = [_conn_row(src=f"10.0.0.{i}", dst=f"192.0.2.{i}") for i in range(5)]
511
+ df = _conn_df(rows)
512
+ body = conn_digest.summarize(df)
513
+ assert body["insights"] == []
514
+
515
+
516
+ def test_insights_sort_by_slot_ratio_not_cell_string() -> None:
517
+ # Hand-build 4 speaking slots with distinct ratios; verify insights
518
+ # verbalize the top 3 in ratio-desc order via the new selection helper.
519
+ slots = [
520
+ DigestSlot(label="conn-share", statistic="cliff", cells=["A", "10%", "2.0x"],
521
+ entity="A", magnitude=10.0, ratio=2.0),
522
+ DigestSlot(label="densest-tuple", statistic="cliff", cells=["B → C:1", "5", "5.0x"],
523
+ entity="B → C:1", magnitude=5.0, ratio=5.0),
524
+ DigestSlot(label="fan-out", statistic="cliff", cells=["D:2", "8 dsts", "4.0x"],
525
+ entity="D:2", magnitude=8.0, ratio=4.0),
526
+ DigestSlot(label="byte-direction", statistic="cliff", cells=["E", "30%", "3.0x"],
527
+ entity="E", magnitude=30.0, ratio=3.0),
528
+ ]
529
+ insights, _ = _conn_insights_and_fields(slots)
530
+ assert len(insights) == 3
531
+ # Top three by ratio descending: densest-tuple (5.0), fan-out (4.0), byte-direction (3.0)
532
+ assert "B → C:1" in insights[0]
533
+ assert "D:2" in insights[1]
534
+ # byte-direction lede MUST NOT lead with the vestigial " → " glyph —
535
+ # the slot stores the bare dst now; only densest-tuple owns the
536
+ # between-endpoints arrow.
537
+ assert insights[2].startswith("E ")
538
+ assert "→" not in insights[2]
539
+
540
+
541
+ def test_insights_verbalize_identity_and_magnitude() -> None:
542
+ slot = DigestSlot(label="densest-tuple", statistic="cliff",
543
+ cells=["X → Y:22", "482", "3.7x"],
544
+ entity="X → Y:22", magnitude=482.0, ratio=3.7)
545
+ insights, _ = _conn_insights_and_fields([slot])
546
+ assert len(insights) == 1
547
+ line = insights[0]
548
+ assert "X → Y:22" in line
549
+ assert "482" in line
550
+ # Never reveal the raw statistic name
551
+ assert "cliff" not in line.lower()
552
+ assert "rank1" not in line.lower()
553
+
554
+
555
+ # ─── Display cap ─────────────────────────────────────────────────────────────
556
+
557
+ def test_format_ratio_cell_below_cap_renders_one_decimal() -> None:
558
+ assert conn_digest._format_ratio_cell(3.7) == "3.7x"
559
+ assert conn_digest._format_ratio_cell(49.9) == "49.9x"
560
+
561
+
562
+ def test_format_ratio_cell_at_or_above_cap_renders_capped_form() -> None:
563
+ # Boundary is inclusive (>= cap)
564
+ assert conn_digest._format_ratio_cell(50.0) == ">50x"
565
+ assert conn_digest._format_ratio_cell(625000.0) == ">50x"
566
+
567
+
568
+ def test_format_ratio_lede_below_cap_renders_one_decimal() -> None:
569
+ assert conn_digest._format_ratio_lede(3.7) == "3.7x"
570
+ assert conn_digest._format_ratio_lede(49.9) == "49.9x"
571
+
572
+
573
+ def test_format_ratio_lede_at_or_above_cap_renders_prose_form() -> None:
574
+ assert conn_digest._format_ratio_lede(50.0) == "more than 50x"
575
+ assert conn_digest._format_ratio_lede(625000.0) == "more than 50x"
576
+
577
+
578
+ def test_ledes_sort_by_true_ratio_when_one_slot_is_capped() -> None:
579
+ """The display cap must NOT corrupt lede sort order.
580
+
581
+ A slot with a huge raw ratio (rendered capped) must still outrank a slot
582
+ with a smaller raw ratio (rendered literally). Verifies the separation
583
+ between stored slot.ratio (raw float, drives sort) and rendered display
584
+ string (capped at CLIFF_DISPLAY_CAP).
585
+ """
586
+ capped = DigestSlot(
587
+ label="byte-direction", statistic="cliff",
588
+ cells=["A", "100%", ">50x"],
589
+ entity="A", magnitude=100.0, ratio=625000.0,
590
+ )
591
+ uncapped = DigestSlot(
592
+ label="densest-tuple", statistic="cliff",
593
+ cells=["B → C:22", "9", "5.0x"],
594
+ entity="B → C:22", magnitude=9.0, ratio=5.0,
595
+ )
596
+ # Intentionally pass uncapped first so the result reflects sort, not input order
597
+ insights, _ = _conn_insights_and_fields([uncapped, capped])
598
+ assert len(insights) == 2
599
+ # Capped slot (raw 625000) sorts first by true ratio
600
+ assert insights[0].startswith("A ")
601
+ assert "more than 50x" in insights[0]
602
+ assert "625000" not in insights[0] # raw number must NOT leak into the rendered string
603
+ # Uncapped slot sorts second, rendered as literal
604
+ assert "B → C:22" in insights[1]
605
+ assert "5.0x" in insights[1]
606
+
607
+
608
+ # ─── Summariser shape ────────────────────────────────────────────────────────
609
+
610
+ def test_summarizer_returns_zone1_insights_fields_keys() -> None:
611
+ df = _conn_df([_conn_row()])
612
+ body = conn_digest.summarize(df)
613
+ assert set(body.keys()) == {"zone1_extras", "insights", "fields"}
614
+
615
+
616
+ def test_summarizer_zone1_extras_lead_with_hosts() -> None:
617
+ df = _conn_df([_conn_row()])
618
+ body = conn_digest.summarize(df)
619
+ assert body["zone1_extras"][0][0] == "hosts"
620
+
621
+
622
+ # ─── Renderer (flat shape) ──────────────────────────────────────────────────
623
+
624
+ def _render_card(card: DigestCard) -> str:
625
+ handler = TextHandler(stream=io.StringIO())
626
+ handler.render_digest(card)
627
+ return handler._stream.getvalue()
628
+
629
+
630
+ def _empty_card() -> DigestCard:
631
+ return DigestCard(
632
+ schema="conn",
633
+ source_name="conn.log",
634
+ data_window=(_NOW - timedelta(days=1), _NOW),
635
+ record_count=0,
636
+ histogram_counts=[],
637
+ histogram_unit="hr",
638
+ histogram_peak=0,
639
+ zone1_extras=[("hosts", "0"), ("outbound bytes", "0 B"), ("inbound bytes", "0 B")],
640
+ insights=[],
641
+ fields=[], # non-speaking slots are filtered before reaching the card
642
+ )
643
+
644
+
645
+ def test_render_digest_identity_then_ambient() -> None:
646
+ rendered = _render_card(_empty_card())
647
+ lines = rendered.splitlines()
648
+ # Identity line 1, then identity line 2 (window), then identity line 3
649
+ # (schema · N lines · size), then blank, then ambient block.
650
+ assert lines[0] == "conn.log"
651
+ assert lines[2].startswith("conn · 0 lines ·")
652
+ # Ambient block (label-aligned, flush-left).
653
+ assert any(ln.startswith("hosts:") for ln in lines)
654
+ # No banner, no schema rule, no N.B. footer.
655
+ assert "LogHunter" not in rendered
656
+ assert "── digest" not in rendered
657
+ assert "N.B." not in rendered
658
+
659
+
660
+ def test_render_digest_non_speaking_slots_are_filtered_by_summariser() -> None:
661
+ """A non-speaking slot never reaches `card.fields` — selection happens
662
+ in the summariser. The renderer prints only what it gets and never
663
+ paints a `label: -` row under the flat grammar."""
664
+ rendered = _render_card(_empty_card())
665
+ assert "conn-share:" not in rendered
666
+ assert "fan-out:" not in rendered
667
+
668
+
669
+ def test_render_digest_field_block_shows_cells_for_speaking_non_insight_slot() -> None:
670
+ slot = DigestSlot(
671
+ label="densest-tuple", statistic="cliff",
672
+ cells=["X → Y:22", "482", "3.7x"],
673
+ entity="X → Y:22", magnitude=482.0, ratio=3.7,
674
+ )
675
+ card = DigestCard(
676
+ schema="conn",
677
+ source_name="conn.log",
678
+ data_window=(_NOW - timedelta(days=1), _NOW),
679
+ record_count=10,
680
+ histogram_counts=[1, 2, 3],
681
+ histogram_unit="hr",
682
+ histogram_peak=3,
683
+ zone1_extras=[("hosts", "1")],
684
+ insights=[],
685
+ fields=[slot],
686
+ data_size_bytes=0,
687
+ )
688
+ rendered = _render_card(card)
689
+ assert "densest-tuple: X → Y:22 482 3.7x" in rendered
690
+
691
+
692
+ # ─── Architectural fork: allowlist non-invocation ────────────────────────────
693
+
694
+ def test_run_digest_does_not_call_allowlist(tmp_path: Path, monkeypatch) -> None:
695
+ """run_digest must never call build_matcher or AllowlistMatcher.filter_df.
696
+ Patch both to raise; the digest run must complete cleanly."""
697
+ zeek_dir = tmp_path / "zeek"
698
+ rows = [
699
+ _conn_row(src="10.0.0.10", dst="192.0.2.50",
700
+ ts=_BASE_TS - 3600 * (i + 1), local_orig=True)
701
+ for i in range(6)
702
+ ]
703
+ _write_conn_ndjson(zeek_dir / "conn.log", rows)
704
+
705
+ sentinel = RuntimeError("digest path violated pre-allowlist tap")
706
+
707
+ from loghunter.common import allowlist as allowlist_mod
708
+ def explode(*_args, **_kwargs):
709
+ raise sentinel
710
+ monkeypatch.setattr(allowlist_mod, "build_matcher", explode)
711
+ monkeypatch.setattr(
712
+ allowlist_mod.AllowlistMatcher, "filter_df",
713
+ lambda self, df, name: (_ for _ in ()).throw(sentinel),
714
+ )
715
+
716
+ config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
717
+ # Should complete with no allowlist interaction; capsys swallows the
718
+ # rendered card so the test output stays clean.
719
+ runner.run_digest(
720
+ config=config, zeek_dir=zeek_dir, load_all=True, skip_confirm=True,
721
+ )
722
+
723
+
724
+ # ─── Default-window paths ────────────────────────────────────────────────────
725
+
726
+ def test_run_digest_flat_layout_default_window_uses_data_max_ts(
727
+ tmp_path: Path, monkeypatch, capsys
728
+ ) -> None:
729
+ """Flat-layout default window must anchor to data max-ts, not now.
730
+
731
+ Regression lock: an earlier plan draft proposed (now - span, now) as the
732
+ flat-layout fallback. With archived logs whose max-ts is in the past,
733
+ that approach silently discards everything. The corrected behaviour is
734
+ [max_ts - span, max_ts] derived from the data itself.
735
+ """
736
+ zeek_dir = tmp_path / "zeek"
737
+ # Far-past max-ts (5 years ago)
738
+ far_past_max = _BASE_TS - 5 * 365 * 86400
739
+ # Rows span 3 days before that max
740
+ rows = []
741
+ for i in range(6):
742
+ rows.append(_conn_row(
743
+ src=f"10.0.0.{i}", dst="192.0.2.20",
744
+ ts=far_past_max - i * 86400,
745
+ ))
746
+ _write_conn_ndjson(zeek_dir / "conn.log", rows)
747
+
748
+ config: dict[str, Any] = {"loghunter": {"default_window": "1d"}}
749
+ runner.run_digest(config=config, zeek_dir=zeek_dir, skip_confirm=True)
750
+ out = capsys.readouterr().out
751
+ # The rendered identity-line-2 window covers only the last day of the
752
+ # data — anchored to data-max-ts, not "now". The flat card has no
753
+ # banner so the old "Default window" note has no surface; window
754
+ # correctness alone is the signal here.
755
+ far_past_dt = datetime.fromtimestamp(far_past_max, tz=timezone.utc)
756
+ assert far_past_dt.strftime("%Y-%m-%d") in out
757
+
758
+
759
+ def test_run_digest_dated_layout_default_window_uses_zeek_dated_helper(
760
+ tmp_path: Path, monkeypatch, capsys
761
+ ) -> None:
762
+ """Dated-layout default window must use zeek_dated_default_window."""
763
+ zeek_dir = tmp_path / "zeek"
764
+ # Two dated subdirs
765
+ rows1 = [_conn_row(
766
+ src=f"10.0.0.{i}", dst="192.0.2.10",
767
+ ts=datetime(2026, 5, 30, 12, 0, tzinfo=timezone.utc).timestamp() + i,
768
+ ) for i in range(3)]
769
+ rows2 = [_conn_row(
770
+ src=f"10.0.0.{i+10}", dst="192.0.2.20",
771
+ ts=datetime(2026, 5, 31, 12, 0, tzinfo=timezone.utc).timestamp() + i,
772
+ ) for i in range(3)]
773
+ _write_conn_ndjson(zeek_dir / "2026-05-30" / "conn.log", rows1)
774
+ _write_conn_ndjson(zeek_dir / "2026-05-31" / "conn.log", rows2)
775
+
776
+ config: dict[str, Any] = {"loghunter": {"default_window": "1d"}}
777
+ runner.run_digest(config=config, zeek_dir=zeek_dir, skip_confirm=True)
778
+ out = capsys.readouterr().out
779
+ # Only the most recent dated dir (2026-05-31) should be in the window
780
+ assert "2026-05-31" in out
781
+
782
+
783
+ def test_run_digest_bounded_target_skips_default_window(
784
+ tmp_path: Path, monkeypatch, capsys
785
+ ) -> None:
786
+ """A single conn.log file (bounded) must load in full — no default-window filter."""
787
+ log_file = tmp_path / "conn.log"
788
+ far_past_max = _BASE_TS - 5 * 365 * 86400
789
+ rows = [_conn_row(
790
+ src=f"10.0.0.{i}", dst="192.0.2.20",
791
+ ts=far_past_max - i * 86400,
792
+ ) for i in range(10)]
793
+ _write_conn_ndjson(log_file, rows)
794
+
795
+ config: dict[str, Any] = {"loghunter": {"default_window": "1d"}}
796
+ runner.run_digest(config=config, zeek_dir=log_file, skip_confirm=True)
797
+ out = capsys.readouterr().out
798
+ # No "Default window" note should appear — bounded targets load full.
799
+ assert "Default window" not in out
800
+
801
+
802
+ # ─── Single-file Zeek bypass: filename-era basename gate retired ─────────────
803
+
804
+ # The single-file Zeek loader path used to route through discover_zeek_files,
805
+ # whose single-file branch gates on fnmatch(basename, pattern). After
806
+ # content-sniffing was added at the CLI layer, that filename gate started
807
+ # silently dropping date-prefixed files (e.g. 2026-06-09.conn.log) into
808
+ # zero-row cards. run_digest now bypasses discover_zeek_files for an explicit
809
+ # single Zeek file — only the Zeek path needed the fix (pihole/syslog/cloudtrail
810
+ # loaders already accept explicit files without a basename gate); the detect
811
+ # path is unchanged (it still uses the basename gate as a type check).
812
+
813
+ _TSV_CONN_HEADER = (
814
+ "#separator \\x09\n"
815
+ "#set_separator\t,\n"
816
+ "#empty_field\t(empty)\n"
817
+ "#unset_field\t-\n"
818
+ "#path\tconn\n"
819
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
820
+ "\tproto\tservice\tduration\torig_bytes\tresp_bytes"
821
+ "\tconn_state\tlocal_orig\tlocal_resp\ttunnel_parents\n"
822
+ "#types\ttime\tstring\taddr\tport\taddr\tport"
823
+ "\tenum\tstring\tinterval\tcount\tcount"
824
+ "\tstring\tbool\tbool\tset[string]\n"
825
+ )
826
+
827
+
828
+ def test_run_digest_date_prefixed_zeek_ndjson_renders_card_with_rows(
829
+ tmp_path: Path, capsys
830
+ ) -> None:
831
+ """Date-prefixed Zeek NDJSON single file renders a conn card with the
832
+ real row count.
833
+
834
+ Pre-fix the basename gate in discover_zeek_files dropped this file as
835
+ not matching ``conn*.log*``, leaving run_digest with an empty frame
836
+ that rendered as ``(no events in window)``.
837
+ """
838
+ log_file = tmp_path / "2026-06-09.conn.log"
839
+ rows = [
840
+ _conn_row(src=f"10.0.0.{i}", dst="192.0.2.20", ts=_BASE_TS - i)
841
+ for i in range(6)
842
+ ]
843
+ _write_conn_ndjson(log_file, rows)
844
+
845
+ config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
846
+ runner.run_digest(
847
+ config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
848
+ )
849
+ out = capsys.readouterr().out
850
+ # Histogram has a real peak — not the empty-frame fallback.
851
+ assert "(no events in window)" not in out
852
+ assert "peak:" in out
853
+
854
+
855
+ def test_run_digest_date_prefixed_zeek_tsv_renders_card_with_rows(
856
+ tmp_path: Path, capsys
857
+ ) -> None:
858
+ """Date-prefixed Zeek TSV single file with a complete header AND at
859
+ least one data row renders a conn card.
860
+
861
+ Proves the bypass reaches the Zeek strategy's prefix-preserving sniff
862
+ (which dispatches TSV vs NDJSON across ``run_load``) and applies the
863
+ conn normalizer — not just that sniff routed the file to the right
864
+ schema.
865
+ """
866
+ log_file = tmp_path / "2026-06-09.conn.log"
867
+ # Two data rows with distinct ts so the timeline has a non-zero span —
868
+ # required by the ts-confidence guard in run_digest. The bypass under
869
+ # test cares about file discovery + TSV parser routing, not span.
870
+ log_file.write_text(
871
+ _TSV_CONN_HEADER
872
+ + "1748649600.000000\tCTest01\t192.0.2.10\t51514\t203.0.113.20\t443"
873
+ "\ttcp\tssl\t3.5\t1500\t8200\tSF\tT\tF\t(empty)\n"
874
+ + "1748649660.000000\tCTest02\t192.0.2.11\t51515\t203.0.113.20\t443"
875
+ "\ttcp\tssl\t2.1\t800\t4400\tSF\tT\tF\t(empty)\n"
876
+ + "#close\t2026-01-01-00:00:00\n",
877
+ encoding="utf-8",
878
+ )
879
+
880
+ config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
881
+ runner.run_digest(
882
+ config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
883
+ )
884
+ out = capsys.readouterr().out
885
+ assert "(no events in window)" not in out
886
+ assert "peak:" in out
887
+
888
+
889
+ def test_run_digest_zeek_tsv_header_only_raises_digest_empty(
890
+ tmp_path: Path
891
+ ) -> None:
892
+ """A Zeek TSV file with a complete ``#path conn`` header but zero data
893
+ rows is RECOGNIZED-BUT-EMPTY: the header carries the schema, sniff
894
+ routes it as conn, the loader returns an empty frame, and run_digest
895
+ raises DigestEmpty (a control signal, not an error).
896
+
897
+ Gate 2 seam: a zero-row schema card was misleading — it read as "we
898
+ hunted and found nothing" rather than the truth ("we recognized it,
899
+ there was nothing to read"). The CLI catches DigestEmpty in both
900
+ entry paths and narrates "recognized X as conn but no parseable
901
+ records — skipping"; this test pins the runner-level raise.
902
+ """
903
+ from loghunter.common.errors import DigestEmpty
904
+
905
+ log_file = tmp_path / "2026-06-09.conn.log"
906
+ log_file.write_text(
907
+ _TSV_CONN_HEADER + "#close\t2026-01-01-00:00:00\n",
908
+ encoding="utf-8",
909
+ )
910
+
911
+ config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
912
+ with pytest.raises(DigestEmpty) as exc_info:
913
+ runner.run_digest(
914
+ config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
915
+ )
916
+ assert exc_info.value.schema == "conn"
917
+ assert exc_info.value.basename == log_file.name
918
+
919
+
920
+ def test_run_digest_plain_conn_log_still_renders_card_with_rows(
921
+ tmp_path: Path, capsys
922
+ ) -> None:
923
+ """Regression: a single file literally named ``conn.log`` (matches the
924
+ old basename pattern) still loads and renders correctly. Confirms the
925
+ bypass didn't break the previously-working filename case."""
926
+ log_file = tmp_path / "conn.log"
927
+ rows = [
928
+ _conn_row(src=f"10.0.0.{i}", dst="192.0.2.20", ts=_BASE_TS - i)
929
+ for i in range(6)
930
+ ]
931
+ _write_conn_ndjson(log_file, rows)
932
+
933
+ config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
934
+ runner.run_digest(
935
+ config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
936
+ )
937
+ out = capsys.readouterr().out
938
+ assert "(no events in window)" not in out
939
+ assert "peak:" in out
940
+
941
+
942
+ # ─── Identity line 1: every card carries its source name ────────────────────
943
+
944
+
945
+ def test_run_digest_single_file_identity_line_is_basename(
946
+ tmp_path: Path, capsys,
947
+ ) -> None:
948
+ """End-to-end: a single-file digest renders identity line 1 as the
949
+ file's basename. No banner. The exact record count appears on
950
+ identity line 3 (no glob-pattern key)."""
951
+ log_file = tmp_path / "2026-05-30.conn.log"
952
+ rows = [
953
+ _conn_row(src=f"10.0.0.{i}", dst="192.0.2.20", ts=_BASE_TS - i)
954
+ for i in range(6)
955
+ ]
956
+ _write_conn_ndjson(log_file, rows)
957
+
958
+ config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
959
+ runner.run_digest(
960
+ config=config, zeek_dir=log_file, load_all=True, skip_confirm=True,
961
+ )
962
+ out = capsys.readouterr().out
963
+ lines = out.splitlines()
964
+ assert lines[0] == "2026-05-30.conn.log"
965
+ # Identity line 3 — exact count, no glob-pattern key.
966
+ schema_line = next(ln for ln in lines if ln.startswith("conn · "))
967
+ assert "6 lines" in schema_line
968
+ assert "conn*.log*" not in schema_line
969
+ # No Source: banner row under the flat grammar.
970
+ assert not any(ln.startswith("Source:") for ln in lines)
971
+
972
+
973
+ def test_run_digest_directory_mode_identity_line_is_dir_name(
974
+ tmp_path: Path, capsys,
975
+ ) -> None:
976
+ """Directory-mode digest gets identity line 1 = directory's basename.
977
+ source_name is no longer the file-vs-directory discriminator — every
978
+ card has an identity line."""
979
+ zeek_dir = tmp_path / "zeek"
980
+ rows = [
981
+ _conn_row(src=f"10.0.0.{i}", dst="192.0.2.20", ts=_BASE_TS - i)
982
+ for i in range(6)
983
+ ]
984
+ _write_conn_ndjson(zeek_dir / "conn.log", rows)
985
+
986
+ config: dict[str, Any] = {"loghunter": {"default_window": "all"}}
987
+ runner.run_digest(
988
+ config=config, zeek_dir=zeek_dir, load_all=True, skip_confirm=True,
989
+ )
990
+ out = capsys.readouterr().out
991
+ lines = out.splitlines()
992
+ assert lines[0] == "zeek"
993
+ # No banner / Source: / Records: rows in the flat grammar.
994
+ assert not any(ln.startswith("Source:") for ln in lines)
995
+ assert not any(ln.startswith("Records:") for ln in lines)
996
+
997
+
998
+ # ─── CLI dispatch and whitelist enforcement ──────────────────────────────────
999
+
1000
+ _ZEEK_NDJSON_CONN_LINE = (
1001
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.20",'
1002
+ ' "id.resp_p": 443, "proto": "tcp", "duration": 1.23}\n'
1003
+ )
1004
+
1005
+
1006
+ def _write_zeek_conn_file(tmp_path: Path) -> Path:
1007
+ log_path = tmp_path / "conn.log"
1008
+ log_path.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
1009
+ return log_path
1010
+
1011
+
1012
+ def test_cli_digest_dispatch_routes_to_run_digest(tmp_path: Path, monkeypatch) -> None:
1013
+ import loghunter.cli as cli
1014
+ import loghunter.runner as runner_mod
1015
+
1016
+ called: dict[str, Any] = {}
1017
+ def fake_run_digest(**kwargs):
1018
+ called.update(kwargs)
1019
+ monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
1020
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1021
+
1022
+ log_path = _write_zeek_conn_file(tmp_path)
1023
+ cli._main(["digest", str(log_path), "--all"])
1024
+ assert called.get("schema") == "conn"
1025
+ # CLI passes raw strings; resolver owns Path conversion.
1026
+ assert called.get("zeek_dir") == str(log_path)
1027
+ assert called.get("load_all") is True
1028
+
1029
+
1030
+ def test_cli_digest_rejects_detect_flag(monkeypatch) -> None:
1031
+ import loghunter.cli as cli
1032
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1033
+ with pytest.raises(ValueError, match="--detect"):
1034
+ cli._main(["digest", "--detect=beacon"])
1035
+
1036
+
1037
+ def test_cli_digest_rejects_non_text_output(tmp_path: Path, monkeypatch) -> None:
1038
+ import loghunter.cli as cli
1039
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1040
+ log_path = _write_zeek_conn_file(tmp_path)
1041
+ with pytest.raises(ValueError, match="text"):
1042
+ cli._main(["digest", str(log_path), "--output=json", "--all"])
1043
+
1044
+
1045
+ def test_cli_digest_rejects_filter_flag() -> None:
1046
+ """Filter / field flags aren't anywhere in the spec → plain unknown-flag."""
1047
+ import loghunter.cli as cli
1048
+ with pytest.raises(ValueError, match="unknown flag --filter"):
1049
+ cli._main(["digest", "--filter=src=192.0.2.10"])
1050
+
1051
+
1052
+ def test_cli_digest_rejects_arbitrary_unknown_long_flag() -> None:
1053
+ import loghunter.cli as cli
1054
+ with pytest.raises(ValueError, match="unknown flag --field"):
1055
+ cli._main(["digest", "--field=src"])
1056
+
1057
+
1058
+ def test_cli_digest_rejects_unknown_short_flag() -> None:
1059
+ import loghunter.cli as cli
1060
+ with pytest.raises(ValueError, match=r"unknown flag -x"):
1061
+ cli._main(["digest", "-x"])
1062
+
1063
+
1064
+ def test_cli_digest_accepts_y_short_flag(tmp_path: Path, monkeypatch) -> None:
1065
+ import loghunter.cli as cli
1066
+ import loghunter.runner as runner_mod
1067
+
1068
+ called: dict[str, Any] = {}
1069
+ def fake_run_digest(**kwargs):
1070
+ called.update(kwargs)
1071
+ monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
1072
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1073
+
1074
+ log_path = _write_zeek_conn_file(tmp_path)
1075
+ cli._main(["digest", str(log_path), "-y", "--all"])
1076
+ assert called.get("skip_confirm") is True
1077
+
1078
+
1079
+ def test_cli_digest_missing_path_surfaces_actionable_error_and_exits_nonzero(
1080
+ monkeypatch, capsys,
1081
+ ) -> None:
1082
+ """Per-path errors surface inline on stderr; with no card rendered the
1083
+ digest exit code is 1 (three-way tally: 0 rendered, ≥1 errored)."""
1084
+ import loghunter.cli as cli
1085
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1086
+ rc = cli._main(["digest", "/no/such/file/here.log"])
1087
+ captured = capsys.readouterr()
1088
+ assert "digest: path not found" in captured.err
1089
+ assert rc == 1
1090
+
1091
+
1092
+ def test_cli_digest_directory_positional_is_rejected_and_exits_nonzero(
1093
+ tmp_path: Path, monkeypatch, capsys,
1094
+ ) -> None:
1095
+ """v1 sniff insists on filenames; directories do not fan out. The
1096
+ directory is surfaced inline on stderr and the run exits 1."""
1097
+ import loghunter.cli as cli
1098
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1099
+ a_dir = tmp_path / "logs"
1100
+ a_dir.mkdir()
1101
+ rc = cli._main(["digest", str(a_dir)])
1102
+ captured = capsys.readouterr()
1103
+ assert "must be a file, not a directory" in captured.err
1104
+ assert rc == 1
1105
+
1106
+
1107
+ def test_cli_digest_empty_file_prints_message_and_skips(
1108
+ tmp_path: Path, monkeypatch, capsys,
1109
+ ) -> None:
1110
+ import loghunter.cli as cli
1111
+ import loghunter.runner as runner_mod
1112
+ called: dict[str, Any] = {}
1113
+ def fake_run_digest(**kwargs):
1114
+ called.update(kwargs)
1115
+ monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
1116
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1117
+
1118
+ empty = tmp_path / "nothing.log"
1119
+ empty.write_text("", encoding="utf-8")
1120
+ cli._main(["digest", str(empty)])
1121
+ captured = capsys.readouterr()
1122
+ assert "nothing.log is empty. Nothing to do!" in captured.out
1123
+ assert called == {}, "run_digest must NOT be invoked for an empty file"
1124
+
1125
+
1126
+ def test_cli_digest_whitespace_only_file_prints_message_and_skips(
1127
+ tmp_path: Path, monkeypatch, capsys,
1128
+ ) -> None:
1129
+ import loghunter.cli as cli
1130
+ import loghunter.runner as runner_mod
1131
+ called: dict[str, Any] = {}
1132
+ def fake_run_digest(**kwargs):
1133
+ called.update(kwargs)
1134
+ monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
1135
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1136
+
1137
+ blanks = tmp_path / "blanks.log"
1138
+ blanks.write_text("\n \n\t\n", encoding="utf-8")
1139
+ cli._main(["digest", str(blanks)])
1140
+ captured = capsys.readouterr()
1141
+ assert "blanks.log is empty. Nothing to do!" in captured.out
1142
+ assert called == {}
1143
+
1144
+
1145
+ def test_cli_digest_unrecognized_text_routes_to_blob(
1146
+ tmp_path: Path, monkeypatch,
1147
+ ) -> None:
1148
+ import loghunter.cli as cli
1149
+ import loghunter.runner as runner_mod
1150
+ called: dict[str, Any] = {}
1151
+ def fake_run_digest(**kwargs):
1152
+ called.update(kwargs)
1153
+ monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
1154
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1155
+
1156
+ mystery = tmp_path / "mystery.txt"
1157
+ mystery.write_text("hello world\nlorem ipsum\n", encoding="utf-8")
1158
+ cli._main(["digest", str(mystery)])
1159
+ assert called.get("schema") == "blob"
1160
+ assert called.get("blob_path") == mystery
1161
+
1162
+
1163
+ def test_cli_digest_bare_no_positional_uses_config_zeek_dir(
1164
+ tmp_path: Path, monkeypatch,
1165
+ ) -> None:
1166
+ """No positional → CLI passes config through unchanged; the config-driven
1167
+ conn fallback fires inside ``resolve_digest_source`` in ``run_digest``.
1168
+
1169
+ This test asserts the CLI seam shape (zeek_dir override is None — the
1170
+ config flows in via the config dict). The actual config-fallback
1171
+ resolution is tested at the resolver layer
1172
+ (tests/test_sources.py:test_digest_conn_override_wins-style coverage
1173
+ + tests/test_root_provenance.py:test_runner_run_digest_applies_root_to_config_source_dirs).
1174
+ """
1175
+ import loghunter.cli as cli
1176
+ import loghunter.runner as runner_mod
1177
+ called: dict[str, Any] = {}
1178
+ def fake_run_digest(**kwargs):
1179
+ called.update(kwargs)
1180
+ monkeypatch.setattr(runner_mod, "run_digest", fake_run_digest)
1181
+ zeek = tmp_path / "zeek"
1182
+ zeek.mkdir()
1183
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {"zeek_dir": str(zeek)}})
1184
+
1185
+ cli._main(["digest"])
1186
+ assert called.get("schema") == "conn"
1187
+ # CLI seam: no override (None); config flows in via the config dict.
1188
+ assert called.get("zeek_dir") is None
1189
+ assert called["config"]["loghunter"]["zeek_dir"] == str(zeek)