loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,1237 @@
1
+ """Tests for the blob digest path — describes unrecognized bytes.
2
+
3
+ Pins the architectural rails of Gate 2:
4
+ - O(sample): the profiler reads a bounded sample, never the whole file.
5
+ - Zero field extraction: no timestamp, no fields — bytes and shape-guesses.
6
+ - Shared banner: blob's RunSummary routes through _render_run_summary like
7
+ schema cards, via the additive record_label / data_window seams.
8
+ - Vanish-don't-dash: optional slots that don't apply are omitted entirely.
9
+ - Sniff-only entry: blob is reached via the sniff floor, never an operator
10
+ token.
11
+
12
+ All synthetic content. Per the project's data-privacy rule, no real network
13
+ artifacts.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import gzip
19
+ import io
20
+ import math
21
+ from pathlib import Path
22
+ from typing import Any
23
+
24
+ import pandas as pd
25
+ import pytest
26
+
27
+ import loghunter.cli as cli
28
+ import loghunter.outputs.text as text_module
29
+ import loghunter.runner as runner
30
+ from loghunter.common.errors import DigestEmpty
31
+ from loghunter.common.finding import BlobCard, RunSummary
32
+ from loghunter.digest import blob as blob_digest
33
+ from loghunter.outputs.text import TextHandler
34
+
35
+
36
+ # ─── Helpers ────────────────────────────────────────────────────────────────
37
+
38
+
39
+ def _render(card: BlobCard, source_name: str = "mystery.txt") -> str:
40
+ """Render a blob card. source_name is overridden on the card before
41
+ render so older fixtures still pin identity-line-1 to a known name."""
42
+ card.source_name = source_name
43
+ stream = io.StringIO()
44
+ handler = TextHandler(stream=stream, verbose_level=0)
45
+ handler.render_blob(card)
46
+ return stream.getvalue()
47
+
48
+
49
+ def _binary_blob_card() -> BlobCard:
50
+ """A BlobCard shaped like a terminal-magic binary hit (PNG)."""
51
+ return BlobCard(
52
+ source_name="mystery.bin",
53
+ byte_size=4096,
54
+ sampled_line_count=0,
55
+ sample_read_count=1,
56
+ is_compressed=False,
57
+ printable_pct=0.1,
58
+ nonprintable_pct=99.9,
59
+ utf8_clean=False,
60
+ file_type_guess="PNG image",
61
+ file_type_magic=b"\x89PNG\r\n\x1a\n",
62
+ shape_guess=None,
63
+ )
64
+
65
+
66
+ def _text_blob_card(
67
+ *,
68
+ shape_guess: str = "freeform text",
69
+ sampled_line_count: int = 1000,
70
+ utf8_clean: bool = True,
71
+ has_templates: bool = True,
72
+ has_tokens: bool = True,
73
+ ) -> BlobCard:
74
+ """A BlobCard shaped like a text path with all text slots populated."""
75
+ return BlobCard(
76
+ source_name="mystery.txt",
77
+ byte_size=64_000,
78
+ sampled_line_count=sampled_line_count,
79
+ sample_read_count=6,
80
+ is_compressed=False,
81
+ printable_pct=99.7,
82
+ nonprintable_pct=0.3,
83
+ utf8_clean=utf8_clean,
84
+ file_type_guess=None,
85
+ file_type_magic=None,
86
+ shape_guess=shape_guess,
87
+ mean_line_length=412.0,
88
+ median_line_length=400.0,
89
+ line_length_p95=980,
90
+ max_line_length=4201,
91
+ line_length_stdev=120.0,
92
+ line_length_shape="varied",
93
+ top_tokens=(
94
+ [("level", 200), ("ts", 200), ("msg", 200), ("service", 200), ("trace_id", 200)]
95
+ if has_tokens else None
96
+ ),
97
+ distinct_templates=140 if has_templates else None,
98
+ top_template_coverage_pct=78.0 if has_templates else None,
99
+ top_template_n=6 if has_templates else None,
100
+ singleton_template_count=12 if has_templates else None,
101
+ )
102
+
103
+
104
+ # ─── O(sample) rail (PRIMARY: deterministic byte counter) ───────────────────
105
+
106
+
107
+ def test_o_sample_rail_bounded_byte_budget(tmp_path, monkeypatch) -> None:
108
+ """The profiler reads a bounded sample regardless of file size. Wrap the
109
+ low-level reads so we can count bytes pulled off disk; assert the total
110
+ is within the head + seek budget. This is the PRIMARY rail enforcement —
111
+ a determinism gate, not a wall-clock smoke test."""
112
+ big = tmp_path / "big.log"
113
+ payload = b"a" * 64 + b"\n" # 65-byte lines
114
+ # ~5 MB — enough to exercise the seek path (well above _SEEK_MIN_SIZE).
115
+ with big.open("wb") as fh:
116
+ for _ in range(80_000):
117
+ fh.write(payload)
118
+
119
+ bytes_read = 0
120
+ readline_bytes = 0
121
+
122
+ real_open = Path.open
123
+
124
+ def spy_open(self, mode="r", *args, **kwargs):
125
+ fh = real_open(self, mode, *args, **kwargs)
126
+ if "b" in mode and self == big:
127
+ real_read = fh.read
128
+ real_readline = fh.readline
129
+
130
+ def counted_read(n=-1):
131
+ nonlocal bytes_read
132
+ data = real_read(n)
133
+ bytes_read += len(data)
134
+ return data
135
+
136
+ def counted_readline(*a, **kw):
137
+ nonlocal readline_bytes
138
+ data = real_readline(*a, **kw)
139
+ readline_bytes += len(data)
140
+ return data
141
+
142
+ fh.read = counted_read
143
+ fh.readline = counted_readline
144
+ return fh
145
+
146
+ monkeypatch.setattr(Path, "open", spy_open)
147
+
148
+ card = blob_digest.summarize_blob(big)
149
+
150
+ head = blob_digest._HEAD_BYTES
151
+ seeks = blob_digest._SEEK_COUNT
152
+ seek_bytes = blob_digest._SEEK_BYTES
153
+ # Hard budget: head + seeks * seek_bytes — NO slack for readline,
154
+ # because the seek skip is now a bounded read, not a readline scan.
155
+ budget = head + seeks * seek_bytes
156
+ total = bytes_read + readline_bytes
157
+ assert total <= budget, (
158
+ f"profiler pulled {total:,} bytes "
159
+ f"(read={bytes_read:,}, readline={readline_bytes:,}); "
160
+ f"budget {budget:,}"
161
+ )
162
+ # And the card still characterises the file.
163
+ assert card.shape_guess is not None
164
+ assert card.sampled_line_count > 0
165
+
166
+
167
+ def test_o_sample_rail_holds_on_long_line_no_newline_file(tmp_path) -> None:
168
+ """REGRESSION: an earlier impl used fh.readline() to discard the partial
169
+ first line after each seek. With a 5 MB single-line file, that pulled
170
+ 13 MB through readline() (scanning to EOF) — violating the rail and
171
+ invisible to the read()-only spy.
172
+
173
+ The fix is a hard-bounded seek window: read EXACTLY _SEEK_BYTES, find
174
+ the first newline within it, return the post-newline slice; if no
175
+ newline in the window, return empty and let the head sample carry the
176
+ cascade. Total disk I/O per seek is _SEEK_BYTES — no more.
177
+
178
+ This test asserts via the spy on bytes-mode .read() AND on .readline().
179
+ Both must total within budget. Long lines are a real shape for
180
+ minified logs, single-line dumps, and certain export formats — the
181
+ rail has to hold there too.
182
+ """
183
+ big = tmp_path / "longline.log"
184
+ # 5 MB of a single line — no newline anywhere except at the end.
185
+ payload = b"a" * (5 * 1024 * 1024) + b"\n"
186
+ big.write_bytes(payload)
187
+
188
+ bytes_read = 0
189
+ readline_bytes = 0
190
+
191
+ real_open = Path.open
192
+
193
+ def spy_open(self, mode="r", *args, **kwargs):
194
+ fh = real_open(self, mode, *args, **kwargs)
195
+ if "b" in mode and self == big:
196
+ real_read = fh.read
197
+ real_readline = fh.readline
198
+
199
+ def counted_read(n=-1):
200
+ nonlocal bytes_read
201
+ data = real_read(n)
202
+ bytes_read += len(data)
203
+ return data
204
+
205
+ def counted_readline(*a, **kw):
206
+ nonlocal readline_bytes
207
+ data = real_readline(*a, **kw)
208
+ readline_bytes += len(data)
209
+ return data
210
+
211
+ fh.read = counted_read
212
+ fh.readline = counted_readline
213
+ return fh
214
+
215
+ import unittest.mock
216
+ with unittest.mock.patch.object(Path, "open", spy_open):
217
+ card = blob_digest.summarize_blob(big)
218
+
219
+ head = blob_digest._HEAD_BYTES
220
+ seeks = blob_digest._SEEK_COUNT
221
+ seek_bytes = blob_digest._SEEK_BYTES
222
+ # Hard budget: head once + at most seek_bytes per seek. No readline.
223
+ budget = head + seeks * seek_bytes
224
+ total = bytes_read + readline_bytes
225
+ assert total <= budget, (
226
+ f"long-line file pulled {total:,} bytes "
227
+ f"(read={bytes_read:,}, readline={readline_bytes:,}); "
228
+ f"budget {budget:,}"
229
+ )
230
+ # And readline() must contribute zero — the fix is "no readline at all".
231
+ assert readline_bytes == 0
232
+ # Card still well-formed even with all-empty body chunks.
233
+ assert isinstance(card, BlobCard)
234
+
235
+
236
+ def test_determinism_same_file_yields_identical_card(tmp_path) -> None:
237
+ """Seek offsets must be derived from file size — no unseeded randomness.
238
+ Same file → same sample → identical card."""
239
+ p = tmp_path / "log.txt"
240
+ with p.open("wb") as fh:
241
+ for i in range(20_000):
242
+ fh.write(f"event {i} payload alpha beta gamma\n".encode())
243
+
244
+ a = blob_digest.summarize_blob(p)
245
+ b = blob_digest.summarize_blob(p)
246
+ assert a.sampled_line_count == b.sampled_line_count
247
+ assert a.sample_read_count == b.sample_read_count
248
+ assert a.top_tokens == b.top_tokens
249
+ assert a.mean_line_length == b.mean_line_length
250
+ assert a.line_length_p95 == b.line_length_p95
251
+ assert a.shape_guess == b.shape_guess
252
+
253
+
254
+ # ─── Magic-byte identification ──────────────────────────────────────────────
255
+
256
+
257
+ def test_terminal_magic_png_skips_text_cascade(tmp_path) -> None:
258
+ p = tmp_path / "img.png"
259
+ # PNG header + arbitrary binary tail.
260
+ p.write_bytes(b"\x89PNG\r\n\x1a\n" + bytes(range(256)) * 64)
261
+
262
+ card = blob_digest.summarize_blob(p)
263
+ assert card.file_type_guess == "PNG image"
264
+ assert card.file_type_magic == b"\x89PNG\r\n\x1a\n"
265
+ # Text slots vanish.
266
+ assert card.shape_guess is None
267
+ assert card.mean_line_length is None
268
+ assert card.top_tokens is None
269
+ assert card.distinct_templates is None
270
+
271
+
272
+ @pytest.mark.parametrize(
273
+ "magic,label",
274
+ [
275
+ (b"%PDF-1.4\n", "PDF document"),
276
+ (b"\x7fELF\x02\x01\x01", "ELF binary"),
277
+ (b"PK\x03\x04stuff", "zip archive"),
278
+ ],
279
+ )
280
+ def test_terminal_magic_set_identifies(tmp_path, magic, label) -> None:
281
+ p = tmp_path / "f.bin"
282
+ p.write_bytes(magic + bytes(range(256)) * 16)
283
+
284
+ card = blob_digest.summarize_blob(p)
285
+ assert card.file_type_guess == label
286
+ assert card.shape_guess is None
287
+ assert card.mean_line_length is None
288
+
289
+
290
+ def test_container_gzip_decompresses_and_profiles_content(tmp_path) -> None:
291
+ """gzip is a CONTAINER, not terminal — decompress the prefix and
292
+ profile the content shape underneath, label as compressed."""
293
+ p = tmp_path / "data.log.gz"
294
+ with gzip.open(p, "wt", encoding="utf-8") as fh:
295
+ for i in range(1000):
296
+ fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
297
+
298
+ card = blob_digest.summarize_blob(p)
299
+ assert card.is_compressed is True
300
+ assert card.shape_guess == "JSON"
301
+ # Terminal magic is NOT set — gzip is a container, not a final answer.
302
+ assert card.file_type_guess is None
303
+ # byte_size is the on-disk size (compressed), NOT the decompressed total.
304
+ on_disk = p.stat().st_size
305
+ assert card.byte_size == on_disk
306
+
307
+
308
+ def test_container_bz2_decompresses_and_profiles_content(tmp_path) -> None:
309
+ """bzip2 is a CONTAINER — decompress the prefix via stdlib bz2 and
310
+ profile the content shape underneath."""
311
+ import bz2 as bz2_mod
312
+ p = tmp_path / "data.log.bz2"
313
+ with bz2_mod.open(p, "wt", encoding="utf-8") as fh:
314
+ for i in range(1000):
315
+ fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
316
+
317
+ card = blob_digest.summarize_blob(p)
318
+ assert card.is_compressed is True
319
+ assert card.shape_guess == "JSON"
320
+ assert card.file_type_guess is None
321
+ assert card.byte_size == p.stat().st_size
322
+
323
+
324
+ def test_container_xz_decompresses_and_profiles_content(tmp_path) -> None:
325
+ """xz is a CONTAINER — decompress the prefix via stdlib lzma and
326
+ profile the content shape underneath."""
327
+ import lzma as lzma_mod
328
+ p = tmp_path / "data.log.xz"
329
+ with lzma_mod.open(p, "wt", encoding="utf-8") as fh:
330
+ for i in range(1000):
331
+ fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
332
+
333
+ card = blob_digest.summarize_blob(p)
334
+ assert card.is_compressed is True
335
+ assert card.shape_guess == "JSON"
336
+ assert card.file_type_guess is None
337
+ assert card.byte_size == p.stat().st_size
338
+
339
+
340
+ def test_misnamed_xz_routes_by_magic_not_suffix(tmp_path) -> None:
341
+ """An xz-compressed file written with a non-.xz suffix (e.g. mystery.log)
342
+ is identified by magic and decompressed via the correct opener — proves
343
+ the magic table actually drives routing rather than being ornamental.
344
+
345
+ Without this, only correctly-suffixed containers would work, and the
346
+ bz2/xz magic-table entries would be vestigial."""
347
+ import lzma as lzma_mod
348
+ p = tmp_path / "mystery.log"
349
+ with lzma_mod.open(p, "wt", encoding="utf-8") as fh:
350
+ for i in range(1000):
351
+ fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
352
+
353
+ card = blob_digest.summarize_blob(p)
354
+ # Magic ID detected xz via "\xfd7zXZ\x00"; opener routed via lzma.open.
355
+ assert card.is_compressed is True
356
+ assert card.shape_guess == "JSON"
357
+ assert card.file_type_guess is None
358
+
359
+
360
+ def test_misnamed_bz2_routes_by_magic_not_suffix(tmp_path) -> None:
361
+ """A bzip2-compressed file with a non-.bz2 suffix is identified by
362
+ magic ("BZh") and decompressed via the correct opener."""
363
+ import bz2 as bz2_mod
364
+ p = tmp_path / "unknown.dat"
365
+ with bz2_mod.open(p, "wt", encoding="utf-8") as fh:
366
+ for i in range(1000):
367
+ fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
368
+
369
+ card = blob_digest.summarize_blob(p)
370
+ assert card.is_compressed is True
371
+ assert card.shape_guess == "JSON"
372
+ assert card.file_type_guess is None
373
+
374
+
375
+ # ─── Shape cascade ──────────────────────────────────────────────────────────
376
+
377
+
378
+ def test_shape_cascade_json(tmp_path) -> None:
379
+ p = tmp_path / "j.log"
380
+ with p.open("w") as fh:
381
+ for i in range(500):
382
+ fh.write(f'{{"k": {i}, "v": "x"}}\n')
383
+
384
+ card = blob_digest.summarize_blob(p)
385
+ assert card.shape_guess == "JSON"
386
+
387
+
388
+ def test_shape_cascade_csv_recognized_from_body_not_header(tmp_path) -> None:
389
+ """A CSV-like single header line with non-CSV body must NOT be
390
+ mis-classified as CSV. The cascade prefers body (seek) lines."""
391
+ p = tmp_path / "fake.csv"
392
+ # Single comma-rich header followed by a large freeform body.
393
+ header = "a,b,c,d,e,f,g,h\n"
394
+ body_line = "this is a freeform line with no commas in it\n"
395
+ # Large enough to trigger seeks (above _SEEK_MIN_SIZE).
396
+ with p.open("w") as fh:
397
+ fh.write(header)
398
+ for _ in range(20_000):
399
+ fh.write(body_line)
400
+
401
+ card = blob_digest.summarize_blob(p)
402
+ assert card.shape_guess != "CSV"
403
+ # Probably freeform; at least, the comma count test must fail.
404
+ assert "CSV" not in (card.shape_guess or "")
405
+
406
+
407
+ def test_shape_cascade_tsv_recognized_from_body(tmp_path) -> None:
408
+ p = tmp_path / "data.tsv"
409
+ with p.open("w") as fh:
410
+ # Body lines: 6 tabs = 7 columns each.
411
+ for i in range(2000):
412
+ fh.write("\t".join(["x"] * 7) + f"\t{i}\n")
413
+
414
+ card = blob_digest.summarize_blob(p)
415
+ assert card.shape_guess is not None
416
+ assert "TSV" in card.shape_guess
417
+ assert "~" in card.shape_guess and "columns" in card.shape_guess
418
+
419
+
420
+ def test_shape_cascade_html(tmp_path) -> None:
421
+ p = tmp_path / "page.html"
422
+ with p.open("w") as fh:
423
+ for _ in range(500):
424
+ fh.write("<div><span>some content</span></div>\n")
425
+
426
+ card = blob_digest.summarize_blob(p)
427
+ assert card.shape_guess == "HTML/XML"
428
+
429
+
430
+ def test_shape_cascade_key_value(tmp_path) -> None:
431
+ p = tmp_path / "kv.log"
432
+ with p.open("w") as fh:
433
+ for i in range(500):
434
+ fh.write(f"key1=val{i} key2=alpha key3=beta key4=gamma\n")
435
+
436
+ card = blob_digest.summarize_blob(p)
437
+ assert card.shape_guess == "key-value text"
438
+
439
+
440
+ def test_shape_cascade_freeform(tmp_path) -> None:
441
+ p = tmp_path / "free.log"
442
+ with p.open("w") as fh:
443
+ for i in range(1000):
444
+ fh.write(f"plain prose sentence number {i} with no structure.\n")
445
+
446
+ card = blob_digest.summarize_blob(p)
447
+ assert card.shape_guess == "freeform text"
448
+
449
+
450
+ # ─── Char-class / UTF-8 honesty ────────────────────────────────────────────
451
+
452
+
453
+ def test_char_class_flags_nonprintable_in_sample(tmp_path) -> None:
454
+ """Char-class is computed over the sample bytes BEFORE decode — a binary-
455
+ heavy sample produces a low printable fraction even without a magic hit."""
456
+ p = tmp_path / "binary.bin"
457
+ # No known magic; just a mass of non-printable bytes.
458
+ p.write_bytes(b"\x01\x02\x03\xfe\xfd\xfc" * 1024)
459
+
460
+ card = blob_digest.summarize_blob(p)
461
+ assert card.printable_pct < 20.0
462
+ assert card.nonprintable_pct > 80.0
463
+ assert math.isclose(
464
+ card.printable_pct + card.nonprintable_pct, 100.0, abs_tol=0.01
465
+ )
466
+
467
+
468
+ def test_utf8_clean_true_on_ascii(tmp_path) -> None:
469
+ p = tmp_path / "ascii.log"
470
+ p.write_text("hello world\n" * 100)
471
+ card = blob_digest.summarize_blob(p)
472
+ assert card.utf8_clean is True
473
+
474
+
475
+ def test_utf8_clean_false_on_latin1_high_bytes(tmp_path) -> None:
476
+ """Bytes that fail strict UTF-8 decode set utf8_clean=False; the
477
+ renderer's Bytes row drops the 'UTF-8 clean' tail."""
478
+ p = tmp_path / "latin1.log"
479
+ # 0xc0 alone is an invalid UTF-8 start; 0xa3 (£) without lead is also bad.
480
+ p.write_bytes(b"hello \xc0\xa3 world\n" * 200)
481
+ card = blob_digest.summarize_blob(p)
482
+ assert card.utf8_clean is False
483
+
484
+ out = _render(card)
485
+ assert "UTF-8 clean" not in out
486
+ assert "% printable" in out
487
+
488
+
489
+ def test_utf8_clean_true_renders_clean_tail(tmp_path) -> None:
490
+ p = tmp_path / "clean.log"
491
+ p.write_text("alpha beta gamma\n" * 200)
492
+ card = blob_digest.summarize_blob(p)
493
+ assert card.utf8_clean is True
494
+ out = _render(card)
495
+ assert "UTF-8 clean" in out
496
+
497
+
498
+ # ─── Line-length p95 ────────────────────────────────────────────────────────
499
+
500
+
501
+ def test_line_length_shape_returns_p95() -> None:
502
+ """The lifted helper grows a 6-tuple with p95 inserted between median
503
+ and max. Card field line_length_p95 is populated from it."""
504
+ lengths = list(range(1, 101)) # 1..100
505
+ mean, median, p95, max_len, stdev, shape = blob_digest._line_length_shape(
506
+ lengths
507
+ )
508
+ assert mean == pytest.approx(50.5)
509
+ assert median == 50.5
510
+ assert max_len == 100
511
+ # 95th percentile of 1..100 with quantiles(n=20)[18] ≈ 95 or 96.
512
+ assert 90 <= p95 <= 100
513
+ assert shape in ("uniform", "varied")
514
+
515
+
516
+ def test_p95_never_exceeds_max_on_tiny_sample() -> None:
517
+ """REGRESSION: statistics.quantiles(n=20) interpolates exclusively and
518
+ EXTRAPOLATES past max on small samples — lengths=[1, 100] yielded
519
+ p95=184 with max=100, which is nonsense (p95 must be an order
520
+ statistic from the sample). The fix is to fall back to max when the
521
+ sample is smaller than 20 lines, plus a defensive clamp."""
522
+ mean, median, p95, max_len, stdev, shape = blob_digest._line_length_shape(
523
+ [1, 100]
524
+ )
525
+ assert max_len == 100
526
+ assert p95 == 100, f"p95 must not exceed max; got p95={p95}, max={max_len}"
527
+
528
+
529
+ @pytest.mark.parametrize(
530
+ "lengths",
531
+ [
532
+ [10], # single line
533
+ [10, 1000], # 2-line extreme spread
534
+ [1, 2, 3, 4, 5], # tiny ascending
535
+ [100, 100, 100, 100, 100], # tiny uniform
536
+ [1] * 19, # just below the n=20 threshold
537
+ list(range(1, 21)), # exactly at threshold
538
+ list(range(1, 1001)), # plenty of data
539
+ ],
540
+ )
541
+ def test_p95_le_max_invariant(lengths) -> None:
542
+ """Invariant: p95 <= max for any non-empty sample, big or small."""
543
+ _, _, p95, max_len, _, _ = blob_digest._line_length_shape(lengths)
544
+ assert p95 <= max_len, (
545
+ f"p95={p95} exceeded max={max_len} for lengths={lengths!r}"
546
+ )
547
+
548
+
549
+ # ─── drain3 quarantine + meaninglessness floor ──────────────────────────────
550
+
551
+
552
+ def test_quarantine_drain3_dormant_vanishes_templates(monkeypatch, tmp_path) -> None:
553
+ """_BLOB_DRAIN3_ENABLED=False → no template fields, card otherwise OK."""
554
+ monkeypatch.setattr(blob_digest, "_BLOB_DRAIN3_ENABLED", False)
555
+
556
+ p = tmp_path / "rep.log"
557
+ p.write_text("host svc: event N\n" * 500)
558
+ card = blob_digest.summarize_blob(p)
559
+ assert card.distinct_templates is None
560
+ assert card.top_template_coverage_pct is None
561
+ assert card.top_template_n is None
562
+ assert card.singleton_template_count is None
563
+ # Card otherwise renders fine. Lowercase labels under flat grammar.
564
+ out = _render(card)
565
+ assert "templates:" not in out
566
+ assert "shape:" in out
567
+
568
+
569
+ def test_meaninglessness_floor_vanishes_templates_on_freeform(tmp_path) -> None:
570
+ """Near-1:1 templates/lines → suppress Templates (better silent than
571
+ vacuous '~480 distinct over 500 lines')."""
572
+ p = tmp_path / "free.log"
573
+ with p.open("w") as fh:
574
+ # Each line structurally distinct.
575
+ for i in range(500):
576
+ fh.write(
577
+ f"line{i:04d} verb_{i % 7} adverb_{i % 11} "
578
+ f"noun_{i % 13} {chr(65 + i % 26)}\n"
579
+ )
580
+
581
+ card = blob_digest.summarize_blob(p)
582
+ # On a deliberately-freeform input the floor should hit.
583
+ assert card.distinct_templates is None or (
584
+ card.distinct_templates is not None
585
+ and card.distinct_templates / max(card.sampled_line_count, 1)
586
+ < blob_digest._TEMPLATE_RATIO_FLOOR
587
+ )
588
+
589
+
590
+ # ─── Renderer: no banner, no Lines: / Data found: rows in flat grammar ─────
591
+ #
592
+ # The old Lines: / Records: / Data found: banner rows lived on RunSummary
593
+ # and went away with it. Blob now uses its identity-line provenance and
594
+ # never participates in a banner.
595
+
596
+
597
+ def test_blob_card_has_no_banner_lines_or_data_found_rows() -> None:
598
+ """The flat blob card has no banner rows at all."""
599
+ for card in (_binary_blob_card(), _text_blob_card()):
600
+ out = _render(card)
601
+ assert "Lines:" not in out
602
+ assert "Records:" not in out
603
+ assert "Data found:" not in out
604
+ assert "LogHunter" not in out
605
+
606
+
607
+ # ─── Renderer: flat-grammar vanish-don't-dash ───────────────────────────────
608
+
609
+
610
+ def test_render_binary_vanishes_text_slots() -> None:
611
+ """Terminal binary magic → only the `bytes:` row remains; shape /
612
+ lines / tokens / templates are absent (vanish-don't-dash)."""
613
+ out = _render(_binary_blob_card())
614
+ assert "bytes:" in out
615
+ assert "PNG image" in out
616
+ assert "binary" in out
617
+ assert "shape:" not in out
618
+ assert "lines:" not in out
619
+ assert "tokens:" not in out
620
+ assert "templates:" not in out
621
+
622
+
623
+ def test_render_text_blob_shows_all_text_slots() -> None:
624
+ card = _text_blob_card()
625
+ out = _render(card)
626
+ for label in ("bytes:", "shape:", "lines:", "tokens:", "templates:"):
627
+ assert label in out
628
+ assert "[literal]" in out
629
+
630
+
631
+ def test_render_text_blob_with_no_templates_vanishes_only_that_slot() -> None:
632
+ card = _text_blob_card(has_templates=False)
633
+ out = _render(card)
634
+ assert "shape:" in out
635
+ assert "lines:" in out
636
+ assert "tokens:" in out
637
+ assert "templates:" not in out
638
+
639
+
640
+ def test_render_no_footer_no_header_rule_no_trailing_sep() -> None:
641
+ """Flat grammar: no `── digest · blob ─` header, no `inner_sep`, no
642
+ `No parser claims…` footer, no trailing `_SEP`."""
643
+ for card in (_binary_blob_card(), _text_blob_card()):
644
+ out = _render(card)
645
+ assert "── digest" not in out
646
+ assert "No parser claims" not in out
647
+ assert "─" not in out
648
+
649
+
650
+ def test_render_headline_labels_guess(tmp_path) -> None:
651
+ """Headline always labels itself as a guess. Under the flat grammar
652
+ the headline is flush-left (no leading indent)."""
653
+ text_out = _render(_text_blob_card(shape_guess="JSON"))
654
+ assert "looks like JSON" in text_out
655
+ # Flush-left — no two-space indent prefix.
656
+ headline = next(ln for ln in text_out.splitlines() if "looks like JSON" in ln)
657
+ assert not headline.startswith(" ")
658
+
659
+ bin_out = _render(_binary_blob_card())
660
+ assert "looks like a PNG image, not a log" in bin_out
661
+
662
+
663
+ def test_render_provenance_line_plain_text() -> None:
664
+ """Plain-text provenance reports the sampled count and reads."""
665
+ out = _render(_text_blob_card())
666
+ assert "sampled" in out
667
+ assert "lines across" in out
668
+ assert "reads" in out
669
+
670
+
671
+ def test_render_provenance_line_compressed() -> None:
672
+ """Compressed provenance labels 'compressed' and 'sampled from head'."""
673
+ base = _text_blob_card()
674
+ compressed = BlobCard(**{**base.__dict__, "is_compressed": True})
675
+ out = _render(compressed)
676
+ assert "compressed" in out
677
+ assert "from head" in out
678
+
679
+
680
+ def test_render_provenance_line_terminal_binary() -> None:
681
+ """Terminal-binary provenance: 'binary, sampled from head'. Does NOT
682
+ count lines (a binary has no line concept)."""
683
+ out = _render(_binary_blob_card())
684
+ # Identity line 1 = source name; line 2 = provenance.
685
+ lines = out.splitlines()
686
+ assert "binary, sampled from head" in lines[1]
687
+ assert "lines across" not in lines[1]
688
+ assert "reads" not in lines[1]
689
+
690
+
691
+ def test_render_identity_line_is_source_name_flush_left() -> None:
692
+ """Identity line 1 = card.source_name, flush-left, no banner above."""
693
+ out = _render(_text_blob_card(), source_name="mystery.txt")
694
+ assert out.splitlines()[0] == "mystery.txt"
695
+
696
+
697
+ def test_round_2sf_helper_is_gone() -> None:
698
+ """_round_2sf existed only for the 'Lines: sampled ~N' rendering that
699
+ earlier work removed. Pin its removal so it does not creep back."""
700
+ assert not hasattr(text_module, "_round_2sf")
701
+
702
+
703
+ # ─── JSON blob: `fields:` (names) replaces `tokens:` (record dump) ──────────
704
+ #
705
+ # Glob-digest of a Zeek directory routes every non-claimed NDJSON log
706
+ # (http, ssl, ssh, dhcp, ntp, weird, …) to the blob floor. The legacy
707
+ # tokens row dumped raw records (and sometimes mid-value garbage from
708
+ # whitespace splits inside string fields). The replacement lists
709
+ # top-level JSON KEY NAMES only — structural description, no values.
710
+
711
+ def _write_json_lines(path, lines: list[str]) -> None:
712
+ path.write_text("".join(line + "\n" for line in lines), encoding="utf-8")
713
+
714
+
715
+ def test_json_blob_renders_fields_row_not_tokens_row(tmp_path) -> None:
716
+ """JSON blob: fields: line present, tokens: line absent. Union of
717
+ keys across rows preserves first-seen order and catches an optional
718
+ key that only appears on one row."""
719
+ p = tmp_path / "ssh.log"
720
+ _write_json_lines(p, [
721
+ '{"ts": 1779750000.0, "uid": "C001", "id.orig_h": "192.0.2.10"}',
722
+ '{"ts": 1779750001.0, "uid": "C002", "id.orig_h": "192.0.2.11",'
723
+ ' "auth_attempts": 3}',
724
+ ] * 3)
725
+ card = blob_digest.summarize_blob(p)
726
+ assert card.shape_guess == "JSON"
727
+ assert card.json_field_names == [
728
+ "ts", "uid", "id.orig_h", "auth_attempts",
729
+ ]
730
+ out = _render(card, source_name="ssh.log")
731
+ assert "fields:" in out
732
+ assert "tokens:" not in out
733
+ fields_line = next(l for l in out.splitlines() if l.startswith("fields:"))
734
+ # Names emitted in first-seen order, comma-separated.
735
+ assert "ts, uid, id.orig_h, auth_attempts" in fields_line
736
+
737
+
738
+ def test_json_arrays_and_scalars_fall_back_to_tokens_row(tmp_path) -> None:
739
+ """Top-level JSON arrays / scalars / mixed have no object keys to
740
+ list; helper returns None; renderer falls back to the existing
741
+ `tokens:` row."""
742
+ p = tmp_path / "weird.log"
743
+ _write_json_lines(p, ['[1, 2, 3]', '42', '"hello"'] * 5)
744
+ card = blob_digest.summarize_blob(p)
745
+ # Helper-level: None
746
+ assert blob_digest._json_field_names([
747
+ '[1, 2, 3]', '42', '"hello"',
748
+ ]) is None
749
+ assert card.json_field_names is None
750
+ out = _render(card, source_name="weird.log")
751
+ # The exact shape-guess is JSON or freeform depending on the cascade;
752
+ # what matters is no `fields:` row when names is None.
753
+ assert "fields:" not in out
754
+
755
+
756
+ def _json_keys_line(n: int) -> str:
757
+ """One JSON record with N generic placeholder keys."""
758
+ pairs = ", ".join(f'"field_{i:02d}": {i}' for i in range(n))
759
+ return "{" + pairs + "}"
760
+
761
+
762
+ def test_fields_row_clamps_to_two_lines_with_correct_remainder(tmp_path) -> None:
763
+ """30 generic keys → exactly two lines, second hang-indented to the
764
+ value column, ends with `… +N more`, and N equals total minus rendered."""
765
+ p = tmp_path / "wide.log"
766
+ _write_json_lines(p, [_json_keys_line(30)] * 5)
767
+ card = blob_digest.summarize_blob(p)
768
+ assert card.json_field_names is not None
769
+ assert len(card.json_field_names) == 30
770
+
771
+ out = _render(card, source_name="wide.log")
772
+ lines = out.splitlines()
773
+ fields_idx = next(i for i, l in enumerate(lines) if l.startswith("fields:"))
774
+ line1 = lines[fields_idx]
775
+ line2 = lines[fields_idx + 1]
776
+
777
+ # Compute the actual label column from a single-line slot rendered
778
+ # alongside this card — the column is `max(label_w) + 2`, where the
779
+ # blob card's longest label is `templates`, not `fields`. Slicing by
780
+ # `len("fields: ")` would land mid-padding and corrupt the first name.
781
+ bytes_line = next(l for l in lines if l.startswith("bytes:"))
782
+ label_col = len(bytes_line) - len(bytes_line.lstrip()) + len("bytes:")
783
+ # Actually compute from the leading-whitespace gap after the `bytes:`
784
+ # prefix: `bytes:` is 6 chars, value column starts at `label_col`.
785
+ label_col = bytes_line.index(bytes_line.lstrip()[len("bytes: ".rstrip()):][0:1] or "t")
786
+ # Simpler: column where the value starts is the position of the first
787
+ # non-space char AFTER the `:` on a single-line slot.
788
+ after_colon = bytes_line.find(":") + 1
789
+ while after_colon < len(bytes_line) and bytes_line[after_colon] == " ":
790
+ after_colon += 1
791
+ label_col = after_colon
792
+
793
+ # Line 2 hang-indents to label_col.
794
+ assert line2.startswith(" " * label_col)
795
+ # Line 2 ends with the truncation suffix and an accurate N.
796
+ import re
797
+ m = re.search(r"… \+(\d+) more$", line2)
798
+ assert m is not None, f"expected `… +N more` suffix; got {line2!r}"
799
+ n_more = int(m.group(1))
800
+
801
+ # Count the field names that actually rendered across both lines.
802
+ rendered_text = (
803
+ line1[label_col:]
804
+ + ", "
805
+ + line2[label_col:].rsplit("… +", 1)[0].rstrip(", ")
806
+ )
807
+ rendered_names = [n for n in rendered_text.split(", ") if n.startswith("field_")]
808
+ assert len(rendered_names) + n_more == 30
809
+ # Each rendered name appears whole — no mid-name break.
810
+ for name in rendered_names:
811
+ assert name.startswith("field_") and len(name) == len("field_NN")
812
+
813
+
814
+ def test_fields_row_renders_single_line_no_suffix_when_narrow(tmp_path) -> None:
815
+ """Short list fits one line → one line, no suffix."""
816
+ p = tmp_path / "narrow.log"
817
+ _write_json_lines(p, [_json_keys_line(4)] * 5)
818
+ card = blob_digest.summarize_blob(p)
819
+ out = _render(card, source_name="narrow.log")
820
+ fields_lines = [
821
+ i for i, l in enumerate(out.splitlines()) if l.startswith("fields:")
822
+ ]
823
+ assert len(fields_lines) == 1
824
+ line = out.splitlines()[fields_lines[0]]
825
+ assert "more" not in line
826
+ assert "…" not in line
827
+
828
+
829
+ def test_fields_row_wrap_never_splits_a_name(tmp_path) -> None:
830
+ """Wrap respects part boundaries — every field name appears whole on
831
+ exactly one rendered line."""
832
+ p = tmp_path / "mixed.log"
833
+ # Mix short and longer names; enough to wrap.
834
+ names = (
835
+ ["ts", "uid", "id.orig_h", "id.resp_h"]
836
+ + [f"long_field_name_{i:02d}" for i in range(20)]
837
+ )
838
+ record = "{" + ", ".join(f'"{n}": {i}' for i, n in enumerate(names)) + "}"
839
+ _write_json_lines(p, [record] * 3)
840
+ card = blob_digest.summarize_blob(p)
841
+ out = _render(card, source_name="mixed.log")
842
+ lines = out.splitlines()
843
+ fields_idx = next(i for i, l in enumerate(lines) if l.startswith("fields:"))
844
+ # Concatenate the rendered lines and check every original name
845
+ # appears either whole somewhere in the rendered text OR is one of
846
+ # the suppressed-by-suffix names. The strong check: NO substring of
847
+ # any rendered line splits a name (a partial like "long_field_name_0"
848
+ # without its trailing digit would indicate a mid-name break).
849
+ rendered = lines[fields_idx] + " " + (lines[fields_idx + 1]
850
+ if fields_idx + 1 < len(lines) else "")
851
+ # For each name, if it appears at all, it appears in full.
852
+ for name in names:
853
+ # Search for the prefix and assert the next char is not alnum/_
854
+ # (so we'd catch a mid-name break like "long_field_name_0" cut
855
+ # off before its decade digit).
856
+ import re
857
+ for m in re.finditer(re.escape(name) + r"(\w)", rendered):
858
+ # An extension of `name` by a word char is only OK if the
859
+ # extended-name is itself an emitted name (e.g. "ts" prefix
860
+ # of "ts_extra"). Names list has no such overlap, so flag it.
861
+ extended = name + m.group(1)
862
+ assert extended in names, (
863
+ f"field name appears to be split: {name!r} extended by "
864
+ f"{m.group(1)!r} in rendered output"
865
+ )
866
+
867
+
868
+ # ─── Cross-helper regression: schema-card values stay UN-clamped ────────────
869
+
870
+ def test_schema_card_long_value_renders_full_no_truncation() -> None:
871
+ """The blob two-line clamp must NOT leak into schema-card rendering.
872
+
873
+ A long densest-tuple flow on a conn DigestCard must render in full
874
+ (no `… +N more`, no truncation). _render_label_value_block stays the
875
+ shared label-aligned helper for schema cards; the wrap lives only
876
+ in render_blob via _wrap_blob_slot_value.
877
+ """
878
+ from datetime import datetime, timezone
879
+ from loghunter.common.finding import DigestCard, DigestSlot
880
+ long_flow = (
881
+ "192.0.2.10:51234 → 198.51.100.20:443 (very-long-tag-padding-out-"
882
+ "to-exceed-the-eighty-col-line-frame-on-purpose)"
883
+ )
884
+ assert len(long_flow) > 80
885
+ cliff = DigestSlot(
886
+ label="densest-tuple", statistic="cliff",
887
+ cells=[long_flow, "482", "3.7x"],
888
+ entity=long_flow, magnitude=482.0, ratio=3.7,
889
+ )
890
+ now = datetime(2026, 6, 14, tzinfo=timezone.utc)
891
+ card = DigestCard(
892
+ schema="conn",
893
+ source_name="conn.log",
894
+ data_window=(now, now),
895
+ record_count=1000,
896
+ histogram_counts=[1, 2, 3],
897
+ histogram_unit="hr",
898
+ histogram_peak=3,
899
+ zone1_extras=[("hosts", "5")],
900
+ insights=[],
901
+ fields=[cliff],
902
+ data_size_bytes=0,
903
+ )
904
+ stream = io.StringIO()
905
+ TextHandler(stream=stream).render_digest(card)
906
+ out = stream.getvalue()
907
+ # Full flow string survives in the rendered output, no truncation suffix.
908
+ assert long_flow in out
909
+ assert "… +" not in out
910
+ assert "more" not in out.split(long_flow)[1]
911
+
912
+
913
+ # ─── Runner fold + CLI invariants ───────────────────────────────────────────
914
+
915
+
916
+ def test_run_digest_blob_no_longer_exists() -> None:
917
+ """The standalone runner is gone; blob is folded into run_digest's
918
+ terminal branch."""
919
+ assert not hasattr(runner, "_run_digest_blob")
920
+
921
+
922
+ def test_cli_digest_blob_token_is_rejected(monkeypatch) -> None:
923
+ """`digest blob PATH` token is gone — schema cannot be selected from CLI."""
924
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
925
+ # Token "blob" treated as a path; the path doesn't exist → errored=1.
926
+ rc = cli._main(["digest", "blob"])
927
+ assert rc == 1
928
+
929
+
930
+ def test_cli_sniff_floor_routes_to_blob_path(tmp_path, monkeypatch) -> None:
931
+ """Unrecognized text positional → schema=blob via sniff floor; sets
932
+ blob_path on the runner kwargs."""
933
+ captured: dict[str, Any] = {}
934
+
935
+ def _fake(**kwargs: Any) -> None:
936
+ captured.update(kwargs)
937
+
938
+ monkeypatch.setattr(runner, "run_digest", _fake)
939
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
940
+
941
+ f = tmp_path / "mystery.txt"
942
+ f.write_text("hello world\nlorem ipsum\n")
943
+ cli._main(["digest", str(f)])
944
+ assert captured.get("schema") == "blob"
945
+ assert captured.get("blob_path") == f
946
+ assert captured.get("zeek_dir") is None
947
+ assert captured.get("pihole_dir") is None
948
+ assert captured.get("syslog_dir") is None
949
+ assert captured.get("cloudtrail_dir") is None
950
+
951
+
952
+ def test_cli_blob_path_not_advertised(monkeypatch) -> None:
953
+ """--blob-path is not advertised — operator cannot set blob_path
954
+ directly. Sniff routing is the only producer."""
955
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
956
+ with pytest.raises(ValueError, match=r"unknown flag --blob-path"):
957
+ cli._main(["digest", "--blob-path=/tmp/x"])
958
+
959
+
960
+ def test_run_digest_blob_requires_blob_path() -> None:
961
+ with pytest.raises(ValueError, match=r"PATH not provided"):
962
+ runner.run_digest(config={"loghunter": {}}, schema="blob")
963
+
964
+
965
+ def test_run_digest_blob_rejects_directory(tmp_path) -> None:
966
+ """The blob terminal branch requires a single file; the sniff path
967
+ only ever produces single files. A directory is a programmer error."""
968
+ with pytest.raises(ValueError, match=r"not a file"):
969
+ runner.run_digest(
970
+ config={"loghunter": {}}, schema="blob", blob_path=tmp_path,
971
+ )
972
+
973
+
974
+ def test_run_digest_blob_path_only_valid_for_blob_schema(tmp_path) -> None:
975
+ """blob_path passed with a schema-card schema is rejected."""
976
+ with pytest.raises(
977
+ ValueError,
978
+ match=r"blob_path is only valid for the blob schema",
979
+ ):
980
+ runner.run_digest(
981
+ config={"loghunter": {}}, schema="conn",
982
+ blob_path=tmp_path / "input.txt",
983
+ )
984
+
985
+
986
+ # ─── Recognized-but-empty seam ──────────────────────────────────────────────
987
+
988
+
989
+ def test_digest_empty_is_not_value_error_subclass() -> None:
990
+ """DigestEmpty is a control signal, not an error. Real per-path
991
+ failures (corrupt gzip, parser errors) are ValueErrors; DigestEmpty
992
+ must not be consumed by the ValueError arm."""
993
+ assert not issubclass(DigestEmpty, ValueError)
994
+
995
+
996
+ def test_digest_empty_carries_basename_and_schema() -> None:
997
+ exc = DigestEmpty(basename="conn.log", schema="conn")
998
+ assert exc.basename == "conn.log"
999
+ assert exc.schema == "conn"
1000
+ assert "conn.log" in str(exc)
1001
+ assert "conn" in str(exc)
1002
+
1003
+
1004
+ def test_run_digest_raises_digest_empty_on_empty_frame(
1005
+ tmp_path, monkeypatch,
1006
+ ) -> None:
1007
+ """Frame-based check: an empty frame returned by the loader raises
1008
+ DigestEmpty, not a zero-row schema card."""
1009
+ from loghunter.common import loader
1010
+
1011
+ fake = loader.LoadResult(
1012
+ logs={"conn*.log*": pd.DataFrame()},
1013
+ record_counts={},
1014
+ data_window=None,
1015
+ warnings=[],
1016
+ data_size_bytes=0,
1017
+ )
1018
+ monkeypatch.setattr(loader, "load_required_logs", lambda *a, **k: fake)
1019
+
1020
+ zeek_dir = tmp_path / "zeek"
1021
+ zeek_dir.mkdir()
1022
+ with pytest.raises(DigestEmpty) as exc_info:
1023
+ runner.run_digest(
1024
+ config={"loghunter": {}}, schema="conn", zeek_dir=zeek_dir,
1025
+ )
1026
+ assert exc_info.value.basename == "zeek"
1027
+ assert exc_info.value.schema == "conn"
1028
+
1029
+
1030
+ def test_run_digest_raises_digest_empty_when_frame_missing(
1031
+ tmp_path, monkeypatch,
1032
+ ) -> None:
1033
+ """`frame is None` (load_result.logs missing the key) is also an empty
1034
+ state — same DigestEmpty raise."""
1035
+ from loghunter.common import loader
1036
+
1037
+ fake = loader.LoadResult(
1038
+ logs={}, # pattern key missing entirely
1039
+ record_counts={},
1040
+ data_window=None,
1041
+ warnings=[],
1042
+ data_size_bytes=0,
1043
+ )
1044
+ monkeypatch.setattr(loader, "load_required_logs", lambda *a, **k: fake)
1045
+
1046
+ zeek_dir = tmp_path / "zeek"
1047
+ zeek_dir.mkdir()
1048
+ with pytest.raises(DigestEmpty):
1049
+ runner.run_digest(
1050
+ config={"loghunter": {}}, schema="conn", zeek_dir=zeek_dir,
1051
+ )
1052
+
1053
+
1054
+ def test_cli_fanout_recognized_empty_narrates_and_exits_zero(
1055
+ tmp_path, monkeypatch, capsys,
1056
+ ) -> None:
1057
+ """Fan-out: DigestEmpty raised from run_digest → narrated to stderr,
1058
+ no card rendered, exit 0."""
1059
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1060
+
1061
+ def _empty(**kwargs):
1062
+ raise DigestEmpty(basename="conn.log", schema="conn")
1063
+
1064
+ monkeypatch.setattr(runner, "run_digest", _empty)
1065
+
1066
+ f = tmp_path / "conn.log"
1067
+ f.write_text("#fields\tts\tsrc\n")
1068
+ # Force sniff to classify as conn.
1069
+ import loghunter.common.loader as loader_mod
1070
+ monkeypatch.setattr(
1071
+ loader_mod,
1072
+ "sniff_format_detailed",
1073
+ lambda p: loader_mod.SniffResult(
1074
+ state="classified", schema="conn", origin="zeek",
1075
+ ),
1076
+ )
1077
+
1078
+ rc = cli._main(["digest", str(f)])
1079
+ err = capsys.readouterr().err
1080
+ assert "recognized as conn but no parseable records" in err
1081
+ assert rc == 0
1082
+
1083
+
1084
+ def test_cli_bare_config_recognized_empty_narrates_and_exits_zero(
1085
+ tmp_path, monkeypatch, capsys,
1086
+ ) -> None:
1087
+ """Bare-config (no positional): DigestEmpty caught at the entry point,
1088
+ narrated, exit 0 — no traceback."""
1089
+ zeek_dir = tmp_path / "zeek"
1090
+ zeek_dir.mkdir()
1091
+ monkeypatch.setattr(
1092
+ cli.cfg, "load",
1093
+ lambda _path: {"loghunter": {"zeek_dir": str(zeek_dir)}},
1094
+ )
1095
+
1096
+ def _empty(**kwargs):
1097
+ raise DigestEmpty(basename=zeek_dir.name, schema="conn")
1098
+
1099
+ monkeypatch.setattr(runner, "run_digest", _empty)
1100
+
1101
+ rc = cli._main(["digest"])
1102
+ err = capsys.readouterr().err
1103
+ assert "recognized as conn but no parseable records" in err
1104
+ assert rc == 0
1105
+
1106
+
1107
+ def test_cli_fanout_corrupt_gzip_exits_clean(tmp_path, monkeypatch, capsys) -> None:
1108
+ """REGRESSION: a corrupt .gz raises gzip.BadGzipFile (OSError subclass)
1109
+ inside sniff_format_detailed BEFORE run_digest is called. The fan-out
1110
+ arm must catch it as a per-path failure — no traceback, no leak to
1111
+ main(). Exit 1 because rendered=0 and errored=1."""
1112
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1113
+
1114
+ bad = tmp_path / "broken.gz"
1115
+ bad.write_bytes(b"not a real gzip stream, just text")
1116
+
1117
+ rc = cli._main(["digest", str(bad)])
1118
+ err = capsys.readouterr().err
1119
+ assert rc == 1
1120
+ # Per-path message format: "digest: <name>: <reason>".
1121
+ assert "broken.gz" in err
1122
+ # And no Python traceback markers.
1123
+ assert "Traceback" not in err
1124
+
1125
+
1126
+ def test_cli_bare_config_corrupt_gzip_handled_gracefully(
1127
+ tmp_path, monkeypatch, capsys,
1128
+ ) -> None:
1129
+ """REGRESSION: a corrupt .gz inside the configured zeek_dir does NOT
1130
+ leak as a traceback. The loader skips the bad file with a warning, the
1131
+ resulting empty frame raises DigestEmpty, and the bare-config arm
1132
+ catches it and narrates cleanly. Exit 0 (file was understood, just
1133
+ nothing to read after the skip)."""
1134
+ zeek_dir = tmp_path / "zeek"
1135
+ zeek_dir.mkdir()
1136
+ (zeek_dir / "conn.log.gz").write_bytes(b"not gzip, just text")
1137
+
1138
+ monkeypatch.setattr(
1139
+ cli.cfg, "load",
1140
+ lambda _path: {
1141
+ "loghunter": {
1142
+ "zeek_dir": str(zeek_dir),
1143
+ "default_window": "all",
1144
+ }
1145
+ },
1146
+ )
1147
+
1148
+ rc = cli._main(["digest"])
1149
+ err = capsys.readouterr().err
1150
+ assert rc == 0
1151
+ assert "Traceback" not in err
1152
+ # The loader surfaces a graceful skip; the recognized-empty seam takes over.
1153
+ assert "could not be read" in err or "incomplete or corrupt" in err
1154
+ assert "recognized as conn but no parseable records" in err
1155
+
1156
+
1157
+ def test_main_oserror_arm_translates_to_exit_one(monkeypatch, capsys) -> None:
1158
+ """Defensive: any OSError that escapes _main() (e.g. an I/O failure that
1159
+ no per-path arm catches) MUST be translated to a clean 'loghunter:' exit 1
1160
+ by main(), not bubble as a traceback. Pin the arm independently of the
1161
+ digest code paths so it stays in place across future refactors."""
1162
+ def _raise_os_error(_argv):
1163
+ raise OSError("synthetic disk failure for test")
1164
+
1165
+ monkeypatch.setattr(cli, "_main", _raise_os_error)
1166
+ with pytest.raises(SystemExit) as exit_info:
1167
+ cli.main(["digest", "/nonexistent"])
1168
+ assert exit_info.value.code == 1
1169
+ err = capsys.readouterr().err
1170
+ assert "Traceback" not in err
1171
+ assert "loghunter:" in err
1172
+ assert "synthetic disk failure" in err
1173
+
1174
+
1175
+ def test_cli_fanout_value_error_still_exits_one(tmp_path, monkeypatch, capsys) -> None:
1176
+ """A real per-path failure (ValueError) still flows through the
1177
+ existing arm — exit 1 when nothing rendered."""
1178
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
1179
+
1180
+ def _boom(**kwargs):
1181
+ raise ValueError("simulated parser failure")
1182
+
1183
+ monkeypatch.setattr(runner, "run_digest", _boom)
1184
+
1185
+ f = tmp_path / "data.log"
1186
+ f.write_text("hello world\n")
1187
+ import loghunter.common.loader as loader_mod
1188
+ monkeypatch.setattr(
1189
+ loader_mod,
1190
+ "sniff_format_detailed",
1191
+ lambda p: loader_mod.SniffResult(
1192
+ state="classified", schema="blob", origin=None,
1193
+ ),
1194
+ )
1195
+
1196
+ rc = cli._main(["digest", str(f)])
1197
+ err = capsys.readouterr().err
1198
+ assert "simulated parser failure" in err
1199
+ assert rc == 1
1200
+
1201
+
1202
+ # ─── End-to-end: run_digest blob terminal branch through the renderer ──────
1203
+
1204
+
1205
+ def test_run_digest_blob_end_to_end_renders_card(tmp_path, capsys) -> None:
1206
+ """Programmatic call to run_digest with schema=blob renders a flat
1207
+ card to the configured stream (stdout by default)."""
1208
+ f = tmp_path / "free.log"
1209
+ f.write_text("alpha beta gamma\ndelta epsilon\n" * 200)
1210
+
1211
+ runner.run_digest(
1212
+ config={"loghunter": {}}, schema="blob", blob_path=f,
1213
+ )
1214
+ out = capsys.readouterr().out
1215
+ # Flat blob card: identity line 1 = source basename, headline names
1216
+ # the best-guess shape, fields block carries the lowercase labels.
1217
+ assert out.splitlines()[0] == "free.log"
1218
+ assert "Unrecognized source" in out
1219
+ assert "bytes:" in out
1220
+ # No banner, no header rule, no footer in the flat grammar.
1221
+ assert "LogHunter" not in out
1222
+ assert "── digest" not in out
1223
+ assert "No parser claims" not in out
1224
+
1225
+
1226
+ def test_run_digest_blob_dry_run_skips_render(tmp_path, capsys) -> None:
1227
+ """Dry-run prints a plan note and returns without sampling or rendering."""
1228
+ f = tmp_path / "free.log"
1229
+ f.write_text("hello\n")
1230
+ runner.run_digest(
1231
+ config={"loghunter": {}}, schema="blob", blob_path=f, dry_run=True,
1232
+ )
1233
+ out = capsys.readouterr().out
1234
+ assert "digest dry run" in out
1235
+ assert "schema:" in out and "blob" in out
1236
+ assert "── digest · blob" not in out
1237
+ assert "No parser claims" not in out