loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,1237 @@
|
|
|
1
|
+
"""Tests for the blob digest path — describes unrecognized bytes.
|
|
2
|
+
|
|
3
|
+
Pins the architectural rails of Gate 2:
|
|
4
|
+
- O(sample): the profiler reads a bounded sample, never the whole file.
|
|
5
|
+
- Zero field extraction: no timestamp, no fields — bytes and shape-guesses.
|
|
6
|
+
- Shared banner: blob's RunSummary routes through _render_run_summary like
|
|
7
|
+
schema cards, via the additive record_label / data_window seams.
|
|
8
|
+
- Vanish-don't-dash: optional slots that don't apply are omitted entirely.
|
|
9
|
+
- Sniff-only entry: blob is reached via the sniff floor, never an operator
|
|
10
|
+
token.
|
|
11
|
+
|
|
12
|
+
All synthetic content. Per the project's data-privacy rule, no real network
|
|
13
|
+
artifacts.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import gzip
|
|
19
|
+
import io
|
|
20
|
+
import math
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import pandas as pd
|
|
25
|
+
import pytest
|
|
26
|
+
|
|
27
|
+
import loghunter.cli as cli
|
|
28
|
+
import loghunter.outputs.text as text_module
|
|
29
|
+
import loghunter.runner as runner
|
|
30
|
+
from loghunter.common.errors import DigestEmpty
|
|
31
|
+
from loghunter.common.finding import BlobCard, RunSummary
|
|
32
|
+
from loghunter.digest import blob as blob_digest
|
|
33
|
+
from loghunter.outputs.text import TextHandler
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
# ─── Helpers ────────────────────────────────────────────────────────────────
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _render(card: BlobCard, source_name: str = "mystery.txt") -> str:
|
|
40
|
+
"""Render a blob card. source_name is overridden on the card before
|
|
41
|
+
render so older fixtures still pin identity-line-1 to a known name."""
|
|
42
|
+
card.source_name = source_name
|
|
43
|
+
stream = io.StringIO()
|
|
44
|
+
handler = TextHandler(stream=stream, verbose_level=0)
|
|
45
|
+
handler.render_blob(card)
|
|
46
|
+
return stream.getvalue()
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _binary_blob_card() -> BlobCard:
|
|
50
|
+
"""A BlobCard shaped like a terminal-magic binary hit (PNG)."""
|
|
51
|
+
return BlobCard(
|
|
52
|
+
source_name="mystery.bin",
|
|
53
|
+
byte_size=4096,
|
|
54
|
+
sampled_line_count=0,
|
|
55
|
+
sample_read_count=1,
|
|
56
|
+
is_compressed=False,
|
|
57
|
+
printable_pct=0.1,
|
|
58
|
+
nonprintable_pct=99.9,
|
|
59
|
+
utf8_clean=False,
|
|
60
|
+
file_type_guess="PNG image",
|
|
61
|
+
file_type_magic=b"\x89PNG\r\n\x1a\n",
|
|
62
|
+
shape_guess=None,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _text_blob_card(
|
|
67
|
+
*,
|
|
68
|
+
shape_guess: str = "freeform text",
|
|
69
|
+
sampled_line_count: int = 1000,
|
|
70
|
+
utf8_clean: bool = True,
|
|
71
|
+
has_templates: bool = True,
|
|
72
|
+
has_tokens: bool = True,
|
|
73
|
+
) -> BlobCard:
|
|
74
|
+
"""A BlobCard shaped like a text path with all text slots populated."""
|
|
75
|
+
return BlobCard(
|
|
76
|
+
source_name="mystery.txt",
|
|
77
|
+
byte_size=64_000,
|
|
78
|
+
sampled_line_count=sampled_line_count,
|
|
79
|
+
sample_read_count=6,
|
|
80
|
+
is_compressed=False,
|
|
81
|
+
printable_pct=99.7,
|
|
82
|
+
nonprintable_pct=0.3,
|
|
83
|
+
utf8_clean=utf8_clean,
|
|
84
|
+
file_type_guess=None,
|
|
85
|
+
file_type_magic=None,
|
|
86
|
+
shape_guess=shape_guess,
|
|
87
|
+
mean_line_length=412.0,
|
|
88
|
+
median_line_length=400.0,
|
|
89
|
+
line_length_p95=980,
|
|
90
|
+
max_line_length=4201,
|
|
91
|
+
line_length_stdev=120.0,
|
|
92
|
+
line_length_shape="varied",
|
|
93
|
+
top_tokens=(
|
|
94
|
+
[("level", 200), ("ts", 200), ("msg", 200), ("service", 200), ("trace_id", 200)]
|
|
95
|
+
if has_tokens else None
|
|
96
|
+
),
|
|
97
|
+
distinct_templates=140 if has_templates else None,
|
|
98
|
+
top_template_coverage_pct=78.0 if has_templates else None,
|
|
99
|
+
top_template_n=6 if has_templates else None,
|
|
100
|
+
singleton_template_count=12 if has_templates else None,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ─── O(sample) rail (PRIMARY: deterministic byte counter) ───────────────────
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def test_o_sample_rail_bounded_byte_budget(tmp_path, monkeypatch) -> None:
|
|
108
|
+
"""The profiler reads a bounded sample regardless of file size. Wrap the
|
|
109
|
+
low-level reads so we can count bytes pulled off disk; assert the total
|
|
110
|
+
is within the head + seek budget. This is the PRIMARY rail enforcement —
|
|
111
|
+
a determinism gate, not a wall-clock smoke test."""
|
|
112
|
+
big = tmp_path / "big.log"
|
|
113
|
+
payload = b"a" * 64 + b"\n" # 65-byte lines
|
|
114
|
+
# ~5 MB — enough to exercise the seek path (well above _SEEK_MIN_SIZE).
|
|
115
|
+
with big.open("wb") as fh:
|
|
116
|
+
for _ in range(80_000):
|
|
117
|
+
fh.write(payload)
|
|
118
|
+
|
|
119
|
+
bytes_read = 0
|
|
120
|
+
readline_bytes = 0
|
|
121
|
+
|
|
122
|
+
real_open = Path.open
|
|
123
|
+
|
|
124
|
+
def spy_open(self, mode="r", *args, **kwargs):
|
|
125
|
+
fh = real_open(self, mode, *args, **kwargs)
|
|
126
|
+
if "b" in mode and self == big:
|
|
127
|
+
real_read = fh.read
|
|
128
|
+
real_readline = fh.readline
|
|
129
|
+
|
|
130
|
+
def counted_read(n=-1):
|
|
131
|
+
nonlocal bytes_read
|
|
132
|
+
data = real_read(n)
|
|
133
|
+
bytes_read += len(data)
|
|
134
|
+
return data
|
|
135
|
+
|
|
136
|
+
def counted_readline(*a, **kw):
|
|
137
|
+
nonlocal readline_bytes
|
|
138
|
+
data = real_readline(*a, **kw)
|
|
139
|
+
readline_bytes += len(data)
|
|
140
|
+
return data
|
|
141
|
+
|
|
142
|
+
fh.read = counted_read
|
|
143
|
+
fh.readline = counted_readline
|
|
144
|
+
return fh
|
|
145
|
+
|
|
146
|
+
monkeypatch.setattr(Path, "open", spy_open)
|
|
147
|
+
|
|
148
|
+
card = blob_digest.summarize_blob(big)
|
|
149
|
+
|
|
150
|
+
head = blob_digest._HEAD_BYTES
|
|
151
|
+
seeks = blob_digest._SEEK_COUNT
|
|
152
|
+
seek_bytes = blob_digest._SEEK_BYTES
|
|
153
|
+
# Hard budget: head + seeks * seek_bytes — NO slack for readline,
|
|
154
|
+
# because the seek skip is now a bounded read, not a readline scan.
|
|
155
|
+
budget = head + seeks * seek_bytes
|
|
156
|
+
total = bytes_read + readline_bytes
|
|
157
|
+
assert total <= budget, (
|
|
158
|
+
f"profiler pulled {total:,} bytes "
|
|
159
|
+
f"(read={bytes_read:,}, readline={readline_bytes:,}); "
|
|
160
|
+
f"budget {budget:,}"
|
|
161
|
+
)
|
|
162
|
+
# And the card still characterises the file.
|
|
163
|
+
assert card.shape_guess is not None
|
|
164
|
+
assert card.sampled_line_count > 0
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def test_o_sample_rail_holds_on_long_line_no_newline_file(tmp_path) -> None:
|
|
168
|
+
"""REGRESSION: an earlier impl used fh.readline() to discard the partial
|
|
169
|
+
first line after each seek. With a 5 MB single-line file, that pulled
|
|
170
|
+
13 MB through readline() (scanning to EOF) — violating the rail and
|
|
171
|
+
invisible to the read()-only spy.
|
|
172
|
+
|
|
173
|
+
The fix is a hard-bounded seek window: read EXACTLY _SEEK_BYTES, find
|
|
174
|
+
the first newline within it, return the post-newline slice; if no
|
|
175
|
+
newline in the window, return empty and let the head sample carry the
|
|
176
|
+
cascade. Total disk I/O per seek is _SEEK_BYTES — no more.
|
|
177
|
+
|
|
178
|
+
This test asserts via the spy on bytes-mode .read() AND on .readline().
|
|
179
|
+
Both must total within budget. Long lines are a real shape for
|
|
180
|
+
minified logs, single-line dumps, and certain export formats — the
|
|
181
|
+
rail has to hold there too.
|
|
182
|
+
"""
|
|
183
|
+
big = tmp_path / "longline.log"
|
|
184
|
+
# 5 MB of a single line — no newline anywhere except at the end.
|
|
185
|
+
payload = b"a" * (5 * 1024 * 1024) + b"\n"
|
|
186
|
+
big.write_bytes(payload)
|
|
187
|
+
|
|
188
|
+
bytes_read = 0
|
|
189
|
+
readline_bytes = 0
|
|
190
|
+
|
|
191
|
+
real_open = Path.open
|
|
192
|
+
|
|
193
|
+
def spy_open(self, mode="r", *args, **kwargs):
|
|
194
|
+
fh = real_open(self, mode, *args, **kwargs)
|
|
195
|
+
if "b" in mode and self == big:
|
|
196
|
+
real_read = fh.read
|
|
197
|
+
real_readline = fh.readline
|
|
198
|
+
|
|
199
|
+
def counted_read(n=-1):
|
|
200
|
+
nonlocal bytes_read
|
|
201
|
+
data = real_read(n)
|
|
202
|
+
bytes_read += len(data)
|
|
203
|
+
return data
|
|
204
|
+
|
|
205
|
+
def counted_readline(*a, **kw):
|
|
206
|
+
nonlocal readline_bytes
|
|
207
|
+
data = real_readline(*a, **kw)
|
|
208
|
+
readline_bytes += len(data)
|
|
209
|
+
return data
|
|
210
|
+
|
|
211
|
+
fh.read = counted_read
|
|
212
|
+
fh.readline = counted_readline
|
|
213
|
+
return fh
|
|
214
|
+
|
|
215
|
+
import unittest.mock
|
|
216
|
+
with unittest.mock.patch.object(Path, "open", spy_open):
|
|
217
|
+
card = blob_digest.summarize_blob(big)
|
|
218
|
+
|
|
219
|
+
head = blob_digest._HEAD_BYTES
|
|
220
|
+
seeks = blob_digest._SEEK_COUNT
|
|
221
|
+
seek_bytes = blob_digest._SEEK_BYTES
|
|
222
|
+
# Hard budget: head once + at most seek_bytes per seek. No readline.
|
|
223
|
+
budget = head + seeks * seek_bytes
|
|
224
|
+
total = bytes_read + readline_bytes
|
|
225
|
+
assert total <= budget, (
|
|
226
|
+
f"long-line file pulled {total:,} bytes "
|
|
227
|
+
f"(read={bytes_read:,}, readline={readline_bytes:,}); "
|
|
228
|
+
f"budget {budget:,}"
|
|
229
|
+
)
|
|
230
|
+
# And readline() must contribute zero — the fix is "no readline at all".
|
|
231
|
+
assert readline_bytes == 0
|
|
232
|
+
# Card still well-formed even with all-empty body chunks.
|
|
233
|
+
assert isinstance(card, BlobCard)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
def test_determinism_same_file_yields_identical_card(tmp_path) -> None:
|
|
237
|
+
"""Seek offsets must be derived from file size — no unseeded randomness.
|
|
238
|
+
Same file → same sample → identical card."""
|
|
239
|
+
p = tmp_path / "log.txt"
|
|
240
|
+
with p.open("wb") as fh:
|
|
241
|
+
for i in range(20_000):
|
|
242
|
+
fh.write(f"event {i} payload alpha beta gamma\n".encode())
|
|
243
|
+
|
|
244
|
+
a = blob_digest.summarize_blob(p)
|
|
245
|
+
b = blob_digest.summarize_blob(p)
|
|
246
|
+
assert a.sampled_line_count == b.sampled_line_count
|
|
247
|
+
assert a.sample_read_count == b.sample_read_count
|
|
248
|
+
assert a.top_tokens == b.top_tokens
|
|
249
|
+
assert a.mean_line_length == b.mean_line_length
|
|
250
|
+
assert a.line_length_p95 == b.line_length_p95
|
|
251
|
+
assert a.shape_guess == b.shape_guess
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
# ─── Magic-byte identification ──────────────────────────────────────────────
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def test_terminal_magic_png_skips_text_cascade(tmp_path) -> None:
|
|
258
|
+
p = tmp_path / "img.png"
|
|
259
|
+
# PNG header + arbitrary binary tail.
|
|
260
|
+
p.write_bytes(b"\x89PNG\r\n\x1a\n" + bytes(range(256)) * 64)
|
|
261
|
+
|
|
262
|
+
card = blob_digest.summarize_blob(p)
|
|
263
|
+
assert card.file_type_guess == "PNG image"
|
|
264
|
+
assert card.file_type_magic == b"\x89PNG\r\n\x1a\n"
|
|
265
|
+
# Text slots vanish.
|
|
266
|
+
assert card.shape_guess is None
|
|
267
|
+
assert card.mean_line_length is None
|
|
268
|
+
assert card.top_tokens is None
|
|
269
|
+
assert card.distinct_templates is None
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
@pytest.mark.parametrize(
|
|
273
|
+
"magic,label",
|
|
274
|
+
[
|
|
275
|
+
(b"%PDF-1.4\n", "PDF document"),
|
|
276
|
+
(b"\x7fELF\x02\x01\x01", "ELF binary"),
|
|
277
|
+
(b"PK\x03\x04stuff", "zip archive"),
|
|
278
|
+
],
|
|
279
|
+
)
|
|
280
|
+
def test_terminal_magic_set_identifies(tmp_path, magic, label) -> None:
|
|
281
|
+
p = tmp_path / "f.bin"
|
|
282
|
+
p.write_bytes(magic + bytes(range(256)) * 16)
|
|
283
|
+
|
|
284
|
+
card = blob_digest.summarize_blob(p)
|
|
285
|
+
assert card.file_type_guess == label
|
|
286
|
+
assert card.shape_guess is None
|
|
287
|
+
assert card.mean_line_length is None
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def test_container_gzip_decompresses_and_profiles_content(tmp_path) -> None:
|
|
291
|
+
"""gzip is a CONTAINER, not terminal — decompress the prefix and
|
|
292
|
+
profile the content shape underneath, label as compressed."""
|
|
293
|
+
p = tmp_path / "data.log.gz"
|
|
294
|
+
with gzip.open(p, "wt", encoding="utf-8") as fh:
|
|
295
|
+
for i in range(1000):
|
|
296
|
+
fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
|
|
297
|
+
|
|
298
|
+
card = blob_digest.summarize_blob(p)
|
|
299
|
+
assert card.is_compressed is True
|
|
300
|
+
assert card.shape_guess == "JSON"
|
|
301
|
+
# Terminal magic is NOT set — gzip is a container, not a final answer.
|
|
302
|
+
assert card.file_type_guess is None
|
|
303
|
+
# byte_size is the on-disk size (compressed), NOT the decompressed total.
|
|
304
|
+
on_disk = p.stat().st_size
|
|
305
|
+
assert card.byte_size == on_disk
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def test_container_bz2_decompresses_and_profiles_content(tmp_path) -> None:
|
|
309
|
+
"""bzip2 is a CONTAINER — decompress the prefix via stdlib bz2 and
|
|
310
|
+
profile the content shape underneath."""
|
|
311
|
+
import bz2 as bz2_mod
|
|
312
|
+
p = tmp_path / "data.log.bz2"
|
|
313
|
+
with bz2_mod.open(p, "wt", encoding="utf-8") as fh:
|
|
314
|
+
for i in range(1000):
|
|
315
|
+
fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
|
|
316
|
+
|
|
317
|
+
card = blob_digest.summarize_blob(p)
|
|
318
|
+
assert card.is_compressed is True
|
|
319
|
+
assert card.shape_guess == "JSON"
|
|
320
|
+
assert card.file_type_guess is None
|
|
321
|
+
assert card.byte_size == p.stat().st_size
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def test_container_xz_decompresses_and_profiles_content(tmp_path) -> None:
|
|
325
|
+
"""xz is a CONTAINER — decompress the prefix via stdlib lzma and
|
|
326
|
+
profile the content shape underneath."""
|
|
327
|
+
import lzma as lzma_mod
|
|
328
|
+
p = tmp_path / "data.log.xz"
|
|
329
|
+
with lzma_mod.open(p, "wt", encoding="utf-8") as fh:
|
|
330
|
+
for i in range(1000):
|
|
331
|
+
fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
|
|
332
|
+
|
|
333
|
+
card = blob_digest.summarize_blob(p)
|
|
334
|
+
assert card.is_compressed is True
|
|
335
|
+
assert card.shape_guess == "JSON"
|
|
336
|
+
assert card.file_type_guess is None
|
|
337
|
+
assert card.byte_size == p.stat().st_size
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def test_misnamed_xz_routes_by_magic_not_suffix(tmp_path) -> None:
|
|
341
|
+
"""An xz-compressed file written with a non-.xz suffix (e.g. mystery.log)
|
|
342
|
+
is identified by magic and decompressed via the correct opener — proves
|
|
343
|
+
the magic table actually drives routing rather than being ornamental.
|
|
344
|
+
|
|
345
|
+
Without this, only correctly-suffixed containers would work, and the
|
|
346
|
+
bz2/xz magic-table entries would be vestigial."""
|
|
347
|
+
import lzma as lzma_mod
|
|
348
|
+
p = tmp_path / "mystery.log"
|
|
349
|
+
with lzma_mod.open(p, "wt", encoding="utf-8") as fh:
|
|
350
|
+
for i in range(1000):
|
|
351
|
+
fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
|
|
352
|
+
|
|
353
|
+
card = blob_digest.summarize_blob(p)
|
|
354
|
+
# Magic ID detected xz via "\xfd7zXZ\x00"; opener routed via lzma.open.
|
|
355
|
+
assert card.is_compressed is True
|
|
356
|
+
assert card.shape_guess == "JSON"
|
|
357
|
+
assert card.file_type_guess is None
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def test_misnamed_bz2_routes_by_magic_not_suffix(tmp_path) -> None:
|
|
361
|
+
"""A bzip2-compressed file with a non-.bz2 suffix is identified by
|
|
362
|
+
magic ("BZh") and decompressed via the correct opener."""
|
|
363
|
+
import bz2 as bz2_mod
|
|
364
|
+
p = tmp_path / "unknown.dat"
|
|
365
|
+
with bz2_mod.open(p, "wt", encoding="utf-8") as fh:
|
|
366
|
+
for i in range(1000):
|
|
367
|
+
fh.write(f'{{"event": "login", "user": "u{i}"}}\n')
|
|
368
|
+
|
|
369
|
+
card = blob_digest.summarize_blob(p)
|
|
370
|
+
assert card.is_compressed is True
|
|
371
|
+
assert card.shape_guess == "JSON"
|
|
372
|
+
assert card.file_type_guess is None
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
# ─── Shape cascade ──────────────────────────────────────────────────────────
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def test_shape_cascade_json(tmp_path) -> None:
|
|
379
|
+
p = tmp_path / "j.log"
|
|
380
|
+
with p.open("w") as fh:
|
|
381
|
+
for i in range(500):
|
|
382
|
+
fh.write(f'{{"k": {i}, "v": "x"}}\n')
|
|
383
|
+
|
|
384
|
+
card = blob_digest.summarize_blob(p)
|
|
385
|
+
assert card.shape_guess == "JSON"
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def test_shape_cascade_csv_recognized_from_body_not_header(tmp_path) -> None:
|
|
389
|
+
"""A CSV-like single header line with non-CSV body must NOT be
|
|
390
|
+
mis-classified as CSV. The cascade prefers body (seek) lines."""
|
|
391
|
+
p = tmp_path / "fake.csv"
|
|
392
|
+
# Single comma-rich header followed by a large freeform body.
|
|
393
|
+
header = "a,b,c,d,e,f,g,h\n"
|
|
394
|
+
body_line = "this is a freeform line with no commas in it\n"
|
|
395
|
+
# Large enough to trigger seeks (above _SEEK_MIN_SIZE).
|
|
396
|
+
with p.open("w") as fh:
|
|
397
|
+
fh.write(header)
|
|
398
|
+
for _ in range(20_000):
|
|
399
|
+
fh.write(body_line)
|
|
400
|
+
|
|
401
|
+
card = blob_digest.summarize_blob(p)
|
|
402
|
+
assert card.shape_guess != "CSV"
|
|
403
|
+
# Probably freeform; at least, the comma count test must fail.
|
|
404
|
+
assert "CSV" not in (card.shape_guess or "")
|
|
405
|
+
|
|
406
|
+
|
|
407
|
+
def test_shape_cascade_tsv_recognized_from_body(tmp_path) -> None:
|
|
408
|
+
p = tmp_path / "data.tsv"
|
|
409
|
+
with p.open("w") as fh:
|
|
410
|
+
# Body lines: 6 tabs = 7 columns each.
|
|
411
|
+
for i in range(2000):
|
|
412
|
+
fh.write("\t".join(["x"] * 7) + f"\t{i}\n")
|
|
413
|
+
|
|
414
|
+
card = blob_digest.summarize_blob(p)
|
|
415
|
+
assert card.shape_guess is not None
|
|
416
|
+
assert "TSV" in card.shape_guess
|
|
417
|
+
assert "~" in card.shape_guess and "columns" in card.shape_guess
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def test_shape_cascade_html(tmp_path) -> None:
|
|
421
|
+
p = tmp_path / "page.html"
|
|
422
|
+
with p.open("w") as fh:
|
|
423
|
+
for _ in range(500):
|
|
424
|
+
fh.write("<div><span>some content</span></div>\n")
|
|
425
|
+
|
|
426
|
+
card = blob_digest.summarize_blob(p)
|
|
427
|
+
assert card.shape_guess == "HTML/XML"
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def test_shape_cascade_key_value(tmp_path) -> None:
|
|
431
|
+
p = tmp_path / "kv.log"
|
|
432
|
+
with p.open("w") as fh:
|
|
433
|
+
for i in range(500):
|
|
434
|
+
fh.write(f"key1=val{i} key2=alpha key3=beta key4=gamma\n")
|
|
435
|
+
|
|
436
|
+
card = blob_digest.summarize_blob(p)
|
|
437
|
+
assert card.shape_guess == "key-value text"
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def test_shape_cascade_freeform(tmp_path) -> None:
|
|
441
|
+
p = tmp_path / "free.log"
|
|
442
|
+
with p.open("w") as fh:
|
|
443
|
+
for i in range(1000):
|
|
444
|
+
fh.write(f"plain prose sentence number {i} with no structure.\n")
|
|
445
|
+
|
|
446
|
+
card = blob_digest.summarize_blob(p)
|
|
447
|
+
assert card.shape_guess == "freeform text"
|
|
448
|
+
|
|
449
|
+
|
|
450
|
+
# ─── Char-class / UTF-8 honesty ────────────────────────────────────────────
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def test_char_class_flags_nonprintable_in_sample(tmp_path) -> None:
|
|
454
|
+
"""Char-class is computed over the sample bytes BEFORE decode — a binary-
|
|
455
|
+
heavy sample produces a low printable fraction even without a magic hit."""
|
|
456
|
+
p = tmp_path / "binary.bin"
|
|
457
|
+
# No known magic; just a mass of non-printable bytes.
|
|
458
|
+
p.write_bytes(b"\x01\x02\x03\xfe\xfd\xfc" * 1024)
|
|
459
|
+
|
|
460
|
+
card = blob_digest.summarize_blob(p)
|
|
461
|
+
assert card.printable_pct < 20.0
|
|
462
|
+
assert card.nonprintable_pct > 80.0
|
|
463
|
+
assert math.isclose(
|
|
464
|
+
card.printable_pct + card.nonprintable_pct, 100.0, abs_tol=0.01
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def test_utf8_clean_true_on_ascii(tmp_path) -> None:
|
|
469
|
+
p = tmp_path / "ascii.log"
|
|
470
|
+
p.write_text("hello world\n" * 100)
|
|
471
|
+
card = blob_digest.summarize_blob(p)
|
|
472
|
+
assert card.utf8_clean is True
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def test_utf8_clean_false_on_latin1_high_bytes(tmp_path) -> None:
|
|
476
|
+
"""Bytes that fail strict UTF-8 decode set utf8_clean=False; the
|
|
477
|
+
renderer's Bytes row drops the 'UTF-8 clean' tail."""
|
|
478
|
+
p = tmp_path / "latin1.log"
|
|
479
|
+
# 0xc0 alone is an invalid UTF-8 start; 0xa3 (£) without lead is also bad.
|
|
480
|
+
p.write_bytes(b"hello \xc0\xa3 world\n" * 200)
|
|
481
|
+
card = blob_digest.summarize_blob(p)
|
|
482
|
+
assert card.utf8_clean is False
|
|
483
|
+
|
|
484
|
+
out = _render(card)
|
|
485
|
+
assert "UTF-8 clean" not in out
|
|
486
|
+
assert "% printable" in out
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def test_utf8_clean_true_renders_clean_tail(tmp_path) -> None:
|
|
490
|
+
p = tmp_path / "clean.log"
|
|
491
|
+
p.write_text("alpha beta gamma\n" * 200)
|
|
492
|
+
card = blob_digest.summarize_blob(p)
|
|
493
|
+
assert card.utf8_clean is True
|
|
494
|
+
out = _render(card)
|
|
495
|
+
assert "UTF-8 clean" in out
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
# ─── Line-length p95 ────────────────────────────────────────────────────────
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def test_line_length_shape_returns_p95() -> None:
|
|
502
|
+
"""The lifted helper grows a 6-tuple with p95 inserted between median
|
|
503
|
+
and max. Card field line_length_p95 is populated from it."""
|
|
504
|
+
lengths = list(range(1, 101)) # 1..100
|
|
505
|
+
mean, median, p95, max_len, stdev, shape = blob_digest._line_length_shape(
|
|
506
|
+
lengths
|
|
507
|
+
)
|
|
508
|
+
assert mean == pytest.approx(50.5)
|
|
509
|
+
assert median == 50.5
|
|
510
|
+
assert max_len == 100
|
|
511
|
+
# 95th percentile of 1..100 with quantiles(n=20)[18] ≈ 95 or 96.
|
|
512
|
+
assert 90 <= p95 <= 100
|
|
513
|
+
assert shape in ("uniform", "varied")
|
|
514
|
+
|
|
515
|
+
|
|
516
|
+
def test_p95_never_exceeds_max_on_tiny_sample() -> None:
|
|
517
|
+
"""REGRESSION: statistics.quantiles(n=20) interpolates exclusively and
|
|
518
|
+
EXTRAPOLATES past max on small samples — lengths=[1, 100] yielded
|
|
519
|
+
p95=184 with max=100, which is nonsense (p95 must be an order
|
|
520
|
+
statistic from the sample). The fix is to fall back to max when the
|
|
521
|
+
sample is smaller than 20 lines, plus a defensive clamp."""
|
|
522
|
+
mean, median, p95, max_len, stdev, shape = blob_digest._line_length_shape(
|
|
523
|
+
[1, 100]
|
|
524
|
+
)
|
|
525
|
+
assert max_len == 100
|
|
526
|
+
assert p95 == 100, f"p95 must not exceed max; got p95={p95}, max={max_len}"
|
|
527
|
+
|
|
528
|
+
|
|
529
|
+
@pytest.mark.parametrize(
|
|
530
|
+
"lengths",
|
|
531
|
+
[
|
|
532
|
+
[10], # single line
|
|
533
|
+
[10, 1000], # 2-line extreme spread
|
|
534
|
+
[1, 2, 3, 4, 5], # tiny ascending
|
|
535
|
+
[100, 100, 100, 100, 100], # tiny uniform
|
|
536
|
+
[1] * 19, # just below the n=20 threshold
|
|
537
|
+
list(range(1, 21)), # exactly at threshold
|
|
538
|
+
list(range(1, 1001)), # plenty of data
|
|
539
|
+
],
|
|
540
|
+
)
|
|
541
|
+
def test_p95_le_max_invariant(lengths) -> None:
|
|
542
|
+
"""Invariant: p95 <= max for any non-empty sample, big or small."""
|
|
543
|
+
_, _, p95, max_len, _, _ = blob_digest._line_length_shape(lengths)
|
|
544
|
+
assert p95 <= max_len, (
|
|
545
|
+
f"p95={p95} exceeded max={max_len} for lengths={lengths!r}"
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
|
|
549
|
+
# ─── drain3 quarantine + meaninglessness floor ──────────────────────────────
|
|
550
|
+
|
|
551
|
+
|
|
552
|
+
def test_quarantine_drain3_dormant_vanishes_templates(monkeypatch, tmp_path) -> None:
|
|
553
|
+
"""_BLOB_DRAIN3_ENABLED=False → no template fields, card otherwise OK."""
|
|
554
|
+
monkeypatch.setattr(blob_digest, "_BLOB_DRAIN3_ENABLED", False)
|
|
555
|
+
|
|
556
|
+
p = tmp_path / "rep.log"
|
|
557
|
+
p.write_text("host svc: event N\n" * 500)
|
|
558
|
+
card = blob_digest.summarize_blob(p)
|
|
559
|
+
assert card.distinct_templates is None
|
|
560
|
+
assert card.top_template_coverage_pct is None
|
|
561
|
+
assert card.top_template_n is None
|
|
562
|
+
assert card.singleton_template_count is None
|
|
563
|
+
# Card otherwise renders fine. Lowercase labels under flat grammar.
|
|
564
|
+
out = _render(card)
|
|
565
|
+
assert "templates:" not in out
|
|
566
|
+
assert "shape:" in out
|
|
567
|
+
|
|
568
|
+
|
|
569
|
+
def test_meaninglessness_floor_vanishes_templates_on_freeform(tmp_path) -> None:
|
|
570
|
+
"""Near-1:1 templates/lines → suppress Templates (better silent than
|
|
571
|
+
vacuous '~480 distinct over 500 lines')."""
|
|
572
|
+
p = tmp_path / "free.log"
|
|
573
|
+
with p.open("w") as fh:
|
|
574
|
+
# Each line structurally distinct.
|
|
575
|
+
for i in range(500):
|
|
576
|
+
fh.write(
|
|
577
|
+
f"line{i:04d} verb_{i % 7} adverb_{i % 11} "
|
|
578
|
+
f"noun_{i % 13} {chr(65 + i % 26)}\n"
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
card = blob_digest.summarize_blob(p)
|
|
582
|
+
# On a deliberately-freeform input the floor should hit.
|
|
583
|
+
assert card.distinct_templates is None or (
|
|
584
|
+
card.distinct_templates is not None
|
|
585
|
+
and card.distinct_templates / max(card.sampled_line_count, 1)
|
|
586
|
+
< blob_digest._TEMPLATE_RATIO_FLOOR
|
|
587
|
+
)
|
|
588
|
+
|
|
589
|
+
|
|
590
|
+
# ─── Renderer: no banner, no Lines: / Data found: rows in flat grammar ─────
|
|
591
|
+
#
|
|
592
|
+
# The old Lines: / Records: / Data found: banner rows lived on RunSummary
|
|
593
|
+
# and went away with it. Blob now uses its identity-line provenance and
|
|
594
|
+
# never participates in a banner.
|
|
595
|
+
|
|
596
|
+
|
|
597
|
+
def test_blob_card_has_no_banner_lines_or_data_found_rows() -> None:
|
|
598
|
+
"""The flat blob card has no banner rows at all."""
|
|
599
|
+
for card in (_binary_blob_card(), _text_blob_card()):
|
|
600
|
+
out = _render(card)
|
|
601
|
+
assert "Lines:" not in out
|
|
602
|
+
assert "Records:" not in out
|
|
603
|
+
assert "Data found:" not in out
|
|
604
|
+
assert "LogHunter" not in out
|
|
605
|
+
|
|
606
|
+
|
|
607
|
+
# ─── Renderer: flat-grammar vanish-don't-dash ───────────────────────────────
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def test_render_binary_vanishes_text_slots() -> None:
|
|
611
|
+
"""Terminal binary magic → only the `bytes:` row remains; shape /
|
|
612
|
+
lines / tokens / templates are absent (vanish-don't-dash)."""
|
|
613
|
+
out = _render(_binary_blob_card())
|
|
614
|
+
assert "bytes:" in out
|
|
615
|
+
assert "PNG image" in out
|
|
616
|
+
assert "binary" in out
|
|
617
|
+
assert "shape:" not in out
|
|
618
|
+
assert "lines:" not in out
|
|
619
|
+
assert "tokens:" not in out
|
|
620
|
+
assert "templates:" not in out
|
|
621
|
+
|
|
622
|
+
|
|
623
|
+
def test_render_text_blob_shows_all_text_slots() -> None:
|
|
624
|
+
card = _text_blob_card()
|
|
625
|
+
out = _render(card)
|
|
626
|
+
for label in ("bytes:", "shape:", "lines:", "tokens:", "templates:"):
|
|
627
|
+
assert label in out
|
|
628
|
+
assert "[literal]" in out
|
|
629
|
+
|
|
630
|
+
|
|
631
|
+
def test_render_text_blob_with_no_templates_vanishes_only_that_slot() -> None:
|
|
632
|
+
card = _text_blob_card(has_templates=False)
|
|
633
|
+
out = _render(card)
|
|
634
|
+
assert "shape:" in out
|
|
635
|
+
assert "lines:" in out
|
|
636
|
+
assert "tokens:" in out
|
|
637
|
+
assert "templates:" not in out
|
|
638
|
+
|
|
639
|
+
|
|
640
|
+
def test_render_no_footer_no_header_rule_no_trailing_sep() -> None:
|
|
641
|
+
"""Flat grammar: no `── digest · blob ─` header, no `inner_sep`, no
|
|
642
|
+
`No parser claims…` footer, no trailing `_SEP`."""
|
|
643
|
+
for card in (_binary_blob_card(), _text_blob_card()):
|
|
644
|
+
out = _render(card)
|
|
645
|
+
assert "── digest" not in out
|
|
646
|
+
assert "No parser claims" not in out
|
|
647
|
+
assert "─" not in out
|
|
648
|
+
|
|
649
|
+
|
|
650
|
+
def test_render_headline_labels_guess(tmp_path) -> None:
|
|
651
|
+
"""Headline always labels itself as a guess. Under the flat grammar
|
|
652
|
+
the headline is flush-left (no leading indent)."""
|
|
653
|
+
text_out = _render(_text_blob_card(shape_guess="JSON"))
|
|
654
|
+
assert "looks like JSON" in text_out
|
|
655
|
+
# Flush-left — no two-space indent prefix.
|
|
656
|
+
headline = next(ln for ln in text_out.splitlines() if "looks like JSON" in ln)
|
|
657
|
+
assert not headline.startswith(" ")
|
|
658
|
+
|
|
659
|
+
bin_out = _render(_binary_blob_card())
|
|
660
|
+
assert "looks like a PNG image, not a log" in bin_out
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def test_render_provenance_line_plain_text() -> None:
|
|
664
|
+
"""Plain-text provenance reports the sampled count and reads."""
|
|
665
|
+
out = _render(_text_blob_card())
|
|
666
|
+
assert "sampled" in out
|
|
667
|
+
assert "lines across" in out
|
|
668
|
+
assert "reads" in out
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def test_render_provenance_line_compressed() -> None:
|
|
672
|
+
"""Compressed provenance labels 'compressed' and 'sampled from head'."""
|
|
673
|
+
base = _text_blob_card()
|
|
674
|
+
compressed = BlobCard(**{**base.__dict__, "is_compressed": True})
|
|
675
|
+
out = _render(compressed)
|
|
676
|
+
assert "compressed" in out
|
|
677
|
+
assert "from head" in out
|
|
678
|
+
|
|
679
|
+
|
|
680
|
+
def test_render_provenance_line_terminal_binary() -> None:
|
|
681
|
+
"""Terminal-binary provenance: 'binary, sampled from head'. Does NOT
|
|
682
|
+
count lines (a binary has no line concept)."""
|
|
683
|
+
out = _render(_binary_blob_card())
|
|
684
|
+
# Identity line 1 = source name; line 2 = provenance.
|
|
685
|
+
lines = out.splitlines()
|
|
686
|
+
assert "binary, sampled from head" in lines[1]
|
|
687
|
+
assert "lines across" not in lines[1]
|
|
688
|
+
assert "reads" not in lines[1]
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def test_render_identity_line_is_source_name_flush_left() -> None:
|
|
692
|
+
"""Identity line 1 = card.source_name, flush-left, no banner above."""
|
|
693
|
+
out = _render(_text_blob_card(), source_name="mystery.txt")
|
|
694
|
+
assert out.splitlines()[0] == "mystery.txt"
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def test_round_2sf_helper_is_gone() -> None:
|
|
698
|
+
"""_round_2sf existed only for the 'Lines: sampled ~N' rendering that
|
|
699
|
+
earlier work removed. Pin its removal so it does not creep back."""
|
|
700
|
+
assert not hasattr(text_module, "_round_2sf")
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
# ─── JSON blob: `fields:` (names) replaces `tokens:` (record dump) ──────────
|
|
704
|
+
#
|
|
705
|
+
# Glob-digest of a Zeek directory routes every non-claimed NDJSON log
|
|
706
|
+
# (http, ssl, ssh, dhcp, ntp, weird, …) to the blob floor. The legacy
|
|
707
|
+
# tokens row dumped raw records (and sometimes mid-value garbage from
|
|
708
|
+
# whitespace splits inside string fields). The replacement lists
|
|
709
|
+
# top-level JSON KEY NAMES only — structural description, no values.
|
|
710
|
+
|
|
711
|
+
def _write_json_lines(path, lines: list[str]) -> None:
|
|
712
|
+
path.write_text("".join(line + "\n" for line in lines), encoding="utf-8")
|
|
713
|
+
|
|
714
|
+
|
|
715
|
+
def test_json_blob_renders_fields_row_not_tokens_row(tmp_path) -> None:
|
|
716
|
+
"""JSON blob: fields: line present, tokens: line absent. Union of
|
|
717
|
+
keys across rows preserves first-seen order and catches an optional
|
|
718
|
+
key that only appears on one row."""
|
|
719
|
+
p = tmp_path / "ssh.log"
|
|
720
|
+
_write_json_lines(p, [
|
|
721
|
+
'{"ts": 1779750000.0, "uid": "C001", "id.orig_h": "192.0.2.10"}',
|
|
722
|
+
'{"ts": 1779750001.0, "uid": "C002", "id.orig_h": "192.0.2.11",'
|
|
723
|
+
' "auth_attempts": 3}',
|
|
724
|
+
] * 3)
|
|
725
|
+
card = blob_digest.summarize_blob(p)
|
|
726
|
+
assert card.shape_guess == "JSON"
|
|
727
|
+
assert card.json_field_names == [
|
|
728
|
+
"ts", "uid", "id.orig_h", "auth_attempts",
|
|
729
|
+
]
|
|
730
|
+
out = _render(card, source_name="ssh.log")
|
|
731
|
+
assert "fields:" in out
|
|
732
|
+
assert "tokens:" not in out
|
|
733
|
+
fields_line = next(l for l in out.splitlines() if l.startswith("fields:"))
|
|
734
|
+
# Names emitted in first-seen order, comma-separated.
|
|
735
|
+
assert "ts, uid, id.orig_h, auth_attempts" in fields_line
|
|
736
|
+
|
|
737
|
+
|
|
738
|
+
def test_json_arrays_and_scalars_fall_back_to_tokens_row(tmp_path) -> None:
|
|
739
|
+
"""Top-level JSON arrays / scalars / mixed have no object keys to
|
|
740
|
+
list; helper returns None; renderer falls back to the existing
|
|
741
|
+
`tokens:` row."""
|
|
742
|
+
p = tmp_path / "weird.log"
|
|
743
|
+
_write_json_lines(p, ['[1, 2, 3]', '42', '"hello"'] * 5)
|
|
744
|
+
card = blob_digest.summarize_blob(p)
|
|
745
|
+
# Helper-level: None
|
|
746
|
+
assert blob_digest._json_field_names([
|
|
747
|
+
'[1, 2, 3]', '42', '"hello"',
|
|
748
|
+
]) is None
|
|
749
|
+
assert card.json_field_names is None
|
|
750
|
+
out = _render(card, source_name="weird.log")
|
|
751
|
+
# The exact shape-guess is JSON or freeform depending on the cascade;
|
|
752
|
+
# what matters is no `fields:` row when names is None.
|
|
753
|
+
assert "fields:" not in out
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def _json_keys_line(n: int) -> str:
|
|
757
|
+
"""One JSON record with N generic placeholder keys."""
|
|
758
|
+
pairs = ", ".join(f'"field_{i:02d}": {i}' for i in range(n))
|
|
759
|
+
return "{" + pairs + "}"
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def test_fields_row_clamps_to_two_lines_with_correct_remainder(tmp_path) -> None:
|
|
763
|
+
"""30 generic keys → exactly two lines, second hang-indented to the
|
|
764
|
+
value column, ends with `… +N more`, and N equals total minus rendered."""
|
|
765
|
+
p = tmp_path / "wide.log"
|
|
766
|
+
_write_json_lines(p, [_json_keys_line(30)] * 5)
|
|
767
|
+
card = blob_digest.summarize_blob(p)
|
|
768
|
+
assert card.json_field_names is not None
|
|
769
|
+
assert len(card.json_field_names) == 30
|
|
770
|
+
|
|
771
|
+
out = _render(card, source_name="wide.log")
|
|
772
|
+
lines = out.splitlines()
|
|
773
|
+
fields_idx = next(i for i, l in enumerate(lines) if l.startswith("fields:"))
|
|
774
|
+
line1 = lines[fields_idx]
|
|
775
|
+
line2 = lines[fields_idx + 1]
|
|
776
|
+
|
|
777
|
+
# Compute the actual label column from a single-line slot rendered
|
|
778
|
+
# alongside this card — the column is `max(label_w) + 2`, where the
|
|
779
|
+
# blob card's longest label is `templates`, not `fields`. Slicing by
|
|
780
|
+
# `len("fields: ")` would land mid-padding and corrupt the first name.
|
|
781
|
+
bytes_line = next(l for l in lines if l.startswith("bytes:"))
|
|
782
|
+
label_col = len(bytes_line) - len(bytes_line.lstrip()) + len("bytes:")
|
|
783
|
+
# Actually compute from the leading-whitespace gap after the `bytes:`
|
|
784
|
+
# prefix: `bytes:` is 6 chars, value column starts at `label_col`.
|
|
785
|
+
label_col = bytes_line.index(bytes_line.lstrip()[len("bytes: ".rstrip()):][0:1] or "t")
|
|
786
|
+
# Simpler: column where the value starts is the position of the first
|
|
787
|
+
# non-space char AFTER the `:` on a single-line slot.
|
|
788
|
+
after_colon = bytes_line.find(":") + 1
|
|
789
|
+
while after_colon < len(bytes_line) and bytes_line[after_colon] == " ":
|
|
790
|
+
after_colon += 1
|
|
791
|
+
label_col = after_colon
|
|
792
|
+
|
|
793
|
+
# Line 2 hang-indents to label_col.
|
|
794
|
+
assert line2.startswith(" " * label_col)
|
|
795
|
+
# Line 2 ends with the truncation suffix and an accurate N.
|
|
796
|
+
import re
|
|
797
|
+
m = re.search(r"… \+(\d+) more$", line2)
|
|
798
|
+
assert m is not None, f"expected `… +N more` suffix; got {line2!r}"
|
|
799
|
+
n_more = int(m.group(1))
|
|
800
|
+
|
|
801
|
+
# Count the field names that actually rendered across both lines.
|
|
802
|
+
rendered_text = (
|
|
803
|
+
line1[label_col:]
|
|
804
|
+
+ ", "
|
|
805
|
+
+ line2[label_col:].rsplit("… +", 1)[0].rstrip(", ")
|
|
806
|
+
)
|
|
807
|
+
rendered_names = [n for n in rendered_text.split(", ") if n.startswith("field_")]
|
|
808
|
+
assert len(rendered_names) + n_more == 30
|
|
809
|
+
# Each rendered name appears whole — no mid-name break.
|
|
810
|
+
for name in rendered_names:
|
|
811
|
+
assert name.startswith("field_") and len(name) == len("field_NN")
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def test_fields_row_renders_single_line_no_suffix_when_narrow(tmp_path) -> None:
|
|
815
|
+
"""Short list fits one line → one line, no suffix."""
|
|
816
|
+
p = tmp_path / "narrow.log"
|
|
817
|
+
_write_json_lines(p, [_json_keys_line(4)] * 5)
|
|
818
|
+
card = blob_digest.summarize_blob(p)
|
|
819
|
+
out = _render(card, source_name="narrow.log")
|
|
820
|
+
fields_lines = [
|
|
821
|
+
i for i, l in enumerate(out.splitlines()) if l.startswith("fields:")
|
|
822
|
+
]
|
|
823
|
+
assert len(fields_lines) == 1
|
|
824
|
+
line = out.splitlines()[fields_lines[0]]
|
|
825
|
+
assert "more" not in line
|
|
826
|
+
assert "…" not in line
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def test_fields_row_wrap_never_splits_a_name(tmp_path) -> None:
|
|
830
|
+
"""Wrap respects part boundaries — every field name appears whole on
|
|
831
|
+
exactly one rendered line."""
|
|
832
|
+
p = tmp_path / "mixed.log"
|
|
833
|
+
# Mix short and longer names; enough to wrap.
|
|
834
|
+
names = (
|
|
835
|
+
["ts", "uid", "id.orig_h", "id.resp_h"]
|
|
836
|
+
+ [f"long_field_name_{i:02d}" for i in range(20)]
|
|
837
|
+
)
|
|
838
|
+
record = "{" + ", ".join(f'"{n}": {i}' for i, n in enumerate(names)) + "}"
|
|
839
|
+
_write_json_lines(p, [record] * 3)
|
|
840
|
+
card = blob_digest.summarize_blob(p)
|
|
841
|
+
out = _render(card, source_name="mixed.log")
|
|
842
|
+
lines = out.splitlines()
|
|
843
|
+
fields_idx = next(i for i, l in enumerate(lines) if l.startswith("fields:"))
|
|
844
|
+
# Concatenate the rendered lines and check every original name
|
|
845
|
+
# appears either whole somewhere in the rendered text OR is one of
|
|
846
|
+
# the suppressed-by-suffix names. The strong check: NO substring of
|
|
847
|
+
# any rendered line splits a name (a partial like "long_field_name_0"
|
|
848
|
+
# without its trailing digit would indicate a mid-name break).
|
|
849
|
+
rendered = lines[fields_idx] + " " + (lines[fields_idx + 1]
|
|
850
|
+
if fields_idx + 1 < len(lines) else "")
|
|
851
|
+
# For each name, if it appears at all, it appears in full.
|
|
852
|
+
for name in names:
|
|
853
|
+
# Search for the prefix and assert the next char is not alnum/_
|
|
854
|
+
# (so we'd catch a mid-name break like "long_field_name_0" cut
|
|
855
|
+
# off before its decade digit).
|
|
856
|
+
import re
|
|
857
|
+
for m in re.finditer(re.escape(name) + r"(\w)", rendered):
|
|
858
|
+
# An extension of `name` by a word char is only OK if the
|
|
859
|
+
# extended-name is itself an emitted name (e.g. "ts" prefix
|
|
860
|
+
# of "ts_extra"). Names list has no such overlap, so flag it.
|
|
861
|
+
extended = name + m.group(1)
|
|
862
|
+
assert extended in names, (
|
|
863
|
+
f"field name appears to be split: {name!r} extended by "
|
|
864
|
+
f"{m.group(1)!r} in rendered output"
|
|
865
|
+
)
|
|
866
|
+
|
|
867
|
+
|
|
868
|
+
# ─── Cross-helper regression: schema-card values stay UN-clamped ────────────
|
|
869
|
+
|
|
870
|
+
def test_schema_card_long_value_renders_full_no_truncation() -> None:
|
|
871
|
+
"""The blob two-line clamp must NOT leak into schema-card rendering.
|
|
872
|
+
|
|
873
|
+
A long densest-tuple flow on a conn DigestCard must render in full
|
|
874
|
+
(no `… +N more`, no truncation). _render_label_value_block stays the
|
|
875
|
+
shared label-aligned helper for schema cards; the wrap lives only
|
|
876
|
+
in render_blob via _wrap_blob_slot_value.
|
|
877
|
+
"""
|
|
878
|
+
from datetime import datetime, timezone
|
|
879
|
+
from loghunter.common.finding import DigestCard, DigestSlot
|
|
880
|
+
long_flow = (
|
|
881
|
+
"192.0.2.10:51234 → 198.51.100.20:443 (very-long-tag-padding-out-"
|
|
882
|
+
"to-exceed-the-eighty-col-line-frame-on-purpose)"
|
|
883
|
+
)
|
|
884
|
+
assert len(long_flow) > 80
|
|
885
|
+
cliff = DigestSlot(
|
|
886
|
+
label="densest-tuple", statistic="cliff",
|
|
887
|
+
cells=[long_flow, "482", "3.7x"],
|
|
888
|
+
entity=long_flow, magnitude=482.0, ratio=3.7,
|
|
889
|
+
)
|
|
890
|
+
now = datetime(2026, 6, 14, tzinfo=timezone.utc)
|
|
891
|
+
card = DigestCard(
|
|
892
|
+
schema="conn",
|
|
893
|
+
source_name="conn.log",
|
|
894
|
+
data_window=(now, now),
|
|
895
|
+
record_count=1000,
|
|
896
|
+
histogram_counts=[1, 2, 3],
|
|
897
|
+
histogram_unit="hr",
|
|
898
|
+
histogram_peak=3,
|
|
899
|
+
zone1_extras=[("hosts", "5")],
|
|
900
|
+
insights=[],
|
|
901
|
+
fields=[cliff],
|
|
902
|
+
data_size_bytes=0,
|
|
903
|
+
)
|
|
904
|
+
stream = io.StringIO()
|
|
905
|
+
TextHandler(stream=stream).render_digest(card)
|
|
906
|
+
out = stream.getvalue()
|
|
907
|
+
# Full flow string survives in the rendered output, no truncation suffix.
|
|
908
|
+
assert long_flow in out
|
|
909
|
+
assert "… +" not in out
|
|
910
|
+
assert "more" not in out.split(long_flow)[1]
|
|
911
|
+
|
|
912
|
+
|
|
913
|
+
# ─── Runner fold + CLI invariants ───────────────────────────────────────────
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def test_run_digest_blob_no_longer_exists() -> None:
|
|
917
|
+
"""The standalone runner is gone; blob is folded into run_digest's
|
|
918
|
+
terminal branch."""
|
|
919
|
+
assert not hasattr(runner, "_run_digest_blob")
|
|
920
|
+
|
|
921
|
+
|
|
922
|
+
def test_cli_digest_blob_token_is_rejected(monkeypatch) -> None:
|
|
923
|
+
"""`digest blob PATH` token is gone — schema cannot be selected from CLI."""
|
|
924
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
925
|
+
# Token "blob" treated as a path; the path doesn't exist → errored=1.
|
|
926
|
+
rc = cli._main(["digest", "blob"])
|
|
927
|
+
assert rc == 1
|
|
928
|
+
|
|
929
|
+
|
|
930
|
+
def test_cli_sniff_floor_routes_to_blob_path(tmp_path, monkeypatch) -> None:
|
|
931
|
+
"""Unrecognized text positional → schema=blob via sniff floor; sets
|
|
932
|
+
blob_path on the runner kwargs."""
|
|
933
|
+
captured: dict[str, Any] = {}
|
|
934
|
+
|
|
935
|
+
def _fake(**kwargs: Any) -> None:
|
|
936
|
+
captured.update(kwargs)
|
|
937
|
+
|
|
938
|
+
monkeypatch.setattr(runner, "run_digest", _fake)
|
|
939
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
940
|
+
|
|
941
|
+
f = tmp_path / "mystery.txt"
|
|
942
|
+
f.write_text("hello world\nlorem ipsum\n")
|
|
943
|
+
cli._main(["digest", str(f)])
|
|
944
|
+
assert captured.get("schema") == "blob"
|
|
945
|
+
assert captured.get("blob_path") == f
|
|
946
|
+
assert captured.get("zeek_dir") is None
|
|
947
|
+
assert captured.get("pihole_dir") is None
|
|
948
|
+
assert captured.get("syslog_dir") is None
|
|
949
|
+
assert captured.get("cloudtrail_dir") is None
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def test_cli_blob_path_not_advertised(monkeypatch) -> None:
|
|
953
|
+
"""--blob-path is not advertised — operator cannot set blob_path
|
|
954
|
+
directly. Sniff routing is the only producer."""
|
|
955
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
956
|
+
with pytest.raises(ValueError, match=r"unknown flag --blob-path"):
|
|
957
|
+
cli._main(["digest", "--blob-path=/tmp/x"])
|
|
958
|
+
|
|
959
|
+
|
|
960
|
+
def test_run_digest_blob_requires_blob_path() -> None:
|
|
961
|
+
with pytest.raises(ValueError, match=r"PATH not provided"):
|
|
962
|
+
runner.run_digest(config={"loghunter": {}}, schema="blob")
|
|
963
|
+
|
|
964
|
+
|
|
965
|
+
def test_run_digest_blob_rejects_directory(tmp_path) -> None:
|
|
966
|
+
"""The blob terminal branch requires a single file; the sniff path
|
|
967
|
+
only ever produces single files. A directory is a programmer error."""
|
|
968
|
+
with pytest.raises(ValueError, match=r"not a file"):
|
|
969
|
+
runner.run_digest(
|
|
970
|
+
config={"loghunter": {}}, schema="blob", blob_path=tmp_path,
|
|
971
|
+
)
|
|
972
|
+
|
|
973
|
+
|
|
974
|
+
def test_run_digest_blob_path_only_valid_for_blob_schema(tmp_path) -> None:
|
|
975
|
+
"""blob_path passed with a schema-card schema is rejected."""
|
|
976
|
+
with pytest.raises(
|
|
977
|
+
ValueError,
|
|
978
|
+
match=r"blob_path is only valid for the blob schema",
|
|
979
|
+
):
|
|
980
|
+
runner.run_digest(
|
|
981
|
+
config={"loghunter": {}}, schema="conn",
|
|
982
|
+
blob_path=tmp_path / "input.txt",
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
|
|
986
|
+
# ─── Recognized-but-empty seam ──────────────────────────────────────────────
|
|
987
|
+
|
|
988
|
+
|
|
989
|
+
def test_digest_empty_is_not_value_error_subclass() -> None:
|
|
990
|
+
"""DigestEmpty is a control signal, not an error. Real per-path
|
|
991
|
+
failures (corrupt gzip, parser errors) are ValueErrors; DigestEmpty
|
|
992
|
+
must not be consumed by the ValueError arm."""
|
|
993
|
+
assert not issubclass(DigestEmpty, ValueError)
|
|
994
|
+
|
|
995
|
+
|
|
996
|
+
def test_digest_empty_carries_basename_and_schema() -> None:
|
|
997
|
+
exc = DigestEmpty(basename="conn.log", schema="conn")
|
|
998
|
+
assert exc.basename == "conn.log"
|
|
999
|
+
assert exc.schema == "conn"
|
|
1000
|
+
assert "conn.log" in str(exc)
|
|
1001
|
+
assert "conn" in str(exc)
|
|
1002
|
+
|
|
1003
|
+
|
|
1004
|
+
def test_run_digest_raises_digest_empty_on_empty_frame(
|
|
1005
|
+
tmp_path, monkeypatch,
|
|
1006
|
+
) -> None:
|
|
1007
|
+
"""Frame-based check: an empty frame returned by the loader raises
|
|
1008
|
+
DigestEmpty, not a zero-row schema card."""
|
|
1009
|
+
from loghunter.common import loader
|
|
1010
|
+
|
|
1011
|
+
fake = loader.LoadResult(
|
|
1012
|
+
logs={"conn*.log*": pd.DataFrame()},
|
|
1013
|
+
record_counts={},
|
|
1014
|
+
data_window=None,
|
|
1015
|
+
warnings=[],
|
|
1016
|
+
data_size_bytes=0,
|
|
1017
|
+
)
|
|
1018
|
+
monkeypatch.setattr(loader, "load_required_logs", lambda *a, **k: fake)
|
|
1019
|
+
|
|
1020
|
+
zeek_dir = tmp_path / "zeek"
|
|
1021
|
+
zeek_dir.mkdir()
|
|
1022
|
+
with pytest.raises(DigestEmpty) as exc_info:
|
|
1023
|
+
runner.run_digest(
|
|
1024
|
+
config={"loghunter": {}}, schema="conn", zeek_dir=zeek_dir,
|
|
1025
|
+
)
|
|
1026
|
+
assert exc_info.value.basename == "zeek"
|
|
1027
|
+
assert exc_info.value.schema == "conn"
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def test_run_digest_raises_digest_empty_when_frame_missing(
|
|
1031
|
+
tmp_path, monkeypatch,
|
|
1032
|
+
) -> None:
|
|
1033
|
+
"""`frame is None` (load_result.logs missing the key) is also an empty
|
|
1034
|
+
state — same DigestEmpty raise."""
|
|
1035
|
+
from loghunter.common import loader
|
|
1036
|
+
|
|
1037
|
+
fake = loader.LoadResult(
|
|
1038
|
+
logs={}, # pattern key missing entirely
|
|
1039
|
+
record_counts={},
|
|
1040
|
+
data_window=None,
|
|
1041
|
+
warnings=[],
|
|
1042
|
+
data_size_bytes=0,
|
|
1043
|
+
)
|
|
1044
|
+
monkeypatch.setattr(loader, "load_required_logs", lambda *a, **k: fake)
|
|
1045
|
+
|
|
1046
|
+
zeek_dir = tmp_path / "zeek"
|
|
1047
|
+
zeek_dir.mkdir()
|
|
1048
|
+
with pytest.raises(DigestEmpty):
|
|
1049
|
+
runner.run_digest(
|
|
1050
|
+
config={"loghunter": {}}, schema="conn", zeek_dir=zeek_dir,
|
|
1051
|
+
)
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
def test_cli_fanout_recognized_empty_narrates_and_exits_zero(
|
|
1055
|
+
tmp_path, monkeypatch, capsys,
|
|
1056
|
+
) -> None:
|
|
1057
|
+
"""Fan-out: DigestEmpty raised from run_digest → narrated to stderr,
|
|
1058
|
+
no card rendered, exit 0."""
|
|
1059
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1060
|
+
|
|
1061
|
+
def _empty(**kwargs):
|
|
1062
|
+
raise DigestEmpty(basename="conn.log", schema="conn")
|
|
1063
|
+
|
|
1064
|
+
monkeypatch.setattr(runner, "run_digest", _empty)
|
|
1065
|
+
|
|
1066
|
+
f = tmp_path / "conn.log"
|
|
1067
|
+
f.write_text("#fields\tts\tsrc\n")
|
|
1068
|
+
# Force sniff to classify as conn.
|
|
1069
|
+
import loghunter.common.loader as loader_mod
|
|
1070
|
+
monkeypatch.setattr(
|
|
1071
|
+
loader_mod,
|
|
1072
|
+
"sniff_format_detailed",
|
|
1073
|
+
lambda p: loader_mod.SniffResult(
|
|
1074
|
+
state="classified", schema="conn", origin="zeek",
|
|
1075
|
+
),
|
|
1076
|
+
)
|
|
1077
|
+
|
|
1078
|
+
rc = cli._main(["digest", str(f)])
|
|
1079
|
+
err = capsys.readouterr().err
|
|
1080
|
+
assert "recognized as conn but no parseable records" in err
|
|
1081
|
+
assert rc == 0
|
|
1082
|
+
|
|
1083
|
+
|
|
1084
|
+
def test_cli_bare_config_recognized_empty_narrates_and_exits_zero(
|
|
1085
|
+
tmp_path, monkeypatch, capsys,
|
|
1086
|
+
) -> None:
|
|
1087
|
+
"""Bare-config (no positional): DigestEmpty caught at the entry point,
|
|
1088
|
+
narrated, exit 0 — no traceback."""
|
|
1089
|
+
zeek_dir = tmp_path / "zeek"
|
|
1090
|
+
zeek_dir.mkdir()
|
|
1091
|
+
monkeypatch.setattr(
|
|
1092
|
+
cli.cfg, "load",
|
|
1093
|
+
lambda _path: {"loghunter": {"zeek_dir": str(zeek_dir)}},
|
|
1094
|
+
)
|
|
1095
|
+
|
|
1096
|
+
def _empty(**kwargs):
|
|
1097
|
+
raise DigestEmpty(basename=zeek_dir.name, schema="conn")
|
|
1098
|
+
|
|
1099
|
+
monkeypatch.setattr(runner, "run_digest", _empty)
|
|
1100
|
+
|
|
1101
|
+
rc = cli._main(["digest"])
|
|
1102
|
+
err = capsys.readouterr().err
|
|
1103
|
+
assert "recognized as conn but no parseable records" in err
|
|
1104
|
+
assert rc == 0
|
|
1105
|
+
|
|
1106
|
+
|
|
1107
|
+
def test_cli_fanout_corrupt_gzip_exits_clean(tmp_path, monkeypatch, capsys) -> None:
|
|
1108
|
+
"""REGRESSION: a corrupt .gz raises gzip.BadGzipFile (OSError subclass)
|
|
1109
|
+
inside sniff_format_detailed BEFORE run_digest is called. The fan-out
|
|
1110
|
+
arm must catch it as a per-path failure — no traceback, no leak to
|
|
1111
|
+
main(). Exit 1 because rendered=0 and errored=1."""
|
|
1112
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1113
|
+
|
|
1114
|
+
bad = tmp_path / "broken.gz"
|
|
1115
|
+
bad.write_bytes(b"not a real gzip stream, just text")
|
|
1116
|
+
|
|
1117
|
+
rc = cli._main(["digest", str(bad)])
|
|
1118
|
+
err = capsys.readouterr().err
|
|
1119
|
+
assert rc == 1
|
|
1120
|
+
# Per-path message format: "digest: <name>: <reason>".
|
|
1121
|
+
assert "broken.gz" in err
|
|
1122
|
+
# And no Python traceback markers.
|
|
1123
|
+
assert "Traceback" not in err
|
|
1124
|
+
|
|
1125
|
+
|
|
1126
|
+
def test_cli_bare_config_corrupt_gzip_handled_gracefully(
|
|
1127
|
+
tmp_path, monkeypatch, capsys,
|
|
1128
|
+
) -> None:
|
|
1129
|
+
"""REGRESSION: a corrupt .gz inside the configured zeek_dir does NOT
|
|
1130
|
+
leak as a traceback. The loader skips the bad file with a warning, the
|
|
1131
|
+
resulting empty frame raises DigestEmpty, and the bare-config arm
|
|
1132
|
+
catches it and narrates cleanly. Exit 0 (file was understood, just
|
|
1133
|
+
nothing to read after the skip)."""
|
|
1134
|
+
zeek_dir = tmp_path / "zeek"
|
|
1135
|
+
zeek_dir.mkdir()
|
|
1136
|
+
(zeek_dir / "conn.log.gz").write_bytes(b"not gzip, just text")
|
|
1137
|
+
|
|
1138
|
+
monkeypatch.setattr(
|
|
1139
|
+
cli.cfg, "load",
|
|
1140
|
+
lambda _path: {
|
|
1141
|
+
"loghunter": {
|
|
1142
|
+
"zeek_dir": str(zeek_dir),
|
|
1143
|
+
"default_window": "all",
|
|
1144
|
+
}
|
|
1145
|
+
},
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
rc = cli._main(["digest"])
|
|
1149
|
+
err = capsys.readouterr().err
|
|
1150
|
+
assert rc == 0
|
|
1151
|
+
assert "Traceback" not in err
|
|
1152
|
+
# The loader surfaces a graceful skip; the recognized-empty seam takes over.
|
|
1153
|
+
assert "could not be read" in err or "incomplete or corrupt" in err
|
|
1154
|
+
assert "recognized as conn but no parseable records" in err
|
|
1155
|
+
|
|
1156
|
+
|
|
1157
|
+
def test_main_oserror_arm_translates_to_exit_one(monkeypatch, capsys) -> None:
|
|
1158
|
+
"""Defensive: any OSError that escapes _main() (e.g. an I/O failure that
|
|
1159
|
+
no per-path arm catches) MUST be translated to a clean 'loghunter:' exit 1
|
|
1160
|
+
by main(), not bubble as a traceback. Pin the arm independently of the
|
|
1161
|
+
digest code paths so it stays in place across future refactors."""
|
|
1162
|
+
def _raise_os_error(_argv):
|
|
1163
|
+
raise OSError("synthetic disk failure for test")
|
|
1164
|
+
|
|
1165
|
+
monkeypatch.setattr(cli, "_main", _raise_os_error)
|
|
1166
|
+
with pytest.raises(SystemExit) as exit_info:
|
|
1167
|
+
cli.main(["digest", "/nonexistent"])
|
|
1168
|
+
assert exit_info.value.code == 1
|
|
1169
|
+
err = capsys.readouterr().err
|
|
1170
|
+
assert "Traceback" not in err
|
|
1171
|
+
assert "loghunter:" in err
|
|
1172
|
+
assert "synthetic disk failure" in err
|
|
1173
|
+
|
|
1174
|
+
|
|
1175
|
+
def test_cli_fanout_value_error_still_exits_one(tmp_path, monkeypatch, capsys) -> None:
|
|
1176
|
+
"""A real per-path failure (ValueError) still flows through the
|
|
1177
|
+
existing arm — exit 1 when nothing rendered."""
|
|
1178
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: {"loghunter": {}})
|
|
1179
|
+
|
|
1180
|
+
def _boom(**kwargs):
|
|
1181
|
+
raise ValueError("simulated parser failure")
|
|
1182
|
+
|
|
1183
|
+
monkeypatch.setattr(runner, "run_digest", _boom)
|
|
1184
|
+
|
|
1185
|
+
f = tmp_path / "data.log"
|
|
1186
|
+
f.write_text("hello world\n")
|
|
1187
|
+
import loghunter.common.loader as loader_mod
|
|
1188
|
+
monkeypatch.setattr(
|
|
1189
|
+
loader_mod,
|
|
1190
|
+
"sniff_format_detailed",
|
|
1191
|
+
lambda p: loader_mod.SniffResult(
|
|
1192
|
+
state="classified", schema="blob", origin=None,
|
|
1193
|
+
),
|
|
1194
|
+
)
|
|
1195
|
+
|
|
1196
|
+
rc = cli._main(["digest", str(f)])
|
|
1197
|
+
err = capsys.readouterr().err
|
|
1198
|
+
assert "simulated parser failure" in err
|
|
1199
|
+
assert rc == 1
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
# ─── End-to-end: run_digest blob terminal branch through the renderer ──────
|
|
1203
|
+
|
|
1204
|
+
|
|
1205
|
+
def test_run_digest_blob_end_to_end_renders_card(tmp_path, capsys) -> None:
|
|
1206
|
+
"""Programmatic call to run_digest with schema=blob renders a flat
|
|
1207
|
+
card to the configured stream (stdout by default)."""
|
|
1208
|
+
f = tmp_path / "free.log"
|
|
1209
|
+
f.write_text("alpha beta gamma\ndelta epsilon\n" * 200)
|
|
1210
|
+
|
|
1211
|
+
runner.run_digest(
|
|
1212
|
+
config={"loghunter": {}}, schema="blob", blob_path=f,
|
|
1213
|
+
)
|
|
1214
|
+
out = capsys.readouterr().out
|
|
1215
|
+
# Flat blob card: identity line 1 = source basename, headline names
|
|
1216
|
+
# the best-guess shape, fields block carries the lowercase labels.
|
|
1217
|
+
assert out.splitlines()[0] == "free.log"
|
|
1218
|
+
assert "Unrecognized source" in out
|
|
1219
|
+
assert "bytes:" in out
|
|
1220
|
+
# No banner, no header rule, no footer in the flat grammar.
|
|
1221
|
+
assert "LogHunter" not in out
|
|
1222
|
+
assert "── digest" not in out
|
|
1223
|
+
assert "No parser claims" not in out
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def test_run_digest_blob_dry_run_skips_render(tmp_path, capsys) -> None:
|
|
1227
|
+
"""Dry-run prints a plan note and returns without sampling or rendering."""
|
|
1228
|
+
f = tmp_path / "free.log"
|
|
1229
|
+
f.write_text("hello\n")
|
|
1230
|
+
runner.run_digest(
|
|
1231
|
+
config={"loghunter": {}}, schema="blob", blob_path=f, dry_run=True,
|
|
1232
|
+
)
|
|
1233
|
+
out = capsys.readouterr().out
|
|
1234
|
+
assert "digest dry run" in out
|
|
1235
|
+
assert "schema:" in out and "blob" in out
|
|
1236
|
+
assert "── digest · blob" not in out
|
|
1237
|
+
assert "No parser claims" not in out
|