loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,373 @@
|
|
|
1
|
+
"""Integration tests for sniff_format — the loader-layer orchestrator.
|
|
2
|
+
|
|
3
|
+
Verifies file I/O integration (`_open_log`, gzip transparency), precedence,
|
|
4
|
+
the blob floor, and the bounded-read perf guarantee. All sample data is
|
|
5
|
+
synthetic per the privacy rail — RFC 5737 IPs and placeholder hostnames.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import gzip
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
from loghunter.common import loader
|
|
17
|
+
from loghunter.common.loader import _SNIFF_MAX_PEEK, sniff_format, sniff_format_detailed
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# ── File fixture helpers ──────────────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
def _write(path: Path, lines: list[str]) -> None:
|
|
23
|
+
path.write_text("".join(lines), encoding="utf-8")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _write_gz(path: Path, lines: list[str]) -> None:
|
|
27
|
+
path.write_bytes(gzip.compress("".join(lines).encode("utf-8")))
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
ZEEK_TSV_CONN_LINES = [
|
|
31
|
+
"#separator \\x09\n",
|
|
32
|
+
"#set_separator\t,\n",
|
|
33
|
+
"#empty_field\t(empty)\n",
|
|
34
|
+
"#unset_field\t-\n",
|
|
35
|
+
"#path\tconn\n",
|
|
36
|
+
"#open\t2026-06-01-12-00-00\n",
|
|
37
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\tduration\n",
|
|
38
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport\tenum\tinterval\n",
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
ZEEK_TSV_DNS_LINES = [
|
|
42
|
+
"#separator \\x09\n",
|
|
43
|
+
"#path\tdns\n",
|
|
44
|
+
"#fields\tts\tuid\tid.orig_h\tquery\n",
|
|
45
|
+
"#types\ttime\tstring\taddr\tstring\n",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
ZEEK_NDJSON_CONN_LINE = (
|
|
49
|
+
'{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.20",'
|
|
50
|
+
' "id.resp_p": 443, "proto": "tcp", "duration": 1.23}\n'
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
ZEEK_NDJSON_DNS_LINE = (
|
|
54
|
+
'{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "query": "example.test"}\n'
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
CLOUDTRAIL_NDJSON_LINE = json.dumps({
|
|
58
|
+
"eventVersion": "1.08",
|
|
59
|
+
"eventTime": "2026-06-01T12:00:00Z",
|
|
60
|
+
"userIdentity": {"type": "IAMUser"},
|
|
61
|
+
"eventName": "GetObject",
|
|
62
|
+
"eventSource": "s3.amazonaws.com",
|
|
63
|
+
"sourceIPAddress": "192.0.2.10",
|
|
64
|
+
}) + "\n"
|
|
65
|
+
|
|
66
|
+
CLOUDTRAIL_ENVELOPE_PAYLOAD = json.dumps({
|
|
67
|
+
"Records": [
|
|
68
|
+
{
|
|
69
|
+
"eventVersion": "1.08",
|
|
70
|
+
"eventTime": "2026-06-01T12:00:00Z",
|
|
71
|
+
"userIdentity": {"type": "IAMUser"},
|
|
72
|
+
"eventName": "GetObject",
|
|
73
|
+
"eventSource": "s3.amazonaws.com",
|
|
74
|
+
"sourceIPAddress": "192.0.2.10",
|
|
75
|
+
}
|
|
76
|
+
]
|
|
77
|
+
}, indent=2) + "\n"
|
|
78
|
+
|
|
79
|
+
DNSMASQ_LINES = [
|
|
80
|
+
"Jun 1 12:00:00 piholehost dnsmasq[123]: query[A] example.test from 192.0.2.10\n",
|
|
81
|
+
"Jun 1 12:00:01 piholehost dnsmasq[123]: forwarded example.test to 198.51.100.53\n",
|
|
82
|
+
]
|
|
83
|
+
|
|
84
|
+
SYSLOG_LINES = [
|
|
85
|
+
"<13>Jun 1 12:00:00 examplehost sshd[1234]: Accepted publickey for placeholder\n",
|
|
86
|
+
"Jun 1 12:00:01 examplehost cron[5678]: (root) CMD (placeholder)\n",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
# ── Per-format classification ─────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
def test_sniff_format_zeek_tsv_conn(tmp_path: Path) -> None:
|
|
93
|
+
path = tmp_path / "conn.log"
|
|
94
|
+
_write(path, ZEEK_TSV_CONN_LINES)
|
|
95
|
+
assert sniff_format(path) == "conn"
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def test_sniff_format_zeek_tsv_dns(tmp_path: Path) -> None:
|
|
99
|
+
path = tmp_path / "dns.log"
|
|
100
|
+
_write(path, ZEEK_TSV_DNS_LINES)
|
|
101
|
+
assert sniff_format(path) == "dns"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def test_sniff_format_zeek_ndjson_conn(tmp_path: Path) -> None:
|
|
105
|
+
path = tmp_path / "conn.log"
|
|
106
|
+
_write(path, [ZEEK_NDJSON_CONN_LINE])
|
|
107
|
+
assert sniff_format(path) == "conn"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_sniff_format_zeek_ndjson_dns(tmp_path: Path) -> None:
|
|
111
|
+
path = tmp_path / "dns.log"
|
|
112
|
+
_write(path, [ZEEK_NDJSON_DNS_LINE])
|
|
113
|
+
assert sniff_format(path) == "dns"
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_sniff_format_cloudtrail_ndjson(tmp_path: Path) -> None:
|
|
117
|
+
path = tmp_path / "cloudtrail.json.log"
|
|
118
|
+
_write(path, [CLOUDTRAIL_NDJSON_LINE])
|
|
119
|
+
assert sniff_format(path) == "cloudtrail"
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def test_sniff_format_cloudtrail_envelope(tmp_path: Path) -> None:
|
|
123
|
+
path = tmp_path / "cloudtrail.json"
|
|
124
|
+
path.write_text(CLOUDTRAIL_ENVELOPE_PAYLOAD, encoding="utf-8")
|
|
125
|
+
assert sniff_format(path) == "cloudtrail"
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def test_sniff_format_cloudtrail_envelope_gz(tmp_path: Path) -> None:
|
|
129
|
+
# Exercises the _open_log gzip path in the orchestrator end-to-end.
|
|
130
|
+
path = tmp_path / "cloudtrail.json.gz"
|
|
131
|
+
path.write_bytes(gzip.compress(CLOUDTRAIL_ENVELOPE_PAYLOAD.encode("utf-8")))
|
|
132
|
+
assert sniff_format(path) == "cloudtrail"
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def test_sniff_format_dnsmasq(tmp_path: Path) -> None:
|
|
136
|
+
path = tmp_path / "pihole.log"
|
|
137
|
+
_write(path, DNSMASQ_LINES)
|
|
138
|
+
assert sniff_format(path) == "dns"
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_sniff_format_syslog(tmp_path: Path) -> None:
|
|
142
|
+
path = tmp_path / "syslog"
|
|
143
|
+
_write(path, SYSLOG_LINES)
|
|
144
|
+
assert sniff_format(path) == "syslog"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def test_sniff_format_zeek_tsv_gz(tmp_path: Path) -> None:
|
|
148
|
+
path = tmp_path / "conn.log.gz"
|
|
149
|
+
_write_gz(path, ZEEK_TSV_CONN_LINES)
|
|
150
|
+
assert sniff_format(path) == "conn"
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
# ── Ambiguity / precedence ────────────────────────────────────────────────────
|
|
154
|
+
|
|
155
|
+
def test_zeek_ndjson_not_claimed_as_cloudtrail(tmp_path: Path) -> None:
|
|
156
|
+
# A Zeek NDJSON conn line is JSON but lacks CT event keys — cloudtrail
|
|
157
|
+
# must not claim it; the zeek recognizer downstream wins.
|
|
158
|
+
path = tmp_path / "conn.log"
|
|
159
|
+
_write(path, [ZEEK_NDJSON_CONN_LINE])
|
|
160
|
+
assert sniff_format(path) == "conn"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def test_cloudtrail_event_not_claimed_as_zeek(tmp_path: Path) -> None:
|
|
164
|
+
# A CloudTrail per-event NDJSON line is JSON but lacks Zeek's key sets
|
|
165
|
+
# — the cloudtrail recognizer wins (precedence: cloudtrail before zeek).
|
|
166
|
+
path = tmp_path / "events.json.log"
|
|
167
|
+
_write(path, [CLOUDTRAIL_NDJSON_LINE])
|
|
168
|
+
assert sniff_format(path) == "cloudtrail"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def test_zeek_ndjson_notice_no_path_routes_to_blob(tmp_path: Path) -> None:
|
|
172
|
+
# notice.log-shaped pathless NDJSON: carries the conn 5-tuple via
|
|
173
|
+
# id.* AND its own native src/dst (the original incident shape).
|
|
174
|
+
# The Layer-2 conn fallback rejects the rename-collision; sniff
|
|
175
|
+
# returns None and the orchestrator drops to the blob floor.
|
|
176
|
+
line = (
|
|
177
|
+
'{"ts": 1779750000.0, "uid": "Cxxxxxx",'
|
|
178
|
+
' "id.orig_h": "192.0.2.10", "id.orig_p": 41514,'
|
|
179
|
+
' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
|
|
180
|
+
' "src": "192.0.2.10", "dst": "198.51.100.20",'
|
|
181
|
+
' "note": "Placeholder::Note", "msg": "placeholder message"}\n'
|
|
182
|
+
)
|
|
183
|
+
path = tmp_path / "notice.log"
|
|
184
|
+
_write(path, [line])
|
|
185
|
+
assert sniff_format(path) == "blob"
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def test_dnsmasq_wins_over_syslog(tmp_path: Path) -> None:
|
|
189
|
+
# Dnsmasq IS RFC 3164 — both recognizers would match at the
|
|
190
|
+
# recognizer level. The orchestrator runs dnsmasq first.
|
|
191
|
+
path = tmp_path / "pihole.log"
|
|
192
|
+
_write(path, DNSMASQ_LINES)
|
|
193
|
+
assert sniff_format(path) == "dns"
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# ── Blob floor ────────────────────────────────────────────────────────────────
|
|
197
|
+
|
|
198
|
+
def test_sniff_format_unrecognized_text_returns_blob(tmp_path: Path) -> None:
|
|
199
|
+
path = tmp_path / "mystery.txt"
|
|
200
|
+
_write(path, ["hello world\n", "this is not a log\n", "lorem ipsum\n"])
|
|
201
|
+
assert sniff_format(path) == "blob"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def test_sniff_format_empty_file_returns_blob(tmp_path: Path) -> None:
|
|
205
|
+
path = tmp_path / "empty.log"
|
|
206
|
+
path.write_text("", encoding="utf-8")
|
|
207
|
+
assert sniff_format(path) == "blob"
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def test_sniff_format_blanks_only_returns_blob(tmp_path: Path) -> None:
|
|
211
|
+
path = tmp_path / "blanks.log"
|
|
212
|
+
_write(path, ["\n", "\n", " \n"])
|
|
213
|
+
assert sniff_format(path) == "blob"
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# ── Bounded-read perf guarantee ───────────────────────────────────────────────
|
|
217
|
+
|
|
218
|
+
class _CountingHandle:
|
|
219
|
+
"""Context-manager iterator that counts how many lines were pulled."""
|
|
220
|
+
|
|
221
|
+
def __init__(self, lines):
|
|
222
|
+
self._iter = iter(lines)
|
|
223
|
+
self.read_count = 0
|
|
224
|
+
|
|
225
|
+
def __enter__(self):
|
|
226
|
+
return self
|
|
227
|
+
|
|
228
|
+
def __exit__(self, exc_type, exc, tb):
|
|
229
|
+
return False
|
|
230
|
+
|
|
231
|
+
def __iter__(self):
|
|
232
|
+
return self
|
|
233
|
+
|
|
234
|
+
def __next__(self):
|
|
235
|
+
line = next(self._iter)
|
|
236
|
+
self.read_count += 1
|
|
237
|
+
return line
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
def test_sniff_format_caps_reads_at_max_peek(monkeypatch, tmp_path: Path) -> None:
|
|
241
|
+
"""Orchestrator pulls at most _SNIFF_MAX_PEEK lines, even for huge inputs."""
|
|
242
|
+
over_budget_count = _SNIFF_MAX_PEEK + 100_000
|
|
243
|
+
lines = (f"random text line {i}\n" for i in range(over_budget_count))
|
|
244
|
+
handle = _CountingHandle(lines)
|
|
245
|
+
|
|
246
|
+
def fake_open_log(path):
|
|
247
|
+
return handle
|
|
248
|
+
|
|
249
|
+
monkeypatch.setattr(loader, "_open_log", fake_open_log)
|
|
250
|
+
result = sniff_format(tmp_path / "fake")
|
|
251
|
+
|
|
252
|
+
assert result == "blob"
|
|
253
|
+
assert handle.read_count == _SNIFF_MAX_PEEK
|
|
254
|
+
|
|
255
|
+
|
|
256
|
+
def test_sniff_format_pulls_only_as_many_lines_as_file_has(
|
|
257
|
+
monkeypatch, tmp_path: Path
|
|
258
|
+
) -> None:
|
|
259
|
+
"""When the file is smaller than the budget, only the file's lines are pulled."""
|
|
260
|
+
short_lines = ["hello\n", "world\n", "shorter than budget\n"]
|
|
261
|
+
assert len(short_lines) < _SNIFF_MAX_PEEK
|
|
262
|
+
handle = _CountingHandle(short_lines)
|
|
263
|
+
|
|
264
|
+
def fake_open_log(path):
|
|
265
|
+
return handle
|
|
266
|
+
|
|
267
|
+
monkeypatch.setattr(loader, "_open_log", fake_open_log)
|
|
268
|
+
result = sniff_format(tmp_path / "fake")
|
|
269
|
+
|
|
270
|
+
assert result == "blob"
|
|
271
|
+
assert handle.read_count == len(short_lines)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# ── sniff_format_detailed: schema + origin + empty-state ─────────────────────
|
|
275
|
+
|
|
276
|
+
def test_detailed_zeek_ndjson_conn_origin_zeek(tmp_path: Path) -> None:
|
|
277
|
+
path = tmp_path / "conn.log"
|
|
278
|
+
_write(path, [ZEEK_NDJSON_CONN_LINE])
|
|
279
|
+
result = sniff_format_detailed(path)
|
|
280
|
+
assert result.state == "classified"
|
|
281
|
+
assert result.schema == "conn"
|
|
282
|
+
assert result.origin == "zeek"
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def test_detailed_zeek_ndjson_dns_origin_zeek(tmp_path: Path) -> None:
|
|
286
|
+
path = tmp_path / "dns.log"
|
|
287
|
+
_write(path, [ZEEK_NDJSON_DNS_LINE])
|
|
288
|
+
result = sniff_format_detailed(path)
|
|
289
|
+
assert result.state == "classified"
|
|
290
|
+
assert result.schema == "dns"
|
|
291
|
+
assert result.origin == "zeek"
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def test_detailed_dnsmasq_origin_pihole(tmp_path: Path) -> None:
|
|
295
|
+
path = tmp_path / "pihole.log"
|
|
296
|
+
_write(path, DNSMASQ_LINES)
|
|
297
|
+
result = sniff_format_detailed(path)
|
|
298
|
+
assert result.state == "classified"
|
|
299
|
+
assert result.schema == "dns"
|
|
300
|
+
assert result.origin == "pihole"
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def test_detailed_cloudtrail_origin_cloudtrail(tmp_path: Path) -> None:
|
|
304
|
+
path = tmp_path / "cloudtrail.json.log"
|
|
305
|
+
_write(path, [CLOUDTRAIL_NDJSON_LINE])
|
|
306
|
+
result = sniff_format_detailed(path)
|
|
307
|
+
assert result.state == "classified"
|
|
308
|
+
assert result.schema == "cloudtrail"
|
|
309
|
+
assert result.origin == "cloudtrail"
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def test_detailed_syslog_origin_syslog(tmp_path: Path) -> None:
|
|
313
|
+
path = tmp_path / "syslog"
|
|
314
|
+
_write(path, SYSLOG_LINES)
|
|
315
|
+
result = sniff_format_detailed(path)
|
|
316
|
+
assert result.state == "classified"
|
|
317
|
+
assert result.schema == "syslog"
|
|
318
|
+
assert result.origin == "syslog"
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def test_detailed_zero_byte_file_is_empty(tmp_path: Path) -> None:
|
|
322
|
+
path = tmp_path / "empty.log"
|
|
323
|
+
path.write_text("", encoding="utf-8")
|
|
324
|
+
result = sniff_format_detailed(path)
|
|
325
|
+
assert result.state == "empty"
|
|
326
|
+
assert result.schema is None
|
|
327
|
+
assert result.origin is None
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def test_detailed_short_whitespace_only_file_is_empty(tmp_path: Path) -> None:
|
|
331
|
+
path = tmp_path / "blanks.log"
|
|
332
|
+
_write(path, ["\n", " \n", "\t\n"])
|
|
333
|
+
result = sniff_format_detailed(path)
|
|
334
|
+
assert result.state == "empty"
|
|
335
|
+
assert result.schema is None
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def test_detailed_long_whitespace_falls_to_blob_not_empty(
|
|
339
|
+
monkeypatch, tmp_path: Path
|
|
340
|
+
) -> None:
|
|
341
|
+
"""More leading-whitespace lines than the bounded peek can prove → blob, not empty.
|
|
342
|
+
|
|
343
|
+
The EOF probe cannot confirm the file is truly empty when it has more
|
|
344
|
+
content past the peek. Whitespace beyond what we read must NOT short-
|
|
345
|
+
circuit to the empty path. Locks the EOF-sensitive contract.
|
|
346
|
+
"""
|
|
347
|
+
# Yield _SNIFF_MAX_PEEK whitespace lines followed by more whitespace —
|
|
348
|
+
# the EOF probe will pull one extra line, so EOF is not reached and
|
|
349
|
+
# the result must NOT be "empty".
|
|
350
|
+
extra_whitespace = (f" \n" for _ in range(_SNIFF_MAX_PEEK + 5))
|
|
351
|
+
handle = _CountingHandle(extra_whitespace)
|
|
352
|
+
|
|
353
|
+
def fake_open_log(path):
|
|
354
|
+
return handle
|
|
355
|
+
|
|
356
|
+
fake_path = tmp_path / "fake"
|
|
357
|
+
fake_path.write_text("placeholder", encoding="utf-8") # nonzero size to pass stat()
|
|
358
|
+
monkeypatch.setattr(loader, "_open_log", fake_open_log)
|
|
359
|
+
result = sniff_format_detailed(fake_path)
|
|
360
|
+
assert result.state == "classified"
|
|
361
|
+
assert result.schema == "blob"
|
|
362
|
+
assert result.origin is None
|
|
363
|
+
# Peek + 1 EOF probe; never more.
|
|
364
|
+
assert handle.read_count == _SNIFF_MAX_PEEK + 1
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def test_detailed_unrecognized_text_is_blob(tmp_path: Path) -> None:
|
|
368
|
+
path = tmp_path / "mystery.txt"
|
|
369
|
+
_write(path, ["hello world\n", "this is not a log\n"])
|
|
370
|
+
result = sniff_format_detailed(path)
|
|
371
|
+
assert result.state == "classified"
|
|
372
|
+
assert result.schema == "blob"
|
|
373
|
+
assert result.origin is None
|