loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,582 @@
|
|
|
1
|
+
"""Tests for the syslog parser (parsers/syslog.py) and load_syslog() integration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import bz2
|
|
6
|
+
import gzip
|
|
7
|
+
import lzma
|
|
8
|
+
import math
|
|
9
|
+
from datetime import datetime, timedelta, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import pytest
|
|
13
|
+
|
|
14
|
+
from loghunter.common.loader import (
|
|
15
|
+
_stem_hostname,
|
|
16
|
+
load_required_logs,
|
|
17
|
+
load_syslog,
|
|
18
|
+
)
|
|
19
|
+
from loghunter.parsers.syslog import (
|
|
20
|
+
is_reboot_signal,
|
|
21
|
+
parse_program,
|
|
22
|
+
parse_timestamp,
|
|
23
|
+
strip_header,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# ── parse_timestamp ────────────────────────────────────────────────────────────
|
|
28
|
+
|
|
29
|
+
def test_parse_timestamp_year_rollback() -> None:
|
|
30
|
+
"""A timestamp 10 days in the future is rolled back to the previous year."""
|
|
31
|
+
future = (datetime.now(timezone.utc) + timedelta(days=10)).replace(
|
|
32
|
+
hour=12, minute=0, second=0, microsecond=0
|
|
33
|
+
)
|
|
34
|
+
raw = f"<134>{future.strftime('%b')} {future.day} 12:00:00 router sshd: message"
|
|
35
|
+
result = parse_timestamp(raw)
|
|
36
|
+
assert result is not None
|
|
37
|
+
assert result == future.replace(year=future.year - 1)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_parse_timestamp_returns_utc_aware() -> None:
|
|
41
|
+
result = parse_timestamp("<134>May 31 12:00:00 router sshd: message")
|
|
42
|
+
assert result is not None
|
|
43
|
+
assert result.tzinfo == timezone.utc
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def test_parse_timestamp_unparseable_returns_none() -> None:
|
|
47
|
+
assert parse_timestamp("not a valid syslog line at all") is None
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# ── is_reboot_signal ───────────────────────────────────────────────────────────
|
|
51
|
+
|
|
52
|
+
def test_is_reboot_signal_logind_reboot() -> None:
|
|
53
|
+
line = "<165>May 31 06:00:00 router systemd-logind[42]: System is rebooting."
|
|
54
|
+
assert is_reboot_signal(line) is True
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_is_reboot_signal_rsyslogd_exit() -> None:
|
|
58
|
+
line = "<165>May 31 06:00:00 router rsyslogd: exiting on signal 15."
|
|
59
|
+
assert is_reboot_signal(line) is True
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def test_is_reboot_signal_false_for_normal_line() -> None:
|
|
63
|
+
line = "<134>May 31 12:00:00 router sshd[1234]: Accepted publickey for user"
|
|
64
|
+
assert is_reboot_signal(line) is False
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# ── parse_program ──────────────────────────────────────────────────────────────
|
|
68
|
+
|
|
69
|
+
@pytest.mark.parametrize(
|
|
70
|
+
"body, expected",
|
|
71
|
+
[
|
|
72
|
+
("sshd[1234]: Accepted publickey", "sshd"),
|
|
73
|
+
("postfix/smtpd[889]: connect from", "postfix/smtpd"),
|
|
74
|
+
("kernel: Linux version 6.1", "kernel"),
|
|
75
|
+
("audisp: node=... type=...", "audisp"),
|
|
76
|
+
("", "unknown"),
|
|
77
|
+
(" ", "unknown"),
|
|
78
|
+
(": payload", "unknown"),
|
|
79
|
+
("[123]: payload", "unknown"),
|
|
80
|
+
],
|
|
81
|
+
)
|
|
82
|
+
def test_parse_program(body: str, expected: str) -> None:
|
|
83
|
+
"""parse_program returns the leading non-whitespace token before '[' or ':',
|
|
84
|
+
falling back to 'unknown' when no such token exists."""
|
|
85
|
+
assert parse_program(body) == expected
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# ── load_syslog ────────────────────────────────────────────────────────────────
|
|
89
|
+
|
|
90
|
+
def test_load_syslog_per_host_files(tmp_path: Path) -> None:
|
|
91
|
+
"""Two per-host files: H4 reads the in-content RFC-3164 host (which here
|
|
92
|
+
equals the filename stem), correct schema, correct row count. Both files
|
|
93
|
+
pass the content-sniff gate (real RFC-3164 lines)."""
|
|
94
|
+
syslog_dir = tmp_path / "syslog"
|
|
95
|
+
syslog_dir.mkdir()
|
|
96
|
+
(syslog_dir / "router.log").write_text(
|
|
97
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n"
|
|
98
|
+
"<134>May 31 12:01:00 router sshd[101]: session opened for user\n",
|
|
99
|
+
encoding="utf-8",
|
|
100
|
+
)
|
|
101
|
+
(syslog_dir / "webserver.log").write_text(
|
|
102
|
+
"<134>May 31 12:02:00 webserver nginx[200]: GET / HTTP/1.1 200\n",
|
|
103
|
+
encoding="utf-8",
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
df = load_syslog(syslog_dir)
|
|
107
|
+
|
|
108
|
+
assert list(df.columns) == ["ts", "host", "program", "raw", "message"]
|
|
109
|
+
assert len(df) == 3
|
|
110
|
+
assert set(df["host"]) == {"router", "webserver"}
|
|
111
|
+
assert (df[df["host"] == "router"]["host"] == "router").all()
|
|
112
|
+
assert (df[df["host"] == "webserver"]["host"] == "webserver").all()
|
|
113
|
+
assert set(df[df["host"] == "router"]["program"]) == {"sshd"}
|
|
114
|
+
assert df[df["host"] == "webserver"]["program"].iloc[0] == "nginx"
|
|
115
|
+
# Lock the byte-identical `message` invariant directly at this surface —
|
|
116
|
+
# adding `program` must not perturb the drain3 input.
|
|
117
|
+
assert set(df[df["host"] == "router"]["message"]) == {
|
|
118
|
+
"sshd[*]: Accepted publickey for user",
|
|
119
|
+
"sshd[*]: session opened for user",
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def test_load_syslog_non_host_filename_reads_in_content_host(tmp_path: Path) -> None:
|
|
124
|
+
"""A file named with a non-host stem (syslog.log): H4 reads the in-content
|
|
125
|
+
host per line — no filename inheritance."""
|
|
126
|
+
syslog_dir = tmp_path / "syslog"
|
|
127
|
+
syslog_dir.mkdir()
|
|
128
|
+
(syslog_dir / "syslog.log").write_text(
|
|
129
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n"
|
|
130
|
+
"<134>May 31 12:01:00 webserver nginx[200]: GET / HTTP/1.1 200\n",
|
|
131
|
+
encoding="utf-8",
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
df = load_syslog(syslog_dir)
|
|
135
|
+
|
|
136
|
+
assert list(df.columns) == ["ts", "host", "program", "raw", "message"]
|
|
137
|
+
assert len(df) == 2
|
|
138
|
+
assert set(df["host"]) == {"router", "webserver"}
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_load_syslog_multi_host_dump_keeps_distinct_in_content_hosts(tmp_path: Path) -> None:
|
|
142
|
+
"""A multi-host flat dump named with a non-host stem (syslog.2M.log): H4
|
|
143
|
+
reads the distinct in-content hosts per line — nothing collapses to the
|
|
144
|
+
filename (the bug-C collapse the old whole-stem host inheritance produced)."""
|
|
145
|
+
syslog_dir = tmp_path / "syslog"
|
|
146
|
+
syslog_dir.mkdir()
|
|
147
|
+
(syslog_dir / "syslog.2M.log").write_text(
|
|
148
|
+
"<134>May 31 12:00:00 routerA sshd[100]: Accepted publickey for user\n"
|
|
149
|
+
"<134>May 31 12:01:00 webserverB nginx[200]: GET / HTTP/1.1 200\n"
|
|
150
|
+
"<134>May 31 12:02:00 dbhostC cron[300]: (root) CMD (placeholder)\n",
|
|
151
|
+
encoding="utf-8",
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
df = load_syslog(syslog_dir)
|
|
155
|
+
|
|
156
|
+
assert len(df) == 3
|
|
157
|
+
assert set(df["host"]) == {"routerA", "webserverB", "dbhostC"}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def test_load_syslog_hostless_line_falls_back_to_filename_stem(tmp_path: Path) -> None:
|
|
161
|
+
"""H4 fallback: a genuinely hostless line (parse_host → "unknown", <4 tokens)
|
|
162
|
+
takes the filename stem. Exercised on an EXPLICIT FILE input — the gate is
|
|
163
|
+
bypassed for a named file, and a directory file would have to pass the
|
|
164
|
+
RFC-3164 gate (a gate-passing line almost always yields a non-"unknown"
|
|
165
|
+
host, so the fallback arm is not reachable from directory discovery)."""
|
|
166
|
+
f = tmp_path / "relay1.log"
|
|
167
|
+
f.write_text("boot sequence done\n", encoding="utf-8")
|
|
168
|
+
|
|
169
|
+
df = load_syslog(f)
|
|
170
|
+
|
|
171
|
+
assert len(df) == 1
|
|
172
|
+
assert df.iloc[0]["host"] == "relay1"
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def test_load_syslog_unparseable_timestamps_produce_nan_not_dropped(tmp_path: Path) -> None:
|
|
176
|
+
"""Lines with no parseable timestamp produce ts=nan and are kept in the DataFrame."""
|
|
177
|
+
syslog_dir = tmp_path / "syslog"
|
|
178
|
+
syslog_dir.mkdir()
|
|
179
|
+
(syslog_dir / "router.log").write_text(
|
|
180
|
+
"not a valid syslog line at all\n"
|
|
181
|
+
"<134>May 31 12:00:00 192.0.2.1 sshd[100]: normal line\n",
|
|
182
|
+
encoding="utf-8",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
df = load_syslog(syslog_dir)
|
|
186
|
+
|
|
187
|
+
assert len(df) == 2
|
|
188
|
+
nan_rows = df[df["ts"].isna()]
|
|
189
|
+
assert len(nan_rows) == 1
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
# ── load_required_logs() wiring ────────────────────────────────────────────────
|
|
193
|
+
|
|
194
|
+
def test_load_syslog_with_single_file(tmp_path: Path) -> None:
|
|
195
|
+
"""load_syslog() accepts a single file path in place of a directory."""
|
|
196
|
+
log_file = tmp_path / "router.log"
|
|
197
|
+
log_file.write_text(
|
|
198
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
|
|
199
|
+
encoding="utf-8",
|
|
200
|
+
)
|
|
201
|
+
df = load_syslog(log_file)
|
|
202
|
+
assert list(df.columns) == ["ts", "host", "program", "raw", "message"]
|
|
203
|
+
assert len(df) == 1
|
|
204
|
+
assert df.iloc[0]["host"] == "router"
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def test_load_syslog_directory_silently_drops_ndjson(tmp_path: Path, capsys) -> None:
|
|
208
|
+
"""A wrong-family NDJSON in a syslog DIRECTORY is dropped by the content-sniff
|
|
209
|
+
gate — silently, at EVERY verbosity (decision C: no per-file stderr for
|
|
210
|
+
rejected candidates). The real syslog file still loads."""
|
|
211
|
+
syslog_dir = tmp_path / "syslog"
|
|
212
|
+
syslog_dir.mkdir()
|
|
213
|
+
(syslog_dir / "conn.log").write_text(
|
|
214
|
+
'{"ts": 1.0, "id.orig_h": "192.0.2.1"}\n', encoding="utf-8"
|
|
215
|
+
)
|
|
216
|
+
(syslog_dir / "router.log").write_text(
|
|
217
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
|
|
218
|
+
encoding="utf-8",
|
|
219
|
+
)
|
|
220
|
+
df = load_syslog(syslog_dir)
|
|
221
|
+
assert len(df) == 1
|
|
222
|
+
assert capsys.readouterr().err == ""
|
|
223
|
+
|
|
224
|
+
df = load_syslog(syslog_dir, verbose=True)
|
|
225
|
+
assert len(df) == 1
|
|
226
|
+
assert "conn.log" not in capsys.readouterr().err
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def test_load_syslog_explicit_ndjson_file_skipped_and_warns(tmp_path: Path, capsys) -> None:
|
|
230
|
+
"""An EXPLICITLY-NAMED NDJSON file bypasses the gate (operator intent) but is
|
|
231
|
+
skipped by `_syslog_should_skip` at load; the skip note reaches stderr ONLY
|
|
232
|
+
in verbose mode."""
|
|
233
|
+
f = tmp_path / "conn.log"
|
|
234
|
+
f.write_text('{"ts": 1.0, "id.orig_h": "192.0.2.1"}\n', encoding="utf-8")
|
|
235
|
+
|
|
236
|
+
df = load_syslog(f)
|
|
237
|
+
assert len(df) == 0
|
|
238
|
+
assert capsys.readouterr().err == ""
|
|
239
|
+
|
|
240
|
+
df = load_syslog(f, verbose=True)
|
|
241
|
+
assert len(df) == 0
|
|
242
|
+
captured = capsys.readouterr()
|
|
243
|
+
assert "conn.log" in captured.err
|
|
244
|
+
assert "NDJSON" in captured.err
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def test_stem_hostname_variants() -> None:
|
|
248
|
+
"""_stem_hostname strips log suffixes and rotation numbers, preserving dotted hostnames."""
|
|
249
|
+
assert _stem_hostname("router.log") == "router"
|
|
250
|
+
assert _stem_hostname("router.log.gz") == "router"
|
|
251
|
+
assert _stem_hostname("host1.example.com.log") == "host1.example.com"
|
|
252
|
+
assert _stem_hostname("syslog.log.1") == "syslog"
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def test_load_required_logs_routes_syslog_dir(tmp_path: Path) -> None:
|
|
256
|
+
"""load_required_logs() branches on syslog_dir and returns the syslog schema."""
|
|
257
|
+
syslog_dir = tmp_path / "syslog"
|
|
258
|
+
syslog_dir.mkdir()
|
|
259
|
+
(syslog_dir / "router.log").write_text(
|
|
260
|
+
"<134>May 31 12:00:00 192.0.2.1 sshd[100]: Accepted publickey for user\n",
|
|
261
|
+
encoding="utf-8",
|
|
262
|
+
)
|
|
263
|
+
|
|
264
|
+
result = load_required_logs(
|
|
265
|
+
{"*": "syslog_dir"},
|
|
266
|
+
{"syslog_dir": [syslog_dir]},
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
assert "*" in result.logs
|
|
270
|
+
df = result.logs["*"]
|
|
271
|
+
assert list(df.columns) == ["ts", "host", "program", "raw", "message"]
|
|
272
|
+
assert len(df) == 1
|
|
273
|
+
assert result.record_counts == {"*": 1}
|
|
274
|
+
assert result.warnings == []
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
# ── strip_header doubled-timestamp invariant ──────────────────────────────────
|
|
278
|
+
|
|
279
|
+
def test_strip_header_preserves_inner_timestamp_in_body() -> None:
|
|
280
|
+
"""SYSLOG_HDR_RE is `^`-anchored: only the LEADING transport header is
|
|
281
|
+
stripped; an app's own inner RFC 3164-shaped timestamp in the body
|
|
282
|
+
survives verbatim. This invariant is load-bearing for the Zeek syslog.log
|
|
283
|
+
normalizer — both feeds share strip_header, so any regression here would
|
|
284
|
+
misderive `program`/`message` on either path."""
|
|
285
|
+
raw = "Jan 02 03:04:05 host1 prog: payload Jan 02 03:04:05 host2 prog2: inner"
|
|
286
|
+
stripped = strip_header(raw)
|
|
287
|
+
assert stripped == "prog: payload Jan 02 03:04:05 host2 prog2: inner"
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def test_strip_header_idempotent_when_no_leading_header() -> None:
|
|
291
|
+
"""A body that does NOT begin with a transport header is returned unchanged
|
|
292
|
+
(modulo PRI prefix stripping, which is absent here too)."""
|
|
293
|
+
raw = "prog: body without any leading transport header"
|
|
294
|
+
assert strip_header(raw) == "prog: body without any leading transport header"
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# ── load_syslog defensive Zeek-TSV skip (gated on #separator) ─────────────────
|
|
298
|
+
|
|
299
|
+
def test_load_syslog_directory_silently_drops_zeek_tsv(tmp_path: Path, capsys) -> None:
|
|
300
|
+
"""A Zeek-TSV syslog.log in a syslog DIRECTORY is dropped by the content-sniff
|
|
301
|
+
gate (no RFC-3164 header line) — silently, at EVERY verbosity. The real
|
|
302
|
+
syslog file still loads, not garbled into NaN-ts rows."""
|
|
303
|
+
syslog_dir = tmp_path / "syslog"
|
|
304
|
+
syslog_dir.mkdir()
|
|
305
|
+
(syslog_dir / "syslog.log").write_text(
|
|
306
|
+
"#separator \\x09\n"
|
|
307
|
+
"#set_separator\t,\n"
|
|
308
|
+
"#path\tsyslog\n"
|
|
309
|
+
"#fields\tts\thost\tmessage\n"
|
|
310
|
+
"#types\ttime\tstring\tstring\n"
|
|
311
|
+
"1779750000.0\thost1\tplaceholder\n",
|
|
312
|
+
encoding="utf-8",
|
|
313
|
+
)
|
|
314
|
+
(syslog_dir / "router.log").write_text(
|
|
315
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
|
|
316
|
+
encoding="utf-8",
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
df = load_syslog(syslog_dir)
|
|
320
|
+
assert len(df) == 1
|
|
321
|
+
assert capsys.readouterr().err == ""
|
|
322
|
+
|
|
323
|
+
df = load_syslog(syslog_dir, verbose=True)
|
|
324
|
+
assert len(df) == 1
|
|
325
|
+
assert "syslog.log" not in capsys.readouterr().err
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def test_load_syslog_explicit_zeek_tsv_file_skipped_and_warns(tmp_path: Path, capsys) -> None:
|
|
329
|
+
"""An EXPLICITLY-NAMED Zeek-TSV file bypasses the gate but is skipped by
|
|
330
|
+
`_syslog_should_skip` at load — the gate is narrow on the `#separator`
|
|
331
|
+
directive (the exact signal the Zeek strategy parse uses). Verbose mode
|
|
332
|
+
emits an actionable note pointing at zeek_dir."""
|
|
333
|
+
f = tmp_path / "syslog.log"
|
|
334
|
+
f.write_text(
|
|
335
|
+
"#separator \\x09\n"
|
|
336
|
+
"#set_separator\t,\n"
|
|
337
|
+
"#path\tsyslog\n"
|
|
338
|
+
"#fields\tts\thost\tmessage\n"
|
|
339
|
+
"#types\ttime\tstring\tstring\n"
|
|
340
|
+
"1779750000.0\thost1\tplaceholder\n",
|
|
341
|
+
encoding="utf-8",
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
df = load_syslog(f)
|
|
345
|
+
assert len(df) == 0
|
|
346
|
+
assert capsys.readouterr().err == ""
|
|
347
|
+
|
|
348
|
+
df = load_syslog(f, verbose=True)
|
|
349
|
+
assert len(df) == 0
|
|
350
|
+
captured = capsys.readouterr()
|
|
351
|
+
assert "syslog.log" in captured.err
|
|
352
|
+
assert "Zeek TSV" in captured.err
|
|
353
|
+
assert "zeek_dir" in captured.err
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def test_load_syslog_does_not_skip_hash_comment_flat_syslog(tmp_path: Path) -> None:
|
|
357
|
+
"""An ordinary `#`-comment-bearing flat syslog file is NOT skipped — the
|
|
358
|
+
Zeek-TSV gate is narrow on `#separator`, not generic `#`. Regression check
|
|
359
|
+
for the gate-narrowness rail. (Explicit file → gate bypassed; should_skip
|
|
360
|
+
must still not skip it.)"""
|
|
361
|
+
f = tmp_path / "router.log"
|
|
362
|
+
f.write_text(
|
|
363
|
+
"# this is a leading comment, not a Zeek header\n"
|
|
364
|
+
"# another comment\n"
|
|
365
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
|
|
366
|
+
encoding="utf-8",
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
df = load_syslog(f)
|
|
370
|
+
assert len(df) == 1
|
|
371
|
+
assert df.iloc[0]["host"] == "router"
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
# ── bz2 / xz transparent decompression at load_syslog ────────────────────────
|
|
375
|
+
#
|
|
376
|
+
# This is the bug that triggered the prompt: a rotated `system.log.bz2` in
|
|
377
|
+
# `/var/log` was read as replacement-char garbage and the syslog detector
|
|
378
|
+
# titled findings with binary soup. With bz2/xz in `_open_log`, the public
|
|
379
|
+
# `load_syslog` path ingests the file as text rows like any other syslog file.
|
|
380
|
+
|
|
381
|
+
_SYSLOG_BZ2_XZ_LINES = (
|
|
382
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n"
|
|
383
|
+
"<134>May 31 12:01:00 router sshd[101]: session opened for user\n"
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def test_load_syslog_decompresses_bz2(tmp_path: Path) -> None:
|
|
388
|
+
"""A rotated `system.log.bz2` ingests as text rows — no binary soup.
|
|
389
|
+
|
|
390
|
+
`system` is a generic stem, so per-line `parse_host` runs and recovers
|
|
391
|
+
the embedded `router` host from the fixture lines. The load-bearing
|
|
392
|
+
invariant is that the rows render as TEXT, not as bzip2-magic / soup.
|
|
393
|
+
"""
|
|
394
|
+
syslog_dir = tmp_path / "syslog"
|
|
395
|
+
syslog_dir.mkdir()
|
|
396
|
+
(syslog_dir / "system.log.bz2").write_bytes(
|
|
397
|
+
bz2.compress(_SYSLOG_BZ2_XZ_LINES.encode("utf-8"))
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
df = load_syslog(syslog_dir)
|
|
401
|
+
|
|
402
|
+
assert len(df) == 2
|
|
403
|
+
assert set(df["host"]) == {"router"}
|
|
404
|
+
assert set(df["program"]) == {"sshd"}
|
|
405
|
+
# Sanity: no bzip2-magic / replacement-char soup leaked into the title-feed.
|
|
406
|
+
assert not any("BZh" in r for r in df["raw"])
|
|
407
|
+
assert not any("�" in r for r in df["raw"])
|
|
408
|
+
|
|
409
|
+
|
|
410
|
+
def test_load_syslog_decompresses_xz(tmp_path: Path) -> None:
|
|
411
|
+
"""The xz sibling — same shape as bz2 above."""
|
|
412
|
+
syslog_dir = tmp_path / "syslog"
|
|
413
|
+
syslog_dir.mkdir()
|
|
414
|
+
(syslog_dir / "messages.log.xz").write_bytes(
|
|
415
|
+
lzma.compress(_SYSLOG_BZ2_XZ_LINES.encode("utf-8"))
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
df = load_syslog(syslog_dir)
|
|
419
|
+
|
|
420
|
+
assert len(df) == 2
|
|
421
|
+
assert set(df["host"]) == {"router"}
|
|
422
|
+
assert set(df["program"]) == {"sshd"}
|
|
423
|
+
# No xz-magic byte (`\xfd7zXZ`) bytes in the raw text.
|
|
424
|
+
assert not any("7zXZ" in r for r in df["raw"])
|
|
425
|
+
assert not any("�" in r for r in df["raw"])
|
|
426
|
+
|
|
427
|
+
|
|
428
|
+
# ── load_syslog: corrupt compressed-file skip-with-warning ──────────────────
|
|
429
|
+
#
|
|
430
|
+
# `_open_log` is lazy — corrupt compressed files raise at the READ site, not
|
|
431
|
+
# the open. The flat-syslog reader catches the decode-error family per-file,
|
|
432
|
+
# emits the standard read-warning, and continues so one bad file never aborts
|
|
433
|
+
# the load. `lzma.LZMAError` is NOT an `OSError` — without the explicit
|
|
434
|
+
# catch, a corrupt `.xz` would leak past the CLI as a raw traceback.
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
@pytest.mark.parametrize("suffix, corrupt_bytes", [
|
|
438
|
+
(".gz", b"NOTGZIP garbage"),
|
|
439
|
+
(".bz2", b"NOTBZIP2 garbage"),
|
|
440
|
+
(".xz", b"NOTXZ garbage"),
|
|
441
|
+
])
|
|
442
|
+
def test_load_syslog_corrupt_compressed_file_skipped_with_warning(
|
|
443
|
+
tmp_path: Path, suffix: str, corrupt_bytes: bytes,
|
|
444
|
+
) -> None:
|
|
445
|
+
"""A corrupt compressed file is skipped per-file with the actionable
|
|
446
|
+
read-warning. Good files in the same directory still load (skip is
|
|
447
|
+
per-file, not whole-run). The phrasing differs by corruption shape —
|
|
448
|
+
.gz/.xz land in the "incomplete or corrupt" branch, .bz2's OSError
|
|
449
|
+
falls to the generic class-name fallback (per the prompt's "do not
|
|
450
|
+
contort to special-case" note); both branches satisfy the load-bearing
|
|
451
|
+
rail of "warned, not traceback'd"."""
|
|
452
|
+
syslog_dir = tmp_path / "syslog"
|
|
453
|
+
syslog_dir.mkdir()
|
|
454
|
+
# Good companion file alongside the corrupt one.
|
|
455
|
+
(syslog_dir / "router.log").write_text(
|
|
456
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
|
|
457
|
+
encoding="utf-8",
|
|
458
|
+
)
|
|
459
|
+
(syslog_dir / f"system.log{suffix}").write_bytes(corrupt_bytes)
|
|
460
|
+
|
|
461
|
+
warnings: list[str] = []
|
|
462
|
+
df = load_syslog(syslog_dir, _warnings=warnings)
|
|
463
|
+
|
|
464
|
+
# Good file still loaded.
|
|
465
|
+
assert len(df) == 1
|
|
466
|
+
assert df.iloc[0]["host"] == "router"
|
|
467
|
+
# Corrupt file produced an actionable warning, not a traceback.
|
|
468
|
+
assert any(
|
|
469
|
+
f"system.log{suffix} could not be read" in w for w in warnings
|
|
470
|
+
)
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def test_load_syslog_corrupt_xz_lands_in_incomplete_or_corrupt_branch(
|
|
474
|
+
tmp_path: Path,
|
|
475
|
+
) -> None:
|
|
476
|
+
"""The wrinkle assertion: a corrupt `.xz` lands in
|
|
477
|
+
`_zeek_file_read_warning`'s "compressed file is incomplete or corrupt"
|
|
478
|
+
branch, NOT the generic class-name fallback. Proves `lzma.LZMAError` is
|
|
479
|
+
recognised at the warning helper, not just caught at the loop."""
|
|
480
|
+
syslog_dir = tmp_path / "syslog"
|
|
481
|
+
syslog_dir.mkdir()
|
|
482
|
+
(syslog_dir / "system.log.xz").write_bytes(b"NOTXZ garbage")
|
|
483
|
+
|
|
484
|
+
warnings: list[str] = []
|
|
485
|
+
load_syslog(syslog_dir, _warnings=warnings)
|
|
486
|
+
|
|
487
|
+
assert any(
|
|
488
|
+
"system.log.xz could not be read" in w and "incomplete or corrupt" in w
|
|
489
|
+
for w in warnings
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def test_load_syslog_corrupt_compressed_file_without_warnings_buffer(
|
|
494
|
+
tmp_path: Path,
|
|
495
|
+
) -> None:
|
|
496
|
+
"""When _warnings is None (notebook callers, direct library use), a corrupt
|
|
497
|
+
file still doesn't raise — it's silently skipped. Locks the warnings=None
|
|
498
|
+
branch so a future tightening can't turn this into a regression."""
|
|
499
|
+
syslog_dir = tmp_path / "syslog"
|
|
500
|
+
syslog_dir.mkdir()
|
|
501
|
+
(syslog_dir / "system.log.xz").write_bytes(b"NOTXZ garbage")
|
|
502
|
+
|
|
503
|
+
df = load_syslog(syslog_dir) # _warnings omitted
|
|
504
|
+
assert df.empty
|
|
505
|
+
|
|
506
|
+
|
|
507
|
+
# ── load_syslog: truncated (trailer-corrupt) compressed file honesty rail ──
|
|
508
|
+
#
|
|
509
|
+
# Invalid-magic corruption raises immediately on read. Truncated compressed
|
|
510
|
+
# files are nastier: the decompressor yields valid-looking lines and only
|
|
511
|
+
# raises at the EOF/trailer check. Pre-honesty-fix, a file the loader warned
|
|
512
|
+
# it had "skipped" still leaked rows into the returned frame.
|
|
513
|
+
# Honesty rail: a file the loader warns it skipped contributes ZERO rows.
|
|
514
|
+
|
|
515
|
+
_SYSLOG_TRUNCATE_PAYLOAD = (
|
|
516
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user a\n"
|
|
517
|
+
"<134>May 31 12:01:00 router sshd[101]: Accepted publickey for user b\n"
|
|
518
|
+
"<134>May 31 12:02:00 router sshd[102]: Accepted publickey for user c\n"
|
|
519
|
+
"<134>May 31 12:03:00 router sshd[103]: Accepted publickey for user d\n"
|
|
520
|
+
"<134>May 31 12:04:00 router sshd[104]: Accepted publickey for user e\n"
|
|
521
|
+
"<134>May 31 12:05:00 router sshd[105]: Accepted publickey for user f\n"
|
|
522
|
+
"<134>May 31 12:06:00 router sshd[106]: Accepted publickey for user g\n"
|
|
523
|
+
"<134>May 31 12:07:00 router sshd[107]: Accepted publickey for user h\n"
|
|
524
|
+
"<134>May 31 12:08:00 router sshd[108]: Accepted publickey for user i\n"
|
|
525
|
+
"<134>May 31 12:09:00 router sshd[109]: Accepted publickey for user j\n"
|
|
526
|
+
"<134>May 31 12:10:00 router sshd[110]: Accepted publickey for user k\n"
|
|
527
|
+
"<134>May 31 12:11:00 router sshd[111]: Accepted publickey for user l\n"
|
|
528
|
+
"<134>May 31 12:12:00 router sshd[112]: Accepted publickey for user m\n"
|
|
529
|
+
"<134>May 31 12:13:00 router sshd[113]: Accepted publickey for user n\n"
|
|
530
|
+
"<134>May 31 12:14:00 router sshd[114]: Accepted publickey for user o\n"
|
|
531
|
+
"<134>May 31 12:15:00 router sshd[115]: Accepted publickey for user p\n"
|
|
532
|
+
"<134>May 31 12:16:00 router sshd[116]: Accepted publickey for user q\n"
|
|
533
|
+
"<134>May 31 12:17:00 router sshd[117]: Accepted publickey for user r\n"
|
|
534
|
+
"<134>May 31 12:18:00 router sshd[118]: Accepted publickey for user s\n"
|
|
535
|
+
"<134>May 31 12:19:00 router sshd[119]: Accepted publickey for user t\n"
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
def _truncated_compressed(payload: bytes, suffix: str) -> bytes:
|
|
540
|
+
"""Compress ``payload`` with the suffix's algorithm and lop off the last
|
|
541
|
+
byte so the trailer fails. The decompressor yields valid-looking lines
|
|
542
|
+
until it hits the broken trailer, then raises — exactly the shape Glenn
|
|
543
|
+
flagged."""
|
|
544
|
+
if suffix == ".gz":
|
|
545
|
+
return gzip.compress(payload)[:-1]
|
|
546
|
+
if suffix == ".bz2":
|
|
547
|
+
return bz2.compress(payload)[:-1]
|
|
548
|
+
if suffix == ".xz":
|
|
549
|
+
return lzma.compress(payload)[:-1]
|
|
550
|
+
raise ValueError(f"unsupported suffix {suffix!r}")
|
|
551
|
+
|
|
552
|
+
|
|
553
|
+
@pytest.mark.parametrize("suffix", [".gz", ".bz2", ".xz"])
|
|
554
|
+
def test_load_syslog_trailer_corrupt_compressed_contributes_zero_rows(
|
|
555
|
+
tmp_path: Path, suffix: str,
|
|
556
|
+
) -> None:
|
|
557
|
+
"""A truncated `.gz` / `.bz2` / `.xz` syslog file: the warning fires AND
|
|
558
|
+
the corrupt file contributes ZERO rows. A good companion file in the
|
|
559
|
+
same directory still loads (skip is per-file, not whole-run)."""
|
|
560
|
+
syslog_dir = tmp_path / "syslog"
|
|
561
|
+
syslog_dir.mkdir()
|
|
562
|
+
# Good companion — exactly one identifiable line.
|
|
563
|
+
(syslog_dir / "router.log").write_text(
|
|
564
|
+
"<134>May 31 23:59:00 router sshd[999]: Accepted publickey for COMPANION\n",
|
|
565
|
+
encoding="utf-8",
|
|
566
|
+
)
|
|
567
|
+
(syslog_dir / f"system.log{suffix}").write_bytes(
|
|
568
|
+
_truncated_compressed(_SYSLOG_TRUNCATE_PAYLOAD.encode("utf-8"), suffix)
|
|
569
|
+
)
|
|
570
|
+
|
|
571
|
+
warnings: list[str] = []
|
|
572
|
+
df = load_syslog(syslog_dir, _warnings=warnings)
|
|
573
|
+
|
|
574
|
+
# The corrupt file produced a warning…
|
|
575
|
+
assert any(
|
|
576
|
+
f"system.log{suffix} could not be read" in w for w in warnings
|
|
577
|
+
)
|
|
578
|
+
# …AND contributed zero rows. The good companion's single row is the
|
|
579
|
+
# ONLY row in the frame. Pre-honesty-fix, the truncated file's pre-EOF
|
|
580
|
+
# rows leaked in here too.
|
|
581
|
+
assert len(df) == 1
|
|
582
|
+
assert "COMPANION" in df.iloc[0]["raw"]
|