loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,582 @@
1
+ """Tests for the syslog parser (parsers/syslog.py) and load_syslog() integration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import bz2
6
+ import gzip
7
+ import lzma
8
+ import math
9
+ from datetime import datetime, timedelta, timezone
10
+ from pathlib import Path
11
+
12
+ import pytest
13
+
14
+ from loghunter.common.loader import (
15
+ _stem_hostname,
16
+ load_required_logs,
17
+ load_syslog,
18
+ )
19
+ from loghunter.parsers.syslog import (
20
+ is_reboot_signal,
21
+ parse_program,
22
+ parse_timestamp,
23
+ strip_header,
24
+ )
25
+
26
+
27
+ # ── parse_timestamp ────────────────────────────────────────────────────────────
28
+
29
+ def test_parse_timestamp_year_rollback() -> None:
30
+ """A timestamp 10 days in the future is rolled back to the previous year."""
31
+ future = (datetime.now(timezone.utc) + timedelta(days=10)).replace(
32
+ hour=12, minute=0, second=0, microsecond=0
33
+ )
34
+ raw = f"<134>{future.strftime('%b')} {future.day} 12:00:00 router sshd: message"
35
+ result = parse_timestamp(raw)
36
+ assert result is not None
37
+ assert result == future.replace(year=future.year - 1)
38
+
39
+
40
+ def test_parse_timestamp_returns_utc_aware() -> None:
41
+ result = parse_timestamp("<134>May 31 12:00:00 router sshd: message")
42
+ assert result is not None
43
+ assert result.tzinfo == timezone.utc
44
+
45
+
46
+ def test_parse_timestamp_unparseable_returns_none() -> None:
47
+ assert parse_timestamp("not a valid syslog line at all") is None
48
+
49
+
50
+ # ── is_reboot_signal ───────────────────────────────────────────────────────────
51
+
52
+ def test_is_reboot_signal_logind_reboot() -> None:
53
+ line = "<165>May 31 06:00:00 router systemd-logind[42]: System is rebooting."
54
+ assert is_reboot_signal(line) is True
55
+
56
+
57
+ def test_is_reboot_signal_rsyslogd_exit() -> None:
58
+ line = "<165>May 31 06:00:00 router rsyslogd: exiting on signal 15."
59
+ assert is_reboot_signal(line) is True
60
+
61
+
62
+ def test_is_reboot_signal_false_for_normal_line() -> None:
63
+ line = "<134>May 31 12:00:00 router sshd[1234]: Accepted publickey for user"
64
+ assert is_reboot_signal(line) is False
65
+
66
+
67
+ # ── parse_program ──────────────────────────────────────────────────────────────
68
+
69
+ @pytest.mark.parametrize(
70
+ "body, expected",
71
+ [
72
+ ("sshd[1234]: Accepted publickey", "sshd"),
73
+ ("postfix/smtpd[889]: connect from", "postfix/smtpd"),
74
+ ("kernel: Linux version 6.1", "kernel"),
75
+ ("audisp: node=... type=...", "audisp"),
76
+ ("", "unknown"),
77
+ (" ", "unknown"),
78
+ (": payload", "unknown"),
79
+ ("[123]: payload", "unknown"),
80
+ ],
81
+ )
82
+ def test_parse_program(body: str, expected: str) -> None:
83
+ """parse_program returns the leading non-whitespace token before '[' or ':',
84
+ falling back to 'unknown' when no such token exists."""
85
+ assert parse_program(body) == expected
86
+
87
+
88
+ # ── load_syslog ────────────────────────────────────────────────────────────────
89
+
90
+ def test_load_syslog_per_host_files(tmp_path: Path) -> None:
91
+ """Two per-host files: H4 reads the in-content RFC-3164 host (which here
92
+ equals the filename stem), correct schema, correct row count. Both files
93
+ pass the content-sniff gate (real RFC-3164 lines)."""
94
+ syslog_dir = tmp_path / "syslog"
95
+ syslog_dir.mkdir()
96
+ (syslog_dir / "router.log").write_text(
97
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n"
98
+ "<134>May 31 12:01:00 router sshd[101]: session opened for user\n",
99
+ encoding="utf-8",
100
+ )
101
+ (syslog_dir / "webserver.log").write_text(
102
+ "<134>May 31 12:02:00 webserver nginx[200]: GET / HTTP/1.1 200\n",
103
+ encoding="utf-8",
104
+ )
105
+
106
+ df = load_syslog(syslog_dir)
107
+
108
+ assert list(df.columns) == ["ts", "host", "program", "raw", "message"]
109
+ assert len(df) == 3
110
+ assert set(df["host"]) == {"router", "webserver"}
111
+ assert (df[df["host"] == "router"]["host"] == "router").all()
112
+ assert (df[df["host"] == "webserver"]["host"] == "webserver").all()
113
+ assert set(df[df["host"] == "router"]["program"]) == {"sshd"}
114
+ assert df[df["host"] == "webserver"]["program"].iloc[0] == "nginx"
115
+ # Lock the byte-identical `message` invariant directly at this surface —
116
+ # adding `program` must not perturb the drain3 input.
117
+ assert set(df[df["host"] == "router"]["message"]) == {
118
+ "sshd[*]: Accepted publickey for user",
119
+ "sshd[*]: session opened for user",
120
+ }
121
+
122
+
123
+ def test_load_syslog_non_host_filename_reads_in_content_host(tmp_path: Path) -> None:
124
+ """A file named with a non-host stem (syslog.log): H4 reads the in-content
125
+ host per line — no filename inheritance."""
126
+ syslog_dir = tmp_path / "syslog"
127
+ syslog_dir.mkdir()
128
+ (syslog_dir / "syslog.log").write_text(
129
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n"
130
+ "<134>May 31 12:01:00 webserver nginx[200]: GET / HTTP/1.1 200\n",
131
+ encoding="utf-8",
132
+ )
133
+
134
+ df = load_syslog(syslog_dir)
135
+
136
+ assert list(df.columns) == ["ts", "host", "program", "raw", "message"]
137
+ assert len(df) == 2
138
+ assert set(df["host"]) == {"router", "webserver"}
139
+
140
+
141
+ def test_load_syslog_multi_host_dump_keeps_distinct_in_content_hosts(tmp_path: Path) -> None:
142
+ """A multi-host flat dump named with a non-host stem (syslog.2M.log): H4
143
+ reads the distinct in-content hosts per line — nothing collapses to the
144
+ filename (the bug-C collapse the old whole-stem host inheritance produced)."""
145
+ syslog_dir = tmp_path / "syslog"
146
+ syslog_dir.mkdir()
147
+ (syslog_dir / "syslog.2M.log").write_text(
148
+ "<134>May 31 12:00:00 routerA sshd[100]: Accepted publickey for user\n"
149
+ "<134>May 31 12:01:00 webserverB nginx[200]: GET / HTTP/1.1 200\n"
150
+ "<134>May 31 12:02:00 dbhostC cron[300]: (root) CMD (placeholder)\n",
151
+ encoding="utf-8",
152
+ )
153
+
154
+ df = load_syslog(syslog_dir)
155
+
156
+ assert len(df) == 3
157
+ assert set(df["host"]) == {"routerA", "webserverB", "dbhostC"}
158
+
159
+
160
+ def test_load_syslog_hostless_line_falls_back_to_filename_stem(tmp_path: Path) -> None:
161
+ """H4 fallback: a genuinely hostless line (parse_host → "unknown", <4 tokens)
162
+ takes the filename stem. Exercised on an EXPLICIT FILE input — the gate is
163
+ bypassed for a named file, and a directory file would have to pass the
164
+ RFC-3164 gate (a gate-passing line almost always yields a non-"unknown"
165
+ host, so the fallback arm is not reachable from directory discovery)."""
166
+ f = tmp_path / "relay1.log"
167
+ f.write_text("boot sequence done\n", encoding="utf-8")
168
+
169
+ df = load_syslog(f)
170
+
171
+ assert len(df) == 1
172
+ assert df.iloc[0]["host"] == "relay1"
173
+
174
+
175
+ def test_load_syslog_unparseable_timestamps_produce_nan_not_dropped(tmp_path: Path) -> None:
176
+ """Lines with no parseable timestamp produce ts=nan and are kept in the DataFrame."""
177
+ syslog_dir = tmp_path / "syslog"
178
+ syslog_dir.mkdir()
179
+ (syslog_dir / "router.log").write_text(
180
+ "not a valid syslog line at all\n"
181
+ "<134>May 31 12:00:00 192.0.2.1 sshd[100]: normal line\n",
182
+ encoding="utf-8",
183
+ )
184
+
185
+ df = load_syslog(syslog_dir)
186
+
187
+ assert len(df) == 2
188
+ nan_rows = df[df["ts"].isna()]
189
+ assert len(nan_rows) == 1
190
+
191
+
192
+ # ── load_required_logs() wiring ────────────────────────────────────────────────
193
+
194
+ def test_load_syslog_with_single_file(tmp_path: Path) -> None:
195
+ """load_syslog() accepts a single file path in place of a directory."""
196
+ log_file = tmp_path / "router.log"
197
+ log_file.write_text(
198
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
199
+ encoding="utf-8",
200
+ )
201
+ df = load_syslog(log_file)
202
+ assert list(df.columns) == ["ts", "host", "program", "raw", "message"]
203
+ assert len(df) == 1
204
+ assert df.iloc[0]["host"] == "router"
205
+
206
+
207
+ def test_load_syslog_directory_silently_drops_ndjson(tmp_path: Path, capsys) -> None:
208
+ """A wrong-family NDJSON in a syslog DIRECTORY is dropped by the content-sniff
209
+ gate — silently, at EVERY verbosity (decision C: no per-file stderr for
210
+ rejected candidates). The real syslog file still loads."""
211
+ syslog_dir = tmp_path / "syslog"
212
+ syslog_dir.mkdir()
213
+ (syslog_dir / "conn.log").write_text(
214
+ '{"ts": 1.0, "id.orig_h": "192.0.2.1"}\n', encoding="utf-8"
215
+ )
216
+ (syslog_dir / "router.log").write_text(
217
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
218
+ encoding="utf-8",
219
+ )
220
+ df = load_syslog(syslog_dir)
221
+ assert len(df) == 1
222
+ assert capsys.readouterr().err == ""
223
+
224
+ df = load_syslog(syslog_dir, verbose=True)
225
+ assert len(df) == 1
226
+ assert "conn.log" not in capsys.readouterr().err
227
+
228
+
229
+ def test_load_syslog_explicit_ndjson_file_skipped_and_warns(tmp_path: Path, capsys) -> None:
230
+ """An EXPLICITLY-NAMED NDJSON file bypasses the gate (operator intent) but is
231
+ skipped by `_syslog_should_skip` at load; the skip note reaches stderr ONLY
232
+ in verbose mode."""
233
+ f = tmp_path / "conn.log"
234
+ f.write_text('{"ts": 1.0, "id.orig_h": "192.0.2.1"}\n', encoding="utf-8")
235
+
236
+ df = load_syslog(f)
237
+ assert len(df) == 0
238
+ assert capsys.readouterr().err == ""
239
+
240
+ df = load_syslog(f, verbose=True)
241
+ assert len(df) == 0
242
+ captured = capsys.readouterr()
243
+ assert "conn.log" in captured.err
244
+ assert "NDJSON" in captured.err
245
+
246
+
247
+ def test_stem_hostname_variants() -> None:
248
+ """_stem_hostname strips log suffixes and rotation numbers, preserving dotted hostnames."""
249
+ assert _stem_hostname("router.log") == "router"
250
+ assert _stem_hostname("router.log.gz") == "router"
251
+ assert _stem_hostname("host1.example.com.log") == "host1.example.com"
252
+ assert _stem_hostname("syslog.log.1") == "syslog"
253
+
254
+
255
+ def test_load_required_logs_routes_syslog_dir(tmp_path: Path) -> None:
256
+ """load_required_logs() branches on syslog_dir and returns the syslog schema."""
257
+ syslog_dir = tmp_path / "syslog"
258
+ syslog_dir.mkdir()
259
+ (syslog_dir / "router.log").write_text(
260
+ "<134>May 31 12:00:00 192.0.2.1 sshd[100]: Accepted publickey for user\n",
261
+ encoding="utf-8",
262
+ )
263
+
264
+ result = load_required_logs(
265
+ {"*": "syslog_dir"},
266
+ {"syslog_dir": [syslog_dir]},
267
+ )
268
+
269
+ assert "*" in result.logs
270
+ df = result.logs["*"]
271
+ assert list(df.columns) == ["ts", "host", "program", "raw", "message"]
272
+ assert len(df) == 1
273
+ assert result.record_counts == {"*": 1}
274
+ assert result.warnings == []
275
+
276
+
277
+ # ── strip_header doubled-timestamp invariant ──────────────────────────────────
278
+
279
+ def test_strip_header_preserves_inner_timestamp_in_body() -> None:
280
+ """SYSLOG_HDR_RE is `^`-anchored: only the LEADING transport header is
281
+ stripped; an app's own inner RFC 3164-shaped timestamp in the body
282
+ survives verbatim. This invariant is load-bearing for the Zeek syslog.log
283
+ normalizer — both feeds share strip_header, so any regression here would
284
+ misderive `program`/`message` on either path."""
285
+ raw = "Jan 02 03:04:05 host1 prog: payload Jan 02 03:04:05 host2 prog2: inner"
286
+ stripped = strip_header(raw)
287
+ assert stripped == "prog: payload Jan 02 03:04:05 host2 prog2: inner"
288
+
289
+
290
+ def test_strip_header_idempotent_when_no_leading_header() -> None:
291
+ """A body that does NOT begin with a transport header is returned unchanged
292
+ (modulo PRI prefix stripping, which is absent here too)."""
293
+ raw = "prog: body without any leading transport header"
294
+ assert strip_header(raw) == "prog: body without any leading transport header"
295
+
296
+
297
+ # ── load_syslog defensive Zeek-TSV skip (gated on #separator) ─────────────────
298
+
299
+ def test_load_syslog_directory_silently_drops_zeek_tsv(tmp_path: Path, capsys) -> None:
300
+ """A Zeek-TSV syslog.log in a syslog DIRECTORY is dropped by the content-sniff
301
+ gate (no RFC-3164 header line) — silently, at EVERY verbosity. The real
302
+ syslog file still loads, not garbled into NaN-ts rows."""
303
+ syslog_dir = tmp_path / "syslog"
304
+ syslog_dir.mkdir()
305
+ (syslog_dir / "syslog.log").write_text(
306
+ "#separator \\x09\n"
307
+ "#set_separator\t,\n"
308
+ "#path\tsyslog\n"
309
+ "#fields\tts\thost\tmessage\n"
310
+ "#types\ttime\tstring\tstring\n"
311
+ "1779750000.0\thost1\tplaceholder\n",
312
+ encoding="utf-8",
313
+ )
314
+ (syslog_dir / "router.log").write_text(
315
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
316
+ encoding="utf-8",
317
+ )
318
+
319
+ df = load_syslog(syslog_dir)
320
+ assert len(df) == 1
321
+ assert capsys.readouterr().err == ""
322
+
323
+ df = load_syslog(syslog_dir, verbose=True)
324
+ assert len(df) == 1
325
+ assert "syslog.log" not in capsys.readouterr().err
326
+
327
+
328
+ def test_load_syslog_explicit_zeek_tsv_file_skipped_and_warns(tmp_path: Path, capsys) -> None:
329
+ """An EXPLICITLY-NAMED Zeek-TSV file bypasses the gate but is skipped by
330
+ `_syslog_should_skip` at load — the gate is narrow on the `#separator`
331
+ directive (the exact signal the Zeek strategy parse uses). Verbose mode
332
+ emits an actionable note pointing at zeek_dir."""
333
+ f = tmp_path / "syslog.log"
334
+ f.write_text(
335
+ "#separator \\x09\n"
336
+ "#set_separator\t,\n"
337
+ "#path\tsyslog\n"
338
+ "#fields\tts\thost\tmessage\n"
339
+ "#types\ttime\tstring\tstring\n"
340
+ "1779750000.0\thost1\tplaceholder\n",
341
+ encoding="utf-8",
342
+ )
343
+
344
+ df = load_syslog(f)
345
+ assert len(df) == 0
346
+ assert capsys.readouterr().err == ""
347
+
348
+ df = load_syslog(f, verbose=True)
349
+ assert len(df) == 0
350
+ captured = capsys.readouterr()
351
+ assert "syslog.log" in captured.err
352
+ assert "Zeek TSV" in captured.err
353
+ assert "zeek_dir" in captured.err
354
+
355
+
356
+ def test_load_syslog_does_not_skip_hash_comment_flat_syslog(tmp_path: Path) -> None:
357
+ """An ordinary `#`-comment-bearing flat syslog file is NOT skipped — the
358
+ Zeek-TSV gate is narrow on `#separator`, not generic `#`. Regression check
359
+ for the gate-narrowness rail. (Explicit file → gate bypassed; should_skip
360
+ must still not skip it.)"""
361
+ f = tmp_path / "router.log"
362
+ f.write_text(
363
+ "# this is a leading comment, not a Zeek header\n"
364
+ "# another comment\n"
365
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
366
+ encoding="utf-8",
367
+ )
368
+
369
+ df = load_syslog(f)
370
+ assert len(df) == 1
371
+ assert df.iloc[0]["host"] == "router"
372
+
373
+
374
+ # ── bz2 / xz transparent decompression at load_syslog ────────────────────────
375
+ #
376
+ # This is the bug that triggered the prompt: a rotated `system.log.bz2` in
377
+ # `/var/log` was read as replacement-char garbage and the syslog detector
378
+ # titled findings with binary soup. With bz2/xz in `_open_log`, the public
379
+ # `load_syslog` path ingests the file as text rows like any other syslog file.
380
+
381
+ _SYSLOG_BZ2_XZ_LINES = (
382
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n"
383
+ "<134>May 31 12:01:00 router sshd[101]: session opened for user\n"
384
+ )
385
+
386
+
387
+ def test_load_syslog_decompresses_bz2(tmp_path: Path) -> None:
388
+ """A rotated `system.log.bz2` ingests as text rows — no binary soup.
389
+
390
+ `system` is a generic stem, so per-line `parse_host` runs and recovers
391
+ the embedded `router` host from the fixture lines. The load-bearing
392
+ invariant is that the rows render as TEXT, not as bzip2-magic / soup.
393
+ """
394
+ syslog_dir = tmp_path / "syslog"
395
+ syslog_dir.mkdir()
396
+ (syslog_dir / "system.log.bz2").write_bytes(
397
+ bz2.compress(_SYSLOG_BZ2_XZ_LINES.encode("utf-8"))
398
+ )
399
+
400
+ df = load_syslog(syslog_dir)
401
+
402
+ assert len(df) == 2
403
+ assert set(df["host"]) == {"router"}
404
+ assert set(df["program"]) == {"sshd"}
405
+ # Sanity: no bzip2-magic / replacement-char soup leaked into the title-feed.
406
+ assert not any("BZh" in r for r in df["raw"])
407
+ assert not any("�" in r for r in df["raw"])
408
+
409
+
410
+ def test_load_syslog_decompresses_xz(tmp_path: Path) -> None:
411
+ """The xz sibling — same shape as bz2 above."""
412
+ syslog_dir = tmp_path / "syslog"
413
+ syslog_dir.mkdir()
414
+ (syslog_dir / "messages.log.xz").write_bytes(
415
+ lzma.compress(_SYSLOG_BZ2_XZ_LINES.encode("utf-8"))
416
+ )
417
+
418
+ df = load_syslog(syslog_dir)
419
+
420
+ assert len(df) == 2
421
+ assert set(df["host"]) == {"router"}
422
+ assert set(df["program"]) == {"sshd"}
423
+ # No xz-magic byte (`\xfd7zXZ`) bytes in the raw text.
424
+ assert not any("7zXZ" in r for r in df["raw"])
425
+ assert not any("�" in r for r in df["raw"])
426
+
427
+
428
+ # ── load_syslog: corrupt compressed-file skip-with-warning ──────────────────
429
+ #
430
+ # `_open_log` is lazy — corrupt compressed files raise at the READ site, not
431
+ # the open. The flat-syslog reader catches the decode-error family per-file,
432
+ # emits the standard read-warning, and continues so one bad file never aborts
433
+ # the load. `lzma.LZMAError` is NOT an `OSError` — without the explicit
434
+ # catch, a corrupt `.xz` would leak past the CLI as a raw traceback.
435
+
436
+
437
+ @pytest.mark.parametrize("suffix, corrupt_bytes", [
438
+ (".gz", b"NOTGZIP garbage"),
439
+ (".bz2", b"NOTBZIP2 garbage"),
440
+ (".xz", b"NOTXZ garbage"),
441
+ ])
442
+ def test_load_syslog_corrupt_compressed_file_skipped_with_warning(
443
+ tmp_path: Path, suffix: str, corrupt_bytes: bytes,
444
+ ) -> None:
445
+ """A corrupt compressed file is skipped per-file with the actionable
446
+ read-warning. Good files in the same directory still load (skip is
447
+ per-file, not whole-run). The phrasing differs by corruption shape —
448
+ .gz/.xz land in the "incomplete or corrupt" branch, .bz2's OSError
449
+ falls to the generic class-name fallback (per the prompt's "do not
450
+ contort to special-case" note); both branches satisfy the load-bearing
451
+ rail of "warned, not traceback'd"."""
452
+ syslog_dir = tmp_path / "syslog"
453
+ syslog_dir.mkdir()
454
+ # Good companion file alongside the corrupt one.
455
+ (syslog_dir / "router.log").write_text(
456
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
457
+ encoding="utf-8",
458
+ )
459
+ (syslog_dir / f"system.log{suffix}").write_bytes(corrupt_bytes)
460
+
461
+ warnings: list[str] = []
462
+ df = load_syslog(syslog_dir, _warnings=warnings)
463
+
464
+ # Good file still loaded.
465
+ assert len(df) == 1
466
+ assert df.iloc[0]["host"] == "router"
467
+ # Corrupt file produced an actionable warning, not a traceback.
468
+ assert any(
469
+ f"system.log{suffix} could not be read" in w for w in warnings
470
+ )
471
+
472
+
473
+ def test_load_syslog_corrupt_xz_lands_in_incomplete_or_corrupt_branch(
474
+ tmp_path: Path,
475
+ ) -> None:
476
+ """The wrinkle assertion: a corrupt `.xz` lands in
477
+ `_zeek_file_read_warning`'s "compressed file is incomplete or corrupt"
478
+ branch, NOT the generic class-name fallback. Proves `lzma.LZMAError` is
479
+ recognised at the warning helper, not just caught at the loop."""
480
+ syslog_dir = tmp_path / "syslog"
481
+ syslog_dir.mkdir()
482
+ (syslog_dir / "system.log.xz").write_bytes(b"NOTXZ garbage")
483
+
484
+ warnings: list[str] = []
485
+ load_syslog(syslog_dir, _warnings=warnings)
486
+
487
+ assert any(
488
+ "system.log.xz could not be read" in w and "incomplete or corrupt" in w
489
+ for w in warnings
490
+ )
491
+
492
+
493
+ def test_load_syslog_corrupt_compressed_file_without_warnings_buffer(
494
+ tmp_path: Path,
495
+ ) -> None:
496
+ """When _warnings is None (notebook callers, direct library use), a corrupt
497
+ file still doesn't raise — it's silently skipped. Locks the warnings=None
498
+ branch so a future tightening can't turn this into a regression."""
499
+ syslog_dir = tmp_path / "syslog"
500
+ syslog_dir.mkdir()
501
+ (syslog_dir / "system.log.xz").write_bytes(b"NOTXZ garbage")
502
+
503
+ df = load_syslog(syslog_dir) # _warnings omitted
504
+ assert df.empty
505
+
506
+
507
+ # ── load_syslog: truncated (trailer-corrupt) compressed file honesty rail ──
508
+ #
509
+ # Invalid-magic corruption raises immediately on read. Truncated compressed
510
+ # files are nastier: the decompressor yields valid-looking lines and only
511
+ # raises at the EOF/trailer check. Pre-honesty-fix, a file the loader warned
512
+ # it had "skipped" still leaked rows into the returned frame.
513
+ # Honesty rail: a file the loader warns it skipped contributes ZERO rows.
514
+
515
+ _SYSLOG_TRUNCATE_PAYLOAD = (
516
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user a\n"
517
+ "<134>May 31 12:01:00 router sshd[101]: Accepted publickey for user b\n"
518
+ "<134>May 31 12:02:00 router sshd[102]: Accepted publickey for user c\n"
519
+ "<134>May 31 12:03:00 router sshd[103]: Accepted publickey for user d\n"
520
+ "<134>May 31 12:04:00 router sshd[104]: Accepted publickey for user e\n"
521
+ "<134>May 31 12:05:00 router sshd[105]: Accepted publickey for user f\n"
522
+ "<134>May 31 12:06:00 router sshd[106]: Accepted publickey for user g\n"
523
+ "<134>May 31 12:07:00 router sshd[107]: Accepted publickey for user h\n"
524
+ "<134>May 31 12:08:00 router sshd[108]: Accepted publickey for user i\n"
525
+ "<134>May 31 12:09:00 router sshd[109]: Accepted publickey for user j\n"
526
+ "<134>May 31 12:10:00 router sshd[110]: Accepted publickey for user k\n"
527
+ "<134>May 31 12:11:00 router sshd[111]: Accepted publickey for user l\n"
528
+ "<134>May 31 12:12:00 router sshd[112]: Accepted publickey for user m\n"
529
+ "<134>May 31 12:13:00 router sshd[113]: Accepted publickey for user n\n"
530
+ "<134>May 31 12:14:00 router sshd[114]: Accepted publickey for user o\n"
531
+ "<134>May 31 12:15:00 router sshd[115]: Accepted publickey for user p\n"
532
+ "<134>May 31 12:16:00 router sshd[116]: Accepted publickey for user q\n"
533
+ "<134>May 31 12:17:00 router sshd[117]: Accepted publickey for user r\n"
534
+ "<134>May 31 12:18:00 router sshd[118]: Accepted publickey for user s\n"
535
+ "<134>May 31 12:19:00 router sshd[119]: Accepted publickey for user t\n"
536
+ )
537
+
538
+
539
+ def _truncated_compressed(payload: bytes, suffix: str) -> bytes:
540
+ """Compress ``payload`` with the suffix's algorithm and lop off the last
541
+ byte so the trailer fails. The decompressor yields valid-looking lines
542
+ until it hits the broken trailer, then raises — exactly the shape Glenn
543
+ flagged."""
544
+ if suffix == ".gz":
545
+ return gzip.compress(payload)[:-1]
546
+ if suffix == ".bz2":
547
+ return bz2.compress(payload)[:-1]
548
+ if suffix == ".xz":
549
+ return lzma.compress(payload)[:-1]
550
+ raise ValueError(f"unsupported suffix {suffix!r}")
551
+
552
+
553
+ @pytest.mark.parametrize("suffix", [".gz", ".bz2", ".xz"])
554
+ def test_load_syslog_trailer_corrupt_compressed_contributes_zero_rows(
555
+ tmp_path: Path, suffix: str,
556
+ ) -> None:
557
+ """A truncated `.gz` / `.bz2` / `.xz` syslog file: the warning fires AND
558
+ the corrupt file contributes ZERO rows. A good companion file in the
559
+ same directory still loads (skip is per-file, not whole-run)."""
560
+ syslog_dir = tmp_path / "syslog"
561
+ syslog_dir.mkdir()
562
+ # Good companion — exactly one identifiable line.
563
+ (syslog_dir / "router.log").write_text(
564
+ "<134>May 31 23:59:00 router sshd[999]: Accepted publickey for COMPANION\n",
565
+ encoding="utf-8",
566
+ )
567
+ (syslog_dir / f"system.log{suffix}").write_bytes(
568
+ _truncated_compressed(_SYSLOG_TRUNCATE_PAYLOAD.encode("utf-8"), suffix)
569
+ )
570
+
571
+ warnings: list[str] = []
572
+ df = load_syslog(syslog_dir, _warnings=warnings)
573
+
574
+ # The corrupt file produced a warning…
575
+ assert any(
576
+ f"system.log{suffix} could not be read" in w for w in warnings
577
+ )
578
+ # …AND contributed zero rows. The good companion's single row is the
579
+ # ONLY row in the frame. Pre-honesty-fix, the truncated file's pre-EOF
580
+ # rows leaked in here too.
581
+ assert len(df) == 1
582
+ assert "COMPANION" in df.iloc[0]["raw"]