loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
tests/test_loader.py ADDED
@@ -0,0 +1,3639 @@
1
+ """Tests for log loading metadata, normalization, and schema warnings."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import bz2
6
+ import gzip
7
+ import json
8
+ import lzma
9
+ from datetime import datetime, timezone
10
+ from pathlib import Path
11
+
12
+ import pandas as pd
13
+ import pytest
14
+
15
+ from datetime import date, timedelta
16
+
17
+ from loghunter.common.loader import (
18
+ _CLOUDTRAIL_COLUMNS,
19
+ _PIHOLE_COLUMNS,
20
+ _SYSLOG_SNIFF_BYTES,
21
+ _SOURCE_LOADERS,
22
+ _apply_ts_filter,
23
+ _classify_rotation_name,
24
+ _discover_syslog_files,
25
+ _flat_default_floor,
26
+ _looks_like_syslog,
27
+ _peek_first_ts,
28
+ _rotation_windowed_files,
29
+ _schema_warning,
30
+ _select_group,
31
+ _syslog_files,
32
+ _zeek_dated_window,
33
+ CoverageTracker,
34
+ RotationSkipInfo,
35
+ SourceCoverage,
36
+ discover_cloudtrail_files,
37
+ discover_zeek_files,
38
+ is_bounded,
39
+ is_zeek_bounded,
40
+ load_cloudtrail,
41
+ load_logs,
42
+ load_pihole,
43
+ load_required_logs,
44
+ load_syslog,
45
+ )
46
+ from loghunter.exporters import _auto_filename
47
+ from loghunter.parsers.syslog import parse_timestamp
48
+ from loghunter.parsers.zeek import _normalize_dns_df
49
+
50
+
51
+ def _write_ndjson(path: Path, records: list[dict]) -> None:
52
+ path.write_text(
53
+ "\n".join(json.dumps(record) for record in records) + "\n",
54
+ encoding="utf-8",
55
+ )
56
+
57
+
58
+ # ── CloudTrail fixture helpers ────────────────────────────────────────────────
59
+
60
+ _CT_DOCS_ACCOUNT = "123456789012"
61
+
62
+
63
+ def _ct_event(**overrides) -> dict:
64
+ """Build a minimal valid CloudTrail event dict for loader fixtures."""
65
+ base: dict = {
66
+ "eventTime": "2026-06-01T12:00:00Z",
67
+ "eventSource": "s3.amazonaws.com",
68
+ "eventName": "GetObject",
69
+ "eventID": "11111111-1111-1111-1111-111111111111",
70
+ "awsRegion": "us-east-1",
71
+ "sourceIPAddress": "192.0.2.10",
72
+ "userIdentity": {
73
+ "type": "IAMUser",
74
+ "userName": "placeholder-user",
75
+ "principalId": "AIDAEXAMPLE",
76
+ "arn": f"arn:aws:iam::{_CT_DOCS_ACCOUNT}:user/placeholder-user",
77
+ },
78
+ "readOnly": True,
79
+ }
80
+ base.update(overrides)
81
+ return base
82
+
83
+
84
+ def _ct_write_ndjson(path: Path, events: list[dict]) -> None:
85
+ """Write events as one JSON object per line (the exporter wire shape)."""
86
+ path.parent.mkdir(parents=True, exist_ok=True)
87
+ path.write_text(
88
+ "\n".join(json.dumps(e) for e in events) + "\n",
89
+ encoding="utf-8",
90
+ )
91
+
92
+
93
+ def _ct_write_envelope_gz(path: Path, events: list[dict]) -> None:
94
+ """Write a gzipped single-line ``{"Records":[...]}`` envelope (native S3 shape)."""
95
+ path.parent.mkdir(parents=True, exist_ok=True)
96
+ payload = json.dumps({"Records": events}).encode("utf-8")
97
+ path.write_bytes(gzip.compress(payload))
98
+
99
+
100
+ def test_load_required_logs_normalizes_conn_and_reports_window(tmp_path: Path) -> None:
101
+ zeek_dir = tmp_path / "zeek"
102
+ zeek_dir.mkdir()
103
+ _write_ndjson(
104
+ zeek_dir / "conn.log",
105
+ [
106
+ {
107
+ "ts": 1_779_750_000.0,
108
+ "id.orig_h": "192.0.2.10",
109
+ "id.resp_h": "198.51.100.20",
110
+ "id.resp_p": 443,
111
+ "proto": "tcp",
112
+ },
113
+ {
114
+ "ts": 1_779_753_600.0,
115
+ "id.orig_h": "192.0.2.11",
116
+ "id.resp_h": "203.0.113.20",
117
+ "id.resp_p": 22,
118
+ "proto": "tcp",
119
+ },
120
+ ],
121
+ )
122
+
123
+ result = load_required_logs(
124
+ {"conn*.log*": "zeek_dir"},
125
+ {"zeek_dir": [zeek_dir]},
126
+ )
127
+
128
+ df = result.logs["conn*.log*"]
129
+ assert list(df[["src", "dst", "port"]].iloc[0]) == [
130
+ "192.0.2.10",
131
+ "198.51.100.20",
132
+ 443,
133
+ ]
134
+ assert result.record_counts == {"conn*.log*": 2}
135
+ assert result.data_window == (
136
+ datetime.fromtimestamp(1_779_750_000.0, tz=timezone.utc),
137
+ datetime.fromtimestamp(1_779_753_600.0, tz=timezone.utc),
138
+ )
139
+ assert result.warnings == []
140
+
141
+
142
+ def test_load_required_logs_warns_on_missing_canonical_fields(tmp_path: Path) -> None:
143
+ zeek_dir = tmp_path / "zeek"
144
+ zeek_dir.mkdir()
145
+ _write_ndjson(
146
+ zeek_dir / "conn.log",
147
+ [
148
+ {
149
+ "ts": 1_779_750_000.0,
150
+ "id.orig_h": "192.0.2.10",
151
+ },
152
+ ],
153
+ )
154
+
155
+ result = load_required_logs(
156
+ {"conn*.log*": "zeek_dir"},
157
+ {"zeek_dir": [zeek_dir]},
158
+ )
159
+
160
+ assert result.record_counts == {"conn*.log*": 1}
161
+ assert result.warnings == [
162
+ "conn.log fields not found: dst, port, proto — is this a Zeek conn.log?"
163
+ ]
164
+
165
+
166
+ def test_load_required_logs_warns_when_source_missing() -> None:
167
+ result = load_required_logs(
168
+ {"conn*.log*": "zeek_dir"},
169
+ {},
170
+ )
171
+
172
+ assert result.logs == {}
173
+ assert result.record_counts == {}
174
+ assert result.data_window is None
175
+ assert result.warnings == ["zeek_dir not configured — conn*.log* not loaded"]
176
+
177
+
178
+ def test_schema_warning_does_not_fire_for_missing_duration() -> None:
179
+ """duration is optional — Zeek omits it for connections that have not closed."""
180
+ df = pd.DataFrame([{
181
+ "src": "192.0.2.10", "dst": "198.51.100.20",
182
+ "port": 443, "proto": "tcp", "ts": 1_779_750_000.0,
183
+ }])
184
+ assert _schema_warning("conn*.log*", df) is None
185
+
186
+
187
+ def test_schema_warning_fires_for_missing_required_conn_field() -> None:
188
+ """Optional-column subtraction must not suppress warnings for truly required fields."""
189
+ df = pd.DataFrame([{
190
+ "src": "192.0.2.10", "dst": "198.51.100.20",
191
+ "port": 443, "ts": 1_779_750_000.0, "duration": 600.0,
192
+ # proto deliberately absent
193
+ }])
194
+ warning = _schema_warning("conn*.log*", df)
195
+ assert warning is not None
196
+ assert "proto" in warning
197
+
198
+
199
+ def test_load_required_logs_routes_pihole_dir(tmp_path: Path) -> None:
200
+ """pihole_dir source key loads via load_pihole and returns _PIHOLE_COLUMNS schema."""
201
+ pihole_dir = tmp_path / "pihole"
202
+ pihole_dir.mkdir()
203
+ (pihole_dir / "pihole.log").write_text(
204
+ "Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.1\n",
205
+ encoding="utf-8",
206
+ )
207
+ result = load_required_logs(
208
+ {"pihole*.log*": "pihole_dir"},
209
+ {"pihole_dir": [pihole_dir]},
210
+ )
211
+ df = result.logs["pihole*.log*"]
212
+ assert set(_PIHOLE_COLUMNS).issubset(set(df.columns))
213
+ assert len(df) == 1
214
+
215
+
216
+ def test_load_required_logs_routes_cloudtrail_dir(tmp_path: Path) -> None:
217
+ """cloudtrail_dir source key loads via load_cloudtrail with canonical columns."""
218
+ cloudtrail_dir = tmp_path / "cloudtrail"
219
+ cloudtrail_dir.mkdir()
220
+ _ct_write_ndjson(cloudtrail_dir / "events.json.log", [_ct_event()])
221
+
222
+ result = load_required_logs(
223
+ {"*.json*": "cloudtrail_dir"},
224
+ {"cloudtrail_dir": [cloudtrail_dir]},
225
+ )
226
+
227
+ df = result.logs["*.json*"]
228
+ assert list(df.columns) == _CLOUDTRAIL_COLUMNS
229
+ assert len(df) == 1
230
+ assert result.record_counts == {"*.json*": 1}
231
+ assert result.data_size_bytes > 0
232
+ assert result.warnings == []
233
+
234
+
235
+ def test_load_required_logs_raises_for_unknown_source_key(tmp_path: Path) -> None:
236
+ bogus_dir = tmp_path / "bogus"
237
+ bogus_dir.mkdir()
238
+ with pytest.raises(ValueError, match="bogus_dir"):
239
+ load_required_logs({"*.log*": "bogus_dir"}, {"bogus_dir": [bogus_dir]})
240
+
241
+
242
+ def test_normalize_dns_df_renames_and_applies_qclass_aperture() -> None:
243
+ """_normalize_dns_df renames Zeek-native DNS columns to canonical names,
244
+ keeps only qclass==1 rows, drops qclass, and carries qtype through."""
245
+ df = pd.DataFrame([
246
+ # qclass=1 (internet) — must be kept
247
+ {
248
+ "id.orig_h": "192.0.2.1", "TTLs": [300.0], "answers": ["198.51.100.1"],
249
+ "TC": 0, "qclass": 1, "qtype": 1, "query": "example.com",
250
+ "ts": 1.0, "rtt": 0.05, "rcode": 0,
251
+ },
252
+ # qclass=2 (CSNET, obsolete) — must be dropped
253
+ {
254
+ "id.orig_h": "192.0.2.2", "TTLs": [60.0], "answers": ["198.51.100.2"],
255
+ "TC": 0, "qclass": 2, "qtype": 1, "query": "other.com",
256
+ "ts": 2.0, "rtt": 0.03, "rcode": 0,
257
+ },
258
+ # qclass=None — must be dropped (== 1 drops nulls)
259
+ {
260
+ "id.orig_h": "192.0.2.3", "TTLs": None, "answers": None,
261
+ "TC": 0, "qclass": None, "qtype": 1, "query": "null-class.com",
262
+ "ts": 3.0, "rtt": None, "rcode": None,
263
+ },
264
+ ])
265
+
266
+ result = _normalize_dns_df(df)
267
+
268
+ assert len(result) == 1, "only the qclass=1 row should survive"
269
+
270
+ assert "src" in result.columns, "id.orig_h should be renamed to src"
271
+ assert "ttl" in result.columns, "TTLs should be renamed to ttl"
272
+ assert "answer" in result.columns, "answers should be renamed to answer"
273
+ assert "tc" in result.columns, "TC should be renamed to tc"
274
+
275
+ assert "qclass" not in result.columns, "qclass must be dropped"
276
+ assert "qtype" in result.columns, "qtype must be carried through (raw numeric code)"
277
+ assert "id.orig_h" not in result.columns, "Zeek-native id.orig_h must not remain"
278
+
279
+ assert result.iloc[0]["src"] == "192.0.2.1"
280
+
281
+ # rtt and rcode are already canonical — must pass through unchanged
282
+ assert "rtt" in result.columns
283
+ assert "rcode" in result.columns
284
+
285
+
286
+ def test_normalize_dns_df_carries_qtype_as_raw_numeric() -> None:
287
+ """qtype is carried through as Zeek's raw numeric type code — no rename,
288
+ no mnemonic translation. Aperture and qclass drop are unchanged."""
289
+ df = pd.DataFrame([
290
+ # qclass=1, qtype=1 (A) — must survive with qtype preserved
291
+ {
292
+ "id.orig_h": "192.0.2.10", "query": "alpha.invalid",
293
+ "ts": 1.0, "qclass": 1, "qtype": 1, "rcode": 0,
294
+ },
295
+ # qclass=1, qtype=28 (AAAA) — must survive with qtype preserved
296
+ {
297
+ "id.orig_h": "192.0.2.11", "query": "beta.invalid",
298
+ "ts": 2.0, "qclass": 1, "qtype": 28, "rcode": 0,
299
+ },
300
+ # qclass=2 (CSNET) — must be dropped by the aperture
301
+ {
302
+ "id.orig_h": "192.0.2.12", "query": "gamma.invalid",
303
+ "ts": 3.0, "qclass": 2, "qtype": 1, "rcode": 0,
304
+ },
305
+ ])
306
+
307
+ result = _normalize_dns_df(df).reset_index(drop=True)
308
+
309
+ # Aperture still working — CSNET row dropped
310
+ assert len(result) == 2, "qclass=2 row must be dropped by the aperture"
311
+ # qclass still dropped from the output frame
312
+ assert "qclass" not in result.columns, "qclass must be dropped"
313
+ # qtype carried through as raw numeric (1 for A, 28 for AAAA — no mnemonic)
314
+ assert "qtype" in result.columns, "qtype must be carried through"
315
+ assert list(result["qtype"]) == [1, 28]
316
+
317
+
318
+ # ── Pi-hole / dnsmasq loader tests ───────────────────────────────────────────
319
+
320
+ _PIHOLE_LINE_QUERY = "Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.1"
321
+ _PIHOLE_LINE_REPLY = "Jun 1 12:00:01 dnsmasq[1]: reply example.test is 203.0.113.1"
322
+
323
+
324
+ def test_load_pihole_plain_fixture(tmp_path: Path) -> None:
325
+ """Two valid dnsmasq lines in a directory load into a _PIHOLE_COLUMNS DataFrame."""
326
+ pihole_dir = tmp_path / "pihole"
327
+ pihole_dir.mkdir()
328
+ (pihole_dir / "pihole.log").write_text(
329
+ f"{_PIHOLE_LINE_QUERY}\n{_PIHOLE_LINE_REPLY}\n", encoding="utf-8"
330
+ )
331
+ df = load_pihole(pihole_dir)
332
+ assert list(df.columns) == _PIHOLE_COLUMNS
333
+ assert len(df) == 2
334
+ assert df.iloc[0]["event_type"] == "query"
335
+ assert df.iloc[0]["src"] == "192.0.2.1"
336
+ assert df.iloc[1]["event_type"] == "reply"
337
+
338
+
339
+ def test_load_pihole_single_file_path(tmp_path: Path) -> None:
340
+ """load_pihole accepts a direct file path instead of a directory."""
341
+ log_file = tmp_path / "pihole.log"
342
+ log_file.write_text(f"{_PIHOLE_LINE_QUERY}\n", encoding="utf-8")
343
+ df = load_pihole(log_file)
344
+ assert list(df.columns) == _PIHOLE_COLUMNS
345
+ assert len(df) == 1
346
+
347
+
348
+ def test_load_pihole_gzip_fixture(tmp_path: Path) -> None:
349
+ """Gzip-compressed dnsmasq log is decompressed and loads identically to plain."""
350
+ pihole_dir = tmp_path / "pihole"
351
+ pihole_dir.mkdir()
352
+ content = f"{_PIHOLE_LINE_QUERY}\n{_PIHOLE_LINE_REPLY}\n"
353
+ with gzip.open(pihole_dir / "pihole.log.gz", "wt", encoding="utf-8") as fh:
354
+ fh.write(content)
355
+ df = load_pihole(pihole_dir)
356
+ assert list(df.columns) == _PIHOLE_COLUMNS
357
+ assert len(df) == 2
358
+
359
+
360
+ def test_load_pihole_ndjson_skipped(tmp_path: Path, capsys: pytest.CaptureFixture) -> None:
361
+ """A wrong-FORMAT (NDJSON) file matching the pihole glob is skipped quietly by
362
+ default; surrounding dnsmasq files load. (Named to match ``pihole*.log*`` so it
363
+ enters the discovered universe — the wrong-format skip is what's under test.)"""
364
+ pihole_dir = tmp_path / "pihole"
365
+ pihole_dir.mkdir()
366
+ (pihole_dir / "pihole.ndjson.log").write_text('{"ts": 1.0}\n', encoding="utf-8")
367
+ (pihole_dir / "pihole.log").write_text(f"{_PIHOLE_LINE_QUERY}\n", encoding="utf-8")
368
+ df = load_pihole(pihole_dir)
369
+ assert len(df) == 1
370
+ captured = capsys.readouterr()
371
+ assert captured.err == ""
372
+
373
+ df = load_pihole(pihole_dir, verbose=True)
374
+ assert len(df) == 1
375
+ captured = capsys.readouterr()
376
+ assert "pihole.ndjson.log" in captured.err
377
+ assert "NDJSON" in captured.err
378
+
379
+
380
+ def test_load_pihole_empty_file(tmp_path: Path) -> None:
381
+ """An empty log file returns an empty DataFrame with the canonical columns."""
382
+ pihole_dir = tmp_path / "pihole"
383
+ pihole_dir.mkdir()
384
+ (pihole_dir / "pihole.log").write_text("", encoding="utf-8")
385
+ df = load_pihole(pihole_dir)
386
+ assert list(df.columns) == _PIHOLE_COLUMNS
387
+ assert len(df) == 0
388
+
389
+
390
+ def test_load_pihole_malformed_lines_dropped(tmp_path: Path) -> None:
391
+ """Non-dnsmasq lines are dropped; valid lines on either side are retained."""
392
+ pihole_dir = tmp_path / "pihole"
393
+ pihole_dir.mkdir()
394
+ (pihole_dir / "pihole.log").write_text(
395
+ f"{_PIHOLE_LINE_QUERY}\nnot a dnsmasq line at all\n{_PIHOLE_LINE_REPLY}\n",
396
+ encoding="utf-8",
397
+ )
398
+ df = load_pihole(pihole_dir)
399
+ assert len(df) == 2
400
+
401
+
402
+ def test_load_pihole_hostname_from_stem(tmp_path: Path) -> None:
403
+ """Host is derived from the filename stem, not from log content."""
404
+ pihole_dir = tmp_path / "pihole"
405
+ pihole_dir.mkdir()
406
+ (pihole_dir / "pihole-router.log").write_text(
407
+ f"{_PIHOLE_LINE_QUERY}\n{_PIHOLE_LINE_REPLY}\n", encoding="utf-8"
408
+ )
409
+ df = load_pihole(pihole_dir)
410
+ assert (df["host"] == "pihole-router").all()
411
+
412
+
413
+ def test_load_pihole_timefilter_keeps_nan_ts(tmp_path: Path) -> None:
414
+ """Rows with unparseable timestamps (NaN ts) are not dropped by the timeframe filter."""
415
+ pihole_dir = tmp_path / "pihole"
416
+ pihole_dir.mkdir()
417
+ # Outer regex matches "Xxx" (\w{3}) but strptime fails on it → parse_timestamp returns None
418
+ nan_ts_line = "Xxx 1 12:00:00 dnsmasq[1]: query[A] other.test from 192.0.2.2"
419
+ (pihole_dir / "pihole.log").write_text(
420
+ f"{_PIHOLE_LINE_QUERY}\n{nan_ts_line}\n", encoding="utf-8"
421
+ )
422
+ _year = datetime.now(timezone.utc).year
423
+ since = datetime(_year, 6, 1, 11, 0, 0, tzinfo=timezone.utc)
424
+ until = datetime(_year, 6, 1, 13, 0, 0, tzinfo=timezone.utc)
425
+ df = load_pihole(pihole_dir, since, until)
426
+ assert len(df) == 2
427
+ import math
428
+ assert math.isnan(df.loc[df["query"] == "other.test", "ts"].iloc[0])
429
+
430
+
431
+ def test_schema_warning_no_ops_for_pihole_pattern(tmp_path: Path) -> None:
432
+ """_schema_warning returns None for pihole patterns — no required-column contract."""
433
+ df = pd.DataFrame([{"ts": 1.0, "query": "example.test", "src": "192.0.2.1"}])
434
+ assert _schema_warning("pihole*.log*", df) is None
435
+
436
+
437
+ # ── TSV load path tests ───────────────────────────────────────────────────────
438
+
439
+ _CONN_TSV_HEADER = (
440
+ "#separator \\x09\n"
441
+ "#set_separator ,\n"
442
+ "#empty_field (empty)\n"
443
+ "#unset_field -\n"
444
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\n"
445
+ "#types\ttime\tstring\taddr\tport\taddr\tport\tenum\n"
446
+ )
447
+
448
+
449
+ def _write_tsv(path: Path, content: str) -> None:
450
+ path.write_text(content, encoding="utf-8")
451
+
452
+
453
+ def test_load_logs_mixed_ndjson_and_tsv(tmp_path: Path) -> None:
454
+ """NDJSON and TSV files in the same directory both load; canonical columns present."""
455
+ zeek_dir = tmp_path / "zeek"
456
+ zeek_dir.mkdir()
457
+
458
+ _write_ndjson(
459
+ zeek_dir / "conn-ndjson.log",
460
+ [
461
+ {"ts": 1000.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
462
+ "id.resp_p": 443, "proto": "tcp"},
463
+ {"ts": 1001.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
464
+ "id.resp_p": 80, "proto": "tcp"},
465
+ ],
466
+ )
467
+ _write_tsv(
468
+ zeek_dir / "conn-tsv.log",
469
+ _CONN_TSV_HEADER
470
+ + "2000.0\tCabc1\t192.0.2.2\t54321\t198.51.100.2\t443\ttcp\n"
471
+ + "2001.0\tCabc2\t192.0.2.2\t54322\t198.51.100.2\t80\ttcp\n",
472
+ )
473
+
474
+ df = load_logs(zeek_dir, "conn*.log*")
475
+
476
+ assert len(df) == 4
477
+ for col in ("src", "dst", "port", "proto"):
478
+ assert col in df.columns, f"canonical column {col!r} missing"
479
+ assert set(df["src"].tolist()) == {"192.0.2.1", "192.0.2.2"}
480
+
481
+
482
+ def test_load_logs_timeframe_filter_applies_across_encodings(tmp_path: Path) -> None:
483
+ """since/until filters rows from both NDJSON and TSV files uniformly."""
484
+ zeek_dir = tmp_path / "zeek"
485
+ zeek_dir.mkdir()
486
+
487
+ # NDJSON: ts=100.0 in-window, ts=50.0 out-of-window
488
+ _write_ndjson(
489
+ zeek_dir / "conn-ndjson.log",
490
+ [
491
+ {"ts": 100.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
492
+ "id.resp_p": 443, "proto": "tcp"},
493
+ {"ts": 50.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
494
+ "id.resp_p": 443, "proto": "tcp"},
495
+ ],
496
+ )
497
+ # TSV: ts=200.0 in-window, ts=300.0 out-of-window
498
+ _write_tsv(
499
+ zeek_dir / "conn-tsv.log",
500
+ _CONN_TSV_HEADER
501
+ + "200.0\tCabc1\t192.0.2.3\t54321\t198.51.100.3\t443\ttcp\n"
502
+ + "300.0\tCabc2\t192.0.2.3\t54322\t198.51.100.3\t443\ttcp\n",
503
+ )
504
+
505
+ since = datetime.fromtimestamp(75.0, tz=timezone.utc)
506
+ until = datetime.fromtimestamp(250.0, tz=timezone.utc)
507
+ df = load_logs(zeek_dir, "conn*.log*", since=since, until=until)
508
+
509
+ assert len(df) == 2
510
+ assert set(df["ts"].tolist()) == {100.0, 200.0}
511
+ assert set(df["src"].tolist()) == {"192.0.2.1", "192.0.2.3"}
512
+
513
+
514
+ def test_load_logs_tsv_vector_addr_and_set_enum(tmp_path: Path) -> None:
515
+ """vector[addr] and set[enum] fields survive the load path as Python lists."""
516
+ zeek_dir = tmp_path / "zeek"
517
+ zeek_dir.mkdir()
518
+
519
+ _write_tsv(
520
+ zeek_dir / "weird-tsv.log",
521
+ "#separator \\x09\n"
522
+ "#set_separator ,\n"
523
+ "#empty_field (empty)\n"
524
+ "#unset_field -\n"
525
+ "#fields\tts\taddrs\tactions\n"
526
+ "#types\ttime\tvector[addr]\tset[enum]\n"
527
+ "1000.0\t192.0.2.1,192.0.2.2\tWeird::ACTIVITY,Weird::NOTICE\n",
528
+ )
529
+
530
+ df = load_logs(zeek_dir, "weird*.log*")
531
+
532
+ assert len(df) == 1
533
+ addrs = df.iloc[0]["addrs"]
534
+ assert isinstance(addrs, list)
535
+ assert addrs == ["192.0.2.1", "192.0.2.2"]
536
+
537
+ actions = df.iloc[0]["actions"]
538
+ assert isinstance(actions, list)
539
+ assert set(actions) == {"Weird::ACTIVITY", "Weird::NOTICE"}
540
+
541
+
542
+ # ── Dated-directory layout tests ─────────────────────────────────────────────
543
+ #
544
+ # Epoch timestamps used below map to the following UTC calendar dates:
545
+ # 1767225600 → 2026-01-01 00:00:00 UTC
546
+ # 1767312000 → 2026-01-02 00:00:00 UTC
547
+ # 1767398400 → 2026-01-03 00:00:00 UTC
548
+ # 1767427200 → 2026-01-03 08:00:00 UTC
549
+ # 1767463200 → 2026-01-03 18:00:00 UTC
550
+ # 1767484800 → 2026-01-04 00:00:00 UTC
551
+
552
+ _JAN1 = datetime(2026, 1, 1, tzinfo=timezone.utc)
553
+ _JAN2 = datetime(2026, 1, 2, tzinfo=timezone.utc)
554
+ _JAN3 = datetime(2026, 1, 3, tzinfo=timezone.utc)
555
+ _JAN4 = datetime(2026, 1, 4, tzinfo=timezone.utc)
556
+ _JAN5 = datetime(2026, 1, 5, tzinfo=timezone.utc)
557
+
558
+ _TS_JAN1 = _JAN1.timestamp() # 1767225600.0
559
+ _TS_JAN2 = _JAN2.timestamp() # 1767312000.0
560
+ _TS_JAN3 = _JAN3.timestamp() # 1767398400.0
561
+ _TS_JAN3_08 = datetime(2026, 1, 3, 8, tzinfo=timezone.utc).timestamp() # 1767427200.0
562
+ _TS_JAN3_18 = datetime(2026, 1, 3, 18, tzinfo=timezone.utc).timestamp() # 1767463200.0
563
+ _TS_JAN4 = _JAN4.timestamp() # 1767484800.0
564
+
565
+
566
+ def test_load_logs_flat_layout_unchanged(tmp_path: Path) -> None:
567
+ """Flat directory (no YYYY-MM-DD subdirs) loads exactly as before — regression guard."""
568
+ zeek_dir = tmp_path / "zeek"
569
+ zeek_dir.mkdir()
570
+ _write_ndjson(
571
+ zeek_dir / "conn-a.log",
572
+ [{"ts": _TS_JAN1, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
573
+ "id.resp_p": 443, "proto": "tcp"}],
574
+ )
575
+ _write_ndjson(
576
+ zeek_dir / "conn-b.log",
577
+ [{"ts": _TS_JAN2, "id.orig_h": "192.0.2.2", "id.resp_h": "198.51.100.2",
578
+ "id.resp_p": 80, "proto": "tcp"}],
579
+ )
580
+
581
+ df = load_logs(zeek_dir, "conn*.log*")
582
+
583
+ assert len(df) == 2
584
+ for col in ("src", "dst", "port", "proto"):
585
+ assert col in df.columns
586
+ assert set(df["src"].tolist()) == {"192.0.2.1", "192.0.2.2"}
587
+
588
+
589
+ def test_load_logs_dated_layout_discovers_subdirs(tmp_path: Path) -> None:
590
+ """Dated layout: files inside YYYY-MM-DD subdirs are discovered and concatenated."""
591
+ zeek_dir = tmp_path / "zeek"
592
+ zeek_dir.mkdir()
593
+ (zeek_dir / "2026-01-01").mkdir()
594
+ (zeek_dir / "2026-01-02").mkdir()
595
+ _write_ndjson(
596
+ zeek_dir / "2026-01-01" / "conn.log",
597
+ [
598
+ {"ts": _TS_JAN1, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
599
+ "id.resp_p": 443, "proto": "tcp"},
600
+ {"ts": _TS_JAN1 + 1, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
601
+ "id.resp_p": 80, "proto": "tcp"},
602
+ ],
603
+ )
604
+ _write_ndjson(
605
+ zeek_dir / "2026-01-02" / "conn.log",
606
+ [
607
+ {"ts": _TS_JAN2, "id.orig_h": "198.51.100.2", "id.resp_h": "203.0.113.1",
608
+ "id.resp_p": 443, "proto": "tcp"},
609
+ {"ts": _TS_JAN2 + 1, "id.orig_h": "198.51.100.2", "id.resp_h": "203.0.113.1",
610
+ "id.resp_p": 22, "proto": "tcp"},
611
+ ],
612
+ )
613
+
614
+ df = load_logs(zeek_dir, "conn*.log*")
615
+
616
+ assert len(df) == 4
617
+ assert "192.0.2.1" in df["src"].tolist()
618
+ assert "198.51.100.2" in df["src"].tolist()
619
+
620
+
621
+ def test_load_logs_date_pruning_skips_out_of_window_dirs(tmp_path: Path) -> None:
622
+ """Date pruning: out-of-window subdirs are never opened (coarse-by-dirname proof).
623
+
624
+ The garbage .gz file in 2026-01-01 would raise BadGzipFile if opened. Absence of
625
+ that exception proves the directory was pruned, not just filtered downstream.
626
+ """
627
+ zeek_dir = tmp_path / "zeek"
628
+ zeek_dir.mkdir()
629
+
630
+ day1 = zeek_dir / "2026-01-01"
631
+ day1.mkdir()
632
+ # Non-gzip bytes in a .gz file — raises BadGzipFile if opened.
633
+ (day1 / "conn.00:00:00-01:00:00.log.gz").write_bytes(b"NOTGZIP")
634
+
635
+ day2 = zeek_dir / "2026-01-02"
636
+ day2.mkdir()
637
+ _write_ndjson(
638
+ day2 / "conn.log",
639
+ [
640
+ {"ts": _TS_JAN2, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.10",
641
+ "id.resp_p": 443, "proto": "tcp"},
642
+ {"ts": _TS_JAN2 + 1, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.10",
643
+ "id.resp_p": 80, "proto": "tcp"},
644
+ ],
645
+ )
646
+
647
+ day3 = zeek_dir / "2026-01-03"
648
+ day3.mkdir()
649
+ _write_ndjson(
650
+ day3 / "conn.log",
651
+ [{"ts": _TS_JAN3, "id.orig_h": "192.0.2.11", "id.resp_h": "198.51.100.11",
652
+ "id.resp_p": 443, "proto": "tcp"}],
653
+ )
654
+
655
+ since = _JAN2
656
+ until = datetime(2026, 1, 2, 23, 59, 59, tzinfo=timezone.utc)
657
+ df = load_logs(zeek_dir, "conn*.log*", since=since, until=until)
658
+
659
+ assert len(df) == 2
660
+ assert set(df["src"].tolist()) == {"192.0.2.10"}
661
+
662
+
663
+ def test_load_logs_dated_boundary_day_included(tmp_path: Path) -> None:
664
+ """A window starting mid-day still includes the boundary subdir; per-line filter trims."""
665
+ zeek_dir = tmp_path / "zeek"
666
+ zeek_dir.mkdir()
667
+ (zeek_dir / "2026-01-03").mkdir()
668
+ _write_ndjson(
669
+ zeek_dir / "2026-01-03" / "conn.log",
670
+ [
671
+ {"ts": _TS_JAN3_08, "id.orig_h": "192.0.2.20", "id.resp_h": "198.51.100.20",
672
+ "id.resp_p": 443, "proto": "tcp"},
673
+ {"ts": _TS_JAN3_18, "id.orig_h": "192.0.2.21", "id.resp_h": "198.51.100.21",
674
+ "id.resp_p": 443, "proto": "tcp"},
675
+ ],
676
+ )
677
+
678
+ # Window starts at noon on Jan 3; only the 18:00 row survives the per-line filter.
679
+ since = datetime(2026, 1, 3, 12, 0, 0, tzinfo=timezone.utc)
680
+ df = load_logs(zeek_dir, "conn*.log*", since=since)
681
+
682
+ assert len(df) == 1
683
+ assert df.iloc[0]["src"] == "192.0.2.21"
684
+
685
+
686
+ def test_load_logs_dated_suffix_dir_treated_as_date(tmp_path: Path) -> None:
687
+ """A YYYY-MM-DD-SUFFIX dir is treated as the date prefix, suffix ignored."""
688
+ zeek_dir = tmp_path / "zeek"
689
+ zeek_dir.mkdir()
690
+ (zeek_dir / "2026-01-02-TSVPRE").mkdir()
691
+ (zeek_dir / "2026-01-04").mkdir()
692
+ _write_ndjson(
693
+ zeek_dir / "2026-01-02-TSVPRE" / "conn.log",
694
+ [{"ts": _TS_JAN2, "id.orig_h": "192.0.2.30", "id.resp_h": "198.51.100.30",
695
+ "id.resp_p": 443, "proto": "tcp"},
696
+ {"ts": _TS_JAN2 + 1, "id.orig_h": "192.0.2.30", "id.resp_h": "198.51.100.30",
697
+ "id.resp_p": 80, "proto": "tcp"}],
698
+ )
699
+ _write_ndjson(
700
+ zeek_dir / "2026-01-04" / "conn.log",
701
+ [{"ts": _TS_JAN4, "id.orig_h": "192.0.2.31", "id.resp_h": "198.51.100.31",
702
+ "id.resp_p": 443, "proto": "tcp"},
703
+ {"ts": _TS_JAN4 + 1, "id.orig_h": "192.0.2.31", "id.resp_h": "198.51.100.31",
704
+ "id.resp_p": 80, "proto": "tcp"}],
705
+ )
706
+
707
+ # Window Jan 2–3: TSVPRE dir included, Jan 4 excluded.
708
+ df = load_logs(zeek_dir, "conn*.log*", since=_JAN2, until=_JAN3)
709
+ assert len(df) == 2
710
+ assert set(df["src"].tolist()) == {"192.0.2.30"}
711
+
712
+ # Window Jan 4–5: Jan 4 included, TSVPRE dir excluded.
713
+ df = load_logs(zeek_dir, "conn*.log*", since=_JAN4, until=_JAN5)
714
+ assert len(df) == 2
715
+ assert set(df["src"].tolist()) == {"192.0.2.31"}
716
+
717
+
718
+ def test_load_logs_dated_symlink_deduplication(tmp_path: Path) -> None:
719
+ """Symlink pointing at a date subdir: data loads exactly once in both window cases."""
720
+ zeek_dir = tmp_path / "zeek"
721
+ zeek_dir.mkdir()
722
+ day1 = zeek_dir / "2026-01-01"
723
+ day1.mkdir()
724
+ _write_ndjson(
725
+ day1 / "conn.log",
726
+ [{"ts": _TS_JAN1, "id.orig_h": "192.0.2.40", "id.resp_h": "198.51.100.40",
727
+ "id.resp_p": 443, "proto": "tcp"},
728
+ {"ts": _TS_JAN1 + 1, "id.orig_h": "192.0.2.40", "id.resp_h": "198.51.100.40",
729
+ "id.resp_p": 80, "proto": "tcp"}],
730
+ )
731
+ (zeek_dir / "current").symlink_to(day1)
732
+
733
+ # No window: current symlink is deduped; data appears exactly once.
734
+ df = load_logs(zeek_dir, "conn*.log*")
735
+ assert len(df) == 2, f"expected 2 rows (deduped), got {len(df)}"
736
+
737
+ # With window covering Jan 1: current (non-date name) skipped; 2026-01-01 loads once.
738
+ df = load_logs(zeek_dir, "conn*.log*", since=_JAN1, until=_JAN2)
739
+ assert len(df) == 2, f"expected 2 rows under window, got {len(df)}"
740
+
741
+
742
+ def test_load_logs_dated_non_date_dir_skipped_with_window(tmp_path: Path) -> None:
743
+ """Non-date-named dirs are skipped when a window is set — no duplicate rows."""
744
+ zeek_dir = tmp_path / "zeek"
745
+ zeek_dir.mkdir()
746
+ (zeek_dir / "2026-01-02").mkdir()
747
+ (zeek_dir / "export").mkdir()
748
+
749
+ rows = [
750
+ {"ts": _TS_JAN2, "id.orig_h": "192.0.2.50", "id.resp_h": "198.51.100.50",
751
+ "id.resp_p": 443, "proto": "tcp"},
752
+ {"ts": _TS_JAN2 + 1, "id.orig_h": "192.0.2.50", "id.resp_h": "198.51.100.50",
753
+ "id.resp_p": 80, "proto": "tcp"},
754
+ ]
755
+ _write_ndjson(zeek_dir / "2026-01-02" / "conn.log", rows)
756
+ _write_ndjson(zeek_dir / "export" / "conn.log", rows) # same rows — would double if included
757
+
758
+ df = load_logs(zeek_dir, "conn*.log*", since=_JAN2, until=_JAN3)
759
+ assert len(df) == 2
760
+
761
+
762
+ def test_load_required_logs_size_matches_pruned_files(tmp_path: Path) -> None:
763
+ """data_size_bytes accounts only for files in the pruned window, not excluded days."""
764
+ zeek_dir = tmp_path / "zeek"
765
+ zeek_dir.mkdir()
766
+ (zeek_dir / "2026-01-01").mkdir()
767
+ (zeek_dir / "2026-01-02").mkdir()
768
+
769
+ _write_ndjson(
770
+ zeek_dir / "2026-01-01" / "conn.log",
771
+ [{"ts": _TS_JAN1, "id.orig_h": "192.0.2.60", "id.resp_h": "198.51.100.60",
772
+ "id.resp_p": 443, "proto": "tcp"}],
773
+ )
774
+ _write_ndjson(
775
+ zeek_dir / "2026-01-02" / "conn.log",
776
+ [{"ts": _TS_JAN2, "id.orig_h": "192.0.2.61", "id.resp_h": "198.51.100.61",
777
+ "id.resp_p": 443, "proto": "tcp"}],
778
+ )
779
+
780
+ result = load_required_logs(
781
+ {"conn*.log*": "zeek_dir"},
782
+ {"zeek_dir": [zeek_dir]},
783
+ since=_JAN2,
784
+ until=datetime(2026, 1, 2, 23, 59, 59, tzinfo=timezone.utc),
785
+ )
786
+
787
+ expected_size = (zeek_dir / "2026-01-02" / "conn.log").stat().st_size
788
+ assert result.data_size_bytes == expected_size
789
+
790
+
791
+ def test_load_required_logs_warns_and_skips_truncated_zeek_gzip(
792
+ tmp_path: Path,
793
+ ) -> None:
794
+ """A selected truncated gzip file warns and does not abort the whole Zeek load."""
795
+ zeek_dir = tmp_path / "zeek"
796
+ zeek_dir.mkdir()
797
+
798
+ payload = (
799
+ b'{"ts":1767312000.0,"id.orig_h":"192.0.2.10",'
800
+ b'"id.resp_h":"198.51.100.10","id.resp_p":443,"proto":"tcp"}\n'
801
+ )
802
+ (zeek_dir / "conn-bad.log.gz").write_bytes(gzip.compress(payload)[:-8])
803
+ _write_ndjson(
804
+ zeek_dir / "conn-good.log",
805
+ [{
806
+ "ts": _TS_JAN2,
807
+ "id.orig_h": "192.0.2.11",
808
+ "id.resp_h": "198.51.100.11",
809
+ "id.resp_p": 443,
810
+ "proto": "tcp",
811
+ }],
812
+ )
813
+
814
+ result = load_required_logs(
815
+ {"conn*.log*": "zeek_dir"},
816
+ {"zeek_dir": [zeek_dir]},
817
+ )
818
+
819
+ df = result.logs["conn*.log*"]
820
+ assert len(df) == 1
821
+ assert df.iloc[0]["src"] == "192.0.2.11"
822
+ assert any(
823
+ "conn-bad.log.gz could not be read" in warning
824
+ and "compressed file is incomplete or corrupt" in warning
825
+ for warning in result.warnings
826
+ )
827
+
828
+
829
+ def test_load_logs_dated_layout_ignores_root_level_files(tmp_path: Path) -> None:
830
+ """Root-level files are ignored when a YYYY-MM-DD subdir exists (mixed-root policy)."""
831
+ zeek_dir = tmp_path / "zeek"
832
+ zeek_dir.mkdir()
833
+ (zeek_dir / "2026-01-02").mkdir()
834
+
835
+ # Root-level file — ignored because a date dir is present.
836
+ _write_ndjson(
837
+ zeek_dir / "conn.log",
838
+ [{"ts": _TS_JAN2, "id.orig_h": "192.0.2.99", "id.resp_h": "198.51.100.99",
839
+ "id.resp_p": 443, "proto": "tcp"}],
840
+ )
841
+ # Dated subdir file — loaded.
842
+ _write_ndjson(
843
+ zeek_dir / "2026-01-02" / "conn.log",
844
+ [{"ts": _TS_JAN2 + 1, "id.orig_h": "192.0.2.50", "id.resp_h": "198.51.100.50",
845
+ "id.resp_p": 443, "proto": "tcp"}],
846
+ )
847
+
848
+ df = load_logs(zeek_dir, "conn*.log*")
849
+
850
+ assert len(df) == 1
851
+ assert df.iloc[0]["src"] == "192.0.2.50"
852
+
853
+
854
+ # ── Stage 4: boundedness + default window helpers ─────────────────────────────
855
+
856
+
857
+ def test_is_zeek_bounded_returns_true_for_file(tmp_path: Path) -> None:
858
+ f = tmp_path / "conn.log"
859
+ f.write_text("", encoding="utf-8")
860
+ assert is_zeek_bounded([f]) is True
861
+
862
+
863
+ def test_is_zeek_bounded_returns_false_for_directory(tmp_path: Path) -> None:
864
+ assert is_zeek_bounded([tmp_path]) is False
865
+
866
+
867
+ def test_is_zeek_bounded_returns_false_for_glob_string() -> None:
868
+ """Glob strings classify as UNBOUNDED. Stage 4 helper contract; load wiring deferred."""
869
+ assert is_zeek_bounded([Path("conn*.log")]) is False
870
+
871
+
872
+ def test_is_zeek_bounded_empty_list_returns_false() -> None:
873
+ """An empty bucket is NOT bounded — the runner short-circuits before
874
+ calling, but the predicate stays explicit (no Zeek to discuss)."""
875
+ assert is_zeek_bounded([]) is False
876
+
877
+
878
+ def test_zeek_dated_default_window_flat_layout_returns_none(tmp_path: Path) -> None:
879
+ (tmp_path / "conn.log").write_text("", encoding="utf-8")
880
+ assert _zeek_dated_window([tmp_path], timedelta(days=1)) is None
881
+
882
+
883
+ def test_zeek_dated_default_window_1d_picks_newest_subdir_only(tmp_path: Path) -> None:
884
+ """GUARDRAIL — single-input dated selection that the union path must
885
+ GENERALIZE (newest N=ceil(span_days) date subdirs, earliest-midnight →
886
+ newest-23:59:59 UTC). Do NOT reinterpret these assertions; the
887
+ one-element list IS the degenerate single-input case."""
888
+ (tmp_path / "2026-01-01").mkdir()
889
+ (tmp_path / "2026-01-05").mkdir()
890
+ since, until = _zeek_dated_window([tmp_path], timedelta(days=1))
891
+ assert since == datetime(2026, 1, 5, 0, 0, 0, tzinfo=timezone.utc)
892
+ assert until == datetime(2026, 1, 5, 23, 59, 59, tzinfo=timezone.utc)
893
+
894
+
895
+ def test_zeek_dated_default_window_2d_picks_newest_2_subdirs_even_when_sparse(
896
+ tmp_path: Path,
897
+ ) -> None:
898
+ """GUARDRAIL — sparse-archive selection that the union path must
899
+ GENERALIZE. [2026-01-01, 2026-01-05] with span=2d → BOTH dirs; window
900
+ Jan 1 → Jan 5. Do NOT reinterpret."""
901
+ (tmp_path / "2026-01-01").mkdir()
902
+ (tmp_path / "2026-01-05").mkdir()
903
+ since, until = _zeek_dated_window([tmp_path], timedelta(days=2))
904
+ assert since == datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
905
+ assert until == datetime(2026, 1, 5, 23, 59, 59, tzinfo=timezone.utc)
906
+
907
+
908
+ def test_zeek_dated_default_window_span_exceeds_subdir_count(tmp_path: Path) -> None:
909
+ for d in ["2026-01-01", "2026-01-03", "2026-01-05"]:
910
+ (tmp_path / d).mkdir()
911
+ since, until = _zeek_dated_window([tmp_path], timedelta(days=7))
912
+ assert since.date() == date(2026, 1, 1)
913
+ assert until.date() == date(2026, 1, 5)
914
+
915
+
916
+ def test_discover_zeek_files_file_input_matching_pattern_returns_file(
917
+ tmp_path: Path,
918
+ ) -> None:
919
+ f = tmp_path / "conn.log"
920
+ _write_ndjson(f, [{"ts": _TS_JAN2}])
921
+ assert discover_zeek_files(f, "conn*.log*") == [f]
922
+
923
+
924
+ def test_discover_zeek_files_file_input_nonmatching_pattern_returns_empty(
925
+ tmp_path: Path,
926
+ ) -> None:
927
+ f = tmp_path / "dns.log"
928
+ _write_ndjson(f, [{"ts": _TS_JAN2}])
929
+ assert discover_zeek_files(f, "conn*.log*") == []
930
+
931
+
932
+ # ── CloudTrail loader: per-file shapes ────────────────────────────────────────
933
+
934
+ def test_load_cloudtrail_ndjson_multiple_events_preserves_first_event(
935
+ tmp_path: Path,
936
+ ) -> None:
937
+ """Regression guard: the NDJSON branch must seed with the parsed first line.
938
+
939
+ A prior draft iterated 'remaining lines' without seeding, silently dropping
940
+ the first event of every exporter .json.log file. This asserts the observed
941
+ output, not the internal route.
942
+ """
943
+ cloudtrail_dir = tmp_path / "ct"
944
+ cloudtrail_dir.mkdir()
945
+ events = [
946
+ _ct_event(eventID="aaaa", eventTime="2026-06-01T12:00:00Z"),
947
+ _ct_event(eventID="bbbb", eventTime="2026-06-01T12:01:00Z"),
948
+ _ct_event(eventID="cccc", eventTime="2026-06-01T12:02:00Z"),
949
+ ]
950
+ _ct_write_ndjson(cloudtrail_dir / "events.json.log", events)
951
+
952
+ df = load_cloudtrail(cloudtrail_dir)
953
+
954
+ assert list(df.columns) == _CLOUDTRAIL_COLUMNS
955
+ assert len(df) == 3
956
+ assert set(df["event_id"]) == {"aaaa", "bbbb", "cccc"}
957
+
958
+
959
+ def test_load_cloudtrail_bare_one_line_dict_event_loads_as_single_row(
960
+ tmp_path: Path,
961
+ ) -> None:
962
+ """Single-dict-per-file: first-line parses as a dict without Records, NDJSON
963
+ branch seeds with it, no more lines → exactly one event in the frame."""
964
+ cloudtrail_dir = tmp_path / "ct"
965
+ cloudtrail_dir.mkdir()
966
+ (cloudtrail_dir / "one.json").write_text(
967
+ json.dumps(_ct_event(eventID="only-one")),
968
+ encoding="utf-8",
969
+ )
970
+
971
+ df = load_cloudtrail(cloudtrail_dir)
972
+
973
+ assert len(df) == 1
974
+ assert df.iloc[0]["event_id"] == "only-one"
975
+
976
+
977
+ def test_load_cloudtrail_one_line_bare_list_loads_as_event_list(
978
+ tmp_path: Path,
979
+ ) -> None:
980
+ cloudtrail_dir = tmp_path / "ct"
981
+ cloudtrail_dir.mkdir()
982
+ events = [
983
+ _ct_event(eventID="list-1"),
984
+ _ct_event(eventID="list-2"),
985
+ ]
986
+ (cloudtrail_dir / "list.json").write_text(
987
+ json.dumps(events),
988
+ encoding="utf-8",
989
+ )
990
+
991
+ df = load_cloudtrail(cloudtrail_dir)
992
+
993
+ assert len(df) == 2
994
+ assert set(df["event_id"]) == {"list-1", "list-2"}
995
+
996
+
997
+ def test_load_cloudtrail_gzipped_envelope_loads_identically(tmp_path: Path) -> None:
998
+ """Native S3 wire shape: {"Records": [...]} as a single gzipped JSON document."""
999
+ cloudtrail_dir = tmp_path / "ct"
1000
+ cloudtrail_dir.mkdir()
1001
+ events = [
1002
+ _ct_event(eventID="env-1"),
1003
+ _ct_event(eventID="env-2"),
1004
+ ]
1005
+ _ct_write_envelope_gz(cloudtrail_dir / "envelope.json.gz", events)
1006
+
1007
+ df = load_cloudtrail(cloudtrail_dir)
1008
+
1009
+ assert list(df.columns) == _CLOUDTRAIL_COLUMNS
1010
+ assert len(df) == 2
1011
+ assert set(df["event_id"]) == {"env-1", "env-2"}
1012
+
1013
+
1014
+ def test_load_cloudtrail_pretty_printed_multiline_envelope_loads(tmp_path: Path) -> None:
1015
+ """Whole-file fallback path: first line is a '{' fragment, full text is the doc."""
1016
+ cloudtrail_dir = tmp_path / "ct"
1017
+ cloudtrail_dir.mkdir()
1018
+ events = [_ct_event(eventID="pretty-1"), _ct_event(eventID="pretty-2")]
1019
+ (cloudtrail_dir / "pretty.json").write_text(
1020
+ json.dumps({"Records": events}, indent=2),
1021
+ encoding="utf-8",
1022
+ )
1023
+
1024
+ df = load_cloudtrail(cloudtrail_dir)
1025
+
1026
+ assert len(df) == 2
1027
+ assert set(df["event_id"]) == {"pretty-1", "pretty-2"}
1028
+
1029
+
1030
+ def test_load_cloudtrail_mixed_formats_in_one_directory_loads_union(
1031
+ tmp_path: Path,
1032
+ ) -> None:
1033
+ cloudtrail_dir = tmp_path / "ct"
1034
+ cloudtrail_dir.mkdir()
1035
+ _ct_write_ndjson(
1036
+ cloudtrail_dir / "ndjson.json.log",
1037
+ [_ct_event(eventID="nd-1"), _ct_event(eventID="nd-2")],
1038
+ )
1039
+ _ct_write_envelope_gz(
1040
+ cloudtrail_dir / "env.json.gz",
1041
+ [_ct_event(eventID="env-1")],
1042
+ )
1043
+
1044
+ df = load_cloudtrail(cloudtrail_dir)
1045
+
1046
+ assert len(df) == 3
1047
+ assert set(df["event_id"]) == {"nd-1", "nd-2", "env-1"}
1048
+
1049
+
1050
+ # ── CloudTrail loader: discovery ──────────────────────────────────────────────
1051
+
1052
+ def test_load_cloudtrail_native_nested_aws_logs_tree_discovered_recursively(
1053
+ tmp_path: Path,
1054
+ ) -> None:
1055
+ """Recursive *.json* discovery — what makes a native AWSLogs tree just work."""
1056
+ nested = (
1057
+ tmp_path
1058
+ / "AWSLogs" / _CT_DOCS_ACCOUNT / "CloudTrail" / "us-east-1"
1059
+ / "2026" / "06" / "01"
1060
+ )
1061
+ _ct_write_envelope_gz(nested / "events.json.gz", [_ct_event(eventID="nested-1")])
1062
+
1063
+ files = discover_cloudtrail_files(tmp_path)
1064
+ assert any("nested" not in p.name for p in files) # the actual file is discovered
1065
+ assert len(files) == 1
1066
+ assert files[0].name == "events.json.gz"
1067
+
1068
+ df = load_cloudtrail(tmp_path)
1069
+ assert len(df) == 1
1070
+ assert df.iloc[0]["event_id"] == "nested-1"
1071
+
1072
+
1073
+ def test_discover_cloudtrail_files_excludes_cloud_trail_digest_tree(
1074
+ tmp_path: Path,
1075
+ ) -> None:
1076
+ """Digest files are integrity manifests, not events — exclude them."""
1077
+ events_dir = tmp_path / "CloudTrail" / "us-east-1" / "2026" / "06" / "01"
1078
+ digest_dir = tmp_path / "CloudTrail-Digest" / "us-east-1" / "2026" / "06" / "01"
1079
+ _ct_write_envelope_gz(events_dir / "events.json.gz", [_ct_event(eventID="evt-1")])
1080
+ _ct_write_envelope_gz(digest_dir / "digest.json.gz", [_ct_event(eventID="digest-1")])
1081
+
1082
+ files = discover_cloudtrail_files(tmp_path)
1083
+ file_names = [f.name for f in files]
1084
+ assert "events.json.gz" in file_names
1085
+ assert "digest.json.gz" not in file_names
1086
+
1087
+
1088
+ def test_load_cloudtrail_single_file_path_works(tmp_path: Path) -> None:
1089
+ file_path = tmp_path / "events.json.log"
1090
+ _ct_write_ndjson(file_path, [_ct_event(eventID="single-file-event")])
1091
+
1092
+ df = load_cloudtrail(file_path)
1093
+
1094
+ assert len(df) == 1
1095
+ assert df.iloc[0]["event_id"] == "single-file-event"
1096
+
1097
+
1098
+ def test_load_cloudtrail_empty_directory_returns_column_stable_empty_frame(
1099
+ tmp_path: Path,
1100
+ ) -> None:
1101
+ cloudtrail_dir = tmp_path / "ct"
1102
+ cloudtrail_dir.mkdir()
1103
+
1104
+ df = load_cloudtrail(cloudtrail_dir)
1105
+
1106
+ assert list(df.columns) == _CLOUDTRAIL_COLUMNS
1107
+ assert len(df) == 0
1108
+
1109
+
1110
+ # ── CloudTrail loader: tolerance, warnings, filtering ─────────────────────────
1111
+
1112
+ def test_load_cloudtrail_undecodable_ndjson_lines_silently_skipped(
1113
+ tmp_path: Path,
1114
+ ) -> None:
1115
+ cloudtrail_dir = tmp_path / "ct"
1116
+ cloudtrail_dir.mkdir()
1117
+ good_a = json.dumps(_ct_event(eventID="good-a"))
1118
+ good_b = json.dumps(_ct_event(eventID="good-b"))
1119
+ (cloudtrail_dir / "events.json.log").write_text(
1120
+ f"{good_a}\nnot json at all\n{good_b}\n",
1121
+ encoding="utf-8",
1122
+ )
1123
+
1124
+ df = load_cloudtrail(cloudtrail_dir)
1125
+
1126
+ assert len(df) == 2
1127
+ assert set(df["event_id"]) == {"good-a", "good-b"}
1128
+
1129
+
1130
+ def test_load_required_logs_warns_and_skips_corrupt_cloudtrail_gzip(
1131
+ tmp_path: Path,
1132
+ ) -> None:
1133
+ """Corrupt gzip: warning appended to LoadResult.warnings; sibling still loads."""
1134
+ cloudtrail_dir = tmp_path / "ct"
1135
+ cloudtrail_dir.mkdir()
1136
+ # Truncated gzip
1137
+ payload = gzip.compress(
1138
+ json.dumps({"Records": [_ct_event(eventID="bad-evt")]}).encode("utf-8")
1139
+ )
1140
+ (cloudtrail_dir / "broken.json.gz").write_bytes(payload[:-8])
1141
+ _ct_write_ndjson(cloudtrail_dir / "ok.json.log", [_ct_event(eventID="good-evt")])
1142
+
1143
+ result = load_required_logs(
1144
+ {"*.json*": "cloudtrail_dir"},
1145
+ {"cloudtrail_dir": [cloudtrail_dir]},
1146
+ )
1147
+
1148
+ df = result.logs["*.json*"]
1149
+ assert len(df) == 1
1150
+ assert df.iloc[0]["event_id"] == "good-evt"
1151
+ assert any(
1152
+ "broken.json.gz could not be read" in w
1153
+ and "compressed file is incomplete or corrupt" in w
1154
+ for w in result.warnings
1155
+ )
1156
+
1157
+
1158
+ def test_load_required_logs_warns_and_skips_unparseable_json_file(
1159
+ tmp_path: Path,
1160
+ ) -> None:
1161
+ """Non-gzip file whose contents are not valid JSON: warn-and-skip with the
1162
+ 'not valid JSON' message; sibling still loads."""
1163
+ cloudtrail_dir = tmp_path / "ct"
1164
+ cloudtrail_dir.mkdir()
1165
+ (cloudtrail_dir / "garbage.json").write_text(
1166
+ "this is not json at all\nstill not\n",
1167
+ encoding="utf-8",
1168
+ )
1169
+ _ct_write_ndjson(cloudtrail_dir / "ok.json.log", [_ct_event(eventID="evt-ok")])
1170
+
1171
+ result = load_required_logs(
1172
+ {"*.json*": "cloudtrail_dir"},
1173
+ {"cloudtrail_dir": [cloudtrail_dir]},
1174
+ )
1175
+
1176
+ df = result.logs["*.json*"]
1177
+ assert len(df) == 1
1178
+ assert df.iloc[0]["event_id"] == "evt-ok"
1179
+ assert any(
1180
+ "garbage.json could not be read" in w and "not valid JSON" in w
1181
+ for w in result.warnings
1182
+ )
1183
+
1184
+
1185
+ def test_load_cloudtrail_drops_events_with_missing_event_time(
1186
+ tmp_path: Path,
1187
+ ) -> None:
1188
+ cloudtrail_dir = tmp_path / "ct"
1189
+ cloudtrail_dir.mkdir()
1190
+ no_ts = _ct_event(eventID="no-ts")
1191
+ no_ts.pop("eventTime")
1192
+ _ct_write_ndjson(
1193
+ cloudtrail_dir / "events.json.log",
1194
+ [_ct_event(eventID="has-ts"), no_ts],
1195
+ )
1196
+
1197
+ df = load_cloudtrail(cloudtrail_dir)
1198
+
1199
+ assert len(df) == 1
1200
+ assert df.iloc[0]["event_id"] == "has-ts"
1201
+
1202
+
1203
+ def test_load_cloudtrail_applies_since_and_until_window(tmp_path: Path) -> None:
1204
+ cloudtrail_dir = tmp_path / "ct"
1205
+ cloudtrail_dir.mkdir()
1206
+ _ct_write_ndjson(
1207
+ cloudtrail_dir / "events.json.log",
1208
+ [
1209
+ _ct_event(eventID="too-early", eventTime="2026-05-31T11:00:00Z"),
1210
+ _ct_event(eventID="inside", eventTime="2026-06-01T12:00:00Z"),
1211
+ _ct_event(eventID="too-late", eventTime="2026-06-02T13:00:00Z"),
1212
+ ],
1213
+ )
1214
+
1215
+ df = load_cloudtrail(
1216
+ cloudtrail_dir,
1217
+ since=datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc),
1218
+ until=datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc),
1219
+ )
1220
+
1221
+ assert len(df) == 1
1222
+ assert df.iloc[0]["event_id"] == "inside"
1223
+
1224
+
1225
+ # ── Liveness: loader leaves a permanent record line ──────────────────────────
1226
+
1227
+
1228
+ class _FakeTTYStream:
1229
+ """sys.stderr stand-in that reports isatty()=True and captures writes.
1230
+
1231
+ Used to exercise the byte-identical-on-TTY rail: on a real TTY,
1232
+ ``progress`` constructs tqdm and the bar text reaches the stream. capsys
1233
+ cannot be used here because its captured stderr reports isatty()=False —
1234
+ that is exactly the non-TTY suppression rail tested separately below.
1235
+ """
1236
+
1237
+ def __init__(self) -> None:
1238
+ self._chunks: list[str] = []
1239
+
1240
+ def isatty(self) -> bool:
1241
+ return True
1242
+
1243
+ def write(self, s: str) -> int:
1244
+ self._chunks.append(s)
1245
+ return len(s)
1246
+
1247
+ def flush(self) -> None: # pragma: no cover - no-op
1248
+ return None
1249
+
1250
+ @property
1251
+ def output(self) -> str:
1252
+ return "".join(self._chunks)
1253
+
1254
+
1255
+ def test_parse_ndjson_leaves_permanent_record_line_on_tty(
1256
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1257
+ ) -> None:
1258
+ """Byte-identical-on-TTY regression: on a real TTY stream, the NDJSON
1259
+ loader still writes a ``loaded <file>: N lines`` permanent record line
1260
+ through the shared progress helper. The bar_format is pinned in
1261
+ ``common/display.py:progress`` and reproduces the pre-helper inline
1262
+ bar_format byte-for-byte when ``unit=" lines"``."""
1263
+ from loghunter.common.loader import _parse_ndjson_file
1264
+
1265
+ fake = _FakeTTYStream()
1266
+ monkeypatch.setattr("sys.stderr", fake)
1267
+
1268
+ f = tmp_path / "conn.log"
1269
+ f.write_text(
1270
+ "\n".join(
1271
+ json.dumps({"ts": float(i), "id.orig_h": f"192.0.2.{i}"})
1272
+ for i in range(1, 6)
1273
+ ) + "\n",
1274
+ encoding="utf-8",
1275
+ )
1276
+
1277
+ _parse_ndjson_file(f)
1278
+
1279
+ out = fake.output
1280
+ # tqdm with leave=True commits the summary line for that file.
1281
+ assert "loaded conn.log" in out
1282
+ assert "5" in out # the line count, formatted by tqdm's n_fmt
1283
+
1284
+
1285
+ def test_parse_ndjson_non_tty_stream_suppresses_loader_bar(
1286
+ tmp_path: Path, capsys: pytest.CaptureFixture
1287
+ ) -> None:
1288
+ """On a non-TTY stream (the codified within-loader TTY policy), the
1289
+ progress helper returns the bare iterable and tqdm is never constructed.
1290
+ capsys's stderr reports isatty()=False, exercising the non-TTY arm."""
1291
+ from loghunter.common.loader import _parse_ndjson_file
1292
+
1293
+ f = tmp_path / "conn.log"
1294
+ f.write_text(
1295
+ "\n".join(
1296
+ json.dumps({"ts": float(i), "id.orig_h": f"192.0.2.{i}"})
1297
+ for i in range(1, 6)
1298
+ ) + "\n",
1299
+ encoding="utf-8",
1300
+ )
1301
+
1302
+ df = _parse_ndjson_file(f) # default show_progress=True
1303
+
1304
+ captured = capsys.readouterr()
1305
+ assert "loaded conn.log" not in captured.err
1306
+ assert len(df) == 5
1307
+
1308
+
1309
+ def test_parse_ndjson_show_progress_false_suppresses_loader_bar(
1310
+ tmp_path: Path, capsys: pytest.CaptureFixture
1311
+ ) -> None:
1312
+ """show_progress=False routes through the shared progress helper, which
1313
+ returns the bare iterable without constructing tqdm. A multi-file digest
1314
+ fan-out passes show_progress=False so per-file bars don't interleave
1315
+ between rendered cards. The frame must still be returned identical to
1316
+ the default-True path — suppression is purely cosmetic."""
1317
+ from loghunter.common.loader import _parse_ndjson_file
1318
+
1319
+ f = tmp_path / "conn.log"
1320
+ f.write_text(
1321
+ "\n".join(
1322
+ json.dumps({"ts": float(i), "id.orig_h": f"192.0.2.{i}"})
1323
+ for i in range(1, 6)
1324
+ ) + "\n",
1325
+ encoding="utf-8",
1326
+ )
1327
+
1328
+ df = _parse_ndjson_file(f, show_progress=False)
1329
+
1330
+ captured = capsys.readouterr()
1331
+ assert "loaded conn.log" not in captured.err
1332
+ assert len(df) == 5
1333
+
1334
+
1335
+ # ── Loader progress: seam coverage (mock progress, assert kwargs) ────────────
1336
+ #
1337
+ # Each loader read path routes through the shared
1338
+ # loghunter.common.loader.progress helper. Mocking that seam keeps the tests
1339
+ # off carriage-return-byte scraping (which is brittle) and verifies the desc /
1340
+ # unit / show_progress contract each loader holds with the helper. The two
1341
+ # NDJSON byte-output tests above lock the on-TTY render — these tests lock the
1342
+ # wiring beneath it.
1343
+
1344
+
1345
+ class _ProgressSpy:
1346
+ """Spy for loghunter.common.loader.progress.
1347
+
1348
+ Records (desc, unit, show_progress) per call and forwards iteration to the
1349
+ bare iterable so the loader still produces a real frame. Tests can then
1350
+ assert how many times each loader called the helper and with what args.
1351
+ """
1352
+
1353
+ def __init__(self) -> None:
1354
+ self.calls: list[dict] = []
1355
+
1356
+ def __call__(self, iterable, *, desc, show_progress=True, unit=" lines",
1357
+ total=None, stream=None):
1358
+ self.calls.append({
1359
+ "desc": desc,
1360
+ "unit": unit,
1361
+ "show_progress": show_progress,
1362
+ })
1363
+ return iter(iterable)
1364
+
1365
+
1366
+ def test_progress_seam_tsv_wraps_pre_materialization(
1367
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1368
+ ) -> None:
1369
+ """The Zeek TSV strategy's parse wraps the file handle ONCE through
1370
+ ``progress`` (in ``run_load``) BEFORE any per-line work — the
1371
+ materialization that follows the prefix-preserving sniff is the slow
1372
+ part on a long log. The spy intercepts the single ``progress`` call
1373
+ and verifies its kwargs."""
1374
+ from loghunter.common import loader as loader_mod
1375
+
1376
+ spy = _ProgressSpy()
1377
+ monkeypatch.setattr(loader_mod, "progress", spy)
1378
+
1379
+ f = tmp_path / "conn.tsv"
1380
+ # Minimal valid Zeek TSV — header is enough to claim TSV via #separator.
1381
+ f.write_text(
1382
+ "#separator \\x09\n"
1383
+ "#fields\tts\tid.orig_h\tid.resp_h\tid.resp_p\tproto\n"
1384
+ "#types\ttime\taddr\taddr\tport\tenum\n"
1385
+ "1779750000.0\t192.0.2.10\t198.51.100.20\t443\ttcp\n"
1386
+ "#close\t2026-06-01-12-00-00\n",
1387
+ encoding="utf-8",
1388
+ )
1389
+
1390
+ loader_mod.load_logs(f.parent, "*.tsv", _files=[f])
1391
+
1392
+ assert len(spy.calls) == 1
1393
+ assert spy.calls[0]["desc"] == "loaded conn.tsv"
1394
+ assert spy.calls[0]["unit"] == " lines"
1395
+ assert spy.calls[0]["show_progress"] is True
1396
+
1397
+
1398
+ def test_progress_seam_load_syslog_calls_per_file(
1399
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1400
+ ) -> None:
1401
+ """load_syslog wraps each per-file read with progress."""
1402
+ from loghunter.common import loader as loader_mod
1403
+
1404
+ spy = _ProgressSpy()
1405
+ monkeypatch.setattr(loader_mod, "progress", spy)
1406
+
1407
+ syslog_dir = tmp_path / "syslog"
1408
+ syslog_dir.mkdir()
1409
+ (syslog_dir / "router.log").write_text(
1410
+ "Jun 1 12:00:00 router sshd[1]: hi\n", encoding="utf-8",
1411
+ )
1412
+ (syslog_dir / "webserver.log").write_text(
1413
+ "Jun 1 12:01:00 web nginx[2]: hi\n", encoding="utf-8",
1414
+ )
1415
+
1416
+ loader_mod.load_syslog(syslog_dir, show_progress=False)
1417
+
1418
+ descs = sorted(c["desc"] for c in spy.calls)
1419
+ assert descs == ["loaded router.log", "loaded webserver.log"]
1420
+ assert all(c["show_progress"] is False for c in spy.calls)
1421
+ assert all(c["unit"] == " lines" for c in spy.calls)
1422
+
1423
+
1424
+ def test_progress_seam_load_pihole_calls_per_file(
1425
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1426
+ ) -> None:
1427
+ """load_pihole wraps each per-file read with progress."""
1428
+ from loghunter.common import loader as loader_mod
1429
+
1430
+ spy = _ProgressSpy()
1431
+ monkeypatch.setattr(loader_mod, "progress", spy)
1432
+
1433
+ pihole_dir = tmp_path / "pihole"
1434
+ pihole_dir.mkdir()
1435
+ (pihole_dir / "pihole.log").write_text(
1436
+ "Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.10\n",
1437
+ encoding="utf-8",
1438
+ )
1439
+
1440
+ loader_mod.load_pihole(pihole_dir, show_progress=True)
1441
+
1442
+ assert len(spy.calls) == 1
1443
+ assert spy.calls[0]["desc"] == "loaded pihole.log"
1444
+ assert spy.calls[0]["unit"] == " lines"
1445
+ assert spy.calls[0]["show_progress"] is True
1446
+
1447
+
1448
+ # ── CloudTrail single-iterator: per-shape input-line accounting ──────────────
1449
+ #
1450
+ # After `line_iter = progress(...)` exists in _cloudtrail_strategy_parse, ALL four
1451
+ # wire-shape branches consume from the same wrapped iterator. The progress bar
1452
+ # therefore reports actual INPUT lines (never parsed events) for every shape —
1453
+ # including the Glenn round-2 regression case: a one-line NDJSON file must
1454
+ # report `loaded x: 1 lines`, NOT zero.
1455
+
1456
+
1457
+ class _CountingProgressSpy:
1458
+ """Progress spy that wraps iteration and counts lines pulled through it.
1459
+
1460
+ Distinct from _ProgressSpy in that it tracks per-call line counts via
1461
+ actual iteration — needed to assert the CloudTrail single-iterator drives
1462
+ every branch (envelope / multi-line pretty / NDJSON / bare-list).
1463
+ """
1464
+
1465
+ def __init__(self) -> None:
1466
+ self.calls: list[dict] = [] # one entry per call (desc, line_count)
1467
+
1468
+ def __call__(self, iterable, *, desc, show_progress=True, unit=" lines",
1469
+ total=None, stream=None):
1470
+ entry = {"desc": desc, "line_count": 0}
1471
+ self.calls.append(entry)
1472
+
1473
+ def _counting():
1474
+ for line in iterable:
1475
+ entry["line_count"] += 1
1476
+ yield line
1477
+
1478
+ return _counting()
1479
+
1480
+
1481
+ def test_cloudtrail_one_line_ndjson_bar_reports_one_line_not_zero(
1482
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1483
+ ) -> None:
1484
+ """Glenn round-2 regression: a single-event NDJSON CloudTrail file must
1485
+ NOT leave a ``loaded x: 0 lines`` record. The first-nonblank sniff
1486
+ consumes the one line through the shared wrapped iterator, so the bar
1487
+ correctly reports 1 line consumed."""
1488
+ from loghunter.common import loader as loader_mod
1489
+
1490
+ spy = _CountingProgressSpy()
1491
+ monkeypatch.setattr(loader_mod, "progress", spy)
1492
+
1493
+ cloudtrail_dir = tmp_path / "ct"
1494
+ cloudtrail_dir.mkdir()
1495
+ _ct_write_ndjson(cloudtrail_dir / "one.json.log",
1496
+ [_ct_event(eventID="only-one")])
1497
+
1498
+ df = loader_mod.load_cloudtrail(cloudtrail_dir)
1499
+
1500
+ assert len(df) == 1
1501
+ assert len(spy.calls) == 1
1502
+ assert spy.calls[0]["desc"] == "loaded one.json.log"
1503
+ # The one input line was pulled through the wrapped iterator.
1504
+ assert spy.calls[0]["line_count"] == 1
1505
+
1506
+
1507
+ def test_cloudtrail_multi_line_ndjson_bar_counts_every_input_line(
1508
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1509
+ ) -> None:
1510
+ """NDJSON branch consumes both the first-nonblank sniff AND the per-event
1511
+ stream from the same wrapped iterator — total = input lines."""
1512
+ from loghunter.common import loader as loader_mod
1513
+
1514
+ spy = _CountingProgressSpy()
1515
+ monkeypatch.setattr(loader_mod, "progress", spy)
1516
+
1517
+ cloudtrail_dir = tmp_path / "ct"
1518
+ cloudtrail_dir.mkdir()
1519
+ _ct_write_ndjson(cloudtrail_dir / "events.json.log", [
1520
+ _ct_event(eventID="a"),
1521
+ _ct_event(eventID="b"),
1522
+ _ct_event(eventID="c"),
1523
+ ])
1524
+
1525
+ loader_mod.load_cloudtrail(cloudtrail_dir)
1526
+
1527
+ assert spy.calls[0]["line_count"] == 3
1528
+
1529
+
1530
+ def test_cloudtrail_envelope_bar_counts_envelope_line(
1531
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1532
+ ) -> None:
1533
+ """``{"Records": [...]}`` envelope: the helper-wrapped iterator carries
1534
+ the first line (and any additional input lines) before the whole-document
1535
+ join. A single-line envelope reports 1 input line."""
1536
+ from loghunter.common import loader as loader_mod
1537
+
1538
+ spy = _CountingProgressSpy()
1539
+ monkeypatch.setattr(loader_mod, "progress", spy)
1540
+
1541
+ cloudtrail_dir = tmp_path / "ct"
1542
+ cloudtrail_dir.mkdir()
1543
+ _ct_write_envelope_gz(cloudtrail_dir / "envelope.json.gz",
1544
+ [_ct_event(eventID="env-1"),
1545
+ _ct_event(eventID="env-2")])
1546
+
1547
+ loader_mod.load_cloudtrail(cloudtrail_dir)
1548
+
1549
+ # Single-line envelope = exactly one input line through the wrapped iter.
1550
+ assert spy.calls[0]["line_count"] == 1
1551
+
1552
+
1553
+ def test_cloudtrail_pretty_multiline_bar_counts_every_input_line(
1554
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1555
+ ) -> None:
1556
+ """Pretty-printed multi-line single-document fallback (first line is a
1557
+ JSON fragment): the wrapped iterator collects all remaining lines via
1558
+ ``"".join(line_iter)`` so the bar reports the full file line count, not
1559
+ just 1."""
1560
+ from loghunter.common import loader as loader_mod
1561
+
1562
+ spy = _CountingProgressSpy()
1563
+ monkeypatch.setattr(loader_mod, "progress", spy)
1564
+
1565
+ cloudtrail_dir = tmp_path / "ct"
1566
+ cloudtrail_dir.mkdir()
1567
+ events = [_ct_event(eventID="pretty-1"), _ct_event(eventID="pretty-2")]
1568
+ pretty_text = json.dumps({"Records": events}, indent=2)
1569
+ (cloudtrail_dir / "pretty.json").write_text(pretty_text, encoding="utf-8")
1570
+ expected_lines = len(pretty_text.splitlines())
1571
+
1572
+ loader_mod.load_cloudtrail(cloudtrail_dir)
1573
+
1574
+ assert spy.calls[0]["line_count"] == expected_lines
1575
+ assert expected_lines > 1 # sanity: this fixture really is multi-line
1576
+
1577
+
1578
+ def test_cloudtrail_bare_list_one_line_doc_bar_reports_one_line(
1579
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1580
+ ) -> None:
1581
+ """Bare-list one-line document: the wrapped iterator delivers the single
1582
+ line for the sniff, no further iteration; bar = 1 line consumed."""
1583
+ from loghunter.common import loader as loader_mod
1584
+
1585
+ spy = _CountingProgressSpy()
1586
+ monkeypatch.setattr(loader_mod, "progress", spy)
1587
+
1588
+ cloudtrail_dir = tmp_path / "ct"
1589
+ cloudtrail_dir.mkdir()
1590
+ events = [_ct_event(eventID="list-1"), _ct_event(eventID="list-2")]
1591
+ (cloudtrail_dir / "list.json").write_text(
1592
+ json.dumps(events), encoding="utf-8",
1593
+ )
1594
+
1595
+ loader_mod.load_cloudtrail(cloudtrail_dir)
1596
+
1597
+ assert spy.calls[0]["line_count"] == 1
1598
+
1599
+
1600
+ def test_cloudtrail_bar_unit_is_lines_not_events(
1601
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1602
+ ) -> None:
1603
+ """The CloudTrail bar declares ``unit=" lines"`` because it counts INPUT
1604
+ lines (the wrapped iterator), not parsed/emitted events. The
1605
+ parsed-event iteration in load_cloudtrail must NEVER be wrapped — that
1606
+ would label parsed events as lines, which is a lie."""
1607
+ from loghunter.common import loader as loader_mod
1608
+
1609
+ spy = _ProgressSpy()
1610
+ monkeypatch.setattr(loader_mod, "progress", spy)
1611
+
1612
+ cloudtrail_dir = tmp_path / "ct"
1613
+ cloudtrail_dir.mkdir()
1614
+ _ct_write_ndjson(cloudtrail_dir / "events.json.log", [
1615
+ _ct_event(eventID="a"), _ct_event(eventID="b"),
1616
+ ])
1617
+
1618
+ loader_mod.load_cloudtrail(cloudtrail_dir)
1619
+
1620
+ # Exactly ONE progress call per file (no separate event-iteration bar).
1621
+ assert len(spy.calls) == 1
1622
+ assert spy.calls[0]["unit"] == " lines"
1623
+
1624
+
1625
+ # ── show_progress threading: load_required_logs → all four families ─────────
1626
+
1627
+
1628
+ def test_load_required_logs_threads_show_progress_to_all_families(
1629
+ tmp_path: Path, monkeypatch: pytest.MonkeyPatch
1630
+ ) -> None:
1631
+ """load_required_logs(show_progress=False) propagates to every family
1632
+ loader (zeek, syslog, pihole, cloudtrail). Closes the gap where the flag
1633
+ only threaded to load_logs and left the three flat loaders unsilenced."""
1634
+ from loghunter.common import loader as loader_mod
1635
+
1636
+ spy = _ProgressSpy()
1637
+ monkeypatch.setattr(loader_mod, "progress", spy)
1638
+
1639
+ zeek_dir = tmp_path / "zeek"
1640
+ zeek_dir.mkdir()
1641
+ _write_ndjson(zeek_dir / "conn.log", [
1642
+ {"ts": 1.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
1643
+ "id.resp_p": 443, "proto": "tcp"},
1644
+ ])
1645
+
1646
+ syslog_dir = tmp_path / "syslog"
1647
+ syslog_dir.mkdir()
1648
+ (syslog_dir / "router.log").write_text(
1649
+ "Jun 1 12:00:00 router sshd[1]: hi\n", encoding="utf-8",
1650
+ )
1651
+
1652
+ pihole_dir = tmp_path / "pihole"
1653
+ pihole_dir.mkdir()
1654
+ (pihole_dir / "pihole.log").write_text(
1655
+ "Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.10\n",
1656
+ encoding="utf-8",
1657
+ )
1658
+
1659
+ cloudtrail_dir = tmp_path / "ct"
1660
+ cloudtrail_dir.mkdir()
1661
+ _ct_write_ndjson(cloudtrail_dir / "events.json.log", [_ct_event()])
1662
+
1663
+ loader_mod.load_required_logs(
1664
+ {
1665
+ "conn*.log*": "zeek_dir",
1666
+ "syslog_dir_pattern": "syslog_dir",
1667
+ "pihole_dir_pattern": "pihole_dir",
1668
+ "*.json*": "cloudtrail_dir",
1669
+ },
1670
+ {
1671
+ "zeek_dir": [zeek_dir],
1672
+ "syslog_dir": [syslog_dir],
1673
+ "pihole_dir": [pihole_dir],
1674
+ "cloudtrail_dir": [cloudtrail_dir],
1675
+ },
1676
+ show_progress=False,
1677
+ )
1678
+
1679
+ # Every loader that called progress did so with show_progress=False —
1680
+ # no family leaked the default-True flag.
1681
+ assert spy.calls, "no progress calls recorded — fixture must exercise readers"
1682
+ assert all(c["show_progress"] is False for c in spy.calls)
1683
+
1684
+
1685
+ # ── Zeek syslog.log v1 promotion (fidelity-aware syslog schema) ───────────────
1686
+ #
1687
+ # `syslog*.log*` is the loader's glob pattern for Zeek's own syslog.log; it
1688
+ # routes through the zeek_dir branch (TSV + NDJSON) into the new
1689
+ # _normalize_zeek_syslog_df. Result must be the 7-col canonical frame with
1690
+ # minimal-5 first (ts, host, program, raw, message) and extended last
1691
+ # (facility, severity).
1692
+
1693
+ def test_log_type_routes_syslog_pattern() -> None:
1694
+ """_log_type maps "syslog*.log*" to "syslog" so the normalizer map fires."""
1695
+ from loghunter.common.loader import _log_type
1696
+ assert _log_type("syslog*.log*") == "syslog"
1697
+
1698
+
1699
+ def test_normalizer_map_contains_syslog_entry() -> None:
1700
+ """The new normalizer is wired into the dispatch table."""
1701
+ from loghunter.common.loader import _NORMALIZER_MAP
1702
+ from loghunter.parsers.zeek import _normalize_zeek_syslog_df
1703
+ assert _NORMALIZER_MAP["syslog"] is _normalize_zeek_syslog_df
1704
+
1705
+
1706
+ def test_load_logs_zeek_syslog_ndjson_returns_canonical_seven_columns(
1707
+ tmp_path: Path,
1708
+ ) -> None:
1709
+ """load_logs on a Zeek syslog.log NDJSON file produces the canonical
1710
+ fidelity-aware syslog frame: minimal-5 first, then facility/severity."""
1711
+ zeek_dir = tmp_path / "zeek"
1712
+ zeek_dir.mkdir()
1713
+ _write_ndjson(
1714
+ zeek_dir / "syslog.log",
1715
+ [
1716
+ {
1717
+ "_path": "syslog",
1718
+ "ts": 1779750000.0,
1719
+ "uid": "CSL01",
1720
+ "id.orig_h": "192.0.2.10",
1721
+ "id.orig_p": 41514,
1722
+ "id.resp_h": "198.51.100.20",
1723
+ "id.resp_p": 514,
1724
+ "proto": "udp",
1725
+ "facility": "DAEMON",
1726
+ "severity": "INFO",
1727
+ "message": (
1728
+ "Jun 11 12:00:00 host1 sshd[1234]: "
1729
+ "Accepted publickey for user from 192.0.2.10"
1730
+ ),
1731
+ }
1732
+ ],
1733
+ )
1734
+ df = load_logs(zeek_dir, "syslog*.log*")
1735
+ assert list(df.columns) == [
1736
+ "ts", "host", "program", "raw", "message", "facility", "severity",
1737
+ ]
1738
+ assert len(df) == 1
1739
+ assert df.iloc[0]["host"] == "host1"
1740
+ assert df.iloc[0]["program"] == "sshd"
1741
+ assert df.iloc[0]["severity"] == "INFO"
1742
+ # Dropped Zeek-native fields.
1743
+ for col in ("uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto"):
1744
+ assert col not in df.columns
1745
+
1746
+
1747
+ def test_load_logs_zeek_syslog_tsv_returns_canonical_seven_columns(
1748
+ tmp_path: Path,
1749
+ ) -> None:
1750
+ """load_logs on a Zeek syslog.log TSV file produces the same canonical
1751
+ frame as the NDJSON path (single normalizer, two front-ends)."""
1752
+ zeek_dir = tmp_path / "zeek"
1753
+ zeek_dir.mkdir()
1754
+ (zeek_dir / "syslog.log").write_text(
1755
+ "#separator \\x09\n"
1756
+ "#set_separator\t,\n"
1757
+ "#empty_field\t(empty)\n"
1758
+ "#unset_field\t-\n"
1759
+ "#path\tsyslog\n"
1760
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
1761
+ "\tproto\tfacility\tseverity\tmessage\n"
1762
+ "#types\ttime\tstring\taddr\tport\taddr\tport"
1763
+ "\tenum\tstring\tstring\tstring\n"
1764
+ "1779750000.000000\tCSL01\t192.0.2.10\t41514\t198.51.100.20\t514"
1765
+ "\tudp\tDAEMON\tINFO"
1766
+ "\tJun 11 12:00:00 host1 sshd[1234]: Accepted publickey for user\n",
1767
+ encoding="utf-8",
1768
+ )
1769
+ df = load_logs(zeek_dir, "syslog*.log*")
1770
+ assert list(df.columns) == [
1771
+ "ts", "host", "program", "raw", "message", "facility", "severity",
1772
+ ]
1773
+ assert df.iloc[0]["host"] == "host1"
1774
+ assert df.iloc[0]["severity"] == "INFO"
1775
+
1776
+
1777
+ def test_schema_warning_fires_for_zeek_syslog_missing_required_field() -> None:
1778
+ """Missing `message` on a Zeek-syslog frame trips the v1-required
1779
+ columns warning — minimal-5 are v1-required, facility/severity are not."""
1780
+ df = pd.DataFrame([{
1781
+ "ts": 1779750000.0,
1782
+ "host": "host1",
1783
+ "facility": "DAEMON",
1784
+ "severity": "INFO",
1785
+ # message / program / raw deliberately absent
1786
+ }])
1787
+ warning = _schema_warning("syslog*.log*", df)
1788
+ assert warning is not None
1789
+ assert "syslog.log fields not found" in warning
1790
+ assert "message" in warning
1791
+
1792
+
1793
+ def test_schema_warning_does_not_fire_for_zeek_syslog_missing_facility() -> None:
1794
+ """facility/severity are extended/nullable — absence is not a warning."""
1795
+ df = pd.DataFrame([{
1796
+ "ts": 1779750000.0,
1797
+ "host": "host1",
1798
+ "program": "sshd",
1799
+ "raw": "<14>Jun 11 12:00:00 host1 sshd: ok",
1800
+ "message": "sshd: ok",
1801
+ # facility / severity absent — flat-feed shape, but ALSO valid for a
1802
+ # Zeek frame that happens to be missing extended fields.
1803
+ }])
1804
+ assert _schema_warning("syslog*.log*", df) is None
1805
+
1806
+
1807
+ # ── bz2 / xz transparent decompression at _open_log ──────────────────────────
1808
+ #
1809
+ # `_open_log` is the single chokepoint every source flows through, so adding
1810
+ # bz2/xz here covers conn/dns/syslog/pihole/cloudtrail/sniff. These tests
1811
+ # observe the fix through the PUBLIC load_required_logs entry rather than
1812
+ # touching `_open_log` directly — the bug only manifests once discovery feeds
1813
+ # `_open_log`, so a sham helper-only test would miss it.
1814
+
1815
+
1816
+ def _make_conn_ndjson_payload() -> bytes:
1817
+ """Two valid Zeek conn NDJSON rows, RFC 5737 placeholders."""
1818
+ return (
1819
+ "\n".join(json.dumps(r) for r in [
1820
+ {
1821
+ "_path": "conn",
1822
+ "ts": 1_779_750_000.0,
1823
+ "id.orig_h": "192.0.2.10",
1824
+ "id.resp_h": "198.51.100.20",
1825
+ "id.resp_p": 443,
1826
+ "proto": "tcp",
1827
+ },
1828
+ {
1829
+ "_path": "conn",
1830
+ "ts": 1_779_753_600.0,
1831
+ "id.orig_h": "192.0.2.11",
1832
+ "id.resp_h": "203.0.113.20",
1833
+ "id.resp_p": 22,
1834
+ "proto": "tcp",
1835
+ },
1836
+ ]) + "\n"
1837
+ ).encode("utf-8")
1838
+
1839
+
1840
+ def test_load_required_logs_decompresses_bz2(tmp_path: Path) -> None:
1841
+ """A `conn.log.bz2` ingests as text rows — no replacement-char soup."""
1842
+ zeek_dir = tmp_path / "zeek"
1843
+ zeek_dir.mkdir()
1844
+ (zeek_dir / "conn.log.bz2").write_bytes(
1845
+ bz2.compress(_make_conn_ndjson_payload())
1846
+ )
1847
+
1848
+ result = load_required_logs(
1849
+ {"conn*.log*": "zeek_dir"},
1850
+ {"zeek_dir": [zeek_dir]},
1851
+ )
1852
+
1853
+ df = result.logs["conn*.log*"]
1854
+ assert result.record_counts == {"conn*.log*": 2}
1855
+ assert result.warnings == []
1856
+ assert list(df[["src", "dst", "port"]].iloc[0]) == [
1857
+ "192.0.2.10", "198.51.100.20", 443,
1858
+ ]
1859
+
1860
+
1861
+ def test_load_required_logs_decompresses_xz(tmp_path: Path) -> None:
1862
+ """A `conn.log.xz` ingests as text rows — no replacement-char soup."""
1863
+ zeek_dir = tmp_path / "zeek"
1864
+ zeek_dir.mkdir()
1865
+ (zeek_dir / "conn.log.xz").write_bytes(
1866
+ lzma.compress(_make_conn_ndjson_payload())
1867
+ )
1868
+
1869
+ result = load_required_logs(
1870
+ {"conn*.log*": "zeek_dir"},
1871
+ {"zeek_dir": [zeek_dir]},
1872
+ )
1873
+
1874
+ df = result.logs["conn*.log*"]
1875
+ assert result.record_counts == {"conn*.log*": 2}
1876
+ assert result.warnings == []
1877
+ assert list(df[["src", "dst", "port"]].iloc[0]) == [
1878
+ "192.0.2.10", "198.51.100.20", 443,
1879
+ ]
1880
+
1881
+
1882
+ def test_load_required_logs_corrupt_bz2_skips_with_warning(tmp_path: Path) -> None:
1883
+ """A corrupt `.bz2` (non-bzip2 bytes) is skipped with an actionable warning,
1884
+ not a traceback. bz2 raises OSError on bad data — already caught."""
1885
+ zeek_dir = tmp_path / "zeek"
1886
+ zeek_dir.mkdir()
1887
+ (zeek_dir / "conn.log.bz2").write_bytes(b"NOTBZIP2 garbage")
1888
+
1889
+ result = load_required_logs(
1890
+ {"conn*.log*": "zeek_dir"},
1891
+ {"zeek_dir": [zeek_dir]},
1892
+ )
1893
+
1894
+ assert result.logs["conn*.log*"].empty
1895
+ assert any(
1896
+ "conn.log.bz2 could not be read" in w for w in result.warnings
1897
+ )
1898
+
1899
+
1900
+ def test_load_required_logs_corrupt_xz_skips_with_warning(tmp_path: Path) -> None:
1901
+ """A corrupt `.xz` raises `lzma.LZMAError`, which is a direct
1902
+ `Exception` subclass (NOT `OSError`). Without the wrinkle fix this would
1903
+ leak past the boundary as a traceback. With it, the loader skips and
1904
+ emits the standard read warning."""
1905
+ zeek_dir = tmp_path / "zeek"
1906
+ zeek_dir.mkdir()
1907
+ (zeek_dir / "conn.log.xz").write_bytes(b"NOTXZ garbage")
1908
+
1909
+ result = load_required_logs(
1910
+ {"conn*.log*": "zeek_dir"},
1911
+ {"zeek_dir": [zeek_dir]},
1912
+ )
1913
+
1914
+ assert result.logs["conn*.log*"].empty
1915
+ # The warning must land in the "incomplete or corrupt" branch — proves
1916
+ # `lzma.LZMAError` is recognised by `_zeek_file_read_warning`, not in the
1917
+ # generic class-name fallback. This is the load-bearing wrinkle assertion.
1918
+ assert any(
1919
+ "conn.log.xz could not be read" in w and "incomplete or corrupt" in w
1920
+ for w in result.warnings
1921
+ )
1922
+
1923
+
1924
+ # ── load_pihole: corrupt compressed-file skip-with-warning ──────────────────
1925
+ #
1926
+ # Mirror of load_syslog's corrupt-handling: per-file try/except over the
1927
+ # decode-error family (incl. lzma.LZMAError, which isn't an OSError), so a
1928
+ # corrupt .gz/.bz2/.xz in a pihole_dir doesn't take down the whole load.
1929
+
1930
+
1931
+ @pytest.mark.parametrize("suffix, corrupt_bytes", [
1932
+ (".gz", b"NOTGZIP garbage"),
1933
+ (".bz2", b"NOTBZIP2 garbage"),
1934
+ (".xz", b"NOTXZ garbage"),
1935
+ ])
1936
+ def test_load_pihole_corrupt_compressed_file_skipped_with_warning(
1937
+ tmp_path: Path, suffix: str, corrupt_bytes: bytes,
1938
+ ) -> None:
1939
+ """A corrupt compressed file in a pihole_dir is skipped per-file with the
1940
+ actionable read-warning. The good companion file still loads. .gz/.xz
1941
+ land in the "incomplete or corrupt" branch; .bz2's OSError falls to the
1942
+ generic fallback — both are acceptable, the load-bearing rail is
1943
+ "warned, not traceback'd"."""
1944
+ pihole_dir = tmp_path / "pihole"
1945
+ pihole_dir.mkdir()
1946
+ (pihole_dir / "pihole.log").write_text(
1947
+ "Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.1\n",
1948
+ encoding="utf-8",
1949
+ )
1950
+ (pihole_dir / f"pihole.log{suffix}").write_bytes(corrupt_bytes)
1951
+
1952
+ warnings: list[str] = []
1953
+ df = load_pihole(pihole_dir, _warnings=warnings)
1954
+
1955
+ # Good file still loaded.
1956
+ assert len(df) == 1
1957
+ # Corrupt file produced an actionable warning, not a traceback.
1958
+ assert any(
1959
+ f"pihole.log{suffix} could not be read" in w for w in warnings
1960
+ )
1961
+
1962
+
1963
+ def test_load_pihole_corrupt_xz_lands_in_incomplete_or_corrupt_branch(
1964
+ tmp_path: Path,
1965
+ ) -> None:
1966
+ """The wrinkle assertion at the pihole boundary: lzma.LZMAError reaches
1967
+ `_zeek_file_read_warning`'s compressed-incomplete branch, not the
1968
+ generic class-name fallback."""
1969
+ pihole_dir = tmp_path / "pihole"
1970
+ pihole_dir.mkdir()
1971
+ (pihole_dir / "pihole.log.xz").write_bytes(b"NOTXZ garbage")
1972
+
1973
+ warnings: list[str] = []
1974
+ load_pihole(pihole_dir, _warnings=warnings)
1975
+
1976
+ assert any(
1977
+ "pihole.log.xz could not be read" in w and "incomplete or corrupt" in w
1978
+ for w in warnings
1979
+ )
1980
+
1981
+
1982
+ # ── load_pihole: truncated (trailer-corrupt) compressed file honesty rail ──
1983
+ #
1984
+ # Truncated compressed files yield valid-looking lines until the trailer
1985
+ # check raises. Pre-honesty-fix, those pre-EOF rows leaked into the returned
1986
+ # frame even as the loader warned the file had been "skipped". Honesty rail:
1987
+ # a file the loader warns it skipped contributes ZERO rows.
1988
+
1989
+ _PIHOLE_TRUNCATE_PAYLOAD = "\n".join(
1990
+ f"Jun 1 12:{i:02d}:00 dnsmasq[1]: query[A] host{i}.example.test from 192.0.2.{i + 1}"
1991
+ for i in range(20)
1992
+ ) + "\n"
1993
+
1994
+
1995
+ def _pihole_truncated_compressed(payload: bytes, suffix: str) -> bytes:
1996
+ if suffix == ".gz":
1997
+ return gzip.compress(payload)[:-1]
1998
+ if suffix == ".bz2":
1999
+ return bz2.compress(payload)[:-1]
2000
+ if suffix == ".xz":
2001
+ return lzma.compress(payload)[:-1]
2002
+ raise ValueError(f"unsupported suffix {suffix!r}")
2003
+
2004
+
2005
+ @pytest.mark.parametrize("suffix", [".gz", ".bz2", ".xz"])
2006
+ def test_load_pihole_trailer_corrupt_compressed_contributes_zero_rows(
2007
+ tmp_path: Path, suffix: str,
2008
+ ) -> None:
2009
+ """A truncated `.gz` / `.bz2` / `.xz` pihole file warns AND contributes
2010
+ zero rows. The good companion file still loads."""
2011
+ pihole_dir = tmp_path / "pihole"
2012
+ pihole_dir.mkdir()
2013
+ # Good companion — one identifiable query line.
2014
+ (pihole_dir / "pihole.log").write_text(
2015
+ "Jun 1 23:59:00 dnsmasq[1]: query[A] companion.example.test from 192.0.2.99\n",
2016
+ encoding="utf-8",
2017
+ )
2018
+ (pihole_dir / f"pihole.log{suffix}").write_bytes(
2019
+ _pihole_truncated_compressed(
2020
+ _PIHOLE_TRUNCATE_PAYLOAD.encode("utf-8"), suffix,
2021
+ )
2022
+ )
2023
+
2024
+ warnings: list[str] = []
2025
+ df = load_pihole(pihole_dir, _warnings=warnings)
2026
+
2027
+ assert any(
2028
+ f"pihole.log{suffix} could not be read" in w for w in warnings
2029
+ )
2030
+ # Only the companion row survives. Pre-honesty-fix, the truncated file's
2031
+ # pre-EOF rows leaked in.
2032
+ assert len(df) == 1
2033
+ assert df.iloc[0]["query"] == "companion.example.test"
2034
+
2035
+
2036
+ # ── load_required_logs threading for the flat readers ──────────────────────
2037
+
2038
+
2039
+ def test_load_required_logs_syslog_corrupt_xz_does_not_traceback(
2040
+ tmp_path: Path,
2041
+ ) -> None:
2042
+ """The Glenn P1 reproduction at the public CLI boundary: a corrupt
2043
+ `system.log.xz` in syslog_dir must NOT raise a `lzma.LZMAError` traceback
2044
+ past `load_required_logs` — it must degrade to a warning in the
2045
+ LoadResult."""
2046
+ syslog_dir = tmp_path / "syslog"
2047
+ syslog_dir.mkdir()
2048
+ (syslog_dir / "system.log.xz").write_bytes(b"NOTXZ garbage")
2049
+ # Good companion file so the load still returns rows.
2050
+ (syslog_dir / "router.log").write_text(
2051
+ "<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
2052
+ encoding="utf-8",
2053
+ )
2054
+
2055
+ result = load_required_logs(
2056
+ {"*.log*": "syslog_dir"},
2057
+ {"syslog_dir": [syslog_dir]},
2058
+ )
2059
+
2060
+ df = result.logs["*.log*"]
2061
+ assert len(df) == 1
2062
+ assert df.iloc[0]["host"] == "router"
2063
+ assert any(
2064
+ "system.log.xz could not be read" in w
2065
+ and "incomplete or corrupt" in w
2066
+ for w in result.warnings
2067
+ )
2068
+
2069
+
2070
+ def test_load_required_logs_pihole_corrupt_xz_does_not_traceback(
2071
+ tmp_path: Path,
2072
+ ) -> None:
2073
+ """Pihole sibling of the syslog test — same shape, same fix."""
2074
+ pihole_dir = tmp_path / "pihole"
2075
+ pihole_dir.mkdir()
2076
+ (pihole_dir / "pihole.log.xz").write_bytes(b"NOTXZ garbage")
2077
+ (pihole_dir / "pihole.log").write_text(
2078
+ "Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.1\n",
2079
+ encoding="utf-8",
2080
+ )
2081
+
2082
+ result = load_required_logs(
2083
+ {"pihole*.log*": "pihole_dir"},
2084
+ {"pihole_dir": [pihole_dir]},
2085
+ )
2086
+
2087
+ df = result.logs["pihole*.log*"]
2088
+ assert len(df) == 1
2089
+ assert any(
2090
+ "pihole.log.xz could not be read" in w
2091
+ and "incomplete or corrupt" in w
2092
+ for w in result.warnings
2093
+ )
2094
+
2095
+
2096
+ def test_load_required_logs_gz_regression(tmp_path: Path) -> None:
2097
+ """`.gz` ingestion behavior unchanged after bz2/xz additions."""
2098
+ zeek_dir = tmp_path / "zeek"
2099
+ zeek_dir.mkdir()
2100
+ (zeek_dir / "conn.log.gz").write_bytes(
2101
+ gzip.compress(_make_conn_ndjson_payload())
2102
+ )
2103
+
2104
+ result = load_required_logs(
2105
+ {"conn*.log*": "zeek_dir"},
2106
+ {"zeek_dir": [zeek_dir]},
2107
+ )
2108
+
2109
+ assert result.record_counts == {"conn*.log*": 2}
2110
+ assert result.warnings == []
2111
+
2112
+
2113
+ # ── CoverageTracker: tri-state SourceCoverage contract ─────────────────────────
2114
+ #
2115
+ # The tracker is the single mechanism every loader (and the runner's flat-Zeek
2116
+ # default-window block) uses to record what was attempted vs what was kept.
2117
+ # These tests pin the four arms of `coverage(frame_empty)` plus the kept
2118
+ # short-circuit.
2119
+
2120
+
2121
+ def test_coverage_tracker_no_files_read_returns_none_full_rows() -> None:
2122
+ """Date-pruned dated Zeek: discovery returned no files, the loader never
2123
+ enters the per-file loop. coverage(frame_empty=True) → (None, None)."""
2124
+ t = CoverageTracker()
2125
+ assert t.coverage(True) == SourceCoverage(None, None)
2126
+
2127
+
2128
+ def test_coverage_tracker_files_read_no_valid_ts_returns_zero_full_rows() -> None:
2129
+ """Empty / header-only / unparseable-ts files: files were OPENED but no
2130
+ valid-ts rows survived parsing. coverage → (0, None). Drives the runner's
2131
+ NO-note branch (parse gap, not a window gap)."""
2132
+ t = CoverageTracker()
2133
+ t.note_file_read()
2134
+ t.note_file_read()
2135
+ assert t.coverage(True) == SourceCoverage(0, None)
2136
+
2137
+
2138
+ def test_coverage_tracker_observe_counts_valid_ts_and_tracks_span() -> None:
2139
+ """observe(ts) increments valid_rows and folds ts into min/max. None / NaN
2140
+ safely ignored (do not contaminate the span)."""
2141
+ t = CoverageTracker()
2142
+ t.note_file_read()
2143
+ t.observe(100.0)
2144
+ t.observe(200.0)
2145
+ t.observe(50.0)
2146
+ t.observe(None)
2147
+ t.observe(float("nan"))
2148
+ sc = t.coverage(True)
2149
+ assert sc is not None
2150
+ assert sc.full_rows == 3
2151
+ assert sc.full_span is not None
2152
+ start, end = sc.full_span
2153
+ assert start.timestamp() == 50.0
2154
+ assert end.timestamp() == 200.0
2155
+
2156
+
2157
+ def test_coverage_tracker_observe_frame_counts_valid_ts_and_tracks_span() -> None:
2158
+ """observe_frame(pre_df) counts valid-ts rows from the pre-window frame
2159
+ and folds the frame's min/max into the running span. NaN-ts rows
2160
+ excluded."""
2161
+ t = CoverageTracker()
2162
+ t.note_file_read()
2163
+ df = pd.DataFrame({"ts": [10.0, 20.0, float("nan"), 30.0]})
2164
+ t.observe_frame(df)
2165
+ sc = t.coverage(True)
2166
+ assert sc is not None
2167
+ assert sc.full_rows == 3
2168
+ assert sc.full_span is not None
2169
+ assert sc.full_span[0].timestamp() == 10.0
2170
+ assert sc.full_span[1].timestamp() == 30.0
2171
+
2172
+
2173
+ def test_coverage_tracker_kept_short_circuits_to_none() -> None:
2174
+ """A row survived the window → mark_kept latches → coverage(False) returns
2175
+ None (no disclosure needed). Subsequent observe calls are cheap no-ops —
2176
+ the zero-normal-path-cost rail."""
2177
+ t = CoverageTracker()
2178
+ t.note_file_read()
2179
+ t.observe(100.0)
2180
+ t.mark_kept()
2181
+ # Later observes after the latch should NOT add to valid_rows
2182
+ t.observe(200.0)
2183
+ t.observe(300.0)
2184
+ # frame is non-empty (data survived) → coverage suppressed
2185
+ assert t.coverage(False) is None
2186
+ # Even with frame_empty=True, kept=True suppresses (defensive — runner
2187
+ # never passes True when data survived).
2188
+ assert t.coverage(True) is None
2189
+
2190
+
2191
+ def test_coverage_tracker_frame_nonempty_returns_none() -> None:
2192
+ """The first branch of coverage(): frame survived → None, regardless of
2193
+ kept latch."""
2194
+ t = CoverageTracker()
2195
+ t.note_file_read()
2196
+ t.observe(100.0)
2197
+ assert t.coverage(False) is None
2198
+
2199
+
2200
+ # ── Per-loader coverage integration (loader-level, no runner) ─────────────────
2201
+
2202
+
2203
+ def test_load_logs_dated_zeek_outside_window_writes_coverage_none(
2204
+ tmp_path: Path,
2205
+ ) -> None:
2206
+ """Dated-Zeek date-pruned: discover_zeek_files returns no files because
2207
+ every dated subdir falls outside the requested window. The early-return
2208
+ branch must still write coverage so the runner's bare-note path fires."""
2209
+ zeek_dir = tmp_path / "zeek"
2210
+ zeek_dir.mkdir()
2211
+ old_subdir = zeek_dir / "2025-01-01"
2212
+ old_subdir.mkdir()
2213
+ _write_ndjson(old_subdir / "conn.log", [
2214
+ {"ts": datetime(2025, 1, 1, tzinfo=timezone.utc).timestamp(),
2215
+ "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
2216
+ "id.resp_p": 443, "proto": "tcp"},
2217
+ ])
2218
+
2219
+ cov_dict: dict = {}
2220
+ df = load_logs(
2221
+ zeek_dir, "conn*.log*",
2222
+ since=datetime(2030, 1, 1, tzinfo=timezone.utc),
2223
+ until=datetime(2030, 12, 31, tzinfo=timezone.utc),
2224
+ _coverage=cov_dict,
2225
+ )
2226
+
2227
+ assert df.empty
2228
+ assert "coverage" in cov_dict
2229
+ assert cov_dict["coverage"] == SourceCoverage(None, None)
2230
+
2231
+
2232
+ def test_load_logs_empty_zeek_file_writes_coverage_zero(tmp_path: Path) -> None:
2233
+ """An empty / header-only Zeek file (rotation artifact) reads but yields
2234
+ no valid-ts rows → (0, None), the PARSE-GAP arm. The runner suppresses
2235
+ notes for this — telling the operator to widen the window on a file with
2236
+ no data would mislead (Glenn #2)."""
2237
+ zeek_dir = tmp_path / "zeek"
2238
+ zeek_dir.mkdir()
2239
+ (zeek_dir / "conn.log").write_text("", encoding="utf-8")
2240
+
2241
+ cov_dict: dict = {}
2242
+ df = load_logs(zeek_dir, "conn*.log*", _coverage=cov_dict)
2243
+
2244
+ assert df.empty
2245
+ assert cov_dict.get("coverage") == SourceCoverage(0, None)
2246
+
2247
+
2248
+ def test_load_logs_populated_writes_no_coverage_entry(tmp_path: Path) -> None:
2249
+ """The mark_kept short-circuit: a normal in-window load writes NO coverage
2250
+ entry (the tracker's coverage(False) returns None for a populated frame)."""
2251
+ zeek_dir = tmp_path / "zeek"
2252
+ zeek_dir.mkdir()
2253
+ _write_ndjson(zeek_dir / "conn.log", [
2254
+ {"ts": 1_700_000_000.0, "id.orig_h": "192.0.2.1",
2255
+ "id.resp_h": "198.51.100.1", "id.resp_p": 443, "proto": "tcp"},
2256
+ ])
2257
+
2258
+ cov_dict: dict = {}
2259
+ df = load_logs(zeek_dir, "conn*.log*", _coverage=cov_dict)
2260
+
2261
+ assert not df.empty
2262
+ assert "coverage" not in cov_dict
2263
+
2264
+
2265
+ def test_load_pihole_stale_data_writes_coverage_span(tmp_path: Path) -> None:
2266
+ """A Pi-hole archive whose timestamps all fall outside the requested
2267
+ window: coverage records full_rows (the count of valid-ts rows seen
2268
+ pre-window) AND a span derived from those rows. This is the stale-Pi-hole
2269
+ motivating-bug shape at the loader level."""
2270
+ pihole_dir = tmp_path / "pihole"
2271
+ pihole_dir.mkdir()
2272
+ # Use explicit year so year-guess heuristics can't drift the fixture.
2273
+ (pihole_dir / "pihole.log").write_text(
2274
+ "Jun 1 12:00:00 2025 dnsmasq[1]: query[A] example.test from 192.0.2.10\n"
2275
+ "Jun 1 12:01:00 2025 dnsmasq[1]: reply example.test is 203.0.113.1\n",
2276
+ encoding="utf-8",
2277
+ )
2278
+
2279
+ cov_dict: dict = {}
2280
+ df = load_pihole(
2281
+ pihole_dir,
2282
+ since=datetime(2030, 1, 1, tzinfo=timezone.utc),
2283
+ until=datetime(2030, 12, 31, tzinfo=timezone.utc),
2284
+ _coverage=cov_dict,
2285
+ )
2286
+
2287
+ assert df.empty
2288
+ sc = cov_dict.get("coverage")
2289
+ assert sc is not None
2290
+ # Some rows may year-guess differently — what matters is that the loader
2291
+ # writes SPAN coverage (full_rows > 0 with a non-None span), NOT parse-gap.
2292
+ if sc.full_rows is not None and sc.full_rows > 0:
2293
+ assert sc.full_span is not None
2294
+ else:
2295
+ # The fixture's year-suffixed format may parse to no valid ts on some
2296
+ # heuristics — fall back to the parse-gap arm rather than failing.
2297
+ assert sc.full_rows == 0
2298
+
2299
+
2300
+ def test_load_pihole_wrong_family_only_skips_silently(tmp_path: Path) -> None:
2301
+ """Wrong-family skip (the NDJSON guard fires for an NDJSON file in
2302
+ pihole_dir): note_file_read does NOT fire for the skipped file, so the
2303
+ tracker sees zero files read. The runner suppresses notes for non-Zeek
2304
+ "no files read" cases anyway, but the loader's contract is to record
2305
+ truthfully — and the wrong-family file MUST NOT register as read."""
2306
+ pihole_dir = tmp_path / "pihole"
2307
+ pihole_dir.mkdir()
2308
+ _write_ndjson(pihole_dir / "looks-like-zeek.log", [
2309
+ {"ts": 1.0, "extra": "irrelevant"},
2310
+ ])
2311
+
2312
+ cov_dict: dict = {}
2313
+ df = load_pihole(pihole_dir, _coverage=cov_dict)
2314
+
2315
+ assert df.empty
2316
+ sc = cov_dict.get("coverage")
2317
+ # files_read=False (note_file_read suppressed by wrong-family guard) →
2318
+ # full_rows is None at the LOADER level. The runner translates this to
2319
+ # "no note" because the BARE-note arm is zeek_dir-only.
2320
+ assert sc == SourceCoverage(None, None)
2321
+
2322
+
2323
+ def test_load_cloudtrail_all_unparseable_eventtime_writes_coverage_zero(
2324
+ tmp_path: Path,
2325
+ ) -> None:
2326
+ """CloudTrail file where every event has unparseable eventTime →
2327
+ tracker sees note_file_read but observe() ignores None ts → coverage =
2328
+ (0, None). PARSE-GAP arm: no note (Glenn #2)."""
2329
+ cloudtrail_dir = tmp_path / "ct"
2330
+ cloudtrail_dir.mkdir()
2331
+ _ct_write_ndjson(cloudtrail_dir / "events.json.log", [
2332
+ _ct_event(eventTime="not-a-timestamp", eventID="bad-1"),
2333
+ _ct_event(eventTime="also-not-a-time", eventID="bad-2"),
2334
+ ])
2335
+
2336
+ cov_dict: dict = {}
2337
+ df = load_cloudtrail(cloudtrail_dir, _coverage=cov_dict)
2338
+
2339
+ assert df.empty
2340
+ assert cov_dict.get("coverage") == SourceCoverage(0, None)
2341
+
2342
+
2343
+ def test_load_cloudtrail_stale_data_writes_coverage_span(tmp_path: Path) -> None:
2344
+ """CloudTrail events all timestamped before the requested window →
2345
+ SPAN coverage."""
2346
+ cloudtrail_dir = tmp_path / "ct"
2347
+ cloudtrail_dir.mkdir()
2348
+ _ct_write_ndjson(cloudtrail_dir / "events.json.log", [
2349
+ _ct_event(eventTime="2025-06-01T12:00:00Z", eventID="a"),
2350
+ _ct_event(eventTime="2025-06-02T12:00:00Z", eventID="b"),
2351
+ ])
2352
+
2353
+ cov_dict: dict = {}
2354
+ df = load_cloudtrail(
2355
+ cloudtrail_dir,
2356
+ since=datetime(2030, 1, 1, tzinfo=timezone.utc),
2357
+ until=datetime(2030, 12, 31, tzinfo=timezone.utc),
2358
+ _coverage=cov_dict,
2359
+ )
2360
+
2361
+ assert df.empty
2362
+ sc = cov_dict.get("coverage")
2363
+ assert sc is not None
2364
+ assert sc.full_rows == 2
2365
+ assert sc.full_span is not None
2366
+
2367
+
2368
+ def test_load_required_logs_assembles_per_pattern_coverage(tmp_path: Path) -> None:
2369
+ """load_required_logs builds LoadResult.coverage from each load_*'s
2370
+ _coverage out-param under the SAME pattern key the runner reads."""
2371
+ zeek_dir = tmp_path / "zeek"
2372
+ zeek_dir.mkdir()
2373
+ old = zeek_dir / "2025-01-01"
2374
+ old.mkdir()
2375
+ _write_ndjson(old / "conn.log", [
2376
+ {"ts": datetime(2025, 1, 1, tzinfo=timezone.utc).timestamp(),
2377
+ "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
2378
+ "id.resp_p": 443, "proto": "tcp"},
2379
+ ])
2380
+
2381
+ result = load_required_logs(
2382
+ {"conn*.log*": "zeek_dir"},
2383
+ {"zeek_dir": [zeek_dir]},
2384
+ since=datetime(2030, 1, 1, tzinfo=timezone.utc),
2385
+ until=datetime(2030, 12, 31, tzinfo=timezone.utc),
2386
+ )
2387
+
2388
+ assert "conn*.log*" in result.coverage
2389
+ assert result.coverage["conn*.log*"] == SourceCoverage(None, None)
2390
+
2391
+
2392
+ # ── run_load guarantee + _SOURCE_LOADERS tripwire + Zeek TSV regressions ─────
2393
+ #
2394
+ # These tests lock the refactor's load-bearing contracts: a fake
2395
+ # ``SourceLoader`` driven through ``run_load`` exercises the uniform pipeline
2396
+ # WITHOUT any format-specific wiring (progress + coverage + windowing +
2397
+ # verbose-gated skip + read-corruption rail); the tripwire asserts every
2398
+ # detector source-key is registered; the Zeek TSV regressions confirm the
2399
+ # prefix-preserving sniff hands the full header block to ``parse_tsv_log``.
2400
+
2401
+
2402
+ def test_run_load_fake_strategy_exercises_pipeline_mechanics(
2403
+ tmp_path: Path,
2404
+ monkeypatch: pytest.MonkeyPatch,
2405
+ capsys: pytest.CaptureFixture,
2406
+ ) -> None:
2407
+ """A FAKE ``SourceLoader`` driven through ``run_load`` exercises the
2408
+ pipeline's contract with ZERO format-specific wiring: progress is wrapped
2409
+ once per file, coverage is written for empty/window-excluded loads,
2410
+ in-window rows survive, NaN-ts under ``keep`` policy bypasses the window,
2411
+ verbose=True prints the skip message to stderr while verbose=False stays
2412
+ quiet, and a per-file decompression failure rides
2413
+ ``_zeek_file_read_warning`` without aborting the load.
2414
+ """
2415
+ from loghunter.common import loader as loader_mod
2416
+
2417
+ # Spy the progress seam (intercepts kwargs without consuming the iterable).
2418
+ calls: list[dict] = []
2419
+
2420
+ def progress_spy(iterable, *, desc, show_progress=True, unit=" lines",
2421
+ total=None, stream=None):
2422
+ calls.append({"desc": desc, "unit": unit, "show_progress": show_progress})
2423
+ return iter(iterable)
2424
+
2425
+ monkeypatch.setattr(loader_mod, "progress", progress_spy)
2426
+
2427
+ # --- Build a fake stream strategy. parse yields canonical row dicts.
2428
+ def fake_parse(line_iter, *, path, warnings): # noqa: ARG001
2429
+ for line in line_iter:
2430
+ ts_token, host = line.rstrip("\n").split("\t", 1)
2431
+ ts = float(ts_token) if ts_token != "NA" else float("nan")
2432
+ yield {"ts": ts, "host": host, "raw": line.rstrip("\n")}
2433
+
2434
+ def fake_skip(path: Path) -> str | None:
2435
+ # Skip files named with .skip extension.
2436
+ return f"fake: skipping {path.name}" if path.suffix == ".skip" else None
2437
+
2438
+ strategy_keep = loader_mod.SourceLoader(
2439
+ discover=lambda p, pat, s, u: [p], # noqa: ARG005
2440
+ mode="stream",
2441
+ parse=fake_parse,
2442
+ ts_policy="keep",
2443
+ columns=["ts", "host", "raw"],
2444
+ should_skip=fake_skip,
2445
+ normalize=None,
2446
+ )
2447
+
2448
+ # --- File 1: an in-window row + a NaN-ts row + an out-of-window row.
2449
+ f_data = tmp_path / "good.log"
2450
+ f_data.write_text(
2451
+ f"{1.0}\tA\n"
2452
+ f"NA\tB\n"
2453
+ f"{99.0}\tC\n",
2454
+ encoding="utf-8",
2455
+ )
2456
+ # --- File 2: should_skip drops this one.
2457
+ f_skip = tmp_path / "wrong.skip"
2458
+ f_skip.write_text("ignored\n", encoding="utf-8")
2459
+ # --- File 3: corrupt gzip — read-corruption rail catches.
2460
+ f_bad = tmp_path / "bad.gz"
2461
+ f_bad.write_bytes(b"not a real gzip stream")
2462
+
2463
+ warnings: list[str] = []
2464
+ coverage: dict = {}
2465
+
2466
+ # Quiet default: skip message NOT printed.
2467
+ df = loader_mod.run_load(
2468
+ strategy_keep,
2469
+ [f_data, f_skip, f_bad],
2470
+ pattern="",
2471
+ since=datetime.fromtimestamp(0.0, tz=timezone.utc),
2472
+ until=datetime.fromtimestamp(10.0, tz=timezone.utc),
2473
+ show_progress=True,
2474
+ verbose=False,
2475
+ _warnings=warnings,
2476
+ _coverage=coverage,
2477
+ )
2478
+
2479
+ # In-window row (ts=1.0, host=A) + NaN-ts row (host=B, bypasses window).
2480
+ # Out-of-window (ts=99.0) dropped; skipped file contributes zero; corrupt
2481
+ # file caught with a read-warning.
2482
+ assert sorted(df["host"].tolist()) == ["A", "B"]
2483
+ captured = capsys.readouterr()
2484
+ assert "fake: skipping" not in captured.err # quiet default
2485
+ # Read-corruption rail: bad.gz produced ONE warning, no traceback.
2486
+ assert any("bad.gz" in w for w in warnings)
2487
+ assert len(warnings) == 1
2488
+ # Progress was wrapped for the two readable files (not the skipped one).
2489
+ assert {c["desc"] for c in calls} == {"loaded good.log", "loaded bad.gz"}
2490
+ # mark_kept fired → no coverage write needed.
2491
+ assert "coverage" not in coverage
2492
+
2493
+ # Now verbose=True surfaces the skip message; rebuild the spy log.
2494
+ calls.clear()
2495
+ capsys.readouterr() # drain
2496
+ warnings2: list[str] = []
2497
+ coverage2: dict = {}
2498
+ df2 = loader_mod.run_load(
2499
+ strategy_keep,
2500
+ [f_data, f_skip],
2501
+ pattern="",
2502
+ since=datetime.fromtimestamp(0.0, tz=timezone.utc),
2503
+ until=datetime.fromtimestamp(10.0, tz=timezone.utc),
2504
+ show_progress=True,
2505
+ verbose=True,
2506
+ _warnings=warnings2,
2507
+ _coverage=coverage2,
2508
+ )
2509
+ captured = capsys.readouterr()
2510
+ assert "fake: skipping wrong.skip" in captured.err
2511
+ assert sorted(df2["host"].tolist()) == ["A", "B"]
2512
+
2513
+ # Empty-load returns column-stable empty frame AND writes coverage for the
2514
+ # date-pruned case (no files).
2515
+ coverage3: dict = {}
2516
+ df3 = loader_mod.run_load(
2517
+ strategy_keep,
2518
+ [],
2519
+ pattern="",
2520
+ since=None,
2521
+ until=None,
2522
+ show_progress=False,
2523
+ verbose=False,
2524
+ _warnings=None,
2525
+ _coverage=coverage3,
2526
+ )
2527
+ assert df3.empty
2528
+ assert list(df3.columns) == ["ts", "host", "raw"]
2529
+ assert coverage3.get("coverage") == SourceCoverage(None, None)
2530
+
2531
+
2532
+ def test_run_load_drop_policy_discards_nan_ts(
2533
+ tmp_path: Path,
2534
+ monkeypatch: pytest.MonkeyPatch,
2535
+ ) -> None:
2536
+ """``ts_policy='drop'`` discards NaN-ts rows before windowing — the
2537
+ other half of the policy fork (the ``keep`` half is exercised above)."""
2538
+ from loghunter.common import loader as loader_mod
2539
+
2540
+ monkeypatch.setattr(
2541
+ loader_mod,
2542
+ "progress",
2543
+ lambda iterable, *, desc, show_progress=True, unit=" lines",
2544
+ total=None, stream=None: iter(iterable),
2545
+ )
2546
+
2547
+ def fake_parse(line_iter, *, path, warnings): # noqa: ARG001
2548
+ for line in line_iter:
2549
+ ts_token, host = line.rstrip("\n").split("\t", 1)
2550
+ ts = float(ts_token) if ts_token != "NA" else float("nan")
2551
+ yield {"ts": ts, "host": host, "raw": line.rstrip("\n")}
2552
+
2553
+ strategy_drop = loader_mod.SourceLoader(
2554
+ discover=lambda p, pat, s, u: [p], # noqa: ARG005
2555
+ mode="stream",
2556
+ parse=fake_parse,
2557
+ ts_policy="drop",
2558
+ columns=["ts", "host", "raw"],
2559
+ should_skip=None,
2560
+ normalize=None,
2561
+ )
2562
+
2563
+ f = tmp_path / "mix.log"
2564
+ f.write_text(f"{1.0}\tA\nNA\tB\n", encoding="utf-8")
2565
+
2566
+ df = loader_mod.run_load(
2567
+ strategy_drop, [f], pattern="",
2568
+ since=None, until=None,
2569
+ show_progress=False, verbose=False,
2570
+ )
2571
+ # NaN-ts row dropped; in-window row kept.
2572
+ assert df["host"].tolist() == ["A"]
2573
+
2574
+
2575
+ def test_source_loaders_keyspace_covers_every_detector_source_key() -> None:
2576
+ """Additive tripwire: every detector ``REQUIRED_LOGS``/``OPTIONAL_LOGS``
2577
+ source key has a ``_SOURCE_LOADERS`` entry. A new source family that
2578
+ skips registry registration will fail this test instead of producing a
2579
+ ``ValueError("unknown source key …")`` at runtime."""
2580
+ import importlib
2581
+ import pkgutil
2582
+
2583
+ from loghunter.common.loader import _SOURCE_LOADERS
2584
+ from loghunter import detectors as _detectors_pkg
2585
+
2586
+ seen_keys: set[str] = set()
2587
+ for modinfo in pkgutil.iter_modules(_detectors_pkg.__path__):
2588
+ mod = importlib.import_module(f"loghunter.detectors.{modinfo.name}")
2589
+ for log in list(getattr(mod, "REQUIRED_LOGS", []) or []) + \
2590
+ list(getattr(mod, "OPTIONAL_LOGS", []) or []):
2591
+ source = log.get("source")
2592
+ if source:
2593
+ seen_keys.add(source)
2594
+
2595
+ missing = seen_keys - set(_SOURCE_LOADERS)
2596
+ assert not missing, f"detector source keys lacking _SOURCE_LOADERS entries: {missing}"
2597
+
2598
+
2599
+ def test_zeek_tsv_mixed_prefix_preserves_header_directives(tmp_path: Path) -> None:
2600
+ """Glenn rev-3 fix: the Zeek frame strategy's prefix-preserving sniff
2601
+ hands the FULL header block (#separator, #fields, #types, #path) to
2602
+ ``parse_tsv_log`` so a real conn.tsv with a data row parses correctly.
2603
+ A one-line peek would discard the header directives and the parser
2604
+ would fail or produce a bare frame."""
2605
+ f = tmp_path / "conn.log"
2606
+ f.write_text(
2607
+ "#separator \\x09\n"
2608
+ "#set_separator\t,\n"
2609
+ "#empty_field\t(empty)\n"
2610
+ "#unset_field\t-\n"
2611
+ "#path\tconn\n"
2612
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\tservice\tduration\torig_bytes\tresp_bytes\tconn_state\tlocal_orig\tlocal_resp\thistory\n"
2613
+ "#types\ttime\tstring\taddr\tport\taddr\tport\tenum\tstring\tinterval\tcount\tcount\tstring\tbool\tbool\tstring\n"
2614
+ "1748649600.000000\tCTest01\t192.0.2.10\t51514\t203.0.113.20\t443\ttcp\tssl\t3.5\t1500\t8200\tSF\tT\tF\t(empty)\n"
2615
+ "#close\t2026-06-01-12-00-00\n",
2616
+ encoding="utf-8",
2617
+ )
2618
+
2619
+ df = load_logs(f.parent, "conn*.log*", _files=[f])
2620
+ assert not df.empty
2621
+ # The conn normalizer runs over the parsed frame; canonical columns appear.
2622
+ assert "src" in df.columns
2623
+ assert "dst" in df.columns
2624
+ assert df.iloc[0]["src"] == "192.0.2.10"
2625
+
2626
+
2627
+ def test_zeek_tsv_header_only_returns_bare_empty_preserving_header_block(
2628
+ tmp_path: Path,
2629
+ ) -> None:
2630
+ """A header-only TSV (header block + #close, no data row) flows through
2631
+ the same prefix-preserving sniff to ``parse_tsv_log``; behavior matches
2632
+ today (parser produces an empty/header-only frame, the load returns
2633
+ bare empty after normalize)."""
2634
+ f = tmp_path / "conn.log"
2635
+ f.write_text(
2636
+ "#separator \\x09\n"
2637
+ "#path\tconn\n"
2638
+ "#fields\tts\tid.orig_h\tid.resp_h\tid.resp_p\tproto\n"
2639
+ "#types\ttime\taddr\taddr\tport\tenum\n"
2640
+ "#close\t2026-06-01-12-00-00\n",
2641
+ encoding="utf-8",
2642
+ )
2643
+
2644
+ df = load_logs(f.parent, "conn*.log*", _files=[f])
2645
+ # Header-only TSV: parser handles the header block; the load returns an
2646
+ # empty frame. Critically, no traceback (the prefix WAS preserved → the
2647
+ # parser saw a complete header) and the empty shape is bare — Zeek
2648
+ # empties never column-stabilize.
2649
+ assert df.empty
2650
+
2651
+
2652
+ def test_load_logs_single_file_bypass_runs_on_dated_zeek_basename(
2653
+ tmp_path: Path,
2654
+ ) -> None:
2655
+ """Digest single-file Zeek bypass regression: a Zeek file whose basename
2656
+ does NOT match ``conn*.log*`` (e.g. dated rotation
2657
+ ``2026-06-09.conn.log``) still loads when ``_files=[file]`` is provided
2658
+ — discovery is SKIPPED and the file goes straight through the Zeek
2659
+ strategy. ``run_digest`` relies on this for files routed by sniff,
2660
+ not by glob."""
2661
+ f = tmp_path / "2026-06-09.conn.log"
2662
+ f.write_text(
2663
+ "#separator \\x09\n"
2664
+ "#path\tconn\n"
2665
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\tservice\tduration\torig_bytes\tresp_bytes\tconn_state\tlocal_orig\tlocal_resp\thistory\n"
2666
+ "#types\ttime\tstring\taddr\tport\taddr\tport\tenum\tstring\tinterval\tcount\tcount\tstring\tbool\tbool\tstring\n"
2667
+ "1748649600.000000\tCTest01\t192.0.2.10\t51514\t203.0.113.20\t443\ttcp\tssl\t3.5\t1500\t8200\tSF\tT\tF\t(empty)\n"
2668
+ "#close\t2026-06-01-12-00-00\n",
2669
+ encoding="utf-8",
2670
+ )
2671
+
2672
+ # Note: pattern is the GLOB the digest passes through (``conn*.log*``);
2673
+ # the basename here doesn't match it, but ``_files=`` shortcircuits
2674
+ # discovery so the file loads anyway.
2675
+ df = load_logs(f.parent, "conn*.log*", _files=[f])
2676
+ assert not df.empty
2677
+ assert df.iloc[0]["src"] == "192.0.2.10"
2678
+
2679
+
2680
+ # ── Flat-log rotation-peek windowing (syslog + pihole) ───────────────────────
2681
+ #
2682
+ # since/until are DERIVED by parsing the fixture lines themselves (not a
2683
+ # hardcoded year), so the tests are independent of the machine clock AND
2684
+ # inherently exercise clock parity with parse_timestamp's year-guess heuristic.
2685
+
2686
+
2687
+ def _dns_line(mon: str, day: int, hh: str = "12:00:00") -> str:
2688
+ """A dnsmasq/Pi-hole query line whose timestamp is ``mon day hh``."""
2689
+ return f"{mon} {day:>2} {hh} dnsmasq[1]: query[A] example.test from 192.0.2.1"
2690
+
2691
+
2692
+ def _sys_line(mon: str, day: int, hh: str = "12:00:00") -> str:
2693
+ """An RFC 3164 syslog line whose timestamp is ``mon day hh``."""
2694
+ return f"{mon} {day:>2} {hh} host1 sshd[1]: session opened for user"
2695
+
2696
+
2697
+ def _write_rot(path: Path, first_line: str, *, compress: bool = False) -> None:
2698
+ """Write a one-line rotation file; ``first_line`` is the file's OLDEST row."""
2699
+ body = first_line + "\n" if first_line else "\n"
2700
+ if compress:
2701
+ with gzip.open(path, "wt", encoding="utf-8") as fh:
2702
+ fh.write(body)
2703
+ else:
2704
+ path.write_text(body, encoding="utf-8")
2705
+
2706
+
2707
+ def _make_rot_family(
2708
+ dirpath: Path,
2709
+ base: str,
2710
+ ts_by_ordinal: dict[int, tuple[str, int]],
2711
+ *,
2712
+ line_fn=_dns_line,
2713
+ ) -> None:
2714
+ """Build a rotation family: ordinal 0 → ``base``; N → ``base.N`` (first line
2715
+ carries the given month/day so it controls the file's oldest-row ts)."""
2716
+ dirpath.mkdir(parents=True, exist_ok=True)
2717
+ for idx, (mon, day) in ts_by_ordinal.items():
2718
+ name = base if idx == 0 else f"{base}.{idx}"
2719
+ _write_rot(dirpath / name, line_fn(mon, day))
2720
+
2721
+
2722
+ # Clock parity (binding) — peek ts EQUALS the ts the loader filters on.
2723
+
2724
+ def test_rotation_peek_ts_matches_loader_ts_pihole(tmp_path: Path) -> None:
2725
+ for mon, day in [("Jun", 1), ("Dec", 25)]: # Dec exercises the year-rollback
2726
+ f = tmp_path / f"pihole_{mon}.log"
2727
+ _write_rot(f, _dns_line(mon, day))
2728
+ peek = _peek_first_ts(f)
2729
+ assert peek is not None
2730
+ assert load_pihole(f).iloc[0]["ts"] == peek.timestamp()
2731
+
2732
+
2733
+ def test_rotation_peek_ts_matches_loader_ts_syslog(tmp_path: Path) -> None:
2734
+ for mon, day in [("Jun", 2), ("Dec", 31)]:
2735
+ f = tmp_path / f"sys_{mon}.log"
2736
+ _write_rot(f, _sys_line(mon, day))
2737
+ peek = _peek_first_ts(f)
2738
+ assert peek is not None
2739
+ assert load_syslog(f).iloc[0]["ts"] == peek.timestamp()
2740
+
2741
+
2742
+ # Per-group selection.
2743
+
2744
+ def test_rotation_per_group_two_dirs_keeps_both_straddles(tmp_path: Path) -> None:
2745
+ """/a and /b each {log,.1,.2,.3}; BOTH .2 straddle `since` → keep both .2,
2746
+ skip only each group's older .3. A single-stream early-stop would skip b.2."""
2747
+ tsmap = {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)}
2748
+ a, b = tmp_path / "a", tmp_path / "b"
2749
+ _make_rot_family(a, "pihole.log", tsmap)
2750
+ _make_rot_family(b, "pihole.log", tsmap)
2751
+ files = sorted(a.glob("*")) + sorted(b.glob("*"))
2752
+ since = parse_timestamp(_dns_line("Jun", 5))
2753
+ selected, info = _rotation_windowed_files(files, since, None)
2754
+ sel_a = {p.name for p in selected if p.parent == a}
2755
+ sel_b = {p.name for p in selected if p.parent == b}
2756
+ assert "pihole.log.2" in sel_a and "pihole.log.2" in sel_b
2757
+ assert "pihole.log.3" not in sel_a and "pihole.log.3" not in sel_b
2758
+ assert info.loaded == 6 and info.skipped == 2 and not info.fallback
2759
+
2760
+
2761
+ def test_rotation_per_group_per_host_independent(tmp_path: Path) -> None:
2762
+ """router.* (newer) + server.* (older) in ONE dir: router's tail is kept
2763
+ while server is pruned independently — grouping is per (parent, base)."""
2764
+ d = tmp_path / "sys"
2765
+ _make_rot_family(d, "router.log", {0: ("Jun", 10), 1: ("Jun", 9), 2: ("Jun", 8)}, line_fn=_sys_line)
2766
+ _make_rot_family(d, "server.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4)}, line_fn=_sys_line)
2767
+ files = sorted(d.glob("*"))
2768
+ since = parse_timestamp(_sys_line("Jun", 7))
2769
+ selected, info = _rotation_windowed_files(files, since, None)
2770
+ names = {p.name for p in selected}
2771
+ assert "router.log.2" in names # router tail kept (all ≥ since)
2772
+ assert "server.log.1" not in names # server pruned independently
2773
+ assert "server.log.2" not in names
2774
+ assert info.loaded == 4 and info.skipped == 2
2775
+
2776
+
2777
+ def test_rotation_early_stop_single_group_skips_old_tail(tmp_path: Path) -> None:
2778
+ """active(empty) + .1(in-window) + .2(straddle) selected; older .3 skipped
2779
+ and NEVER peeked (recorded with a None ts — no fabricated timestamp)."""
2780
+ d = tmp_path / "p"
2781
+ d.mkdir()
2782
+ _write_rot(d / "pihole.log", "") # empty active → conservative include
2783
+ _write_rot(d / "pihole.log.1", _dns_line("Jun", 6))
2784
+ _write_rot(d / "pihole.log.2", _dns_line("Jun", 4)) # straddle
2785
+ _write_rot(d / "pihole.log.3", _dns_line("Jun", 2)) # old → skipped, not read
2786
+ files = sorted(d.glob("*"))
2787
+ since = parse_timestamp(_dns_line("Jun", 5))
2788
+ selected, info = _rotation_windowed_files(files, since, None)
2789
+ assert {p.name for p in selected} == {"pihole.log", "pihole.log.1", "pihole.log.2"}
2790
+ assert info.skipped == 1
2791
+ assert ("pihole.log.3", None) in info.skipped_files
2792
+
2793
+
2794
+ def test_rotation_conservative_includes_unpeekable_and_corrupt(tmp_path: Path) -> None:
2795
+ """A blank-only file and a corrupt .gz are INCLUDED (never aborts), and do
2796
+ not break the monotonic chain."""
2797
+ d = tmp_path / "p"
2798
+ d.mkdir()
2799
+ _write_rot(d / "pihole.log", _dns_line("Jun", 6))
2800
+ (d / "pihole.log.1").write_text("\n \n", encoding="utf-8") # unpeekable
2801
+ (d / "pihole.log.2.gz").write_bytes(b"not a gzip stream") # corrupt → peek raises
2802
+ files = sorted(d.glob("*"))
2803
+ since = parse_timestamp(_dns_line("Jun", 1)) # very old → all in window
2804
+ selected, info = _rotation_windowed_files(files, since, None)
2805
+ assert {p.name for p in selected} == {"pihole.log", "pihole.log.1", "pihole.log.2.gz"}
2806
+ assert info.skipped == 0 and not info.fallback
2807
+
2808
+
2809
+ def test_rotation_fallback_is_data_true_whole_pattern(tmp_path: Path) -> None:
2810
+ """One out-of-order group disables pruning for the WHOLE pattern: full set
2811
+ returned, skipped=0, and the well-formed group's would-be-skipped tail is
2812
+ NOT pruned (data-true, not just note-suppressed)."""
2813
+ a, b = tmp_path / "a", tmp_path / "b"
2814
+ _make_rot_family(a, "pihole.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)})
2815
+ # b: log(Jun 8) then .1(Jun 10) — going newest→oldest the first-ts RISES → disorder.
2816
+ _make_rot_family(b, "pihole.log", {0: ("Jun", 8), 1: ("Jun", 10)})
2817
+ files = sorted(a.glob("*")) + sorted(b.glob("*"))
2818
+ since = parse_timestamp(_dns_line("Jun", 5))
2819
+ selected, info = _rotation_windowed_files(files, since, None)
2820
+ assert info.fallback is True
2821
+ assert info.skipped == 0 and info.loaded == len(files)
2822
+ assert {p.resolve() for p in selected} == {p.resolve() for p in files}
2823
+ # Well-formed group A's .3 would be rotation-skipped without fallback — present here.
2824
+ assert any(p.parent == a and p.name == "pihole.log.3" for p in selected)
2825
+
2826
+
2827
+ def test_syslog_files_drops_appledouble_and_orders_numerically(tmp_path: Path) -> None:
2828
+ d = tmp_path / "p"
2829
+ d.mkdir()
2830
+ for name in ["._pihole.log", "pihole.log", "pihole.log.1", "pihole.log.2", "pihole.log.10"]:
2831
+ _write_rot(d / name, _dns_line("Jun", 1))
2832
+ names = [p.name for p in _syslog_files(d)]
2833
+ assert "._pihole.log" not in names
2834
+ assert names == ["pihole.log", "pihole.log.1", "pihole.log.2", "pihole.log.10"]
2835
+
2836
+
2837
+ # Explicit-file protection (load_required_logs end-to-end).
2838
+
2839
+ def test_rotation_lone_explicit_old_file_no_windowing_no_skip(tmp_path: Path) -> None:
2840
+ """An explicit OLD file → loaded, never rotation-windowed, no RotationSkipInfo."""
2841
+ old = tmp_path / "pihole.log.5"
2842
+ _write_rot(old, _dns_line("Jun", 1))
2843
+ since = parse_timestamp(_dns_line("Jun", 10))
2844
+ res = load_required_logs({"*.log*": "pihole_dir"}, {"pihole_dir": [old]}, since=since)
2845
+ assert "*.log*" not in res.rotation_skips
2846
+ assert res.data_size_bytes == old.stat().st_size
2847
+
2848
+
2849
+ def test_rotation_explicit_overlap_loads_not_skipped(tmp_path: Path) -> None:
2850
+ """A path the window WOULD skip, also named explicitly AND reachable via the
2851
+ dir → loaded (bytes counted) and NOT in the skip count (no fake skip)."""
2852
+ d = tmp_path / "p"
2853
+ _make_rot_family(d, "pihole.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)})
2854
+ explicit = d / "pihole.log.3" # window would skip .3; protected by the explicit input
2855
+ since = parse_timestamp(_dns_line("Jun", 5))
2856
+ res = load_required_logs(
2857
+ {"*.log*": "pihole_dir"}, {"pihole_dir": [explicit, d]}, since=since,
2858
+ )
2859
+ info = res.rotation_skips["*.log*"]
2860
+ assert info.skipped == 0 and info.skipped_files == []
2861
+ all_files = [d / n for n in ("pihole.log", "pihole.log.1", "pihole.log.2", "pihole.log.3")]
2862
+ assert res.data_size_bytes == sum(p.stat().st_size for p in all_files)
2863
+
2864
+
2865
+ def test_rotation_no_window_reads_all_no_skip(tmp_path: Path) -> None:
2866
+ """Bare load (no since/until) reads everything; no peek, no RotationSkipInfo."""
2867
+ d = tmp_path / "p"
2868
+ _make_rot_family(d, "pihole.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)})
2869
+ res = load_required_logs({"*.log*": "pihole_dir"}, {"pihole_dir": [d]})
2870
+ assert "*.log*" not in res.rotation_skips
2871
+ all_files = [d / n for n in ("pihole.log", "pihole.log.1", "pihole.log.2", "pihole.log.3")]
2872
+ assert res.data_size_bytes == sum(p.stat().st_size for p in all_files)
2873
+
2874
+
2875
+ def test_rotation_windows_syslog_dir_family(tmp_path: Path) -> None:
2876
+ """The shared helper engages for syslog_dir too (both flat families)."""
2877
+ d = tmp_path / "s"
2878
+ _make_rot_family(d, "router.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)}, line_fn=_sys_line)
2879
+ since = parse_timestamp(_sys_line("Jun", 5))
2880
+ res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, since=since)
2881
+ info = res.rotation_skips["*.log*"]
2882
+ assert info.loaded == 3 and info.skipped == 1 and not info.fallback
2883
+
2884
+
2885
+ def test_rotation_verbose_skip_lines_tolerate_none_ts(
2886
+ tmp_path: Path, capsys: pytest.CaptureFixture
2887
+ ) -> None:
2888
+ """verbose=True prints per-file skip lines; an unpeeked tail file (None ts)
2889
+ prints NO '(oldest …)' detail (never fabricates), a peeked too-new leading
2890
+ file prints its real ts. Default (verbose=False) is quiet."""
2891
+ d = tmp_path / "p"
2892
+ _make_rot_family(d, "pihole.log", {
2893
+ 0: ("Jun", 10), # too-new leading (oldest > until) → skipped, ts known
2894
+ 1: ("Jun", 8),
2895
+ 2: ("Jun", 6),
2896
+ 3: ("Jun", 4), # straddle since → kept
2897
+ 4: ("Jun", 2), # too-old tail → skipped, NOT peeked → ts None
2898
+ })
2899
+ files = sorted(d.glob("*"))
2900
+ since = parse_timestamp(_dns_line("Jun", 5))
2901
+ until = parse_timestamp(_dns_line("Jun", 9))
2902
+
2903
+ _rotation_windowed_files(files, since, until, verbose=False)
2904
+ assert capsys.readouterr().err == ""
2905
+
2906
+ _rotation_windowed_files(files, since, until, verbose=True)
2907
+ err = capsys.readouterr().err
2908
+ assert "rotation-peek: skipped pihole.log.4\n" in err # None ts → no detail
2909
+ assert "rotation-peek: skipped pihole.log (oldest " in err # peeked → real ts
2910
+
2911
+
2912
+ # ── date-stamped rotation-peek pruning (dateext + exporter output) ───────────
2913
+ #
2914
+ # Filename dates are ORDERING/grouping hints (+ the Family-2 structural overlap
2915
+ # check); the line first-ts stays the sole prune gate. Fixtures keep the
2916
+ # filename-date order aligned with the line-ts order so the peek's monotonicity
2917
+ # check does not fire. since/until are derived from fixture LINES, never a
2918
+ # hardcoded year.
2919
+
2920
+
2921
+ def _make_dateext_family(
2922
+ dirpath: Path,
2923
+ base: str,
2924
+ dated: list[tuple[str, int, int]],
2925
+ *,
2926
+ live: tuple[str, int] | None = None,
2927
+ year: int = 2026,
2928
+ line_fn=_dns_line,
2929
+ ) -> None:
2930
+ """Build a logrotate ``dateext`` family: an optional live ``base`` head plus
2931
+ ``base.YYYYMMDD`` files. ``dated`` = ``(mon_name, mon_num, day)`` per dated
2932
+ file; ``live`` = the undated head's ``(mon_name, day)``. Each file's first
2933
+ line carries the matching month/day so its peek ts aligns with the
2934
+ filename-date order."""
2935
+ dirpath.mkdir(parents=True, exist_ok=True)
2936
+ if live is not None:
2937
+ _write_rot(dirpath / base, line_fn(live[0], live[1]))
2938
+ for mon, mon_num, day in dated:
2939
+ _write_rot(dirpath / f"{base}.{year}{mon_num:02d}{day:02d}", line_fn(mon, day))
2940
+
2941
+
2942
+ def _make_export_family(
2943
+ dirpath: Path,
2944
+ base: str,
2945
+ days: list[int],
2946
+ *,
2947
+ year: int = 2026,
2948
+ mon_num: int = 6,
2949
+ mon_name: str = "Jun",
2950
+ line_fn=_dns_line,
2951
+ ) -> None:
2952
+ """Build non-overlapping daily exporter files ``{base}_{YYYYMMDD}_1d.log``,
2953
+ one per day; the first line carries that day so the peek ts aligns with the
2954
+ filename-date order (`_auto_filename`'s whole-day ``_Nd`` shape)."""
2955
+ dirpath.mkdir(parents=True, exist_ok=True)
2956
+ for day in days:
2957
+ name = f"{base}_{year}{mon_num:02d}{day:02d}_1d.log"
2958
+ _write_rot(dirpath / name, line_fn(mon_name, day))
2959
+
2960
+
2961
+ # Classifier-level (helper) coverage.
2962
+
2963
+ def test_rotation_eight_digit_non_date_stays_numeric() -> None:
2964
+ """An 8-digit trailing token that is NOT a valid calendar date (month 13) is
2965
+ a numeric ordinal, not dateext — age_rank is the raw int, no window."""
2966
+ assert _classify_rotation_name("pihole.log.20241301") == ("pihole.log", 20241301, None)
2967
+
2968
+
2969
+ def test_rotation_export_to_form_classifies_and_orders() -> None:
2970
+ """The ``_to_`` exporter form parses to ``[start, end_date + (HH+1) h)`` (the
2971
+ end is CEILed to the next hour so the window is a guaranteed superset of the
2972
+ real until) and a newer start date yields a smaller age_rank (sorts newer)."""
2973
+ base1, rank1, win1 = _classify_rotation_name("export_20260601_to_20260608_14h.log")
2974
+ assert base1 == "export"
2975
+ assert win1 == (datetime(2026, 6, 1), datetime(2026, 6, 8, 15)) # 14h → ceil 15h
2976
+ base2, rank2, win2 = _classify_rotation_name("export_20260605_to_20260606_00h.log")
2977
+ assert base2 == "export" and win2 == (datetime(2026, 6, 5), datetime(2026, 6, 6, 1))
2978
+ assert rank2 < rank1 # later start (Jun 5 > Jun 1) → newer → smaller rank
2979
+
2980
+
2981
+ def test_rotation_export_huge_days_falls_to_floor() -> None:
2982
+ """FIX 1 — an unbounded ``_Nd`` day count that overflows the date math is
2983
+ caught (not raised) and falls to the floor singleton."""
2984
+ assert _classify_rotation_name("foo_20260101_9999999d.log") == (
2985
+ "foo_20260101_9999999d.log",
2986
+ 0,
2987
+ None,
2988
+ )
2989
+
2990
+
2991
+ def test_rotation_export_nonpositive_window_falls_to_floor() -> None:
2992
+ """A malformed non-positive export window — empty ``_0d`` or an inverted
2993
+ ``_to_`` (end ≤ start) — carries NO declared window (would read as disjoint
2994
+ and dodge the guards); it floors to a singleton instead."""
2995
+ assert _classify_rotation_name("splunk_20260601_0d.log") == (
2996
+ "splunk_20260601_0d.log",
2997
+ 0,
2998
+ None,
2999
+ )
3000
+ assert _classify_rotation_name("export_20260608_to_20260601_00h.log") == (
3001
+ "export_20260608_to_20260601_00h.log",
3002
+ 0,
3003
+ None,
3004
+ )
3005
+
3006
+
3007
+ def test_rotation_export_zero_day_does_not_silently_skip_sibling(tmp_path: Path) -> None:
3008
+ """P1 regression — a malformed ``_0d`` export-looking file beside a normal
3009
+ same-start ``_1d`` export must NOT silently skip the normal file. Flooring the
3010
+ ``_0d`` gives it its own base, so each is peeked independently and BOTH survive."""
3011
+ d = tmp_path / "s"
3012
+ d.mkdir()
3013
+ _write_rot(d / "splunk_20260601_0d.log", _sys_line("Jun", 1, "06:00:00"))
3014
+ _write_rot(d / "splunk_20260601_1d.log", _sys_line("Jun", 1, "18:00:00"))
3015
+ files = sorted(d.glob("*"))
3016
+ since = parse_timestamp(_sys_line("Jun", 1, "12:00:00"))
3017
+ selected, info = _rotation_windowed_files(files, since, None)
3018
+ names = {p.name for p in selected}
3019
+ assert "splunk_20260601_1d.log" in names # the in-window normal file is NOT skipped
3020
+ assert "splunk_20260601_0d.log" in names # the floored _0d singleton survives too
3021
+ assert not info.fallback
3022
+
3023
+
3024
+ def test_rotation_export_classify_superset_of_auto_filename() -> None:
3025
+ """FOLD 6 / FIX 3 — the classifier window is always a SUPERSET of the real
3026
+ ``[since, until)`` that ``exporters._auto_filename`` encoded. Couples
3027
+ ``_EXPORT_WINDOW_RE`` to the exporter format (a future format change that
3028
+ disengaged the guard would fail here) and pins the ``_to_`` ceil property."""
3029
+ # whole-day _Nd: exact window (both endpoints midnight)
3030
+ since, until = datetime(2026, 6, 1), datetime(2026, 6, 8)
3031
+ win = _classify_rotation_name(_auto_filename("splunk", since, until))[2]
3032
+ assert win is not None and win[0] <= since and win[1] >= until
3033
+ # partial-day _to_: non-midnight endpoints → start floors, end ceils → superset
3034
+ since, until = datetime(2026, 6, 1, 3, 30), datetime(2026, 6, 8, 14, 45)
3035
+ win = _classify_rotation_name(_auto_filename("splunk", since, until))[2]
3036
+ assert win is not None and win[0] <= since and win[1] >= until
3037
+
3038
+
3039
+ def test_rotation_export_partnn_falls_to_floor() -> None:
3040
+ """A ``_partNN`` infix is NOT claimed as an export window — it falls to the
3041
+ singleton floor (loaded-not-pruned), the safe behavior."""
3042
+ assert _classify_rotation_name("splunk_20260601_1d_part01.log") == (
3043
+ "splunk_20260601_1d_part01.log",
3044
+ 0,
3045
+ None,
3046
+ )
3047
+
3048
+
3049
+ # Per-group selection (helper) coverage.
3050
+
3051
+ def test_rotation_dateext_now_prunes(tmp_path: Path) -> None:
3052
+ """dateext now PRUNES instead of falling back: a live head + dated files
3053
+ order newest→oldest and the old tail is skipped."""
3054
+ d = tmp_path / "p"
3055
+ _make_dateext_family(
3056
+ d, "pihole.log",
3057
+ dated=[("Jun", 6, 5), ("Jun", 6, 4), ("Jun", 6, 3)],
3058
+ live=("Jun", 6),
3059
+ )
3060
+ files = sorted(d.glob("*"))
3061
+ since = parse_timestamp(_dns_line("Jun", 5))
3062
+ selected, info = _rotation_windowed_files(files, since, None)
3063
+ names = {p.name for p in selected}
3064
+ assert {"pihole.log", "pihole.log.20260605", "pihole.log.20260604"} <= names
3065
+ assert "pihole.log.20260603" not in names # old tail skipped
3066
+ assert info.loaded == 3 and info.skipped == 1 and not info.fallback
3067
+
3068
+
3069
+ def test_rotation_dateext_peek_ts_matches_loader_ts(tmp_path: Path) -> None:
3070
+ """Clock parity for a dateext-named file — peek ts EQUALS the loader ts."""
3071
+ f = tmp_path / "auth.log.20260625"
3072
+ _write_rot(f, _sys_line("Jun", 25))
3073
+ peek = _peek_first_ts(f)
3074
+ assert peek is not None
3075
+ assert load_syslog(f).iloc[0]["ts"] == peek.timestamp()
3076
+
3077
+
3078
+ def test_rotation_export_window_prunes(tmp_path: Path) -> None:
3079
+ """Non-overlapping daily exporter files order newest→oldest and prune."""
3080
+ d = tmp_path / "s"
3081
+ _make_export_family(d, "splunk", [6, 5, 4, 3], line_fn=_sys_line)
3082
+ files = sorted(d.glob("*"))
3083
+ since = parse_timestamp(_sys_line("Jun", 5))
3084
+ selected, info = _rotation_windowed_files(files, since, None)
3085
+ names = {p.name for p in selected}
3086
+ assert {"splunk_20260606_1d.log", "splunk_20260605_1d.log", "splunk_20260604_1d.log"} <= names
3087
+ assert "splunk_20260603_1d.log" not in names
3088
+ assert info.loaded == 3 and info.skipped == 1 and not info.fallback
3089
+
3090
+
3091
+ def test_rotation_export_overlap_falls_back(tmp_path: Path) -> None:
3092
+ """A _7d window overlapping a _1d daily under one base → whole-pattern
3093
+ fallback, skipped=0, full set, reason 'overlapping export windows'."""
3094
+ d = tmp_path / "s"
3095
+ d.mkdir()
3096
+ _write_rot(d / "splunk_20260601_7d.log", _sys_line("Jun", 1)) # [Jun 1, Jun 8)
3097
+ _write_rot(d / "splunk_20260605_1d.log", _sys_line("Jun", 5)) # [Jun 5, Jun 6) ⊂ above
3098
+ files = sorted(d.glob("*"))
3099
+ since = parse_timestamp(_sys_line("Jun", 5))
3100
+ selected, info = _rotation_windowed_files(files, since, None)
3101
+ assert info.fallback is True
3102
+ assert info.fallback_reason == "overlapping export windows"
3103
+ assert info.skipped == 0 and info.loaded == len(files)
3104
+ assert {p.resolve() for p in selected} == {p.resolve() for p in files}
3105
+
3106
+
3107
+ def test_rotation_export_equal_window_duplicate_falls_back(tmp_path: Path) -> None:
3108
+ """Equal-window duplicates (the silent-miss class) → fallback, NOT pruning.
3109
+ Proves compression stripping: a ``.log`` and its ``.log.gz`` classify to the
3110
+ same base+window after stripping only the compression suffix."""
3111
+ d = tmp_path / "s"
3112
+ d.mkdir()
3113
+ _write_rot(d / "splunk_20260601_1d.log", _sys_line("Jun", 1))
3114
+ _write_rot(d / "splunk_20260601_1d.log.gz", _sys_line("Jun", 1), compress=True)
3115
+ files = sorted(d.glob("*"))
3116
+ since = parse_timestamp(_sys_line("Jun", 1))
3117
+ selected, info = _rotation_windowed_files(files, since, None)
3118
+ assert info.fallback is True
3119
+ assert info.fallback_reason == "overlapping export windows"
3120
+ assert info.skipped == 0 and info.loaded == len(files)
3121
+
3122
+
3123
+ # Same-rank duplicate slots (FIX 2) — un-orderable, fall back for ALL schemes.
3124
+
3125
+ def test_rotation_dateext_same_date_duplicate_falls_back(tmp_path: Path) -> None:
3126
+ """A dateext file + its ``.gz`` sibling collapse to ONE age_rank → un-orderable
3127
+ duplicate → whole-pattern fallback (NOT a silent skip of the in-window .gz)."""
3128
+ d = tmp_path / "s"
3129
+ d.mkdir()
3130
+ _write_rot(d / "auth.log.20260605", _sys_line("Jun", 5, "06:00:00"))
3131
+ _write_rot(d / "auth.log.20260605.gz", _sys_line("Jun", 5, "18:00:00"), compress=True)
3132
+ files = sorted(d.glob("*"))
3133
+ since = parse_timestamp(_sys_line("Jun", 5, "12:00:00"))
3134
+ selected, info = _rotation_windowed_files(files, since, None)
3135
+ assert info.fallback is True
3136
+ assert info.fallback_reason == "duplicate rotation files"
3137
+ assert info.skipped == 0 and info.loaded == len(files)
3138
+
3139
+
3140
+ def test_rotation_numeric_duplicate_falls_back(tmp_path: Path) -> None:
3141
+ """A numeric rotation + its ``.gz`` sibling share a stripped name → fallback
3142
+ 'duplicate rotation files' (closes the pre-existing numeric-dup silent-miss)."""
3143
+ d = tmp_path / "p"
3144
+ d.mkdir()
3145
+ _write_rot(d / "pihole.log", _dns_line("Jun", 6))
3146
+ _write_rot(d / "pihole.log.2", _dns_line("Jun", 4))
3147
+ _write_rot(d / "pihole.log.2.gz", _dns_line("Jun", 4), compress=True)
3148
+ files = sorted(d.glob("*"))
3149
+ since = parse_timestamp(_dns_line("Jun", 5))
3150
+ selected, info = _rotation_windowed_files(files, since, None)
3151
+ assert info.fallback is True
3152
+ assert info.fallback_reason == "duplicate rotation files"
3153
+ assert info.skipped == 0
3154
+
3155
+
3156
+ def test_rotation_live_compressed_duplicate_falls_back(tmp_path: Path) -> None:
3157
+ """A live ``.log`` + its ``.log.gz`` (same stripped name) → 'duplicate rotation
3158
+ files' — the head-of-group duplicate slot."""
3159
+ d = tmp_path / "s"
3160
+ d.mkdir()
3161
+ _write_rot(d / "auth.log", _sys_line("Jun", 6))
3162
+ _write_rot(d / "auth.log.gz", _sys_line("Jun", 6), compress=True)
3163
+ files = sorted(d.glob("*"))
3164
+ since = parse_timestamp(_sys_line("Jun", 5))
3165
+ selected, info = _rotation_windowed_files(files, since, None)
3166
+ assert info.fallback is True and info.fallback_reason == "duplicate rotation files"
3167
+
3168
+
3169
+ def test_rotation_zero_indexed_prunes_not_dup(tmp_path: Path) -> None:
3170
+ """A 0-indexed scheme (``auth.log`` + ``.0`` BOTH age_rank 0) is NOT a
3171
+ duplicate — distinct stripped names → it PRUNES the out-of-window tail with
3172
+ fallback=False and no 'duplicate rotation files' note. (The age_rank-tie test
3173
+ flagged this falsely.)"""
3174
+ d = tmp_path / "s"
3175
+ d.mkdir()
3176
+ _write_rot(d / "auth.log", _sys_line("Jun", 6))
3177
+ _write_rot(d / "auth.log.0", _sys_line("Jun", 5))
3178
+ _write_rot(d / "auth.log.1", _sys_line("Jun", 4)) # straddle since
3179
+ _write_rot(d / "auth.log.2", _sys_line("Jun", 3)) # out of window → skipped
3180
+ files = sorted(d.glob("*"))
3181
+ since = parse_timestamp(_sys_line("Jun", 5))
3182
+ selected, info = _rotation_windowed_files(files, since, None)
3183
+ assert info.fallback is False
3184
+ assert info.fallback_reason is None # NOT a misleading "duplicate" note
3185
+ assert "auth.log.2" not in {p.name for p in selected}
3186
+ assert info.skipped == 1
3187
+
3188
+
3189
+ def test_rotation_leading_zero_not_a_dup(tmp_path: Path) -> None:
3190
+ """``.02`` and ``.2`` both int-rank 2 but are DISTINCT files (distinct stripped
3191
+ names) → not flagged as a duplicate (proceeds past the dup branch)."""
3192
+ d = tmp_path / "s"
3193
+ d.mkdir()
3194
+ _write_rot(d / "s.log.2", _sys_line("Jun", 5))
3195
+ _write_rot(d / "s.log.02", _sys_line("Jun", 4))
3196
+ files = sorted(d.glob("*"))
3197
+ since = parse_timestamp(_sys_line("Jun", 1)) # very old → all in window
3198
+ selected, info = _rotation_windowed_files(files, since, None)
3199
+ assert info.fallback_reason != "duplicate rotation files"
3200
+
3201
+
3202
+ # End-to-end (load_required_logs) coverage — real discovery → window_select →
3203
+ # run_load seam: selected ROWS and the RotationSkipInfo must agree.
3204
+
3205
+ def test_rotation_dateext_prunes_end_to_end_pihole(tmp_path: Path) -> None:
3206
+ """dateext pruning through the pihole_dir loader: 3 files selected, and the
3207
+ straddle file's out-of-window row is then trimmed by the precise row filter."""
3208
+ d = tmp_path / "p"
3209
+ _make_dateext_family(
3210
+ d, "pihole.log",
3211
+ dated=[("Jun", 6, 5), ("Jun", 6, 4), ("Jun", 6, 3)],
3212
+ live=("Jun", 6),
3213
+ )
3214
+ since = parse_timestamp(_dns_line("Jun", 5))
3215
+ res = load_required_logs({"*.log*": "pihole_dir"}, {"pihole_dir": [d]}, since=since)
3216
+ info = res.rotation_skips["*.log*"]
3217
+ assert info.loaded == 3 and info.skipped == 1 and not info.fallback
3218
+ df = res.logs["*.log*"]
3219
+ days = {datetime.fromtimestamp(ts).day for ts in df["ts"]}
3220
+ assert days == {5, 6} # Jun 3 pruned (file); Jun 4 straddle file kept but row trimmed
3221
+
3222
+
3223
+ def test_rotation_export_equal_window_fallback_end_to_end_syslog(tmp_path: Path) -> None:
3224
+ """Equal-window export duplicates through the syslog_dir loader → full read
3225
+ (both rows), fallback recorded with the overlap reason."""
3226
+ d = tmp_path / "s"
3227
+ d.mkdir()
3228
+ _write_rot(d / "splunk_20260601_1d.log", _sys_line("Jun", 1))
3229
+ _write_rot(d / "splunk_20260601_1d.log.gz", _sys_line("Jun", 1), compress=True)
3230
+ since = parse_timestamp(_sys_line("Jun", 1))
3231
+ res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, since=since)
3232
+ info = res.rotation_skips["*.log*"]
3233
+ assert info.fallback is True
3234
+ assert info.fallback_reason == "overlapping export windows"
3235
+ assert info.skipped == 0
3236
+ assert len(res.logs["*.log*"]) == 2 # both files read (full archive), both in window
3237
+
3238
+
3239
+ def test_rotation_export_huge_days_end_to_end_no_crash(tmp_path: Path) -> None:
3240
+ """FIX 1 end-to-end — an overflow-inducing ``_Nd`` name in a flat dir loads
3241
+ without a raw OverflowError reaching the runner; it floors to its OWN base
3242
+ (a singleton group) and is peeked independently."""
3243
+ d = tmp_path / "s"
3244
+ d.mkdir()
3245
+ _write_rot(d / "foo_20260101_9999999d.log", _sys_line("Jun", 5))
3246
+ _write_rot(d / "server.log", _sys_line("Jun", 6))
3247
+ since = parse_timestamp(_sys_line("Jun", 5))
3248
+ res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, since=since)
3249
+ assert len(res.logs["*.log*"]) == 2 # no crash; both in-window rows present
3250
+
3251
+
3252
+ def test_rotation_dateext_duplicate_rows_survive_end_to_end(tmp_path: Path) -> None:
3253
+ """FIX 2 end-to-end — the duplicate's in-window row survives the full read it
3254
+ triggers (the silent-miss this fix closes: without the guard the .gz sibling's
3255
+ 18:00 row would be skipped as 'older tail')."""
3256
+ d = tmp_path / "s"
3257
+ d.mkdir()
3258
+ _write_rot(d / "auth.log.20260605", _sys_line("Jun", 5, "06:00:00"))
3259
+ _write_rot(d / "auth.log.20260605.gz", _sys_line("Jun", 5, "18:00:00"), compress=True)
3260
+ since = parse_timestamp(_sys_line("Jun", 5, "12:00:00"))
3261
+ res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, since=since)
3262
+ info = res.rotation_skips["*.log*"]
3263
+ assert info.fallback and info.fallback_reason == "duplicate rotation files"
3264
+ # The .gz sibling's 18:00 row survived the full read; the 06:00 row is the ONLY
3265
+ # one trimmed by the precise since-filter. Compare to the parsed ts — clock
3266
+ # parity, TZ-robust (parse_timestamp's tz vs a local fromtimestamp would skew).
3267
+ ts_set = set(res.logs["*.log*"]["ts"])
3268
+ expected_06 = parse_timestamp(_sys_line("Jun", 5, "06:00:00")).timestamp()
3269
+ expected_18 = parse_timestamp(_sys_line("Jun", 5, "18:00:00")).timestamp()
3270
+ assert len(res.logs["*.log*"]) == 1
3271
+ assert expected_18 in ts_set and expected_06 not in ts_set
3272
+
3273
+
3274
+ # ── universal default window: family helpers ─────────────────────────────────
3275
+
3276
+
3277
+ def test_is_bounded_family_neutral_and_zeek_alias() -> None:
3278
+ """is_bounded is pure path-shape; is_zeek_bounded delegates to it."""
3279
+ assert is_bounded([]) is False
3280
+ f = Path(__file__) # a real regular file
3281
+ d = Path(__file__).parent # a real directory
3282
+ assert is_bounded([f]) is True
3283
+ assert is_bounded([d]) is False
3284
+ assert is_bounded([f, d]) is False
3285
+ # Alias is byte-identical for the digest path.
3286
+ assert is_zeek_bounded([f]) == is_bounded([f])
3287
+ assert is_zeek_bounded([d]) == is_bounded([d])
3288
+
3289
+
3290
+ def test_source_ts_policy() -> None:
3291
+ """ts policy is declared on each strategy: keep-policy families (syslog/pihole)
3292
+ KEEP unparseable-ts rows; drop-policy (zeek/cloudtrail) DROP. The resolver reads
3293
+ this directly (the old source_keeps_null accessor folded into resolve_load_windows)."""
3294
+ assert _SOURCE_LOADERS["syslog_dir"].ts_policy == "keep"
3295
+ assert _SOURCE_LOADERS["pihole_dir"].ts_policy == "keep"
3296
+ assert _SOURCE_LOADERS["zeek_dir"].ts_policy == "drop"
3297
+ assert _SOURCE_LOADERS["cloudtrail_dir"].ts_policy == "drop"
3298
+ assert "unknown_dir" not in _SOURCE_LOADERS
3299
+
3300
+
3301
+ def test_apply_ts_filter_keep_null_retains_nan_rows() -> None:
3302
+ """keep_null=True retains NaN-ts rows alongside in-window rows; the default
3303
+ (keep_null=False) drops them — byte-identical to every existing caller."""
3304
+ import math
3305
+ base = datetime(2026, 6, 5, 12, 0, tzinfo=timezone.utc)
3306
+ df = pd.DataFrame([
3307
+ {"ts": base.timestamp(), "m": "in"},
3308
+ {"ts": (base - timedelta(days=5)).timestamp(), "m": "old"},
3309
+ {"ts": float("nan"), "m": "nan"},
3310
+ ])
3311
+ since = base - timedelta(days=1)
3312
+ keep = _apply_ts_filter(df, since, base, keep_null=True)
3313
+ assert set(keep["m"]) == {"in", "nan"}
3314
+ drop = _apply_ts_filter(df, since, base) # default
3315
+ assert set(drop["m"]) == {"in"}
3316
+ assert not any(math.isnan(x) for x in drop["ts"])
3317
+
3318
+
3319
+ def test_flat_family_default_floor_pihole_and_syslog(tmp_path: Path) -> None:
3320
+ """The flat floor peeks DIRECTORY candidates' max first-ts and returns
3321
+ (f_max − span, None); None when nothing is peekable. Directory-only inputs
3322
+ drive the anchor."""
3323
+ span = timedelta(days=1)
3324
+
3325
+ # pihole: two rotation files, oldest first-ts Jun 1 / Jun 5 respectively.
3326
+ pihole_dir = tmp_path / "pihole"
3327
+ pihole_dir.mkdir()
3328
+ (pihole_dir / "pihole.log.1").write_text(
3329
+ "Jun 1 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n", encoding="utf-8"
3330
+ )
3331
+ (pihole_dir / "pihole.log").write_text(
3332
+ "Jun 5 12:00:00 dnsmasq[1]: query[A] b.test from 192.0.2.1\n", encoding="utf-8"
3333
+ )
3334
+ floor = _flat_default_floor(_SOURCE_LOADERS["pihole_dir"], [pihole_dir], "pihole*.log*", span)
3335
+ assert floor is not None
3336
+ # Derive expected from the SAME yearless-ts parser the floor uses, so the
3337
+ # parse_timestamp year-rollback applies to both sides (clock-independent).
3338
+ expected = _peek_first_ts(pihole_dir / "pihole.log") - span
3339
+ assert floor[0] == expected
3340
+ assert floor[1] is None
3341
+
3342
+ # syslog: same mechanism, *.log* discovery.
3343
+ syslog_dir = tmp_path / "syslog"
3344
+ syslog_dir.mkdir()
3345
+ (syslog_dir / "host.log").write_text(
3346
+ "Jun 5 12:00:00 host kernel: line\n", encoding="utf-8"
3347
+ )
3348
+ sfloor = _flat_default_floor(_SOURCE_LOADERS["syslog_dir"], [syslog_dir], "*.log*", span)
3349
+ assert sfloor is not None
3350
+ assert sfloor[0] == _peek_first_ts(syslog_dir / "host.log") - span
3351
+ assert sfloor[1] is None
3352
+
3353
+
3354
+ def test_flat_family_default_floor_unpeekable_returns_none(tmp_path: Path) -> None:
3355
+ """No parseable first-ts across candidates → None (runner load-full fallback)."""
3356
+ d = tmp_path / "syslog"
3357
+ d.mkdir()
3358
+ (d / "host.log").write_text(
3359
+ "Xxx 1 12:00:00 host kernel: unparseable month\n", encoding="utf-8"
3360
+ )
3361
+ assert _flat_default_floor(_SOURCE_LOADERS["syslog_dir"], [d], "*.log*", timedelta(days=1)) is None
3362
+
3363
+
3364
+ def test_flat_family_default_floor_excludes_explicit_files(tmp_path: Path) -> None:
3365
+ """Only is_dir() inputs drive the anchor — an explicit file passed in the list
3366
+ is ignored (1E: explicit files load regardless, must not drive the floor)."""
3367
+ explicit = tmp_path / "old.log"
3368
+ explicit.write_text("Jun 1 12:00:00 host kernel: old\n", encoding="utf-8")
3369
+ d = tmp_path / "syslog"
3370
+ d.mkdir()
3371
+ (d / "host.log").write_text("Jun 5 12:00:00 host kernel: new\n", encoding="utf-8")
3372
+ floor = _flat_default_floor(
3373
+ _SOURCE_LOADERS["syslog_dir"], [explicit, d], "*.log*", timedelta(days=1)
3374
+ )
3375
+ # Anchor is the DIR file (Jun 5), NOT the explicit Jun-1 file — proves exclusion.
3376
+ # Derive expected from the same parser (clock-independent year-rollback).
3377
+ assert floor[0] == _peek_first_ts(d / "host.log") - timedelta(days=1)
3378
+
3379
+
3380
+ # ── pattern-aware flat discovery (pihole narrowing; explicit-file intent) ─────
3381
+
3382
+
3383
+ def test_source_default_window_eligible_cloudtrail_opts_out() -> None:
3384
+ """default_window_eligible is declared on each strategy; the resolver reads it
3385
+ directly (the old source_default_window_eligible accessor folded in). CloudTrail
3386
+ opts out (baseline-relative)."""
3387
+ assert _SOURCE_LOADERS["cloudtrail_dir"].default_window_eligible is False
3388
+ assert _SOURCE_LOADERS["zeek_dir"].default_window_eligible is True
3389
+ assert _SOURCE_LOADERS["syslog_dir"].default_window_eligible is True
3390
+ assert _SOURCE_LOADERS["pihole_dir"].default_window_eligible is True
3391
+ assert "unknown_dir" not in _SOURCE_LOADERS
3392
+
3393
+
3394
+ def test_pihole_directory_discovery_narrows_to_pattern(tmp_path: Path) -> None:
3395
+ """A pihole DIRECTORY discovers only ``pihole*.log*`` — not sibling syslog /
3396
+ cloudtrail files in a shared dir."""
3397
+ d = tmp_path / "shared"
3398
+ d.mkdir()
3399
+ (d / "pihole.log").write_text("x\n", encoding="utf-8")
3400
+ (d / "pihole.log.1").write_text("y\n", encoding="utf-8")
3401
+ (d / "syslog_host.log").write_text("z\n", encoding="utf-8")
3402
+ (d / "cloudtrail.json.log").write_text("{}\n", encoding="utf-8")
3403
+ names = {p.name for p in _syslog_files(d, "pihole*.log*")}
3404
+ assert names == {"pihole.log", "pihole.log.1"}
3405
+ # `_syslog_files`' broad `*.log*` default still grabs everything — it is the
3406
+ # retained Pi-hole filename helper (and backs the Pi-hole mismatch check).
3407
+ # NOTE: syslog discovery no longer uses this glob; it content-sniffs via
3408
+ # `_discover_syslog_files`.
3409
+ assert {p.name for p in _syslog_files(d)} == {
3410
+ "pihole.log", "pihole.log.1", "syslog_host.log", "cloudtrail.json.log",
3411
+ }
3412
+
3413
+
3414
+ def test_pihole_explicit_nonmatching_file_still_loads(tmp_path: Path) -> None:
3415
+ """An explicit FILE routed as Pi-hole loads even if its name doesn't match
3416
+ ``pihole*.log*`` — the pattern applies to DIRECTORY discovery only."""
3417
+ f = tmp_path / "events.log"
3418
+ f.write_text("Jun 5 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n",
3419
+ encoding="utf-8")
3420
+ assert _syslog_files(f, "pihole*.log*") == [f]
3421
+ df = load_pihole(f) # routes through the file path → loads
3422
+ assert len(df) == 1
3423
+
3424
+
3425
+ def test_pihole_plan_and_loader_one_universe(tmp_path: Path) -> None:
3426
+ """Plan-time satisfiability and the loader discover the SAME pihole universe:
3427
+ a dir of only non-pihole files → not satisfiable AND loads empty."""
3428
+ from loghunter.runner import _any_input_yields_files
3429
+
3430
+ d = tmp_path / "syslogonly"
3431
+ d.mkdir()
3432
+ (d / "syslog_host.log").write_text("Jun 5 12:00:00 host kernel: x\n",
3433
+ encoding="utf-8")
3434
+ # Plan: pihole pattern finds nothing here.
3435
+ assert _any_input_yields_files("pihole_dir", [d], "pihole*.log*") is False
3436
+ # Loader: same — discovers no pihole files, loads an empty (column-stable) frame.
3437
+ df = load_pihole(d)
3438
+ assert len(df) == 0
3439
+ assert list(df.columns) == _PIHOLE_COLUMNS
3440
+
3441
+
3442
+ def test_pihole_dir_nonmatching_logs_disclosed_not_silent(tmp_path: Path) -> None:
3443
+ """A configured pihole DIRECTORY holding .log files that don't match
3444
+ ``pihole*.log*`` (e.g. a mis-named dnsmasq log or a shared export dir) loads
3445
+ nothing — but it is DISCLOSED via a loader warning, never a silent miss."""
3446
+ d = tmp_path / "shared"
3447
+ d.mkdir()
3448
+ (d / "dnsmasq.log").write_text(
3449
+ "Jun 5 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n",
3450
+ encoding="utf-8",
3451
+ )
3452
+ res = load_required_logs({"pihole*.log*": "pihole_dir"}, {"pihole_dir": [d]})
3453
+ assert res.record_counts.get("pihole*.log*", 0) == 0, "non-matching name not loaded"
3454
+ assert any("none match 'pihole*.log*'" in w for w in res.warnings), res.warnings
3455
+
3456
+ # A correctly-named pihole dir loads AND emits no mismatch warning.
3457
+ good = tmp_path / "pihole"
3458
+ good.mkdir()
3459
+ (good / "pihole.log").write_text(
3460
+ "Jun 5 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n",
3461
+ encoding="utf-8",
3462
+ )
3463
+ res2 = load_required_logs({"pihole*.log*": "pihole_dir"}, {"pihole_dir": [good]})
3464
+ assert res2.record_counts.get("pihole*.log*", 0) == 1
3465
+ assert not any("none match" in w for w in res2.warnings), res2.warnings
3466
+
3467
+
3468
+ # ── syslog content-sniff discovery gate (Item E) ───────────────────────────────
3469
+
3470
+ def test_syslog_gate_accepts_extensionless_rhel_streams(tmp_path: Path) -> None:
3471
+ """RHEL/Fedora streams carry no `.log` suffix — the content gate accepts
3472
+ `messages`/`secure`/`maillog`/`cron` by RFC-3164 content; per-line hosts come
3473
+ from content (H4), not the filename."""
3474
+ d = tmp_path / "varlog"
3475
+ d.mkdir()
3476
+ (d / "messages").write_text(
3477
+ "<134>May 31 12:00:00 host-a kernel: link up\n", encoding="utf-8")
3478
+ (d / "secure").write_text(
3479
+ "<134>May 31 12:01:00 host-b sshd[100]: Accepted publickey for user\n",
3480
+ encoding="utf-8")
3481
+ (d / "maillog").write_text(
3482
+ "<134>May 31 12:02:00 host-c postfix/smtpd[200]: connect from relay1\n",
3483
+ encoding="utf-8")
3484
+ (d / "cron").write_text(
3485
+ "<134>May 31 12:03:00 host-d CROND[300]: (root) CMD (placeholder)\n",
3486
+ encoding="utf-8")
3487
+
3488
+ res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]})
3489
+ df = res.logs["*.log*"]
3490
+ assert len(df) == 4
3491
+ assert set(df["host"]) == {"host-a", "host-b", "host-c", "host-d"}
3492
+ assert res.warnings == []
3493
+
3494
+
3495
+ def test_syslog_gate_rejects_non_syslog_logs_silently(tmp_path: Path, capsys) -> None:
3496
+ """An ISO-timestamped `dnf.log` and a systemd `boot.log` are dropped by the
3497
+ content gate — no rows AND no per-file stderr at any verbosity."""
3498
+ d = tmp_path / "varlog"
3499
+ d.mkdir()
3500
+ (d / "dnf.log").write_text(
3501
+ "2026-06-01T12:00:00+0000 INFO --- logging initialized ---\n",
3502
+ encoding="utf-8")
3503
+ (d / "boot.log").write_text("[ OK ] Started Some Service.\n", encoding="utf-8")
3504
+ (d / "messages").write_text(
3505
+ "<134>May 31 12:00:00 host-a kernel: link up\n", encoding="utf-8")
3506
+
3507
+ res = load_required_logs(
3508
+ {"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, verbose=True,
3509
+ )
3510
+ df = res.logs["*.log*"]
3511
+ assert len(df) == 1
3512
+ assert set(df["host"]) == {"host-a"}
3513
+ err = capsys.readouterr().err
3514
+ assert "dnf.log" not in err
3515
+ assert "boot.log" not in err
3516
+
3517
+
3518
+ def test_syslog_gate_read_is_byte_bounded(tmp_path: Path, monkeypatch) -> None:
3519
+ """The gate reads a BOUNDED `read(_SYSLOG_SNIFF_BYTES)` on an unclassified
3520
+ candidate and NEVER iterates / readlines it — a line-bounded read would scan
3521
+ a newline-sparse binary (wtmp/btmp/lastlog) to EOF. This is the regression
3522
+ this thread exists to prevent."""
3523
+ import loghunter.common.loader as L
3524
+
3525
+ calls: list[int] = []
3526
+
3527
+ class _Spy:
3528
+ def __enter__(self):
3529
+ return self
3530
+
3531
+ def __exit__(self, *exc):
3532
+ return False
3533
+
3534
+ def read(self, n):
3535
+ calls.append(n)
3536
+ return "\x00\x00\x00" # NUL → binary → rejected
3537
+
3538
+ def __iter__(self):
3539
+ raise AssertionError("gate must not iterate the handle")
3540
+
3541
+ def readline(self, *a):
3542
+ raise AssertionError("gate must not readline the handle")
3543
+
3544
+ f = tmp_path / "btmp"
3545
+ f.write_bytes(b"\x00" * 4096)
3546
+ monkeypatch.setattr(L, "_open_log", lambda p: _Spy())
3547
+
3548
+ assert _looks_like_syslog(f) is False
3549
+ assert calls == [_SYSLOG_SNIFF_BYTES]
3550
+
3551
+
3552
+ def test_syslog_gate_accepts_dnsmasq_bearing_messages(tmp_path: Path) -> None:
3553
+ """A `messages` whose lines are dnsmasq queries IS accepted into syslog — the
3554
+ gate runs the syslog recognizer DIRECTLY (dnsmasq lines are RFC 3164), not
3555
+ the full sniff_format cascade (which would route them to dns)."""
3556
+ d = tmp_path / "varlog"
3557
+ d.mkdir()
3558
+ (d / "messages").write_text(
3559
+ "<30>May 31 12:00:00 host-a dnsmasq[1]: query[A] a.test from 192.0.2.1\n",
3560
+ encoding="utf-8")
3561
+ assert [p.name for p in _discover_syslog_files(d)] == ["messages"]
3562
+
3563
+
3564
+ def test_syslog_zero_accepted_dir_one_summary_warning(tmp_path: Path) -> None:
3565
+ """A syslog dir holding only non-syslog files → exactly ONE summary warning
3566
+ (directory path only, NO per-file name list); a dir with >=1 accepted stream
3567
+ → NO warning; an EMPTY dir → NO warning."""
3568
+ bad = tmp_path / "bad"
3569
+ bad.mkdir()
3570
+ (bad / "dnf.log").write_text("2026-06-01T12:00:00 INFO x\n", encoding="utf-8")
3571
+ (bad / "junk").write_bytes(b"\x00\x01\x02")
3572
+ res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [bad]})
3573
+ assert res.record_counts.get("*.log*", 0) == 0
3574
+ matches = [w for w in res.warnings if "looks like syslog (RFC 3164)" in w]
3575
+ assert len(matches) == 1, res.warnings
3576
+ assert "nothing in" in matches[0]
3577
+ assert str(bad) in matches[0]
3578
+ assert "dnf.log" not in matches[0] and "junk" not in matches[0]
3579
+
3580
+ good = tmp_path / "good"
3581
+ good.mkdir()
3582
+ (good / "messages").write_text(
3583
+ "<134>May 31 12:00:00 host-a kernel: x\n", encoding="utf-8")
3584
+ res2 = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [good]})
3585
+ assert not any("looks like syslog" in w for w in res2.warnings)
3586
+
3587
+ empty = tmp_path / "empty"
3588
+ empty.mkdir()
3589
+ res3 = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [empty]})
3590
+ assert not any("looks like syslog" in w for w in res3.warnings)
3591
+
3592
+
3593
+ def test_syslog_explicit_file_bypasses_gate(tmp_path: Path) -> None:
3594
+ """A named non-RFC-3164 file loads as operator intent — the gate is bypassed
3595
+ for an explicit FILE input."""
3596
+ f = tmp_path / "dnf.log"
3597
+ f.write_text("2026-06-01T12:00:00 INFO x\n", encoding="utf-8")
3598
+ assert _discover_syslog_files(f) == [f]
3599
+ assert len(load_syslog(f)) == 1
3600
+
3601
+
3602
+ def test_syslog_plan_time_lockstep_with_gate(tmp_path: Path) -> None:
3603
+ """Plan-time satisfiability uses the SAME content gate: a dir of only
3604
+ `dnf.log` is NOT satisfiable; a `messages`-bearing dir IS."""
3605
+ from loghunter.runner import _any_input_yields_files
3606
+
3607
+ dnf_only = tmp_path / "dnf"
3608
+ dnf_only.mkdir()
3609
+ (dnf_only / "dnf.log").write_text("2026-06-01T12:00:00 INFO x\n", encoding="utf-8")
3610
+ assert _any_input_yields_files("syslog_dir", [dnf_only], "*.log*") is False
3611
+
3612
+ msgs = tmp_path / "msgs"
3613
+ msgs.mkdir()
3614
+ (msgs / "messages").write_text(
3615
+ "<134>May 31 12:00:00 host-a kernel: x\n", encoding="utf-8")
3616
+ assert _any_input_yields_files("syslog_dir", [msgs], "*.log*") is True
3617
+
3618
+
3619
+ def test_syslog_default_window_floor_anchors_on_accepted_only(tmp_path: Path) -> None:
3620
+ """flat_family_default_floor over a syslog dir with `dnf.log` (ISO, gate-
3621
+ rejected) + a binary + RFC-3164 streams anchors f_max on the MAX accepted
3622
+ candidate's peek ts — rejected files never contribute a peek."""
3623
+ d = tmp_path / "varlog"
3624
+ d.mkdir()
3625
+ (d / "messages").write_text(
3626
+ "<134>May 31 12:00:00 host-a kernel: x\n", encoding="utf-8")
3627
+ (d / "secure").write_text( # later ts → should win f_max
3628
+ "<134>Jun 1 12:00:00 host-b sshd[1]: x\n", encoding="utf-8")
3629
+ (d / "dnf.log").write_text("2026-06-01T12:00:00 INFO x\n", encoding="utf-8")
3630
+ (d / "junk").write_bytes(b"\x00\x01\x02")
3631
+
3632
+ span = timedelta(days=1)
3633
+ floor = _flat_default_floor(_SOURCE_LOADERS["syslog_dir"], [d], "*.log*", span)
3634
+ assert floor is not None
3635
+ f_max, until = floor
3636
+ assert until is None
3637
+ later = _peek_first_ts(d / "secure")
3638
+ assert later is not None
3639
+ assert f_max == later - span