loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
tests/test_loader.py
ADDED
|
@@ -0,0 +1,3639 @@
|
|
|
1
|
+
"""Tests for log loading metadata, normalization, and schema warnings."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import bz2
|
|
6
|
+
import gzip
|
|
7
|
+
import json
|
|
8
|
+
import lzma
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from datetime import date, timedelta
|
|
16
|
+
|
|
17
|
+
from loghunter.common.loader import (
|
|
18
|
+
_CLOUDTRAIL_COLUMNS,
|
|
19
|
+
_PIHOLE_COLUMNS,
|
|
20
|
+
_SYSLOG_SNIFF_BYTES,
|
|
21
|
+
_SOURCE_LOADERS,
|
|
22
|
+
_apply_ts_filter,
|
|
23
|
+
_classify_rotation_name,
|
|
24
|
+
_discover_syslog_files,
|
|
25
|
+
_flat_default_floor,
|
|
26
|
+
_looks_like_syslog,
|
|
27
|
+
_peek_first_ts,
|
|
28
|
+
_rotation_windowed_files,
|
|
29
|
+
_schema_warning,
|
|
30
|
+
_select_group,
|
|
31
|
+
_syslog_files,
|
|
32
|
+
_zeek_dated_window,
|
|
33
|
+
CoverageTracker,
|
|
34
|
+
RotationSkipInfo,
|
|
35
|
+
SourceCoverage,
|
|
36
|
+
discover_cloudtrail_files,
|
|
37
|
+
discover_zeek_files,
|
|
38
|
+
is_bounded,
|
|
39
|
+
is_zeek_bounded,
|
|
40
|
+
load_cloudtrail,
|
|
41
|
+
load_logs,
|
|
42
|
+
load_pihole,
|
|
43
|
+
load_required_logs,
|
|
44
|
+
load_syslog,
|
|
45
|
+
)
|
|
46
|
+
from loghunter.exporters import _auto_filename
|
|
47
|
+
from loghunter.parsers.syslog import parse_timestamp
|
|
48
|
+
from loghunter.parsers.zeek import _normalize_dns_df
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _write_ndjson(path: Path, records: list[dict]) -> None:
|
|
52
|
+
path.write_text(
|
|
53
|
+
"\n".join(json.dumps(record) for record in records) + "\n",
|
|
54
|
+
encoding="utf-8",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ── CloudTrail fixture helpers ────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
_CT_DOCS_ACCOUNT = "123456789012"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _ct_event(**overrides) -> dict:
|
|
64
|
+
"""Build a minimal valid CloudTrail event dict for loader fixtures."""
|
|
65
|
+
base: dict = {
|
|
66
|
+
"eventTime": "2026-06-01T12:00:00Z",
|
|
67
|
+
"eventSource": "s3.amazonaws.com",
|
|
68
|
+
"eventName": "GetObject",
|
|
69
|
+
"eventID": "11111111-1111-1111-1111-111111111111",
|
|
70
|
+
"awsRegion": "us-east-1",
|
|
71
|
+
"sourceIPAddress": "192.0.2.10",
|
|
72
|
+
"userIdentity": {
|
|
73
|
+
"type": "IAMUser",
|
|
74
|
+
"userName": "placeholder-user",
|
|
75
|
+
"principalId": "AIDAEXAMPLE",
|
|
76
|
+
"arn": f"arn:aws:iam::{_CT_DOCS_ACCOUNT}:user/placeholder-user",
|
|
77
|
+
},
|
|
78
|
+
"readOnly": True,
|
|
79
|
+
}
|
|
80
|
+
base.update(overrides)
|
|
81
|
+
return base
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def _ct_write_ndjson(path: Path, events: list[dict]) -> None:
|
|
85
|
+
"""Write events as one JSON object per line (the exporter wire shape)."""
|
|
86
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
87
|
+
path.write_text(
|
|
88
|
+
"\n".join(json.dumps(e) for e in events) + "\n",
|
|
89
|
+
encoding="utf-8",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _ct_write_envelope_gz(path: Path, events: list[dict]) -> None:
|
|
94
|
+
"""Write a gzipped single-line ``{"Records":[...]}`` envelope (native S3 shape)."""
|
|
95
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
96
|
+
payload = json.dumps({"Records": events}).encode("utf-8")
|
|
97
|
+
path.write_bytes(gzip.compress(payload))
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def test_load_required_logs_normalizes_conn_and_reports_window(tmp_path: Path) -> None:
|
|
101
|
+
zeek_dir = tmp_path / "zeek"
|
|
102
|
+
zeek_dir.mkdir()
|
|
103
|
+
_write_ndjson(
|
|
104
|
+
zeek_dir / "conn.log",
|
|
105
|
+
[
|
|
106
|
+
{
|
|
107
|
+
"ts": 1_779_750_000.0,
|
|
108
|
+
"id.orig_h": "192.0.2.10",
|
|
109
|
+
"id.resp_h": "198.51.100.20",
|
|
110
|
+
"id.resp_p": 443,
|
|
111
|
+
"proto": "tcp",
|
|
112
|
+
},
|
|
113
|
+
{
|
|
114
|
+
"ts": 1_779_753_600.0,
|
|
115
|
+
"id.orig_h": "192.0.2.11",
|
|
116
|
+
"id.resp_h": "203.0.113.20",
|
|
117
|
+
"id.resp_p": 22,
|
|
118
|
+
"proto": "tcp",
|
|
119
|
+
},
|
|
120
|
+
],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
result = load_required_logs(
|
|
124
|
+
{"conn*.log*": "zeek_dir"},
|
|
125
|
+
{"zeek_dir": [zeek_dir]},
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
df = result.logs["conn*.log*"]
|
|
129
|
+
assert list(df[["src", "dst", "port"]].iloc[0]) == [
|
|
130
|
+
"192.0.2.10",
|
|
131
|
+
"198.51.100.20",
|
|
132
|
+
443,
|
|
133
|
+
]
|
|
134
|
+
assert result.record_counts == {"conn*.log*": 2}
|
|
135
|
+
assert result.data_window == (
|
|
136
|
+
datetime.fromtimestamp(1_779_750_000.0, tz=timezone.utc),
|
|
137
|
+
datetime.fromtimestamp(1_779_753_600.0, tz=timezone.utc),
|
|
138
|
+
)
|
|
139
|
+
assert result.warnings == []
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def test_load_required_logs_warns_on_missing_canonical_fields(tmp_path: Path) -> None:
|
|
143
|
+
zeek_dir = tmp_path / "zeek"
|
|
144
|
+
zeek_dir.mkdir()
|
|
145
|
+
_write_ndjson(
|
|
146
|
+
zeek_dir / "conn.log",
|
|
147
|
+
[
|
|
148
|
+
{
|
|
149
|
+
"ts": 1_779_750_000.0,
|
|
150
|
+
"id.orig_h": "192.0.2.10",
|
|
151
|
+
},
|
|
152
|
+
],
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
result = load_required_logs(
|
|
156
|
+
{"conn*.log*": "zeek_dir"},
|
|
157
|
+
{"zeek_dir": [zeek_dir]},
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
assert result.record_counts == {"conn*.log*": 1}
|
|
161
|
+
assert result.warnings == [
|
|
162
|
+
"conn.log fields not found: dst, port, proto — is this a Zeek conn.log?"
|
|
163
|
+
]
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def test_load_required_logs_warns_when_source_missing() -> None:
|
|
167
|
+
result = load_required_logs(
|
|
168
|
+
{"conn*.log*": "zeek_dir"},
|
|
169
|
+
{},
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
assert result.logs == {}
|
|
173
|
+
assert result.record_counts == {}
|
|
174
|
+
assert result.data_window is None
|
|
175
|
+
assert result.warnings == ["zeek_dir not configured — conn*.log* not loaded"]
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_schema_warning_does_not_fire_for_missing_duration() -> None:
|
|
179
|
+
"""duration is optional — Zeek omits it for connections that have not closed."""
|
|
180
|
+
df = pd.DataFrame([{
|
|
181
|
+
"src": "192.0.2.10", "dst": "198.51.100.20",
|
|
182
|
+
"port": 443, "proto": "tcp", "ts": 1_779_750_000.0,
|
|
183
|
+
}])
|
|
184
|
+
assert _schema_warning("conn*.log*", df) is None
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def test_schema_warning_fires_for_missing_required_conn_field() -> None:
|
|
188
|
+
"""Optional-column subtraction must not suppress warnings for truly required fields."""
|
|
189
|
+
df = pd.DataFrame([{
|
|
190
|
+
"src": "192.0.2.10", "dst": "198.51.100.20",
|
|
191
|
+
"port": 443, "ts": 1_779_750_000.0, "duration": 600.0,
|
|
192
|
+
# proto deliberately absent
|
|
193
|
+
}])
|
|
194
|
+
warning = _schema_warning("conn*.log*", df)
|
|
195
|
+
assert warning is not None
|
|
196
|
+
assert "proto" in warning
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def test_load_required_logs_routes_pihole_dir(tmp_path: Path) -> None:
|
|
200
|
+
"""pihole_dir source key loads via load_pihole and returns _PIHOLE_COLUMNS schema."""
|
|
201
|
+
pihole_dir = tmp_path / "pihole"
|
|
202
|
+
pihole_dir.mkdir()
|
|
203
|
+
(pihole_dir / "pihole.log").write_text(
|
|
204
|
+
"Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.1\n",
|
|
205
|
+
encoding="utf-8",
|
|
206
|
+
)
|
|
207
|
+
result = load_required_logs(
|
|
208
|
+
{"pihole*.log*": "pihole_dir"},
|
|
209
|
+
{"pihole_dir": [pihole_dir]},
|
|
210
|
+
)
|
|
211
|
+
df = result.logs["pihole*.log*"]
|
|
212
|
+
assert set(_PIHOLE_COLUMNS).issubset(set(df.columns))
|
|
213
|
+
assert len(df) == 1
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def test_load_required_logs_routes_cloudtrail_dir(tmp_path: Path) -> None:
|
|
217
|
+
"""cloudtrail_dir source key loads via load_cloudtrail with canonical columns."""
|
|
218
|
+
cloudtrail_dir = tmp_path / "cloudtrail"
|
|
219
|
+
cloudtrail_dir.mkdir()
|
|
220
|
+
_ct_write_ndjson(cloudtrail_dir / "events.json.log", [_ct_event()])
|
|
221
|
+
|
|
222
|
+
result = load_required_logs(
|
|
223
|
+
{"*.json*": "cloudtrail_dir"},
|
|
224
|
+
{"cloudtrail_dir": [cloudtrail_dir]},
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
df = result.logs["*.json*"]
|
|
228
|
+
assert list(df.columns) == _CLOUDTRAIL_COLUMNS
|
|
229
|
+
assert len(df) == 1
|
|
230
|
+
assert result.record_counts == {"*.json*": 1}
|
|
231
|
+
assert result.data_size_bytes > 0
|
|
232
|
+
assert result.warnings == []
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def test_load_required_logs_raises_for_unknown_source_key(tmp_path: Path) -> None:
|
|
236
|
+
bogus_dir = tmp_path / "bogus"
|
|
237
|
+
bogus_dir.mkdir()
|
|
238
|
+
with pytest.raises(ValueError, match="bogus_dir"):
|
|
239
|
+
load_required_logs({"*.log*": "bogus_dir"}, {"bogus_dir": [bogus_dir]})
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def test_normalize_dns_df_renames_and_applies_qclass_aperture() -> None:
|
|
243
|
+
"""_normalize_dns_df renames Zeek-native DNS columns to canonical names,
|
|
244
|
+
keeps only qclass==1 rows, drops qclass, and carries qtype through."""
|
|
245
|
+
df = pd.DataFrame([
|
|
246
|
+
# qclass=1 (internet) — must be kept
|
|
247
|
+
{
|
|
248
|
+
"id.orig_h": "192.0.2.1", "TTLs": [300.0], "answers": ["198.51.100.1"],
|
|
249
|
+
"TC": 0, "qclass": 1, "qtype": 1, "query": "example.com",
|
|
250
|
+
"ts": 1.0, "rtt": 0.05, "rcode": 0,
|
|
251
|
+
},
|
|
252
|
+
# qclass=2 (CSNET, obsolete) — must be dropped
|
|
253
|
+
{
|
|
254
|
+
"id.orig_h": "192.0.2.2", "TTLs": [60.0], "answers": ["198.51.100.2"],
|
|
255
|
+
"TC": 0, "qclass": 2, "qtype": 1, "query": "other.com",
|
|
256
|
+
"ts": 2.0, "rtt": 0.03, "rcode": 0,
|
|
257
|
+
},
|
|
258
|
+
# qclass=None — must be dropped (== 1 drops nulls)
|
|
259
|
+
{
|
|
260
|
+
"id.orig_h": "192.0.2.3", "TTLs": None, "answers": None,
|
|
261
|
+
"TC": 0, "qclass": None, "qtype": 1, "query": "null-class.com",
|
|
262
|
+
"ts": 3.0, "rtt": None, "rcode": None,
|
|
263
|
+
},
|
|
264
|
+
])
|
|
265
|
+
|
|
266
|
+
result = _normalize_dns_df(df)
|
|
267
|
+
|
|
268
|
+
assert len(result) == 1, "only the qclass=1 row should survive"
|
|
269
|
+
|
|
270
|
+
assert "src" in result.columns, "id.orig_h should be renamed to src"
|
|
271
|
+
assert "ttl" in result.columns, "TTLs should be renamed to ttl"
|
|
272
|
+
assert "answer" in result.columns, "answers should be renamed to answer"
|
|
273
|
+
assert "tc" in result.columns, "TC should be renamed to tc"
|
|
274
|
+
|
|
275
|
+
assert "qclass" not in result.columns, "qclass must be dropped"
|
|
276
|
+
assert "qtype" in result.columns, "qtype must be carried through (raw numeric code)"
|
|
277
|
+
assert "id.orig_h" not in result.columns, "Zeek-native id.orig_h must not remain"
|
|
278
|
+
|
|
279
|
+
assert result.iloc[0]["src"] == "192.0.2.1"
|
|
280
|
+
|
|
281
|
+
# rtt and rcode are already canonical — must pass through unchanged
|
|
282
|
+
assert "rtt" in result.columns
|
|
283
|
+
assert "rcode" in result.columns
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
def test_normalize_dns_df_carries_qtype_as_raw_numeric() -> None:
|
|
287
|
+
"""qtype is carried through as Zeek's raw numeric type code — no rename,
|
|
288
|
+
no mnemonic translation. Aperture and qclass drop are unchanged."""
|
|
289
|
+
df = pd.DataFrame([
|
|
290
|
+
# qclass=1, qtype=1 (A) — must survive with qtype preserved
|
|
291
|
+
{
|
|
292
|
+
"id.orig_h": "192.0.2.10", "query": "alpha.invalid",
|
|
293
|
+
"ts": 1.0, "qclass": 1, "qtype": 1, "rcode": 0,
|
|
294
|
+
},
|
|
295
|
+
# qclass=1, qtype=28 (AAAA) — must survive with qtype preserved
|
|
296
|
+
{
|
|
297
|
+
"id.orig_h": "192.0.2.11", "query": "beta.invalid",
|
|
298
|
+
"ts": 2.0, "qclass": 1, "qtype": 28, "rcode": 0,
|
|
299
|
+
},
|
|
300
|
+
# qclass=2 (CSNET) — must be dropped by the aperture
|
|
301
|
+
{
|
|
302
|
+
"id.orig_h": "192.0.2.12", "query": "gamma.invalid",
|
|
303
|
+
"ts": 3.0, "qclass": 2, "qtype": 1, "rcode": 0,
|
|
304
|
+
},
|
|
305
|
+
])
|
|
306
|
+
|
|
307
|
+
result = _normalize_dns_df(df).reset_index(drop=True)
|
|
308
|
+
|
|
309
|
+
# Aperture still working — CSNET row dropped
|
|
310
|
+
assert len(result) == 2, "qclass=2 row must be dropped by the aperture"
|
|
311
|
+
# qclass still dropped from the output frame
|
|
312
|
+
assert "qclass" not in result.columns, "qclass must be dropped"
|
|
313
|
+
# qtype carried through as raw numeric (1 for A, 28 for AAAA — no mnemonic)
|
|
314
|
+
assert "qtype" in result.columns, "qtype must be carried through"
|
|
315
|
+
assert list(result["qtype"]) == [1, 28]
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
# ── Pi-hole / dnsmasq loader tests ───────────────────────────────────────────
|
|
319
|
+
|
|
320
|
+
_PIHOLE_LINE_QUERY = "Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.1"
|
|
321
|
+
_PIHOLE_LINE_REPLY = "Jun 1 12:00:01 dnsmasq[1]: reply example.test is 203.0.113.1"
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
def test_load_pihole_plain_fixture(tmp_path: Path) -> None:
|
|
325
|
+
"""Two valid dnsmasq lines in a directory load into a _PIHOLE_COLUMNS DataFrame."""
|
|
326
|
+
pihole_dir = tmp_path / "pihole"
|
|
327
|
+
pihole_dir.mkdir()
|
|
328
|
+
(pihole_dir / "pihole.log").write_text(
|
|
329
|
+
f"{_PIHOLE_LINE_QUERY}\n{_PIHOLE_LINE_REPLY}\n", encoding="utf-8"
|
|
330
|
+
)
|
|
331
|
+
df = load_pihole(pihole_dir)
|
|
332
|
+
assert list(df.columns) == _PIHOLE_COLUMNS
|
|
333
|
+
assert len(df) == 2
|
|
334
|
+
assert df.iloc[0]["event_type"] == "query"
|
|
335
|
+
assert df.iloc[0]["src"] == "192.0.2.1"
|
|
336
|
+
assert df.iloc[1]["event_type"] == "reply"
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def test_load_pihole_single_file_path(tmp_path: Path) -> None:
|
|
340
|
+
"""load_pihole accepts a direct file path instead of a directory."""
|
|
341
|
+
log_file = tmp_path / "pihole.log"
|
|
342
|
+
log_file.write_text(f"{_PIHOLE_LINE_QUERY}\n", encoding="utf-8")
|
|
343
|
+
df = load_pihole(log_file)
|
|
344
|
+
assert list(df.columns) == _PIHOLE_COLUMNS
|
|
345
|
+
assert len(df) == 1
|
|
346
|
+
|
|
347
|
+
|
|
348
|
+
def test_load_pihole_gzip_fixture(tmp_path: Path) -> None:
|
|
349
|
+
"""Gzip-compressed dnsmasq log is decompressed and loads identically to plain."""
|
|
350
|
+
pihole_dir = tmp_path / "pihole"
|
|
351
|
+
pihole_dir.mkdir()
|
|
352
|
+
content = f"{_PIHOLE_LINE_QUERY}\n{_PIHOLE_LINE_REPLY}\n"
|
|
353
|
+
with gzip.open(pihole_dir / "pihole.log.gz", "wt", encoding="utf-8") as fh:
|
|
354
|
+
fh.write(content)
|
|
355
|
+
df = load_pihole(pihole_dir)
|
|
356
|
+
assert list(df.columns) == _PIHOLE_COLUMNS
|
|
357
|
+
assert len(df) == 2
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def test_load_pihole_ndjson_skipped(tmp_path: Path, capsys: pytest.CaptureFixture) -> None:
|
|
361
|
+
"""A wrong-FORMAT (NDJSON) file matching the pihole glob is skipped quietly by
|
|
362
|
+
default; surrounding dnsmasq files load. (Named to match ``pihole*.log*`` so it
|
|
363
|
+
enters the discovered universe — the wrong-format skip is what's under test.)"""
|
|
364
|
+
pihole_dir = tmp_path / "pihole"
|
|
365
|
+
pihole_dir.mkdir()
|
|
366
|
+
(pihole_dir / "pihole.ndjson.log").write_text('{"ts": 1.0}\n', encoding="utf-8")
|
|
367
|
+
(pihole_dir / "pihole.log").write_text(f"{_PIHOLE_LINE_QUERY}\n", encoding="utf-8")
|
|
368
|
+
df = load_pihole(pihole_dir)
|
|
369
|
+
assert len(df) == 1
|
|
370
|
+
captured = capsys.readouterr()
|
|
371
|
+
assert captured.err == ""
|
|
372
|
+
|
|
373
|
+
df = load_pihole(pihole_dir, verbose=True)
|
|
374
|
+
assert len(df) == 1
|
|
375
|
+
captured = capsys.readouterr()
|
|
376
|
+
assert "pihole.ndjson.log" in captured.err
|
|
377
|
+
assert "NDJSON" in captured.err
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def test_load_pihole_empty_file(tmp_path: Path) -> None:
|
|
381
|
+
"""An empty log file returns an empty DataFrame with the canonical columns."""
|
|
382
|
+
pihole_dir = tmp_path / "pihole"
|
|
383
|
+
pihole_dir.mkdir()
|
|
384
|
+
(pihole_dir / "pihole.log").write_text("", encoding="utf-8")
|
|
385
|
+
df = load_pihole(pihole_dir)
|
|
386
|
+
assert list(df.columns) == _PIHOLE_COLUMNS
|
|
387
|
+
assert len(df) == 0
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
def test_load_pihole_malformed_lines_dropped(tmp_path: Path) -> None:
|
|
391
|
+
"""Non-dnsmasq lines are dropped; valid lines on either side are retained."""
|
|
392
|
+
pihole_dir = tmp_path / "pihole"
|
|
393
|
+
pihole_dir.mkdir()
|
|
394
|
+
(pihole_dir / "pihole.log").write_text(
|
|
395
|
+
f"{_PIHOLE_LINE_QUERY}\nnot a dnsmasq line at all\n{_PIHOLE_LINE_REPLY}\n",
|
|
396
|
+
encoding="utf-8",
|
|
397
|
+
)
|
|
398
|
+
df = load_pihole(pihole_dir)
|
|
399
|
+
assert len(df) == 2
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def test_load_pihole_hostname_from_stem(tmp_path: Path) -> None:
|
|
403
|
+
"""Host is derived from the filename stem, not from log content."""
|
|
404
|
+
pihole_dir = tmp_path / "pihole"
|
|
405
|
+
pihole_dir.mkdir()
|
|
406
|
+
(pihole_dir / "pihole-router.log").write_text(
|
|
407
|
+
f"{_PIHOLE_LINE_QUERY}\n{_PIHOLE_LINE_REPLY}\n", encoding="utf-8"
|
|
408
|
+
)
|
|
409
|
+
df = load_pihole(pihole_dir)
|
|
410
|
+
assert (df["host"] == "pihole-router").all()
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def test_load_pihole_timefilter_keeps_nan_ts(tmp_path: Path) -> None:
|
|
414
|
+
"""Rows with unparseable timestamps (NaN ts) are not dropped by the timeframe filter."""
|
|
415
|
+
pihole_dir = tmp_path / "pihole"
|
|
416
|
+
pihole_dir.mkdir()
|
|
417
|
+
# Outer regex matches "Xxx" (\w{3}) but strptime fails on it → parse_timestamp returns None
|
|
418
|
+
nan_ts_line = "Xxx 1 12:00:00 dnsmasq[1]: query[A] other.test from 192.0.2.2"
|
|
419
|
+
(pihole_dir / "pihole.log").write_text(
|
|
420
|
+
f"{_PIHOLE_LINE_QUERY}\n{nan_ts_line}\n", encoding="utf-8"
|
|
421
|
+
)
|
|
422
|
+
_year = datetime.now(timezone.utc).year
|
|
423
|
+
since = datetime(_year, 6, 1, 11, 0, 0, tzinfo=timezone.utc)
|
|
424
|
+
until = datetime(_year, 6, 1, 13, 0, 0, tzinfo=timezone.utc)
|
|
425
|
+
df = load_pihole(pihole_dir, since, until)
|
|
426
|
+
assert len(df) == 2
|
|
427
|
+
import math
|
|
428
|
+
assert math.isnan(df.loc[df["query"] == "other.test", "ts"].iloc[0])
|
|
429
|
+
|
|
430
|
+
|
|
431
|
+
def test_schema_warning_no_ops_for_pihole_pattern(tmp_path: Path) -> None:
|
|
432
|
+
"""_schema_warning returns None for pihole patterns — no required-column contract."""
|
|
433
|
+
df = pd.DataFrame([{"ts": 1.0, "query": "example.test", "src": "192.0.2.1"}])
|
|
434
|
+
assert _schema_warning("pihole*.log*", df) is None
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
# ── TSV load path tests ───────────────────────────────────────────────────────
|
|
438
|
+
|
|
439
|
+
_CONN_TSV_HEADER = (
|
|
440
|
+
"#separator \\x09\n"
|
|
441
|
+
"#set_separator ,\n"
|
|
442
|
+
"#empty_field (empty)\n"
|
|
443
|
+
"#unset_field -\n"
|
|
444
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\n"
|
|
445
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport\tenum\n"
|
|
446
|
+
)
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
def _write_tsv(path: Path, content: str) -> None:
|
|
450
|
+
path.write_text(content, encoding="utf-8")
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def test_load_logs_mixed_ndjson_and_tsv(tmp_path: Path) -> None:
|
|
454
|
+
"""NDJSON and TSV files in the same directory both load; canonical columns present."""
|
|
455
|
+
zeek_dir = tmp_path / "zeek"
|
|
456
|
+
zeek_dir.mkdir()
|
|
457
|
+
|
|
458
|
+
_write_ndjson(
|
|
459
|
+
zeek_dir / "conn-ndjson.log",
|
|
460
|
+
[
|
|
461
|
+
{"ts": 1000.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
462
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
463
|
+
{"ts": 1001.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
464
|
+
"id.resp_p": 80, "proto": "tcp"},
|
|
465
|
+
],
|
|
466
|
+
)
|
|
467
|
+
_write_tsv(
|
|
468
|
+
zeek_dir / "conn-tsv.log",
|
|
469
|
+
_CONN_TSV_HEADER
|
|
470
|
+
+ "2000.0\tCabc1\t192.0.2.2\t54321\t198.51.100.2\t443\ttcp\n"
|
|
471
|
+
+ "2001.0\tCabc2\t192.0.2.2\t54322\t198.51.100.2\t80\ttcp\n",
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
df = load_logs(zeek_dir, "conn*.log*")
|
|
475
|
+
|
|
476
|
+
assert len(df) == 4
|
|
477
|
+
for col in ("src", "dst", "port", "proto"):
|
|
478
|
+
assert col in df.columns, f"canonical column {col!r} missing"
|
|
479
|
+
assert set(df["src"].tolist()) == {"192.0.2.1", "192.0.2.2"}
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
def test_load_logs_timeframe_filter_applies_across_encodings(tmp_path: Path) -> None:
|
|
483
|
+
"""since/until filters rows from both NDJSON and TSV files uniformly."""
|
|
484
|
+
zeek_dir = tmp_path / "zeek"
|
|
485
|
+
zeek_dir.mkdir()
|
|
486
|
+
|
|
487
|
+
# NDJSON: ts=100.0 in-window, ts=50.0 out-of-window
|
|
488
|
+
_write_ndjson(
|
|
489
|
+
zeek_dir / "conn-ndjson.log",
|
|
490
|
+
[
|
|
491
|
+
{"ts": 100.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
492
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
493
|
+
{"ts": 50.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
494
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
495
|
+
],
|
|
496
|
+
)
|
|
497
|
+
# TSV: ts=200.0 in-window, ts=300.0 out-of-window
|
|
498
|
+
_write_tsv(
|
|
499
|
+
zeek_dir / "conn-tsv.log",
|
|
500
|
+
_CONN_TSV_HEADER
|
|
501
|
+
+ "200.0\tCabc1\t192.0.2.3\t54321\t198.51.100.3\t443\ttcp\n"
|
|
502
|
+
+ "300.0\tCabc2\t192.0.2.3\t54322\t198.51.100.3\t443\ttcp\n",
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
since = datetime.fromtimestamp(75.0, tz=timezone.utc)
|
|
506
|
+
until = datetime.fromtimestamp(250.0, tz=timezone.utc)
|
|
507
|
+
df = load_logs(zeek_dir, "conn*.log*", since=since, until=until)
|
|
508
|
+
|
|
509
|
+
assert len(df) == 2
|
|
510
|
+
assert set(df["ts"].tolist()) == {100.0, 200.0}
|
|
511
|
+
assert set(df["src"].tolist()) == {"192.0.2.1", "192.0.2.3"}
|
|
512
|
+
|
|
513
|
+
|
|
514
|
+
def test_load_logs_tsv_vector_addr_and_set_enum(tmp_path: Path) -> None:
|
|
515
|
+
"""vector[addr] and set[enum] fields survive the load path as Python lists."""
|
|
516
|
+
zeek_dir = tmp_path / "zeek"
|
|
517
|
+
zeek_dir.mkdir()
|
|
518
|
+
|
|
519
|
+
_write_tsv(
|
|
520
|
+
zeek_dir / "weird-tsv.log",
|
|
521
|
+
"#separator \\x09\n"
|
|
522
|
+
"#set_separator ,\n"
|
|
523
|
+
"#empty_field (empty)\n"
|
|
524
|
+
"#unset_field -\n"
|
|
525
|
+
"#fields\tts\taddrs\tactions\n"
|
|
526
|
+
"#types\ttime\tvector[addr]\tset[enum]\n"
|
|
527
|
+
"1000.0\t192.0.2.1,192.0.2.2\tWeird::ACTIVITY,Weird::NOTICE\n",
|
|
528
|
+
)
|
|
529
|
+
|
|
530
|
+
df = load_logs(zeek_dir, "weird*.log*")
|
|
531
|
+
|
|
532
|
+
assert len(df) == 1
|
|
533
|
+
addrs = df.iloc[0]["addrs"]
|
|
534
|
+
assert isinstance(addrs, list)
|
|
535
|
+
assert addrs == ["192.0.2.1", "192.0.2.2"]
|
|
536
|
+
|
|
537
|
+
actions = df.iloc[0]["actions"]
|
|
538
|
+
assert isinstance(actions, list)
|
|
539
|
+
assert set(actions) == {"Weird::ACTIVITY", "Weird::NOTICE"}
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
# ── Dated-directory layout tests ─────────────────────────────────────────────
|
|
543
|
+
#
|
|
544
|
+
# Epoch timestamps used below map to the following UTC calendar dates:
|
|
545
|
+
# 1767225600 → 2026-01-01 00:00:00 UTC
|
|
546
|
+
# 1767312000 → 2026-01-02 00:00:00 UTC
|
|
547
|
+
# 1767398400 → 2026-01-03 00:00:00 UTC
|
|
548
|
+
# 1767427200 → 2026-01-03 08:00:00 UTC
|
|
549
|
+
# 1767463200 → 2026-01-03 18:00:00 UTC
|
|
550
|
+
# 1767484800 → 2026-01-04 00:00:00 UTC
|
|
551
|
+
|
|
552
|
+
_JAN1 = datetime(2026, 1, 1, tzinfo=timezone.utc)
|
|
553
|
+
_JAN2 = datetime(2026, 1, 2, tzinfo=timezone.utc)
|
|
554
|
+
_JAN3 = datetime(2026, 1, 3, tzinfo=timezone.utc)
|
|
555
|
+
_JAN4 = datetime(2026, 1, 4, tzinfo=timezone.utc)
|
|
556
|
+
_JAN5 = datetime(2026, 1, 5, tzinfo=timezone.utc)
|
|
557
|
+
|
|
558
|
+
_TS_JAN1 = _JAN1.timestamp() # 1767225600.0
|
|
559
|
+
_TS_JAN2 = _JAN2.timestamp() # 1767312000.0
|
|
560
|
+
_TS_JAN3 = _JAN3.timestamp() # 1767398400.0
|
|
561
|
+
_TS_JAN3_08 = datetime(2026, 1, 3, 8, tzinfo=timezone.utc).timestamp() # 1767427200.0
|
|
562
|
+
_TS_JAN3_18 = datetime(2026, 1, 3, 18, tzinfo=timezone.utc).timestamp() # 1767463200.0
|
|
563
|
+
_TS_JAN4 = _JAN4.timestamp() # 1767484800.0
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
def test_load_logs_flat_layout_unchanged(tmp_path: Path) -> None:
|
|
567
|
+
"""Flat directory (no YYYY-MM-DD subdirs) loads exactly as before — regression guard."""
|
|
568
|
+
zeek_dir = tmp_path / "zeek"
|
|
569
|
+
zeek_dir.mkdir()
|
|
570
|
+
_write_ndjson(
|
|
571
|
+
zeek_dir / "conn-a.log",
|
|
572
|
+
[{"ts": _TS_JAN1, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
573
|
+
"id.resp_p": 443, "proto": "tcp"}],
|
|
574
|
+
)
|
|
575
|
+
_write_ndjson(
|
|
576
|
+
zeek_dir / "conn-b.log",
|
|
577
|
+
[{"ts": _TS_JAN2, "id.orig_h": "192.0.2.2", "id.resp_h": "198.51.100.2",
|
|
578
|
+
"id.resp_p": 80, "proto": "tcp"}],
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
df = load_logs(zeek_dir, "conn*.log*")
|
|
582
|
+
|
|
583
|
+
assert len(df) == 2
|
|
584
|
+
for col in ("src", "dst", "port", "proto"):
|
|
585
|
+
assert col in df.columns
|
|
586
|
+
assert set(df["src"].tolist()) == {"192.0.2.1", "192.0.2.2"}
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def test_load_logs_dated_layout_discovers_subdirs(tmp_path: Path) -> None:
|
|
590
|
+
"""Dated layout: files inside YYYY-MM-DD subdirs are discovered and concatenated."""
|
|
591
|
+
zeek_dir = tmp_path / "zeek"
|
|
592
|
+
zeek_dir.mkdir()
|
|
593
|
+
(zeek_dir / "2026-01-01").mkdir()
|
|
594
|
+
(zeek_dir / "2026-01-02").mkdir()
|
|
595
|
+
_write_ndjson(
|
|
596
|
+
zeek_dir / "2026-01-01" / "conn.log",
|
|
597
|
+
[
|
|
598
|
+
{"ts": _TS_JAN1, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
599
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
600
|
+
{"ts": _TS_JAN1 + 1, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
601
|
+
"id.resp_p": 80, "proto": "tcp"},
|
|
602
|
+
],
|
|
603
|
+
)
|
|
604
|
+
_write_ndjson(
|
|
605
|
+
zeek_dir / "2026-01-02" / "conn.log",
|
|
606
|
+
[
|
|
607
|
+
{"ts": _TS_JAN2, "id.orig_h": "198.51.100.2", "id.resp_h": "203.0.113.1",
|
|
608
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
609
|
+
{"ts": _TS_JAN2 + 1, "id.orig_h": "198.51.100.2", "id.resp_h": "203.0.113.1",
|
|
610
|
+
"id.resp_p": 22, "proto": "tcp"},
|
|
611
|
+
],
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
df = load_logs(zeek_dir, "conn*.log*")
|
|
615
|
+
|
|
616
|
+
assert len(df) == 4
|
|
617
|
+
assert "192.0.2.1" in df["src"].tolist()
|
|
618
|
+
assert "198.51.100.2" in df["src"].tolist()
|
|
619
|
+
|
|
620
|
+
|
|
621
|
+
def test_load_logs_date_pruning_skips_out_of_window_dirs(tmp_path: Path) -> None:
|
|
622
|
+
"""Date pruning: out-of-window subdirs are never opened (coarse-by-dirname proof).
|
|
623
|
+
|
|
624
|
+
The garbage .gz file in 2026-01-01 would raise BadGzipFile if opened. Absence of
|
|
625
|
+
that exception proves the directory was pruned, not just filtered downstream.
|
|
626
|
+
"""
|
|
627
|
+
zeek_dir = tmp_path / "zeek"
|
|
628
|
+
zeek_dir.mkdir()
|
|
629
|
+
|
|
630
|
+
day1 = zeek_dir / "2026-01-01"
|
|
631
|
+
day1.mkdir()
|
|
632
|
+
# Non-gzip bytes in a .gz file — raises BadGzipFile if opened.
|
|
633
|
+
(day1 / "conn.00:00:00-01:00:00.log.gz").write_bytes(b"NOTGZIP")
|
|
634
|
+
|
|
635
|
+
day2 = zeek_dir / "2026-01-02"
|
|
636
|
+
day2.mkdir()
|
|
637
|
+
_write_ndjson(
|
|
638
|
+
day2 / "conn.log",
|
|
639
|
+
[
|
|
640
|
+
{"ts": _TS_JAN2, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.10",
|
|
641
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
642
|
+
{"ts": _TS_JAN2 + 1, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.10",
|
|
643
|
+
"id.resp_p": 80, "proto": "tcp"},
|
|
644
|
+
],
|
|
645
|
+
)
|
|
646
|
+
|
|
647
|
+
day3 = zeek_dir / "2026-01-03"
|
|
648
|
+
day3.mkdir()
|
|
649
|
+
_write_ndjson(
|
|
650
|
+
day3 / "conn.log",
|
|
651
|
+
[{"ts": _TS_JAN3, "id.orig_h": "192.0.2.11", "id.resp_h": "198.51.100.11",
|
|
652
|
+
"id.resp_p": 443, "proto": "tcp"}],
|
|
653
|
+
)
|
|
654
|
+
|
|
655
|
+
since = _JAN2
|
|
656
|
+
until = datetime(2026, 1, 2, 23, 59, 59, tzinfo=timezone.utc)
|
|
657
|
+
df = load_logs(zeek_dir, "conn*.log*", since=since, until=until)
|
|
658
|
+
|
|
659
|
+
assert len(df) == 2
|
|
660
|
+
assert set(df["src"].tolist()) == {"192.0.2.10"}
|
|
661
|
+
|
|
662
|
+
|
|
663
|
+
def test_load_logs_dated_boundary_day_included(tmp_path: Path) -> None:
|
|
664
|
+
"""A window starting mid-day still includes the boundary subdir; per-line filter trims."""
|
|
665
|
+
zeek_dir = tmp_path / "zeek"
|
|
666
|
+
zeek_dir.mkdir()
|
|
667
|
+
(zeek_dir / "2026-01-03").mkdir()
|
|
668
|
+
_write_ndjson(
|
|
669
|
+
zeek_dir / "2026-01-03" / "conn.log",
|
|
670
|
+
[
|
|
671
|
+
{"ts": _TS_JAN3_08, "id.orig_h": "192.0.2.20", "id.resp_h": "198.51.100.20",
|
|
672
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
673
|
+
{"ts": _TS_JAN3_18, "id.orig_h": "192.0.2.21", "id.resp_h": "198.51.100.21",
|
|
674
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
675
|
+
],
|
|
676
|
+
)
|
|
677
|
+
|
|
678
|
+
# Window starts at noon on Jan 3; only the 18:00 row survives the per-line filter.
|
|
679
|
+
since = datetime(2026, 1, 3, 12, 0, 0, tzinfo=timezone.utc)
|
|
680
|
+
df = load_logs(zeek_dir, "conn*.log*", since=since)
|
|
681
|
+
|
|
682
|
+
assert len(df) == 1
|
|
683
|
+
assert df.iloc[0]["src"] == "192.0.2.21"
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def test_load_logs_dated_suffix_dir_treated_as_date(tmp_path: Path) -> None:
|
|
687
|
+
"""A YYYY-MM-DD-SUFFIX dir is treated as the date prefix, suffix ignored."""
|
|
688
|
+
zeek_dir = tmp_path / "zeek"
|
|
689
|
+
zeek_dir.mkdir()
|
|
690
|
+
(zeek_dir / "2026-01-02-TSVPRE").mkdir()
|
|
691
|
+
(zeek_dir / "2026-01-04").mkdir()
|
|
692
|
+
_write_ndjson(
|
|
693
|
+
zeek_dir / "2026-01-02-TSVPRE" / "conn.log",
|
|
694
|
+
[{"ts": _TS_JAN2, "id.orig_h": "192.0.2.30", "id.resp_h": "198.51.100.30",
|
|
695
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
696
|
+
{"ts": _TS_JAN2 + 1, "id.orig_h": "192.0.2.30", "id.resp_h": "198.51.100.30",
|
|
697
|
+
"id.resp_p": 80, "proto": "tcp"}],
|
|
698
|
+
)
|
|
699
|
+
_write_ndjson(
|
|
700
|
+
zeek_dir / "2026-01-04" / "conn.log",
|
|
701
|
+
[{"ts": _TS_JAN4, "id.orig_h": "192.0.2.31", "id.resp_h": "198.51.100.31",
|
|
702
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
703
|
+
{"ts": _TS_JAN4 + 1, "id.orig_h": "192.0.2.31", "id.resp_h": "198.51.100.31",
|
|
704
|
+
"id.resp_p": 80, "proto": "tcp"}],
|
|
705
|
+
)
|
|
706
|
+
|
|
707
|
+
# Window Jan 2–3: TSVPRE dir included, Jan 4 excluded.
|
|
708
|
+
df = load_logs(zeek_dir, "conn*.log*", since=_JAN2, until=_JAN3)
|
|
709
|
+
assert len(df) == 2
|
|
710
|
+
assert set(df["src"].tolist()) == {"192.0.2.30"}
|
|
711
|
+
|
|
712
|
+
# Window Jan 4–5: Jan 4 included, TSVPRE dir excluded.
|
|
713
|
+
df = load_logs(zeek_dir, "conn*.log*", since=_JAN4, until=_JAN5)
|
|
714
|
+
assert len(df) == 2
|
|
715
|
+
assert set(df["src"].tolist()) == {"192.0.2.31"}
|
|
716
|
+
|
|
717
|
+
|
|
718
|
+
def test_load_logs_dated_symlink_deduplication(tmp_path: Path) -> None:
|
|
719
|
+
"""Symlink pointing at a date subdir: data loads exactly once in both window cases."""
|
|
720
|
+
zeek_dir = tmp_path / "zeek"
|
|
721
|
+
zeek_dir.mkdir()
|
|
722
|
+
day1 = zeek_dir / "2026-01-01"
|
|
723
|
+
day1.mkdir()
|
|
724
|
+
_write_ndjson(
|
|
725
|
+
day1 / "conn.log",
|
|
726
|
+
[{"ts": _TS_JAN1, "id.orig_h": "192.0.2.40", "id.resp_h": "198.51.100.40",
|
|
727
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
728
|
+
{"ts": _TS_JAN1 + 1, "id.orig_h": "192.0.2.40", "id.resp_h": "198.51.100.40",
|
|
729
|
+
"id.resp_p": 80, "proto": "tcp"}],
|
|
730
|
+
)
|
|
731
|
+
(zeek_dir / "current").symlink_to(day1)
|
|
732
|
+
|
|
733
|
+
# No window: current symlink is deduped; data appears exactly once.
|
|
734
|
+
df = load_logs(zeek_dir, "conn*.log*")
|
|
735
|
+
assert len(df) == 2, f"expected 2 rows (deduped), got {len(df)}"
|
|
736
|
+
|
|
737
|
+
# With window covering Jan 1: current (non-date name) skipped; 2026-01-01 loads once.
|
|
738
|
+
df = load_logs(zeek_dir, "conn*.log*", since=_JAN1, until=_JAN2)
|
|
739
|
+
assert len(df) == 2, f"expected 2 rows under window, got {len(df)}"
|
|
740
|
+
|
|
741
|
+
|
|
742
|
+
def test_load_logs_dated_non_date_dir_skipped_with_window(tmp_path: Path) -> None:
|
|
743
|
+
"""Non-date-named dirs are skipped when a window is set — no duplicate rows."""
|
|
744
|
+
zeek_dir = tmp_path / "zeek"
|
|
745
|
+
zeek_dir.mkdir()
|
|
746
|
+
(zeek_dir / "2026-01-02").mkdir()
|
|
747
|
+
(zeek_dir / "export").mkdir()
|
|
748
|
+
|
|
749
|
+
rows = [
|
|
750
|
+
{"ts": _TS_JAN2, "id.orig_h": "192.0.2.50", "id.resp_h": "198.51.100.50",
|
|
751
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
752
|
+
{"ts": _TS_JAN2 + 1, "id.orig_h": "192.0.2.50", "id.resp_h": "198.51.100.50",
|
|
753
|
+
"id.resp_p": 80, "proto": "tcp"},
|
|
754
|
+
]
|
|
755
|
+
_write_ndjson(zeek_dir / "2026-01-02" / "conn.log", rows)
|
|
756
|
+
_write_ndjson(zeek_dir / "export" / "conn.log", rows) # same rows — would double if included
|
|
757
|
+
|
|
758
|
+
df = load_logs(zeek_dir, "conn*.log*", since=_JAN2, until=_JAN3)
|
|
759
|
+
assert len(df) == 2
|
|
760
|
+
|
|
761
|
+
|
|
762
|
+
def test_load_required_logs_size_matches_pruned_files(tmp_path: Path) -> None:
|
|
763
|
+
"""data_size_bytes accounts only for files in the pruned window, not excluded days."""
|
|
764
|
+
zeek_dir = tmp_path / "zeek"
|
|
765
|
+
zeek_dir.mkdir()
|
|
766
|
+
(zeek_dir / "2026-01-01").mkdir()
|
|
767
|
+
(zeek_dir / "2026-01-02").mkdir()
|
|
768
|
+
|
|
769
|
+
_write_ndjson(
|
|
770
|
+
zeek_dir / "2026-01-01" / "conn.log",
|
|
771
|
+
[{"ts": _TS_JAN1, "id.orig_h": "192.0.2.60", "id.resp_h": "198.51.100.60",
|
|
772
|
+
"id.resp_p": 443, "proto": "tcp"}],
|
|
773
|
+
)
|
|
774
|
+
_write_ndjson(
|
|
775
|
+
zeek_dir / "2026-01-02" / "conn.log",
|
|
776
|
+
[{"ts": _TS_JAN2, "id.orig_h": "192.0.2.61", "id.resp_h": "198.51.100.61",
|
|
777
|
+
"id.resp_p": 443, "proto": "tcp"}],
|
|
778
|
+
)
|
|
779
|
+
|
|
780
|
+
result = load_required_logs(
|
|
781
|
+
{"conn*.log*": "zeek_dir"},
|
|
782
|
+
{"zeek_dir": [zeek_dir]},
|
|
783
|
+
since=_JAN2,
|
|
784
|
+
until=datetime(2026, 1, 2, 23, 59, 59, tzinfo=timezone.utc),
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
expected_size = (zeek_dir / "2026-01-02" / "conn.log").stat().st_size
|
|
788
|
+
assert result.data_size_bytes == expected_size
|
|
789
|
+
|
|
790
|
+
|
|
791
|
+
def test_load_required_logs_warns_and_skips_truncated_zeek_gzip(
|
|
792
|
+
tmp_path: Path,
|
|
793
|
+
) -> None:
|
|
794
|
+
"""A selected truncated gzip file warns and does not abort the whole Zeek load."""
|
|
795
|
+
zeek_dir = tmp_path / "zeek"
|
|
796
|
+
zeek_dir.mkdir()
|
|
797
|
+
|
|
798
|
+
payload = (
|
|
799
|
+
b'{"ts":1767312000.0,"id.orig_h":"192.0.2.10",'
|
|
800
|
+
b'"id.resp_h":"198.51.100.10","id.resp_p":443,"proto":"tcp"}\n'
|
|
801
|
+
)
|
|
802
|
+
(zeek_dir / "conn-bad.log.gz").write_bytes(gzip.compress(payload)[:-8])
|
|
803
|
+
_write_ndjson(
|
|
804
|
+
zeek_dir / "conn-good.log",
|
|
805
|
+
[{
|
|
806
|
+
"ts": _TS_JAN2,
|
|
807
|
+
"id.orig_h": "192.0.2.11",
|
|
808
|
+
"id.resp_h": "198.51.100.11",
|
|
809
|
+
"id.resp_p": 443,
|
|
810
|
+
"proto": "tcp",
|
|
811
|
+
}],
|
|
812
|
+
)
|
|
813
|
+
|
|
814
|
+
result = load_required_logs(
|
|
815
|
+
{"conn*.log*": "zeek_dir"},
|
|
816
|
+
{"zeek_dir": [zeek_dir]},
|
|
817
|
+
)
|
|
818
|
+
|
|
819
|
+
df = result.logs["conn*.log*"]
|
|
820
|
+
assert len(df) == 1
|
|
821
|
+
assert df.iloc[0]["src"] == "192.0.2.11"
|
|
822
|
+
assert any(
|
|
823
|
+
"conn-bad.log.gz could not be read" in warning
|
|
824
|
+
and "compressed file is incomplete or corrupt" in warning
|
|
825
|
+
for warning in result.warnings
|
|
826
|
+
)
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def test_load_logs_dated_layout_ignores_root_level_files(tmp_path: Path) -> None:
|
|
830
|
+
"""Root-level files are ignored when a YYYY-MM-DD subdir exists (mixed-root policy)."""
|
|
831
|
+
zeek_dir = tmp_path / "zeek"
|
|
832
|
+
zeek_dir.mkdir()
|
|
833
|
+
(zeek_dir / "2026-01-02").mkdir()
|
|
834
|
+
|
|
835
|
+
# Root-level file — ignored because a date dir is present.
|
|
836
|
+
_write_ndjson(
|
|
837
|
+
zeek_dir / "conn.log",
|
|
838
|
+
[{"ts": _TS_JAN2, "id.orig_h": "192.0.2.99", "id.resp_h": "198.51.100.99",
|
|
839
|
+
"id.resp_p": 443, "proto": "tcp"}],
|
|
840
|
+
)
|
|
841
|
+
# Dated subdir file — loaded.
|
|
842
|
+
_write_ndjson(
|
|
843
|
+
zeek_dir / "2026-01-02" / "conn.log",
|
|
844
|
+
[{"ts": _TS_JAN2 + 1, "id.orig_h": "192.0.2.50", "id.resp_h": "198.51.100.50",
|
|
845
|
+
"id.resp_p": 443, "proto": "tcp"}],
|
|
846
|
+
)
|
|
847
|
+
|
|
848
|
+
df = load_logs(zeek_dir, "conn*.log*")
|
|
849
|
+
|
|
850
|
+
assert len(df) == 1
|
|
851
|
+
assert df.iloc[0]["src"] == "192.0.2.50"
|
|
852
|
+
|
|
853
|
+
|
|
854
|
+
# ── Stage 4: boundedness + default window helpers ─────────────────────────────
|
|
855
|
+
|
|
856
|
+
|
|
857
|
+
def test_is_zeek_bounded_returns_true_for_file(tmp_path: Path) -> None:
|
|
858
|
+
f = tmp_path / "conn.log"
|
|
859
|
+
f.write_text("", encoding="utf-8")
|
|
860
|
+
assert is_zeek_bounded([f]) is True
|
|
861
|
+
|
|
862
|
+
|
|
863
|
+
def test_is_zeek_bounded_returns_false_for_directory(tmp_path: Path) -> None:
|
|
864
|
+
assert is_zeek_bounded([tmp_path]) is False
|
|
865
|
+
|
|
866
|
+
|
|
867
|
+
def test_is_zeek_bounded_returns_false_for_glob_string() -> None:
|
|
868
|
+
"""Glob strings classify as UNBOUNDED. Stage 4 helper contract; load wiring deferred."""
|
|
869
|
+
assert is_zeek_bounded([Path("conn*.log")]) is False
|
|
870
|
+
|
|
871
|
+
|
|
872
|
+
def test_is_zeek_bounded_empty_list_returns_false() -> None:
|
|
873
|
+
"""An empty bucket is NOT bounded — the runner short-circuits before
|
|
874
|
+
calling, but the predicate stays explicit (no Zeek to discuss)."""
|
|
875
|
+
assert is_zeek_bounded([]) is False
|
|
876
|
+
|
|
877
|
+
|
|
878
|
+
def test_zeek_dated_default_window_flat_layout_returns_none(tmp_path: Path) -> None:
|
|
879
|
+
(tmp_path / "conn.log").write_text("", encoding="utf-8")
|
|
880
|
+
assert _zeek_dated_window([tmp_path], timedelta(days=1)) is None
|
|
881
|
+
|
|
882
|
+
|
|
883
|
+
def test_zeek_dated_default_window_1d_picks_newest_subdir_only(tmp_path: Path) -> None:
|
|
884
|
+
"""GUARDRAIL — single-input dated selection that the union path must
|
|
885
|
+
GENERALIZE (newest N=ceil(span_days) date subdirs, earliest-midnight →
|
|
886
|
+
newest-23:59:59 UTC). Do NOT reinterpret these assertions; the
|
|
887
|
+
one-element list IS the degenerate single-input case."""
|
|
888
|
+
(tmp_path / "2026-01-01").mkdir()
|
|
889
|
+
(tmp_path / "2026-01-05").mkdir()
|
|
890
|
+
since, until = _zeek_dated_window([tmp_path], timedelta(days=1))
|
|
891
|
+
assert since == datetime(2026, 1, 5, 0, 0, 0, tzinfo=timezone.utc)
|
|
892
|
+
assert until == datetime(2026, 1, 5, 23, 59, 59, tzinfo=timezone.utc)
|
|
893
|
+
|
|
894
|
+
|
|
895
|
+
def test_zeek_dated_default_window_2d_picks_newest_2_subdirs_even_when_sparse(
|
|
896
|
+
tmp_path: Path,
|
|
897
|
+
) -> None:
|
|
898
|
+
"""GUARDRAIL — sparse-archive selection that the union path must
|
|
899
|
+
GENERALIZE. [2026-01-01, 2026-01-05] with span=2d → BOTH dirs; window
|
|
900
|
+
Jan 1 → Jan 5. Do NOT reinterpret."""
|
|
901
|
+
(tmp_path / "2026-01-01").mkdir()
|
|
902
|
+
(tmp_path / "2026-01-05").mkdir()
|
|
903
|
+
since, until = _zeek_dated_window([tmp_path], timedelta(days=2))
|
|
904
|
+
assert since == datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc)
|
|
905
|
+
assert until == datetime(2026, 1, 5, 23, 59, 59, tzinfo=timezone.utc)
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def test_zeek_dated_default_window_span_exceeds_subdir_count(tmp_path: Path) -> None:
|
|
909
|
+
for d in ["2026-01-01", "2026-01-03", "2026-01-05"]:
|
|
910
|
+
(tmp_path / d).mkdir()
|
|
911
|
+
since, until = _zeek_dated_window([tmp_path], timedelta(days=7))
|
|
912
|
+
assert since.date() == date(2026, 1, 1)
|
|
913
|
+
assert until.date() == date(2026, 1, 5)
|
|
914
|
+
|
|
915
|
+
|
|
916
|
+
def test_discover_zeek_files_file_input_matching_pattern_returns_file(
|
|
917
|
+
tmp_path: Path,
|
|
918
|
+
) -> None:
|
|
919
|
+
f = tmp_path / "conn.log"
|
|
920
|
+
_write_ndjson(f, [{"ts": _TS_JAN2}])
|
|
921
|
+
assert discover_zeek_files(f, "conn*.log*") == [f]
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
def test_discover_zeek_files_file_input_nonmatching_pattern_returns_empty(
|
|
925
|
+
tmp_path: Path,
|
|
926
|
+
) -> None:
|
|
927
|
+
f = tmp_path / "dns.log"
|
|
928
|
+
_write_ndjson(f, [{"ts": _TS_JAN2}])
|
|
929
|
+
assert discover_zeek_files(f, "conn*.log*") == []
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
# ── CloudTrail loader: per-file shapes ────────────────────────────────────────
|
|
933
|
+
|
|
934
|
+
def test_load_cloudtrail_ndjson_multiple_events_preserves_first_event(
|
|
935
|
+
tmp_path: Path,
|
|
936
|
+
) -> None:
|
|
937
|
+
"""Regression guard: the NDJSON branch must seed with the parsed first line.
|
|
938
|
+
|
|
939
|
+
A prior draft iterated 'remaining lines' without seeding, silently dropping
|
|
940
|
+
the first event of every exporter .json.log file. This asserts the observed
|
|
941
|
+
output, not the internal route.
|
|
942
|
+
"""
|
|
943
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
944
|
+
cloudtrail_dir.mkdir()
|
|
945
|
+
events = [
|
|
946
|
+
_ct_event(eventID="aaaa", eventTime="2026-06-01T12:00:00Z"),
|
|
947
|
+
_ct_event(eventID="bbbb", eventTime="2026-06-01T12:01:00Z"),
|
|
948
|
+
_ct_event(eventID="cccc", eventTime="2026-06-01T12:02:00Z"),
|
|
949
|
+
]
|
|
950
|
+
_ct_write_ndjson(cloudtrail_dir / "events.json.log", events)
|
|
951
|
+
|
|
952
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
953
|
+
|
|
954
|
+
assert list(df.columns) == _CLOUDTRAIL_COLUMNS
|
|
955
|
+
assert len(df) == 3
|
|
956
|
+
assert set(df["event_id"]) == {"aaaa", "bbbb", "cccc"}
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
def test_load_cloudtrail_bare_one_line_dict_event_loads_as_single_row(
|
|
960
|
+
tmp_path: Path,
|
|
961
|
+
) -> None:
|
|
962
|
+
"""Single-dict-per-file: first-line parses as a dict without Records, NDJSON
|
|
963
|
+
branch seeds with it, no more lines → exactly one event in the frame."""
|
|
964
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
965
|
+
cloudtrail_dir.mkdir()
|
|
966
|
+
(cloudtrail_dir / "one.json").write_text(
|
|
967
|
+
json.dumps(_ct_event(eventID="only-one")),
|
|
968
|
+
encoding="utf-8",
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
972
|
+
|
|
973
|
+
assert len(df) == 1
|
|
974
|
+
assert df.iloc[0]["event_id"] == "only-one"
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
def test_load_cloudtrail_one_line_bare_list_loads_as_event_list(
|
|
978
|
+
tmp_path: Path,
|
|
979
|
+
) -> None:
|
|
980
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
981
|
+
cloudtrail_dir.mkdir()
|
|
982
|
+
events = [
|
|
983
|
+
_ct_event(eventID="list-1"),
|
|
984
|
+
_ct_event(eventID="list-2"),
|
|
985
|
+
]
|
|
986
|
+
(cloudtrail_dir / "list.json").write_text(
|
|
987
|
+
json.dumps(events),
|
|
988
|
+
encoding="utf-8",
|
|
989
|
+
)
|
|
990
|
+
|
|
991
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
992
|
+
|
|
993
|
+
assert len(df) == 2
|
|
994
|
+
assert set(df["event_id"]) == {"list-1", "list-2"}
|
|
995
|
+
|
|
996
|
+
|
|
997
|
+
def test_load_cloudtrail_gzipped_envelope_loads_identically(tmp_path: Path) -> None:
|
|
998
|
+
"""Native S3 wire shape: {"Records": [...]} as a single gzipped JSON document."""
|
|
999
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1000
|
+
cloudtrail_dir.mkdir()
|
|
1001
|
+
events = [
|
|
1002
|
+
_ct_event(eventID="env-1"),
|
|
1003
|
+
_ct_event(eventID="env-2"),
|
|
1004
|
+
]
|
|
1005
|
+
_ct_write_envelope_gz(cloudtrail_dir / "envelope.json.gz", events)
|
|
1006
|
+
|
|
1007
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
1008
|
+
|
|
1009
|
+
assert list(df.columns) == _CLOUDTRAIL_COLUMNS
|
|
1010
|
+
assert len(df) == 2
|
|
1011
|
+
assert set(df["event_id"]) == {"env-1", "env-2"}
|
|
1012
|
+
|
|
1013
|
+
|
|
1014
|
+
def test_load_cloudtrail_pretty_printed_multiline_envelope_loads(tmp_path: Path) -> None:
|
|
1015
|
+
"""Whole-file fallback path: first line is a '{' fragment, full text is the doc."""
|
|
1016
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1017
|
+
cloudtrail_dir.mkdir()
|
|
1018
|
+
events = [_ct_event(eventID="pretty-1"), _ct_event(eventID="pretty-2")]
|
|
1019
|
+
(cloudtrail_dir / "pretty.json").write_text(
|
|
1020
|
+
json.dumps({"Records": events}, indent=2),
|
|
1021
|
+
encoding="utf-8",
|
|
1022
|
+
)
|
|
1023
|
+
|
|
1024
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
1025
|
+
|
|
1026
|
+
assert len(df) == 2
|
|
1027
|
+
assert set(df["event_id"]) == {"pretty-1", "pretty-2"}
|
|
1028
|
+
|
|
1029
|
+
|
|
1030
|
+
def test_load_cloudtrail_mixed_formats_in_one_directory_loads_union(
|
|
1031
|
+
tmp_path: Path,
|
|
1032
|
+
) -> None:
|
|
1033
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1034
|
+
cloudtrail_dir.mkdir()
|
|
1035
|
+
_ct_write_ndjson(
|
|
1036
|
+
cloudtrail_dir / "ndjson.json.log",
|
|
1037
|
+
[_ct_event(eventID="nd-1"), _ct_event(eventID="nd-2")],
|
|
1038
|
+
)
|
|
1039
|
+
_ct_write_envelope_gz(
|
|
1040
|
+
cloudtrail_dir / "env.json.gz",
|
|
1041
|
+
[_ct_event(eventID="env-1")],
|
|
1042
|
+
)
|
|
1043
|
+
|
|
1044
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
1045
|
+
|
|
1046
|
+
assert len(df) == 3
|
|
1047
|
+
assert set(df["event_id"]) == {"nd-1", "nd-2", "env-1"}
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
# ── CloudTrail loader: discovery ──────────────────────────────────────────────
|
|
1051
|
+
|
|
1052
|
+
def test_load_cloudtrail_native_nested_aws_logs_tree_discovered_recursively(
|
|
1053
|
+
tmp_path: Path,
|
|
1054
|
+
) -> None:
|
|
1055
|
+
"""Recursive *.json* discovery — what makes a native AWSLogs tree just work."""
|
|
1056
|
+
nested = (
|
|
1057
|
+
tmp_path
|
|
1058
|
+
/ "AWSLogs" / _CT_DOCS_ACCOUNT / "CloudTrail" / "us-east-1"
|
|
1059
|
+
/ "2026" / "06" / "01"
|
|
1060
|
+
)
|
|
1061
|
+
_ct_write_envelope_gz(nested / "events.json.gz", [_ct_event(eventID="nested-1")])
|
|
1062
|
+
|
|
1063
|
+
files = discover_cloudtrail_files(tmp_path)
|
|
1064
|
+
assert any("nested" not in p.name for p in files) # the actual file is discovered
|
|
1065
|
+
assert len(files) == 1
|
|
1066
|
+
assert files[0].name == "events.json.gz"
|
|
1067
|
+
|
|
1068
|
+
df = load_cloudtrail(tmp_path)
|
|
1069
|
+
assert len(df) == 1
|
|
1070
|
+
assert df.iloc[0]["event_id"] == "nested-1"
|
|
1071
|
+
|
|
1072
|
+
|
|
1073
|
+
def test_discover_cloudtrail_files_excludes_cloud_trail_digest_tree(
|
|
1074
|
+
tmp_path: Path,
|
|
1075
|
+
) -> None:
|
|
1076
|
+
"""Digest files are integrity manifests, not events — exclude them."""
|
|
1077
|
+
events_dir = tmp_path / "CloudTrail" / "us-east-1" / "2026" / "06" / "01"
|
|
1078
|
+
digest_dir = tmp_path / "CloudTrail-Digest" / "us-east-1" / "2026" / "06" / "01"
|
|
1079
|
+
_ct_write_envelope_gz(events_dir / "events.json.gz", [_ct_event(eventID="evt-1")])
|
|
1080
|
+
_ct_write_envelope_gz(digest_dir / "digest.json.gz", [_ct_event(eventID="digest-1")])
|
|
1081
|
+
|
|
1082
|
+
files = discover_cloudtrail_files(tmp_path)
|
|
1083
|
+
file_names = [f.name for f in files]
|
|
1084
|
+
assert "events.json.gz" in file_names
|
|
1085
|
+
assert "digest.json.gz" not in file_names
|
|
1086
|
+
|
|
1087
|
+
|
|
1088
|
+
def test_load_cloudtrail_single_file_path_works(tmp_path: Path) -> None:
|
|
1089
|
+
file_path = tmp_path / "events.json.log"
|
|
1090
|
+
_ct_write_ndjson(file_path, [_ct_event(eventID="single-file-event")])
|
|
1091
|
+
|
|
1092
|
+
df = load_cloudtrail(file_path)
|
|
1093
|
+
|
|
1094
|
+
assert len(df) == 1
|
|
1095
|
+
assert df.iloc[0]["event_id"] == "single-file-event"
|
|
1096
|
+
|
|
1097
|
+
|
|
1098
|
+
def test_load_cloudtrail_empty_directory_returns_column_stable_empty_frame(
|
|
1099
|
+
tmp_path: Path,
|
|
1100
|
+
) -> None:
|
|
1101
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1102
|
+
cloudtrail_dir.mkdir()
|
|
1103
|
+
|
|
1104
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
1105
|
+
|
|
1106
|
+
assert list(df.columns) == _CLOUDTRAIL_COLUMNS
|
|
1107
|
+
assert len(df) == 0
|
|
1108
|
+
|
|
1109
|
+
|
|
1110
|
+
# ── CloudTrail loader: tolerance, warnings, filtering ─────────────────────────
|
|
1111
|
+
|
|
1112
|
+
def test_load_cloudtrail_undecodable_ndjson_lines_silently_skipped(
|
|
1113
|
+
tmp_path: Path,
|
|
1114
|
+
) -> None:
|
|
1115
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1116
|
+
cloudtrail_dir.mkdir()
|
|
1117
|
+
good_a = json.dumps(_ct_event(eventID="good-a"))
|
|
1118
|
+
good_b = json.dumps(_ct_event(eventID="good-b"))
|
|
1119
|
+
(cloudtrail_dir / "events.json.log").write_text(
|
|
1120
|
+
f"{good_a}\nnot json at all\n{good_b}\n",
|
|
1121
|
+
encoding="utf-8",
|
|
1122
|
+
)
|
|
1123
|
+
|
|
1124
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
1125
|
+
|
|
1126
|
+
assert len(df) == 2
|
|
1127
|
+
assert set(df["event_id"]) == {"good-a", "good-b"}
|
|
1128
|
+
|
|
1129
|
+
|
|
1130
|
+
def test_load_required_logs_warns_and_skips_corrupt_cloudtrail_gzip(
|
|
1131
|
+
tmp_path: Path,
|
|
1132
|
+
) -> None:
|
|
1133
|
+
"""Corrupt gzip: warning appended to LoadResult.warnings; sibling still loads."""
|
|
1134
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1135
|
+
cloudtrail_dir.mkdir()
|
|
1136
|
+
# Truncated gzip
|
|
1137
|
+
payload = gzip.compress(
|
|
1138
|
+
json.dumps({"Records": [_ct_event(eventID="bad-evt")]}).encode("utf-8")
|
|
1139
|
+
)
|
|
1140
|
+
(cloudtrail_dir / "broken.json.gz").write_bytes(payload[:-8])
|
|
1141
|
+
_ct_write_ndjson(cloudtrail_dir / "ok.json.log", [_ct_event(eventID="good-evt")])
|
|
1142
|
+
|
|
1143
|
+
result = load_required_logs(
|
|
1144
|
+
{"*.json*": "cloudtrail_dir"},
|
|
1145
|
+
{"cloudtrail_dir": [cloudtrail_dir]},
|
|
1146
|
+
)
|
|
1147
|
+
|
|
1148
|
+
df = result.logs["*.json*"]
|
|
1149
|
+
assert len(df) == 1
|
|
1150
|
+
assert df.iloc[0]["event_id"] == "good-evt"
|
|
1151
|
+
assert any(
|
|
1152
|
+
"broken.json.gz could not be read" in w
|
|
1153
|
+
and "compressed file is incomplete or corrupt" in w
|
|
1154
|
+
for w in result.warnings
|
|
1155
|
+
)
|
|
1156
|
+
|
|
1157
|
+
|
|
1158
|
+
def test_load_required_logs_warns_and_skips_unparseable_json_file(
|
|
1159
|
+
tmp_path: Path,
|
|
1160
|
+
) -> None:
|
|
1161
|
+
"""Non-gzip file whose contents are not valid JSON: warn-and-skip with the
|
|
1162
|
+
'not valid JSON' message; sibling still loads."""
|
|
1163
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1164
|
+
cloudtrail_dir.mkdir()
|
|
1165
|
+
(cloudtrail_dir / "garbage.json").write_text(
|
|
1166
|
+
"this is not json at all\nstill not\n",
|
|
1167
|
+
encoding="utf-8",
|
|
1168
|
+
)
|
|
1169
|
+
_ct_write_ndjson(cloudtrail_dir / "ok.json.log", [_ct_event(eventID="evt-ok")])
|
|
1170
|
+
|
|
1171
|
+
result = load_required_logs(
|
|
1172
|
+
{"*.json*": "cloudtrail_dir"},
|
|
1173
|
+
{"cloudtrail_dir": [cloudtrail_dir]},
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
df = result.logs["*.json*"]
|
|
1177
|
+
assert len(df) == 1
|
|
1178
|
+
assert df.iloc[0]["event_id"] == "evt-ok"
|
|
1179
|
+
assert any(
|
|
1180
|
+
"garbage.json could not be read" in w and "not valid JSON" in w
|
|
1181
|
+
for w in result.warnings
|
|
1182
|
+
)
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
def test_load_cloudtrail_drops_events_with_missing_event_time(
|
|
1186
|
+
tmp_path: Path,
|
|
1187
|
+
) -> None:
|
|
1188
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1189
|
+
cloudtrail_dir.mkdir()
|
|
1190
|
+
no_ts = _ct_event(eventID="no-ts")
|
|
1191
|
+
no_ts.pop("eventTime")
|
|
1192
|
+
_ct_write_ndjson(
|
|
1193
|
+
cloudtrail_dir / "events.json.log",
|
|
1194
|
+
[_ct_event(eventID="has-ts"), no_ts],
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
df = load_cloudtrail(cloudtrail_dir)
|
|
1198
|
+
|
|
1199
|
+
assert len(df) == 1
|
|
1200
|
+
assert df.iloc[0]["event_id"] == "has-ts"
|
|
1201
|
+
|
|
1202
|
+
|
|
1203
|
+
def test_load_cloudtrail_applies_since_and_until_window(tmp_path: Path) -> None:
|
|
1204
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1205
|
+
cloudtrail_dir.mkdir()
|
|
1206
|
+
_ct_write_ndjson(
|
|
1207
|
+
cloudtrail_dir / "events.json.log",
|
|
1208
|
+
[
|
|
1209
|
+
_ct_event(eventID="too-early", eventTime="2026-05-31T11:00:00Z"),
|
|
1210
|
+
_ct_event(eventID="inside", eventTime="2026-06-01T12:00:00Z"),
|
|
1211
|
+
_ct_event(eventID="too-late", eventTime="2026-06-02T13:00:00Z"),
|
|
1212
|
+
],
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
df = load_cloudtrail(
|
|
1216
|
+
cloudtrail_dir,
|
|
1217
|
+
since=datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc),
|
|
1218
|
+
until=datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc),
|
|
1219
|
+
)
|
|
1220
|
+
|
|
1221
|
+
assert len(df) == 1
|
|
1222
|
+
assert df.iloc[0]["event_id"] == "inside"
|
|
1223
|
+
|
|
1224
|
+
|
|
1225
|
+
# ── Liveness: loader leaves a permanent record line ──────────────────────────
|
|
1226
|
+
|
|
1227
|
+
|
|
1228
|
+
class _FakeTTYStream:
|
|
1229
|
+
"""sys.stderr stand-in that reports isatty()=True and captures writes.
|
|
1230
|
+
|
|
1231
|
+
Used to exercise the byte-identical-on-TTY rail: on a real TTY,
|
|
1232
|
+
``progress`` constructs tqdm and the bar text reaches the stream. capsys
|
|
1233
|
+
cannot be used here because its captured stderr reports isatty()=False —
|
|
1234
|
+
that is exactly the non-TTY suppression rail tested separately below.
|
|
1235
|
+
"""
|
|
1236
|
+
|
|
1237
|
+
def __init__(self) -> None:
|
|
1238
|
+
self._chunks: list[str] = []
|
|
1239
|
+
|
|
1240
|
+
def isatty(self) -> bool:
|
|
1241
|
+
return True
|
|
1242
|
+
|
|
1243
|
+
def write(self, s: str) -> int:
|
|
1244
|
+
self._chunks.append(s)
|
|
1245
|
+
return len(s)
|
|
1246
|
+
|
|
1247
|
+
def flush(self) -> None: # pragma: no cover - no-op
|
|
1248
|
+
return None
|
|
1249
|
+
|
|
1250
|
+
@property
|
|
1251
|
+
def output(self) -> str:
|
|
1252
|
+
return "".join(self._chunks)
|
|
1253
|
+
|
|
1254
|
+
|
|
1255
|
+
def test_parse_ndjson_leaves_permanent_record_line_on_tty(
|
|
1256
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1257
|
+
) -> None:
|
|
1258
|
+
"""Byte-identical-on-TTY regression: on a real TTY stream, the NDJSON
|
|
1259
|
+
loader still writes a ``loaded <file>: N lines`` permanent record line
|
|
1260
|
+
through the shared progress helper. The bar_format is pinned in
|
|
1261
|
+
``common/display.py:progress`` and reproduces the pre-helper inline
|
|
1262
|
+
bar_format byte-for-byte when ``unit=" lines"``."""
|
|
1263
|
+
from loghunter.common.loader import _parse_ndjson_file
|
|
1264
|
+
|
|
1265
|
+
fake = _FakeTTYStream()
|
|
1266
|
+
monkeypatch.setattr("sys.stderr", fake)
|
|
1267
|
+
|
|
1268
|
+
f = tmp_path / "conn.log"
|
|
1269
|
+
f.write_text(
|
|
1270
|
+
"\n".join(
|
|
1271
|
+
json.dumps({"ts": float(i), "id.orig_h": f"192.0.2.{i}"})
|
|
1272
|
+
for i in range(1, 6)
|
|
1273
|
+
) + "\n",
|
|
1274
|
+
encoding="utf-8",
|
|
1275
|
+
)
|
|
1276
|
+
|
|
1277
|
+
_parse_ndjson_file(f)
|
|
1278
|
+
|
|
1279
|
+
out = fake.output
|
|
1280
|
+
# tqdm with leave=True commits the summary line for that file.
|
|
1281
|
+
assert "loaded conn.log" in out
|
|
1282
|
+
assert "5" in out # the line count, formatted by tqdm's n_fmt
|
|
1283
|
+
|
|
1284
|
+
|
|
1285
|
+
def test_parse_ndjson_non_tty_stream_suppresses_loader_bar(
|
|
1286
|
+
tmp_path: Path, capsys: pytest.CaptureFixture
|
|
1287
|
+
) -> None:
|
|
1288
|
+
"""On a non-TTY stream (the codified within-loader TTY policy), the
|
|
1289
|
+
progress helper returns the bare iterable and tqdm is never constructed.
|
|
1290
|
+
capsys's stderr reports isatty()=False, exercising the non-TTY arm."""
|
|
1291
|
+
from loghunter.common.loader import _parse_ndjson_file
|
|
1292
|
+
|
|
1293
|
+
f = tmp_path / "conn.log"
|
|
1294
|
+
f.write_text(
|
|
1295
|
+
"\n".join(
|
|
1296
|
+
json.dumps({"ts": float(i), "id.orig_h": f"192.0.2.{i}"})
|
|
1297
|
+
for i in range(1, 6)
|
|
1298
|
+
) + "\n",
|
|
1299
|
+
encoding="utf-8",
|
|
1300
|
+
)
|
|
1301
|
+
|
|
1302
|
+
df = _parse_ndjson_file(f) # default show_progress=True
|
|
1303
|
+
|
|
1304
|
+
captured = capsys.readouterr()
|
|
1305
|
+
assert "loaded conn.log" not in captured.err
|
|
1306
|
+
assert len(df) == 5
|
|
1307
|
+
|
|
1308
|
+
|
|
1309
|
+
def test_parse_ndjson_show_progress_false_suppresses_loader_bar(
|
|
1310
|
+
tmp_path: Path, capsys: pytest.CaptureFixture
|
|
1311
|
+
) -> None:
|
|
1312
|
+
"""show_progress=False routes through the shared progress helper, which
|
|
1313
|
+
returns the bare iterable without constructing tqdm. A multi-file digest
|
|
1314
|
+
fan-out passes show_progress=False so per-file bars don't interleave
|
|
1315
|
+
between rendered cards. The frame must still be returned identical to
|
|
1316
|
+
the default-True path — suppression is purely cosmetic."""
|
|
1317
|
+
from loghunter.common.loader import _parse_ndjson_file
|
|
1318
|
+
|
|
1319
|
+
f = tmp_path / "conn.log"
|
|
1320
|
+
f.write_text(
|
|
1321
|
+
"\n".join(
|
|
1322
|
+
json.dumps({"ts": float(i), "id.orig_h": f"192.0.2.{i}"})
|
|
1323
|
+
for i in range(1, 6)
|
|
1324
|
+
) + "\n",
|
|
1325
|
+
encoding="utf-8",
|
|
1326
|
+
)
|
|
1327
|
+
|
|
1328
|
+
df = _parse_ndjson_file(f, show_progress=False)
|
|
1329
|
+
|
|
1330
|
+
captured = capsys.readouterr()
|
|
1331
|
+
assert "loaded conn.log" not in captured.err
|
|
1332
|
+
assert len(df) == 5
|
|
1333
|
+
|
|
1334
|
+
|
|
1335
|
+
# ── Loader progress: seam coverage (mock progress, assert kwargs) ────────────
|
|
1336
|
+
#
|
|
1337
|
+
# Each loader read path routes through the shared
|
|
1338
|
+
# loghunter.common.loader.progress helper. Mocking that seam keeps the tests
|
|
1339
|
+
# off carriage-return-byte scraping (which is brittle) and verifies the desc /
|
|
1340
|
+
# unit / show_progress contract each loader holds with the helper. The two
|
|
1341
|
+
# NDJSON byte-output tests above lock the on-TTY render — these tests lock the
|
|
1342
|
+
# wiring beneath it.
|
|
1343
|
+
|
|
1344
|
+
|
|
1345
|
+
class _ProgressSpy:
|
|
1346
|
+
"""Spy for loghunter.common.loader.progress.
|
|
1347
|
+
|
|
1348
|
+
Records (desc, unit, show_progress) per call and forwards iteration to the
|
|
1349
|
+
bare iterable so the loader still produces a real frame. Tests can then
|
|
1350
|
+
assert how many times each loader called the helper and with what args.
|
|
1351
|
+
"""
|
|
1352
|
+
|
|
1353
|
+
def __init__(self) -> None:
|
|
1354
|
+
self.calls: list[dict] = []
|
|
1355
|
+
|
|
1356
|
+
def __call__(self, iterable, *, desc, show_progress=True, unit=" lines",
|
|
1357
|
+
total=None, stream=None):
|
|
1358
|
+
self.calls.append({
|
|
1359
|
+
"desc": desc,
|
|
1360
|
+
"unit": unit,
|
|
1361
|
+
"show_progress": show_progress,
|
|
1362
|
+
})
|
|
1363
|
+
return iter(iterable)
|
|
1364
|
+
|
|
1365
|
+
|
|
1366
|
+
def test_progress_seam_tsv_wraps_pre_materialization(
|
|
1367
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1368
|
+
) -> None:
|
|
1369
|
+
"""The Zeek TSV strategy's parse wraps the file handle ONCE through
|
|
1370
|
+
``progress`` (in ``run_load``) BEFORE any per-line work — the
|
|
1371
|
+
materialization that follows the prefix-preserving sniff is the slow
|
|
1372
|
+
part on a long log. The spy intercepts the single ``progress`` call
|
|
1373
|
+
and verifies its kwargs."""
|
|
1374
|
+
from loghunter.common import loader as loader_mod
|
|
1375
|
+
|
|
1376
|
+
spy = _ProgressSpy()
|
|
1377
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1378
|
+
|
|
1379
|
+
f = tmp_path / "conn.tsv"
|
|
1380
|
+
# Minimal valid Zeek TSV — header is enough to claim TSV via #separator.
|
|
1381
|
+
f.write_text(
|
|
1382
|
+
"#separator \\x09\n"
|
|
1383
|
+
"#fields\tts\tid.orig_h\tid.resp_h\tid.resp_p\tproto\n"
|
|
1384
|
+
"#types\ttime\taddr\taddr\tport\tenum\n"
|
|
1385
|
+
"1779750000.0\t192.0.2.10\t198.51.100.20\t443\ttcp\n"
|
|
1386
|
+
"#close\t2026-06-01-12-00-00\n",
|
|
1387
|
+
encoding="utf-8",
|
|
1388
|
+
)
|
|
1389
|
+
|
|
1390
|
+
loader_mod.load_logs(f.parent, "*.tsv", _files=[f])
|
|
1391
|
+
|
|
1392
|
+
assert len(spy.calls) == 1
|
|
1393
|
+
assert spy.calls[0]["desc"] == "loaded conn.tsv"
|
|
1394
|
+
assert spy.calls[0]["unit"] == " lines"
|
|
1395
|
+
assert spy.calls[0]["show_progress"] is True
|
|
1396
|
+
|
|
1397
|
+
|
|
1398
|
+
def test_progress_seam_load_syslog_calls_per_file(
|
|
1399
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1400
|
+
) -> None:
|
|
1401
|
+
"""load_syslog wraps each per-file read with progress."""
|
|
1402
|
+
from loghunter.common import loader as loader_mod
|
|
1403
|
+
|
|
1404
|
+
spy = _ProgressSpy()
|
|
1405
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1406
|
+
|
|
1407
|
+
syslog_dir = tmp_path / "syslog"
|
|
1408
|
+
syslog_dir.mkdir()
|
|
1409
|
+
(syslog_dir / "router.log").write_text(
|
|
1410
|
+
"Jun 1 12:00:00 router sshd[1]: hi\n", encoding="utf-8",
|
|
1411
|
+
)
|
|
1412
|
+
(syslog_dir / "webserver.log").write_text(
|
|
1413
|
+
"Jun 1 12:01:00 web nginx[2]: hi\n", encoding="utf-8",
|
|
1414
|
+
)
|
|
1415
|
+
|
|
1416
|
+
loader_mod.load_syslog(syslog_dir, show_progress=False)
|
|
1417
|
+
|
|
1418
|
+
descs = sorted(c["desc"] for c in spy.calls)
|
|
1419
|
+
assert descs == ["loaded router.log", "loaded webserver.log"]
|
|
1420
|
+
assert all(c["show_progress"] is False for c in spy.calls)
|
|
1421
|
+
assert all(c["unit"] == " lines" for c in spy.calls)
|
|
1422
|
+
|
|
1423
|
+
|
|
1424
|
+
def test_progress_seam_load_pihole_calls_per_file(
|
|
1425
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1426
|
+
) -> None:
|
|
1427
|
+
"""load_pihole wraps each per-file read with progress."""
|
|
1428
|
+
from loghunter.common import loader as loader_mod
|
|
1429
|
+
|
|
1430
|
+
spy = _ProgressSpy()
|
|
1431
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1432
|
+
|
|
1433
|
+
pihole_dir = tmp_path / "pihole"
|
|
1434
|
+
pihole_dir.mkdir()
|
|
1435
|
+
(pihole_dir / "pihole.log").write_text(
|
|
1436
|
+
"Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.10\n",
|
|
1437
|
+
encoding="utf-8",
|
|
1438
|
+
)
|
|
1439
|
+
|
|
1440
|
+
loader_mod.load_pihole(pihole_dir, show_progress=True)
|
|
1441
|
+
|
|
1442
|
+
assert len(spy.calls) == 1
|
|
1443
|
+
assert spy.calls[0]["desc"] == "loaded pihole.log"
|
|
1444
|
+
assert spy.calls[0]["unit"] == " lines"
|
|
1445
|
+
assert spy.calls[0]["show_progress"] is True
|
|
1446
|
+
|
|
1447
|
+
|
|
1448
|
+
# ── CloudTrail single-iterator: per-shape input-line accounting ──────────────
|
|
1449
|
+
#
|
|
1450
|
+
# After `line_iter = progress(...)` exists in _cloudtrail_strategy_parse, ALL four
|
|
1451
|
+
# wire-shape branches consume from the same wrapped iterator. The progress bar
|
|
1452
|
+
# therefore reports actual INPUT lines (never parsed events) for every shape —
|
|
1453
|
+
# including the Glenn round-2 regression case: a one-line NDJSON file must
|
|
1454
|
+
# report `loaded x: 1 lines`, NOT zero.
|
|
1455
|
+
|
|
1456
|
+
|
|
1457
|
+
class _CountingProgressSpy:
|
|
1458
|
+
"""Progress spy that wraps iteration and counts lines pulled through it.
|
|
1459
|
+
|
|
1460
|
+
Distinct from _ProgressSpy in that it tracks per-call line counts via
|
|
1461
|
+
actual iteration — needed to assert the CloudTrail single-iterator drives
|
|
1462
|
+
every branch (envelope / multi-line pretty / NDJSON / bare-list).
|
|
1463
|
+
"""
|
|
1464
|
+
|
|
1465
|
+
def __init__(self) -> None:
|
|
1466
|
+
self.calls: list[dict] = [] # one entry per call (desc, line_count)
|
|
1467
|
+
|
|
1468
|
+
def __call__(self, iterable, *, desc, show_progress=True, unit=" lines",
|
|
1469
|
+
total=None, stream=None):
|
|
1470
|
+
entry = {"desc": desc, "line_count": 0}
|
|
1471
|
+
self.calls.append(entry)
|
|
1472
|
+
|
|
1473
|
+
def _counting():
|
|
1474
|
+
for line in iterable:
|
|
1475
|
+
entry["line_count"] += 1
|
|
1476
|
+
yield line
|
|
1477
|
+
|
|
1478
|
+
return _counting()
|
|
1479
|
+
|
|
1480
|
+
|
|
1481
|
+
def test_cloudtrail_one_line_ndjson_bar_reports_one_line_not_zero(
|
|
1482
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1483
|
+
) -> None:
|
|
1484
|
+
"""Glenn round-2 regression: a single-event NDJSON CloudTrail file must
|
|
1485
|
+
NOT leave a ``loaded x: 0 lines`` record. The first-nonblank sniff
|
|
1486
|
+
consumes the one line through the shared wrapped iterator, so the bar
|
|
1487
|
+
correctly reports 1 line consumed."""
|
|
1488
|
+
from loghunter.common import loader as loader_mod
|
|
1489
|
+
|
|
1490
|
+
spy = _CountingProgressSpy()
|
|
1491
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1492
|
+
|
|
1493
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1494
|
+
cloudtrail_dir.mkdir()
|
|
1495
|
+
_ct_write_ndjson(cloudtrail_dir / "one.json.log",
|
|
1496
|
+
[_ct_event(eventID="only-one")])
|
|
1497
|
+
|
|
1498
|
+
df = loader_mod.load_cloudtrail(cloudtrail_dir)
|
|
1499
|
+
|
|
1500
|
+
assert len(df) == 1
|
|
1501
|
+
assert len(spy.calls) == 1
|
|
1502
|
+
assert spy.calls[0]["desc"] == "loaded one.json.log"
|
|
1503
|
+
# The one input line was pulled through the wrapped iterator.
|
|
1504
|
+
assert spy.calls[0]["line_count"] == 1
|
|
1505
|
+
|
|
1506
|
+
|
|
1507
|
+
def test_cloudtrail_multi_line_ndjson_bar_counts_every_input_line(
|
|
1508
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1509
|
+
) -> None:
|
|
1510
|
+
"""NDJSON branch consumes both the first-nonblank sniff AND the per-event
|
|
1511
|
+
stream from the same wrapped iterator — total = input lines."""
|
|
1512
|
+
from loghunter.common import loader as loader_mod
|
|
1513
|
+
|
|
1514
|
+
spy = _CountingProgressSpy()
|
|
1515
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1516
|
+
|
|
1517
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1518
|
+
cloudtrail_dir.mkdir()
|
|
1519
|
+
_ct_write_ndjson(cloudtrail_dir / "events.json.log", [
|
|
1520
|
+
_ct_event(eventID="a"),
|
|
1521
|
+
_ct_event(eventID="b"),
|
|
1522
|
+
_ct_event(eventID="c"),
|
|
1523
|
+
])
|
|
1524
|
+
|
|
1525
|
+
loader_mod.load_cloudtrail(cloudtrail_dir)
|
|
1526
|
+
|
|
1527
|
+
assert spy.calls[0]["line_count"] == 3
|
|
1528
|
+
|
|
1529
|
+
|
|
1530
|
+
def test_cloudtrail_envelope_bar_counts_envelope_line(
|
|
1531
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1532
|
+
) -> None:
|
|
1533
|
+
"""``{"Records": [...]}`` envelope: the helper-wrapped iterator carries
|
|
1534
|
+
the first line (and any additional input lines) before the whole-document
|
|
1535
|
+
join. A single-line envelope reports 1 input line."""
|
|
1536
|
+
from loghunter.common import loader as loader_mod
|
|
1537
|
+
|
|
1538
|
+
spy = _CountingProgressSpy()
|
|
1539
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1540
|
+
|
|
1541
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1542
|
+
cloudtrail_dir.mkdir()
|
|
1543
|
+
_ct_write_envelope_gz(cloudtrail_dir / "envelope.json.gz",
|
|
1544
|
+
[_ct_event(eventID="env-1"),
|
|
1545
|
+
_ct_event(eventID="env-2")])
|
|
1546
|
+
|
|
1547
|
+
loader_mod.load_cloudtrail(cloudtrail_dir)
|
|
1548
|
+
|
|
1549
|
+
# Single-line envelope = exactly one input line through the wrapped iter.
|
|
1550
|
+
assert spy.calls[0]["line_count"] == 1
|
|
1551
|
+
|
|
1552
|
+
|
|
1553
|
+
def test_cloudtrail_pretty_multiline_bar_counts_every_input_line(
|
|
1554
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1555
|
+
) -> None:
|
|
1556
|
+
"""Pretty-printed multi-line single-document fallback (first line is a
|
|
1557
|
+
JSON fragment): the wrapped iterator collects all remaining lines via
|
|
1558
|
+
``"".join(line_iter)`` so the bar reports the full file line count, not
|
|
1559
|
+
just 1."""
|
|
1560
|
+
from loghunter.common import loader as loader_mod
|
|
1561
|
+
|
|
1562
|
+
spy = _CountingProgressSpy()
|
|
1563
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1564
|
+
|
|
1565
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1566
|
+
cloudtrail_dir.mkdir()
|
|
1567
|
+
events = [_ct_event(eventID="pretty-1"), _ct_event(eventID="pretty-2")]
|
|
1568
|
+
pretty_text = json.dumps({"Records": events}, indent=2)
|
|
1569
|
+
(cloudtrail_dir / "pretty.json").write_text(pretty_text, encoding="utf-8")
|
|
1570
|
+
expected_lines = len(pretty_text.splitlines())
|
|
1571
|
+
|
|
1572
|
+
loader_mod.load_cloudtrail(cloudtrail_dir)
|
|
1573
|
+
|
|
1574
|
+
assert spy.calls[0]["line_count"] == expected_lines
|
|
1575
|
+
assert expected_lines > 1 # sanity: this fixture really is multi-line
|
|
1576
|
+
|
|
1577
|
+
|
|
1578
|
+
def test_cloudtrail_bare_list_one_line_doc_bar_reports_one_line(
|
|
1579
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1580
|
+
) -> None:
|
|
1581
|
+
"""Bare-list one-line document: the wrapped iterator delivers the single
|
|
1582
|
+
line for the sniff, no further iteration; bar = 1 line consumed."""
|
|
1583
|
+
from loghunter.common import loader as loader_mod
|
|
1584
|
+
|
|
1585
|
+
spy = _CountingProgressSpy()
|
|
1586
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1587
|
+
|
|
1588
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1589
|
+
cloudtrail_dir.mkdir()
|
|
1590
|
+
events = [_ct_event(eventID="list-1"), _ct_event(eventID="list-2")]
|
|
1591
|
+
(cloudtrail_dir / "list.json").write_text(
|
|
1592
|
+
json.dumps(events), encoding="utf-8",
|
|
1593
|
+
)
|
|
1594
|
+
|
|
1595
|
+
loader_mod.load_cloudtrail(cloudtrail_dir)
|
|
1596
|
+
|
|
1597
|
+
assert spy.calls[0]["line_count"] == 1
|
|
1598
|
+
|
|
1599
|
+
|
|
1600
|
+
def test_cloudtrail_bar_unit_is_lines_not_events(
|
|
1601
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1602
|
+
) -> None:
|
|
1603
|
+
"""The CloudTrail bar declares ``unit=" lines"`` because it counts INPUT
|
|
1604
|
+
lines (the wrapped iterator), not parsed/emitted events. The
|
|
1605
|
+
parsed-event iteration in load_cloudtrail must NEVER be wrapped — that
|
|
1606
|
+
would label parsed events as lines, which is a lie."""
|
|
1607
|
+
from loghunter.common import loader as loader_mod
|
|
1608
|
+
|
|
1609
|
+
spy = _ProgressSpy()
|
|
1610
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1611
|
+
|
|
1612
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1613
|
+
cloudtrail_dir.mkdir()
|
|
1614
|
+
_ct_write_ndjson(cloudtrail_dir / "events.json.log", [
|
|
1615
|
+
_ct_event(eventID="a"), _ct_event(eventID="b"),
|
|
1616
|
+
])
|
|
1617
|
+
|
|
1618
|
+
loader_mod.load_cloudtrail(cloudtrail_dir)
|
|
1619
|
+
|
|
1620
|
+
# Exactly ONE progress call per file (no separate event-iteration bar).
|
|
1621
|
+
assert len(spy.calls) == 1
|
|
1622
|
+
assert spy.calls[0]["unit"] == " lines"
|
|
1623
|
+
|
|
1624
|
+
|
|
1625
|
+
# ── show_progress threading: load_required_logs → all four families ─────────
|
|
1626
|
+
|
|
1627
|
+
|
|
1628
|
+
def test_load_required_logs_threads_show_progress_to_all_families(
|
|
1629
|
+
tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
|
1630
|
+
) -> None:
|
|
1631
|
+
"""load_required_logs(show_progress=False) propagates to every family
|
|
1632
|
+
loader (zeek, syslog, pihole, cloudtrail). Closes the gap where the flag
|
|
1633
|
+
only threaded to load_logs and left the three flat loaders unsilenced."""
|
|
1634
|
+
from loghunter.common import loader as loader_mod
|
|
1635
|
+
|
|
1636
|
+
spy = _ProgressSpy()
|
|
1637
|
+
monkeypatch.setattr(loader_mod, "progress", spy)
|
|
1638
|
+
|
|
1639
|
+
zeek_dir = tmp_path / "zeek"
|
|
1640
|
+
zeek_dir.mkdir()
|
|
1641
|
+
_write_ndjson(zeek_dir / "conn.log", [
|
|
1642
|
+
{"ts": 1.0, "id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
1643
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
1644
|
+
])
|
|
1645
|
+
|
|
1646
|
+
syslog_dir = tmp_path / "syslog"
|
|
1647
|
+
syslog_dir.mkdir()
|
|
1648
|
+
(syslog_dir / "router.log").write_text(
|
|
1649
|
+
"Jun 1 12:00:00 router sshd[1]: hi\n", encoding="utf-8",
|
|
1650
|
+
)
|
|
1651
|
+
|
|
1652
|
+
pihole_dir = tmp_path / "pihole"
|
|
1653
|
+
pihole_dir.mkdir()
|
|
1654
|
+
(pihole_dir / "pihole.log").write_text(
|
|
1655
|
+
"Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.10\n",
|
|
1656
|
+
encoding="utf-8",
|
|
1657
|
+
)
|
|
1658
|
+
|
|
1659
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
1660
|
+
cloudtrail_dir.mkdir()
|
|
1661
|
+
_ct_write_ndjson(cloudtrail_dir / "events.json.log", [_ct_event()])
|
|
1662
|
+
|
|
1663
|
+
loader_mod.load_required_logs(
|
|
1664
|
+
{
|
|
1665
|
+
"conn*.log*": "zeek_dir",
|
|
1666
|
+
"syslog_dir_pattern": "syslog_dir",
|
|
1667
|
+
"pihole_dir_pattern": "pihole_dir",
|
|
1668
|
+
"*.json*": "cloudtrail_dir",
|
|
1669
|
+
},
|
|
1670
|
+
{
|
|
1671
|
+
"zeek_dir": [zeek_dir],
|
|
1672
|
+
"syslog_dir": [syslog_dir],
|
|
1673
|
+
"pihole_dir": [pihole_dir],
|
|
1674
|
+
"cloudtrail_dir": [cloudtrail_dir],
|
|
1675
|
+
},
|
|
1676
|
+
show_progress=False,
|
|
1677
|
+
)
|
|
1678
|
+
|
|
1679
|
+
# Every loader that called progress did so with show_progress=False —
|
|
1680
|
+
# no family leaked the default-True flag.
|
|
1681
|
+
assert spy.calls, "no progress calls recorded — fixture must exercise readers"
|
|
1682
|
+
assert all(c["show_progress"] is False for c in spy.calls)
|
|
1683
|
+
|
|
1684
|
+
|
|
1685
|
+
# ── Zeek syslog.log v1 promotion (fidelity-aware syslog schema) ───────────────
|
|
1686
|
+
#
|
|
1687
|
+
# `syslog*.log*` is the loader's glob pattern for Zeek's own syslog.log; it
|
|
1688
|
+
# routes through the zeek_dir branch (TSV + NDJSON) into the new
|
|
1689
|
+
# _normalize_zeek_syslog_df. Result must be the 7-col canonical frame with
|
|
1690
|
+
# minimal-5 first (ts, host, program, raw, message) and extended last
|
|
1691
|
+
# (facility, severity).
|
|
1692
|
+
|
|
1693
|
+
def test_log_type_routes_syslog_pattern() -> None:
|
|
1694
|
+
"""_log_type maps "syslog*.log*" to "syslog" so the normalizer map fires."""
|
|
1695
|
+
from loghunter.common.loader import _log_type
|
|
1696
|
+
assert _log_type("syslog*.log*") == "syslog"
|
|
1697
|
+
|
|
1698
|
+
|
|
1699
|
+
def test_normalizer_map_contains_syslog_entry() -> None:
|
|
1700
|
+
"""The new normalizer is wired into the dispatch table."""
|
|
1701
|
+
from loghunter.common.loader import _NORMALIZER_MAP
|
|
1702
|
+
from loghunter.parsers.zeek import _normalize_zeek_syslog_df
|
|
1703
|
+
assert _NORMALIZER_MAP["syslog"] is _normalize_zeek_syslog_df
|
|
1704
|
+
|
|
1705
|
+
|
|
1706
|
+
def test_load_logs_zeek_syslog_ndjson_returns_canonical_seven_columns(
|
|
1707
|
+
tmp_path: Path,
|
|
1708
|
+
) -> None:
|
|
1709
|
+
"""load_logs on a Zeek syslog.log NDJSON file produces the canonical
|
|
1710
|
+
fidelity-aware syslog frame: minimal-5 first, then facility/severity."""
|
|
1711
|
+
zeek_dir = tmp_path / "zeek"
|
|
1712
|
+
zeek_dir.mkdir()
|
|
1713
|
+
_write_ndjson(
|
|
1714
|
+
zeek_dir / "syslog.log",
|
|
1715
|
+
[
|
|
1716
|
+
{
|
|
1717
|
+
"_path": "syslog",
|
|
1718
|
+
"ts": 1779750000.0,
|
|
1719
|
+
"uid": "CSL01",
|
|
1720
|
+
"id.orig_h": "192.0.2.10",
|
|
1721
|
+
"id.orig_p": 41514,
|
|
1722
|
+
"id.resp_h": "198.51.100.20",
|
|
1723
|
+
"id.resp_p": 514,
|
|
1724
|
+
"proto": "udp",
|
|
1725
|
+
"facility": "DAEMON",
|
|
1726
|
+
"severity": "INFO",
|
|
1727
|
+
"message": (
|
|
1728
|
+
"Jun 11 12:00:00 host1 sshd[1234]: "
|
|
1729
|
+
"Accepted publickey for user from 192.0.2.10"
|
|
1730
|
+
),
|
|
1731
|
+
}
|
|
1732
|
+
],
|
|
1733
|
+
)
|
|
1734
|
+
df = load_logs(zeek_dir, "syslog*.log*")
|
|
1735
|
+
assert list(df.columns) == [
|
|
1736
|
+
"ts", "host", "program", "raw", "message", "facility", "severity",
|
|
1737
|
+
]
|
|
1738
|
+
assert len(df) == 1
|
|
1739
|
+
assert df.iloc[0]["host"] == "host1"
|
|
1740
|
+
assert df.iloc[0]["program"] == "sshd"
|
|
1741
|
+
assert df.iloc[0]["severity"] == "INFO"
|
|
1742
|
+
# Dropped Zeek-native fields.
|
|
1743
|
+
for col in ("uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto"):
|
|
1744
|
+
assert col not in df.columns
|
|
1745
|
+
|
|
1746
|
+
|
|
1747
|
+
def test_load_logs_zeek_syslog_tsv_returns_canonical_seven_columns(
|
|
1748
|
+
tmp_path: Path,
|
|
1749
|
+
) -> None:
|
|
1750
|
+
"""load_logs on a Zeek syslog.log TSV file produces the same canonical
|
|
1751
|
+
frame as the NDJSON path (single normalizer, two front-ends)."""
|
|
1752
|
+
zeek_dir = tmp_path / "zeek"
|
|
1753
|
+
zeek_dir.mkdir()
|
|
1754
|
+
(zeek_dir / "syslog.log").write_text(
|
|
1755
|
+
"#separator \\x09\n"
|
|
1756
|
+
"#set_separator\t,\n"
|
|
1757
|
+
"#empty_field\t(empty)\n"
|
|
1758
|
+
"#unset_field\t-\n"
|
|
1759
|
+
"#path\tsyslog\n"
|
|
1760
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
|
|
1761
|
+
"\tproto\tfacility\tseverity\tmessage\n"
|
|
1762
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport"
|
|
1763
|
+
"\tenum\tstring\tstring\tstring\n"
|
|
1764
|
+
"1779750000.000000\tCSL01\t192.0.2.10\t41514\t198.51.100.20\t514"
|
|
1765
|
+
"\tudp\tDAEMON\tINFO"
|
|
1766
|
+
"\tJun 11 12:00:00 host1 sshd[1234]: Accepted publickey for user\n",
|
|
1767
|
+
encoding="utf-8",
|
|
1768
|
+
)
|
|
1769
|
+
df = load_logs(zeek_dir, "syslog*.log*")
|
|
1770
|
+
assert list(df.columns) == [
|
|
1771
|
+
"ts", "host", "program", "raw", "message", "facility", "severity",
|
|
1772
|
+
]
|
|
1773
|
+
assert df.iloc[0]["host"] == "host1"
|
|
1774
|
+
assert df.iloc[0]["severity"] == "INFO"
|
|
1775
|
+
|
|
1776
|
+
|
|
1777
|
+
def test_schema_warning_fires_for_zeek_syslog_missing_required_field() -> None:
|
|
1778
|
+
"""Missing `message` on a Zeek-syslog frame trips the v1-required
|
|
1779
|
+
columns warning — minimal-5 are v1-required, facility/severity are not."""
|
|
1780
|
+
df = pd.DataFrame([{
|
|
1781
|
+
"ts": 1779750000.0,
|
|
1782
|
+
"host": "host1",
|
|
1783
|
+
"facility": "DAEMON",
|
|
1784
|
+
"severity": "INFO",
|
|
1785
|
+
# message / program / raw deliberately absent
|
|
1786
|
+
}])
|
|
1787
|
+
warning = _schema_warning("syslog*.log*", df)
|
|
1788
|
+
assert warning is not None
|
|
1789
|
+
assert "syslog.log fields not found" in warning
|
|
1790
|
+
assert "message" in warning
|
|
1791
|
+
|
|
1792
|
+
|
|
1793
|
+
def test_schema_warning_does_not_fire_for_zeek_syslog_missing_facility() -> None:
|
|
1794
|
+
"""facility/severity are extended/nullable — absence is not a warning."""
|
|
1795
|
+
df = pd.DataFrame([{
|
|
1796
|
+
"ts": 1779750000.0,
|
|
1797
|
+
"host": "host1",
|
|
1798
|
+
"program": "sshd",
|
|
1799
|
+
"raw": "<14>Jun 11 12:00:00 host1 sshd: ok",
|
|
1800
|
+
"message": "sshd: ok",
|
|
1801
|
+
# facility / severity absent — flat-feed shape, but ALSO valid for a
|
|
1802
|
+
# Zeek frame that happens to be missing extended fields.
|
|
1803
|
+
}])
|
|
1804
|
+
assert _schema_warning("syslog*.log*", df) is None
|
|
1805
|
+
|
|
1806
|
+
|
|
1807
|
+
# ── bz2 / xz transparent decompression at _open_log ──────────────────────────
|
|
1808
|
+
#
|
|
1809
|
+
# `_open_log` is the single chokepoint every source flows through, so adding
|
|
1810
|
+
# bz2/xz here covers conn/dns/syslog/pihole/cloudtrail/sniff. These tests
|
|
1811
|
+
# observe the fix through the PUBLIC load_required_logs entry rather than
|
|
1812
|
+
# touching `_open_log` directly — the bug only manifests once discovery feeds
|
|
1813
|
+
# `_open_log`, so a sham helper-only test would miss it.
|
|
1814
|
+
|
|
1815
|
+
|
|
1816
|
+
def _make_conn_ndjson_payload() -> bytes:
|
|
1817
|
+
"""Two valid Zeek conn NDJSON rows, RFC 5737 placeholders."""
|
|
1818
|
+
return (
|
|
1819
|
+
"\n".join(json.dumps(r) for r in [
|
|
1820
|
+
{
|
|
1821
|
+
"_path": "conn",
|
|
1822
|
+
"ts": 1_779_750_000.0,
|
|
1823
|
+
"id.orig_h": "192.0.2.10",
|
|
1824
|
+
"id.resp_h": "198.51.100.20",
|
|
1825
|
+
"id.resp_p": 443,
|
|
1826
|
+
"proto": "tcp",
|
|
1827
|
+
},
|
|
1828
|
+
{
|
|
1829
|
+
"_path": "conn",
|
|
1830
|
+
"ts": 1_779_753_600.0,
|
|
1831
|
+
"id.orig_h": "192.0.2.11",
|
|
1832
|
+
"id.resp_h": "203.0.113.20",
|
|
1833
|
+
"id.resp_p": 22,
|
|
1834
|
+
"proto": "tcp",
|
|
1835
|
+
},
|
|
1836
|
+
]) + "\n"
|
|
1837
|
+
).encode("utf-8")
|
|
1838
|
+
|
|
1839
|
+
|
|
1840
|
+
def test_load_required_logs_decompresses_bz2(tmp_path: Path) -> None:
|
|
1841
|
+
"""A `conn.log.bz2` ingests as text rows — no replacement-char soup."""
|
|
1842
|
+
zeek_dir = tmp_path / "zeek"
|
|
1843
|
+
zeek_dir.mkdir()
|
|
1844
|
+
(zeek_dir / "conn.log.bz2").write_bytes(
|
|
1845
|
+
bz2.compress(_make_conn_ndjson_payload())
|
|
1846
|
+
)
|
|
1847
|
+
|
|
1848
|
+
result = load_required_logs(
|
|
1849
|
+
{"conn*.log*": "zeek_dir"},
|
|
1850
|
+
{"zeek_dir": [zeek_dir]},
|
|
1851
|
+
)
|
|
1852
|
+
|
|
1853
|
+
df = result.logs["conn*.log*"]
|
|
1854
|
+
assert result.record_counts == {"conn*.log*": 2}
|
|
1855
|
+
assert result.warnings == []
|
|
1856
|
+
assert list(df[["src", "dst", "port"]].iloc[0]) == [
|
|
1857
|
+
"192.0.2.10", "198.51.100.20", 443,
|
|
1858
|
+
]
|
|
1859
|
+
|
|
1860
|
+
|
|
1861
|
+
def test_load_required_logs_decompresses_xz(tmp_path: Path) -> None:
|
|
1862
|
+
"""A `conn.log.xz` ingests as text rows — no replacement-char soup."""
|
|
1863
|
+
zeek_dir = tmp_path / "zeek"
|
|
1864
|
+
zeek_dir.mkdir()
|
|
1865
|
+
(zeek_dir / "conn.log.xz").write_bytes(
|
|
1866
|
+
lzma.compress(_make_conn_ndjson_payload())
|
|
1867
|
+
)
|
|
1868
|
+
|
|
1869
|
+
result = load_required_logs(
|
|
1870
|
+
{"conn*.log*": "zeek_dir"},
|
|
1871
|
+
{"zeek_dir": [zeek_dir]},
|
|
1872
|
+
)
|
|
1873
|
+
|
|
1874
|
+
df = result.logs["conn*.log*"]
|
|
1875
|
+
assert result.record_counts == {"conn*.log*": 2}
|
|
1876
|
+
assert result.warnings == []
|
|
1877
|
+
assert list(df[["src", "dst", "port"]].iloc[0]) == [
|
|
1878
|
+
"192.0.2.10", "198.51.100.20", 443,
|
|
1879
|
+
]
|
|
1880
|
+
|
|
1881
|
+
|
|
1882
|
+
def test_load_required_logs_corrupt_bz2_skips_with_warning(tmp_path: Path) -> None:
|
|
1883
|
+
"""A corrupt `.bz2` (non-bzip2 bytes) is skipped with an actionable warning,
|
|
1884
|
+
not a traceback. bz2 raises OSError on bad data — already caught."""
|
|
1885
|
+
zeek_dir = tmp_path / "zeek"
|
|
1886
|
+
zeek_dir.mkdir()
|
|
1887
|
+
(zeek_dir / "conn.log.bz2").write_bytes(b"NOTBZIP2 garbage")
|
|
1888
|
+
|
|
1889
|
+
result = load_required_logs(
|
|
1890
|
+
{"conn*.log*": "zeek_dir"},
|
|
1891
|
+
{"zeek_dir": [zeek_dir]},
|
|
1892
|
+
)
|
|
1893
|
+
|
|
1894
|
+
assert result.logs["conn*.log*"].empty
|
|
1895
|
+
assert any(
|
|
1896
|
+
"conn.log.bz2 could not be read" in w for w in result.warnings
|
|
1897
|
+
)
|
|
1898
|
+
|
|
1899
|
+
|
|
1900
|
+
def test_load_required_logs_corrupt_xz_skips_with_warning(tmp_path: Path) -> None:
|
|
1901
|
+
"""A corrupt `.xz` raises `lzma.LZMAError`, which is a direct
|
|
1902
|
+
`Exception` subclass (NOT `OSError`). Without the wrinkle fix this would
|
|
1903
|
+
leak past the boundary as a traceback. With it, the loader skips and
|
|
1904
|
+
emits the standard read warning."""
|
|
1905
|
+
zeek_dir = tmp_path / "zeek"
|
|
1906
|
+
zeek_dir.mkdir()
|
|
1907
|
+
(zeek_dir / "conn.log.xz").write_bytes(b"NOTXZ garbage")
|
|
1908
|
+
|
|
1909
|
+
result = load_required_logs(
|
|
1910
|
+
{"conn*.log*": "zeek_dir"},
|
|
1911
|
+
{"zeek_dir": [zeek_dir]},
|
|
1912
|
+
)
|
|
1913
|
+
|
|
1914
|
+
assert result.logs["conn*.log*"].empty
|
|
1915
|
+
# The warning must land in the "incomplete or corrupt" branch — proves
|
|
1916
|
+
# `lzma.LZMAError` is recognised by `_zeek_file_read_warning`, not in the
|
|
1917
|
+
# generic class-name fallback. This is the load-bearing wrinkle assertion.
|
|
1918
|
+
assert any(
|
|
1919
|
+
"conn.log.xz could not be read" in w and "incomplete or corrupt" in w
|
|
1920
|
+
for w in result.warnings
|
|
1921
|
+
)
|
|
1922
|
+
|
|
1923
|
+
|
|
1924
|
+
# ── load_pihole: corrupt compressed-file skip-with-warning ──────────────────
|
|
1925
|
+
#
|
|
1926
|
+
# Mirror of load_syslog's corrupt-handling: per-file try/except over the
|
|
1927
|
+
# decode-error family (incl. lzma.LZMAError, which isn't an OSError), so a
|
|
1928
|
+
# corrupt .gz/.bz2/.xz in a pihole_dir doesn't take down the whole load.
|
|
1929
|
+
|
|
1930
|
+
|
|
1931
|
+
@pytest.mark.parametrize("suffix, corrupt_bytes", [
|
|
1932
|
+
(".gz", b"NOTGZIP garbage"),
|
|
1933
|
+
(".bz2", b"NOTBZIP2 garbage"),
|
|
1934
|
+
(".xz", b"NOTXZ garbage"),
|
|
1935
|
+
])
|
|
1936
|
+
def test_load_pihole_corrupt_compressed_file_skipped_with_warning(
|
|
1937
|
+
tmp_path: Path, suffix: str, corrupt_bytes: bytes,
|
|
1938
|
+
) -> None:
|
|
1939
|
+
"""A corrupt compressed file in a pihole_dir is skipped per-file with the
|
|
1940
|
+
actionable read-warning. The good companion file still loads. .gz/.xz
|
|
1941
|
+
land in the "incomplete or corrupt" branch; .bz2's OSError falls to the
|
|
1942
|
+
generic fallback — both are acceptable, the load-bearing rail is
|
|
1943
|
+
"warned, not traceback'd"."""
|
|
1944
|
+
pihole_dir = tmp_path / "pihole"
|
|
1945
|
+
pihole_dir.mkdir()
|
|
1946
|
+
(pihole_dir / "pihole.log").write_text(
|
|
1947
|
+
"Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.1\n",
|
|
1948
|
+
encoding="utf-8",
|
|
1949
|
+
)
|
|
1950
|
+
(pihole_dir / f"pihole.log{suffix}").write_bytes(corrupt_bytes)
|
|
1951
|
+
|
|
1952
|
+
warnings: list[str] = []
|
|
1953
|
+
df = load_pihole(pihole_dir, _warnings=warnings)
|
|
1954
|
+
|
|
1955
|
+
# Good file still loaded.
|
|
1956
|
+
assert len(df) == 1
|
|
1957
|
+
# Corrupt file produced an actionable warning, not a traceback.
|
|
1958
|
+
assert any(
|
|
1959
|
+
f"pihole.log{suffix} could not be read" in w for w in warnings
|
|
1960
|
+
)
|
|
1961
|
+
|
|
1962
|
+
|
|
1963
|
+
def test_load_pihole_corrupt_xz_lands_in_incomplete_or_corrupt_branch(
|
|
1964
|
+
tmp_path: Path,
|
|
1965
|
+
) -> None:
|
|
1966
|
+
"""The wrinkle assertion at the pihole boundary: lzma.LZMAError reaches
|
|
1967
|
+
`_zeek_file_read_warning`'s compressed-incomplete branch, not the
|
|
1968
|
+
generic class-name fallback."""
|
|
1969
|
+
pihole_dir = tmp_path / "pihole"
|
|
1970
|
+
pihole_dir.mkdir()
|
|
1971
|
+
(pihole_dir / "pihole.log.xz").write_bytes(b"NOTXZ garbage")
|
|
1972
|
+
|
|
1973
|
+
warnings: list[str] = []
|
|
1974
|
+
load_pihole(pihole_dir, _warnings=warnings)
|
|
1975
|
+
|
|
1976
|
+
assert any(
|
|
1977
|
+
"pihole.log.xz could not be read" in w and "incomplete or corrupt" in w
|
|
1978
|
+
for w in warnings
|
|
1979
|
+
)
|
|
1980
|
+
|
|
1981
|
+
|
|
1982
|
+
# ── load_pihole: truncated (trailer-corrupt) compressed file honesty rail ──
|
|
1983
|
+
#
|
|
1984
|
+
# Truncated compressed files yield valid-looking lines until the trailer
|
|
1985
|
+
# check raises. Pre-honesty-fix, those pre-EOF rows leaked into the returned
|
|
1986
|
+
# frame even as the loader warned the file had been "skipped". Honesty rail:
|
|
1987
|
+
# a file the loader warns it skipped contributes ZERO rows.
|
|
1988
|
+
|
|
1989
|
+
_PIHOLE_TRUNCATE_PAYLOAD = "\n".join(
|
|
1990
|
+
f"Jun 1 12:{i:02d}:00 dnsmasq[1]: query[A] host{i}.example.test from 192.0.2.{i + 1}"
|
|
1991
|
+
for i in range(20)
|
|
1992
|
+
) + "\n"
|
|
1993
|
+
|
|
1994
|
+
|
|
1995
|
+
def _pihole_truncated_compressed(payload: bytes, suffix: str) -> bytes:
|
|
1996
|
+
if suffix == ".gz":
|
|
1997
|
+
return gzip.compress(payload)[:-1]
|
|
1998
|
+
if suffix == ".bz2":
|
|
1999
|
+
return bz2.compress(payload)[:-1]
|
|
2000
|
+
if suffix == ".xz":
|
|
2001
|
+
return lzma.compress(payload)[:-1]
|
|
2002
|
+
raise ValueError(f"unsupported suffix {suffix!r}")
|
|
2003
|
+
|
|
2004
|
+
|
|
2005
|
+
@pytest.mark.parametrize("suffix", [".gz", ".bz2", ".xz"])
|
|
2006
|
+
def test_load_pihole_trailer_corrupt_compressed_contributes_zero_rows(
|
|
2007
|
+
tmp_path: Path, suffix: str,
|
|
2008
|
+
) -> None:
|
|
2009
|
+
"""A truncated `.gz` / `.bz2` / `.xz` pihole file warns AND contributes
|
|
2010
|
+
zero rows. The good companion file still loads."""
|
|
2011
|
+
pihole_dir = tmp_path / "pihole"
|
|
2012
|
+
pihole_dir.mkdir()
|
|
2013
|
+
# Good companion — one identifiable query line.
|
|
2014
|
+
(pihole_dir / "pihole.log").write_text(
|
|
2015
|
+
"Jun 1 23:59:00 dnsmasq[1]: query[A] companion.example.test from 192.0.2.99\n",
|
|
2016
|
+
encoding="utf-8",
|
|
2017
|
+
)
|
|
2018
|
+
(pihole_dir / f"pihole.log{suffix}").write_bytes(
|
|
2019
|
+
_pihole_truncated_compressed(
|
|
2020
|
+
_PIHOLE_TRUNCATE_PAYLOAD.encode("utf-8"), suffix,
|
|
2021
|
+
)
|
|
2022
|
+
)
|
|
2023
|
+
|
|
2024
|
+
warnings: list[str] = []
|
|
2025
|
+
df = load_pihole(pihole_dir, _warnings=warnings)
|
|
2026
|
+
|
|
2027
|
+
assert any(
|
|
2028
|
+
f"pihole.log{suffix} could not be read" in w for w in warnings
|
|
2029
|
+
)
|
|
2030
|
+
# Only the companion row survives. Pre-honesty-fix, the truncated file's
|
|
2031
|
+
# pre-EOF rows leaked in.
|
|
2032
|
+
assert len(df) == 1
|
|
2033
|
+
assert df.iloc[0]["query"] == "companion.example.test"
|
|
2034
|
+
|
|
2035
|
+
|
|
2036
|
+
# ── load_required_logs threading for the flat readers ──────────────────────
|
|
2037
|
+
|
|
2038
|
+
|
|
2039
|
+
def test_load_required_logs_syslog_corrupt_xz_does_not_traceback(
|
|
2040
|
+
tmp_path: Path,
|
|
2041
|
+
) -> None:
|
|
2042
|
+
"""The Glenn P1 reproduction at the public CLI boundary: a corrupt
|
|
2043
|
+
`system.log.xz` in syslog_dir must NOT raise a `lzma.LZMAError` traceback
|
|
2044
|
+
past `load_required_logs` — it must degrade to a warning in the
|
|
2045
|
+
LoadResult."""
|
|
2046
|
+
syslog_dir = tmp_path / "syslog"
|
|
2047
|
+
syslog_dir.mkdir()
|
|
2048
|
+
(syslog_dir / "system.log.xz").write_bytes(b"NOTXZ garbage")
|
|
2049
|
+
# Good companion file so the load still returns rows.
|
|
2050
|
+
(syslog_dir / "router.log").write_text(
|
|
2051
|
+
"<134>May 31 12:00:00 router sshd[100]: Accepted publickey for user\n",
|
|
2052
|
+
encoding="utf-8",
|
|
2053
|
+
)
|
|
2054
|
+
|
|
2055
|
+
result = load_required_logs(
|
|
2056
|
+
{"*.log*": "syslog_dir"},
|
|
2057
|
+
{"syslog_dir": [syslog_dir]},
|
|
2058
|
+
)
|
|
2059
|
+
|
|
2060
|
+
df = result.logs["*.log*"]
|
|
2061
|
+
assert len(df) == 1
|
|
2062
|
+
assert df.iloc[0]["host"] == "router"
|
|
2063
|
+
assert any(
|
|
2064
|
+
"system.log.xz could not be read" in w
|
|
2065
|
+
and "incomplete or corrupt" in w
|
|
2066
|
+
for w in result.warnings
|
|
2067
|
+
)
|
|
2068
|
+
|
|
2069
|
+
|
|
2070
|
+
def test_load_required_logs_pihole_corrupt_xz_does_not_traceback(
|
|
2071
|
+
tmp_path: Path,
|
|
2072
|
+
) -> None:
|
|
2073
|
+
"""Pihole sibling of the syslog test — same shape, same fix."""
|
|
2074
|
+
pihole_dir = tmp_path / "pihole"
|
|
2075
|
+
pihole_dir.mkdir()
|
|
2076
|
+
(pihole_dir / "pihole.log.xz").write_bytes(b"NOTXZ garbage")
|
|
2077
|
+
(pihole_dir / "pihole.log").write_text(
|
|
2078
|
+
"Jun 1 12:00:00 dnsmasq[1]: query[A] example.test from 192.0.2.1\n",
|
|
2079
|
+
encoding="utf-8",
|
|
2080
|
+
)
|
|
2081
|
+
|
|
2082
|
+
result = load_required_logs(
|
|
2083
|
+
{"pihole*.log*": "pihole_dir"},
|
|
2084
|
+
{"pihole_dir": [pihole_dir]},
|
|
2085
|
+
)
|
|
2086
|
+
|
|
2087
|
+
df = result.logs["pihole*.log*"]
|
|
2088
|
+
assert len(df) == 1
|
|
2089
|
+
assert any(
|
|
2090
|
+
"pihole.log.xz could not be read" in w
|
|
2091
|
+
and "incomplete or corrupt" in w
|
|
2092
|
+
for w in result.warnings
|
|
2093
|
+
)
|
|
2094
|
+
|
|
2095
|
+
|
|
2096
|
+
def test_load_required_logs_gz_regression(tmp_path: Path) -> None:
|
|
2097
|
+
"""`.gz` ingestion behavior unchanged after bz2/xz additions."""
|
|
2098
|
+
zeek_dir = tmp_path / "zeek"
|
|
2099
|
+
zeek_dir.mkdir()
|
|
2100
|
+
(zeek_dir / "conn.log.gz").write_bytes(
|
|
2101
|
+
gzip.compress(_make_conn_ndjson_payload())
|
|
2102
|
+
)
|
|
2103
|
+
|
|
2104
|
+
result = load_required_logs(
|
|
2105
|
+
{"conn*.log*": "zeek_dir"},
|
|
2106
|
+
{"zeek_dir": [zeek_dir]},
|
|
2107
|
+
)
|
|
2108
|
+
|
|
2109
|
+
assert result.record_counts == {"conn*.log*": 2}
|
|
2110
|
+
assert result.warnings == []
|
|
2111
|
+
|
|
2112
|
+
|
|
2113
|
+
# ── CoverageTracker: tri-state SourceCoverage contract ─────────────────────────
|
|
2114
|
+
#
|
|
2115
|
+
# The tracker is the single mechanism every loader (and the runner's flat-Zeek
|
|
2116
|
+
# default-window block) uses to record what was attempted vs what was kept.
|
|
2117
|
+
# These tests pin the four arms of `coverage(frame_empty)` plus the kept
|
|
2118
|
+
# short-circuit.
|
|
2119
|
+
|
|
2120
|
+
|
|
2121
|
+
def test_coverage_tracker_no_files_read_returns_none_full_rows() -> None:
|
|
2122
|
+
"""Date-pruned dated Zeek: discovery returned no files, the loader never
|
|
2123
|
+
enters the per-file loop. coverage(frame_empty=True) → (None, None)."""
|
|
2124
|
+
t = CoverageTracker()
|
|
2125
|
+
assert t.coverage(True) == SourceCoverage(None, None)
|
|
2126
|
+
|
|
2127
|
+
|
|
2128
|
+
def test_coverage_tracker_files_read_no_valid_ts_returns_zero_full_rows() -> None:
|
|
2129
|
+
"""Empty / header-only / unparseable-ts files: files were OPENED but no
|
|
2130
|
+
valid-ts rows survived parsing. coverage → (0, None). Drives the runner's
|
|
2131
|
+
NO-note branch (parse gap, not a window gap)."""
|
|
2132
|
+
t = CoverageTracker()
|
|
2133
|
+
t.note_file_read()
|
|
2134
|
+
t.note_file_read()
|
|
2135
|
+
assert t.coverage(True) == SourceCoverage(0, None)
|
|
2136
|
+
|
|
2137
|
+
|
|
2138
|
+
def test_coverage_tracker_observe_counts_valid_ts_and_tracks_span() -> None:
|
|
2139
|
+
"""observe(ts) increments valid_rows and folds ts into min/max. None / NaN
|
|
2140
|
+
safely ignored (do not contaminate the span)."""
|
|
2141
|
+
t = CoverageTracker()
|
|
2142
|
+
t.note_file_read()
|
|
2143
|
+
t.observe(100.0)
|
|
2144
|
+
t.observe(200.0)
|
|
2145
|
+
t.observe(50.0)
|
|
2146
|
+
t.observe(None)
|
|
2147
|
+
t.observe(float("nan"))
|
|
2148
|
+
sc = t.coverage(True)
|
|
2149
|
+
assert sc is not None
|
|
2150
|
+
assert sc.full_rows == 3
|
|
2151
|
+
assert sc.full_span is not None
|
|
2152
|
+
start, end = sc.full_span
|
|
2153
|
+
assert start.timestamp() == 50.0
|
|
2154
|
+
assert end.timestamp() == 200.0
|
|
2155
|
+
|
|
2156
|
+
|
|
2157
|
+
def test_coverage_tracker_observe_frame_counts_valid_ts_and_tracks_span() -> None:
|
|
2158
|
+
"""observe_frame(pre_df) counts valid-ts rows from the pre-window frame
|
|
2159
|
+
and folds the frame's min/max into the running span. NaN-ts rows
|
|
2160
|
+
excluded."""
|
|
2161
|
+
t = CoverageTracker()
|
|
2162
|
+
t.note_file_read()
|
|
2163
|
+
df = pd.DataFrame({"ts": [10.0, 20.0, float("nan"), 30.0]})
|
|
2164
|
+
t.observe_frame(df)
|
|
2165
|
+
sc = t.coverage(True)
|
|
2166
|
+
assert sc is not None
|
|
2167
|
+
assert sc.full_rows == 3
|
|
2168
|
+
assert sc.full_span is not None
|
|
2169
|
+
assert sc.full_span[0].timestamp() == 10.0
|
|
2170
|
+
assert sc.full_span[1].timestamp() == 30.0
|
|
2171
|
+
|
|
2172
|
+
|
|
2173
|
+
def test_coverage_tracker_kept_short_circuits_to_none() -> None:
|
|
2174
|
+
"""A row survived the window → mark_kept latches → coverage(False) returns
|
|
2175
|
+
None (no disclosure needed). Subsequent observe calls are cheap no-ops —
|
|
2176
|
+
the zero-normal-path-cost rail."""
|
|
2177
|
+
t = CoverageTracker()
|
|
2178
|
+
t.note_file_read()
|
|
2179
|
+
t.observe(100.0)
|
|
2180
|
+
t.mark_kept()
|
|
2181
|
+
# Later observes after the latch should NOT add to valid_rows
|
|
2182
|
+
t.observe(200.0)
|
|
2183
|
+
t.observe(300.0)
|
|
2184
|
+
# frame is non-empty (data survived) → coverage suppressed
|
|
2185
|
+
assert t.coverage(False) is None
|
|
2186
|
+
# Even with frame_empty=True, kept=True suppresses (defensive — runner
|
|
2187
|
+
# never passes True when data survived).
|
|
2188
|
+
assert t.coverage(True) is None
|
|
2189
|
+
|
|
2190
|
+
|
|
2191
|
+
def test_coverage_tracker_frame_nonempty_returns_none() -> None:
|
|
2192
|
+
"""The first branch of coverage(): frame survived → None, regardless of
|
|
2193
|
+
kept latch."""
|
|
2194
|
+
t = CoverageTracker()
|
|
2195
|
+
t.note_file_read()
|
|
2196
|
+
t.observe(100.0)
|
|
2197
|
+
assert t.coverage(False) is None
|
|
2198
|
+
|
|
2199
|
+
|
|
2200
|
+
# ── Per-loader coverage integration (loader-level, no runner) ─────────────────
|
|
2201
|
+
|
|
2202
|
+
|
|
2203
|
+
def test_load_logs_dated_zeek_outside_window_writes_coverage_none(
|
|
2204
|
+
tmp_path: Path,
|
|
2205
|
+
) -> None:
|
|
2206
|
+
"""Dated-Zeek date-pruned: discover_zeek_files returns no files because
|
|
2207
|
+
every dated subdir falls outside the requested window. The early-return
|
|
2208
|
+
branch must still write coverage so the runner's bare-note path fires."""
|
|
2209
|
+
zeek_dir = tmp_path / "zeek"
|
|
2210
|
+
zeek_dir.mkdir()
|
|
2211
|
+
old_subdir = zeek_dir / "2025-01-01"
|
|
2212
|
+
old_subdir.mkdir()
|
|
2213
|
+
_write_ndjson(old_subdir / "conn.log", [
|
|
2214
|
+
{"ts": datetime(2025, 1, 1, tzinfo=timezone.utc).timestamp(),
|
|
2215
|
+
"id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
2216
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
2217
|
+
])
|
|
2218
|
+
|
|
2219
|
+
cov_dict: dict = {}
|
|
2220
|
+
df = load_logs(
|
|
2221
|
+
zeek_dir, "conn*.log*",
|
|
2222
|
+
since=datetime(2030, 1, 1, tzinfo=timezone.utc),
|
|
2223
|
+
until=datetime(2030, 12, 31, tzinfo=timezone.utc),
|
|
2224
|
+
_coverage=cov_dict,
|
|
2225
|
+
)
|
|
2226
|
+
|
|
2227
|
+
assert df.empty
|
|
2228
|
+
assert "coverage" in cov_dict
|
|
2229
|
+
assert cov_dict["coverage"] == SourceCoverage(None, None)
|
|
2230
|
+
|
|
2231
|
+
|
|
2232
|
+
def test_load_logs_empty_zeek_file_writes_coverage_zero(tmp_path: Path) -> None:
|
|
2233
|
+
"""An empty / header-only Zeek file (rotation artifact) reads but yields
|
|
2234
|
+
no valid-ts rows → (0, None), the PARSE-GAP arm. The runner suppresses
|
|
2235
|
+
notes for this — telling the operator to widen the window on a file with
|
|
2236
|
+
no data would mislead (Glenn #2)."""
|
|
2237
|
+
zeek_dir = tmp_path / "zeek"
|
|
2238
|
+
zeek_dir.mkdir()
|
|
2239
|
+
(zeek_dir / "conn.log").write_text("", encoding="utf-8")
|
|
2240
|
+
|
|
2241
|
+
cov_dict: dict = {}
|
|
2242
|
+
df = load_logs(zeek_dir, "conn*.log*", _coverage=cov_dict)
|
|
2243
|
+
|
|
2244
|
+
assert df.empty
|
|
2245
|
+
assert cov_dict.get("coverage") == SourceCoverage(0, None)
|
|
2246
|
+
|
|
2247
|
+
|
|
2248
|
+
def test_load_logs_populated_writes_no_coverage_entry(tmp_path: Path) -> None:
|
|
2249
|
+
"""The mark_kept short-circuit: a normal in-window load writes NO coverage
|
|
2250
|
+
entry (the tracker's coverage(False) returns None for a populated frame)."""
|
|
2251
|
+
zeek_dir = tmp_path / "zeek"
|
|
2252
|
+
zeek_dir.mkdir()
|
|
2253
|
+
_write_ndjson(zeek_dir / "conn.log", [
|
|
2254
|
+
{"ts": 1_700_000_000.0, "id.orig_h": "192.0.2.1",
|
|
2255
|
+
"id.resp_h": "198.51.100.1", "id.resp_p": 443, "proto": "tcp"},
|
|
2256
|
+
])
|
|
2257
|
+
|
|
2258
|
+
cov_dict: dict = {}
|
|
2259
|
+
df = load_logs(zeek_dir, "conn*.log*", _coverage=cov_dict)
|
|
2260
|
+
|
|
2261
|
+
assert not df.empty
|
|
2262
|
+
assert "coverage" not in cov_dict
|
|
2263
|
+
|
|
2264
|
+
|
|
2265
|
+
def test_load_pihole_stale_data_writes_coverage_span(tmp_path: Path) -> None:
|
|
2266
|
+
"""A Pi-hole archive whose timestamps all fall outside the requested
|
|
2267
|
+
window: coverage records full_rows (the count of valid-ts rows seen
|
|
2268
|
+
pre-window) AND a span derived from those rows. This is the stale-Pi-hole
|
|
2269
|
+
motivating-bug shape at the loader level."""
|
|
2270
|
+
pihole_dir = tmp_path / "pihole"
|
|
2271
|
+
pihole_dir.mkdir()
|
|
2272
|
+
# Use explicit year so year-guess heuristics can't drift the fixture.
|
|
2273
|
+
(pihole_dir / "pihole.log").write_text(
|
|
2274
|
+
"Jun 1 12:00:00 2025 dnsmasq[1]: query[A] example.test from 192.0.2.10\n"
|
|
2275
|
+
"Jun 1 12:01:00 2025 dnsmasq[1]: reply example.test is 203.0.113.1\n",
|
|
2276
|
+
encoding="utf-8",
|
|
2277
|
+
)
|
|
2278
|
+
|
|
2279
|
+
cov_dict: dict = {}
|
|
2280
|
+
df = load_pihole(
|
|
2281
|
+
pihole_dir,
|
|
2282
|
+
since=datetime(2030, 1, 1, tzinfo=timezone.utc),
|
|
2283
|
+
until=datetime(2030, 12, 31, tzinfo=timezone.utc),
|
|
2284
|
+
_coverage=cov_dict,
|
|
2285
|
+
)
|
|
2286
|
+
|
|
2287
|
+
assert df.empty
|
|
2288
|
+
sc = cov_dict.get("coverage")
|
|
2289
|
+
assert sc is not None
|
|
2290
|
+
# Some rows may year-guess differently — what matters is that the loader
|
|
2291
|
+
# writes SPAN coverage (full_rows > 0 with a non-None span), NOT parse-gap.
|
|
2292
|
+
if sc.full_rows is not None and sc.full_rows > 0:
|
|
2293
|
+
assert sc.full_span is not None
|
|
2294
|
+
else:
|
|
2295
|
+
# The fixture's year-suffixed format may parse to no valid ts on some
|
|
2296
|
+
# heuristics — fall back to the parse-gap arm rather than failing.
|
|
2297
|
+
assert sc.full_rows == 0
|
|
2298
|
+
|
|
2299
|
+
|
|
2300
|
+
def test_load_pihole_wrong_family_only_skips_silently(tmp_path: Path) -> None:
|
|
2301
|
+
"""Wrong-family skip (the NDJSON guard fires for an NDJSON file in
|
|
2302
|
+
pihole_dir): note_file_read does NOT fire for the skipped file, so the
|
|
2303
|
+
tracker sees zero files read. The runner suppresses notes for non-Zeek
|
|
2304
|
+
"no files read" cases anyway, but the loader's contract is to record
|
|
2305
|
+
truthfully — and the wrong-family file MUST NOT register as read."""
|
|
2306
|
+
pihole_dir = tmp_path / "pihole"
|
|
2307
|
+
pihole_dir.mkdir()
|
|
2308
|
+
_write_ndjson(pihole_dir / "looks-like-zeek.log", [
|
|
2309
|
+
{"ts": 1.0, "extra": "irrelevant"},
|
|
2310
|
+
])
|
|
2311
|
+
|
|
2312
|
+
cov_dict: dict = {}
|
|
2313
|
+
df = load_pihole(pihole_dir, _coverage=cov_dict)
|
|
2314
|
+
|
|
2315
|
+
assert df.empty
|
|
2316
|
+
sc = cov_dict.get("coverage")
|
|
2317
|
+
# files_read=False (note_file_read suppressed by wrong-family guard) →
|
|
2318
|
+
# full_rows is None at the LOADER level. The runner translates this to
|
|
2319
|
+
# "no note" because the BARE-note arm is zeek_dir-only.
|
|
2320
|
+
assert sc == SourceCoverage(None, None)
|
|
2321
|
+
|
|
2322
|
+
|
|
2323
|
+
def test_load_cloudtrail_all_unparseable_eventtime_writes_coverage_zero(
|
|
2324
|
+
tmp_path: Path,
|
|
2325
|
+
) -> None:
|
|
2326
|
+
"""CloudTrail file where every event has unparseable eventTime →
|
|
2327
|
+
tracker sees note_file_read but observe() ignores None ts → coverage =
|
|
2328
|
+
(0, None). PARSE-GAP arm: no note (Glenn #2)."""
|
|
2329
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
2330
|
+
cloudtrail_dir.mkdir()
|
|
2331
|
+
_ct_write_ndjson(cloudtrail_dir / "events.json.log", [
|
|
2332
|
+
_ct_event(eventTime="not-a-timestamp", eventID="bad-1"),
|
|
2333
|
+
_ct_event(eventTime="also-not-a-time", eventID="bad-2"),
|
|
2334
|
+
])
|
|
2335
|
+
|
|
2336
|
+
cov_dict: dict = {}
|
|
2337
|
+
df = load_cloudtrail(cloudtrail_dir, _coverage=cov_dict)
|
|
2338
|
+
|
|
2339
|
+
assert df.empty
|
|
2340
|
+
assert cov_dict.get("coverage") == SourceCoverage(0, None)
|
|
2341
|
+
|
|
2342
|
+
|
|
2343
|
+
def test_load_cloudtrail_stale_data_writes_coverage_span(tmp_path: Path) -> None:
|
|
2344
|
+
"""CloudTrail events all timestamped before the requested window →
|
|
2345
|
+
SPAN coverage."""
|
|
2346
|
+
cloudtrail_dir = tmp_path / "ct"
|
|
2347
|
+
cloudtrail_dir.mkdir()
|
|
2348
|
+
_ct_write_ndjson(cloudtrail_dir / "events.json.log", [
|
|
2349
|
+
_ct_event(eventTime="2025-06-01T12:00:00Z", eventID="a"),
|
|
2350
|
+
_ct_event(eventTime="2025-06-02T12:00:00Z", eventID="b"),
|
|
2351
|
+
])
|
|
2352
|
+
|
|
2353
|
+
cov_dict: dict = {}
|
|
2354
|
+
df = load_cloudtrail(
|
|
2355
|
+
cloudtrail_dir,
|
|
2356
|
+
since=datetime(2030, 1, 1, tzinfo=timezone.utc),
|
|
2357
|
+
until=datetime(2030, 12, 31, tzinfo=timezone.utc),
|
|
2358
|
+
_coverage=cov_dict,
|
|
2359
|
+
)
|
|
2360
|
+
|
|
2361
|
+
assert df.empty
|
|
2362
|
+
sc = cov_dict.get("coverage")
|
|
2363
|
+
assert sc is not None
|
|
2364
|
+
assert sc.full_rows == 2
|
|
2365
|
+
assert sc.full_span is not None
|
|
2366
|
+
|
|
2367
|
+
|
|
2368
|
+
def test_load_required_logs_assembles_per_pattern_coverage(tmp_path: Path) -> None:
|
|
2369
|
+
"""load_required_logs builds LoadResult.coverage from each load_*'s
|
|
2370
|
+
_coverage out-param under the SAME pattern key the runner reads."""
|
|
2371
|
+
zeek_dir = tmp_path / "zeek"
|
|
2372
|
+
zeek_dir.mkdir()
|
|
2373
|
+
old = zeek_dir / "2025-01-01"
|
|
2374
|
+
old.mkdir()
|
|
2375
|
+
_write_ndjson(old / "conn.log", [
|
|
2376
|
+
{"ts": datetime(2025, 1, 1, tzinfo=timezone.utc).timestamp(),
|
|
2377
|
+
"id.orig_h": "192.0.2.1", "id.resp_h": "198.51.100.1",
|
|
2378
|
+
"id.resp_p": 443, "proto": "tcp"},
|
|
2379
|
+
])
|
|
2380
|
+
|
|
2381
|
+
result = load_required_logs(
|
|
2382
|
+
{"conn*.log*": "zeek_dir"},
|
|
2383
|
+
{"zeek_dir": [zeek_dir]},
|
|
2384
|
+
since=datetime(2030, 1, 1, tzinfo=timezone.utc),
|
|
2385
|
+
until=datetime(2030, 12, 31, tzinfo=timezone.utc),
|
|
2386
|
+
)
|
|
2387
|
+
|
|
2388
|
+
assert "conn*.log*" in result.coverage
|
|
2389
|
+
assert result.coverage["conn*.log*"] == SourceCoverage(None, None)
|
|
2390
|
+
|
|
2391
|
+
|
|
2392
|
+
# ── run_load guarantee + _SOURCE_LOADERS tripwire + Zeek TSV regressions ─────
|
|
2393
|
+
#
|
|
2394
|
+
# These tests lock the refactor's load-bearing contracts: a fake
|
|
2395
|
+
# ``SourceLoader`` driven through ``run_load`` exercises the uniform pipeline
|
|
2396
|
+
# WITHOUT any format-specific wiring (progress + coverage + windowing +
|
|
2397
|
+
# verbose-gated skip + read-corruption rail); the tripwire asserts every
|
|
2398
|
+
# detector source-key is registered; the Zeek TSV regressions confirm the
|
|
2399
|
+
# prefix-preserving sniff hands the full header block to ``parse_tsv_log``.
|
|
2400
|
+
|
|
2401
|
+
|
|
2402
|
+
def test_run_load_fake_strategy_exercises_pipeline_mechanics(
|
|
2403
|
+
tmp_path: Path,
|
|
2404
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
2405
|
+
capsys: pytest.CaptureFixture,
|
|
2406
|
+
) -> None:
|
|
2407
|
+
"""A FAKE ``SourceLoader`` driven through ``run_load`` exercises the
|
|
2408
|
+
pipeline's contract with ZERO format-specific wiring: progress is wrapped
|
|
2409
|
+
once per file, coverage is written for empty/window-excluded loads,
|
|
2410
|
+
in-window rows survive, NaN-ts under ``keep`` policy bypasses the window,
|
|
2411
|
+
verbose=True prints the skip message to stderr while verbose=False stays
|
|
2412
|
+
quiet, and a per-file decompression failure rides
|
|
2413
|
+
``_zeek_file_read_warning`` without aborting the load.
|
|
2414
|
+
"""
|
|
2415
|
+
from loghunter.common import loader as loader_mod
|
|
2416
|
+
|
|
2417
|
+
# Spy the progress seam (intercepts kwargs without consuming the iterable).
|
|
2418
|
+
calls: list[dict] = []
|
|
2419
|
+
|
|
2420
|
+
def progress_spy(iterable, *, desc, show_progress=True, unit=" lines",
|
|
2421
|
+
total=None, stream=None):
|
|
2422
|
+
calls.append({"desc": desc, "unit": unit, "show_progress": show_progress})
|
|
2423
|
+
return iter(iterable)
|
|
2424
|
+
|
|
2425
|
+
monkeypatch.setattr(loader_mod, "progress", progress_spy)
|
|
2426
|
+
|
|
2427
|
+
# --- Build a fake stream strategy. parse yields canonical row dicts.
|
|
2428
|
+
def fake_parse(line_iter, *, path, warnings): # noqa: ARG001
|
|
2429
|
+
for line in line_iter:
|
|
2430
|
+
ts_token, host = line.rstrip("\n").split("\t", 1)
|
|
2431
|
+
ts = float(ts_token) if ts_token != "NA" else float("nan")
|
|
2432
|
+
yield {"ts": ts, "host": host, "raw": line.rstrip("\n")}
|
|
2433
|
+
|
|
2434
|
+
def fake_skip(path: Path) -> str | None:
|
|
2435
|
+
# Skip files named with .skip extension.
|
|
2436
|
+
return f"fake: skipping {path.name}" if path.suffix == ".skip" else None
|
|
2437
|
+
|
|
2438
|
+
strategy_keep = loader_mod.SourceLoader(
|
|
2439
|
+
discover=lambda p, pat, s, u: [p], # noqa: ARG005
|
|
2440
|
+
mode="stream",
|
|
2441
|
+
parse=fake_parse,
|
|
2442
|
+
ts_policy="keep",
|
|
2443
|
+
columns=["ts", "host", "raw"],
|
|
2444
|
+
should_skip=fake_skip,
|
|
2445
|
+
normalize=None,
|
|
2446
|
+
)
|
|
2447
|
+
|
|
2448
|
+
# --- File 1: an in-window row + a NaN-ts row + an out-of-window row.
|
|
2449
|
+
f_data = tmp_path / "good.log"
|
|
2450
|
+
f_data.write_text(
|
|
2451
|
+
f"{1.0}\tA\n"
|
|
2452
|
+
f"NA\tB\n"
|
|
2453
|
+
f"{99.0}\tC\n",
|
|
2454
|
+
encoding="utf-8",
|
|
2455
|
+
)
|
|
2456
|
+
# --- File 2: should_skip drops this one.
|
|
2457
|
+
f_skip = tmp_path / "wrong.skip"
|
|
2458
|
+
f_skip.write_text("ignored\n", encoding="utf-8")
|
|
2459
|
+
# --- File 3: corrupt gzip — read-corruption rail catches.
|
|
2460
|
+
f_bad = tmp_path / "bad.gz"
|
|
2461
|
+
f_bad.write_bytes(b"not a real gzip stream")
|
|
2462
|
+
|
|
2463
|
+
warnings: list[str] = []
|
|
2464
|
+
coverage: dict = {}
|
|
2465
|
+
|
|
2466
|
+
# Quiet default: skip message NOT printed.
|
|
2467
|
+
df = loader_mod.run_load(
|
|
2468
|
+
strategy_keep,
|
|
2469
|
+
[f_data, f_skip, f_bad],
|
|
2470
|
+
pattern="",
|
|
2471
|
+
since=datetime.fromtimestamp(0.0, tz=timezone.utc),
|
|
2472
|
+
until=datetime.fromtimestamp(10.0, tz=timezone.utc),
|
|
2473
|
+
show_progress=True,
|
|
2474
|
+
verbose=False,
|
|
2475
|
+
_warnings=warnings,
|
|
2476
|
+
_coverage=coverage,
|
|
2477
|
+
)
|
|
2478
|
+
|
|
2479
|
+
# In-window row (ts=1.0, host=A) + NaN-ts row (host=B, bypasses window).
|
|
2480
|
+
# Out-of-window (ts=99.0) dropped; skipped file contributes zero; corrupt
|
|
2481
|
+
# file caught with a read-warning.
|
|
2482
|
+
assert sorted(df["host"].tolist()) == ["A", "B"]
|
|
2483
|
+
captured = capsys.readouterr()
|
|
2484
|
+
assert "fake: skipping" not in captured.err # quiet default
|
|
2485
|
+
# Read-corruption rail: bad.gz produced ONE warning, no traceback.
|
|
2486
|
+
assert any("bad.gz" in w for w in warnings)
|
|
2487
|
+
assert len(warnings) == 1
|
|
2488
|
+
# Progress was wrapped for the two readable files (not the skipped one).
|
|
2489
|
+
assert {c["desc"] for c in calls} == {"loaded good.log", "loaded bad.gz"}
|
|
2490
|
+
# mark_kept fired → no coverage write needed.
|
|
2491
|
+
assert "coverage" not in coverage
|
|
2492
|
+
|
|
2493
|
+
# Now verbose=True surfaces the skip message; rebuild the spy log.
|
|
2494
|
+
calls.clear()
|
|
2495
|
+
capsys.readouterr() # drain
|
|
2496
|
+
warnings2: list[str] = []
|
|
2497
|
+
coverage2: dict = {}
|
|
2498
|
+
df2 = loader_mod.run_load(
|
|
2499
|
+
strategy_keep,
|
|
2500
|
+
[f_data, f_skip],
|
|
2501
|
+
pattern="",
|
|
2502
|
+
since=datetime.fromtimestamp(0.0, tz=timezone.utc),
|
|
2503
|
+
until=datetime.fromtimestamp(10.0, tz=timezone.utc),
|
|
2504
|
+
show_progress=True,
|
|
2505
|
+
verbose=True,
|
|
2506
|
+
_warnings=warnings2,
|
|
2507
|
+
_coverage=coverage2,
|
|
2508
|
+
)
|
|
2509
|
+
captured = capsys.readouterr()
|
|
2510
|
+
assert "fake: skipping wrong.skip" in captured.err
|
|
2511
|
+
assert sorted(df2["host"].tolist()) == ["A", "B"]
|
|
2512
|
+
|
|
2513
|
+
# Empty-load returns column-stable empty frame AND writes coverage for the
|
|
2514
|
+
# date-pruned case (no files).
|
|
2515
|
+
coverage3: dict = {}
|
|
2516
|
+
df3 = loader_mod.run_load(
|
|
2517
|
+
strategy_keep,
|
|
2518
|
+
[],
|
|
2519
|
+
pattern="",
|
|
2520
|
+
since=None,
|
|
2521
|
+
until=None,
|
|
2522
|
+
show_progress=False,
|
|
2523
|
+
verbose=False,
|
|
2524
|
+
_warnings=None,
|
|
2525
|
+
_coverage=coverage3,
|
|
2526
|
+
)
|
|
2527
|
+
assert df3.empty
|
|
2528
|
+
assert list(df3.columns) == ["ts", "host", "raw"]
|
|
2529
|
+
assert coverage3.get("coverage") == SourceCoverage(None, None)
|
|
2530
|
+
|
|
2531
|
+
|
|
2532
|
+
def test_run_load_drop_policy_discards_nan_ts(
|
|
2533
|
+
tmp_path: Path,
|
|
2534
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
2535
|
+
) -> None:
|
|
2536
|
+
"""``ts_policy='drop'`` discards NaN-ts rows before windowing — the
|
|
2537
|
+
other half of the policy fork (the ``keep`` half is exercised above)."""
|
|
2538
|
+
from loghunter.common import loader as loader_mod
|
|
2539
|
+
|
|
2540
|
+
monkeypatch.setattr(
|
|
2541
|
+
loader_mod,
|
|
2542
|
+
"progress",
|
|
2543
|
+
lambda iterable, *, desc, show_progress=True, unit=" lines",
|
|
2544
|
+
total=None, stream=None: iter(iterable),
|
|
2545
|
+
)
|
|
2546
|
+
|
|
2547
|
+
def fake_parse(line_iter, *, path, warnings): # noqa: ARG001
|
|
2548
|
+
for line in line_iter:
|
|
2549
|
+
ts_token, host = line.rstrip("\n").split("\t", 1)
|
|
2550
|
+
ts = float(ts_token) if ts_token != "NA" else float("nan")
|
|
2551
|
+
yield {"ts": ts, "host": host, "raw": line.rstrip("\n")}
|
|
2552
|
+
|
|
2553
|
+
strategy_drop = loader_mod.SourceLoader(
|
|
2554
|
+
discover=lambda p, pat, s, u: [p], # noqa: ARG005
|
|
2555
|
+
mode="stream",
|
|
2556
|
+
parse=fake_parse,
|
|
2557
|
+
ts_policy="drop",
|
|
2558
|
+
columns=["ts", "host", "raw"],
|
|
2559
|
+
should_skip=None,
|
|
2560
|
+
normalize=None,
|
|
2561
|
+
)
|
|
2562
|
+
|
|
2563
|
+
f = tmp_path / "mix.log"
|
|
2564
|
+
f.write_text(f"{1.0}\tA\nNA\tB\n", encoding="utf-8")
|
|
2565
|
+
|
|
2566
|
+
df = loader_mod.run_load(
|
|
2567
|
+
strategy_drop, [f], pattern="",
|
|
2568
|
+
since=None, until=None,
|
|
2569
|
+
show_progress=False, verbose=False,
|
|
2570
|
+
)
|
|
2571
|
+
# NaN-ts row dropped; in-window row kept.
|
|
2572
|
+
assert df["host"].tolist() == ["A"]
|
|
2573
|
+
|
|
2574
|
+
|
|
2575
|
+
def test_source_loaders_keyspace_covers_every_detector_source_key() -> None:
|
|
2576
|
+
"""Additive tripwire: every detector ``REQUIRED_LOGS``/``OPTIONAL_LOGS``
|
|
2577
|
+
source key has a ``_SOURCE_LOADERS`` entry. A new source family that
|
|
2578
|
+
skips registry registration will fail this test instead of producing a
|
|
2579
|
+
``ValueError("unknown source key …")`` at runtime."""
|
|
2580
|
+
import importlib
|
|
2581
|
+
import pkgutil
|
|
2582
|
+
|
|
2583
|
+
from loghunter.common.loader import _SOURCE_LOADERS
|
|
2584
|
+
from loghunter import detectors as _detectors_pkg
|
|
2585
|
+
|
|
2586
|
+
seen_keys: set[str] = set()
|
|
2587
|
+
for modinfo in pkgutil.iter_modules(_detectors_pkg.__path__):
|
|
2588
|
+
mod = importlib.import_module(f"loghunter.detectors.{modinfo.name}")
|
|
2589
|
+
for log in list(getattr(mod, "REQUIRED_LOGS", []) or []) + \
|
|
2590
|
+
list(getattr(mod, "OPTIONAL_LOGS", []) or []):
|
|
2591
|
+
source = log.get("source")
|
|
2592
|
+
if source:
|
|
2593
|
+
seen_keys.add(source)
|
|
2594
|
+
|
|
2595
|
+
missing = seen_keys - set(_SOURCE_LOADERS)
|
|
2596
|
+
assert not missing, f"detector source keys lacking _SOURCE_LOADERS entries: {missing}"
|
|
2597
|
+
|
|
2598
|
+
|
|
2599
|
+
def test_zeek_tsv_mixed_prefix_preserves_header_directives(tmp_path: Path) -> None:
|
|
2600
|
+
"""Glenn rev-3 fix: the Zeek frame strategy's prefix-preserving sniff
|
|
2601
|
+
hands the FULL header block (#separator, #fields, #types, #path) to
|
|
2602
|
+
``parse_tsv_log`` so a real conn.tsv with a data row parses correctly.
|
|
2603
|
+
A one-line peek would discard the header directives and the parser
|
|
2604
|
+
would fail or produce a bare frame."""
|
|
2605
|
+
f = tmp_path / "conn.log"
|
|
2606
|
+
f.write_text(
|
|
2607
|
+
"#separator \\x09\n"
|
|
2608
|
+
"#set_separator\t,\n"
|
|
2609
|
+
"#empty_field\t(empty)\n"
|
|
2610
|
+
"#unset_field\t-\n"
|
|
2611
|
+
"#path\tconn\n"
|
|
2612
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\tservice\tduration\torig_bytes\tresp_bytes\tconn_state\tlocal_orig\tlocal_resp\thistory\n"
|
|
2613
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport\tenum\tstring\tinterval\tcount\tcount\tstring\tbool\tbool\tstring\n"
|
|
2614
|
+
"1748649600.000000\tCTest01\t192.0.2.10\t51514\t203.0.113.20\t443\ttcp\tssl\t3.5\t1500\t8200\tSF\tT\tF\t(empty)\n"
|
|
2615
|
+
"#close\t2026-06-01-12-00-00\n",
|
|
2616
|
+
encoding="utf-8",
|
|
2617
|
+
)
|
|
2618
|
+
|
|
2619
|
+
df = load_logs(f.parent, "conn*.log*", _files=[f])
|
|
2620
|
+
assert not df.empty
|
|
2621
|
+
# The conn normalizer runs over the parsed frame; canonical columns appear.
|
|
2622
|
+
assert "src" in df.columns
|
|
2623
|
+
assert "dst" in df.columns
|
|
2624
|
+
assert df.iloc[0]["src"] == "192.0.2.10"
|
|
2625
|
+
|
|
2626
|
+
|
|
2627
|
+
def test_zeek_tsv_header_only_returns_bare_empty_preserving_header_block(
|
|
2628
|
+
tmp_path: Path,
|
|
2629
|
+
) -> None:
|
|
2630
|
+
"""A header-only TSV (header block + #close, no data row) flows through
|
|
2631
|
+
the same prefix-preserving sniff to ``parse_tsv_log``; behavior matches
|
|
2632
|
+
today (parser produces an empty/header-only frame, the load returns
|
|
2633
|
+
bare empty after normalize)."""
|
|
2634
|
+
f = tmp_path / "conn.log"
|
|
2635
|
+
f.write_text(
|
|
2636
|
+
"#separator \\x09\n"
|
|
2637
|
+
"#path\tconn\n"
|
|
2638
|
+
"#fields\tts\tid.orig_h\tid.resp_h\tid.resp_p\tproto\n"
|
|
2639
|
+
"#types\ttime\taddr\taddr\tport\tenum\n"
|
|
2640
|
+
"#close\t2026-06-01-12-00-00\n",
|
|
2641
|
+
encoding="utf-8",
|
|
2642
|
+
)
|
|
2643
|
+
|
|
2644
|
+
df = load_logs(f.parent, "conn*.log*", _files=[f])
|
|
2645
|
+
# Header-only TSV: parser handles the header block; the load returns an
|
|
2646
|
+
# empty frame. Critically, no traceback (the prefix WAS preserved → the
|
|
2647
|
+
# parser saw a complete header) and the empty shape is bare — Zeek
|
|
2648
|
+
# empties never column-stabilize.
|
|
2649
|
+
assert df.empty
|
|
2650
|
+
|
|
2651
|
+
|
|
2652
|
+
def test_load_logs_single_file_bypass_runs_on_dated_zeek_basename(
|
|
2653
|
+
tmp_path: Path,
|
|
2654
|
+
) -> None:
|
|
2655
|
+
"""Digest single-file Zeek bypass regression: a Zeek file whose basename
|
|
2656
|
+
does NOT match ``conn*.log*`` (e.g. dated rotation
|
|
2657
|
+
``2026-06-09.conn.log``) still loads when ``_files=[file]`` is provided
|
|
2658
|
+
— discovery is SKIPPED and the file goes straight through the Zeek
|
|
2659
|
+
strategy. ``run_digest`` relies on this for files routed by sniff,
|
|
2660
|
+
not by glob."""
|
|
2661
|
+
f = tmp_path / "2026-06-09.conn.log"
|
|
2662
|
+
f.write_text(
|
|
2663
|
+
"#separator \\x09\n"
|
|
2664
|
+
"#path\tconn\n"
|
|
2665
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\tservice\tduration\torig_bytes\tresp_bytes\tconn_state\tlocal_orig\tlocal_resp\thistory\n"
|
|
2666
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport\tenum\tstring\tinterval\tcount\tcount\tstring\tbool\tbool\tstring\n"
|
|
2667
|
+
"1748649600.000000\tCTest01\t192.0.2.10\t51514\t203.0.113.20\t443\ttcp\tssl\t3.5\t1500\t8200\tSF\tT\tF\t(empty)\n"
|
|
2668
|
+
"#close\t2026-06-01-12-00-00\n",
|
|
2669
|
+
encoding="utf-8",
|
|
2670
|
+
)
|
|
2671
|
+
|
|
2672
|
+
# Note: pattern is the GLOB the digest passes through (``conn*.log*``);
|
|
2673
|
+
# the basename here doesn't match it, but ``_files=`` shortcircuits
|
|
2674
|
+
# discovery so the file loads anyway.
|
|
2675
|
+
df = load_logs(f.parent, "conn*.log*", _files=[f])
|
|
2676
|
+
assert not df.empty
|
|
2677
|
+
assert df.iloc[0]["src"] == "192.0.2.10"
|
|
2678
|
+
|
|
2679
|
+
|
|
2680
|
+
# ── Flat-log rotation-peek windowing (syslog + pihole) ───────────────────────
|
|
2681
|
+
#
|
|
2682
|
+
# since/until are DERIVED by parsing the fixture lines themselves (not a
|
|
2683
|
+
# hardcoded year), so the tests are independent of the machine clock AND
|
|
2684
|
+
# inherently exercise clock parity with parse_timestamp's year-guess heuristic.
|
|
2685
|
+
|
|
2686
|
+
|
|
2687
|
+
def _dns_line(mon: str, day: int, hh: str = "12:00:00") -> str:
|
|
2688
|
+
"""A dnsmasq/Pi-hole query line whose timestamp is ``mon day hh``."""
|
|
2689
|
+
return f"{mon} {day:>2} {hh} dnsmasq[1]: query[A] example.test from 192.0.2.1"
|
|
2690
|
+
|
|
2691
|
+
|
|
2692
|
+
def _sys_line(mon: str, day: int, hh: str = "12:00:00") -> str:
|
|
2693
|
+
"""An RFC 3164 syslog line whose timestamp is ``mon day hh``."""
|
|
2694
|
+
return f"{mon} {day:>2} {hh} host1 sshd[1]: session opened for user"
|
|
2695
|
+
|
|
2696
|
+
|
|
2697
|
+
def _write_rot(path: Path, first_line: str, *, compress: bool = False) -> None:
|
|
2698
|
+
"""Write a one-line rotation file; ``first_line`` is the file's OLDEST row."""
|
|
2699
|
+
body = first_line + "\n" if first_line else "\n"
|
|
2700
|
+
if compress:
|
|
2701
|
+
with gzip.open(path, "wt", encoding="utf-8") as fh:
|
|
2702
|
+
fh.write(body)
|
|
2703
|
+
else:
|
|
2704
|
+
path.write_text(body, encoding="utf-8")
|
|
2705
|
+
|
|
2706
|
+
|
|
2707
|
+
def _make_rot_family(
|
|
2708
|
+
dirpath: Path,
|
|
2709
|
+
base: str,
|
|
2710
|
+
ts_by_ordinal: dict[int, tuple[str, int]],
|
|
2711
|
+
*,
|
|
2712
|
+
line_fn=_dns_line,
|
|
2713
|
+
) -> None:
|
|
2714
|
+
"""Build a rotation family: ordinal 0 → ``base``; N → ``base.N`` (first line
|
|
2715
|
+
carries the given month/day so it controls the file's oldest-row ts)."""
|
|
2716
|
+
dirpath.mkdir(parents=True, exist_ok=True)
|
|
2717
|
+
for idx, (mon, day) in ts_by_ordinal.items():
|
|
2718
|
+
name = base if idx == 0 else f"{base}.{idx}"
|
|
2719
|
+
_write_rot(dirpath / name, line_fn(mon, day))
|
|
2720
|
+
|
|
2721
|
+
|
|
2722
|
+
# Clock parity (binding) — peek ts EQUALS the ts the loader filters on.
|
|
2723
|
+
|
|
2724
|
+
def test_rotation_peek_ts_matches_loader_ts_pihole(tmp_path: Path) -> None:
|
|
2725
|
+
for mon, day in [("Jun", 1), ("Dec", 25)]: # Dec exercises the year-rollback
|
|
2726
|
+
f = tmp_path / f"pihole_{mon}.log"
|
|
2727
|
+
_write_rot(f, _dns_line(mon, day))
|
|
2728
|
+
peek = _peek_first_ts(f)
|
|
2729
|
+
assert peek is not None
|
|
2730
|
+
assert load_pihole(f).iloc[0]["ts"] == peek.timestamp()
|
|
2731
|
+
|
|
2732
|
+
|
|
2733
|
+
def test_rotation_peek_ts_matches_loader_ts_syslog(tmp_path: Path) -> None:
|
|
2734
|
+
for mon, day in [("Jun", 2), ("Dec", 31)]:
|
|
2735
|
+
f = tmp_path / f"sys_{mon}.log"
|
|
2736
|
+
_write_rot(f, _sys_line(mon, day))
|
|
2737
|
+
peek = _peek_first_ts(f)
|
|
2738
|
+
assert peek is not None
|
|
2739
|
+
assert load_syslog(f).iloc[0]["ts"] == peek.timestamp()
|
|
2740
|
+
|
|
2741
|
+
|
|
2742
|
+
# Per-group selection.
|
|
2743
|
+
|
|
2744
|
+
def test_rotation_per_group_two_dirs_keeps_both_straddles(tmp_path: Path) -> None:
|
|
2745
|
+
"""/a and /b each {log,.1,.2,.3}; BOTH .2 straddle `since` → keep both .2,
|
|
2746
|
+
skip only each group's older .3. A single-stream early-stop would skip b.2."""
|
|
2747
|
+
tsmap = {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)}
|
|
2748
|
+
a, b = tmp_path / "a", tmp_path / "b"
|
|
2749
|
+
_make_rot_family(a, "pihole.log", tsmap)
|
|
2750
|
+
_make_rot_family(b, "pihole.log", tsmap)
|
|
2751
|
+
files = sorted(a.glob("*")) + sorted(b.glob("*"))
|
|
2752
|
+
since = parse_timestamp(_dns_line("Jun", 5))
|
|
2753
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
2754
|
+
sel_a = {p.name for p in selected if p.parent == a}
|
|
2755
|
+
sel_b = {p.name for p in selected if p.parent == b}
|
|
2756
|
+
assert "pihole.log.2" in sel_a and "pihole.log.2" in sel_b
|
|
2757
|
+
assert "pihole.log.3" not in sel_a and "pihole.log.3" not in sel_b
|
|
2758
|
+
assert info.loaded == 6 and info.skipped == 2 and not info.fallback
|
|
2759
|
+
|
|
2760
|
+
|
|
2761
|
+
def test_rotation_per_group_per_host_independent(tmp_path: Path) -> None:
|
|
2762
|
+
"""router.* (newer) + server.* (older) in ONE dir: router's tail is kept
|
|
2763
|
+
while server is pruned independently — grouping is per (parent, base)."""
|
|
2764
|
+
d = tmp_path / "sys"
|
|
2765
|
+
_make_rot_family(d, "router.log", {0: ("Jun", 10), 1: ("Jun", 9), 2: ("Jun", 8)}, line_fn=_sys_line)
|
|
2766
|
+
_make_rot_family(d, "server.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4)}, line_fn=_sys_line)
|
|
2767
|
+
files = sorted(d.glob("*"))
|
|
2768
|
+
since = parse_timestamp(_sys_line("Jun", 7))
|
|
2769
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
2770
|
+
names = {p.name for p in selected}
|
|
2771
|
+
assert "router.log.2" in names # router tail kept (all ≥ since)
|
|
2772
|
+
assert "server.log.1" not in names # server pruned independently
|
|
2773
|
+
assert "server.log.2" not in names
|
|
2774
|
+
assert info.loaded == 4 and info.skipped == 2
|
|
2775
|
+
|
|
2776
|
+
|
|
2777
|
+
def test_rotation_early_stop_single_group_skips_old_tail(tmp_path: Path) -> None:
|
|
2778
|
+
"""active(empty) + .1(in-window) + .2(straddle) selected; older .3 skipped
|
|
2779
|
+
and NEVER peeked (recorded with a None ts — no fabricated timestamp)."""
|
|
2780
|
+
d = tmp_path / "p"
|
|
2781
|
+
d.mkdir()
|
|
2782
|
+
_write_rot(d / "pihole.log", "") # empty active → conservative include
|
|
2783
|
+
_write_rot(d / "pihole.log.1", _dns_line("Jun", 6))
|
|
2784
|
+
_write_rot(d / "pihole.log.2", _dns_line("Jun", 4)) # straddle
|
|
2785
|
+
_write_rot(d / "pihole.log.3", _dns_line("Jun", 2)) # old → skipped, not read
|
|
2786
|
+
files = sorted(d.glob("*"))
|
|
2787
|
+
since = parse_timestamp(_dns_line("Jun", 5))
|
|
2788
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
2789
|
+
assert {p.name for p in selected} == {"pihole.log", "pihole.log.1", "pihole.log.2"}
|
|
2790
|
+
assert info.skipped == 1
|
|
2791
|
+
assert ("pihole.log.3", None) in info.skipped_files
|
|
2792
|
+
|
|
2793
|
+
|
|
2794
|
+
def test_rotation_conservative_includes_unpeekable_and_corrupt(tmp_path: Path) -> None:
|
|
2795
|
+
"""A blank-only file and a corrupt .gz are INCLUDED (never aborts), and do
|
|
2796
|
+
not break the monotonic chain."""
|
|
2797
|
+
d = tmp_path / "p"
|
|
2798
|
+
d.mkdir()
|
|
2799
|
+
_write_rot(d / "pihole.log", _dns_line("Jun", 6))
|
|
2800
|
+
(d / "pihole.log.1").write_text("\n \n", encoding="utf-8") # unpeekable
|
|
2801
|
+
(d / "pihole.log.2.gz").write_bytes(b"not a gzip stream") # corrupt → peek raises
|
|
2802
|
+
files = sorted(d.glob("*"))
|
|
2803
|
+
since = parse_timestamp(_dns_line("Jun", 1)) # very old → all in window
|
|
2804
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
2805
|
+
assert {p.name for p in selected} == {"pihole.log", "pihole.log.1", "pihole.log.2.gz"}
|
|
2806
|
+
assert info.skipped == 0 and not info.fallback
|
|
2807
|
+
|
|
2808
|
+
|
|
2809
|
+
def test_rotation_fallback_is_data_true_whole_pattern(tmp_path: Path) -> None:
|
|
2810
|
+
"""One out-of-order group disables pruning for the WHOLE pattern: full set
|
|
2811
|
+
returned, skipped=0, and the well-formed group's would-be-skipped tail is
|
|
2812
|
+
NOT pruned (data-true, not just note-suppressed)."""
|
|
2813
|
+
a, b = tmp_path / "a", tmp_path / "b"
|
|
2814
|
+
_make_rot_family(a, "pihole.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)})
|
|
2815
|
+
# b: log(Jun 8) then .1(Jun 10) — going newest→oldest the first-ts RISES → disorder.
|
|
2816
|
+
_make_rot_family(b, "pihole.log", {0: ("Jun", 8), 1: ("Jun", 10)})
|
|
2817
|
+
files = sorted(a.glob("*")) + sorted(b.glob("*"))
|
|
2818
|
+
since = parse_timestamp(_dns_line("Jun", 5))
|
|
2819
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
2820
|
+
assert info.fallback is True
|
|
2821
|
+
assert info.skipped == 0 and info.loaded == len(files)
|
|
2822
|
+
assert {p.resolve() for p in selected} == {p.resolve() for p in files}
|
|
2823
|
+
# Well-formed group A's .3 would be rotation-skipped without fallback — present here.
|
|
2824
|
+
assert any(p.parent == a and p.name == "pihole.log.3" for p in selected)
|
|
2825
|
+
|
|
2826
|
+
|
|
2827
|
+
def test_syslog_files_drops_appledouble_and_orders_numerically(tmp_path: Path) -> None:
|
|
2828
|
+
d = tmp_path / "p"
|
|
2829
|
+
d.mkdir()
|
|
2830
|
+
for name in ["._pihole.log", "pihole.log", "pihole.log.1", "pihole.log.2", "pihole.log.10"]:
|
|
2831
|
+
_write_rot(d / name, _dns_line("Jun", 1))
|
|
2832
|
+
names = [p.name for p in _syslog_files(d)]
|
|
2833
|
+
assert "._pihole.log" not in names
|
|
2834
|
+
assert names == ["pihole.log", "pihole.log.1", "pihole.log.2", "pihole.log.10"]
|
|
2835
|
+
|
|
2836
|
+
|
|
2837
|
+
# Explicit-file protection (load_required_logs end-to-end).
|
|
2838
|
+
|
|
2839
|
+
def test_rotation_lone_explicit_old_file_no_windowing_no_skip(tmp_path: Path) -> None:
|
|
2840
|
+
"""An explicit OLD file → loaded, never rotation-windowed, no RotationSkipInfo."""
|
|
2841
|
+
old = tmp_path / "pihole.log.5"
|
|
2842
|
+
_write_rot(old, _dns_line("Jun", 1))
|
|
2843
|
+
since = parse_timestamp(_dns_line("Jun", 10))
|
|
2844
|
+
res = load_required_logs({"*.log*": "pihole_dir"}, {"pihole_dir": [old]}, since=since)
|
|
2845
|
+
assert "*.log*" not in res.rotation_skips
|
|
2846
|
+
assert res.data_size_bytes == old.stat().st_size
|
|
2847
|
+
|
|
2848
|
+
|
|
2849
|
+
def test_rotation_explicit_overlap_loads_not_skipped(tmp_path: Path) -> None:
|
|
2850
|
+
"""A path the window WOULD skip, also named explicitly AND reachable via the
|
|
2851
|
+
dir → loaded (bytes counted) and NOT in the skip count (no fake skip)."""
|
|
2852
|
+
d = tmp_path / "p"
|
|
2853
|
+
_make_rot_family(d, "pihole.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)})
|
|
2854
|
+
explicit = d / "pihole.log.3" # window would skip .3; protected by the explicit input
|
|
2855
|
+
since = parse_timestamp(_dns_line("Jun", 5))
|
|
2856
|
+
res = load_required_logs(
|
|
2857
|
+
{"*.log*": "pihole_dir"}, {"pihole_dir": [explicit, d]}, since=since,
|
|
2858
|
+
)
|
|
2859
|
+
info = res.rotation_skips["*.log*"]
|
|
2860
|
+
assert info.skipped == 0 and info.skipped_files == []
|
|
2861
|
+
all_files = [d / n for n in ("pihole.log", "pihole.log.1", "pihole.log.2", "pihole.log.3")]
|
|
2862
|
+
assert res.data_size_bytes == sum(p.stat().st_size for p in all_files)
|
|
2863
|
+
|
|
2864
|
+
|
|
2865
|
+
def test_rotation_no_window_reads_all_no_skip(tmp_path: Path) -> None:
|
|
2866
|
+
"""Bare load (no since/until) reads everything; no peek, no RotationSkipInfo."""
|
|
2867
|
+
d = tmp_path / "p"
|
|
2868
|
+
_make_rot_family(d, "pihole.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)})
|
|
2869
|
+
res = load_required_logs({"*.log*": "pihole_dir"}, {"pihole_dir": [d]})
|
|
2870
|
+
assert "*.log*" not in res.rotation_skips
|
|
2871
|
+
all_files = [d / n for n in ("pihole.log", "pihole.log.1", "pihole.log.2", "pihole.log.3")]
|
|
2872
|
+
assert res.data_size_bytes == sum(p.stat().st_size for p in all_files)
|
|
2873
|
+
|
|
2874
|
+
|
|
2875
|
+
def test_rotation_windows_syslog_dir_family(tmp_path: Path) -> None:
|
|
2876
|
+
"""The shared helper engages for syslog_dir too (both flat families)."""
|
|
2877
|
+
d = tmp_path / "s"
|
|
2878
|
+
_make_rot_family(d, "router.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)}, line_fn=_sys_line)
|
|
2879
|
+
since = parse_timestamp(_sys_line("Jun", 5))
|
|
2880
|
+
res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, since=since)
|
|
2881
|
+
info = res.rotation_skips["*.log*"]
|
|
2882
|
+
assert info.loaded == 3 and info.skipped == 1 and not info.fallback
|
|
2883
|
+
|
|
2884
|
+
|
|
2885
|
+
def test_rotation_verbose_skip_lines_tolerate_none_ts(
|
|
2886
|
+
tmp_path: Path, capsys: pytest.CaptureFixture
|
|
2887
|
+
) -> None:
|
|
2888
|
+
"""verbose=True prints per-file skip lines; an unpeeked tail file (None ts)
|
|
2889
|
+
prints NO '(oldest …)' detail (never fabricates), a peeked too-new leading
|
|
2890
|
+
file prints its real ts. Default (verbose=False) is quiet."""
|
|
2891
|
+
d = tmp_path / "p"
|
|
2892
|
+
_make_rot_family(d, "pihole.log", {
|
|
2893
|
+
0: ("Jun", 10), # too-new leading (oldest > until) → skipped, ts known
|
|
2894
|
+
1: ("Jun", 8),
|
|
2895
|
+
2: ("Jun", 6),
|
|
2896
|
+
3: ("Jun", 4), # straddle since → kept
|
|
2897
|
+
4: ("Jun", 2), # too-old tail → skipped, NOT peeked → ts None
|
|
2898
|
+
})
|
|
2899
|
+
files = sorted(d.glob("*"))
|
|
2900
|
+
since = parse_timestamp(_dns_line("Jun", 5))
|
|
2901
|
+
until = parse_timestamp(_dns_line("Jun", 9))
|
|
2902
|
+
|
|
2903
|
+
_rotation_windowed_files(files, since, until, verbose=False)
|
|
2904
|
+
assert capsys.readouterr().err == ""
|
|
2905
|
+
|
|
2906
|
+
_rotation_windowed_files(files, since, until, verbose=True)
|
|
2907
|
+
err = capsys.readouterr().err
|
|
2908
|
+
assert "rotation-peek: skipped pihole.log.4\n" in err # None ts → no detail
|
|
2909
|
+
assert "rotation-peek: skipped pihole.log (oldest " in err # peeked → real ts
|
|
2910
|
+
|
|
2911
|
+
|
|
2912
|
+
# ── date-stamped rotation-peek pruning (dateext + exporter output) ───────────
|
|
2913
|
+
#
|
|
2914
|
+
# Filename dates are ORDERING/grouping hints (+ the Family-2 structural overlap
|
|
2915
|
+
# check); the line first-ts stays the sole prune gate. Fixtures keep the
|
|
2916
|
+
# filename-date order aligned with the line-ts order so the peek's monotonicity
|
|
2917
|
+
# check does not fire. since/until are derived from fixture LINES, never a
|
|
2918
|
+
# hardcoded year.
|
|
2919
|
+
|
|
2920
|
+
|
|
2921
|
+
def _make_dateext_family(
|
|
2922
|
+
dirpath: Path,
|
|
2923
|
+
base: str,
|
|
2924
|
+
dated: list[tuple[str, int, int]],
|
|
2925
|
+
*,
|
|
2926
|
+
live: tuple[str, int] | None = None,
|
|
2927
|
+
year: int = 2026,
|
|
2928
|
+
line_fn=_dns_line,
|
|
2929
|
+
) -> None:
|
|
2930
|
+
"""Build a logrotate ``dateext`` family: an optional live ``base`` head plus
|
|
2931
|
+
``base.YYYYMMDD`` files. ``dated`` = ``(mon_name, mon_num, day)`` per dated
|
|
2932
|
+
file; ``live`` = the undated head's ``(mon_name, day)``. Each file's first
|
|
2933
|
+
line carries the matching month/day so its peek ts aligns with the
|
|
2934
|
+
filename-date order."""
|
|
2935
|
+
dirpath.mkdir(parents=True, exist_ok=True)
|
|
2936
|
+
if live is not None:
|
|
2937
|
+
_write_rot(dirpath / base, line_fn(live[0], live[1]))
|
|
2938
|
+
for mon, mon_num, day in dated:
|
|
2939
|
+
_write_rot(dirpath / f"{base}.{year}{mon_num:02d}{day:02d}", line_fn(mon, day))
|
|
2940
|
+
|
|
2941
|
+
|
|
2942
|
+
def _make_export_family(
|
|
2943
|
+
dirpath: Path,
|
|
2944
|
+
base: str,
|
|
2945
|
+
days: list[int],
|
|
2946
|
+
*,
|
|
2947
|
+
year: int = 2026,
|
|
2948
|
+
mon_num: int = 6,
|
|
2949
|
+
mon_name: str = "Jun",
|
|
2950
|
+
line_fn=_dns_line,
|
|
2951
|
+
) -> None:
|
|
2952
|
+
"""Build non-overlapping daily exporter files ``{base}_{YYYYMMDD}_1d.log``,
|
|
2953
|
+
one per day; the first line carries that day so the peek ts aligns with the
|
|
2954
|
+
filename-date order (`_auto_filename`'s whole-day ``_Nd`` shape)."""
|
|
2955
|
+
dirpath.mkdir(parents=True, exist_ok=True)
|
|
2956
|
+
for day in days:
|
|
2957
|
+
name = f"{base}_{year}{mon_num:02d}{day:02d}_1d.log"
|
|
2958
|
+
_write_rot(dirpath / name, line_fn(mon_name, day))
|
|
2959
|
+
|
|
2960
|
+
|
|
2961
|
+
# Classifier-level (helper) coverage.
|
|
2962
|
+
|
|
2963
|
+
def test_rotation_eight_digit_non_date_stays_numeric() -> None:
|
|
2964
|
+
"""An 8-digit trailing token that is NOT a valid calendar date (month 13) is
|
|
2965
|
+
a numeric ordinal, not dateext — age_rank is the raw int, no window."""
|
|
2966
|
+
assert _classify_rotation_name("pihole.log.20241301") == ("pihole.log", 20241301, None)
|
|
2967
|
+
|
|
2968
|
+
|
|
2969
|
+
def test_rotation_export_to_form_classifies_and_orders() -> None:
|
|
2970
|
+
"""The ``_to_`` exporter form parses to ``[start, end_date + (HH+1) h)`` (the
|
|
2971
|
+
end is CEILed to the next hour so the window is a guaranteed superset of the
|
|
2972
|
+
real until) and a newer start date yields a smaller age_rank (sorts newer)."""
|
|
2973
|
+
base1, rank1, win1 = _classify_rotation_name("export_20260601_to_20260608_14h.log")
|
|
2974
|
+
assert base1 == "export"
|
|
2975
|
+
assert win1 == (datetime(2026, 6, 1), datetime(2026, 6, 8, 15)) # 14h → ceil 15h
|
|
2976
|
+
base2, rank2, win2 = _classify_rotation_name("export_20260605_to_20260606_00h.log")
|
|
2977
|
+
assert base2 == "export" and win2 == (datetime(2026, 6, 5), datetime(2026, 6, 6, 1))
|
|
2978
|
+
assert rank2 < rank1 # later start (Jun 5 > Jun 1) → newer → smaller rank
|
|
2979
|
+
|
|
2980
|
+
|
|
2981
|
+
def test_rotation_export_huge_days_falls_to_floor() -> None:
|
|
2982
|
+
"""FIX 1 — an unbounded ``_Nd`` day count that overflows the date math is
|
|
2983
|
+
caught (not raised) and falls to the floor singleton."""
|
|
2984
|
+
assert _classify_rotation_name("foo_20260101_9999999d.log") == (
|
|
2985
|
+
"foo_20260101_9999999d.log",
|
|
2986
|
+
0,
|
|
2987
|
+
None,
|
|
2988
|
+
)
|
|
2989
|
+
|
|
2990
|
+
|
|
2991
|
+
def test_rotation_export_nonpositive_window_falls_to_floor() -> None:
|
|
2992
|
+
"""A malformed non-positive export window — empty ``_0d`` or an inverted
|
|
2993
|
+
``_to_`` (end ≤ start) — carries NO declared window (would read as disjoint
|
|
2994
|
+
and dodge the guards); it floors to a singleton instead."""
|
|
2995
|
+
assert _classify_rotation_name("splunk_20260601_0d.log") == (
|
|
2996
|
+
"splunk_20260601_0d.log",
|
|
2997
|
+
0,
|
|
2998
|
+
None,
|
|
2999
|
+
)
|
|
3000
|
+
assert _classify_rotation_name("export_20260608_to_20260601_00h.log") == (
|
|
3001
|
+
"export_20260608_to_20260601_00h.log",
|
|
3002
|
+
0,
|
|
3003
|
+
None,
|
|
3004
|
+
)
|
|
3005
|
+
|
|
3006
|
+
|
|
3007
|
+
def test_rotation_export_zero_day_does_not_silently_skip_sibling(tmp_path: Path) -> None:
|
|
3008
|
+
"""P1 regression — a malformed ``_0d`` export-looking file beside a normal
|
|
3009
|
+
same-start ``_1d`` export must NOT silently skip the normal file. Flooring the
|
|
3010
|
+
``_0d`` gives it its own base, so each is peeked independently and BOTH survive."""
|
|
3011
|
+
d = tmp_path / "s"
|
|
3012
|
+
d.mkdir()
|
|
3013
|
+
_write_rot(d / "splunk_20260601_0d.log", _sys_line("Jun", 1, "06:00:00"))
|
|
3014
|
+
_write_rot(d / "splunk_20260601_1d.log", _sys_line("Jun", 1, "18:00:00"))
|
|
3015
|
+
files = sorted(d.glob("*"))
|
|
3016
|
+
since = parse_timestamp(_sys_line("Jun", 1, "12:00:00"))
|
|
3017
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3018
|
+
names = {p.name for p in selected}
|
|
3019
|
+
assert "splunk_20260601_1d.log" in names # the in-window normal file is NOT skipped
|
|
3020
|
+
assert "splunk_20260601_0d.log" in names # the floored _0d singleton survives too
|
|
3021
|
+
assert not info.fallback
|
|
3022
|
+
|
|
3023
|
+
|
|
3024
|
+
def test_rotation_export_classify_superset_of_auto_filename() -> None:
|
|
3025
|
+
"""FOLD 6 / FIX 3 — the classifier window is always a SUPERSET of the real
|
|
3026
|
+
``[since, until)`` that ``exporters._auto_filename`` encoded. Couples
|
|
3027
|
+
``_EXPORT_WINDOW_RE`` to the exporter format (a future format change that
|
|
3028
|
+
disengaged the guard would fail here) and pins the ``_to_`` ceil property."""
|
|
3029
|
+
# whole-day _Nd: exact window (both endpoints midnight)
|
|
3030
|
+
since, until = datetime(2026, 6, 1), datetime(2026, 6, 8)
|
|
3031
|
+
win = _classify_rotation_name(_auto_filename("splunk", since, until))[2]
|
|
3032
|
+
assert win is not None and win[0] <= since and win[1] >= until
|
|
3033
|
+
# partial-day _to_: non-midnight endpoints → start floors, end ceils → superset
|
|
3034
|
+
since, until = datetime(2026, 6, 1, 3, 30), datetime(2026, 6, 8, 14, 45)
|
|
3035
|
+
win = _classify_rotation_name(_auto_filename("splunk", since, until))[2]
|
|
3036
|
+
assert win is not None and win[0] <= since and win[1] >= until
|
|
3037
|
+
|
|
3038
|
+
|
|
3039
|
+
def test_rotation_export_partnn_falls_to_floor() -> None:
|
|
3040
|
+
"""A ``_partNN`` infix is NOT claimed as an export window — it falls to the
|
|
3041
|
+
singleton floor (loaded-not-pruned), the safe behavior."""
|
|
3042
|
+
assert _classify_rotation_name("splunk_20260601_1d_part01.log") == (
|
|
3043
|
+
"splunk_20260601_1d_part01.log",
|
|
3044
|
+
0,
|
|
3045
|
+
None,
|
|
3046
|
+
)
|
|
3047
|
+
|
|
3048
|
+
|
|
3049
|
+
# Per-group selection (helper) coverage.
|
|
3050
|
+
|
|
3051
|
+
def test_rotation_dateext_now_prunes(tmp_path: Path) -> None:
|
|
3052
|
+
"""dateext now PRUNES instead of falling back: a live head + dated files
|
|
3053
|
+
order newest→oldest and the old tail is skipped."""
|
|
3054
|
+
d = tmp_path / "p"
|
|
3055
|
+
_make_dateext_family(
|
|
3056
|
+
d, "pihole.log",
|
|
3057
|
+
dated=[("Jun", 6, 5), ("Jun", 6, 4), ("Jun", 6, 3)],
|
|
3058
|
+
live=("Jun", 6),
|
|
3059
|
+
)
|
|
3060
|
+
files = sorted(d.glob("*"))
|
|
3061
|
+
since = parse_timestamp(_dns_line("Jun", 5))
|
|
3062
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3063
|
+
names = {p.name for p in selected}
|
|
3064
|
+
assert {"pihole.log", "pihole.log.20260605", "pihole.log.20260604"} <= names
|
|
3065
|
+
assert "pihole.log.20260603" not in names # old tail skipped
|
|
3066
|
+
assert info.loaded == 3 and info.skipped == 1 and not info.fallback
|
|
3067
|
+
|
|
3068
|
+
|
|
3069
|
+
def test_rotation_dateext_peek_ts_matches_loader_ts(tmp_path: Path) -> None:
|
|
3070
|
+
"""Clock parity for a dateext-named file — peek ts EQUALS the loader ts."""
|
|
3071
|
+
f = tmp_path / "auth.log.20260625"
|
|
3072
|
+
_write_rot(f, _sys_line("Jun", 25))
|
|
3073
|
+
peek = _peek_first_ts(f)
|
|
3074
|
+
assert peek is not None
|
|
3075
|
+
assert load_syslog(f).iloc[0]["ts"] == peek.timestamp()
|
|
3076
|
+
|
|
3077
|
+
|
|
3078
|
+
def test_rotation_export_window_prunes(tmp_path: Path) -> None:
|
|
3079
|
+
"""Non-overlapping daily exporter files order newest→oldest and prune."""
|
|
3080
|
+
d = tmp_path / "s"
|
|
3081
|
+
_make_export_family(d, "splunk", [6, 5, 4, 3], line_fn=_sys_line)
|
|
3082
|
+
files = sorted(d.glob("*"))
|
|
3083
|
+
since = parse_timestamp(_sys_line("Jun", 5))
|
|
3084
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3085
|
+
names = {p.name for p in selected}
|
|
3086
|
+
assert {"splunk_20260606_1d.log", "splunk_20260605_1d.log", "splunk_20260604_1d.log"} <= names
|
|
3087
|
+
assert "splunk_20260603_1d.log" not in names
|
|
3088
|
+
assert info.loaded == 3 and info.skipped == 1 and not info.fallback
|
|
3089
|
+
|
|
3090
|
+
|
|
3091
|
+
def test_rotation_export_overlap_falls_back(tmp_path: Path) -> None:
|
|
3092
|
+
"""A _7d window overlapping a _1d daily under one base → whole-pattern
|
|
3093
|
+
fallback, skipped=0, full set, reason 'overlapping export windows'."""
|
|
3094
|
+
d = tmp_path / "s"
|
|
3095
|
+
d.mkdir()
|
|
3096
|
+
_write_rot(d / "splunk_20260601_7d.log", _sys_line("Jun", 1)) # [Jun 1, Jun 8)
|
|
3097
|
+
_write_rot(d / "splunk_20260605_1d.log", _sys_line("Jun", 5)) # [Jun 5, Jun 6) ⊂ above
|
|
3098
|
+
files = sorted(d.glob("*"))
|
|
3099
|
+
since = parse_timestamp(_sys_line("Jun", 5))
|
|
3100
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3101
|
+
assert info.fallback is True
|
|
3102
|
+
assert info.fallback_reason == "overlapping export windows"
|
|
3103
|
+
assert info.skipped == 0 and info.loaded == len(files)
|
|
3104
|
+
assert {p.resolve() for p in selected} == {p.resolve() for p in files}
|
|
3105
|
+
|
|
3106
|
+
|
|
3107
|
+
def test_rotation_export_equal_window_duplicate_falls_back(tmp_path: Path) -> None:
|
|
3108
|
+
"""Equal-window duplicates (the silent-miss class) → fallback, NOT pruning.
|
|
3109
|
+
Proves compression stripping: a ``.log`` and its ``.log.gz`` classify to the
|
|
3110
|
+
same base+window after stripping only the compression suffix."""
|
|
3111
|
+
d = tmp_path / "s"
|
|
3112
|
+
d.mkdir()
|
|
3113
|
+
_write_rot(d / "splunk_20260601_1d.log", _sys_line("Jun", 1))
|
|
3114
|
+
_write_rot(d / "splunk_20260601_1d.log.gz", _sys_line("Jun", 1), compress=True)
|
|
3115
|
+
files = sorted(d.glob("*"))
|
|
3116
|
+
since = parse_timestamp(_sys_line("Jun", 1))
|
|
3117
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3118
|
+
assert info.fallback is True
|
|
3119
|
+
assert info.fallback_reason == "overlapping export windows"
|
|
3120
|
+
assert info.skipped == 0 and info.loaded == len(files)
|
|
3121
|
+
|
|
3122
|
+
|
|
3123
|
+
# Same-rank duplicate slots (FIX 2) — un-orderable, fall back for ALL schemes.
|
|
3124
|
+
|
|
3125
|
+
def test_rotation_dateext_same_date_duplicate_falls_back(tmp_path: Path) -> None:
|
|
3126
|
+
"""A dateext file + its ``.gz`` sibling collapse to ONE age_rank → un-orderable
|
|
3127
|
+
duplicate → whole-pattern fallback (NOT a silent skip of the in-window .gz)."""
|
|
3128
|
+
d = tmp_path / "s"
|
|
3129
|
+
d.mkdir()
|
|
3130
|
+
_write_rot(d / "auth.log.20260605", _sys_line("Jun", 5, "06:00:00"))
|
|
3131
|
+
_write_rot(d / "auth.log.20260605.gz", _sys_line("Jun", 5, "18:00:00"), compress=True)
|
|
3132
|
+
files = sorted(d.glob("*"))
|
|
3133
|
+
since = parse_timestamp(_sys_line("Jun", 5, "12:00:00"))
|
|
3134
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3135
|
+
assert info.fallback is True
|
|
3136
|
+
assert info.fallback_reason == "duplicate rotation files"
|
|
3137
|
+
assert info.skipped == 0 and info.loaded == len(files)
|
|
3138
|
+
|
|
3139
|
+
|
|
3140
|
+
def test_rotation_numeric_duplicate_falls_back(tmp_path: Path) -> None:
|
|
3141
|
+
"""A numeric rotation + its ``.gz`` sibling share a stripped name → fallback
|
|
3142
|
+
'duplicate rotation files' (closes the pre-existing numeric-dup silent-miss)."""
|
|
3143
|
+
d = tmp_path / "p"
|
|
3144
|
+
d.mkdir()
|
|
3145
|
+
_write_rot(d / "pihole.log", _dns_line("Jun", 6))
|
|
3146
|
+
_write_rot(d / "pihole.log.2", _dns_line("Jun", 4))
|
|
3147
|
+
_write_rot(d / "pihole.log.2.gz", _dns_line("Jun", 4), compress=True)
|
|
3148
|
+
files = sorted(d.glob("*"))
|
|
3149
|
+
since = parse_timestamp(_dns_line("Jun", 5))
|
|
3150
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3151
|
+
assert info.fallback is True
|
|
3152
|
+
assert info.fallback_reason == "duplicate rotation files"
|
|
3153
|
+
assert info.skipped == 0
|
|
3154
|
+
|
|
3155
|
+
|
|
3156
|
+
def test_rotation_live_compressed_duplicate_falls_back(tmp_path: Path) -> None:
|
|
3157
|
+
"""A live ``.log`` + its ``.log.gz`` (same stripped name) → 'duplicate rotation
|
|
3158
|
+
files' — the head-of-group duplicate slot."""
|
|
3159
|
+
d = tmp_path / "s"
|
|
3160
|
+
d.mkdir()
|
|
3161
|
+
_write_rot(d / "auth.log", _sys_line("Jun", 6))
|
|
3162
|
+
_write_rot(d / "auth.log.gz", _sys_line("Jun", 6), compress=True)
|
|
3163
|
+
files = sorted(d.glob("*"))
|
|
3164
|
+
since = parse_timestamp(_sys_line("Jun", 5))
|
|
3165
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3166
|
+
assert info.fallback is True and info.fallback_reason == "duplicate rotation files"
|
|
3167
|
+
|
|
3168
|
+
|
|
3169
|
+
def test_rotation_zero_indexed_prunes_not_dup(tmp_path: Path) -> None:
|
|
3170
|
+
"""A 0-indexed scheme (``auth.log`` + ``.0`` BOTH age_rank 0) is NOT a
|
|
3171
|
+
duplicate — distinct stripped names → it PRUNES the out-of-window tail with
|
|
3172
|
+
fallback=False and no 'duplicate rotation files' note. (The age_rank-tie test
|
|
3173
|
+
flagged this falsely.)"""
|
|
3174
|
+
d = tmp_path / "s"
|
|
3175
|
+
d.mkdir()
|
|
3176
|
+
_write_rot(d / "auth.log", _sys_line("Jun", 6))
|
|
3177
|
+
_write_rot(d / "auth.log.0", _sys_line("Jun", 5))
|
|
3178
|
+
_write_rot(d / "auth.log.1", _sys_line("Jun", 4)) # straddle since
|
|
3179
|
+
_write_rot(d / "auth.log.2", _sys_line("Jun", 3)) # out of window → skipped
|
|
3180
|
+
files = sorted(d.glob("*"))
|
|
3181
|
+
since = parse_timestamp(_sys_line("Jun", 5))
|
|
3182
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3183
|
+
assert info.fallback is False
|
|
3184
|
+
assert info.fallback_reason is None # NOT a misleading "duplicate" note
|
|
3185
|
+
assert "auth.log.2" not in {p.name for p in selected}
|
|
3186
|
+
assert info.skipped == 1
|
|
3187
|
+
|
|
3188
|
+
|
|
3189
|
+
def test_rotation_leading_zero_not_a_dup(tmp_path: Path) -> None:
|
|
3190
|
+
"""``.02`` and ``.2`` both int-rank 2 but are DISTINCT files (distinct stripped
|
|
3191
|
+
names) → not flagged as a duplicate (proceeds past the dup branch)."""
|
|
3192
|
+
d = tmp_path / "s"
|
|
3193
|
+
d.mkdir()
|
|
3194
|
+
_write_rot(d / "s.log.2", _sys_line("Jun", 5))
|
|
3195
|
+
_write_rot(d / "s.log.02", _sys_line("Jun", 4))
|
|
3196
|
+
files = sorted(d.glob("*"))
|
|
3197
|
+
since = parse_timestamp(_sys_line("Jun", 1)) # very old → all in window
|
|
3198
|
+
selected, info = _rotation_windowed_files(files, since, None)
|
|
3199
|
+
assert info.fallback_reason != "duplicate rotation files"
|
|
3200
|
+
|
|
3201
|
+
|
|
3202
|
+
# End-to-end (load_required_logs) coverage — real discovery → window_select →
|
|
3203
|
+
# run_load seam: selected ROWS and the RotationSkipInfo must agree.
|
|
3204
|
+
|
|
3205
|
+
def test_rotation_dateext_prunes_end_to_end_pihole(tmp_path: Path) -> None:
|
|
3206
|
+
"""dateext pruning through the pihole_dir loader: 3 files selected, and the
|
|
3207
|
+
straddle file's out-of-window row is then trimmed by the precise row filter."""
|
|
3208
|
+
d = tmp_path / "p"
|
|
3209
|
+
_make_dateext_family(
|
|
3210
|
+
d, "pihole.log",
|
|
3211
|
+
dated=[("Jun", 6, 5), ("Jun", 6, 4), ("Jun", 6, 3)],
|
|
3212
|
+
live=("Jun", 6),
|
|
3213
|
+
)
|
|
3214
|
+
since = parse_timestamp(_dns_line("Jun", 5))
|
|
3215
|
+
res = load_required_logs({"*.log*": "pihole_dir"}, {"pihole_dir": [d]}, since=since)
|
|
3216
|
+
info = res.rotation_skips["*.log*"]
|
|
3217
|
+
assert info.loaded == 3 and info.skipped == 1 and not info.fallback
|
|
3218
|
+
df = res.logs["*.log*"]
|
|
3219
|
+
days = {datetime.fromtimestamp(ts).day for ts in df["ts"]}
|
|
3220
|
+
assert days == {5, 6} # Jun 3 pruned (file); Jun 4 straddle file kept but row trimmed
|
|
3221
|
+
|
|
3222
|
+
|
|
3223
|
+
def test_rotation_export_equal_window_fallback_end_to_end_syslog(tmp_path: Path) -> None:
|
|
3224
|
+
"""Equal-window export duplicates through the syslog_dir loader → full read
|
|
3225
|
+
(both rows), fallback recorded with the overlap reason."""
|
|
3226
|
+
d = tmp_path / "s"
|
|
3227
|
+
d.mkdir()
|
|
3228
|
+
_write_rot(d / "splunk_20260601_1d.log", _sys_line("Jun", 1))
|
|
3229
|
+
_write_rot(d / "splunk_20260601_1d.log.gz", _sys_line("Jun", 1), compress=True)
|
|
3230
|
+
since = parse_timestamp(_sys_line("Jun", 1))
|
|
3231
|
+
res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, since=since)
|
|
3232
|
+
info = res.rotation_skips["*.log*"]
|
|
3233
|
+
assert info.fallback is True
|
|
3234
|
+
assert info.fallback_reason == "overlapping export windows"
|
|
3235
|
+
assert info.skipped == 0
|
|
3236
|
+
assert len(res.logs["*.log*"]) == 2 # both files read (full archive), both in window
|
|
3237
|
+
|
|
3238
|
+
|
|
3239
|
+
def test_rotation_export_huge_days_end_to_end_no_crash(tmp_path: Path) -> None:
|
|
3240
|
+
"""FIX 1 end-to-end — an overflow-inducing ``_Nd`` name in a flat dir loads
|
|
3241
|
+
without a raw OverflowError reaching the runner; it floors to its OWN base
|
|
3242
|
+
(a singleton group) and is peeked independently."""
|
|
3243
|
+
d = tmp_path / "s"
|
|
3244
|
+
d.mkdir()
|
|
3245
|
+
_write_rot(d / "foo_20260101_9999999d.log", _sys_line("Jun", 5))
|
|
3246
|
+
_write_rot(d / "server.log", _sys_line("Jun", 6))
|
|
3247
|
+
since = parse_timestamp(_sys_line("Jun", 5))
|
|
3248
|
+
res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, since=since)
|
|
3249
|
+
assert len(res.logs["*.log*"]) == 2 # no crash; both in-window rows present
|
|
3250
|
+
|
|
3251
|
+
|
|
3252
|
+
def test_rotation_dateext_duplicate_rows_survive_end_to_end(tmp_path: Path) -> None:
|
|
3253
|
+
"""FIX 2 end-to-end — the duplicate's in-window row survives the full read it
|
|
3254
|
+
triggers (the silent-miss this fix closes: without the guard the .gz sibling's
|
|
3255
|
+
18:00 row would be skipped as 'older tail')."""
|
|
3256
|
+
d = tmp_path / "s"
|
|
3257
|
+
d.mkdir()
|
|
3258
|
+
_write_rot(d / "auth.log.20260605", _sys_line("Jun", 5, "06:00:00"))
|
|
3259
|
+
_write_rot(d / "auth.log.20260605.gz", _sys_line("Jun", 5, "18:00:00"), compress=True)
|
|
3260
|
+
since = parse_timestamp(_sys_line("Jun", 5, "12:00:00"))
|
|
3261
|
+
res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, since=since)
|
|
3262
|
+
info = res.rotation_skips["*.log*"]
|
|
3263
|
+
assert info.fallback and info.fallback_reason == "duplicate rotation files"
|
|
3264
|
+
# The .gz sibling's 18:00 row survived the full read; the 06:00 row is the ONLY
|
|
3265
|
+
# one trimmed by the precise since-filter. Compare to the parsed ts — clock
|
|
3266
|
+
# parity, TZ-robust (parse_timestamp's tz vs a local fromtimestamp would skew).
|
|
3267
|
+
ts_set = set(res.logs["*.log*"]["ts"])
|
|
3268
|
+
expected_06 = parse_timestamp(_sys_line("Jun", 5, "06:00:00")).timestamp()
|
|
3269
|
+
expected_18 = parse_timestamp(_sys_line("Jun", 5, "18:00:00")).timestamp()
|
|
3270
|
+
assert len(res.logs["*.log*"]) == 1
|
|
3271
|
+
assert expected_18 in ts_set and expected_06 not in ts_set
|
|
3272
|
+
|
|
3273
|
+
|
|
3274
|
+
# ── universal default window: family helpers ─────────────────────────────────
|
|
3275
|
+
|
|
3276
|
+
|
|
3277
|
+
def test_is_bounded_family_neutral_and_zeek_alias() -> None:
|
|
3278
|
+
"""is_bounded is pure path-shape; is_zeek_bounded delegates to it."""
|
|
3279
|
+
assert is_bounded([]) is False
|
|
3280
|
+
f = Path(__file__) # a real regular file
|
|
3281
|
+
d = Path(__file__).parent # a real directory
|
|
3282
|
+
assert is_bounded([f]) is True
|
|
3283
|
+
assert is_bounded([d]) is False
|
|
3284
|
+
assert is_bounded([f, d]) is False
|
|
3285
|
+
# Alias is byte-identical for the digest path.
|
|
3286
|
+
assert is_zeek_bounded([f]) == is_bounded([f])
|
|
3287
|
+
assert is_zeek_bounded([d]) == is_bounded([d])
|
|
3288
|
+
|
|
3289
|
+
|
|
3290
|
+
def test_source_ts_policy() -> None:
|
|
3291
|
+
"""ts policy is declared on each strategy: keep-policy families (syslog/pihole)
|
|
3292
|
+
KEEP unparseable-ts rows; drop-policy (zeek/cloudtrail) DROP. The resolver reads
|
|
3293
|
+
this directly (the old source_keeps_null accessor folded into resolve_load_windows)."""
|
|
3294
|
+
assert _SOURCE_LOADERS["syslog_dir"].ts_policy == "keep"
|
|
3295
|
+
assert _SOURCE_LOADERS["pihole_dir"].ts_policy == "keep"
|
|
3296
|
+
assert _SOURCE_LOADERS["zeek_dir"].ts_policy == "drop"
|
|
3297
|
+
assert _SOURCE_LOADERS["cloudtrail_dir"].ts_policy == "drop"
|
|
3298
|
+
assert "unknown_dir" not in _SOURCE_LOADERS
|
|
3299
|
+
|
|
3300
|
+
|
|
3301
|
+
def test_apply_ts_filter_keep_null_retains_nan_rows() -> None:
|
|
3302
|
+
"""keep_null=True retains NaN-ts rows alongside in-window rows; the default
|
|
3303
|
+
(keep_null=False) drops them — byte-identical to every existing caller."""
|
|
3304
|
+
import math
|
|
3305
|
+
base = datetime(2026, 6, 5, 12, 0, tzinfo=timezone.utc)
|
|
3306
|
+
df = pd.DataFrame([
|
|
3307
|
+
{"ts": base.timestamp(), "m": "in"},
|
|
3308
|
+
{"ts": (base - timedelta(days=5)).timestamp(), "m": "old"},
|
|
3309
|
+
{"ts": float("nan"), "m": "nan"},
|
|
3310
|
+
])
|
|
3311
|
+
since = base - timedelta(days=1)
|
|
3312
|
+
keep = _apply_ts_filter(df, since, base, keep_null=True)
|
|
3313
|
+
assert set(keep["m"]) == {"in", "nan"}
|
|
3314
|
+
drop = _apply_ts_filter(df, since, base) # default
|
|
3315
|
+
assert set(drop["m"]) == {"in"}
|
|
3316
|
+
assert not any(math.isnan(x) for x in drop["ts"])
|
|
3317
|
+
|
|
3318
|
+
|
|
3319
|
+
def test_flat_family_default_floor_pihole_and_syslog(tmp_path: Path) -> None:
|
|
3320
|
+
"""The flat floor peeks DIRECTORY candidates' max first-ts and returns
|
|
3321
|
+
(f_max − span, None); None when nothing is peekable. Directory-only inputs
|
|
3322
|
+
drive the anchor."""
|
|
3323
|
+
span = timedelta(days=1)
|
|
3324
|
+
|
|
3325
|
+
# pihole: two rotation files, oldest first-ts Jun 1 / Jun 5 respectively.
|
|
3326
|
+
pihole_dir = tmp_path / "pihole"
|
|
3327
|
+
pihole_dir.mkdir()
|
|
3328
|
+
(pihole_dir / "pihole.log.1").write_text(
|
|
3329
|
+
"Jun 1 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n", encoding="utf-8"
|
|
3330
|
+
)
|
|
3331
|
+
(pihole_dir / "pihole.log").write_text(
|
|
3332
|
+
"Jun 5 12:00:00 dnsmasq[1]: query[A] b.test from 192.0.2.1\n", encoding="utf-8"
|
|
3333
|
+
)
|
|
3334
|
+
floor = _flat_default_floor(_SOURCE_LOADERS["pihole_dir"], [pihole_dir], "pihole*.log*", span)
|
|
3335
|
+
assert floor is not None
|
|
3336
|
+
# Derive expected from the SAME yearless-ts parser the floor uses, so the
|
|
3337
|
+
# parse_timestamp year-rollback applies to both sides (clock-independent).
|
|
3338
|
+
expected = _peek_first_ts(pihole_dir / "pihole.log") - span
|
|
3339
|
+
assert floor[0] == expected
|
|
3340
|
+
assert floor[1] is None
|
|
3341
|
+
|
|
3342
|
+
# syslog: same mechanism, *.log* discovery.
|
|
3343
|
+
syslog_dir = tmp_path / "syslog"
|
|
3344
|
+
syslog_dir.mkdir()
|
|
3345
|
+
(syslog_dir / "host.log").write_text(
|
|
3346
|
+
"Jun 5 12:00:00 host kernel: line\n", encoding="utf-8"
|
|
3347
|
+
)
|
|
3348
|
+
sfloor = _flat_default_floor(_SOURCE_LOADERS["syslog_dir"], [syslog_dir], "*.log*", span)
|
|
3349
|
+
assert sfloor is not None
|
|
3350
|
+
assert sfloor[0] == _peek_first_ts(syslog_dir / "host.log") - span
|
|
3351
|
+
assert sfloor[1] is None
|
|
3352
|
+
|
|
3353
|
+
|
|
3354
|
+
def test_flat_family_default_floor_unpeekable_returns_none(tmp_path: Path) -> None:
|
|
3355
|
+
"""No parseable first-ts across candidates → None (runner load-full fallback)."""
|
|
3356
|
+
d = tmp_path / "syslog"
|
|
3357
|
+
d.mkdir()
|
|
3358
|
+
(d / "host.log").write_text(
|
|
3359
|
+
"Xxx 1 12:00:00 host kernel: unparseable month\n", encoding="utf-8"
|
|
3360
|
+
)
|
|
3361
|
+
assert _flat_default_floor(_SOURCE_LOADERS["syslog_dir"], [d], "*.log*", timedelta(days=1)) is None
|
|
3362
|
+
|
|
3363
|
+
|
|
3364
|
+
def test_flat_family_default_floor_excludes_explicit_files(tmp_path: Path) -> None:
|
|
3365
|
+
"""Only is_dir() inputs drive the anchor — an explicit file passed in the list
|
|
3366
|
+
is ignored (1E: explicit files load regardless, must not drive the floor)."""
|
|
3367
|
+
explicit = tmp_path / "old.log"
|
|
3368
|
+
explicit.write_text("Jun 1 12:00:00 host kernel: old\n", encoding="utf-8")
|
|
3369
|
+
d = tmp_path / "syslog"
|
|
3370
|
+
d.mkdir()
|
|
3371
|
+
(d / "host.log").write_text("Jun 5 12:00:00 host kernel: new\n", encoding="utf-8")
|
|
3372
|
+
floor = _flat_default_floor(
|
|
3373
|
+
_SOURCE_LOADERS["syslog_dir"], [explicit, d], "*.log*", timedelta(days=1)
|
|
3374
|
+
)
|
|
3375
|
+
# Anchor is the DIR file (Jun 5), NOT the explicit Jun-1 file — proves exclusion.
|
|
3376
|
+
# Derive expected from the same parser (clock-independent year-rollback).
|
|
3377
|
+
assert floor[0] == _peek_first_ts(d / "host.log") - timedelta(days=1)
|
|
3378
|
+
|
|
3379
|
+
|
|
3380
|
+
# ── pattern-aware flat discovery (pihole narrowing; explicit-file intent) ─────
|
|
3381
|
+
|
|
3382
|
+
|
|
3383
|
+
def test_source_default_window_eligible_cloudtrail_opts_out() -> None:
|
|
3384
|
+
"""default_window_eligible is declared on each strategy; the resolver reads it
|
|
3385
|
+
directly (the old source_default_window_eligible accessor folded in). CloudTrail
|
|
3386
|
+
opts out (baseline-relative)."""
|
|
3387
|
+
assert _SOURCE_LOADERS["cloudtrail_dir"].default_window_eligible is False
|
|
3388
|
+
assert _SOURCE_LOADERS["zeek_dir"].default_window_eligible is True
|
|
3389
|
+
assert _SOURCE_LOADERS["syslog_dir"].default_window_eligible is True
|
|
3390
|
+
assert _SOURCE_LOADERS["pihole_dir"].default_window_eligible is True
|
|
3391
|
+
assert "unknown_dir" not in _SOURCE_LOADERS
|
|
3392
|
+
|
|
3393
|
+
|
|
3394
|
+
def test_pihole_directory_discovery_narrows_to_pattern(tmp_path: Path) -> None:
|
|
3395
|
+
"""A pihole DIRECTORY discovers only ``pihole*.log*`` — not sibling syslog /
|
|
3396
|
+
cloudtrail files in a shared dir."""
|
|
3397
|
+
d = tmp_path / "shared"
|
|
3398
|
+
d.mkdir()
|
|
3399
|
+
(d / "pihole.log").write_text("x\n", encoding="utf-8")
|
|
3400
|
+
(d / "pihole.log.1").write_text("y\n", encoding="utf-8")
|
|
3401
|
+
(d / "syslog_host.log").write_text("z\n", encoding="utf-8")
|
|
3402
|
+
(d / "cloudtrail.json.log").write_text("{}\n", encoding="utf-8")
|
|
3403
|
+
names = {p.name for p in _syslog_files(d, "pihole*.log*")}
|
|
3404
|
+
assert names == {"pihole.log", "pihole.log.1"}
|
|
3405
|
+
# `_syslog_files`' broad `*.log*` default still grabs everything — it is the
|
|
3406
|
+
# retained Pi-hole filename helper (and backs the Pi-hole mismatch check).
|
|
3407
|
+
# NOTE: syslog discovery no longer uses this glob; it content-sniffs via
|
|
3408
|
+
# `_discover_syslog_files`.
|
|
3409
|
+
assert {p.name for p in _syslog_files(d)} == {
|
|
3410
|
+
"pihole.log", "pihole.log.1", "syslog_host.log", "cloudtrail.json.log",
|
|
3411
|
+
}
|
|
3412
|
+
|
|
3413
|
+
|
|
3414
|
+
def test_pihole_explicit_nonmatching_file_still_loads(tmp_path: Path) -> None:
|
|
3415
|
+
"""An explicit FILE routed as Pi-hole loads even if its name doesn't match
|
|
3416
|
+
``pihole*.log*`` — the pattern applies to DIRECTORY discovery only."""
|
|
3417
|
+
f = tmp_path / "events.log"
|
|
3418
|
+
f.write_text("Jun 5 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n",
|
|
3419
|
+
encoding="utf-8")
|
|
3420
|
+
assert _syslog_files(f, "pihole*.log*") == [f]
|
|
3421
|
+
df = load_pihole(f) # routes through the file path → loads
|
|
3422
|
+
assert len(df) == 1
|
|
3423
|
+
|
|
3424
|
+
|
|
3425
|
+
def test_pihole_plan_and_loader_one_universe(tmp_path: Path) -> None:
|
|
3426
|
+
"""Plan-time satisfiability and the loader discover the SAME pihole universe:
|
|
3427
|
+
a dir of only non-pihole files → not satisfiable AND loads empty."""
|
|
3428
|
+
from loghunter.runner import _any_input_yields_files
|
|
3429
|
+
|
|
3430
|
+
d = tmp_path / "syslogonly"
|
|
3431
|
+
d.mkdir()
|
|
3432
|
+
(d / "syslog_host.log").write_text("Jun 5 12:00:00 host kernel: x\n",
|
|
3433
|
+
encoding="utf-8")
|
|
3434
|
+
# Plan: pihole pattern finds nothing here.
|
|
3435
|
+
assert _any_input_yields_files("pihole_dir", [d], "pihole*.log*") is False
|
|
3436
|
+
# Loader: same — discovers no pihole files, loads an empty (column-stable) frame.
|
|
3437
|
+
df = load_pihole(d)
|
|
3438
|
+
assert len(df) == 0
|
|
3439
|
+
assert list(df.columns) == _PIHOLE_COLUMNS
|
|
3440
|
+
|
|
3441
|
+
|
|
3442
|
+
def test_pihole_dir_nonmatching_logs_disclosed_not_silent(tmp_path: Path) -> None:
|
|
3443
|
+
"""A configured pihole DIRECTORY holding .log files that don't match
|
|
3444
|
+
``pihole*.log*`` (e.g. a mis-named dnsmasq log or a shared export dir) loads
|
|
3445
|
+
nothing — but it is DISCLOSED via a loader warning, never a silent miss."""
|
|
3446
|
+
d = tmp_path / "shared"
|
|
3447
|
+
d.mkdir()
|
|
3448
|
+
(d / "dnsmasq.log").write_text(
|
|
3449
|
+
"Jun 5 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n",
|
|
3450
|
+
encoding="utf-8",
|
|
3451
|
+
)
|
|
3452
|
+
res = load_required_logs({"pihole*.log*": "pihole_dir"}, {"pihole_dir": [d]})
|
|
3453
|
+
assert res.record_counts.get("pihole*.log*", 0) == 0, "non-matching name not loaded"
|
|
3454
|
+
assert any("none match 'pihole*.log*'" in w for w in res.warnings), res.warnings
|
|
3455
|
+
|
|
3456
|
+
# A correctly-named pihole dir loads AND emits no mismatch warning.
|
|
3457
|
+
good = tmp_path / "pihole"
|
|
3458
|
+
good.mkdir()
|
|
3459
|
+
(good / "pihole.log").write_text(
|
|
3460
|
+
"Jun 5 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n",
|
|
3461
|
+
encoding="utf-8",
|
|
3462
|
+
)
|
|
3463
|
+
res2 = load_required_logs({"pihole*.log*": "pihole_dir"}, {"pihole_dir": [good]})
|
|
3464
|
+
assert res2.record_counts.get("pihole*.log*", 0) == 1
|
|
3465
|
+
assert not any("none match" in w for w in res2.warnings), res2.warnings
|
|
3466
|
+
|
|
3467
|
+
|
|
3468
|
+
# ── syslog content-sniff discovery gate (Item E) ───────────────────────────────
|
|
3469
|
+
|
|
3470
|
+
def test_syslog_gate_accepts_extensionless_rhel_streams(tmp_path: Path) -> None:
|
|
3471
|
+
"""RHEL/Fedora streams carry no `.log` suffix — the content gate accepts
|
|
3472
|
+
`messages`/`secure`/`maillog`/`cron` by RFC-3164 content; per-line hosts come
|
|
3473
|
+
from content (H4), not the filename."""
|
|
3474
|
+
d = tmp_path / "varlog"
|
|
3475
|
+
d.mkdir()
|
|
3476
|
+
(d / "messages").write_text(
|
|
3477
|
+
"<134>May 31 12:00:00 host-a kernel: link up\n", encoding="utf-8")
|
|
3478
|
+
(d / "secure").write_text(
|
|
3479
|
+
"<134>May 31 12:01:00 host-b sshd[100]: Accepted publickey for user\n",
|
|
3480
|
+
encoding="utf-8")
|
|
3481
|
+
(d / "maillog").write_text(
|
|
3482
|
+
"<134>May 31 12:02:00 host-c postfix/smtpd[200]: connect from relay1\n",
|
|
3483
|
+
encoding="utf-8")
|
|
3484
|
+
(d / "cron").write_text(
|
|
3485
|
+
"<134>May 31 12:03:00 host-d CROND[300]: (root) CMD (placeholder)\n",
|
|
3486
|
+
encoding="utf-8")
|
|
3487
|
+
|
|
3488
|
+
res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [d]})
|
|
3489
|
+
df = res.logs["*.log*"]
|
|
3490
|
+
assert len(df) == 4
|
|
3491
|
+
assert set(df["host"]) == {"host-a", "host-b", "host-c", "host-d"}
|
|
3492
|
+
assert res.warnings == []
|
|
3493
|
+
|
|
3494
|
+
|
|
3495
|
+
def test_syslog_gate_rejects_non_syslog_logs_silently(tmp_path: Path, capsys) -> None:
|
|
3496
|
+
"""An ISO-timestamped `dnf.log` and a systemd `boot.log` are dropped by the
|
|
3497
|
+
content gate — no rows AND no per-file stderr at any verbosity."""
|
|
3498
|
+
d = tmp_path / "varlog"
|
|
3499
|
+
d.mkdir()
|
|
3500
|
+
(d / "dnf.log").write_text(
|
|
3501
|
+
"2026-06-01T12:00:00+0000 INFO --- logging initialized ---\n",
|
|
3502
|
+
encoding="utf-8")
|
|
3503
|
+
(d / "boot.log").write_text("[ OK ] Started Some Service.\n", encoding="utf-8")
|
|
3504
|
+
(d / "messages").write_text(
|
|
3505
|
+
"<134>May 31 12:00:00 host-a kernel: link up\n", encoding="utf-8")
|
|
3506
|
+
|
|
3507
|
+
res = load_required_logs(
|
|
3508
|
+
{"*.log*": "syslog_dir"}, {"syslog_dir": [d]}, verbose=True,
|
|
3509
|
+
)
|
|
3510
|
+
df = res.logs["*.log*"]
|
|
3511
|
+
assert len(df) == 1
|
|
3512
|
+
assert set(df["host"]) == {"host-a"}
|
|
3513
|
+
err = capsys.readouterr().err
|
|
3514
|
+
assert "dnf.log" not in err
|
|
3515
|
+
assert "boot.log" not in err
|
|
3516
|
+
|
|
3517
|
+
|
|
3518
|
+
def test_syslog_gate_read_is_byte_bounded(tmp_path: Path, monkeypatch) -> None:
|
|
3519
|
+
"""The gate reads a BOUNDED `read(_SYSLOG_SNIFF_BYTES)` on an unclassified
|
|
3520
|
+
candidate and NEVER iterates / readlines it — a line-bounded read would scan
|
|
3521
|
+
a newline-sparse binary (wtmp/btmp/lastlog) to EOF. This is the regression
|
|
3522
|
+
this thread exists to prevent."""
|
|
3523
|
+
import loghunter.common.loader as L
|
|
3524
|
+
|
|
3525
|
+
calls: list[int] = []
|
|
3526
|
+
|
|
3527
|
+
class _Spy:
|
|
3528
|
+
def __enter__(self):
|
|
3529
|
+
return self
|
|
3530
|
+
|
|
3531
|
+
def __exit__(self, *exc):
|
|
3532
|
+
return False
|
|
3533
|
+
|
|
3534
|
+
def read(self, n):
|
|
3535
|
+
calls.append(n)
|
|
3536
|
+
return "\x00\x00\x00" # NUL → binary → rejected
|
|
3537
|
+
|
|
3538
|
+
def __iter__(self):
|
|
3539
|
+
raise AssertionError("gate must not iterate the handle")
|
|
3540
|
+
|
|
3541
|
+
def readline(self, *a):
|
|
3542
|
+
raise AssertionError("gate must not readline the handle")
|
|
3543
|
+
|
|
3544
|
+
f = tmp_path / "btmp"
|
|
3545
|
+
f.write_bytes(b"\x00" * 4096)
|
|
3546
|
+
monkeypatch.setattr(L, "_open_log", lambda p: _Spy())
|
|
3547
|
+
|
|
3548
|
+
assert _looks_like_syslog(f) is False
|
|
3549
|
+
assert calls == [_SYSLOG_SNIFF_BYTES]
|
|
3550
|
+
|
|
3551
|
+
|
|
3552
|
+
def test_syslog_gate_accepts_dnsmasq_bearing_messages(tmp_path: Path) -> None:
|
|
3553
|
+
"""A `messages` whose lines are dnsmasq queries IS accepted into syslog — the
|
|
3554
|
+
gate runs the syslog recognizer DIRECTLY (dnsmasq lines are RFC 3164), not
|
|
3555
|
+
the full sniff_format cascade (which would route them to dns)."""
|
|
3556
|
+
d = tmp_path / "varlog"
|
|
3557
|
+
d.mkdir()
|
|
3558
|
+
(d / "messages").write_text(
|
|
3559
|
+
"<30>May 31 12:00:00 host-a dnsmasq[1]: query[A] a.test from 192.0.2.1\n",
|
|
3560
|
+
encoding="utf-8")
|
|
3561
|
+
assert [p.name for p in _discover_syslog_files(d)] == ["messages"]
|
|
3562
|
+
|
|
3563
|
+
|
|
3564
|
+
def test_syslog_zero_accepted_dir_one_summary_warning(tmp_path: Path) -> None:
|
|
3565
|
+
"""A syslog dir holding only non-syslog files → exactly ONE summary warning
|
|
3566
|
+
(directory path only, NO per-file name list); a dir with >=1 accepted stream
|
|
3567
|
+
→ NO warning; an EMPTY dir → NO warning."""
|
|
3568
|
+
bad = tmp_path / "bad"
|
|
3569
|
+
bad.mkdir()
|
|
3570
|
+
(bad / "dnf.log").write_text("2026-06-01T12:00:00 INFO x\n", encoding="utf-8")
|
|
3571
|
+
(bad / "junk").write_bytes(b"\x00\x01\x02")
|
|
3572
|
+
res = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [bad]})
|
|
3573
|
+
assert res.record_counts.get("*.log*", 0) == 0
|
|
3574
|
+
matches = [w for w in res.warnings if "looks like syslog (RFC 3164)" in w]
|
|
3575
|
+
assert len(matches) == 1, res.warnings
|
|
3576
|
+
assert "nothing in" in matches[0]
|
|
3577
|
+
assert str(bad) in matches[0]
|
|
3578
|
+
assert "dnf.log" not in matches[0] and "junk" not in matches[0]
|
|
3579
|
+
|
|
3580
|
+
good = tmp_path / "good"
|
|
3581
|
+
good.mkdir()
|
|
3582
|
+
(good / "messages").write_text(
|
|
3583
|
+
"<134>May 31 12:00:00 host-a kernel: x\n", encoding="utf-8")
|
|
3584
|
+
res2 = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [good]})
|
|
3585
|
+
assert not any("looks like syslog" in w for w in res2.warnings)
|
|
3586
|
+
|
|
3587
|
+
empty = tmp_path / "empty"
|
|
3588
|
+
empty.mkdir()
|
|
3589
|
+
res3 = load_required_logs({"*.log*": "syslog_dir"}, {"syslog_dir": [empty]})
|
|
3590
|
+
assert not any("looks like syslog" in w for w in res3.warnings)
|
|
3591
|
+
|
|
3592
|
+
|
|
3593
|
+
def test_syslog_explicit_file_bypasses_gate(tmp_path: Path) -> None:
|
|
3594
|
+
"""A named non-RFC-3164 file loads as operator intent — the gate is bypassed
|
|
3595
|
+
for an explicit FILE input."""
|
|
3596
|
+
f = tmp_path / "dnf.log"
|
|
3597
|
+
f.write_text("2026-06-01T12:00:00 INFO x\n", encoding="utf-8")
|
|
3598
|
+
assert _discover_syslog_files(f) == [f]
|
|
3599
|
+
assert len(load_syslog(f)) == 1
|
|
3600
|
+
|
|
3601
|
+
|
|
3602
|
+
def test_syslog_plan_time_lockstep_with_gate(tmp_path: Path) -> None:
|
|
3603
|
+
"""Plan-time satisfiability uses the SAME content gate: a dir of only
|
|
3604
|
+
`dnf.log` is NOT satisfiable; a `messages`-bearing dir IS."""
|
|
3605
|
+
from loghunter.runner import _any_input_yields_files
|
|
3606
|
+
|
|
3607
|
+
dnf_only = tmp_path / "dnf"
|
|
3608
|
+
dnf_only.mkdir()
|
|
3609
|
+
(dnf_only / "dnf.log").write_text("2026-06-01T12:00:00 INFO x\n", encoding="utf-8")
|
|
3610
|
+
assert _any_input_yields_files("syslog_dir", [dnf_only], "*.log*") is False
|
|
3611
|
+
|
|
3612
|
+
msgs = tmp_path / "msgs"
|
|
3613
|
+
msgs.mkdir()
|
|
3614
|
+
(msgs / "messages").write_text(
|
|
3615
|
+
"<134>May 31 12:00:00 host-a kernel: x\n", encoding="utf-8")
|
|
3616
|
+
assert _any_input_yields_files("syslog_dir", [msgs], "*.log*") is True
|
|
3617
|
+
|
|
3618
|
+
|
|
3619
|
+
def test_syslog_default_window_floor_anchors_on_accepted_only(tmp_path: Path) -> None:
|
|
3620
|
+
"""flat_family_default_floor over a syslog dir with `dnf.log` (ISO, gate-
|
|
3621
|
+
rejected) + a binary + RFC-3164 streams anchors f_max on the MAX accepted
|
|
3622
|
+
candidate's peek ts — rejected files never contribute a peek."""
|
|
3623
|
+
d = tmp_path / "varlog"
|
|
3624
|
+
d.mkdir()
|
|
3625
|
+
(d / "messages").write_text(
|
|
3626
|
+
"<134>May 31 12:00:00 host-a kernel: x\n", encoding="utf-8")
|
|
3627
|
+
(d / "secure").write_text( # later ts → should win f_max
|
|
3628
|
+
"<134>Jun 1 12:00:00 host-b sshd[1]: x\n", encoding="utf-8")
|
|
3629
|
+
(d / "dnf.log").write_text("2026-06-01T12:00:00 INFO x\n", encoding="utf-8")
|
|
3630
|
+
(d / "junk").write_bytes(b"\x00\x01\x02")
|
|
3631
|
+
|
|
3632
|
+
span = timedelta(days=1)
|
|
3633
|
+
floor = _flat_default_floor(_SOURCE_LOADERS["syslog_dir"], [d], "*.log*", span)
|
|
3634
|
+
assert floor is not None
|
|
3635
|
+
f_max, until = floor
|
|
3636
|
+
assert until is None
|
|
3637
|
+
later = _peek_first_ts(d / "secure")
|
|
3638
|
+
assert later is not None
|
|
3639
|
+
assert f_max == later - span
|