loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,458 @@
|
|
|
1
|
+
"""Multi-positional source ingestion — CLI primary rail (rev-3 prompt).
|
|
2
|
+
|
|
3
|
+
These tests exercise the REAL CLI ↔ runner path with ``--dry-run`` and
|
|
4
|
+
``runner.run`` UNMOCKED. They prove the property the prior multi-positional
|
|
5
|
+
work could not prove because the bug was at CLI fan-in (analyze/single-detector
|
|
6
|
+
read only ``parsed["path"]`` and silently dropped the rest of ``parsed["paths"]``):
|
|
7
|
+
that N positionals fan into per-family buckets, MERGE with explicit
|
|
8
|
+
``--<family>-dir`` flags (sanctioned rail supersession; both load now), and the
|
|
9
|
+
union load runs across families.
|
|
10
|
+
|
|
11
|
+
Companion to:
|
|
12
|
+
- ``tests/test_source_resolution_seam.py`` (single-positional scope seam),
|
|
13
|
+
- ``tests/test_loader.py`` (loader-level union + dated-window guardrails),
|
|
14
|
+
- ``tests/test_sources.py`` (router + resolver primitives).
|
|
15
|
+
|
|
16
|
+
Privacy rail: RFC 5737 IPs (192.0.2.x / 198.51.100.x / 203.0.113.x) and
|
|
17
|
+
placeholder/example domains only.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import json
|
|
23
|
+
from datetime import timedelta
|
|
24
|
+
from pathlib import Path
|
|
25
|
+
|
|
26
|
+
import pytest
|
|
27
|
+
|
|
28
|
+
from loghunter import cli, runner
|
|
29
|
+
from loghunter.common import config as cfg
|
|
30
|
+
from loghunter.common import loader, sources
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ── content fixtures (RFC 5737 + placeholder domains) ────────────────────────
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_FLAT_SYSLOG_LINE = (
|
|
37
|
+
"<134>Jun 11 12:00:00 examplehost sshd[1234]: Accepted publickey for placeholder\n"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
_PIHOLE_LINE = (
|
|
41
|
+
"Jun 11 12:00:00 piholehost dnsmasq[1234]: query[A] example.test from 192.0.2.10\n"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
_ZEEK_NDJSON_CONN_LINE = (
|
|
45
|
+
'{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
|
|
46
|
+
' "id.resp_h": "198.51.100.20", "id.resp_p": 443,'
|
|
47
|
+
' "proto": "tcp", "duration": 1.23}\n'
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
_ZEEK_NDJSON_DNS_LINE = (
|
|
51
|
+
'{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
|
|
52
|
+
' "query": "example.test"}\n'
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
_CLOUDTRAIL_NDJSON_LINE = json.dumps({
|
|
56
|
+
"eventVersion": "1.08",
|
|
57
|
+
"eventTime": "2026-06-01T12:00:00Z",
|
|
58
|
+
"userIdentity": {"type": "IAMUser"},
|
|
59
|
+
"eventName": "GetObject",
|
|
60
|
+
"eventSource": "s3.amazonaws.com",
|
|
61
|
+
"sourceIPAddress": "192.0.2.10",
|
|
62
|
+
}) + "\n"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _write_cfg(tmp_path: Path, **keys: str) -> str:
|
|
66
|
+
"""Minimal TOML config under tmp_path; only named keys written."""
|
|
67
|
+
lines = ["[loghunter]", 'root = ""']
|
|
68
|
+
for k, v in keys.items():
|
|
69
|
+
lines.append(f'{k} = "{v}"')
|
|
70
|
+
cfg_path = tmp_path / "cfg.toml"
|
|
71
|
+
cfg_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
|
72
|
+
return str(cfg_path)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# ── PRIMARY RAIL: real cli._main + --dry-run, runner.run UNMOCKED ────────────
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_dns_cross_source_positionals_both_families_load(
|
|
79
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
80
|
+
tmp_path: Path,
|
|
81
|
+
capsys: pytest.CaptureFixture[str],
|
|
82
|
+
) -> None:
|
|
83
|
+
"""``loghunter dns zeek_dns.log events.log --dry-run``: the Zeek-shaped
|
|
84
|
+
positional routes to zeek_dir, the Pi-hole-shaped positional routes to
|
|
85
|
+
pihole_dir via content-sniff. Both appear in the dry-run block.
|
|
86
|
+
|
|
87
|
+
The pihole fixture's filename is DELIBERATELY neutral (``events.log``, NOT
|
|
88
|
+
``pihole.log``) so the test proves CONTENT-SNIFF routes it — never fnmatch
|
|
89
|
+
on the filename, which would let an old assumption pass accidentally.
|
|
90
|
+
"""
|
|
91
|
+
monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
|
|
92
|
+
monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
|
|
93
|
+
|
|
94
|
+
zeek_file = tmp_path / "zeek_dns.log"
|
|
95
|
+
zeek_file.write_text(_ZEEK_NDJSON_DNS_LINE, encoding="utf-8")
|
|
96
|
+
pihole_file = tmp_path / "events.log" # neutral filename — sniff must classify by content
|
|
97
|
+
pihole_file.write_text(_PIHOLE_LINE, encoding="utf-8")
|
|
98
|
+
|
|
99
|
+
cfg_path = _write_cfg(tmp_path)
|
|
100
|
+
cli._main([
|
|
101
|
+
"dns", str(zeek_file), str(pihole_file),
|
|
102
|
+
f"--config={cfg_path}", "--dry-run",
|
|
103
|
+
])
|
|
104
|
+
|
|
105
|
+
out = capsys.readouterr().out
|
|
106
|
+
assert str(zeek_file) in out
|
|
107
|
+
assert str(pihole_file) in out
|
|
108
|
+
# Sibling families NOT touched by any positional stay "not configured"
|
|
109
|
+
# (verifies scope is the UNION of touched families).
|
|
110
|
+
assert "syslog_dir:" in out
|
|
111
|
+
assert "not configured" in out.split("syslog_dir:")[1].split("\n")[0]
|
|
112
|
+
assert "cloudtrail_dir:" in out
|
|
113
|
+
assert "not configured" in out.split("cloudtrail_dir:")[1].split("\n")[0]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def test_beacon_same_family_multi_positionals_both_files_listed(
|
|
117
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
118
|
+
tmp_path: Path,
|
|
119
|
+
capsys: pytest.CaptureFixture[str],
|
|
120
|
+
) -> None:
|
|
121
|
+
"""``loghunter beacon a.log b.log --dry-run``: both Zeek conn files land
|
|
122
|
+
under zeek_dir's multi-input block. This is the natural multi-file command
|
|
123
|
+
a sysadmin types after a shell glob; an old "first wins" rule would have
|
|
124
|
+
silently dropped b.log."""
|
|
125
|
+
monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
|
|
126
|
+
monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
|
|
127
|
+
|
|
128
|
+
f1 = tmp_path / "conn.day1.log"
|
|
129
|
+
f1.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
130
|
+
f2 = tmp_path / "conn.day2.log"
|
|
131
|
+
f2.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
132
|
+
|
|
133
|
+
cfg_path = _write_cfg(tmp_path)
|
|
134
|
+
cli._main([
|
|
135
|
+
"beacon", str(f1), str(f2),
|
|
136
|
+
f"--config={cfg_path}", "--dry-run",
|
|
137
|
+
])
|
|
138
|
+
|
|
139
|
+
out = capsys.readouterr().out
|
|
140
|
+
assert str(f1) in out
|
|
141
|
+
assert str(f2) in out
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def test_analyze_detect_all_heterogeneous_positionals_bucket_correctly(
|
|
145
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
146
|
+
tmp_path: Path,
|
|
147
|
+
capsys: pytest.CaptureFixture[str],
|
|
148
|
+
) -> None:
|
|
149
|
+
"""``loghunter conn.log dns.log syslog.log --dry-run`` (detect=all path):
|
|
150
|
+
the Zeek-shaped positionals bucket into zeek_dir, the syslog-shaped
|
|
151
|
+
positional buckets into syslog_dir. The detect=all router's None-mode
|
|
152
|
+
content-sniff classifies each positional independently."""
|
|
153
|
+
monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
|
|
154
|
+
monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
|
|
155
|
+
|
|
156
|
+
conn = tmp_path / "conn.log"
|
|
157
|
+
conn.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
158
|
+
dns = tmp_path / "dns.log"
|
|
159
|
+
dns.write_text(_ZEEK_NDJSON_DNS_LINE, encoding="utf-8")
|
|
160
|
+
syslog = tmp_path / "syslog.log"
|
|
161
|
+
syslog.write_text(_FLAT_SYSLOG_LINE, encoding="utf-8")
|
|
162
|
+
|
|
163
|
+
cfg_path = _write_cfg(tmp_path)
|
|
164
|
+
cli._main([
|
|
165
|
+
str(conn), str(dns), str(syslog),
|
|
166
|
+
f"--config={cfg_path}", "--dry-run",
|
|
167
|
+
])
|
|
168
|
+
|
|
169
|
+
out = capsys.readouterr().out
|
|
170
|
+
# Both Zeek positionals under zeek_dir, syslog positional under syslog_dir.
|
|
171
|
+
assert str(conn) in out
|
|
172
|
+
assert str(dns) in out
|
|
173
|
+
assert str(syslog) in out
|
|
174
|
+
# The two zeek_dir entries must appear in the zeek_dir block, not syslog.
|
|
175
|
+
zeek_block = out.split("zeek_dir:")[1].split("syslog_dir:")[0]
|
|
176
|
+
assert str(conn) in zeek_block
|
|
177
|
+
assert str(dns) in zeek_block
|
|
178
|
+
syslog_block = out.split("syslog_dir:")[1].split("pihole_dir:")[0]
|
|
179
|
+
assert str(syslog) in syslog_block
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_flag_plus_positional_different_family_both_load(
|
|
183
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
184
|
+
tmp_path: Path,
|
|
185
|
+
capsys: pytest.CaptureFixture[str],
|
|
186
|
+
) -> None:
|
|
187
|
+
"""``loghunter dns zeek.log --pihole-dir=pihole.log`` (different family):
|
|
188
|
+
BOTH the positional and the explicit flag load. Mirrors the motivating
|
|
189
|
+
user pattern from the BUGS entry — the operator wanted both files."""
|
|
190
|
+
monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
|
|
191
|
+
monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
|
|
192
|
+
|
|
193
|
+
zeek_file = tmp_path / "zeek_dns.log"
|
|
194
|
+
zeek_file.write_text(_ZEEK_NDJSON_DNS_LINE, encoding="utf-8")
|
|
195
|
+
pihole_file = tmp_path / "events.log"
|
|
196
|
+
pihole_file.write_text(_PIHOLE_LINE, encoding="utf-8")
|
|
197
|
+
|
|
198
|
+
cfg_path = _write_cfg(tmp_path)
|
|
199
|
+
cli._main([
|
|
200
|
+
"dns", str(zeek_file),
|
|
201
|
+
f"--pihole-dir={pihole_file}",
|
|
202
|
+
f"--config={cfg_path}", "--dry-run",
|
|
203
|
+
])
|
|
204
|
+
|
|
205
|
+
out = capsys.readouterr().out
|
|
206
|
+
assert str(zeek_file) in out
|
|
207
|
+
assert str(pihole_file) in out
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def test_same_family_flag_plus_positionals_all_merge(
|
|
211
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
212
|
+
tmp_path: Path,
|
|
213
|
+
capsys: pytest.CaptureFixture[str],
|
|
214
|
+
) -> None:
|
|
215
|
+
"""``loghunter beacon a.log b.log --zeek-dir=c.log``: ALL THREE entries
|
|
216
|
+
contribute to zeek_dir (MERGE — sanctioned rail supersession from the
|
|
217
|
+
rev-3 prompt). The order is positionals first, flag appended."""
|
|
218
|
+
monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
|
|
219
|
+
monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
|
|
220
|
+
|
|
221
|
+
f1 = tmp_path / "conn.a.log"
|
|
222
|
+
f1.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
223
|
+
f2 = tmp_path / "conn.b.log"
|
|
224
|
+
f2.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
225
|
+
f3 = tmp_path / "conn.c.log"
|
|
226
|
+
f3.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
227
|
+
|
|
228
|
+
cfg_path = _write_cfg(tmp_path)
|
|
229
|
+
cli._main([
|
|
230
|
+
"beacon", str(f1), str(f2),
|
|
231
|
+
f"--zeek-dir={f3}",
|
|
232
|
+
f"--config={cfg_path}", "--dry-run",
|
|
233
|
+
])
|
|
234
|
+
|
|
235
|
+
out = capsys.readouterr().out
|
|
236
|
+
assert str(f1) in out
|
|
237
|
+
assert str(f2) in out
|
|
238
|
+
assert str(f3) in out
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def test_multi_positional_scope_still_suppresses_unrelated_configured_sibling(
|
|
242
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
243
|
+
tmp_path: Path,
|
|
244
|
+
capsys: pytest.CaptureFixture[str],
|
|
245
|
+
) -> None:
|
|
246
|
+
"""Multi-positional run, all routing to syslog_dir, with a CONFIGURED
|
|
247
|
+
zeek_dir in the config. Scope is the UNION of touched families
|
|
248
|
+
(frozenset({"syslog_dir"}) here), so the configured zeek_dir stays out —
|
|
249
|
+
the sibling-leak fix is preserved under the union shape."""
|
|
250
|
+
monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
|
|
251
|
+
monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
|
|
252
|
+
|
|
253
|
+
zeek_d = tmp_path / "configured_zeek"
|
|
254
|
+
zeek_d.mkdir()
|
|
255
|
+
f1 = tmp_path / "flat1.log"
|
|
256
|
+
f1.write_text(_FLAT_SYSLOG_LINE, encoding="utf-8")
|
|
257
|
+
f2 = tmp_path / "flat2.log"
|
|
258
|
+
f2.write_text(_FLAT_SYSLOG_LINE, encoding="utf-8")
|
|
259
|
+
|
|
260
|
+
cfg_path = _write_cfg(tmp_path, zeek_dir=str(zeek_d))
|
|
261
|
+
cli._main([
|
|
262
|
+
"syslog", str(f1), str(f2),
|
|
263
|
+
f"--config={cfg_path}", "--dry-run",
|
|
264
|
+
])
|
|
265
|
+
|
|
266
|
+
out = capsys.readouterr().out
|
|
267
|
+
assert str(f1) in out
|
|
268
|
+
assert str(f2) in out
|
|
269
|
+
# The configured zeek_dir MUST NOT sneak through under union scoping.
|
|
270
|
+
assert str(zeek_d) not in out
|
|
271
|
+
assert "zeek_dir:" in out
|
|
272
|
+
assert "not configured" in out.split("zeek_dir:")[1].split("\n")[0]
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
# ── SECONDARY: scalar-vs-list programmatic contract ──────────────────────────
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def test_runner_run_scalar_and_list_produce_identical_dry_run_output(
|
|
279
|
+
monkeypatch: pytest.MonkeyPatch,
|
|
280
|
+
tmp_path: Path,
|
|
281
|
+
capsys: pytest.CaptureFixture[str],
|
|
282
|
+
) -> None:
|
|
283
|
+
"""``runner.run(zeek_dir="/x")`` and ``runner.run(zeek_dir=["/x"])`` MUST
|
|
284
|
+
produce byte-identical dry-run output. The scalar caller is the
|
|
285
|
+
degenerate one-element list under ``_normalize_overrides``; ~35
|
|
286
|
+
programmatic scalar callers + the Glenn-P2 rail
|
|
287
|
+
(tests/test_root_provenance.py) depend on this."""
|
|
288
|
+
monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
|
|
289
|
+
f = tmp_path / "conn.log"
|
|
290
|
+
f.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
291
|
+
|
|
292
|
+
runner.run(config={"loghunter": {"root": ""}}, zeek_dir=str(f), dry_run=True)
|
|
293
|
+
scalar_out = capsys.readouterr().out
|
|
294
|
+
|
|
295
|
+
runner.run(config={"loghunter": {"root": ""}}, zeek_dir=[str(f)], dry_run=True)
|
|
296
|
+
list_out = capsys.readouterr().out
|
|
297
|
+
|
|
298
|
+
assert scalar_out == list_out
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# ── SECONDARY: detect=all router fallback (None-mode) ────────────────────────
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def test_route_positional_source_none_mode_dir_falls_back_to_zeek(
|
|
305
|
+
tmp_path: Path,
|
|
306
|
+
) -> None:
|
|
307
|
+
"""detect=all / unknown selector with a directory positional → zeek_dir
|
|
308
|
+
fallback. Preserves today's analyze default for unrecognized inputs."""
|
|
309
|
+
d = tmp_path / "some_dir"
|
|
310
|
+
d.mkdir()
|
|
311
|
+
assert sources.route_positional_source(str(d), detector_module=None) == "zeek_dir"
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def test_route_positional_source_none_mode_syslog_content_routes_syslog(
|
|
315
|
+
tmp_path: Path,
|
|
316
|
+
) -> None:
|
|
317
|
+
"""detect=all + recognized flat syslog file → syslog_dir."""
|
|
318
|
+
f = tmp_path / "flat.log"
|
|
319
|
+
f.write_text(_FLAT_SYSLOG_LINE, encoding="utf-8")
|
|
320
|
+
assert sources.route_positional_source(str(f), detector_module=None) == "syslog_dir"
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
def test_route_positional_source_none_mode_pihole_content_routes_pihole(
|
|
324
|
+
tmp_path: Path,
|
|
325
|
+
) -> None:
|
|
326
|
+
"""detect=all + recognized Pi-hole dnsmasq content → pihole_dir, regardless
|
|
327
|
+
of filename. Neutral filename ``events.log`` proves CONTENT-sniff."""
|
|
328
|
+
f = tmp_path / "events.log"
|
|
329
|
+
f.write_text(_PIHOLE_LINE, encoding="utf-8")
|
|
330
|
+
assert sources.route_positional_source(str(f), detector_module=None) == "pihole_dir"
|
|
331
|
+
|
|
332
|
+
|
|
333
|
+
def test_route_positional_source_none_mode_cloudtrail_routes_cloudtrail(
|
|
334
|
+
tmp_path: Path,
|
|
335
|
+
) -> None:
|
|
336
|
+
"""detect=all + recognized CloudTrail NDJSON → cloudtrail_dir."""
|
|
337
|
+
f = tmp_path / "events.json.log"
|
|
338
|
+
f.write_text(_CLOUDTRAIL_NDJSON_LINE, encoding="utf-8")
|
|
339
|
+
assert sources.route_positional_source(
|
|
340
|
+
str(f), detector_module=None,
|
|
341
|
+
) == "cloudtrail_dir"
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
def test_route_positional_source_none_mode_unrecognized_falls_back_to_zeek(
|
|
345
|
+
tmp_path: Path,
|
|
346
|
+
) -> None:
|
|
347
|
+
"""detect=all + unrecognized content → zeek_dir fallback. Preserves
|
|
348
|
+
today's analyze default for inputs the sniffer can't classify."""
|
|
349
|
+
f = tmp_path / "garbage.log"
|
|
350
|
+
f.write_text("not log content, just words\n" * 5, encoding="utf-8")
|
|
351
|
+
assert sources.route_positional_source(str(f), detector_module=None) == "zeek_dir"
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
# ── SECONDARY: plan-time satisfiability lockstep ─────────────────────────────
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def test_pihole_satisfiability_via_neutral_filename_lockstep_with_loader(
|
|
358
|
+
tmp_path: Path,
|
|
359
|
+
) -> None:
|
|
360
|
+
"""Glenn req #2: plan-time pihole satisfiability uses ``_syslog_files``
|
|
361
|
+
(file-or-dir, ``*.log*``), NOT ``directory.glob(pattern)``. A Pi-hole
|
|
362
|
+
file with a neutral name (``events.log``) MUST be plan-satisfiable,
|
|
363
|
+
matching what the loader will actually ingest. The old glob-on-pattern
|
|
364
|
+
check would reject ``events.log`` (no ``pihole`` prefix) while the
|
|
365
|
+
loader happily reads it — drift between plan and loader."""
|
|
366
|
+
from types import SimpleNamespace
|
|
367
|
+
|
|
368
|
+
from loghunter.runner import _is_optional_satisfiable
|
|
369
|
+
|
|
370
|
+
f = tmp_path / "events.log"
|
|
371
|
+
f.write_text(_PIHOLE_LINE, encoding="utf-8")
|
|
372
|
+
|
|
373
|
+
req = {"source": "pihole_dir", "pattern": "pihole*.log*"}
|
|
374
|
+
# Single-input shape (degenerate one-element list).
|
|
375
|
+
assert _is_optional_satisfiable(req, {"pihole_dir": [f]}) is True
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
# ── SECONDARY: union dated-window (multi-input branch of the helper) ─────────
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
def test_zeek_dated_default_window_union_across_inputs(tmp_path: Path) -> None:
|
|
382
|
+
"""Multi dated-dir union: two inputs each carrying disjoint dates → the
|
|
383
|
+
union spans the newest N=ceil(span_days). Generalizes the single-input
|
|
384
|
+
selection (guardrail tests) across the union — the rev-3 algorithm."""
|
|
385
|
+
a = tmp_path / "siteA"
|
|
386
|
+
a.mkdir()
|
|
387
|
+
b = tmp_path / "siteB"
|
|
388
|
+
b.mkdir()
|
|
389
|
+
(a / "2026-01-01").mkdir()
|
|
390
|
+
(a / "2026-01-03").mkdir()
|
|
391
|
+
(b / "2026-01-05").mkdir()
|
|
392
|
+
|
|
393
|
+
# span=2d → newest 2 distinct dates across the union (Jan 3 + Jan 5),
|
|
394
|
+
# window Jan 3 → Jan 5.
|
|
395
|
+
since, until = loader._zeek_dated_window([a, b], timedelta(days=2))
|
|
396
|
+
assert since.date().isoformat() == "2026-01-03"
|
|
397
|
+
assert until.date().isoformat() == "2026-01-05"
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def test_zeek_dated_default_window_returns_none_when_file_alongside_dir(
|
|
401
|
+
tmp_path: Path,
|
|
402
|
+
) -> None:
|
|
403
|
+
"""Mixed file + dated dir is NOT purely-dated → helper returns None →
|
|
404
|
+
runner falls to the flat post-load path (max-ts over the combined
|
|
405
|
+
loaded frame). Honesty rail: never silently trim unseen file rows."""
|
|
406
|
+
f = tmp_path / "conn.log"
|
|
407
|
+
f.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
408
|
+
d = tmp_path / "dated"
|
|
409
|
+
d.mkdir()
|
|
410
|
+
(d / "2026-01-05").mkdir()
|
|
411
|
+
assert (
|
|
412
|
+
loader._zeek_dated_window([f, d], timedelta(days=1))
|
|
413
|
+
is None
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
|
|
417
|
+
def test_zeek_dated_default_window_returns_none_when_flat_dir_alongside_dated(
|
|
418
|
+
tmp_path: Path,
|
|
419
|
+
) -> None:
|
|
420
|
+
"""Mixed flat dir + dated dir is NOT purely-dated → helper returns None →
|
|
421
|
+
runner falls to the flat post-load path."""
|
|
422
|
+
flat = tmp_path / "flat"
|
|
423
|
+
flat.mkdir()
|
|
424
|
+
(flat / "conn.log").write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
425
|
+
dated = tmp_path / "dated"
|
|
426
|
+
dated.mkdir()
|
|
427
|
+
(dated / "2026-01-05").mkdir()
|
|
428
|
+
assert (
|
|
429
|
+
loader._zeek_dated_window([flat, dated], timedelta(days=1))
|
|
430
|
+
is None
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
# ── SECONDARY: dedup accounting (duplicate input → no double-count) ─────────
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
def test_load_required_logs_dedupes_duplicate_inputs_no_double_count(
|
|
438
|
+
tmp_path: Path,
|
|
439
|
+
) -> None:
|
|
440
|
+
"""A positional file that is ALSO inside a positional directory must
|
|
441
|
+
contribute ONCE to byte total and record count. The loader's
|
|
442
|
+
``_union_dedupe`` (by ``.resolve()`` preserving first-seen order)
|
|
443
|
+
enforces this; dedup runs BEFORE size/record accounting."""
|
|
444
|
+
d = tmp_path / "zeek"
|
|
445
|
+
d.mkdir()
|
|
446
|
+
f = d / "conn.log"
|
|
447
|
+
f.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
|
|
448
|
+
expected_size = f.stat().st_size
|
|
449
|
+
|
|
450
|
+
# Pass BOTH the file (as a positional-style file input) AND the directory
|
|
451
|
+
# containing it. The loader must dedupe by realpath, so conn.log is loaded
|
|
452
|
+
# ONCE — total bytes match the single file's size, NOT 2x.
|
|
453
|
+
result = loader.load_required_logs(
|
|
454
|
+
{"conn*.log*": "zeek_dir"},
|
|
455
|
+
{"zeek_dir": [f, d]},
|
|
456
|
+
)
|
|
457
|
+
assert result.record_counts == {"conn*.log*": 1}
|
|
458
|
+
assert result.data_size_bytes == expected_size
|