loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,458 @@
1
+ """Multi-positional source ingestion — CLI primary rail (rev-3 prompt).
2
+
3
+ These tests exercise the REAL CLI ↔ runner path with ``--dry-run`` and
4
+ ``runner.run`` UNMOCKED. They prove the property the prior multi-positional
5
+ work could not prove because the bug was at CLI fan-in (analyze/single-detector
6
+ read only ``parsed["path"]`` and silently dropped the rest of ``parsed["paths"]``):
7
+ that N positionals fan into per-family buckets, MERGE with explicit
8
+ ``--<family>-dir`` flags (sanctioned rail supersession; both load now), and the
9
+ union load runs across families.
10
+
11
+ Companion to:
12
+ - ``tests/test_source_resolution_seam.py`` (single-positional scope seam),
13
+ - ``tests/test_loader.py`` (loader-level union + dated-window guardrails),
14
+ - ``tests/test_sources.py`` (router + resolver primitives).
15
+
16
+ Privacy rail: RFC 5737 IPs (192.0.2.x / 198.51.100.x / 203.0.113.x) and
17
+ placeholder/example domains only.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import json
23
+ from datetime import timedelta
24
+ from pathlib import Path
25
+
26
+ import pytest
27
+
28
+ from loghunter import cli, runner
29
+ from loghunter.common import config as cfg
30
+ from loghunter.common import loader, sources
31
+
32
+
33
+ # ── content fixtures (RFC 5737 + placeholder domains) ────────────────────────
34
+
35
+
36
+ _FLAT_SYSLOG_LINE = (
37
+ "<134>Jun 11 12:00:00 examplehost sshd[1234]: Accepted publickey for placeholder\n"
38
+ )
39
+
40
+ _PIHOLE_LINE = (
41
+ "Jun 11 12:00:00 piholehost dnsmasq[1234]: query[A] example.test from 192.0.2.10\n"
42
+ )
43
+
44
+ _ZEEK_NDJSON_CONN_LINE = (
45
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
46
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443,'
47
+ ' "proto": "tcp", "duration": 1.23}\n'
48
+ )
49
+
50
+ _ZEEK_NDJSON_DNS_LINE = (
51
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
52
+ ' "query": "example.test"}\n'
53
+ )
54
+
55
+ _CLOUDTRAIL_NDJSON_LINE = json.dumps({
56
+ "eventVersion": "1.08",
57
+ "eventTime": "2026-06-01T12:00:00Z",
58
+ "userIdentity": {"type": "IAMUser"},
59
+ "eventName": "GetObject",
60
+ "eventSource": "s3.amazonaws.com",
61
+ "sourceIPAddress": "192.0.2.10",
62
+ }) + "\n"
63
+
64
+
65
+ def _write_cfg(tmp_path: Path, **keys: str) -> str:
66
+ """Minimal TOML config under tmp_path; only named keys written."""
67
+ lines = ["[loghunter]", 'root = ""']
68
+ for k, v in keys.items():
69
+ lines.append(f'{k} = "{v}"')
70
+ cfg_path = tmp_path / "cfg.toml"
71
+ cfg_path.write_text("\n".join(lines) + "\n", encoding="utf-8")
72
+ return str(cfg_path)
73
+
74
+
75
+ # ── PRIMARY RAIL: real cli._main + --dry-run, runner.run UNMOCKED ────────────
76
+
77
+
78
+ def test_dns_cross_source_positionals_both_families_load(
79
+ monkeypatch: pytest.MonkeyPatch,
80
+ tmp_path: Path,
81
+ capsys: pytest.CaptureFixture[str],
82
+ ) -> None:
83
+ """``loghunter dns zeek_dns.log events.log --dry-run``: the Zeek-shaped
84
+ positional routes to zeek_dir, the Pi-hole-shaped positional routes to
85
+ pihole_dir via content-sniff. Both appear in the dry-run block.
86
+
87
+ The pihole fixture's filename is DELIBERATELY neutral (``events.log``, NOT
88
+ ``pihole.log``) so the test proves CONTENT-SNIFF routes it — never fnmatch
89
+ on the filename, which would let an old assumption pass accidentally.
90
+ """
91
+ monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
92
+ monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
93
+
94
+ zeek_file = tmp_path / "zeek_dns.log"
95
+ zeek_file.write_text(_ZEEK_NDJSON_DNS_LINE, encoding="utf-8")
96
+ pihole_file = tmp_path / "events.log" # neutral filename — sniff must classify by content
97
+ pihole_file.write_text(_PIHOLE_LINE, encoding="utf-8")
98
+
99
+ cfg_path = _write_cfg(tmp_path)
100
+ cli._main([
101
+ "dns", str(zeek_file), str(pihole_file),
102
+ f"--config={cfg_path}", "--dry-run",
103
+ ])
104
+
105
+ out = capsys.readouterr().out
106
+ assert str(zeek_file) in out
107
+ assert str(pihole_file) in out
108
+ # Sibling families NOT touched by any positional stay "not configured"
109
+ # (verifies scope is the UNION of touched families).
110
+ assert "syslog_dir:" in out
111
+ assert "not configured" in out.split("syslog_dir:")[1].split("\n")[0]
112
+ assert "cloudtrail_dir:" in out
113
+ assert "not configured" in out.split("cloudtrail_dir:")[1].split("\n")[0]
114
+
115
+
116
+ def test_beacon_same_family_multi_positionals_both_files_listed(
117
+ monkeypatch: pytest.MonkeyPatch,
118
+ tmp_path: Path,
119
+ capsys: pytest.CaptureFixture[str],
120
+ ) -> None:
121
+ """``loghunter beacon a.log b.log --dry-run``: both Zeek conn files land
122
+ under zeek_dir's multi-input block. This is the natural multi-file command
123
+ a sysadmin types after a shell glob; an old "first wins" rule would have
124
+ silently dropped b.log."""
125
+ monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
126
+ monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
127
+
128
+ f1 = tmp_path / "conn.day1.log"
129
+ f1.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
130
+ f2 = tmp_path / "conn.day2.log"
131
+ f2.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
132
+
133
+ cfg_path = _write_cfg(tmp_path)
134
+ cli._main([
135
+ "beacon", str(f1), str(f2),
136
+ f"--config={cfg_path}", "--dry-run",
137
+ ])
138
+
139
+ out = capsys.readouterr().out
140
+ assert str(f1) in out
141
+ assert str(f2) in out
142
+
143
+
144
+ def test_analyze_detect_all_heterogeneous_positionals_bucket_correctly(
145
+ monkeypatch: pytest.MonkeyPatch,
146
+ tmp_path: Path,
147
+ capsys: pytest.CaptureFixture[str],
148
+ ) -> None:
149
+ """``loghunter conn.log dns.log syslog.log --dry-run`` (detect=all path):
150
+ the Zeek-shaped positionals bucket into zeek_dir, the syslog-shaped
151
+ positional buckets into syslog_dir. The detect=all router's None-mode
152
+ content-sniff classifies each positional independently."""
153
+ monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
154
+ monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
155
+
156
+ conn = tmp_path / "conn.log"
157
+ conn.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
158
+ dns = tmp_path / "dns.log"
159
+ dns.write_text(_ZEEK_NDJSON_DNS_LINE, encoding="utf-8")
160
+ syslog = tmp_path / "syslog.log"
161
+ syslog.write_text(_FLAT_SYSLOG_LINE, encoding="utf-8")
162
+
163
+ cfg_path = _write_cfg(tmp_path)
164
+ cli._main([
165
+ str(conn), str(dns), str(syslog),
166
+ f"--config={cfg_path}", "--dry-run",
167
+ ])
168
+
169
+ out = capsys.readouterr().out
170
+ # Both Zeek positionals under zeek_dir, syslog positional under syslog_dir.
171
+ assert str(conn) in out
172
+ assert str(dns) in out
173
+ assert str(syslog) in out
174
+ # The two zeek_dir entries must appear in the zeek_dir block, not syslog.
175
+ zeek_block = out.split("zeek_dir:")[1].split("syslog_dir:")[0]
176
+ assert str(conn) in zeek_block
177
+ assert str(dns) in zeek_block
178
+ syslog_block = out.split("syslog_dir:")[1].split("pihole_dir:")[0]
179
+ assert str(syslog) in syslog_block
180
+
181
+
182
+ def test_flag_plus_positional_different_family_both_load(
183
+ monkeypatch: pytest.MonkeyPatch,
184
+ tmp_path: Path,
185
+ capsys: pytest.CaptureFixture[str],
186
+ ) -> None:
187
+ """``loghunter dns zeek.log --pihole-dir=pihole.log`` (different family):
188
+ BOTH the positional and the explicit flag load. Mirrors the motivating
189
+ user pattern from the BUGS entry — the operator wanted both files."""
190
+ monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
191
+ monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
192
+
193
+ zeek_file = tmp_path / "zeek_dns.log"
194
+ zeek_file.write_text(_ZEEK_NDJSON_DNS_LINE, encoding="utf-8")
195
+ pihole_file = tmp_path / "events.log"
196
+ pihole_file.write_text(_PIHOLE_LINE, encoding="utf-8")
197
+
198
+ cfg_path = _write_cfg(tmp_path)
199
+ cli._main([
200
+ "dns", str(zeek_file),
201
+ f"--pihole-dir={pihole_file}",
202
+ f"--config={cfg_path}", "--dry-run",
203
+ ])
204
+
205
+ out = capsys.readouterr().out
206
+ assert str(zeek_file) in out
207
+ assert str(pihole_file) in out
208
+
209
+
210
+ def test_same_family_flag_plus_positionals_all_merge(
211
+ monkeypatch: pytest.MonkeyPatch,
212
+ tmp_path: Path,
213
+ capsys: pytest.CaptureFixture[str],
214
+ ) -> None:
215
+ """``loghunter beacon a.log b.log --zeek-dir=c.log``: ALL THREE entries
216
+ contribute to zeek_dir (MERGE — sanctioned rail supersession from the
217
+ rev-3 prompt). The order is positionals first, flag appended."""
218
+ monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
219
+ monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
220
+
221
+ f1 = tmp_path / "conn.a.log"
222
+ f1.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
223
+ f2 = tmp_path / "conn.b.log"
224
+ f2.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
225
+ f3 = tmp_path / "conn.c.log"
226
+ f3.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
227
+
228
+ cfg_path = _write_cfg(tmp_path)
229
+ cli._main([
230
+ "beacon", str(f1), str(f2),
231
+ f"--zeek-dir={f3}",
232
+ f"--config={cfg_path}", "--dry-run",
233
+ ])
234
+
235
+ out = capsys.readouterr().out
236
+ assert str(f1) in out
237
+ assert str(f2) in out
238
+ assert str(f3) in out
239
+
240
+
241
+ def test_multi_positional_scope_still_suppresses_unrelated_configured_sibling(
242
+ monkeypatch: pytest.MonkeyPatch,
243
+ tmp_path: Path,
244
+ capsys: pytest.CaptureFixture[str],
245
+ ) -> None:
246
+ """Multi-positional run, all routing to syslog_dir, with a CONFIGURED
247
+ zeek_dir in the config. Scope is the UNION of touched families
248
+ (frozenset({"syslog_dir"}) here), so the configured zeek_dir stays out —
249
+ the sibling-leak fix is preserved under the union shape."""
250
+ monkeypatch.setattr(cfg, "SEARCH_PATHS", [])
251
+ monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
252
+
253
+ zeek_d = tmp_path / "configured_zeek"
254
+ zeek_d.mkdir()
255
+ f1 = tmp_path / "flat1.log"
256
+ f1.write_text(_FLAT_SYSLOG_LINE, encoding="utf-8")
257
+ f2 = tmp_path / "flat2.log"
258
+ f2.write_text(_FLAT_SYSLOG_LINE, encoding="utf-8")
259
+
260
+ cfg_path = _write_cfg(tmp_path, zeek_dir=str(zeek_d))
261
+ cli._main([
262
+ "syslog", str(f1), str(f2),
263
+ f"--config={cfg_path}", "--dry-run",
264
+ ])
265
+
266
+ out = capsys.readouterr().out
267
+ assert str(f1) in out
268
+ assert str(f2) in out
269
+ # The configured zeek_dir MUST NOT sneak through under union scoping.
270
+ assert str(zeek_d) not in out
271
+ assert "zeek_dir:" in out
272
+ assert "not configured" in out.split("zeek_dir:")[1].split("\n")[0]
273
+
274
+
275
+ # ── SECONDARY: scalar-vs-list programmatic contract ──────────────────────────
276
+
277
+
278
+ def test_runner_run_scalar_and_list_produce_identical_dry_run_output(
279
+ monkeypatch: pytest.MonkeyPatch,
280
+ tmp_path: Path,
281
+ capsys: pytest.CaptureFixture[str],
282
+ ) -> None:
283
+ """``runner.run(zeek_dir="/x")`` and ``runner.run(zeek_dir=["/x"])`` MUST
284
+ produce byte-identical dry-run output. The scalar caller is the
285
+ degenerate one-element list under ``_normalize_overrides``; ~35
286
+ programmatic scalar callers + the Glenn-P2 rail
287
+ (tests/test_root_provenance.py) depend on this."""
288
+ monkeypatch.delenv("LOGHUNTER_ROOT", raising=False)
289
+ f = tmp_path / "conn.log"
290
+ f.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
291
+
292
+ runner.run(config={"loghunter": {"root": ""}}, zeek_dir=str(f), dry_run=True)
293
+ scalar_out = capsys.readouterr().out
294
+
295
+ runner.run(config={"loghunter": {"root": ""}}, zeek_dir=[str(f)], dry_run=True)
296
+ list_out = capsys.readouterr().out
297
+
298
+ assert scalar_out == list_out
299
+
300
+
301
+ # ── SECONDARY: detect=all router fallback (None-mode) ────────────────────────
302
+
303
+
304
+ def test_route_positional_source_none_mode_dir_falls_back_to_zeek(
305
+ tmp_path: Path,
306
+ ) -> None:
307
+ """detect=all / unknown selector with a directory positional → zeek_dir
308
+ fallback. Preserves today's analyze default for unrecognized inputs."""
309
+ d = tmp_path / "some_dir"
310
+ d.mkdir()
311
+ assert sources.route_positional_source(str(d), detector_module=None) == "zeek_dir"
312
+
313
+
314
+ def test_route_positional_source_none_mode_syslog_content_routes_syslog(
315
+ tmp_path: Path,
316
+ ) -> None:
317
+ """detect=all + recognized flat syslog file → syslog_dir."""
318
+ f = tmp_path / "flat.log"
319
+ f.write_text(_FLAT_SYSLOG_LINE, encoding="utf-8")
320
+ assert sources.route_positional_source(str(f), detector_module=None) == "syslog_dir"
321
+
322
+
323
+ def test_route_positional_source_none_mode_pihole_content_routes_pihole(
324
+ tmp_path: Path,
325
+ ) -> None:
326
+ """detect=all + recognized Pi-hole dnsmasq content → pihole_dir, regardless
327
+ of filename. Neutral filename ``events.log`` proves CONTENT-sniff."""
328
+ f = tmp_path / "events.log"
329
+ f.write_text(_PIHOLE_LINE, encoding="utf-8")
330
+ assert sources.route_positional_source(str(f), detector_module=None) == "pihole_dir"
331
+
332
+
333
+ def test_route_positional_source_none_mode_cloudtrail_routes_cloudtrail(
334
+ tmp_path: Path,
335
+ ) -> None:
336
+ """detect=all + recognized CloudTrail NDJSON → cloudtrail_dir."""
337
+ f = tmp_path / "events.json.log"
338
+ f.write_text(_CLOUDTRAIL_NDJSON_LINE, encoding="utf-8")
339
+ assert sources.route_positional_source(
340
+ str(f), detector_module=None,
341
+ ) == "cloudtrail_dir"
342
+
343
+
344
+ def test_route_positional_source_none_mode_unrecognized_falls_back_to_zeek(
345
+ tmp_path: Path,
346
+ ) -> None:
347
+ """detect=all + unrecognized content → zeek_dir fallback. Preserves
348
+ today's analyze default for inputs the sniffer can't classify."""
349
+ f = tmp_path / "garbage.log"
350
+ f.write_text("not log content, just words\n" * 5, encoding="utf-8")
351
+ assert sources.route_positional_source(str(f), detector_module=None) == "zeek_dir"
352
+
353
+
354
+ # ── SECONDARY: plan-time satisfiability lockstep ─────────────────────────────
355
+
356
+
357
+ def test_pihole_satisfiability_via_neutral_filename_lockstep_with_loader(
358
+ tmp_path: Path,
359
+ ) -> None:
360
+ """Glenn req #2: plan-time pihole satisfiability uses ``_syslog_files``
361
+ (file-or-dir, ``*.log*``), NOT ``directory.glob(pattern)``. A Pi-hole
362
+ file with a neutral name (``events.log``) MUST be plan-satisfiable,
363
+ matching what the loader will actually ingest. The old glob-on-pattern
364
+ check would reject ``events.log`` (no ``pihole`` prefix) while the
365
+ loader happily reads it — drift between plan and loader."""
366
+ from types import SimpleNamespace
367
+
368
+ from loghunter.runner import _is_optional_satisfiable
369
+
370
+ f = tmp_path / "events.log"
371
+ f.write_text(_PIHOLE_LINE, encoding="utf-8")
372
+
373
+ req = {"source": "pihole_dir", "pattern": "pihole*.log*"}
374
+ # Single-input shape (degenerate one-element list).
375
+ assert _is_optional_satisfiable(req, {"pihole_dir": [f]}) is True
376
+
377
+
378
+ # ── SECONDARY: union dated-window (multi-input branch of the helper) ─────────
379
+
380
+
381
+ def test_zeek_dated_default_window_union_across_inputs(tmp_path: Path) -> None:
382
+ """Multi dated-dir union: two inputs each carrying disjoint dates → the
383
+ union spans the newest N=ceil(span_days). Generalizes the single-input
384
+ selection (guardrail tests) across the union — the rev-3 algorithm."""
385
+ a = tmp_path / "siteA"
386
+ a.mkdir()
387
+ b = tmp_path / "siteB"
388
+ b.mkdir()
389
+ (a / "2026-01-01").mkdir()
390
+ (a / "2026-01-03").mkdir()
391
+ (b / "2026-01-05").mkdir()
392
+
393
+ # span=2d → newest 2 distinct dates across the union (Jan 3 + Jan 5),
394
+ # window Jan 3 → Jan 5.
395
+ since, until = loader._zeek_dated_window([a, b], timedelta(days=2))
396
+ assert since.date().isoformat() == "2026-01-03"
397
+ assert until.date().isoformat() == "2026-01-05"
398
+
399
+
400
+ def test_zeek_dated_default_window_returns_none_when_file_alongside_dir(
401
+ tmp_path: Path,
402
+ ) -> None:
403
+ """Mixed file + dated dir is NOT purely-dated → helper returns None →
404
+ runner falls to the flat post-load path (max-ts over the combined
405
+ loaded frame). Honesty rail: never silently trim unseen file rows."""
406
+ f = tmp_path / "conn.log"
407
+ f.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
408
+ d = tmp_path / "dated"
409
+ d.mkdir()
410
+ (d / "2026-01-05").mkdir()
411
+ assert (
412
+ loader._zeek_dated_window([f, d], timedelta(days=1))
413
+ is None
414
+ )
415
+
416
+
417
+ def test_zeek_dated_default_window_returns_none_when_flat_dir_alongside_dated(
418
+ tmp_path: Path,
419
+ ) -> None:
420
+ """Mixed flat dir + dated dir is NOT purely-dated → helper returns None →
421
+ runner falls to the flat post-load path."""
422
+ flat = tmp_path / "flat"
423
+ flat.mkdir()
424
+ (flat / "conn.log").write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
425
+ dated = tmp_path / "dated"
426
+ dated.mkdir()
427
+ (dated / "2026-01-05").mkdir()
428
+ assert (
429
+ loader._zeek_dated_window([flat, dated], timedelta(days=1))
430
+ is None
431
+ )
432
+
433
+
434
+ # ── SECONDARY: dedup accounting (duplicate input → no double-count) ─────────
435
+
436
+
437
+ def test_load_required_logs_dedupes_duplicate_inputs_no_double_count(
438
+ tmp_path: Path,
439
+ ) -> None:
440
+ """A positional file that is ALSO inside a positional directory must
441
+ contribute ONCE to byte total and record count. The loader's
442
+ ``_union_dedupe`` (by ``.resolve()`` preserving first-seen order)
443
+ enforces this; dedup runs BEFORE size/record accounting."""
444
+ d = tmp_path / "zeek"
445
+ d.mkdir()
446
+ f = d / "conn.log"
447
+ f.write_text(_ZEEK_NDJSON_CONN_LINE, encoding="utf-8")
448
+ expected_size = f.stat().st_size
449
+
450
+ # Pass BOTH the file (as a positional-style file input) AND the directory
451
+ # containing it. The loader must dedupe by realpath, so conn.log is loaded
452
+ # ONCE — total bytes match the single file's size, NOT 2x.
453
+ result = loader.load_required_logs(
454
+ {"conn*.log*": "zeek_dir"},
455
+ {"zeek_dir": [f, d]},
456
+ )
457
+ assert result.record_counts == {"conn*.log*": 1}
458
+ assert result.data_size_bytes == expected_size