loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
tests/test_runner.py ADDED
@@ -0,0 +1,2599 @@
1
+ """Unit tests for runner helper functions: _derive_data_sources, _dns_nudge, and build_run_plan."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import gzip
6
+ import json
7
+ from datetime import datetime, timedelta, timezone
8
+ from pathlib import Path
9
+ from types import SimpleNamespace
10
+
11
+ import pandas as pd
12
+ import pytest
13
+
14
+ import loghunter.runner as runner
15
+ from loghunter.common.display import TEXT_RULE_WIDTH
16
+ from loghunter.runner import (
17
+ _DIGEST_TS_CONFIDENCE_FLOOR,
18
+ _aws_no_interactive_note,
19
+ _aws_window_note,
20
+ _check_required_logs,
21
+ _derive_data_sources,
22
+ _dns_nudge,
23
+ _is_optional_satisfiable,
24
+ _print_dry_run,
25
+ _source_overlap_notes,
26
+ _ts_confidence,
27
+ RunPlan,
28
+ build_run_plan,
29
+ )
30
+
31
+
32
+ # ── _derive_data_sources ──────────────────────────────────────────────────────
33
+
34
+ def test_derive_data_sources_zeek_conn_and_dns() -> None:
35
+ needed = {
36
+ "conn*.log*": "zeek_dir",
37
+ "dns*.log*": "zeek_dir",
38
+ }
39
+ counts = {"conn*.log*": 1000, "dns*.log*": 500}
40
+ assert _derive_data_sources(needed, counts) == ["zeek_conn", "zeek_dns"]
41
+
42
+
43
+ def test_derive_data_sources_syslog() -> None:
44
+ needed = {"*.log*": "syslog_dir"}
45
+ counts = {"*.log*": 200}
46
+ assert _derive_data_sources(needed, counts) == ["syslog_raw"]
47
+
48
+
49
+ def test_derive_data_sources_excludes_zero_count() -> None:
50
+ needed = {
51
+ "conn*.log*": "zeek_dir",
52
+ "dns*.log*": "zeek_dir",
53
+ }
54
+ # dns has zero records — must not appear in output
55
+ counts = {"conn*.log*": 50, "dns*.log*": 0}
56
+ assert _derive_data_sources(needed, counts) == ["zeek_conn"]
57
+
58
+
59
+ def test_derive_data_sources_pihole() -> None:
60
+ needed = {"*.log*": "pihole_dir"}
61
+ counts = {"*.log*": 100}
62
+ assert _derive_data_sources(needed, counts) == ["dnsmasq_dns"]
63
+
64
+
65
+ def test_derive_data_sources_cloudtrail() -> None:
66
+ needed = {"*.json": "cloudtrail_dir"}
67
+ counts = {"*.json": 75}
68
+ assert _derive_data_sources(needed, counts) == ["cloudtrail_raw"]
69
+
70
+
71
+ def test_derive_data_sources_unknown_pattern_skipped() -> None:
72
+ # Patterns not in needed_logs produce no label
73
+ needed = {"conn*.log*": "zeek_dir"}
74
+ counts = {"conn*.log*": 10, "mystery*.log*": 5}
75
+ assert _derive_data_sources(needed, counts) == ["zeek_conn"]
76
+
77
+
78
+ def test_derive_data_sources_empty_record_counts() -> None:
79
+ needed = {"conn*.log*": "zeek_dir"}
80
+ assert _derive_data_sources(needed, {}) == []
81
+
82
+
83
+ # ── _dns_nudge ────────────────────────────────────────────────────────────────
84
+
85
+ def test_dns_nudge_fires_for_dnsmasq_alone() -> None:
86
+ result = _dns_nudge(["dnsmasq_dns"])
87
+ assert result is not None
88
+ assert "Pi-hole" in result or "dnsmasq" in result
89
+
90
+
91
+ def test_dns_nudge_fires_for_dnsmasq_with_non_rich_zeek_source() -> None:
92
+ # zeek_conn is not a rich DNS source — nudge should still fire
93
+ result = _dns_nudge(["dnsmasq_dns", "zeek_conn"])
94
+ assert result is not None
95
+
96
+
97
+ def test_dns_nudge_suppressed_when_zeek_dns_present() -> None:
98
+ assert _dns_nudge(["zeek_dns"]) is None
99
+
100
+
101
+ def test_dns_nudge_suppressed_when_zeek_dns_and_dnsmasq_both_present() -> None:
102
+ assert _dns_nudge(["dnsmasq_dns", "zeek_dns"]) is None
103
+
104
+
105
+ def test_dns_nudge_suppressed_when_no_dns_at_all() -> None:
106
+ assert _dns_nudge(["zeek_conn", "syslog_raw"]) is None
107
+
108
+
109
+ def test_dry_run_uses_shared_text_rule_width(capsys) -> None:
110
+ _print_dry_run(
111
+ zeek_dir=None,
112
+ syslog_dir=None,
113
+ pihole_dir=None,
114
+ cloudtrail_dir=None,
115
+ since=None,
116
+ until=None,
117
+ load_all=False,
118
+ will_run=[],
119
+ skipped={},
120
+ )
121
+ lines = capsys.readouterr().out.splitlines()
122
+ # The dry-run banner is bracketed by DOUBLE rules (run-summary/dry-run polish).
123
+ rule_lines = [line for line in lines if set(line) == {"═"}]
124
+
125
+ assert rule_lines
126
+ assert all(len(line) == TEXT_RULE_WIDTH for line in rule_lines)
127
+
128
+
129
+ def test_dry_run_lists_cloudtrail_dir(tmp_path: Path, capsys) -> None:
130
+ cloudtrail_dir = tmp_path / "ct"
131
+ cloudtrail_dir.mkdir()
132
+ _print_dry_run(
133
+ zeek_dir=None,
134
+ syslog_dir=None,
135
+ pihole_dir=None,
136
+ cloudtrail_dir=cloudtrail_dir,
137
+ since=None,
138
+ until=None,
139
+ load_all=False,
140
+ will_run=[],
141
+ skipped={},
142
+ )
143
+ out = capsys.readouterr().out
144
+ assert "cloudtrail_dir:" in out
145
+ assert str(cloudtrail_dir) in out
146
+ assert "found" in out
147
+
148
+
149
+ # ── DNS run-plan resolution — four source cases ───────────────────────────────
150
+
151
+ _SKIP_REASON = "dns — no DNS source found (need zeek_dir dns logs or pihole_dir logs)"
152
+
153
+
154
+ def _dns_mod() -> SimpleNamespace:
155
+ return SimpleNamespace(
156
+ REQUIRED_LOGS=[],
157
+ OPTIONAL_LOGS=[
158
+ {"source": "zeek_dir", "pattern": "dns*.log*"},
159
+ {"source": "pihole_dir", "pattern": "pihole*.log*"},
160
+ ],
161
+ REQUIRES_ONE_OF_OPTIONAL=True,
162
+ REQUIRES_ONE_OF_OPTIONAL_REASON=_SKIP_REASON,
163
+ )
164
+
165
+
166
+ def _beacon_mod() -> SimpleNamespace:
167
+ return SimpleNamespace(
168
+ REQUIRED_LOGS=[{"source": "zeek_dir", "pattern": "conn*.log*"}],
169
+ OPTIONAL_LOGS=[],
170
+ )
171
+
172
+
173
+ def test_dns_plan_neither_source_skipped() -> None:
174
+ plan = build_run_plan(
175
+ "all", zeek_dir=None, syslog_dir=None, pihole_dir=None,
176
+ detectors={"dns": _dns_mod()},
177
+ )
178
+ assert plan.will_run == []
179
+ assert plan.skipped["dns"] == _SKIP_REASON
180
+
181
+
182
+ def test_dns_plan_zeek_only_runs(tmp_path: Path) -> None:
183
+ zeek_dir = tmp_path / "zeek"
184
+ zeek_dir.mkdir()
185
+ (zeek_dir / "dns.log").write_text("", encoding="utf-8")
186
+
187
+ plan = build_run_plan(
188
+ "all", zeek_dir=zeek_dir, syslog_dir=None, pihole_dir=None,
189
+ detectors={"dns": _dns_mod()},
190
+ )
191
+
192
+ assert "dns" in plan.will_run
193
+ assert "dns" not in plan.skipped
194
+ assert plan.needed_logs == {"dns*.log*": "zeek_dir"}
195
+
196
+
197
+ def test_dns_plan_pihole_only_runs(tmp_path: Path) -> None:
198
+ pihole_dir = tmp_path / "pihole"
199
+ pihole_dir.mkdir()
200
+ (pihole_dir / "pihole.log").write_text("", encoding="utf-8")
201
+
202
+ plan = build_run_plan(
203
+ "all", zeek_dir=None, syslog_dir=None, pihole_dir=pihole_dir,
204
+ detectors={"dns": _dns_mod()},
205
+ )
206
+
207
+ assert "dns" in plan.will_run
208
+ assert "dns" not in plan.skipped
209
+ assert plan.needed_logs == {"pihole*.log*": "pihole_dir"}
210
+
211
+
212
+ def test_dns_plan_both_sources_runs(tmp_path: Path) -> None:
213
+ zeek_dir = tmp_path / "zeek"
214
+ zeek_dir.mkdir()
215
+ (zeek_dir / "dns.log").write_text("", encoding="utf-8")
216
+ pihole_dir = tmp_path / "pihole"
217
+ pihole_dir.mkdir()
218
+ (pihole_dir / "pihole.log").write_text("", encoding="utf-8")
219
+
220
+ plan = build_run_plan(
221
+ "all", zeek_dir=zeek_dir, syslog_dir=None, pihole_dir=pihole_dir,
222
+ detectors={"dns": _dns_mod()},
223
+ )
224
+
225
+ assert "dns" in plan.will_run
226
+ assert plan.needed_logs.get("dns*.log*") == "zeek_dir"
227
+ assert plan.needed_logs.get("pihole*.log*") == "pihole_dir"
228
+
229
+
230
+ def test_dns_plan_zeek_no_dns_files_pihole_satisfies(tmp_path: Path) -> None:
231
+ """When zeek_dir has no dns*.log* but pihole_dir does, only pihole pattern is loaded."""
232
+ zeek_dir = tmp_path / "zeek"
233
+ zeek_dir.mkdir()
234
+ (zeek_dir / "conn.log").write_text("", encoding="utf-8") # no dns*.log*
235
+ pihole_dir = tmp_path / "pihole"
236
+ pihole_dir.mkdir()
237
+ (pihole_dir / "pihole.log").write_text("", encoding="utf-8")
238
+
239
+ plan = build_run_plan(
240
+ "all", zeek_dir=zeek_dir, syslog_dir=None, pihole_dir=pihole_dir,
241
+ detectors={"dns": _dns_mod()},
242
+ )
243
+
244
+ assert "dns" in plan.will_run
245
+ assert plan.needed_logs == {"pihole*.log*": "pihole_dir"}
246
+ assert "dns*.log*" not in plan.needed_logs
247
+
248
+
249
+ def test_dns_plan_beacon_regression(tmp_path: Path) -> None:
250
+ """Adding pihole_dir=None does not affect a detector with normal REQUIRED_LOGS."""
251
+ zeek_dir = tmp_path / "zeek"
252
+ zeek_dir.mkdir()
253
+ (zeek_dir / "conn.log").write_text("", encoding="utf-8")
254
+
255
+ plan = build_run_plan(
256
+ "all", zeek_dir=zeek_dir, syslog_dir=None, pihole_dir=None,
257
+ detectors={"beacon": _beacon_mod()},
258
+ )
259
+
260
+ assert "beacon" in plan.will_run
261
+ assert "beacon" not in plan.skipped
262
+
263
+
264
+ # ── data_sources via _derive_data_sources on plan.needed_logs ─────────────────
265
+
266
+ def test_data_sources_dns_zeek_only(tmp_path: Path) -> None:
267
+ zeek_dir = tmp_path / "zeek"
268
+ zeek_dir.mkdir()
269
+ (zeek_dir / "dns.log").write_text("", encoding="utf-8")
270
+ plan = build_run_plan(
271
+ "all", zeek_dir=zeek_dir, syslog_dir=None, pihole_dir=None,
272
+ detectors={"dns": _dns_mod()},
273
+ )
274
+ assert _derive_data_sources(plan.needed_logs, {"dns*.log*": 500}) == ["zeek_dns"]
275
+
276
+
277
+ def test_data_sources_dns_pihole_only(tmp_path: Path) -> None:
278
+ pihole_dir = tmp_path / "pihole"
279
+ pihole_dir.mkdir()
280
+ (pihole_dir / "pihole.log").write_text("", encoding="utf-8")
281
+ plan = build_run_plan(
282
+ "all", zeek_dir=None, syslog_dir=None, pihole_dir=pihole_dir,
283
+ detectors={"dns": _dns_mod()},
284
+ )
285
+ assert _derive_data_sources(plan.needed_logs, {"pihole*.log*": 100}) == ["dnsmasq_dns"]
286
+
287
+
288
+ def test_data_sources_dns_both(tmp_path: Path) -> None:
289
+ zeek_dir = tmp_path / "zeek"
290
+ zeek_dir.mkdir()
291
+ (zeek_dir / "dns.log").write_text("", encoding="utf-8")
292
+ pihole_dir = tmp_path / "pihole"
293
+ pihole_dir.mkdir()
294
+ (pihole_dir / "pihole.log").write_text("", encoding="utf-8")
295
+ plan = build_run_plan(
296
+ "all", zeek_dir=zeek_dir, syslog_dir=None, pihole_dir=pihole_dir,
297
+ detectors={"dns": _dns_mod()},
298
+ )
299
+ counts = {"dns*.log*": 500, "pihole*.log*": 100}
300
+ assert _derive_data_sources(plan.needed_logs, counts) == ["dnsmasq_dns", "zeek_dns"]
301
+
302
+
303
+ def test_data_sources_dns_neither() -> None:
304
+ plan = build_run_plan(
305
+ "all", zeek_dir=None, syslog_dir=None, pihole_dir=None,
306
+ detectors={"dns": _dns_mod()},
307
+ )
308
+ assert _derive_data_sources(plan.needed_logs, {}) == []
309
+
310
+
311
+ # ── Stage 4: pattern-aware single-file satisfiability ─────────────────────────
312
+
313
+
314
+ def _beacon_mod():
315
+ import loghunter.detectors.beacon as beacon
316
+ return beacon
317
+
318
+
319
+ def _dns_real_mod():
320
+ import loghunter.detectors.dns as dns
321
+ return dns
322
+
323
+
324
+ def test_check_required_logs_zeek_file_matching_pattern_passes(tmp_path: Path) -> None:
325
+ f = tmp_path / "conn.log"
326
+ f.write_text("", encoding="utf-8")
327
+ assert _check_required_logs(_beacon_mod(), {"zeek_dir": f}) is None
328
+
329
+
330
+ def test_check_required_logs_zeek_file_wrong_pattern_skips(tmp_path: Path) -> None:
331
+ """beacon /path/to/dns.log → skipped (pattern conn*.log* doesn't match dns.log)."""
332
+ f = tmp_path / "dns.log"
333
+ f.write_text("", encoding="utf-8")
334
+ reason = _check_required_logs(_beacon_mod(), {"zeek_dir": f})
335
+ assert reason is not None
336
+ assert "conn*.log*" in reason and "not found" in reason
337
+
338
+
339
+ def test_is_optional_satisfiable_zeek_file_matches_dns_pattern(tmp_path: Path) -> None:
340
+ """loghunter dns /path/to/dns.log → DNS optional path satisfied."""
341
+ f = tmp_path / "dns.log"
342
+ f.write_text("", encoding="utf-8")
343
+ req = {"source": "zeek_dir", "pattern": "dns*.log*"}
344
+ assert _is_optional_satisfiable(req, {"zeek_dir": f}) is True
345
+
346
+
347
+ # ── CloudTrail source threading ───────────────────────────────────────────────
348
+
349
+ def _cloudtrail_mod() -> SimpleNamespace:
350
+ """Fake aws-family detector requiring cloudtrail_dir for satisfiability tests."""
351
+ return SimpleNamespace(
352
+ DETECTOR_NAME="fakeaws",
353
+ STATUS="available",
354
+ REQUIRED_LOGS=[{"source": "cloudtrail_dir", "pattern": "*.json*"}],
355
+ OPTIONAL_LOGS=[],
356
+ )
357
+
358
+
359
+ def test_check_required_logs_cloudtrail_native_nested_tree_passes(
360
+ tmp_path: Path,
361
+ ) -> None:
362
+ """Native AWSLogs/<acct>/CloudTrail/<region>/YYYY/MM/DD/ tree resolves via
363
+ discover_cloudtrail_files — not via raw directory.glob (which is non-recursive)."""
364
+ nested = (
365
+ tmp_path
366
+ / "AWSLogs" / "123456789012" / "CloudTrail" / "us-east-1"
367
+ / "2026" / "06" / "01"
368
+ )
369
+ nested.mkdir(parents=True)
370
+ (nested / "events.json.gz").write_bytes(b"placeholder")
371
+
372
+ reason = _check_required_logs(_cloudtrail_mod(), {"cloudtrail_dir": tmp_path})
373
+ assert reason is None
374
+
375
+
376
+ def test_check_required_logs_cloudtrail_empty_dir_returns_reason(
377
+ tmp_path: Path,
378
+ ) -> None:
379
+ empty = tmp_path / "empty-ct"
380
+ empty.mkdir()
381
+
382
+ reason = _check_required_logs(_cloudtrail_mod(), {"cloudtrail_dir": empty})
383
+ assert reason is not None
384
+ assert "no CloudTrail JSON logs found" in reason
385
+ assert str(empty) in reason
386
+
387
+
388
+ def test_build_run_plan_threads_cloudtrail_dir_into_source_map(
389
+ tmp_path: Path,
390
+ ) -> None:
391
+ cloudtrail_dir = tmp_path / "ct"
392
+ cloudtrail_dir.mkdir()
393
+ (cloudtrail_dir / "events.json.log").write_text("{}\n", encoding="utf-8")
394
+
395
+ plan = build_run_plan(
396
+ "all",
397
+ zeek_dir=None, syslog_dir=None, pihole_dir=None,
398
+ cloudtrail_dir=cloudtrail_dir,
399
+ detectors={"fakeaws": _cloudtrail_mod()},
400
+ )
401
+
402
+ assert "fakeaws" in plan.will_run
403
+ assert plan.needed_logs == {"*.json*": "cloudtrail_dir"}
404
+
405
+
406
+ def test_runner_cloudtrail_integration_lights_data_sources(
407
+ tmp_path: Path, capture_summary, monkeypatch
408
+ ) -> None:
409
+ """End-to-end load contract: a detector requiring cloudtrail_dir drives the
410
+ load through runner.run, and the resulting RunSummary.data_sources contains
411
+ "cloudtrail_raw". This is what proves Thread A wires plan → load → context →
412
+ data_sources before the real aws detector lands in Thread B."""
413
+ cloudtrail_dir = tmp_path / "ct"
414
+ cloudtrail_dir.mkdir()
415
+ event = {
416
+ "eventTime": "2026-06-01T12:00:00Z",
417
+ "eventSource": "s3.amazonaws.com",
418
+ "eventName": "GetObject",
419
+ "eventID": "integration-test-event",
420
+ "awsRegion": "us-east-1",
421
+ "sourceIPAddress": "192.0.2.10",
422
+ "userIdentity": {
423
+ "type": "IAMUser",
424
+ "userName": "placeholder-user",
425
+ "arn": "arn:aws:iam::123456789012:user/placeholder-user",
426
+ },
427
+ "readOnly": True,
428
+ }
429
+ (cloudtrail_dir / "events.json.log").write_text(
430
+ json.dumps(event) + "\n",
431
+ encoding="utf-8",
432
+ )
433
+
434
+ captured_ctx: dict = {}
435
+
436
+ def _fake_run(ctx):
437
+ captured_ctx["ctx"] = ctx
438
+ return []
439
+
440
+ fakeaws = SimpleNamespace(
441
+ DETECTOR_NAME="fakeaws",
442
+ STATUS="available",
443
+ REQUIRED_LOGS=[{"source": "cloudtrail_dir", "pattern": "*.json*"}],
444
+ OPTIONAL_LOGS=[],
445
+ DEFAULT_CONFIG={},
446
+ run=_fake_run,
447
+ )
448
+ monkeypatch.setattr(runner, "discover_detectors", lambda: {"fakeaws": fakeaws})
449
+
450
+ runner.run(
451
+ config={"loghunter": {"detect": "fakeaws"}},
452
+ cloudtrail_dir=cloudtrail_dir,
453
+ )
454
+
455
+ s = capture_summary["summary"]
456
+ assert s.data_sources == ["cloudtrail_raw"]
457
+ assert s.record_counts.get("*.json*", 0) == 1
458
+
459
+ ctx = captured_ctx["ctx"]
460
+ df = ctx.logs["*.json*"]
461
+ from loghunter.common.loader import _CLOUDTRAIL_COLUMNS
462
+ assert list(df.columns) == _CLOUDTRAIL_COLUMNS
463
+ assert df.iloc[0]["event_id"] == "integration-test-event"
464
+
465
+
466
+ # ── Stage 4: integration tests — drive runner.run() end-to-end ────────────────
467
+
468
+
469
+ def _write_ndjson(path: Path, records: list[dict]) -> None:
470
+ path.write_text(
471
+ "\n".join(json.dumps(r) for r in records) + "\n",
472
+ encoding="utf-8",
473
+ )
474
+
475
+
476
+ _TS_JAN1 = datetime(2026, 1, 1, tzinfo=timezone.utc).timestamp()
477
+ _TS_JAN5 = datetime(2026, 1, 5, tzinfo=timezone.utc).timestamp()
478
+
479
+
480
+ def _conn(ts: float) -> dict:
481
+ return {
482
+ "ts": ts,
483
+ "id.orig_h": "192.0.2.10",
484
+ "id.resp_h": "198.51.100.20",
485
+ "id.resp_p": 443,
486
+ "proto": "tcp",
487
+ }
488
+
489
+
490
+ def _make_dated_zeek(tmp_path: Path, dates_records: dict[str, list[dict]]) -> Path:
491
+ zeek_dir = tmp_path / "zeek"
492
+ zeek_dir.mkdir()
493
+ for d, records in dates_records.items():
494
+ sub = zeek_dir / d
495
+ sub.mkdir()
496
+ _write_ndjson(sub / "conn.log", records)
497
+ return zeek_dir
498
+
499
+
500
+ def _make_flat_zeek(tmp_path: Path, records: list[dict]) -> Path:
501
+ zeek_dir = tmp_path / "zeek"
502
+ zeek_dir.mkdir()
503
+ _write_ndjson(zeek_dir / "conn.log", records)
504
+ return zeek_dir
505
+
506
+
507
+ @pytest.fixture
508
+ def capture_summary(monkeypatch):
509
+ """Patch _build_output_handler to capture RunSummary instead of rendering."""
510
+ captured: dict = {}
511
+
512
+ class _CapHandler:
513
+ def begin(self, rs): captured["summary"] = rs
514
+ def write(self, fs): captured["findings"] = fs
515
+ def end(self): pass
516
+
517
+ def _fake_build(output_format, output_dir, output_file, verbose_level, *, max_findings_per_detector=100):
518
+ return _CapHandler(), lambda: None
519
+
520
+ monkeypatch.setattr("loghunter.runner._build_output_handler", _fake_build)
521
+ return captured
522
+
523
+
524
+ _BEACON_ONLY = {"loghunter": {"detect": "beacon", "default_window": "1d"}}
525
+
526
+
527
+ def test_runner_dated_default_filters_to_newest_date(tmp_path, capture_summary, capsys):
528
+ zeek_dir = _make_dated_zeek(tmp_path, {
529
+ "2026-01-01": [_conn(_TS_JAN1)],
530
+ "2026-01-05": [_conn(_TS_JAN5)],
531
+ })
532
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir)
533
+ s = capture_summary["summary"]
534
+ assert s.record_counts.get("conn*.log*", 0) == 1
535
+ # The prose default-window note moved to a pre-load stderr announcement.
536
+ assert "Default window: last 1d" in capsys.readouterr().err
537
+ assert not any("Default window" in n for n in s.notes)
538
+
539
+
540
+ def test_runner_dated_default_7d_with_sparse_dirs(tmp_path, capture_summary):
541
+ zeek_dir = _make_dated_zeek(tmp_path, {
542
+ "2026-01-01": [_conn(_TS_JAN1)],
543
+ "2026-01-05": [_conn(_TS_JAN5)],
544
+ })
545
+ config = {"loghunter": {"detect": "beacon", "default_window": "7d"}}
546
+ runner.run(config=config, zeek_dir=zeek_dir)
547
+ s = capture_summary["summary"]
548
+ assert s.record_counts.get("conn*.log*", 0) == 2
549
+
550
+
551
+ def test_runner_flat_default_filters_to_last_span(tmp_path, capture_summary, capsys):
552
+ zeek_dir = _make_flat_zeek(tmp_path, [_conn(_TS_JAN1), _conn(_TS_JAN5)])
553
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir)
554
+ s = capture_summary["summary"]
555
+ assert s.record_counts.get("conn*.log*", 0) == 1
556
+ assert "Default window: last 1d" in capsys.readouterr().err
557
+ assert not any("Default window" in n for n in s.notes)
558
+
559
+
560
+ def test_runner_bounded_single_file_loads_everything_no_note(
561
+ tmp_path, capture_summary, capsys
562
+ ):
563
+ f = tmp_path / "conn.log"
564
+ _write_ndjson(f, [_conn(_TS_JAN1), _conn(_TS_JAN5)])
565
+ runner.run(config=_BEACON_ONLY, zeek_dir=f)
566
+ s = capture_summary["summary"]
567
+ assert s.record_counts.get("conn*.log*", 0) == 2
568
+ assert not any("Default window" in n for n in s.notes)
569
+ assert "Default window" not in capsys.readouterr().err
570
+
571
+
572
+ def test_runner_populates_detector_methods_for_will_run(tmp_path, capture_summary):
573
+ """W1: RunSummary.detector_methods carries the MethodTag for every
574
+ detector in plan.will_run. Beacon's tag is FFT (named=True)."""
575
+ from loghunter.common.finding import MethodTag
576
+ f = tmp_path / "conn.log"
577
+ _write_ndjson(f, [_conn(_TS_JAN1)])
578
+ runner.run(config=_BEACON_ONLY, zeek_dir=f)
579
+ s = capture_summary["summary"]
580
+ assert s.detectors_run == ["beacon"]
581
+ assert s.detector_methods.get("beacon") == MethodTag("FFT", named=True)
582
+
583
+
584
+ def test_runner_load_all_on_single_file_silent_noop(tmp_path, capture_summary):
585
+ """--all on a BOUNDED single file: loads all, emits no default-window note, no error."""
586
+ f = tmp_path / "conn.log"
587
+ _write_ndjson(f, [_conn(_TS_JAN1), _conn(_TS_JAN5)])
588
+ runner.run(config=_BEACON_ONLY, zeek_dir=f, load_all=True)
589
+ s = capture_summary["summary"]
590
+ assert s.record_counts.get("conn*.log*", 0) == 2
591
+ assert not any("Default window" in n for n in s.notes)
592
+
593
+
594
+ def test_runner_explicit_since_suppresses_default_window_note(
595
+ tmp_path, capture_summary, capsys
596
+ ):
597
+ zeek_dir = _make_dated_zeek(tmp_path, {
598
+ "2026-01-01": [_conn(_TS_JAN1)],
599
+ "2026-01-05": [_conn(_TS_JAN5)],
600
+ })
601
+ runner.run(
602
+ config=_BEACON_ONLY,
603
+ zeek_dir=zeek_dir,
604
+ since=datetime(2026, 1, 1, 0, 0, 0, tzinfo=timezone.utc),
605
+ until=datetime(2026, 1, 5, 23, 59, 59, tzinfo=timezone.utc),
606
+ )
607
+ s = capture_summary["summary"]
608
+ assert s.record_counts.get("conn*.log*", 0) == 2
609
+ assert not any("Default window" in n for n in s.notes)
610
+ assert "Default window" not in capsys.readouterr().err
611
+
612
+
613
+ def test_runner_load_all_overrides_default_window(tmp_path, capture_summary, capsys):
614
+ zeek_dir = _make_dated_zeek(tmp_path, {
615
+ "2026-01-01": [_conn(_TS_JAN1)],
616
+ "2026-01-05": [_conn(_TS_JAN5)],
617
+ })
618
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir, load_all=True)
619
+ s = capture_summary["summary"]
620
+ assert s.record_counts.get("conn*.log*", 0) == 2
621
+ assert not any("Default window" in n for n in s.notes)
622
+ assert "Default window" not in capsys.readouterr().err
623
+
624
+
625
+ def test_runner_default_window_empty_string_disables(tmp_path, capture_summary):
626
+ zeek_dir = _make_dated_zeek(tmp_path, {
627
+ "2026-01-01": [_conn(_TS_JAN1)],
628
+ "2026-01-05": [_conn(_TS_JAN5)],
629
+ })
630
+ config = {"loghunter": {"detect": "beacon", "default_window": ""}}
631
+ runner.run(config=config, zeek_dir=zeek_dir)
632
+ s = capture_summary["summary"]
633
+ assert s.record_counts.get("conn*.log*", 0) == 2
634
+ assert not any("Default window" in n for n in s.notes)
635
+
636
+
637
+ # ── Source-coverage disclosure notes ──────────────────────────────────────────
638
+ #
639
+ # Each test drives runner.run end-to-end and inspects RunSummary.notes for the
640
+ # user-facing disclosure note. The HUMAN label (`Pi-hole` / `syslog` /
641
+ # `CloudTrail` / `Zeek <log_type>`) is asserted explicitly — and the parallel
642
+ # `data_sources` token string (`dnsmasq_dns` / `zeek_dns` / `syslog_raw` /
643
+ # `cloudtrail_raw`) is asserted ABSENT from the note text, to pin against
644
+ # internal-token leaks (Glenn B+ required).
645
+
646
+
647
+ def _has_coverage_note(notes, *, starts_with, forbidden_token=None):
648
+ """Return the disclosure note that starts with the given human label; fail
649
+ fast if any note starts with a forbidden internal-token prefix."""
650
+ for n in notes:
651
+ if forbidden_token is not None and n.startswith(forbidden_token + ":"):
652
+ raise AssertionError(
653
+ f"note leaked internal token {forbidden_token!r}: {n!r}"
654
+ )
655
+ matches = [n for n in notes if n.startswith(starts_with + ":")]
656
+ assert matches, (
657
+ f"no note starting with {starts_with!r} found in: {notes!r}"
658
+ )
659
+ return matches[0]
660
+
661
+
662
+ def test_runner_dated_zeek_outside_window_emits_bare_note(
663
+ tmp_path, capture_summary,
664
+ ):
665
+ """Dated-Zeek date-pruned: every dated subdir falls outside the requested
666
+ window. `discover_zeek_files` returns no files; the loader early-returns
667
+ empty; coverage = (None, None) → runner emits the BARE note. Detector
668
+ still RUNS (not skipped)."""
669
+ zeek_dir = _make_dated_zeek(tmp_path, {
670
+ "2025-01-01": [_conn(datetime(2025, 1, 1, tzinfo=timezone.utc).timestamp())],
671
+ })
672
+ runner.run(
673
+ config=_BEACON_ONLY, zeek_dir=zeek_dir,
674
+ since=datetime(2030, 1, 1, tzinfo=timezone.utc),
675
+ until=datetime(2030, 12, 31, tzinfo=timezone.utc),
676
+ )
677
+ s = capture_summary["summary"]
678
+ note = _has_coverage_note(s.notes, starts_with="Zeek conn",
679
+ forbidden_token="zeek_conn")
680
+ assert "files found" in note
681
+ assert "Widen with --since/--days" in note
682
+ # Detector RAN — beacon is in detectors_run, just produced no findings
683
+ # (because the loaded frame was empty).
684
+ assert s.detectors_run == ["beacon"]
685
+
686
+
687
+ def test_runner_flat_zeek_per_pattern_trim_emits_span_note(
688
+ tmp_path, capture_summary,
689
+ ):
690
+ """Glenn #1: flat Zeek dir, DEFAULT window, stale dns*.log* alongside
691
+ fresh conn*.log*. The combined-max window derived from conn's max ts
692
+ trims dns to empty. The runner-side flat-default instrumentation
693
+ writes per-pattern coverage; dns gets a SPAN note labelled "Zeek dns:"
694
+ (not "zeek_dns:").
695
+
696
+ Detector selection: "beacon,dns" so BOTH patterns are in plan.needed_logs
697
+ (the per-pattern trim only fires when more than one Zeek pattern is in
698
+ the subset — that is the entire shape Glenn flagged).
699
+ """
700
+ from datetime import datetime as _dt
701
+
702
+ # FRESH window is anchored to NOW so the default 1d window keeps conn
703
+ # alive; STALE window is well outside the 1d span so dns trims to empty.
704
+ fresh_ts = _dt.now(timezone.utc).timestamp()
705
+ stale_ts = fresh_ts - 30 * 24 * 3600 # 30 days before fresh
706
+
707
+ zeek_dir = tmp_path / "zeek"
708
+ zeek_dir.mkdir()
709
+ _write_ndjson(zeek_dir / "conn.log", [_conn(fresh_ts)])
710
+ _write_ndjson(zeek_dir / "dns.log", [{
711
+ "ts": stale_ts,
712
+ "id.orig_h": "192.0.2.10",
713
+ "query": "example.test",
714
+ "qclass": 1,
715
+ }])
716
+
717
+ runner.run(
718
+ config={"loghunter": {"detect": "beacon, dns", "default_window": "1d"}},
719
+ zeek_dir=zeek_dir,
720
+ )
721
+ s = capture_summary["summary"]
722
+ # conn ran in-window (used as the max-ts anchor), dns trimmed to empty.
723
+ assert s.record_counts.get("dns*.log*", 0) == 0
724
+ assert s.record_counts.get("conn*.log*", 0) == 1
725
+ note = _has_coverage_note(s.notes, starts_with="Zeek dns",
726
+ forbidden_token="zeek_dns")
727
+ assert "rows loaded" in note
728
+ assert "data spans" in note
729
+ assert "Widen with --since/--days" in note
730
+
731
+
732
+ def test_runner_populated_run_emits_no_coverage_note(tmp_path, capture_summary):
733
+ """Happy path: a populated single-file Zeek load produces NO disclosure
734
+ note. The mark_kept short-circuit means LoadResult.coverage is empty for
735
+ populated patterns."""
736
+ f = tmp_path / "conn.log"
737
+ _write_ndjson(f, [_conn(_TS_JAN1)])
738
+ runner.run(config=_BEACON_ONLY, zeek_dir=f)
739
+ s = capture_summary["summary"]
740
+ # No coverage note for any label.
741
+ for label in ("Zeek conn", "Zeek dns", "Pi-hole", "syslog", "CloudTrail"):
742
+ assert not any(n.startswith(label + ":") for n in s.notes), (
743
+ f"unexpected coverage note for {label}: {s.notes!r}"
744
+ )
745
+
746
+
747
+ def test_runner_empty_zeek_file_no_coverage_note(tmp_path, capture_summary):
748
+ """Glenn #2: an empty Zeek file (rotation artifact) reads but yields no
749
+ valid-ts rows. coverage = (0, None) → PARSE-GAP arm → NO note. Telling
750
+ the operator to widen the window on an empty file would mislead."""
751
+ zeek_dir = tmp_path / "zeek"
752
+ zeek_dir.mkdir()
753
+ (zeek_dir / "conn.log").write_text("", encoding="utf-8")
754
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir)
755
+ s = capture_summary["summary"]
756
+ # No Zeek conn coverage note despite the empty file + empty frame.
757
+ assert not any(n.startswith("Zeek conn:") for n in s.notes), s.notes
758
+
759
+
760
+ # ── Mock-based runner tests (stale Pi-hole / parse-gap CT / multi-source) ────
761
+ #
762
+ # These cases need precise control over LoadResult.coverage shape that the
763
+ # parser layer doesn't make easy to fixture (year-guessing in dnsmasq /
764
+ # multi-pattern coverage assembly). Mocking load_required_logs lets each test
765
+ # pin one coverage scenario through runner.run + the capture_summary fixture
766
+ # while still exercising the full runner-side note assembly.
767
+
768
+
769
+ @pytest.fixture
770
+ def mock_load_required_logs(monkeypatch):
771
+ """Override loader.load_required_logs to return a hand-built LoadResult."""
772
+ from loghunter.common import loader as _loader
773
+
774
+ def _install(load_result):
775
+ def _fake(*args, **kwargs):
776
+ return load_result
777
+ monkeypatch.setattr(_loader, "load_required_logs", _fake)
778
+ return _install
779
+
780
+
781
+ def _ts_window_span():
782
+ """A representative full-data span used in mocked SPAN-coverage tests."""
783
+ return (
784
+ datetime(2025, 6, 1, 0, 0, 0, tzinfo=timezone.utc),
785
+ datetime(2025, 6, 5, 0, 0, 0, tzinfo=timezone.utc),
786
+ )
787
+
788
+
789
+ def test_runner_stale_pihole_emits_pihole_span_note(
790
+ tmp_path, capture_summary, mock_load_required_logs,
791
+ ):
792
+ """Motivating bug: dns both-mode, Pi-hole archive timestamped weeks ago
793
+ and the window picks up nothing. SPAN note labelled "Pi-hole:" — must
794
+ NOT leak the internal "dnsmasq_dns:" token. Pi-hole isn't in
795
+ data_sources (record_counts==0), so the Zeek-evangelization nudge does
796
+ NOT fire (data_sources is byte-identical to a Zeek-only run)."""
797
+ from loghunter.common.loader import LoadResult, SourceCoverage
798
+
799
+ # Build minimal Zeek dns + Pi-hole dirs so the plan picks both patterns.
800
+ zeek_dir = tmp_path / "zeek"
801
+ zeek_dir.mkdir()
802
+ _write_ndjson(zeek_dir / "dns.log", [{
803
+ "ts": _TS_JAN5, "id.orig_h": "192.0.2.10",
804
+ "query": "example.test", "qclass": 1,
805
+ }])
806
+ pihole_dir = tmp_path / "pihole"
807
+ pihole_dir.mkdir()
808
+ (pihole_dir / "pihole.log").write_text(
809
+ "Jun 1 12:00:00 dnsmasq[1]: query[A] x.test from 192.0.2.10\n",
810
+ encoding="utf-8",
811
+ )
812
+
813
+ zeek_dns_df = pd.DataFrame({
814
+ "ts": [_TS_JAN5], "src": ["192.0.2.10"],
815
+ "query": ["example.test"], "qclass": [1],
816
+ })
817
+ pihole_empty = pd.DataFrame(columns=_PIHOLE_COLUMNS_FOR_MOCK)
818
+ span = _ts_window_span()
819
+ fake_lr = LoadResult(
820
+ logs={"dns*.log*": zeek_dns_df, "pihole*.log*": pihole_empty},
821
+ record_counts={"dns*.log*": 1}, # pihole = 0
822
+ data_window=(
823
+ datetime.fromtimestamp(_TS_JAN5, tz=timezone.utc),
824
+ datetime.fromtimestamp(_TS_JAN5, tz=timezone.utc),
825
+ ),
826
+ warnings=[],
827
+ data_size_bytes=0,
828
+ coverage={"pihole*.log*": SourceCoverage(15_400_000, span)},
829
+ )
830
+ mock_load_required_logs(fake_lr)
831
+
832
+ runner.run(
833
+ config={"loghunter": {"detect": "dns", "default_window": ""}},
834
+ zeek_dir=zeek_dir, pihole_dir=pihole_dir,
835
+ since=datetime(2026, 6, 1, tzinfo=timezone.utc),
836
+ until=datetime(2026, 6, 3, tzinfo=timezone.utc),
837
+ )
838
+ s = capture_summary["summary"]
839
+ note = _has_coverage_note(s.notes, starts_with="Pi-hole",
840
+ forbidden_token="dnsmasq_dns")
841
+ assert "15,400,000 rows loaded" in note
842
+ assert "data spans" in note
843
+ assert "Widen with --since/--days" in note
844
+ # Pi-hole NOT in data_sources (record_counts==0) → nudge does not fire,
845
+ # and data_sources is unchanged (only the Zeek dns label).
846
+ assert "zeek_dns" in s.data_sources
847
+ assert "dnsmasq_dns" not in s.data_sources
848
+ assert not any("Pi-hole/dnsmasq logs" in n for n in s.notes)
849
+
850
+
851
+ def test_runner_parse_gap_cloudtrail_emits_no_note(
852
+ tmp_path, capture_summary, mock_load_required_logs,
853
+ ):
854
+ """CloudTrail file with all-unparseable eventTime → coverage = (0, None)
855
+ → runner emits NO note (parse-gap arm)."""
856
+ from loghunter.common.loader import LoadResult, SourceCoverage
857
+
858
+ ct_dir = tmp_path / "ct"
859
+ ct_dir.mkdir()
860
+ (ct_dir / "events.json.log").write_text("{}", encoding="utf-8")
861
+
862
+ empty_ct = pd.DataFrame(columns=_CT_COLUMNS_FOR_MOCK)
863
+ fake_lr = LoadResult(
864
+ logs={"*.json*": empty_ct},
865
+ record_counts={},
866
+ data_window=None,
867
+ warnings=[],
868
+ data_size_bytes=0,
869
+ coverage={"*.json*": SourceCoverage(0, None)},
870
+ )
871
+ mock_load_required_logs(fake_lr)
872
+
873
+ runner.run(
874
+ config={"loghunter": {"detect": "aws"}}, cloudtrail_dir=ct_dir,
875
+ )
876
+ s = capture_summary["summary"]
877
+ # No CloudTrail (or any other) coverage note.
878
+ for label in ("CloudTrail", "Zeek conn", "Pi-hole", "syslog"):
879
+ assert not any(n.startswith(label + ":") for n in s.notes), (
880
+ f"unexpected coverage note for {label}: {s.notes!r}"
881
+ )
882
+
883
+
884
+ def test_runner_wrong_family_syslog_skip_no_coverage_note(
885
+ tmp_path, capture_summary, mock_load_required_logs,
886
+ ):
887
+ """A deliberately-skipped wrong-family file (NDJSON in syslog_dir)
888
+ surfaces as the loader's existing skip behavior; the runner MUST NOT
889
+ emit a window-disclosure note for it. At the loader level the tracker
890
+ writes SourceCoverage(None, None) (note_file_read never fired), but the
891
+ runner's BARE-note arm is zeek_dir-only — so syslog produces no note."""
892
+ from loghunter.common.loader import LoadResult, SourceCoverage
893
+
894
+ syslog_dir = tmp_path / "syslog"
895
+ syslog_dir.mkdir()
896
+ # Plan-time satisfiability now content-sniffs (Item E), so the on-disk file
897
+ # must pass the gate; the LOAD is mocked below to simulate the wrong-family
898
+ # skip (empty frame + SourceCoverage(None, None)).
899
+ (syslog_dir / "host.log").write_text(
900
+ "<134>May 31 12:00:00 host-a kernel: x\n", encoding="utf-8"
901
+ )
902
+
903
+ empty_syslog = pd.DataFrame(
904
+ columns=["ts", "host", "program", "raw", "message"]
905
+ )
906
+ fake_lr = LoadResult(
907
+ logs={"*.log*": empty_syslog},
908
+ record_counts={},
909
+ data_window=None,
910
+ warnings=[],
911
+ data_size_bytes=0,
912
+ # The wrong-family skip leaves the tracker with no note_file_read
913
+ # calls; coverage(True) → SourceCoverage(None, None).
914
+ coverage={"*.log*": SourceCoverage(None, None)},
915
+ )
916
+ mock_load_required_logs(fake_lr)
917
+
918
+ runner.run(
919
+ config={"loghunter": {"detect": "syslog"}}, syslog_dir=syslog_dir,
920
+ )
921
+ s = capture_summary["summary"]
922
+ # No "syslog:" coverage note (BARE arm is zeek_dir-only).
923
+ assert not any(n.startswith("syslog:") for n in s.notes), s.notes
924
+
925
+
926
+ def test_runner_unconfigured_source_no_coverage_note(
927
+ tmp_path, capture_summary,
928
+ ):
929
+ """A pattern not loaded (source unconfigured) → loader warns via its
930
+ existing "{source} not configured — {pattern} not loaded" warning; the
931
+ disclosure note MUST NOT fire (would duplicate the warning).
932
+
933
+ Driven by detect=dns with only zeek_dir configured (no pihole_dir);
934
+ DNS plan needs both patterns but pihole_dir is absent.
935
+ """
936
+ zeek_dir = tmp_path / "zeek"
937
+ zeek_dir.mkdir()
938
+ _write_ndjson(zeek_dir / "dns.log", [{
939
+ "ts": _TS_JAN5, "id.orig_h": "192.0.2.10",
940
+ "query": "example.test", "qclass": 1,
941
+ }])
942
+
943
+ runner.run(
944
+ config={"loghunter": {"detect": "dns", "default_window": ""}},
945
+ zeek_dir=zeek_dir,
946
+ # pihole_dir omitted entirely
947
+ )
948
+ s = capture_summary["summary"]
949
+ # Loader emits the "pihole_dir not configured" warning, but the
950
+ # disclosure note (Pi-hole: …) does NOT fire.
951
+ assert not any(n.startswith("Pi-hole:") for n in s.notes), s.notes
952
+
953
+
954
+ def test_runner_appends_disclosure_after_home_net_note(
955
+ tmp_path, capture_summary, mock_load_required_logs,
956
+ ):
957
+ """Note ordering preserved: the new disclosure note appends LAST in the
958
+ notes list, after _home_net_note (so the existing notes' relative order
959
+ is byte-identical)."""
960
+ from loghunter.common.loader import LoadResult, SourceCoverage
961
+
962
+ ct_dir = tmp_path / "ct"
963
+ ct_dir.mkdir()
964
+ (ct_dir / "events.json.log").write_text("{}", encoding="utf-8")
965
+
966
+ empty_ct = pd.DataFrame(columns=_CT_COLUMNS_FOR_MOCK)
967
+ span = _ts_window_span()
968
+ fake_lr = LoadResult(
969
+ logs={"*.json*": empty_ct},
970
+ record_counts={},
971
+ data_window=None,
972
+ warnings=[],
973
+ data_size_bytes=0,
974
+ coverage={"*.json*": SourceCoverage(42, span)},
975
+ )
976
+ mock_load_required_logs(fake_lr)
977
+
978
+ runner.run(
979
+ config={"loghunter": {"detect": "aws"}}, cloudtrail_dir=ct_dir,
980
+ )
981
+ s = capture_summary["summary"]
982
+ ct_note_idx = next(
983
+ (i for i, n in enumerate(s.notes) if n.startswith("CloudTrail:")),
984
+ None,
985
+ )
986
+ assert ct_note_idx is not None, f"no CloudTrail note in {s.notes!r}"
987
+ # No "_home_net" or other internal-prefixed notes follow the disclosure.
988
+ # The disclosure must be at or after the index of any pre-existing note.
989
+ # Simpler invariant: it's the LAST note (or among the last).
990
+ assert ct_note_idx == len(s.notes) - 1, (
991
+ f"CloudTrail note not last (idx={ct_note_idx}, len={len(s.notes)}): "
992
+ f"{s.notes!r}"
993
+ )
994
+
995
+
996
+ # Column lists needed by the mocked LoadResult fixtures above.
997
+ _PIHOLE_COLUMNS_FOR_MOCK = [
998
+ "ts", "host", "program", "client", "qtype", "query", "answer",
999
+ "rcode", "raw", "message", "event_type",
1000
+ ]
1001
+ _CT_COLUMNS_FOR_MOCK = [
1002
+ "ts", "eventTime", "eventSource", "eventName", "eventID", "awsRegion",
1003
+ "sourceIPAddress", "principal", "lane", "read_write", "errorCode",
1004
+ "raw",
1005
+ ]
1006
+
1007
+
1008
+ # The two staging-dir tests below are the Stage 4 vs Stage 3 differential:
1009
+ # Stage 3 includes non-date child dirs in the no-window branch; Stage 4 bare runs
1010
+ # are windowed (skipping them) but --all reverts to the no-window branch.
1011
+
1012
+ def test_runner_default_window_skips_real_nondate_subdir(tmp_path, capture_summary):
1013
+ """Bare run with dated dirs + staging/ → only dated dir loaded (windowed branch)."""
1014
+ zeek_dir = _make_dated_zeek(tmp_path, {"2026-01-05": [_conn(_TS_JAN5)]})
1015
+ staging = zeek_dir / "staging"
1016
+ staging.mkdir()
1017
+ _write_ndjson(staging / "conn.log", [_conn(_TS_JAN1)])
1018
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir)
1019
+ s = capture_summary["summary"]
1020
+ assert s.record_counts.get("conn*.log*", 0) == 1
1021
+
1022
+
1023
+ def test_runner_load_all_includes_real_nondate_subdir(tmp_path, capture_summary):
1024
+ """--all with dated dirs + staging/ → both dirs loaded (no-window branch)."""
1025
+ zeek_dir = _make_dated_zeek(tmp_path, {"2026-01-05": [_conn(_TS_JAN5)]})
1026
+ staging = zeek_dir / "staging"
1027
+ staging.mkdir()
1028
+ _write_ndjson(staging / "conn.log", [_conn(_TS_JAN1)])
1029
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir, load_all=True)
1030
+ s = capture_summary["summary"]
1031
+ assert s.record_counts.get("conn*.log*", 0) == 2
1032
+
1033
+
1034
+ # ── Stage 4 P1 regression: Zeek default window must not leak into other sources ──
1035
+
1036
+ def test_runner_default_window_no_leak_from_unplanned_family(
1037
+ tmp_path, capture_summary, capsys
1038
+ ):
1039
+ """detect=syslog with zeek_dir configured: a CONFIGURED-but-not-in-plan family
1040
+ (zeek_dir) is never loaded or windowed (no conn count). The universal default
1041
+ window engages on syslog — its OWN family — anchoring on syslog's own max-ts.
1042
+
1043
+ Under the old Zeek-only rule this case proved Zeek's window did not leak into
1044
+ syslog (which loaded full). Under the universal window, syslog gets its OWN
1045
+ 1d window: the Jan 1 row falls outside (Jan 5 − 1d) and is trimmed; the
1046
+ unplanned Zeek family stays out of the load entirely.
1047
+ """
1048
+ # Current year so RFC 3164 syslog parsing (which assumes current year) is recent.
1049
+ year = datetime.now(timezone.utc).year
1050
+
1051
+ zeek_dir = tmp_path / "zeek"
1052
+ zeek_dir.mkdir()
1053
+ newest = zeek_dir / f"{year}-01-05"
1054
+ newest.mkdir()
1055
+ _write_ndjson(newest / "conn.log", [_conn(_TS_JAN5)])
1056
+
1057
+ syslog_dir = tmp_path / "syslog"
1058
+ syslog_dir.mkdir()
1059
+ (syslog_dir / "host.log").write_text(
1060
+ f"Jan 1 00:00:01 host kernel: old line\n"
1061
+ f"Jan 5 00:00:01 host kernel: new line\n",
1062
+ encoding="utf-8",
1063
+ )
1064
+
1065
+ runner.run(
1066
+ config={"loghunter": {"detect": "syslog", "default_window": "1d"}},
1067
+ zeek_dir=zeek_dir, syslog_dir=syslog_dir,
1068
+ )
1069
+
1070
+ s = capture_summary["summary"]
1071
+ assert "conn*.log*" not in s.record_counts, \
1072
+ "zeek_dir is not in the plan for detect=syslog — must not load"
1073
+ assert s.record_counts.get("*.log*", 0) == 1, \
1074
+ "syslog's OWN universal default window keeps only the in-window row"
1075
+ assert "Default window: last 1d" in capsys.readouterr().err
1076
+ assert not any("Default window" in n for n in s.notes)
1077
+
1078
+
1079
+ def test_runner_default_window_applies_to_all_families_in_mixed_run(
1080
+ tmp_path, capture_summary, capsys
1081
+ ):
1082
+ """Mixed run (beacon + syslog) with default window: EVERY in-plan family is
1083
+ windowed on its own anchor — Zeek conn to the newest dated dir, syslog to its
1084
+ own last-1d. (Old behavior windowed Zeek only; the window is now universal.)"""
1085
+ year = datetime.now(timezone.utc).year
1086
+
1087
+ zeek_dir = tmp_path / "zeek"
1088
+ zeek_dir.mkdir()
1089
+ # Derive conn row ts from the SAME `year` as the dir names: the Zeek default
1090
+ # window is derived from the dir NAME, so hardcoded 2026 rows would be filtered
1091
+ # out (window misses them) on a 2027+ box.
1092
+ old = zeek_dir / f"{year}-01-01"
1093
+ old.mkdir()
1094
+ _write_ndjson(old / "conn.log", [
1095
+ _conn(datetime(year, 1, 1, tzinfo=timezone.utc).timestamp())
1096
+ ])
1097
+ new = zeek_dir / f"{year}-01-05"
1098
+ new.mkdir()
1099
+ _write_ndjson(new / "conn.log", [
1100
+ _conn(datetime(year, 1, 5, tzinfo=timezone.utc).timestamp())
1101
+ ])
1102
+
1103
+ syslog_dir = tmp_path / "syslog"
1104
+ syslog_dir.mkdir()
1105
+ (syslog_dir / "host.log").write_text(
1106
+ "Jan 1 00:00:01 host kernel: old line\n"
1107
+ "Jan 5 00:00:01 host kernel: new line\n",
1108
+ encoding="utf-8",
1109
+ )
1110
+
1111
+ runner.run(
1112
+ config={"loghunter": {"detect": "beacon,syslog", "default_window": "1d"}},
1113
+ zeek_dir=zeek_dir, syslog_dir=syslog_dir,
1114
+ )
1115
+
1116
+ s = capture_summary["summary"]
1117
+ assert "Default window: last 1d" in capsys.readouterr().err
1118
+ assert not any("Default window" in n for n in s.notes)
1119
+ assert s.record_counts.get("conn*.log*", 0) == 1, \
1120
+ "Zeek conn rows filtered to newest dated dir only"
1121
+ assert s.record_counts.get("*.log*", 0) == 1, \
1122
+ "syslog rows windowed to its OWN last-1d (Jan 1 trimmed)"
1123
+
1124
+
1125
+ # ── universal default window: flat (syslog/pihole) + cloudtrail families ──────
1126
+
1127
+
1128
+ def test_runner_syslog_default_window_trims_and_keeps_nan_ts(
1129
+ tmp_path, capture_summary, capsys
1130
+ ):
1131
+ """The universal default window engages on a flat syslog DIRECTORY: rows older
1132
+ than (max-ts − 1d) are trimmed, the in-window row survives, AND a row with an
1133
+ unparseable timestamp (NaN ts, keep-policy) survives the trim (keep-null)."""
1134
+ year = datetime.now(timezone.utc).year # noqa: F841 (documents the year-guess)
1135
+ syslog_dir = tmp_path / "syslog"
1136
+ syslog_dir.mkdir()
1137
+ (syslog_dir / "host.log").write_text(
1138
+ "Jun 1 12:00:00 host kernel: old line\n" # outside last-1d → trimmed
1139
+ "Jun 5 12:00:00 host kernel: new line\n" # in window → kept
1140
+ "Xxx 1 12:00:00 host kernel: nan line\n", # NaN ts → kept (keep policy)
1141
+ encoding="utf-8",
1142
+ )
1143
+ runner.run(
1144
+ config={"loghunter": {"detect": "syslog", "default_window": "1d"}},
1145
+ syslog_dir=syslog_dir,
1146
+ )
1147
+ s = capture_summary["summary"]
1148
+ assert "Default window: last 1d" in capsys.readouterr().err
1149
+ assert s.record_counts.get("*.log*", 0) == 2, \
1150
+ "in-window row + NaN-ts row survive; the old row is trimmed"
1151
+
1152
+
1153
+ def test_runner_flat_family_explicit_file_is_bounded_no_window(
1154
+ tmp_path, capture_summary, capsys
1155
+ ):
1156
+ """A flat family given an explicit FILE is BOUNDED — load full, no default
1157
+ window, no stderr announcement (boundedness generalizes to every family)."""
1158
+ f = tmp_path / "host.log"
1159
+ f.write_text(
1160
+ "Jun 1 12:00:00 host kernel: old line\n"
1161
+ "Jun 5 12:00:00 host kernel: new line\n",
1162
+ encoding="utf-8",
1163
+ )
1164
+ runner.run(
1165
+ config={"loghunter": {"detect": "syslog", "default_window": "1d"}},
1166
+ syslog_dir=f,
1167
+ )
1168
+ s = capture_summary["summary"]
1169
+ assert s.record_counts.get("*.log*", 0) == 2, "bounded file loads full"
1170
+ assert "Default window" not in capsys.readouterr().err
1171
+
1172
+
1173
+ def test_runner_flat_family_mixed_file_and_dir_trims_with_bucket(
1174
+ tmp_path, capture_summary, capsys
1175
+ ):
1176
+ """Mixed explicit-file + directory in one flat family (1E): the family is
1177
+ unbounded, the default window applies to the WHOLE load, and the named file's
1178
+ out-of-window rows are trimmed WITH the bucket. The floor anchors on DIRECTORY
1179
+ candidates only — the explicit file does not drive it (else its old row would
1180
+ survive)."""
1181
+ old_file = tmp_path / "old.log"
1182
+ old_file.write_text(
1183
+ "Jun 1 12:00:00 host kernel: explicit old line\n", # trimmed with bucket
1184
+ encoding="utf-8",
1185
+ )
1186
+ syslog_dir = tmp_path / "syslog"
1187
+ syslog_dir.mkdir()
1188
+ (syslog_dir / "host.log").write_text(
1189
+ "Jun 5 12:00:00 host kernel: dir new line\n", # anchor + in window
1190
+ encoding="utf-8",
1191
+ )
1192
+ runner.run(
1193
+ config={"loghunter": {"detect": "syslog", "default_window": "1d"}},
1194
+ syslog_dir=[old_file, syslog_dir],
1195
+ )
1196
+ s = capture_summary["summary"]
1197
+ assert "Default window: last 1d" in capsys.readouterr().err
1198
+ assert s.record_counts.get("*.log*", 0) == 1, \
1199
+ "only the dir's in-window row survives; the explicit file's old row is trimmed"
1200
+
1201
+
1202
+ def _ct_event(ts_iso: str, event_id: str) -> dict:
1203
+ return {
1204
+ "eventTime": ts_iso,
1205
+ "eventSource": "s3.amazonaws.com",
1206
+ "eventName": "GetObject",
1207
+ "eventID": event_id,
1208
+ "awsRegion": "us-east-1",
1209
+ "sourceIPAddress": "192.0.2.10",
1210
+ "userIdentity": {
1211
+ "type": "IAMUser",
1212
+ "userName": "placeholder-user",
1213
+ "principalId": "AIDAEXAMPLE",
1214
+ "arn": "arn:aws:iam::123456789012:user/placeholder-user",
1215
+ },
1216
+ "readOnly": True,
1217
+ }
1218
+
1219
+
1220
+ def test_runner_cloudtrail_excluded_from_default_window_loads_full(
1221
+ tmp_path, capture_summary, capsys
1222
+ ):
1223
+ """CloudTrail opts OUT of the auto-default window (aws is baseline-relative):
1224
+ an UNQUALIFIED run loads the FULL archive (no trim), and — being the only
1225
+ family — emits NO "Default window" stderr line."""
1226
+ ct_dir = tmp_path / "ct"
1227
+ ct_dir.mkdir()
1228
+ (ct_dir / "events.json").write_text(
1229
+ "\n".join(json.dumps(e) for e in [
1230
+ _ct_event("2026-06-01T12:00:00Z", "aaaa"), # a month apart — both
1231
+ _ct_event("2026-06-05T12:00:00Z", "bbbb"), # load (no default window)
1232
+ ]) + "\n",
1233
+ encoding="utf-8",
1234
+ )
1235
+ runner.run(
1236
+ config={"loghunter": {"detect": "aws", "default_window": "1d"}},
1237
+ cloudtrail_dir=ct_dir,
1238
+ )
1239
+ s = capture_summary["summary"]
1240
+ assert "Default window" not in capsys.readouterr().err, \
1241
+ "cloudtrail-only unqualified run engages no default window"
1242
+ assert s.record_counts.get("*.json*", 0) == 2, \
1243
+ "cloudtrail loads FULL — excluded from the auto-default window"
1244
+
1245
+
1246
+ def test_runner_cloudtrail_explicit_window_narrows_and_riders(
1247
+ tmp_path, capture_summary
1248
+ ):
1249
+ """An explicit --since DOES window cloudtrail, and the aws window note then
1250
+ carries the --all rider (cloudtrail_narrowed)."""
1251
+ ct_dir = tmp_path / "ct"
1252
+ ct_dir.mkdir()
1253
+ (ct_dir / "events.json").write_text(
1254
+ "\n".join(json.dumps(e) for e in [
1255
+ _ct_event("2026-06-01T12:00:00Z", "aaaa"), # before since → excluded
1256
+ _ct_event("2026-06-05T12:00:00Z", "bbbb"), # in window
1257
+ ]) + "\n",
1258
+ encoding="utf-8",
1259
+ )
1260
+ runner.run(
1261
+ config={"loghunter": {"detect": "aws", "default_window": "1d"}},
1262
+ cloudtrail_dir=ct_dir,
1263
+ since=datetime(2026, 6, 4, tzinfo=timezone.utc),
1264
+ until=datetime(2026, 6, 6, tzinfo=timezone.utc),
1265
+ )
1266
+ s = capture_summary["summary"]
1267
+ assert s.record_counts.get("*.json*", 0) == 1, "explicit window narrows cloudtrail"
1268
+ assert any("--all for a full-baseline" in n for n in s.notes), \
1269
+ "explicit narrowing → aws window note carries the --all rider"
1270
+
1271
+
1272
+ def test_runner_mixed_unqualified_cloudtrail_full_no_aws_all_rider(
1273
+ tmp_path, capture_summary, capsys
1274
+ ):
1275
+ """Mixed unqualified run (aws + syslog): the default window fires for syslog
1276
+ (eligible) so the stderr line STILL prints, but cloudtrail loads FULL and the
1277
+ aws notes must NOT claim --all is needed (cloudtrail wasn't narrowed)."""
1278
+ ct_dir = tmp_path / "ct"
1279
+ ct_dir.mkdir()
1280
+ (ct_dir / "events.json").write_text(
1281
+ "\n".join(json.dumps(e) for e in [
1282
+ _ct_event("2026-06-01T12:00:00Z", "aaaa"),
1283
+ _ct_event("2026-06-05T12:00:00Z", "bbbb"),
1284
+ ]) + "\n",
1285
+ encoding="utf-8",
1286
+ )
1287
+ syslog_dir = tmp_path / "syslog"
1288
+ syslog_dir.mkdir()
1289
+ (syslog_dir / "host.log").write_text(
1290
+ "Jun 1 12:00:00 host kernel: old line\n"
1291
+ "Jun 5 12:00:00 host kernel: new line\n",
1292
+ encoding="utf-8",
1293
+ )
1294
+ runner.run(
1295
+ config={"loghunter": {"detect": "aws,syslog", "default_window": "1d"}},
1296
+ cloudtrail_dir=ct_dir, syslog_dir=syslog_dir,
1297
+ )
1298
+ s = capture_summary["summary"]
1299
+ assert "Default window: last 1d" in capsys.readouterr().err, \
1300
+ "syslog (eligible) still engages the default window"
1301
+ assert s.record_counts.get("*.json*", 0) == 2, "cloudtrail loaded FULL"
1302
+ # Positive guard so the negative below proves the --all rider was SUPPRESSED,
1303
+ # not that the whole aws note silently vanished (a vacuous pass otherwise).
1304
+ assert any(n.startswith("aws:") for n in s.notes), \
1305
+ "the aws first-seen note still fires"
1306
+ assert not any("--all" in n for n in s.notes), \
1307
+ "cloudtrail not narrowed → no --all rider on any aws note"
1308
+
1309
+
1310
+ def test_apply_default_window_keep_null_and_metadata(tmp_path):
1311
+ """B/D unit: the post-load trim (relocated to loader.apply_default_window) retains
1312
+ NaN-ts rows under keep_null and preserves rotation_skips / warnings /
1313
+ data_size_bytes via dataclasses.replace (only logs / record_counts / data_window
1314
+ / coverage are rebuilt)."""
1315
+ import math
1316
+ import pandas as pd
1317
+ from loghunter.common.loader import LoadResult, RotationSkipInfo
1318
+ from loghunter.common.loader import apply_default_window
1319
+
1320
+ base = datetime(2026, 6, 5, 12, 0, tzinfo=timezone.utc).timestamp()
1321
+ skips = {"*.log*": RotationSkipInfo(loaded=2, skipped=3, fallback=False)}
1322
+
1323
+ def _mk() -> LoadResult:
1324
+ df = pd.DataFrame([
1325
+ {"ts": base, "message": "in-window"},
1326
+ {"ts": base - 5 * 86400, "message": "old"}, # outside 1d → trimmed
1327
+ {"ts": float("nan"), "message": "no-ts"}, # NaN → kept iff keep_null
1328
+ ])
1329
+ return LoadResult(
1330
+ logs={"*.log*": df},
1331
+ record_counts={"*.log*": 3},
1332
+ data_window=None,
1333
+ warnings=["a soft warning"],
1334
+ data_size_bytes=4242,
1335
+ rotation_skips=skips,
1336
+ )
1337
+
1338
+ src = _mk()
1339
+ kept = apply_default_window(
1340
+ src, ["*.log*"], timedelta(days=1), keep_null=True
1341
+ )
1342
+ msgs = set(kept.logs["*.log*"]["message"])
1343
+ assert msgs == {"in-window", "no-ts"}, "keep_null retains the NaN-ts row"
1344
+ # #4: the passed-in LoadResult.logs must NOT be mutated in place (shallow copy).
1345
+ assert len(src.logs["*.log*"]) == 3, "input frame untouched by the trim"
1346
+ assert kept.logs["*.log*"] is not src.logs["*.log*"]
1347
+ assert kept.record_counts["*.log*"] == 2
1348
+ # Metadata preserved unchanged through replace().
1349
+ assert kept.warnings == ["a soft warning"]
1350
+ assert kept.data_size_bytes == 4242
1351
+ assert kept.rotation_skips is skips
1352
+
1353
+ dropped = apply_default_window(
1354
+ _mk(), ["*.log*"], timedelta(days=1), keep_null=False
1355
+ )
1356
+ msgs2 = set(dropped.logs["*.log*"]["message"])
1357
+ assert msgs2 == {"in-window"}, "keep_null=False drops the NaN-ts row (drop policy)"
1358
+ assert not any(math.isnan(x) for x in dropped.logs["*.log*"]["ts"])
1359
+
1360
+
1361
+ def test_runner_no_data_window_forces_requested_span_none(
1362
+ tmp_path, capture_summary, capsys
1363
+ ):
1364
+ """#2: a default window is active but the load has NO real data window (every
1365
+ row's ts is unparseable → kept by keep-policy but `_data_window` is None). The
1366
+ runner must force requested_span None so the underfill parenthetical can't render
1367
+ a confident comparison over data that doesn't exist.
1368
+
1369
+ Uses pihole_dir: it KEEPs NaN-ts rows AND is discovered by filename (no content
1370
+ gate), so an all-unparseable-ts file still loads. The syslog content-sniff gate
1371
+ (Item E) rejects an all-unparseable-ts file at discovery — sniff requires a
1372
+ parseable ts — so this scenario is unreachable via syslog directory discovery,
1373
+ and a parseable line would itself give a non-None data window."""
1374
+ pihole_dir = tmp_path / "pihole"
1375
+ pihole_dir.mkdir()
1376
+ # "Xxx" matches the outer \w{3} but strptime fails → NaN ts, kept by keep policy.
1377
+ (pihole_dir / "pihole.log").write_text(
1378
+ "Xxx 1 12:00:00 dnsmasq[1]: query[A] a.test from 192.0.2.1\n"
1379
+ "Xxx 2 12:00:00 dnsmasq[1]: query[A] b.test from 192.0.2.1\n",
1380
+ encoding="utf-8",
1381
+ )
1382
+ runner.run(
1383
+ config={"loghunter": {"detect": "dns", "default_window": "1d"}},
1384
+ pihole_dir=pihole_dir,
1385
+ )
1386
+ s = capture_summary["summary"]
1387
+ # Default window engaged (unbounded dir, no explicit window)...
1388
+ assert "Default window: last 1d" in capsys.readouterr().err
1389
+ # ...but with no real data window, requested_span is forced None (the gate the
1390
+ # renderer can't see).
1391
+ assert s.requested_span is None
1392
+
1393
+
1394
+ def test_aws_window_note_cloudtrail_narrowed_rider() -> None:
1395
+ """The aws window note gains --all guidance ONLY when CloudTrail was actually
1396
+ narrowed (explicit window) — it rides the EXISTING note (no new note), and the
1397
+ base note is unchanged when CloudTrail loaded full."""
1398
+ plan = SimpleNamespace(will_run=["aws"])
1399
+ base = _aws_window_note(plan, cloudtrail_narrowed=False)
1400
+ assert base is not None
1401
+ assert "first-seen" in base
1402
+ assert "--all" not in base
1403
+
1404
+ rider = _aws_window_note(plan, cloudtrail_narrowed=True)
1405
+ assert rider is not None
1406
+ assert rider.startswith(base) # same note, guidance appended
1407
+ assert "--all" in rider
1408
+
1409
+ # No aws → no note regardless of the flag.
1410
+ assert _aws_window_note(SimpleNamespace(will_run=["beacon"]),
1411
+ cloudtrail_narrowed=True) is None
1412
+
1413
+
1414
+ def _ct_service_event(ts_iso: str, event_id: str) -> dict:
1415
+ """A service-lane CloudTrail event (userIdentity.type=AWSService → service)."""
1416
+ return {
1417
+ "eventTime": ts_iso,
1418
+ "eventSource": "s3.amazonaws.com",
1419
+ "eventName": "GetObject",
1420
+ "eventID": event_id,
1421
+ "awsRegion": "us-east-1",
1422
+ "sourceIPAddress": "ec2.amazonaws.com",
1423
+ "userIdentity": {"type": "AWSService", "invokedBy": "ec2.amazonaws.com"},
1424
+ "readOnly": True,
1425
+ }
1426
+
1427
+
1428
+ def test_runner_aws_no_interactive_note_unqualified_end_to_end(
1429
+ tmp_path, capture_summary
1430
+ ):
1431
+ """#2 end-to-end: a real all-service-lane CloudTrail load (parser lane
1432
+ assignment → runner note assembly → detector empty return) on an UNQUALIFIED
1433
+ run discloses the neutral no-interactive note (NO --all) and aws emits no
1434
+ finding."""
1435
+ ct_dir = tmp_path / "ct"
1436
+ ct_dir.mkdir()
1437
+ (ct_dir / "events.json").write_text(
1438
+ "\n".join(json.dumps(e) for e in [
1439
+ _ct_service_event("2026-06-01T12:00:00Z", "aaaa"),
1440
+ _ct_service_event("2026-06-05T12:00:00Z", "bbbb"),
1441
+ ]) + "\n",
1442
+ encoding="utf-8",
1443
+ )
1444
+ runner.run(
1445
+ config={"loghunter": {"detect": "aws", "default_window": "1d"}},
1446
+ cloudtrail_dir=ct_dir,
1447
+ )
1448
+ s = capture_summary["summary"]
1449
+ note = next((n for n in s.notes if "none are interactive-lane" in n), None)
1450
+ assert note is not None, "the no-interactive disclosure must be appended"
1451
+ assert "--all" not in note, "unqualified → CloudTrail loaded full, no --all"
1452
+ findings = capture_summary.get("findings", [])
1453
+ assert not any(f.detector == "aws" for f in findings), "aws scored nothing"
1454
+
1455
+
1456
+ def test_runner_aws_no_interactive_note_narrowed_end_to_end(
1457
+ tmp_path, capture_summary
1458
+ ):
1459
+ """#2 end-to-end: with an explicit window (CloudTrail narrowed), the same
1460
+ no-interactive note carries the --all suffix."""
1461
+ ct_dir = tmp_path / "ct"
1462
+ ct_dir.mkdir()
1463
+ (ct_dir / "events.json").write_text(
1464
+ "\n".join(json.dumps(e) for e in [
1465
+ _ct_service_event("2026-06-05T12:00:00Z", "bbbb"),
1466
+ ]) + "\n",
1467
+ encoding="utf-8",
1468
+ )
1469
+ runner.run(
1470
+ config={"loghunter": {"detect": "aws", "default_window": "1d"}},
1471
+ cloudtrail_dir=ct_dir,
1472
+ since=datetime(2026, 6, 4, tzinfo=timezone.utc),
1473
+ until=datetime(2026, 6, 6, tzinfo=timezone.utc),
1474
+ )
1475
+ s = capture_summary["summary"]
1476
+ note = next((n for n in s.notes if "none are interactive-lane" in n), None)
1477
+ assert note is not None
1478
+ assert "Run with --all for full history." in note
1479
+
1480
+
1481
+ def test_interactive_count_helper() -> None:
1482
+ """Supplementary unit: interactive_count counts interactive-lane rows; 0 on
1483
+ all-service / empty / missing-lane (== the silent-nothing condition)."""
1484
+ import pandas as pd
1485
+ from loghunter.detectors.aws import interactive_count
1486
+
1487
+ assert interactive_count(None) == 0
1488
+ assert interactive_count(pd.DataFrame()) == 0
1489
+ assert interactive_count(pd.DataFrame({"x": [1, 2]})) == 0 # missing lane
1490
+ assert interactive_count(pd.DataFrame({"lane": ["service", "service"]})) == 0
1491
+ assert interactive_count(
1492
+ pd.DataFrame({"lane": ["interactive", "service", "interactive"]})
1493
+ ) == 2
1494
+
1495
+
1496
+ # ── large-dataset prompt: skip_confirm wiring ────────────────────────────────
1497
+
1498
+
1499
+ _TINY_WARN_CFG = {"loghunter": {"detect": "beacon", "warn_above": 1, "default_window": "all"}}
1500
+
1501
+
1502
+ def test_runner_skip_confirm_skips_prompt_entirely(
1503
+ tmp_path: Path, capture_summary, monkeypatch
1504
+ ) -> None:
1505
+ """skip_confirm=True must short-circuit the prompt — input() is never called."""
1506
+ from loghunter.common.errors import ExportAborted # noqa: F401 (import resolves post-move)
1507
+
1508
+ zeek_dir = _make_flat_zeek(tmp_path, [_conn(_TS_JAN1), _conn(_TS_JAN5)])
1509
+
1510
+ def _no_input(*_a, **_kw):
1511
+ raise AssertionError("input() must not be called when skip_confirm=True")
1512
+
1513
+ monkeypatch.setattr("builtins.input", _no_input)
1514
+ runner.run(config=_TINY_WARN_CFG, zeek_dir=zeek_dir, skip_confirm=True)
1515
+ # If we got here, no input() was called and the run completed.
1516
+ assert capture_summary["summary"] is not None
1517
+
1518
+
1519
+ def test_runner_decline_raises_export_aborted(
1520
+ tmp_path: Path, capture_summary, monkeypatch
1521
+ ) -> None:
1522
+ """Decline at the large-dataset prompt must raise ExportAborted (not bare return)."""
1523
+ from loghunter.common.errors import ExportAborted
1524
+
1525
+ zeek_dir = _make_flat_zeek(tmp_path, [_conn(_TS_JAN1), _conn(_TS_JAN5)])
1526
+ monkeypatch.setattr("builtins.input", lambda *_: "n")
1527
+ with pytest.raises(ExportAborted, match="aborted by user"):
1528
+ runner.run(config=_TINY_WARN_CFG, zeek_dir=zeek_dir)
1529
+
1530
+
1531
+ def test_runner_accept_continues_normally(
1532
+ tmp_path: Path, capture_summary, monkeypatch
1533
+ ) -> None:
1534
+ """Default skip_confirm=False with 'y' answer preserves interactive behavior."""
1535
+ zeek_dir = _make_flat_zeek(tmp_path, [_conn(_TS_JAN1), _conn(_TS_JAN5)])
1536
+ monkeypatch.setattr("builtins.input", lambda *_: "y")
1537
+ runner.run(config=_TINY_WARN_CFG, zeek_dir=zeek_dir)
1538
+ assert capture_summary["summary"] is not None
1539
+
1540
+
1541
+ # ── _build_output_handler: output_file precedence and behavior ───────────────
1542
+
1543
+
1544
+ from loghunter.runner import _build_output_handler # noqa: E402
1545
+ from loghunter.common.finding import RunSummary # noqa: E402
1546
+
1547
+
1548
+ def _drive_handler(handler, close_handler) -> None:
1549
+ """Drive a handler through one no-finding lifecycle so its file is created."""
1550
+ summary = RunSummary(
1551
+ data_window=(datetime(2026, 1, 1, tzinfo=timezone.utc),
1552
+ datetime(2026, 1, 2, tzinfo=timezone.utc)),
1553
+ record_counts={},
1554
+ data_size_bytes=0,
1555
+ detectors_run=["beacon"],
1556
+ detectors_skipped={},
1557
+ )
1558
+ handler.begin(summary)
1559
+ handler.write([])
1560
+ handler.end()
1561
+ close_handler()
1562
+
1563
+
1564
+ def test_build_output_handler_writes_to_exact_output_file(tmp_path: Path) -> None:
1565
+ """output_file writes to the EXACT path; no auto-named file appears."""
1566
+ target = tmp_path / "hunt.txt"
1567
+ handler, close_handler = _build_output_handler(
1568
+ output_format="text", output_dir=None, output_file=target, verbose_level=0,
1569
+ )
1570
+ _drive_handler(handler, close_handler)
1571
+ assert target.exists()
1572
+ # No auto-named *.txt sibling
1573
+ siblings = [p.name for p in tmp_path.iterdir()]
1574
+ assert siblings == ["hunt.txt"]
1575
+
1576
+
1577
+ def test_build_output_handler_creates_parent_directories(tmp_path: Path) -> None:
1578
+ """output_file parent directories are mkdir-p'd at handler-build time."""
1579
+ target = tmp_path / "deep" / "nested" / "hunt.txt"
1580
+ assert not target.parent.exists()
1581
+ handler, close_handler = _build_output_handler(
1582
+ output_format="text", output_dir=None, output_file=target, verbose_level=0,
1583
+ )
1584
+ _drive_handler(handler, close_handler)
1585
+ assert target.exists()
1586
+ assert target.parent.is_dir()
1587
+
1588
+
1589
+ def test_build_output_handler_output_file_takes_precedence_over_output_dir(
1590
+ tmp_path: Path,
1591
+ ) -> None:
1592
+ """When both are set, output_file wins and no findings file is created under output_dir."""
1593
+ explicit = tmp_path / "explicit.txt"
1594
+ some_dir = tmp_path / "some_dir"
1595
+ handler, close_handler = _build_output_handler(
1596
+ output_format="text", output_dir=some_dir, output_file=explicit, verbose_level=0,
1597
+ )
1598
+ _drive_handler(handler, close_handler)
1599
+ assert explicit.exists()
1600
+ # output_dir may or may not have been created; key invariant is that no
1601
+ # auto-named findings file lives in it.
1602
+ if some_dir.exists():
1603
+ assert not any(p.is_file() for p in some_dir.iterdir())
1604
+
1605
+
1606
+ # ── Deliverable 0: dry-run alignment of source-dir lines ─────────────────────
1607
+
1608
+ def test_dry_run_source_dir_lines_align_colons_and_values(tmp_path: Path, capsys) -> None:
1609
+ """All four ``*_dir:`` lines must have their colon AND their value-start at
1610
+ the same column. Catches the alignment bug where ``cloudtrail_dir:`` pushed
1611
+ its value out of column with the others."""
1612
+ zeek = tmp_path / "zeek"; zeek.mkdir()
1613
+ syslog = tmp_path / "syslog"; syslog.mkdir()
1614
+ pihole = tmp_path / "pihole"; pihole.mkdir()
1615
+ cloudtrail = tmp_path / "ct"; cloudtrail.mkdir()
1616
+ _print_dry_run(
1617
+ zeek_dir=zeek, syslog_dir=syslog, pihole_dir=pihole, cloudtrail_dir=cloudtrail,
1618
+ since=None, until=None, load_all=False, will_run=[], skipped={},
1619
+ )
1620
+ out = capsys.readouterr().out.splitlines()
1621
+
1622
+ dir_lines = [ln for ln in out if any(
1623
+ label in ln for label in (
1624
+ "zeek_dir:", "syslog_dir:", "pihole_dir:", "cloudtrail_dir:",
1625
+ )
1626
+ )]
1627
+ assert len(dir_lines) == 4, f"expected 4 source-dir lines, got {len(dir_lines)}"
1628
+
1629
+ colon_cols = [ln.index(":") for ln in dir_lines]
1630
+ assert len(set(colon_cols)) == 1, f"colons misaligned: {colon_cols} in lines {dir_lines}"
1631
+
1632
+ # Value start = first non-space character after the trailing gutter that
1633
+ # follows the label field.
1634
+ value_starts = []
1635
+ for ln in dir_lines:
1636
+ label_end = ln.index(":") + 1
1637
+ # find the first non-space char after the label
1638
+ i = label_end
1639
+ while i < len(ln) and ln[i] == " ":
1640
+ i += 1
1641
+ value_starts.append(i)
1642
+ assert len(set(value_starts)) == 1, (
1643
+ f"value starts misaligned: {value_starts} in lines {dir_lines}"
1644
+ )
1645
+
1646
+
1647
+ # ── Deliverable 3: aws RunSummary notes — pure helper tests ──────────────────
1648
+
1649
+ def _fake_aws_mod(below_floor: int = 0):
1650
+ """Tiny fake of the aws detector exposing only what the runner reads."""
1651
+ return SimpleNamespace(
1652
+ DETECTOR_NAME="aws",
1653
+ STATUS="available",
1654
+ DEFAULT_CONFIG={"min_events": 50},
1655
+ below_floor_count=lambda df, n: below_floor,
1656
+ )
1657
+
1658
+
1659
+ def _fake_plan(will_run: list[str], aws_mod=None) -> SimpleNamespace:
1660
+ detectors = {"aws": aws_mod} if aws_mod is not None else {}
1661
+ return SimpleNamespace(
1662
+ detectors=detectors,
1663
+ selected=will_run,
1664
+ will_run=will_run,
1665
+ skipped={},
1666
+ needed_logs={"*.json*": "cloudtrail_dir"},
1667
+ )
1668
+
1669
+
1670
+ def test_aws_below_floor_note_returns_string_with_count() -> None:
1671
+ from loghunter.runner import _aws_below_floor_note
1672
+ plan = _fake_plan(["aws"], _fake_aws_mod(below_floor=5))
1673
+ df = pd.DataFrame([{"lane": "interactive", "principal": "x"}])
1674
+ note = _aws_below_floor_note(plan, {"*.json*": df}, config={})
1675
+ assert note is not None
1676
+ assert "5" in note
1677
+ assert "min_events" in note
1678
+
1679
+
1680
+ def test_aws_below_floor_note_returns_none_when_aws_not_in_plan() -> None:
1681
+ from loghunter.runner import _aws_below_floor_note
1682
+ plan = _fake_plan(["beacon"], aws_mod=None)
1683
+ df = pd.DataFrame([{"lane": "interactive", "principal": "x"}])
1684
+ assert _aws_below_floor_note(plan, {"*.json*": df}, config={}) is None
1685
+
1686
+
1687
+ def test_aws_below_floor_note_returns_none_when_count_is_zero() -> None:
1688
+ from loghunter.runner import _aws_below_floor_note
1689
+ plan = _fake_plan(["aws"], _fake_aws_mod(below_floor=0))
1690
+ df = pd.DataFrame([{"lane": "interactive", "principal": "x"}])
1691
+ assert _aws_below_floor_note(plan, {"*.json*": df}, config={}) is None
1692
+
1693
+
1694
+ def test_aws_below_floor_note_returns_none_when_no_frame() -> None:
1695
+ from loghunter.runner import _aws_below_floor_note
1696
+ plan = _fake_plan(["aws"], _fake_aws_mod(below_floor=5))
1697
+ assert _aws_below_floor_note(plan, {}, config={}) is None
1698
+
1699
+
1700
+ def test_aws_window_note_fires_when_aws_runs() -> None:
1701
+ from loghunter.runner import _aws_window_note
1702
+ plan = _fake_plan(["aws"], _fake_aws_mod())
1703
+ note = _aws_window_note(plan)
1704
+ assert note is not None
1705
+ assert "first-seen" in note
1706
+
1707
+
1708
+ def test_aws_window_note_silent_when_aws_did_not_run() -> None:
1709
+ from loghunter.runner import _aws_window_note
1710
+ plan = _fake_plan(["beacon"], aws_mod=None)
1711
+ assert _aws_window_note(plan) is None
1712
+
1713
+
1714
+ # ── Integration: real runner.run() emits the note via the loaded frame ───────
1715
+
1716
+ def test_aws_below_floor_note_in_runner_run_reflects_current_frame(
1717
+ tmp_path: Path, capture_summary, monkeypatch
1718
+ ) -> None:
1719
+ """The note must appear in the RunSummary.notes the user actually sees, not
1720
+ just in the helper. Glenn's catch: a helper-only test could pass while the
1721
+ runner's call ordering or wiring was broken; this asserts the wired path."""
1722
+ # Build a CloudTrail directory whose loaded frame has 3 below-floor
1723
+ # interactive principals.
1724
+ cloudtrail_dir = tmp_path / "ct"
1725
+ cloudtrail_dir.mkdir()
1726
+ events: list[dict] = []
1727
+ for name in ["alice", "bob", "carol"]:
1728
+ for i in range(5):
1729
+ events.append({
1730
+ "eventTime": f"2026-06-01T12:0{i}:00Z",
1731
+ "eventSource": "s3.amazonaws.com",
1732
+ "eventName": "GetObject",
1733
+ "eventID": f"e-{name}-{i}",
1734
+ "awsRegion": "us-east-1",
1735
+ "sourceIPAddress": "192.0.2.10",
1736
+ "userIdentity": {"type": "IAMUser", "userName": name,
1737
+ "arn": f"arn:aws:iam::123456789012:user/{name}"},
1738
+ "readOnly": True,
1739
+ })
1740
+ (cloudtrail_dir / "events.json.log").write_text(
1741
+ "\n".join(json.dumps(e) for e in events) + "\n",
1742
+ encoding="utf-8",
1743
+ )
1744
+
1745
+ # Use the real aws detector so the wiring is exercised end to end.
1746
+ import loghunter.detectors.aws as aws_mod
1747
+ monkeypatch.setattr(runner, "discover_detectors", lambda: {"aws": aws_mod})
1748
+
1749
+ runner.run(
1750
+ config={"loghunter": {"detect": "aws"}},
1751
+ cloudtrail_dir=cloudtrail_dir,
1752
+ )
1753
+
1754
+ s = capture_summary["summary"]
1755
+ floor_notes = [n for n in s.notes if "below the min_events floor" in n]
1756
+ assert floor_notes, f"expected below-floor note in {s.notes}"
1757
+ assert "3" in floor_notes[0]
1758
+
1759
+
1760
+ def test_aws_window_note_in_runner_run(
1761
+ tmp_path: Path, capture_summary, monkeypatch
1762
+ ) -> None:
1763
+ """The window-boundary disclosure must appear whenever aws runs."""
1764
+ cloudtrail_dir = tmp_path / "ct"
1765
+ cloudtrail_dir.mkdir()
1766
+ event = {
1767
+ "eventTime": "2026-06-01T12:00:00Z",
1768
+ "eventSource": "s3.amazonaws.com",
1769
+ "eventName": "GetObject",
1770
+ "eventID": "e-1",
1771
+ "awsRegion": "us-east-1",
1772
+ "sourceIPAddress": "192.0.2.10",
1773
+ "userIdentity": {"type": "IAMUser", "userName": "placeholder",
1774
+ "arn": "arn:aws:iam::123456789012:user/placeholder"},
1775
+ "readOnly": True,
1776
+ }
1777
+ (cloudtrail_dir / "events.json.log").write_text(
1778
+ json.dumps(event) + "\n", encoding="utf-8",
1779
+ )
1780
+
1781
+ import loghunter.detectors.aws as aws_mod
1782
+ monkeypatch.setattr(runner, "discover_detectors", lambda: {"aws": aws_mod})
1783
+
1784
+ runner.run(
1785
+ config={"loghunter": {"detect": "aws"}},
1786
+ cloudtrail_dir=cloudtrail_dir,
1787
+ )
1788
+
1789
+ s = capture_summary["summary"]
1790
+ assert any("first-seen" in n for n in s.notes)
1791
+
1792
+
1793
+ # ── _home_net_note — scan topology disclosure ────────────────────────────────
1794
+ #
1795
+ # Pure helper tests of the runner's home_net disclosure note. Provenance is
1796
+ # carried by the ``__user_set__`` sidecar attached by the config loader; tests
1797
+ # construct it explicitly to drive both default and declared paths.
1798
+
1799
+ _RFC1918_HOME_NET = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
1800
+
1801
+
1802
+ def _scan_plan(scan_in_plan: bool) -> SimpleNamespace:
1803
+ will_run = ["scan"] if scan_in_plan else ["beacon"]
1804
+ return SimpleNamespace(
1805
+ detectors={},
1806
+ selected=will_run,
1807
+ will_run=will_run,
1808
+ skipped={},
1809
+ needed_logs={},
1810
+ )
1811
+
1812
+
1813
+ def test_home_net_note_default_includes_parenthetical() -> None:
1814
+ from loghunter.runner import _home_net_note
1815
+ config = {"loghunter": {"home_net": _RFC1918_HOME_NET}}
1816
+ note = _home_net_note(_scan_plan(scan_in_plan=True), config)
1817
+ assert note is not None
1818
+ assert "10.0.0.0/8" in note
1819
+ assert "172.16.0.0/12" in note
1820
+ assert "192.168.0.0/16" in note
1821
+ assert "RFC1918 default" in note
1822
+ assert "set home_net in config to override" in note
1823
+
1824
+
1825
+ def test_home_net_note_declared_omits_parenthetical_with_custom_range() -> None:
1826
+ from loghunter.runner import _home_net_note
1827
+ config = {
1828
+ "loghunter": {"home_net": ["192.0.2.0/24"]},
1829
+ "__user_set__": {"loghunter": {"home_net"}},
1830
+ }
1831
+ note = _home_net_note(_scan_plan(scan_in_plan=True), config)
1832
+ assert note is not None
1833
+ assert "192.0.2.0/24" in note
1834
+ assert "RFC1918 default" not in note
1835
+
1836
+
1837
+ def test_home_net_note_declared_omits_parenthetical_when_value_equals_default() -> None:
1838
+ """User explicitly types the RFC1918 list — must read as declared, not default.
1839
+
1840
+ A value-only check would misclassify this. The ``__user_set__`` sidecar
1841
+ is the provenance source of truth.
1842
+ """
1843
+ from loghunter.runner import _home_net_note
1844
+ config = {
1845
+ "loghunter": {"home_net": list(_RFC1918_HOME_NET)},
1846
+ "__user_set__": {"loghunter": {"home_net"}},
1847
+ }
1848
+ note = _home_net_note(_scan_plan(scan_in_plan=True), config)
1849
+ assert note is not None
1850
+ assert "10.0.0.0/8" in note
1851
+ assert "RFC1918 default" not in note
1852
+ assert "override" not in note
1853
+
1854
+
1855
+ def test_home_net_note_returns_none_when_scan_not_in_plan() -> None:
1856
+ from loghunter.runner import _home_net_note
1857
+ config = {"loghunter": {"home_net": _RFC1918_HOME_NET}}
1858
+ assert _home_net_note(_scan_plan(scan_in_plan=False), config) is None
1859
+
1860
+
1861
+ # ── Stage 3: caller-owned TextIO seam for digest fan-out ─────────────────────
1862
+
1863
+
1864
+ def test_build_output_handler_caller_stream_no_open_no_close(
1865
+ tmp_path: Path,
1866
+ ) -> None:
1867
+ """``_build_output_handler(..., stream=<TextIO>)`` returns a handler
1868
+ wrapping the caller's stream with a no-op close — the stream stays open
1869
+ after the close callback runs."""
1870
+ import io as _io
1871
+ from loghunter.runner import _build_output_handler
1872
+
1873
+ buf = _io.StringIO()
1874
+ handler, close = _build_output_handler(
1875
+ "text", output_dir=None, output_file=None, verbose_level=0, stream=buf,
1876
+ )
1877
+ close()
1878
+ assert not buf.closed
1879
+ # Handler must write to the caller's buffer, not stdout.
1880
+ handler._stream.write("probe\n")
1881
+ assert buf.getvalue() == "probe\n"
1882
+
1883
+
1884
+ def test_run_digest_conn_writes_to_caller_stream(
1885
+ tmp_path: Path,
1886
+ ) -> None:
1887
+ """``run_digest(..., stream=<StringIO>)`` writes the conn card to the
1888
+ caller-owned stream — never touches output_dir / output_file."""
1889
+ import io as _io
1890
+ from loghunter import runner as _runner
1891
+
1892
+ log_path = tmp_path / "conn.log"
1893
+ log_path.write_text(
1894
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", '
1895
+ '"id.resp_h": "198.51.100.20", "id.resp_p": 443, '
1896
+ '"proto": "tcp", "duration": 1.23}\n',
1897
+ encoding="utf-8",
1898
+ )
1899
+ buf = _io.StringIO()
1900
+ _runner.run_digest(
1901
+ config={"loghunter": {}},
1902
+ zeek_dir=log_path,
1903
+ stream=buf,
1904
+ skip_confirm=True,
1905
+ schema="conn",
1906
+ )
1907
+ rendered = buf.getvalue()
1908
+ # Flat-card identity block: source basename on line 1; "conn · …" on
1909
+ # line 3. No banner, no header rule under the new grammar.
1910
+ assert "conn.log" in rendered
1911
+ assert "conn ·" in rendered
1912
+ # No file was created next to the input log.
1913
+ assert sorted(p.name for p in tmp_path.iterdir()) == ["conn.log"]
1914
+
1915
+
1916
+ def test_run_digest_blob_writes_to_caller_stream(
1917
+ tmp_path: Path,
1918
+ ) -> None:
1919
+ """Stage 3 regression: ``run_digest(schema='blob', stream=<StringIO>)``
1920
+ writes the blob card to the caller-owned stream. Without the stream
1921
+ being threaded into ``_run_digest_blob``'s ``_build_output_handler``,
1922
+ blob cards would silently bypass the shared --out file and the fan-out
1923
+ contract would break the moment a positional sniffed to blob."""
1924
+ import io as _io
1925
+ from loghunter import runner as _runner
1926
+
1927
+ blob = tmp_path / "weird.txt"
1928
+ blob.write_text(
1929
+ "unrecognized-app-banner xyzzy 42 frobnicate\n"
1930
+ "second line with no clear schema\n",
1931
+ encoding="utf-8",
1932
+ )
1933
+ buf = _io.StringIO()
1934
+ _runner.run_digest(
1935
+ config={"loghunter": {}},
1936
+ blob_path=blob,
1937
+ stream=buf,
1938
+ skip_confirm=True,
1939
+ schema="blob",
1940
+ )
1941
+ rendered = buf.getvalue()
1942
+ # Flat blob card: source basename on identity line 1; the labeled
1943
+ # best-guess headline names "Unrecognized source". No header rule.
1944
+ assert "weird.txt" in rendered
1945
+ assert "Unrecognized source" in rendered
1946
+ # No incidental files materialised in tmp_path beyond the input.
1947
+ assert sorted(p.name for p in tmp_path.iterdir()) == ["weird.txt"]
1948
+
1949
+
1950
+ # ── Liveness narration in the detector loop ───────────────────────────────────
1951
+
1952
+
1953
+ def _fake_detector(name: str, run_impl):
1954
+ """Build a minimal fake detector module suitable for the runner loop."""
1955
+ return SimpleNamespace(
1956
+ DETECTOR_NAME=name,
1957
+ STATUS="available",
1958
+ REQUIRED_LOGS=[],
1959
+ OPTIONAL_LOGS=[],
1960
+ DEFAULT_CONFIG={},
1961
+ run=run_impl,
1962
+ )
1963
+
1964
+
1965
+ def test_liveness_seals_one_record_per_non_syslog_detector(
1966
+ tmp_path: Path, capture_summary, monkeypatch, capsys
1967
+ ) -> None:
1968
+ """Two non-syslog detectors → two sealed lines (one per detector). The
1969
+ detector that returned findings gets the completion record 'done'; the
1970
+ empty one gets 'nothing'. The seal MUST NOT carry the finding count —
1971
+ the W2 report header is the single authoritative count surface (the
1972
+ double-count fix from James's revamp CR). Both records go to stderr only."""
1973
+ f1 = SimpleNamespace() # opaque placeholder Findings — handler is patched
1974
+ f2 = SimpleNamespace()
1975
+ fakes = {
1976
+ "alpha": _fake_detector("alpha", lambda ctx: [f1, f2]),
1977
+ "beta": _fake_detector("beta", lambda ctx: []),
1978
+ }
1979
+ monkeypatch.setattr(runner, "discover_detectors", lambda: fakes)
1980
+
1981
+ runner.run(config={"loghunter": {"detect": "alpha,beta"}})
1982
+
1983
+ captured = capsys.readouterr()
1984
+ assert "alpha: done" in captured.err
1985
+ assert "beta: nothing" in captured.err
1986
+ # Seal MUST NOT contain the finding count — the header carries it.
1987
+ import re
1988
+ assert not re.search(r"alpha: \d+ findings", captured.err)
1989
+ # Records are stderr-only; stdout carries findings rendering (suppressed
1990
+ # here by the capture_summary fake handler).
1991
+ assert "alpha: done" not in captured.out
1992
+ assert "beta: nothing" not in captured.out
1993
+ # The captured findings via the patched handler include both detectors'
1994
+ # output (the patched handler is what the user pointed at — runner.run
1995
+ # returns None, so we assert against the captured findings list).
1996
+ assert capture_summary["findings"] == [f1, f2]
1997
+
1998
+
1999
+ def test_liveness_suppresses_seal_on_detector_error(
2000
+ tmp_path: Path, capture_summary, monkeypatch, capsys
2001
+ ) -> None:
2002
+ """A detector that raises Exception leaves the existing 'detector error'
2003
+ line, and the liveness block emits NO sealed record (no false success)."""
2004
+ def _boom(ctx):
2005
+ raise RuntimeError("boom")
2006
+
2007
+ fakes = {"gamma": _fake_detector("gamma", _boom)}
2008
+ monkeypatch.setattr(runner, "discover_detectors", lambda: fakes)
2009
+
2010
+ runner.run(config={"loghunter": {"detect": "gamma"}})
2011
+
2012
+ captured = capsys.readouterr()
2013
+ assert "gamma: detector error — boom" in captured.err
2014
+ # No seal of any shape for the errored detector.
2015
+ assert "gamma: nothing" not in captured.err
2016
+ assert "gamma: 0 findings" not in captured.err
2017
+ import re
2018
+ assert not re.search(r"gamma: \d+ findings", captured.err)
2019
+ # The patched handler still got called with an empty findings list
2020
+ # (run completes; the error did not abort the loop).
2021
+ assert capture_summary["findings"] == []
2022
+
2023
+
2024
+ def test_liveness_skips_outer_spinner_for_syslog(
2025
+ tmp_path: Path, capture_summary, monkeypatch, capsys
2026
+ ) -> None:
2027
+ """syslog gets no outer liveness wrapper — its inner drain3 tqdm carries
2028
+ the narration for that phase. Verified as the absence of the outer
2029
+ 'running syslog' label and the absence of a 'syslog: ...' seal."""
2030
+ fakes = {"syslog": _fake_detector("syslog", lambda ctx: [])}
2031
+ monkeypatch.setattr(runner, "discover_detectors", lambda: fakes)
2032
+
2033
+ runner.run(config={"loghunter": {"detect": "syslog"}})
2034
+
2035
+ captured = capsys.readouterr()
2036
+ assert "running syslog" not in captured.err
2037
+ assert "syslog: nothing" not in captured.err
2038
+ assert "syslog: 0 findings" not in captured.err
2039
+
2040
+
2041
+ # ── _ts_confidence (item 4: timestamp-confidence floor) ──────────────────────
2042
+
2043
+
2044
+ def _ts_frame(ts_values: list[float]) -> pd.DataFrame:
2045
+ """Build a minimal frame carrying only the ts column from a list of
2046
+ float values (use float("nan") for unparseable rows)."""
2047
+ return pd.DataFrame({"ts": ts_values})
2048
+
2049
+
2050
+ def test_ts_confidence_full_parseable_with_span_is_confident() -> None:
2051
+ """All rows parseable + non-zero span → True."""
2052
+ assert _ts_confidence(_ts_frame([1000.0, 1100.0, 1200.0, 1300.0])) is True
2053
+
2054
+
2055
+ def test_ts_confidence_at_floor_passes() -> None:
2056
+ """Parseable fraction equal to the floor (8/10 = 0.80) + non-zero span
2057
+ → True; the floor is inclusive."""
2058
+ values = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0,
2059
+ float("nan"), float("nan")]
2060
+ assert _ts_confidence(_ts_frame(values)) is True
2061
+ assert _DIGEST_TS_CONFIDENCE_FLOOR == 0.80
2062
+
2063
+
2064
+ def test_ts_confidence_just_below_floor_fails() -> None:
2065
+ """7/10 = 0.70 < 0.80 → False."""
2066
+ values = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0,
2067
+ float("nan"), float("nan"), float("nan")]
2068
+ assert _ts_confidence(_ts_frame(values)) is False
2069
+
2070
+
2071
+ def test_ts_confidence_all_nan_fails() -> None:
2072
+ """Every ts unparseable → False (coverage gate)."""
2073
+ assert _ts_confidence(_ts_frame([float("nan")] * 50)) is False
2074
+
2075
+
2076
+ def test_ts_confidence_zero_span_fails() -> None:
2077
+ """All events at the same instant → False (span gate).
2078
+
2079
+ The flat card now renders the SAME bare "(timeline unavailable)" line
2080
+ for both the coverage gate and the span gate — the differentiated
2081
+ footer text is gone, and so are the sentinel reasons.
2082
+ """
2083
+ assert _ts_confidence(_ts_frame([42.0] * 80)) is False
2084
+
2085
+
2086
+ def test_ts_confidence_no_ts_column_fails() -> None:
2087
+ """A frame with no ts column → False (structural coverage shape)."""
2088
+ frame = pd.DataFrame({"src": ["192.0.2.1", "192.0.2.2"]})
2089
+ assert _ts_confidence(frame) is False
2090
+
2091
+
2092
+ def test_ts_confidence_empty_frame_fails() -> None:
2093
+ """Defensive: an empty frame returns False."""
2094
+ assert _ts_confidence(pd.DataFrame({"ts": []})) is False
2095
+
2096
+
2097
+ # ── Both timeline-failure modes render the same bare line (no footer) ───────
2098
+
2099
+
2100
+ def _zeek_conn_line(ts: float) -> str:
2101
+ return (
2102
+ '{"_path": "conn", "ts": ' + repr(ts) + ', "id.orig_h": "192.0.2.10",'
2103
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp"}\n'
2104
+ )
2105
+
2106
+
2107
+ def test_run_digest_zero_span_renders_bare_timeline_unavailable(
2108
+ tmp_path: Path, capsys,
2109
+ ) -> None:
2110
+ """Zero-span timestamps render the bare "(timeline unavailable)" line
2111
+ and NO footer block. The old differentiated footer text is gone with
2112
+ the flat card grammar."""
2113
+ zeek_dir = tmp_path / "zeek"
2114
+ zeek_dir.mkdir()
2115
+ (zeek_dir / "conn.log").write_text(
2116
+ _zeek_conn_line(1779750000.0) * 5,
2117
+ encoding="utf-8",
2118
+ )
2119
+
2120
+ runner.run_digest(
2121
+ config={"loghunter": {}},
2122
+ zeek_dir=zeek_dir, load_all=True, skip_confirm=True,
2123
+ )
2124
+ out = capsys.readouterr().out
2125
+ assert "(timeline unavailable)" in out
2126
+ # No footer / N.B. block anywhere in the flat grammar.
2127
+ assert "N.B." not in out
2128
+ assert "timeline collapsed" not in out
2129
+ assert "timestamp unparseable" not in out
2130
+
2131
+
2132
+ def test_run_digest_low_coverage_renders_bare_timeline_unavailable(
2133
+ tmp_path: Path, capsys,
2134
+ ) -> None:
2135
+ """Low-coverage timestamps render the SAME bare line as zero-span —
2136
+ proves the _ts_confidence collapse to a boolean predicate, not just
2137
+ sentinel deletion."""
2138
+ syslog_dir = tmp_path / "syslog"
2139
+ syslog_dir.mkdir()
2140
+ (syslog_dir / "router.log").write_text(
2141
+ "<134>May 31 12:00:00 192.0.2.1 sshd[100]: real line\n"
2142
+ "garbage line 1\n"
2143
+ "garbage line 2\n"
2144
+ "garbage line 3\n"
2145
+ "garbage line 4\n",
2146
+ encoding="utf-8",
2147
+ )
2148
+
2149
+ runner.run_digest(
2150
+ config={"loghunter": {}},
2151
+ syslog_dir=syslog_dir, load_all=True, skip_confirm=True,
2152
+ schema="syslog",
2153
+ )
2154
+ out = capsys.readouterr().out
2155
+ assert "(timeline unavailable)" in out
2156
+ assert "N.B." not in out
2157
+ assert "timestamp unparseable" not in out
2158
+ assert "floor 80%" not in out
2159
+
2160
+
2161
+ def test_run_digest_summariser_raise_without_fallback_path_reraises(
2162
+ tmp_path: Path, monkeypatch,
2163
+ ) -> None:
2164
+ """When ``fallback_blob_path`` is None (the bare-config caller has no
2165
+ single-file fallback available), a summariser raise propagates out so
2166
+ the CLI's existing ValueError arm can format the message. The narrow
2167
+ wrap MUST NOT swallow exceptions silently when no fallback is
2168
+ available."""
2169
+ # Build a minimal Zeek conn file that loads fine.
2170
+ zeek_dir = tmp_path / "zeek"
2171
+ zeek_dir.mkdir()
2172
+ (zeek_dir / "conn.log").write_text(
2173
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
2174
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp"}\n',
2175
+ encoding="utf-8",
2176
+ )
2177
+
2178
+ def _exploding_summarizer(_schema_name: str):
2179
+ def _raise(*_a, **_kw):
2180
+ raise RuntimeError("induced summariser failure")
2181
+ return _raise
2182
+
2183
+ monkeypatch.setattr(
2184
+ "loghunter.digest.get_summarizer", _exploding_summarizer,
2185
+ )
2186
+
2187
+ # No fallback_blob_path → must re-raise.
2188
+ with pytest.raises(RuntimeError, match="induced summariser failure"):
2189
+ runner.run_digest(
2190
+ config={"loghunter": {}},
2191
+ zeek_dir=zeek_dir, load_all=True, skip_confirm=True,
2192
+ # fallback_blob_path is the default None.
2193
+ )
2194
+
2195
+
2196
+ # ── _prepare_detector_context + prep-error vs detector-error labels ─────────
2197
+ #
2198
+ # Addendum (docs/BUGS.md "Detector liveness starts too late"): the
2199
+ # per-detector prep (filter_df + DetectorContext construction) now lives
2200
+ # INSIDE the per-detector liveness block, so the spinner appears as soon
2201
+ # as the operator-visible work begins. A failure during prep must be
2202
+ # labelled "prep error" — distinct from "detector error" — because the
2203
+ # runner owns prep, not the detector (separation-of-powers).
2204
+
2205
+
2206
+ def _zeek_conn_dir(tmp_path: Path) -> Path:
2207
+ """Build a minimal Zeek conn directory with one parseable record."""
2208
+ zeek_dir = tmp_path / "zeek"
2209
+ zeek_dir.mkdir()
2210
+ _write_ndjson(zeek_dir / "conn.log", [
2211
+ _conn(_TS_JAN5),
2212
+ _conn(_TS_JAN5 + 60.0),
2213
+ ])
2214
+ return zeek_dir
2215
+
2216
+
2217
+ def test_prep_error_renders_prep_error_label_not_detector_error(
2218
+ tmp_path: Path, monkeypatch, capsys,
2219
+ ) -> None:
2220
+ """A failure inside _prepare_detector_context surfaces as
2221
+ 'prep error', NOT 'detector error'. The detector module is not at
2222
+ fault — the runner's own prep raised."""
2223
+ zeek_dir = _zeek_conn_dir(tmp_path)
2224
+
2225
+ def _exploding_prep(*_a, **_kw):
2226
+ raise RuntimeError("induced prep failure")
2227
+
2228
+ monkeypatch.setattr(
2229
+ runner, "_prepare_detector_context", _exploding_prep,
2230
+ )
2231
+
2232
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir)
2233
+
2234
+ err = capsys.readouterr().err
2235
+ assert "beacon: prep error — induced prep failure" in err
2236
+ # The detector-error label must NOT appear — that would mislead the
2237
+ # operator about WHERE the failure was. Separation-of-powers detail.
2238
+ assert "beacon: detector error" not in err
2239
+
2240
+
2241
+ def test_detector_error_label_preserved_byte_identical(
2242
+ tmp_path: Path, monkeypatch, capsys,
2243
+ ) -> None:
2244
+ """A failure inside mod.run(ctx) keeps the existing
2245
+ 'detector error — ...' shape exactly. Today's contract preserved."""
2246
+ zeek_dir = _zeek_conn_dir(tmp_path)
2247
+
2248
+ import loghunter.detectors.beacon as beacon_mod
2249
+
2250
+ def _exploding_run(_ctx):
2251
+ raise RuntimeError("induced detector failure")
2252
+
2253
+ monkeypatch.setattr(beacon_mod, "run", _exploding_run)
2254
+
2255
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir)
2256
+
2257
+ err = capsys.readouterr().err
2258
+ assert "beacon: detector error — induced detector failure" in err
2259
+ # The new prep-error label must NOT appear for a detector-side raise.
2260
+ assert "beacon: prep error" not in err
2261
+
2262
+
2263
+ def test_liveness_seal_lands_once_for_successful_run(
2264
+ tmp_path: Path, monkeypatch, capsys, capture_summary,
2265
+ ) -> None:
2266
+ """A successful detector run produces exactly one sealed liveness
2267
+ record ('beacon: done' or 'beacon: nothing' — the seal carries no
2268
+ count after James's double-count fix; the report header is the
2269
+ single authoritative count surface). Guards against a double-seal
2270
+ regression — the prep block is now INSIDE the liveness scope, so a
2271
+ stray extra seal would land if the body were wrapped twice."""
2272
+ zeek_dir = _zeek_conn_dir(tmp_path)
2273
+
2274
+ # Patch beacon's run() to return nothing — sidesteps fixture
2275
+ # field-shape mismatches; this test is about seal accounting, not
2276
+ # detector logic.
2277
+ import loghunter.detectors.beacon as beacon_mod
2278
+ monkeypatch.setattr(beacon_mod, "run", lambda _ctx: [])
2279
+
2280
+ runner.run(config=_BEACON_ONLY, zeek_dir=zeek_dir)
2281
+ err = capsys.readouterr().err
2282
+ # Either "beacon: N findings" or "beacon: nothing" — exactly one of
2283
+ # them, exactly once.
2284
+ seal_lines = [
2285
+ ln for ln in err.splitlines()
2286
+ if ln.strip().startswith("beacon:") and "error" not in ln
2287
+ ]
2288
+ assert len(seal_lines) == 1, (
2289
+ f"expected exactly one beacon seal line, got {seal_lines!r}"
2290
+ )
2291
+
2292
+
2293
+ def test_prepare_detector_context_filters_per_pattern(tmp_path: Path) -> None:
2294
+ """Unit: _prepare_detector_context calls allowlist.filter_df once per
2295
+ pattern the detector declares (REQUIRED + OPTIONAL), and builds a
2296
+ DetectorContext with the filtered view. Verifies the pure extraction
2297
+ of the previously inline prep."""
2298
+ from loghunter.common.finding import DetectorContext as _DC
2299
+
2300
+ mod = SimpleNamespace(
2301
+ REQUIRED_LOGS=[{"source": "zeek_dir", "pattern": "conn*.log*"}],
2302
+ OPTIONAL_LOGS=[{"source": "zeek_dir", "pattern": "dns*.log*"}],
2303
+ )
2304
+
2305
+ conn_df = pd.DataFrame({"a": [1, 2]})
2306
+ dns_df = pd.DataFrame({"b": [3]})
2307
+ other_df = pd.DataFrame({"c": [4]})
2308
+
2309
+ filter_calls: list[tuple[str, str]] = []
2310
+
2311
+ class _RecordingAllowlist:
2312
+ def filter_df(self, df, name):
2313
+ filter_calls.append((name, "<df>"))
2314
+ # Identity filter for the test — we only care about being called.
2315
+ return df
2316
+
2317
+ logs = {
2318
+ "conn*.log*": conn_df,
2319
+ "dns*.log*": dns_df,
2320
+ "other*.log*": other_df,
2321
+ }
2322
+ ctx = runner._prepare_detector_context(
2323
+ mod=mod, name="beacon", logs=logs,
2324
+ allowlist=_RecordingAllowlist(),
2325
+ det_cfg={"k": "v"},
2326
+ data_window=(_NOW := datetime(2026, 1, 5, tzinfo=timezone.utc),
2327
+ _NOW),
2328
+ data_sources=["zeek_conn"],
2329
+ home_net=["10.0.0.0/8"],
2330
+ )
2331
+
2332
+ # filter_df called for each declared pattern, in name=beacon.
2333
+ assert ("beacon", "<df>") in filter_calls
2334
+ assert filter_calls.count(("beacon", "<df>")) == 2 # conn + dns
2335
+ # other*.log* is NOT in the detector's declared patterns — passes
2336
+ # through unfiltered.
2337
+ assert "other*.log*" in ctx.logs
2338
+ assert ctx.logs["other*.log*"] is other_df
2339
+
2340
+ # The returned context is shaped like the previously inline DetectorContext.
2341
+ assert isinstance(ctx, _DC)
2342
+ assert ctx.config == {"k": "v"}
2343
+ assert ctx.data_sources == ["zeek_conn"]
2344
+ assert ctx.home_net == ["10.0.0.0/8"]
2345
+
2346
+
2347
+ # ── Rotation-peek disclosure notes (real runner.run, syslog_dir) ───────────────
2348
+ #
2349
+ # Drive runner.run end-to-end (NOT mocked) so the loader→RunSummary note seam is
2350
+ # exercised. since/until are derived by parsing the fixture lines so the tests do
2351
+ # not depend on the machine clock year. _rotation_skip_notes is the formatter.
2352
+
2353
+ from loghunter.parsers.syslog import parse_timestamp as _parse_ts
2354
+
2355
+ _SYSLOG_ONLY = {"loghunter": {"detect": "syslog"}}
2356
+
2357
+
2358
+ def _sysrot_line(mon: str, day: int) -> str:
2359
+ return f"{mon} {day:>2} 12:00:00 host1 sshd[1]: session opened for user"
2360
+
2361
+
2362
+ def _write_sysrot(d: Path, base: str, ts_by_ordinal: dict[int, tuple[str, int]]) -> None:
2363
+ d.mkdir(parents=True, exist_ok=True)
2364
+ for idx, (mon, day) in ts_by_ordinal.items():
2365
+ name = base if idx == 0 else f"{base}.{idx}"
2366
+ (d / name).write_text(_sysrot_line(mon, day) + "\n", encoding="utf-8")
2367
+
2368
+
2369
+ def test_runner_rotation_skip_note_neutral_wording(tmp_path, capture_summary):
2370
+ """A bounded --since/--until run that skips BOTH a too-new leading file AND a
2371
+ too-old tail file under one count → the NEUTRAL 'outside' wording (truthful
2372
+ for both directions), counts off the post-window candidates."""
2373
+ d = tmp_path / "syslog"
2374
+ _write_sysrot(d, "syslog.log", {
2375
+ 0: ("Jun", 10), # too-new (oldest row > until) → skipped
2376
+ 1: ("Jun", 8), # in window
2377
+ 2: ("Jun", 6), # in window
2378
+ 3: ("Jun", 4), # straddle since → kept
2379
+ 4: ("Jun", 2), # too-old → skipped
2380
+ })
2381
+ runner.run(
2382
+ config=_SYSLOG_ONLY,
2383
+ syslog_dir=d,
2384
+ since=_parse_ts(_sysrot_line("Jun", 5)),
2385
+ until=_parse_ts(_sysrot_line("Jun", 9)),
2386
+ )
2387
+ s = capture_summary["summary"]
2388
+ assert (
2389
+ "syslog: loaded 3 of 5 rotation files; 2 skipped outside the selected "
2390
+ "window (by rotation order)." in s.notes
2391
+ )
2392
+
2393
+
2394
+ def test_runner_rotation_fallback_note_wins(tmp_path, capture_summary):
2395
+ """One out-of-order rotation family → ONE fallback note for the pattern and
2396
+ NO skip-summary (fallback is data-true: the whole archive is read)."""
2397
+ d = tmp_path / "syslog"
2398
+ _write_sysrot(d, "auth.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4), 3: ("Jun", 3)})
2399
+ _write_sysrot(d, "kern.log", {0: ("Jun", 8), 1: ("Jun", 10)}) # first-ts RISE → disorder
2400
+ runner.run(
2401
+ config=_SYSLOG_ONLY,
2402
+ syslog_dir=d,
2403
+ since=_parse_ts(_sysrot_line("Jun", 5)),
2404
+ )
2405
+ s = capture_summary["summary"]
2406
+ assert (
2407
+ "syslog: rotation order not monotonic — read the full archive "
2408
+ "(windowing skipped)." in s.notes
2409
+ )
2410
+ assert not any("skipped outside the selected window" in n for n in s.notes)
2411
+
2412
+
2413
+ def test_runner_rotation_no_note_when_unwindowed(tmp_path, capture_summary):
2414
+ """No explicit window → flat load reads all, no rotation note."""
2415
+ d = tmp_path / "syslog"
2416
+ _write_sysrot(d, "syslog.log", {0: ("Jun", 6), 1: ("Jun", 5), 2: ("Jun", 4)})
2417
+ runner.run(config=_SYSLOG_ONLY, syslog_dir=d, load_all=True)
2418
+ s = capture_summary["summary"]
2419
+ assert not any("rotation" in n.lower() for n in s.notes)
2420
+
2421
+
2422
+ def test_runner_rotation_overlap_export_window_note(tmp_path, capture_summary):
2423
+ """Overlapping exporter-output windows in a flat dir → the NEW overlap
2424
+ fallback wording (whole-pattern full read), distinct from the monotonic note."""
2425
+ d = tmp_path / "syslog"
2426
+ d.mkdir(parents=True, exist_ok=True)
2427
+ (d / "splunk_20260601_7d.log").write_text(_sysrot_line("Jun", 1) + "\n", encoding="utf-8")
2428
+ (d / "splunk_20260605_1d.log").write_text(_sysrot_line("Jun", 5) + "\n", encoding="utf-8")
2429
+ runner.run(
2430
+ config=_SYSLOG_ONLY,
2431
+ syslog_dir=d,
2432
+ since=_parse_ts(_sysrot_line("Jun", 5)),
2433
+ )
2434
+ s = capture_summary["summary"]
2435
+ assert (
2436
+ "syslog: overlapping export windows — read the full archive "
2437
+ "(windowing skipped)." in s.notes
2438
+ )
2439
+ assert not any("not monotonic" in n for n in s.notes)
2440
+
2441
+
2442
+ def test_runner_rotation_duplicate_note(tmp_path, capture_summary):
2443
+ """A duplicate rotation slot (a file + its .gz sibling collapsing to one
2444
+ age_rank) → the new 'duplicate rotation files' fallback wording, distinct from
2445
+ the monotonic and overlap notes."""
2446
+ d = tmp_path / "syslog"
2447
+ d.mkdir(parents=True, exist_ok=True)
2448
+ (d / "auth.log").write_text(_sysrot_line("Jun", 6) + "\n", encoding="utf-8")
2449
+ with gzip.open(d / "auth.log.gz", "wt", encoding="utf-8") as fh:
2450
+ fh.write(_sysrot_line("Jun", 6) + "\n")
2451
+ runner.run(
2452
+ config=_SYSLOG_ONLY,
2453
+ syslog_dir=d,
2454
+ since=_parse_ts(_sysrot_line("Jun", 5)),
2455
+ )
2456
+ s = capture_summary["summary"]
2457
+ assert (
2458
+ "syslog: duplicate rotation files — read the full archive "
2459
+ "(windowing skipped)." in s.notes
2460
+ )
2461
+ assert not any(("not monotonic" in n or "overlapping" in n) for n in s.notes)
2462
+
2463
+
2464
+ # ── _source_overlap_notes — plan-time source-dir overlap disclosure ───────────
2465
+
2466
+
2467
+ def _plan_with_needed(needed_logs: dict[str, str]) -> RunPlan:
2468
+ """Minimal RunPlan carrying only the needed_logs the overlap helper reads."""
2469
+ return RunPlan(
2470
+ detectors={}, selected=[], will_run=[],
2471
+ skipped={}, needed_logs=needed_logs,
2472
+ )
2473
+
2474
+
2475
+ def test_source_overlap_two_families_same_dir(tmp_path) -> None:
2476
+ """Two IN-PLAN families resolved to the same directory → exactly one note
2477
+ naming both, in canonical key order."""
2478
+ shared = tmp_path / "shared"
2479
+ shared.mkdir()
2480
+ source_dirs = {"zeek_dir": [shared], "syslog_dir": [shared]}
2481
+ plan = _plan_with_needed(
2482
+ {"conn*.log*": "zeek_dir", "*.log*": "syslog_dir"}
2483
+ )
2484
+ notes = _source_overlap_notes(source_dirs, plan)
2485
+ assert len(notes) == 1, notes
2486
+ assert notes[0].startswith("zeek_dir, syslog_dir resolve to the same directory")
2487
+ assert str(shared.resolve()) in notes[0]
2488
+ # Customized-path-truthful tail (no hard-coded exports/<x>/).
2489
+ assert "global exports now auto-segment per source" in notes[0]
2490
+
2491
+
2492
+ def test_source_overlap_three_families_same_dir(tmp_path) -> None:
2493
+ """≥3 families at one dir → one note listing all three, canonical order."""
2494
+ shared = tmp_path / "shared"
2495
+ shared.mkdir()
2496
+ source_dirs = {
2497
+ "zeek_dir": [shared], "syslog_dir": [shared], "pihole_dir": [shared],
2498
+ }
2499
+ plan = _plan_with_needed({
2500
+ "conn*.log*": "zeek_dir",
2501
+ "*.log*": "syslog_dir",
2502
+ "pihole*.log*": "pihole_dir",
2503
+ })
2504
+ notes = _source_overlap_notes(source_dirs, plan)
2505
+ assert len(notes) == 1, notes
2506
+ assert notes[0].startswith(
2507
+ "zeek_dir, syslog_dir, pihole_dir resolve to the same directory"
2508
+ )
2509
+
2510
+
2511
+ def test_source_overlap_in_plan_negative(tmp_path) -> None:
2512
+ """GLENN sharp case: two configured dirs resolve to the same directory but
2513
+ only ONE family is in the plan → NO note about the out-of-plan sibling."""
2514
+ shared = tmp_path / "shared"
2515
+ shared.mkdir()
2516
+ source_dirs = {"zeek_dir": [shared], "syslog_dir": [shared]}
2517
+ # Only zeek_dir is planned (e.g. detect=beacon); syslog_dir is configured
2518
+ # but unselected, so it cannot contaminate the run.
2519
+ plan = _plan_with_needed({"conn*.log*": "zeek_dir"})
2520
+ assert _source_overlap_notes(source_dirs, plan) == []
2521
+
2522
+
2523
+ def test_source_overlap_nested_dirs_stay_silent(tmp_path) -> None:
2524
+ """Equal-dir ONLY: a NESTED pair (parent containing child) is NOT an
2525
+ overlap — flat discovery is non-recursive. Uses real existing dirs so the
2526
+ rail is proven by path inequality, not by a missing dir on the test box."""
2527
+ varlog = tmp_path / "varlog"
2528
+ zeek = varlog / "zeek"
2529
+ zeek.mkdir(parents=True)
2530
+ source_dirs = {"syslog_dir": [varlog], "zeek_dir": [zeek]}
2531
+ plan = _plan_with_needed(
2532
+ {"*.log*": "syslog_dir", "conn*.log*": "zeek_dir"}
2533
+ )
2534
+ assert _source_overlap_notes(source_dirs, plan) == []
2535
+
2536
+
2537
+ def test_source_overlap_files_out_of_scope(tmp_path) -> None:
2538
+ """Explicit FILE inputs are out of scope — the vector is dir-glob overlap,
2539
+ not a shared named file."""
2540
+ f = tmp_path / "shared.log"
2541
+ f.write_text("x", encoding="utf-8")
2542
+ source_dirs = {"zeek_dir": [f], "syslog_dir": [f]}
2543
+ plan = _plan_with_needed(
2544
+ {"conn*.log*": "zeek_dir", "*.log*": "syslog_dir"}
2545
+ )
2546
+ assert _source_overlap_notes(source_dirs, plan) == []
2547
+
2548
+
2549
+ def test_source_overlap_collapses_per_family_duplicates(tmp_path) -> None:
2550
+ """Two inputs in ONE family resolving to the same dir are not an overlap —
2551
+ overlap requires two DISTINCT families."""
2552
+ shared = tmp_path / "shared"
2553
+ shared.mkdir()
2554
+ source_dirs = {"zeek_dir": [shared, shared]}
2555
+ plan = _plan_with_needed({"conn*.log*": "zeek_dir"})
2556
+ assert _source_overlap_notes(source_dirs, plan) == []
2557
+
2558
+
2559
+ # ── runner seam pin: the overlap note reaches RunSummary.notes ────────────────
2560
+
2561
+
2562
+ def test_runner_emits_source_overlap_note(
2563
+ tmp_path, capture_summary, mock_load_required_logs,
2564
+ ) -> None:
2565
+ """Seam pin (GLENN): the one-line notes.extend wiring lands the overlap note
2566
+ on the user-facing RunSummary.notes surface, not just in the pure helper.
2567
+
2568
+ zeek_dir (beacon, REQUIRED conn*.log*) and cloudtrail_dir (aws, REQUIRED
2569
+ *.json*) both point at one shared directory holding both files → both
2570
+ families are in-plan at the same resolved dir → overlap note fires."""
2571
+ from loghunter.common.loader import LoadResult, SourceCoverage
2572
+
2573
+ shared = tmp_path / "shared"
2574
+ shared.mkdir()
2575
+ _write_ndjson(shared / "conn.log", [_conn(_TS_JAN5)])
2576
+ (shared / "events.json.log").write_text("{}", encoding="utf-8")
2577
+
2578
+ fake_lr = LoadResult(
2579
+ logs={
2580
+ "conn*.log*": pd.DataFrame(columns=["ts", "src", "dst"]),
2581
+ "*.json*": pd.DataFrame(columns=_CT_COLUMNS_FOR_MOCK),
2582
+ },
2583
+ record_counts={},
2584
+ data_window=None,
2585
+ warnings=[],
2586
+ data_size_bytes=0,
2587
+ coverage={},
2588
+ )
2589
+ mock_load_required_logs(fake_lr)
2590
+
2591
+ runner.run(
2592
+ config={"loghunter": {"detect": "beacon,aws", "default_window": ""}},
2593
+ zeek_dir=shared,
2594
+ cloudtrail_dir=shared,
2595
+ )
2596
+ s = capture_summary["summary"]
2597
+ overlap = [n for n in s.notes if "resolve to the same directory" in n]
2598
+ assert len(overlap) == 1, s.notes
2599
+ assert "zeek_dir" in overlap[0] and "cloudtrail_dir" in overlap[0]