loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
loghunter/runner.py ADDED
@@ -0,0 +1,1895 @@
1
+ """Orchestrates detector execution: discovery, log loading, context assembly, and output.
2
+
3
+ Responsibilities:
4
+ - Auto-discover detectors by scanning loghunter/detectors/ for modules with DETECTOR_NAME
5
+ - Resolve the detect= selection (all, explicit list, exclusion syntax)
6
+ - Check REQUIRED_LOGS availability; skip with warning if missing
7
+ - Load logs and assemble DetectorContext for each detector
8
+ - Collect list[Finding] from each detector's run()
9
+ - Hand findings to Reporter
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import importlib
15
+ import pkgutil
16
+ import sys
17
+ from dataclasses import dataclass
18
+ from datetime import datetime, timedelta, timezone
19
+ from pathlib import Path
20
+ from typing import Any, Sequence
21
+
22
+ import pandas as pd
23
+
24
+ import loghunter.detectors as _detectors_pkg
25
+ from loghunter.common.config import get_detector_config, parse_window_span
26
+ from loghunter.common.display import (
27
+ TEXT_RULE,
28
+ TEXT_RULE_DOUBLE,
29
+ TEXT_RULE_WIDTH,
30
+ liveness,
31
+ )
32
+ from loghunter.common.errors import DigestEmpty, ExportAborted
33
+ from loghunter.common.finding import DetectorContext, Finding, RunSummary
34
+ from loghunter.common.output import OutputHandler, Reporter
35
+ from loghunter.common.sources import (
36
+ resolve_digest_source,
37
+ resolve_sources,
38
+ )
39
+
40
+ _WIDTH = TEXT_RULE_WIDTH
41
+ _SEP = TEXT_RULE
42
+ _SEP_DOUBLE = TEXT_RULE_DOUBLE
43
+
44
+ # Full-fidelity DNS source labels. When any of these are in data_sources the
45
+ # Zeek evangelization nudge is suppressed. BIND9 and others join this set when
46
+ # their parsers land.
47
+ _RICH_DNS_SOURCES = {"zeek_dns"}
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class RunPlan:
52
+ """Detector execution plan produced before loading any log data."""
53
+
54
+ detectors: dict[str, Any]
55
+ selected: list[str]
56
+ will_run: list[str]
57
+ skipped: dict[str, str]
58
+ needed_logs: dict[str, str]
59
+
60
+
61
+ def run(
62
+ config: dict[str, Any],
63
+ detect: str | None = None,
64
+ zeek_dir: str | Path | Sequence[str | Path] | None = None,
65
+ syslog_dir: str | Path | Sequence[str | Path] | None = None,
66
+ pihole_dir: str | Path | Sequence[str | Path] | None = None,
67
+ cloudtrail_dir: str | Path | Sequence[str | Path] | None = None,
68
+ since: datetime | None = None,
69
+ until: datetime | None = None,
70
+ output_format: str = "text",
71
+ output_dir: Path | None = None,
72
+ verbose_level: int = 0,
73
+ dry_run: bool = False,
74
+ export_allowlist: bool = False,
75
+ load_all: bool = False,
76
+ skip_confirm: bool = False,
77
+ output_file: Path | None = None,
78
+ scope: frozenset[str] | None = None,
79
+ ) -> None:
80
+ """Main entry point for a detection run. Called by CLI dispatch functions.
81
+
82
+ Source-dir parameters (``zeek_dir`` / ``syslog_dir`` / ``pihole_dir`` /
83
+ ``cloudtrail_dir``) are EXPLICIT OVERRIDES accepting either a scalar
84
+ (``str`` / ``Path``) or a sequence of scalars (multi-positional analyze).
85
+ ``None`` means "no override." Scalar callers are degenerate one-element
86
+ lists downstream — byte-identical with the prior single-Path contract.
87
+ Resolution happens inside ``loghunter.common.sources.resolve_sources``
88
+ via the single ``_resolve_one`` site (per-element). CLI callers thread
89
+ raw parsed strings or per-family lists; programmatic callers can pass
90
+ already-resolved ``Path``s, lists thereof, or let ``None`` fall back to
91
+ ``config["loghunter"][key]`` (LH_ROOT applied).
92
+
93
+ ``scope`` is the SOLE scoping signal. ``None`` = unconstrained (every
94
+ configured source-dir is eligible). A ``frozenset`` of source-dir keys
95
+ restricts config-fallback to those keys — sibling source-dirs stay
96
+ ``None`` and are NOT loaded. An override outside ``scope`` still wins
97
+ (operator widening). The CLI sets ``scope`` from a positional PATH's
98
+ routed source; the previous ``None``-as-scoped-out wire shape is
99
+ retired.
100
+
101
+ ``skip_confirm`` bypasses the advisory large-dataset prompt (controlled by
102
+ ``[loghunter].warn_above``). Threaded from the CLI's ``--yes`` / ``-y`` flag.
103
+ Has no effect on safety-critical actions — there are none today; advisory
104
+ prompts only.
105
+
106
+ ``output_file`` is the be_like_water FILE verdict — an exact file path for
107
+ the report. When set it takes precedence over ``output_dir``; when both
108
+ are None, the runner streams to stdout (text/json/csv) or writes
109
+ ``loghunter-report.html`` in CWD (html). This preserves the bare-case
110
+ behavior exactly.
111
+ """
112
+ cfg_lh = config.get("loghunter", {})
113
+
114
+ # Single owner of source resolution: resolve_sources runs the four-key
115
+ # truth table (override / scope / config fallback) and is the SOLE site
116
+ # that converts a source-dir string to a Path. Runs BEFORE dry_run so
117
+ # _print_dry_run sees resolved dirs (provenance rail).
118
+ resolved = resolve_sources(
119
+ config,
120
+ overrides={
121
+ "zeek_dir": zeek_dir,
122
+ "syslog_dir": syslog_dir,
123
+ "pihole_dir": pihole_dir,
124
+ "cloudtrail_dir": cloudtrail_dir,
125
+ },
126
+ scope=scope,
127
+ )
128
+ zeek_dirs = resolved.zeek_dir
129
+ syslog_dirs = resolved.syslog_dir
130
+ pihole_dirs = resolved.pihole_dir
131
+ cloudtrail_dirs = resolved.cloudtrail_dir
132
+
133
+ plan = build_run_plan(
134
+ detect_spec=detect if detect is not None else cfg_lh.get("detect", "all"),
135
+ zeek_dir=zeek_dirs,
136
+ syslog_dir=syslog_dirs,
137
+ pihole_dir=pihole_dirs,
138
+ cloudtrail_dir=cloudtrail_dirs,
139
+ )
140
+
141
+ if dry_run:
142
+ _print_dry_run(
143
+ zeek_dir=zeek_dirs,
144
+ syslog_dir=syslog_dirs,
145
+ pihole_dir=pihole_dirs,
146
+ cloudtrail_dir=cloudtrail_dirs,
147
+ since=since,
148
+ until=until,
149
+ load_all=load_all,
150
+ will_run=plan.will_run,
151
+ skipped=plan.skipped,
152
+ )
153
+ return
154
+
155
+ if export_allowlist:
156
+ raise ValueError(
157
+ "--export-allowlist is not yet implemented — planned for a future release"
158
+ )
159
+
160
+ # Emit per-detector skip warnings to stderr
161
+ for name, reason in plan.skipped.items():
162
+ _warn_skipped(name, reason)
163
+
164
+ if not plan.will_run:
165
+ print(
166
+ "No detectors could run — check required log source paths in config "
167
+ "or CLI overrides.",
168
+ file=sys.stderr,
169
+ )
170
+ return
171
+
172
+ # ── Load logs ─────────────────────────────────────────────────────────────
173
+ from loghunter.common import loader
174
+ from loghunter.common.allowlist import build_matcher
175
+
176
+ source_dirs: dict[str, list[Path]] = {}
177
+ if zeek_dirs:
178
+ source_dirs["zeek_dir"] = zeek_dirs
179
+ if syslog_dirs:
180
+ source_dirs["syslog_dir"] = syslog_dirs
181
+ if pihole_dirs:
182
+ source_dirs["pihole_dir"] = pihole_dirs
183
+ if cloudtrail_dirs:
184
+ source_dirs["cloudtrail_dir"] = cloudtrail_dirs
185
+
186
+ # Resolve the UNIVERSAL default window. default_window governs every source
187
+ # family (no longer Zeek-only): each family anchors on its OWN max-ts, with the
188
+ # per-family load/trim strategy encoded in a single LoadWindow per family. The
189
+ # loader owns the window policy now — resolve_load_windows is the SINGLE entry
190
+ # point (shared with run_digest); each family's strategy declares its own
191
+ # resolver. Engages only on an unqualified, non---all, unbounded, in-plan,
192
+ # configured, eligible family.
193
+ _default_spec: str = cfg_lh.get("default_window", "1d")
194
+ load_windows = loader.resolve_load_windows(
195
+ plan.needed_logs, source_dirs, _default_spec,
196
+ load_all=load_all, since=since, until=until,
197
+ )
198
+ default_window_active = bool(load_windows)
199
+ # source_windows: precise dated-Zeek window or conservative flat (floor, None);
200
+ # families with select_window=None load full and are trimmed post-load.
201
+ source_windows = (
202
+ {w.source: w.select_window for w in load_windows if w.select_window is not None}
203
+ or None
204
+ )
205
+
206
+ if default_window_active:
207
+ # PLACEHOLDER VOICE (flag for the qmail error-voice pass). One pre-load
208
+ # line above the loader's `loaded <file>` progress lines.
209
+ print(
210
+ f"Default window: last {_default_spec} of available data. "
211
+ "Use --all for the full archive, or --since/--days to widen.",
212
+ file=sys.stderr,
213
+ )
214
+
215
+ load_result = loader.load_required_logs(
216
+ plan.needed_logs,
217
+ source_dirs,
218
+ since,
219
+ until,
220
+ verbose=(verbose_level >= 1),
221
+ source_windows=source_windows,
222
+ )
223
+
224
+ # Post-load precise trim for every family whose default window engaged with a
225
+ # load-full / conservative select-window (flat peek-prune, cloudtrail
226
+ # load-full, flat/mixed Zeek). Dated-Zeek families carry trim_span=None — their
227
+ # select_window already cut exactly at load. keep_null is wired from the
228
+ # source's loader policy, so keep-policy families (syslog/pihole) retain
229
+ # unparseable-ts rows through the implicit window exactly as through an
230
+ # explicit one. Mixed file+dir trims the named file's rows WITH the bucket.
231
+ for w in load_windows:
232
+ if w.trim_span is None:
233
+ continue
234
+ family_patterns = [
235
+ p for p, src in plan.needed_logs.items() if src == w.source
236
+ ]
237
+ load_result = loader.apply_default_window(
238
+ load_result, family_patterns, w.trim_span, keep_null=w.keep_null,
239
+ )
240
+ logs = load_result.logs
241
+
242
+ for warning in load_result.warnings:
243
+ print(f"Warning: {warning}", file=sys.stderr)
244
+
245
+ # ONE captured `now` for both the data-window fallback and requested_span,
246
+ # so they cannot drift across separate clock reads.
247
+ now = datetime.now(timezone.utc)
248
+ if load_result.data_window is not None:
249
+ data_window = load_result.data_window
250
+ elif since or until:
251
+ data_window = (since or now, until or now)
252
+ else:
253
+ data_window = (now, now)
254
+
255
+ # The window the operator asked for, used by the data-found underfill
256
+ # parenthetical. Default-window active → the configured spec; explicit
257
+ # since&until → their span; since only → since→now; until-only / --all /
258
+ # bounded full-load → None (unconstrained).
259
+ requested_span: timedelta | None
260
+ if default_window_active:
261
+ requested_span = parse_window_span(_default_spec)
262
+ elif since is not None and until is not None:
263
+ requested_span = until - since
264
+ elif since is not None:
265
+ requested_span = now - since
266
+ else:
267
+ requested_span = None
268
+ # No real data window (load yielded nothing the renderer can place — e.g. all
269
+ # rows unparseable-ts under keep policy) → run() fabricated a (now, now) window.
270
+ # Force requested_span None so the underfill parenthetical cannot render a
271
+ # confident comparison over data that does not exist. The legitimate single-event
272
+ # case keeps a real (ts, ts) data_window and is unaffected.
273
+ if load_result.data_window is None:
274
+ requested_span = None
275
+
276
+ # Large-dataset warning. Suppressed when skip_confirm is set (--yes / -y).
277
+ total_records = sum(load_result.record_counts.values())
278
+ warn_above: int = cfg_lh.get("warn_above", 5_000_000)
279
+ if total_records > warn_above and not skip_confirm:
280
+ try:
281
+ answer = input(
282
+ f"{total_records:,} records found. This may take a while. Continue? [y/N] "
283
+ )
284
+ except (EOFError, KeyboardInterrupt):
285
+ answer = ""
286
+ if answer.strip().lower() not in ("y", "yes"):
287
+ raise ExportAborted("loghunter: aborted by user")
288
+
289
+ # Build run summary and begin output before the detector loop so the banner
290
+ # ("Data found:", "Records:", "Detectors:") appears before analysis starts.
291
+ data_sources = _derive_data_sources(plan.needed_logs, load_result.record_counts)
292
+ # The default window is now announced pre-load on stderr (and the data-found
293
+ # parenthetical carries the data-vs-requested span), so no prose default-window
294
+ # note rides the run summary. The old "only X hours" short-window note is gone.
295
+ notes: list[str] = []
296
+ nudge = _dns_nudge(data_sources)
297
+ if nudge:
298
+ notes.append(nudge)
299
+ aws_below_note = _aws_below_floor_note(plan, logs, config)
300
+ if aws_below_note:
301
+ notes.append(aws_below_note)
302
+ # The aws --all riders key on CloudTrail ACTUALLY being narrowed (an explicit
303
+ # window) — NOT run-level default-window activity. CloudTrail opts out of the
304
+ # auto-default window, so a mixed unqualified run (dns/syslog windowed) loads
305
+ # it FULL and must not be told to widen.
306
+ cloudtrail_narrowed = since is not None or until is not None
307
+ aws_window_note = _aws_window_note(plan, cloudtrail_narrowed=cloudtrail_narrowed)
308
+ if aws_window_note:
309
+ notes.append(aws_window_note)
310
+ aws_no_interactive_note = _aws_no_interactive_note(
311
+ plan, logs, cloudtrail_narrowed=cloudtrail_narrowed
312
+ )
313
+ if aws_no_interactive_note:
314
+ notes.append(aws_no_interactive_note)
315
+ home_net_note = _home_net_note(plan, config)
316
+ if home_net_note:
317
+ notes.append(home_net_note)
318
+ # Source-dir overlap disclosure: when two IN-PLAN families resolve to the
319
+ # same directory, flat discovery globs cross-read it (one log surfaced as
320
+ # another's finding). Derives from already-resolved source_dirs + plan, like
321
+ # the home_net note above. Appended before the coverage/rotation extends,
322
+ # which are deliberately last.
323
+ notes.extend(_source_overlap_notes(source_dirs, plan))
324
+ # Source-coverage disclosure: for each planned source that contributed 0
325
+ # in-window rows, append a note (SPAN / BARE / silent per the parse-gap
326
+ # vs window-gap tri-state in CoverageTracker). Appended LAST so the
327
+ # existing notes' relative order is preserved and the disclosure is
328
+ # additive only. Reads the merged coverage written by the runner-side
329
+ # flat-default block above (when fired).
330
+ notes.extend(_zero_window_coverage_notes(load_result, plan))
331
+ # Flat rotation-peek disclosure: one note per windowed pattern that fell back
332
+ # to a full read or skipped out-of-window rotation files. Additive, appended last.
333
+ notes.extend(_rotation_skip_notes(load_result, plan))
334
+ detector_methods = {
335
+ name: getattr(plan.detectors[name], "DETECTOR_METHOD", None)
336
+ for name in plan.will_run
337
+ }
338
+ run_summary = RunSummary(
339
+ data_window=data_window,
340
+ record_counts=load_result.record_counts,
341
+ data_size_bytes=load_result.data_size_bytes,
342
+ detectors_run=plan.will_run,
343
+ detectors_skipped=plan.skipped,
344
+ notes=notes,
345
+ data_sources=data_sources,
346
+ detector_methods=detector_methods,
347
+ requested_span=requested_span,
348
+ )
349
+
350
+ max_per_detector = int(
351
+ config.get("loghunter", {}).get("max_findings_per_detector", 100)
352
+ )
353
+ handler, close_handler = _build_output_handler(
354
+ output_format, output_dir, output_file, verbose_level,
355
+ max_findings_per_detector=max_per_detector,
356
+ )
357
+ reporter = Reporter([handler])
358
+ reporter.begin(run_summary)
359
+
360
+ # ── Run detectors ─────────────────────────────────────────────────────────
361
+ allowlist = build_matcher(config)
362
+ home_net = list(config.get("loghunter", {}).get("home_net", []))
363
+ all_findings: list[Finding] = []
364
+
365
+ for name in plan.will_run:
366
+ mod = plan.detectors[name]
367
+ det_cfg = get_detector_config(config, name, getattr(mod, "DEFAULT_CONFIG", {}))
368
+
369
+ # Per-detector prep + run, scoped to honest error labels. Prep
370
+ # (filter_df + DetectorContext construction) is the runner's
371
+ # responsibility; a prep failure is "prep error", NOT "detector
372
+ # error" — separation-of-powers detail. For the non-syslog
373
+ # branch, both prep and run live INSIDE liveness(...) so the
374
+ # spinner appears as soon as the operator-visible work begins
375
+ # (the "Detector liveness starts too late" bug — see docs/BUGS.md
376
+ # — was the prep running silently before the liveness block
377
+ # opened).
378
+ #
379
+ # syslog stays outside the outer spinner branch: its inner
380
+ # drain3 tqdm bar owns its stderr line, and an outer spinner
381
+ # would fight for the same row. Prep moves into the syslog
382
+ # branch too for consistency but stays outside its own
383
+ # liveness wrapper.
384
+ if name == "syslog":
385
+ try:
386
+ ctx = _prepare_detector_context(
387
+ mod, name, logs, allowlist, det_cfg,
388
+ data_window, data_sources, home_net,
389
+ )
390
+ except Exception as exc:
391
+ print(f"{name}: prep error — {exc}", file=sys.stderr)
392
+ continue
393
+ try:
394
+ findings = mod.run(ctx)
395
+ except Exception as exc:
396
+ print(f"{name}: detector error — {exc}", file=sys.stderr)
397
+ findings = []
398
+ else:
399
+ with liveness(f"running {name}") as _ln:
400
+ try:
401
+ ctx = _prepare_detector_context(
402
+ mod, name, logs, allowlist, det_cfg,
403
+ data_window, data_sources, home_net,
404
+ )
405
+ except Exception as exc:
406
+ # Prep failed BEFORE the detector even started — no
407
+ # seal (the "no false seal" path from
408
+ # tests/test_display.py:120-130); liveness's normal
409
+ # teardown clears the spinner line.
410
+ print(f"{name}: prep error — {exc}", file=sys.stderr)
411
+ continue
412
+ try:
413
+ findings = mod.run(ctx)
414
+ # The seal is a terse live completion record — "this
415
+ # detector finished" — NOT a tally. The report header
416
+ # (W2) is the single authoritative count surface
417
+ # (carries the H/M/L/I breakdown, survives redirect).
418
+ # Empty case stays informative: the detector ran and
419
+ # found nothing. ASCII-only per display.py's spinner
420
+ # discipline. Wording is a PLACEHOLDER pending the
421
+ # error-voice pass.
422
+ _ln.seal(
423
+ f"{name}: done"
424
+ if findings
425
+ else f"{name}: nothing"
426
+ )
427
+ except Exception as exc:
428
+ print(f"{name}: detector error — {exc}", file=sys.stderr)
429
+ findings = []
430
+
431
+ all_findings.extend(findings)
432
+
433
+ try:
434
+ reporter.write(all_findings)
435
+ reporter.end()
436
+ finally:
437
+ close_handler()
438
+
439
+
440
+ def _prepare_detector_context(
441
+ mod: Any,
442
+ name: str,
443
+ logs: dict[str, Any],
444
+ allowlist: Any,
445
+ det_cfg: dict[str, Any],
446
+ data_window: tuple[datetime, datetime],
447
+ data_sources: list[str],
448
+ home_net: list[str],
449
+ ) -> DetectorContext:
450
+ """Build the per-detector filtered view + DetectorContext.
451
+
452
+ Pure extraction of the previously inline prep at the detector loop:
453
+ each detector gets its own filtered copy of the shared log frames
454
+ (so independent ``filter_df`` results never mutate the shared dict),
455
+ keyed by the patterns the detector itself declares via
456
+ ``REQUIRED_LOGS`` + ``OPTIONAL_LOGS``.
457
+
458
+ Lives in the runner — NOT moved into detector code — because
459
+ ``allowlist.filter_df`` is suppression, and suppression stays in the
460
+ runner per the filter-before-analyze rail (CODE.md "Allowlist
461
+ Architecture").
462
+
463
+ Verbose is intentionally absent (W6): detector context carries no
464
+ verbosity; the result set is verbosity-invariant by construction.
465
+ """
466
+ det_patterns = {
467
+ req["pattern"]
468
+ for req in list(getattr(mod, "REQUIRED_LOGS", []))
469
+ + list(getattr(mod, "OPTIONAL_LOGS", []))
470
+ }
471
+ filtered_logs: dict[str, Any] = {}
472
+ for pattern, df in logs.items():
473
+ if pattern in det_patterns and not df.empty:
474
+ filtered_logs[pattern] = allowlist.filter_df(df, name)
475
+ else:
476
+ filtered_logs[pattern] = df
477
+ return DetectorContext(
478
+ logs=filtered_logs,
479
+ config=det_cfg,
480
+ allowlist=allowlist,
481
+ data_window=data_window,
482
+ data_sources=data_sources,
483
+ home_net=home_net,
484
+ )
485
+
486
+
487
+ def discover_detectors() -> dict[str, Any]:
488
+ """Scan loghunter/detectors/ and return available detector modules by name."""
489
+ detectors: dict[str, Any] = {}
490
+ for _finder, name, _ispkg in pkgutil.iter_modules(_detectors_pkg.__path__):
491
+ try:
492
+ mod = importlib.import_module(f"loghunter.detectors.{name}")
493
+ except ImportError:
494
+ continue
495
+ if hasattr(mod, "DETECTOR_NAME") and getattr(mod, "STATUS", "available") == "available":
496
+ detectors[mod.DETECTOR_NAME] = mod
497
+ return detectors
498
+
499
+
500
+ def _as_path_list(value: Path | list[Path] | None) -> list[Path]:
501
+ """Normalize a build_run_plan / _print_dry_run source-dir param.
502
+
503
+ Accepts None (absent), a scalar Path (degenerate one-element list), or a
504
+ list of Paths (the canonical multi-input shape). Returns a list — empty
505
+ means absent. Lets callers and tests pass either form without juggling
506
+ the boundary; the internal pipeline operates on lists only. SAME
507
+ normalization shape as ``runner.run`` accepting ``str | Path | Sequence
508
+ | None`` at its outer boundary, propagated inward.
509
+ """
510
+ if value is None:
511
+ return []
512
+ if isinstance(value, Path):
513
+ return [value]
514
+ return list(value)
515
+
516
+
517
+ def build_run_plan(
518
+ detect_spec: str | None,
519
+ zeek_dir: Path | list[Path] | None = None,
520
+ syslog_dir: Path | list[Path] | None = None,
521
+ pihole_dir: Path | list[Path] | None = None,
522
+ cloudtrail_dir: Path | list[Path] | None = None,
523
+ detectors: dict[str, Any] | None = None,
524
+ ) -> RunPlan:
525
+ """Resolve detector selection, required-log skips, and log patterns to load.
526
+
527
+ Each source-dir parameter accepts ``None`` (absent), a scalar ``Path``
528
+ (degenerate one-element list), or a list of ``Path``s (the canonical
529
+ multi-input shape from the resolver). Plan-time satisfiability uses the
530
+ SAME discovery helpers the loader uses (``discover_zeek_files``,
531
+ ``discover_cloudtrail_files``, ``_syslog_files``); plan and loader MUST
532
+ discover the same universe.
533
+ """
534
+ all_detectors = detectors or discover_detectors()
535
+ selected = resolve_detect(str(detect_spec or "all"), sorted(all_detectors.keys()))
536
+
537
+ source_map: dict[str, list[Path]] = {}
538
+ zeek_paths = _as_path_list(zeek_dir)
539
+ syslog_paths = _as_path_list(syslog_dir)
540
+ pihole_paths = _as_path_list(pihole_dir)
541
+ cloudtrail_paths = _as_path_list(cloudtrail_dir)
542
+ if zeek_paths:
543
+ source_map["zeek_dir"] = zeek_paths
544
+ if syslog_paths:
545
+ source_map["syslog_dir"] = syslog_paths
546
+ if pihole_paths:
547
+ source_map["pihole_dir"] = pihole_paths
548
+ if cloudtrail_paths:
549
+ source_map["cloudtrail_dir"] = cloudtrail_paths
550
+
551
+ will_run: list[str] = []
552
+ skipped: dict[str, str] = {}
553
+ for name in selected:
554
+ reason = _check_required_logs(all_detectors[name], source_map)
555
+ if reason:
556
+ skipped[name] = reason
557
+ else:
558
+ will_run.append(name)
559
+
560
+ # Only include OPTIONAL_LOGS patterns that are actually satisfiable, to avoid
561
+ # loading empty frames for optional sources that happen to be configured but have
562
+ # no matching files (e.g. zeek_dir present but no dns*.log* when pihole satisfied).
563
+ needed_logs: dict[str, str] = {}
564
+ for name in will_run:
565
+ mod = all_detectors[name]
566
+ for req in getattr(mod, "REQUIRED_LOGS", []):
567
+ if req["pattern"] not in needed_logs:
568
+ needed_logs[req["pattern"]] = req["source"]
569
+ for req in getattr(mod, "OPTIONAL_LOGS", []):
570
+ if _is_optional_satisfiable(req, source_map) and req["pattern"] not in needed_logs:
571
+ needed_logs[req["pattern"]] = req["source"]
572
+
573
+ return RunPlan(
574
+ detectors=all_detectors,
575
+ selected=selected,
576
+ will_run=will_run,
577
+ skipped=skipped,
578
+ needed_logs=needed_logs,
579
+ )
580
+
581
+
582
+ def _build_output_handler(
583
+ output_format: str,
584
+ output_dir: Path | None,
585
+ output_file: Path | None,
586
+ verbose_level: int,
587
+ stream: Any = None,
588
+ *,
589
+ max_findings_per_detector: int = 100,
590
+ ) -> tuple[OutputHandler, Any]:
591
+ """Create the requested output handler and a callback that closes any file stream.
592
+
593
+ ``output_file`` is the be_like_water FILE verdict — an exact file path.
594
+ When set it takes precedence over ``output_dir`` (which is the DIRECTORY
595
+ verdict; runner auto-names inside it). With both None, text/json/csv stream
596
+ to stdout and html writes ``loghunter-report.html`` in CWD.
597
+
598
+ ``stream`` is the caller-owned TextIO seam used by the digest fan-out: the
599
+ CLI resolves a shared `--out` target once and passes the open stream here
600
+ so N cards concatenate into one file. Stream-backed formats only (text
601
+ today); HTML/CSV/etc. carry different writer shapes and are not routed
602
+ through this seam. Caller owns stream lifetime; close callback is a no-op.
603
+
604
+ ``verbose_level`` is the single 0/1/2 dial. ONLY the text handler distinguishes
605
+ all three levels; json is invariant; csv/html collapse to level >= 1 for
606
+ description gating. ``max_findings_per_detector`` is the W5 cap (text only;
607
+ machine formats never truncate). Constructed format-branched so json/csv/html
608
+ receive only the cap-agnostic signature.
609
+ """
610
+ from loghunter.common.output import get_handler
611
+
612
+ handler_cls = get_handler(output_format)
613
+
614
+ def _build(target_stream=None, output_path=None):
615
+ if output_format == "text":
616
+ if output_path is not None:
617
+ # HTML uses output_path; text never does — this branch unreachable
618
+ # for text but kept for symmetry.
619
+ raise RuntimeError("text handler does not accept output_path")
620
+ return handler_cls(
621
+ stream=target_stream,
622
+ verbose_level=verbose_level,
623
+ max_findings_per_detector=max_findings_per_detector,
624
+ )
625
+ if output_format == "html":
626
+ return handler_cls(output_path=output_path, verbose_level=verbose_level)
627
+ return handler_cls(stream=target_stream, verbose_level=verbose_level)
628
+
629
+ if stream is not None:
630
+ # Caller owns the stream; don't open, don't close. output_dir and
631
+ # output_file are expected to be None at this seam (digest fan-out
632
+ # resolves the target once in the CLI). Text format only.
633
+ return _build(target_stream=stream), lambda: None
634
+
635
+ # output_file (FILE verdict) wins — caller has decided the exact path.
636
+ if output_file is not None:
637
+ output_file.parent.mkdir(parents=True, exist_ok=True)
638
+ if output_format == "html":
639
+ return _build(output_path=output_file), lambda: None
640
+ opened = output_file.open("w", encoding="utf-8", newline="")
641
+ return _build(target_stream=opened), opened.close
642
+
643
+ if output_format == "html":
644
+ if output_dir is None:
645
+ output_path = Path("loghunter-report.html")
646
+ else:
647
+ output_dir.mkdir(parents=True, exist_ok=True)
648
+ output_path = output_dir / _report_filename(output_format)
649
+ return _build(output_path=output_path), lambda: None
650
+
651
+ target = sys.stdout
652
+ close_handler = lambda: None
653
+ if output_dir is not None:
654
+ output_dir.mkdir(parents=True, exist_ok=True)
655
+ target = (output_dir / _report_filename(output_format)).open("w", encoding="utf-8", newline="")
656
+ close_handler = target.close
657
+
658
+ return _build(target_stream=target), close_handler
659
+
660
+
661
+ def _report_filename(output_format: str) -> str:
662
+ """Return a timestamped report filename used when --out (or report_dir) resolves to a directory."""
663
+ stamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
664
+ suffix = "html" if output_format == "html" else output_format
665
+ return f"loghunter-{stamp}.{suffix}"
666
+
667
+
668
+ def resolve_detect(spec: str, available: list[str]) -> list[str]:
669
+ """Resolve a detect= spec (all, list, exclusions) against available detector names.
670
+
671
+ Examples:
672
+ "all" → all available names (sorted)
673
+ "dns, beacon" → ["dns", "beacon"]
674
+ "all, !syslog" → all except "syslog"
675
+ "all,!syslog,!ssl" → all except syslog and ssl
676
+ """
677
+ # Tokenise: split on commas and whitespace, handle "all, !syslog" etc.
678
+ tokens = [t.strip() for t in spec.replace(",", " ").split() if t.strip()]
679
+
680
+ inclusions: list[str] = []
681
+ exclusions: set[str] = set()
682
+
683
+ for token in tokens:
684
+ if token.startswith("!"):
685
+ exclusions.add(token[1:])
686
+ elif token == "all":
687
+ inclusions = list(available) # replace with all available
688
+ elif token in available:
689
+ inclusions.append(token)
690
+ # unknown detector names in spec are silently ignored
691
+
692
+ # Deduplicate while preserving order, then apply exclusions
693
+ seen: set[str] = set()
694
+ result: list[str] = []
695
+ for name in inclusions:
696
+ if name not in seen and name not in exclusions:
697
+ seen.add(name)
698
+ result.append(name)
699
+
700
+ return result
701
+
702
+
703
+ def _any_input_yields_files(
704
+ source: str, paths: list[Path], pattern: str,
705
+ ) -> bool:
706
+ """Plan-time discovery lockstep with the LOADER for one family.
707
+
708
+ Per-family mapping (matches ``load_required_logs``):
709
+
710
+ - ``zeek_dir`` → ``discover_zeek_files(input, pattern)`` per input
711
+ - ``cloudtrail_dir``→ ``discover_cloudtrail_files(input)`` per input
712
+ - ``syslog_dir`` → ``_discover_syslog_files(input)`` per input — the LOADER
713
+ content-sniffs syslog DIRECTORY candidates (RHEL/Fedora streams carry no
714
+ ``.log`` suffix; ``dnf.log`` etc. would be mis-claimed by a filename glob),
715
+ so plan-time MUST too (one-universe rail). A ``/var/log`` holding only
716
+ ``dnf.log`` reports syslog NOT satisfiable → the detector skips with its
717
+ actionable "not found" message instead of garbage.
718
+ - ``pihole_dir`` → ``_syslog_files(input, pattern)`` per input — the LOADER
719
+ threads the detector's pattern (``pihole*.log*``) into ``_syslog_files``
720
+ for DIRECTORY discovery, so plan-time MUST too. An explicit FILE still
721
+ routes as ``[path]`` regardless of pattern, so a content-routed Pi-hole
722
+ input named e.g. ``events.log`` is NOT plan-rejected.
723
+
724
+ Returns True iff ANY input yields at least one file.
725
+ """
726
+ from loghunter.common.loader import (
727
+ _discover_syslog_files,
728
+ _syslog_files,
729
+ discover_cloudtrail_files,
730
+ discover_zeek_files,
731
+ )
732
+ for p in paths:
733
+ if not p.exists():
734
+ continue
735
+ if source == "zeek_dir":
736
+ if discover_zeek_files(p, pattern):
737
+ return True
738
+ elif source == "cloudtrail_dir":
739
+ if discover_cloudtrail_files(p):
740
+ return True
741
+ elif source == "syslog_dir":
742
+ if _discover_syslog_files(p):
743
+ return True
744
+ elif source == "pihole_dir":
745
+ if _syslog_files(p, pattern):
746
+ return True
747
+ else:
748
+ # Defensive: unknown source key. Fall back to plain glob over
749
+ # directories so an unrecognized future family doesn't silently
750
+ # plan-skip. The loader will raise the actionable error.
751
+ if p.is_file():
752
+ return True
753
+ if list(p.glob(pattern)):
754
+ return True
755
+ return False
756
+
757
+
758
+ def _is_optional_satisfiable(
759
+ req: dict[str, str],
760
+ source_map: dict[str, Path | list[Path]],
761
+ ) -> bool:
762
+ """Return True if an OPTIONAL_LOGS entry has files available to load."""
763
+ source = req["source"]
764
+ paths = _as_path_list(source_map.get(source))
765
+ if not paths:
766
+ return False
767
+ return _any_input_yields_files(source, paths, req["pattern"])
768
+
769
+
770
+ def _check_required_logs(
771
+ detector_module: Any,
772
+ source_map: dict[str, Path | list[Path]],
773
+ ) -> str | None:
774
+ """Return None if all REQUIRED_LOGS are available, or a human-readable reason if not."""
775
+ for req in getattr(detector_module, "REQUIRED_LOGS", []):
776
+ source = req["source"]
777
+ pattern = req["pattern"]
778
+
779
+ paths = _as_path_list(source_map.get(source))
780
+ if not paths:
781
+ return f"{source} not configured"
782
+
783
+ # Existence skip-reason mirrors single-input behavior on a one-element
784
+ # list: report the missing path. With multiple inputs, satisfiability
785
+ # is "ANY input yields files" — _any_input_yields_files handles
786
+ # per-input existence checks (skips non-existent), so we only emit a
787
+ # not-found skip when NO input yields anything.
788
+ if not _any_input_yields_files(source, paths, pattern):
789
+ if len(paths) == 1:
790
+ p = paths[0]
791
+ if not p.exists():
792
+ return f"{source} {p} not found"
793
+ if source == "cloudtrail_dir":
794
+ # Preserve the family-specific wording for the no-events
795
+ # skip path — recursive AWSLogs/<acct>/CloudTrail/<region>/
796
+ # discovery means a plain "pattern not found" reads
797
+ # confusingly.
798
+ return f"no CloudTrail JSON logs found in {p}"
799
+ return f"{pattern} not found in {p}"
800
+ # Multi-input — name the family rather than a single path.
801
+ return f"{pattern} not found in any configured {source} input"
802
+
803
+ if getattr(detector_module, "REQUIRES_ONE_OF_OPTIONAL", False):
804
+ for opt in getattr(detector_module, "OPTIONAL_LOGS", []):
805
+ if _is_optional_satisfiable(opt, source_map):
806
+ return None
807
+ return getattr(
808
+ detector_module,
809
+ "REQUIRES_ONE_OF_OPTIONAL_REASON",
810
+ f"{getattr(detector_module, 'DETECTOR_NAME', 'detector')} — no source available",
811
+ )
812
+
813
+ return None
814
+
815
+
816
+ def _warn_skipped(detector_name: str, reason: str) -> None:
817
+ """Print a skip warning to stderr in the canonical format."""
818
+ print(f"{reason} — skipping {detector_name} detection", file=sys.stderr)
819
+
820
+
821
+ def _zeek_entry_display(p: Path) -> str:
822
+ """Render one zeek_dir input for the dry-run block.
823
+
824
+ Mirrors the single-input format that has shipped: a DIRECTORY shows
825
+ ``{path} (N files, X.X MB)`` (counting only its immediate file children
826
+ — same iteration the prior helper did, NOT recursive); a FILE shows
827
+ ``{path} (X.X MB)``; a non-existent path shows ``{path} — not found``.
828
+ Single-input dry-run is byte-identical with the prior format.
829
+ """
830
+ if not p.exists():
831
+ return f"{p} — not found"
832
+ if p.is_dir():
833
+ log_files = [f for f in p.iterdir() if f.is_file()]
834
+ size_mb = sum(f.stat().st_size for f in log_files) / 1_048_576
835
+ return f"{p} ({len(log_files)} files, {size_mb:.1f} MB)"
836
+ try:
837
+ size_mb = p.stat().st_size / 1_048_576
838
+ return f"{p} ({size_mb:.1f} MB)"
839
+ except OSError:
840
+ return f"{p}"
841
+
842
+
843
+ def _status_entry_display(p: Path) -> str:
844
+ """Render one syslog/pihole/cloudtrail input for the dry-run block."""
845
+ status = "found" if p.exists() else "not found"
846
+ return f"{p} ({status})"
847
+
848
+
849
+ def _print_family_block(label: str, paths: list[Path], formatter) -> None:
850
+ """Render one source-family block in the dry-run output.
851
+
852
+ Empty list → ``{label:>15} not configured`` (byte-identical with prior
853
+ single-Path-None case).
854
+ One input → ``{label:>15} {formatter(input)}`` (byte-identical with
855
+ the prior single-input format — Glenn's preserve-byte-identical rail).
856
+ Multi-input → the first entry rides the label line, subsequent entries
857
+ indent under it at the same value column (17 chars: 15-char right-
858
+ justified label + 2-space gutter). NEVER emits a Python list repr.
859
+ """
860
+ head = f"{label + ':':>15}"
861
+ indent = " " * 15 # matches the right-justified label width
862
+ if not paths:
863
+ print(f"{head} not configured")
864
+ return
865
+ entries = [formatter(p) for p in paths]
866
+ print(f"{head} {entries[0]}")
867
+ for e in entries[1:]:
868
+ print(f"{indent} {e}")
869
+
870
+
871
+ def _print_dry_run(
872
+ zeek_dir: Path | list[Path] | None,
873
+ syslog_dir: Path | list[Path] | None,
874
+ pihole_dir: Path | list[Path] | None,
875
+ cloudtrail_dir: Path | list[Path] | None,
876
+ since: datetime | None,
877
+ until: datetime | None,
878
+ load_all: bool,
879
+ will_run: list[str],
880
+ skipped: dict[str, str],
881
+ ) -> None:
882
+ print("LogHunter · Threat Hunt [dry run]")
883
+ print(_SEP_DOUBLE)
884
+
885
+ # Right-justified 15-char label field (width of the widest label,
886
+ # "cloudtrail_dir:") plus a two-space gutter. Colons AND value starts align
887
+ # in a single clean column for all four source-dir lines. Multi-input
888
+ # buckets stack additional entries under the label's value column.
889
+ # Boundary accepts scalar Path / list / None — same normalization shape
890
+ # as build_run_plan, so test callers passing scalar Path or None work
891
+ # without juggling the wire shape.
892
+ _print_family_block("zeek_dir", _as_path_list(zeek_dir), _zeek_entry_display)
893
+ _print_family_block("syslog_dir", _as_path_list(syslog_dir), _status_entry_display)
894
+ _print_family_block("pihole_dir", _as_path_list(pihole_dir), _status_entry_display)
895
+ _print_family_block(
896
+ "cloudtrail_dir", _as_path_list(cloudtrail_dir), _status_entry_display,
897
+ )
898
+
899
+ if load_all:
900
+ print("Window: all available data (--all)")
901
+ elif since or until:
902
+ since_str = since.strftime("%Y-%m-%d %H:%M UTC") if since else "beginning of data"
903
+ until_str = until.strftime("%Y-%m-%d %H:%M UTC") if until else "end of data"
904
+ print(f"Window: {since_str} → {until_str}")
905
+ else:
906
+ print("Window: all available data")
907
+
908
+ if will_run:
909
+ print(f"Detectors: {' '.join(will_run)}")
910
+ else:
911
+ print("Detectors: (none — required logs unavailable)")
912
+
913
+ # Group detectors by skip reason for compact display
914
+ by_reason: dict[str, list[str]] = {}
915
+ for name, reason in skipped.items():
916
+ by_reason.setdefault(reason, []).append(name)
917
+
918
+ for reason, names in by_reason.items():
919
+ print(f"Skipped: {', '.join(names)} — {reason}")
920
+
921
+ print(_SEP_DOUBLE)
922
+ print("Dry run complete. Remove --dry-run to analyze.")
923
+
924
+
925
+ def _derive_data_sources(
926
+ needed_logs: dict[str, str],
927
+ record_counts: dict[str, int],
928
+ ) -> list[str]:
929
+ """Return sorted data_source labels for patterns that produced non-empty data."""
930
+ from loghunter.common.loader import _log_type
931
+
932
+ labels: set[str] = set()
933
+ for pattern, count in record_counts.items():
934
+ if count <= 0:
935
+ continue
936
+ source = needed_logs.get(pattern)
937
+ if source is None:
938
+ continue
939
+ if source == "zeek_dir":
940
+ lt = _log_type(pattern)
941
+ if lt is not None:
942
+ labels.add(f"zeek_{lt}")
943
+ elif source == "syslog_dir":
944
+ labels.add("syslog_raw")
945
+ elif source == "pihole_dir":
946
+ labels.add("dnsmasq_dns")
947
+ elif source == "cloudtrail_dir":
948
+ labels.add("cloudtrail_raw")
949
+ return sorted(labels)
950
+
951
+
952
+ def _pattern_human_label(source_key: str, pattern: str) -> str:
953
+ """Operator-language label for one (source_key, pattern) tuple.
954
+
955
+ USED BY: the source-coverage disclosure note (``_zero_window_coverage_notes``).
956
+ DISTINCT FROM: ``_derive_data_sources``, which emits internal
957
+ ``data_sources`` tokens (``"zeek_dns"`` / ``"dnsmasq_dns"`` / ``"syslog_raw"``
958
+ / ``"cloudtrail_raw"``) consumed by the Zeek-evangelization nudge matcher and
959
+ other internal channels — those token strings stay byte-identical there.
960
+
961
+ Labels: ``Pi-hole`` / ``syslog`` / ``CloudTrail`` / ``Zeek <log_type>``.
962
+ """
963
+ from loghunter.common.loader import _log_type
964
+
965
+ if source_key == "pihole_dir":
966
+ return "Pi-hole"
967
+ if source_key == "syslog_dir":
968
+ return "syslog"
969
+ if source_key == "cloudtrail_dir":
970
+ return "CloudTrail"
971
+ if source_key == "zeek_dir":
972
+ lt = _log_type(pattern)
973
+ return f"Zeek {lt}" if lt is not None else "Zeek"
974
+ return source_key
975
+
976
+
977
+ def _zero_window_coverage_notes(
978
+ load_result: "loader.LoadResult",
979
+ plan: RunPlan,
980
+ ) -> list[str]:
981
+ """Return disclosure notes for planned sources that contributed 0 in-window rows.
982
+
983
+ Honesty rail — coverage counts VALID-ts rows only:
984
+ - ``full_rows > 0`` → SPAN note (count + span + widen suggestion), or
985
+ count-only when no valid span survived (degenerate; defensive).
986
+ - ``full_rows is None`` and ``source_key == "zeek_dir"`` → BARE note
987
+ ("files found, 0 records …"). The BARE arm is **zeek_dir-only**: for
988
+ syslog/pihole/cloudtrail, "no files read" means a wrong-family skip
989
+ or an empty directory — neither is a window gap the operator can fix
990
+ with ``--since/--days``, and the existing per-source warnings already
991
+ cover it.
992
+ - ``full_rows == 0`` → NO note (parse gap; widen advice would mislead).
993
+ - Pattern not loaded at all (source unconfigured) → NO note (the
994
+ loader already warns ``"{source} not configured — {pattern} not loaded"``).
995
+ """
996
+ out: list[str] = []
997
+ for pattern, source_key in plan.needed_logs.items():
998
+ if load_result.record_counts.get(pattern, 0) != 0:
999
+ continue
1000
+ if pattern not in load_result.logs:
1001
+ # Source unconfigured for this pattern — loader already warned.
1002
+ continue
1003
+ cov = load_result.coverage.get(pattern)
1004
+ # Parse gap → silent (would otherwise tell the operator to widen
1005
+ # the window on a file with no valid timestamps — misleading).
1006
+ if cov is not None and cov.full_rows == 0:
1007
+ continue
1008
+ label = _pattern_human_label(source_key, pattern)
1009
+ if cov is None or cov.full_rows is None:
1010
+ if source_key != "zeek_dir":
1011
+ continue
1012
+ out.append(
1013
+ f"{label}: files found, 0 records in the selected window. "
1014
+ "Widen with --since/--days, or --all."
1015
+ )
1016
+ continue
1017
+ if cov.full_span is not None:
1018
+ start, end = cov.full_span
1019
+ out.append(
1020
+ f"{label}: {cov.full_rows:,} rows loaded, 0 in the selected "
1021
+ f"window — data spans {start.isoformat()} → {end.isoformat()}. "
1022
+ "Widen with --since/--days, or --all."
1023
+ )
1024
+ else:
1025
+ out.append(
1026
+ f"{label}: {cov.full_rows:,} rows loaded, 0 in the selected "
1027
+ "window. Widen with --since/--days, or --all."
1028
+ )
1029
+ return out
1030
+
1031
+
1032
+ def _rotation_skip_notes(
1033
+ load_result: "loader.LoadResult",
1034
+ plan: RunPlan,
1035
+ ) -> list[str]:
1036
+ """Return disclosure notes for flat patterns windowed by rotation-peek.
1037
+
1038
+ The loader records a ``RotationSkipInfo`` per windowed pattern; the runner
1039
+ formats the prose (the loader never imports the runner). Reuses
1040
+ ``_pattern_human_label`` for the operator-language source name.
1041
+
1042
+ - ``fallback`` → reason-aware "read the full archive (windowing skipped)."
1043
+ wording: "rotation order not monotonic" for the first-ts disorder fallback
1044
+ (reason ``None`` or that string — byte-identical to before), "overlapping
1045
+ export windows" for the Family-2 export-window conflict, and "duplicate
1046
+ rotation files" for a same-rank duplicate slot. Fallback WINS: it is
1047
+ data-true at the pattern level (``skipped == 0``), so the skip-summary cannot
1048
+ also fire.
1049
+ - else ``skipped > 0`` → "loaded L of L+S rotation files; S skipped outside
1050
+ the selected window (by rotation order)." NEUTRAL "outside" is truthful
1051
+ for both the ``--since`` older-tail skip and the ``--until`` too-new
1052
+ leading skip (a bounded run can skip both under one count).
1053
+ - else → no note.
1054
+ """
1055
+ out: list[str] = []
1056
+ for pattern, info in load_result.rotation_skips.items():
1057
+ label = _pattern_human_label(plan.needed_logs[pattern], pattern)
1058
+ if info.fallback:
1059
+ if info.fallback_reason == "overlapping export windows":
1060
+ out.append(
1061
+ f"{label}: overlapping export windows — read the full archive "
1062
+ "(windowing skipped)."
1063
+ )
1064
+ elif info.fallback_reason == "duplicate rotation files":
1065
+ out.append(
1066
+ f"{label}: duplicate rotation files — read the full archive "
1067
+ "(windowing skipped)."
1068
+ )
1069
+ else: # "rotation order not monotonic" or None → existing wording
1070
+ out.append(
1071
+ f"{label}: rotation order not monotonic — read the full archive "
1072
+ "(windowing skipped)."
1073
+ )
1074
+ elif info.skipped > 0:
1075
+ out.append(
1076
+ f"{label}: loaded {info.loaded} of {info.loaded + info.skipped} "
1077
+ f"rotation files; {info.skipped} skipped outside the selected "
1078
+ "window (by rotation order)."
1079
+ )
1080
+ return out
1081
+
1082
+
1083
+ def _dns_nudge(data_sources: list[str]) -> str | None:
1084
+ """Return the Zeek evangelization note when only low-fidelity DNS data was loaded."""
1085
+ ds = set(data_sources)
1086
+ if "dnsmasq_dns" in ds and ds.isdisjoint(_RICH_DNS_SOURCES):
1087
+ return (
1088
+ "running on Pi-hole/dnsmasq logs — RTT, TTL, and connection correlation "
1089
+ "unavailable. Add Zeek for richer DNS analysis and conn.log correlation."
1090
+ )
1091
+ return None
1092
+
1093
+
1094
+ def _aws_below_floor_note(
1095
+ plan: RunPlan,
1096
+ logs: dict[str, pd.DataFrame],
1097
+ config: dict[str, Any],
1098
+ ) -> str | None:
1099
+ """RunSummary note disclosing principals below the aws min_events floor.
1100
+
1101
+ Pure derivation from the loaded CloudTrail frame via the detector's
1102
+ public ``below_floor_count`` helper. Called BEFORE the detector loop —
1103
+ detector-side state (a module cache populated inside run()) would be
1104
+ stale at this point. Returns None when aws is not in the plan, when the
1105
+ helper is missing (defensive), when no frame is loaded, or when count == 0.
1106
+ """
1107
+ if "aws" not in plan.will_run:
1108
+ return None
1109
+ mod = plan.detectors.get("aws")
1110
+ if mod is None or not hasattr(mod, "below_floor_count"):
1111
+ return None
1112
+ df = logs.get("*.json*")
1113
+ if df is None or df.empty:
1114
+ return None
1115
+ aws_cfg = get_detector_config(config, "aws", getattr(mod, "DEFAULT_CONFIG", {}))
1116
+ default_min = getattr(mod, "DEFAULT_CONFIG", {}).get("min_events", 50)
1117
+ min_events = aws_cfg.get("min_events", default_min)
1118
+ count = mod.below_floor_count(df, min_events)
1119
+ if count <= 0:
1120
+ return None
1121
+ return (
1122
+ f"aws: {count} interactive principal(s) below the min_events floor were "
1123
+ "not scored — the quiet tail of low-volume actors was not examined."
1124
+ )
1125
+
1126
+
1127
+ def _aws_window_note(
1128
+ plan: RunPlan, *, cloudtrail_narrowed: bool = False
1129
+ ) -> str | None:
1130
+ """First-seen labels are relative to the loaded window — name the limitation.
1131
+
1132
+ Fires whenever aws ran, regardless of whether any bursts were emitted. The
1133
+ methodology limitation is worth knowing even if this run produced no
1134
+ burst findings, because the absence is itself window-dependent.
1135
+
1136
+ The ``--all`` rider is keyed to CLOUDTRAIL ACTUALLY being narrowed (an
1137
+ explicit --since/--until), NOT run-level default-window activity: CloudTrail
1138
+ opts out of the auto-default window, so on a mixed unqualified run
1139
+ (dns/syslog windowed) it loaded FULL and widening would not help. Rides the
1140
+ EXISTING note (no new note, no position change) — placeholder voice, flag
1141
+ for the qmail error-voice pass.
1142
+ """
1143
+ if "aws" not in plan.will_run:
1144
+ return None
1145
+ note = (
1146
+ "aws: first-seen actions are first-seen within this loaded window — an "
1147
+ "action that is routinely used but absent earlier in the window reads "
1148
+ "as first-seen."
1149
+ )
1150
+ if cloudtrail_narrowed:
1151
+ note += " Run with --all for a full-baseline analysis."
1152
+ return note
1153
+
1154
+
1155
+ def _aws_no_interactive_note(
1156
+ plan: RunPlan,
1157
+ logs: dict[str, pd.DataFrame],
1158
+ *,
1159
+ cloudtrail_narrowed: bool,
1160
+ ) -> str | None:
1161
+ """Disclose the silent aws "nothing" when events loaded but zero are
1162
+ interactive-lane (aws.run returns [] with no finding).
1163
+
1164
+ Pure derivation via the detector's public ``interactive_count`` helper
1165
+ (mirrors ``_aws_below_floor_note``). Fires when aws is planned, the
1166
+ ``*.json*`` frame is non-empty, and no event is interactive-lane. The
1167
+ ``--all`` suffix is conditional on ``cloudtrail_narrowed`` — widening only
1168
+ helps when an explicit window narrowed the load; on an unqualified run
1169
+ CloudTrail already loaded full, so widening cannot surface interactive
1170
+ events that do not exist. Placeholder voice (flag for the qmail pass).
1171
+ """
1172
+ if "aws" not in plan.will_run:
1173
+ return None
1174
+ mod = plan.detectors.get("aws")
1175
+ if mod is None or not hasattr(mod, "interactive_count"):
1176
+ return None
1177
+ df = logs.get("*.json*")
1178
+ if df is None or df.empty:
1179
+ return None
1180
+ if mod.interactive_count(df) != 0:
1181
+ return None
1182
+ note = (
1183
+ f"aws: {len(df)} CloudTrail events loaded but none are interactive-lane — "
1184
+ "aws scores only interactive activity, so nothing was analyzed."
1185
+ )
1186
+ if cloudtrail_narrowed:
1187
+ note += " Run with --all for full history."
1188
+ return note
1189
+
1190
+
1191
+ def _home_net_note(plan: RunPlan, config: dict[str, Any]) -> str | None:
1192
+ """RunSummary note disclosing the internal networks in effect for scan.
1193
+
1194
+ Fires only when scan is in plan.will_run. Distinguishes default-vs-declared
1195
+ by reading the ``__user_set__`` provenance sidecar attached by the config
1196
+ loader — a pure value comparison would misclassify a user who declares the
1197
+ RFC1918 list verbatim as "default". When the operator did not declare
1198
+ home_net (no config file, or config file omits the key), the parenthetical
1199
+ fires; when they did declare it, the note states their value plainly.
1200
+ """
1201
+ if "scan" not in plan.will_run:
1202
+ return None
1203
+ home_net = list(config.get("loghunter", {}).get("home_net", []))
1204
+ if not home_net:
1205
+ return None
1206
+ rendered = ", ".join(home_net)
1207
+ user_set = config.get("__user_set__", {}).get("loghunter", set())
1208
+ if "home_net" in user_set:
1209
+ return f"Internal networks: {rendered}."
1210
+ return (
1211
+ f"Internal networks: {rendered} "
1212
+ "(RFC1918 default — set home_net in config to override)."
1213
+ )
1214
+
1215
+
1216
+ def _source_overlap_notes(
1217
+ source_dirs: dict[str, list[Path]], plan: RunPlan,
1218
+ ) -> list[str]:
1219
+ """RunSummary notes when two IN-PLAN source families resolve to one directory.
1220
+
1221
+ The contamination vector: flat discovery globs overlap (``syslog`` discovers
1222
+ with the catch-all ``*.log*``), so a directory shared by two families has its
1223
+ files parsed by each front-end — one log can surface as another's finding.
1224
+ This is a plan-time disclosure, derived from already-resolved sources (same
1225
+ posture as ``_home_net_note``), not a load-time check.
1226
+
1227
+ Binding rails:
1228
+
1229
+ - **Eligibility = in-plan families only.** Derived from
1230
+ ``set(plan.needed_logs.values())``, NOT every non-empty resolved bucket.
1231
+ A family configured-and-resolved but not selected (no detector in the run
1232
+ reads it) cannot contaminate, so two dirs colliding while only one family
1233
+ is planned does NOT warn. Optional multi-source detectors add only their
1234
+ satisfiable patterns to ``needed_logs``, so the note follows what the
1235
+ loader will actually read.
1236
+ - **Directories only.** Explicit FILE inputs are out of scope — the vector
1237
+ is dir-glob overlap, not a shared named file. Per-family duplicate inputs
1238
+ collapse (a key is recorded once per directory).
1239
+ - **Equal-dir ONLY (v1).** Flat discovery is non-recursive, so the shipped
1240
+ default (``syslog_dir=/var/log`` containing ``zeek_dir=/var/log/zeek``)
1241
+ does NOT contaminate and MUST NOT warn — nesting is deliberately out of
1242
+ scope. (CloudTrail's ``rglob`` makes nested cloudtrail an acknowledged
1243
+ deferred edge.)
1244
+ - **Deterministic ordering.** ``source_dirs`` is built in canonical key order
1245
+ by ``run`` (zeek, syslog, pihole, cloudtrail); first-seen preservation here
1246
+ keeps the rendered family list deterministic. ≥3 families at one dir → one
1247
+ note listing all.
1248
+
1249
+ Placeholder voice (pending the qmail error-voice pass).
1250
+ """
1251
+ in_plan = set(plan.needed_logs.values())
1252
+ by_dir: dict[Path, list[str]] = {}
1253
+ for key, paths in source_dirs.items():
1254
+ if key not in in_plan:
1255
+ continue
1256
+ for p in paths:
1257
+ if not p.is_dir(): # explicit files out of scope
1258
+ continue
1259
+ try:
1260
+ resolved = p.resolve()
1261
+ except OSError:
1262
+ continue
1263
+ families = by_dir.setdefault(resolved, [])
1264
+ if key not in families: # collapse per-family duplicate inputs
1265
+ families.append(key)
1266
+
1267
+ notes: list[str] = []
1268
+ for resolved, families in by_dir.items():
1269
+ if len(families) >= 2:
1270
+ notes.append(
1271
+ f"{', '.join(families)} resolve to the same directory "
1272
+ f"({resolved}): files there matching more than one source's "
1273
+ "patterns are parsed by each, which can surface one log as "
1274
+ "another's finding. Point them at separate directories — global "
1275
+ "exports now auto-segment per source."
1276
+ )
1277
+ return notes
1278
+
1279
+
1280
+ # ─────────────────────────────────────────────────────────────────────────────
1281
+ # digest verb — orient-before-the-hunt
1282
+ #
1283
+ # run_digest() and the helpers below are a parallel entry point to run(). They
1284
+ # share the loader, output-handler-building, and _derive_data_sources. Default-
1285
+ # window resolution now goes through the SAME loader.resolve_load_windows +
1286
+ # loader.apply_default_window that run() uses — the digest twin engine is gone.
1287
+ # Digest default-windowing stays Zeek-ONLY (the caller-side gate below): non-Zeek
1288
+ # digest directories continue to load full, exactly as before. Pinned by the
1289
+ # Zeek-directory golden plus the programmatic non-Zeek load-full tests.
1290
+ # ─────────────────────────────────────────────────────────────────────────────
1291
+
1292
+ _HISTOGRAM_HOURLY_THRESHOLD_HOURS = 48
1293
+ _HISTOGRAM_MAX_BINS = 60
1294
+
1295
+
1296
+ # Timestamp-confidence floor for digest cards. When the parseable-ts fraction
1297
+ # falls below this floor (or the non-NaN span is zero), the digest banner
1298
+ # window dashes and the histogram line renders "(timeline unavailable)" —
1299
+ # the card refuses to draw a timeline it cannot trust. Set at 80% per the
1300
+ # confident-but-wrong defects gate: lower risks rendering a confident
1301
+ # timeline on junk timestamps; higher would erase orientation when a small
1302
+ # fraction of a syslog batch is corrupt.
1303
+ _DIGEST_TS_CONFIDENCE_FLOOR: float = 0.80
1304
+
1305
+
1306
+ def _ts_confidence(frame: pd.DataFrame) -> bool:
1307
+ """True iff the frame's ``ts`` column can support an honest timeline.
1308
+
1309
+ Both conditions must hold:
1310
+
1311
+ 1. ``parsed / total >= _DIGEST_TS_CONFIDENCE_FLOOR`` (default 0.80) —
1312
+ the parseable-ts fraction is high enough that the histogram bins
1313
+ reflect the bulk of the records.
1314
+ 2. ``max(ts) - min(ts) > 0`` — the non-NaN timestamps span more than a
1315
+ single instant; otherwise the histogram collapses to one bin and
1316
+ lies about the timeline.
1317
+
1318
+ Both failure modes (low-coverage AND zero-span) render the same bare
1319
+ ``(timeline unavailable)`` line — there is no footer disclosure in the
1320
+ flat card grammar, so the differentiation that the old reason sentinels
1321
+ enabled has no consumer.
1322
+ """
1323
+ total = int(len(frame))
1324
+ if "ts" not in frame.columns or total == 0:
1325
+ return False
1326
+ ts = frame["ts"].dropna()
1327
+ parsed = int(len(ts))
1328
+ if parsed / total < _DIGEST_TS_CONFIDENCE_FLOOR:
1329
+ return False
1330
+ if parsed == 0:
1331
+ return False
1332
+ span = float(ts.max()) - float(ts.min())
1333
+ if span <= 0:
1334
+ return False
1335
+ return True
1336
+
1337
+
1338
+ def _compute_histogram(
1339
+ ts: pd.Series,
1340
+ data_window: tuple[datetime, datetime],
1341
+ ) -> tuple[list[int], str, int]:
1342
+ """Adaptive-binning temporal histogram over a timestamp series.
1343
+
1344
+ Returns ``(counts, unit, peak)``:
1345
+
1346
+ - ``counts`` is a list of per-bin event counts spanning data_window.
1347
+ - ``unit`` is ``"hr"`` for spans <= 48 hours, else ``"day"``.
1348
+ - ``peak`` is the maximum bin value (0 when there are no events).
1349
+
1350
+ Without unit-aware binning, a 30-day window with hourly bars produces
1351
+ 720 useless bars; a 1-hour window with daily bars produces one. Both
1352
+ fail to communicate shape — hence the adaptive switch.
1353
+
1354
+ The right edge is INCLUSIVE: the window is treated as ``[start, end]``
1355
+ so that an event at exactly ``data_window[1]`` (the max-ts event when
1356
+ ``data_window`` is derived from ``min(ts)/max(ts)``) lands in the
1357
+ final bin instead of being silently dropped when the span lands on an
1358
+ exact bin boundary. Callers must pass ``data_window`` such that
1359
+ ``data_window[1] >= max(ts)``; the lone production caller (run_digest)
1360
+ satisfies this by deriving ``data_window`` from the same loaded frame.
1361
+
1362
+ A zero-span window (``start == end``) with non-empty ``ts`` emits a
1363
+ single bin holding the full count — appropriate for single-record
1364
+ digests, or frames whose events all share one timestamp.
1365
+ """
1366
+ start, end = data_window
1367
+ span_seconds = (end - start).total_seconds()
1368
+
1369
+ cleaned = ts.dropna().astype(float)
1370
+ if cleaned.empty or span_seconds < 0:
1371
+ return [], "hr", 0
1372
+ if span_seconds == 0:
1373
+ # All events share a single timestamp — emit one bin holding the count.
1374
+ n = int(len(cleaned))
1375
+ return [n], "hr", n
1376
+
1377
+ span_hours = span_seconds / 3600.0
1378
+ if span_hours <= _HISTOGRAM_HOURLY_THRESHOLD_HOURS:
1379
+ unit = "hr"
1380
+ bin_seconds = 3600
1381
+ else:
1382
+ unit = "day"
1383
+ bin_seconds = 86400
1384
+
1385
+ bin_count = max(1, -(-int(span_seconds) // bin_seconds)) # ceiling division
1386
+ start_epoch = start.timestamp()
1387
+ offsets = ((cleaned - start_epoch) // bin_seconds).astype("int64")
1388
+ # Drop pre-window events, then collapse the inclusive right edge: events
1389
+ # at exactly data_window[1] yield offset == bin_count when the span is an
1390
+ # exact multiple of bin_seconds — fold those into the final bin instead
1391
+ # of filtering them out.
1392
+ offsets = offsets[offsets >= 0]
1393
+ offsets = offsets.where(offsets < bin_count, bin_count - 1)
1394
+ value_counts = offsets.value_counts().sort_index()
1395
+ counts = [int(value_counts.get(i, 0)) for i in range(bin_count)]
1396
+ if len(counts) > _HISTOGRAM_MAX_BINS:
1397
+ # Cap output width by folding adjacent bins by sum. The unit label
1398
+ # stays nominal — each glyph now spans several hr/day — but the peak
1399
+ # anchor recomputed below stays truthful to the drawn bars.
1400
+ group_size = -(-len(counts) // _HISTOGRAM_MAX_BINS)
1401
+ counts = [
1402
+ sum(counts[i:i + group_size])
1403
+ for i in range(0, len(counts), group_size)
1404
+ ]
1405
+ peak = max(counts) if counts else 0
1406
+ return counts, unit, peak
1407
+
1408
+
1409
+ _DNS_ZEEK_EMPTY_COLUMNS = [
1410
+ "ts", "src", "query", "rtt", "ttl", "rcode", "answer", "tc", "qtype",
1411
+ ]
1412
+ _DNS_PIHOLE_EMPTY_COLUMNS = [
1413
+ "ts", "src", "query", "event_type", "qtype", "dst", "answer",
1414
+ "validation", "host", "raw", "message",
1415
+ ]
1416
+ _CONN_EMPTY_COLUMNS = [
1417
+ "src", "dst", "port", "proto", "ts", "bytes", "conn_state", "local_orig",
1418
+ ]
1419
+ _SYSLOG_EMPTY_COLUMNS = ["ts", "host", "program", "raw", "message"]
1420
+ _CLOUDTRAIL_EMPTY_COLUMNS = [
1421
+ "ts", "principal", "lane", "read_write",
1422
+ "event_source", "event_name", "identity_type",
1423
+ "source_ip", "error_code", "aws_region", "event_id", "raw",
1424
+ ]
1425
+
1426
+
1427
+ # (schema, source_key) → (loader glob pattern, empty-frame column set).
1428
+ # Mechanical mapping kept inline alongside run_digest because it's runner
1429
+ # plumbing — pattern + columns are runner/loader concerns, NOT source-
1430
+ # resolution ownership (DigestSource just carries the directory + feed +
1431
+ # source_key). See plan: "_PATTERN_AND_EMPTY[(schema, source_key)] inline".
1432
+ _DIGEST_PATTERN_AND_EMPTY: dict[tuple[str, str], tuple[str, list[str]]] = {
1433
+ ("conn", "zeek_dir"): ("conn*.log*", _CONN_EMPTY_COLUMNS),
1434
+ ("dns", "zeek_dir"): ("dns*.log*", _DNS_ZEEK_EMPTY_COLUMNS),
1435
+ ("dns", "pihole_dir"): ("pihole*.log*", _DNS_PIHOLE_EMPTY_COLUMNS),
1436
+ ("syslog", "syslog_dir"): ("*.log*", _SYSLOG_EMPTY_COLUMNS),
1437
+ ("syslog", "zeek_dir"): ("syslog*.log*", _SYSLOG_EMPTY_COLUMNS),
1438
+ ("cloudtrail", "cloudtrail_dir"): ("*.json*", _CLOUDTRAIL_EMPTY_COLUMNS),
1439
+ }
1440
+
1441
+
1442
+ # Inter-card separator emitted between adjacent rendered cards on a multi-card
1443
+ # run (stdout fan-out or --out concatenation). 40 columns of U+2500 BOX
1444
+ # DRAWINGS LIGHT HORIZONTAL, flush-left, with one blank line above and one
1445
+ # blank line below. Single-card runs (one positional, or a multi-positional
1446
+ # run where only one path reaches render-commit) draw no rule at all — the
1447
+ # emit fires only when ``leading_separator=True``, which the CLI sets from
1448
+ # ``rendered > 0`` AFTER a prior card's run_digest return.
1449
+ _DIGEST_INTER_CARD_RULE: str = "─" * 40
1450
+
1451
+
1452
+ def _emit_inter_card_separator(stream: Any) -> None:
1453
+ """Emit the 40-col inter-card rule with bracketing blank lines."""
1454
+ target = stream if stream is not None else sys.stdout
1455
+ print(file=target)
1456
+ print(_DIGEST_INTER_CARD_RULE, file=target)
1457
+ print(file=target)
1458
+
1459
+
1460
+ def _render_blob_for_path(
1461
+ blob_path: Path,
1462
+ *,
1463
+ stream: Any = None,
1464
+ output_dir: Path | None = None,
1465
+ output_file: Path | None = None,
1466
+ verbose_level: int = 0,
1467
+ leading_separator: bool = False,
1468
+ ) -> None:
1469
+ """Profile a single file via the blob digest path and render the card.
1470
+
1471
+ Shared by the canonical blob branch (schema == "blob": sniff routed a
1472
+ path to the blob floor) and the defensive fallback in the recognised-
1473
+ schema path (item 2: a summariser raise on a recognised schema falls
1474
+ through to a blob card for the same file instead of aborting the fan-
1475
+ out).
1476
+
1477
+ Caller verifies that ``blob_path`` is a regular file before invoking.
1478
+ Output routing (stream / output_dir / output_file / verbose) is the
1479
+ same shape ``run_digest`` itself uses; the fallback caller threads its
1480
+ own values so the blob card lands on the same fan-out stream and
1481
+ --out target as the original card would have.
1482
+
1483
+ ``leading_separator`` is the single-owner emission seam for blob cards.
1484
+ This function owns the rule for BOTH the top-level blob route AND the
1485
+ summariser-failure fallback — ``run_digest`` never emits when handing
1486
+ off to the fallback, it just threads the flag here. Emission happens
1487
+ immediately before ``handler.render_blob(card)`` so a separator only
1488
+ ever precedes a card that reaches its render call.
1489
+ """
1490
+ from loghunter.digest import blob as _blob_summarizer
1491
+ card = _blob_summarizer.summarize_blob(blob_path)
1492
+
1493
+ handler, close_handler = _build_output_handler(
1494
+ "text", output_dir, output_file, verbose_level, stream=stream,
1495
+ )
1496
+ try:
1497
+ from loghunter.outputs.text import TextHandler
1498
+ if not isinstance(handler, TextHandler):
1499
+ raise RuntimeError(
1500
+ "digest blob: _build_output_handler did not return a "
1501
+ f"TextHandler (got {type(handler).__name__})"
1502
+ )
1503
+ if leading_separator:
1504
+ _emit_inter_card_separator(stream)
1505
+ handler.render_blob(card)
1506
+ finally:
1507
+ close_handler()
1508
+
1509
+
1510
+ def run_digest(
1511
+ config: dict[str, Any],
1512
+ zeek_dir: str | Path | None = None,
1513
+ pihole_dir: str | Path | None = None,
1514
+ syslog_dir: str | Path | None = None,
1515
+ cloudtrail_dir: str | Path | None = None,
1516
+ blob_path: Path | None = None,
1517
+ since: datetime | None = None,
1518
+ until: datetime | None = None,
1519
+ output_format: str = "text",
1520
+ output_dir: Path | None = None,
1521
+ output_file: Path | None = None,
1522
+ stream: Any = None,
1523
+ verbose_level: int = 0,
1524
+ dry_run: bool = False,
1525
+ load_all: bool = False,
1526
+ skip_confirm: bool = False,
1527
+ schema: str = "conn",
1528
+ fallback_blob_path: Path | None = None,
1529
+ leading_separator: bool = False,
1530
+ show_progress: bool = True,
1531
+ ) -> None:
1532
+ """Digest entry point — orient-before-the-hunt for a single schema.
1533
+
1534
+ Loads the source frame, computes spine ambient facts and a temporal
1535
+ histogram, dispatches to the schema summariser, assembles a DigestCard,
1536
+ and renders it. Does NOT build a RunPlan, does NOT run the allowlist
1537
+ loop, does NOT produce Findings.
1538
+
1539
+ Pre-allowlist tap: the loaded frame is consumed BEFORE the allowlist
1540
+ seam. Allowlisted infrastructure (resolvers, pollers) is part of what's
1541
+ in the pile and stays on the sonar. This function MUST NOT call
1542
+ build_matcher or AllowlistMatcher.filter_df.
1543
+
1544
+ Source-dir parameters (``zeek_dir`` / ``pihole_dir`` / ``syslog_dir`` /
1545
+ ``cloudtrail_dir``) are EXPLICIT OVERRIDES with ``None`` meaning
1546
+ "no override." Pass a string or ``Path``;
1547
+ ``loghunter.common.sources.resolve_digest_source`` owns the per-schema
1548
+ candidate ladder, wrong-key + XOR + not-configured errors (byte-preserved
1549
+ from the previous in-line strings), and is the SOLE site that converts
1550
+ a source-dir string to a Path. CLI callers thread raw parsed strings;
1551
+ programmatic callers can pass already-resolved ``Path``s or let ``None``
1552
+ fall back to ``config["loghunter"][candidate]`` (LH_ROOT applied).
1553
+
1554
+ ``leading_separator`` drives the multi-card inter-card rule. The CLI
1555
+ fan-out sets it from ``rendered > 0`` after a previous card committed
1556
+ to render. Single-owner emission: run_digest emits for schema cards
1557
+ (immediately before handler.render_digest); on the summariser-failure
1558
+ fallback arm it threads the flag through to _render_blob_for_path
1559
+ (which owns blob emission) and does NOT emit itself.
1560
+ """
1561
+ if output_format != "text":
1562
+ raise ValueError(
1563
+ f"digest currently supports only --output=text (got {output_format!r})"
1564
+ )
1565
+ if schema not in ("conn", "dns", "syslog", "cloudtrail", "blob"):
1566
+ raise ValueError(f"digest: unsupported schema {schema!r}")
1567
+
1568
+ # The blob path is reached ONLY via the CLI sniff router, never via an
1569
+ # operator token (the `digest blob PATH` token is gone). The blob
1570
+ # terminal branch is small by design: profile the single file and hand
1571
+ # off to _render_blob_for_path, which builds + renders the card. No
1572
+ # loader, no allowlist, no histogram, no DigestCard — blob has no
1573
+ # parsed frame.
1574
+ if schema == "blob":
1575
+ if blob_path is None:
1576
+ raise ValueError(
1577
+ "digest blob: PATH not provided — pass a positional PATH"
1578
+ )
1579
+ if not blob_path.is_file():
1580
+ raise ValueError(f"digest blob: not a file: {blob_path}")
1581
+
1582
+ if dry_run:
1583
+ print("LogHunter · digest dry run")
1584
+ print(_SEP)
1585
+ print(" schema: blob")
1586
+ print(f" path: {blob_path}")
1587
+ print(" window: (none — blob extracts no fields)")
1588
+ print(_SEP)
1589
+ return
1590
+
1591
+ _render_blob_for_path(
1592
+ blob_path,
1593
+ stream=stream,
1594
+ output_dir=output_dir,
1595
+ output_file=output_file,
1596
+ verbose_level=verbose_level,
1597
+ leading_separator=leading_separator,
1598
+ )
1599
+ return
1600
+
1601
+ if blob_path is not None:
1602
+ raise ValueError(
1603
+ f"digest {schema}: blob_path is only valid for the blob schema"
1604
+ )
1605
+
1606
+ cfg_lh = config.get("loghunter", {})
1607
+
1608
+ # Single owner of digest source resolution. resolve_digest_source runs the
1609
+ # per-schema candidate ladder + wrong-key / XOR / not-configured guards
1610
+ # with byte-preserved error strings, and is the SOLE site that converts
1611
+ # a source-dir string to a Path on the digest path.
1612
+ ds = resolve_digest_source(
1613
+ config, schema,
1614
+ overrides={
1615
+ "zeek_dir": zeek_dir,
1616
+ "syslog_dir": syslog_dir,
1617
+ "pihole_dir": pihole_dir,
1618
+ "cloudtrail_dir": cloudtrail_dir,
1619
+ },
1620
+ )
1621
+ feed = ds.feed
1622
+ source_dir = ds.directory
1623
+ source_key = ds.source_key
1624
+ pattern, empty_columns = _DIGEST_PATTERN_AND_EMPTY[(schema, source_key)]
1625
+
1626
+ from loghunter.common import loader
1627
+
1628
+ # Default-window resolution is Zeek-ONLY on the digest path (CODE.md
1629
+ # boundedness rule): non-Zeek digest directories (pihole/syslog/cloudtrail)
1630
+ # load full and filter by an explicit window only. The caller-side gate below
1631
+ # IS the behavior-preservation point — digest invokes the SHARED resolver
1632
+ # (loader.resolve_load_windows) for the Zeek source alone, NOT a duplicate
1633
+ # engine. dated → precise (since, until); flat / mixed → post-load trim_span.
1634
+ dated_window: tuple[datetime, datetime] | None = None
1635
+ flat_span: timedelta | None = None
1636
+ keep_null = False
1637
+ default_note = None # no banner on the flat digest card — never rendered
1638
+ if source_key == "zeek_dir":
1639
+ default_spec = cfg_lh.get("default_window", "1d")
1640
+ _digest_windows = loader.resolve_load_windows(
1641
+ {pattern: source_key}, {source_key: [source_dir]}, default_spec,
1642
+ since=since, until=until, load_all=load_all,
1643
+ )
1644
+ if _digest_windows:
1645
+ w = _digest_windows[0]
1646
+ dated_window = w.select_window if w.trim_span is None else None
1647
+ flat_span = w.trim_span
1648
+ keep_null = w.keep_null
1649
+
1650
+ if dry_run:
1651
+ print("LogHunter · digest dry run")
1652
+ print(_SEP)
1653
+ print(f" schema: {schema}")
1654
+ if feed is not None:
1655
+ print(f" feed: {feed}")
1656
+ print(f" {source_key}:{' ' * max(0, 13 - len(source_key) - 1)} {source_dir}")
1657
+ if dated_window is not None:
1658
+ print(
1659
+ f" window: {dated_window[0].isoformat()} → "
1660
+ f"{dated_window[1].isoformat()} (dated default)"
1661
+ )
1662
+ elif flat_span is not None:
1663
+ print(f" window: last {cfg_lh.get('default_window', '1d')} of available data (flat default)")
1664
+ elif since is not None or until is not None:
1665
+ since_str = since.isoformat() if since else "beginning of data"
1666
+ until_str = until.isoformat() if until else "end of data"
1667
+ print(f" window: {since_str} → {until_str}")
1668
+ elif load_all:
1669
+ print(" window: all available data (--all)")
1670
+ else:
1671
+ print(" window: all available data")
1672
+ print(_SEP)
1673
+ return
1674
+
1675
+ needed_logs = {pattern: source_key}
1676
+ # Digest compat: load_required_logs is list-only. Wrap [source_dir] for
1677
+ # the degenerate one-element case — card-per-file behavior unchanged,
1678
+ # the union plumbing runs as a single-element passthrough.
1679
+ source_dirs = {source_key: [source_dir]}
1680
+ source_windows = (
1681
+ {source_key: dated_window} if dated_window is not None else None
1682
+ )
1683
+
1684
+ # Single-file Zeek bypass: the file was already content-identified by sniff;
1685
+ # discover_zeek_files' fnmatch(basename, pattern) gate is meaningless for an
1686
+ # explicitly-named single file and was dropping date-prefixed Zeek logs
1687
+ # (e.g. 2026-06-09.conn.log) into zero-row cards. Pi-hole, syslog, and
1688
+ # CloudTrail loaders already accept explicit files without a basename gate;
1689
+ # only the Zeek path needs the bypass. discover_zeek_files itself is
1690
+ # unchanged — the detect path still uses its single-file gate as a type
1691
+ # check.
1692
+ if source_key == "zeek_dir" and source_dir.is_file():
1693
+ s_since, s_until = (
1694
+ dated_window if dated_window is not None else (since, until)
1695
+ )
1696
+ warnings: list[str] = []
1697
+ try:
1698
+ data_size_bytes = source_dir.stat().st_size
1699
+ except OSError:
1700
+ data_size_bytes = 0
1701
+ df = loader.load_logs(
1702
+ source_dir.parent, pattern, s_since, s_until,
1703
+ _files=[source_dir], _warnings=warnings,
1704
+ show_progress=show_progress,
1705
+ )
1706
+ # Preserve schema-warning parity with load_required_logs so
1707
+ # malformed-but-parseable Zeek single files behave identically on the
1708
+ # bypass and directory paths.
1709
+ schema_warning = loader._schema_warning(pattern, df)
1710
+ if schema_warning:
1711
+ warnings.append(schema_warning)
1712
+ logs = {pattern: df}
1713
+ record_counts = {pattern: len(df)} if not df.empty else {}
1714
+ load_result = loader.LoadResult(
1715
+ logs=logs,
1716
+ record_counts=record_counts,
1717
+ data_window=loader._data_window(logs),
1718
+ warnings=warnings,
1719
+ data_size_bytes=data_size_bytes,
1720
+ )
1721
+ else:
1722
+ load_result = loader.load_required_logs(
1723
+ needed_logs,
1724
+ source_dirs,
1725
+ since,
1726
+ until,
1727
+ verbose=(verbose_level >= 1),
1728
+ source_windows=source_windows,
1729
+ show_progress=show_progress,
1730
+ )
1731
+
1732
+ if flat_span is not None:
1733
+ load_result = loader.apply_default_window(
1734
+ load_result, [pattern], flat_span, keep_null=keep_null,
1735
+ )
1736
+
1737
+ for warning in load_result.warnings:
1738
+ print(f"Warning: {warning}", file=sys.stderr)
1739
+
1740
+ total_records = sum(load_result.record_counts.values())
1741
+ warn_above: int = cfg_lh.get("warn_above", 5_000_000)
1742
+ if total_records > warn_above and not skip_confirm:
1743
+ try:
1744
+ answer = input(
1745
+ f"{total_records:,} records found. This may take a while. "
1746
+ "Continue? [y/N] "
1747
+ )
1748
+ except (EOFError, KeyboardInterrupt):
1749
+ answer = ""
1750
+ if answer.strip().lower() not in ("y", "yes"):
1751
+ raise ExportAborted("loghunter: aborted by user")
1752
+
1753
+ if load_result.data_window is not None:
1754
+ data_window = load_result.data_window
1755
+ elif since or until:
1756
+ data_window = (
1757
+ since or datetime.now(timezone.utc),
1758
+ until or datetime.now(timezone.utc),
1759
+ )
1760
+ else:
1761
+ _now = datetime.now(timezone.utc)
1762
+ data_window = (_now, _now)
1763
+
1764
+ # data_sources / notes were the RunSummary banner inputs; the flat
1765
+ # digest card has no banner so they are no longer consumed here.
1766
+ # default_note is the unbounded-source default-window note — same
1767
+ # provenance disclosure, no card surface to attach it to under the new
1768
+ # grammar. Reference both so unused-arg checkers stay quiet.
1769
+ _ = _derive_data_sources(needed_logs, load_result.record_counts)
1770
+ _ = default_note
1771
+
1772
+ # Identity line 1 always carries the source's name — file or directory.
1773
+ # Directory-mode bare-config digest gets a sensible identity even though
1774
+ # the source is a multi-file load.
1775
+ source_name = source_dir.name
1776
+
1777
+ # Pre-allowlist tap — pull the frame straight out of load_result.
1778
+ # NO build_matcher. NO AllowlistMatcher.filter_df. Digest is the orient
1779
+ # step; allowlisted infrastructure (resolvers, pollers) is part of
1780
+ # what's in here and stays on the sonar.
1781
+ #
1782
+ # Frame is the source of truth for whether a schema card can render.
1783
+ # Empty frame → DigestEmpty (control signal, NOT a ValueError). The
1784
+ # file was understood — it simply had no parseable records. The CLI
1785
+ # narrates this distinctly from a real per-path failure. Applies ONLY
1786
+ # to the recognized-schema path; blob has its own terminal branch
1787
+ # above and an empty FILE was already caught at sniff time as
1788
+ # state="empty" in the CLI fan-out.
1789
+ frame = load_result.logs.get(pattern)
1790
+ if frame is None or frame.empty:
1791
+ raise DigestEmpty(basename=source_dir.name, schema=schema)
1792
+ # empty_columns reserved for any future tolerant-load path; the
1793
+ # current contract is "recognized schema must have at least one row
1794
+ # to render a card", enforced by the raise above.
1795
+ _ = empty_columns
1796
+
1797
+ # Timestamp-confidence gate (now boolean). Below the floor OR with a
1798
+ # zero non-NaN span, the timeline cannot be drawn honestly — dash the
1799
+ # identity-line window AND signal timeline_unavailable to the
1800
+ # renderer, which emits the bare "(timeline unavailable)" histogram
1801
+ # replacement. Both former failure modes (low coverage AND zero span)
1802
+ # render identically; the flat card has no footer surface to
1803
+ # differentiate them.
1804
+ if _ts_confidence(frame):
1805
+ histogram_counts, histogram_unit, histogram_peak = _compute_histogram(
1806
+ frame["ts"], data_window,
1807
+ )
1808
+ timeline_unavailable = False
1809
+ else:
1810
+ data_window = (None, None)
1811
+ histogram_counts = []
1812
+ histogram_unit = "hr"
1813
+ histogram_peak = 0
1814
+ timeline_unavailable = True
1815
+
1816
+ from loghunter import digest
1817
+ from loghunter.common.finding import DigestCard
1818
+
1819
+ # Narrow defence-in-depth wrap (item 2): summariser dispatch + body +
1820
+ # DigestCard construction. If the summariser raises on a pathological
1821
+ # frame (e.g. a duplicate `src` column producing pandas' "Grouper for
1822
+ # 'src' not 1-dimensional"), fall through to a blob card for the same
1823
+ # file rather than aborting the fan-out. Glenn's scope discipline:
1824
+ # - DOES catch summariser raises and DigestCard-construction raises.
1825
+ # - DOES NOT catch loader/parser errors (above this wrap).
1826
+ # - DOES NOT catch DigestEmpty (raised above; control signal).
1827
+ # - DOES NOT catch handler/render errors (below this wrap).
1828
+ # - DOES NOT catch BaseException — KeyboardInterrupt / SystemExit
1829
+ # propagate.
1830
+ # Bare-config callers (no single-file fallback path available) pass
1831
+ # fallback_blob_path=None and the exception re-raises to the caller's
1832
+ # existing ValueError arm.
1833
+ try:
1834
+ summarizer = digest.get_summarizer(schema)
1835
+ if schema in ("dns", "syslog"):
1836
+ body = summarizer(frame, feed)
1837
+ else:
1838
+ body = summarizer(frame)
1839
+ card = DigestCard(
1840
+ schema=schema,
1841
+ source_name=source_name,
1842
+ data_window=data_window,
1843
+ record_count=total_records,
1844
+ histogram_counts=histogram_counts,
1845
+ histogram_unit=histogram_unit,
1846
+ histogram_peak=histogram_peak,
1847
+ zone1_extras=body["zone1_extras"],
1848
+ insights=body["insights"],
1849
+ fields=body["fields"],
1850
+ data_size_bytes=load_result.data_size_bytes,
1851
+ timeline_unavailable=timeline_unavailable,
1852
+ )
1853
+ except Exception as exc:
1854
+ if fallback_blob_path is None:
1855
+ raise
1856
+ # One-line stderr breadcrumb — verbose-gated so the raw exception
1857
+ # text does not leak to default-mode users (the "actionable
1858
+ # messages, never raw exceptions" rail). Default runs see the
1859
+ # blob card as the whole story; --verbose retains the breadcrumb
1860
+ # for debugging. The error-voice pass will tune the phrasing;
1861
+ # voice is PLACEHOLDER.
1862
+ if verbose_level >= 1:
1863
+ print(
1864
+ f"digest: {fallback_blob_path.name}: summariser failed "
1865
+ f"({type(exc).__name__}: {exc}); falling back to blob",
1866
+ file=sys.stderr,
1867
+ )
1868
+ # Separator single-owner: _render_blob_for_path owns blob-card
1869
+ # emission. We thread the flag and do NOT emit here, or the run
1870
+ # would print two rules around the same fallback card.
1871
+ _render_blob_for_path(
1872
+ fallback_blob_path,
1873
+ stream=stream,
1874
+ output_dir=output_dir,
1875
+ output_file=output_file,
1876
+ verbose_level=verbose_level,
1877
+ leading_separator=leading_separator,
1878
+ )
1879
+ return
1880
+
1881
+ handler, close_handler = _build_output_handler(
1882
+ "text", output_dir, output_file, verbose_level, stream=stream,
1883
+ )
1884
+ try:
1885
+ from loghunter.outputs.text import TextHandler
1886
+ if not isinstance(handler, TextHandler):
1887
+ raise RuntimeError(
1888
+ "digest: _build_output_handler did not return a TextHandler "
1889
+ f"(got {type(handler).__name__})"
1890
+ )
1891
+ if leading_separator:
1892
+ _emit_inter_card_separator(stream)
1893
+ handler.render_digest(card)
1894
+ finally:
1895
+ close_handler()