loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
loghunter/runner.py
ADDED
|
@@ -0,0 +1,1895 @@
|
|
|
1
|
+
"""Orchestrates detector execution: discovery, log loading, context assembly, and output.
|
|
2
|
+
|
|
3
|
+
Responsibilities:
|
|
4
|
+
- Auto-discover detectors by scanning loghunter/detectors/ for modules with DETECTOR_NAME
|
|
5
|
+
- Resolve the detect= selection (all, explicit list, exclusion syntax)
|
|
6
|
+
- Check REQUIRED_LOGS availability; skip with warning if missing
|
|
7
|
+
- Load logs and assemble DetectorContext for each detector
|
|
8
|
+
- Collect list[Finding] from each detector's run()
|
|
9
|
+
- Hand findings to Reporter
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import importlib
|
|
15
|
+
import pkgutil
|
|
16
|
+
import sys
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
from datetime import datetime, timedelta, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import Any, Sequence
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
import loghunter.detectors as _detectors_pkg
|
|
25
|
+
from loghunter.common.config import get_detector_config, parse_window_span
|
|
26
|
+
from loghunter.common.display import (
|
|
27
|
+
TEXT_RULE,
|
|
28
|
+
TEXT_RULE_DOUBLE,
|
|
29
|
+
TEXT_RULE_WIDTH,
|
|
30
|
+
liveness,
|
|
31
|
+
)
|
|
32
|
+
from loghunter.common.errors import DigestEmpty, ExportAborted
|
|
33
|
+
from loghunter.common.finding import DetectorContext, Finding, RunSummary
|
|
34
|
+
from loghunter.common.output import OutputHandler, Reporter
|
|
35
|
+
from loghunter.common.sources import (
|
|
36
|
+
resolve_digest_source,
|
|
37
|
+
resolve_sources,
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
_WIDTH = TEXT_RULE_WIDTH
|
|
41
|
+
_SEP = TEXT_RULE
|
|
42
|
+
_SEP_DOUBLE = TEXT_RULE_DOUBLE
|
|
43
|
+
|
|
44
|
+
# Full-fidelity DNS source labels. When any of these are in data_sources the
|
|
45
|
+
# Zeek evangelization nudge is suppressed. BIND9 and others join this set when
|
|
46
|
+
# their parsers land.
|
|
47
|
+
_RICH_DNS_SOURCES = {"zeek_dns"}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class RunPlan:
|
|
52
|
+
"""Detector execution plan produced before loading any log data."""
|
|
53
|
+
|
|
54
|
+
detectors: dict[str, Any]
|
|
55
|
+
selected: list[str]
|
|
56
|
+
will_run: list[str]
|
|
57
|
+
skipped: dict[str, str]
|
|
58
|
+
needed_logs: dict[str, str]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def run(
|
|
62
|
+
config: dict[str, Any],
|
|
63
|
+
detect: str | None = None,
|
|
64
|
+
zeek_dir: str | Path | Sequence[str | Path] | None = None,
|
|
65
|
+
syslog_dir: str | Path | Sequence[str | Path] | None = None,
|
|
66
|
+
pihole_dir: str | Path | Sequence[str | Path] | None = None,
|
|
67
|
+
cloudtrail_dir: str | Path | Sequence[str | Path] | None = None,
|
|
68
|
+
since: datetime | None = None,
|
|
69
|
+
until: datetime | None = None,
|
|
70
|
+
output_format: str = "text",
|
|
71
|
+
output_dir: Path | None = None,
|
|
72
|
+
verbose_level: int = 0,
|
|
73
|
+
dry_run: bool = False,
|
|
74
|
+
export_allowlist: bool = False,
|
|
75
|
+
load_all: bool = False,
|
|
76
|
+
skip_confirm: bool = False,
|
|
77
|
+
output_file: Path | None = None,
|
|
78
|
+
scope: frozenset[str] | None = None,
|
|
79
|
+
) -> None:
|
|
80
|
+
"""Main entry point for a detection run. Called by CLI dispatch functions.
|
|
81
|
+
|
|
82
|
+
Source-dir parameters (``zeek_dir`` / ``syslog_dir`` / ``pihole_dir`` /
|
|
83
|
+
``cloudtrail_dir``) are EXPLICIT OVERRIDES accepting either a scalar
|
|
84
|
+
(``str`` / ``Path``) or a sequence of scalars (multi-positional analyze).
|
|
85
|
+
``None`` means "no override." Scalar callers are degenerate one-element
|
|
86
|
+
lists downstream — byte-identical with the prior single-Path contract.
|
|
87
|
+
Resolution happens inside ``loghunter.common.sources.resolve_sources``
|
|
88
|
+
via the single ``_resolve_one`` site (per-element). CLI callers thread
|
|
89
|
+
raw parsed strings or per-family lists; programmatic callers can pass
|
|
90
|
+
already-resolved ``Path``s, lists thereof, or let ``None`` fall back to
|
|
91
|
+
``config["loghunter"][key]`` (LH_ROOT applied).
|
|
92
|
+
|
|
93
|
+
``scope`` is the SOLE scoping signal. ``None`` = unconstrained (every
|
|
94
|
+
configured source-dir is eligible). A ``frozenset`` of source-dir keys
|
|
95
|
+
restricts config-fallback to those keys — sibling source-dirs stay
|
|
96
|
+
``None`` and are NOT loaded. An override outside ``scope`` still wins
|
|
97
|
+
(operator widening). The CLI sets ``scope`` from a positional PATH's
|
|
98
|
+
routed source; the previous ``None``-as-scoped-out wire shape is
|
|
99
|
+
retired.
|
|
100
|
+
|
|
101
|
+
``skip_confirm`` bypasses the advisory large-dataset prompt (controlled by
|
|
102
|
+
``[loghunter].warn_above``). Threaded from the CLI's ``--yes`` / ``-y`` flag.
|
|
103
|
+
Has no effect on safety-critical actions — there are none today; advisory
|
|
104
|
+
prompts only.
|
|
105
|
+
|
|
106
|
+
``output_file`` is the be_like_water FILE verdict — an exact file path for
|
|
107
|
+
the report. When set it takes precedence over ``output_dir``; when both
|
|
108
|
+
are None, the runner streams to stdout (text/json/csv) or writes
|
|
109
|
+
``loghunter-report.html`` in CWD (html). This preserves the bare-case
|
|
110
|
+
behavior exactly.
|
|
111
|
+
"""
|
|
112
|
+
cfg_lh = config.get("loghunter", {})
|
|
113
|
+
|
|
114
|
+
# Single owner of source resolution: resolve_sources runs the four-key
|
|
115
|
+
# truth table (override / scope / config fallback) and is the SOLE site
|
|
116
|
+
# that converts a source-dir string to a Path. Runs BEFORE dry_run so
|
|
117
|
+
# _print_dry_run sees resolved dirs (provenance rail).
|
|
118
|
+
resolved = resolve_sources(
|
|
119
|
+
config,
|
|
120
|
+
overrides={
|
|
121
|
+
"zeek_dir": zeek_dir,
|
|
122
|
+
"syslog_dir": syslog_dir,
|
|
123
|
+
"pihole_dir": pihole_dir,
|
|
124
|
+
"cloudtrail_dir": cloudtrail_dir,
|
|
125
|
+
},
|
|
126
|
+
scope=scope,
|
|
127
|
+
)
|
|
128
|
+
zeek_dirs = resolved.zeek_dir
|
|
129
|
+
syslog_dirs = resolved.syslog_dir
|
|
130
|
+
pihole_dirs = resolved.pihole_dir
|
|
131
|
+
cloudtrail_dirs = resolved.cloudtrail_dir
|
|
132
|
+
|
|
133
|
+
plan = build_run_plan(
|
|
134
|
+
detect_spec=detect if detect is not None else cfg_lh.get("detect", "all"),
|
|
135
|
+
zeek_dir=zeek_dirs,
|
|
136
|
+
syslog_dir=syslog_dirs,
|
|
137
|
+
pihole_dir=pihole_dirs,
|
|
138
|
+
cloudtrail_dir=cloudtrail_dirs,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
if dry_run:
|
|
142
|
+
_print_dry_run(
|
|
143
|
+
zeek_dir=zeek_dirs,
|
|
144
|
+
syslog_dir=syslog_dirs,
|
|
145
|
+
pihole_dir=pihole_dirs,
|
|
146
|
+
cloudtrail_dir=cloudtrail_dirs,
|
|
147
|
+
since=since,
|
|
148
|
+
until=until,
|
|
149
|
+
load_all=load_all,
|
|
150
|
+
will_run=plan.will_run,
|
|
151
|
+
skipped=plan.skipped,
|
|
152
|
+
)
|
|
153
|
+
return
|
|
154
|
+
|
|
155
|
+
if export_allowlist:
|
|
156
|
+
raise ValueError(
|
|
157
|
+
"--export-allowlist is not yet implemented — planned for a future release"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# Emit per-detector skip warnings to stderr
|
|
161
|
+
for name, reason in plan.skipped.items():
|
|
162
|
+
_warn_skipped(name, reason)
|
|
163
|
+
|
|
164
|
+
if not plan.will_run:
|
|
165
|
+
print(
|
|
166
|
+
"No detectors could run — check required log source paths in config "
|
|
167
|
+
"or CLI overrides.",
|
|
168
|
+
file=sys.stderr,
|
|
169
|
+
)
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
# ── Load logs ─────────────────────────────────────────────────────────────
|
|
173
|
+
from loghunter.common import loader
|
|
174
|
+
from loghunter.common.allowlist import build_matcher
|
|
175
|
+
|
|
176
|
+
source_dirs: dict[str, list[Path]] = {}
|
|
177
|
+
if zeek_dirs:
|
|
178
|
+
source_dirs["zeek_dir"] = zeek_dirs
|
|
179
|
+
if syslog_dirs:
|
|
180
|
+
source_dirs["syslog_dir"] = syslog_dirs
|
|
181
|
+
if pihole_dirs:
|
|
182
|
+
source_dirs["pihole_dir"] = pihole_dirs
|
|
183
|
+
if cloudtrail_dirs:
|
|
184
|
+
source_dirs["cloudtrail_dir"] = cloudtrail_dirs
|
|
185
|
+
|
|
186
|
+
# Resolve the UNIVERSAL default window. default_window governs every source
|
|
187
|
+
# family (no longer Zeek-only): each family anchors on its OWN max-ts, with the
|
|
188
|
+
# per-family load/trim strategy encoded in a single LoadWindow per family. The
|
|
189
|
+
# loader owns the window policy now — resolve_load_windows is the SINGLE entry
|
|
190
|
+
# point (shared with run_digest); each family's strategy declares its own
|
|
191
|
+
# resolver. Engages only on an unqualified, non---all, unbounded, in-plan,
|
|
192
|
+
# configured, eligible family.
|
|
193
|
+
_default_spec: str = cfg_lh.get("default_window", "1d")
|
|
194
|
+
load_windows = loader.resolve_load_windows(
|
|
195
|
+
plan.needed_logs, source_dirs, _default_spec,
|
|
196
|
+
load_all=load_all, since=since, until=until,
|
|
197
|
+
)
|
|
198
|
+
default_window_active = bool(load_windows)
|
|
199
|
+
# source_windows: precise dated-Zeek window or conservative flat (floor, None);
|
|
200
|
+
# families with select_window=None load full and are trimmed post-load.
|
|
201
|
+
source_windows = (
|
|
202
|
+
{w.source: w.select_window for w in load_windows if w.select_window is not None}
|
|
203
|
+
or None
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
if default_window_active:
|
|
207
|
+
# PLACEHOLDER VOICE (flag for the qmail error-voice pass). One pre-load
|
|
208
|
+
# line above the loader's `loaded <file>` progress lines.
|
|
209
|
+
print(
|
|
210
|
+
f"Default window: last {_default_spec} of available data. "
|
|
211
|
+
"Use --all for the full archive, or --since/--days to widen.",
|
|
212
|
+
file=sys.stderr,
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
load_result = loader.load_required_logs(
|
|
216
|
+
plan.needed_logs,
|
|
217
|
+
source_dirs,
|
|
218
|
+
since,
|
|
219
|
+
until,
|
|
220
|
+
verbose=(verbose_level >= 1),
|
|
221
|
+
source_windows=source_windows,
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Post-load precise trim for every family whose default window engaged with a
|
|
225
|
+
# load-full / conservative select-window (flat peek-prune, cloudtrail
|
|
226
|
+
# load-full, flat/mixed Zeek). Dated-Zeek families carry trim_span=None — their
|
|
227
|
+
# select_window already cut exactly at load. keep_null is wired from the
|
|
228
|
+
# source's loader policy, so keep-policy families (syslog/pihole) retain
|
|
229
|
+
# unparseable-ts rows through the implicit window exactly as through an
|
|
230
|
+
# explicit one. Mixed file+dir trims the named file's rows WITH the bucket.
|
|
231
|
+
for w in load_windows:
|
|
232
|
+
if w.trim_span is None:
|
|
233
|
+
continue
|
|
234
|
+
family_patterns = [
|
|
235
|
+
p for p, src in plan.needed_logs.items() if src == w.source
|
|
236
|
+
]
|
|
237
|
+
load_result = loader.apply_default_window(
|
|
238
|
+
load_result, family_patterns, w.trim_span, keep_null=w.keep_null,
|
|
239
|
+
)
|
|
240
|
+
logs = load_result.logs
|
|
241
|
+
|
|
242
|
+
for warning in load_result.warnings:
|
|
243
|
+
print(f"Warning: {warning}", file=sys.stderr)
|
|
244
|
+
|
|
245
|
+
# ONE captured `now` for both the data-window fallback and requested_span,
|
|
246
|
+
# so they cannot drift across separate clock reads.
|
|
247
|
+
now = datetime.now(timezone.utc)
|
|
248
|
+
if load_result.data_window is not None:
|
|
249
|
+
data_window = load_result.data_window
|
|
250
|
+
elif since or until:
|
|
251
|
+
data_window = (since or now, until or now)
|
|
252
|
+
else:
|
|
253
|
+
data_window = (now, now)
|
|
254
|
+
|
|
255
|
+
# The window the operator asked for, used by the data-found underfill
|
|
256
|
+
# parenthetical. Default-window active → the configured spec; explicit
|
|
257
|
+
# since&until → their span; since only → since→now; until-only / --all /
|
|
258
|
+
# bounded full-load → None (unconstrained).
|
|
259
|
+
requested_span: timedelta | None
|
|
260
|
+
if default_window_active:
|
|
261
|
+
requested_span = parse_window_span(_default_spec)
|
|
262
|
+
elif since is not None and until is not None:
|
|
263
|
+
requested_span = until - since
|
|
264
|
+
elif since is not None:
|
|
265
|
+
requested_span = now - since
|
|
266
|
+
else:
|
|
267
|
+
requested_span = None
|
|
268
|
+
# No real data window (load yielded nothing the renderer can place — e.g. all
|
|
269
|
+
# rows unparseable-ts under keep policy) → run() fabricated a (now, now) window.
|
|
270
|
+
# Force requested_span None so the underfill parenthetical cannot render a
|
|
271
|
+
# confident comparison over data that does not exist. The legitimate single-event
|
|
272
|
+
# case keeps a real (ts, ts) data_window and is unaffected.
|
|
273
|
+
if load_result.data_window is None:
|
|
274
|
+
requested_span = None
|
|
275
|
+
|
|
276
|
+
# Large-dataset warning. Suppressed when skip_confirm is set (--yes / -y).
|
|
277
|
+
total_records = sum(load_result.record_counts.values())
|
|
278
|
+
warn_above: int = cfg_lh.get("warn_above", 5_000_000)
|
|
279
|
+
if total_records > warn_above and not skip_confirm:
|
|
280
|
+
try:
|
|
281
|
+
answer = input(
|
|
282
|
+
f"{total_records:,} records found. This may take a while. Continue? [y/N] "
|
|
283
|
+
)
|
|
284
|
+
except (EOFError, KeyboardInterrupt):
|
|
285
|
+
answer = ""
|
|
286
|
+
if answer.strip().lower() not in ("y", "yes"):
|
|
287
|
+
raise ExportAborted("loghunter: aborted by user")
|
|
288
|
+
|
|
289
|
+
# Build run summary and begin output before the detector loop so the banner
|
|
290
|
+
# ("Data found:", "Records:", "Detectors:") appears before analysis starts.
|
|
291
|
+
data_sources = _derive_data_sources(plan.needed_logs, load_result.record_counts)
|
|
292
|
+
# The default window is now announced pre-load on stderr (and the data-found
|
|
293
|
+
# parenthetical carries the data-vs-requested span), so no prose default-window
|
|
294
|
+
# note rides the run summary. The old "only X hours" short-window note is gone.
|
|
295
|
+
notes: list[str] = []
|
|
296
|
+
nudge = _dns_nudge(data_sources)
|
|
297
|
+
if nudge:
|
|
298
|
+
notes.append(nudge)
|
|
299
|
+
aws_below_note = _aws_below_floor_note(plan, logs, config)
|
|
300
|
+
if aws_below_note:
|
|
301
|
+
notes.append(aws_below_note)
|
|
302
|
+
# The aws --all riders key on CloudTrail ACTUALLY being narrowed (an explicit
|
|
303
|
+
# window) — NOT run-level default-window activity. CloudTrail opts out of the
|
|
304
|
+
# auto-default window, so a mixed unqualified run (dns/syslog windowed) loads
|
|
305
|
+
# it FULL and must not be told to widen.
|
|
306
|
+
cloudtrail_narrowed = since is not None or until is not None
|
|
307
|
+
aws_window_note = _aws_window_note(plan, cloudtrail_narrowed=cloudtrail_narrowed)
|
|
308
|
+
if aws_window_note:
|
|
309
|
+
notes.append(aws_window_note)
|
|
310
|
+
aws_no_interactive_note = _aws_no_interactive_note(
|
|
311
|
+
plan, logs, cloudtrail_narrowed=cloudtrail_narrowed
|
|
312
|
+
)
|
|
313
|
+
if aws_no_interactive_note:
|
|
314
|
+
notes.append(aws_no_interactive_note)
|
|
315
|
+
home_net_note = _home_net_note(plan, config)
|
|
316
|
+
if home_net_note:
|
|
317
|
+
notes.append(home_net_note)
|
|
318
|
+
# Source-dir overlap disclosure: when two IN-PLAN families resolve to the
|
|
319
|
+
# same directory, flat discovery globs cross-read it (one log surfaced as
|
|
320
|
+
# another's finding). Derives from already-resolved source_dirs + plan, like
|
|
321
|
+
# the home_net note above. Appended before the coverage/rotation extends,
|
|
322
|
+
# which are deliberately last.
|
|
323
|
+
notes.extend(_source_overlap_notes(source_dirs, plan))
|
|
324
|
+
# Source-coverage disclosure: for each planned source that contributed 0
|
|
325
|
+
# in-window rows, append a note (SPAN / BARE / silent per the parse-gap
|
|
326
|
+
# vs window-gap tri-state in CoverageTracker). Appended LAST so the
|
|
327
|
+
# existing notes' relative order is preserved and the disclosure is
|
|
328
|
+
# additive only. Reads the merged coverage written by the runner-side
|
|
329
|
+
# flat-default block above (when fired).
|
|
330
|
+
notes.extend(_zero_window_coverage_notes(load_result, plan))
|
|
331
|
+
# Flat rotation-peek disclosure: one note per windowed pattern that fell back
|
|
332
|
+
# to a full read or skipped out-of-window rotation files. Additive, appended last.
|
|
333
|
+
notes.extend(_rotation_skip_notes(load_result, plan))
|
|
334
|
+
detector_methods = {
|
|
335
|
+
name: getattr(plan.detectors[name], "DETECTOR_METHOD", None)
|
|
336
|
+
for name in plan.will_run
|
|
337
|
+
}
|
|
338
|
+
run_summary = RunSummary(
|
|
339
|
+
data_window=data_window,
|
|
340
|
+
record_counts=load_result.record_counts,
|
|
341
|
+
data_size_bytes=load_result.data_size_bytes,
|
|
342
|
+
detectors_run=plan.will_run,
|
|
343
|
+
detectors_skipped=plan.skipped,
|
|
344
|
+
notes=notes,
|
|
345
|
+
data_sources=data_sources,
|
|
346
|
+
detector_methods=detector_methods,
|
|
347
|
+
requested_span=requested_span,
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
max_per_detector = int(
|
|
351
|
+
config.get("loghunter", {}).get("max_findings_per_detector", 100)
|
|
352
|
+
)
|
|
353
|
+
handler, close_handler = _build_output_handler(
|
|
354
|
+
output_format, output_dir, output_file, verbose_level,
|
|
355
|
+
max_findings_per_detector=max_per_detector,
|
|
356
|
+
)
|
|
357
|
+
reporter = Reporter([handler])
|
|
358
|
+
reporter.begin(run_summary)
|
|
359
|
+
|
|
360
|
+
# ── Run detectors ─────────────────────────────────────────────────────────
|
|
361
|
+
allowlist = build_matcher(config)
|
|
362
|
+
home_net = list(config.get("loghunter", {}).get("home_net", []))
|
|
363
|
+
all_findings: list[Finding] = []
|
|
364
|
+
|
|
365
|
+
for name in plan.will_run:
|
|
366
|
+
mod = plan.detectors[name]
|
|
367
|
+
det_cfg = get_detector_config(config, name, getattr(mod, "DEFAULT_CONFIG", {}))
|
|
368
|
+
|
|
369
|
+
# Per-detector prep + run, scoped to honest error labels. Prep
|
|
370
|
+
# (filter_df + DetectorContext construction) is the runner's
|
|
371
|
+
# responsibility; a prep failure is "prep error", NOT "detector
|
|
372
|
+
# error" — separation-of-powers detail. For the non-syslog
|
|
373
|
+
# branch, both prep and run live INSIDE liveness(...) so the
|
|
374
|
+
# spinner appears as soon as the operator-visible work begins
|
|
375
|
+
# (the "Detector liveness starts too late" bug — see docs/BUGS.md
|
|
376
|
+
# — was the prep running silently before the liveness block
|
|
377
|
+
# opened).
|
|
378
|
+
#
|
|
379
|
+
# syslog stays outside the outer spinner branch: its inner
|
|
380
|
+
# drain3 tqdm bar owns its stderr line, and an outer spinner
|
|
381
|
+
# would fight for the same row. Prep moves into the syslog
|
|
382
|
+
# branch too for consistency but stays outside its own
|
|
383
|
+
# liveness wrapper.
|
|
384
|
+
if name == "syslog":
|
|
385
|
+
try:
|
|
386
|
+
ctx = _prepare_detector_context(
|
|
387
|
+
mod, name, logs, allowlist, det_cfg,
|
|
388
|
+
data_window, data_sources, home_net,
|
|
389
|
+
)
|
|
390
|
+
except Exception as exc:
|
|
391
|
+
print(f"{name}: prep error — {exc}", file=sys.stderr)
|
|
392
|
+
continue
|
|
393
|
+
try:
|
|
394
|
+
findings = mod.run(ctx)
|
|
395
|
+
except Exception as exc:
|
|
396
|
+
print(f"{name}: detector error — {exc}", file=sys.stderr)
|
|
397
|
+
findings = []
|
|
398
|
+
else:
|
|
399
|
+
with liveness(f"running {name}") as _ln:
|
|
400
|
+
try:
|
|
401
|
+
ctx = _prepare_detector_context(
|
|
402
|
+
mod, name, logs, allowlist, det_cfg,
|
|
403
|
+
data_window, data_sources, home_net,
|
|
404
|
+
)
|
|
405
|
+
except Exception as exc:
|
|
406
|
+
# Prep failed BEFORE the detector even started — no
|
|
407
|
+
# seal (the "no false seal" path from
|
|
408
|
+
# tests/test_display.py:120-130); liveness's normal
|
|
409
|
+
# teardown clears the spinner line.
|
|
410
|
+
print(f"{name}: prep error — {exc}", file=sys.stderr)
|
|
411
|
+
continue
|
|
412
|
+
try:
|
|
413
|
+
findings = mod.run(ctx)
|
|
414
|
+
# The seal is a terse live completion record — "this
|
|
415
|
+
# detector finished" — NOT a tally. The report header
|
|
416
|
+
# (W2) is the single authoritative count surface
|
|
417
|
+
# (carries the H/M/L/I breakdown, survives redirect).
|
|
418
|
+
# Empty case stays informative: the detector ran and
|
|
419
|
+
# found nothing. ASCII-only per display.py's spinner
|
|
420
|
+
# discipline. Wording is a PLACEHOLDER pending the
|
|
421
|
+
# error-voice pass.
|
|
422
|
+
_ln.seal(
|
|
423
|
+
f"{name}: done"
|
|
424
|
+
if findings
|
|
425
|
+
else f"{name}: nothing"
|
|
426
|
+
)
|
|
427
|
+
except Exception as exc:
|
|
428
|
+
print(f"{name}: detector error — {exc}", file=sys.stderr)
|
|
429
|
+
findings = []
|
|
430
|
+
|
|
431
|
+
all_findings.extend(findings)
|
|
432
|
+
|
|
433
|
+
try:
|
|
434
|
+
reporter.write(all_findings)
|
|
435
|
+
reporter.end()
|
|
436
|
+
finally:
|
|
437
|
+
close_handler()
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def _prepare_detector_context(
|
|
441
|
+
mod: Any,
|
|
442
|
+
name: str,
|
|
443
|
+
logs: dict[str, Any],
|
|
444
|
+
allowlist: Any,
|
|
445
|
+
det_cfg: dict[str, Any],
|
|
446
|
+
data_window: tuple[datetime, datetime],
|
|
447
|
+
data_sources: list[str],
|
|
448
|
+
home_net: list[str],
|
|
449
|
+
) -> DetectorContext:
|
|
450
|
+
"""Build the per-detector filtered view + DetectorContext.
|
|
451
|
+
|
|
452
|
+
Pure extraction of the previously inline prep at the detector loop:
|
|
453
|
+
each detector gets its own filtered copy of the shared log frames
|
|
454
|
+
(so independent ``filter_df`` results never mutate the shared dict),
|
|
455
|
+
keyed by the patterns the detector itself declares via
|
|
456
|
+
``REQUIRED_LOGS`` + ``OPTIONAL_LOGS``.
|
|
457
|
+
|
|
458
|
+
Lives in the runner — NOT moved into detector code — because
|
|
459
|
+
``allowlist.filter_df`` is suppression, and suppression stays in the
|
|
460
|
+
runner per the filter-before-analyze rail (CODE.md "Allowlist
|
|
461
|
+
Architecture").
|
|
462
|
+
|
|
463
|
+
Verbose is intentionally absent (W6): detector context carries no
|
|
464
|
+
verbosity; the result set is verbosity-invariant by construction.
|
|
465
|
+
"""
|
|
466
|
+
det_patterns = {
|
|
467
|
+
req["pattern"]
|
|
468
|
+
for req in list(getattr(mod, "REQUIRED_LOGS", []))
|
|
469
|
+
+ list(getattr(mod, "OPTIONAL_LOGS", []))
|
|
470
|
+
}
|
|
471
|
+
filtered_logs: dict[str, Any] = {}
|
|
472
|
+
for pattern, df in logs.items():
|
|
473
|
+
if pattern in det_patterns and not df.empty:
|
|
474
|
+
filtered_logs[pattern] = allowlist.filter_df(df, name)
|
|
475
|
+
else:
|
|
476
|
+
filtered_logs[pattern] = df
|
|
477
|
+
return DetectorContext(
|
|
478
|
+
logs=filtered_logs,
|
|
479
|
+
config=det_cfg,
|
|
480
|
+
allowlist=allowlist,
|
|
481
|
+
data_window=data_window,
|
|
482
|
+
data_sources=data_sources,
|
|
483
|
+
home_net=home_net,
|
|
484
|
+
)
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def discover_detectors() -> dict[str, Any]:
|
|
488
|
+
"""Scan loghunter/detectors/ and return available detector modules by name."""
|
|
489
|
+
detectors: dict[str, Any] = {}
|
|
490
|
+
for _finder, name, _ispkg in pkgutil.iter_modules(_detectors_pkg.__path__):
|
|
491
|
+
try:
|
|
492
|
+
mod = importlib.import_module(f"loghunter.detectors.{name}")
|
|
493
|
+
except ImportError:
|
|
494
|
+
continue
|
|
495
|
+
if hasattr(mod, "DETECTOR_NAME") and getattr(mod, "STATUS", "available") == "available":
|
|
496
|
+
detectors[mod.DETECTOR_NAME] = mod
|
|
497
|
+
return detectors
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _as_path_list(value: Path | list[Path] | None) -> list[Path]:
|
|
501
|
+
"""Normalize a build_run_plan / _print_dry_run source-dir param.
|
|
502
|
+
|
|
503
|
+
Accepts None (absent), a scalar Path (degenerate one-element list), or a
|
|
504
|
+
list of Paths (the canonical multi-input shape). Returns a list — empty
|
|
505
|
+
means absent. Lets callers and tests pass either form without juggling
|
|
506
|
+
the boundary; the internal pipeline operates on lists only. SAME
|
|
507
|
+
normalization shape as ``runner.run`` accepting ``str | Path | Sequence
|
|
508
|
+
| None`` at its outer boundary, propagated inward.
|
|
509
|
+
"""
|
|
510
|
+
if value is None:
|
|
511
|
+
return []
|
|
512
|
+
if isinstance(value, Path):
|
|
513
|
+
return [value]
|
|
514
|
+
return list(value)
|
|
515
|
+
|
|
516
|
+
|
|
517
|
+
def build_run_plan(
|
|
518
|
+
detect_spec: str | None,
|
|
519
|
+
zeek_dir: Path | list[Path] | None = None,
|
|
520
|
+
syslog_dir: Path | list[Path] | None = None,
|
|
521
|
+
pihole_dir: Path | list[Path] | None = None,
|
|
522
|
+
cloudtrail_dir: Path | list[Path] | None = None,
|
|
523
|
+
detectors: dict[str, Any] | None = None,
|
|
524
|
+
) -> RunPlan:
|
|
525
|
+
"""Resolve detector selection, required-log skips, and log patterns to load.
|
|
526
|
+
|
|
527
|
+
Each source-dir parameter accepts ``None`` (absent), a scalar ``Path``
|
|
528
|
+
(degenerate one-element list), or a list of ``Path``s (the canonical
|
|
529
|
+
multi-input shape from the resolver). Plan-time satisfiability uses the
|
|
530
|
+
SAME discovery helpers the loader uses (``discover_zeek_files``,
|
|
531
|
+
``discover_cloudtrail_files``, ``_syslog_files``); plan and loader MUST
|
|
532
|
+
discover the same universe.
|
|
533
|
+
"""
|
|
534
|
+
all_detectors = detectors or discover_detectors()
|
|
535
|
+
selected = resolve_detect(str(detect_spec or "all"), sorted(all_detectors.keys()))
|
|
536
|
+
|
|
537
|
+
source_map: dict[str, list[Path]] = {}
|
|
538
|
+
zeek_paths = _as_path_list(zeek_dir)
|
|
539
|
+
syslog_paths = _as_path_list(syslog_dir)
|
|
540
|
+
pihole_paths = _as_path_list(pihole_dir)
|
|
541
|
+
cloudtrail_paths = _as_path_list(cloudtrail_dir)
|
|
542
|
+
if zeek_paths:
|
|
543
|
+
source_map["zeek_dir"] = zeek_paths
|
|
544
|
+
if syslog_paths:
|
|
545
|
+
source_map["syslog_dir"] = syslog_paths
|
|
546
|
+
if pihole_paths:
|
|
547
|
+
source_map["pihole_dir"] = pihole_paths
|
|
548
|
+
if cloudtrail_paths:
|
|
549
|
+
source_map["cloudtrail_dir"] = cloudtrail_paths
|
|
550
|
+
|
|
551
|
+
will_run: list[str] = []
|
|
552
|
+
skipped: dict[str, str] = {}
|
|
553
|
+
for name in selected:
|
|
554
|
+
reason = _check_required_logs(all_detectors[name], source_map)
|
|
555
|
+
if reason:
|
|
556
|
+
skipped[name] = reason
|
|
557
|
+
else:
|
|
558
|
+
will_run.append(name)
|
|
559
|
+
|
|
560
|
+
# Only include OPTIONAL_LOGS patterns that are actually satisfiable, to avoid
|
|
561
|
+
# loading empty frames for optional sources that happen to be configured but have
|
|
562
|
+
# no matching files (e.g. zeek_dir present but no dns*.log* when pihole satisfied).
|
|
563
|
+
needed_logs: dict[str, str] = {}
|
|
564
|
+
for name in will_run:
|
|
565
|
+
mod = all_detectors[name]
|
|
566
|
+
for req in getattr(mod, "REQUIRED_LOGS", []):
|
|
567
|
+
if req["pattern"] not in needed_logs:
|
|
568
|
+
needed_logs[req["pattern"]] = req["source"]
|
|
569
|
+
for req in getattr(mod, "OPTIONAL_LOGS", []):
|
|
570
|
+
if _is_optional_satisfiable(req, source_map) and req["pattern"] not in needed_logs:
|
|
571
|
+
needed_logs[req["pattern"]] = req["source"]
|
|
572
|
+
|
|
573
|
+
return RunPlan(
|
|
574
|
+
detectors=all_detectors,
|
|
575
|
+
selected=selected,
|
|
576
|
+
will_run=will_run,
|
|
577
|
+
skipped=skipped,
|
|
578
|
+
needed_logs=needed_logs,
|
|
579
|
+
)
|
|
580
|
+
|
|
581
|
+
|
|
582
|
+
def _build_output_handler(
|
|
583
|
+
output_format: str,
|
|
584
|
+
output_dir: Path | None,
|
|
585
|
+
output_file: Path | None,
|
|
586
|
+
verbose_level: int,
|
|
587
|
+
stream: Any = None,
|
|
588
|
+
*,
|
|
589
|
+
max_findings_per_detector: int = 100,
|
|
590
|
+
) -> tuple[OutputHandler, Any]:
|
|
591
|
+
"""Create the requested output handler and a callback that closes any file stream.
|
|
592
|
+
|
|
593
|
+
``output_file`` is the be_like_water FILE verdict — an exact file path.
|
|
594
|
+
When set it takes precedence over ``output_dir`` (which is the DIRECTORY
|
|
595
|
+
verdict; runner auto-names inside it). With both None, text/json/csv stream
|
|
596
|
+
to stdout and html writes ``loghunter-report.html`` in CWD.
|
|
597
|
+
|
|
598
|
+
``stream`` is the caller-owned TextIO seam used by the digest fan-out: the
|
|
599
|
+
CLI resolves a shared `--out` target once and passes the open stream here
|
|
600
|
+
so N cards concatenate into one file. Stream-backed formats only (text
|
|
601
|
+
today); HTML/CSV/etc. carry different writer shapes and are not routed
|
|
602
|
+
through this seam. Caller owns stream lifetime; close callback is a no-op.
|
|
603
|
+
|
|
604
|
+
``verbose_level`` is the single 0/1/2 dial. ONLY the text handler distinguishes
|
|
605
|
+
all three levels; json is invariant; csv/html collapse to level >= 1 for
|
|
606
|
+
description gating. ``max_findings_per_detector`` is the W5 cap (text only;
|
|
607
|
+
machine formats never truncate). Constructed format-branched so json/csv/html
|
|
608
|
+
receive only the cap-agnostic signature.
|
|
609
|
+
"""
|
|
610
|
+
from loghunter.common.output import get_handler
|
|
611
|
+
|
|
612
|
+
handler_cls = get_handler(output_format)
|
|
613
|
+
|
|
614
|
+
def _build(target_stream=None, output_path=None):
|
|
615
|
+
if output_format == "text":
|
|
616
|
+
if output_path is not None:
|
|
617
|
+
# HTML uses output_path; text never does — this branch unreachable
|
|
618
|
+
# for text but kept for symmetry.
|
|
619
|
+
raise RuntimeError("text handler does not accept output_path")
|
|
620
|
+
return handler_cls(
|
|
621
|
+
stream=target_stream,
|
|
622
|
+
verbose_level=verbose_level,
|
|
623
|
+
max_findings_per_detector=max_findings_per_detector,
|
|
624
|
+
)
|
|
625
|
+
if output_format == "html":
|
|
626
|
+
return handler_cls(output_path=output_path, verbose_level=verbose_level)
|
|
627
|
+
return handler_cls(stream=target_stream, verbose_level=verbose_level)
|
|
628
|
+
|
|
629
|
+
if stream is not None:
|
|
630
|
+
# Caller owns the stream; don't open, don't close. output_dir and
|
|
631
|
+
# output_file are expected to be None at this seam (digest fan-out
|
|
632
|
+
# resolves the target once in the CLI). Text format only.
|
|
633
|
+
return _build(target_stream=stream), lambda: None
|
|
634
|
+
|
|
635
|
+
# output_file (FILE verdict) wins — caller has decided the exact path.
|
|
636
|
+
if output_file is not None:
|
|
637
|
+
output_file.parent.mkdir(parents=True, exist_ok=True)
|
|
638
|
+
if output_format == "html":
|
|
639
|
+
return _build(output_path=output_file), lambda: None
|
|
640
|
+
opened = output_file.open("w", encoding="utf-8", newline="")
|
|
641
|
+
return _build(target_stream=opened), opened.close
|
|
642
|
+
|
|
643
|
+
if output_format == "html":
|
|
644
|
+
if output_dir is None:
|
|
645
|
+
output_path = Path("loghunter-report.html")
|
|
646
|
+
else:
|
|
647
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
648
|
+
output_path = output_dir / _report_filename(output_format)
|
|
649
|
+
return _build(output_path=output_path), lambda: None
|
|
650
|
+
|
|
651
|
+
target = sys.stdout
|
|
652
|
+
close_handler = lambda: None
|
|
653
|
+
if output_dir is not None:
|
|
654
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
655
|
+
target = (output_dir / _report_filename(output_format)).open("w", encoding="utf-8", newline="")
|
|
656
|
+
close_handler = target.close
|
|
657
|
+
|
|
658
|
+
return _build(target_stream=target), close_handler
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def _report_filename(output_format: str) -> str:
|
|
662
|
+
"""Return a timestamped report filename used when --out (or report_dir) resolves to a directory."""
|
|
663
|
+
stamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S")
|
|
664
|
+
suffix = "html" if output_format == "html" else output_format
|
|
665
|
+
return f"loghunter-{stamp}.{suffix}"
|
|
666
|
+
|
|
667
|
+
|
|
668
|
+
def resolve_detect(spec: str, available: list[str]) -> list[str]:
|
|
669
|
+
"""Resolve a detect= spec (all, list, exclusions) against available detector names.
|
|
670
|
+
|
|
671
|
+
Examples:
|
|
672
|
+
"all" → all available names (sorted)
|
|
673
|
+
"dns, beacon" → ["dns", "beacon"]
|
|
674
|
+
"all, !syslog" → all except "syslog"
|
|
675
|
+
"all,!syslog,!ssl" → all except syslog and ssl
|
|
676
|
+
"""
|
|
677
|
+
# Tokenise: split on commas and whitespace, handle "all, !syslog" etc.
|
|
678
|
+
tokens = [t.strip() for t in spec.replace(",", " ").split() if t.strip()]
|
|
679
|
+
|
|
680
|
+
inclusions: list[str] = []
|
|
681
|
+
exclusions: set[str] = set()
|
|
682
|
+
|
|
683
|
+
for token in tokens:
|
|
684
|
+
if token.startswith("!"):
|
|
685
|
+
exclusions.add(token[1:])
|
|
686
|
+
elif token == "all":
|
|
687
|
+
inclusions = list(available) # replace with all available
|
|
688
|
+
elif token in available:
|
|
689
|
+
inclusions.append(token)
|
|
690
|
+
# unknown detector names in spec are silently ignored
|
|
691
|
+
|
|
692
|
+
# Deduplicate while preserving order, then apply exclusions
|
|
693
|
+
seen: set[str] = set()
|
|
694
|
+
result: list[str] = []
|
|
695
|
+
for name in inclusions:
|
|
696
|
+
if name not in seen and name not in exclusions:
|
|
697
|
+
seen.add(name)
|
|
698
|
+
result.append(name)
|
|
699
|
+
|
|
700
|
+
return result
|
|
701
|
+
|
|
702
|
+
|
|
703
|
+
def _any_input_yields_files(
|
|
704
|
+
source: str, paths: list[Path], pattern: str,
|
|
705
|
+
) -> bool:
|
|
706
|
+
"""Plan-time discovery lockstep with the LOADER for one family.
|
|
707
|
+
|
|
708
|
+
Per-family mapping (matches ``load_required_logs``):
|
|
709
|
+
|
|
710
|
+
- ``zeek_dir`` → ``discover_zeek_files(input, pattern)`` per input
|
|
711
|
+
- ``cloudtrail_dir``→ ``discover_cloudtrail_files(input)`` per input
|
|
712
|
+
- ``syslog_dir`` → ``_discover_syslog_files(input)`` per input — the LOADER
|
|
713
|
+
content-sniffs syslog DIRECTORY candidates (RHEL/Fedora streams carry no
|
|
714
|
+
``.log`` suffix; ``dnf.log`` etc. would be mis-claimed by a filename glob),
|
|
715
|
+
so plan-time MUST too (one-universe rail). A ``/var/log`` holding only
|
|
716
|
+
``dnf.log`` reports syslog NOT satisfiable → the detector skips with its
|
|
717
|
+
actionable "not found" message instead of garbage.
|
|
718
|
+
- ``pihole_dir`` → ``_syslog_files(input, pattern)`` per input — the LOADER
|
|
719
|
+
threads the detector's pattern (``pihole*.log*``) into ``_syslog_files``
|
|
720
|
+
for DIRECTORY discovery, so plan-time MUST too. An explicit FILE still
|
|
721
|
+
routes as ``[path]`` regardless of pattern, so a content-routed Pi-hole
|
|
722
|
+
input named e.g. ``events.log`` is NOT plan-rejected.
|
|
723
|
+
|
|
724
|
+
Returns True iff ANY input yields at least one file.
|
|
725
|
+
"""
|
|
726
|
+
from loghunter.common.loader import (
|
|
727
|
+
_discover_syslog_files,
|
|
728
|
+
_syslog_files,
|
|
729
|
+
discover_cloudtrail_files,
|
|
730
|
+
discover_zeek_files,
|
|
731
|
+
)
|
|
732
|
+
for p in paths:
|
|
733
|
+
if not p.exists():
|
|
734
|
+
continue
|
|
735
|
+
if source == "zeek_dir":
|
|
736
|
+
if discover_zeek_files(p, pattern):
|
|
737
|
+
return True
|
|
738
|
+
elif source == "cloudtrail_dir":
|
|
739
|
+
if discover_cloudtrail_files(p):
|
|
740
|
+
return True
|
|
741
|
+
elif source == "syslog_dir":
|
|
742
|
+
if _discover_syslog_files(p):
|
|
743
|
+
return True
|
|
744
|
+
elif source == "pihole_dir":
|
|
745
|
+
if _syslog_files(p, pattern):
|
|
746
|
+
return True
|
|
747
|
+
else:
|
|
748
|
+
# Defensive: unknown source key. Fall back to plain glob over
|
|
749
|
+
# directories so an unrecognized future family doesn't silently
|
|
750
|
+
# plan-skip. The loader will raise the actionable error.
|
|
751
|
+
if p.is_file():
|
|
752
|
+
return True
|
|
753
|
+
if list(p.glob(pattern)):
|
|
754
|
+
return True
|
|
755
|
+
return False
|
|
756
|
+
|
|
757
|
+
|
|
758
|
+
def _is_optional_satisfiable(
|
|
759
|
+
req: dict[str, str],
|
|
760
|
+
source_map: dict[str, Path | list[Path]],
|
|
761
|
+
) -> bool:
|
|
762
|
+
"""Return True if an OPTIONAL_LOGS entry has files available to load."""
|
|
763
|
+
source = req["source"]
|
|
764
|
+
paths = _as_path_list(source_map.get(source))
|
|
765
|
+
if not paths:
|
|
766
|
+
return False
|
|
767
|
+
return _any_input_yields_files(source, paths, req["pattern"])
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
def _check_required_logs(
|
|
771
|
+
detector_module: Any,
|
|
772
|
+
source_map: dict[str, Path | list[Path]],
|
|
773
|
+
) -> str | None:
|
|
774
|
+
"""Return None if all REQUIRED_LOGS are available, or a human-readable reason if not."""
|
|
775
|
+
for req in getattr(detector_module, "REQUIRED_LOGS", []):
|
|
776
|
+
source = req["source"]
|
|
777
|
+
pattern = req["pattern"]
|
|
778
|
+
|
|
779
|
+
paths = _as_path_list(source_map.get(source))
|
|
780
|
+
if not paths:
|
|
781
|
+
return f"{source} not configured"
|
|
782
|
+
|
|
783
|
+
# Existence skip-reason mirrors single-input behavior on a one-element
|
|
784
|
+
# list: report the missing path. With multiple inputs, satisfiability
|
|
785
|
+
# is "ANY input yields files" — _any_input_yields_files handles
|
|
786
|
+
# per-input existence checks (skips non-existent), so we only emit a
|
|
787
|
+
# not-found skip when NO input yields anything.
|
|
788
|
+
if not _any_input_yields_files(source, paths, pattern):
|
|
789
|
+
if len(paths) == 1:
|
|
790
|
+
p = paths[0]
|
|
791
|
+
if not p.exists():
|
|
792
|
+
return f"{source} {p} not found"
|
|
793
|
+
if source == "cloudtrail_dir":
|
|
794
|
+
# Preserve the family-specific wording for the no-events
|
|
795
|
+
# skip path — recursive AWSLogs/<acct>/CloudTrail/<region>/
|
|
796
|
+
# discovery means a plain "pattern not found" reads
|
|
797
|
+
# confusingly.
|
|
798
|
+
return f"no CloudTrail JSON logs found in {p}"
|
|
799
|
+
return f"{pattern} not found in {p}"
|
|
800
|
+
# Multi-input — name the family rather than a single path.
|
|
801
|
+
return f"{pattern} not found in any configured {source} input"
|
|
802
|
+
|
|
803
|
+
if getattr(detector_module, "REQUIRES_ONE_OF_OPTIONAL", False):
|
|
804
|
+
for opt in getattr(detector_module, "OPTIONAL_LOGS", []):
|
|
805
|
+
if _is_optional_satisfiable(opt, source_map):
|
|
806
|
+
return None
|
|
807
|
+
return getattr(
|
|
808
|
+
detector_module,
|
|
809
|
+
"REQUIRES_ONE_OF_OPTIONAL_REASON",
|
|
810
|
+
f"{getattr(detector_module, 'DETECTOR_NAME', 'detector')} — no source available",
|
|
811
|
+
)
|
|
812
|
+
|
|
813
|
+
return None
|
|
814
|
+
|
|
815
|
+
|
|
816
|
+
def _warn_skipped(detector_name: str, reason: str) -> None:
|
|
817
|
+
"""Print a skip warning to stderr in the canonical format."""
|
|
818
|
+
print(f"{reason} — skipping {detector_name} detection", file=sys.stderr)
|
|
819
|
+
|
|
820
|
+
|
|
821
|
+
def _zeek_entry_display(p: Path) -> str:
|
|
822
|
+
"""Render one zeek_dir input for the dry-run block.
|
|
823
|
+
|
|
824
|
+
Mirrors the single-input format that has shipped: a DIRECTORY shows
|
|
825
|
+
``{path} (N files, X.X MB)`` (counting only its immediate file children
|
|
826
|
+
— same iteration the prior helper did, NOT recursive); a FILE shows
|
|
827
|
+
``{path} (X.X MB)``; a non-existent path shows ``{path} — not found``.
|
|
828
|
+
Single-input dry-run is byte-identical with the prior format.
|
|
829
|
+
"""
|
|
830
|
+
if not p.exists():
|
|
831
|
+
return f"{p} — not found"
|
|
832
|
+
if p.is_dir():
|
|
833
|
+
log_files = [f for f in p.iterdir() if f.is_file()]
|
|
834
|
+
size_mb = sum(f.stat().st_size for f in log_files) / 1_048_576
|
|
835
|
+
return f"{p} ({len(log_files)} files, {size_mb:.1f} MB)"
|
|
836
|
+
try:
|
|
837
|
+
size_mb = p.stat().st_size / 1_048_576
|
|
838
|
+
return f"{p} ({size_mb:.1f} MB)"
|
|
839
|
+
except OSError:
|
|
840
|
+
return f"{p}"
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
def _status_entry_display(p: Path) -> str:
|
|
844
|
+
"""Render one syslog/pihole/cloudtrail input for the dry-run block."""
|
|
845
|
+
status = "found" if p.exists() else "not found"
|
|
846
|
+
return f"{p} ({status})"
|
|
847
|
+
|
|
848
|
+
|
|
849
|
+
def _print_family_block(label: str, paths: list[Path], formatter) -> None:
|
|
850
|
+
"""Render one source-family block in the dry-run output.
|
|
851
|
+
|
|
852
|
+
Empty list → ``{label:>15} not configured`` (byte-identical with prior
|
|
853
|
+
single-Path-None case).
|
|
854
|
+
One input → ``{label:>15} {formatter(input)}`` (byte-identical with
|
|
855
|
+
the prior single-input format — Glenn's preserve-byte-identical rail).
|
|
856
|
+
Multi-input → the first entry rides the label line, subsequent entries
|
|
857
|
+
indent under it at the same value column (17 chars: 15-char right-
|
|
858
|
+
justified label + 2-space gutter). NEVER emits a Python list repr.
|
|
859
|
+
"""
|
|
860
|
+
head = f"{label + ':':>15}"
|
|
861
|
+
indent = " " * 15 # matches the right-justified label width
|
|
862
|
+
if not paths:
|
|
863
|
+
print(f"{head} not configured")
|
|
864
|
+
return
|
|
865
|
+
entries = [formatter(p) for p in paths]
|
|
866
|
+
print(f"{head} {entries[0]}")
|
|
867
|
+
for e in entries[1:]:
|
|
868
|
+
print(f"{indent} {e}")
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def _print_dry_run(
|
|
872
|
+
zeek_dir: Path | list[Path] | None,
|
|
873
|
+
syslog_dir: Path | list[Path] | None,
|
|
874
|
+
pihole_dir: Path | list[Path] | None,
|
|
875
|
+
cloudtrail_dir: Path | list[Path] | None,
|
|
876
|
+
since: datetime | None,
|
|
877
|
+
until: datetime | None,
|
|
878
|
+
load_all: bool,
|
|
879
|
+
will_run: list[str],
|
|
880
|
+
skipped: dict[str, str],
|
|
881
|
+
) -> None:
|
|
882
|
+
print("LogHunter · Threat Hunt [dry run]")
|
|
883
|
+
print(_SEP_DOUBLE)
|
|
884
|
+
|
|
885
|
+
# Right-justified 15-char label field (width of the widest label,
|
|
886
|
+
# "cloudtrail_dir:") plus a two-space gutter. Colons AND value starts align
|
|
887
|
+
# in a single clean column for all four source-dir lines. Multi-input
|
|
888
|
+
# buckets stack additional entries under the label's value column.
|
|
889
|
+
# Boundary accepts scalar Path / list / None — same normalization shape
|
|
890
|
+
# as build_run_plan, so test callers passing scalar Path or None work
|
|
891
|
+
# without juggling the wire shape.
|
|
892
|
+
_print_family_block("zeek_dir", _as_path_list(zeek_dir), _zeek_entry_display)
|
|
893
|
+
_print_family_block("syslog_dir", _as_path_list(syslog_dir), _status_entry_display)
|
|
894
|
+
_print_family_block("pihole_dir", _as_path_list(pihole_dir), _status_entry_display)
|
|
895
|
+
_print_family_block(
|
|
896
|
+
"cloudtrail_dir", _as_path_list(cloudtrail_dir), _status_entry_display,
|
|
897
|
+
)
|
|
898
|
+
|
|
899
|
+
if load_all:
|
|
900
|
+
print("Window: all available data (--all)")
|
|
901
|
+
elif since or until:
|
|
902
|
+
since_str = since.strftime("%Y-%m-%d %H:%M UTC") if since else "beginning of data"
|
|
903
|
+
until_str = until.strftime("%Y-%m-%d %H:%M UTC") if until else "end of data"
|
|
904
|
+
print(f"Window: {since_str} → {until_str}")
|
|
905
|
+
else:
|
|
906
|
+
print("Window: all available data")
|
|
907
|
+
|
|
908
|
+
if will_run:
|
|
909
|
+
print(f"Detectors: {' '.join(will_run)}")
|
|
910
|
+
else:
|
|
911
|
+
print("Detectors: (none — required logs unavailable)")
|
|
912
|
+
|
|
913
|
+
# Group detectors by skip reason for compact display
|
|
914
|
+
by_reason: dict[str, list[str]] = {}
|
|
915
|
+
for name, reason in skipped.items():
|
|
916
|
+
by_reason.setdefault(reason, []).append(name)
|
|
917
|
+
|
|
918
|
+
for reason, names in by_reason.items():
|
|
919
|
+
print(f"Skipped: {', '.join(names)} — {reason}")
|
|
920
|
+
|
|
921
|
+
print(_SEP_DOUBLE)
|
|
922
|
+
print("Dry run complete. Remove --dry-run to analyze.")
|
|
923
|
+
|
|
924
|
+
|
|
925
|
+
def _derive_data_sources(
|
|
926
|
+
needed_logs: dict[str, str],
|
|
927
|
+
record_counts: dict[str, int],
|
|
928
|
+
) -> list[str]:
|
|
929
|
+
"""Return sorted data_source labels for patterns that produced non-empty data."""
|
|
930
|
+
from loghunter.common.loader import _log_type
|
|
931
|
+
|
|
932
|
+
labels: set[str] = set()
|
|
933
|
+
for pattern, count in record_counts.items():
|
|
934
|
+
if count <= 0:
|
|
935
|
+
continue
|
|
936
|
+
source = needed_logs.get(pattern)
|
|
937
|
+
if source is None:
|
|
938
|
+
continue
|
|
939
|
+
if source == "zeek_dir":
|
|
940
|
+
lt = _log_type(pattern)
|
|
941
|
+
if lt is not None:
|
|
942
|
+
labels.add(f"zeek_{lt}")
|
|
943
|
+
elif source == "syslog_dir":
|
|
944
|
+
labels.add("syslog_raw")
|
|
945
|
+
elif source == "pihole_dir":
|
|
946
|
+
labels.add("dnsmasq_dns")
|
|
947
|
+
elif source == "cloudtrail_dir":
|
|
948
|
+
labels.add("cloudtrail_raw")
|
|
949
|
+
return sorted(labels)
|
|
950
|
+
|
|
951
|
+
|
|
952
|
+
def _pattern_human_label(source_key: str, pattern: str) -> str:
|
|
953
|
+
"""Operator-language label for one (source_key, pattern) tuple.
|
|
954
|
+
|
|
955
|
+
USED BY: the source-coverage disclosure note (``_zero_window_coverage_notes``).
|
|
956
|
+
DISTINCT FROM: ``_derive_data_sources``, which emits internal
|
|
957
|
+
``data_sources`` tokens (``"zeek_dns"`` / ``"dnsmasq_dns"`` / ``"syslog_raw"``
|
|
958
|
+
/ ``"cloudtrail_raw"``) consumed by the Zeek-evangelization nudge matcher and
|
|
959
|
+
other internal channels — those token strings stay byte-identical there.
|
|
960
|
+
|
|
961
|
+
Labels: ``Pi-hole`` / ``syslog`` / ``CloudTrail`` / ``Zeek <log_type>``.
|
|
962
|
+
"""
|
|
963
|
+
from loghunter.common.loader import _log_type
|
|
964
|
+
|
|
965
|
+
if source_key == "pihole_dir":
|
|
966
|
+
return "Pi-hole"
|
|
967
|
+
if source_key == "syslog_dir":
|
|
968
|
+
return "syslog"
|
|
969
|
+
if source_key == "cloudtrail_dir":
|
|
970
|
+
return "CloudTrail"
|
|
971
|
+
if source_key == "zeek_dir":
|
|
972
|
+
lt = _log_type(pattern)
|
|
973
|
+
return f"Zeek {lt}" if lt is not None else "Zeek"
|
|
974
|
+
return source_key
|
|
975
|
+
|
|
976
|
+
|
|
977
|
+
def _zero_window_coverage_notes(
|
|
978
|
+
load_result: "loader.LoadResult",
|
|
979
|
+
plan: RunPlan,
|
|
980
|
+
) -> list[str]:
|
|
981
|
+
"""Return disclosure notes for planned sources that contributed 0 in-window rows.
|
|
982
|
+
|
|
983
|
+
Honesty rail — coverage counts VALID-ts rows only:
|
|
984
|
+
- ``full_rows > 0`` → SPAN note (count + span + widen suggestion), or
|
|
985
|
+
count-only when no valid span survived (degenerate; defensive).
|
|
986
|
+
- ``full_rows is None`` and ``source_key == "zeek_dir"`` → BARE note
|
|
987
|
+
("files found, 0 records …"). The BARE arm is **zeek_dir-only**: for
|
|
988
|
+
syslog/pihole/cloudtrail, "no files read" means a wrong-family skip
|
|
989
|
+
or an empty directory — neither is a window gap the operator can fix
|
|
990
|
+
with ``--since/--days``, and the existing per-source warnings already
|
|
991
|
+
cover it.
|
|
992
|
+
- ``full_rows == 0`` → NO note (parse gap; widen advice would mislead).
|
|
993
|
+
- Pattern not loaded at all (source unconfigured) → NO note (the
|
|
994
|
+
loader already warns ``"{source} not configured — {pattern} not loaded"``).
|
|
995
|
+
"""
|
|
996
|
+
out: list[str] = []
|
|
997
|
+
for pattern, source_key in plan.needed_logs.items():
|
|
998
|
+
if load_result.record_counts.get(pattern, 0) != 0:
|
|
999
|
+
continue
|
|
1000
|
+
if pattern not in load_result.logs:
|
|
1001
|
+
# Source unconfigured for this pattern — loader already warned.
|
|
1002
|
+
continue
|
|
1003
|
+
cov = load_result.coverage.get(pattern)
|
|
1004
|
+
# Parse gap → silent (would otherwise tell the operator to widen
|
|
1005
|
+
# the window on a file with no valid timestamps — misleading).
|
|
1006
|
+
if cov is not None and cov.full_rows == 0:
|
|
1007
|
+
continue
|
|
1008
|
+
label = _pattern_human_label(source_key, pattern)
|
|
1009
|
+
if cov is None or cov.full_rows is None:
|
|
1010
|
+
if source_key != "zeek_dir":
|
|
1011
|
+
continue
|
|
1012
|
+
out.append(
|
|
1013
|
+
f"{label}: files found, 0 records in the selected window. "
|
|
1014
|
+
"Widen with --since/--days, or --all."
|
|
1015
|
+
)
|
|
1016
|
+
continue
|
|
1017
|
+
if cov.full_span is not None:
|
|
1018
|
+
start, end = cov.full_span
|
|
1019
|
+
out.append(
|
|
1020
|
+
f"{label}: {cov.full_rows:,} rows loaded, 0 in the selected "
|
|
1021
|
+
f"window — data spans {start.isoformat()} → {end.isoformat()}. "
|
|
1022
|
+
"Widen with --since/--days, or --all."
|
|
1023
|
+
)
|
|
1024
|
+
else:
|
|
1025
|
+
out.append(
|
|
1026
|
+
f"{label}: {cov.full_rows:,} rows loaded, 0 in the selected "
|
|
1027
|
+
"window. Widen with --since/--days, or --all."
|
|
1028
|
+
)
|
|
1029
|
+
return out
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
def _rotation_skip_notes(
|
|
1033
|
+
load_result: "loader.LoadResult",
|
|
1034
|
+
plan: RunPlan,
|
|
1035
|
+
) -> list[str]:
|
|
1036
|
+
"""Return disclosure notes for flat patterns windowed by rotation-peek.
|
|
1037
|
+
|
|
1038
|
+
The loader records a ``RotationSkipInfo`` per windowed pattern; the runner
|
|
1039
|
+
formats the prose (the loader never imports the runner). Reuses
|
|
1040
|
+
``_pattern_human_label`` for the operator-language source name.
|
|
1041
|
+
|
|
1042
|
+
- ``fallback`` → reason-aware "read the full archive (windowing skipped)."
|
|
1043
|
+
wording: "rotation order not monotonic" for the first-ts disorder fallback
|
|
1044
|
+
(reason ``None`` or that string — byte-identical to before), "overlapping
|
|
1045
|
+
export windows" for the Family-2 export-window conflict, and "duplicate
|
|
1046
|
+
rotation files" for a same-rank duplicate slot. Fallback WINS: it is
|
|
1047
|
+
data-true at the pattern level (``skipped == 0``), so the skip-summary cannot
|
|
1048
|
+
also fire.
|
|
1049
|
+
- else ``skipped > 0`` → "loaded L of L+S rotation files; S skipped outside
|
|
1050
|
+
the selected window (by rotation order)." NEUTRAL "outside" is truthful
|
|
1051
|
+
for both the ``--since`` older-tail skip and the ``--until`` too-new
|
|
1052
|
+
leading skip (a bounded run can skip both under one count).
|
|
1053
|
+
- else → no note.
|
|
1054
|
+
"""
|
|
1055
|
+
out: list[str] = []
|
|
1056
|
+
for pattern, info in load_result.rotation_skips.items():
|
|
1057
|
+
label = _pattern_human_label(plan.needed_logs[pattern], pattern)
|
|
1058
|
+
if info.fallback:
|
|
1059
|
+
if info.fallback_reason == "overlapping export windows":
|
|
1060
|
+
out.append(
|
|
1061
|
+
f"{label}: overlapping export windows — read the full archive "
|
|
1062
|
+
"(windowing skipped)."
|
|
1063
|
+
)
|
|
1064
|
+
elif info.fallback_reason == "duplicate rotation files":
|
|
1065
|
+
out.append(
|
|
1066
|
+
f"{label}: duplicate rotation files — read the full archive "
|
|
1067
|
+
"(windowing skipped)."
|
|
1068
|
+
)
|
|
1069
|
+
else: # "rotation order not monotonic" or None → existing wording
|
|
1070
|
+
out.append(
|
|
1071
|
+
f"{label}: rotation order not monotonic — read the full archive "
|
|
1072
|
+
"(windowing skipped)."
|
|
1073
|
+
)
|
|
1074
|
+
elif info.skipped > 0:
|
|
1075
|
+
out.append(
|
|
1076
|
+
f"{label}: loaded {info.loaded} of {info.loaded + info.skipped} "
|
|
1077
|
+
f"rotation files; {info.skipped} skipped outside the selected "
|
|
1078
|
+
"window (by rotation order)."
|
|
1079
|
+
)
|
|
1080
|
+
return out
|
|
1081
|
+
|
|
1082
|
+
|
|
1083
|
+
def _dns_nudge(data_sources: list[str]) -> str | None:
|
|
1084
|
+
"""Return the Zeek evangelization note when only low-fidelity DNS data was loaded."""
|
|
1085
|
+
ds = set(data_sources)
|
|
1086
|
+
if "dnsmasq_dns" in ds and ds.isdisjoint(_RICH_DNS_SOURCES):
|
|
1087
|
+
return (
|
|
1088
|
+
"running on Pi-hole/dnsmasq logs — RTT, TTL, and connection correlation "
|
|
1089
|
+
"unavailable. Add Zeek for richer DNS analysis and conn.log correlation."
|
|
1090
|
+
)
|
|
1091
|
+
return None
|
|
1092
|
+
|
|
1093
|
+
|
|
1094
|
+
def _aws_below_floor_note(
|
|
1095
|
+
plan: RunPlan,
|
|
1096
|
+
logs: dict[str, pd.DataFrame],
|
|
1097
|
+
config: dict[str, Any],
|
|
1098
|
+
) -> str | None:
|
|
1099
|
+
"""RunSummary note disclosing principals below the aws min_events floor.
|
|
1100
|
+
|
|
1101
|
+
Pure derivation from the loaded CloudTrail frame via the detector's
|
|
1102
|
+
public ``below_floor_count`` helper. Called BEFORE the detector loop —
|
|
1103
|
+
detector-side state (a module cache populated inside run()) would be
|
|
1104
|
+
stale at this point. Returns None when aws is not in the plan, when the
|
|
1105
|
+
helper is missing (defensive), when no frame is loaded, or when count == 0.
|
|
1106
|
+
"""
|
|
1107
|
+
if "aws" not in plan.will_run:
|
|
1108
|
+
return None
|
|
1109
|
+
mod = plan.detectors.get("aws")
|
|
1110
|
+
if mod is None or not hasattr(mod, "below_floor_count"):
|
|
1111
|
+
return None
|
|
1112
|
+
df = logs.get("*.json*")
|
|
1113
|
+
if df is None or df.empty:
|
|
1114
|
+
return None
|
|
1115
|
+
aws_cfg = get_detector_config(config, "aws", getattr(mod, "DEFAULT_CONFIG", {}))
|
|
1116
|
+
default_min = getattr(mod, "DEFAULT_CONFIG", {}).get("min_events", 50)
|
|
1117
|
+
min_events = aws_cfg.get("min_events", default_min)
|
|
1118
|
+
count = mod.below_floor_count(df, min_events)
|
|
1119
|
+
if count <= 0:
|
|
1120
|
+
return None
|
|
1121
|
+
return (
|
|
1122
|
+
f"aws: {count} interactive principal(s) below the min_events floor were "
|
|
1123
|
+
"not scored — the quiet tail of low-volume actors was not examined."
|
|
1124
|
+
)
|
|
1125
|
+
|
|
1126
|
+
|
|
1127
|
+
def _aws_window_note(
|
|
1128
|
+
plan: RunPlan, *, cloudtrail_narrowed: bool = False
|
|
1129
|
+
) -> str | None:
|
|
1130
|
+
"""First-seen labels are relative to the loaded window — name the limitation.
|
|
1131
|
+
|
|
1132
|
+
Fires whenever aws ran, regardless of whether any bursts were emitted. The
|
|
1133
|
+
methodology limitation is worth knowing even if this run produced no
|
|
1134
|
+
burst findings, because the absence is itself window-dependent.
|
|
1135
|
+
|
|
1136
|
+
The ``--all`` rider is keyed to CLOUDTRAIL ACTUALLY being narrowed (an
|
|
1137
|
+
explicit --since/--until), NOT run-level default-window activity: CloudTrail
|
|
1138
|
+
opts out of the auto-default window, so on a mixed unqualified run
|
|
1139
|
+
(dns/syslog windowed) it loaded FULL and widening would not help. Rides the
|
|
1140
|
+
EXISTING note (no new note, no position change) — placeholder voice, flag
|
|
1141
|
+
for the qmail error-voice pass.
|
|
1142
|
+
"""
|
|
1143
|
+
if "aws" not in plan.will_run:
|
|
1144
|
+
return None
|
|
1145
|
+
note = (
|
|
1146
|
+
"aws: first-seen actions are first-seen within this loaded window — an "
|
|
1147
|
+
"action that is routinely used but absent earlier in the window reads "
|
|
1148
|
+
"as first-seen."
|
|
1149
|
+
)
|
|
1150
|
+
if cloudtrail_narrowed:
|
|
1151
|
+
note += " Run with --all for a full-baseline analysis."
|
|
1152
|
+
return note
|
|
1153
|
+
|
|
1154
|
+
|
|
1155
|
+
def _aws_no_interactive_note(
|
|
1156
|
+
plan: RunPlan,
|
|
1157
|
+
logs: dict[str, pd.DataFrame],
|
|
1158
|
+
*,
|
|
1159
|
+
cloudtrail_narrowed: bool,
|
|
1160
|
+
) -> str | None:
|
|
1161
|
+
"""Disclose the silent aws "nothing" when events loaded but zero are
|
|
1162
|
+
interactive-lane (aws.run returns [] with no finding).
|
|
1163
|
+
|
|
1164
|
+
Pure derivation via the detector's public ``interactive_count`` helper
|
|
1165
|
+
(mirrors ``_aws_below_floor_note``). Fires when aws is planned, the
|
|
1166
|
+
``*.json*`` frame is non-empty, and no event is interactive-lane. The
|
|
1167
|
+
``--all`` suffix is conditional on ``cloudtrail_narrowed`` — widening only
|
|
1168
|
+
helps when an explicit window narrowed the load; on an unqualified run
|
|
1169
|
+
CloudTrail already loaded full, so widening cannot surface interactive
|
|
1170
|
+
events that do not exist. Placeholder voice (flag for the qmail pass).
|
|
1171
|
+
"""
|
|
1172
|
+
if "aws" not in plan.will_run:
|
|
1173
|
+
return None
|
|
1174
|
+
mod = plan.detectors.get("aws")
|
|
1175
|
+
if mod is None or not hasattr(mod, "interactive_count"):
|
|
1176
|
+
return None
|
|
1177
|
+
df = logs.get("*.json*")
|
|
1178
|
+
if df is None or df.empty:
|
|
1179
|
+
return None
|
|
1180
|
+
if mod.interactive_count(df) != 0:
|
|
1181
|
+
return None
|
|
1182
|
+
note = (
|
|
1183
|
+
f"aws: {len(df)} CloudTrail events loaded but none are interactive-lane — "
|
|
1184
|
+
"aws scores only interactive activity, so nothing was analyzed."
|
|
1185
|
+
)
|
|
1186
|
+
if cloudtrail_narrowed:
|
|
1187
|
+
note += " Run with --all for full history."
|
|
1188
|
+
return note
|
|
1189
|
+
|
|
1190
|
+
|
|
1191
|
+
def _home_net_note(plan: RunPlan, config: dict[str, Any]) -> str | None:
|
|
1192
|
+
"""RunSummary note disclosing the internal networks in effect for scan.
|
|
1193
|
+
|
|
1194
|
+
Fires only when scan is in plan.will_run. Distinguishes default-vs-declared
|
|
1195
|
+
by reading the ``__user_set__`` provenance sidecar attached by the config
|
|
1196
|
+
loader — a pure value comparison would misclassify a user who declares the
|
|
1197
|
+
RFC1918 list verbatim as "default". When the operator did not declare
|
|
1198
|
+
home_net (no config file, or config file omits the key), the parenthetical
|
|
1199
|
+
fires; when they did declare it, the note states their value plainly.
|
|
1200
|
+
"""
|
|
1201
|
+
if "scan" not in plan.will_run:
|
|
1202
|
+
return None
|
|
1203
|
+
home_net = list(config.get("loghunter", {}).get("home_net", []))
|
|
1204
|
+
if not home_net:
|
|
1205
|
+
return None
|
|
1206
|
+
rendered = ", ".join(home_net)
|
|
1207
|
+
user_set = config.get("__user_set__", {}).get("loghunter", set())
|
|
1208
|
+
if "home_net" in user_set:
|
|
1209
|
+
return f"Internal networks: {rendered}."
|
|
1210
|
+
return (
|
|
1211
|
+
f"Internal networks: {rendered} "
|
|
1212
|
+
"(RFC1918 default — set home_net in config to override)."
|
|
1213
|
+
)
|
|
1214
|
+
|
|
1215
|
+
|
|
1216
|
+
def _source_overlap_notes(
|
|
1217
|
+
source_dirs: dict[str, list[Path]], plan: RunPlan,
|
|
1218
|
+
) -> list[str]:
|
|
1219
|
+
"""RunSummary notes when two IN-PLAN source families resolve to one directory.
|
|
1220
|
+
|
|
1221
|
+
The contamination vector: flat discovery globs overlap (``syslog`` discovers
|
|
1222
|
+
with the catch-all ``*.log*``), so a directory shared by two families has its
|
|
1223
|
+
files parsed by each front-end — one log can surface as another's finding.
|
|
1224
|
+
This is a plan-time disclosure, derived from already-resolved sources (same
|
|
1225
|
+
posture as ``_home_net_note``), not a load-time check.
|
|
1226
|
+
|
|
1227
|
+
Binding rails:
|
|
1228
|
+
|
|
1229
|
+
- **Eligibility = in-plan families only.** Derived from
|
|
1230
|
+
``set(plan.needed_logs.values())``, NOT every non-empty resolved bucket.
|
|
1231
|
+
A family configured-and-resolved but not selected (no detector in the run
|
|
1232
|
+
reads it) cannot contaminate, so two dirs colliding while only one family
|
|
1233
|
+
is planned does NOT warn. Optional multi-source detectors add only their
|
|
1234
|
+
satisfiable patterns to ``needed_logs``, so the note follows what the
|
|
1235
|
+
loader will actually read.
|
|
1236
|
+
- **Directories only.** Explicit FILE inputs are out of scope — the vector
|
|
1237
|
+
is dir-glob overlap, not a shared named file. Per-family duplicate inputs
|
|
1238
|
+
collapse (a key is recorded once per directory).
|
|
1239
|
+
- **Equal-dir ONLY (v1).** Flat discovery is non-recursive, so the shipped
|
|
1240
|
+
default (``syslog_dir=/var/log`` containing ``zeek_dir=/var/log/zeek``)
|
|
1241
|
+
does NOT contaminate and MUST NOT warn — nesting is deliberately out of
|
|
1242
|
+
scope. (CloudTrail's ``rglob`` makes nested cloudtrail an acknowledged
|
|
1243
|
+
deferred edge.)
|
|
1244
|
+
- **Deterministic ordering.** ``source_dirs`` is built in canonical key order
|
|
1245
|
+
by ``run`` (zeek, syslog, pihole, cloudtrail); first-seen preservation here
|
|
1246
|
+
keeps the rendered family list deterministic. ≥3 families at one dir → one
|
|
1247
|
+
note listing all.
|
|
1248
|
+
|
|
1249
|
+
Placeholder voice (pending the qmail error-voice pass).
|
|
1250
|
+
"""
|
|
1251
|
+
in_plan = set(plan.needed_logs.values())
|
|
1252
|
+
by_dir: dict[Path, list[str]] = {}
|
|
1253
|
+
for key, paths in source_dirs.items():
|
|
1254
|
+
if key not in in_plan:
|
|
1255
|
+
continue
|
|
1256
|
+
for p in paths:
|
|
1257
|
+
if not p.is_dir(): # explicit files out of scope
|
|
1258
|
+
continue
|
|
1259
|
+
try:
|
|
1260
|
+
resolved = p.resolve()
|
|
1261
|
+
except OSError:
|
|
1262
|
+
continue
|
|
1263
|
+
families = by_dir.setdefault(resolved, [])
|
|
1264
|
+
if key not in families: # collapse per-family duplicate inputs
|
|
1265
|
+
families.append(key)
|
|
1266
|
+
|
|
1267
|
+
notes: list[str] = []
|
|
1268
|
+
for resolved, families in by_dir.items():
|
|
1269
|
+
if len(families) >= 2:
|
|
1270
|
+
notes.append(
|
|
1271
|
+
f"{', '.join(families)} resolve to the same directory "
|
|
1272
|
+
f"({resolved}): files there matching more than one source's "
|
|
1273
|
+
"patterns are parsed by each, which can surface one log as "
|
|
1274
|
+
"another's finding. Point them at separate directories — global "
|
|
1275
|
+
"exports now auto-segment per source."
|
|
1276
|
+
)
|
|
1277
|
+
return notes
|
|
1278
|
+
|
|
1279
|
+
|
|
1280
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
1281
|
+
# digest verb — orient-before-the-hunt
|
|
1282
|
+
#
|
|
1283
|
+
# run_digest() and the helpers below are a parallel entry point to run(). They
|
|
1284
|
+
# share the loader, output-handler-building, and _derive_data_sources. Default-
|
|
1285
|
+
# window resolution now goes through the SAME loader.resolve_load_windows +
|
|
1286
|
+
# loader.apply_default_window that run() uses — the digest twin engine is gone.
|
|
1287
|
+
# Digest default-windowing stays Zeek-ONLY (the caller-side gate below): non-Zeek
|
|
1288
|
+
# digest directories continue to load full, exactly as before. Pinned by the
|
|
1289
|
+
# Zeek-directory golden plus the programmatic non-Zeek load-full tests.
|
|
1290
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
1291
|
+
|
|
1292
|
+
_HISTOGRAM_HOURLY_THRESHOLD_HOURS = 48
|
|
1293
|
+
_HISTOGRAM_MAX_BINS = 60
|
|
1294
|
+
|
|
1295
|
+
|
|
1296
|
+
# Timestamp-confidence floor for digest cards. When the parseable-ts fraction
|
|
1297
|
+
# falls below this floor (or the non-NaN span is zero), the digest banner
|
|
1298
|
+
# window dashes and the histogram line renders "(timeline unavailable)" —
|
|
1299
|
+
# the card refuses to draw a timeline it cannot trust. Set at 80% per the
|
|
1300
|
+
# confident-but-wrong defects gate: lower risks rendering a confident
|
|
1301
|
+
# timeline on junk timestamps; higher would erase orientation when a small
|
|
1302
|
+
# fraction of a syslog batch is corrupt.
|
|
1303
|
+
_DIGEST_TS_CONFIDENCE_FLOOR: float = 0.80
|
|
1304
|
+
|
|
1305
|
+
|
|
1306
|
+
def _ts_confidence(frame: pd.DataFrame) -> bool:
|
|
1307
|
+
"""True iff the frame's ``ts`` column can support an honest timeline.
|
|
1308
|
+
|
|
1309
|
+
Both conditions must hold:
|
|
1310
|
+
|
|
1311
|
+
1. ``parsed / total >= _DIGEST_TS_CONFIDENCE_FLOOR`` (default 0.80) —
|
|
1312
|
+
the parseable-ts fraction is high enough that the histogram bins
|
|
1313
|
+
reflect the bulk of the records.
|
|
1314
|
+
2. ``max(ts) - min(ts) > 0`` — the non-NaN timestamps span more than a
|
|
1315
|
+
single instant; otherwise the histogram collapses to one bin and
|
|
1316
|
+
lies about the timeline.
|
|
1317
|
+
|
|
1318
|
+
Both failure modes (low-coverage AND zero-span) render the same bare
|
|
1319
|
+
``(timeline unavailable)`` line — there is no footer disclosure in the
|
|
1320
|
+
flat card grammar, so the differentiation that the old reason sentinels
|
|
1321
|
+
enabled has no consumer.
|
|
1322
|
+
"""
|
|
1323
|
+
total = int(len(frame))
|
|
1324
|
+
if "ts" not in frame.columns or total == 0:
|
|
1325
|
+
return False
|
|
1326
|
+
ts = frame["ts"].dropna()
|
|
1327
|
+
parsed = int(len(ts))
|
|
1328
|
+
if parsed / total < _DIGEST_TS_CONFIDENCE_FLOOR:
|
|
1329
|
+
return False
|
|
1330
|
+
if parsed == 0:
|
|
1331
|
+
return False
|
|
1332
|
+
span = float(ts.max()) - float(ts.min())
|
|
1333
|
+
if span <= 0:
|
|
1334
|
+
return False
|
|
1335
|
+
return True
|
|
1336
|
+
|
|
1337
|
+
|
|
1338
|
+
def _compute_histogram(
|
|
1339
|
+
ts: pd.Series,
|
|
1340
|
+
data_window: tuple[datetime, datetime],
|
|
1341
|
+
) -> tuple[list[int], str, int]:
|
|
1342
|
+
"""Adaptive-binning temporal histogram over a timestamp series.
|
|
1343
|
+
|
|
1344
|
+
Returns ``(counts, unit, peak)``:
|
|
1345
|
+
|
|
1346
|
+
- ``counts`` is a list of per-bin event counts spanning data_window.
|
|
1347
|
+
- ``unit`` is ``"hr"`` for spans <= 48 hours, else ``"day"``.
|
|
1348
|
+
- ``peak`` is the maximum bin value (0 when there are no events).
|
|
1349
|
+
|
|
1350
|
+
Without unit-aware binning, a 30-day window with hourly bars produces
|
|
1351
|
+
720 useless bars; a 1-hour window with daily bars produces one. Both
|
|
1352
|
+
fail to communicate shape — hence the adaptive switch.
|
|
1353
|
+
|
|
1354
|
+
The right edge is INCLUSIVE: the window is treated as ``[start, end]``
|
|
1355
|
+
so that an event at exactly ``data_window[1]`` (the max-ts event when
|
|
1356
|
+
``data_window`` is derived from ``min(ts)/max(ts)``) lands in the
|
|
1357
|
+
final bin instead of being silently dropped when the span lands on an
|
|
1358
|
+
exact bin boundary. Callers must pass ``data_window`` such that
|
|
1359
|
+
``data_window[1] >= max(ts)``; the lone production caller (run_digest)
|
|
1360
|
+
satisfies this by deriving ``data_window`` from the same loaded frame.
|
|
1361
|
+
|
|
1362
|
+
A zero-span window (``start == end``) with non-empty ``ts`` emits a
|
|
1363
|
+
single bin holding the full count — appropriate for single-record
|
|
1364
|
+
digests, or frames whose events all share one timestamp.
|
|
1365
|
+
"""
|
|
1366
|
+
start, end = data_window
|
|
1367
|
+
span_seconds = (end - start).total_seconds()
|
|
1368
|
+
|
|
1369
|
+
cleaned = ts.dropna().astype(float)
|
|
1370
|
+
if cleaned.empty or span_seconds < 0:
|
|
1371
|
+
return [], "hr", 0
|
|
1372
|
+
if span_seconds == 0:
|
|
1373
|
+
# All events share a single timestamp — emit one bin holding the count.
|
|
1374
|
+
n = int(len(cleaned))
|
|
1375
|
+
return [n], "hr", n
|
|
1376
|
+
|
|
1377
|
+
span_hours = span_seconds / 3600.0
|
|
1378
|
+
if span_hours <= _HISTOGRAM_HOURLY_THRESHOLD_HOURS:
|
|
1379
|
+
unit = "hr"
|
|
1380
|
+
bin_seconds = 3600
|
|
1381
|
+
else:
|
|
1382
|
+
unit = "day"
|
|
1383
|
+
bin_seconds = 86400
|
|
1384
|
+
|
|
1385
|
+
bin_count = max(1, -(-int(span_seconds) // bin_seconds)) # ceiling division
|
|
1386
|
+
start_epoch = start.timestamp()
|
|
1387
|
+
offsets = ((cleaned - start_epoch) // bin_seconds).astype("int64")
|
|
1388
|
+
# Drop pre-window events, then collapse the inclusive right edge: events
|
|
1389
|
+
# at exactly data_window[1] yield offset == bin_count when the span is an
|
|
1390
|
+
# exact multiple of bin_seconds — fold those into the final bin instead
|
|
1391
|
+
# of filtering them out.
|
|
1392
|
+
offsets = offsets[offsets >= 0]
|
|
1393
|
+
offsets = offsets.where(offsets < bin_count, bin_count - 1)
|
|
1394
|
+
value_counts = offsets.value_counts().sort_index()
|
|
1395
|
+
counts = [int(value_counts.get(i, 0)) for i in range(bin_count)]
|
|
1396
|
+
if len(counts) > _HISTOGRAM_MAX_BINS:
|
|
1397
|
+
# Cap output width by folding adjacent bins by sum. The unit label
|
|
1398
|
+
# stays nominal — each glyph now spans several hr/day — but the peak
|
|
1399
|
+
# anchor recomputed below stays truthful to the drawn bars.
|
|
1400
|
+
group_size = -(-len(counts) // _HISTOGRAM_MAX_BINS)
|
|
1401
|
+
counts = [
|
|
1402
|
+
sum(counts[i:i + group_size])
|
|
1403
|
+
for i in range(0, len(counts), group_size)
|
|
1404
|
+
]
|
|
1405
|
+
peak = max(counts) if counts else 0
|
|
1406
|
+
return counts, unit, peak
|
|
1407
|
+
|
|
1408
|
+
|
|
1409
|
+
_DNS_ZEEK_EMPTY_COLUMNS = [
|
|
1410
|
+
"ts", "src", "query", "rtt", "ttl", "rcode", "answer", "tc", "qtype",
|
|
1411
|
+
]
|
|
1412
|
+
_DNS_PIHOLE_EMPTY_COLUMNS = [
|
|
1413
|
+
"ts", "src", "query", "event_type", "qtype", "dst", "answer",
|
|
1414
|
+
"validation", "host", "raw", "message",
|
|
1415
|
+
]
|
|
1416
|
+
_CONN_EMPTY_COLUMNS = [
|
|
1417
|
+
"src", "dst", "port", "proto", "ts", "bytes", "conn_state", "local_orig",
|
|
1418
|
+
]
|
|
1419
|
+
_SYSLOG_EMPTY_COLUMNS = ["ts", "host", "program", "raw", "message"]
|
|
1420
|
+
_CLOUDTRAIL_EMPTY_COLUMNS = [
|
|
1421
|
+
"ts", "principal", "lane", "read_write",
|
|
1422
|
+
"event_source", "event_name", "identity_type",
|
|
1423
|
+
"source_ip", "error_code", "aws_region", "event_id", "raw",
|
|
1424
|
+
]
|
|
1425
|
+
|
|
1426
|
+
|
|
1427
|
+
# (schema, source_key) → (loader glob pattern, empty-frame column set).
|
|
1428
|
+
# Mechanical mapping kept inline alongside run_digest because it's runner
|
|
1429
|
+
# plumbing — pattern + columns are runner/loader concerns, NOT source-
|
|
1430
|
+
# resolution ownership (DigestSource just carries the directory + feed +
|
|
1431
|
+
# source_key). See plan: "_PATTERN_AND_EMPTY[(schema, source_key)] inline".
|
|
1432
|
+
_DIGEST_PATTERN_AND_EMPTY: dict[tuple[str, str], tuple[str, list[str]]] = {
|
|
1433
|
+
("conn", "zeek_dir"): ("conn*.log*", _CONN_EMPTY_COLUMNS),
|
|
1434
|
+
("dns", "zeek_dir"): ("dns*.log*", _DNS_ZEEK_EMPTY_COLUMNS),
|
|
1435
|
+
("dns", "pihole_dir"): ("pihole*.log*", _DNS_PIHOLE_EMPTY_COLUMNS),
|
|
1436
|
+
("syslog", "syslog_dir"): ("*.log*", _SYSLOG_EMPTY_COLUMNS),
|
|
1437
|
+
("syslog", "zeek_dir"): ("syslog*.log*", _SYSLOG_EMPTY_COLUMNS),
|
|
1438
|
+
("cloudtrail", "cloudtrail_dir"): ("*.json*", _CLOUDTRAIL_EMPTY_COLUMNS),
|
|
1439
|
+
}
|
|
1440
|
+
|
|
1441
|
+
|
|
1442
|
+
# Inter-card separator emitted between adjacent rendered cards on a multi-card
|
|
1443
|
+
# run (stdout fan-out or --out concatenation). 40 columns of U+2500 BOX
|
|
1444
|
+
# DRAWINGS LIGHT HORIZONTAL, flush-left, with one blank line above and one
|
|
1445
|
+
# blank line below. Single-card runs (one positional, or a multi-positional
|
|
1446
|
+
# run where only one path reaches render-commit) draw no rule at all — the
|
|
1447
|
+
# emit fires only when ``leading_separator=True``, which the CLI sets from
|
|
1448
|
+
# ``rendered > 0`` AFTER a prior card's run_digest return.
|
|
1449
|
+
_DIGEST_INTER_CARD_RULE: str = "─" * 40
|
|
1450
|
+
|
|
1451
|
+
|
|
1452
|
+
def _emit_inter_card_separator(stream: Any) -> None:
|
|
1453
|
+
"""Emit the 40-col inter-card rule with bracketing blank lines."""
|
|
1454
|
+
target = stream if stream is not None else sys.stdout
|
|
1455
|
+
print(file=target)
|
|
1456
|
+
print(_DIGEST_INTER_CARD_RULE, file=target)
|
|
1457
|
+
print(file=target)
|
|
1458
|
+
|
|
1459
|
+
|
|
1460
|
+
def _render_blob_for_path(
|
|
1461
|
+
blob_path: Path,
|
|
1462
|
+
*,
|
|
1463
|
+
stream: Any = None,
|
|
1464
|
+
output_dir: Path | None = None,
|
|
1465
|
+
output_file: Path | None = None,
|
|
1466
|
+
verbose_level: int = 0,
|
|
1467
|
+
leading_separator: bool = False,
|
|
1468
|
+
) -> None:
|
|
1469
|
+
"""Profile a single file via the blob digest path and render the card.
|
|
1470
|
+
|
|
1471
|
+
Shared by the canonical blob branch (schema == "blob": sniff routed a
|
|
1472
|
+
path to the blob floor) and the defensive fallback in the recognised-
|
|
1473
|
+
schema path (item 2: a summariser raise on a recognised schema falls
|
|
1474
|
+
through to a blob card for the same file instead of aborting the fan-
|
|
1475
|
+
out).
|
|
1476
|
+
|
|
1477
|
+
Caller verifies that ``blob_path`` is a regular file before invoking.
|
|
1478
|
+
Output routing (stream / output_dir / output_file / verbose) is the
|
|
1479
|
+
same shape ``run_digest`` itself uses; the fallback caller threads its
|
|
1480
|
+
own values so the blob card lands on the same fan-out stream and
|
|
1481
|
+
--out target as the original card would have.
|
|
1482
|
+
|
|
1483
|
+
``leading_separator`` is the single-owner emission seam for blob cards.
|
|
1484
|
+
This function owns the rule for BOTH the top-level blob route AND the
|
|
1485
|
+
summariser-failure fallback — ``run_digest`` never emits when handing
|
|
1486
|
+
off to the fallback, it just threads the flag here. Emission happens
|
|
1487
|
+
immediately before ``handler.render_blob(card)`` so a separator only
|
|
1488
|
+
ever precedes a card that reaches its render call.
|
|
1489
|
+
"""
|
|
1490
|
+
from loghunter.digest import blob as _blob_summarizer
|
|
1491
|
+
card = _blob_summarizer.summarize_blob(blob_path)
|
|
1492
|
+
|
|
1493
|
+
handler, close_handler = _build_output_handler(
|
|
1494
|
+
"text", output_dir, output_file, verbose_level, stream=stream,
|
|
1495
|
+
)
|
|
1496
|
+
try:
|
|
1497
|
+
from loghunter.outputs.text import TextHandler
|
|
1498
|
+
if not isinstance(handler, TextHandler):
|
|
1499
|
+
raise RuntimeError(
|
|
1500
|
+
"digest blob: _build_output_handler did not return a "
|
|
1501
|
+
f"TextHandler (got {type(handler).__name__})"
|
|
1502
|
+
)
|
|
1503
|
+
if leading_separator:
|
|
1504
|
+
_emit_inter_card_separator(stream)
|
|
1505
|
+
handler.render_blob(card)
|
|
1506
|
+
finally:
|
|
1507
|
+
close_handler()
|
|
1508
|
+
|
|
1509
|
+
|
|
1510
|
+
def run_digest(
|
|
1511
|
+
config: dict[str, Any],
|
|
1512
|
+
zeek_dir: str | Path | None = None,
|
|
1513
|
+
pihole_dir: str | Path | None = None,
|
|
1514
|
+
syslog_dir: str | Path | None = None,
|
|
1515
|
+
cloudtrail_dir: str | Path | None = None,
|
|
1516
|
+
blob_path: Path | None = None,
|
|
1517
|
+
since: datetime | None = None,
|
|
1518
|
+
until: datetime | None = None,
|
|
1519
|
+
output_format: str = "text",
|
|
1520
|
+
output_dir: Path | None = None,
|
|
1521
|
+
output_file: Path | None = None,
|
|
1522
|
+
stream: Any = None,
|
|
1523
|
+
verbose_level: int = 0,
|
|
1524
|
+
dry_run: bool = False,
|
|
1525
|
+
load_all: bool = False,
|
|
1526
|
+
skip_confirm: bool = False,
|
|
1527
|
+
schema: str = "conn",
|
|
1528
|
+
fallback_blob_path: Path | None = None,
|
|
1529
|
+
leading_separator: bool = False,
|
|
1530
|
+
show_progress: bool = True,
|
|
1531
|
+
) -> None:
|
|
1532
|
+
"""Digest entry point — orient-before-the-hunt for a single schema.
|
|
1533
|
+
|
|
1534
|
+
Loads the source frame, computes spine ambient facts and a temporal
|
|
1535
|
+
histogram, dispatches to the schema summariser, assembles a DigestCard,
|
|
1536
|
+
and renders it. Does NOT build a RunPlan, does NOT run the allowlist
|
|
1537
|
+
loop, does NOT produce Findings.
|
|
1538
|
+
|
|
1539
|
+
Pre-allowlist tap: the loaded frame is consumed BEFORE the allowlist
|
|
1540
|
+
seam. Allowlisted infrastructure (resolvers, pollers) is part of what's
|
|
1541
|
+
in the pile and stays on the sonar. This function MUST NOT call
|
|
1542
|
+
build_matcher or AllowlistMatcher.filter_df.
|
|
1543
|
+
|
|
1544
|
+
Source-dir parameters (``zeek_dir`` / ``pihole_dir`` / ``syslog_dir`` /
|
|
1545
|
+
``cloudtrail_dir``) are EXPLICIT OVERRIDES with ``None`` meaning
|
|
1546
|
+
"no override." Pass a string or ``Path``;
|
|
1547
|
+
``loghunter.common.sources.resolve_digest_source`` owns the per-schema
|
|
1548
|
+
candidate ladder, wrong-key + XOR + not-configured errors (byte-preserved
|
|
1549
|
+
from the previous in-line strings), and is the SOLE site that converts
|
|
1550
|
+
a source-dir string to a Path. CLI callers thread raw parsed strings;
|
|
1551
|
+
programmatic callers can pass already-resolved ``Path``s or let ``None``
|
|
1552
|
+
fall back to ``config["loghunter"][candidate]`` (LH_ROOT applied).
|
|
1553
|
+
|
|
1554
|
+
``leading_separator`` drives the multi-card inter-card rule. The CLI
|
|
1555
|
+
fan-out sets it from ``rendered > 0`` after a previous card committed
|
|
1556
|
+
to render. Single-owner emission: run_digest emits for schema cards
|
|
1557
|
+
(immediately before handler.render_digest); on the summariser-failure
|
|
1558
|
+
fallback arm it threads the flag through to _render_blob_for_path
|
|
1559
|
+
(which owns blob emission) and does NOT emit itself.
|
|
1560
|
+
"""
|
|
1561
|
+
if output_format != "text":
|
|
1562
|
+
raise ValueError(
|
|
1563
|
+
f"digest currently supports only --output=text (got {output_format!r})"
|
|
1564
|
+
)
|
|
1565
|
+
if schema not in ("conn", "dns", "syslog", "cloudtrail", "blob"):
|
|
1566
|
+
raise ValueError(f"digest: unsupported schema {schema!r}")
|
|
1567
|
+
|
|
1568
|
+
# The blob path is reached ONLY via the CLI sniff router, never via an
|
|
1569
|
+
# operator token (the `digest blob PATH` token is gone). The blob
|
|
1570
|
+
# terminal branch is small by design: profile the single file and hand
|
|
1571
|
+
# off to _render_blob_for_path, which builds + renders the card. No
|
|
1572
|
+
# loader, no allowlist, no histogram, no DigestCard — blob has no
|
|
1573
|
+
# parsed frame.
|
|
1574
|
+
if schema == "blob":
|
|
1575
|
+
if blob_path is None:
|
|
1576
|
+
raise ValueError(
|
|
1577
|
+
"digest blob: PATH not provided — pass a positional PATH"
|
|
1578
|
+
)
|
|
1579
|
+
if not blob_path.is_file():
|
|
1580
|
+
raise ValueError(f"digest blob: not a file: {blob_path}")
|
|
1581
|
+
|
|
1582
|
+
if dry_run:
|
|
1583
|
+
print("LogHunter · digest dry run")
|
|
1584
|
+
print(_SEP)
|
|
1585
|
+
print(" schema: blob")
|
|
1586
|
+
print(f" path: {blob_path}")
|
|
1587
|
+
print(" window: (none — blob extracts no fields)")
|
|
1588
|
+
print(_SEP)
|
|
1589
|
+
return
|
|
1590
|
+
|
|
1591
|
+
_render_blob_for_path(
|
|
1592
|
+
blob_path,
|
|
1593
|
+
stream=stream,
|
|
1594
|
+
output_dir=output_dir,
|
|
1595
|
+
output_file=output_file,
|
|
1596
|
+
verbose_level=verbose_level,
|
|
1597
|
+
leading_separator=leading_separator,
|
|
1598
|
+
)
|
|
1599
|
+
return
|
|
1600
|
+
|
|
1601
|
+
if blob_path is not None:
|
|
1602
|
+
raise ValueError(
|
|
1603
|
+
f"digest {schema}: blob_path is only valid for the blob schema"
|
|
1604
|
+
)
|
|
1605
|
+
|
|
1606
|
+
cfg_lh = config.get("loghunter", {})
|
|
1607
|
+
|
|
1608
|
+
# Single owner of digest source resolution. resolve_digest_source runs the
|
|
1609
|
+
# per-schema candidate ladder + wrong-key / XOR / not-configured guards
|
|
1610
|
+
# with byte-preserved error strings, and is the SOLE site that converts
|
|
1611
|
+
# a source-dir string to a Path on the digest path.
|
|
1612
|
+
ds = resolve_digest_source(
|
|
1613
|
+
config, schema,
|
|
1614
|
+
overrides={
|
|
1615
|
+
"zeek_dir": zeek_dir,
|
|
1616
|
+
"syslog_dir": syslog_dir,
|
|
1617
|
+
"pihole_dir": pihole_dir,
|
|
1618
|
+
"cloudtrail_dir": cloudtrail_dir,
|
|
1619
|
+
},
|
|
1620
|
+
)
|
|
1621
|
+
feed = ds.feed
|
|
1622
|
+
source_dir = ds.directory
|
|
1623
|
+
source_key = ds.source_key
|
|
1624
|
+
pattern, empty_columns = _DIGEST_PATTERN_AND_EMPTY[(schema, source_key)]
|
|
1625
|
+
|
|
1626
|
+
from loghunter.common import loader
|
|
1627
|
+
|
|
1628
|
+
# Default-window resolution is Zeek-ONLY on the digest path (CODE.md
|
|
1629
|
+
# boundedness rule): non-Zeek digest directories (pihole/syslog/cloudtrail)
|
|
1630
|
+
# load full and filter by an explicit window only. The caller-side gate below
|
|
1631
|
+
# IS the behavior-preservation point — digest invokes the SHARED resolver
|
|
1632
|
+
# (loader.resolve_load_windows) for the Zeek source alone, NOT a duplicate
|
|
1633
|
+
# engine. dated → precise (since, until); flat / mixed → post-load trim_span.
|
|
1634
|
+
dated_window: tuple[datetime, datetime] | None = None
|
|
1635
|
+
flat_span: timedelta | None = None
|
|
1636
|
+
keep_null = False
|
|
1637
|
+
default_note = None # no banner on the flat digest card — never rendered
|
|
1638
|
+
if source_key == "zeek_dir":
|
|
1639
|
+
default_spec = cfg_lh.get("default_window", "1d")
|
|
1640
|
+
_digest_windows = loader.resolve_load_windows(
|
|
1641
|
+
{pattern: source_key}, {source_key: [source_dir]}, default_spec,
|
|
1642
|
+
since=since, until=until, load_all=load_all,
|
|
1643
|
+
)
|
|
1644
|
+
if _digest_windows:
|
|
1645
|
+
w = _digest_windows[0]
|
|
1646
|
+
dated_window = w.select_window if w.trim_span is None else None
|
|
1647
|
+
flat_span = w.trim_span
|
|
1648
|
+
keep_null = w.keep_null
|
|
1649
|
+
|
|
1650
|
+
if dry_run:
|
|
1651
|
+
print("LogHunter · digest dry run")
|
|
1652
|
+
print(_SEP)
|
|
1653
|
+
print(f" schema: {schema}")
|
|
1654
|
+
if feed is not None:
|
|
1655
|
+
print(f" feed: {feed}")
|
|
1656
|
+
print(f" {source_key}:{' ' * max(0, 13 - len(source_key) - 1)} {source_dir}")
|
|
1657
|
+
if dated_window is not None:
|
|
1658
|
+
print(
|
|
1659
|
+
f" window: {dated_window[0].isoformat()} → "
|
|
1660
|
+
f"{dated_window[1].isoformat()} (dated default)"
|
|
1661
|
+
)
|
|
1662
|
+
elif flat_span is not None:
|
|
1663
|
+
print(f" window: last {cfg_lh.get('default_window', '1d')} of available data (flat default)")
|
|
1664
|
+
elif since is not None or until is not None:
|
|
1665
|
+
since_str = since.isoformat() if since else "beginning of data"
|
|
1666
|
+
until_str = until.isoformat() if until else "end of data"
|
|
1667
|
+
print(f" window: {since_str} → {until_str}")
|
|
1668
|
+
elif load_all:
|
|
1669
|
+
print(" window: all available data (--all)")
|
|
1670
|
+
else:
|
|
1671
|
+
print(" window: all available data")
|
|
1672
|
+
print(_SEP)
|
|
1673
|
+
return
|
|
1674
|
+
|
|
1675
|
+
needed_logs = {pattern: source_key}
|
|
1676
|
+
# Digest compat: load_required_logs is list-only. Wrap [source_dir] for
|
|
1677
|
+
# the degenerate one-element case — card-per-file behavior unchanged,
|
|
1678
|
+
# the union plumbing runs as a single-element passthrough.
|
|
1679
|
+
source_dirs = {source_key: [source_dir]}
|
|
1680
|
+
source_windows = (
|
|
1681
|
+
{source_key: dated_window} if dated_window is not None else None
|
|
1682
|
+
)
|
|
1683
|
+
|
|
1684
|
+
# Single-file Zeek bypass: the file was already content-identified by sniff;
|
|
1685
|
+
# discover_zeek_files' fnmatch(basename, pattern) gate is meaningless for an
|
|
1686
|
+
# explicitly-named single file and was dropping date-prefixed Zeek logs
|
|
1687
|
+
# (e.g. 2026-06-09.conn.log) into zero-row cards. Pi-hole, syslog, and
|
|
1688
|
+
# CloudTrail loaders already accept explicit files without a basename gate;
|
|
1689
|
+
# only the Zeek path needs the bypass. discover_zeek_files itself is
|
|
1690
|
+
# unchanged — the detect path still uses its single-file gate as a type
|
|
1691
|
+
# check.
|
|
1692
|
+
if source_key == "zeek_dir" and source_dir.is_file():
|
|
1693
|
+
s_since, s_until = (
|
|
1694
|
+
dated_window if dated_window is not None else (since, until)
|
|
1695
|
+
)
|
|
1696
|
+
warnings: list[str] = []
|
|
1697
|
+
try:
|
|
1698
|
+
data_size_bytes = source_dir.stat().st_size
|
|
1699
|
+
except OSError:
|
|
1700
|
+
data_size_bytes = 0
|
|
1701
|
+
df = loader.load_logs(
|
|
1702
|
+
source_dir.parent, pattern, s_since, s_until,
|
|
1703
|
+
_files=[source_dir], _warnings=warnings,
|
|
1704
|
+
show_progress=show_progress,
|
|
1705
|
+
)
|
|
1706
|
+
# Preserve schema-warning parity with load_required_logs so
|
|
1707
|
+
# malformed-but-parseable Zeek single files behave identically on the
|
|
1708
|
+
# bypass and directory paths.
|
|
1709
|
+
schema_warning = loader._schema_warning(pattern, df)
|
|
1710
|
+
if schema_warning:
|
|
1711
|
+
warnings.append(schema_warning)
|
|
1712
|
+
logs = {pattern: df}
|
|
1713
|
+
record_counts = {pattern: len(df)} if not df.empty else {}
|
|
1714
|
+
load_result = loader.LoadResult(
|
|
1715
|
+
logs=logs,
|
|
1716
|
+
record_counts=record_counts,
|
|
1717
|
+
data_window=loader._data_window(logs),
|
|
1718
|
+
warnings=warnings,
|
|
1719
|
+
data_size_bytes=data_size_bytes,
|
|
1720
|
+
)
|
|
1721
|
+
else:
|
|
1722
|
+
load_result = loader.load_required_logs(
|
|
1723
|
+
needed_logs,
|
|
1724
|
+
source_dirs,
|
|
1725
|
+
since,
|
|
1726
|
+
until,
|
|
1727
|
+
verbose=(verbose_level >= 1),
|
|
1728
|
+
source_windows=source_windows,
|
|
1729
|
+
show_progress=show_progress,
|
|
1730
|
+
)
|
|
1731
|
+
|
|
1732
|
+
if flat_span is not None:
|
|
1733
|
+
load_result = loader.apply_default_window(
|
|
1734
|
+
load_result, [pattern], flat_span, keep_null=keep_null,
|
|
1735
|
+
)
|
|
1736
|
+
|
|
1737
|
+
for warning in load_result.warnings:
|
|
1738
|
+
print(f"Warning: {warning}", file=sys.stderr)
|
|
1739
|
+
|
|
1740
|
+
total_records = sum(load_result.record_counts.values())
|
|
1741
|
+
warn_above: int = cfg_lh.get("warn_above", 5_000_000)
|
|
1742
|
+
if total_records > warn_above and not skip_confirm:
|
|
1743
|
+
try:
|
|
1744
|
+
answer = input(
|
|
1745
|
+
f"{total_records:,} records found. This may take a while. "
|
|
1746
|
+
"Continue? [y/N] "
|
|
1747
|
+
)
|
|
1748
|
+
except (EOFError, KeyboardInterrupt):
|
|
1749
|
+
answer = ""
|
|
1750
|
+
if answer.strip().lower() not in ("y", "yes"):
|
|
1751
|
+
raise ExportAborted("loghunter: aborted by user")
|
|
1752
|
+
|
|
1753
|
+
if load_result.data_window is not None:
|
|
1754
|
+
data_window = load_result.data_window
|
|
1755
|
+
elif since or until:
|
|
1756
|
+
data_window = (
|
|
1757
|
+
since or datetime.now(timezone.utc),
|
|
1758
|
+
until or datetime.now(timezone.utc),
|
|
1759
|
+
)
|
|
1760
|
+
else:
|
|
1761
|
+
_now = datetime.now(timezone.utc)
|
|
1762
|
+
data_window = (_now, _now)
|
|
1763
|
+
|
|
1764
|
+
# data_sources / notes were the RunSummary banner inputs; the flat
|
|
1765
|
+
# digest card has no banner so they are no longer consumed here.
|
|
1766
|
+
# default_note is the unbounded-source default-window note — same
|
|
1767
|
+
# provenance disclosure, no card surface to attach it to under the new
|
|
1768
|
+
# grammar. Reference both so unused-arg checkers stay quiet.
|
|
1769
|
+
_ = _derive_data_sources(needed_logs, load_result.record_counts)
|
|
1770
|
+
_ = default_note
|
|
1771
|
+
|
|
1772
|
+
# Identity line 1 always carries the source's name — file or directory.
|
|
1773
|
+
# Directory-mode bare-config digest gets a sensible identity even though
|
|
1774
|
+
# the source is a multi-file load.
|
|
1775
|
+
source_name = source_dir.name
|
|
1776
|
+
|
|
1777
|
+
# Pre-allowlist tap — pull the frame straight out of load_result.
|
|
1778
|
+
# NO build_matcher. NO AllowlistMatcher.filter_df. Digest is the orient
|
|
1779
|
+
# step; allowlisted infrastructure (resolvers, pollers) is part of
|
|
1780
|
+
# what's in here and stays on the sonar.
|
|
1781
|
+
#
|
|
1782
|
+
# Frame is the source of truth for whether a schema card can render.
|
|
1783
|
+
# Empty frame → DigestEmpty (control signal, NOT a ValueError). The
|
|
1784
|
+
# file was understood — it simply had no parseable records. The CLI
|
|
1785
|
+
# narrates this distinctly from a real per-path failure. Applies ONLY
|
|
1786
|
+
# to the recognized-schema path; blob has its own terminal branch
|
|
1787
|
+
# above and an empty FILE was already caught at sniff time as
|
|
1788
|
+
# state="empty" in the CLI fan-out.
|
|
1789
|
+
frame = load_result.logs.get(pattern)
|
|
1790
|
+
if frame is None or frame.empty:
|
|
1791
|
+
raise DigestEmpty(basename=source_dir.name, schema=schema)
|
|
1792
|
+
# empty_columns reserved for any future tolerant-load path; the
|
|
1793
|
+
# current contract is "recognized schema must have at least one row
|
|
1794
|
+
# to render a card", enforced by the raise above.
|
|
1795
|
+
_ = empty_columns
|
|
1796
|
+
|
|
1797
|
+
# Timestamp-confidence gate (now boolean). Below the floor OR with a
|
|
1798
|
+
# zero non-NaN span, the timeline cannot be drawn honestly — dash the
|
|
1799
|
+
# identity-line window AND signal timeline_unavailable to the
|
|
1800
|
+
# renderer, which emits the bare "(timeline unavailable)" histogram
|
|
1801
|
+
# replacement. Both former failure modes (low coverage AND zero span)
|
|
1802
|
+
# render identically; the flat card has no footer surface to
|
|
1803
|
+
# differentiate them.
|
|
1804
|
+
if _ts_confidence(frame):
|
|
1805
|
+
histogram_counts, histogram_unit, histogram_peak = _compute_histogram(
|
|
1806
|
+
frame["ts"], data_window,
|
|
1807
|
+
)
|
|
1808
|
+
timeline_unavailable = False
|
|
1809
|
+
else:
|
|
1810
|
+
data_window = (None, None)
|
|
1811
|
+
histogram_counts = []
|
|
1812
|
+
histogram_unit = "hr"
|
|
1813
|
+
histogram_peak = 0
|
|
1814
|
+
timeline_unavailable = True
|
|
1815
|
+
|
|
1816
|
+
from loghunter import digest
|
|
1817
|
+
from loghunter.common.finding import DigestCard
|
|
1818
|
+
|
|
1819
|
+
# Narrow defence-in-depth wrap (item 2): summariser dispatch + body +
|
|
1820
|
+
# DigestCard construction. If the summariser raises on a pathological
|
|
1821
|
+
# frame (e.g. a duplicate `src` column producing pandas' "Grouper for
|
|
1822
|
+
# 'src' not 1-dimensional"), fall through to a blob card for the same
|
|
1823
|
+
# file rather than aborting the fan-out. Glenn's scope discipline:
|
|
1824
|
+
# - DOES catch summariser raises and DigestCard-construction raises.
|
|
1825
|
+
# - DOES NOT catch loader/parser errors (above this wrap).
|
|
1826
|
+
# - DOES NOT catch DigestEmpty (raised above; control signal).
|
|
1827
|
+
# - DOES NOT catch handler/render errors (below this wrap).
|
|
1828
|
+
# - DOES NOT catch BaseException — KeyboardInterrupt / SystemExit
|
|
1829
|
+
# propagate.
|
|
1830
|
+
# Bare-config callers (no single-file fallback path available) pass
|
|
1831
|
+
# fallback_blob_path=None and the exception re-raises to the caller's
|
|
1832
|
+
# existing ValueError arm.
|
|
1833
|
+
try:
|
|
1834
|
+
summarizer = digest.get_summarizer(schema)
|
|
1835
|
+
if schema in ("dns", "syslog"):
|
|
1836
|
+
body = summarizer(frame, feed)
|
|
1837
|
+
else:
|
|
1838
|
+
body = summarizer(frame)
|
|
1839
|
+
card = DigestCard(
|
|
1840
|
+
schema=schema,
|
|
1841
|
+
source_name=source_name,
|
|
1842
|
+
data_window=data_window,
|
|
1843
|
+
record_count=total_records,
|
|
1844
|
+
histogram_counts=histogram_counts,
|
|
1845
|
+
histogram_unit=histogram_unit,
|
|
1846
|
+
histogram_peak=histogram_peak,
|
|
1847
|
+
zone1_extras=body["zone1_extras"],
|
|
1848
|
+
insights=body["insights"],
|
|
1849
|
+
fields=body["fields"],
|
|
1850
|
+
data_size_bytes=load_result.data_size_bytes,
|
|
1851
|
+
timeline_unavailable=timeline_unavailable,
|
|
1852
|
+
)
|
|
1853
|
+
except Exception as exc:
|
|
1854
|
+
if fallback_blob_path is None:
|
|
1855
|
+
raise
|
|
1856
|
+
# One-line stderr breadcrumb — verbose-gated so the raw exception
|
|
1857
|
+
# text does not leak to default-mode users (the "actionable
|
|
1858
|
+
# messages, never raw exceptions" rail). Default runs see the
|
|
1859
|
+
# blob card as the whole story; --verbose retains the breadcrumb
|
|
1860
|
+
# for debugging. The error-voice pass will tune the phrasing;
|
|
1861
|
+
# voice is PLACEHOLDER.
|
|
1862
|
+
if verbose_level >= 1:
|
|
1863
|
+
print(
|
|
1864
|
+
f"digest: {fallback_blob_path.name}: summariser failed "
|
|
1865
|
+
f"({type(exc).__name__}: {exc}); falling back to blob",
|
|
1866
|
+
file=sys.stderr,
|
|
1867
|
+
)
|
|
1868
|
+
# Separator single-owner: _render_blob_for_path owns blob-card
|
|
1869
|
+
# emission. We thread the flag and do NOT emit here, or the run
|
|
1870
|
+
# would print two rules around the same fallback card.
|
|
1871
|
+
_render_blob_for_path(
|
|
1872
|
+
fallback_blob_path,
|
|
1873
|
+
stream=stream,
|
|
1874
|
+
output_dir=output_dir,
|
|
1875
|
+
output_file=output_file,
|
|
1876
|
+
verbose_level=verbose_level,
|
|
1877
|
+
leading_separator=leading_separator,
|
|
1878
|
+
)
|
|
1879
|
+
return
|
|
1880
|
+
|
|
1881
|
+
handler, close_handler = _build_output_handler(
|
|
1882
|
+
"text", output_dir, output_file, verbose_level, stream=stream,
|
|
1883
|
+
)
|
|
1884
|
+
try:
|
|
1885
|
+
from loghunter.outputs.text import TextHandler
|
|
1886
|
+
if not isinstance(handler, TextHandler):
|
|
1887
|
+
raise RuntimeError(
|
|
1888
|
+
"digest: _build_output_handler did not return a TextHandler "
|
|
1889
|
+
f"(got {type(handler).__name__})"
|
|
1890
|
+
)
|
|
1891
|
+
if leading_separator:
|
|
1892
|
+
_emit_inter_card_separator(stream)
|
|
1893
|
+
handler.render_digest(card)
|
|
1894
|
+
finally:
|
|
1895
|
+
close_handler()
|