loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,1010 @@
|
|
|
1
|
+
"""The uniform load pipeline — ``run_load`` + the ``_SOURCE_LOADERS`` registry.
|
|
2
|
+
|
|
3
|
+
The protected core: every detector source-family load flows through ``run_load``
|
|
4
|
+
(progress wrap, coverage tracking, default-window filtering, verbose-gated
|
|
5
|
+
wrong-family skips, read-corruption handling — written ONCE). A new format = one
|
|
6
|
+
``SourceLoader`` in ``_SOURCE_LOADERS`` → it inherits the treatment by
|
|
7
|
+
construction and cannot diverge by happenstance. ``_open_log`` / ``progress`` are
|
|
8
|
+
reached through the package facade so test monkeypatches take effect here.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import gzip
|
|
14
|
+
import itertools
|
|
15
|
+
import json
|
|
16
|
+
import lzma
|
|
17
|
+
import sys
|
|
18
|
+
from dataclasses import dataclass
|
|
19
|
+
from datetime import datetime, timedelta
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any, Callable
|
|
22
|
+
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
import loghunter.common.loader as _loader # facade: _open_log / progress patch-through (call-time only)
|
|
26
|
+
from loghunter.common.config import parse_window_span
|
|
27
|
+
from loghunter.common.loader.diagnostics import (
|
|
28
|
+
_cloudtrail_parse_warning,
|
|
29
|
+
_log_type,
|
|
30
|
+
_schema_warning,
|
|
31
|
+
_zeek_file_read_warning,
|
|
32
|
+
)
|
|
33
|
+
from loghunter.common.loader.discovery import (
|
|
34
|
+
_default_resolve_window,
|
|
35
|
+
_dir_has_regular_files,
|
|
36
|
+
_discover_syslog_files,
|
|
37
|
+
_flat_resolve_window,
|
|
38
|
+
_stem_hostname,
|
|
39
|
+
_syslog_files,
|
|
40
|
+
_zeek_resolve_window,
|
|
41
|
+
discover_cloudtrail_files,
|
|
42
|
+
discover_zeek_files,
|
|
43
|
+
)
|
|
44
|
+
from loghunter.common.loader.io import _safe_resolve, _union_dedupe
|
|
45
|
+
from loghunter.common.loader.sniff import _is_ndjson
|
|
46
|
+
from loghunter.common.loader.types import (
|
|
47
|
+
_CLOUDTRAIL_COLUMNS,
|
|
48
|
+
_PIHOLE_COLUMNS,
|
|
49
|
+
_SYSLOG_COLUMNS,
|
|
50
|
+
CoverageTracker,
|
|
51
|
+
LoadResult,
|
|
52
|
+
RotationSkipInfo,
|
|
53
|
+
SourceCoverage,
|
|
54
|
+
_data_window,
|
|
55
|
+
)
|
|
56
|
+
from loghunter.common.loader.windowing import (
|
|
57
|
+
LoadWindow,
|
|
58
|
+
_apply_ts_filter,
|
|
59
|
+
_missing_ts,
|
|
60
|
+
_rotation_windowed_files,
|
|
61
|
+
is_bounded,
|
|
62
|
+
)
|
|
63
|
+
from loghunter.parsers.cloudtrail import parse_event as _parse_cloudtrail_event
|
|
64
|
+
from loghunter.parsers.dnsmasq import parse_line as _parse_dnsmasq_line
|
|
65
|
+
from loghunter.parsers.syslog import parse_line as _parse_syslog_line
|
|
66
|
+
from loghunter.parsers.zeek import (
|
|
67
|
+
_normalize_conn_df,
|
|
68
|
+
_normalize_dns_df,
|
|
69
|
+
_normalize_zeek_syslog_df,
|
|
70
|
+
)
|
|
71
|
+
from loghunter.parsers.zeek_tsv import parse_tsv_log as _parse_tsv_log
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass(frozen=True)
|
|
75
|
+
class SourceLoader:
|
|
76
|
+
"""Per-source-family load strategy consumed by ``run_load``.
|
|
77
|
+
|
|
78
|
+
The strategy carries the thin per-family description; the uniform
|
|
79
|
+
behavior — progress, coverage, windowing, corruption-handling, verbose-
|
|
80
|
+
gated wrong-family skip — lives in ``run_load``. A new format = one
|
|
81
|
+
``SourceLoader`` in ``_SOURCE_LOADERS`` → inherits the treatment by
|
|
82
|
+
construction.
|
|
83
|
+
|
|
84
|
+
Fields:
|
|
85
|
+
- ``discover``: window-aware file discovery for a single input path.
|
|
86
|
+
- ``mode``: ``"stream"`` yields canonical row dicts; ``"frame"`` returns
|
|
87
|
+
a pre-filter DataFrame (Zeek, normalized later).
|
|
88
|
+
- ``parse(line_iter, *, path, warnings)``: format-specific decode given
|
|
89
|
+
the progress-wrapped line iterator AND the per-file context
|
|
90
|
+
(``path`` for hostname stems / file identity; ``warnings`` as the
|
|
91
|
+
content-parse warning sink).
|
|
92
|
+
- ``ts_policy``: ``"keep"`` (NaN-ts rows bypass the window) or
|
|
93
|
+
``"drop"`` (NaN-ts rows discarded before windowing). Each entry
|
|
94
|
+
carries a rationale comment at registration.
|
|
95
|
+
- ``columns``: stream-mode empty-frame stability list; ``None`` for
|
|
96
|
+
frame-mode. Frame-mode (Zeek) preserves today's bare ``pd.DataFrame()``
|
|
97
|
+
on date-pruned / empty / all-filtered — Zeek's non-empty columns
|
|
98
|
+
come from parse + normalize, not a static list.
|
|
99
|
+
- ``should_skip(path)``: wrong-family guard returning a skip message
|
|
100
|
+
(printed to stderr only under ``verbose=True``) or ``None`` to keep.
|
|
101
|
+
Optional; ``None`` means never skip.
|
|
102
|
+
- ``normalize(df, pattern)``: post-assembly normalize hook
|
|
103
|
+
(``_NORMALIZER_MAP`` dispatch for Zeek; ``None`` for the flat
|
|
104
|
+
loaders).
|
|
105
|
+
- ``unit``: progress bar unit label.
|
|
106
|
+
- ``window_select(files, since, until, *, verbose)``: OPTIONAL ordinal-
|
|
107
|
+
rotation peek-prune (flat syslog / pihole). Returns
|
|
108
|
+
``(selected, RotationSkipInfo)``. ``None`` (Zeek / CloudTrail) means no
|
|
109
|
+
windowing of discovered candidates — the loader keeps today's behavior
|
|
110
|
+
verbatim. Defaulted so non-flat registry entries and programmatic
|
|
111
|
+
constructions do not churn.
|
|
112
|
+
"""
|
|
113
|
+
|
|
114
|
+
discover: Callable[[Path, str, datetime | None, datetime | None], list[Path]]
|
|
115
|
+
mode: str # "stream" | "frame"
|
|
116
|
+
parse: Callable[..., Any]
|
|
117
|
+
ts_policy: str # "keep" | "drop"
|
|
118
|
+
columns: list[str] | None
|
|
119
|
+
should_skip: Callable[[Path], str | None] | None
|
|
120
|
+
normalize: Callable[[pd.DataFrame, str], pd.DataFrame] | None
|
|
121
|
+
unit: str = " lines"
|
|
122
|
+
# Signature: (files, since, until, *, verbose) -> (selected, RotationSkipInfo).
|
|
123
|
+
# `Callable[...]` because the real callable has a keyword-only `verbose` arg
|
|
124
|
+
# that the parameter-list form cannot express.
|
|
125
|
+
window_select: Callable[..., tuple[list[Path], RotationSkipInfo]] | None = None
|
|
126
|
+
# Whether the auto-default window applies to this family. Default True
|
|
127
|
+
# (zeek/syslog/pihole). CloudTrail opts OUT — aws is baseline-relative
|
|
128
|
+
# (novelty/weirdness needs full history), so the recent-slice auto-window
|
|
129
|
+
# defeats it; explicit windows still apply (resolved before the default).
|
|
130
|
+
default_window_eligible: bool = True
|
|
131
|
+
# How this family resolves its default window: (strategy, dirs, pattern, span)
|
|
132
|
+
# -> (select_window, trim_span). ``None`` = the universal default
|
|
133
|
+
# (_default_resolve_window: load full + post-load trim). A source with special
|
|
134
|
+
# temporal semantics (dated Zeek dirs, flat rotation-peek) declares its resolver
|
|
135
|
+
# HERE, on the entry — zero runner edits. The owning strategy is passed in so a
|
|
136
|
+
# resolver can reach ``strategy.discover`` without a registry import.
|
|
137
|
+
resolve_window: Callable[..., tuple[Any, timedelta | None]] | None = None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _zeek_records_from_lines(line_iter: Any) -> list[dict[str, Any]]:
|
|
141
|
+
"""Iterate ``line_iter`` and return Zeek NDJSON records.
|
|
142
|
+
|
|
143
|
+
Skips blank and ``#``-comment lines, drops records with ``ts is None`` or
|
|
144
|
+
malformed JSON. Shared by ``_parse_ndjson_file`` (path-driven NDJSON
|
|
145
|
+
parse) and the Zeek strategy's NDJSON branch (line-iter-driven).
|
|
146
|
+
"""
|
|
147
|
+
records: list[dict[str, Any]] = []
|
|
148
|
+
for line in line_iter:
|
|
149
|
+
line = line.strip()
|
|
150
|
+
if not line or line.startswith("#"):
|
|
151
|
+
continue
|
|
152
|
+
try:
|
|
153
|
+
record = json.loads(line)
|
|
154
|
+
except json.JSONDecodeError:
|
|
155
|
+
continue
|
|
156
|
+
if record.get("ts") is None:
|
|
157
|
+
continue
|
|
158
|
+
records.append(record)
|
|
159
|
+
return records
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _zeek_parse_from_lines(line_iter: Any) -> pd.DataFrame:
|
|
163
|
+
"""Prefix-preserving NDJSON-vs-TSV dispatch for a Zeek line iterator.
|
|
164
|
+
|
|
165
|
+
Glenn rev-3 fix: a one-line peek would discard ``#separator`` / ``#fields``
|
|
166
|
+
/ ``#types`` directives that ``parse_tsv_log`` requires. This helper
|
|
167
|
+
accumulates a ``prefix`` list of every consumed line while scanning, so the
|
|
168
|
+
parser sees the full header block.
|
|
169
|
+
|
|
170
|
+
Decision rule:
|
|
171
|
+
- NDJSON when the FIRST non-blank, non-comment line starts with ``{``.
|
|
172
|
+
- TSV when ``#separator`` appears anywhere in the scanned ``prefix``.
|
|
173
|
+
- Bare empty ``DataFrame`` otherwise (header-only / empty / non-Zeek
|
|
174
|
+
stub) — preserves today's bare-frame shape for date-pruned / empty /
|
|
175
|
+
all-filtered Zeek paths.
|
|
176
|
+
|
|
177
|
+
Parse runs over ``itertools.chain(prefix, line_iter)`` so EVERY consumed
|
|
178
|
+
line — header directives included — reaches the parser. Header-only TSV
|
|
179
|
+
files retain their header block; ``parse_tsv_log`` produces whatever it
|
|
180
|
+
makes of header-only input today.
|
|
181
|
+
"""
|
|
182
|
+
prefix: list[str] = []
|
|
183
|
+
is_ndjson: bool | None = None
|
|
184
|
+
has_separator = False
|
|
185
|
+
for line in line_iter:
|
|
186
|
+
prefix.append(line)
|
|
187
|
+
stripped = line.strip()
|
|
188
|
+
if not stripped:
|
|
189
|
+
continue
|
|
190
|
+
if stripped.startswith("#"):
|
|
191
|
+
if stripped.startswith("#separator"):
|
|
192
|
+
has_separator = True
|
|
193
|
+
continue
|
|
194
|
+
# First non-blank, non-comment line decides NDJSON.
|
|
195
|
+
is_ndjson = stripped.startswith("{")
|
|
196
|
+
break
|
|
197
|
+
rest = itertools.chain(prefix, line_iter)
|
|
198
|
+
if is_ndjson:
|
|
199
|
+
return pd.DataFrame(_zeek_records_from_lines(rest))
|
|
200
|
+
if has_separator:
|
|
201
|
+
return _parse_tsv_log(rest)
|
|
202
|
+
# Header-only / empty / non-Zeek stub — bare empty frame (preserved).
|
|
203
|
+
return pd.DataFrame()
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def _parse_ndjson_file(path: Path, show_progress: bool = True) -> pd.DataFrame:
|
|
207
|
+
"""Parse a single Zeek NDJSON log file, return unfiltered Zeek-native DataFrame."""
|
|
208
|
+
with _loader._open_log(path) as fh:
|
|
209
|
+
line_iter = _loader.progress(
|
|
210
|
+
fh,
|
|
211
|
+
desc=f"loaded {path.name}",
|
|
212
|
+
show_progress=show_progress,
|
|
213
|
+
unit=" lines",
|
|
214
|
+
)
|
|
215
|
+
records = _zeek_records_from_lines(line_iter)
|
|
216
|
+
return pd.DataFrame(records)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def _parse_lines(lines: list[str]) -> list[dict[str, Any]]:
|
|
220
|
+
"""Parse NDJSON lines, skipping blanks and Zeek comment headers."""
|
|
221
|
+
result: list[dict[str, Any]] = []
|
|
222
|
+
for line in lines:
|
|
223
|
+
line = line.strip()
|
|
224
|
+
if not line or line.startswith("#"):
|
|
225
|
+
continue
|
|
226
|
+
try:
|
|
227
|
+
result.append(json.loads(line))
|
|
228
|
+
except json.JSONDecodeError:
|
|
229
|
+
pass
|
|
230
|
+
return result
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def load_zeek_log(
|
|
234
|
+
path: Path,
|
|
235
|
+
since: datetime | None = None,
|
|
236
|
+
until: datetime | None = None,
|
|
237
|
+
show_progress: bool = True,
|
|
238
|
+
) -> pd.DataFrame:
|
|
239
|
+
"""Parse a single Zeek NDJSON log file and return a DataFrame.
|
|
240
|
+
|
|
241
|
+
Handles plain and gzip-compressed files transparently.
|
|
242
|
+
Applies timeframe filter on the ts field if since/until are provided.
|
|
243
|
+
"""
|
|
244
|
+
return _apply_ts_filter(
|
|
245
|
+
_parse_ndjson_file(path, show_progress=show_progress), since, until
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _events_from_whole_document(
|
|
250
|
+
text: str,
|
|
251
|
+
path: Path,
|
|
252
|
+
_warnings: list[str] | None,
|
|
253
|
+
) -> list[dict]:
|
|
254
|
+
"""Parse ``text`` as a single JSON document and extract its event list.
|
|
255
|
+
|
|
256
|
+
Accepts three shapes: ``{"Records": [...]}`` envelope, a bare ``[...]`` list,
|
|
257
|
+
or a bare ``{...}`` single event. Total parse failure or any other shape
|
|
258
|
+
appends a warning and returns an empty list.
|
|
259
|
+
"""
|
|
260
|
+
try:
|
|
261
|
+
doc = json.loads(text)
|
|
262
|
+
except json.JSONDecodeError:
|
|
263
|
+
if _warnings is not None:
|
|
264
|
+
_warnings.append(_cloudtrail_parse_warning(path))
|
|
265
|
+
return []
|
|
266
|
+
|
|
267
|
+
if isinstance(doc, dict):
|
|
268
|
+
records = doc.get("Records")
|
|
269
|
+
if isinstance(records, list):
|
|
270
|
+
return [e for e in records if isinstance(e, dict)]
|
|
271
|
+
# Bare single-event dict.
|
|
272
|
+
return [doc]
|
|
273
|
+
if isinstance(doc, list):
|
|
274
|
+
return [e for e in doc if isinstance(e, dict)]
|
|
275
|
+
|
|
276
|
+
if _warnings is not None:
|
|
277
|
+
_warnings.append(_cloudtrail_parse_warning(path))
|
|
278
|
+
return []
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
# Zeek normalization lives in loghunter.parsers.zeek; loader keeps dispatch here.
|
|
282
|
+
# Covers Zeek NDJSON formats only. Syslog is handled by load_syslog() — see parsers/syslog.py.
|
|
283
|
+
|
|
284
|
+
# Map from log type → normalizer function. Add an entry here (alongside a new
|
|
285
|
+
# _normalize_*_df function) when implementing each new Zeek log source.
|
|
286
|
+
_NORMALIZER_MAP: dict[str, Callable[[pd.DataFrame], pd.DataFrame]] = {
|
|
287
|
+
"conn": _normalize_conn_df,
|
|
288
|
+
"dns": _normalize_dns_df,
|
|
289
|
+
"syslog": _normalize_zeek_syslog_df,
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
294
|
+
# run_load — the uniform load pipeline + per-source SourceLoader strategies
|
|
295
|
+
#
|
|
296
|
+
# Every detector source-family load flows through ``run_load``: progress
|
|
297
|
+
# wrapping, coverage tracking, default-window filtering, verbose-gated
|
|
298
|
+
# wrong-family skips, and read-corruption handling are written ONCE. A new
|
|
299
|
+
# format = one ``SourceLoader`` in ``_SOURCE_LOADERS`` — it inherits the
|
|
300
|
+
# treatment by construction and cannot diverge by happenstance.
|
|
301
|
+
#
|
|
302
|
+
# Stream strategies (syslog / pihole / cloudtrail) yield canonical row dicts;
|
|
303
|
+
# frame strategies (zeek) return a pre-window DataFrame that the pipeline
|
|
304
|
+
# windows + normalises. NaN-ts policy is declared per strategy:
|
|
305
|
+
# ``ts_policy="drop"`` (zeek + cloudtrail — unparseable timestamps are not
|
|
306
|
+
# trustworthy data) vs ``ts_policy="keep"`` (syslog + pihole — RFC 3164's
|
|
307
|
+
# year-guess can lose a timestamp without making the LINE less useful).
|
|
308
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _zeek_strategy_parse(line_iter, *, path, warnings): # noqa: ARG001 - uniform contract
|
|
312
|
+
"""Zeek strategy parse: prefix-preserving NDJSON-vs-TSV dispatch.
|
|
313
|
+
|
|
314
|
+
``path`` and ``warnings`` are accepted for the uniform strategy contract;
|
|
315
|
+
Zeek's parse doesn't consult either — its content-parse failures degrade
|
|
316
|
+
to bare DataFrame via ``_zeek_parse_from_lines`` rather than a warning.
|
|
317
|
+
"""
|
|
318
|
+
return _zeek_parse_from_lines(line_iter)
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _zeek_normalize(df: pd.DataFrame, pattern: str) -> pd.DataFrame:
|
|
322
|
+
"""Apply the Zeek per-log-type normaliser when the pattern has one."""
|
|
323
|
+
log_type = _log_type(pattern)
|
|
324
|
+
if log_type in _NORMALIZER_MAP:
|
|
325
|
+
return _NORMALIZER_MAP[log_type](df)
|
|
326
|
+
return df
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def _syslog_strategy_parse(line_iter, *, path, warnings): # noqa: ARG001
|
|
330
|
+
"""Syslog stream parse: yield canonical rows with float/NaN ``ts``.
|
|
331
|
+
|
|
332
|
+
Host derivation (H4): prefer the in-content RFC-3164 host (``parse_host``:
|
|
333
|
+
field 4, or ``"unknown"`` when <4 tokens); fall back to the filename stem
|
|
334
|
+
(``_stem_hostname``) only when the line is hostless. Generalizes the old
|
|
335
|
+
``messages`` special-case to every stream file, with the per-host-file case
|
|
336
|
+
preserved by the fallback. ``ts`` is converted to float seconds, or
|
|
337
|
+
``float('nan')`` when the RFC 3164 line has no parseable timestamp — KEEP
|
|
338
|
+
policy applies at the pipeline.
|
|
339
|
+
"""
|
|
340
|
+
stem = _stem_hostname(path.name)
|
|
341
|
+
for line in line_iter:
|
|
342
|
+
record = _parse_syslog_line(line.rstrip("\n"))
|
|
343
|
+
if record is None:
|
|
344
|
+
continue
|
|
345
|
+
in_content = record["host"]
|
|
346
|
+
host = in_content if in_content != "unknown" else stem
|
|
347
|
+
ts_dt = record["ts"]
|
|
348
|
+
ts_float = ts_dt.timestamp() if ts_dt is not None else float("nan")
|
|
349
|
+
yield {
|
|
350
|
+
"ts": ts_float,
|
|
351
|
+
"host": host,
|
|
352
|
+
"program": record["program"],
|
|
353
|
+
"raw": record["raw"],
|
|
354
|
+
"message": record["message"],
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
|
|
358
|
+
def _pihole_strategy_parse(line_iter, *, path, warnings): # noqa: ARG001
|
|
359
|
+
"""Pi-hole stream parse: yield canonical rows with float/NaN ``ts``.
|
|
360
|
+
|
|
361
|
+
Hostname is taken from the filename stem unconditionally (Pi-hole logs
|
|
362
|
+
are per-host). ``ts`` is float seconds or ``float('nan')`` — KEEP policy.
|
|
363
|
+
"""
|
|
364
|
+
stem = _stem_hostname(path.name)
|
|
365
|
+
for line in line_iter:
|
|
366
|
+
record = _parse_dnsmasq_line(line.rstrip("\n"))
|
|
367
|
+
if record is None:
|
|
368
|
+
continue
|
|
369
|
+
record["host"] = stem
|
|
370
|
+
ts_dt = record["ts"]
|
|
371
|
+
record["ts"] = ts_dt.timestamp() if ts_dt is not None else float("nan")
|
|
372
|
+
yield record
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
def _cloudtrail_strategy_parse(line_iter, *, path, warnings):
|
|
376
|
+
"""CloudTrail stream parse: SINGLE-iterator sniff + dispatch yielding rows.
|
|
377
|
+
|
|
378
|
+
The single-iterator invariant (preserved when the per-file CloudTrail reader
|
|
379
|
+
folded into this strategy): the pipeline wraps ``fh`` → ``line_iter`` once;
|
|
380
|
+
EVERY branch (first-line sniff, NDJSON stream, envelope/pretty multi-line,
|
|
381
|
+
bare-list) consumes from the SAME wrapped iterator so the progress bar's
|
|
382
|
+
line count reflects actual INPUT lines, never re-reading ``fh``.
|
|
383
|
+
|
|
384
|
+
Content-parse failures (malformed JSON) append
|
|
385
|
+
``_cloudtrail_parse_warning(path)`` to the ``warnings`` sink — the
|
|
386
|
+
content-parse-vs-read-corruption split preserved (read-corruption stays
|
|
387
|
+
on the pipeline's ``_zeek_file_read_warning`` rail).
|
|
388
|
+
"""
|
|
389
|
+
first_line = None
|
|
390
|
+
for line in line_iter:
|
|
391
|
+
if line.strip():
|
|
392
|
+
first_line = line
|
|
393
|
+
break
|
|
394
|
+
if first_line is None:
|
|
395
|
+
return
|
|
396
|
+
|
|
397
|
+
try:
|
|
398
|
+
first_value = json.loads(first_line)
|
|
399
|
+
except json.JSONDecodeError:
|
|
400
|
+
# First line is a fragment of a pretty-printed multi-line document.
|
|
401
|
+
full_text = first_line + "".join(line_iter)
|
|
402
|
+
for event in _events_from_whole_document(full_text, path, warnings):
|
|
403
|
+
row = _parse_cloudtrail_event(event)
|
|
404
|
+
if row is not None:
|
|
405
|
+
yield row
|
|
406
|
+
return
|
|
407
|
+
|
|
408
|
+
if isinstance(first_value, dict):
|
|
409
|
+
if "Records" in first_value:
|
|
410
|
+
# Envelope: accumulate rest from the same wrapped iterator.
|
|
411
|
+
full_text = first_line + "".join(line_iter)
|
|
412
|
+
for event in _events_from_whole_document(full_text, path, warnings):
|
|
413
|
+
row = _parse_cloudtrail_event(event)
|
|
414
|
+
if row is not None:
|
|
415
|
+
yield row
|
|
416
|
+
return
|
|
417
|
+
# NDJSON: seed events with this first dict (do NOT drop it), then
|
|
418
|
+
# stream the rest, silently skipping undecodable lines.
|
|
419
|
+
row = _parse_cloudtrail_event(first_value)
|
|
420
|
+
if row is not None:
|
|
421
|
+
yield row
|
|
422
|
+
for line in line_iter:
|
|
423
|
+
line = line.strip()
|
|
424
|
+
if not line:
|
|
425
|
+
continue
|
|
426
|
+
try:
|
|
427
|
+
evt = json.loads(line)
|
|
428
|
+
except json.JSONDecodeError:
|
|
429
|
+
continue
|
|
430
|
+
if isinstance(evt, dict):
|
|
431
|
+
row = _parse_cloudtrail_event(evt)
|
|
432
|
+
if row is not None:
|
|
433
|
+
yield row
|
|
434
|
+
return
|
|
435
|
+
|
|
436
|
+
if isinstance(first_value, list):
|
|
437
|
+
# Bare-list one-line document. Any trailing content is malformed; the
|
|
438
|
+
# JSON value is the document.
|
|
439
|
+
for e in first_value:
|
|
440
|
+
if isinstance(e, dict):
|
|
441
|
+
row = _parse_cloudtrail_event(e)
|
|
442
|
+
if row is not None:
|
|
443
|
+
yield row
|
|
444
|
+
return
|
|
445
|
+
|
|
446
|
+
# First-line is a JSON primitive — not a valid CloudTrail event shape.
|
|
447
|
+
# Treat as a parse failure.
|
|
448
|
+
if warnings is not None:
|
|
449
|
+
warnings.append(_cloudtrail_parse_warning(path))
|
|
450
|
+
return
|
|
451
|
+
|
|
452
|
+
|
|
453
|
+
def _syslog_should_skip(path: Path) -> str | None:
|
|
454
|
+
"""Wrong-family guard for ``syslog_dir``: skip NDJSON and Zeek TSV.
|
|
455
|
+
|
|
456
|
+
PRESERVES today's asymmetry — syslog skips NDJSON (an operator dropping
|
|
457
|
+
a Zeek NDJSON ``syslog.log`` here would garble through RFC 3164) AND
|
|
458
|
+
Zeek-TSV (the ``#separator`` directive is the strong signal). The
|
|
459
|
+
pipeline gates the returned message on ``verbose=True``.
|
|
460
|
+
"""
|
|
461
|
+
if _is_ndjson(path):
|
|
462
|
+
return f"load_syslog: skipping {path.name} — looks like NDJSON, not syslog"
|
|
463
|
+
with _loader._open_log(path) as fh:
|
|
464
|
+
head = list(itertools.islice(fh, 8))
|
|
465
|
+
if any(ln.startswith("#separator") for ln in head):
|
|
466
|
+
return (
|
|
467
|
+
f"load_syslog: skipping {path.name} — looks like Zeek TSV, "
|
|
468
|
+
"not flat syslog (Zeek logs belong in zeek_dir)"
|
|
469
|
+
)
|
|
470
|
+
return None
|
|
471
|
+
|
|
472
|
+
|
|
473
|
+
def _pihole_should_skip(path: Path) -> str | None:
|
|
474
|
+
"""Wrong-family guard for ``pihole_dir``: skip NDJSON ONLY.
|
|
475
|
+
|
|
476
|
+
PRESERVES today's asymmetry — Pi-hole guards NDJSON but NOT Zeek TSV
|
|
477
|
+
(there's no real-world case of a Zeek TSV landing in a pihole_dir; a
|
|
478
|
+
blanket TSV skip here would be a behavior change).
|
|
479
|
+
"""
|
|
480
|
+
if _is_ndjson(path):
|
|
481
|
+
return f"load_pihole: skipping {path.name} — looks like NDJSON, not dnsmasq"
|
|
482
|
+
return None
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def run_load(
|
|
486
|
+
strategy: SourceLoader,
|
|
487
|
+
files: list[Path],
|
|
488
|
+
pattern: str,
|
|
489
|
+
since: datetime | None,
|
|
490
|
+
until: datetime | None,
|
|
491
|
+
*,
|
|
492
|
+
show_progress: bool = True,
|
|
493
|
+
verbose: bool = False,
|
|
494
|
+
_warnings: list[str] | None = None,
|
|
495
|
+
_coverage: dict | None = None,
|
|
496
|
+
) -> pd.DataFrame:
|
|
497
|
+
"""The uniform load pipeline. Owns progress wrap, coverage tracking,
|
|
498
|
+
windowing, corruption rail, verbose-gated wrong-family skip.
|
|
499
|
+
|
|
500
|
+
Does NOT own byte accounting — ``load_required_logs`` sums ``stat`` over
|
|
501
|
+
the deduped ``files`` in its uniform loop.
|
|
502
|
+
|
|
503
|
+
Stream mode (syslog / pihole / cloudtrail):
|
|
504
|
+
Strategy ``parse`` yields canonical row dicts; the pipeline applies the
|
|
505
|
+
ts policy + window per row and assembles a column-stable DataFrame
|
|
506
|
+
(``strategy.columns``) on the way out.
|
|
507
|
+
|
|
508
|
+
Frame mode (zeek):
|
|
509
|
+
Strategy ``parse`` returns a pre-filter DataFrame; the pipeline
|
|
510
|
+
observes the pre-filter frame, windows via ``_apply_ts_filter`` (which
|
|
511
|
+
drops NaN-ts then trims — that IS the drop policy), and optionally
|
|
512
|
+
normalises post-concat. Empty paths return bare ``pd.DataFrame()`` —
|
|
513
|
+
Zeek's empty shape is preserved exactly (no forced columns).
|
|
514
|
+
|
|
515
|
+
Coverage: ``_coverage["coverage"]`` is written iff the returned frame is
|
|
516
|
+
empty AND the tracker has something to say (no-files-read /
|
|
517
|
+
files-but-zero-valid-ts / valid-rows-all-excluded-by-window). A populated
|
|
518
|
+
load short-circuits via ``mark_kept`` and writes nothing.
|
|
519
|
+
"""
|
|
520
|
+
tracker = CoverageTracker()
|
|
521
|
+
if not files:
|
|
522
|
+
if _coverage is not None:
|
|
523
|
+
sc = tracker.coverage(True)
|
|
524
|
+
if sc is not None:
|
|
525
|
+
_coverage["coverage"] = sc
|
|
526
|
+
if strategy.mode == "stream":
|
|
527
|
+
return pd.DataFrame(columns=strategy.columns)
|
|
528
|
+
return pd.DataFrame()
|
|
529
|
+
|
|
530
|
+
since_ts = since.timestamp() if since else None
|
|
531
|
+
until_ts = until.timestamp() if until else None
|
|
532
|
+
|
|
533
|
+
rows: list[dict] = []
|
|
534
|
+
frames: list[pd.DataFrame] = []
|
|
535
|
+
|
|
536
|
+
for path in files:
|
|
537
|
+
file_rows: list[dict] = []
|
|
538
|
+
try:
|
|
539
|
+
# should_skip is inside the try so a corrupt compressed file
|
|
540
|
+
# caught during its head sniff lands on the read-corruption rail,
|
|
541
|
+
# not a raw traceback.
|
|
542
|
+
if strategy.should_skip is not None:
|
|
543
|
+
skip_msg = strategy.should_skip(path)
|
|
544
|
+
if skip_msg is not None:
|
|
545
|
+
# Quiet default — print only under verbose. Preserves the
|
|
546
|
+
# NDJSON/Zeek-TSV skip-message tests. ``note_file_read``
|
|
547
|
+
# is NOT fired for a skipped file so the coverage
|
|
548
|
+
# disclosure doesn't mislead.
|
|
549
|
+
if verbose:
|
|
550
|
+
print(skip_msg, file=sys.stderr)
|
|
551
|
+
continue
|
|
552
|
+
tracker.note_file_read()
|
|
553
|
+
with _loader._open_log(path) as fh:
|
|
554
|
+
line_iter = _loader.progress(
|
|
555
|
+
fh,
|
|
556
|
+
desc=f"loaded {path.name}",
|
|
557
|
+
show_progress=show_progress,
|
|
558
|
+
unit=strategy.unit,
|
|
559
|
+
)
|
|
560
|
+
if strategy.mode == "stream":
|
|
561
|
+
for row in strategy.parse(
|
|
562
|
+
line_iter, path=path, warnings=_warnings
|
|
563
|
+
):
|
|
564
|
+
ts = row["ts"]
|
|
565
|
+
if _missing_ts(ts):
|
|
566
|
+
tracker.observe(None)
|
|
567
|
+
if strategy.ts_policy == "drop":
|
|
568
|
+
continue
|
|
569
|
+
# keep policy — NaN-ts row bypasses the window
|
|
570
|
+
# (an unfilterable line stays in the frame).
|
|
571
|
+
else:
|
|
572
|
+
tracker.observe(ts)
|
|
573
|
+
if since_ts is not None and ts < since_ts:
|
|
574
|
+
continue
|
|
575
|
+
if until_ts is not None and ts > until_ts:
|
|
576
|
+
continue
|
|
577
|
+
file_rows.append(row)
|
|
578
|
+
tracker.mark_kept()
|
|
579
|
+
else: # frame mode
|
|
580
|
+
pre = strategy.parse(
|
|
581
|
+
line_iter, path=path, warnings=_warnings
|
|
582
|
+
)
|
|
583
|
+
tracker.observe_frame(pre)
|
|
584
|
+
post = _apply_ts_filter(pre, since, until)
|
|
585
|
+
if not post.empty:
|
|
586
|
+
frames.append(post)
|
|
587
|
+
tracker.mark_kept()
|
|
588
|
+
except (EOFError, gzip.BadGzipFile, lzma.LZMAError, OSError) as exc:
|
|
589
|
+
# ``_open_log`` returns a lazy reader; corruption may surface only
|
|
590
|
+
# at the trailer after many valid-looking lines. Discard the
|
|
591
|
+
# per-file buffer so the warning is honest (a "skipped with
|
|
592
|
+
# warning" file MUST contribute zero rows), and skip with the
|
|
593
|
+
# standard read-warning. Distinct from CloudTrail's content-parse
|
|
594
|
+
# warning rail (``_cloudtrail_parse_warning``).
|
|
595
|
+
if _warnings is not None:
|
|
596
|
+
_warnings.append(_zeek_file_read_warning(path, exc))
|
|
597
|
+
continue
|
|
598
|
+
if strategy.mode == "stream":
|
|
599
|
+
rows.extend(file_rows)
|
|
600
|
+
|
|
601
|
+
if strategy.mode == "stream":
|
|
602
|
+
if not rows:
|
|
603
|
+
if _coverage is not None:
|
|
604
|
+
sc = tracker.coverage(True)
|
|
605
|
+
if sc is not None:
|
|
606
|
+
_coverage["coverage"] = sc
|
|
607
|
+
return pd.DataFrame(columns=strategy.columns)
|
|
608
|
+
if _coverage is not None:
|
|
609
|
+
sc = tracker.coverage(False)
|
|
610
|
+
if sc is not None:
|
|
611
|
+
_coverage["coverage"] = sc
|
|
612
|
+
return pd.DataFrame(rows, columns=strategy.columns)
|
|
613
|
+
|
|
614
|
+
# Frame mode (Zeek): concat with TODAY's behavior — bare empty, no forced
|
|
615
|
+
# columns. Zeek's non-empty columns come from parse + normalize.
|
|
616
|
+
result = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
|
|
617
|
+
if strategy.normalize is not None and not result.empty:
|
|
618
|
+
result = strategy.normalize(result, pattern)
|
|
619
|
+
if _coverage is not None:
|
|
620
|
+
sc = tracker.coverage(result.empty)
|
|
621
|
+
if sc is not None:
|
|
622
|
+
_coverage["coverage"] = sc
|
|
623
|
+
return result
|
|
624
|
+
|
|
625
|
+
|
|
626
|
+
# Source-family strategy registry. A new format = one entry here → inherits
|
|
627
|
+
# the run_load pipeline (progress, coverage, windowing, corruption handling,
|
|
628
|
+
# verbose-gated skip) by construction.
|
|
629
|
+
#
|
|
630
|
+
# ts_policy rationale per family:
|
|
631
|
+
# zeek + cloudtrail = drop. An unparseable Zeek timestamp / CloudTrail
|
|
632
|
+
# eventTime is not trustworthy data; drop before windowing.
|
|
633
|
+
# syslog + pihole = keep. RFC 3164's year-guess can lose a timestamp
|
|
634
|
+
# without making the LINE less useful (e.g. for drain3 templating /
|
|
635
|
+
# reboot detection); keep + bypass the window.
|
|
636
|
+
_SOURCE_LOADERS: dict[str, SourceLoader] = {
|
|
637
|
+
"zeek_dir": SourceLoader(
|
|
638
|
+
discover=discover_zeek_files,
|
|
639
|
+
mode="frame",
|
|
640
|
+
parse=_zeek_strategy_parse,
|
|
641
|
+
ts_policy="drop",
|
|
642
|
+
columns=None,
|
|
643
|
+
should_skip=None,
|
|
644
|
+
normalize=_zeek_normalize,
|
|
645
|
+
# Dated dirs → precise window, no trim; flat / mixed → load full + trim.
|
|
646
|
+
resolve_window=_zeek_resolve_window,
|
|
647
|
+
),
|
|
648
|
+
"syslog_dir": SourceLoader(
|
|
649
|
+
# Content-gated discovery — the strategy lambda only adapts the
|
|
650
|
+
# signature; _discover_syslog_files is the single discovery body.
|
|
651
|
+
discover=lambda p, pattern, since, until: _discover_syslog_files(p),
|
|
652
|
+
mode="stream",
|
|
653
|
+
parse=_syslog_strategy_parse,
|
|
654
|
+
ts_policy="keep",
|
|
655
|
+
columns=_SYSLOG_COLUMNS,
|
|
656
|
+
should_skip=_syslog_should_skip,
|
|
657
|
+
normalize=None,
|
|
658
|
+
window_select=_rotation_windowed_files,
|
|
659
|
+
# Peek rotation candidates → conservative (floor, None) + post-load trim.
|
|
660
|
+
resolve_window=_flat_resolve_window,
|
|
661
|
+
),
|
|
662
|
+
"pihole_dir": SourceLoader(
|
|
663
|
+
discover=lambda p, pattern, since, until: _syslog_files(p, pattern),
|
|
664
|
+
mode="stream",
|
|
665
|
+
parse=_pihole_strategy_parse,
|
|
666
|
+
ts_policy="keep",
|
|
667
|
+
columns=_PIHOLE_COLUMNS,
|
|
668
|
+
should_skip=_pihole_should_skip,
|
|
669
|
+
normalize=None,
|
|
670
|
+
window_select=_rotation_windowed_files,
|
|
671
|
+
resolve_window=_flat_resolve_window,
|
|
672
|
+
),
|
|
673
|
+
"cloudtrail_dir": SourceLoader(
|
|
674
|
+
discover=lambda p, pattern, since, until: discover_cloudtrail_files(p),
|
|
675
|
+
mode="stream",
|
|
676
|
+
parse=_cloudtrail_strategy_parse,
|
|
677
|
+
ts_policy="drop",
|
|
678
|
+
columns=_CLOUDTRAIL_COLUMNS,
|
|
679
|
+
should_skip=None,
|
|
680
|
+
normalize=None,
|
|
681
|
+
# aws is baseline-relative — opt CloudTrail OUT of the auto-default window
|
|
682
|
+
# (an explicit --since/--until still narrows it).
|
|
683
|
+
default_window_eligible=False,
|
|
684
|
+
),
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
def load_logs(
|
|
689
|
+
directory: Path,
|
|
690
|
+
pattern: str,
|
|
691
|
+
since: datetime | None = None,
|
|
692
|
+
until: datetime | None = None,
|
|
693
|
+
_files: list[Path] | None = None,
|
|
694
|
+
_warnings: list[str] | None = None,
|
|
695
|
+
show_progress: bool = True,
|
|
696
|
+
_coverage: dict | None = None,
|
|
697
|
+
) -> pd.DataFrame:
|
|
698
|
+
"""Discover and load all matching Zeek log files from directory into a single DataFrame.
|
|
699
|
+
|
|
700
|
+
Thin shim over ``run_load`` with the ``zeek_dir`` strategy. ``_files`` short-
|
|
701
|
+
circuits discovery (digest single-file Zeek bypass + multi-positional
|
|
702
|
+
dedupe both rely on this); when ``None``, ``discover_zeek_files`` runs
|
|
703
|
+
against ``directory`` with the same window-prune behavior as before.
|
|
704
|
+
Signature is preserved byte-compatible for the ~20 callers.
|
|
705
|
+
|
|
706
|
+
_warnings: optional warning sink for per-file operational read failures.
|
|
707
|
+
_coverage: optional out-param. When the returned frame is empty, the loader
|
|
708
|
+
writes ``_coverage["coverage"] = SourceCoverage(...)`` describing the
|
|
709
|
+
pre-window read (None if data survived).
|
|
710
|
+
"""
|
|
711
|
+
strategy = _SOURCE_LOADERS["zeek_dir"]
|
|
712
|
+
files = (
|
|
713
|
+
_files
|
|
714
|
+
if _files is not None
|
|
715
|
+
else discover_zeek_files(directory, pattern, since, until)
|
|
716
|
+
)
|
|
717
|
+
return run_load(
|
|
718
|
+
strategy, files, pattern, since, until,
|
|
719
|
+
show_progress=show_progress, verbose=False,
|
|
720
|
+
_warnings=_warnings, _coverage=_coverage,
|
|
721
|
+
)
|
|
722
|
+
|
|
723
|
+
|
|
724
|
+
def load_syslog(
|
|
725
|
+
directory: Path,
|
|
726
|
+
since: datetime | None = None,
|
|
727
|
+
until: datetime | None = None,
|
|
728
|
+
verbose: bool = False,
|
|
729
|
+
_files: list[Path] | None = None,
|
|
730
|
+
_warnings: list[str] | None = None,
|
|
731
|
+
show_progress: bool = True,
|
|
732
|
+
_coverage: dict | None = None,
|
|
733
|
+
) -> pd.DataFrame:
|
|
734
|
+
"""Discover and load syslog files into a column-stable DataFrame.
|
|
735
|
+
|
|
736
|
+
Thin shim over ``run_load`` with the ``syslog_dir`` strategy. Supports a
|
|
737
|
+
directory (per-host files / flat file) or a single file. Wrong-family
|
|
738
|
+
files (NDJSON, Zeek TSV) are skipped via the strategy's ``should_skip``;
|
|
739
|
+
the skip message reaches stderr ONLY when ``verbose=True``. NaN-ts rows
|
|
740
|
+
are KEPT and bypass the window. Returns a column-stable empty frame
|
|
741
|
+
(``_SYSLOG_COLUMNS``) when no rows survive.
|
|
742
|
+
"""
|
|
743
|
+
strategy = _SOURCE_LOADERS["syslog_dir"]
|
|
744
|
+
files = _files if _files is not None else _discover_syslog_files(directory)
|
|
745
|
+
return run_load(
|
|
746
|
+
strategy, files, "", since, until,
|
|
747
|
+
show_progress=show_progress, verbose=verbose,
|
|
748
|
+
_warnings=_warnings, _coverage=_coverage,
|
|
749
|
+
)
|
|
750
|
+
|
|
751
|
+
|
|
752
|
+
def load_pihole(
|
|
753
|
+
directory: Path,
|
|
754
|
+
since: datetime | None = None,
|
|
755
|
+
until: datetime | None = None,
|
|
756
|
+
verbose: bool = False,
|
|
757
|
+
_files: list[Path] | None = None,
|
|
758
|
+
_warnings: list[str] | None = None,
|
|
759
|
+
show_progress: bool = True,
|
|
760
|
+
_coverage: dict | None = None,
|
|
761
|
+
) -> pd.DataFrame:
|
|
762
|
+
"""Discover and load dnsmasq/Pi-hole log files into a column-stable DataFrame.
|
|
763
|
+
|
|
764
|
+
Thin shim over ``run_load`` with the ``pihole_dir`` strategy. Wrong-family
|
|
765
|
+
NDJSON files are skipped (Zeek TSV is NOT — Pi-hole's wrong-family
|
|
766
|
+
asymmetry preserved). NaN-ts rows are KEPT and bypass the window.
|
|
767
|
+
Returns a column-stable empty frame (``_PIHOLE_COLUMNS``) when no rows
|
|
768
|
+
survive.
|
|
769
|
+
"""
|
|
770
|
+
strategy = _SOURCE_LOADERS["pihole_dir"]
|
|
771
|
+
files = _files if _files is not None else _syslog_files(directory, "pihole*.log*")
|
|
772
|
+
return run_load(
|
|
773
|
+
strategy, files, "", since, until,
|
|
774
|
+
show_progress=show_progress, verbose=verbose,
|
|
775
|
+
_warnings=_warnings, _coverage=_coverage,
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
def load_cloudtrail(
|
|
780
|
+
path: Path,
|
|
781
|
+
since: datetime | None = None,
|
|
782
|
+
until: datetime | None = None,
|
|
783
|
+
verbose: bool = False,
|
|
784
|
+
_files: list[Path] | None = None,
|
|
785
|
+
_warnings: list[str] | None = None,
|
|
786
|
+
show_progress: bool = True,
|
|
787
|
+
_coverage: dict | None = None,
|
|
788
|
+
) -> pd.DataFrame:
|
|
789
|
+
"""Discover and load CloudTrail event files into a canonical-schema DataFrame.
|
|
790
|
+
|
|
791
|
+
Thin shim over ``run_load`` with the ``cloudtrail_dir`` strategy. Single-
|
|
792
|
+
iterator wire-shape sniff (NDJSON / envelope / bare-list) preserved by
|
|
793
|
+
the strategy's ``parse``. Events with unparseable ``eventTime`` are
|
|
794
|
+
DROPPED before windowing. Bad files (compressed corruption) warn and
|
|
795
|
+
skip; malformed-JSON content failures append
|
|
796
|
+
``_cloudtrail_parse_warning`` to ``_warnings`` (distinct rail from the
|
|
797
|
+
read-corruption ``_zeek_file_read_warning``). Returns a column-stable
|
|
798
|
+
empty frame (``_CLOUDTRAIL_COLUMNS``) when no rows survive.
|
|
799
|
+
|
|
800
|
+
Note: ``verbose`` is accepted for signature compatibility but is unused
|
|
801
|
+
(the strategy has no ``should_skip``); CloudTrail's per-file content
|
|
802
|
+
warnings ride ``_warnings`` rather than stderr.
|
|
803
|
+
"""
|
|
804
|
+
strategy = _SOURCE_LOADERS["cloudtrail_dir"]
|
|
805
|
+
files = _files if _files is not None else discover_cloudtrail_files(path)
|
|
806
|
+
return run_load(
|
|
807
|
+
strategy, files, "", since, until,
|
|
808
|
+
show_progress=show_progress, verbose=verbose,
|
|
809
|
+
_warnings=_warnings, _coverage=_coverage,
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
|
|
813
|
+
def load_required_logs(
|
|
814
|
+
needed_logs: dict[str, str],
|
|
815
|
+
source_dirs: dict[str, list[Path]],
|
|
816
|
+
since: datetime | None = None,
|
|
817
|
+
until: datetime | None = None,
|
|
818
|
+
verbose: bool = False,
|
|
819
|
+
source_windows: dict[str, tuple[datetime | None, datetime | None]] | None = None,
|
|
820
|
+
show_progress: bool = True,
|
|
821
|
+
) -> LoadResult:
|
|
822
|
+
"""Load all patterns required by a run plan and return data plus metadata.
|
|
823
|
+
|
|
824
|
+
``source_dirs`` is keyed by source family (``zeek_dir`` / ``syslog_dir`` /
|
|
825
|
+
``pihole_dir`` / ``cloudtrail_dir``); each value is a LIST of inputs (each
|
|
826
|
+
a directory or an explicit file) contributed by positionals, the
|
|
827
|
+
``--<family>-dir`` flag, and config fallback. The loader iterates each
|
|
828
|
+
family's inputs, runs the EXISTING per-input discovery, concatenates the
|
|
829
|
+
results, dedupes by ``.resolve()`` preserving first-seen order, and loads
|
|
830
|
+
the union. Single-input (one-element list) behavior is byte-identical
|
|
831
|
+
with the prior scalar shape.
|
|
832
|
+
|
|
833
|
+
``source_windows`` overrides ``(since, until)`` per source key. This lets
|
|
834
|
+
the runner apply a Zeek-derived default window to Zeek loads only,
|
|
835
|
+
leaving syslog/pihole unwindowed when the user gave no explicit timeframe.
|
|
836
|
+
"""
|
|
837
|
+
logs: dict[str, pd.DataFrame] = {}
|
|
838
|
+
record_counts: dict[str, int] = {}
|
|
839
|
+
warnings: list[str] = []
|
|
840
|
+
data_size_bytes = 0
|
|
841
|
+
coverage: dict[str, SourceCoverage] = {}
|
|
842
|
+
rotation_skips: dict[str, RotationSkipInfo] = {}
|
|
843
|
+
source_windows = source_windows or {}
|
|
844
|
+
|
|
845
|
+
for pattern, source in needed_logs.items():
|
|
846
|
+
paths = source_dirs.get(source) or []
|
|
847
|
+
if not paths:
|
|
848
|
+
warnings.append(f"{source} not configured — {pattern} not loaded")
|
|
849
|
+
continue
|
|
850
|
+
|
|
851
|
+
strategy = _SOURCE_LOADERS.get(source)
|
|
852
|
+
if strategy is None:
|
|
853
|
+
raise ValueError(
|
|
854
|
+
f"unknown source key {source!r} for pattern {pattern!r} — "
|
|
855
|
+
"no loader is registered for it"
|
|
856
|
+
)
|
|
857
|
+
|
|
858
|
+
s_since, s_until = source_windows.get(source, (since, until))
|
|
859
|
+
|
|
860
|
+
skip_info: RotationSkipInfo | None = None
|
|
861
|
+
if strategy.window_select is None:
|
|
862
|
+
# Zeek / CloudTrail — byte-identical to the prior behavior. Discover
|
|
863
|
+
# over EVERY input (file or dir); the per-file pattern match in
|
|
864
|
+
# discover_zeek_files is what routes multi-positional Zeek inputs to
|
|
865
|
+
# the right pattern, so it must NOT be bypassed.
|
|
866
|
+
files = _union_dedupe([
|
|
867
|
+
strategy.discover(p, pattern, s_since, s_until) for p in paths
|
|
868
|
+
])
|
|
869
|
+
else:
|
|
870
|
+
# Flat (syslog / pihole) — ordinal-rotation peek-prune of the
|
|
871
|
+
# directory-discovered candidates. Explicit FILES the operator named
|
|
872
|
+
# are partitioned out, protected from BOTH the windowing input and
|
|
873
|
+
# the skip count, and always loaded.
|
|
874
|
+
file_inputs = [p for p in paths if p.is_file()]
|
|
875
|
+
dir_inputs = [p for p in paths if p.is_dir()]
|
|
876
|
+
dir_candidates = _union_dedupe([
|
|
877
|
+
strategy.discover(d, pattern, s_since, s_until) for d in dir_inputs
|
|
878
|
+
])
|
|
879
|
+
# Silent-miss disclosure, forked by source. syslog discovery is
|
|
880
|
+
# content-gated, so "zero candidates from a dir that holds files"
|
|
881
|
+
# means nothing read as RFC 3164 — distinct from pihole's
|
|
882
|
+
# filename-pattern mismatch. Either way a security tool must not
|
|
883
|
+
# swallow it silently. Explicit files load regardless and never
|
|
884
|
+
# reach this check.
|
|
885
|
+
if dir_inputs and not dir_candidates:
|
|
886
|
+
if source == "syslog_dir":
|
|
887
|
+
# Cheap iterdir presence check (NO sniff, NO `*.log*` test) so
|
|
888
|
+
# an extensionless-only dir is disclosed, not dropped silently.
|
|
889
|
+
# Directory path(s) only — never a per-file name list.
|
|
890
|
+
offending = [d for d in dir_inputs if _dir_has_regular_files(d)]
|
|
891
|
+
if offending:
|
|
892
|
+
names = ", ".join(str(d) for d in offending)
|
|
893
|
+
warnings.append(
|
|
894
|
+
f"syslog_dir: nothing in {names} looks like syslog "
|
|
895
|
+
f"(RFC 3164) — nothing loaded (check the path)."
|
|
896
|
+
)
|
|
897
|
+
elif any(_syslog_files(d, "*.log*") for d in dir_inputs):
|
|
898
|
+
warnings.append(
|
|
899
|
+
f"{source}: directory has .log files but none match "
|
|
900
|
+
f"{pattern!r} — not loaded (check the log file naming)."
|
|
901
|
+
)
|
|
902
|
+
explicit_resolved = {_safe_resolve(p) for p in file_inputs}
|
|
903
|
+
dir_for_window = [
|
|
904
|
+
p for p in dir_candidates
|
|
905
|
+
if _safe_resolve(p) not in explicit_resolved
|
|
906
|
+
]
|
|
907
|
+
if (s_since or s_until) and dir_for_window:
|
|
908
|
+
selected_dir, skip_info = strategy.window_select(
|
|
909
|
+
dir_for_window, s_since, s_until, verbose=verbose
|
|
910
|
+
)
|
|
911
|
+
else:
|
|
912
|
+
selected_dir = dir_for_window
|
|
913
|
+
files = _union_dedupe([file_inputs, selected_dir])
|
|
914
|
+
|
|
915
|
+
data_size_bytes += sum(p.stat().st_size for p in files if p.is_file())
|
|
916
|
+
|
|
917
|
+
cov_dict: dict = {}
|
|
918
|
+
df = run_load(
|
|
919
|
+
strategy, files, pattern, s_since, s_until,
|
|
920
|
+
show_progress=show_progress, verbose=verbose,
|
|
921
|
+
_warnings=warnings, _coverage=cov_dict,
|
|
922
|
+
)
|
|
923
|
+
|
|
924
|
+
if skip_info is not None:
|
|
925
|
+
rotation_skips[pattern] = skip_info
|
|
926
|
+
|
|
927
|
+
logs[pattern] = df
|
|
928
|
+
if not df.empty:
|
|
929
|
+
record_counts[pattern] = len(df)
|
|
930
|
+
|
|
931
|
+
if "coverage" in cov_dict:
|
|
932
|
+
coverage[pattern] = cov_dict["coverage"]
|
|
933
|
+
|
|
934
|
+
warning = _schema_warning(pattern, df)
|
|
935
|
+
if warning:
|
|
936
|
+
warnings.append(warning)
|
|
937
|
+
|
|
938
|
+
return LoadResult(
|
|
939
|
+
logs=logs,
|
|
940
|
+
record_counts=record_counts,
|
|
941
|
+
data_window=_data_window(logs),
|
|
942
|
+
warnings=warnings,
|
|
943
|
+
data_size_bytes=data_size_bytes,
|
|
944
|
+
coverage=coverage,
|
|
945
|
+
rotation_skips=rotation_skips,
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
|
|
949
|
+
def resolve_load_windows(
|
|
950
|
+
needed_sources: dict[str, str],
|
|
951
|
+
source_dirs: dict[str, list[Path]],
|
|
952
|
+
default_spec: str,
|
|
953
|
+
*,
|
|
954
|
+
since: datetime | None,
|
|
955
|
+
until: datetime | None,
|
|
956
|
+
load_all: bool,
|
|
957
|
+
) -> list[LoadWindow]:
|
|
958
|
+
"""Resolve the universal default window into ONE ``LoadWindow`` per family.
|
|
959
|
+
|
|
960
|
+
The SINGLE window-policy entry point both ``run()`` and ``run_digest()`` call —
|
|
961
|
+
it replaced the runner's per-family name-ladder and the digest twin. Returns
|
|
962
|
+
``[]`` (no default window engaged anywhere) when the operator gave an explicit
|
|
963
|
+
window, passed ``--all``, or ``default_window`` is empty/"all"/invalid.
|
|
964
|
+
|
|
965
|
+
Otherwise builds one :class:`LoadWindow` per source family that is in
|
|
966
|
+
``needed_sources`` (the plan's pattern→source map), configured in
|
|
967
|
+
``source_dirs``, UNBOUNDED (any directory in the bucket), AND eligible
|
|
968
|
+
(``default_window_eligible`` — CloudTrail opts out, baseline-relative). Each
|
|
969
|
+
family's OWN strategy resolves the ``(select_window, trim_span)`` via its
|
|
970
|
+
declared ``resolve_window`` (or :func:`_default_resolve_window` when it declares
|
|
971
|
+
none — load full + trim, the universal default a new flat source inherits with
|
|
972
|
+
zero runner edits). ``keep_null`` is read straight off ``strategy.ts_policy``.
|
|
973
|
+
|
|
974
|
+
``needed_sources`` carries the pattern→source map so the flat resolver recovers
|
|
975
|
+
the detector glob per family (``pattern``) — first pattern per family, matching
|
|
976
|
+
the prior name-ladder behavior — without reintroducing a source-name branch.
|
|
977
|
+
"""
|
|
978
|
+
if load_all or since is not None or until is not None:
|
|
979
|
+
return []
|
|
980
|
+
span = parse_window_span(default_spec)
|
|
981
|
+
if span is None:
|
|
982
|
+
return []
|
|
983
|
+
|
|
984
|
+
# Families present in the plan, stable order, deduped.
|
|
985
|
+
planned_sources: list[str] = []
|
|
986
|
+
for src in needed_sources.values():
|
|
987
|
+
if src not in planned_sources:
|
|
988
|
+
planned_sources.append(src)
|
|
989
|
+
|
|
990
|
+
windows: list[LoadWindow] = []
|
|
991
|
+
for source in planned_sources:
|
|
992
|
+
dirs = source_dirs.get(source)
|
|
993
|
+
if not dirs or is_bounded(dirs):
|
|
994
|
+
continue
|
|
995
|
+
strategy = _SOURCE_LOADERS.get(source)
|
|
996
|
+
# Declared opt-out: CloudTrail (baseline-relative) produces NO default
|
|
997
|
+
# window → loads full on unqualified runs. Explicit windows still apply
|
|
998
|
+
# (handled before this function via the since/until short-circuit).
|
|
999
|
+
if strategy is None or not strategy.default_window_eligible:
|
|
1000
|
+
continue
|
|
1001
|
+
# First pattern per family — matches the prior name-ladder; the flat
|
|
1002
|
+
# resolver anchors its conservative floor from DIRECTORY candidates only.
|
|
1003
|
+
pattern = next(
|
|
1004
|
+
(p for p, s in needed_sources.items() if s == source), "*.log*"
|
|
1005
|
+
)
|
|
1006
|
+
resolver = strategy.resolve_window or _default_resolve_window
|
|
1007
|
+
select_window, trim_span = resolver(strategy, dirs, pattern, span)
|
|
1008
|
+
keep_null = strategy.ts_policy == "keep"
|
|
1009
|
+
windows.append(LoadWindow(source, select_window, trim_span, keep_null))
|
|
1010
|
+
return windows
|