loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Log discovery, decompression, parsing, and timeframe filtering.
|
|
2
|
+
|
|
3
|
+
All file I/O for log data flows through this package. Detectors never open files
|
|
4
|
+
directly. This ``__init__`` re-exports the FULL public + private symbol surface
|
|
5
|
+
that previously lived in the single ``common/loader.py`` module, so every name
|
|
6
|
+
importable or monkeypatchable at ``loghunter.common.loader.<name>`` continues to
|
|
7
|
+
resolve (and stays settable) after the package split.
|
|
8
|
+
|
|
9
|
+
Submodule layout (acyclic; ``io``/``types``/``diagnostics`` are leaves):
|
|
10
|
+
- ``io`` — ``_open_log`` + path-normalization primitives.
|
|
11
|
+
- ``types`` — ``LoadResult`` / coverage / rotation-skip dataclasses, the
|
|
12
|
+
cross-frame window helper, column constants.
|
|
13
|
+
- ``diagnostics`` — log-type + warning/wording helpers.
|
|
14
|
+
- ``sniff`` — the digest recognizer cascade + the syslog content gate.
|
|
15
|
+
- ``windowing`` — ts filter, boundedness, the rotation-peek subsystem.
|
|
16
|
+
- ``discovery`` — per-family file discovery + dated-Zeek default window.
|
|
17
|
+
- ``pipeline`` — ``run_load`` + the ``_SOURCE_LOADERS`` registry + the
|
|
18
|
+
public ``load_*`` shims + registry-policy accessors.
|
|
19
|
+
|
|
20
|
+
Canonical connection record schema
|
|
21
|
+
───────────────────────────────────
|
|
22
|
+
All conn log DataFrames returned by this package use these column names.
|
|
23
|
+
Detectors, runner, and matcher never reference Zeek-specific column names.
|
|
24
|
+
|
|
25
|
+
src — source IP (str)
|
|
26
|
+
dst — destination IP (str)
|
|
27
|
+
port — destination port (int)
|
|
28
|
+
proto — protocol: tcp / udp / icmp (str)
|
|
29
|
+
ts — unix epoch timestamp (float)
|
|
30
|
+
duration — connection duration in seconds (float, nullable)
|
|
31
|
+
bytes — originator bytes (int, nullable)
|
|
32
|
+
conn_state — connection state (str, nullable)
|
|
33
|
+
local_orig — bool (nullable)
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
from __future__ import annotations
|
|
37
|
+
|
|
38
|
+
# progress is re-exported so loghunter.common.loader.progress both RESOLVES and
|
|
39
|
+
# is SETTABLE — the load pipeline reads it through this package attribute (the
|
|
40
|
+
# call-time facade), so monkeypatching it here takes effect.
|
|
41
|
+
from loghunter.common.display import progress
|
|
42
|
+
|
|
43
|
+
from loghunter.common.loader.io import (
|
|
44
|
+
_open_log,
|
|
45
|
+
_safe_resolve,
|
|
46
|
+
_union_dedupe,
|
|
47
|
+
)
|
|
48
|
+
from loghunter.common.loader.types import (
|
|
49
|
+
_CLOUDTRAIL_COLUMNS,
|
|
50
|
+
_LOG_SUFFIXES,
|
|
51
|
+
_PIHOLE_COLUMNS,
|
|
52
|
+
_SYSLOG_COLUMNS,
|
|
53
|
+
CoverageTracker,
|
|
54
|
+
LoadResult,
|
|
55
|
+
RotationSkipInfo,
|
|
56
|
+
SourceCoverage,
|
|
57
|
+
_data_window,
|
|
58
|
+
)
|
|
59
|
+
from loghunter.common.loader.diagnostics import (
|
|
60
|
+
_cloudtrail_parse_warning,
|
|
61
|
+
_log_type,
|
|
62
|
+
_schema_warning,
|
|
63
|
+
_zeek_file_read_warning,
|
|
64
|
+
)
|
|
65
|
+
from loghunter.common.loader.sniff import (
|
|
66
|
+
_SNIFF_MAX_PEEK,
|
|
67
|
+
_SNIFF_ORIGIN,
|
|
68
|
+
_SNIFF_RECOGNIZERS,
|
|
69
|
+
_SYSLOG_SNIFF_BYTES,
|
|
70
|
+
SniffResult,
|
|
71
|
+
_is_ndjson,
|
|
72
|
+
_looks_like_syslog,
|
|
73
|
+
sniff_format,
|
|
74
|
+
sniff_format_detailed,
|
|
75
|
+
)
|
|
76
|
+
from loghunter.common.loader.windowing import (
|
|
77
|
+
_COMPRESSION_EXTS,
|
|
78
|
+
_DATE_RANK_BASE,
|
|
79
|
+
_EXPORT_WINDOW_RE,
|
|
80
|
+
_ROTATION_NUM_RE,
|
|
81
|
+
LoadWindow,
|
|
82
|
+
_apply_ts_filter,
|
|
83
|
+
_classify_rotation_name,
|
|
84
|
+
_group_order_conflict,
|
|
85
|
+
_missing_ts,
|
|
86
|
+
_peek_first_ts,
|
|
87
|
+
_rotation_base_and_index,
|
|
88
|
+
_rotation_windowed_files,
|
|
89
|
+
_select_group,
|
|
90
|
+
_strip_compression_ext,
|
|
91
|
+
apply_default_window,
|
|
92
|
+
is_bounded,
|
|
93
|
+
is_zeek_bounded,
|
|
94
|
+
)
|
|
95
|
+
from loghunter.common.loader.discovery import (
|
|
96
|
+
_DATE_DIR_RE,
|
|
97
|
+
_default_resolve_window,
|
|
98
|
+
_dir_has_regular_files,
|
|
99
|
+
_discover_syslog_files,
|
|
100
|
+
_file_matches_pattern,
|
|
101
|
+
_flat_default_floor,
|
|
102
|
+
_flat_resolve_window,
|
|
103
|
+
_stem_hostname,
|
|
104
|
+
_syslog_files,
|
|
105
|
+
_zeek_date_subdirs,
|
|
106
|
+
_zeek_dated_window,
|
|
107
|
+
_zeek_resolve_window,
|
|
108
|
+
discover_cloudtrail_files,
|
|
109
|
+
discover_files,
|
|
110
|
+
discover_zeek_files,
|
|
111
|
+
)
|
|
112
|
+
from loghunter.common.loader.pipeline import (
|
|
113
|
+
_NORMALIZER_MAP,
|
|
114
|
+
_SOURCE_LOADERS,
|
|
115
|
+
_cloudtrail_strategy_parse,
|
|
116
|
+
_events_from_whole_document,
|
|
117
|
+
_parse_lines,
|
|
118
|
+
_parse_ndjson_file,
|
|
119
|
+
_pihole_should_skip,
|
|
120
|
+
_pihole_strategy_parse,
|
|
121
|
+
_syslog_should_skip,
|
|
122
|
+
_syslog_strategy_parse,
|
|
123
|
+
_zeek_normalize,
|
|
124
|
+
_zeek_parse_from_lines,
|
|
125
|
+
_zeek_records_from_lines,
|
|
126
|
+
_zeek_strategy_parse,
|
|
127
|
+
SourceLoader,
|
|
128
|
+
load_cloudtrail,
|
|
129
|
+
load_logs,
|
|
130
|
+
load_pihole,
|
|
131
|
+
load_required_logs,
|
|
132
|
+
load_syslog,
|
|
133
|
+
load_zeek_log,
|
|
134
|
+
resolve_load_windows,
|
|
135
|
+
run_load,
|
|
136
|
+
)
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Loader warning/wording + log-type helpers (leaf utility module).
|
|
2
|
+
|
|
3
|
+
The non-dataclass glue James flagged out of ``types``: ``_log_type`` (glob →
|
|
4
|
+
canonical log type), ``_schema_warning`` (actionable missing-field message),
|
|
5
|
+
``_zeek_file_read_warning`` / ``_cloudtrail_parse_warning`` (privacy-safe
|
|
6
|
+
per-file failure wording). Imports the parser schema constants only.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import gzip
|
|
12
|
+
import lzma
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from loghunter.parsers.zeek import (
|
|
18
|
+
_OPTIONAL_COLUMNS,
|
|
19
|
+
_REQUIRED_COLUMNS,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _log_type(pattern: str) -> str | None:
|
|
24
|
+
"""Return the canonical log type inferred from a loader glob pattern."""
|
|
25
|
+
if pattern.startswith("conn"):
|
|
26
|
+
return "conn"
|
|
27
|
+
if pattern.startswith("dns"):
|
|
28
|
+
return "dns"
|
|
29
|
+
if pattern.startswith("ssl"):
|
|
30
|
+
return "ssl"
|
|
31
|
+
if pattern.startswith("weird"):
|
|
32
|
+
return "weird"
|
|
33
|
+
if pattern.startswith("notice"):
|
|
34
|
+
return "notice"
|
|
35
|
+
if pattern.startswith("auth"):
|
|
36
|
+
return "auth"
|
|
37
|
+
if pattern.startswith("files"):
|
|
38
|
+
return "files"
|
|
39
|
+
if pattern.startswith("pihole"):
|
|
40
|
+
return "pihole"
|
|
41
|
+
if pattern.startswith("syslog"):
|
|
42
|
+
return "syslog"
|
|
43
|
+
return None
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _schema_warning(pattern: str, df: pd.DataFrame) -> str | None:
|
|
47
|
+
"""Return an actionable warning when a loaded DataFrame lacks canonical fields."""
|
|
48
|
+
if df.empty:
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
log_type = _log_type(pattern)
|
|
52
|
+
if log_type is None or log_type not in _REQUIRED_COLUMNS:
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
opt = _OPTIONAL_COLUMNS.get(log_type, set())
|
|
56
|
+
missing = sorted((_REQUIRED_COLUMNS[log_type] - opt) - set(df.columns))
|
|
57
|
+
if not missing:
|
|
58
|
+
return None
|
|
59
|
+
|
|
60
|
+
if log_type == "conn":
|
|
61
|
+
return (
|
|
62
|
+
f"conn.log fields not found: {', '.join(missing)} — "
|
|
63
|
+
"is this a Zeek conn.log?"
|
|
64
|
+
)
|
|
65
|
+
if log_type == "dns":
|
|
66
|
+
return (
|
|
67
|
+
f"dns.log fields not found: {', '.join(missing)} — "
|
|
68
|
+
"is this a Zeek dns.log?"
|
|
69
|
+
)
|
|
70
|
+
if log_type == "syslog":
|
|
71
|
+
return (
|
|
72
|
+
f"syslog.log fields not found: {', '.join(missing)} — "
|
|
73
|
+
"is this a Zeek syslog.log?"
|
|
74
|
+
)
|
|
75
|
+
return None
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _zeek_file_read_warning(path: Path, exc: BaseException) -> str:
|
|
79
|
+
"""Return a privacy-safe warning for a Zeek file that could not be read."""
|
|
80
|
+
if isinstance(exc, (EOFError, gzip.BadGzipFile, lzma.LZMAError)):
|
|
81
|
+
reason = "compressed file is incomplete or corrupt"
|
|
82
|
+
else:
|
|
83
|
+
reason = f"could not be read ({exc.__class__.__name__})"
|
|
84
|
+
return f"{path.name} could not be read — {reason}; skipping"
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _cloudtrail_parse_warning(path: Path) -> str:
|
|
88
|
+
"""Return a privacy-safe warning for a CloudTrail file with malformed JSON.
|
|
89
|
+
|
|
90
|
+
Parallels ``_zeek_file_read_warning``'s tone for I/O failures; this variant
|
|
91
|
+
covers the parse-failure case (file readable, but the contents are not JSON
|
|
92
|
+
we can use).
|
|
93
|
+
"""
|
|
94
|
+
return f"{path.name} could not be read — not valid JSON; skipping"
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""File discovery for each source family + the dated-Zeek default window.
|
|
2
|
+
|
|
3
|
+
Per-family discovery (Zeek flat/dated, syslog content-gated, pihole glob,
|
|
4
|
+
CloudTrail recursive), the hostname stem helper, and the per-strategy
|
|
5
|
+
default-window resolvers (``_zeek_resolve_window`` / ``_flat_resolve_window`` /
|
|
6
|
+
``_default_resolve_window`` — the ``SourceLoader.resolve_window`` bodies; the
|
|
7
|
+
dated-layout selection ``_zeek_dated_window`` reads ``_zeek_date_subdirs``).
|
|
8
|
+
Imports the rotation sort key + first-ts peek from ``windowing``, the union dedupe
|
|
9
|
+
from ``io``, and the syslog content gate from ``sniff``; none import discovery, so
|
|
10
|
+
the package stays acyclic.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import fnmatch
|
|
16
|
+
import math
|
|
17
|
+
import re
|
|
18
|
+
from datetime import date, datetime, timedelta, timezone
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
from loghunter.common.loader.io import _union_dedupe
|
|
22
|
+
from loghunter.common.loader.sniff import _looks_like_syslog
|
|
23
|
+
from loghunter.common.loader.types import _LOG_SUFFIXES
|
|
24
|
+
from loghunter.common.loader.windowing import _peek_first_ts, _rotation_base_and_index
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def discover_files(directory: Path, pattern: str) -> list[Path]:
|
|
28
|
+
"""Return all files in directory matching the glob pattern, sorted by name."""
|
|
29
|
+
return sorted(directory.glob(pattern))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# Matches YYYY-MM-DD at the start of a Zeek log-rotation directory name.
|
|
33
|
+
# Suffix (e.g. -TSVPRE) is ignored for date extraction; see discover_zeek_files.
|
|
34
|
+
_DATE_DIR_RE = re.compile(r"^\d{4}-\d{2}-\d{2}")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _zeek_date_subdirs(directory: Path) -> list[Path]:
|
|
38
|
+
"""Return immediate child directories whose names begin with YYYY-MM-DD, sorted."""
|
|
39
|
+
result = []
|
|
40
|
+
for child in directory.iterdir():
|
|
41
|
+
if child.is_dir() and _DATE_DIR_RE.match(child.name):
|
|
42
|
+
result.append(child)
|
|
43
|
+
return sorted(result, key=lambda p: p.name)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _file_matches_pattern(path: Path, pattern: str) -> bool:
|
|
47
|
+
"""Return True if path's basename matches the glob pattern (single-file Zeek mode)."""
|
|
48
|
+
return fnmatch.fnmatch(path.name, pattern)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _zeek_dated_window(
|
|
52
|
+
paths: list[Path], span: timedelta
|
|
53
|
+
) -> tuple[datetime, datetime] | None:
|
|
54
|
+
"""Compute the default analysis window for a union of Zeek inputs.
|
|
55
|
+
|
|
56
|
+
PURELY-DATED predicate: every input is a directory AND every directory has
|
|
57
|
+
non-empty YYYY-MM-DD subdirs. When that predicate holds, generalizes the
|
|
58
|
+
single-input selection across the union — gather every discovered date
|
|
59
|
+
subdir across all inputs, dedupe by date prefix, sort by date, select the
|
|
60
|
+
newest ``N = ceil(span_days)`` (min 1), and return ``00:00:00`` UTC of the
|
|
61
|
+
earliest selected → ``23:59:59`` UTC of the newest selected.
|
|
62
|
+
|
|
63
|
+
Returns ``None`` when ANY input is a file (file + dated-dir mix) or ANY
|
|
64
|
+
directory is flat (mixed/flat present) — the caller falls through to the
|
|
65
|
+
flat-layout default path, which computes the window post-load from the
|
|
66
|
+
COMBINED loaded Zeek frame's max-ts.
|
|
67
|
+
|
|
68
|
+
Single-input behavior is BYTE-IDENTICAL with the prior scalar helper: a
|
|
69
|
+
one-element list of a single dated dir runs the same selection
|
|
70
|
+
(``_zeek_date_subdirs(input)``, newest N, earliest-midnight →
|
|
71
|
+
newest-23:59:59).
|
|
72
|
+
|
|
73
|
+
Sparse archives behave correctly: subdirs ``[2026-01-01, 2026-01-05]``
|
|
74
|
+
with span=2d → BOTH selected, window Jan 1 → Jan 5. Cross-input
|
|
75
|
+
duplicate dates count once toward N (dedup by date prefix).
|
|
76
|
+
"""
|
|
77
|
+
if not paths:
|
|
78
|
+
return None
|
|
79
|
+
all_date_dirs: list[Path] = []
|
|
80
|
+
for p in paths:
|
|
81
|
+
if not p.is_dir():
|
|
82
|
+
return None
|
|
83
|
+
date_dirs = _zeek_date_subdirs(p)
|
|
84
|
+
if not date_dirs:
|
|
85
|
+
return None
|
|
86
|
+
all_date_dirs.extend(date_dirs)
|
|
87
|
+
# Dedup by date prefix so N counts DISTINCT dates across the union, not
|
|
88
|
+
# duplicates contributed by multiple inputs carrying the same day.
|
|
89
|
+
seen: set[str] = set()
|
|
90
|
+
unique_by_date: list[Path] = []
|
|
91
|
+
for d in sorted(all_date_dirs, key=lambda p: p.name[:10]):
|
|
92
|
+
prefix = d.name[:10]
|
|
93
|
+
if prefix in seen:
|
|
94
|
+
continue
|
|
95
|
+
seen.add(prefix)
|
|
96
|
+
unique_by_date.append(d)
|
|
97
|
+
n = max(1, math.ceil(span.total_seconds() / 86400))
|
|
98
|
+
selected = unique_by_date[-n:]
|
|
99
|
+
earliest_date = date.fromisoformat(selected[0].name[:10])
|
|
100
|
+
newest_date = date.fromisoformat(selected[-1].name[:10])
|
|
101
|
+
since = datetime(earliest_date.year, earliest_date.month, earliest_date.day,
|
|
102
|
+
0, 0, 0, tzinfo=timezone.utc)
|
|
103
|
+
until = datetime(newest_date.year, newest_date.month, newest_date.day,
|
|
104
|
+
23, 59, 59, tzinfo=timezone.utc)
|
|
105
|
+
return since, until
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
109
|
+
# Per-strategy default-window resolvers — the SourceLoader.resolve_window bodies.
|
|
110
|
+
#
|
|
111
|
+
# Signature is uniform: (strategy, dirs, pattern, span) -> (select_window,
|
|
112
|
+
# trim_span). resolve_load_windows (pipeline) loops eligible/unbounded/in-play
|
|
113
|
+
# families and calls each strategy's resolver (or _default_resolve_window when a
|
|
114
|
+
# strategy declares none). The owning ``strategy`` is passed so the flat resolver
|
|
115
|
+
# can reach ``strategy.discover`` for its candidate universe WITHOUT a source-name
|
|
116
|
+
# ladder or a registry import — that is why this hook takes ``strategy`` and the
|
|
117
|
+
# other strategy callables (parse/discover/should_skip) do not.
|
|
118
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _default_resolve_window(
|
|
122
|
+
strategy, dirs: list[Path], pattern: str, span: timedelta
|
|
123
|
+
) -> tuple[None, timedelta]:
|
|
124
|
+
"""Universal default for a source that declares no ``resolve_window``: load the
|
|
125
|
+
family full and trim post-load to its own last-``span`` window. A new flat
|
|
126
|
+
source inherits this with zero runner edits — mirroring today's
|
|
127
|
+
``default_window_eligible=True`` / ``window_select=None`` behavior.
|
|
128
|
+
"""
|
|
129
|
+
return None, span
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _zeek_resolve_window(
|
|
133
|
+
strategy, dirs: list[Path], pattern: str, span: timedelta
|
|
134
|
+
) -> tuple[tuple[datetime, datetime] | None, timedelta | None]:
|
|
135
|
+
"""Zeek strategy resolver. Dated layout → a precise ``(since, until)``
|
|
136
|
+
``select_window`` and NO post-load trim (the load-time window already cut
|
|
137
|
+
exactly). Flat / mixed / file → ``(None, span)`` (load full, trim post-load).
|
|
138
|
+
"""
|
|
139
|
+
dated = _zeek_dated_window(dirs, span)
|
|
140
|
+
if dated is not None:
|
|
141
|
+
return dated, None
|
|
142
|
+
return None, span
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def _flat_default_floor(
|
|
146
|
+
strategy, dir_paths: list[Path], pattern: str, span: timedelta
|
|
147
|
+
) -> tuple[datetime, None] | None:
|
|
148
|
+
"""Conservative default-window floor for a flat family (syslog / pihole).
|
|
149
|
+
|
|
150
|
+
Discovers the family's DIRECTORY candidates via the passed ``strategy.discover``
|
|
151
|
+
over the ``is_dir()`` inputs (``_union_dedupe``d — the same universe as
|
|
152
|
+
:func:`load_required_logs`, explicit files excluded) and peeks each candidate's
|
|
153
|
+
first-ts (``_peek_first_ts`` — clock-parity with the loader's own filter).
|
|
154
|
+
Returns ``(f_max − span, None)`` — the conservative select-window the
|
|
155
|
+
rotation-peek prunes against (``until=None``; the precise cut is the post-load
|
|
156
|
+
trim) — where ``f_max`` is the max parseable first-ts across candidates.
|
|
157
|
+
Returns ``None`` when nothing is peekable (load-full fallback).
|
|
158
|
+
"""
|
|
159
|
+
candidates = _union_dedupe(
|
|
160
|
+
[strategy.discover(d, pattern, None, None) for d in dir_paths if d.is_dir()]
|
|
161
|
+
)
|
|
162
|
+
peeked = [ts for ts in (_peek_first_ts(p) for p in candidates) if ts is not None]
|
|
163
|
+
if not peeked:
|
|
164
|
+
return None
|
|
165
|
+
return (max(peeked) - span, None)
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _flat_resolve_window(
|
|
169
|
+
strategy, dirs: list[Path], pattern: str, span: timedelta
|
|
170
|
+
) -> tuple[tuple[datetime, None] | None, timedelta]:
|
|
171
|
+
"""Flat strategy resolver (syslog / pihole). Peek the directory candidates →
|
|
172
|
+
conservative ``(floor, None)`` ``select_window`` + precise post-load
|
|
173
|
+
``trim_span``; unpeekable → ``(None, span)`` (load full, trim post-load).
|
|
174
|
+
"""
|
|
175
|
+
return _flat_default_floor(strategy, dirs, pattern, span), span
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def discover_zeek_files(
|
|
179
|
+
directory: Path,
|
|
180
|
+
pattern: str,
|
|
181
|
+
since: datetime | None = None,
|
|
182
|
+
until: datetime | None = None,
|
|
183
|
+
) -> list[Path]:
|
|
184
|
+
"""Return Zeek log files matching pattern from a flat or dated-layout directory.
|
|
185
|
+
|
|
186
|
+
Single-file mode (directory.is_file() True): returns [directory] when its basename
|
|
187
|
+
matches the pattern, else []. Single-file zeek_dir is a bounded target.
|
|
188
|
+
|
|
189
|
+
Flat layout (no YYYY-MM-DD immediate children): sorted(directory.glob(pattern)).
|
|
190
|
+
Dated layout: globs pattern within YYYY-MM-DD subdirs, date-prunes when window given.
|
|
191
|
+
This is structural inspection of a known zeek_dir — not generic format discovery.
|
|
192
|
+
|
|
193
|
+
Mixed-root policy: if any immediate child is a YYYY-MM-DD directory, dated layout
|
|
194
|
+
is used and root-level files are not included. Zeek does not produce root-level files
|
|
195
|
+
alongside date subdirs; the hybrid is ambiguous and therefore unsupported.
|
|
196
|
+
"""
|
|
197
|
+
if directory.is_file():
|
|
198
|
+
return [directory] if _file_matches_pattern(directory, pattern) else []
|
|
199
|
+
|
|
200
|
+
date_dirs = _zeek_date_subdirs(directory)
|
|
201
|
+
|
|
202
|
+
if not date_dirs:
|
|
203
|
+
# Flat layout — identical to discover_files behavior.
|
|
204
|
+
return sorted(directory.glob(pattern))
|
|
205
|
+
|
|
206
|
+
# Dated layout.
|
|
207
|
+
if since is not None or until is not None:
|
|
208
|
+
# Prune by directory date; skip non-date children entirely.
|
|
209
|
+
since_date = since.date() if since else None
|
|
210
|
+
until_date = until.date() if until else None
|
|
211
|
+
included: list[Path] = []
|
|
212
|
+
for d in date_dirs:
|
|
213
|
+
dir_date = date.fromisoformat(d.name[:10])
|
|
214
|
+
if since_date is not None and dir_date < since_date:
|
|
215
|
+
continue
|
|
216
|
+
if until_date is not None and dir_date > until_date:
|
|
217
|
+
continue
|
|
218
|
+
included.append(d)
|
|
219
|
+
else:
|
|
220
|
+
# No window — include all date dirs and any non-date dirs, deduped by realpath.
|
|
221
|
+
# Non-date dirs (current, export, mixed) are often symlinks to date dirs;
|
|
222
|
+
# resolving and deduping prevents double-loading.
|
|
223
|
+
candidates: list[Path] = list(date_dirs)
|
|
224
|
+
for child in directory.iterdir():
|
|
225
|
+
if child.is_dir() and not _DATE_DIR_RE.match(child.name):
|
|
226
|
+
candidates.append(child)
|
|
227
|
+
seen: set[Path] = set()
|
|
228
|
+
included = []
|
|
229
|
+
# Dedup by realpath — non-date children are often symlinks pointing to date dirs;
|
|
230
|
+
# resolving prevents double-loading regardless of iteration order.
|
|
231
|
+
for d in sorted(candidates, key=lambda p: p.name):
|
|
232
|
+
rp = d.resolve()
|
|
233
|
+
if rp not in seen:
|
|
234
|
+
seen.add(rp)
|
|
235
|
+
included.append(d)
|
|
236
|
+
|
|
237
|
+
files: list[Path] = []
|
|
238
|
+
for d in included:
|
|
239
|
+
files.extend(sorted(d.glob(pattern)))
|
|
240
|
+
return files
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def _syslog_files(path: Path, pattern: str = "*.log*") -> list[Path]:
|
|
244
|
+
"""Return flat-source files to process by FILENAME glob: ``[path]`` if a file,
|
|
245
|
+
else files matching ``pattern`` in the directory — ``._``-prefixed AppleDouble
|
|
246
|
+
sidecars dropped, numeric rotation order.
|
|
247
|
+
|
|
248
|
+
Pi-hole's discovery helper (and the Pi-hole pattern-mismatch presence check).
|
|
249
|
+
Syslog discovery is NOT this — it content-sniffs via ``_discover_syslog_files``
|
|
250
|
+
(RHEL/Fedora streams carry no ``.log`` suffix, and ``dnf.log`` etc. would be
|
|
251
|
+
mis-claimed by a filename glob). The ``pattern`` applies to DIRECTORY discovery
|
|
252
|
+
ONLY (pihole passes ``pihole*.log*`` to avoid grabbing non-pihole files in a
|
|
253
|
+
shared dir). An explicitly-named FILE always loads as named — the pattern is
|
|
254
|
+
NOT applied to it, so a content-routed Pi-hole file like ``events.log`` still
|
|
255
|
+
loads. The AppleDouble filter and numeric ordering apply to DIRECTORY discovery
|
|
256
|
+
only: the junk filter targets glob noise, not operator intent.
|
|
257
|
+
"""
|
|
258
|
+
if path.is_file():
|
|
259
|
+
return [path]
|
|
260
|
+
files = [p for p in discover_files(path, pattern) if not p.name.startswith("._")]
|
|
261
|
+
return sorted(files, key=lambda p: _rotation_base_and_index(p.name))
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _discover_syslog_files(path: Path) -> list[Path]:
|
|
265
|
+
"""The SINGLE syslog discovery universe — content-gated.
|
|
266
|
+
|
|
267
|
+
FILE input → ``[path]`` UNGATED (the explicit-named-file rail: an explicitly
|
|
268
|
+
named file always loads regardless of name; ``_syslog_should_skip`` still
|
|
269
|
+
guards a named NDJSON/Zeek-TSV at load). DIRECTORY input → all regular,
|
|
270
|
+
non-AppleDouble files that pass ``_looks_like_syslog``, in rotation order.
|
|
271
|
+
Non-recursive ``iterdir`` correctly skips the binary subdirs (``journal/``,
|
|
272
|
+
``audit/``, ``sa/``) — they are not regular files.
|
|
273
|
+
"""
|
|
274
|
+
if path.is_file():
|
|
275
|
+
return [path]
|
|
276
|
+
files = [
|
|
277
|
+
p for p in path.iterdir()
|
|
278
|
+
if p.is_file() and not p.name.startswith("._") and _looks_like_syslog(p)
|
|
279
|
+
]
|
|
280
|
+
return sorted(files, key=lambda p: _rotation_base_and_index(p.name))
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _dir_has_regular_files(path: Path) -> bool:
|
|
284
|
+
"""True iff ``path`` holds >=1 regular, non-AppleDouble file.
|
|
285
|
+
|
|
286
|
+
Cheap ``iterdir`` presence check for the syslog zero-accepted disclosure —
|
|
287
|
+
NO sniff, NO ``*.log*`` test, so an extensionless-only dir does not fall
|
|
288
|
+
through the disclosure silently.
|
|
289
|
+
"""
|
|
290
|
+
try:
|
|
291
|
+
return any(p.is_file() and not p.name.startswith("._") for p in path.iterdir())
|
|
292
|
+
except OSError:
|
|
293
|
+
return False
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
def discover_cloudtrail_files(path: Path) -> list[Path]:
|
|
297
|
+
"""Discover CloudTrail event files for the loader and runner satisfiability check.
|
|
298
|
+
|
|
299
|
+
File path → [path]. Directory → recursive sorted ``*.json*`` matches, excluding
|
|
300
|
+
any file whose path contains a ``CloudTrail-Digest`` component (integrity
|
|
301
|
+
manifests, not events — the same exclusion the exporter applies on the S3 side).
|
|
302
|
+
|
|
303
|
+
The ``*.json*`` glob covers ``.json``, ``.json.gz``, the exporter's ``.json.log``
|
|
304
|
+
and its ``_partNN`` splits. Recursion is what makes a native
|
|
305
|
+
``AWSLogs/<acct>/CloudTrail/<region>/YYYY/MM/DD/`` tree work — users who pull
|
|
306
|
+
logs their own way can point ``cloudtrail_dir`` at any level of that tree.
|
|
307
|
+
"""
|
|
308
|
+
if path.is_file():
|
|
309
|
+
return [path]
|
|
310
|
+
if not path.is_dir():
|
|
311
|
+
return []
|
|
312
|
+
files: list[Path] = []
|
|
313
|
+
for candidate in sorted(path.rglob("*.json*")):
|
|
314
|
+
if not candidate.is_file():
|
|
315
|
+
continue
|
|
316
|
+
if "CloudTrail-Digest" in candidate.parts:
|
|
317
|
+
continue
|
|
318
|
+
files.append(candidate)
|
|
319
|
+
return files
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _stem_hostname(name: str) -> str:
|
|
323
|
+
"""Strip log-file suffixes from a filename to derive a hostname stem.
|
|
324
|
+
|
|
325
|
+
Strips .log, .gz, and numeric rotation suffixes (.1, .42).
|
|
326
|
+
Dotted hostnames are preserved: host1.example.com.log → host1.example.com.
|
|
327
|
+
"""
|
|
328
|
+
stem = name
|
|
329
|
+
while True:
|
|
330
|
+
suffix = Path(stem).suffix
|
|
331
|
+
if suffix in _LOG_SUFFIXES or (suffix and suffix[1:].isdigit()):
|
|
332
|
+
stem = Path(stem).stem
|
|
333
|
+
else:
|
|
334
|
+
break
|
|
335
|
+
return stem
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""Low-level filesystem primitives for the loader package (leaf module).
|
|
2
|
+
|
|
3
|
+
Decompression-transparent file opening plus the path-normalization helpers
|
|
4
|
+
(``_safe_resolve`` / ``_union_dedupe``). These are the lowest leaf: every other
|
|
5
|
+
loader submodule may import from here, and this module imports nothing from the
|
|
6
|
+
package. ``_open_log`` is the SINGLE chokepoint every source flows through.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import bz2
|
|
12
|
+
import gzip
|
|
13
|
+
import lzma
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _open_log(path: Path):
|
|
18
|
+
"""Open a plain, gzip-, bzip2-, or xz-compressed log file for reading.
|
|
19
|
+
|
|
20
|
+
Suffix-gated (NOT magic-authoritative — the blob profiler is the magic-sniff
|
|
21
|
+
context; the loader routes by suffix, keeping the two contexts distinct).
|
|
22
|
+
`_open_log` is the SINGLE chokepoint every source flows through, so adding a
|
|
23
|
+
new format here closes the gap across conn/dns/syslog/pihole/cloudtrail/sniff
|
|
24
|
+
in one place.
|
|
25
|
+
"""
|
|
26
|
+
if path.suffix == ".gz":
|
|
27
|
+
return gzip.open(path, "rt", encoding="utf-8", errors="replace")
|
|
28
|
+
if path.suffix == ".bz2":
|
|
29
|
+
return bz2.open(path, "rt", encoding="utf-8", errors="replace")
|
|
30
|
+
if path.suffix == ".xz":
|
|
31
|
+
return lzma.open(path, "rt", encoding="utf-8", errors="replace")
|
|
32
|
+
return path.open("r", encoding="utf-8", errors="replace")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _safe_resolve(p: Path) -> Path:
|
|
36
|
+
"""``p.resolve()``, falling back to ``p`` on ``OSError``.
|
|
37
|
+
|
|
38
|
+
The single realpath-normalization primitive the loader uses for dedupe,
|
|
39
|
+
the rotation-windowing explicit-file partition, and rotation grouping —
|
|
40
|
+
one consistent notion of "same path" across all three.
|
|
41
|
+
"""
|
|
42
|
+
try:
|
|
43
|
+
return p.resolve()
|
|
44
|
+
except OSError:
|
|
45
|
+
return p
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _union_dedupe(per_input_files: list[list[Path]]) -> list[Path]:
|
|
49
|
+
"""Concat per-input discovery results; dedupe by ``.resolve()`` preserving
|
|
50
|
+
first-seen order.
|
|
51
|
+
|
|
52
|
+
Single-ownership union point — the loader is the only place file lists
|
|
53
|
+
from multiple source-dir inputs are concatenated under one family. Dedup
|
|
54
|
+
by realpath catches:
|
|
55
|
+
|
|
56
|
+
- the same file appearing in two inputs (positional pointing at a file
|
|
57
|
+
that's ALSO inside a positional directory);
|
|
58
|
+
- symlink farms (a non-date child of a Zeek dated dir that resolves to a
|
|
59
|
+
date dir already in the list).
|
|
60
|
+
|
|
61
|
+
First-seen order preservation keeps user-visible file ordering predictable
|
|
62
|
+
(positionals before flag-supplied dirs, mirrors CLI bucket order).
|
|
63
|
+
Returns the deduped list; downstream accounting (``data_size_bytes`` sums,
|
|
64
|
+
warnings, ``load_*`` iteration) runs over this list so duplicates never
|
|
65
|
+
double-count.
|
|
66
|
+
"""
|
|
67
|
+
seen: set[Path] = set()
|
|
68
|
+
out: list[Path] = []
|
|
69
|
+
for files in per_input_files:
|
|
70
|
+
for p in files:
|
|
71
|
+
key = _safe_resolve(p)
|
|
72
|
+
if key in seen:
|
|
73
|
+
continue
|
|
74
|
+
seen.add(key)
|
|
75
|
+
out.append(p)
|
|
76
|
+
return out
|