loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Be-like-water target resolution shared by CLI, runner, and exporters.
|
|
2
|
+
|
|
3
|
+
One function (``be_like_water``) decides whether a user-supplied target string
|
|
4
|
+
points to a FILE or a DIRECTORY, via a gated ladder. The trailing-slash gate is
|
|
5
|
+
evaluated BEFORE any disk check so an explicit trailing slash can never be
|
|
6
|
+
overridden by what happens to exist on disk.
|
|
7
|
+
|
|
8
|
+
A second helper (``resolve_path``) resolves a config-supplied path string
|
|
9
|
+
against the LH_ROOT base. ``effective_root`` reads the active root from env or
|
|
10
|
+
config. CLI-supplied paths never get root applied; only config-file values do.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Any, NamedTuple
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ResolvedTarget(NamedTuple):
|
|
21
|
+
"""Verdict from be_like_water: where to write, and whether it's a file or directory.
|
|
22
|
+
|
|
23
|
+
Attributes:
|
|
24
|
+
path: For FILE mode, the exact file path. For DIRECTORY mode, the
|
|
25
|
+
directory; caller auto-names inside it.
|
|
26
|
+
is_file: True for FILE, False for DIRECTORY.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
path: Path
|
|
30
|
+
is_file: bool
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def be_like_water(target: str) -> ResolvedTarget:
|
|
34
|
+
"""Resolve a target string to a (path, is_file) verdict via a gated ladder.
|
|
35
|
+
|
|
36
|
+
Gates evaluated in order — a winning gate decides without falling through:
|
|
37
|
+
|
|
38
|
+
Step 0 (gate): trailing slash -> DIRECTORY. No disk consult.
|
|
39
|
+
Explicit user intent overrides anything that happens to
|
|
40
|
+
exist on disk by that name.
|
|
41
|
+
|
|
42
|
+
For targets without a trailing slash, conform to disk first:
|
|
43
|
+
|
|
44
|
+
Step 1: exists and is_file() -> FILE (use as-is; overwrite silently at write).
|
|
45
|
+
Step 2: exists and is_dir() -> DIRECTORY (auto-name inside).
|
|
46
|
+
Step 3: does not exist -> FILE. Parent will be mkdir-p'd at write;
|
|
47
|
+
basename IS the filename whatever it looks like
|
|
48
|
+
(no suffix inspection).
|
|
49
|
+
|
|
50
|
+
Exotic fs objects (dangling symlinks, FIFOs, devices) fall through to step 3
|
|
51
|
+
and let the real open() surface the error via the CLI actionable-error
|
|
52
|
+
boundary. We do not special-case exotic fs objects.
|
|
53
|
+
|
|
54
|
+
Pure-ish: reads disk for exists/is_file/is_dir but does NOT create
|
|
55
|
+
directories. Callers mkdir at write time.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
target: Raw path string, NOT a Path. Path normalizes trailing slashes
|
|
59
|
+
away, so the raw user intent must be preserved end-to-end.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
ResolvedTarget(path, is_file) — path is expanduser'd; caller decides
|
|
63
|
+
when to mkdir.
|
|
64
|
+
"""
|
|
65
|
+
if target.endswith("/"):
|
|
66
|
+
return ResolvedTarget(Path(target).expanduser(), is_file=False)
|
|
67
|
+
p = Path(target).expanduser()
|
|
68
|
+
if p.is_file():
|
|
69
|
+
return ResolvedTarget(p, is_file=True)
|
|
70
|
+
if p.is_dir():
|
|
71
|
+
return ResolvedTarget(p, is_file=False)
|
|
72
|
+
return ResolvedTarget(p, is_file=True)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def resolve_path(value: str | None, root: str) -> str | None:
|
|
76
|
+
"""Resolve a config-supplied path value against the LH_ROOT base.
|
|
77
|
+
|
|
78
|
+
Returns a STRING (trailing slash preserved) or None — never a Path, so
|
|
79
|
+
output-dir callers can still hand the result to ``be_like_water`` without
|
|
80
|
+
Path() stripping the directory-intent slash.
|
|
81
|
+
|
|
82
|
+
None / "" -> None (key unset)
|
|
83
|
+
"/var/log/zeek" -> as-is (absolute: root ignored)
|
|
84
|
+
"~/x/exports" -> expanduser(value) (~-anchored: root ignored)
|
|
85
|
+
"exports" -> join(expanduser(root), value) if root else value
|
|
86
|
+
|
|
87
|
+
Pure path helper — no validation, no URL handling, no suffix sniffing.
|
|
88
|
+
Apply to CONFIG-supplied paths only; CLI-supplied paths take ``root=""``
|
|
89
|
+
so they get ``~``-expansion but resolve relative to CWD as shell semantics
|
|
90
|
+
demand.
|
|
91
|
+
"""
|
|
92
|
+
if not value:
|
|
93
|
+
return None
|
|
94
|
+
if os.path.isabs(value):
|
|
95
|
+
return value
|
|
96
|
+
if value.startswith("~"):
|
|
97
|
+
return os.path.expanduser(value)
|
|
98
|
+
if root:
|
|
99
|
+
return os.path.join(os.path.expanduser(root), value)
|
|
100
|
+
return value
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def effective_root(config: dict[str, Any]) -> str:
|
|
104
|
+
"""Return the active LH_ROOT — env wins, then config, then empty."""
|
|
105
|
+
return os.environ.get("LOGHUNTER_ROOT") or config.get("loghunter", {}).get("root", "")
|
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"""Single-ownership source resolution for LogHunter.
|
|
2
|
+
|
|
3
|
+
Replaces a four-instance bug class:
|
|
4
|
+
|
|
5
|
+
- ``runner.run`` + ``run_digest`` used to overload ``None`` to mean both
|
|
6
|
+
"no override" and "scoped out — don't load this." Result: CLI scoping was
|
|
7
|
+
silently undone when the runner config-filled a None back, so
|
|
8
|
+
``loghunter syslog ./flat.log`` also loaded configured Zeek
|
|
9
|
+
``syslog*.log*`` on a default install.
|
|
10
|
+
- Analyze CLI, digest CLI, runner, and ``run_digest`` each carried their
|
|
11
|
+
own per-key config-fallback ladder, drifting in error strings and
|
|
12
|
+
semantics.
|
|
13
|
+
- The detect-path positional→source router was implemented three times
|
|
14
|
+
(analyze, single-detector, and a "wrong-source" hint scold), with a
|
|
15
|
+
``detector_name == "syslog"`` content-sniff special case while DNS
|
|
16
|
+
routed by filename ``fnmatch``.
|
|
17
|
+
|
|
18
|
+
After this module:
|
|
19
|
+
|
|
20
|
+
- One owner of source resolution. ``resolve_sources`` is the analyze
|
|
21
|
+
resolver; ``resolve_digest_source`` is the digest resolver.
|
|
22
|
+
``_resolve_one`` is the ONLY site where a source-dir string becomes a
|
|
23
|
+
resolved ``Path`` — CLI seams pass raw strings (or ``None``) and the
|
|
24
|
+
runner threads them straight in.
|
|
25
|
+
- ``None`` everywhere means strictly "no override." Scope is the only
|
|
26
|
+
scoping signal.
|
|
27
|
+
- One generic content-sniff router, ``route_positional_source``.
|
|
28
|
+
|
|
29
|
+
Layering: this module imports ``common.paths`` and ``common.loader``
|
|
30
|
+
(content sniffing). It MUST NOT import from ``loghunter.detectors`` —
|
|
31
|
+
``route_positional_source`` takes an already-imported detector module
|
|
32
|
+
as a parameter; the CLI does the ``importlib`` work.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
from dataclasses import dataclass
|
|
38
|
+
from pathlib import Path
|
|
39
|
+
from typing import Any, Sequence
|
|
40
|
+
|
|
41
|
+
from loghunter.common.loader import sniff_format_detailed
|
|
42
|
+
from loghunter.common.paths import effective_root, resolve_path
|
|
43
|
+
|
|
44
|
+
_ALL_KEYS: tuple[str, ...] = (
|
|
45
|
+
"zeek_dir", "syslog_dir", "pihole_dir", "cloudtrail_dir",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _present(value: object) -> bool:
|
|
50
|
+
"""An override counts only when it carries a real value.
|
|
51
|
+
|
|
52
|
+
The CLI parser stores a bare ``--zeek-dir=`` (no value after the ``=``) as
|
|
53
|
+
the EMPTY STRING — not None, not rejected. ``None``-vs-``""`` is the same
|
|
54
|
+
falsy-vs-None ambiguity class the single-ownership refactor exists to kill:
|
|
55
|
+
treating ``""`` as "present" makes ``_resolve_one("", …)`` return None and
|
|
56
|
+
silently suppresses config fallback, so a configured ``[loghunter].zeek_dir``
|
|
57
|
+
is ignored when the operator passes a bare flag. Match the pre-refactor
|
|
58
|
+
truthiness semantics (``if cli_val:``) by promoting the boundary check
|
|
59
|
+
here: any falsy override (None, "", empty Path string) is "no override."
|
|
60
|
+
|
|
61
|
+
Used by the digest resolver, which stays scalar-shaped (digest is
|
|
62
|
+
card-per-file; multi-input union does not apply). The analyze resolver
|
|
63
|
+
uses ``_normalize_overrides`` instead, which handles scalar / list /
|
|
64
|
+
None uniformly under the same falsy-is-absent rule.
|
|
65
|
+
"""
|
|
66
|
+
return bool(value)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _normalize_overrides(
|
|
70
|
+
value: str | Path | Sequence[str | Path] | None,
|
|
71
|
+
) -> list[str | Path]:
|
|
72
|
+
"""Normalize an override value to a list of truthy scalar inputs.
|
|
73
|
+
|
|
74
|
+
The widened contract for ``runner.run``'s four source-dir kwargs is
|
|
75
|
+
``str | Path | Sequence[str | Path] | None``. This function is the SINGLE
|
|
76
|
+
rule:
|
|
77
|
+
|
|
78
|
+
- ``None`` → ``[]`` (absent — signal config fallback within scope)
|
|
79
|
+
- scalar truthy ``str`` / ``Path`` → ``[scalar]`` (one-element list — the
|
|
80
|
+
degenerate case that keeps programmatic scalar callers byte-identical)
|
|
81
|
+
- scalar falsy (``""`` / empty Path string) → ``[]`` (absent — same
|
|
82
|
+
``_present`` semantics, just expressed at the list boundary)
|
|
83
|
+
- sequence → ``[v for v in value if v]`` — drop falsy members FIRST so
|
|
84
|
+
``["", "/x"]`` and ``["/x"]`` are equivalent, PRESERVE order
|
|
85
|
+
|
|
86
|
+
Dedup is intentionally NOT here. Cross-input dedup by ``.resolve()``
|
|
87
|
+
happens at the loader file-union site (``_union_dedupe``), not at the
|
|
88
|
+
string layer; doing it here would collapse two CLI inputs whose strings
|
|
89
|
+
differ but resolve to the same file BEFORE the user sees them rendered
|
|
90
|
+
in ``_print_dry_run``.
|
|
91
|
+
"""
|
|
92
|
+
if value is None:
|
|
93
|
+
return []
|
|
94
|
+
if isinstance(value, (str, Path)):
|
|
95
|
+
return [value] if value else []
|
|
96
|
+
return [v for v in value if v]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass(frozen=True)
|
|
100
|
+
class ResolvedSources:
|
|
101
|
+
"""The four source-dir buckets, resolved once by ``resolve_sources``.
|
|
102
|
+
|
|
103
|
+
Each field is the LIST of resolved ``Path`` inputs the runner should
|
|
104
|
+
load from for that family — positionals contributed by the CLI,
|
|
105
|
+
explicit ``--<family>-dir`` flag values, and config fallback (within
|
|
106
|
+
scope). An EMPTY LIST means the source is neither overridden nor
|
|
107
|
+
configured, or is scoped out of the run.
|
|
108
|
+
|
|
109
|
+
Single-input shape is the degenerate one-element list: scalar
|
|
110
|
+
programmatic callers (``runner.run(zeek_dir="/x")``) flow through
|
|
111
|
+
``_normalize_overrides`` and land here as ``[Path("/x")]`` —
|
|
112
|
+
byte-identical downstream behavior with the prior scalar shape.
|
|
113
|
+
"""
|
|
114
|
+
|
|
115
|
+
zeek_dir: list[Path]
|
|
116
|
+
syslog_dir: list[Path]
|
|
117
|
+
pihole_dir: list[Path]
|
|
118
|
+
cloudtrail_dir: list[Path]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@dataclass(frozen=True)
|
|
122
|
+
class DigestSource:
|
|
123
|
+
"""The single source chosen by ``resolve_digest_source`` for a digest schema.
|
|
124
|
+
|
|
125
|
+
Attributes:
|
|
126
|
+
source_key: One of ``zeek_dir`` / ``syslog_dir`` / ``pihole_dir`` /
|
|
127
|
+
``cloudtrail_dir`` — the key ``run_digest`` looks up its
|
|
128
|
+
(pattern, empty_columns) mapping against.
|
|
129
|
+
directory: Resolved directory ``Path`` to load from.
|
|
130
|
+
feed: Schema-specific feed identifier — ``"zeek"`` / ``"pihole"`` /
|
|
131
|
+
``"syslog"`` for the fidelity-aware schemas (dns, syslog), or
|
|
132
|
+
``None`` for the single-source schemas (conn, cloudtrail).
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
source_key: str
|
|
136
|
+
directory: Path
|
|
137
|
+
feed: str | None
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _resolve_one(
|
|
141
|
+
override: str | Path | None,
|
|
142
|
+
cfg_value: Any,
|
|
143
|
+
root: str,
|
|
144
|
+
) -> Path | None:
|
|
145
|
+
"""Single-key atom — the ONE site that converts a source-dir string to a Path.
|
|
146
|
+
|
|
147
|
+
A non-None ``override`` is treated as a CLI/explicit value and goes
|
|
148
|
+
through ``resolve_path(str(override), "")`` — shell semantics: ``~``
|
|
149
|
+
expansion, no LH_ROOT prefix (CLI-supplied paths resolve against CWD as
|
|
150
|
+
shells expect). A None ``override`` falls back to ``cfg_value`` resolved
|
|
151
|
+
via LH_ROOT (``resolve_path(cfg_value, root)``). Either branch returning
|
|
152
|
+
a falsy string yields ``None``.
|
|
153
|
+
|
|
154
|
+
``str(override)`` so a ``Path`` override round-trips identically — only
|
|
155
|
+
the resulting absolute-or-relative string semantics matter to
|
|
156
|
+
``resolve_path``.
|
|
157
|
+
"""
|
|
158
|
+
if override is not None:
|
|
159
|
+
resolved = resolve_path(str(override), "")
|
|
160
|
+
else:
|
|
161
|
+
resolved = resolve_path(cfg_value, root)
|
|
162
|
+
return Path(resolved) if resolved else None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def resolve_sources(
|
|
166
|
+
config: dict[str, Any],
|
|
167
|
+
*,
|
|
168
|
+
overrides: dict[str, str | Path | Sequence[str | Path] | None],
|
|
169
|
+
scope: frozenset[str] | None,
|
|
170
|
+
) -> ResolvedSources:
|
|
171
|
+
"""Resolve all four source dirs for an analyze run, list-shaped.
|
|
172
|
+
|
|
173
|
+
Per-key truth table (after ``_normalize_overrides`` → ``list[str | Path]``):
|
|
174
|
+
|
|
175
|
+
+------------------------+----------------------------------+--------------------------------------------------+
|
|
176
|
+
| override list | scope | result |
|
|
177
|
+
+========================+==================================+==================================================+
|
|
178
|
+
| non-empty | any | ``[_resolve_one(o, None, root) for o in list]`` |
|
|
179
|
+
+------------------------+----------------------------------+--------------------------------------------------+
|
|
180
|
+
| empty | ``None`` or ``key in scope`` | ``[_resolve_one(None, cfg_value, root)]`` |
|
|
181
|
+
+------------------------+----------------------------------+--------------------------------------------------+
|
|
182
|
+
| empty | ``key not in scope`` | ``[]`` — NEVER config-filled |
|
|
183
|
+
+------------------------+----------------------------------+--------------------------------------------------+
|
|
184
|
+
|
|
185
|
+
An override outside ``scope`` still applies — that is the operator
|
|
186
|
+
widening the run deliberately.
|
|
187
|
+
|
|
188
|
+
Single-element override lists give byte-identical downstream behavior
|
|
189
|
+
with the prior scalar shape, so ``runner.run(zeek_dir="/x")`` callers
|
|
190
|
+
(~35 sites + ``tests/test_root_provenance.py``) remain unchanged at
|
|
191
|
+
their call site — the normalization layer accepts either form.
|
|
192
|
+
|
|
193
|
+
Config fallback resolves a single config string per key (config-supplied
|
|
194
|
+
list shapes are NOT a v1 feature — out of scope here; revisit when the
|
|
195
|
+
config surface advertises a list form). The resulting one-element
|
|
196
|
+
list keeps the bucket non-empty so the loader sees it as present.
|
|
197
|
+
"""
|
|
198
|
+
cfg_lh = config.get("loghunter", {})
|
|
199
|
+
root = effective_root(config)
|
|
200
|
+
resolved: dict[str, list[Path]] = {}
|
|
201
|
+
for key in _ALL_KEYS:
|
|
202
|
+
override_list = _normalize_overrides(overrides.get(key))
|
|
203
|
+
if override_list:
|
|
204
|
+
resolved[key] = [
|
|
205
|
+
p for p in (_resolve_one(o, None, root) for o in override_list)
|
|
206
|
+
if p is not None
|
|
207
|
+
]
|
|
208
|
+
elif scope is None or key in scope:
|
|
209
|
+
cfg_path = _resolve_one(None, cfg_lh.get(key), root)
|
|
210
|
+
resolved[key] = [cfg_path] if cfg_path is not None else []
|
|
211
|
+
else:
|
|
212
|
+
resolved[key] = []
|
|
213
|
+
return ResolvedSources(**resolved)
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
# Per-schema candidate ladder + feed mapping for the digest resolver.
|
|
217
|
+
# Order = preference: first non-None config value wins on fallback.
|
|
218
|
+
_DIGEST_CANDIDATES: dict[str, tuple[str, ...]] = {
|
|
219
|
+
"conn": ("zeek_dir",),
|
|
220
|
+
"dns": ("zeek_dir", "pihole_dir"),
|
|
221
|
+
"syslog": ("syslog_dir", "zeek_dir"),
|
|
222
|
+
"cloudtrail": ("cloudtrail_dir",),
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
_DIGEST_FEED: dict[tuple[str, str], str | None] = {
|
|
226
|
+
("conn", "zeek_dir"): None,
|
|
227
|
+
("dns", "zeek_dir"): "zeek",
|
|
228
|
+
("dns", "pihole_dir"): "pihole",
|
|
229
|
+
("syslog", "syslog_dir"): "syslog",
|
|
230
|
+
("syslog", "zeek_dir"): "zeek",
|
|
231
|
+
("cloudtrail", "cloudtrail_dir"): None,
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
# BYTE-PRESERVED error strings, lifted from the previous run_digest ladder
|
|
235
|
+
# (runner.py:1302-1413 pre-refactor). The wrong-key message is templated;
|
|
236
|
+
# the XOR and not-configured messages are static per schema.
|
|
237
|
+
_DIGEST_XOR_MSG: dict[str, str] = {
|
|
238
|
+
"dns": "digest dns: cannot use both --zeek-dir and --pihole-dir",
|
|
239
|
+
"syslog": "digest syslog: cannot use both zeek_dir and syslog_dir",
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
_DIGEST_NOT_CONFIGURED_MSG: dict[str, str] = {
|
|
243
|
+
"conn": (
|
|
244
|
+
"digest: zeek_dir not configured — pass a PATH or set "
|
|
245
|
+
"[loghunter].zeek_dir in your config"
|
|
246
|
+
),
|
|
247
|
+
"dns": (
|
|
248
|
+
"digest dns: zeek_dir or pihole_dir not configured — "
|
|
249
|
+
"pass a PATH, --zeek-dir/--pihole-dir, or set one in config"
|
|
250
|
+
),
|
|
251
|
+
"syslog": (
|
|
252
|
+
"digest syslog: no syslog source configured — pass a PATH, "
|
|
253
|
+
"--zeek-dir, or set [loghunter].syslog_dir / "
|
|
254
|
+
"[loghunter].zeek_dir in your config"
|
|
255
|
+
),
|
|
256
|
+
"cloudtrail": (
|
|
257
|
+
"digest cloudtrail: cloudtrail_dir not configured — pass a PATH, "
|
|
258
|
+
"--cloudtrail-dir, or set [loghunter].cloudtrail_dir in your config"
|
|
259
|
+
),
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _wrong_key_msg(schema: str, key: str) -> str:
|
|
264
|
+
"""Templated wrong-source error message — byte-equal to the prior text."""
|
|
265
|
+
return f"digest {schema}: {key} is not valid for the {schema} schema"
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def resolve_digest_source(
|
|
269
|
+
config: dict[str, Any],
|
|
270
|
+
schema: str,
|
|
271
|
+
*,
|
|
272
|
+
overrides: dict[str, str | Path | None],
|
|
273
|
+
) -> DigestSource:
|
|
274
|
+
"""Resolve the SINGLE source for a digest schema.
|
|
275
|
+
|
|
276
|
+
Same ``None``-contract as ``resolve_sources``: an override is present
|
|
277
|
+
only when its value is non-None. Raises ordinary ``ValueError`` on:
|
|
278
|
+
|
|
279
|
+
- any non-None override OUTSIDE the schema's candidate set (wrong-key);
|
|
280
|
+
- more than one non-None override INSIDE the candidate set (XOR);
|
|
281
|
+
- no source resolved (not-configured).
|
|
282
|
+
|
|
283
|
+
Error strings are byte-preserved from the previous ``run_digest``
|
|
284
|
+
ladder so user-facing wording does not drift.
|
|
285
|
+
"""
|
|
286
|
+
candidates = _DIGEST_CANDIDATES[schema]
|
|
287
|
+
candidate_set = set(candidates)
|
|
288
|
+
cfg_lh = config.get("loghunter", {})
|
|
289
|
+
root = effective_root(config)
|
|
290
|
+
|
|
291
|
+
for key in _ALL_KEYS:
|
|
292
|
+
if key in candidate_set:
|
|
293
|
+
continue
|
|
294
|
+
if _present(overrides.get(key)):
|
|
295
|
+
raise ValueError(_wrong_key_msg(schema, key))
|
|
296
|
+
|
|
297
|
+
present_overrides = [
|
|
298
|
+
k for k in candidates if _present(overrides.get(k))
|
|
299
|
+
]
|
|
300
|
+
if len(present_overrides) > 1:
|
|
301
|
+
raise ValueError(_DIGEST_XOR_MSG[schema])
|
|
302
|
+
|
|
303
|
+
if present_overrides:
|
|
304
|
+
chosen: str | None = present_overrides[0]
|
|
305
|
+
directory = _resolve_one(overrides[chosen], None, root)
|
|
306
|
+
else:
|
|
307
|
+
chosen = None
|
|
308
|
+
directory = None
|
|
309
|
+
for k in candidates:
|
|
310
|
+
d = _resolve_one(None, cfg_lh.get(k), root)
|
|
311
|
+
if d is not None:
|
|
312
|
+
chosen = k
|
|
313
|
+
directory = d
|
|
314
|
+
break
|
|
315
|
+
|
|
316
|
+
if chosen is None or directory is None:
|
|
317
|
+
raise ValueError(_DIGEST_NOT_CONFIGURED_MSG[schema])
|
|
318
|
+
|
|
319
|
+
return DigestSource(
|
|
320
|
+
source_key=chosen,
|
|
321
|
+
directory=directory,
|
|
322
|
+
feed=_DIGEST_FEED[(schema, chosen)],
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def route_positional_source(
|
|
327
|
+
path: str | Path,
|
|
328
|
+
*,
|
|
329
|
+
detector_module: Any | None,
|
|
330
|
+
) -> str:
|
|
331
|
+
"""Decide which source-dir key a positional PATH routes to.
|
|
332
|
+
|
|
333
|
+
Generic — no detector-name special cases.
|
|
334
|
+
|
|
335
|
+
**Named-module mode** (``detector_module`` is an imported detector module):
|
|
336
|
+
``REQUIRED_LOGS`` carriers (beacon, scan, duration, aws, …) route to
|
|
337
|
+
``REQUIRED_LOGS[0]["source"]``. Two-source detectors (dns, syslog)
|
|
338
|
+
content-sniff the file and route to the matching ``OPTIONAL_LOGS`` source;
|
|
339
|
+
on miss, on a directory positional, or on a sniff ``OSError``, they fall
|
|
340
|
+
back to ``OPTIONAL_LOGS[0]["source"]`` — matching today's "directory
|
|
341
|
+
defaults to flat" / "unreadable degrades silently" conventions.
|
|
342
|
+
``OPTIONAL_LOGS[0]`` reproduces both current defaults:
|
|
343
|
+
``dns → zeek_dir`` and ``syslog → syslog_dir``.
|
|
344
|
+
|
|
345
|
+
**None mode** (``detector_module is None``): for detect=all / unknown
|
|
346
|
+
selectors. Content-sniff the positional and map ``origin → {origin}_dir``
|
|
347
|
+
(cloudtrail → cloudtrail_dir, syslog → syslog_dir, zeek → zeek_dir,
|
|
348
|
+
pihole → pihole_dir). Falls back to ``zeek_dir`` on a directory
|
|
349
|
+
positional, an unrecognized sniff, or a sniff ``OSError`` — preserving
|
|
350
|
+
today's analyze default for unrecognized inputs (the old hardcoded
|
|
351
|
+
``routed_source = "zeek_dir"`` in cli.py). NOTE: ``common/sources.py``
|
|
352
|
+
MUST NOT import ``detectors/`` — the named-module branch still receives
|
|
353
|
+
the imported module from the CLI.
|
|
354
|
+
"""
|
|
355
|
+
path_obj = Path(path).expanduser()
|
|
356
|
+
|
|
357
|
+
if detector_module is None:
|
|
358
|
+
if path_obj.is_dir():
|
|
359
|
+
return "zeek_dir"
|
|
360
|
+
try:
|
|
361
|
+
result = sniff_format_detailed(path_obj)
|
|
362
|
+
except OSError:
|
|
363
|
+
return "zeek_dir"
|
|
364
|
+
origin = result.origin
|
|
365
|
+
candidate = f"{origin}_dir" if origin else None
|
|
366
|
+
return candidate if candidate in _ALL_KEYS else "zeek_dir"
|
|
367
|
+
|
|
368
|
+
required = getattr(detector_module, "REQUIRED_LOGS", [])
|
|
369
|
+
if required:
|
|
370
|
+
# ``.get("source", "zeek_dir")`` instead of ``["source"]`` — defensive
|
|
371
|
+
# against a third-party / new detector whose REQUIRED_LOGS[0] omits
|
|
372
|
+
# the source key. The error-boundary rail says lower layers raise
|
|
373
|
+
# actionable exceptions, not bare KeyErrors. None of the six shipped
|
|
374
|
+
# detectors trip this, but the default keeps the router callable
|
|
375
|
+
# against malformed metadata.
|
|
376
|
+
return required[0].get("source", "zeek_dir")
|
|
377
|
+
optional = [
|
|
378
|
+
o.get("source", "zeek_dir")
|
|
379
|
+
for o in getattr(detector_module, "OPTIONAL_LOGS", [])
|
|
380
|
+
]
|
|
381
|
+
default = optional[0] if optional else "zeek_dir"
|
|
382
|
+
|
|
383
|
+
if path_obj.is_dir():
|
|
384
|
+
return default
|
|
385
|
+
try:
|
|
386
|
+
result = sniff_format_detailed(path_obj)
|
|
387
|
+
except OSError:
|
|
388
|
+
return default
|
|
389
|
+
|
|
390
|
+
origin = result.origin
|
|
391
|
+
candidate = f"{origin}_dir" if origin else None
|
|
392
|
+
return candidate if candidate in optional else default
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# LogHunter — Flat Numeric Connection Allowlist
|
|
2
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
3
|
+
#
|
|
4
|
+
# Format: one rule per line, whitespace-separated tokens. Order doesn't matter.
|
|
5
|
+
# # starts a comment (inline or full-line). Blank lines are ignored.
|
|
6
|
+
#
|
|
7
|
+
# Token types:
|
|
8
|
+
# IP address exact host: 192.0.2.10
|
|
9
|
+
# CIDR range subnet: 192.0.2.0/24
|
|
10
|
+
# Wildcard any host: *
|
|
11
|
+
# Port/proto leading colon: :443 :123/udp :*/tcp
|
|
12
|
+
#
|
|
13
|
+
# A rule may contain zero, one, or two IP/CIDR/wildcard fields plus an
|
|
14
|
+
# optional port/proto token. Rules with two IP fields match in either
|
|
15
|
+
# direction — src→dst and dst→src are both covered by a single rule.
|
|
16
|
+
#
|
|
17
|
+
# Missing fields match anything: omission is permission.
|
|
18
|
+
#
|
|
19
|
+
# !! BARE IP WARNING !!
|
|
20
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
21
|
+
# A rule containing only an IP address with no port token suppresses ALL
|
|
22
|
+
# traffic involving that host across every detector. This is intentional and
|
|
23
|
+
# powerful — one bare IP rule silently drops every flow involving that host
|
|
24
|
+
# from all findings. Use scoped rules (with a port/proto token) unless you
|
|
25
|
+
# explicitly mean to exclude a host entirely.
|
|
26
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
27
|
+
#
|
|
28
|
+
# Syntax examples (documentation only — not active rules):
|
|
29
|
+
#
|
|
30
|
+
# Two specific hosts, port 22, TCP only:
|
|
31
|
+
# 192.0.2.10 198.51.100.1 :22/tcp
|
|
32
|
+
#
|
|
33
|
+
# Any flow involving 192.0.2.10 on port 22, any protocol:
|
|
34
|
+
# 192.0.2.10 :22
|
|
35
|
+
#
|
|
36
|
+
# Entire subnet on port 443, any protocol:
|
|
37
|
+
# 192.0.2.0/24 :443
|
|
38
|
+
#
|
|
39
|
+
# Any host, UDP port 123 (NTP):
|
|
40
|
+
# * :123/udp
|
|
41
|
+
#
|
|
42
|
+
# Port only — suppress this port for every host:
|
|
43
|
+
# :6556
|
|
44
|
+
#
|
|
45
|
+
# Bare IP — suppresses ALL traffic involving this host (see warning above):
|
|
46
|
+
# 192.0.2.33
|
|
47
|
+
#
|
|
48
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
49
|
+
# Add your rules below this line.
|
|
50
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Universal domain allowlist — curated patterns for infrastructure seen universally
|
|
2
|
+
# across home lab and self-hosted environments.
|
|
3
|
+
#
|
|
4
|
+
# Format: one pattern per line. # inline or full-line comments. Blank lines ignored.
|
|
5
|
+
# Patterns prefixed with "re:" are treated as Python regex (re.search, case-insensitive).
|
|
6
|
+
# Patterns without the prefix are matched as fnmatch globs.
|
|
7
|
+
#
|
|
8
|
+
# This file ships with LogHunter. It covers reverse-DNS, NTP, CDN, cloud platforms,
|
|
9
|
+
# public nameserver infrastructure, and common SaaS endpoints that appear in virtually
|
|
10
|
+
# every environment.
|
|
11
|
+
#
|
|
12
|
+
# Site-specific known-good domains (your own infrastructure, local devices, internal
|
|
13
|
+
# services) belong in ~/.loghunter/allowlist.d/domains_user.txt — not here.
|
|
14
|
+
|
|
15
|
+
# Reverse DNS
|
|
16
|
+
re:\.in-addr\.arpa$ # reverse_dns
|
|
17
|
+
re:\.ip6\.arpa$ # ipv6_arpa
|
|
18
|
+
|
|
19
|
+
# mDNS / link-local
|
|
20
|
+
re:\.local$ # mdns_local
|
|
21
|
+
re:^_ # mdns_service
|
|
22
|
+
|
|
23
|
+
# UUID labels (e.g. device beacons)
|
|
24
|
+
re:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12} # uuid
|
|
25
|
+
|
|
26
|
+
# NTP
|
|
27
|
+
re:pool\.ntp\.org$|\.ntp\.org$ # ntp
|
|
28
|
+
|
|
29
|
+
# Akamai CDN
|
|
30
|
+
re:\.akamai\.net$|\.akamaiedge\.net$|\.akamai\.com$|\.akamaihd\.net$|\.akadns\.net$|\.akamaized\.net$|\.akamaitechnologies\.com$ # akamai
|
|
31
|
+
re:\.akam\.net$|\.edgekey\.net$ # akamai_delegation
|
|
32
|
+
|
|
33
|
+
# Apple / iCloud
|
|
34
|
+
re:\.apple\.com$|\.icloud\.com$|\.aaplimg\.com$|\.apple-dns\.net$ # apple_cdn
|
|
35
|
+
|
|
36
|
+
# Amazon Web Services
|
|
37
|
+
re:\.amazonaws\.com$|\.awsglobalaccelerator\.com$|\.cloudfront\.net$ # aws
|
|
38
|
+
|
|
39
|
+
# Google
|
|
40
|
+
re:\.googlevideo\.com$|\.googleapis\.com$|\.gstatic\.com$|\.googleusercontent\.com$|\.googledomains\.com$|\.google\.com$ # google
|
|
41
|
+
|
|
42
|
+
# Microsoft Azure
|
|
43
|
+
re:\.azurefd\.net$|\.azureedge\.net$|\.cloudapp\.azure\.com$|\.azurewebsites\.net$|\.trafficmanager\.net$|\.windows\.net$ # azure
|
|
44
|
+
re:\.azure-dns\.com$ # azure_dns
|
|
45
|
+
|
|
46
|
+
# Sonos (connection-indexed hostnames)
|
|
47
|
+
re:conn-i-[0-9a-f]+\..*\.sonos\.com$ # sonos_ws
|
|
48
|
+
|
|
49
|
+
# Amazon / Alexa
|
|
50
|
+
re:\.amazonvideo\.com$|\.amazon\.com$|\.amazonalexa\.com$|\.a2z\.com$ # amazon_video
|
|
51
|
+
|
|
52
|
+
# Oracle Cloud
|
|
53
|
+
re:\.oraclecloud\.com$|\.oracle\.com$ # oracle_idcs
|
|
54
|
+
|
|
55
|
+
# Sonos
|
|
56
|
+
re:\.sonos\.com$ # sonos
|
|
57
|
+
|
|
58
|
+
# Dropbox
|
|
59
|
+
re:\.dropbox\.com$|\.dropbox-dns\.com$ # dropbox
|
|
60
|
+
|
|
61
|
+
# Zoom
|
|
62
|
+
re:\.zoom\.us$ # zoom
|
|
63
|
+
|
|
64
|
+
# Mozilla
|
|
65
|
+
re:\.mozilla\.net$|\.mozilla\.org$|\.mozgcp\.net$ # mozilla
|
|
66
|
+
|
|
67
|
+
# Microsoft 365 / Windows
|
|
68
|
+
re:\.microsoft\.com$|\.office\.com$|\.live\.com$|\.skype\.com$|\.msidentity\.com$ # microsoft
|
|
69
|
+
re:\.windowsupdate\.com$ # windows_update
|
|
70
|
+
|
|
71
|
+
# Fastly CDN
|
|
72
|
+
re:\.fastly\.net$|\.fastly-edge\.com$ # fastly
|
|
73
|
+
|
|
74
|
+
# Piano / TinyPass (paywall SDK)
|
|
75
|
+
re:\.tinypass\.com$ # tinypass
|
|
76
|
+
|
|
77
|
+
# Atlassian / Jira / Confluence
|
|
78
|
+
re:\.atlassian\.com$|\.atlassian-dev\.net$|\.atl-paas\.net$ # atlassian
|
|
79
|
+
|
|
80
|
+
# AWS Route 53 nameservers
|
|
81
|
+
re:(^|\.)awsdns-\d+\.\w+(\.\w+)?$ # awsdns
|
|
82
|
+
re:ns-\d+\.awsdns # aws_ns
|
|
83
|
+
|
|
84
|
+
# AWS WAF
|
|
85
|
+
re:(^|\.)awswaf\.com$ # awswaf
|
|
86
|
+
|
|
87
|
+
# OVH nameservers
|
|
88
|
+
re:ns\d+\.ovh\.net$|dns\d+\.ovh\.net$ # ovh_ns
|
|
89
|
+
|
|
90
|
+
# UltraDNS
|
|
91
|
+
re:\.ultradns\.(net|com|org|info|co\.uk)$ # ultradns
|
|
92
|
+
|
|
93
|
+
# NS1 nameservers
|
|
94
|
+
re:\.nsone\.net$ # nsone
|
|
95
|
+
|
|
96
|
+
# Azure DNS nameservers
|
|
97
|
+
re:ns\d+-\d+\.azure-dns\.(com|net|org|info)$ # azure_ns
|
|
98
|
+
|
|
99
|
+
# Backblaze B2
|
|
100
|
+
re:pod-\d+-\d+-\d+\.backblaze\.com$|pod-\d{3}-\d{4}-\d{2}\.backblaze\.com$|ca\d+\.backblaze\.com$ # backblaze
|
|
101
|
+
|
|
102
|
+
# Microsoft Edge CDN
|
|
103
|
+
re:\.t-msedge\.net$|\.fb-t-msedge\.net$ # msedge
|
|
104
|
+
re:\.(ax|bx|ln)-\d+\.(ax|bx|ln)(-dc)?-msedge\.net$ # msedge_cdn
|
|
105
|
+
|
|
106
|
+
# Generic nameserver hostname patterns (ns1., ns.*, awsdns-, etc.)
|
|
107
|
+
re:^ns\d*[-\.]|\.awsdns-|\.ultradns\.|\.cloudns\.|\.constellix\.|\.digicertdns\.|\.domaincontrol\. # nameservers
|
|
108
|
+
|
|
109
|
+
# AWS networking diagnostic infrastructure
|
|
110
|
+
re:\.prod\.diagnostic\.networking\.aws\.dev$ # diagnostic_dns
|
|
111
|
+
|
|
112
|
+
# Oracle DNS infrastructure
|
|
113
|
+
re:\.dns\.oraclecloud\.net$ # oracledns
|
|
114
|
+
|
|
115
|
+
# SentinelOne EDR
|
|
116
|
+
re:\.sentinelone\.net$ # sentinelone
|
|
117
|
+
|
|
118
|
+
# hCaptcha
|
|
119
|
+
re:\.hcaptcha\.com$ # hcaptcha
|
|
120
|
+
|
|
121
|
+
# Sentry error tracking
|
|
122
|
+
re:\.sentry\.io$ # sentry
|
|
123
|
+
|
|
124
|
+
# AT&T local
|
|
125
|
+
re:\.attlocal\.net$ # attlocal
|