loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,105 @@
1
+ """Be-like-water target resolution shared by CLI, runner, and exporters.
2
+
3
+ One function (``be_like_water``) decides whether a user-supplied target string
4
+ points to a FILE or a DIRECTORY, via a gated ladder. The trailing-slash gate is
5
+ evaluated BEFORE any disk check so an explicit trailing slash can never be
6
+ overridden by what happens to exist on disk.
7
+
8
+ A second helper (``resolve_path``) resolves a config-supplied path string
9
+ against the LH_ROOT base. ``effective_root`` reads the active root from env or
10
+ config. CLI-supplied paths never get root applied; only config-file values do.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import os
16
+ from pathlib import Path
17
+ from typing import Any, NamedTuple
18
+
19
+
20
+ class ResolvedTarget(NamedTuple):
21
+ """Verdict from be_like_water: where to write, and whether it's a file or directory.
22
+
23
+ Attributes:
24
+ path: For FILE mode, the exact file path. For DIRECTORY mode, the
25
+ directory; caller auto-names inside it.
26
+ is_file: True for FILE, False for DIRECTORY.
27
+ """
28
+
29
+ path: Path
30
+ is_file: bool
31
+
32
+
33
+ def be_like_water(target: str) -> ResolvedTarget:
34
+ """Resolve a target string to a (path, is_file) verdict via a gated ladder.
35
+
36
+ Gates evaluated in order — a winning gate decides without falling through:
37
+
38
+ Step 0 (gate): trailing slash -> DIRECTORY. No disk consult.
39
+ Explicit user intent overrides anything that happens to
40
+ exist on disk by that name.
41
+
42
+ For targets without a trailing slash, conform to disk first:
43
+
44
+ Step 1: exists and is_file() -> FILE (use as-is; overwrite silently at write).
45
+ Step 2: exists and is_dir() -> DIRECTORY (auto-name inside).
46
+ Step 3: does not exist -> FILE. Parent will be mkdir-p'd at write;
47
+ basename IS the filename whatever it looks like
48
+ (no suffix inspection).
49
+
50
+ Exotic fs objects (dangling symlinks, FIFOs, devices) fall through to step 3
51
+ and let the real open() surface the error via the CLI actionable-error
52
+ boundary. We do not special-case exotic fs objects.
53
+
54
+ Pure-ish: reads disk for exists/is_file/is_dir but does NOT create
55
+ directories. Callers mkdir at write time.
56
+
57
+ Args:
58
+ target: Raw path string, NOT a Path. Path normalizes trailing slashes
59
+ away, so the raw user intent must be preserved end-to-end.
60
+
61
+ Returns:
62
+ ResolvedTarget(path, is_file) — path is expanduser'd; caller decides
63
+ when to mkdir.
64
+ """
65
+ if target.endswith("/"):
66
+ return ResolvedTarget(Path(target).expanduser(), is_file=False)
67
+ p = Path(target).expanduser()
68
+ if p.is_file():
69
+ return ResolvedTarget(p, is_file=True)
70
+ if p.is_dir():
71
+ return ResolvedTarget(p, is_file=False)
72
+ return ResolvedTarget(p, is_file=True)
73
+
74
+
75
+ def resolve_path(value: str | None, root: str) -> str | None:
76
+ """Resolve a config-supplied path value against the LH_ROOT base.
77
+
78
+ Returns a STRING (trailing slash preserved) or None — never a Path, so
79
+ output-dir callers can still hand the result to ``be_like_water`` without
80
+ Path() stripping the directory-intent slash.
81
+
82
+ None / "" -> None (key unset)
83
+ "/var/log/zeek" -> as-is (absolute: root ignored)
84
+ "~/x/exports" -> expanduser(value) (~-anchored: root ignored)
85
+ "exports" -> join(expanduser(root), value) if root else value
86
+
87
+ Pure path helper — no validation, no URL handling, no suffix sniffing.
88
+ Apply to CONFIG-supplied paths only; CLI-supplied paths take ``root=""``
89
+ so they get ``~``-expansion but resolve relative to CWD as shell semantics
90
+ demand.
91
+ """
92
+ if not value:
93
+ return None
94
+ if os.path.isabs(value):
95
+ return value
96
+ if value.startswith("~"):
97
+ return os.path.expanduser(value)
98
+ if root:
99
+ return os.path.join(os.path.expanduser(root), value)
100
+ return value
101
+
102
+
103
+ def effective_root(config: dict[str, Any]) -> str:
104
+ """Return the active LH_ROOT — env wins, then config, then empty."""
105
+ return os.environ.get("LOGHUNTER_ROOT") or config.get("loghunter", {}).get("root", "")
@@ -0,0 +1,392 @@
1
+ """Single-ownership source resolution for LogHunter.
2
+
3
+ Replaces a four-instance bug class:
4
+
5
+ - ``runner.run`` + ``run_digest`` used to overload ``None`` to mean both
6
+ "no override" and "scoped out — don't load this." Result: CLI scoping was
7
+ silently undone when the runner config-filled a None back, so
8
+ ``loghunter syslog ./flat.log`` also loaded configured Zeek
9
+ ``syslog*.log*`` on a default install.
10
+ - Analyze CLI, digest CLI, runner, and ``run_digest`` each carried their
11
+ own per-key config-fallback ladder, drifting in error strings and
12
+ semantics.
13
+ - The detect-path positional→source router was implemented three times
14
+ (analyze, single-detector, and a "wrong-source" hint scold), with a
15
+ ``detector_name == "syslog"`` content-sniff special case while DNS
16
+ routed by filename ``fnmatch``.
17
+
18
+ After this module:
19
+
20
+ - One owner of source resolution. ``resolve_sources`` is the analyze
21
+ resolver; ``resolve_digest_source`` is the digest resolver.
22
+ ``_resolve_one`` is the ONLY site where a source-dir string becomes a
23
+ resolved ``Path`` — CLI seams pass raw strings (or ``None``) and the
24
+ runner threads them straight in.
25
+ - ``None`` everywhere means strictly "no override." Scope is the only
26
+ scoping signal.
27
+ - One generic content-sniff router, ``route_positional_source``.
28
+
29
+ Layering: this module imports ``common.paths`` and ``common.loader``
30
+ (content sniffing). It MUST NOT import from ``loghunter.detectors`` —
31
+ ``route_positional_source`` takes an already-imported detector module
32
+ as a parameter; the CLI does the ``importlib`` work.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ from dataclasses import dataclass
38
+ from pathlib import Path
39
+ from typing import Any, Sequence
40
+
41
+ from loghunter.common.loader import sniff_format_detailed
42
+ from loghunter.common.paths import effective_root, resolve_path
43
+
44
+ _ALL_KEYS: tuple[str, ...] = (
45
+ "zeek_dir", "syslog_dir", "pihole_dir", "cloudtrail_dir",
46
+ )
47
+
48
+
49
+ def _present(value: object) -> bool:
50
+ """An override counts only when it carries a real value.
51
+
52
+ The CLI parser stores a bare ``--zeek-dir=`` (no value after the ``=``) as
53
+ the EMPTY STRING — not None, not rejected. ``None``-vs-``""`` is the same
54
+ falsy-vs-None ambiguity class the single-ownership refactor exists to kill:
55
+ treating ``""`` as "present" makes ``_resolve_one("", …)`` return None and
56
+ silently suppresses config fallback, so a configured ``[loghunter].zeek_dir``
57
+ is ignored when the operator passes a bare flag. Match the pre-refactor
58
+ truthiness semantics (``if cli_val:``) by promoting the boundary check
59
+ here: any falsy override (None, "", empty Path string) is "no override."
60
+
61
+ Used by the digest resolver, which stays scalar-shaped (digest is
62
+ card-per-file; multi-input union does not apply). The analyze resolver
63
+ uses ``_normalize_overrides`` instead, which handles scalar / list /
64
+ None uniformly under the same falsy-is-absent rule.
65
+ """
66
+ return bool(value)
67
+
68
+
69
+ def _normalize_overrides(
70
+ value: str | Path | Sequence[str | Path] | None,
71
+ ) -> list[str | Path]:
72
+ """Normalize an override value to a list of truthy scalar inputs.
73
+
74
+ The widened contract for ``runner.run``'s four source-dir kwargs is
75
+ ``str | Path | Sequence[str | Path] | None``. This function is the SINGLE
76
+ rule:
77
+
78
+ - ``None`` → ``[]`` (absent — signal config fallback within scope)
79
+ - scalar truthy ``str`` / ``Path`` → ``[scalar]`` (one-element list — the
80
+ degenerate case that keeps programmatic scalar callers byte-identical)
81
+ - scalar falsy (``""`` / empty Path string) → ``[]`` (absent — same
82
+ ``_present`` semantics, just expressed at the list boundary)
83
+ - sequence → ``[v for v in value if v]`` — drop falsy members FIRST so
84
+ ``["", "/x"]`` and ``["/x"]`` are equivalent, PRESERVE order
85
+
86
+ Dedup is intentionally NOT here. Cross-input dedup by ``.resolve()``
87
+ happens at the loader file-union site (``_union_dedupe``), not at the
88
+ string layer; doing it here would collapse two CLI inputs whose strings
89
+ differ but resolve to the same file BEFORE the user sees them rendered
90
+ in ``_print_dry_run``.
91
+ """
92
+ if value is None:
93
+ return []
94
+ if isinstance(value, (str, Path)):
95
+ return [value] if value else []
96
+ return [v for v in value if v]
97
+
98
+
99
+ @dataclass(frozen=True)
100
+ class ResolvedSources:
101
+ """The four source-dir buckets, resolved once by ``resolve_sources``.
102
+
103
+ Each field is the LIST of resolved ``Path`` inputs the runner should
104
+ load from for that family — positionals contributed by the CLI,
105
+ explicit ``--<family>-dir`` flag values, and config fallback (within
106
+ scope). An EMPTY LIST means the source is neither overridden nor
107
+ configured, or is scoped out of the run.
108
+
109
+ Single-input shape is the degenerate one-element list: scalar
110
+ programmatic callers (``runner.run(zeek_dir="/x")``) flow through
111
+ ``_normalize_overrides`` and land here as ``[Path("/x")]`` —
112
+ byte-identical downstream behavior with the prior scalar shape.
113
+ """
114
+
115
+ zeek_dir: list[Path]
116
+ syslog_dir: list[Path]
117
+ pihole_dir: list[Path]
118
+ cloudtrail_dir: list[Path]
119
+
120
+
121
+ @dataclass(frozen=True)
122
+ class DigestSource:
123
+ """The single source chosen by ``resolve_digest_source`` for a digest schema.
124
+
125
+ Attributes:
126
+ source_key: One of ``zeek_dir`` / ``syslog_dir`` / ``pihole_dir`` /
127
+ ``cloudtrail_dir`` — the key ``run_digest`` looks up its
128
+ (pattern, empty_columns) mapping against.
129
+ directory: Resolved directory ``Path`` to load from.
130
+ feed: Schema-specific feed identifier — ``"zeek"`` / ``"pihole"`` /
131
+ ``"syslog"`` for the fidelity-aware schemas (dns, syslog), or
132
+ ``None`` for the single-source schemas (conn, cloudtrail).
133
+ """
134
+
135
+ source_key: str
136
+ directory: Path
137
+ feed: str | None
138
+
139
+
140
+ def _resolve_one(
141
+ override: str | Path | None,
142
+ cfg_value: Any,
143
+ root: str,
144
+ ) -> Path | None:
145
+ """Single-key atom — the ONE site that converts a source-dir string to a Path.
146
+
147
+ A non-None ``override`` is treated as a CLI/explicit value and goes
148
+ through ``resolve_path(str(override), "")`` — shell semantics: ``~``
149
+ expansion, no LH_ROOT prefix (CLI-supplied paths resolve against CWD as
150
+ shells expect). A None ``override`` falls back to ``cfg_value`` resolved
151
+ via LH_ROOT (``resolve_path(cfg_value, root)``). Either branch returning
152
+ a falsy string yields ``None``.
153
+
154
+ ``str(override)`` so a ``Path`` override round-trips identically — only
155
+ the resulting absolute-or-relative string semantics matter to
156
+ ``resolve_path``.
157
+ """
158
+ if override is not None:
159
+ resolved = resolve_path(str(override), "")
160
+ else:
161
+ resolved = resolve_path(cfg_value, root)
162
+ return Path(resolved) if resolved else None
163
+
164
+
165
+ def resolve_sources(
166
+ config: dict[str, Any],
167
+ *,
168
+ overrides: dict[str, str | Path | Sequence[str | Path] | None],
169
+ scope: frozenset[str] | None,
170
+ ) -> ResolvedSources:
171
+ """Resolve all four source dirs for an analyze run, list-shaped.
172
+
173
+ Per-key truth table (after ``_normalize_overrides`` → ``list[str | Path]``):
174
+
175
+ +------------------------+----------------------------------+--------------------------------------------------+
176
+ | override list | scope | result |
177
+ +========================+==================================+==================================================+
178
+ | non-empty | any | ``[_resolve_one(o, None, root) for o in list]`` |
179
+ +------------------------+----------------------------------+--------------------------------------------------+
180
+ | empty | ``None`` or ``key in scope`` | ``[_resolve_one(None, cfg_value, root)]`` |
181
+ +------------------------+----------------------------------+--------------------------------------------------+
182
+ | empty | ``key not in scope`` | ``[]`` — NEVER config-filled |
183
+ +------------------------+----------------------------------+--------------------------------------------------+
184
+
185
+ An override outside ``scope`` still applies — that is the operator
186
+ widening the run deliberately.
187
+
188
+ Single-element override lists give byte-identical downstream behavior
189
+ with the prior scalar shape, so ``runner.run(zeek_dir="/x")`` callers
190
+ (~35 sites + ``tests/test_root_provenance.py``) remain unchanged at
191
+ their call site — the normalization layer accepts either form.
192
+
193
+ Config fallback resolves a single config string per key (config-supplied
194
+ list shapes are NOT a v1 feature — out of scope here; revisit when the
195
+ config surface advertises a list form). The resulting one-element
196
+ list keeps the bucket non-empty so the loader sees it as present.
197
+ """
198
+ cfg_lh = config.get("loghunter", {})
199
+ root = effective_root(config)
200
+ resolved: dict[str, list[Path]] = {}
201
+ for key in _ALL_KEYS:
202
+ override_list = _normalize_overrides(overrides.get(key))
203
+ if override_list:
204
+ resolved[key] = [
205
+ p for p in (_resolve_one(o, None, root) for o in override_list)
206
+ if p is not None
207
+ ]
208
+ elif scope is None or key in scope:
209
+ cfg_path = _resolve_one(None, cfg_lh.get(key), root)
210
+ resolved[key] = [cfg_path] if cfg_path is not None else []
211
+ else:
212
+ resolved[key] = []
213
+ return ResolvedSources(**resolved)
214
+
215
+
216
+ # Per-schema candidate ladder + feed mapping for the digest resolver.
217
+ # Order = preference: first non-None config value wins on fallback.
218
+ _DIGEST_CANDIDATES: dict[str, tuple[str, ...]] = {
219
+ "conn": ("zeek_dir",),
220
+ "dns": ("zeek_dir", "pihole_dir"),
221
+ "syslog": ("syslog_dir", "zeek_dir"),
222
+ "cloudtrail": ("cloudtrail_dir",),
223
+ }
224
+
225
+ _DIGEST_FEED: dict[tuple[str, str], str | None] = {
226
+ ("conn", "zeek_dir"): None,
227
+ ("dns", "zeek_dir"): "zeek",
228
+ ("dns", "pihole_dir"): "pihole",
229
+ ("syslog", "syslog_dir"): "syslog",
230
+ ("syslog", "zeek_dir"): "zeek",
231
+ ("cloudtrail", "cloudtrail_dir"): None,
232
+ }
233
+
234
+ # BYTE-PRESERVED error strings, lifted from the previous run_digest ladder
235
+ # (runner.py:1302-1413 pre-refactor). The wrong-key message is templated;
236
+ # the XOR and not-configured messages are static per schema.
237
+ _DIGEST_XOR_MSG: dict[str, str] = {
238
+ "dns": "digest dns: cannot use both --zeek-dir and --pihole-dir",
239
+ "syslog": "digest syslog: cannot use both zeek_dir and syslog_dir",
240
+ }
241
+
242
+ _DIGEST_NOT_CONFIGURED_MSG: dict[str, str] = {
243
+ "conn": (
244
+ "digest: zeek_dir not configured — pass a PATH or set "
245
+ "[loghunter].zeek_dir in your config"
246
+ ),
247
+ "dns": (
248
+ "digest dns: zeek_dir or pihole_dir not configured — "
249
+ "pass a PATH, --zeek-dir/--pihole-dir, or set one in config"
250
+ ),
251
+ "syslog": (
252
+ "digest syslog: no syslog source configured — pass a PATH, "
253
+ "--zeek-dir, or set [loghunter].syslog_dir / "
254
+ "[loghunter].zeek_dir in your config"
255
+ ),
256
+ "cloudtrail": (
257
+ "digest cloudtrail: cloudtrail_dir not configured — pass a PATH, "
258
+ "--cloudtrail-dir, or set [loghunter].cloudtrail_dir in your config"
259
+ ),
260
+ }
261
+
262
+
263
+ def _wrong_key_msg(schema: str, key: str) -> str:
264
+ """Templated wrong-source error message — byte-equal to the prior text."""
265
+ return f"digest {schema}: {key} is not valid for the {schema} schema"
266
+
267
+
268
+ def resolve_digest_source(
269
+ config: dict[str, Any],
270
+ schema: str,
271
+ *,
272
+ overrides: dict[str, str | Path | None],
273
+ ) -> DigestSource:
274
+ """Resolve the SINGLE source for a digest schema.
275
+
276
+ Same ``None``-contract as ``resolve_sources``: an override is present
277
+ only when its value is non-None. Raises ordinary ``ValueError`` on:
278
+
279
+ - any non-None override OUTSIDE the schema's candidate set (wrong-key);
280
+ - more than one non-None override INSIDE the candidate set (XOR);
281
+ - no source resolved (not-configured).
282
+
283
+ Error strings are byte-preserved from the previous ``run_digest``
284
+ ladder so user-facing wording does not drift.
285
+ """
286
+ candidates = _DIGEST_CANDIDATES[schema]
287
+ candidate_set = set(candidates)
288
+ cfg_lh = config.get("loghunter", {})
289
+ root = effective_root(config)
290
+
291
+ for key in _ALL_KEYS:
292
+ if key in candidate_set:
293
+ continue
294
+ if _present(overrides.get(key)):
295
+ raise ValueError(_wrong_key_msg(schema, key))
296
+
297
+ present_overrides = [
298
+ k for k in candidates if _present(overrides.get(k))
299
+ ]
300
+ if len(present_overrides) > 1:
301
+ raise ValueError(_DIGEST_XOR_MSG[schema])
302
+
303
+ if present_overrides:
304
+ chosen: str | None = present_overrides[0]
305
+ directory = _resolve_one(overrides[chosen], None, root)
306
+ else:
307
+ chosen = None
308
+ directory = None
309
+ for k in candidates:
310
+ d = _resolve_one(None, cfg_lh.get(k), root)
311
+ if d is not None:
312
+ chosen = k
313
+ directory = d
314
+ break
315
+
316
+ if chosen is None or directory is None:
317
+ raise ValueError(_DIGEST_NOT_CONFIGURED_MSG[schema])
318
+
319
+ return DigestSource(
320
+ source_key=chosen,
321
+ directory=directory,
322
+ feed=_DIGEST_FEED[(schema, chosen)],
323
+ )
324
+
325
+
326
+ def route_positional_source(
327
+ path: str | Path,
328
+ *,
329
+ detector_module: Any | None,
330
+ ) -> str:
331
+ """Decide which source-dir key a positional PATH routes to.
332
+
333
+ Generic — no detector-name special cases.
334
+
335
+ **Named-module mode** (``detector_module`` is an imported detector module):
336
+ ``REQUIRED_LOGS`` carriers (beacon, scan, duration, aws, …) route to
337
+ ``REQUIRED_LOGS[0]["source"]``. Two-source detectors (dns, syslog)
338
+ content-sniff the file and route to the matching ``OPTIONAL_LOGS`` source;
339
+ on miss, on a directory positional, or on a sniff ``OSError``, they fall
340
+ back to ``OPTIONAL_LOGS[0]["source"]`` — matching today's "directory
341
+ defaults to flat" / "unreadable degrades silently" conventions.
342
+ ``OPTIONAL_LOGS[0]`` reproduces both current defaults:
343
+ ``dns → zeek_dir`` and ``syslog → syslog_dir``.
344
+
345
+ **None mode** (``detector_module is None``): for detect=all / unknown
346
+ selectors. Content-sniff the positional and map ``origin → {origin}_dir``
347
+ (cloudtrail → cloudtrail_dir, syslog → syslog_dir, zeek → zeek_dir,
348
+ pihole → pihole_dir). Falls back to ``zeek_dir`` on a directory
349
+ positional, an unrecognized sniff, or a sniff ``OSError`` — preserving
350
+ today's analyze default for unrecognized inputs (the old hardcoded
351
+ ``routed_source = "zeek_dir"`` in cli.py). NOTE: ``common/sources.py``
352
+ MUST NOT import ``detectors/`` — the named-module branch still receives
353
+ the imported module from the CLI.
354
+ """
355
+ path_obj = Path(path).expanduser()
356
+
357
+ if detector_module is None:
358
+ if path_obj.is_dir():
359
+ return "zeek_dir"
360
+ try:
361
+ result = sniff_format_detailed(path_obj)
362
+ except OSError:
363
+ return "zeek_dir"
364
+ origin = result.origin
365
+ candidate = f"{origin}_dir" if origin else None
366
+ return candidate if candidate in _ALL_KEYS else "zeek_dir"
367
+
368
+ required = getattr(detector_module, "REQUIRED_LOGS", [])
369
+ if required:
370
+ # ``.get("source", "zeek_dir")`` instead of ``["source"]`` — defensive
371
+ # against a third-party / new detector whose REQUIRED_LOGS[0] omits
372
+ # the source key. The error-boundary rail says lower layers raise
373
+ # actionable exceptions, not bare KeyErrors. None of the six shipped
374
+ # detectors trip this, but the default keeps the router callable
375
+ # against malformed metadata.
376
+ return required[0].get("source", "zeek_dir")
377
+ optional = [
378
+ o.get("source", "zeek_dir")
379
+ for o in getattr(detector_module, "OPTIONAL_LOGS", [])
380
+ ]
381
+ default = optional[0] if optional else "zeek_dir"
382
+
383
+ if path_obj.is_dir():
384
+ return default
385
+ try:
386
+ result = sniff_format_detailed(path_obj)
387
+ except OSError:
388
+ return default
389
+
390
+ origin = result.origin
391
+ candidate = f"{origin}_dir" if origin else None
392
+ return candidate if candidate in optional else default
@@ -0,0 +1,50 @@
1
+ # LogHunter — Flat Numeric Connection Allowlist
2
+ # ─────────────────────────────────────────────────────────────────────────────
3
+ #
4
+ # Format: one rule per line, whitespace-separated tokens. Order doesn't matter.
5
+ # # starts a comment (inline or full-line). Blank lines are ignored.
6
+ #
7
+ # Token types:
8
+ # IP address exact host: 192.0.2.10
9
+ # CIDR range subnet: 192.0.2.0/24
10
+ # Wildcard any host: *
11
+ # Port/proto leading colon: :443 :123/udp :*/tcp
12
+ #
13
+ # A rule may contain zero, one, or two IP/CIDR/wildcard fields plus an
14
+ # optional port/proto token. Rules with two IP fields match in either
15
+ # direction — src→dst and dst→src are both covered by a single rule.
16
+ #
17
+ # Missing fields match anything: omission is permission.
18
+ #
19
+ # !! BARE IP WARNING !!
20
+ # ─────────────────────────────────────────────────────────────────────────────
21
+ # A rule containing only an IP address with no port token suppresses ALL
22
+ # traffic involving that host across every detector. This is intentional and
23
+ # powerful — one bare IP rule silently drops every flow involving that host
24
+ # from all findings. Use scoped rules (with a port/proto token) unless you
25
+ # explicitly mean to exclude a host entirely.
26
+ # ─────────────────────────────────────────────────────────────────────────────
27
+ #
28
+ # Syntax examples (documentation only — not active rules):
29
+ #
30
+ # Two specific hosts, port 22, TCP only:
31
+ # 192.0.2.10 198.51.100.1 :22/tcp
32
+ #
33
+ # Any flow involving 192.0.2.10 on port 22, any protocol:
34
+ # 192.0.2.10 :22
35
+ #
36
+ # Entire subnet on port 443, any protocol:
37
+ # 192.0.2.0/24 :443
38
+ #
39
+ # Any host, UDP port 123 (NTP):
40
+ # * :123/udp
41
+ #
42
+ # Port only — suppress this port for every host:
43
+ # :6556
44
+ #
45
+ # Bare IP — suppresses ALL traffic involving this host (see warning above):
46
+ # 192.0.2.33
47
+ #
48
+ # ─────────────────────────────────────────────────────────────────────────────
49
+ # Add your rules below this line.
50
+ # ─────────────────────────────────────────────────────────────────────────────
@@ -0,0 +1,5 @@
1
+ # Device domain allowlist — Apple TV, Hue, smart home, gaming consoles, etc.
2
+ # One glob pattern per line. # for comments. Blank lines ignored.
3
+ # Regex also supported: prefix with "re:"
4
+ #
5
+ # This file covers consumer device infrastructure seen in home networks.
@@ -0,0 +1,5 @@
1
+ # Home lab domain allowlist — checkmk, Splunk, Pi-hole, Unbound, MRTG, etc.
2
+ # One glob pattern per line. # for comments. Blank lines ignored.
3
+ # Regex also supported: prefix with "re:"
4
+ #
5
+ # This file covers infrastructure commonly found in self-hosted environments.
@@ -0,0 +1,125 @@
1
+ # Universal domain allowlist — curated patterns for infrastructure seen universally
2
+ # across home lab and self-hosted environments.
3
+ #
4
+ # Format: one pattern per line. # inline or full-line comments. Blank lines ignored.
5
+ # Patterns prefixed with "re:" are treated as Python regex (re.search, case-insensitive).
6
+ # Patterns without the prefix are matched as fnmatch globs.
7
+ #
8
+ # This file ships with LogHunter. It covers reverse-DNS, NTP, CDN, cloud platforms,
9
+ # public nameserver infrastructure, and common SaaS endpoints that appear in virtually
10
+ # every environment.
11
+ #
12
+ # Site-specific known-good domains (your own infrastructure, local devices, internal
13
+ # services) belong in ~/.loghunter/allowlist.d/domains_user.txt — not here.
14
+
15
+ # Reverse DNS
16
+ re:\.in-addr\.arpa$ # reverse_dns
17
+ re:\.ip6\.arpa$ # ipv6_arpa
18
+
19
+ # mDNS / link-local
20
+ re:\.local$ # mdns_local
21
+ re:^_ # mdns_service
22
+
23
+ # UUID labels (e.g. device beacons)
24
+ re:[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12} # uuid
25
+
26
+ # NTP
27
+ re:pool\.ntp\.org$|\.ntp\.org$ # ntp
28
+
29
+ # Akamai CDN
30
+ re:\.akamai\.net$|\.akamaiedge\.net$|\.akamai\.com$|\.akamaihd\.net$|\.akadns\.net$|\.akamaized\.net$|\.akamaitechnologies\.com$ # akamai
31
+ re:\.akam\.net$|\.edgekey\.net$ # akamai_delegation
32
+
33
+ # Apple / iCloud
34
+ re:\.apple\.com$|\.icloud\.com$|\.aaplimg\.com$|\.apple-dns\.net$ # apple_cdn
35
+
36
+ # Amazon Web Services
37
+ re:\.amazonaws\.com$|\.awsglobalaccelerator\.com$|\.cloudfront\.net$ # aws
38
+
39
+ # Google
40
+ re:\.googlevideo\.com$|\.googleapis\.com$|\.gstatic\.com$|\.googleusercontent\.com$|\.googledomains\.com$|\.google\.com$ # google
41
+
42
+ # Microsoft Azure
43
+ re:\.azurefd\.net$|\.azureedge\.net$|\.cloudapp\.azure\.com$|\.azurewebsites\.net$|\.trafficmanager\.net$|\.windows\.net$ # azure
44
+ re:\.azure-dns\.com$ # azure_dns
45
+
46
+ # Sonos (connection-indexed hostnames)
47
+ re:conn-i-[0-9a-f]+\..*\.sonos\.com$ # sonos_ws
48
+
49
+ # Amazon / Alexa
50
+ re:\.amazonvideo\.com$|\.amazon\.com$|\.amazonalexa\.com$|\.a2z\.com$ # amazon_video
51
+
52
+ # Oracle Cloud
53
+ re:\.oraclecloud\.com$|\.oracle\.com$ # oracle_idcs
54
+
55
+ # Sonos
56
+ re:\.sonos\.com$ # sonos
57
+
58
+ # Dropbox
59
+ re:\.dropbox\.com$|\.dropbox-dns\.com$ # dropbox
60
+
61
+ # Zoom
62
+ re:\.zoom\.us$ # zoom
63
+
64
+ # Mozilla
65
+ re:\.mozilla\.net$|\.mozilla\.org$|\.mozgcp\.net$ # mozilla
66
+
67
+ # Microsoft 365 / Windows
68
+ re:\.microsoft\.com$|\.office\.com$|\.live\.com$|\.skype\.com$|\.msidentity\.com$ # microsoft
69
+ re:\.windowsupdate\.com$ # windows_update
70
+
71
+ # Fastly CDN
72
+ re:\.fastly\.net$|\.fastly-edge\.com$ # fastly
73
+
74
+ # Piano / TinyPass (paywall SDK)
75
+ re:\.tinypass\.com$ # tinypass
76
+
77
+ # Atlassian / Jira / Confluence
78
+ re:\.atlassian\.com$|\.atlassian-dev\.net$|\.atl-paas\.net$ # atlassian
79
+
80
+ # AWS Route 53 nameservers
81
+ re:(^|\.)awsdns-\d+\.\w+(\.\w+)?$ # awsdns
82
+ re:ns-\d+\.awsdns # aws_ns
83
+
84
+ # AWS WAF
85
+ re:(^|\.)awswaf\.com$ # awswaf
86
+
87
+ # OVH nameservers
88
+ re:ns\d+\.ovh\.net$|dns\d+\.ovh\.net$ # ovh_ns
89
+
90
+ # UltraDNS
91
+ re:\.ultradns\.(net|com|org|info|co\.uk)$ # ultradns
92
+
93
+ # NS1 nameservers
94
+ re:\.nsone\.net$ # nsone
95
+
96
+ # Azure DNS nameservers
97
+ re:ns\d+-\d+\.azure-dns\.(com|net|org|info)$ # azure_ns
98
+
99
+ # Backblaze B2
100
+ re:pod-\d+-\d+-\d+\.backblaze\.com$|pod-\d{3}-\d{4}-\d{2}\.backblaze\.com$|ca\d+\.backblaze\.com$ # backblaze
101
+
102
+ # Microsoft Edge CDN
103
+ re:\.t-msedge\.net$|\.fb-t-msedge\.net$ # msedge
104
+ re:\.(ax|bx|ln)-\d+\.(ax|bx|ln)(-dc)?-msedge\.net$ # msedge_cdn
105
+
106
+ # Generic nameserver hostname patterns (ns1., ns.*, awsdns-, etc.)
107
+ re:^ns\d*[-\.]|\.awsdns-|\.ultradns\.|\.cloudns\.|\.constellix\.|\.digicertdns\.|\.domaincontrol\. # nameservers
108
+
109
+ # AWS networking diagnostic infrastructure
110
+ re:\.prod\.diagnostic\.networking\.aws\.dev$ # diagnostic_dns
111
+
112
+ # Oracle DNS infrastructure
113
+ re:\.dns\.oraclecloud\.net$ # oracledns
114
+
115
+ # SentinelOne EDR
116
+ re:\.sentinelone\.net$ # sentinelone
117
+
118
+ # hCaptcha
119
+ re:\.hcaptcha\.com$ # hcaptcha
120
+
121
+ # Sentry error tracking
122
+ re:\.sentry\.io$ # sentry
123
+
124
+ # AT&T local
125
+ re:\.attlocal\.net$ # attlocal