loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,136 @@
1
+ """Log discovery, decompression, parsing, and timeframe filtering.
2
+
3
+ All file I/O for log data flows through this package. Detectors never open files
4
+ directly. This ``__init__`` re-exports the FULL public + private symbol surface
5
+ that previously lived in the single ``common/loader.py`` module, so every name
6
+ importable or monkeypatchable at ``loghunter.common.loader.<name>`` continues to
7
+ resolve (and stays settable) after the package split.
8
+
9
+ Submodule layout (acyclic; ``io``/``types``/``diagnostics`` are leaves):
10
+ - ``io`` — ``_open_log`` + path-normalization primitives.
11
+ - ``types`` — ``LoadResult`` / coverage / rotation-skip dataclasses, the
12
+ cross-frame window helper, column constants.
13
+ - ``diagnostics`` — log-type + warning/wording helpers.
14
+ - ``sniff`` — the digest recognizer cascade + the syslog content gate.
15
+ - ``windowing`` — ts filter, boundedness, the rotation-peek subsystem.
16
+ - ``discovery`` — per-family file discovery + dated-Zeek default window.
17
+ - ``pipeline`` — ``run_load`` + the ``_SOURCE_LOADERS`` registry + the
18
+ public ``load_*`` shims + registry-policy accessors.
19
+
20
+ Canonical connection record schema
21
+ ───────────────────────────────────
22
+ All conn log DataFrames returned by this package use these column names.
23
+ Detectors, runner, and matcher never reference Zeek-specific column names.
24
+
25
+ src — source IP (str)
26
+ dst — destination IP (str)
27
+ port — destination port (int)
28
+ proto — protocol: tcp / udp / icmp (str)
29
+ ts — unix epoch timestamp (float)
30
+ duration — connection duration in seconds (float, nullable)
31
+ bytes — originator bytes (int, nullable)
32
+ conn_state — connection state (str, nullable)
33
+ local_orig — bool (nullable)
34
+ """
35
+
36
+ from __future__ import annotations
37
+
38
+ # progress is re-exported so loghunter.common.loader.progress both RESOLVES and
39
+ # is SETTABLE — the load pipeline reads it through this package attribute (the
40
+ # call-time facade), so monkeypatching it here takes effect.
41
+ from loghunter.common.display import progress
42
+
43
+ from loghunter.common.loader.io import (
44
+ _open_log,
45
+ _safe_resolve,
46
+ _union_dedupe,
47
+ )
48
+ from loghunter.common.loader.types import (
49
+ _CLOUDTRAIL_COLUMNS,
50
+ _LOG_SUFFIXES,
51
+ _PIHOLE_COLUMNS,
52
+ _SYSLOG_COLUMNS,
53
+ CoverageTracker,
54
+ LoadResult,
55
+ RotationSkipInfo,
56
+ SourceCoverage,
57
+ _data_window,
58
+ )
59
+ from loghunter.common.loader.diagnostics import (
60
+ _cloudtrail_parse_warning,
61
+ _log_type,
62
+ _schema_warning,
63
+ _zeek_file_read_warning,
64
+ )
65
+ from loghunter.common.loader.sniff import (
66
+ _SNIFF_MAX_PEEK,
67
+ _SNIFF_ORIGIN,
68
+ _SNIFF_RECOGNIZERS,
69
+ _SYSLOG_SNIFF_BYTES,
70
+ SniffResult,
71
+ _is_ndjson,
72
+ _looks_like_syslog,
73
+ sniff_format,
74
+ sniff_format_detailed,
75
+ )
76
+ from loghunter.common.loader.windowing import (
77
+ _COMPRESSION_EXTS,
78
+ _DATE_RANK_BASE,
79
+ _EXPORT_WINDOW_RE,
80
+ _ROTATION_NUM_RE,
81
+ LoadWindow,
82
+ _apply_ts_filter,
83
+ _classify_rotation_name,
84
+ _group_order_conflict,
85
+ _missing_ts,
86
+ _peek_first_ts,
87
+ _rotation_base_and_index,
88
+ _rotation_windowed_files,
89
+ _select_group,
90
+ _strip_compression_ext,
91
+ apply_default_window,
92
+ is_bounded,
93
+ is_zeek_bounded,
94
+ )
95
+ from loghunter.common.loader.discovery import (
96
+ _DATE_DIR_RE,
97
+ _default_resolve_window,
98
+ _dir_has_regular_files,
99
+ _discover_syslog_files,
100
+ _file_matches_pattern,
101
+ _flat_default_floor,
102
+ _flat_resolve_window,
103
+ _stem_hostname,
104
+ _syslog_files,
105
+ _zeek_date_subdirs,
106
+ _zeek_dated_window,
107
+ _zeek_resolve_window,
108
+ discover_cloudtrail_files,
109
+ discover_files,
110
+ discover_zeek_files,
111
+ )
112
+ from loghunter.common.loader.pipeline import (
113
+ _NORMALIZER_MAP,
114
+ _SOURCE_LOADERS,
115
+ _cloudtrail_strategy_parse,
116
+ _events_from_whole_document,
117
+ _parse_lines,
118
+ _parse_ndjson_file,
119
+ _pihole_should_skip,
120
+ _pihole_strategy_parse,
121
+ _syslog_should_skip,
122
+ _syslog_strategy_parse,
123
+ _zeek_normalize,
124
+ _zeek_parse_from_lines,
125
+ _zeek_records_from_lines,
126
+ _zeek_strategy_parse,
127
+ SourceLoader,
128
+ load_cloudtrail,
129
+ load_logs,
130
+ load_pihole,
131
+ load_required_logs,
132
+ load_syslog,
133
+ load_zeek_log,
134
+ resolve_load_windows,
135
+ run_load,
136
+ )
@@ -0,0 +1,94 @@
1
+ """Loader warning/wording + log-type helpers (leaf utility module).
2
+
3
+ The non-dataclass glue James flagged out of ``types``: ``_log_type`` (glob →
4
+ canonical log type), ``_schema_warning`` (actionable missing-field message),
5
+ ``_zeek_file_read_warning`` / ``_cloudtrail_parse_warning`` (privacy-safe
6
+ per-file failure wording). Imports the parser schema constants only.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import gzip
12
+ import lzma
13
+ from pathlib import Path
14
+
15
+ import pandas as pd
16
+
17
+ from loghunter.parsers.zeek import (
18
+ _OPTIONAL_COLUMNS,
19
+ _REQUIRED_COLUMNS,
20
+ )
21
+
22
+
23
+ def _log_type(pattern: str) -> str | None:
24
+ """Return the canonical log type inferred from a loader glob pattern."""
25
+ if pattern.startswith("conn"):
26
+ return "conn"
27
+ if pattern.startswith("dns"):
28
+ return "dns"
29
+ if pattern.startswith("ssl"):
30
+ return "ssl"
31
+ if pattern.startswith("weird"):
32
+ return "weird"
33
+ if pattern.startswith("notice"):
34
+ return "notice"
35
+ if pattern.startswith("auth"):
36
+ return "auth"
37
+ if pattern.startswith("files"):
38
+ return "files"
39
+ if pattern.startswith("pihole"):
40
+ return "pihole"
41
+ if pattern.startswith("syslog"):
42
+ return "syslog"
43
+ return None
44
+
45
+
46
+ def _schema_warning(pattern: str, df: pd.DataFrame) -> str | None:
47
+ """Return an actionable warning when a loaded DataFrame lacks canonical fields."""
48
+ if df.empty:
49
+ return None
50
+
51
+ log_type = _log_type(pattern)
52
+ if log_type is None or log_type not in _REQUIRED_COLUMNS:
53
+ return None
54
+
55
+ opt = _OPTIONAL_COLUMNS.get(log_type, set())
56
+ missing = sorted((_REQUIRED_COLUMNS[log_type] - opt) - set(df.columns))
57
+ if not missing:
58
+ return None
59
+
60
+ if log_type == "conn":
61
+ return (
62
+ f"conn.log fields not found: {', '.join(missing)} — "
63
+ "is this a Zeek conn.log?"
64
+ )
65
+ if log_type == "dns":
66
+ return (
67
+ f"dns.log fields not found: {', '.join(missing)} — "
68
+ "is this a Zeek dns.log?"
69
+ )
70
+ if log_type == "syslog":
71
+ return (
72
+ f"syslog.log fields not found: {', '.join(missing)} — "
73
+ "is this a Zeek syslog.log?"
74
+ )
75
+ return None
76
+
77
+
78
+ def _zeek_file_read_warning(path: Path, exc: BaseException) -> str:
79
+ """Return a privacy-safe warning for a Zeek file that could not be read."""
80
+ if isinstance(exc, (EOFError, gzip.BadGzipFile, lzma.LZMAError)):
81
+ reason = "compressed file is incomplete or corrupt"
82
+ else:
83
+ reason = f"could not be read ({exc.__class__.__name__})"
84
+ return f"{path.name} could not be read — {reason}; skipping"
85
+
86
+
87
+ def _cloudtrail_parse_warning(path: Path) -> str:
88
+ """Return a privacy-safe warning for a CloudTrail file with malformed JSON.
89
+
90
+ Parallels ``_zeek_file_read_warning``'s tone for I/O failures; this variant
91
+ covers the parse-failure case (file readable, but the contents are not JSON
92
+ we can use).
93
+ """
94
+ return f"{path.name} could not be read — not valid JSON; skipping"
@@ -0,0 +1,335 @@
1
+ """File discovery for each source family + the dated-Zeek default window.
2
+
3
+ Per-family discovery (Zeek flat/dated, syslog content-gated, pihole glob,
4
+ CloudTrail recursive), the hostname stem helper, and the per-strategy
5
+ default-window resolvers (``_zeek_resolve_window`` / ``_flat_resolve_window`` /
6
+ ``_default_resolve_window`` — the ``SourceLoader.resolve_window`` bodies; the
7
+ dated-layout selection ``_zeek_dated_window`` reads ``_zeek_date_subdirs``).
8
+ Imports the rotation sort key + first-ts peek from ``windowing``, the union dedupe
9
+ from ``io``, and the syslog content gate from ``sniff``; none import discovery, so
10
+ the package stays acyclic.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import fnmatch
16
+ import math
17
+ import re
18
+ from datetime import date, datetime, timedelta, timezone
19
+ from pathlib import Path
20
+
21
+ from loghunter.common.loader.io import _union_dedupe
22
+ from loghunter.common.loader.sniff import _looks_like_syslog
23
+ from loghunter.common.loader.types import _LOG_SUFFIXES
24
+ from loghunter.common.loader.windowing import _peek_first_ts, _rotation_base_and_index
25
+
26
+
27
+ def discover_files(directory: Path, pattern: str) -> list[Path]:
28
+ """Return all files in directory matching the glob pattern, sorted by name."""
29
+ return sorted(directory.glob(pattern))
30
+
31
+
32
+ # Matches YYYY-MM-DD at the start of a Zeek log-rotation directory name.
33
+ # Suffix (e.g. -TSVPRE) is ignored for date extraction; see discover_zeek_files.
34
+ _DATE_DIR_RE = re.compile(r"^\d{4}-\d{2}-\d{2}")
35
+
36
+
37
+ def _zeek_date_subdirs(directory: Path) -> list[Path]:
38
+ """Return immediate child directories whose names begin with YYYY-MM-DD, sorted."""
39
+ result = []
40
+ for child in directory.iterdir():
41
+ if child.is_dir() and _DATE_DIR_RE.match(child.name):
42
+ result.append(child)
43
+ return sorted(result, key=lambda p: p.name)
44
+
45
+
46
+ def _file_matches_pattern(path: Path, pattern: str) -> bool:
47
+ """Return True if path's basename matches the glob pattern (single-file Zeek mode)."""
48
+ return fnmatch.fnmatch(path.name, pattern)
49
+
50
+
51
+ def _zeek_dated_window(
52
+ paths: list[Path], span: timedelta
53
+ ) -> tuple[datetime, datetime] | None:
54
+ """Compute the default analysis window for a union of Zeek inputs.
55
+
56
+ PURELY-DATED predicate: every input is a directory AND every directory has
57
+ non-empty YYYY-MM-DD subdirs. When that predicate holds, generalizes the
58
+ single-input selection across the union — gather every discovered date
59
+ subdir across all inputs, dedupe by date prefix, sort by date, select the
60
+ newest ``N = ceil(span_days)`` (min 1), and return ``00:00:00`` UTC of the
61
+ earliest selected → ``23:59:59`` UTC of the newest selected.
62
+
63
+ Returns ``None`` when ANY input is a file (file + dated-dir mix) or ANY
64
+ directory is flat (mixed/flat present) — the caller falls through to the
65
+ flat-layout default path, which computes the window post-load from the
66
+ COMBINED loaded Zeek frame's max-ts.
67
+
68
+ Single-input behavior is BYTE-IDENTICAL with the prior scalar helper: a
69
+ one-element list of a single dated dir runs the same selection
70
+ (``_zeek_date_subdirs(input)``, newest N, earliest-midnight →
71
+ newest-23:59:59).
72
+
73
+ Sparse archives behave correctly: subdirs ``[2026-01-01, 2026-01-05]``
74
+ with span=2d → BOTH selected, window Jan 1 → Jan 5. Cross-input
75
+ duplicate dates count once toward N (dedup by date prefix).
76
+ """
77
+ if not paths:
78
+ return None
79
+ all_date_dirs: list[Path] = []
80
+ for p in paths:
81
+ if not p.is_dir():
82
+ return None
83
+ date_dirs = _zeek_date_subdirs(p)
84
+ if not date_dirs:
85
+ return None
86
+ all_date_dirs.extend(date_dirs)
87
+ # Dedup by date prefix so N counts DISTINCT dates across the union, not
88
+ # duplicates contributed by multiple inputs carrying the same day.
89
+ seen: set[str] = set()
90
+ unique_by_date: list[Path] = []
91
+ for d in sorted(all_date_dirs, key=lambda p: p.name[:10]):
92
+ prefix = d.name[:10]
93
+ if prefix in seen:
94
+ continue
95
+ seen.add(prefix)
96
+ unique_by_date.append(d)
97
+ n = max(1, math.ceil(span.total_seconds() / 86400))
98
+ selected = unique_by_date[-n:]
99
+ earliest_date = date.fromisoformat(selected[0].name[:10])
100
+ newest_date = date.fromisoformat(selected[-1].name[:10])
101
+ since = datetime(earliest_date.year, earliest_date.month, earliest_date.day,
102
+ 0, 0, 0, tzinfo=timezone.utc)
103
+ until = datetime(newest_date.year, newest_date.month, newest_date.day,
104
+ 23, 59, 59, tzinfo=timezone.utc)
105
+ return since, until
106
+
107
+
108
+ # ─────────────────────────────────────────────────────────────────────────────
109
+ # Per-strategy default-window resolvers — the SourceLoader.resolve_window bodies.
110
+ #
111
+ # Signature is uniform: (strategy, dirs, pattern, span) -> (select_window,
112
+ # trim_span). resolve_load_windows (pipeline) loops eligible/unbounded/in-play
113
+ # families and calls each strategy's resolver (or _default_resolve_window when a
114
+ # strategy declares none). The owning ``strategy`` is passed so the flat resolver
115
+ # can reach ``strategy.discover`` for its candidate universe WITHOUT a source-name
116
+ # ladder or a registry import — that is why this hook takes ``strategy`` and the
117
+ # other strategy callables (parse/discover/should_skip) do not.
118
+ # ─────────────────────────────────────────────────────────────────────────────
119
+
120
+
121
+ def _default_resolve_window(
122
+ strategy, dirs: list[Path], pattern: str, span: timedelta
123
+ ) -> tuple[None, timedelta]:
124
+ """Universal default for a source that declares no ``resolve_window``: load the
125
+ family full and trim post-load to its own last-``span`` window. A new flat
126
+ source inherits this with zero runner edits — mirroring today's
127
+ ``default_window_eligible=True`` / ``window_select=None`` behavior.
128
+ """
129
+ return None, span
130
+
131
+
132
+ def _zeek_resolve_window(
133
+ strategy, dirs: list[Path], pattern: str, span: timedelta
134
+ ) -> tuple[tuple[datetime, datetime] | None, timedelta | None]:
135
+ """Zeek strategy resolver. Dated layout → a precise ``(since, until)``
136
+ ``select_window`` and NO post-load trim (the load-time window already cut
137
+ exactly). Flat / mixed / file → ``(None, span)`` (load full, trim post-load).
138
+ """
139
+ dated = _zeek_dated_window(dirs, span)
140
+ if dated is not None:
141
+ return dated, None
142
+ return None, span
143
+
144
+
145
+ def _flat_default_floor(
146
+ strategy, dir_paths: list[Path], pattern: str, span: timedelta
147
+ ) -> tuple[datetime, None] | None:
148
+ """Conservative default-window floor for a flat family (syslog / pihole).
149
+
150
+ Discovers the family's DIRECTORY candidates via the passed ``strategy.discover``
151
+ over the ``is_dir()`` inputs (``_union_dedupe``d — the same universe as
152
+ :func:`load_required_logs`, explicit files excluded) and peeks each candidate's
153
+ first-ts (``_peek_first_ts`` — clock-parity with the loader's own filter).
154
+ Returns ``(f_max − span, None)`` — the conservative select-window the
155
+ rotation-peek prunes against (``until=None``; the precise cut is the post-load
156
+ trim) — where ``f_max`` is the max parseable first-ts across candidates.
157
+ Returns ``None`` when nothing is peekable (load-full fallback).
158
+ """
159
+ candidates = _union_dedupe(
160
+ [strategy.discover(d, pattern, None, None) for d in dir_paths if d.is_dir()]
161
+ )
162
+ peeked = [ts for ts in (_peek_first_ts(p) for p in candidates) if ts is not None]
163
+ if not peeked:
164
+ return None
165
+ return (max(peeked) - span, None)
166
+
167
+
168
+ def _flat_resolve_window(
169
+ strategy, dirs: list[Path], pattern: str, span: timedelta
170
+ ) -> tuple[tuple[datetime, None] | None, timedelta]:
171
+ """Flat strategy resolver (syslog / pihole). Peek the directory candidates →
172
+ conservative ``(floor, None)`` ``select_window`` + precise post-load
173
+ ``trim_span``; unpeekable → ``(None, span)`` (load full, trim post-load).
174
+ """
175
+ return _flat_default_floor(strategy, dirs, pattern, span), span
176
+
177
+
178
+ def discover_zeek_files(
179
+ directory: Path,
180
+ pattern: str,
181
+ since: datetime | None = None,
182
+ until: datetime | None = None,
183
+ ) -> list[Path]:
184
+ """Return Zeek log files matching pattern from a flat or dated-layout directory.
185
+
186
+ Single-file mode (directory.is_file() True): returns [directory] when its basename
187
+ matches the pattern, else []. Single-file zeek_dir is a bounded target.
188
+
189
+ Flat layout (no YYYY-MM-DD immediate children): sorted(directory.glob(pattern)).
190
+ Dated layout: globs pattern within YYYY-MM-DD subdirs, date-prunes when window given.
191
+ This is structural inspection of a known zeek_dir — not generic format discovery.
192
+
193
+ Mixed-root policy: if any immediate child is a YYYY-MM-DD directory, dated layout
194
+ is used and root-level files are not included. Zeek does not produce root-level files
195
+ alongside date subdirs; the hybrid is ambiguous and therefore unsupported.
196
+ """
197
+ if directory.is_file():
198
+ return [directory] if _file_matches_pattern(directory, pattern) else []
199
+
200
+ date_dirs = _zeek_date_subdirs(directory)
201
+
202
+ if not date_dirs:
203
+ # Flat layout — identical to discover_files behavior.
204
+ return sorted(directory.glob(pattern))
205
+
206
+ # Dated layout.
207
+ if since is not None or until is not None:
208
+ # Prune by directory date; skip non-date children entirely.
209
+ since_date = since.date() if since else None
210
+ until_date = until.date() if until else None
211
+ included: list[Path] = []
212
+ for d in date_dirs:
213
+ dir_date = date.fromisoformat(d.name[:10])
214
+ if since_date is not None and dir_date < since_date:
215
+ continue
216
+ if until_date is not None and dir_date > until_date:
217
+ continue
218
+ included.append(d)
219
+ else:
220
+ # No window — include all date dirs and any non-date dirs, deduped by realpath.
221
+ # Non-date dirs (current, export, mixed) are often symlinks to date dirs;
222
+ # resolving and deduping prevents double-loading.
223
+ candidates: list[Path] = list(date_dirs)
224
+ for child in directory.iterdir():
225
+ if child.is_dir() and not _DATE_DIR_RE.match(child.name):
226
+ candidates.append(child)
227
+ seen: set[Path] = set()
228
+ included = []
229
+ # Dedup by realpath — non-date children are often symlinks pointing to date dirs;
230
+ # resolving prevents double-loading regardless of iteration order.
231
+ for d in sorted(candidates, key=lambda p: p.name):
232
+ rp = d.resolve()
233
+ if rp not in seen:
234
+ seen.add(rp)
235
+ included.append(d)
236
+
237
+ files: list[Path] = []
238
+ for d in included:
239
+ files.extend(sorted(d.glob(pattern)))
240
+ return files
241
+
242
+
243
+ def _syslog_files(path: Path, pattern: str = "*.log*") -> list[Path]:
244
+ """Return flat-source files to process by FILENAME glob: ``[path]`` if a file,
245
+ else files matching ``pattern`` in the directory — ``._``-prefixed AppleDouble
246
+ sidecars dropped, numeric rotation order.
247
+
248
+ Pi-hole's discovery helper (and the Pi-hole pattern-mismatch presence check).
249
+ Syslog discovery is NOT this — it content-sniffs via ``_discover_syslog_files``
250
+ (RHEL/Fedora streams carry no ``.log`` suffix, and ``dnf.log`` etc. would be
251
+ mis-claimed by a filename glob). The ``pattern`` applies to DIRECTORY discovery
252
+ ONLY (pihole passes ``pihole*.log*`` to avoid grabbing non-pihole files in a
253
+ shared dir). An explicitly-named FILE always loads as named — the pattern is
254
+ NOT applied to it, so a content-routed Pi-hole file like ``events.log`` still
255
+ loads. The AppleDouble filter and numeric ordering apply to DIRECTORY discovery
256
+ only: the junk filter targets glob noise, not operator intent.
257
+ """
258
+ if path.is_file():
259
+ return [path]
260
+ files = [p for p in discover_files(path, pattern) if not p.name.startswith("._")]
261
+ return sorted(files, key=lambda p: _rotation_base_and_index(p.name))
262
+
263
+
264
+ def _discover_syslog_files(path: Path) -> list[Path]:
265
+ """The SINGLE syslog discovery universe — content-gated.
266
+
267
+ FILE input → ``[path]`` UNGATED (the explicit-named-file rail: an explicitly
268
+ named file always loads regardless of name; ``_syslog_should_skip`` still
269
+ guards a named NDJSON/Zeek-TSV at load). DIRECTORY input → all regular,
270
+ non-AppleDouble files that pass ``_looks_like_syslog``, in rotation order.
271
+ Non-recursive ``iterdir`` correctly skips the binary subdirs (``journal/``,
272
+ ``audit/``, ``sa/``) — they are not regular files.
273
+ """
274
+ if path.is_file():
275
+ return [path]
276
+ files = [
277
+ p for p in path.iterdir()
278
+ if p.is_file() and not p.name.startswith("._") and _looks_like_syslog(p)
279
+ ]
280
+ return sorted(files, key=lambda p: _rotation_base_and_index(p.name))
281
+
282
+
283
+ def _dir_has_regular_files(path: Path) -> bool:
284
+ """True iff ``path`` holds >=1 regular, non-AppleDouble file.
285
+
286
+ Cheap ``iterdir`` presence check for the syslog zero-accepted disclosure —
287
+ NO sniff, NO ``*.log*`` test, so an extensionless-only dir does not fall
288
+ through the disclosure silently.
289
+ """
290
+ try:
291
+ return any(p.is_file() and not p.name.startswith("._") for p in path.iterdir())
292
+ except OSError:
293
+ return False
294
+
295
+
296
+ def discover_cloudtrail_files(path: Path) -> list[Path]:
297
+ """Discover CloudTrail event files for the loader and runner satisfiability check.
298
+
299
+ File path → [path]. Directory → recursive sorted ``*.json*`` matches, excluding
300
+ any file whose path contains a ``CloudTrail-Digest`` component (integrity
301
+ manifests, not events — the same exclusion the exporter applies on the S3 side).
302
+
303
+ The ``*.json*`` glob covers ``.json``, ``.json.gz``, the exporter's ``.json.log``
304
+ and its ``_partNN`` splits. Recursion is what makes a native
305
+ ``AWSLogs/<acct>/CloudTrail/<region>/YYYY/MM/DD/`` tree work — users who pull
306
+ logs their own way can point ``cloudtrail_dir`` at any level of that tree.
307
+ """
308
+ if path.is_file():
309
+ return [path]
310
+ if not path.is_dir():
311
+ return []
312
+ files: list[Path] = []
313
+ for candidate in sorted(path.rglob("*.json*")):
314
+ if not candidate.is_file():
315
+ continue
316
+ if "CloudTrail-Digest" in candidate.parts:
317
+ continue
318
+ files.append(candidate)
319
+ return files
320
+
321
+
322
+ def _stem_hostname(name: str) -> str:
323
+ """Strip log-file suffixes from a filename to derive a hostname stem.
324
+
325
+ Strips .log, .gz, and numeric rotation suffixes (.1, .42).
326
+ Dotted hostnames are preserved: host1.example.com.log → host1.example.com.
327
+ """
328
+ stem = name
329
+ while True:
330
+ suffix = Path(stem).suffix
331
+ if suffix in _LOG_SUFFIXES or (suffix and suffix[1:].isdigit()):
332
+ stem = Path(stem).stem
333
+ else:
334
+ break
335
+ return stem
@@ -0,0 +1,76 @@
1
+ """Low-level filesystem primitives for the loader package (leaf module).
2
+
3
+ Decompression-transparent file opening plus the path-normalization helpers
4
+ (``_safe_resolve`` / ``_union_dedupe``). These are the lowest leaf: every other
5
+ loader submodule may import from here, and this module imports nothing from the
6
+ package. ``_open_log`` is the SINGLE chokepoint every source flows through.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import bz2
12
+ import gzip
13
+ import lzma
14
+ from pathlib import Path
15
+
16
+
17
+ def _open_log(path: Path):
18
+ """Open a plain, gzip-, bzip2-, or xz-compressed log file for reading.
19
+
20
+ Suffix-gated (NOT magic-authoritative — the blob profiler is the magic-sniff
21
+ context; the loader routes by suffix, keeping the two contexts distinct).
22
+ `_open_log` is the SINGLE chokepoint every source flows through, so adding a
23
+ new format here closes the gap across conn/dns/syslog/pihole/cloudtrail/sniff
24
+ in one place.
25
+ """
26
+ if path.suffix == ".gz":
27
+ return gzip.open(path, "rt", encoding="utf-8", errors="replace")
28
+ if path.suffix == ".bz2":
29
+ return bz2.open(path, "rt", encoding="utf-8", errors="replace")
30
+ if path.suffix == ".xz":
31
+ return lzma.open(path, "rt", encoding="utf-8", errors="replace")
32
+ return path.open("r", encoding="utf-8", errors="replace")
33
+
34
+
35
+ def _safe_resolve(p: Path) -> Path:
36
+ """``p.resolve()``, falling back to ``p`` on ``OSError``.
37
+
38
+ The single realpath-normalization primitive the loader uses for dedupe,
39
+ the rotation-windowing explicit-file partition, and rotation grouping —
40
+ one consistent notion of "same path" across all three.
41
+ """
42
+ try:
43
+ return p.resolve()
44
+ except OSError:
45
+ return p
46
+
47
+
48
+ def _union_dedupe(per_input_files: list[list[Path]]) -> list[Path]:
49
+ """Concat per-input discovery results; dedupe by ``.resolve()`` preserving
50
+ first-seen order.
51
+
52
+ Single-ownership union point — the loader is the only place file lists
53
+ from multiple source-dir inputs are concatenated under one family. Dedup
54
+ by realpath catches:
55
+
56
+ - the same file appearing in two inputs (positional pointing at a file
57
+ that's ALSO inside a positional directory);
58
+ - symlink farms (a non-date child of a Zeek dated dir that resolves to a
59
+ date dir already in the list).
60
+
61
+ First-seen order preservation keeps user-visible file ordering predictable
62
+ (positionals before flag-supplied dirs, mirrors CLI bucket order).
63
+ Returns the deduped list; downstream accounting (``data_size_bytes`` sums,
64
+ warnings, ``load_*`` iteration) runs over this list so duplicates never
65
+ double-count.
66
+ """
67
+ seen: set[Path] = set()
68
+ out: list[Path] = []
69
+ for files in per_input_files:
70
+ for p in files:
71
+ key = _safe_resolve(p)
72
+ if key in seen:
73
+ continue
74
+ seen.add(key)
75
+ out.append(p)
76
+ return out