loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,1010 @@
1
+ """The uniform load pipeline — ``run_load`` + the ``_SOURCE_LOADERS`` registry.
2
+
3
+ The protected core: every detector source-family load flows through ``run_load``
4
+ (progress wrap, coverage tracking, default-window filtering, verbose-gated
5
+ wrong-family skips, read-corruption handling — written ONCE). A new format = one
6
+ ``SourceLoader`` in ``_SOURCE_LOADERS`` → it inherits the treatment by
7
+ construction and cannot diverge by happenstance. ``_open_log`` / ``progress`` are
8
+ reached through the package facade so test monkeypatches take effect here.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import gzip
14
+ import itertools
15
+ import json
16
+ import lzma
17
+ import sys
18
+ from dataclasses import dataclass
19
+ from datetime import datetime, timedelta
20
+ from pathlib import Path
21
+ from typing import Any, Callable
22
+
23
+ import pandas as pd
24
+
25
+ import loghunter.common.loader as _loader # facade: _open_log / progress patch-through (call-time only)
26
+ from loghunter.common.config import parse_window_span
27
+ from loghunter.common.loader.diagnostics import (
28
+ _cloudtrail_parse_warning,
29
+ _log_type,
30
+ _schema_warning,
31
+ _zeek_file_read_warning,
32
+ )
33
+ from loghunter.common.loader.discovery import (
34
+ _default_resolve_window,
35
+ _dir_has_regular_files,
36
+ _discover_syslog_files,
37
+ _flat_resolve_window,
38
+ _stem_hostname,
39
+ _syslog_files,
40
+ _zeek_resolve_window,
41
+ discover_cloudtrail_files,
42
+ discover_zeek_files,
43
+ )
44
+ from loghunter.common.loader.io import _safe_resolve, _union_dedupe
45
+ from loghunter.common.loader.sniff import _is_ndjson
46
+ from loghunter.common.loader.types import (
47
+ _CLOUDTRAIL_COLUMNS,
48
+ _PIHOLE_COLUMNS,
49
+ _SYSLOG_COLUMNS,
50
+ CoverageTracker,
51
+ LoadResult,
52
+ RotationSkipInfo,
53
+ SourceCoverage,
54
+ _data_window,
55
+ )
56
+ from loghunter.common.loader.windowing import (
57
+ LoadWindow,
58
+ _apply_ts_filter,
59
+ _missing_ts,
60
+ _rotation_windowed_files,
61
+ is_bounded,
62
+ )
63
+ from loghunter.parsers.cloudtrail import parse_event as _parse_cloudtrail_event
64
+ from loghunter.parsers.dnsmasq import parse_line as _parse_dnsmasq_line
65
+ from loghunter.parsers.syslog import parse_line as _parse_syslog_line
66
+ from loghunter.parsers.zeek import (
67
+ _normalize_conn_df,
68
+ _normalize_dns_df,
69
+ _normalize_zeek_syslog_df,
70
+ )
71
+ from loghunter.parsers.zeek_tsv import parse_tsv_log as _parse_tsv_log
72
+
73
+
74
+ @dataclass(frozen=True)
75
+ class SourceLoader:
76
+ """Per-source-family load strategy consumed by ``run_load``.
77
+
78
+ The strategy carries the thin per-family description; the uniform
79
+ behavior — progress, coverage, windowing, corruption-handling, verbose-
80
+ gated wrong-family skip — lives in ``run_load``. A new format = one
81
+ ``SourceLoader`` in ``_SOURCE_LOADERS`` → inherits the treatment by
82
+ construction.
83
+
84
+ Fields:
85
+ - ``discover``: window-aware file discovery for a single input path.
86
+ - ``mode``: ``"stream"`` yields canonical row dicts; ``"frame"`` returns
87
+ a pre-filter DataFrame (Zeek, normalized later).
88
+ - ``parse(line_iter, *, path, warnings)``: format-specific decode given
89
+ the progress-wrapped line iterator AND the per-file context
90
+ (``path`` for hostname stems / file identity; ``warnings`` as the
91
+ content-parse warning sink).
92
+ - ``ts_policy``: ``"keep"`` (NaN-ts rows bypass the window) or
93
+ ``"drop"`` (NaN-ts rows discarded before windowing). Each entry
94
+ carries a rationale comment at registration.
95
+ - ``columns``: stream-mode empty-frame stability list; ``None`` for
96
+ frame-mode. Frame-mode (Zeek) preserves today's bare ``pd.DataFrame()``
97
+ on date-pruned / empty / all-filtered — Zeek's non-empty columns
98
+ come from parse + normalize, not a static list.
99
+ - ``should_skip(path)``: wrong-family guard returning a skip message
100
+ (printed to stderr only under ``verbose=True``) or ``None`` to keep.
101
+ Optional; ``None`` means never skip.
102
+ - ``normalize(df, pattern)``: post-assembly normalize hook
103
+ (``_NORMALIZER_MAP`` dispatch for Zeek; ``None`` for the flat
104
+ loaders).
105
+ - ``unit``: progress bar unit label.
106
+ - ``window_select(files, since, until, *, verbose)``: OPTIONAL ordinal-
107
+ rotation peek-prune (flat syslog / pihole). Returns
108
+ ``(selected, RotationSkipInfo)``. ``None`` (Zeek / CloudTrail) means no
109
+ windowing of discovered candidates — the loader keeps today's behavior
110
+ verbatim. Defaulted so non-flat registry entries and programmatic
111
+ constructions do not churn.
112
+ """
113
+
114
+ discover: Callable[[Path, str, datetime | None, datetime | None], list[Path]]
115
+ mode: str # "stream" | "frame"
116
+ parse: Callable[..., Any]
117
+ ts_policy: str # "keep" | "drop"
118
+ columns: list[str] | None
119
+ should_skip: Callable[[Path], str | None] | None
120
+ normalize: Callable[[pd.DataFrame, str], pd.DataFrame] | None
121
+ unit: str = " lines"
122
+ # Signature: (files, since, until, *, verbose) -> (selected, RotationSkipInfo).
123
+ # `Callable[...]` because the real callable has a keyword-only `verbose` arg
124
+ # that the parameter-list form cannot express.
125
+ window_select: Callable[..., tuple[list[Path], RotationSkipInfo]] | None = None
126
+ # Whether the auto-default window applies to this family. Default True
127
+ # (zeek/syslog/pihole). CloudTrail opts OUT — aws is baseline-relative
128
+ # (novelty/weirdness needs full history), so the recent-slice auto-window
129
+ # defeats it; explicit windows still apply (resolved before the default).
130
+ default_window_eligible: bool = True
131
+ # How this family resolves its default window: (strategy, dirs, pattern, span)
132
+ # -> (select_window, trim_span). ``None`` = the universal default
133
+ # (_default_resolve_window: load full + post-load trim). A source with special
134
+ # temporal semantics (dated Zeek dirs, flat rotation-peek) declares its resolver
135
+ # HERE, on the entry — zero runner edits. The owning strategy is passed in so a
136
+ # resolver can reach ``strategy.discover`` without a registry import.
137
+ resolve_window: Callable[..., tuple[Any, timedelta | None]] | None = None
138
+
139
+
140
+ def _zeek_records_from_lines(line_iter: Any) -> list[dict[str, Any]]:
141
+ """Iterate ``line_iter`` and return Zeek NDJSON records.
142
+
143
+ Skips blank and ``#``-comment lines, drops records with ``ts is None`` or
144
+ malformed JSON. Shared by ``_parse_ndjson_file`` (path-driven NDJSON
145
+ parse) and the Zeek strategy's NDJSON branch (line-iter-driven).
146
+ """
147
+ records: list[dict[str, Any]] = []
148
+ for line in line_iter:
149
+ line = line.strip()
150
+ if not line or line.startswith("#"):
151
+ continue
152
+ try:
153
+ record = json.loads(line)
154
+ except json.JSONDecodeError:
155
+ continue
156
+ if record.get("ts") is None:
157
+ continue
158
+ records.append(record)
159
+ return records
160
+
161
+
162
+ def _zeek_parse_from_lines(line_iter: Any) -> pd.DataFrame:
163
+ """Prefix-preserving NDJSON-vs-TSV dispatch for a Zeek line iterator.
164
+
165
+ Glenn rev-3 fix: a one-line peek would discard ``#separator`` / ``#fields``
166
+ / ``#types`` directives that ``parse_tsv_log`` requires. This helper
167
+ accumulates a ``prefix`` list of every consumed line while scanning, so the
168
+ parser sees the full header block.
169
+
170
+ Decision rule:
171
+ - NDJSON when the FIRST non-blank, non-comment line starts with ``{``.
172
+ - TSV when ``#separator`` appears anywhere in the scanned ``prefix``.
173
+ - Bare empty ``DataFrame`` otherwise (header-only / empty / non-Zeek
174
+ stub) — preserves today's bare-frame shape for date-pruned / empty /
175
+ all-filtered Zeek paths.
176
+
177
+ Parse runs over ``itertools.chain(prefix, line_iter)`` so EVERY consumed
178
+ line — header directives included — reaches the parser. Header-only TSV
179
+ files retain their header block; ``parse_tsv_log`` produces whatever it
180
+ makes of header-only input today.
181
+ """
182
+ prefix: list[str] = []
183
+ is_ndjson: bool | None = None
184
+ has_separator = False
185
+ for line in line_iter:
186
+ prefix.append(line)
187
+ stripped = line.strip()
188
+ if not stripped:
189
+ continue
190
+ if stripped.startswith("#"):
191
+ if stripped.startswith("#separator"):
192
+ has_separator = True
193
+ continue
194
+ # First non-blank, non-comment line decides NDJSON.
195
+ is_ndjson = stripped.startswith("{")
196
+ break
197
+ rest = itertools.chain(prefix, line_iter)
198
+ if is_ndjson:
199
+ return pd.DataFrame(_zeek_records_from_lines(rest))
200
+ if has_separator:
201
+ return _parse_tsv_log(rest)
202
+ # Header-only / empty / non-Zeek stub — bare empty frame (preserved).
203
+ return pd.DataFrame()
204
+
205
+
206
+ def _parse_ndjson_file(path: Path, show_progress: bool = True) -> pd.DataFrame:
207
+ """Parse a single Zeek NDJSON log file, return unfiltered Zeek-native DataFrame."""
208
+ with _loader._open_log(path) as fh:
209
+ line_iter = _loader.progress(
210
+ fh,
211
+ desc=f"loaded {path.name}",
212
+ show_progress=show_progress,
213
+ unit=" lines",
214
+ )
215
+ records = _zeek_records_from_lines(line_iter)
216
+ return pd.DataFrame(records)
217
+
218
+
219
+ def _parse_lines(lines: list[str]) -> list[dict[str, Any]]:
220
+ """Parse NDJSON lines, skipping blanks and Zeek comment headers."""
221
+ result: list[dict[str, Any]] = []
222
+ for line in lines:
223
+ line = line.strip()
224
+ if not line or line.startswith("#"):
225
+ continue
226
+ try:
227
+ result.append(json.loads(line))
228
+ except json.JSONDecodeError:
229
+ pass
230
+ return result
231
+
232
+
233
+ def load_zeek_log(
234
+ path: Path,
235
+ since: datetime | None = None,
236
+ until: datetime | None = None,
237
+ show_progress: bool = True,
238
+ ) -> pd.DataFrame:
239
+ """Parse a single Zeek NDJSON log file and return a DataFrame.
240
+
241
+ Handles plain and gzip-compressed files transparently.
242
+ Applies timeframe filter on the ts field if since/until are provided.
243
+ """
244
+ return _apply_ts_filter(
245
+ _parse_ndjson_file(path, show_progress=show_progress), since, until
246
+ )
247
+
248
+
249
+ def _events_from_whole_document(
250
+ text: str,
251
+ path: Path,
252
+ _warnings: list[str] | None,
253
+ ) -> list[dict]:
254
+ """Parse ``text`` as a single JSON document and extract its event list.
255
+
256
+ Accepts three shapes: ``{"Records": [...]}`` envelope, a bare ``[...]`` list,
257
+ or a bare ``{...}`` single event. Total parse failure or any other shape
258
+ appends a warning and returns an empty list.
259
+ """
260
+ try:
261
+ doc = json.loads(text)
262
+ except json.JSONDecodeError:
263
+ if _warnings is not None:
264
+ _warnings.append(_cloudtrail_parse_warning(path))
265
+ return []
266
+
267
+ if isinstance(doc, dict):
268
+ records = doc.get("Records")
269
+ if isinstance(records, list):
270
+ return [e for e in records if isinstance(e, dict)]
271
+ # Bare single-event dict.
272
+ return [doc]
273
+ if isinstance(doc, list):
274
+ return [e for e in doc if isinstance(e, dict)]
275
+
276
+ if _warnings is not None:
277
+ _warnings.append(_cloudtrail_parse_warning(path))
278
+ return []
279
+
280
+
281
+ # Zeek normalization lives in loghunter.parsers.zeek; loader keeps dispatch here.
282
+ # Covers Zeek NDJSON formats only. Syslog is handled by load_syslog() — see parsers/syslog.py.
283
+
284
+ # Map from log type → normalizer function. Add an entry here (alongside a new
285
+ # _normalize_*_df function) when implementing each new Zeek log source.
286
+ _NORMALIZER_MAP: dict[str, Callable[[pd.DataFrame], pd.DataFrame]] = {
287
+ "conn": _normalize_conn_df,
288
+ "dns": _normalize_dns_df,
289
+ "syslog": _normalize_zeek_syslog_df,
290
+ }
291
+
292
+
293
+ # ─────────────────────────────────────────────────────────────────────────────
294
+ # run_load — the uniform load pipeline + per-source SourceLoader strategies
295
+ #
296
+ # Every detector source-family load flows through ``run_load``: progress
297
+ # wrapping, coverage tracking, default-window filtering, verbose-gated
298
+ # wrong-family skips, and read-corruption handling are written ONCE. A new
299
+ # format = one ``SourceLoader`` in ``_SOURCE_LOADERS`` — it inherits the
300
+ # treatment by construction and cannot diverge by happenstance.
301
+ #
302
+ # Stream strategies (syslog / pihole / cloudtrail) yield canonical row dicts;
303
+ # frame strategies (zeek) return a pre-window DataFrame that the pipeline
304
+ # windows + normalises. NaN-ts policy is declared per strategy:
305
+ # ``ts_policy="drop"`` (zeek + cloudtrail — unparseable timestamps are not
306
+ # trustworthy data) vs ``ts_policy="keep"`` (syslog + pihole — RFC 3164's
307
+ # year-guess can lose a timestamp without making the LINE less useful).
308
+ # ─────────────────────────────────────────────────────────────────────────────
309
+
310
+
311
+ def _zeek_strategy_parse(line_iter, *, path, warnings): # noqa: ARG001 - uniform contract
312
+ """Zeek strategy parse: prefix-preserving NDJSON-vs-TSV dispatch.
313
+
314
+ ``path`` and ``warnings`` are accepted for the uniform strategy contract;
315
+ Zeek's parse doesn't consult either — its content-parse failures degrade
316
+ to bare DataFrame via ``_zeek_parse_from_lines`` rather than a warning.
317
+ """
318
+ return _zeek_parse_from_lines(line_iter)
319
+
320
+
321
+ def _zeek_normalize(df: pd.DataFrame, pattern: str) -> pd.DataFrame:
322
+ """Apply the Zeek per-log-type normaliser when the pattern has one."""
323
+ log_type = _log_type(pattern)
324
+ if log_type in _NORMALIZER_MAP:
325
+ return _NORMALIZER_MAP[log_type](df)
326
+ return df
327
+
328
+
329
+ def _syslog_strategy_parse(line_iter, *, path, warnings): # noqa: ARG001
330
+ """Syslog stream parse: yield canonical rows with float/NaN ``ts``.
331
+
332
+ Host derivation (H4): prefer the in-content RFC-3164 host (``parse_host``:
333
+ field 4, or ``"unknown"`` when <4 tokens); fall back to the filename stem
334
+ (``_stem_hostname``) only when the line is hostless. Generalizes the old
335
+ ``messages`` special-case to every stream file, with the per-host-file case
336
+ preserved by the fallback. ``ts`` is converted to float seconds, or
337
+ ``float('nan')`` when the RFC 3164 line has no parseable timestamp — KEEP
338
+ policy applies at the pipeline.
339
+ """
340
+ stem = _stem_hostname(path.name)
341
+ for line in line_iter:
342
+ record = _parse_syslog_line(line.rstrip("\n"))
343
+ if record is None:
344
+ continue
345
+ in_content = record["host"]
346
+ host = in_content if in_content != "unknown" else stem
347
+ ts_dt = record["ts"]
348
+ ts_float = ts_dt.timestamp() if ts_dt is not None else float("nan")
349
+ yield {
350
+ "ts": ts_float,
351
+ "host": host,
352
+ "program": record["program"],
353
+ "raw": record["raw"],
354
+ "message": record["message"],
355
+ }
356
+
357
+
358
+ def _pihole_strategy_parse(line_iter, *, path, warnings): # noqa: ARG001
359
+ """Pi-hole stream parse: yield canonical rows with float/NaN ``ts``.
360
+
361
+ Hostname is taken from the filename stem unconditionally (Pi-hole logs
362
+ are per-host). ``ts`` is float seconds or ``float('nan')`` — KEEP policy.
363
+ """
364
+ stem = _stem_hostname(path.name)
365
+ for line in line_iter:
366
+ record = _parse_dnsmasq_line(line.rstrip("\n"))
367
+ if record is None:
368
+ continue
369
+ record["host"] = stem
370
+ ts_dt = record["ts"]
371
+ record["ts"] = ts_dt.timestamp() if ts_dt is not None else float("nan")
372
+ yield record
373
+
374
+
375
+ def _cloudtrail_strategy_parse(line_iter, *, path, warnings):
376
+ """CloudTrail stream parse: SINGLE-iterator sniff + dispatch yielding rows.
377
+
378
+ The single-iterator invariant (preserved when the per-file CloudTrail reader
379
+ folded into this strategy): the pipeline wraps ``fh`` → ``line_iter`` once;
380
+ EVERY branch (first-line sniff, NDJSON stream, envelope/pretty multi-line,
381
+ bare-list) consumes from the SAME wrapped iterator so the progress bar's
382
+ line count reflects actual INPUT lines, never re-reading ``fh``.
383
+
384
+ Content-parse failures (malformed JSON) append
385
+ ``_cloudtrail_parse_warning(path)`` to the ``warnings`` sink — the
386
+ content-parse-vs-read-corruption split preserved (read-corruption stays
387
+ on the pipeline's ``_zeek_file_read_warning`` rail).
388
+ """
389
+ first_line = None
390
+ for line in line_iter:
391
+ if line.strip():
392
+ first_line = line
393
+ break
394
+ if first_line is None:
395
+ return
396
+
397
+ try:
398
+ first_value = json.loads(first_line)
399
+ except json.JSONDecodeError:
400
+ # First line is a fragment of a pretty-printed multi-line document.
401
+ full_text = first_line + "".join(line_iter)
402
+ for event in _events_from_whole_document(full_text, path, warnings):
403
+ row = _parse_cloudtrail_event(event)
404
+ if row is not None:
405
+ yield row
406
+ return
407
+
408
+ if isinstance(first_value, dict):
409
+ if "Records" in first_value:
410
+ # Envelope: accumulate rest from the same wrapped iterator.
411
+ full_text = first_line + "".join(line_iter)
412
+ for event in _events_from_whole_document(full_text, path, warnings):
413
+ row = _parse_cloudtrail_event(event)
414
+ if row is not None:
415
+ yield row
416
+ return
417
+ # NDJSON: seed events with this first dict (do NOT drop it), then
418
+ # stream the rest, silently skipping undecodable lines.
419
+ row = _parse_cloudtrail_event(first_value)
420
+ if row is not None:
421
+ yield row
422
+ for line in line_iter:
423
+ line = line.strip()
424
+ if not line:
425
+ continue
426
+ try:
427
+ evt = json.loads(line)
428
+ except json.JSONDecodeError:
429
+ continue
430
+ if isinstance(evt, dict):
431
+ row = _parse_cloudtrail_event(evt)
432
+ if row is not None:
433
+ yield row
434
+ return
435
+
436
+ if isinstance(first_value, list):
437
+ # Bare-list one-line document. Any trailing content is malformed; the
438
+ # JSON value is the document.
439
+ for e in first_value:
440
+ if isinstance(e, dict):
441
+ row = _parse_cloudtrail_event(e)
442
+ if row is not None:
443
+ yield row
444
+ return
445
+
446
+ # First-line is a JSON primitive — not a valid CloudTrail event shape.
447
+ # Treat as a parse failure.
448
+ if warnings is not None:
449
+ warnings.append(_cloudtrail_parse_warning(path))
450
+ return
451
+
452
+
453
+ def _syslog_should_skip(path: Path) -> str | None:
454
+ """Wrong-family guard for ``syslog_dir``: skip NDJSON and Zeek TSV.
455
+
456
+ PRESERVES today's asymmetry — syslog skips NDJSON (an operator dropping
457
+ a Zeek NDJSON ``syslog.log`` here would garble through RFC 3164) AND
458
+ Zeek-TSV (the ``#separator`` directive is the strong signal). The
459
+ pipeline gates the returned message on ``verbose=True``.
460
+ """
461
+ if _is_ndjson(path):
462
+ return f"load_syslog: skipping {path.name} — looks like NDJSON, not syslog"
463
+ with _loader._open_log(path) as fh:
464
+ head = list(itertools.islice(fh, 8))
465
+ if any(ln.startswith("#separator") for ln in head):
466
+ return (
467
+ f"load_syslog: skipping {path.name} — looks like Zeek TSV, "
468
+ "not flat syslog (Zeek logs belong in zeek_dir)"
469
+ )
470
+ return None
471
+
472
+
473
+ def _pihole_should_skip(path: Path) -> str | None:
474
+ """Wrong-family guard for ``pihole_dir``: skip NDJSON ONLY.
475
+
476
+ PRESERVES today's asymmetry — Pi-hole guards NDJSON but NOT Zeek TSV
477
+ (there's no real-world case of a Zeek TSV landing in a pihole_dir; a
478
+ blanket TSV skip here would be a behavior change).
479
+ """
480
+ if _is_ndjson(path):
481
+ return f"load_pihole: skipping {path.name} — looks like NDJSON, not dnsmasq"
482
+ return None
483
+
484
+
485
+ def run_load(
486
+ strategy: SourceLoader,
487
+ files: list[Path],
488
+ pattern: str,
489
+ since: datetime | None,
490
+ until: datetime | None,
491
+ *,
492
+ show_progress: bool = True,
493
+ verbose: bool = False,
494
+ _warnings: list[str] | None = None,
495
+ _coverage: dict | None = None,
496
+ ) -> pd.DataFrame:
497
+ """The uniform load pipeline. Owns progress wrap, coverage tracking,
498
+ windowing, corruption rail, verbose-gated wrong-family skip.
499
+
500
+ Does NOT own byte accounting — ``load_required_logs`` sums ``stat`` over
501
+ the deduped ``files`` in its uniform loop.
502
+
503
+ Stream mode (syslog / pihole / cloudtrail):
504
+ Strategy ``parse`` yields canonical row dicts; the pipeline applies the
505
+ ts policy + window per row and assembles a column-stable DataFrame
506
+ (``strategy.columns``) on the way out.
507
+
508
+ Frame mode (zeek):
509
+ Strategy ``parse`` returns a pre-filter DataFrame; the pipeline
510
+ observes the pre-filter frame, windows via ``_apply_ts_filter`` (which
511
+ drops NaN-ts then trims — that IS the drop policy), and optionally
512
+ normalises post-concat. Empty paths return bare ``pd.DataFrame()`` —
513
+ Zeek's empty shape is preserved exactly (no forced columns).
514
+
515
+ Coverage: ``_coverage["coverage"]`` is written iff the returned frame is
516
+ empty AND the tracker has something to say (no-files-read /
517
+ files-but-zero-valid-ts / valid-rows-all-excluded-by-window). A populated
518
+ load short-circuits via ``mark_kept`` and writes nothing.
519
+ """
520
+ tracker = CoverageTracker()
521
+ if not files:
522
+ if _coverage is not None:
523
+ sc = tracker.coverage(True)
524
+ if sc is not None:
525
+ _coverage["coverage"] = sc
526
+ if strategy.mode == "stream":
527
+ return pd.DataFrame(columns=strategy.columns)
528
+ return pd.DataFrame()
529
+
530
+ since_ts = since.timestamp() if since else None
531
+ until_ts = until.timestamp() if until else None
532
+
533
+ rows: list[dict] = []
534
+ frames: list[pd.DataFrame] = []
535
+
536
+ for path in files:
537
+ file_rows: list[dict] = []
538
+ try:
539
+ # should_skip is inside the try so a corrupt compressed file
540
+ # caught during its head sniff lands on the read-corruption rail,
541
+ # not a raw traceback.
542
+ if strategy.should_skip is not None:
543
+ skip_msg = strategy.should_skip(path)
544
+ if skip_msg is not None:
545
+ # Quiet default — print only under verbose. Preserves the
546
+ # NDJSON/Zeek-TSV skip-message tests. ``note_file_read``
547
+ # is NOT fired for a skipped file so the coverage
548
+ # disclosure doesn't mislead.
549
+ if verbose:
550
+ print(skip_msg, file=sys.stderr)
551
+ continue
552
+ tracker.note_file_read()
553
+ with _loader._open_log(path) as fh:
554
+ line_iter = _loader.progress(
555
+ fh,
556
+ desc=f"loaded {path.name}",
557
+ show_progress=show_progress,
558
+ unit=strategy.unit,
559
+ )
560
+ if strategy.mode == "stream":
561
+ for row in strategy.parse(
562
+ line_iter, path=path, warnings=_warnings
563
+ ):
564
+ ts = row["ts"]
565
+ if _missing_ts(ts):
566
+ tracker.observe(None)
567
+ if strategy.ts_policy == "drop":
568
+ continue
569
+ # keep policy — NaN-ts row bypasses the window
570
+ # (an unfilterable line stays in the frame).
571
+ else:
572
+ tracker.observe(ts)
573
+ if since_ts is not None and ts < since_ts:
574
+ continue
575
+ if until_ts is not None and ts > until_ts:
576
+ continue
577
+ file_rows.append(row)
578
+ tracker.mark_kept()
579
+ else: # frame mode
580
+ pre = strategy.parse(
581
+ line_iter, path=path, warnings=_warnings
582
+ )
583
+ tracker.observe_frame(pre)
584
+ post = _apply_ts_filter(pre, since, until)
585
+ if not post.empty:
586
+ frames.append(post)
587
+ tracker.mark_kept()
588
+ except (EOFError, gzip.BadGzipFile, lzma.LZMAError, OSError) as exc:
589
+ # ``_open_log`` returns a lazy reader; corruption may surface only
590
+ # at the trailer after many valid-looking lines. Discard the
591
+ # per-file buffer so the warning is honest (a "skipped with
592
+ # warning" file MUST contribute zero rows), and skip with the
593
+ # standard read-warning. Distinct from CloudTrail's content-parse
594
+ # warning rail (``_cloudtrail_parse_warning``).
595
+ if _warnings is not None:
596
+ _warnings.append(_zeek_file_read_warning(path, exc))
597
+ continue
598
+ if strategy.mode == "stream":
599
+ rows.extend(file_rows)
600
+
601
+ if strategy.mode == "stream":
602
+ if not rows:
603
+ if _coverage is not None:
604
+ sc = tracker.coverage(True)
605
+ if sc is not None:
606
+ _coverage["coverage"] = sc
607
+ return pd.DataFrame(columns=strategy.columns)
608
+ if _coverage is not None:
609
+ sc = tracker.coverage(False)
610
+ if sc is not None:
611
+ _coverage["coverage"] = sc
612
+ return pd.DataFrame(rows, columns=strategy.columns)
613
+
614
+ # Frame mode (Zeek): concat with TODAY's behavior — bare empty, no forced
615
+ # columns. Zeek's non-empty columns come from parse + normalize.
616
+ result = pd.concat(frames, ignore_index=True) if frames else pd.DataFrame()
617
+ if strategy.normalize is not None and not result.empty:
618
+ result = strategy.normalize(result, pattern)
619
+ if _coverage is not None:
620
+ sc = tracker.coverage(result.empty)
621
+ if sc is not None:
622
+ _coverage["coverage"] = sc
623
+ return result
624
+
625
+
626
+ # Source-family strategy registry. A new format = one entry here → inherits
627
+ # the run_load pipeline (progress, coverage, windowing, corruption handling,
628
+ # verbose-gated skip) by construction.
629
+ #
630
+ # ts_policy rationale per family:
631
+ # zeek + cloudtrail = drop. An unparseable Zeek timestamp / CloudTrail
632
+ # eventTime is not trustworthy data; drop before windowing.
633
+ # syslog + pihole = keep. RFC 3164's year-guess can lose a timestamp
634
+ # without making the LINE less useful (e.g. for drain3 templating /
635
+ # reboot detection); keep + bypass the window.
636
+ _SOURCE_LOADERS: dict[str, SourceLoader] = {
637
+ "zeek_dir": SourceLoader(
638
+ discover=discover_zeek_files,
639
+ mode="frame",
640
+ parse=_zeek_strategy_parse,
641
+ ts_policy="drop",
642
+ columns=None,
643
+ should_skip=None,
644
+ normalize=_zeek_normalize,
645
+ # Dated dirs → precise window, no trim; flat / mixed → load full + trim.
646
+ resolve_window=_zeek_resolve_window,
647
+ ),
648
+ "syslog_dir": SourceLoader(
649
+ # Content-gated discovery — the strategy lambda only adapts the
650
+ # signature; _discover_syslog_files is the single discovery body.
651
+ discover=lambda p, pattern, since, until: _discover_syslog_files(p),
652
+ mode="stream",
653
+ parse=_syslog_strategy_parse,
654
+ ts_policy="keep",
655
+ columns=_SYSLOG_COLUMNS,
656
+ should_skip=_syslog_should_skip,
657
+ normalize=None,
658
+ window_select=_rotation_windowed_files,
659
+ # Peek rotation candidates → conservative (floor, None) + post-load trim.
660
+ resolve_window=_flat_resolve_window,
661
+ ),
662
+ "pihole_dir": SourceLoader(
663
+ discover=lambda p, pattern, since, until: _syslog_files(p, pattern),
664
+ mode="stream",
665
+ parse=_pihole_strategy_parse,
666
+ ts_policy="keep",
667
+ columns=_PIHOLE_COLUMNS,
668
+ should_skip=_pihole_should_skip,
669
+ normalize=None,
670
+ window_select=_rotation_windowed_files,
671
+ resolve_window=_flat_resolve_window,
672
+ ),
673
+ "cloudtrail_dir": SourceLoader(
674
+ discover=lambda p, pattern, since, until: discover_cloudtrail_files(p),
675
+ mode="stream",
676
+ parse=_cloudtrail_strategy_parse,
677
+ ts_policy="drop",
678
+ columns=_CLOUDTRAIL_COLUMNS,
679
+ should_skip=None,
680
+ normalize=None,
681
+ # aws is baseline-relative — opt CloudTrail OUT of the auto-default window
682
+ # (an explicit --since/--until still narrows it).
683
+ default_window_eligible=False,
684
+ ),
685
+ }
686
+
687
+
688
+ def load_logs(
689
+ directory: Path,
690
+ pattern: str,
691
+ since: datetime | None = None,
692
+ until: datetime | None = None,
693
+ _files: list[Path] | None = None,
694
+ _warnings: list[str] | None = None,
695
+ show_progress: bool = True,
696
+ _coverage: dict | None = None,
697
+ ) -> pd.DataFrame:
698
+ """Discover and load all matching Zeek log files from directory into a single DataFrame.
699
+
700
+ Thin shim over ``run_load`` with the ``zeek_dir`` strategy. ``_files`` short-
701
+ circuits discovery (digest single-file Zeek bypass + multi-positional
702
+ dedupe both rely on this); when ``None``, ``discover_zeek_files`` runs
703
+ against ``directory`` with the same window-prune behavior as before.
704
+ Signature is preserved byte-compatible for the ~20 callers.
705
+
706
+ _warnings: optional warning sink for per-file operational read failures.
707
+ _coverage: optional out-param. When the returned frame is empty, the loader
708
+ writes ``_coverage["coverage"] = SourceCoverage(...)`` describing the
709
+ pre-window read (None if data survived).
710
+ """
711
+ strategy = _SOURCE_LOADERS["zeek_dir"]
712
+ files = (
713
+ _files
714
+ if _files is not None
715
+ else discover_zeek_files(directory, pattern, since, until)
716
+ )
717
+ return run_load(
718
+ strategy, files, pattern, since, until,
719
+ show_progress=show_progress, verbose=False,
720
+ _warnings=_warnings, _coverage=_coverage,
721
+ )
722
+
723
+
724
+ def load_syslog(
725
+ directory: Path,
726
+ since: datetime | None = None,
727
+ until: datetime | None = None,
728
+ verbose: bool = False,
729
+ _files: list[Path] | None = None,
730
+ _warnings: list[str] | None = None,
731
+ show_progress: bool = True,
732
+ _coverage: dict | None = None,
733
+ ) -> pd.DataFrame:
734
+ """Discover and load syslog files into a column-stable DataFrame.
735
+
736
+ Thin shim over ``run_load`` with the ``syslog_dir`` strategy. Supports a
737
+ directory (per-host files / flat file) or a single file. Wrong-family
738
+ files (NDJSON, Zeek TSV) are skipped via the strategy's ``should_skip``;
739
+ the skip message reaches stderr ONLY when ``verbose=True``. NaN-ts rows
740
+ are KEPT and bypass the window. Returns a column-stable empty frame
741
+ (``_SYSLOG_COLUMNS``) when no rows survive.
742
+ """
743
+ strategy = _SOURCE_LOADERS["syslog_dir"]
744
+ files = _files if _files is not None else _discover_syslog_files(directory)
745
+ return run_load(
746
+ strategy, files, "", since, until,
747
+ show_progress=show_progress, verbose=verbose,
748
+ _warnings=_warnings, _coverage=_coverage,
749
+ )
750
+
751
+
752
+ def load_pihole(
753
+ directory: Path,
754
+ since: datetime | None = None,
755
+ until: datetime | None = None,
756
+ verbose: bool = False,
757
+ _files: list[Path] | None = None,
758
+ _warnings: list[str] | None = None,
759
+ show_progress: bool = True,
760
+ _coverage: dict | None = None,
761
+ ) -> pd.DataFrame:
762
+ """Discover and load dnsmasq/Pi-hole log files into a column-stable DataFrame.
763
+
764
+ Thin shim over ``run_load`` with the ``pihole_dir`` strategy. Wrong-family
765
+ NDJSON files are skipped (Zeek TSV is NOT — Pi-hole's wrong-family
766
+ asymmetry preserved). NaN-ts rows are KEPT and bypass the window.
767
+ Returns a column-stable empty frame (``_PIHOLE_COLUMNS``) when no rows
768
+ survive.
769
+ """
770
+ strategy = _SOURCE_LOADERS["pihole_dir"]
771
+ files = _files if _files is not None else _syslog_files(directory, "pihole*.log*")
772
+ return run_load(
773
+ strategy, files, "", since, until,
774
+ show_progress=show_progress, verbose=verbose,
775
+ _warnings=_warnings, _coverage=_coverage,
776
+ )
777
+
778
+
779
+ def load_cloudtrail(
780
+ path: Path,
781
+ since: datetime | None = None,
782
+ until: datetime | None = None,
783
+ verbose: bool = False,
784
+ _files: list[Path] | None = None,
785
+ _warnings: list[str] | None = None,
786
+ show_progress: bool = True,
787
+ _coverage: dict | None = None,
788
+ ) -> pd.DataFrame:
789
+ """Discover and load CloudTrail event files into a canonical-schema DataFrame.
790
+
791
+ Thin shim over ``run_load`` with the ``cloudtrail_dir`` strategy. Single-
792
+ iterator wire-shape sniff (NDJSON / envelope / bare-list) preserved by
793
+ the strategy's ``parse``. Events with unparseable ``eventTime`` are
794
+ DROPPED before windowing. Bad files (compressed corruption) warn and
795
+ skip; malformed-JSON content failures append
796
+ ``_cloudtrail_parse_warning`` to ``_warnings`` (distinct rail from the
797
+ read-corruption ``_zeek_file_read_warning``). Returns a column-stable
798
+ empty frame (``_CLOUDTRAIL_COLUMNS``) when no rows survive.
799
+
800
+ Note: ``verbose`` is accepted for signature compatibility but is unused
801
+ (the strategy has no ``should_skip``); CloudTrail's per-file content
802
+ warnings ride ``_warnings`` rather than stderr.
803
+ """
804
+ strategy = _SOURCE_LOADERS["cloudtrail_dir"]
805
+ files = _files if _files is not None else discover_cloudtrail_files(path)
806
+ return run_load(
807
+ strategy, files, "", since, until,
808
+ show_progress=show_progress, verbose=verbose,
809
+ _warnings=_warnings, _coverage=_coverage,
810
+ )
811
+
812
+
813
+ def load_required_logs(
814
+ needed_logs: dict[str, str],
815
+ source_dirs: dict[str, list[Path]],
816
+ since: datetime | None = None,
817
+ until: datetime | None = None,
818
+ verbose: bool = False,
819
+ source_windows: dict[str, tuple[datetime | None, datetime | None]] | None = None,
820
+ show_progress: bool = True,
821
+ ) -> LoadResult:
822
+ """Load all patterns required by a run plan and return data plus metadata.
823
+
824
+ ``source_dirs`` is keyed by source family (``zeek_dir`` / ``syslog_dir`` /
825
+ ``pihole_dir`` / ``cloudtrail_dir``); each value is a LIST of inputs (each
826
+ a directory or an explicit file) contributed by positionals, the
827
+ ``--<family>-dir`` flag, and config fallback. The loader iterates each
828
+ family's inputs, runs the EXISTING per-input discovery, concatenates the
829
+ results, dedupes by ``.resolve()`` preserving first-seen order, and loads
830
+ the union. Single-input (one-element list) behavior is byte-identical
831
+ with the prior scalar shape.
832
+
833
+ ``source_windows`` overrides ``(since, until)`` per source key. This lets
834
+ the runner apply a Zeek-derived default window to Zeek loads only,
835
+ leaving syslog/pihole unwindowed when the user gave no explicit timeframe.
836
+ """
837
+ logs: dict[str, pd.DataFrame] = {}
838
+ record_counts: dict[str, int] = {}
839
+ warnings: list[str] = []
840
+ data_size_bytes = 0
841
+ coverage: dict[str, SourceCoverage] = {}
842
+ rotation_skips: dict[str, RotationSkipInfo] = {}
843
+ source_windows = source_windows or {}
844
+
845
+ for pattern, source in needed_logs.items():
846
+ paths = source_dirs.get(source) or []
847
+ if not paths:
848
+ warnings.append(f"{source} not configured — {pattern} not loaded")
849
+ continue
850
+
851
+ strategy = _SOURCE_LOADERS.get(source)
852
+ if strategy is None:
853
+ raise ValueError(
854
+ f"unknown source key {source!r} for pattern {pattern!r} — "
855
+ "no loader is registered for it"
856
+ )
857
+
858
+ s_since, s_until = source_windows.get(source, (since, until))
859
+
860
+ skip_info: RotationSkipInfo | None = None
861
+ if strategy.window_select is None:
862
+ # Zeek / CloudTrail — byte-identical to the prior behavior. Discover
863
+ # over EVERY input (file or dir); the per-file pattern match in
864
+ # discover_zeek_files is what routes multi-positional Zeek inputs to
865
+ # the right pattern, so it must NOT be bypassed.
866
+ files = _union_dedupe([
867
+ strategy.discover(p, pattern, s_since, s_until) for p in paths
868
+ ])
869
+ else:
870
+ # Flat (syslog / pihole) — ordinal-rotation peek-prune of the
871
+ # directory-discovered candidates. Explicit FILES the operator named
872
+ # are partitioned out, protected from BOTH the windowing input and
873
+ # the skip count, and always loaded.
874
+ file_inputs = [p for p in paths if p.is_file()]
875
+ dir_inputs = [p for p in paths if p.is_dir()]
876
+ dir_candidates = _union_dedupe([
877
+ strategy.discover(d, pattern, s_since, s_until) for d in dir_inputs
878
+ ])
879
+ # Silent-miss disclosure, forked by source. syslog discovery is
880
+ # content-gated, so "zero candidates from a dir that holds files"
881
+ # means nothing read as RFC 3164 — distinct from pihole's
882
+ # filename-pattern mismatch. Either way a security tool must not
883
+ # swallow it silently. Explicit files load regardless and never
884
+ # reach this check.
885
+ if dir_inputs and not dir_candidates:
886
+ if source == "syslog_dir":
887
+ # Cheap iterdir presence check (NO sniff, NO `*.log*` test) so
888
+ # an extensionless-only dir is disclosed, not dropped silently.
889
+ # Directory path(s) only — never a per-file name list.
890
+ offending = [d for d in dir_inputs if _dir_has_regular_files(d)]
891
+ if offending:
892
+ names = ", ".join(str(d) for d in offending)
893
+ warnings.append(
894
+ f"syslog_dir: nothing in {names} looks like syslog "
895
+ f"(RFC 3164) — nothing loaded (check the path)."
896
+ )
897
+ elif any(_syslog_files(d, "*.log*") for d in dir_inputs):
898
+ warnings.append(
899
+ f"{source}: directory has .log files but none match "
900
+ f"{pattern!r} — not loaded (check the log file naming)."
901
+ )
902
+ explicit_resolved = {_safe_resolve(p) for p in file_inputs}
903
+ dir_for_window = [
904
+ p for p in dir_candidates
905
+ if _safe_resolve(p) not in explicit_resolved
906
+ ]
907
+ if (s_since or s_until) and dir_for_window:
908
+ selected_dir, skip_info = strategy.window_select(
909
+ dir_for_window, s_since, s_until, verbose=verbose
910
+ )
911
+ else:
912
+ selected_dir = dir_for_window
913
+ files = _union_dedupe([file_inputs, selected_dir])
914
+
915
+ data_size_bytes += sum(p.stat().st_size for p in files if p.is_file())
916
+
917
+ cov_dict: dict = {}
918
+ df = run_load(
919
+ strategy, files, pattern, s_since, s_until,
920
+ show_progress=show_progress, verbose=verbose,
921
+ _warnings=warnings, _coverage=cov_dict,
922
+ )
923
+
924
+ if skip_info is not None:
925
+ rotation_skips[pattern] = skip_info
926
+
927
+ logs[pattern] = df
928
+ if not df.empty:
929
+ record_counts[pattern] = len(df)
930
+
931
+ if "coverage" in cov_dict:
932
+ coverage[pattern] = cov_dict["coverage"]
933
+
934
+ warning = _schema_warning(pattern, df)
935
+ if warning:
936
+ warnings.append(warning)
937
+
938
+ return LoadResult(
939
+ logs=logs,
940
+ record_counts=record_counts,
941
+ data_window=_data_window(logs),
942
+ warnings=warnings,
943
+ data_size_bytes=data_size_bytes,
944
+ coverage=coverage,
945
+ rotation_skips=rotation_skips,
946
+ )
947
+
948
+
949
+ def resolve_load_windows(
950
+ needed_sources: dict[str, str],
951
+ source_dirs: dict[str, list[Path]],
952
+ default_spec: str,
953
+ *,
954
+ since: datetime | None,
955
+ until: datetime | None,
956
+ load_all: bool,
957
+ ) -> list[LoadWindow]:
958
+ """Resolve the universal default window into ONE ``LoadWindow`` per family.
959
+
960
+ The SINGLE window-policy entry point both ``run()`` and ``run_digest()`` call —
961
+ it replaced the runner's per-family name-ladder and the digest twin. Returns
962
+ ``[]`` (no default window engaged anywhere) when the operator gave an explicit
963
+ window, passed ``--all``, or ``default_window`` is empty/"all"/invalid.
964
+
965
+ Otherwise builds one :class:`LoadWindow` per source family that is in
966
+ ``needed_sources`` (the plan's pattern→source map), configured in
967
+ ``source_dirs``, UNBOUNDED (any directory in the bucket), AND eligible
968
+ (``default_window_eligible`` — CloudTrail opts out, baseline-relative). Each
969
+ family's OWN strategy resolves the ``(select_window, trim_span)`` via its
970
+ declared ``resolve_window`` (or :func:`_default_resolve_window` when it declares
971
+ none — load full + trim, the universal default a new flat source inherits with
972
+ zero runner edits). ``keep_null`` is read straight off ``strategy.ts_policy``.
973
+
974
+ ``needed_sources`` carries the pattern→source map so the flat resolver recovers
975
+ the detector glob per family (``pattern``) — first pattern per family, matching
976
+ the prior name-ladder behavior — without reintroducing a source-name branch.
977
+ """
978
+ if load_all or since is not None or until is not None:
979
+ return []
980
+ span = parse_window_span(default_spec)
981
+ if span is None:
982
+ return []
983
+
984
+ # Families present in the plan, stable order, deduped.
985
+ planned_sources: list[str] = []
986
+ for src in needed_sources.values():
987
+ if src not in planned_sources:
988
+ planned_sources.append(src)
989
+
990
+ windows: list[LoadWindow] = []
991
+ for source in planned_sources:
992
+ dirs = source_dirs.get(source)
993
+ if not dirs or is_bounded(dirs):
994
+ continue
995
+ strategy = _SOURCE_LOADERS.get(source)
996
+ # Declared opt-out: CloudTrail (baseline-relative) produces NO default
997
+ # window → loads full on unqualified runs. Explicit windows still apply
998
+ # (handled before this function via the since/until short-circuit).
999
+ if strategy is None or not strategy.default_window_eligible:
1000
+ continue
1001
+ # First pattern per family — matches the prior name-ladder; the flat
1002
+ # resolver anchors its conservative floor from DIRECTORY candidates only.
1003
+ pattern = next(
1004
+ (p for p, s in needed_sources.items() if s == source), "*.log*"
1005
+ )
1006
+ resolver = strategy.resolve_window or _default_resolve_window
1007
+ select_window, trim_span = resolver(strategy, dirs, pattern, span)
1008
+ keep_null = strategy.ts_policy == "keep"
1009
+ windows.append(LoadWindow(source, select_window, trim_span, keep_null))
1010
+ return windows