loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,184 @@
1
+ """Content sniffing — the digest schema cascade and the syslog content gate.
2
+
3
+ Two deliberately separate sniff heads (CODE.md: do NOT unify — dnsmasq IS RFC
4
+ 3164): ``sniff_format`` / ``sniff_format_detailed`` (the digest recognizer
5
+ cascade) and ``_looks_like_syslog`` (the syslog discovery content gate).
6
+ ``_open_log`` is reached through the package facade so test monkeypatches of
7
+ ``loghunter.common.loader._open_log`` take effect here.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import gzip
13
+ import itertools
14
+ import lzma
15
+ from dataclasses import dataclass
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import loghunter.common.loader as _loader # facade: _open_log patch-through (call-time only)
20
+ from loghunter.parsers import (
21
+ cloudtrail as _cloudtrail_parser,
22
+ dnsmasq as _dnsmasq_parser,
23
+ syslog as _syslog_parser,
24
+ zeek as _zeek_parser,
25
+ zeek_tsv as _zeek_tsv_parser,
26
+ )
27
+
28
+
29
+ def _is_ndjson(path: Path) -> bool:
30
+ """Return True if the file's first content line starts with '{' (NDJSON)."""
31
+ with _loader._open_log(path) as fh:
32
+ for line in fh:
33
+ s = line.strip()
34
+ if s and not s.startswith("#"):
35
+ return s.startswith("{")
36
+ return False
37
+
38
+
39
+ # Byte-bounded prefix read by the syslog content-sniff gate. The bound is
40
+ # load-bearing: a line-bounded peek would read a newline-sparse binary
41
+ # (wtmp/btmp/lastlog) to EOF; a byte-bounded read cannot (blob's
42
+ # hard-bounded-window rail).
43
+ _SYSLOG_SNIFF_BYTES = 8192
44
+
45
+
46
+ def _looks_like_syslog(path: Path) -> bool:
47
+ """Content-sniff gate: True iff a BOUNDED decompressed prefix of ``path``
48
+ reads as RFC 3164 syslog.
49
+
50
+ Byte-bounded printable gate FIRST, then the syslog recognizer DIRECTLY — NOT
51
+ the full ``sniff_format`` cascade. Rationale: dnsmasq lines ARE RFC 3164 and
52
+ ``dnsmasq.sniff`` is strict, so the cascade would route a dnsmasq-query-first
53
+ ``messages`` to "dns"; the syslog recognizer claims any real RFC-3164 header
54
+ (incl. dnsmasq's) and cleanly rejects ISO-timestamped ``dnf``/``hawkey``,
55
+ systemd ``boot.log``, and binaries.
56
+
57
+ Conservative-include on a read error: return True so the file defers to
58
+ ``run_load``'s disclosed corruption rail (``_zeek_file_read_warning``) rather
59
+ than being silently dropped. A gzip rotation decompresses CLEAN through
60
+ ``_open_log``, so the NUL test runs on decoded text — never on raw compressed
61
+ bytes.
62
+ """
63
+ try:
64
+ with _loader._open_log(path) as fh:
65
+ chunk = fh.read(_SYSLOG_SNIFF_BYTES)
66
+ except (EOFError, gzip.BadGzipFile, lzma.LZMAError, OSError):
67
+ return True
68
+ if "\x00" in chunk:
69
+ return False
70
+ lines = chunk.splitlines()
71
+ return _syslog_parser.sniff(lines[: _syslog_parser.SNIFF_PEEK_LINES]) is not None
72
+
73
+
74
+ # Per-parser recognizers in fixed precedence — most-specific-first. The
75
+ # orchestrator runs each in turn; first non-None target wins. Precedence is
76
+ # the ambiguity policy: zeek_tsv before cloudtrail because the TSV header
77
+ # is the strongest signal; cloudtrail before zeek (NDJSON) so CloudTrail
78
+ # events are not claimed by the looser Zeek key-set test; dnsmasq before
79
+ # syslog because dnsmasq IS RFC 3164 and would otherwise be claimed as
80
+ # generic syslog.
81
+ _SNIFF_RECOGNIZERS: tuple[tuple[Any, int], ...] = (
82
+ (_zeek_tsv_parser, _zeek_tsv_parser.SNIFF_PEEK_LINES),
83
+ (_cloudtrail_parser, _cloudtrail_parser.SNIFF_PEEK_LINES),
84
+ (_zeek_parser, _zeek_parser.SNIFF_PEEK_LINES),
85
+ (_dnsmasq_parser, _dnsmasq_parser.SNIFF_PEEK_LINES),
86
+ (_syslog_parser, _syslog_parser.SNIFF_PEEK_LINES),
87
+ )
88
+
89
+ _SNIFF_MAX_PEEK: int = max(b for _, b in _SNIFF_RECOGNIZERS)
90
+
91
+ # Winning-recognizer module → source-family origin. The CLI uses origin to
92
+ # split Zeek-dns from Pi-hole-dns without re-reading the file.
93
+ _SNIFF_ORIGIN: dict[Any, str] = {
94
+ _zeek_tsv_parser: "zeek",
95
+ _zeek_parser: "zeek",
96
+ _cloudtrail_parser: "cloudtrail",
97
+ _dnsmasq_parser: "pihole",
98
+ _syslog_parser: "syslog",
99
+ }
100
+
101
+
102
+ def sniff_format(path: Path) -> str:
103
+ """Classify a log file into a digest schema by sampling its head.
104
+
105
+ Opens ``path`` via ``_open_log`` (gzip-transparent), reads at most
106
+ ``_SNIFF_MAX_PEEK`` lines once, and runs the per-parser recognizers in
107
+ fixed precedence (zeek_tsv → cloudtrail → zeek → dnsmasq → syslog).
108
+ Each recognizer sees only the prefix it asked for via ``SNIFF_PEEK_LINES``.
109
+
110
+ Returns one of "conn" | "dns" | "syslog" | "cloudtrail" | "blob". The
111
+ "blob" floor covers empty files and any content no recognizer claims.
112
+
113
+ This function classifies content only — the CLI-level decision of how
114
+ to handle empty inputs is layered on top in a later stage and is not
115
+ pre-empted here.
116
+ """
117
+ with _loader._open_log(path) as fh:
118
+ sample = list(itertools.islice(fh, _SNIFF_MAX_PEEK))
119
+ if not sample:
120
+ return "blob"
121
+ for mod, budget in _SNIFF_RECOGNIZERS:
122
+ target = mod.sniff(sample[:budget])
123
+ if target is not None:
124
+ return target
125
+ return "blob"
126
+
127
+
128
+ @dataclass(frozen=True)
129
+ class SniffResult:
130
+ """Detailed sniff outcome — schema plus source-family origin.
131
+
132
+ ``state`` is "empty" or "classified". On "empty", ``schema`` and ``origin``
133
+ are both None. On "classified", ``schema`` is one of
134
+ {conn, dns, syslog, cloudtrail, blob}; ``origin`` is the winning
135
+ recognizer's source family ({zeek, pihole, syslog, cloudtrail}) when a
136
+ recognizer claimed the sample, or None on the blob floor.
137
+ """
138
+
139
+ state: str
140
+ schema: str | None
141
+ origin: str | None
142
+
143
+
144
+ def sniff_format_detailed(path: Path) -> SniffResult:
145
+ """Classify a log file and expose origin + empty-state.
146
+
147
+ Sibling to ``sniff_format``. Single bounded read (``_SNIFF_MAX_PEEK`` lines
148
+ plus a one-line EOF probe). The CLI uses the result to short-circuit
149
+ truly-empty files and to split Zeek-dns vs Pi-hole-dns by origin.
150
+
151
+ Empty-detection contract is EOF-sensitive (leading whitespace beyond the
152
+ peek does not classify as empty):
153
+
154
+ 1. Zero-byte file → state="empty" without opening.
155
+ 2. Sample length zero → state="empty".
156
+ 3. Every sampled line is whitespace-only AND EOF was reached within the
157
+ bounded read → state="empty".
158
+ 4. Every sampled line is whitespace-only AND EOF was NOT reached (file
159
+ has more content beyond the peek) → fall through to the recognizer
160
+ cascade; the blob floor catches it.
161
+
162
+ Otherwise the same precedence as ``sniff_format``; origin is mapped from
163
+ the winning recognizer module via ``_SNIFF_ORIGIN``. Blob floor returns
164
+ ``schema="blob"``, ``origin=None``.
165
+ """
166
+ if path.stat().st_size == 0:
167
+ return SniffResult(state="empty", schema=None, origin=None)
168
+ with _loader._open_log(path) as fh:
169
+ sample = list(itertools.islice(fh, _SNIFF_MAX_PEEK))
170
+ # One-line EOF probe — at most _SNIFF_MAX_PEEK + 1 lines read total.
171
+ eof_reached = next(fh, None) is None
172
+ if not sample:
173
+ return SniffResult(state="empty", schema=None, origin=None)
174
+ if eof_reached and all(not line.strip() for line in sample):
175
+ return SniffResult(state="empty", schema=None, origin=None)
176
+ for mod, budget in _SNIFF_RECOGNIZERS:
177
+ target = mod.sniff(sample[:budget])
178
+ if target is not None:
179
+ return SniffResult(
180
+ state="classified",
181
+ schema=target,
182
+ origin=_SNIFF_ORIGIN[mod],
183
+ )
184
+ return SniffResult(state="classified", schema="blob", origin=None)
@@ -0,0 +1,207 @@
1
+ """Loader metadata types and the cross-frame window helper (leaf module).
2
+
3
+ The dataclasses the loader returns to the runner (``LoadResult`` and the
4
+ disclosure records ``SourceCoverage`` / ``RotationSkipInfo``), the incremental
5
+ ``CoverageTracker``, ``_data_window`` (pure ``logs dict → window``), and the
6
+ stream-mode empty-frame column constants. Imports stdlib + pandas only.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import math
12
+ from dataclasses import dataclass, field
13
+ from datetime import datetime, timezone
14
+
15
+ import pandas as pd
16
+
17
+
18
+ # Named log/compression suffixes stripped when deriving hostname from filename.
19
+ # All-numeric rotation suffixes (.1, .10, .42, etc.) are also stripped.
20
+ # Only log-related suffixes are removed so dotted hostnames (host1.example.com.log.gz) are preserved.
21
+ _LOG_SUFFIXES = frozenset({".gz", ".log"})
22
+
23
+ _PIHOLE_COLUMNS = [
24
+ "ts", "src", "query", "event_type", "qtype",
25
+ "dst", "answer", "validation", "host", "raw", "message",
26
+ ]
27
+
28
+ # CloudTrail canonical row schema. The aws detector (Thread B) consumes frames with
29
+ # these columns in this order. parsers/cloudtrail.py is the single source of truth
30
+ # for what each column means.
31
+ _CLOUDTRAIL_COLUMNS = [
32
+ "ts", "principal", "lane", "read_write",
33
+ "event_source", "event_name", "identity_type",
34
+ "source_ip", "error_code", "aws_region", "event_id", "raw",
35
+ ]
36
+
37
+ # Stream-mode empty-frame columns. Module-level so the strategy table reads
38
+ # clean; values match the per-loader empty-shape that existed pre-refactor.
39
+ _SYSLOG_COLUMNS = ["ts", "host", "program", "raw", "message"]
40
+
41
+
42
+ @dataclass(frozen=True)
43
+ class SourceCoverage:
44
+ """Pre-window coverage for one loaded pattern. Drives the runner's
45
+ "planned source contributed zero in-window rows" disclosure note.
46
+
47
+ ``full_rows`` is tri-state and load-bearing:
48
+ - ``None`` — NO files were read for this pattern (date-pruned dated
49
+ Zeek). Drives the BARE note ("files found, 0 records in the selected
50
+ window. Widen…").
51
+ - ``0`` — files were read but ZERO valid-ts rows survived parsing
52
+ (empty / header-only / unparseable timestamps — a PARSE gap, not a
53
+ window gap). Drives NO note: telling the operator to widen the
54
+ window on an empty file would mislead.
55
+ - ``>0`` — N valid-ts rows the window excluded. Drives the SPAN
56
+ note (count + span + widen suggestion).
57
+
58
+ ``full_span`` is None when ``full_rows`` is None or 0.
59
+ """
60
+
61
+ full_rows: int | None
62
+ full_span: tuple[datetime, datetime] | None
63
+
64
+
65
+ class CoverageTracker:
66
+ """Builds a SourceCoverage incrementally as a loader reads a pattern.
67
+
68
+ Single mechanism covering BOTH the streaming loaders (syslog / pihole /
69
+ cloudtrail — observe ts per row) and the frame loader (Zeek — observe
70
+ the parsed pre-filter frame per file). The runner's flat-Zeek
71
+ default-window block also uses this tracker.
72
+
73
+ Lifecycle (a single tracker per (pattern) load):
74
+ - ``note_file_read()`` per file OPENED. Distinguishes
75
+ "no files read" (date-pruned) → ``full_rows = None``
76
+ from "files read, no valid-ts rows" → ``full_rows = 0``.
77
+ - Either ``observe(ts)`` per row pre-window-check (streaming) OR
78
+ ``observe_frame(pre_df)`` per file pre-``_apply_ts_filter`` (Zeek).
79
+ Both count VALID-ts rows only.
80
+ - ``mark_kept()`` on row append (streaming) or non-empty post-window
81
+ per-file frame (Zeek). Latches so subsequent ``observe`` /
82
+ ``observe_frame`` calls short-circuit — ZERO normal-path cost.
83
+ - ``coverage(frame_empty)`` returns a SourceCoverage or None.
84
+
85
+ The tracker holds no references to the data it observed beyond running
86
+ counts and min/max — safe to retain across the load.
87
+ """
88
+
89
+ def __init__(self) -> None:
90
+ self._files_read = False
91
+ self._kept = False
92
+ self._valid_rows = 0
93
+ self._min_ts: float | None = None
94
+ self._max_ts: float | None = None
95
+
96
+ def note_file_read(self) -> None:
97
+ self._files_read = True
98
+
99
+ def observe(self, ts: float | None) -> None:
100
+ if self._kept:
101
+ return
102
+ if ts is None:
103
+ return
104
+ # NaN-safe: math.isnan rejects NaN before it pollutes min/max.
105
+ if isinstance(ts, float) and math.isnan(ts):
106
+ return
107
+ self._valid_rows += 1
108
+ if self._min_ts is None or ts < self._min_ts:
109
+ self._min_ts = ts
110
+ if self._max_ts is None or ts > self._max_ts:
111
+ self._max_ts = ts
112
+
113
+ def observe_frame(self, pre_df: pd.DataFrame) -> None:
114
+ if self._kept:
115
+ return
116
+ if pre_df is None or pre_df.empty or "ts" not in pre_df.columns:
117
+ return
118
+ valid = pre_df["ts"].dropna()
119
+ if valid.empty:
120
+ return
121
+ self._valid_rows += int(len(valid))
122
+ frame_min = float(valid.min())
123
+ frame_max = float(valid.max())
124
+ if self._min_ts is None or frame_min < self._min_ts:
125
+ self._min_ts = frame_min
126
+ if self._max_ts is None or frame_max > self._max_ts:
127
+ self._max_ts = frame_max
128
+
129
+ def mark_kept(self) -> None:
130
+ self._kept = True
131
+
132
+ def coverage(self, frame_empty: bool) -> SourceCoverage | None:
133
+ """Return a SourceCoverage when disclosure is warranted; else None.
134
+
135
+ - data survived (frame non-empty OR mark_kept fired) → None.
136
+ - no files read → (None, None).
137
+ - files read but zero valid-ts rows → (0, None).
138
+ - valid rows seen, all excluded by window → (valid, span).
139
+ """
140
+ if not frame_empty or self._kept:
141
+ return None
142
+ if not self._files_read:
143
+ return SourceCoverage(None, None)
144
+ if self._valid_rows == 0:
145
+ return SourceCoverage(0, None)
146
+ span: tuple[datetime, datetime] | None = None
147
+ if self._min_ts is not None and self._max_ts is not None:
148
+ span = (
149
+ datetime.fromtimestamp(self._min_ts, tz=timezone.utc),
150
+ datetime.fromtimestamp(self._max_ts, tz=timezone.utc),
151
+ )
152
+ return SourceCoverage(self._valid_rows, span)
153
+
154
+
155
+ @dataclass(frozen=True)
156
+ class RotationSkipInfo:
157
+ """Per-pattern result of flat-log rotation-peek windowing (syslog / pihole).
158
+
159
+ The loader records this STRUCTURED metadata; the runner formats the prose
160
+ note (``_rotation_skip_notes``) — the loader never imports the runner.
161
+
162
+ ``fallback`` is data-true at the PATTERN level: when any rotation group's
163
+ first-ts order is non-monotonic, ``_rotation_windowed_files`` disables
164
+ pruning for the WHOLE pattern and returns every candidate file
165
+ (``fallback=True``, ``skipped=0``, ``loaded=len(files)``). That keeps the
166
+ runner's "read the full archive" note honest — a fallback can never coexist
167
+ with a sibling group that was silently pruned.
168
+
169
+ ``skipped_files`` carries ``(name, oldest_ts_or_None)`` for verbose
170
+ per-file lines. The early-stopped older tail is never peeked, so its ts is
171
+ ``None`` — the perf win is real and no timestamp is fabricated.
172
+ """
173
+
174
+ loaded: int
175
+ skipped: int
176
+ fallback: bool
177
+ fallback_reason: str | None = None
178
+ skipped_files: list[tuple[str, datetime | None]] = field(default_factory=list)
179
+
180
+
181
+ @dataclass
182
+ class LoadResult:
183
+ """Loaded log data and metadata needed by the runner."""
184
+
185
+ logs: dict[str, pd.DataFrame]
186
+ record_counts: dict[str, int]
187
+ data_window: tuple[datetime, datetime] | None = None
188
+ warnings: list[str] = field(default_factory=list)
189
+ data_size_bytes: int = 0
190
+ coverage: dict[str, SourceCoverage] = field(default_factory=dict)
191
+ rotation_skips: dict[str, RotationSkipInfo] = field(default_factory=dict)
192
+
193
+
194
+ def _data_window(logs: dict[str, pd.DataFrame]) -> tuple[datetime, datetime] | None:
195
+ """Compute the min/max timestamp window across loaded DataFrames."""
196
+ all_ts: list[float] = []
197
+ for df in logs.values():
198
+ if not df.empty and "ts" in df.columns:
199
+ all_ts.extend(df["ts"].dropna().tolist())
200
+
201
+ if not all_ts:
202
+ return None
203
+
204
+ return (
205
+ datetime.fromtimestamp(min(all_ts), tz=timezone.utc),
206
+ datetime.fromtimestamp(max(all_ts), tz=timezone.utc),
207
+ )