loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,523 @@
1
+ """Timeframe filtering, boundedness, and the flat-source rotation-peek subsystem.
2
+
3
+ The ts predicate (``_missing_ts``) and frame filter (``_apply_ts_filter``), the
4
+ path-shape boundedness predicates (``is_bounded`` / ``is_zeek_bounded``), and the
5
+ whole rotation-peek windowing subsystem (filename classifier + per-group peek/
6
+ prune). ``_open_log`` is reached through the package facade for monkeypatch
7
+ parity; ``_safe_resolve`` is imported directly from ``io`` (not a patch seam).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import math
13
+ import re
14
+ import sys
15
+ from dataclasses import dataclass, replace
16
+ from datetime import datetime, timedelta
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+ import pandas as pd
21
+
22
+ import loghunter.common.loader as _loader # facade: _open_log patch-through (call-time only)
23
+ from loghunter.common.loader.io import _safe_resolve
24
+ from loghunter.common.loader.types import (
25
+ CoverageTracker,
26
+ LoadResult,
27
+ RotationSkipInfo,
28
+ _data_window,
29
+ )
30
+ from loghunter.parsers.syslog import parse_timestamp as _parse_syslog_ts
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class LoadWindow:
35
+ """Resolved default window for ONE source family — the single window-policy
36
+ state object both ``run()`` and ``run_digest()`` derive everything from.
37
+
38
+ Built by :func:`loghunter.common.loader.resolve_load_windows` (one per in-plan,
39
+ configured, unbounded, eligible family). Every downstream consumer reads it: the
40
+ ``source_windows`` load override, the pre-load stderr line, ``requested_span``,
41
+ the post-load trim, the aws rider — so a family is never announced, windowed,
42
+ trimmed, and disclosed inconsistently.
43
+
44
+ Fields:
45
+ - ``source``: the family this resolves for (dir key, e.g. ``"syslog_dir"``).
46
+ - ``select_window``: what goes into ``source_windows[source]`` — a precise
47
+ ``(since, until)`` for a dated-Zeek layout, a conservative ``(floor, None)``
48
+ for a peekable flat family, or ``None`` to load the family full (flat/mixed
49
+ Zeek, unpeekable flat fallback, the universal default for a declarationless
50
+ source). Flat floors preserve the open-ended ``(floor, None)`` shape.
51
+ - ``trim_span``: the precise post-load trim span, anchored on the family's own
52
+ max-ts. ``None`` ONLY for the dated-Zeek path (its ``select_window`` already
53
+ cut exactly at load); set for every load-full / conservative path.
54
+ - ``keep_null``: the source's ts policy — keep-policy families (syslog/pihole)
55
+ retain unparseable-ts rows through the implicit trim, exactly as through an
56
+ explicit window.
57
+ """
58
+
59
+ source: str
60
+ select_window: tuple[datetime, datetime | None] | None
61
+ trim_span: timedelta | None
62
+ keep_null: bool
63
+
64
+
65
+ def _missing_ts(ts: Any) -> bool:
66
+ """Return True for the canonical "missing timestamp" shapes the loader
67
+ recognises: ``None`` and ``float('nan')``.
68
+
69
+ The ONE place that defines "missing" — used by ``run_load`` to dispatch
70
+ the stream-mode keep/drop policy, by ``CoverageTracker.observe`` (which
71
+ silently ignores missing-ts inputs), and indirectly by ``_apply_ts_filter``
72
+ (which calls ``ts.notna()`` over a pandas Series — same predicate).
73
+ """
74
+ return ts is None or (isinstance(ts, float) and math.isnan(ts))
75
+
76
+
77
+ def _apply_ts_filter(
78
+ df: pd.DataFrame,
79
+ since: datetime | None,
80
+ until: datetime | None,
81
+ *,
82
+ keep_null: bool = False,
83
+ ) -> pd.DataFrame:
84
+ """Drop null-ts rows and filter to [since, until] window.
85
+
86
+ Default (``keep_null=False``): drops rows with NaN/None ts unconditionally —
87
+ matches NDJSON behavior where records without ts are always skipped. This is
88
+ the drop-policy behavior the frame-mode load path sees, byte-for-byte.
89
+
90
+ ``keep_null=True``: retain rows where ts is NaN OR ts ∈ window. Used by the
91
+ analyze post-load default-window trim for keep-policy families (syslog /
92
+ pihole), whose stream loader deliberately retains unparseable-ts rows
93
+ (an unfilterable line is a real event and must survive the implicit window,
94
+ just as it bypasses the load-time window). Returns an empty DataFrame if the
95
+ ts column is absent.
96
+ """
97
+ if df.empty:
98
+ return df
99
+ if "ts" not in df.columns:
100
+ return pd.DataFrame(columns=df.columns)
101
+ null_mask = df["ts"].isna()
102
+ if not keep_null:
103
+ df = df[~null_mask]
104
+ null_mask = pd.Series(False, index=df.index)
105
+ if df.empty or (since is None and until is None):
106
+ return df
107
+ since_ts = since.timestamp() if since else None
108
+ until_ts = until.timestamp() if until else None
109
+ window_mask = pd.Series(True, index=df.index)
110
+ if since_ts is not None:
111
+ window_mask &= df["ts"] >= since_ts
112
+ if until_ts is not None:
113
+ window_mask &= df["ts"] <= until_ts
114
+ return df[window_mask | null_mask]
115
+
116
+
117
+ def is_bounded(paths: list[Path]) -> bool:
118
+ """Return True when a source-input bucket is BOUNDED (no auto-window applies).
119
+
120
+ Family-neutral bucket-level predicate: non-empty AND every input is a single
121
+ regular file. Any directory in the bucket — or an empty bucket — is UNBOUNDED.
122
+ Mirrors the single-input semantics under the multi-input wire shape: a
123
+ one-element list of a file → True (degenerate single-input case); a
124
+ one-element list of a directory → False; a list with any directory → False;
125
+ an empty list → False (the runner gates on truthiness before calling, so this
126
+ state rarely reaches the helper — but the predicate stays explicit).
127
+
128
+ Boundedness is pure path-shape and identical for every family — the
129
+ universal default window (analyze AND digest, both via
130
+ ``resolve_load_windows``) reads this per family. ``is_zeek_bounded`` is a
131
+ retained Zeek-named public alias of it: it has no internal caller after the
132
+ window-model unification, kept for API stability.
133
+ """
134
+ return bool(paths) and all(p.is_file() for p in paths)
135
+
136
+
137
+ def is_zeek_bounded(paths: list[Path]) -> bool:
138
+ """Retained Zeek-named public alias of :func:`is_bounded` (byte-identical).
139
+
140
+ No internal caller after the window-model unification routed digest
141
+ boundedness through ``resolve_load_windows``; kept for API stability.
142
+ """
143
+ return is_bounded(paths)
144
+
145
+
146
+ # Compression extensions stripped before reading a rotation ordinal, and the
147
+ # trailing ``.<digits>`` rotation-number matcher. ``pihole.log.2.gz`` → strip
148
+ # ``.gz`` → match ``.2`` → base ``pihole.log``, index 2. The active file
149
+ # (``pihole.log``) and a non-numeric tail (``server.log``) carry index 0.
150
+ _COMPRESSION_EXTS = (".gz", ".bz2", ".xz")
151
+ _ROTATION_NUM_RE = re.compile(r"\.(\d+)$")
152
+
153
+ # Date-aware ordering anchor: a newer date → a SMALLER age_rank, and any dated
154
+ # file (rank ~8e7) sorts AFTER the live/undated head (rank 0). Keeps ascending
155
+ # age_rank == strictly newest→oldest across numeric AND date-based rotations.
156
+ _DATE_RANK_BASE = 99_999_999
157
+
158
+ # Family 2 — loghunter's own exporter output (``exporters/__init__.py``
159
+ # ``_auto_filename``): an INFIX ``_{YYYYMMDD}`` start date followed by either
160
+ # ``_{N}d`` or ``_to_{YYYYMMDD}_{HH}h``, then the (optional) extension. A bare
161
+ # ``_{YYYYMMDD}`` with no window token, or a ``_partNN`` infix, fails this regex
162
+ # and falls to the singleton floor (loaded-not-pruned — safe).
163
+ _EXPORT_WINDOW_RE = re.compile(
164
+ r"^(?P<base>.+?)_(?P<start>\d{8})_"
165
+ r"(?:(?P<days>\d+)d|to_(?P<end>\d{8})_(?P<hours>\d{2})h)"
166
+ r"(?P<ext>\..*)?$"
167
+ )
168
+
169
+
170
+ def _strip_compression_ext(name: str) -> str:
171
+ """Strip ONE trailing compression extension (``.gz``/``.bz2``/``.xz``) if
172
+ present, else return the name unchanged.
173
+
174
+ The shared primitive behind rotation classification AND duplicate-slot
175
+ detection: a genuine duplicate is the SAME logical file in two compressions
176
+ (``.log`` + ``.log.gz``, ``.2`` + ``.2.gz``, ``.20240101`` + ``.gz``), so it
177
+ strips to the same name — distinct files (``auth.log`` vs ``auth.log.0``,
178
+ ``.02`` vs ``.2``) do not.
179
+ """
180
+ for ext in _COMPRESSION_EXTS:
181
+ if name.endswith(ext):
182
+ return name[: -len(ext)]
183
+ return name
184
+
185
+
186
+ def _classify_rotation_name(
187
+ name: str,
188
+ ) -> tuple[str, int, tuple[datetime, datetime] | None]:
189
+ """Classify a discovered rotation filename into
190
+ ``(group_base, age_rank, declared_window | None)``.
191
+
192
+ The single classifier behind both ``_rotation_base_and_index`` (grouping +
193
+ ordering) and the Family-2 structural overlap guard. A trailing compression
194
+ extension (``.gz``/``.bz2``/``.xz``) is stripped first, then the four
195
+ recognized forms in order:
196
+
197
+ - **numeric ordinal** (existing): trailing ``.(\\d+)`` whose digits are NOT a
198
+ valid 8-digit calendar date → ``(base, N, None)`` (live/undated head ``N=0``).
199
+ - **dateext** (Family 1): trailing ``.(\\d{8})`` parsing as ``%Y%m%d`` →
200
+ ``(base, _DATE_RANK_BASE - int, None)``. logrotate dateext rotations are
201
+ non-overlapping by construction, so they carry NO declared window and rely on
202
+ the monotonicity backstop like numeric ordinals.
203
+ - **export window** (Family 2): an infix ``_{YYYYMMDD}`` + ``_{N}d`` /
204
+ ``_to_{YYYYMMDD}_{HH}h`` → ``(base, _DATE_RANK_BASE - int(start), [start, end))``
205
+ with NAIVE datetimes. The window feeds ONLY the intra-group overlap guard,
206
+ NEVER prune gating.
207
+ - **floor**: nothing matched → ``(stripped_name, 0, None)`` — today's singleton.
208
+
209
+ AGE_RANK CONTRACT: within a homogeneous group, ascending age_rank sort =
210
+ strictly newest→oldest. numeric age_rank == N keeps every existing numeric
211
+ ordering test byte-green. An 8-digit token that is not a valid calendar date is
212
+ a numeric ordinal; a real index that happens to parse as a date is
213
+ astronomically unlikely and the peek's monotonicity check catches any resulting
214
+ disorder → fallback.
215
+ """
216
+ name = _strip_compression_ext(name)
217
+
218
+ m = _ROTATION_NUM_RE.search(name)
219
+ if m:
220
+ digits = m.group(1)
221
+ base = name[: m.start()]
222
+ if len(digits) == 8:
223
+ try:
224
+ datetime.strptime(digits, "%Y%m%d")
225
+ except ValueError:
226
+ pass # 8-digit non-date (e.g. month 13) → numeric ordinal
227
+ else:
228
+ return base, _DATE_RANK_BASE - int(digits), None
229
+ return base, int(digits), None
230
+
231
+ em = _EXPORT_WINDOW_RE.match(name)
232
+ if em:
233
+ try:
234
+ start = datetime.strptime(em.group("start"), "%Y%m%d")
235
+ if em.group("days") is not None:
236
+ # `_Nd` days is unbounded `\d+`; a huge count overflows the date
237
+ # add → OverflowError, which the guard below catches → floor.
238
+ end = start + timedelta(days=int(em.group("days")))
239
+ else:
240
+ # `_auto_filename` lossily encodes `until` as `_{HH}h` (min/sec
241
+ # dropped), so the real until lies in [date+HH:00, date+(HH+1):00).
242
+ # CEIL to the next hour so the reconstructed end is a guaranteed
243
+ # SUPERSET (≥ real until) — the overlap guard is then never-miss.
244
+ end = datetime.strptime(em.group("end"), "%Y%m%d") + timedelta(
245
+ hours=int(em.group("hours")) + 1
246
+ )
247
+ except (ValueError, OverflowError):
248
+ pass # date(s) don't parse OR the math overflows → floor (never raises)
249
+ else:
250
+ # A non-positive window (empty `_0d`, or an inverted `_to_` whose end
251
+ # precedes its start) is malformed: an empty `[start, start)` reads as
252
+ # DISJOINT under the half-open overlap test, so it would dodge both the
253
+ # overlap predicate and the rank-tie fallback and silently skip a
254
+ # same-start sibling. Floor it — it then carries no window (own base,
255
+ # peeked independently) instead of a degenerate claimed one.
256
+ if end > start:
257
+ return em.group("base"), _DATE_RANK_BASE - int(em.group("start")), (start, end)
258
+
259
+ return name, 0, None
260
+
261
+
262
+ def _rotation_base_and_index(name: str) -> tuple[str, int]:
263
+ """Split a rotation filename into ``(base, age_rank)``.
264
+
265
+ Thin wrapper over ``_classify_rotation_name`` — the public helper tests import.
266
+ ``pihole.log.2.gz`` → ``("pihole.log", 2)``; ``router.log.1`` →
267
+ ``("router.log", 1)``; ``server.log`` / ``pihole.log`` → ``(name, 0)``;
268
+ date-based forms carry the date-aware age_rank. The declared window stays
269
+ INTERNAL to the classifier — it never enters this helper's contract.
270
+ Drives both rotation ordering (fixing lexical ``.1, .10, .2``) and per-family
271
+ grouping in ``_rotation_windowed_files``.
272
+ """
273
+ base, age_rank, _ = _classify_rotation_name(name)
274
+ return base, age_rank
275
+
276
+
277
+ def _peek_first_ts(path: Path) -> datetime | None:
278
+ """Return the parsed timestamp of ``path``'s first non-blank, non-comment
279
+ line — the file's OLDEST row (flat rotations are internally oldest→newest).
280
+
281
+ Reuses ``_open_log`` (transparent gz/bz2/xz) and the shared
282
+ ``parsers.syslog.parse_timestamp`` so the peek ts is IDENTICAL to the ts the
283
+ loader filters on (clock parity — the same call for syslog and dnsmasq).
284
+ A bounded read: returns on the first usable line. ``None`` when the file is
285
+ empty or carries no parseable first-ts (caller treats that as conservative
286
+ include).
287
+ """
288
+ with _loader._open_log(path) as fh:
289
+ for line in fh:
290
+ s = line.strip()
291
+ if s and not s.startswith("#"):
292
+ return _parse_syslog_ts(s)
293
+ return None
294
+
295
+
296
+ def _select_group(
297
+ group_sorted: list[Path],
298
+ since: datetime | None,
299
+ until: datetime | None,
300
+ ) -> tuple[list[Path], list[tuple[str, datetime | None]], bool]:
301
+ """Select the in-window files of ONE rotation family (already sorted
302
+ newest→oldest by ordinal). Returns ``(selected, skipped, fell_back)``.
303
+
304
+ Only DETECTS first-ts disorder (``fell_back=True``); it does NOT self-select
305
+ all — the pattern-level aggregate (``_rotation_windowed_files``) owns the
306
+ full-read decision so a fallback is data-true across the whole pattern.
307
+
308
+ Conservative: an unpeekable / corrupt file is INCLUDED and skipped from the
309
+ monotonic chain — the optimization never drops a file it could not vet.
310
+ """
311
+ selected: list[Path] = []
312
+ skipped: list[tuple[str, datetime | None]] = []
313
+ prev_ts: datetime | None = None
314
+ for i, f in enumerate(group_sorted): # newest → oldest
315
+ try:
316
+ ts = _peek_first_ts(f)
317
+ except Exception:
318
+ # Corrupt compressed file — let run_load's read-corruption rail warn
319
+ # at read; never abort discovery. Conservative include.
320
+ ts = None
321
+ if ts is None:
322
+ selected.append(f) # empty / unpeekable → conservative include
323
+ continue
324
+ if prev_ts is not None and ts > prev_ts:
325
+ # First-ts RISE against the newest→oldest order — gross disorder.
326
+ # Signal a pattern-level fallback (the aggregate reads the full set).
327
+ return selected, skipped, True
328
+ prev_ts = ts
329
+ if until is not None and ts > until:
330
+ # Leading file entirely AFTER the window (its oldest row > until).
331
+ skipped.append((f.name, ts))
332
+ continue
333
+ if since is not None and ts < since:
334
+ # First file whose oldest row predates `since` — it STRADDLES the
335
+ # lower bound (its newer tail may hold in-window rows), so include
336
+ # it, then stop: every older file is wholly out of window. The tail
337
+ # is never peeked → the perf win, and ts stays None (not fabricated).
338
+ selected.append(f)
339
+ skipped.extend((g.name, None) for g in group_sorted[i + 1:])
340
+ break
341
+ selected.append(f)
342
+ return selected, skipped, False
343
+
344
+
345
+ def _group_order_conflict(
346
+ classified: list[tuple[str, int, tuple[datetime, datetime] | None]],
347
+ stripped_names: list[str],
348
+ ) -> str | None:
349
+ """Return the fallback REASON when a rotation group cannot be cleanly ordered,
350
+ or ``None`` when the group is safe to peek-prune. ``classified`` is the group's
351
+ ``(base, age_rank, window | None)`` tuples in group order; ``stripped_names``
352
+ is the parallel list of compression-stripped filenames.
353
+
354
+ Two un-orderable shapes the lower-bound straddle + strict-``>`` monotonicity
355
+ check would otherwise silently mishandle — both fall back whole-pattern:
356
+
357
+ - **Duplicate rotation slots (non-export schemes):** the SAME logical file in
358
+ two compressions (``pihole.log.2`` + ``.2.gz``; ``auth.log.20240101`` + ``.gz``;
359
+ a live ``.log`` + its ``.log.gz``) collapses to one ambiguous slot — equal
360
+ first-ts doesn't trip the strict monotonicity check, so one would be
361
+ straddle-kept and its in-window sibling silently skipped as the "older tail".
362
+ Detected by a SHARED compression-stripped NAME — NOT an age_rank tie, which is
363
+ overloaded (the live/floor head and a 0-indexed ``.0`` both rank 0; ``.02`` and
364
+ ``.2`` both int-rank 2 — distinct files, not duplicates). Reason
365
+ ``"duplicate rotation files"``.
366
+ - **Family-2 export windows that overlap / duplicate / mix with unwindowed
367
+ members under one base:** loghunter authored the names, so each declares its
368
+ full ``[start, end)`` — checkable from filenames alone, ZERO extra reads. The
369
+ half-open overlap test (``a0 < b1 and b0 < a1``, TRUE for equal windows)
370
+ catches export ``.log`` + ``.log.gz`` duplicates too, so they NEVER reach the
371
+ stripped-name branch. Reason ``"overlapping export windows"``.
372
+
373
+ The declared window is used ONLY here; it is NEVER a prune gate (the peek's
374
+ first-ts vs since/until stays the sole data gate, so the filename date's tz is
375
+ irrelevant — windows are only compared to each other).
376
+ """
377
+ windows = [c[2] for c in classified]
378
+ if all(w is None for w in windows):
379
+ # numeric / dateext / floor: un-orderable iff two members are the same
380
+ # logical file in two compressions → a shared stripped name.
381
+ return (
382
+ "duplicate rotation files"
383
+ if len(set(stripped_names)) != len(stripped_names)
384
+ else None
385
+ )
386
+ if len(classified) == 1:
387
+ return None # singleton declared window — the peek handles it
388
+ if any(w is None for w in windows):
389
+ return "overlapping export windows" # mixed windowed + unwindowed
390
+ for i in range(len(windows)): # all members windowed → require pairwise disjoint
391
+ for j in range(i + 1, len(windows)):
392
+ (a0, a1), (b0, b1) = windows[i], windows[j]
393
+ if a0 < b1 and b0 < a1: # half-open; equal → overlap (duplicate)
394
+ return "overlapping export windows"
395
+ return None
396
+
397
+
398
+ def _rotation_windowed_files(
399
+ files: list[Path],
400
+ since: datetime | None,
401
+ until: datetime | None,
402
+ *,
403
+ verbose: bool = False,
404
+ ) -> tuple[list[Path], RotationSkipInfo]:
405
+ """Peek-prune a flat rotation candidate list to the files a ``since``/
406
+ ``until`` window can touch, PER ROTATION GROUP.
407
+
408
+ The safety invariant (each file internally chronological, non-overlapping)
409
+ holds only WITHIN one logrotate family, so files are grouped by
410
+ ``(resolved parent, rotation base)`` — ``/a/pihole.log.*`` and
411
+ ``/b/pihole.log.*`` are two groups; ``router.log.*`` and ``server.log.*`` in
412
+ one dir are two groups. Within a group, files are read newest→oldest and the
413
+ older out-of-window tail is skipped.
414
+
415
+ Fallback is DATA-TRUE at the pattern level: if ANY group's first-ts order is
416
+ non-monotonic, pruning is disabled for the WHOLE pattern — every candidate is
417
+ returned with ``fallback=True``, ``skipped=0``. That keeps the runner's "read
418
+ the full archive" note honest (no silently-pruned sibling group).
419
+ """
420
+ # Classify (and compression-strip) each discovered file ONCE; grouping, the
421
+ # group sort key, and the order-conflict check all read from these maps (the
422
+ # classifier is regex + up to two strptime — not worth re-running ~3× per file).
423
+ classified = {p: _classify_rotation_name(p.name) for p in files}
424
+ stripped = {p: _strip_compression_ext(p.name) for p in files}
425
+
426
+ groups: dict[tuple[Path, str], list[Path]] = {}
427
+ for p in files:
428
+ base = classified[p][0]
429
+ groups.setdefault((_safe_resolve(p).parent, base), []).append(p)
430
+
431
+ selected_all: list[Path] = []
432
+ skipped_all: list[tuple[str, datetime | None]] = []
433
+ for key in sorted(groups, key=lambda k: (str(k[0]), k[1])):
434
+ group_sorted = sorted(
435
+ groups[key], key=lambda p: classified[p][1]
436
+ ) # ascending age_rank = newest (rank 0) → oldest
437
+ reason = _group_order_conflict(
438
+ [classified[p] for p in group_sorted],
439
+ [stripped[p] for p in group_sorted],
440
+ )
441
+ if reason is not None:
442
+ # A group that can't be cleanly ordered (same-rank duplicate or
443
+ # overlapping export windows) → full read for the entire pattern
444
+ # (whole-pattern, data-true like the monotonic fallback).
445
+ return list(files), RotationSkipInfo(
446
+ loaded=len(files),
447
+ skipped=0,
448
+ fallback=True,
449
+ fallback_reason=reason,
450
+ skipped_files=[],
451
+ )
452
+ sel, skp, fell_back = _select_group(group_sorted, since, until)
453
+ if fell_back:
454
+ # Any disorder → full read for the entire pattern. Nothing skipped.
455
+ return list(files), RotationSkipInfo(
456
+ loaded=len(files),
457
+ skipped=0,
458
+ fallback=True,
459
+ fallback_reason="rotation order not monotonic",
460
+ skipped_files=[],
461
+ )
462
+ selected_all.extend(sel)
463
+ skipped_all.extend(skp)
464
+
465
+ if verbose:
466
+ for name, ts in skipped_all:
467
+ detail = f" (oldest {ts.isoformat()})" if ts is not None else ""
468
+ print(f"rotation-peek: skipped {name}{detail}", file=sys.stderr)
469
+
470
+ return selected_all, RotationSkipInfo(
471
+ loaded=len(selected_all),
472
+ skipped=len(skipped_all),
473
+ fallback=False,
474
+ skipped_files=skipped_all,
475
+ )
476
+
477
+
478
+ def apply_default_window(
479
+ load_result: LoadResult,
480
+ family_patterns: list[str],
481
+ span: timedelta,
482
+ *,
483
+ keep_null: bool,
484
+ ) -> LoadResult:
485
+ """Trim one family's loaded frames to its own last-``span`` window (post-load).
486
+
487
+ The single post-load trim for the universal default window, shared by ``run()``
488
+ and ``run_digest()`` (relocated from the runner). Anchors on the family's OWN
489
+ max-ts (combined across its patterns), filters each pattern with ``keep_null``
490
+ wired from the source policy (keep-policy families retain unparseable-ts rows
491
+ through the implicit window), rebuilds coverage for any pattern that went
492
+ non-empty → empty (so its zero-in-window note still fires), and rebuilds via
493
+ ``dataclasses.replace`` so ``warnings`` / ``data_size_bytes`` / ``rotation_skips``
494
+ carry forward unchanged. Never mutates the passed-in ``LoadResult.logs``.
495
+ """
496
+ # Shallow-copy the dict so the per-pattern reassignment below never mutates the
497
+ # passed-in LoadResult's logs.
498
+ logs = dict(load_result.logs)
499
+ subset = {p: logs[p] for p in family_patterns if p in logs and not logs[p].empty}
500
+ window = _data_window(subset)
501
+ if window is None:
502
+ return load_result
503
+ until = window[1]
504
+ since = until - span
505
+ merged_cov = dict(load_result.coverage)
506
+ for p, pre_df in subset.items():
507
+ post_df = _apply_ts_filter(pre_df, since, until, keep_null=keep_null)
508
+ logs[p] = post_df
509
+ if not pre_df.empty and post_df.empty:
510
+ tracker = CoverageTracker()
511
+ tracker.note_file_read()
512
+ tracker.observe_frame(pre_df)
513
+ # post is empty — mark_kept intentionally does NOT fire
514
+ sc = tracker.coverage(True)
515
+ if sc is not None:
516
+ merged_cov[p] = sc
517
+ return replace(
518
+ load_result,
519
+ logs=logs,
520
+ record_counts={p: len(df) for p, df in logs.items() if not df.empty},
521
+ data_window=_data_window(logs),
522
+ coverage=merged_cov,
523
+ )
@@ -0,0 +1,93 @@
1
+ """Reporter, OutputHandler base class, and handler registry.
2
+
3
+ Findings flow: list[Finding] → Reporter → one or more OutputHandler instances.
4
+
5
+ Detectors never know how output is handled. Adding a new output format means
6
+ implementing one OutputHandler subclass in loghunter/outputs/. Nothing else changes.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import importlib
12
+ from abc import ABC, abstractmethod
13
+
14
+ from loghunter.common.finding import Finding, RunSummary
15
+
16
+
17
+ class OutputHandler(ABC):
18
+ """Base class for all output format handlers.
19
+
20
+ Implement begin(), write(), and end() in each subclass.
21
+ The framework calls them in order: begin → write (once per detector group) → end.
22
+ """
23
+
24
+ @abstractmethod
25
+ def begin(self, run_summary: RunSummary) -> None:
26
+ """Called once before any findings are written. Render the run summary here."""
27
+ raise NotImplementedError
28
+
29
+ @abstractmethod
30
+ def write(self, findings: list[Finding]) -> None:
31
+ """Called with findings from one or more detectors."""
32
+ raise NotImplementedError
33
+
34
+ @abstractmethod
35
+ def end(self) -> None:
36
+ """Called once after all findings have been written. Flush/close resources here."""
37
+ raise NotImplementedError
38
+
39
+
40
+ class Reporter:
41
+ """Orchestrates output across one or more registered OutputHandler instances."""
42
+
43
+ def __init__(self, handlers: list[OutputHandler]) -> None:
44
+ self._handlers = handlers
45
+
46
+ def begin(self, run_summary: RunSummary) -> None:
47
+ """Call begin() on all handlers. Invoke before the detection loop."""
48
+ for handler in self._handlers:
49
+ handler.begin(run_summary)
50
+
51
+ def write(self, findings: list[Finding]) -> None:
52
+ """Call write() on all handlers. Invoke after detection completes."""
53
+ for handler in self._handlers:
54
+ handler.write(findings)
55
+
56
+ def end(self) -> None:
57
+ """Call end() on all handlers. Flush and close any open resources."""
58
+ for handler in self._handlers:
59
+ handler.end()
60
+
61
+ def run(self, findings: list[Finding], run_summary: RunSummary) -> None:
62
+ """Convenience method: begin → write → end in a single call."""
63
+ self.begin(run_summary)
64
+ self.write(findings)
65
+ self.end()
66
+
67
+
68
+ _HANDLER_REGISTRY: dict[str, type[OutputHandler]] = {}
69
+
70
+
71
+ def register_handler(name: str, cls: type[OutputHandler]) -> None:
72
+ """Register an OutputHandler subclass under a format name (e.g. 'text', 'json')."""
73
+ _HANDLER_REGISTRY[name] = cls
74
+
75
+
76
+ def register_builtin_handlers() -> None:
77
+ """Import built-in output modules so their handlers register themselves."""
78
+ for module in ("text", "json", "csv", "html"):
79
+ importlib.import_module(f"loghunter.outputs.{module}")
80
+
81
+
82
+ def get_handler(name: str) -> type[OutputHandler]:
83
+ """Return the OutputHandler class for the given format name.
84
+
85
+ Raises ValueError with an actionable message if the format is not registered.
86
+ """
87
+ register_builtin_handlers()
88
+ if name not in _HANDLER_REGISTRY:
89
+ available = ", ".join(sorted(_HANDLER_REGISTRY))
90
+ raise ValueError(
91
+ f"Unknown output format '{name}'. Available formats: {available}"
92
+ )
93
+ return _HANDLER_REGISTRY[name]