PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

loghunter/common/loader/sniff.py ADDED Viewed

@@ -0,0 +1,184 @@
+"""Content sniffing — the digest schema cascade and the syslog content gate.
+Two deliberately separate sniff heads (CODE.md: do NOT unify — dnsmasq IS RFC
+3164): ``sniff_format`` / ``sniff_format_detailed`` (the digest recognizer
+cascade) and ``_looks_like_syslog`` (the syslog discovery content gate).
+``_open_log`` is reached through the package facade so test monkeypatches of
+``loghunter.common.loader._open_log`` take effect here.
+"""
+from __future__ import annotations
+import gzip
+import itertools
+import lzma
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any
+import loghunter.common.loader as _loader  # facade: _open_log patch-through (call-time only)
+from loghunter.parsers import (
+    cloudtrail as _cloudtrail_parser,
+    dnsmasq as _dnsmasq_parser,
+    syslog as _syslog_parser,
+    zeek as _zeek_parser,
+    zeek_tsv as _zeek_tsv_parser,
+)
+def _is_ndjson(path: Path) -> bool:
+    """Return True if the file's first content line starts with '{' (NDJSON)."""
+    with _loader._open_log(path) as fh:
+        for line in fh:
+            s = line.strip()
+            if s and not s.startswith("#"):
+                return s.startswith("{")
+    return False
+# Byte-bounded prefix read by the syslog content-sniff gate. The bound is
+# load-bearing: a line-bounded peek would read a newline-sparse binary
+# (wtmp/btmp/lastlog) to EOF; a byte-bounded read cannot (blob's
+# hard-bounded-window rail).
+_SYSLOG_SNIFF_BYTES = 8192
+def _looks_like_syslog(path: Path) -> bool:
+    """Content-sniff gate: True iff a BOUNDED decompressed prefix of ``path``
+    reads as RFC 3164 syslog.
+    Byte-bounded printable gate FIRST, then the syslog recognizer DIRECTLY — NOT
+    the full ``sniff_format`` cascade. Rationale: dnsmasq lines ARE RFC 3164 and
+    ``dnsmasq.sniff`` is strict, so the cascade would route a dnsmasq-query-first
+    ``messages`` to "dns"; the syslog recognizer claims any real RFC-3164 header
+    (incl. dnsmasq's) and cleanly rejects ISO-timestamped ``dnf``/``hawkey``,
+    systemd ``boot.log``, and binaries.
+    Conservative-include on a read error: return True so the file defers to
+    ``run_load``'s disclosed corruption rail (``_zeek_file_read_warning``) rather
+    than being silently dropped. A gzip rotation decompresses CLEAN through
+    ``_open_log``, so the NUL test runs on decoded text — never on raw compressed
+    bytes.
+    """
+    try:
+        with _loader._open_log(path) as fh:
+            chunk = fh.read(_SYSLOG_SNIFF_BYTES)
+    except (EOFError, gzip.BadGzipFile, lzma.LZMAError, OSError):
+        return True
+    if "\x00" in chunk:
+        return False
+    lines = chunk.splitlines()
+    return _syslog_parser.sniff(lines[: _syslog_parser.SNIFF_PEEK_LINES]) is not None
+# Per-parser recognizers in fixed precedence — most-specific-first. The
+# orchestrator runs each in turn; first non-None target wins. Precedence is
+# the ambiguity policy: zeek_tsv before cloudtrail because the TSV header
+# is the strongest signal; cloudtrail before zeek (NDJSON) so CloudTrail
+# events are not claimed by the looser Zeek key-set test; dnsmasq before
+# syslog because dnsmasq IS RFC 3164 and would otherwise be claimed as
+# generic syslog.
+_SNIFF_RECOGNIZERS: tuple[tuple[Any, int], ...] = (
+    (_zeek_tsv_parser, _zeek_tsv_parser.SNIFF_PEEK_LINES),
+    (_cloudtrail_parser, _cloudtrail_parser.SNIFF_PEEK_LINES),
+    (_zeek_parser, _zeek_parser.SNIFF_PEEK_LINES),
+    (_dnsmasq_parser, _dnsmasq_parser.SNIFF_PEEK_LINES),
+    (_syslog_parser, _syslog_parser.SNIFF_PEEK_LINES),
+)
+_SNIFF_MAX_PEEK: int = max(b for _, b in _SNIFF_RECOGNIZERS)
+# Winning-recognizer module → source-family origin. The CLI uses origin to
+# split Zeek-dns from Pi-hole-dns without re-reading the file.
+_SNIFF_ORIGIN: dict[Any, str] = {
+    _zeek_tsv_parser: "zeek",
+    _zeek_parser: "zeek",
+    _cloudtrail_parser: "cloudtrail",
+    _dnsmasq_parser: "pihole",
+    _syslog_parser: "syslog",
+}
+def sniff_format(path: Path) -> str:
+    """Classify a log file into a digest schema by sampling its head.
+    Opens ``path`` via ``_open_log`` (gzip-transparent), reads at most
+    ``_SNIFF_MAX_PEEK`` lines once, and runs the per-parser recognizers in
+    fixed precedence (zeek_tsv → cloudtrail → zeek → dnsmasq → syslog).
+    Each recognizer sees only the prefix it asked for via ``SNIFF_PEEK_LINES``.
+    Returns one of "conn" | "dns" | "syslog" | "cloudtrail" | "blob". The
+    "blob" floor covers empty files and any content no recognizer claims.
+    This function classifies content only — the CLI-level decision of how
+    to handle empty inputs is layered on top in a later stage and is not
+    pre-empted here.
+    """
+    with _loader._open_log(path) as fh:
+        sample = list(itertools.islice(fh, _SNIFF_MAX_PEEK))
+    if not sample:
+        return "blob"
+    for mod, budget in _SNIFF_RECOGNIZERS:
+        target = mod.sniff(sample[:budget])
+        if target is not None:
+            return target
+    return "blob"
+@dataclass(frozen=True)
+class SniffResult:
+    """Detailed sniff outcome — schema plus source-family origin.
+    ``state`` is "empty" or "classified". On "empty", ``schema`` and ``origin``
+    are both None. On "classified", ``schema`` is one of
+    {conn, dns, syslog, cloudtrail, blob}; ``origin`` is the winning
+    recognizer's source family ({zeek, pihole, syslog, cloudtrail}) when a
+    recognizer claimed the sample, or None on the blob floor.
+    """
+    state: str
+    schema: str | None
+    origin: str | None
+def sniff_format_detailed(path: Path) -> SniffResult:
+    """Classify a log file and expose origin + empty-state.
+    Sibling to ``sniff_format``. Single bounded read (``_SNIFF_MAX_PEEK`` lines
+    plus a one-line EOF probe). The CLI uses the result to short-circuit
+    truly-empty files and to split Zeek-dns vs Pi-hole-dns by origin.
+    Empty-detection contract is EOF-sensitive (leading whitespace beyond the
+    peek does not classify as empty):
+    1. Zero-byte file → state="empty" without opening.
+    2. Sample length zero → state="empty".
+    3. Every sampled line is whitespace-only AND EOF was reached within the
+       bounded read → state="empty".
+    4. Every sampled line is whitespace-only AND EOF was NOT reached (file
+       has more content beyond the peek) → fall through to the recognizer
+       cascade; the blob floor catches it.
+    Otherwise the same precedence as ``sniff_format``; origin is mapped from
+    the winning recognizer module via ``_SNIFF_ORIGIN``. Blob floor returns
+    ``schema="blob"``, ``origin=None``.
+    """
+    if path.stat().st_size == 0:
+        return SniffResult(state="empty", schema=None, origin=None)
+    with _loader._open_log(path) as fh:
+        sample = list(itertools.islice(fh, _SNIFF_MAX_PEEK))
+        # One-line EOF probe — at most _SNIFF_MAX_PEEK + 1 lines read total.
+        eof_reached = next(fh, None) is None
+    if not sample:
+        return SniffResult(state="empty", schema=None, origin=None)
+    if eof_reached and all(not line.strip() for line in sample):
+        return SniffResult(state="empty", schema=None, origin=None)
+    for mod, budget in _SNIFF_RECOGNIZERS:
+        target = mod.sniff(sample[:budget])
+        if target is not None:
+            return SniffResult(
+                state="classified",
+                schema=target,
+                origin=_SNIFF_ORIGIN[mod],
+            )
+    return SniffResult(state="classified", schema="blob", origin=None)

loghunter/common/loader/types.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Loader metadata types and the cross-frame window helper (leaf module).
+The dataclasses the loader returns to the runner (``LoadResult`` and the
+disclosure records ``SourceCoverage`` / ``RotationSkipInfo``), the incremental
+``CoverageTracker``, ``_data_window`` (pure ``logs dict → window``), and the
+stream-mode empty-frame column constants. Imports stdlib + pandas only.
+"""
+from __future__ import annotations
+import math
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+import pandas as pd
+# Named log/compression suffixes stripped when deriving hostname from filename.
+# All-numeric rotation suffixes (.1, .10, .42, etc.) are also stripped.
+# Only log-related suffixes are removed so dotted hostnames (host1.example.com.log.gz) are preserved.
+_LOG_SUFFIXES = frozenset({".gz", ".log"})
+_PIHOLE_COLUMNS = [
+    "ts", "src", "query", "event_type", "qtype",
+    "dst", "answer", "validation", "host", "raw", "message",
+]
+# CloudTrail canonical row schema. The aws detector (Thread B) consumes frames with
+# these columns in this order. parsers/cloudtrail.py is the single source of truth
+# for what each column means.
+_CLOUDTRAIL_COLUMNS = [
+    "ts", "principal", "lane", "read_write",
+    "event_source", "event_name", "identity_type",
+    "source_ip", "error_code", "aws_region", "event_id", "raw",
+]
+# Stream-mode empty-frame columns. Module-level so the strategy table reads
+# clean; values match the per-loader empty-shape that existed pre-refactor.
+_SYSLOG_COLUMNS = ["ts", "host", "program", "raw", "message"]
+@dataclass(frozen=True)
+class SourceCoverage:
+    """Pre-window coverage for one loaded pattern. Drives the runner's
+    "planned source contributed zero in-window rows" disclosure note.
+    ``full_rows`` is tri-state and load-bearing:
+      - ``None`` — NO files were read for this pattern (date-pruned dated
+        Zeek). Drives the BARE note ("files found, 0 records in the selected
+        window. Widen…").
+      - ``0``    — files were read but ZERO valid-ts rows survived parsing
+        (empty / header-only / unparseable timestamps — a PARSE gap, not a
+        window gap). Drives NO note: telling the operator to widen the
+        window on an empty file would mislead.
+      - ``>0``   — N valid-ts rows the window excluded. Drives the SPAN
+        note (count + span + widen suggestion).
+    ``full_span`` is None when ``full_rows`` is None or 0.
+    """
+    full_rows: int | None
+    full_span: tuple[datetime, datetime] | None
+class CoverageTracker:
+    """Builds a SourceCoverage incrementally as a loader reads a pattern.
+    Single mechanism covering BOTH the streaming loaders (syslog / pihole /
+    cloudtrail — observe ts per row) and the frame loader (Zeek — observe
+    the parsed pre-filter frame per file). The runner's flat-Zeek
+    default-window block also uses this tracker.
+    Lifecycle (a single tracker per (pattern) load):
+      - ``note_file_read()`` per file OPENED. Distinguishes
+        "no files read"        (date-pruned)           → ``full_rows = None``
+        from "files read, no valid-ts rows"            → ``full_rows = 0``.
+      - Either ``observe(ts)`` per row pre-window-check (streaming) OR
+        ``observe_frame(pre_df)`` per file pre-``_apply_ts_filter`` (Zeek).
+        Both count VALID-ts rows only.
+      - ``mark_kept()`` on row append (streaming) or non-empty post-window
+        per-file frame (Zeek). Latches so subsequent ``observe`` /
+        ``observe_frame`` calls short-circuit — ZERO normal-path cost.
+      - ``coverage(frame_empty)`` returns a SourceCoverage or None.
+    The tracker holds no references to the data it observed beyond running
+    counts and min/max — safe to retain across the load.
+    """
+    def __init__(self) -> None:
+        self._files_read = False
+        self._kept = False
+        self._valid_rows = 0
+        self._min_ts: float | None = None
+        self._max_ts: float | None = None
+    def note_file_read(self) -> None:
+        self._files_read = True
+    def observe(self, ts: float | None) -> None:
+        if self._kept:
+            return
+        if ts is None:
+            return
+        # NaN-safe: math.isnan rejects NaN before it pollutes min/max.
+        if isinstance(ts, float) and math.isnan(ts):
+            return
+        self._valid_rows += 1
+        if self._min_ts is None or ts < self._min_ts:
+            self._min_ts = ts
+        if self._max_ts is None or ts > self._max_ts:
+            self._max_ts = ts
+    def observe_frame(self, pre_df: pd.DataFrame) -> None:
+        if self._kept:
+            return
+        if pre_df is None or pre_df.empty or "ts" not in pre_df.columns:
+            return
+        valid = pre_df["ts"].dropna()
+        if valid.empty:
+            return
+        self._valid_rows += int(len(valid))
+        frame_min = float(valid.min())
+        frame_max = float(valid.max())
+        if self._min_ts is None or frame_min < self._min_ts:
+            self._min_ts = frame_min
+        if self._max_ts is None or frame_max > self._max_ts:
+            self._max_ts = frame_max
+    def mark_kept(self) -> None:
+        self._kept = True
+    def coverage(self, frame_empty: bool) -> SourceCoverage | None:
+        """Return a SourceCoverage when disclosure is warranted; else None.
+        - data survived (frame non-empty OR mark_kept fired) → None.
+        - no files read                                       → (None, None).
+        - files read but zero valid-ts rows                   → (0, None).
+        - valid rows seen, all excluded by window             → (valid, span).
+        """
+        if not frame_empty or self._kept:
+            return None
+        if not self._files_read:
+            return SourceCoverage(None, None)
+        if self._valid_rows == 0:
+            return SourceCoverage(0, None)
+        span: tuple[datetime, datetime] | None = None
+        if self._min_ts is not None and self._max_ts is not None:
+            span = (
+                datetime.fromtimestamp(self._min_ts, tz=timezone.utc),
+                datetime.fromtimestamp(self._max_ts, tz=timezone.utc),
+            )
+        return SourceCoverage(self._valid_rows, span)
+@dataclass(frozen=True)
+class RotationSkipInfo:
+    """Per-pattern result of flat-log rotation-peek windowing (syslog / pihole).
+    The loader records this STRUCTURED metadata; the runner formats the prose
+    note (``_rotation_skip_notes``) — the loader never imports the runner.
+    ``fallback`` is data-true at the PATTERN level: when any rotation group's
+    first-ts order is non-monotonic, ``_rotation_windowed_files`` disables
+    pruning for the WHOLE pattern and returns every candidate file
+    (``fallback=True``, ``skipped=0``, ``loaded=len(files)``). That keeps the
+    runner's "read the full archive" note honest — a fallback can never coexist
+    with a sibling group that was silently pruned.
+    ``skipped_files`` carries ``(name, oldest_ts_or_None)`` for verbose
+    per-file lines. The early-stopped older tail is never peeked, so its ts is
+    ``None`` — the perf win is real and no timestamp is fabricated.
+    """
+    loaded: int
+    skipped: int
+    fallback: bool
+    fallback_reason: str | None = None
+    skipped_files: list[tuple[str, datetime | None]] = field(default_factory=list)
+@dataclass
+class LoadResult:
+    """Loaded log data and metadata needed by the runner."""
+    logs: dict[str, pd.DataFrame]
+    record_counts: dict[str, int]
+    data_window: tuple[datetime, datetime] | None = None
+    warnings: list[str] = field(default_factory=list)
+    data_size_bytes: int = 0
+    coverage: dict[str, SourceCoverage] = field(default_factory=dict)
+    rotation_skips: dict[str, RotationSkipInfo] = field(default_factory=dict)
+def _data_window(logs: dict[str, pd.DataFrame]) -> tuple[datetime, datetime] | None:
+    """Compute the min/max timestamp window across loaded DataFrames."""
+    all_ts: list[float] = []
+    for df in logs.values():
+        if not df.empty and "ts" in df.columns:
+            all_ts.extend(df["ts"].dropna().tolist())
+    if not all_ts:
+        return None
+    return (
+        datetime.fromtimestamp(min(all_ts), tz=timezone.utc),
+        datetime.fromtimestamp(max(all_ts), tz=timezone.utc),
+    )