PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

loghunter/parsers/zeek_tsv.py ADDED Viewed

@@ -0,0 +1,310 @@
+"""Zeek TSV log parser — header-block parsing and type coercion.
+This module is the TSV front-end for Zeek log parsing. It produces a pre-normalization
+DataFrame with Zeek-native column names and Python-typed values, ready for consumption
+by the normalizers in parsers/zeek.py (_normalize_conn_df, _normalize_dns_df).
+Architecture: one normalizer, two front-ends. The NDJSON front-end (common/loader.py)
+and this TSV front-end both produce the same intermediate DataFrame shape. Normalizers
+are never aware of which format was loaded.
+File I/O and decompression are the caller's responsibility (common/loader.py, stage 2).
+"""
+from __future__ import annotations
+import re
+from dataclasses import dataclass, field
+from typing import Any, Iterable, Iterator
+import pandas as pd
+# Sentinel returned by _coerce when a field value is the unset token.
+# The caller must omit the key from the record dict entirely.
+# Using absent keys (rather than explicit None) mirrors the NDJSON path:
+# pd.DataFrame(records) produces NaN for absent keys, matching NDJSON absent-field behavior.
+_UNSET = object()
+@dataclass
+class _TSVHeader:
+    """Parsed Zeek TSV header block directives."""
+    separator: str = "\t"
+    set_separator: str = ","        # Zeek spec default
+    empty_field: str = "(empty)"    # Zeek spec default
+    unset_field: str = "-"          # Zeek spec default
+    path: str = ""
+    fields: list[str] = field(default_factory=list)
+    types: list[str] = field(default_factory=list)
+    # Tracks whether #separator was actually declared (required).
+    _separator_seen: bool = field(default=False, repr=False)
+def _unescape_separator(raw: str) -> str:
+    """Convert Zeek #separator escape sequences (e.g. \\x09) to real characters."""
+    return re.sub(r"\\x([0-9a-fA-F]{2})", lambda m: chr(int(m.group(1), 16)), raw)
+SNIFF_PEEK_LINES: int = 16
+def sniff(sample: list[str]) -> str | None:
+    """Recognize a Zeek TSV header and return its digester target.
+    Returns "conn", "dns", or "syslog" when the sample carries a well-formed
+    Zeek TSV header block declaring #separator, #fields, and #path with a
+    value of "conn", "dns", or "syslog". Returns None for any other shape —
+    including text that happens to contain a "#path" substring without a
+    real header block, and Zeek TSV logs whose #path is something else
+    (notice/analyzer/etc. — no digester yet, fall to the blob floor).
+    Pure: takes already-decoded lines, performs no I/O. Mirrors the header
+    parse in _parse_header without draining the iterator.
+    """
+    separator: str | None = None
+    path: str | None = None
+    fields_seen = False
+    saw_directive = False
+    for raw_line in sample:
+        line = raw_line.rstrip("\r\n")
+        if not line:
+            continue
+        if not line.startswith("#"):
+            break
+        saw_directive = True
+        if line.startswith("#separator ") or line.startswith("#separator\t"):
+            raw_val = line.split(None, 1)[1].strip()
+            separator = _unescape_separator(raw_val)
+            continue
+        if separator is None:
+            # Other directives use the parsed separator; without #separator
+            # we cannot split them. Skip — #separator may yet appear.
+            continue
+        parts = line[1:].split(separator)
+        key = parts[0]
+        values = parts[1:]
+        if key == "path":
+            path = values[0] if values else ""
+        elif key == "fields":
+            fields_seen = True
+    if not saw_directive or separator is None or not fields_seen:
+        return None
+    if path == "conn":
+        return "conn"
+    if path == "dns":
+        return "dns"
+    if path == "syslog":
+        return "syslog"
+    return None
+def _parse_header(lines: Iterator[str]) -> tuple[_TSVHeader, list[str]]:
+    """Parse the Zeek TSV header block and return (header, buffered_data_lines).
+    Reads #-prefixed directive lines until the first non-# line or #close.
+    The first non-# line is the first data row; it is included in data_lines.
+    Raises ValueError if #fields or #types is missing, their lengths differ,
+    or #separator was never declared before data rows appear.
+    """
+    hdr = _TSVHeader()
+    data_lines: list[str] = []
+    for raw_line in lines:
+        line = raw_line.rstrip("\r\n")
+        if not line:
+            continue
+        if line.startswith("#separator ") or line.startswith("#separator\t"):
+            # #separator uses plain space as its own delimiter.
+            raw_val = line.split(None, 1)[1].strip()
+            hdr.separator = _unescape_separator(raw_val)
+            hdr._separator_seen = True
+            continue
+        if line.startswith("#close"):
+            break
+        if line.startswith("#"):
+            # All other directives use the declared separator.
+            parts = line[1:].split(hdr.separator)
+            key = parts[0]
+            values = parts[1:]
+            if key == "set_separator":
+                hdr.set_separator = values[0] if values else ","
+            elif key == "empty_field":
+                hdr.empty_field = values[0] if values else "(empty)"
+            elif key == "unset_field":
+                hdr.unset_field = values[0] if values else "-"
+            elif key == "path":
+                hdr.path = values[0] if values else ""
+            elif key == "fields":
+                hdr.fields = values
+            elif key == "types":
+                hdr.types = values
+            # #open and other directives are silently ignored.
+            continue
+        # First non-# line: data row.
+        if not hdr._separator_seen:
+            raise ValueError("Zeek TSV header missing #separator")
+        data_lines.append(line)
+        break
+    # Drain remaining lines.
+    for raw_line in lines:
+        line = raw_line.rstrip("\r\n")
+        if line.startswith("#close"):
+            break
+        if line:
+            data_lines.append(line)
+    # Validate required directives.
+    if not hdr.fields:
+        raise ValueError("Zeek TSV header missing #fields")
+    if not hdr.types:
+        raise ValueError("Zeek TSV header missing #types")
+    if len(hdr.fields) != len(hdr.types):
+        raise ValueError(
+            f"Zeek TSV #fields has {len(hdr.fields)} columns but "
+            f"#types has {len(hdr.types)} — header is malformed"
+        )
+    return hdr, data_lines
+# Container-type prefix regex for set[…] and vector[…].
+_CONTAINER_RE = re.compile(r"^(?:set|vector)\[(.+)\]$")
+# Known scalar Zeek types. Anything not in this set or not a container raises.
+_SCALAR_TYPES = frozenset({
+    "time", "interval", "double",
+    "count", "int", "port",
+    "bool",
+    "addr", "string", "enum",
+})
+def _coerce(
+    raw: str,
+    zeek_type: str,
+    set_sep: str,
+    empty_field: str,
+    unset_field: str,
+) -> Any:
+    """Coerce a raw TSV field value to its Python equivalent for the given Zeek type.
+    Returns _UNSET when the value is the unset token — the caller must omit the key
+    from the record dict rather than inserting None.
+    Raises ValueError for unknown types, invalid bool tokens, empty tokens on numeric
+    or bool types, and _UNSET appearing inside a collection element.
+    """
+    if raw == unset_field:
+        return _UNSET
+    # Container types: set[inner] and vector[inner].
+    m = _CONTAINER_RE.match(zeek_type)
+    if m:
+        if raw == empty_field:
+            return []
+        inner_type = m.group(1)
+        result = []
+        for element in raw.split(set_sep):
+            coerced = _coerce(element, inner_type, set_sep, empty_field, unset_field)
+            if coerced is _UNSET:
+                raise ValueError(
+                    f"Zeek TSV: unset token found inside collection element "
+                    f"(type {zeek_type!r}); individual elements cannot be unset"
+                )
+            result.append(coerced)
+        return result
+    # Scalar types.
+    if zeek_type in ("time", "interval", "double"):
+        if raw == empty_field:
+            raise ValueError(
+                f"Zeek TSV: empty token in numeric field (type {zeek_type!r})"
+            )
+        return float(raw)
+    if zeek_type in ("count", "int", "port"):
+        if raw == empty_field:
+            raise ValueError(
+                f"Zeek TSV: empty token in numeric field (type {zeek_type!r})"
+            )
+        return int(raw)
+    if zeek_type == "bool":
+        if raw == empty_field:
+            raise ValueError("Zeek TSV: empty token in bool field")
+        if raw == "T":
+            return True
+        if raw == "F":
+            return False
+        raise ValueError(
+            f"Zeek TSV: invalid bool token {raw!r} — expected 'T' or 'F'"
+        )
+    if zeek_type in ("addr", "string", "enum"):
+        return "" if raw == empty_field else raw
+    raise ValueError(f"Zeek TSV: unsupported Zeek type {zeek_type!r}")
+def parse_tsv_log(source: Iterable[str]) -> pd.DataFrame:
+    """Parse a single Zeek TSV log stream and return a pre-normalization DataFrame.
+    source may be an open text stream or any iterable of strings (e.g. the result of
+    str.splitlines(keepends=True)).
+    Column names retain Zeek-native names (id.orig_h, id.resp_p, TTLs, answers, etc.).
+    Values are typed as Python objects matching what json.loads produces on the NDJSON
+    path: floats for time/interval/double, ints for count/int/port, bools for bool,
+    lists for set[…]/vector[…], absent key for unset fields.
+    This output is intended to be passed directly to _normalize_conn_df or
+    _normalize_dns_df in loghunter.parsers.zeek, unchanged.
+    Raises ValueError for malformed headers, ragged rows, invalid coercions, or
+    unknown Zeek types.
+    """
+    hdr, data_lines = _parse_header(iter(source))
+    n_fields = len(hdr.fields)
+    records: list[dict[str, Any]] = []
+    for lineno, line in enumerate(data_lines, start=1):
+        # Strip any residual line endings (header parser may have left some if
+        # data_lines were collected after the first data row was already stripped).
+        line = line.rstrip("\r\n")
+        if not line or line.startswith("#"):
+            continue
+        tokens = line.split(hdr.separator)
+        if len(tokens) != n_fields:
+            raise ValueError(
+                f"Zeek TSV: line {lineno} has {len(tokens)} fields, "
+                f"expected {n_fields}"
+            )
+        record: dict[str, Any] = {}
+        for fname, ftype, raw in zip(hdr.fields, hdr.types, tokens):
+            value = _coerce(
+                raw, ftype, hdr.set_separator, hdr.empty_field, hdr.unset_field
+            )
+            if value is not _UNSET:
+                record[fname] = value
+        records.append(record)
+    if not records:
+        return pd.DataFrame(columns=hdr.fields)
+    return pd.DataFrame(records)