PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

loghunter/digest/conn.py ADDED Viewed

@@ -0,0 +1,367 @@
+"""conn summariser — orient-before-the-hunt for Zeek conn data.
+Reads a normalised conn frame (canonical columns ``src, dst, port, proto, ts,
+bytes, conn_state, local_orig``) and returns the schema-specific body of a
+DigestCard: ``zone1_extras`` (the ambient label/value block), ``insights``
+(prose sentences mechanically derived from speaking gated slots), and
+``fields`` (the display-ready, already-filtered speaking non-insight slots).
+All four conn slots use the ``cliff`` statistic: rank1 / rank2 over the sorted
+entity counts. A slot is non-speaking when the population is below
+``POPULATION_FLOOR`` or when the ratio is below ``CLIFF_GATE``; non-speaking
+slots are filtered out of ``fields`` by ``select_insights_and_fields`` and
+never reach the renderer.
+Internal/external classification is computed locally; the scan detector's
+home_net is intentionally not imported.
+"""
+from __future__ import annotations
+import ipaddress
+from typing import Any
+import pandas as pd
+from loghunter.common.finding import DigestSlot
+# ── Calibration constants — provisional, tunable in one place ────────────────
+CLIFF_GATE = 2.0
+POPULATION_FLOOR = 5
+# Display-only ceiling for rendered cliff ratios. Above this, "625000.0x" and
+# "60x" tell the reader the same thing (one entity utterly dominates), so the
+# extra magnitude is noise. We cap the RENDERED string at >50x / "more than
+# 50x"; slot.ratio continues to carry the true float so lede sort ordering
+# still respects the real value.
+CLIFF_DISPLAY_CAP = 50.0
+_RFC1918_NETWORKS = (
+    ipaddress.ip_network("10.0.0.0/8"),
+    ipaddress.ip_network("172.16.0.0/12"),
+    ipaddress.ip_network("192.168.0.0/16"),
+)
+# ── Internal/external classifier ─────────────────────────────────────────────
+def _is_internal(ip: object) -> bool:
+    """Return True iff ip is a string parsable as an RFC1918 address."""
+    if not isinstance(ip, str) or not ip:
+        return False
+    try:
+        addr = ipaddress.ip_address(ip)
+    except ValueError:
+        return False
+    return any(addr in net for net in _RFC1918_NETWORKS)
+def _origin_internal_series(frame: pd.DataFrame) -> pd.Series:
+    """Rule B per-row originator-is-internal classification.
+    ``local_orig`` is the per-row signal when present (True → internal,
+    False → external). When ``local_orig`` is missing or NaN, fall back to
+    RFC1918 membership of ``src`` (Rule A applied to src).
+    """
+    src_internal = frame["src"].map(_is_internal)
+    if "local_orig" not in frame.columns:
+        return src_internal.astype(bool)
+    local_orig = frame["local_orig"]
+    resolved = local_orig.where(local_orig.notna(), src_internal)
+    return resolved.astype(bool)
+# ── Cliff ratio display formatting ───────────────────────────────────────────
+def _format_ratio_cell(ratio: float) -> str:
+    """Compact Zone-3 table cell. Caps at CLIFF_DISPLAY_CAP."""
+    if ratio >= CLIFF_DISPLAY_CAP:
+        return f">{int(CLIFF_DISPLAY_CAP)}x"
+    return f"{ratio:.1f}x"
+def _format_ratio_lede(ratio: float) -> str:
+    """Prose Zone-2 lede fragment. Caps at CLIFF_DISPLAY_CAP.
+    Returns just the comparator phrase (e.g. ``"3.7x"`` or
+    ``"more than 50x"``); the surrounding "the next destination" / "its
+    nearest peer" / etc. lives in the per-slot lede formatter.
+    """
+    if ratio >= CLIFF_DISPLAY_CAP:
+        return f"more than {int(CLIFF_DISPLAY_CAP)}x"
+    return f"{ratio:.1f}x"
+# ── Cliff statistic ──────────────────────────────────────────────────────────
+def _cliff(sorted_counts: pd.Series) -> tuple[Any, float, float] | None:
+    """Evaluate the cliff slot over a descending series of entity magnitudes.
+    Returns ``(rank1_entity, rank1_magnitude, ratio)`` when the slot speaks;
+    None when it should dash. Dashes when population is below
+    POPULATION_FLOOR, when rank2 is zero/NaN, or when the rank1/rank2 ratio
+    is below CLIFF_GATE.
+    """
+    if len(sorted_counts) < POPULATION_FLOOR:
+        return None
+    rank1 = sorted_counts.iloc[0]
+    rank2 = sorted_counts.iloc[1]
+    if pd.isna(rank2) or rank2 == 0:
+        return None
+    ratio = float(rank1) / float(rank2)
+    if ratio < CLIFF_GATE:
+        return None
+    return sorted_counts.index[0], float(rank1), ratio
+# ── Slot computations ────────────────────────────────────────────────────────
+def _slot_conn_share(frame: pd.DataFrame) -> DigestSlot:
+    """conn-share: which host owns the largest share of connections.
+    Host involvement = rows where host appears as src OR dst. Each row
+    contributes to two hosts' counts (src and dst); a row with src == dst
+    counts once for that host. The brief reads "share of connections" as
+    endpoint involvement, not source-only.
+    """
+    label = "conn-share"
+    if frame.empty:
+        return DigestSlot(label=label, statistic="cliff")
+    src_counts = frame["src"].value_counts(dropna=False)
+    dst_counts = frame["dst"].value_counts(dropna=False)
+    same = frame.loc[frame["src"] == frame["dst"], "src"].value_counts(dropna=False)
+    involvement = src_counts.add(dst_counts, fill_value=0).sub(same, fill_value=0)
+    involvement = involvement.sort_values(ascending=False)
+    result = _cliff(involvement)
+    if result is None:
+        return DigestSlot(label=label, statistic="cliff")
+    entity, magnitude, ratio = result
+    total_rows = len(frame)
+    share_pct = (magnitude / total_rows * 100.0) if total_rows > 0 else 0.0
+    entity_str = str(entity)
+    return DigestSlot(
+        label=label,
+        statistic="cliff",
+        cells=[entity_str, f"{share_pct:.0f}%", _format_ratio_cell(ratio)],
+        entity=entity_str,
+        magnitude=share_pct,
+        ratio=ratio,
+    )
+def _slot_densest_tuple(frame: pd.DataFrame) -> DigestSlot:
+    """densest-tuple: the single busiest (src, dst, port) flow.
+    Proto is intentionally not part of the key — the brief specifies the fill
+    format as ``src->dst:port``.
+    """
+    label = "densest-tuple"
+    if frame.empty:
+        return DigestSlot(label=label, statistic="cliff")
+    counts = (
+        frame.groupby(["src", "dst", "port"], dropna=False)
+        .size()
+        .sort_values(ascending=False)
+    )
+    result = _cliff(counts)
+    if result is None:
+        return DigestSlot(label=label, statistic="cliff")
+    (src, dst, port), magnitude, ratio = result
+    port_token = str(int(port)) if pd.notna(port) else "?"
+    flow = f"{src} → {dst}:{port_token}"
+    return DigestSlot(
+        label=label,
+        statistic="cliff",
+        cells=[flow, f"{int(magnitude)}", _format_ratio_cell(ratio)],
+        entity=flow,
+        magnitude=magnitude,
+        ratio=ratio,
+    )
+def _slot_fan_out(frame: pd.DataFrame) -> DigestSlot:
+    """fan-out: src:port reaching the most distinct destinations."""
+    label = "fan-out"
+    if frame.empty:
+        return DigestSlot(label=label, statistic="cliff")
+    distinct_dsts = (
+        frame.groupby(["src", "port"], dropna=False)["dst"]
+        .nunique()
+        .sort_values(ascending=False)
+    )
+    result = _cliff(distinct_dsts)
+    if result is None:
+        return DigestSlot(label=label, statistic="cliff")
+    (src, port), magnitude, ratio = result
+    port_token = str(int(port)) if pd.notna(port) else "?"
+    src_port = f"{src}:{port_token}"
+    return DigestSlot(
+        label=label,
+        statistic="cliff",
+        cells=[src_port, f"{int(magnitude)} dsts", _format_ratio_cell(ratio)],
+        entity=src_port,
+        magnitude=magnitude,
+        ratio=ratio,
+    )
+def _slot_byte_direction(frame: pd.DataFrame) -> DigestSlot:
+    """byte-direction: external dst receiving the largest share of outbound bytes.
+    A row is outbound iff (Rule B src-internal) AND (Rule A dst-external);
+    neither alone is sufficient. NaN/missing bytes count as 0.
+    """
+    label = "byte-direction"
+    if frame.empty or "bytes" not in frame.columns:
+        return DigestSlot(label=label, statistic="cliff")
+    src_internal = _origin_internal_series(frame)
+    dst_external = ~frame["dst"].map(_is_internal)
+    outbound_mask = src_internal & dst_external
+    if not outbound_mask.any():
+        return DigestSlot(label=label, statistic="cliff")
+    outbound = frame.loc[outbound_mask]
+    bytes_filled = outbound["bytes"].fillna(0)
+    per_dst_bytes = bytes_filled.groupby(outbound["dst"]).sum().sort_values(ascending=False)
+    result = _cliff(per_dst_bytes)
+    if result is None:
+        return DigestSlot(label=label, statistic="cliff")
+    dst, magnitude, ratio = result
+    total_outbound = float(bytes_filled.sum())
+    pct = (magnitude / total_outbound * 100.0) if total_outbound > 0 else 0.0
+    entity = str(dst)
+    return DigestSlot(
+        label=label,
+        statistic="cliff",
+        cells=[entity, f"{pct:.0f}%", _format_ratio_cell(ratio)],
+        entity=entity,
+        magnitude=pct,
+        ratio=ratio,
+    )
+# ── Zone-1 extras ────────────────────────────────────────────────────────────
+def _format_bytes(n: float) -> str:
+    """Format a byte count for the Zone-1 descriptive line."""
+    if n < 1024:
+        return f"{int(n)} B"
+    if n < 1024 ** 2:
+        return f"{n / 1024:.1f} KB"
+    if n < 1024 ** 3:
+        return f"{n / (1024 ** 2):.1f} MB"
+    if n < 1024 ** 4:
+        return f"{n / (1024 ** 3):.1f} GB"
+    return f"{n / (1024 ** 4):.1f} TB"
+def _zone1_extras(frame: pd.DataFrame) -> list[tuple[str, str]]:
+    """Return the ambient label/value rows the conn card prints.
+    Exactly the four pieces the brief lists: host count, internal/external
+    split, outbound bytes, inbound bytes. Host count and split share one
+    rendered line (the split is the parenthetical of the count). Outbound and
+    inbound bytes are two further lines.
+    """
+    if frame.empty:
+        return [
+            ("hosts", "0"),
+            ("outbound bytes", _format_bytes(0)),
+            ("inbound bytes", _format_bytes(0)),
+        ]
+    hosts: set[str] = set()
+    for col in ("src", "dst"):
+        for value in frame[col].dropna().tolist():
+            if isinstance(value, str) and value:
+                hosts.add(value)
+    internal_count = sum(1 for h in hosts if _is_internal(h))
+    external_count = len(hosts) - internal_count
+    src_internal = _origin_internal_series(frame)
+    src_external = ~src_internal
+    dst_internal = frame["dst"].map(_is_internal)
+    dst_external = ~dst_internal
+    if "bytes" in frame.columns:
+        bytes_series = frame["bytes"].fillna(0)
+    else:
+        bytes_series = pd.Series(0, index=frame.index)
+    outbound_bytes = float(bytes_series[src_internal & dst_external].sum())
+    inbound_bytes = float(bytes_series[src_external & dst_internal].sum())
+    return [
+        ("hosts", f"{len(hosts)} ({internal_count} internal, {external_count} external)"),
+        ("outbound bytes", _format_bytes(outbound_bytes)),
+        ("inbound bytes", _format_bytes(inbound_bytes)),
+    ]
+# ── Lede formatters ──────────────────────────────────────────────────────────
+def _lede_conn_share(slot: DigestSlot) -> str:
+    return (
+        f"{slot.entity} is in {slot.magnitude:.0f}% of connections, "
+        f"{_format_ratio_lede(slot.ratio)} its nearest peer."
+    )
+def _lede_densest_tuple(slot: DigestSlot) -> str:
+    return (
+        f"{slot.entity} is the densest flow at {int(slot.magnitude)} connections, "
+        f"{_format_ratio_lede(slot.ratio)} the next."
+    )
+def _lede_fan_out(slot: DigestSlot) -> str:
+    return (
+        f"{slot.entity} reaches {int(slot.magnitude)} distinct destinations, "
+        f"{_format_ratio_lede(slot.ratio)} the next-broadest source."
+    )
+def _lede_byte_direction(slot: DigestSlot) -> str:
+    return (
+        f"{slot.entity} receives {slot.magnitude:.0f}% of outbound bytes, "
+        f"{_format_ratio_lede(slot.ratio)} the next destination."
+    )
+_INSIGHT_FORMATTERS = {
+    "conn-share":     _lede_conn_share,
+    "densest-tuple":  _lede_densest_tuple,
+    "fan-out":        _lede_fan_out,
+    "byte-direction": _lede_byte_direction,
+}
+# ── Public entry point ──────────────────────────────────────────────────────
+def summarize(frame: pd.DataFrame) -> dict:
+    """Return the schema-specific body of a conn DigestCard.
+    Returned keys:
+      zone1_extras — list[(label, value)] in render order
+      insights     — list[str], 0..3 prose sentences
+      fields       — list[DigestSlot] speaking-and-not-promoted, in declared order
+    """
+    from loghunter.digest._stats import select_insights_and_fields
+    slots = [
+        _slot_conn_share(frame),
+        _slot_densest_tuple(frame),
+        _slot_fan_out(frame),
+        _slot_byte_direction(frame),
+    ]
+    insights, fields = select_insights_and_fields(slots, _INSIGHT_FORMATTERS)
+    return {
+        "zone1_extras": _zone1_extras(frame),
+        "insights": insights,
+        "fields": fields,
+    }