PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

loghunter/digest/cloudtrail.py ADDED Viewed

@@ -0,0 +1,341 @@
+"""cloudtrail summariser — orient-before-the-hunt for CloudTrail data.
+Six fixed slots, two of which are scoped to the interactive lane only:
+  - lane-split    — dist  — interactive vs service share of the WHOLE pile
+                            (HEADLINE orient; renders first; never produces
+                            an insight)
+  - principal-vol — cliff — INTERACTIVE ONLY: largest share of interactive events
+  - event-source  — cliff — busiest AWS service across the whole pile
+  - source-ip     — share — INTERACTIVE ONLY: concentration of one source IP
+                            against interactive total. NO population floor —
+                            single-IP-dominates is the SIGNAL this slot was
+                            introduced to surface, and that case inherently
+                            has few distinct IPs. Gated at SHARE_GATE only.
+  - region        — dist  — top-3 aws_region share across the whole pile
+                            (always shows; never produces an insight)
+  - error-rate    — rate  — fraction of events that errored
+                            (error_code non-null); names the top error CODE
+Lane scoping is the one structural wrinkle on this card. principal-vol and
+source-ip read the interactive subset only; lane-split, event-source, region,
+and error-rate read the whole frame. The aws detector takes the same
+interactive-first discipline; we read aws.py for understanding but do NOT
+import from it — same no-cross-import rail dns/syslog follow with their
+detectors.
+Cliff machinery is imported from conn so the cards cannot drift on gate /
+floor / display-cap behaviour. Rate (and its RATE_FLOOR) and share (and its
+SHARE_GATE) live in ``loghunter.digest._stats`` — the shared stats module
+factored once three cards needed an identical rate (dns + syslog +
+cloudtrail) and once a second statistic without a sibling needed its
+canonical home.
+Dist slots (lane-split, region) never contribute an insight — ambient
+orientation, not a standout, same rule as dns's qtype-mix. On a quiet
+account every gating slot stays non-speaking and vanishes from ``fields``;
+the card carries only the two dist slots — that IS the honest digest of a
+quiet pile.
+"""
+from __future__ import annotations
+import pandas as pd
+from loghunter.common.finding import DigestSlot
+from loghunter.digest._stats import RATE_FLOOR, SHARE_GATE, _rate, _share
+from loghunter.digest.conn import (
+    CLIFF_DISPLAY_CAP,  # noqa: F401 — re-exported for downstream symmetry
+    CLIFF_GATE,         # noqa: F401 — re-exported for downstream symmetry
+    POPULATION_FLOOR,   # noqa: F401 — cliff slots in this card use it via _cliff
+    _cliff,
+    _format_ratio_cell,
+    _format_ratio_lede,
+)
+# ── dist helpers — local, no shared base ────────────────────────────────────
+def _lane_split_dist(lane_series: pd.Series | None) -> str:
+    """Render the lane-split binary share for the lane-split dist slot.
+    Two distinct fallbacks (consistency with dns.qtype-mix):
+      - Missing column (lane_series is None) → "(no lane)" (schema-presence fact)
+      - Empty / all-NaN series → "(no events)" (data-shape fact)
+    Otherwise: ``"interactive N% / service M%"``. Any non-interactive label
+    counts toward the service share — the parser's derivation is "default
+    interactive, escalate to service when service-marked," and any unknown
+    label is closer to service than to interactive.
+    """
+    if lane_series is None:
+        return "(no lane)"
+    labels = lane_series.dropna()
+    if labels.empty:
+        return "(no events)"
+    total = int(len(labels))
+    interactive_count = int((labels == "interactive").sum())
+    service_count = total - interactive_count
+    interactive_pct = int(round(interactive_count / total * 100))
+    service_pct = int(round(service_count / total * 100))
+    return f"interactive {interactive_pct}% / service {service_pct}%"
+def _region_dist(region_series: pd.Series | None) -> str:
+    """Render top-3 region share string for the region dist slot.
+    Two distinct fallbacks (consistency with dns.qtype-mix):
+      - Missing column (region_series is None) → "(no region)" (schema-presence fact)
+      - Empty / all-NaN series → "(no events)" (data-shape fact)
+    Single-region pile → "us-east-1 100%". Mix → top-3 joined by " · ".
+    """
+    if region_series is None:
+        return "(no region)"
+    labels = region_series.dropna().astype(str)
+    if labels.empty:
+        return "(no events)"
+    counts = labels.value_counts()
+    total = int(counts.sum())
+    top_three = counts.head(3)
+    parts = [
+        f"{label} {int(round(count / total * 100))}%"
+        for label, count in top_three.items()
+    ]
+    return " · ".join(parts)
+# ── Slot computers ──────────────────────────────────────────────────────────
+def _slot_lane_split(frame: pd.DataFrame) -> DigestSlot:
+    """lane-split — dist over the lane column; whole pile; always shows."""
+    label = "lane-split"
+    lane = frame["lane"] if "lane" in frame.columns else None
+    rendered = _lane_split_dist(lane)
+    return DigestSlot(label=label, statistic="dist", cells=[rendered])
+def _slot_principal_vol(frame_interactive: pd.DataFrame) -> DigestSlot:
+    """principal-vol — cliff over per-principal counts in the interactive lane.
+    Share denominator is the interactive total, not the whole pile. On a
+    pile with two ≈balanced interactive principals (population below
+    POPULATION_FLOOR or rank1/rank2 ratio below CLIFF_GATE) this slot
+    correctly DASHES — that is the spec.
+    """
+    label = "principal-vol"
+    if frame_interactive.empty or "principal" not in frame_interactive.columns:
+        return DigestSlot(label=label, statistic="cliff")
+    counts = (
+        frame_interactive["principal"]
+        .value_counts(dropna=True)
+        .sort_values(ascending=False)
+    )
+    result = _cliff(counts)
+    if result is None:
+        return DigestSlot(label=label, statistic="cliff")
+    entity, magnitude, ratio = result
+    total = int(len(frame_interactive))
+    share_pct = (magnitude / total * 100.0) if total > 0 else 0.0
+    entity_str = str(entity)
+    return DigestSlot(
+        label=label,
+        statistic="cliff",
+        cells=[entity_str, f"{share_pct:.0f}%", _format_ratio_cell(ratio)],
+        entity=entity_str,
+        magnitude=share_pct,
+        ratio=ratio,
+    )
+def _slot_event_source(frame: pd.DataFrame) -> DigestSlot:
+    """event-source — cliff over per-service counts across the whole pile."""
+    label = "event-source"
+    if frame.empty or "event_source" not in frame.columns:
+        return DigestSlot(label=label, statistic="cliff")
+    counts = frame["event_source"].value_counts(dropna=True).sort_values(ascending=False)
+    result = _cliff(counts)
+    if result is None:
+        return DigestSlot(label=label, statistic="cliff")
+    entity, magnitude, ratio = result
+    entity_str = str(entity)
+    return DigestSlot(
+        label=label,
+        statistic="cliff",
+        cells=[entity_str, f"{int(magnitude)}", _format_ratio_cell(ratio)],
+        entity=entity_str,
+        magnitude=magnitude,
+        ratio=ratio,
+    )
+def _slot_source_ip(frame_interactive: pd.DataFrame) -> DigestSlot:
+    """source-ip — share of one source IP against the interactive total.
+    Concentration-against-total, NOT rank-dominance. The question this slot
+    asks is "is interactive traffic concentrated in one source," which is
+    answered by share-of-total — not by a rank1/rank2 ratio. The case the
+    slot exists to surface (a single attacker IP) inherently produces a
+    low-cardinality distribution; using cliff's POPULATION_FLOOR=5 here
+    would suppress exactly that signal. The share statistic has no
+    population floor — a pile of one distinct IP at 100% speaks, two IPs
+    with one at 99% speaks.
+    Interactive-scoped because service-lane source_ip is frequently a
+    service hostname (e.g. "s3.amazonaws.com"), not an IP — that string
+    would dominate the whole-pile share and manufacture a meaningless
+    "standout".
+    Cell vs entity split: the table cell leads with "1 IP" to make the
+    concentration legible at a glance; the entity field carries the actual
+    address so the lede names it. Two cells, not three — there is no
+    rank-2 ratio in a share statistic.
+    """
+    label = "source-ip"
+    if frame_interactive.empty or "source_ip" not in frame_interactive.columns:
+        return DigestSlot(label=label, statistic="share")
+    counts = (
+        frame_interactive["source_ip"]
+        .value_counts(dropna=True)
+        .sort_values(ascending=False)
+    )
+    total = int(len(frame_interactive))
+    result = _share(counts, total)
+    if result is None:
+        return DigestSlot(label=label, statistic="share")
+    entity, top_share = result
+    share_pct = top_share * 100.0
+    entity_str = str(entity)
+    return DigestSlot(
+        label=label,
+        statistic="share",
+        cells=["1 IP", f"{share_pct:.0f}% of interactive"],
+        entity=entity_str,
+        magnitude=share_pct,
+    )
+def _slot_region(frame: pd.DataFrame) -> DigestSlot:
+    """region — dist over aws_region across the whole pile; always shows."""
+    label = "region"
+    regions = frame["aws_region"] if "aws_region" in frame.columns else None
+    rendered = _region_dist(regions)
+    return DigestSlot(label=label, statistic="dist", cells=[rendered])
+def _slot_error_rate(frame: pd.DataFrame) -> DigestSlot:
+    """error-rate — rate of events with non-null error_code; names top error code.
+    Kind definition: ``error_code.notna()``. The parser emits None on
+    success; a non-null string means the call errored. The top contributor
+    is the most common errorCode value among errored events — NOT a
+    principal.
+    Literal notna() semantics: rows with None or NaN read as clean; rows
+    with an empty string read as errored (the parser does not emit "" on
+    success, so this is a no-op in practice but pinned by tests).
+    """
+    label = "error-rate"
+    if frame.empty or "error_code" not in frame.columns:
+        return DigestSlot(label=label, statistic="rate")
+    kind_mask = frame["error_code"].notna()
+    result = _rate(kind_mask, frame["error_code"])
+    if result is None:
+        return DigestSlot(label=label, statistic="rate")
+    fraction, top = result
+    pct = fraction * 100.0
+    return DigestSlot(
+        label=label,
+        statistic="rate",
+        cells=[f"{pct:.0f}%", top],
+        entity=top,
+        magnitude=pct,
+    )
+# ── Lede formatters ─────────────────────────────────────────────────────────
+def _lede_principal_vol(slot: DigestSlot) -> str:
+    return (
+        f"{slot.entity} drove {slot.magnitude:.0f}% of interactive events, "
+        f"{_format_ratio_lede(slot.ratio)} the next principal."
+    )
+def _lede_event_source(slot: DigestSlot) -> str:
+    return (
+        f"{slot.entity} accounted for {int(slot.magnitude)} events, "
+        f"{_format_ratio_lede(slot.ratio)} the next service."
+    )
+def _lede_source_ip(slot: DigestSlot) -> str:
+    # Share statistic — no rank-2 ratio, so no "Nx the next" clause.
+    return (
+        f"{slot.entity} is the source of {slot.magnitude:.0f}% of "
+        f"interactive events."
+    )
+def _lede_error_rate(slot: DigestSlot) -> str:
+    return (
+        f"{slot.magnitude:.0f}% of events errored, "
+        f"most commonly {slot.entity}."
+    )
+_INSIGHT_FORMATTERS = {
+    "principal-vol": _lede_principal_vol,
+    "event-source":  _lede_event_source,
+    "source-ip":     _lede_source_ip,
+    "error-rate":    _lede_error_rate,
+}
+# ── Zone 1 extras ───────────────────────────────────────────────────────────
+def _zone1_extras(frame: pd.DataFrame) -> list[tuple[str, str]]:
+    """Two lines, brief-pinned: distinct principals + distinct event sources."""
+    if frame.empty:
+        return [("principals", "0"), ("event sources", "0")]
+    distinct_principals = (
+        int(frame["principal"].nunique(dropna=True))
+        if "principal" in frame.columns else 0
+    )
+    distinct_sources = (
+        int(frame["event_source"].nunique(dropna=True))
+        if "event_source" in frame.columns else 0
+    )
+    return [
+        ("principals", str(distinct_principals)),
+        ("event sources", str(distinct_sources)),
+    ]
+# ── Public entry point ─────────────────────────────────────────────────────
+def summarize(frame: pd.DataFrame) -> dict:
+    """Return the schema-specific body of a cloudtrail DigestCard.
+    The interactive subset is derived once at the top so the two
+    interactive-scoped slots (principal-vol, source-ip) see the same view
+    of the data.
+    """
+    from loghunter.digest._stats import select_insights_and_fields
+    if "lane" in frame.columns:
+        frame_interactive = frame[frame["lane"] == "interactive"]
+    else:
+        frame_interactive = frame.iloc[0:0]
+    slots = [
+        _slot_lane_split(frame),
+        _slot_principal_vol(frame_interactive),
+        _slot_event_source(frame),
+        _slot_source_ip(frame_interactive),
+        _slot_region(frame),
+        _slot_error_rate(frame),
+    ]
+    insights, fields = select_insights_and_fields(slots, _INSIGHT_FORMATTERS)
+    return {
+        "zone1_extras": _zone1_extras(frame),
+        "insights": insights,
+        "fields": fields,
+    }