PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

tests/test_digest_stats.py ADDED Viewed

@@ -0,0 +1,282 @@
+"""Tests for the shared digest stats module.
+Two purposes:
+  1. Lock the seam — `_rate` and `_share` behave correctly at their gates
+     and floors, and the constants live where they should.
+  2. Prevent regressions of the factoring — `_rate` has a single source
+     of truth (function identity across all three importing cards), and
+     `RATE_FLOOR` resolves to the same numeric value everywhere.
+The existing tests/test_digest_{dns,syslog,cloudtrail}.py suites continuing
+to pass UNCHANGED is the load-bearing proof that Fix 2 was
+behavior-preserving. These tests layer additional invariants at the
+boundary.
+"""
+from __future__ import annotations
+import pandas as pd
+from loghunter.digest import _stats
+from loghunter.digest import cloudtrail as ct
+from loghunter.digest import dns
+from loghunter.digest import syslog
+# ─── Sharing invariants ─────────────────────────────────────────────────────
+def test_rate_identity_across_cards() -> None:
+    """All three cards reference the same `_rate` function object — no
+    shadowing copies. Function identity is meaningful here: any future
+    re-introduction of a local copy would break `is`."""
+    assert ct._rate is _stats._rate
+    assert dns._rate is _stats._rate
+    assert syslog._rate is _stats._rate
+def test_rate_floor_value_across_cards() -> None:
+    """RATE_FLOOR is an immutable float; check by equality (per James'
+    note), not `is`. Identity on a float is brittle and misleading."""
+    assert (
+        ct.RATE_FLOOR
+        == dns.RATE_FLOOR
+        == syslog.RATE_FLOOR
+        == _stats.RATE_FLOOR
+        == 0.01
+    )
+def test_share_gate_value() -> None:
+    """SHARE_GATE lives in _stats and is the canonical 0.80 threshold."""
+    assert _stats.SHARE_GATE == 0.80
+    assert ct.SHARE_GATE is _stats.SHARE_GATE  # constant re-import, same float
+# ─── _rate behavior ─────────────────────────────────────────────────────────
+def test_rate_dashes_below_population_floor() -> None:
+    """POPULATION_FLOOR is 5 — a 4-event mask returns None regardless of
+    fraction."""
+    mask = pd.Series([True, True, True, True])
+    contributor = pd.Series(["x", "x", "x", "x"])
+    assert _stats._rate(mask, contributor) is None
+def test_rate_dashes_when_kind_count_is_zero() -> None:
+    """Above floor but no matching events — return None even though the
+    population is fine."""
+    mask = pd.Series([False] * 20)
+    contributor = pd.Series(["x"] * 20)
+    assert _stats._rate(mask, contributor) is None
+def test_rate_dashes_below_rate_floor() -> None:
+    """200 events with 1 hit = 0.5% < RATE_FLOOR (1%) → dashes."""
+    mask = pd.Series([False] * 199 + [True])
+    contributor = pd.Series(["x"] * 199 + ["badcode"])
+    assert _stats._rate(mask, contributor) is None
+def test_rate_speaks_with_top_contributor() -> None:
+    """50 events, 10 errored (20%), contributor "AccessDenied" is the mode
+    among the errored subset — returns (0.20, "AccessDenied")."""
+    mask = pd.Series([False] * 40 + [True] * 10)
+    contributor = pd.Series(
+        ["clean"] * 40 + ["AccessDenied"] * 7 + ["ValidationException"] * 3
+    )
+    result = _stats._rate(mask, contributor)
+    assert result is not None
+    fraction, top = result
+    assert fraction == 0.20
+    assert top == "AccessDenied"
+def test_rate_drops_nan_contributors_in_mode() -> None:
+    """Top contributor lookup ignores NaN values among matching rows —
+    matches the dns/syslog/cloudtrail contract before factoring."""
+    mask = pd.Series([True] * 10 + [False] * 90)
+    contributor = pd.Series(
+        ["alice"] * 5 + [float("nan")] * 5 + ["x"] * 90
+    )
+    result = _stats._rate(mask, contributor)
+    assert result is not None
+    fraction, top = result
+    assert top == "alice"
+    assert fraction == 0.10
+# ─── _share behavior ───────────────────────────────────────────────────────
+def test_share_speaks_on_single_distinct_value_at_100_percent() -> None:
+    """One distinct entity at 100% → speaks. Critically, NO population
+    floor — the share statistic exists to surface concentration, and
+    low cardinality is the signal, not noise."""
+    counts = pd.Series([10], index=["203.0.113.99"])
+    result = _stats._share(counts, total=10)
+    assert result is not None
+    entity, top_share = result
+    assert entity == "203.0.113.99"
+    assert top_share == 1.0
+def test_share_speaks_on_two_distinct_values_with_dominant() -> None:
+    """99/100 = 99% concentration on 2 distinct entities → speaks. The
+    OLD cliff floor would suppress this; the NEW share statistic does not."""
+    counts = pd.Series([99, 1], index=["203.0.113.99", "203.0.113.10"])
+    result = _stats._share(counts, total=100)
+    assert result is not None
+    entity, top_share = result
+    assert entity == "203.0.113.99"
+    assert top_share == 0.99
+def test_share_speaks_exactly_at_gate() -> None:
+    """80% at SHARE_GATE = 0.80 → speaks (>=, not >)."""
+    counts = pd.Series([80, 20], index=["a", "b"])
+    result = _stats._share(counts, total=100)
+    assert result is not None
+    entity, top_share = result
+    assert entity == "a"
+    assert top_share == 0.80
+def test_share_dashes_just_below_gate() -> None:
+    """79.9% just below SHARE_GATE → dashes."""
+    counts = pd.Series([799, 201], index=["a", "b"])
+    assert _stats._share(counts, total=1000) is None
+def test_share_dashes_on_diffuse_distribution() -> None:
+    """No single entity above the gate → dashes."""
+    counts = pd.Series([30, 25, 20, 15, 10],
+                       index=["a", "b", "c", "d", "e"])
+    assert _stats._share(counts, total=100) is None
+def test_share_defensive_returns_on_empty_or_zero_total() -> None:
+    assert _stats._share(pd.Series([], dtype=int), total=0) is None
+    assert _stats._share(pd.Series([], dtype=int), total=100) is None
+    assert _stats._share(pd.Series([5], index=["a"]), total=0) is None
+def test_share_defensive_return_on_nan_rank1() -> None:
+    """A NaN top count is meaningless — return None rather than crashing
+    or returning a NaN-share."""
+    counts = pd.Series([float("nan")], index=["a"])
+    assert _stats._share(counts, total=10) is None
+# ─── select_insights_and_fields behavior ────────────────────────────────────
+#
+# The shared selection helper that the four schema summarisers all use.
+# Covers Glenn's precision ask: only suppress from fields when an insight
+# actually ran (formatter present AND used). Missing formatter keeps the
+# slot in fields, preserving "each fact appears exactly once."
+from loghunter.common.finding import DigestSlot
+def _cliff_slot(label: str, *, ratio: float, magnitude: float = 1.0) -> DigestSlot:
+    return DigestSlot(
+        label=label, statistic="cliff",
+        cells=["entity-a", f"{int(magnitude)}", f"{ratio:.1f}x"],
+        entity="entity-a", magnitude=magnitude, ratio=ratio,
+    )
+def _dist_slot(label: str, cells_text: str) -> DigestSlot:
+    return DigestSlot(label=label, statistic="dist", cells=[cells_text])
+def _nonspeaking(label: str, statistic: str = "cliff") -> DigestSlot:
+    return DigestSlot(label=label, statistic=statistic)
+def test_select_promotes_top_three_by_salience() -> None:
+    """Speaking cliff slots sort by ratio desc; top-3 with a formatter
+    become insights. Non-promoted cliff slot stays in fields."""
+    slots = [
+        _cliff_slot("a", ratio=5.0),
+        _cliff_slot("b", ratio=10.0),
+        _cliff_slot("c", ratio=2.0),
+        _cliff_slot("d", ratio=20.0),
+    ]
+    formatters = {label: (lambda s, l=label: f"{l}-insight") for label in "abcd"}
+    insights, fields = _stats.select_insights_and_fields(slots, formatters)
+    # Top 3 by ratio desc: d (20), b (10), a (5). c is not promoted.
+    assert insights == ["d-insight", "b-insight", "a-insight"]
+    assert [f.label for f in fields] == ["c"]
+def test_select_dist_slots_pass_through_unfiltered() -> None:
+    """Dist slots never produce insights; they always pass through to
+    fields when they have cells."""
+    slots = [
+        _dist_slot("qtype-mix", "A 50% · AAAA 30%"),
+        _cliff_slot("client-volume", ratio=5.0),
+    ]
+    formatters = {"client-volume": lambda s: "client-volume-insight"}
+    insights, fields = _stats.select_insights_and_fields(slots, formatters)
+    assert insights == ["client-volume-insight"]
+    # qtype-mix not promoted; client-volume promoted → suppressed.
+    assert [f.label for f in fields] == ["qtype-mix"]
+def test_select_missing_formatter_keeps_slot_as_field() -> None:
+    """Glenn's precision: a gating slot whose label has no formatter
+    falls through to fields instead of vanishing. 'Each fact appears
+    exactly once' must not lose facts to a missing formatter."""
+    slots = [
+        _cliff_slot("with-fmt", ratio=10.0),
+        _cliff_slot("no-fmt", ratio=20.0),  # higher salience but no fmt
+    ]
+    formatters = {"with-fmt": lambda s: "with-fmt-insight"}
+    insights, fields = _stats.select_insights_and_fields(slots, formatters)
+    # no-fmt ranks first by salience but cannot become an insight; it
+    # falls through to fields. with-fmt is the only promoted slot.
+    assert insights == ["with-fmt-insight"]
+    assert [f.label for f in fields] == ["no-fmt"]
+def test_select_non_speaking_slots_omitted_from_both() -> None:
+    """A slot with cells=None vanishes from BOTH insights and fields —
+    the renderer never sees the non-speaking state."""
+    slots = [
+        _cliff_slot("speaks", ratio=10.0),
+        _nonspeaking("silent"),
+    ]
+    formatters = {"speaks": lambda s: "speaks-insight"}
+    insights, fields = _stats.select_insights_and_fields(slots, formatters)
+    assert insights == ["speaks-insight"]
+    assert [f.label for f in fields] == []
+def test_select_all_speaking_promoted_yields_empty_fields() -> None:
+    """The syslog mock case: every speaking slot becomes an insight, so
+    the fields block is empty. Card ends on the last insight."""
+    slots = [
+        _cliff_slot("a", ratio=5.0),
+        _cliff_slot("b", ratio=10.0),
+        _cliff_slot("c", ratio=2.0),
+    ]
+    formatters = {label: (lambda s, l=label: f"{l}-insight") for label in "abc"}
+    insights, fields = _stats.select_insights_and_fields(slots, formatters)
+    assert len(insights) == 3
+    assert fields == []
+def test_select_share_and_rate_salience_share_bypasses_population_floor() -> None:
+    """share salience uses raw percentage; rate salience uses fraction /
+    RATE_FLOOR. A heavily concentrated share (90%) outranks a modest
+    cliff (5x)."""
+    share = DigestSlot(
+        label="source-ip", statistic="share",
+        cells=["x", "90%"], entity="x", magnitude=90.0, ratio=None,
+    )
+    cliff = _cliff_slot("event-source", ratio=5.0)
+    formatters = {
+        "source-ip": lambda s: f"share-{s.magnitude:.0f}",
+        "event-source": lambda s: f"cliff-{s.ratio:.0f}",
+    }
+    insights, _ = _stats.select_insights_and_fields([share, cliff], formatters)
+    assert insights == ["share-90", "cliff-5"]