PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

tests/test_aws_detector.py ADDED Viewed

@@ -0,0 +1,504 @@
+"""Unit tests for the aws detector — per-principal CloudTrail behavioral surfacing.
+All fixtures are synthetic per the privacy rail: RFC 5737 IPs only, AWS
+documentation account 123456789012, obvious-placeholder principal / role names.
+Each test states the property under test and exercises the smallest synthetic
+frame that proves it.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+from types import SimpleNamespace
+import pandas as pd
+from loghunter.common.finding import DetectorContext, Severity
+from loghunter.common.loader import _CLOUDTRAIL_COLUMNS
+from loghunter.detectors.aws import (
+    DEFAULT_CONFIG,
+    _aggregate_per_principal,
+    _compute_bursts,
+    _compute_rarity,
+    _compute_weirdness,
+    below_floor_count,
+    run,
+)
+_DOCS_ACCT = "123456789012"
+_WINDOW = (
+    datetime(2026, 6, 1, tzinfo=timezone.utc),
+    datetime(2026, 6, 8, tzinfo=timezone.utc),
+)
+_BASE_TS = datetime(2026, 6, 1, 12, 0, 0, tzinfo=timezone.utc).timestamp()
+# ── Fixture helpers ──────────────────────────────────────────────────────────
+def _event(**overrides) -> dict:
+    """Build a minimal canonical CloudTrail per-event row (12 fields)."""
+    base: dict = {
+        "ts":            _BASE_TS,
+        "principal":     "placeholder-user",
+        "lane":          "interactive",
+        "read_write":    "read",
+        "event_source":  "s3.amazonaws.com",
+        "event_name":    "GetObject",
+        "identity_type": "IAMUser",
+        "source_ip":     "192.0.2.10",
+        "error_code":    None,
+        "aws_region":    "us-east-1",
+        "event_id":      "11111111-1111-1111-1111-111111111111",
+        "raw":           {},
+    }
+    base.update(overrides)
+    return base
+def _df(events: list[dict]) -> pd.DataFrame:
+    """Build a DataFrame matching parsers/cloudtrail.py's 12-column output."""
+    if not events:
+        return pd.DataFrame(columns=_CLOUDTRAIL_COLUMNS)
+    rows = [_event(**e) for e in events]
+    return pd.DataFrame(rows, columns=_CLOUDTRAIL_COLUMNS)
+def _ctx(df: pd.DataFrame, **kwargs) -> DetectorContext:
+    """DetectorContext for driving run() in tests.
+    No verbose kwarg under W6 — the result set is verbosity-invariant. Any
+    leftover ``verbose=`` kwarg is silently dropped to keep legacy call sites
+    quiet during the migration.
+    """
+    cfg = kwargs.pop("config", {})
+    kwargs.pop("verbose", None)
+    return DetectorContext(
+        logs={"*.json*": df},
+        config=cfg,
+        allowlist=SimpleNamespace(filter_df=lambda d, name: d),
+        data_window=_WINDOW,
+        data_sources=["cloudtrail_raw"],
+    )
+# ── Aggregation: principal collapses across sessions ─────────────────────────
+def test_aggregate_per_principal_collapses_sessions_of_same_role() -> None:
+    """The parser's principal key already collapses an AssumedRole's sessions; the
+    detector aggregates by that key, so two events with different session names
+    but the same parser-derived principal aggregate as one row."""
+    events = [_event(principal="role:placeholder-role", event_id=f"e{i}") for i in range(20)]
+    df = _df(events)
+    per = _aggregate_per_principal(df)
+    assert len(per) == 1
+    assert per.iloc[0]["principal"] == "role:placeholder-role"
+    assert per.iloc[0]["event_count"] == 20
+def test_aggregate_features_match_known_distribution() -> None:
+    """Spot-check features against a small hand-constructed event mix."""
+    events = (
+        # 5 GetObject (read, success), 5 PutObject (write, 1 errored), all one IP, one region
+        [_event(event_name="GetObject", read_write="read") for _ in range(5)]
+        + [_event(event_name="PutObject", read_write="write") for _ in range(4)]
+        + [_event(event_name="PutObject", read_write="write", error_code="AccessDenied")]
+    )
+    df = _df(events)
+    per = _aggregate_per_principal(df)
+    assert len(per) == 1
+    row = per.iloc[0]
+    assert row["event_count"] == 10
+    assert abs(row["error_rate"] - 0.1) < 1e-9
+    assert row["distinct_event_name"] == 2          # GetObject, PutObject
+    assert row["distinct_source_ip"] == 1
+    assert row["distinct_event_source"] == 1
+    assert abs(row["read_ratio"] - 0.5) < 1e-9
+# ── Lane split: service principals are excluded ──────────────────────────────
+def test_lane_split_service_principals_yield_no_findings() -> None:
+    """A frame containing only service-lane events returns []."""
+    events = [
+        _event(lane="service", principal="ec2.amazonaws.com", event_name=f"Action{i}")
+        for i in range(100)
+    ]
+    df = _df(events)
+    findings = run(_ctx(df, config={"min_events": 10}))
+    assert findings == []
+def test_lane_split_service_events_excluded_from_aggregation() -> None:
+    """A mixed frame with one interactive and one service-lane principal aggregates
+    only the interactive one."""
+    events = (
+        [_event(principal="alice") for _ in range(5)]
+        + [_event(principal="ec2.amazonaws.com", lane="service") for _ in range(50)]
+    )
+    df = _df(events)
+    from loghunter.detectors.aws import _filter_interactive
+    per = _aggregate_per_principal(_filter_interactive(df))
+    assert list(per["principal"]) == ["alice"]
+# ── Signal 1: rarity ─────────────────────────────────────────────────────────
+def test_rarity_log10_n_over_count() -> None:
+    """For 100 events with three actions in 70/20/10 proportions, rarity is
+    log10(N/count) per action."""
+    import math
+    events = (
+        [_event(event_name="GetObject") for _ in range(70)]
+        + [_event(event_name="ListBuckets") for _ in range(20)]
+        + [_event(event_name="DeleteBucket") for _ in range(10)]
+    )
+    rarity = _compute_rarity(_df(events))
+    assert abs(rarity["GetObject"]   - math.log10(100 / 70)) < 1e-9
+    assert abs(rarity["ListBuckets"] - math.log10(100 / 20)) < 1e-9
+    assert abs(rarity["DeleteBucket"] - math.log10(100 / 10)) < 1e-9
+def test_rarity_empty_frame_returns_empty_dict() -> None:
+    assert _compute_rarity(_df([])) == {}
+# ── Signal 2: weirdness composite ────────────────────────────────────────────
+def test_weirdness_composite_ranks_outlier_first() -> None:
+    """Five principals; one is unambiguously the standout in error rate and
+    distinct source-IP count. It must rank first by composite_z."""
+    # Build N events each for 5 principals; principal 'outlier' has high error
+    # rate AND many distinct source IPs. Others are bland and similar.
+    events: list[dict] = []
+    for name in ["alice", "bob", "carol", "dave"]:
+        events.extend(_event(principal=name, source_ip="192.0.2.10",
+                             event_name="GetObject", error_code=None)
+                      for _ in range(60))
+    for i in range(60):
+        events.append(_event(
+            principal="outlier",
+            source_ip=f"198.51.100.{i % 30}",
+            event_name=f"Action{i % 20}",
+            error_code="AccessDenied" if i % 2 == 0 else None,
+        ))
+    df = _df(events)
+    findings = run(_ctx(df, config={
+        "min_events": 50,
+        "composite_medium_threshold": 1.5,
+        "composite_low_threshold": 0.5,
+    }))
+    ranked = [f for f in findings if f.evidence.get("tier") == "ranked"]
+    assert ranked, "expected at least one ranked finding"
+    assert ranked[0].evidence["principal"] == "outlier"
+def test_weirdness_composite_degenerate_population_yields_zero_z() -> None:
+    """A single scorable principal produces std == 0 across all features; all
+    z-scores collapse to 0, composite is 0, and the synthetic ranked_summary
+    is emitted instead of a per-principal finding."""
+    events = [_event(principal="only-one") for _ in range(60)]
+    df = _df(events)
+    findings = run(_ctx(df, config={"min_events": 50}))
+    ranked = [f for f in findings if f.evidence.get("tier") == "ranked"]
+    summary = [f for f in findings if f.evidence.get("tier") == "ranked_summary"]
+    assert ranked == []
+    assert len(summary) == 1
+    assert summary[0].severity == Severity.INFO
+    assert summary[0].evidence["scorable_count"] == 1
+    assert summary[0].evidence["top_composite_z"] == 0.0
+# ── Signal 3: burst aggregation ──────────────────────────────────────────────
+def _enum_sweep(principal: str, n_firsts: int, gap: float, start_ts: float,
+                error_rate: float = 0.0, n_services: int = 1) -> list[dict]:
+    """Construct an enumeration-sweep event sequence:
+      1. one seed event (so principal isn't all-new)
+      2. n_firsts events with distinct event_names spaced ``gap`` seconds apart
+    """
+    events = [_event(principal=principal, ts=start_ts, event_name="SeedAction")]
+    for i in range(n_firsts):
+        events.append(_event(
+            principal=principal,
+            ts=start_ts + (i + 1) * gap,
+            event_name=f"NewAction{i:03d}",
+            event_source=f"svc{i % n_services}.amazonaws.com",
+            error_code="AccessDenied" if i / n_firsts < error_rate else None,
+        ))
+    return events
+def test_burst_collapses_enumeration_sweep_to_one_finding() -> None:
+    """N first-seen actions within burst_gap_seconds collapse to ONE burst Finding."""
+    events = _enum_sweep("attacker", n_firsts=10, gap=30.0, start_ts=_BASE_TS)
+    df = _df(events)
+    findings = run(_ctx(df, config={
+        "min_events": 1000,            # nobody scorable; only burst tier exposed
+        "burst_gap_seconds": 300,
+        "burst_min_firsts": 3,
+    }))
+    bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
+    assert len(bursts) == 1
+    assert bursts[0].evidence["new_action_count"] == 10
+def test_burst_negative_gap_too_wide_produces_no_finding() -> None:
+    """First-seen actions spread wider than burst_gap_seconds produce no burst."""
+    # Gap of 600s with burst_gap_seconds=300 → each first-seen event starts a fresh
+    # singleton burst that never reaches burst_min_firsts.
+    events = _enum_sweep("explorer", n_firsts=10, gap=600.0, start_ts=_BASE_TS)
+    df = _df(events)
+    findings = run(_ctx(df, config={
+        "min_events": 1000,
+        "burst_gap_seconds": 300,
+        "burst_min_firsts": 3,
+    }))
+    assert [f for f in findings if f.evidence.get("tier") == "burst"] == []
+def test_burst_negative_too_few_firsts() -> None:
+    """Fewer than burst_min_firsts first-seen actions produce no burst finding."""
+    events = _enum_sweep("explorer", n_firsts=2, gap=30.0, start_ts=_BASE_TS)
+    df = _df(events)
+    findings = run(_ctx(df, config={
+        "min_events": 1000,
+        "burst_gap_seconds": 300,
+        "burst_min_firsts": 3,
+    }))
+    assert [f for f in findings if f.evidence.get("tier") == "burst"] == []
+def test_burst_skips_principal_very_first_event() -> None:
+    """A principal's first event must NOT count as first-seen (all-new is
+    uninformative — handled by the seed step in _compute_bursts)."""
+    # A principal whose entire footprint is N events of distinct names, with
+    # NO seed: first event seeds, next (N-1) are first-seen.
+    events = [
+        _event(principal="alpha", ts=_BASE_TS + i * 30.0, event_name=f"Action{i:03d}")
+        for i in range(5)
+    ]
+    df = _df(events)
+    findings = run(_ctx(df, config={
+        "min_events": 1000,
+        "burst_gap_seconds": 300,
+        "burst_min_firsts": 3,
+    }))
+    bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
+    # 5 events, first is seed, 4 are first-seen → burst of 4 (>= burst_min_firsts=3)
+    assert len(bursts) == 1
+    assert bursts[0].evidence["new_action_count"] == 4
+# ── Severity gates ────────────────────────────────────────────────────────────
+def test_burst_default_severity_is_medium_on_clean_burst() -> None:
+    """A bare large burst with no errors and one service is MEDIUM."""
+    events = _enum_sweep("attacker", n_firsts=10, gap=30.0, start_ts=_BASE_TS,
+                         error_rate=0.0, n_services=1)
+    findings = run(_ctx(_df(events), config={"min_events": 1000}))
+    bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
+    assert bursts[0].severity == Severity.MEDIUM
+def test_burst_escalates_to_high_on_error_rate() -> None:
+    """burst error_rate >= burst_high_error_rate → HIGH."""
+    events = _enum_sweep("attacker", n_firsts=10, gap=30.0, start_ts=_BASE_TS,
+                         error_rate=1.0, n_services=1)
+    findings = run(_ctx(_df(events), config={
+        "min_events": 1000,
+        "burst_high_error_rate": 0.5,
+        "burst_high_service_count": 10,   # disable the service gate
+    }))
+    bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
+    assert bursts[0].severity == Severity.HIGH
+def test_burst_escalates_to_high_on_service_spread() -> None:
+    """new_service_count >= burst_high_service_count → HIGH."""
+    events = _enum_sweep("attacker", n_firsts=10, gap=30.0, start_ts=_BASE_TS,
+                         error_rate=0.0, n_services=5)
+    findings = run(_ctx(_df(events), config={
+        "min_events": 1000,
+        "burst_high_error_rate": 1.5,    # disable the error gate
+        "burst_high_service_count": 3,
+    }))
+    bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
+    assert bursts[0].severity == Severity.HIGH
+def test_burst_never_auto_high_on_size_alone() -> None:
+    """Even a very large clean burst stays MEDIUM — size alone never escalates."""
+    events = _enum_sweep("walker", n_firsts=100, gap=10.0, start_ts=_BASE_TS,
+                         error_rate=0.0, n_services=1)
+    findings = run(_ctx(_df(events), config={
+        "min_events": 1000,
+        "burst_high_error_rate": 0.5,
+        "burst_high_service_count": 3,
+    }))
+    bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
+    assert bursts[0].severity == Severity.MEDIUM
+# ── Two clean-corpus cases — Glenn's watch item ──────────────────────────────
+def test_clean_corpus_below_floor_emits_no_ranked_findings() -> None:
+    """When all principals are below min_events, no ranked tier at all — the
+    runner's RunSummary note is what discloses this case, not a detector
+    Finding."""
+    # 3 principals, each with 5 events (default min_events=50).
+    events: list[dict] = []
+    for name in ["alice", "bob", "carol"]:
+        events.extend(_event(principal=name, event_name="GetObject") for _ in range(5))
+    df = _df(events)
+    findings = run(_ctx(df))
+    ranked = [f for f in findings if f.evidence.get("tier") == "ranked"]
+    summary = [f for f in findings if f.evidence.get("tier") == "ranked_summary"]
+    assert ranked == []
+    assert summary == []  # no synthetic summary either
+    assert below_floor_count(df, DEFAULT_CONFIG["min_events"]) == 3
+def test_clean_corpus_scorable_but_below_low_band_emits_one_summary() -> None:
+    """When scorable principals exist but none clears the LOW band, the synthetic
+    ranked_summary INFO finding is emitted (one, not per-principal)."""
+    # 4 principals, identical footprint — z-scores collapse to 0 < LOW.
+    events: list[dict] = []
+    for name in ["alice", "bob", "carol", "dave"]:
+        events.extend(_event(principal=name) for _ in range(60))
+    df = _df(events)
+    findings = run(_ctx(df, config={"min_events": 50}))
+    ranked = [f for f in findings if f.evidence.get("tier") == "ranked"]
+    summary = [f for f in findings if f.evidence.get("tier") == "ranked_summary"]
+    assert ranked == []
+    assert len(summary) == 1
+    assert summary[0].severity == Severity.INFO
+def test_clean_corpus_summary_evidence_carries_scorable_count_and_top() -> None:
+    """The synthetic summary surfaces scorable_count and top_principal (the
+    least-unremarkable actor) as analyst pivot — not just an empty 'quiet' line."""
+    events: list[dict] = []
+    for name in ["alice", "bob"]:
+        events.extend(_event(principal=name) for _ in range(60))
+    df = _df(events)
+    findings = run(_ctx(df, config={"min_events": 50}))
+    summary = [f for f in findings if f.evidence.get("tier") == "ranked_summary"][0]
+    assert summary.evidence["scorable_count"] == 2
+    assert summary.evidence["top_principal"] in {"alice", "bob"}
+    assert "top_composite_z" in summary.evidence
+# ── below_floor_count helper ─────────────────────────────────────────────────
+def test_below_floor_count_pure_helper_counts_correctly() -> None:
+    events: list[dict] = []
+    # 2 below-floor principals (5 events each)
+    for name in ["alice", "bob"]:
+        events.extend(_event(principal=name) for _ in range(5))
+    # 1 at-or-above principal (50 events)
+    events.extend(_event(principal="carol") for _ in range(50))
+    df = _df(events)
+    assert below_floor_count(df, 50) == 2
+def test_below_floor_count_none_returns_zero() -> None:
+    assert below_floor_count(None, 50) == 0
+def test_below_floor_count_empty_returns_zero() -> None:
+    assert below_floor_count(_df([]), 50) == 0
+def test_below_floor_count_ignores_service_lane_principals() -> None:
+    """Service-lane principals aren't candidates for scoring; they don't
+    contribute to below-floor regardless of event count."""
+    events = [_event(principal="ec2.amazonaws.com", lane="service") for _ in range(5)]
+    assert below_floor_count(_df(events), 50) == 0
+def test_below_floor_count_matches_detector_internal_count() -> None:
+    """Same helper, same answer — analysis and disclosure never drift."""
+    events: list[dict] = []
+    for name in ["alice", "bob"]:
+        events.extend(_event(principal=name) for _ in range(5))
+    events.extend(_event(principal="carol") for _ in range(60))
+    df = _df(events)
+    n_via_helper = below_floor_count(df, 50)
+    # And via the detector's actual aggregation: count interactive principals
+    # with event_count < 50 in the per-principal frame.
+    from loghunter.detectors.aws import _filter_interactive
+    per = _aggregate_per_principal(_filter_interactive(df))
+    n_internal = int((per["event_count"] < 50).sum())
+    assert n_via_helper == n_internal == 2
+# ── Output ordering & defensive contracts ────────────────────────────────────
+def test_burst_findings_precede_ranked_findings() -> None:
+    """Two-tier ordering: bursts first, then ranked. Mixed Findings list order."""
+    events = (
+        _enum_sweep("attacker", n_firsts=5, gap=30.0, start_ts=_BASE_TS)
+        + [_event(principal=f"bland{i}",
+                  source_ip=f"192.0.2.{i}",
+                  event_name=f"Bland{j:02d}")
+           for i in range(4) for j in range(60)]
+    )
+    df = _df(events)
+    findings = run(_ctx(df, config={"min_events": 50}))
+    tiers = [f.evidence["tier"] for f in findings]
+    # No "ranked" tier finding may appear before a "burst" tier finding.
+    last_burst_idx = max((i for i, t in enumerate(tiers) if t == "burst"), default=-1)
+    first_other_idx = min(
+        (i for i, t in enumerate(tiers) if t in {"ranked", "ranked_summary"}),
+        default=len(tiers),
+    )
+    assert last_burst_idx < first_other_idx
+def test_empty_frame_returns_empty_list() -> None:
+    df = _df([])
+    assert run(_ctx(df)) == []
+def test_absent_pattern_returns_empty_list() -> None:
+    """context.logs has no entry for *.json* — run() returns [] without raising."""
+    ctx = DetectorContext(
+        logs={},
+        config={},
+        allowlist=SimpleNamespace(filter_df=lambda d, name: d),
+        data_window=_WINDOW,
+        data_sources=[],
+    )
+    assert run(ctx) == []
+def test_low_band_findings_emitted_without_verbose() -> None:
+    """LOW ranked findings are NOT gated on context.verbose; the analyst is
+    asking for the detector by selecting it."""
+    # Make one principal a mild standout — composite ~ 1.0..1.5 range — so it
+    # lands in LOW band with the default thresholds (1.0 → LOW, 2.0 → MEDIUM).
+    events: list[dict] = []
+    for name in ["alice", "bob", "carol", "dave"]:
+        events.extend(_event(principal=name, source_ip="192.0.2.10",
+                             event_name="GetObject")
+                      for _ in range(60))
+    # mild outlier: 2 distinct event names instead of 1
+    for i in range(60):
+        events.append(_event(
+            principal="standout",
+            source_ip="192.0.2.10",
+            event_name="GetObject" if i % 2 == 0 else "ListBuckets",
+        ))
+    df = _df(events)
+    findings_default = run(_ctx(df, config={"min_events": 50}))
+    findings_verbose = run(_ctx(df, config={"min_events": 50}, verbose=True))
+    # Whatever the severity is, the counts must match (no verbose gating).
+    assert (
+        sum(1 for f in findings_default if f.evidence.get("tier") == "ranked")
+        == sum(1 for f in findings_verbose if f.evidence.get("tier") == "ranked")
+    )

tests/test_be_like_water.py ADDED Viewed

@@ -0,0 +1,106 @@
+"""Unit tests for the be_like_water target resolver.
+Gated ladder, evaluated in order — a winning gate decides without falling
+through:
+  Step 0 (gate): trailing slash -> DIRECTORY. No disk consult.
+  Step 1: exists and is_file()  -> FILE.
+  Step 2: exists and is_dir()   -> DIRECTORY.
+  Step 3: does not exist        -> FILE (basename is the filename; parent
+                                  will be mkdir-p'd at write).
+"""
+from __future__ import annotations
+from pathlib import Path
+import pytest
+from loghunter.common.paths import ResolvedTarget, be_like_water
+def test_trailing_slash_gate_wins_over_existing_file(tmp_path: Path) -> None:
+    """Step 0: a target with a trailing slash is DIRECTORY even when a file
+    by that exact name exists on disk. The gate runs before disk reads."""
+    f = tmp_path / "X"
+    f.write_text("preexisting file content", encoding="utf-8")
+    assert f.is_file()  # confirm the file exists
+    result = be_like_water(f"{f}/")   # trailing slash forces directory verdict
+    assert result == ResolvedTarget(Path(f"{f}/").expanduser(), is_file=False)
+    # User intent (trailing slash) wins over disk state.
+def test_existing_file_resolves_to_file(tmp_path: Path) -> None:
+    """Step 1: an existing file with no trailing slash -> FILE at that path."""
+    f = tmp_path / "events.log"
+    f.write_text("data", encoding="utf-8")
+    result = be_like_water(str(f))
+    assert result.is_file is True
+    assert result.path == f
+def test_existing_directory_resolves_to_directory(tmp_path: Path) -> None:
+    """Step 2: an existing directory with no trailing slash -> DIRECTORY."""
+    d = tmp_path / "reports"
+    d.mkdir()
+    result = be_like_water(str(d))
+    assert result.is_file is False
+    assert result.path == d
+def test_not_exists_resolves_to_file(tmp_path: Path) -> None:
+    """Step 3: a path that does not exist -> FILE named by the last segment."""
+    target = tmp_path / "missing" / "leaf"
+    assert not target.exists()
+    result = be_like_water(str(target))
+    assert result.is_file is True
+    assert result.path == target
+    # Verify NO directory was created during resolution — that's a write-time concern.
+    assert not target.parent.exists()
+def test_trailing_slash_on_nonexistent_resolves_to_directory(tmp_path: Path) -> None:
+    """Step 0 (gate): trailing slash on a non-existent path -> DIRECTORY."""
+    target = tmp_path / "a" / "b" / "c"
+    assert not target.exists()
+    result = be_like_water(f"{target}/")
+    assert result.is_file is False
+    # Note: Path() normalizes trailing slashes, so result.path equals the
+    # unsuffixed equivalent — but the verdict is still DIRECTORY.
+    assert result.path == target
+    # Resolver did not create anything.
+    assert not target.exists()
+def test_tilde_reports_consequence(monkeypatch, tmp_path: Path) -> None:
+    """Explicit consequence: only trailing slash, or an already-existing directory,
+    yields directory behavior. `--out=~/reports` (no trailing slash, not exists)
+    creates a FILE named "reports" (after mkdir -p of the parent at write time).
+    This is the surprising-but-consistent behavior we lock down.
+    """
+    # Force ~ to expand to a tmp location so the test does not touch the real HOME.
+    monkeypatch.setenv("HOME", str(tmp_path))
+    result = be_like_water("~/reports")
+    assert result.is_file is True   # NOT a directory verdict
+    assert result.path == tmp_path / "reports"
+def test_expanduser_applied_to_both_branches(monkeypatch, tmp_path: Path) -> None:
+    """expanduser is applied for both the trailing-slash gate and the disk-conform paths."""
+    monkeypatch.setenv("HOME", str(tmp_path))
+    # Trailing slash:
+    dir_result = be_like_water("~/foo/")
+    assert dir_result.path == tmp_path / "foo"
+    assert dir_result.is_file is False
+    # No trailing slash, not exists:
+    file_result = be_like_water("~/foo")
+    assert file_result.path == tmp_path / "foo"
+    assert file_result.is_file is True
+def test_resolved_target_path_is_pathlib_path(tmp_path: Path) -> None:
+    """ResolvedTarget.path is a Path object, not a str — callers depend on it."""
+    result = be_like_water(str(tmp_path))   # tmp_path exists, is dir
+    assert isinstance(result.path, Path)
+    assert isinstance(result.is_file, bool)