PyPI - loghunter-cli - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

loghunter/__init__.py +3 -0
loghunter/cli.py +1108 -0
loghunter/cli_init.py +567 -0
loghunter/common/__init__.py +1 -0
loghunter/common/allowlist.py +436 -0
loghunter/common/clustering.py +326 -0
loghunter/common/config.py +221 -0
loghunter/common/display.py +323 -0
loghunter/common/errors.py +45 -0
loghunter/common/finding.py +239 -0
loghunter/common/loader/__init__.py +136 -0
loghunter/common/loader/diagnostics.py +94 -0
loghunter/common/loader/discovery.py +335 -0
loghunter/common/loader/io.py +76 -0
loghunter/common/loader/pipeline.py +1010 -0
loghunter/common/loader/sniff.py +184 -0
loghunter/common/loader/types.py +207 -0
loghunter/common/loader/windowing.py +523 -0
loghunter/common/output.py +93 -0
loghunter/common/paths.py +105 -0
loghunter/common/sources.py +392 -0
loghunter/data/allowlist/connections.txt +50 -0
loghunter/data/allowlist/domains_devices.txt +5 -0
loghunter/data/allowlist/domains_homelab.txt +5 -0
loghunter/data/allowlist/domains_universal.txt +125 -0
loghunter/data/config_example.toml +144 -0
loghunter/detectors/__init__.py +5 -0
loghunter/detectors/auth.py +27 -0
loghunter/detectors/aws.py +671 -0
loghunter/detectors/beacon.py +258 -0
loghunter/detectors/dns.py +778 -0
loghunter/detectors/dnsblock.py +29 -0
loghunter/detectors/duration.py +178 -0
loghunter/detectors/protocol.py +26 -0
loghunter/detectors/scan.py +735 -0
loghunter/detectors/ssl.py +25 -0
loghunter/detectors/syslog.py +266 -0
loghunter/detectors/weird.py +27 -0
loghunter/digest/__init__.py +43 -0
loghunter/digest/_stats.py +182 -0
loghunter/digest/blob.py +698 -0
loghunter/digest/cloudtrail.py +341 -0
loghunter/digest/conn.py +367 -0
loghunter/digest/dns.py +364 -0
loghunter/digest/syslog.py +269 -0
loghunter/exporters/__init__.py +534 -0
loghunter/exporters/cloudtrail.py +499 -0
loghunter/exporters/splunk.py +222 -0
loghunter/outputs/__init__.py +1 -0
loghunter/outputs/allowlist.py +75 -0
loghunter/outputs/csv.py +70 -0
loghunter/outputs/email.py +44 -0
loghunter/outputs/html.py +99 -0
loghunter/outputs/json.py +77 -0
loghunter/outputs/text.py +1422 -0
loghunter/parsers/__init__.py +1 -0
loghunter/parsers/cloudtrail.py +287 -0
loghunter/parsers/dnsmasq.py +331 -0
loghunter/parsers/syslog.py +150 -0
loghunter/parsers/zeek.py +294 -0
loghunter/parsers/zeek_tsv.py +310 -0
loghunter/runner.py +1895 -0
loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
migrations/cloudtrail_parquet.py +59 -0
migrations/conn_fft.py +550 -0
migrations/conn_scan.py +1097 -0
migrations/dns_dbscan.py +520 -0
migrations/get_syslog.py +402 -0
migrations/syslog_drain3.py +479 -0
scratch/junk/parquet.py +59 -0
tests/__init__.py +1 -0
tests/_cloudtrail_fakes.py +116 -0
tests/conftest.py +17 -0
tests/test_allowlist_defaults_accessor.py +90 -0
tests/test_architecture_spine.py +302 -0
tests/test_aws_detector.py +504 -0
tests/test_be_like_water.py +106 -0
tests/test_cli_help.py +342 -0
tests/test_cli_multi_positional.py +458 -0
tests/test_cloudtrail_exporter.py +631 -0
tests/test_cloudtrail_exporter_botocore.py +207 -0
tests/test_cloudtrail_parser.py +393 -0
tests/test_clustering.py +85 -0
tests/test_clustering_interruptible.py +404 -0
tests/test_config_cli.py +1006 -0
tests/test_config_example_drift.py +164 -0
tests/test_digest_blob.py +1237 -0
tests/test_digest_cli.py +1040 -0
tests/test_digest_cloudtrail.py +980 -0
tests/test_digest_conn.py +1189 -0
tests/test_digest_dns.py +770 -0
tests/test_digest_stats.py +282 -0
tests/test_digest_syslog.py +724 -0
tests/test_display.py +370 -0
tests/test_dns_detector.py +1010 -0
tests/test_dnsmasq_parser.py +467 -0
tests/test_duration_detector.py +491 -0
tests/test_export_orchestrator_shape.py +153 -0
tests/test_init_wizard.py +707 -0
tests/test_loader.py +3639 -0
tests/test_loader_package_surface.py +115 -0
tests/test_loader_window_model.py +215 -0
tests/test_output_path_cascade.py +575 -0
tests/test_resolve_path.py +111 -0
tests/test_root_provenance.py +212 -0
tests/test_runner.py +2599 -0
tests/test_scan_detector.py +455 -0
tests/test_search_paths.py +50 -0
tests/test_sniff_orchestrator.py +373 -0
tests/test_sniff_recognizers.py +573 -0
tests/test_source_resolution_seam.py +471 -0
tests/test_sources.py +648 -0
tests/test_splunk_exporter.py +351 -0
tests/test_syslog_detector.py +458 -0
tests/test_syslog_parser.py +582 -0
tests/test_text_output.py +1225 -0
tests/test_zeek_tsv_parser.py +580 -0

tests/test_cloudtrail_exporter_botocore.py ADDED Viewed

@@ -0,0 +1,207 @@
+"""Botocore-gated tests for the CloudTrail S3 exporter framework.
+These tests construct real botocore exception classes (ClientError,
+NoCredentialsError, MissingDependencyException, EndpointConnectionError) to
+exercise the centralized boto-error translation rail in
+loghunter.exporters.cloudtrail._translate_boto_errors. They are split off
+behind a module-level importorskip so the bulk mock-only suite in
+tests/test_cloudtrail_exporter.py runs on a base checkout without botocore.
+The FakeS3Client / _gz_envelope helpers are shared via tests._cloudtrail_fakes
+(no botocore in that module). All bucket names and account IDs are obviously
+fake.
+"""
+from __future__ import annotations
+from datetime import datetime, timezone
+from pathlib import Path
+import pytest
+# Gate the whole module: skip on a base checkout without botocore.
+botocore_exc = pytest.importorskip("botocore.exceptions")
+from loghunter.exporters import cloudtrail as ct
+from tests._cloudtrail_fakes import FakeS3Client, _gz_envelope
+def _make_client_error(code: str) -> botocore_exc.ClientError:
+    return botocore_exc.ClientError(
+        {"Error": {"Code": code, "Message": code}}, "Op"
+    )
+def test_access_denied_in_sibling_branch_does_not_abort_pull(monkeypatch) -> None:
+    """P2: an AccessDenied on a non-CloudTrail sibling must NOT abort the run.
+    Common bucket-policy pattern: CloudTrail/ is readable to the analyst, ELB/
+    is restricted. Pre-fix, the walker descended into elasticloadbalancing/,
+    triggered AccessDenied, and the entire pull aborted with the auth-error
+    ValueError. With pruning, the sibling is never listed.
+    """
+    client = FakeS3Client()
+    ct_base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
+    client.add_object(ct_base + "obj1.json.gz", _gz_envelope([
+        {"eventTime": "2026-06-01T01:00:00Z", "eventName": "Good"},
+    ]))
+    # Booby-trap: listing inside the ELB branch raises AccessDenied.
+    client.set_list_error_for_prefix(
+        "AWSLogs/000000000000/elasticloadbalancing/",
+        _make_client_error("AccessDenied"),
+    )
+    # Make sure the ELB prefix actually appears as a CommonPrefix when listing
+    # the account level — we need a key under it for the fake to surface it.
+    client.add_object(
+        "AWSLogs/000000000000/elasticloadbalancing/marker", b"",
+    )
+    monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc)
+    # Must complete cleanly — the sibling AccessDenied must never fire.
+    events, meta = ct.fetch({}, cfg, since, until, verbose=False)
+    assert meta["units"] == 1
+    assert events[0]["eventName"] == "Good"
+# ── auth errors take priority over bad-object handling ──────────────────────
+def test_auth_error_from_list_path(monkeypatch) -> None:
+    client = FakeS3Client()
+    client.set_list_error(_make_client_error("AccessDenied"))
+    monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, tzinfo=timezone.utc)
+    with pytest.raises(ValueError, match="AWS credentials not found or expired"):
+        ct.fetch({}, cfg, since, until, verbose=False)
+def test_auth_error_from_get_path_aborts_run(monkeypatch) -> None:
+    """A denied get_object must NOT be downgraded to bad-object skip-and-warn."""
+    client = FakeS3Client()
+    base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
+    client.add_object(base + "obj1.json.gz", _gz_envelope([
+        {"eventTime": "2026-06-01T01:00:00Z", "eventName": "x"},
+    ]))
+    client.set_get_object_error(base + "obj1.json.gz", _make_client_error("ExpiredToken"))
+    monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, tzinfo=timezone.utc)
+    with pytest.raises(ValueError, match="AWS credentials not found or expired"):
+        ct.fetch({}, cfg, since, until, verbose=False)
+def test_no_credentials_error_handled(monkeypatch) -> None:
+    client = FakeS3Client()
+    client.set_list_error(botocore_exc.NoCredentialsError())
+    monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, tzinfo=timezone.utc)
+    with pytest.raises(ValueError, match="AWS credentials not found or expired"):
+        ct.fetch({}, cfg, since, until, verbose=False)
+# ── MissingDependencyException + BotoCoreError sweep (actionable-error rail) ──
+def _missing_dep_exc() -> botocore_exc.MissingDependencyException:
+    """Mirror the real-world SSO/login-provider message."""
+    return botocore_exc.MissingDependencyException(
+        msg="Using the login credential provider requires an additional dependency. "
+            "Please install with `pip install 'botocore[crt]'`"
+    )
+def test_missing_dependency_at_client_construction_maps_to_actionable_error(
+    monkeypatch,
+) -> None:
+    """MissingDependencyException at boto3.client() must become an actionable ValueError."""
+    def _raise(_svc):
+        raise _missing_dep_exc()
+    monkeypatch.setattr(ct.boto3, "client", _raise)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, tzinfo=timezone.utc)
+    with pytest.raises(ValueError) as exc_info:
+        ct.fetch({}, cfg, since, until, verbose=False)
+    msg = str(exc_info.value)
+    assert "botocore[crt]" in msg
+    assert "credential provider" in msg
+    # The original botocore detail must be embedded so the user sees the exact
+    # missing piece (it varies — login vs SSO vs other providers).
+    assert "login credential provider" in msg
+def test_missing_dependency_at_list_call_maps_to_actionable_error(
+    monkeypatch,
+) -> None:
+    """A list-call MissingDependencyException must map the same way (not propagate raw)."""
+    client = FakeS3Client()
+    client.set_list_error(_missing_dep_exc())
+    monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, tzinfo=timezone.utc)
+    with pytest.raises(ValueError, match="botocore\\[crt\\]"):
+        ct.fetch({}, cfg, since, until, verbose=False)
+def test_missing_dependency_at_get_call_maps_to_actionable_error(
+    monkeypatch,
+) -> None:
+    """A get_object MissingDependencyException must abort with the actionable message,
+    NOT be downgraded to the corrupt-object skip-and-warn path."""
+    client = FakeS3Client()
+    base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
+    client.add_object(base + "obj1.json.gz", _gz_envelope([
+        {"eventTime": "2026-06-01T01:00:00Z", "eventName": "x"},
+    ]))
+    client.set_get_object_error(base + "obj1.json.gz", _missing_dep_exc())
+    monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, tzinfo=timezone.utc)
+    with pytest.raises(ValueError, match="botocore\\[crt\\]"):
+        ct.fetch({}, cfg, since, until, verbose=False)
+def test_generic_botocore_error_is_wrapped(monkeypatch) -> None:
+    """A long-tail BotoCoreError subclass must become 'AWS error during CloudTrail export'."""
+    client = FakeS3Client()
+    client.set_list_error(
+        botocore_exc.EndpointConnectionError(endpoint_url="https://example.invalid")
+    )
+    monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, tzinfo=timezone.utc)
+    with pytest.raises(ValueError) as exc_info:
+        ct.fetch({}, cfg, since, until, verbose=False)
+    msg = str(exc_info.value)
+    assert "AWS error during CloudTrail export" in msg
+    # Original detail embedded for diagnosis
+    assert "example.invalid" in msg
+def test_non_auth_client_error_is_wrapped(monkeypatch) -> None:
+    """A non-auth ClientError (e.g. NoSuchBucket) must be wrapped, not propagated raw."""
+    client = FakeS3Client()
+    client.set_list_error(_make_client_error("NoSuchBucket"))
+    monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
+    cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
+    since = datetime(2026, 6, 1, tzinfo=timezone.utc)
+    until = datetime(2026, 6, 2, tzinfo=timezone.utc)
+    with pytest.raises(ValueError) as exc_info:
+        ct.fetch({}, cfg, since, until, verbose=False)
+    msg = str(exc_info.value)
+    assert "AWS error during CloudTrail export" in msg
+    assert "NoSuchBucket" in msg

tests/test_cloudtrail_parser.py ADDED Viewed

@@ -0,0 +1,393 @@
+"""Unit tests for loghunter.parsers.cloudtrail.parse_event.
+Pure-function tests: no I/O, no DataFrames, no fixtures on disk. Every event is
+built with the ``_event()`` helper so each test states only the field(s) it is
+about. All values are synthetic per the privacy rail — RFC 5737 IPs only, AWS
+documentation account ``123456789012``, and obvious-placeholder names.
+"""
+from __future__ import annotations
+from loghunter.parsers.cloudtrail import parse_event
+_DOCS_ACCOUNT = "123456789012"  # AWS documentation account id
+def _event(**overrides) -> dict:
+    """Build a minimal valid CloudTrail event dict with field overrides.
+    Defaults model a single benign IAMUser GetObject call. Override anything you
+    want to vary; everything else stays sane. Pass ``userIdentity={...}`` to
+    replace the whole identity stanza, or use a nested key like
+    ``userIdentity_type="AssumedRole"`` for shorthand isn't supported — give the
+    full dict.
+    """
+    base: dict = {
+        "eventTime":       "2026-06-01T12:00:00Z",
+        "eventSource":     "s3.amazonaws.com",
+        "eventName":       "GetObject",
+        "eventID":         "11111111-1111-1111-1111-111111111111",
+        "awsRegion":       "us-east-1",
+        "sourceIPAddress": "192.0.2.10",
+        "userIdentity": {
+            "type":        "IAMUser",
+            "userName":    "placeholder-user",
+            "principalId": "AIDAEXAMPLE",
+            "arn":         f"arn:aws:iam::{_DOCS_ACCOUNT}:user/placeholder-user",
+        },
+        "readOnly": True,
+    }
+    base.update(overrides)
+    return base
+# ── principal derivation ──────────────────────────────────────────────────────
+def test_principal_assumed_role_uses_session_issuer_user_name() -> None:
+    event = _event(userIdentity={
+        "type":        "AssumedRole",
+        "principalId": "AROAEXAMPLE:session-alpha",
+        "arn":         f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/placeholder-role/session-alpha",
+        "sessionContext": {
+            "sessionIssuer": {
+                "type":        "Role",
+                "principalId": "AROAEXAMPLE",
+                "userName":    "placeholder-role",
+                "arn":         f"arn:aws:iam::{_DOCS_ACCOUNT}:role/placeholder-role",
+            },
+        },
+    })
+    assert parse_event(event)["principal"] == "placeholder-role"
+def test_principal_assumed_role_falls_back_to_arn_last_segment_when_no_username() -> None:
+    event = _event(userIdentity={
+        "type":        "AssumedRole",
+        "principalId": "AROAEXAMPLE:session-alpha",
+        "sessionContext": {
+            "sessionIssuer": {
+                "type":        "Role",
+                "principalId": "AROAEXAMPLE",
+                "arn":         f"arn:aws:iam::{_DOCS_ACCOUNT}:role/placeholder-role",
+                # userName intentionally omitted
+            },
+        },
+    })
+    assert parse_event(event)["principal"] == "placeholder-role"
+def test_principal_assumed_role_is_stable_across_sessions_of_same_role() -> None:
+    """Load-bearing: two events from different sessions of one role aggregate together."""
+    issuer = {
+        "type":        "Role",
+        "principalId": "AROAEXAMPLE",
+        "userName":    "placeholder-role",
+        "arn":         f"arn:aws:iam::{_DOCS_ACCOUNT}:role/placeholder-role",
+    }
+    session_one = _event(userIdentity={
+        "type":           "AssumedRole",
+        "principalId":    "AROAEXAMPLE:session-alpha",
+        "arn":            f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/placeholder-role/session-alpha",
+        "sessionContext": {"sessionIssuer": issuer},
+    })
+    session_two = _event(userIdentity={
+        "type":           "AssumedRole",
+        "principalId":    "AROAEXAMPLE:session-beta",
+        "arn":            f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/placeholder-role/session-beta",
+        "sessionContext": {"sessionIssuer": issuer},
+    })
+    p1 = parse_event(session_one)["principal"]
+    p2 = parse_event(session_two)["principal"]
+    assert p1 == p2 == "placeholder-role"
+    # Session name must never become the key.
+    assert "session-alpha" not in p1 and "session-beta" not in p2
+def test_principal_iam_user_uses_user_name() -> None:
+    event = _event(userIdentity={
+        "type":        "IAMUser",
+        "userName":    "placeholder-user",
+        "principalId": "AIDAEXAMPLE",
+        "arn":         f"arn:aws:iam::{_DOCS_ACCOUNT}:user/placeholder-user",
+    })
+    assert parse_event(event)["principal"] == "placeholder-user"
+def test_principal_iam_user_falls_back_to_arn_last_segment() -> None:
+    event = _event(userIdentity={
+        "type":        "IAMUser",
+        "principalId": "AIDAEXAMPLE",
+        "arn":         f"arn:aws:iam::{_DOCS_ACCOUNT}:user/arn-derived-name",
+        # userName intentionally omitted
+    })
+    assert parse_event(event)["principal"] == "arn-derived-name"
+def test_principal_aws_service_uses_invoked_by() -> None:
+    event = _event(userIdentity={
+        "type":      "AWSService",
+        "invokedBy": "ec2.amazonaws.com",
+    })
+    assert parse_event(event)["principal"] == "ec2.amazonaws.com"
+def test_principal_root_returns_root_literal() -> None:
+    event = _event(userIdentity={
+        "type":        "Root",
+        "principalId": _DOCS_ACCOUNT,
+        "arn":         f"arn:aws:iam::{_DOCS_ACCOUNT}:root",
+    })
+    assert parse_event(event)["principal"] == "root"
+def test_principal_federated_user_falls_back_to_principal_id() -> None:
+    event = _event(userIdentity={
+        "type":        "FederatedUser",
+        "principalId": f"{_DOCS_ACCOUNT}:placeholder-federated",
+    })
+    assert parse_event(event)["principal"] == f"{_DOCS_ACCOUNT}:placeholder-federated"
+def test_principal_saml_user_falls_back_to_principal_id() -> None:
+    event = _event(userIdentity={
+        "type":        "SAMLUser",
+        "principalId": "SAMLEXAMPLE:placeholder-saml",
+    })
+    assert parse_event(event)["principal"] == "SAMLEXAMPLE:placeholder-saml"
+def test_principal_missing_user_identity_returns_unknown_without_raising() -> None:
+    event = _event()
+    event.pop("userIdentity")
+    assert parse_event(event)["principal"] == "unknown"
+def test_principal_non_dict_user_identity_returns_unknown_without_raising() -> None:
+    event = _event(userIdentity="not-a-dict")
+    assert parse_event(event)["principal"] == "unknown"
+def test_principal_distinct_principal_ids_under_unknown_type_stay_distinct() -> None:
+    event_a = _event(userIdentity={"type": "FutureUnknownType", "principalId": "AAA-EXAMPLE"})
+    event_b = _event(userIdentity={"type": "FutureUnknownType", "principalId": "BBB-EXAMPLE"})
+    assert parse_event(event_a)["principal"] == "AAA-EXAMPLE"
+    assert parse_event(event_b)["principal"] == "BBB-EXAMPLE"
+    assert parse_event(event_a)["principal"] != parse_event(event_b)["principal"]
+# ── lane derivation ───────────────────────────────────────────────────────────
+def test_lane_aws_service_type_is_service() -> None:
+    event = _event(userIdentity={"type": "AWSService", "invokedBy": "lambda.amazonaws.com"})
+    assert parse_event(event)["lane"] == "service"
+def test_lane_aws_account_type_is_service() -> None:
+    event = _event(userIdentity={"type": "AWSAccount", "principalId": "EXAMPLEACCT"})
+    assert parse_event(event)["lane"] == "service"
+def test_lane_invoked_by_amazonaws_com_is_service() -> None:
+    event = _event(userIdentity={
+        "type":      "AssumedRole",
+        "invokedBy": "config.amazonaws.com",
+        "sessionContext": {"sessionIssuer": {
+            "type":     "Role",
+            "userName": "placeholder-role",
+        }},
+    })
+    assert parse_event(event)["lane"] == "service"
+def test_lane_service_role_in_arn_is_service() -> None:
+    event = _event(userIdentity={
+        "type":        "AssumedRole",
+        "principalId": "AROAEXAMPLE:session-x",
+        "arn": (
+            f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/"
+            "AWSServiceRoleForPlaceholder/session-x"
+        ),
+    })
+    assert parse_event(event)["lane"] == "service"
+def test_lane_service_role_in_session_issuer_arn_is_service() -> None:
+    event = _event(userIdentity={
+        "type":        "AssumedRole",
+        "principalId": "AROAEXAMPLE:session-x",
+        "arn":         f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/innocuous/session-x",
+        "sessionContext": {"sessionIssuer": {
+            "type": "Role",
+            "arn":  f"arn:aws:iam::{_DOCS_ACCOUNT}:role/aws-service-role/AWSServiceRoleForExample",
+        }},
+    })
+    assert parse_event(event)["lane"] == "service"
+def test_lane_plain_iam_user_is_interactive() -> None:
+    assert parse_event(_event())["lane"] == "interactive"
+def test_lane_human_assumed_role_is_interactive() -> None:
+    event = _event(userIdentity={
+        "type":        "AssumedRole",
+        "principalId": "AROAEXAMPLE:session-x",
+        "arn":         f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/placeholder-role/session-x",
+        "sessionContext": {"sessionIssuer": {
+            "type":     "Role",
+            "userName": "placeholder-role",
+            "arn":      f"arn:aws:iam::{_DOCS_ACCOUNT}:role/placeholder-role",
+        }},
+    })
+    assert parse_event(event)["lane"] == "interactive"
+def test_lane_root_is_interactive() -> None:
+    event = _event(userIdentity={
+        "type":        "Root",
+        "principalId": _DOCS_ACCOUNT,
+        "arn":         f"arn:aws:iam::{_DOCS_ACCOUNT}:root",
+    })
+    assert parse_event(event)["lane"] == "interactive"
+# ── read_write derivation ─────────────────────────────────────────────────────
+def test_read_write_boolean_true_is_read() -> None:
+    assert parse_event(_event(readOnly=True))["read_write"] == "read"
+def test_read_write_boolean_false_is_write() -> None:
+    assert parse_event(_event(readOnly=False))["read_write"] == "write"
+def test_read_write_string_true_is_read() -> None:
+    assert parse_event(_event(readOnly="true"))["read_write"] == "read"
+def test_read_write_string_false_is_write() -> None:
+    assert parse_event(_event(readOnly="false"))["read_write"] == "write"
+def test_read_write_absent_get_verb_is_read() -> None:
+    event = _event(eventName="GetCallerIdentity")
+    event.pop("readOnly")
+    assert parse_event(event)["read_write"] == "read"
+def test_read_write_absent_list_verb_is_read() -> None:
+    event = _event(eventName="ListBuckets")
+    event.pop("readOnly")
+    assert parse_event(event)["read_write"] == "read"
+def test_read_write_absent_put_verb_is_write() -> None:
+    event = _event(eventName="PutObject")
+    event.pop("readOnly")
+    assert parse_event(event)["read_write"] == "write"
+def test_read_write_absent_delete_verb_is_write() -> None:
+    event = _event(eventName="DeleteBucket")
+    event.pop("readOnly")
+    assert parse_event(event)["read_write"] == "write"
+def test_read_write_absent_run_instances_is_write() -> None:
+    event = _event(eventName="RunInstances")
+    event.pop("readOnly")
+    assert parse_event(event)["read_write"] == "write"
+def test_read_write_absent_empty_event_name_is_write() -> None:
+    event = _event(eventName="")
+    event.pop("readOnly")
+    assert parse_event(event)["read_write"] == "write"
+# ── ts derivation ─────────────────────────────────────────────────────────────
+def test_ts_valid_event_time_parses_to_epoch_float() -> None:
+    event = _event(eventTime="2026-06-01T12:00:00Z")
+    ts = parse_event(event)["ts"]
+    assert isinstance(ts, float)
+    # 2026-06-01T12:00:00Z is well past the unix epoch; specific value documented
+    # via fromisoformat reproducibility, not magic-numbered here.
+    from datetime import datetime
+    expected = datetime.fromisoformat("2026-06-01T12:00:00+00:00").timestamp()
+    assert ts == expected
+def test_ts_missing_event_time_is_none() -> None:
+    event = _event()
+    event.pop("eventTime")
+    assert parse_event(event)["ts"] is None
+def test_ts_garbage_event_time_is_none() -> None:
+    event = _event(eventTime="not-a-timestamp")
+    assert parse_event(event)["ts"] is None
+# ── Carried fields ────────────────────────────────────────────────────────────
+_ALL_KEYS = {
+    "ts", "principal", "lane", "read_write",
+    "event_source", "event_name", "identity_type",
+    "source_ip", "error_code", "aws_region", "event_id", "raw",
+}
+def test_every_row_has_all_twelve_canonical_keys() -> None:
+    row = parse_event(_event())
+    assert set(row.keys()) == _ALL_KEYS
+def test_error_code_is_none_on_success_events() -> None:
+    # Default fixture has no errorCode key — success path.
+    assert parse_event(_event())["error_code"] is None
+def test_error_code_carried_when_present() -> None:
+    assert parse_event(_event(errorCode="AccessDenied"))["error_code"] == "AccessDenied"
+def test_event_source_carried_verbatim_no_suffix_strip() -> None:
+    # The full suffix is part of the analyst's pivot — never strip "amazonaws.com".
+    assert parse_event(_event(eventSource="s3.amazonaws.com"))["event_source"] == "s3.amazonaws.com"
+def test_carried_fields_pass_through_unchanged() -> None:
+    row = parse_event(_event())
+    assert row["event_name"]    == "GetObject"
+    assert row["identity_type"] == "IAMUser"
+    assert row["source_ip"]     == "192.0.2.10"
+    assert row["aws_region"]    == "us-east-1"
+    assert row["event_id"]      == "11111111-1111-1111-1111-111111111111"
+def test_raw_holds_original_event_dict() -> None:
+    event = _event(extraField="future-detector-fodder")
+    row = parse_event(event)
+    assert row["raw"] is event
+    assert row["raw"]["extraField"] == "future-detector-fodder"
+def test_identity_type_none_when_user_identity_missing() -> None:
+    event = _event()
+    event.pop("userIdentity")
+    assert parse_event(event)["identity_type"] is None
+def test_identity_type_none_when_user_identity_not_dict() -> None:
+    assert parse_event(_event(userIdentity=42))["identity_type"] is None
+# ── Defensive non-dict input ──────────────────────────────────────────────────
+def test_parse_event_returns_none_for_non_dict_input() -> None:
+    assert parse_event(None) is None
+    assert parse_event("string") is None
+    assert parse_event([{"eventName": "GetObject"}]) is None
+    assert parse_event(42) is None

tests/test_clustering.py ADDED Viewed

@@ -0,0 +1,85 @@
+"""Tests for the HDBSCAN backend shim — resolution and exposure only.
+Contract under test (loghunter.common.clustering):
+1. ``HDBSCAN`` is exposed as a class at module level, constructable with the
+   standard ``min_cluster_size=`` and ``min_samples=`` kwargs and exposing
+   ``fit_predict``.
+2. ``ACTIVE_BACKEND`` is one of ``{"fast_hdbscan", "hdbscan"}`` and matches
+   whichever backend is actually importable in the current environment, in
+   the same priority order the shim itself uses.
+3. When ``fast_hdbscan`` is force-blocked at import time, the shim falls
+   back to stock ``hdbscan`` and reports ``ACTIVE_BACKEND == "hdbscan"``.
+Clustering numerics and equivalence between the two backends are out of
+scope — that lives with the dns detector tests.
+"""
+from __future__ import annotations
+import importlib
+import sys
+import pytest
+from loghunter.common import clustering
+def _expected_backend_in_env() -> str:
+    """Resolve the expected backend in the same priority order as the shim.
+    Mirrors the shim's logic exactly so the assertion fails clearly if the
+    base-dependency expectation (stock ``hdbscan`` always present) is
+    violated by the test environment.
+    """
+    try:
+        import fast_hdbscan  # noqa: F401
+        return "fast_hdbscan"
+    except ImportError:
+        try:
+            import hdbscan  # noqa: F401
+            return "hdbscan"
+        except ImportError as e:
+            pytest.fail(
+                "Neither fast_hdbscan nor hdbscan is importable in the test "
+                "environment. hdbscan is the base dependency of loghunt and "
+                "must be present for the shim to resolve. Original error: "
+                f"{e!r}"
+            )
+def test_shim_exposes_constructable_hdbscan_class():
+    cls = clustering.HDBSCAN
+    assert isinstance(cls, type), "HDBSCAN must be exposed as a class, not a factory"
+    instance = cls(min_cluster_size=5, min_samples=2)
+    assert hasattr(instance, "fit_predict"), "HDBSCAN instance must expose fit_predict"
+def test_active_backend_is_one_of_expected_strings():
+    assert clustering.ACTIVE_BACKEND in {"fast_hdbscan", "hdbscan"}
+def test_active_backend_matches_environment():
+    assert clustering.ACTIVE_BACKEND == _expected_backend_in_env()
+def test_fallback_resolves_to_hdbscan_when_fast_hdbscan_blocked(monkeypatch):
+    """Force-block fast_hdbscan and reload; the shim must fall through to hdbscan.
+    Uses the standard ``sys.modules[name] = None`` sentinel pattern: when the
+    import machinery sees None in ``sys.modules`` for a name, it raises
+    ``ModuleNotFoundError`` rather than attempting to resolve the module.
+    That gives us deterministic fallback coverage regardless of whether
+    fast_hdbscan is actually installed on disk.
+    """
+    monkeypatch.setitem(sys.modules, "fast_hdbscan", None)
+    try:
+        importlib.reload(clustering)
+        assert clustering.ACTIVE_BACKEND == "hdbscan"
+        cls = clustering.HDBSCAN
+        assert isinstance(cls, type)
+        instance = cls(min_cluster_size=5, min_samples=2)
+        assert hasattr(instance, "fit_predict")
+    finally:
+        sys.modules.pop("fast_hdbscan", None)
+        importlib.reload(clustering)