loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,207 @@
1
+ """Botocore-gated tests for the CloudTrail S3 exporter framework.
2
+
3
+ These tests construct real botocore exception classes (ClientError,
4
+ NoCredentialsError, MissingDependencyException, EndpointConnectionError) to
5
+ exercise the centralized boto-error translation rail in
6
+ loghunter.exporters.cloudtrail._translate_boto_errors. They are split off
7
+ behind a module-level importorskip so the bulk mock-only suite in
8
+ tests/test_cloudtrail_exporter.py runs on a base checkout without botocore.
9
+
10
+ The FakeS3Client / _gz_envelope helpers are shared via tests._cloudtrail_fakes
11
+ (no botocore in that module). All bucket names and account IDs are obviously
12
+ fake.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from datetime import datetime, timezone
18
+ from pathlib import Path
19
+
20
+ import pytest
21
+
22
+ # Gate the whole module: skip on a base checkout without botocore.
23
+ botocore_exc = pytest.importorskip("botocore.exceptions")
24
+
25
+ from loghunter.exporters import cloudtrail as ct
26
+
27
+ from tests._cloudtrail_fakes import FakeS3Client, _gz_envelope
28
+
29
+
30
+ def _make_client_error(code: str) -> botocore_exc.ClientError:
31
+ return botocore_exc.ClientError(
32
+ {"Error": {"Code": code, "Message": code}}, "Op"
33
+ )
34
+
35
+
36
+ def test_access_denied_in_sibling_branch_does_not_abort_pull(monkeypatch) -> None:
37
+ """P2: an AccessDenied on a non-CloudTrail sibling must NOT abort the run.
38
+
39
+ Common bucket-policy pattern: CloudTrail/ is readable to the analyst, ELB/
40
+ is restricted. Pre-fix, the walker descended into elasticloadbalancing/,
41
+ triggered AccessDenied, and the entire pull aborted with the auth-error
42
+ ValueError. With pruning, the sibling is never listed.
43
+ """
44
+ client = FakeS3Client()
45
+ ct_base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
46
+ client.add_object(ct_base + "obj1.json.gz", _gz_envelope([
47
+ {"eventTime": "2026-06-01T01:00:00Z", "eventName": "Good"},
48
+ ]))
49
+ # Booby-trap: listing inside the ELB branch raises AccessDenied.
50
+ client.set_list_error_for_prefix(
51
+ "AWSLogs/000000000000/elasticloadbalancing/",
52
+ _make_client_error("AccessDenied"),
53
+ )
54
+ # Make sure the ELB prefix actually appears as a CommonPrefix when listing
55
+ # the account level — we need a key under it for the fake to surface it.
56
+ client.add_object(
57
+ "AWSLogs/000000000000/elasticloadbalancing/marker", b"",
58
+ )
59
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
60
+
61
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
62
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
63
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc)
64
+
65
+ # Must complete cleanly — the sibling AccessDenied must never fire.
66
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False)
67
+ assert meta["units"] == 1
68
+ assert events[0]["eventName"] == "Good"
69
+
70
+
71
+ # ── auth errors take priority over bad-object handling ──────────────────────
72
+
73
+
74
+ def test_auth_error_from_list_path(monkeypatch) -> None:
75
+ client = FakeS3Client()
76
+ client.set_list_error(_make_client_error("AccessDenied"))
77
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
78
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
79
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
80
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
81
+ with pytest.raises(ValueError, match="AWS credentials not found or expired"):
82
+ ct.fetch({}, cfg, since, until, verbose=False)
83
+
84
+
85
+ def test_auth_error_from_get_path_aborts_run(monkeypatch) -> None:
86
+ """A denied get_object must NOT be downgraded to bad-object skip-and-warn."""
87
+ client = FakeS3Client()
88
+ base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
89
+ client.add_object(base + "obj1.json.gz", _gz_envelope([
90
+ {"eventTime": "2026-06-01T01:00:00Z", "eventName": "x"},
91
+ ]))
92
+ client.set_get_object_error(base + "obj1.json.gz", _make_client_error("ExpiredToken"))
93
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
94
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
95
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
96
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
97
+ with pytest.raises(ValueError, match="AWS credentials not found or expired"):
98
+ ct.fetch({}, cfg, since, until, verbose=False)
99
+
100
+
101
+ def test_no_credentials_error_handled(monkeypatch) -> None:
102
+ client = FakeS3Client()
103
+ client.set_list_error(botocore_exc.NoCredentialsError())
104
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
105
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
106
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
107
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
108
+ with pytest.raises(ValueError, match="AWS credentials not found or expired"):
109
+ ct.fetch({}, cfg, since, until, verbose=False)
110
+
111
+
112
+ # ── MissingDependencyException + BotoCoreError sweep (actionable-error rail) ──
113
+
114
+
115
+ def _missing_dep_exc() -> botocore_exc.MissingDependencyException:
116
+ """Mirror the real-world SSO/login-provider message."""
117
+ return botocore_exc.MissingDependencyException(
118
+ msg="Using the login credential provider requires an additional dependency. "
119
+ "Please install with `pip install 'botocore[crt]'`"
120
+ )
121
+
122
+
123
+ def test_missing_dependency_at_client_construction_maps_to_actionable_error(
124
+ monkeypatch,
125
+ ) -> None:
126
+ """MissingDependencyException at boto3.client() must become an actionable ValueError."""
127
+ def _raise(_svc):
128
+ raise _missing_dep_exc()
129
+
130
+ monkeypatch.setattr(ct.boto3, "client", _raise)
131
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
132
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
133
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
134
+ with pytest.raises(ValueError) as exc_info:
135
+ ct.fetch({}, cfg, since, until, verbose=False)
136
+ msg = str(exc_info.value)
137
+ assert "botocore[crt]" in msg
138
+ assert "credential provider" in msg
139
+ # The original botocore detail must be embedded so the user sees the exact
140
+ # missing piece (it varies — login vs SSO vs other providers).
141
+ assert "login credential provider" in msg
142
+
143
+
144
+ def test_missing_dependency_at_list_call_maps_to_actionable_error(
145
+ monkeypatch,
146
+ ) -> None:
147
+ """A list-call MissingDependencyException must map the same way (not propagate raw)."""
148
+ client = FakeS3Client()
149
+ client.set_list_error(_missing_dep_exc())
150
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
151
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
152
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
153
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
154
+ with pytest.raises(ValueError, match="botocore\\[crt\\]"):
155
+ ct.fetch({}, cfg, since, until, verbose=False)
156
+
157
+
158
+ def test_missing_dependency_at_get_call_maps_to_actionable_error(
159
+ monkeypatch,
160
+ ) -> None:
161
+ """A get_object MissingDependencyException must abort with the actionable message,
162
+ NOT be downgraded to the corrupt-object skip-and-warn path."""
163
+ client = FakeS3Client()
164
+ base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
165
+ client.add_object(base + "obj1.json.gz", _gz_envelope([
166
+ {"eventTime": "2026-06-01T01:00:00Z", "eventName": "x"},
167
+ ]))
168
+ client.set_get_object_error(base + "obj1.json.gz", _missing_dep_exc())
169
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
170
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
171
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
172
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
173
+ with pytest.raises(ValueError, match="botocore\\[crt\\]"):
174
+ ct.fetch({}, cfg, since, until, verbose=False)
175
+
176
+
177
+ def test_generic_botocore_error_is_wrapped(monkeypatch) -> None:
178
+ """A long-tail BotoCoreError subclass must become 'AWS error during CloudTrail export'."""
179
+ client = FakeS3Client()
180
+ client.set_list_error(
181
+ botocore_exc.EndpointConnectionError(endpoint_url="https://example.invalid")
182
+ )
183
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
184
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
185
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
186
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
187
+ with pytest.raises(ValueError) as exc_info:
188
+ ct.fetch({}, cfg, since, until, verbose=False)
189
+ msg = str(exc_info.value)
190
+ assert "AWS error during CloudTrail export" in msg
191
+ # Original detail embedded for diagnosis
192
+ assert "example.invalid" in msg
193
+
194
+
195
+ def test_non_auth_client_error_is_wrapped(monkeypatch) -> None:
196
+ """A non-auth ClientError (e.g. NoSuchBucket) must be wrapped, not propagated raw."""
197
+ client = FakeS3Client()
198
+ client.set_list_error(_make_client_error("NoSuchBucket"))
199
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
200
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
201
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
202
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
203
+ with pytest.raises(ValueError) as exc_info:
204
+ ct.fetch({}, cfg, since, until, verbose=False)
205
+ msg = str(exc_info.value)
206
+ assert "AWS error during CloudTrail export" in msg
207
+ assert "NoSuchBucket" in msg
@@ -0,0 +1,393 @@
1
+ """Unit tests for loghunter.parsers.cloudtrail.parse_event.
2
+
3
+ Pure-function tests: no I/O, no DataFrames, no fixtures on disk. Every event is
4
+ built with the ``_event()`` helper so each test states only the field(s) it is
5
+ about. All values are synthetic per the privacy rail — RFC 5737 IPs only, AWS
6
+ documentation account ``123456789012``, and obvious-placeholder names.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from loghunter.parsers.cloudtrail import parse_event
12
+
13
+
14
+ _DOCS_ACCOUNT = "123456789012" # AWS documentation account id
15
+
16
+
17
+ def _event(**overrides) -> dict:
18
+ """Build a minimal valid CloudTrail event dict with field overrides.
19
+
20
+ Defaults model a single benign IAMUser GetObject call. Override anything you
21
+ want to vary; everything else stays sane. Pass ``userIdentity={...}`` to
22
+ replace the whole identity stanza, or use a nested key like
23
+ ``userIdentity_type="AssumedRole"`` for shorthand isn't supported — give the
24
+ full dict.
25
+ """
26
+ base: dict = {
27
+ "eventTime": "2026-06-01T12:00:00Z",
28
+ "eventSource": "s3.amazonaws.com",
29
+ "eventName": "GetObject",
30
+ "eventID": "11111111-1111-1111-1111-111111111111",
31
+ "awsRegion": "us-east-1",
32
+ "sourceIPAddress": "192.0.2.10",
33
+ "userIdentity": {
34
+ "type": "IAMUser",
35
+ "userName": "placeholder-user",
36
+ "principalId": "AIDAEXAMPLE",
37
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:user/placeholder-user",
38
+ },
39
+ "readOnly": True,
40
+ }
41
+ base.update(overrides)
42
+ return base
43
+
44
+
45
+ # ── principal derivation ──────────────────────────────────────────────────────
46
+
47
+ def test_principal_assumed_role_uses_session_issuer_user_name() -> None:
48
+ event = _event(userIdentity={
49
+ "type": "AssumedRole",
50
+ "principalId": "AROAEXAMPLE:session-alpha",
51
+ "arn": f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/placeholder-role/session-alpha",
52
+ "sessionContext": {
53
+ "sessionIssuer": {
54
+ "type": "Role",
55
+ "principalId": "AROAEXAMPLE",
56
+ "userName": "placeholder-role",
57
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:role/placeholder-role",
58
+ },
59
+ },
60
+ })
61
+ assert parse_event(event)["principal"] == "placeholder-role"
62
+
63
+
64
+ def test_principal_assumed_role_falls_back_to_arn_last_segment_when_no_username() -> None:
65
+ event = _event(userIdentity={
66
+ "type": "AssumedRole",
67
+ "principalId": "AROAEXAMPLE:session-alpha",
68
+ "sessionContext": {
69
+ "sessionIssuer": {
70
+ "type": "Role",
71
+ "principalId": "AROAEXAMPLE",
72
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:role/placeholder-role",
73
+ # userName intentionally omitted
74
+ },
75
+ },
76
+ })
77
+ assert parse_event(event)["principal"] == "placeholder-role"
78
+
79
+
80
+ def test_principal_assumed_role_is_stable_across_sessions_of_same_role() -> None:
81
+ """Load-bearing: two events from different sessions of one role aggregate together."""
82
+ issuer = {
83
+ "type": "Role",
84
+ "principalId": "AROAEXAMPLE",
85
+ "userName": "placeholder-role",
86
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:role/placeholder-role",
87
+ }
88
+ session_one = _event(userIdentity={
89
+ "type": "AssumedRole",
90
+ "principalId": "AROAEXAMPLE:session-alpha",
91
+ "arn": f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/placeholder-role/session-alpha",
92
+ "sessionContext": {"sessionIssuer": issuer},
93
+ })
94
+ session_two = _event(userIdentity={
95
+ "type": "AssumedRole",
96
+ "principalId": "AROAEXAMPLE:session-beta",
97
+ "arn": f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/placeholder-role/session-beta",
98
+ "sessionContext": {"sessionIssuer": issuer},
99
+ })
100
+ p1 = parse_event(session_one)["principal"]
101
+ p2 = parse_event(session_two)["principal"]
102
+ assert p1 == p2 == "placeholder-role"
103
+ # Session name must never become the key.
104
+ assert "session-alpha" not in p1 and "session-beta" not in p2
105
+
106
+
107
+ def test_principal_iam_user_uses_user_name() -> None:
108
+ event = _event(userIdentity={
109
+ "type": "IAMUser",
110
+ "userName": "placeholder-user",
111
+ "principalId": "AIDAEXAMPLE",
112
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:user/placeholder-user",
113
+ })
114
+ assert parse_event(event)["principal"] == "placeholder-user"
115
+
116
+
117
+ def test_principal_iam_user_falls_back_to_arn_last_segment() -> None:
118
+ event = _event(userIdentity={
119
+ "type": "IAMUser",
120
+ "principalId": "AIDAEXAMPLE",
121
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:user/arn-derived-name",
122
+ # userName intentionally omitted
123
+ })
124
+ assert parse_event(event)["principal"] == "arn-derived-name"
125
+
126
+
127
+ def test_principal_aws_service_uses_invoked_by() -> None:
128
+ event = _event(userIdentity={
129
+ "type": "AWSService",
130
+ "invokedBy": "ec2.amazonaws.com",
131
+ })
132
+ assert parse_event(event)["principal"] == "ec2.amazonaws.com"
133
+
134
+
135
+ def test_principal_root_returns_root_literal() -> None:
136
+ event = _event(userIdentity={
137
+ "type": "Root",
138
+ "principalId": _DOCS_ACCOUNT,
139
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:root",
140
+ })
141
+ assert parse_event(event)["principal"] == "root"
142
+
143
+
144
+ def test_principal_federated_user_falls_back_to_principal_id() -> None:
145
+ event = _event(userIdentity={
146
+ "type": "FederatedUser",
147
+ "principalId": f"{_DOCS_ACCOUNT}:placeholder-federated",
148
+ })
149
+ assert parse_event(event)["principal"] == f"{_DOCS_ACCOUNT}:placeholder-federated"
150
+
151
+
152
+ def test_principal_saml_user_falls_back_to_principal_id() -> None:
153
+ event = _event(userIdentity={
154
+ "type": "SAMLUser",
155
+ "principalId": "SAMLEXAMPLE:placeholder-saml",
156
+ })
157
+ assert parse_event(event)["principal"] == "SAMLEXAMPLE:placeholder-saml"
158
+
159
+
160
+ def test_principal_missing_user_identity_returns_unknown_without_raising() -> None:
161
+ event = _event()
162
+ event.pop("userIdentity")
163
+ assert parse_event(event)["principal"] == "unknown"
164
+
165
+
166
+ def test_principal_non_dict_user_identity_returns_unknown_without_raising() -> None:
167
+ event = _event(userIdentity="not-a-dict")
168
+ assert parse_event(event)["principal"] == "unknown"
169
+
170
+
171
+ def test_principal_distinct_principal_ids_under_unknown_type_stay_distinct() -> None:
172
+ event_a = _event(userIdentity={"type": "FutureUnknownType", "principalId": "AAA-EXAMPLE"})
173
+ event_b = _event(userIdentity={"type": "FutureUnknownType", "principalId": "BBB-EXAMPLE"})
174
+ assert parse_event(event_a)["principal"] == "AAA-EXAMPLE"
175
+ assert parse_event(event_b)["principal"] == "BBB-EXAMPLE"
176
+ assert parse_event(event_a)["principal"] != parse_event(event_b)["principal"]
177
+
178
+
179
+ # ── lane derivation ───────────────────────────────────────────────────────────
180
+
181
+ def test_lane_aws_service_type_is_service() -> None:
182
+ event = _event(userIdentity={"type": "AWSService", "invokedBy": "lambda.amazonaws.com"})
183
+ assert parse_event(event)["lane"] == "service"
184
+
185
+
186
+ def test_lane_aws_account_type_is_service() -> None:
187
+ event = _event(userIdentity={"type": "AWSAccount", "principalId": "EXAMPLEACCT"})
188
+ assert parse_event(event)["lane"] == "service"
189
+
190
+
191
+ def test_lane_invoked_by_amazonaws_com_is_service() -> None:
192
+ event = _event(userIdentity={
193
+ "type": "AssumedRole",
194
+ "invokedBy": "config.amazonaws.com",
195
+ "sessionContext": {"sessionIssuer": {
196
+ "type": "Role",
197
+ "userName": "placeholder-role",
198
+ }},
199
+ })
200
+ assert parse_event(event)["lane"] == "service"
201
+
202
+
203
+ def test_lane_service_role_in_arn_is_service() -> None:
204
+ event = _event(userIdentity={
205
+ "type": "AssumedRole",
206
+ "principalId": "AROAEXAMPLE:session-x",
207
+ "arn": (
208
+ f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/"
209
+ "AWSServiceRoleForPlaceholder/session-x"
210
+ ),
211
+ })
212
+ assert parse_event(event)["lane"] == "service"
213
+
214
+
215
+ def test_lane_service_role_in_session_issuer_arn_is_service() -> None:
216
+ event = _event(userIdentity={
217
+ "type": "AssumedRole",
218
+ "principalId": "AROAEXAMPLE:session-x",
219
+ "arn": f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/innocuous/session-x",
220
+ "sessionContext": {"sessionIssuer": {
221
+ "type": "Role",
222
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:role/aws-service-role/AWSServiceRoleForExample",
223
+ }},
224
+ })
225
+ assert parse_event(event)["lane"] == "service"
226
+
227
+
228
+ def test_lane_plain_iam_user_is_interactive() -> None:
229
+ assert parse_event(_event())["lane"] == "interactive"
230
+
231
+
232
+ def test_lane_human_assumed_role_is_interactive() -> None:
233
+ event = _event(userIdentity={
234
+ "type": "AssumedRole",
235
+ "principalId": "AROAEXAMPLE:session-x",
236
+ "arn": f"arn:aws:sts::{_DOCS_ACCOUNT}:assumed-role/placeholder-role/session-x",
237
+ "sessionContext": {"sessionIssuer": {
238
+ "type": "Role",
239
+ "userName": "placeholder-role",
240
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:role/placeholder-role",
241
+ }},
242
+ })
243
+ assert parse_event(event)["lane"] == "interactive"
244
+
245
+
246
+ def test_lane_root_is_interactive() -> None:
247
+ event = _event(userIdentity={
248
+ "type": "Root",
249
+ "principalId": _DOCS_ACCOUNT,
250
+ "arn": f"arn:aws:iam::{_DOCS_ACCOUNT}:root",
251
+ })
252
+ assert parse_event(event)["lane"] == "interactive"
253
+
254
+
255
+ # ── read_write derivation ─────────────────────────────────────────────────────
256
+
257
+ def test_read_write_boolean_true_is_read() -> None:
258
+ assert parse_event(_event(readOnly=True))["read_write"] == "read"
259
+
260
+
261
+ def test_read_write_boolean_false_is_write() -> None:
262
+ assert parse_event(_event(readOnly=False))["read_write"] == "write"
263
+
264
+
265
+ def test_read_write_string_true_is_read() -> None:
266
+ assert parse_event(_event(readOnly="true"))["read_write"] == "read"
267
+
268
+
269
+ def test_read_write_string_false_is_write() -> None:
270
+ assert parse_event(_event(readOnly="false"))["read_write"] == "write"
271
+
272
+
273
+ def test_read_write_absent_get_verb_is_read() -> None:
274
+ event = _event(eventName="GetCallerIdentity")
275
+ event.pop("readOnly")
276
+ assert parse_event(event)["read_write"] == "read"
277
+
278
+
279
+ def test_read_write_absent_list_verb_is_read() -> None:
280
+ event = _event(eventName="ListBuckets")
281
+ event.pop("readOnly")
282
+ assert parse_event(event)["read_write"] == "read"
283
+
284
+
285
+ def test_read_write_absent_put_verb_is_write() -> None:
286
+ event = _event(eventName="PutObject")
287
+ event.pop("readOnly")
288
+ assert parse_event(event)["read_write"] == "write"
289
+
290
+
291
+ def test_read_write_absent_delete_verb_is_write() -> None:
292
+ event = _event(eventName="DeleteBucket")
293
+ event.pop("readOnly")
294
+ assert parse_event(event)["read_write"] == "write"
295
+
296
+
297
+ def test_read_write_absent_run_instances_is_write() -> None:
298
+ event = _event(eventName="RunInstances")
299
+ event.pop("readOnly")
300
+ assert parse_event(event)["read_write"] == "write"
301
+
302
+
303
+ def test_read_write_absent_empty_event_name_is_write() -> None:
304
+ event = _event(eventName="")
305
+ event.pop("readOnly")
306
+ assert parse_event(event)["read_write"] == "write"
307
+
308
+
309
+ # ── ts derivation ─────────────────────────────────────────────────────────────
310
+
311
+ def test_ts_valid_event_time_parses_to_epoch_float() -> None:
312
+ event = _event(eventTime="2026-06-01T12:00:00Z")
313
+ ts = parse_event(event)["ts"]
314
+ assert isinstance(ts, float)
315
+ # 2026-06-01T12:00:00Z is well past the unix epoch; specific value documented
316
+ # via fromisoformat reproducibility, not magic-numbered here.
317
+ from datetime import datetime
318
+ expected = datetime.fromisoformat("2026-06-01T12:00:00+00:00").timestamp()
319
+ assert ts == expected
320
+
321
+
322
+ def test_ts_missing_event_time_is_none() -> None:
323
+ event = _event()
324
+ event.pop("eventTime")
325
+ assert parse_event(event)["ts"] is None
326
+
327
+
328
+ def test_ts_garbage_event_time_is_none() -> None:
329
+ event = _event(eventTime="not-a-timestamp")
330
+ assert parse_event(event)["ts"] is None
331
+
332
+
333
+ # ── Carried fields ────────────────────────────────────────────────────────────
334
+
335
+ _ALL_KEYS = {
336
+ "ts", "principal", "lane", "read_write",
337
+ "event_source", "event_name", "identity_type",
338
+ "source_ip", "error_code", "aws_region", "event_id", "raw",
339
+ }
340
+
341
+
342
+ def test_every_row_has_all_twelve_canonical_keys() -> None:
343
+ row = parse_event(_event())
344
+ assert set(row.keys()) == _ALL_KEYS
345
+
346
+
347
+ def test_error_code_is_none_on_success_events() -> None:
348
+ # Default fixture has no errorCode key — success path.
349
+ assert parse_event(_event())["error_code"] is None
350
+
351
+
352
+ def test_error_code_carried_when_present() -> None:
353
+ assert parse_event(_event(errorCode="AccessDenied"))["error_code"] == "AccessDenied"
354
+
355
+
356
+ def test_event_source_carried_verbatim_no_suffix_strip() -> None:
357
+ # The full suffix is part of the analyst's pivot — never strip "amazonaws.com".
358
+ assert parse_event(_event(eventSource="s3.amazonaws.com"))["event_source"] == "s3.amazonaws.com"
359
+
360
+
361
+ def test_carried_fields_pass_through_unchanged() -> None:
362
+ row = parse_event(_event())
363
+ assert row["event_name"] == "GetObject"
364
+ assert row["identity_type"] == "IAMUser"
365
+ assert row["source_ip"] == "192.0.2.10"
366
+ assert row["aws_region"] == "us-east-1"
367
+ assert row["event_id"] == "11111111-1111-1111-1111-111111111111"
368
+
369
+
370
+ def test_raw_holds_original_event_dict() -> None:
371
+ event = _event(extraField="future-detector-fodder")
372
+ row = parse_event(event)
373
+ assert row["raw"] is event
374
+ assert row["raw"]["extraField"] == "future-detector-fodder"
375
+
376
+
377
+ def test_identity_type_none_when_user_identity_missing() -> None:
378
+ event = _event()
379
+ event.pop("userIdentity")
380
+ assert parse_event(event)["identity_type"] is None
381
+
382
+
383
+ def test_identity_type_none_when_user_identity_not_dict() -> None:
384
+ assert parse_event(_event(userIdentity=42))["identity_type"] is None
385
+
386
+
387
+ # ── Defensive non-dict input ──────────────────────────────────────────────────
388
+
389
+ def test_parse_event_returns_none_for_non_dict_input() -> None:
390
+ assert parse_event(None) is None
391
+ assert parse_event("string") is None
392
+ assert parse_event([{"eventName": "GetObject"}]) is None
393
+ assert parse_event(42) is None
@@ -0,0 +1,85 @@
1
+ """Tests for the HDBSCAN backend shim — resolution and exposure only.
2
+
3
+ Contract under test (loghunter.common.clustering):
4
+
5
+ 1. ``HDBSCAN`` is exposed as a class at module level, constructable with the
6
+ standard ``min_cluster_size=`` and ``min_samples=`` kwargs and exposing
7
+ ``fit_predict``.
8
+ 2. ``ACTIVE_BACKEND`` is one of ``{"fast_hdbscan", "hdbscan"}`` and matches
9
+ whichever backend is actually importable in the current environment, in
10
+ the same priority order the shim itself uses.
11
+ 3. When ``fast_hdbscan`` is force-blocked at import time, the shim falls
12
+ back to stock ``hdbscan`` and reports ``ACTIVE_BACKEND == "hdbscan"``.
13
+
14
+ Clustering numerics and equivalence between the two backends are out of
15
+ scope — that lives with the dns detector tests.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import importlib
21
+ import sys
22
+
23
+ import pytest
24
+
25
+ from loghunter.common import clustering
26
+
27
+
28
+ def _expected_backend_in_env() -> str:
29
+ """Resolve the expected backend in the same priority order as the shim.
30
+
31
+ Mirrors the shim's logic exactly so the assertion fails clearly if the
32
+ base-dependency expectation (stock ``hdbscan`` always present) is
33
+ violated by the test environment.
34
+ """
35
+ try:
36
+ import fast_hdbscan # noqa: F401
37
+ return "fast_hdbscan"
38
+ except ImportError:
39
+ try:
40
+ import hdbscan # noqa: F401
41
+ return "hdbscan"
42
+ except ImportError as e:
43
+ pytest.fail(
44
+ "Neither fast_hdbscan nor hdbscan is importable in the test "
45
+ "environment. hdbscan is the base dependency of loghunt and "
46
+ "must be present for the shim to resolve. Original error: "
47
+ f"{e!r}"
48
+ )
49
+
50
+
51
+ def test_shim_exposes_constructable_hdbscan_class():
52
+ cls = clustering.HDBSCAN
53
+ assert isinstance(cls, type), "HDBSCAN must be exposed as a class, not a factory"
54
+ instance = cls(min_cluster_size=5, min_samples=2)
55
+ assert hasattr(instance, "fit_predict"), "HDBSCAN instance must expose fit_predict"
56
+
57
+
58
+ def test_active_backend_is_one_of_expected_strings():
59
+ assert clustering.ACTIVE_BACKEND in {"fast_hdbscan", "hdbscan"}
60
+
61
+
62
+ def test_active_backend_matches_environment():
63
+ assert clustering.ACTIVE_BACKEND == _expected_backend_in_env()
64
+
65
+
66
+ def test_fallback_resolves_to_hdbscan_when_fast_hdbscan_blocked(monkeypatch):
67
+ """Force-block fast_hdbscan and reload; the shim must fall through to hdbscan.
68
+
69
+ Uses the standard ``sys.modules[name] = None`` sentinel pattern: when the
70
+ import machinery sees None in ``sys.modules`` for a name, it raises
71
+ ``ModuleNotFoundError`` rather than attempting to resolve the module.
72
+ That gives us deterministic fallback coverage regardless of whether
73
+ fast_hdbscan is actually installed on disk.
74
+ """
75
+ monkeypatch.setitem(sys.modules, "fast_hdbscan", None)
76
+ try:
77
+ importlib.reload(clustering)
78
+ assert clustering.ACTIVE_BACKEND == "hdbscan"
79
+ cls = clustering.HDBSCAN
80
+ assert isinstance(cls, type)
81
+ instance = cls(min_cluster_size=5, min_samples=2)
82
+ assert hasattr(instance, "fit_predict")
83
+ finally:
84
+ sys.modules.pop("fast_hdbscan", None)
85
+ importlib.reload(clustering)