loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,631 @@
1
+ """Tests for the CloudTrail S3 exporter framework.
2
+
3
+ No live S3 connection — boto3 is mocked via a hand-rolled FakeS3Client.
4
+ All bucket names and account IDs are obviously fake.
5
+
6
+ botocore-dependent tests (those constructing real ClientError / credential /
7
+ MissingDependency classes) live in tests/test_cloudtrail_exporter_botocore.py
8
+ behind an importorskip. This file is intentionally botocore-free so it runs
9
+ on a base checkout — see CODE.md "Architecture tests cover boundaries".
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import io
15
+ import json
16
+ import sys
17
+ from datetime import datetime, timedelta, timezone
18
+ from pathlib import Path
19
+ from typing import Any
20
+
21
+ import pytest
22
+
23
+ from loghunter.common.errors import ExportAborted
24
+ from loghunter.exporters import _resolve_output_path
25
+ from loghunter.exporters import cloudtrail as ct
26
+
27
+ from tests._cloudtrail_fakes import FakeS3Client, _gz_envelope
28
+
29
+
30
+ # ── module-level contract ─────────────────────────────────────────────────────
31
+
32
+
33
+ def test_is_configured() -> None:
34
+ assert ct.is_configured({"path": "s3://example-trail-bucket/AWSLogs/"})
35
+ assert not ct.is_configured({})
36
+ assert not ct.is_configured({"path": ""})
37
+ assert not ct.is_configured({"path": " "})
38
+
39
+
40
+ def test_summary_descriptor() -> None:
41
+ assert ct.summary_descriptor({"path": "s3://example-trail-bucket/AWSLogs/"}) == \
42
+ "s3://example-trail-bucket/AWSLogs/"
43
+ assert ct.summary_descriptor({}) == ""
44
+
45
+
46
+ def test_implicit_default_query_and_extension() -> None:
47
+ # Filename fix depends on the basename being explicit — assert it directly.
48
+ assert ct.implicit_default_query() == {"output_basename": "cloudtrail"}
49
+ assert ct.OUTPUT_EXTENSION == ".json.log"
50
+
51
+
52
+ def test_filename_json_log(tmp_path: Path) -> None:
53
+ # _resolve_output_path with extension=".json.log" produces cloudtrail_..._Nd.json.log
54
+ query_cfg = {"output_basename": "cloudtrail"}
55
+ since = datetime(2026, 6, 1, 0, 0, 0)
56
+ until = datetime(2026, 6, 8, 0, 0, 0)
57
+ result = _resolve_output_path(
58
+ query_cfg, str(tmp_path), since, until, "default",
59
+ extension=".json.log",
60
+ backend_config={}, loghunter_config={},
61
+ )
62
+ assert result.name == "cloudtrail_20260601_7d.json.log"
63
+
64
+
65
+ # ── _parse_s3_path ────────────────────────────────────────────────────────────
66
+
67
+
68
+ def test_parse_s3_path_with_prefix() -> None:
69
+ bucket, prefix = ct._parse_s3_path("s3://example-trail-bucket/AWSLogs/")
70
+ assert bucket == "example-trail-bucket"
71
+ assert prefix == "AWSLogs/"
72
+
73
+
74
+ def test_parse_s3_path_root_only() -> None:
75
+ bucket, prefix = ct._parse_s3_path("s3://example-trail-bucket")
76
+ assert bucket == "example-trail-bucket"
77
+ assert prefix == ""
78
+
79
+
80
+ def test_parse_s3_path_appends_trailing_slash() -> None:
81
+ _, prefix = ct._parse_s3_path("s3://example-trail-bucket/AWSLogs")
82
+ assert prefix == "AWSLogs/"
83
+
84
+
85
+ def test_parse_s3_path_bad_scheme() -> None:
86
+ with pytest.raises(ValueError, match="must start with s3://"):
87
+ ct._parse_s3_path("https://example-trail-bucket/")
88
+
89
+
90
+ # ── _enumerate_days (day-range overlap) ───────────────────────────────────────
91
+
92
+
93
+ def test_day_range_excludes_midnight_upper_bound() -> None:
94
+ # [2026-06-01 00:00 UTC, 2026-06-02 00:00 UTC) — only June 1 overlaps.
95
+ # Use tz-aware UTC datetimes so the assertion is stable across test
96
+ # runners regardless of local timezone (S3 partitions are UTC-keyed).
97
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
98
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc)
99
+ days = ct._enumerate_days(since, until)
100
+ assert days == [(2026, 6, 1)]
101
+
102
+
103
+ def test_day_range_includes_second_day_when_window_spills_in() -> None:
104
+ # One second past midnight pulls in the next day
105
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
106
+ until = datetime(2026, 6, 2, 0, 0, 1, tzinfo=timezone.utc)
107
+ days = ct._enumerate_days(since, until)
108
+ assert days == [(2026, 6, 1), (2026, 6, 2)]
109
+
110
+
111
+ def test_day_range_local_window_includes_utc_spillover_day() -> None:
112
+ """P1 regression: a local non-UTC window must enumerate S3 days in UTC.
113
+
114
+ 2026-06-01 00:00 -0500 → 2026-06-02 00:00 -0500 is 2026-06-01 05:00 UTC →
115
+ 2026-06-02 05:00 UTC, which spans two UTC date partitions. The pre-fix
116
+ code returned only [(2026, 6, 1)] — events under 2026/06/02/ were missed.
117
+ """
118
+ tz_minus5 = timezone(timedelta(hours=-5))
119
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=tz_minus5)
120
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=tz_minus5)
121
+ assert ct._enumerate_days(since, until) == [(2026, 6, 1), (2026, 6, 2)]
122
+
123
+
124
+ def test_day_range_empty_for_zero_or_inverted_window() -> None:
125
+ same = datetime(2026, 6, 1, 12, 0, 0, tzinfo=timezone.utc)
126
+ assert ct._enumerate_days(same, same) == []
127
+ assert ct._enumerate_days(same, same.replace(hour=11)) == []
128
+
129
+
130
+ # ── _has_cloudtrail_segment ───────────────────────────────────────────────────
131
+
132
+
133
+ def test_cloudtrail_segment_detection() -> None:
134
+ assert ct._has_cloudtrail_segment("AWSLogs/000000000000/CloudTrail/us-east-1/")
135
+ assert ct._has_cloudtrail_segment("CloudTrail/us-east-1/")
136
+ assert not ct._has_cloudtrail_segment("AWSLogs/000000000000/elasticloadbalancing/us-east-1/")
137
+ # Digest is a different segment, not a match
138
+ assert not ct._has_cloudtrail_segment("AWSLogs/000000000000/CloudTrail-Digest/us-east-1/")
139
+
140
+
141
+ # ── _split_name ───────────────────────────────────────────────────────────────
142
+
143
+
144
+ def test_split_name_inserts_part_before_double_suffix() -> None:
145
+ base = Path("/tmp/cloudtrail_20260601_7d.json.log")
146
+ result = ct._split_name(base, 1)
147
+ assert result.name == "cloudtrail_20260601_7d_part01.json.log"
148
+ result = ct._split_name(base, 12)
149
+ assert result.name == "cloudtrail_20260601_7d_part12.json.log"
150
+
151
+
152
+ # ── prefix construction / year-invariant walk ────────────────────────────────
153
+
154
+
155
+ def _build_classic_bucket() -> FakeS3Client:
156
+ """s3://example-trail-bucket/AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/..."""
157
+ c = FakeS3Client()
158
+ base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
159
+ events = [{"eventTime": "2026-06-01T12:00:00Z", "eventName": "RunInstances"}]
160
+ c.add_object(base + "obj1.json.gz", _gz_envelope(events))
161
+ return c
162
+
163
+
164
+ def test_prefix_construction_classic(monkeypatch) -> None:
165
+ client = _build_classic_bucket()
166
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
167
+
168
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
169
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
170
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc)
171
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False)
172
+
173
+ assert meta == {"units": 1, "unit_label": "objects"}
174
+ assert len(events) == 1
175
+ assert events[0]["eventName"] == "RunInstances"
176
+
177
+
178
+ def test_prefix_construction_org_layout(monkeypatch) -> None:
179
+ """Org-trail inserts an o-xxxx segment before the account id — walk still finds years."""
180
+ client = FakeS3Client()
181
+ base = "AWSLogs/o-aaaa1111/000000000000/CloudTrail/us-east-1/2026/06/01/"
182
+ events = [{"eventTime": "2026-06-01T01:23:45Z", "eventName": "AssumeRole"}]
183
+ client.add_object(base + "obj1.json.gz", _gz_envelope(events))
184
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
185
+
186
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
187
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
188
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc)
189
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False)
190
+ assert meta["units"] == 1
191
+ assert events[0]["eventName"] == "AssumeRole"
192
+
193
+
194
+ def test_cloudtrail_digest_branch_skipped(monkeypatch) -> None:
195
+ """A sibling CloudTrail-Digest tree must not be descended."""
196
+ client = _build_classic_bucket()
197
+ # Add a Digest sibling with its own year tree and objects
198
+ digest_base = "AWSLogs/000000000000/CloudTrail-Digest/us-east-1/2026/06/01/"
199
+ client.add_object(digest_base + "manifest.json.gz", _gz_envelope([
200
+ {"eventTime": "2026-06-01T12:00:00Z", "eventName": "DIGEST_MARKER"},
201
+ ]))
202
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
203
+
204
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
205
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
206
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc)
207
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False)
208
+ # Only the event from the CloudTrail event tree, never the digest marker
209
+ assert meta["units"] == 1
210
+ assert all(e["eventName"] != "DIGEST_MARKER" for e in events)
211
+
212
+
213
+ def test_non_cloudtrail_year_tree_rejected(monkeypatch) -> None:
214
+ """An ELB tree that shares the YYYY/MM/DD layout must not be picked up."""
215
+ client = FakeS3Client()
216
+ base = "AWSLogs/000000000000/elasticloadbalancing/us-east-1/2026/06/01/"
217
+ client.add_object(base + "obj1.json.gz", _gz_envelope([
218
+ {"eventTime": "2026-06-01T12:00:00Z", "eventName": "ELB_EVENT"},
219
+ ]))
220
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
221
+
222
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
223
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
224
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc)
225
+ with pytest.raises(ValueError, match="no CloudTrail objects found"):
226
+ ct.fetch({}, cfg, since, until, verbose=False)
227
+
228
+
229
+ def test_is_cloudtrail_ancestor_segment() -> None:
230
+ """The ancestor-segment heuristic accepts only structural parents of CloudTrail/."""
231
+ # Accepted: things that can sit above CloudTrail/ in standard AWS layouts.
232
+ assert ct._is_cloudtrail_ancestor_segment("AWSLogs/")
233
+ assert ct._is_cloudtrail_ancestor_segment("CloudTrail/")
234
+ assert ct._is_cloudtrail_ancestor_segment("000000000000/") # 12-digit account
235
+ assert ct._is_cloudtrail_ancestor_segment("12345/") # lenient on digit count
236
+ assert ct._is_cloudtrail_ancestor_segment("o-aaaa1111/") # AWS organization id
237
+ # Rejected: sibling AWS service trees, anything else.
238
+ assert not ct._is_cloudtrail_ancestor_segment("elasticloadbalancing/")
239
+ assert not ct._is_cloudtrail_ancestor_segment("RDS/")
240
+ assert not ct._is_cloudtrail_ancestor_segment("vpc-flow-logs/")
241
+ # Digest is also independently blocked by the explicit digest check, but the
242
+ # heuristic must reject it too.
243
+ assert not ct._is_cloudtrail_ancestor_segment("CloudTrail-Digest/")
244
+
245
+
246
+ def test_walk_does_not_descend_non_cloudtrail_service_branches(monkeypatch) -> None:
247
+ """P2: the walker must NOT list inside elasticloadbalancing/ etc.
248
+
249
+ Two trees live under the same account: a real CloudTrail tree (events) and
250
+ a sibling ELB tree. The walker must descend only into CloudTrail/. We
251
+ assert no recorded list call's Prefix contains 'elasticloadbalancing/'.
252
+ """
253
+ client = FakeS3Client()
254
+ ct_base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
255
+ elb_base = "AWSLogs/000000000000/elasticloadbalancing/us-east-1/2026/06/01/"
256
+ client.add_object(ct_base + "obj1.json.gz", _gz_envelope([
257
+ {"eventTime": "2026-06-01T01:00:00Z", "eventName": "Good"},
258
+ ]))
259
+ client.add_object(elb_base + "elb.log", b"some content")
260
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
261
+
262
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
263
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
264
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=timezone.utc)
265
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False)
266
+
267
+ assert meta["units"] == 1
268
+ assert events[0]["eventName"] == "Good"
269
+ # The recorder must not show ANY list call into the ELB branch.
270
+ for listed in client.list_prefix_log:
271
+ assert "elasticloadbalancing" not in listed, (
272
+ f"walker listed inside non-CloudTrail branch: {listed!r}"
273
+ )
274
+
275
+
276
+ def test_fetch_includes_utc_spillover_day_under_local_window(monkeypatch) -> None:
277
+ """P1 end-to-end: a local UTC-5 window must fetch events from the next UTC day.
278
+
279
+ Window 2026-06-01 00:00 -0500 → 2026-06-02 00:00 -0500 covers UTC
280
+ 05:00..05:00 of the next UTC day, so an event at 2026-06-02T03:00:00Z
281
+ sits under the 2026/06/02/ partition AND inside the precise window.
282
+ Pre-fix, _enumerate_days only listed 2026/06/01/, so this event was
283
+ silently dropped.
284
+ """
285
+ client = FakeS3Client()
286
+ d1 = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
287
+ d2 = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/02/"
288
+ client.add_object(d1 + "a.json.gz", _gz_envelope([
289
+ {"eventTime": "2026-06-01T12:00:00Z", "eventName": "InDay1"},
290
+ ]))
291
+ client.add_object(d2 + "b.json.gz", _gz_envelope([
292
+ {"eventTime": "2026-06-02T03:00:00Z", "eventName": "InUtcSpillover"},
293
+ {"eventTime": "2026-06-02T10:00:00Z", "eventName": "PastWindow"},
294
+ ]))
295
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
296
+
297
+ tz_minus5 = timezone(timedelta(hours=-5))
298
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
299
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=tz_minus5)
300
+ until = datetime(2026, 6, 2, 0, 0, 0, tzinfo=tz_minus5)
301
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False)
302
+
303
+ names = [e["eventName"] for e in events]
304
+ # Both day partitions are listed.
305
+ assert meta["units"] == 2
306
+ # Spillover event is present; past-window event is trimmed.
307
+ assert "InDay1" in names
308
+ assert "InUtcSpillover" in names
309
+ assert "PastWindow" not in names
310
+
311
+
312
+ def test_empty_result_raises(monkeypatch) -> None:
313
+ client = FakeS3Client() # totally empty bucket
314
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
315
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
316
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
317
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
318
+ with pytest.raises(ValueError, match="no CloudTrail objects found"):
319
+ ct.fetch({}, cfg, since, until, verbose=False)
320
+
321
+
322
+ # ── bad-object handling ──────────────────────────────────────────────────────
323
+
324
+
325
+ def test_bad_object_skipped_with_warning(monkeypatch, capsys) -> None:
326
+ client = FakeS3Client()
327
+ base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
328
+ good = [{"eventTime": "2026-06-01T12:00:00Z", "eventName": "Good"}]
329
+ client.add_object(base + "good.json.gz", _gz_envelope(good))
330
+ client.add_object(base + "corrupt.json.gz", b"this is not gzip")
331
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
332
+
333
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
334
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
335
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
336
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False)
337
+
338
+ assert meta["units"] == 2 # both objects counted
339
+ assert len(events) == 1 # only the good one parsed
340
+ assert events[0]["eventName"] == "Good"
341
+ err = capsys.readouterr().err
342
+ assert "skipped unreadable object:" in err
343
+ assert "corrupt.json.gz" in err
344
+
345
+
346
+ # ── egress guard ─────────────────────────────────────────────────────────────
347
+
348
+
349
+ def _bucket_with_large_object(monkeypatch, body_size: int) -> FakeS3Client:
350
+ """Build a bucket whose single object reports `body_size` bytes."""
351
+ client = FakeS3Client()
352
+ base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
353
+ events = [{"eventTime": "2026-06-01T01:00:00Z", "eventName": "x"}]
354
+ body = _gz_envelope(events)
355
+ # Report a fake huge size via Size, but the body itself is tiny so the parse works.
356
+ client.add_object(base + "obj1.json.gz", body, size=body_size)
357
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
358
+ return client
359
+
360
+
361
+ def test_egress_guard_fires_and_user_declines(monkeypatch) -> None:
362
+ _bucket_with_large_object(monkeypatch, body_size=10 * 10**9) # 10 GB
363
+ monkeypatch.setattr("builtins.input", lambda *_: "n")
364
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 5.0}
365
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
366
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
367
+ with pytest.raises(ExportAborted, match="aborted by user"):
368
+ ct.fetch({}, cfg, since, until, verbose=False)
369
+
370
+
371
+ def test_egress_guard_user_accepts(monkeypatch) -> None:
372
+ _bucket_with_large_object(monkeypatch, body_size=10 * 10**9)
373
+ monkeypatch.setattr("builtins.input", lambda *_: "y")
374
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 5.0}
375
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
376
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
377
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False)
378
+ assert meta["units"] == 1
379
+ assert len(events) == 1
380
+
381
+
382
+ def test_egress_guard_bypassed_by_skip_confirm(monkeypatch) -> None:
383
+ _bucket_with_large_object(monkeypatch, body_size=10 * 10**9)
384
+ # Recorder: input must NOT be called when skip_confirm is True.
385
+ called: list[bool] = []
386
+
387
+ def _recording_input(*_args, **_kw):
388
+ called.append(True)
389
+ return "n"
390
+
391
+ monkeypatch.setattr("builtins.input", _recording_input)
392
+
393
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 5.0}
394
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
395
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
396
+ events, meta = ct.fetch({}, cfg, since, until, verbose=False, skip_confirm=True)
397
+ assert called == []
398
+ assert meta["units"] == 1
399
+ assert len(events) == 1
400
+
401
+
402
+ def test_egress_guard_below_threshold_does_not_prompt(monkeypatch) -> None:
403
+ _bucket_with_large_object(monkeypatch, body_size=1000) # well under 5 GB
404
+
405
+ def _no_input(*_a, **_kw):
406
+ raise AssertionError("egress prompt must not fire below threshold")
407
+
408
+ monkeypatch.setattr("builtins.input", _no_input)
409
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 5.0}
410
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
411
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
412
+ events, _meta = ct.fetch({}, cfg, since, until, verbose=False)
413
+ assert len(events) == 1
414
+
415
+
416
+ # ── eventTime sort + window trim ─────────────────────────────────────────────
417
+
418
+
419
+ def test_event_time_sort_and_trim(monkeypatch) -> None:
420
+ """Events arrive out-of-order across days; result is sorted & trimmed."""
421
+ client = FakeS3Client()
422
+ # Two objects, one for each day in the window. Events are not monotonic.
423
+ d1 = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
424
+ d2 = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/02/"
425
+ client.add_object(d1 + "a.json.gz", _gz_envelope([
426
+ {"eventTime": "2026-06-01T18:00:00Z", "eventName": "E_LATE"},
427
+ {"eventTime": "2026-06-01T08:00:00Z", "eventName": "E_EARLY"},
428
+ {"eventTime": "2026-05-31T23:00:00Z", "eventName": "E_BEFORE_WINDOW"},
429
+ ]))
430
+ client.add_object(d2 + "b.json.gz", _gz_envelope([
431
+ # Past the precise upper bound — must be trimmed
432
+ {"eventTime": "2026-06-02T05:00:00Z", "eventName": "E_AFTER_WINDOW"},
433
+ # Inside the window (with seconds-spillover upper bound)
434
+ {"eventTime": "2026-06-02T00:00:00Z", "eventName": "E_BOUNDARY"},
435
+ ]))
436
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
437
+
438
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
439
+ since = datetime(2026, 6, 1, 0, 0, 0, tzinfo=timezone.utc)
440
+ # Pick an upper bound that includes E_BOUNDARY but not E_AFTER_WINDOW
441
+ until = datetime(2026, 6, 2, 1, 0, 0, tzinfo=timezone.utc)
442
+ events, _ = ct.fetch({}, cfg, since, until, verbose=False)
443
+
444
+ names = [e["eventName"] for e in events]
445
+ # E_BEFORE_WINDOW and E_AFTER_WINDOW must be filtered
446
+ assert "E_BEFORE_WINDOW" not in names
447
+ assert "E_AFTER_WINDOW" not in names
448
+ # Remaining events are sorted ascending by eventTime
449
+ assert names == ["E_EARLY", "E_LATE", "E_BOUNDARY"]
450
+
451
+
452
+ # ── write(): 2 GB split with _partNN-only-on-split ────────────────────────────
453
+
454
+
455
+ def test_write_no_split(tmp_path: Path) -> None:
456
+ events = [{"eventTime": "2026-06-01T01:00:00Z", "eventName": "x"}]
457
+ outpath = tmp_path / "cloudtrail_20260601_1d.json.log"
458
+ n, _ = ct.write(events, outpath, verbose=False)
459
+ assert n == 1
460
+ assert outpath.exists()
461
+ # No sibling _part* files
462
+ assert sorted(p.name for p in tmp_path.iterdir()) == ["cloudtrail_20260601_1d.json.log"]
463
+
464
+
465
+ def test_write_splits_into_part_files(tmp_path: Path, monkeypatch) -> None:
466
+ # Tiny split threshold forces splits without writing 2 GB.
467
+ monkeypatch.setattr(ct, "_PART_SPLIT_BYTES", 200)
468
+ # Each line is ~60-80 bytes; 10 events produces multiple parts.
469
+ events = [
470
+ {"eventTime": f"2026-06-01T01:00:{i:02d}Z", "eventName": "x", "i": i}
471
+ for i in range(10)
472
+ ]
473
+ outpath = tmp_path / "cloudtrail_20260601_1d.json.log"
474
+ n, _ = ct.write(events, outpath, verbose=False)
475
+
476
+ assert n == 10
477
+ # Bare file must NOT exist — first split renames it to _part01
478
+ assert not outpath.exists()
479
+ parts = sorted(tmp_path.glob("cloudtrail_20260601_1d_part*.json.log"))
480
+ assert len(parts) >= 2 # at least one split occurred
481
+ # Line counts sum to total, no line is split mid-row
482
+ total_lines = 0
483
+ for p in parts:
484
+ text = p.read_text(encoding="utf-8")
485
+ lines = text.splitlines()
486
+ for line in lines:
487
+ json.loads(line) # each line must be a complete JSON object
488
+ total_lines += len(lines)
489
+ assert total_lines == 10
490
+
491
+
492
+ def test_write_split_threshold_just_below_does_not_split(tmp_path: Path, monkeypatch) -> None:
493
+ # Pick a threshold larger than the entire payload.
494
+ monkeypatch.setattr(ct, "_PART_SPLIT_BYTES", 10_000)
495
+ events = [
496
+ {"eventTime": f"2026-06-01T01:00:{i:02d}Z", "eventName": "x", "i": i}
497
+ for i in range(3)
498
+ ]
499
+ outpath = tmp_path / "cloudtrail_20260601_1d.json.log"
500
+ ct.write(events, outpath, verbose=False)
501
+ assert outpath.exists()
502
+ assert sorted(p.name for p in tmp_path.iterdir()) == ["cloudtrail_20260601_1d.json.log"]
503
+
504
+
505
+ # ── CLI clean-abort path ──────────────────────────────────────────────────────
506
+
507
+
508
+ def test_cli_export_aborted_exits_cleanly(monkeypatch, capsys) -> None:
509
+ """ExportAborted from run_export becomes exit-0 in cli.main()."""
510
+ from loghunter import cli
511
+ from loghunter.common import config as cfg_mod
512
+
513
+ # Avoid loading a real config from the user's filesystem.
514
+ def _fake_load(_path=None):
515
+ return {"export": {"splunk": {"host": "192.0.2.20", "port": 8089,
516
+ "query": {"default": {"spl": "x"}}}}}
517
+
518
+ monkeypatch.setattr(cfg_mod, "load", _fake_load)
519
+
520
+ def _fake_run_export(*_args, **_kwargs):
521
+ raise ExportAborted("loghunter export: aborted by user")
522
+
523
+ # _run_export() rebinds via `from loghunter.exporters import run_export`, so
524
+ # patching the symbol on the exporters package is what it picks up.
525
+ monkeypatch.setattr("loghunter.exporters.run_export", _fake_run_export)
526
+
527
+ with pytest.raises(SystemExit) as exc_info:
528
+ cli.main(["export", "splunk"])
529
+
530
+ assert exc_info.value.code == 0
531
+ captured = capsys.readouterr() # drain once
532
+ assert "aborted by user" in captured.out
533
+ assert "loghunter:" not in captured.err # not the ValueError prefix
534
+ assert "Run 'loghunter --help'" not in captured.err # no usage nudge
535
+
536
+
537
+ # ── liveness narration (gate: stderr seals, prompt never spanned) ────────────
538
+
539
+
540
+ from tests.test_display import _FakeStream # noqa: E402 reuse non-tty mock
541
+
542
+
543
+ def test_listing_seals_on_populated_path(monkeypatch) -> None:
544
+ """Populated listing seals 'listed <N> objects (<GB> GB)' to stderr."""
545
+ client = FakeS3Client()
546
+ base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
547
+ client.add_object(base + "a.json.gz", _gz_envelope([
548
+ {"eventTime": "2026-06-01T12:00:00Z", "eventName": "x"},
549
+ ]))
550
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
551
+
552
+ fake = _FakeStream(tty=False)
553
+ monkeypatch.setattr(sys, "stderr", fake)
554
+
555
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
556
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
557
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
558
+ ct.fetch({}, cfg, since, until, verbose=False)
559
+
560
+ assert "listed 1 objects (" in fake.output
561
+ assert " GB)" in fake.output
562
+
563
+
564
+ def test_listing_seals_on_zero_objects_path(monkeypatch) -> None:
565
+ """Zero-objects path seals 'listed 0 objects (0.0 GB)' BEFORE raising."""
566
+ client = FakeS3Client() # empty
567
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
568
+
569
+ fake = _FakeStream(tty=False)
570
+ monkeypatch.setattr(sys, "stderr", fake)
571
+
572
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
573
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
574
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
575
+ with pytest.raises(ValueError, match="no CloudTrail objects found"):
576
+ ct.fetch({}, cfg, since, until, verbose=False)
577
+
578
+ # The seal MUST land even though the ValueError fires immediately after.
579
+ assert "listed 0 objects (0.0 GB)" in fake.output
580
+
581
+
582
+ def test_listing_seal_lands_before_input_prompt(monkeypatch) -> None:
583
+ """No liveness block may span input(); listing seal must commit first.
584
+
585
+ We monkeypatch builtins.input to inspect the fake stderr at call time —
586
+ if the listing seal is already in the buffer when input() is invoked, no
587
+ spinner is spanning the prompt. Decline the prompt to take the abort path
588
+ cleanly.
589
+ """
590
+ _bucket_with_large_object(monkeypatch, body_size=10 * 10**9) # 10 GB
591
+
592
+ fake = _FakeStream(tty=False)
593
+ monkeypatch.setattr(sys, "stderr", fake)
594
+
595
+ def _inspecting_input(*_args, **_kw):
596
+ # At the moment input() fires, the listing seal must already exist.
597
+ assert "listed 1 objects (" in fake.output, (
598
+ "listing liveness was still active when input() fired — "
599
+ "a spinner block is spanning the egress prompt"
600
+ )
601
+ return "n" # decline → ExportAborted via the existing abort path
602
+
603
+ monkeypatch.setattr("builtins.input", _inspecting_input)
604
+
605
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 5.0}
606
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
607
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
608
+ with pytest.raises(ExportAborted, match="aborted by user"):
609
+ ct.fetch({}, cfg, since, until, verbose=False)
610
+
611
+
612
+ def test_sort_and_trim_seals_record(monkeypatch) -> None:
613
+ """Sort+trim block seals 'sorted and trimmed to <N> events in window'."""
614
+ client = FakeS3Client()
615
+ base = "AWSLogs/000000000000/CloudTrail/us-east-1/2026/06/01/"
616
+ client.add_object(base + "a.json.gz", _gz_envelope([
617
+ {"eventTime": "2026-06-01T08:00:00Z", "eventName": "InWindow1"},
618
+ {"eventTime": "2026-06-01T18:00:00Z", "eventName": "InWindow2"},
619
+ {"eventTime": "2026-05-31T20:00:00Z", "eventName": "BeforeWindow"},
620
+ ]))
621
+ monkeypatch.setattr(ct.boto3, "client", lambda _svc: client)
622
+
623
+ fake = _FakeStream(tty=False)
624
+ monkeypatch.setattr(sys, "stderr", fake)
625
+
626
+ cfg = {"path": "s3://example-trail-bucket/AWSLogs/", "egress_warn_gb": 100}
627
+ since = datetime(2026, 6, 1, tzinfo=timezone.utc)
628
+ until = datetime(2026, 6, 2, tzinfo=timezone.utc)
629
+ ct.fetch({}, cfg, since, until, verbose=False)
630
+
631
+ assert "sorted and trimmed to 2 events in window" in fake.output