loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,980 @@
1
+ """Tests for the cloudtrail digest card (six fixed slots, lane-scoped pair).
2
+
3
+ Covers:
4
+ - lane-split as a dist slot (always shows; both shares; never produces a lede)
5
+ - principal-vol interactive-scoping (service-lane dominant principal must
6
+ not bleed into the interactive cliff; floor + gate both proved to dash)
7
+ - event-source cliff over the WHOLE pile (interactive + service together)
8
+ - source-ip interactive-scoping (one IP dominating interactive speaks;
9
+ service-lane source_ip hostnames like "s3.amazonaws.com" must NOT count)
10
+ - region dist (single-region → "us-east-1 100%"; multi-region → top-3)
11
+ - error-rate (kind = error_code.notna(); top contributor is the error CODE,
12
+ not a principal; literal notna semantics; RATE_FLOOR gates real piles)
13
+ - ledes from gating slots only — neither lane-split nor region prose may
14
+ leak into a lede
15
+ - sleepy whole-pile card: quiet-honest; mostly dashes; zero ledes
16
+ - attack-shaped whole-pile card: multiple gating slots fire
17
+ - CLI dispatch and runner-boundary plumbing for cloudtrail_dir
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ import io
23
+ import json
24
+ from datetime import datetime, timedelta, timezone
25
+ from pathlib import Path
26
+ from typing import Any
27
+
28
+ import pandas as pd
29
+ import pytest
30
+
31
+ import loghunter.cli as cli
32
+ import loghunter.runner as runner
33
+ from loghunter.common.finding import DigestCard, RunSummary
34
+ from loghunter.digest import cloudtrail as ct_digest
35
+ from loghunter.outputs.text import TextHandler
36
+
37
+
38
+ # ─── Fixtures ────────────────────────────────────────────────────────────────
39
+
40
+ _NOW = datetime(2026, 6, 11, 12, 0, tzinfo=timezone.utc)
41
+ _BASE_TS = _NOW.timestamp()
42
+
43
+ _CT_COLUMNS = [
44
+ "ts", "principal", "lane", "read_write",
45
+ "event_source", "event_name", "identity_type",
46
+ "source_ip", "error_code", "aws_region", "event_id", "raw",
47
+ ]
48
+
49
+
50
+ def _ct_row(
51
+ principal: str = "arn:aws:iam::111111111111:user/alice",
52
+ lane: str = "interactive",
53
+ event_source: str = "iam.amazonaws.com",
54
+ event_name: str = "ListUsers",
55
+ source_ip: str = "203.0.113.10",
56
+ aws_region: str = "us-east-1",
57
+ error_code: object = None,
58
+ identity_type: str = "IAMUser",
59
+ read_write: str = "read",
60
+ event_id: str = "evt-0001",
61
+ ts: float = _BASE_TS,
62
+ ) -> dict:
63
+ """Build one canonical CloudTrail row dict with placeholder values.
64
+
65
+ Defaults to a clean interactive IAM read by an example user. Tests override
66
+ only the columns they care about — the rest carry safe sample values so
67
+ the frame always has the full 12-column shape the parser emits.
68
+ """
69
+ return {
70
+ "ts": ts,
71
+ "principal": principal,
72
+ "lane": lane,
73
+ "read_write": read_write,
74
+ "event_source": event_source,
75
+ "event_name": event_name,
76
+ "identity_type": identity_type,
77
+ "source_ip": source_ip,
78
+ "error_code": error_code,
79
+ "aws_region": aws_region,
80
+ "event_id": event_id,
81
+ "raw": {},
82
+ }
83
+
84
+
85
+ def _ct_df(rows: list[dict]) -> pd.DataFrame:
86
+ if not rows:
87
+ return pd.DataFrame(columns=_CT_COLUMNS)
88
+ return pd.DataFrame(rows, columns=_CT_COLUMNS)
89
+
90
+
91
+ def _slot_by_label(slots_or_frame, label):
92
+ """Look up a computed slot by label.
93
+
94
+ Accepts either a pre-built list of DigestSlot (legacy) or a frame
95
+ (new — re-derives via _compute_slots). The legacy form lets the older
96
+ body["slots"] callers keep their shape after a global rename to
97
+ body["fields"] (which is the post-selection display set, not what
98
+ these tests want); the new form is preferred for new tests.
99
+ """
100
+ if isinstance(slots_or_frame, pd.DataFrame):
101
+ slots = _compute_slots(slots_or_frame)
102
+ else:
103
+ slots = slots_or_frame
104
+ for s in slots:
105
+ if s.label == label:
106
+ return s
107
+ raise AssertionError(f"no slot with label {label!r}")
108
+
109
+
110
+ def _run_summary(
111
+ window: tuple[datetime, datetime] = (_NOW - timedelta(days=1), _NOW),
112
+ ) -> RunSummary:
113
+ return RunSummary(
114
+ data_window=window,
115
+ record_counts={"*.json*": 100},
116
+ data_size_bytes=0,
117
+ detectors_run=[],
118
+ detectors_skipped={},
119
+ notes=[],
120
+ data_sources=["cloudtrail"],
121
+ )
122
+
123
+
124
+ def _card_from_body(body: dict) -> DigestCard:
125
+ return DigestCard(
126
+ schema="cloudtrail",
127
+ source_name="cloudtrail.json.log",
128
+ data_window=(_NOW - timedelta(days=1), _NOW),
129
+ record_count=100,
130
+ histogram_counts=[1, 2, 3, 5, 8, 5, 3, 2, 1],
131
+ histogram_unit="hr",
132
+ histogram_peak=8,
133
+ zone1_extras=body["zone1_extras"],
134
+ insights=body["insights"],
135
+ fields=body["fields"],
136
+ )
137
+
138
+
139
+ def _render(card: DigestCard) -> str:
140
+ """Render the digest card through TextHandler and return the output text."""
141
+ buffer = io.StringIO()
142
+ handler = TextHandler(stream=buffer, verbose_level=0)
143
+ handler.render_digest(card)
144
+ return buffer.getvalue()
145
+
146
+
147
+ def _compute_slots(frame: pd.DataFrame) -> list:
148
+ """Re-compute the canonical cloudtrail slot list for tests.
149
+
150
+ The summariser used to return the full pre-filter slot list as
151
+ body["slots"]. Under the flat grammar the body returns only
152
+ post-selection display state (`fields`). Tests that need to inspect a
153
+ specific slot's computed state re-derive it here — same six
154
+ computers, same interactive-lane scoping, in declared order.
155
+ """
156
+ if "lane" in frame.columns:
157
+ frame_interactive = frame[frame["lane"] == "interactive"]
158
+ else:
159
+ frame_interactive = frame.iloc[0:0]
160
+ return [
161
+ ct_digest._slot_lane_split(frame),
162
+ ct_digest._slot_principal_vol(frame_interactive),
163
+ ct_digest._slot_event_source(frame),
164
+ ct_digest._slot_source_ip(frame_interactive),
165
+ ct_digest._slot_region(frame),
166
+ ct_digest._slot_error_rate(frame),
167
+ ]
168
+
169
+
170
+ # ─── lane-split (dist; whole pile; always shows) ────────────────────────────
171
+
172
+ def test_lane_split_renders_both_shares() -> None:
173
+ frame = _ct_df(
174
+ [_ct_row(lane="interactive") for _ in range(3)]
175
+ + [_ct_row(lane="service") for _ in range(7)]
176
+ )
177
+ body = ct_digest.summarize(frame)
178
+ slot = _slot_by_label(_compute_slots(frame), "lane-split")
179
+ assert slot.statistic == "dist"
180
+ assert slot.cells == ["interactive 30% / service 70%"]
181
+ # dist never carries entity / ratio / magnitude
182
+ assert slot.entity is None and slot.ratio is None and slot.magnitude is None
183
+
184
+
185
+ def test_lane_split_all_interactive_renders_zero_service() -> None:
186
+ frame = _ct_df([_ct_row(lane="interactive") for _ in range(5)])
187
+ body = ct_digest.summarize(frame)
188
+ slot = _slot_by_label(_compute_slots(frame), "lane-split")
189
+ assert slot.cells == ["interactive 100% / service 0%"]
190
+
191
+
192
+ def test_lane_split_empty_frame_renders_no_events_placeholder() -> None:
193
+ body = ct_digest.summarize(_ct_df([]))
194
+ slot = _slot_by_label(_compute_slots(_ct_df([])), "lane-split")
195
+ assert slot.cells == ["(no events)"]
196
+
197
+
198
+ def test_lane_split_missing_lane_column_renders_no_lane_placeholder() -> None:
199
+ # Drop the column entirely — mirrors the dns qtype-mix dual-fallback contract.
200
+ frame = pd.DataFrame([
201
+ {k: v for k, v in _ct_row().items() if k != "lane"}
202
+ for _ in range(3)
203
+ ])
204
+ body = ct_digest.summarize(frame)
205
+ slot = _slot_by_label(_compute_slots(frame), "lane-split")
206
+ assert slot.cells == ["(no lane)"]
207
+
208
+
209
+ # ─── principal-vol (cliff; INTERACTIVE-SCOPED) ──────────────────────────────
210
+
211
+ def test_principal_vol_speaks_with_dominant_interactive_principal() -> None:
212
+ # 5 distinct interactive principals, clear rank1/rank2 cliff.
213
+ rows: list[dict] = []
214
+ for _ in range(20):
215
+ rows.append(_ct_row(principal="arn:aws:iam::111111111111:role/AdminRole"))
216
+ for name in ("user/alice", "user/bob", "user/carol", "user/dave"):
217
+ rows.append(_ct_row(principal=f"arn:aws:iam::111111111111:{name}"))
218
+ body = ct_digest.summarize(_ct_df(rows))
219
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "principal-vol")
220
+ assert slot.statistic == "cliff"
221
+ assert slot.entity == "arn:aws:iam::111111111111:role/AdminRole"
222
+ assert slot.ratio is not None and slot.ratio >= 2.0
223
+ # Cell renders the share-of-interactive percentage.
224
+ assert slot.cells is not None
225
+ assert slot.cells[0] == "arn:aws:iam::111111111111:role/AdminRole"
226
+ assert slot.cells[1].endswith("%")
227
+
228
+
229
+ def test_principal_vol_dashes_when_interactive_neckandneck_despite_service_dominator() -> None:
230
+ """Proves both (a) scoping and (b) the cliff floor.
231
+
232
+ Service lane has one dominant principal that would WIN a whole-pile cliff.
233
+ Interactive lane has only two principals (below POPULATION_FLOOR=5), so
234
+ even though one of them dominates within interactive, the slot must dash
235
+ — the spec calls this out explicitly.
236
+ """
237
+ rows: list[dict] = []
238
+ # 10 service rows all from the same service principal — would dominate
239
+ # the whole-pile cliff if the interactive filter were forgotten.
240
+ for _ in range(10):
241
+ rows.append(_ct_row(
242
+ principal="lambda.amazonaws.com",
243
+ lane="service",
244
+ identity_type="AWSService",
245
+ event_source="lambda.amazonaws.com",
246
+ source_ip="lambda.amazonaws.com",
247
+ ))
248
+ # 5 interactive rows split between two principals — below floor.
249
+ for _ in range(3):
250
+ rows.append(_ct_row(principal="arn:aws:iam::111111111111:user/alice"))
251
+ for _ in range(2):
252
+ rows.append(_ct_row(principal="arn:aws:iam::111111111111:user/bob"))
253
+ body = ct_digest.summarize(_ct_df(rows))
254
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "principal-vol")
255
+ assert slot.cells is None # dashed
256
+ assert slot.entity is None and slot.ratio is None
257
+
258
+
259
+ # ─── event-source (cliff; WHOLE pile) ───────────────────────────────────────
260
+
261
+ def test_event_source_cliff_counts_whole_pile() -> None:
262
+ """event-source counts interactive + service rows together (whole-pile)."""
263
+ rows: list[dict] = []
264
+ # 25 interactive iam events.
265
+ for _ in range(25):
266
+ rows.append(_ct_row(event_source="iam.amazonaws.com"))
267
+ # 4 service rows across 4 other services — without the service rows the
268
+ # population would only be 1 distinct source and the slot would dash;
269
+ # whole-pile counting brings the population to 5.
270
+ for src in ("ec2.amazonaws.com", "s3.amazonaws.com",
271
+ "sts.amazonaws.com", "kms.amazonaws.com"):
272
+ rows.append(_ct_row(lane="service", event_source=src,
273
+ principal=src, identity_type="AWSService",
274
+ source_ip=src))
275
+ body = ct_digest.summarize(_ct_df(rows))
276
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "event-source")
277
+ assert slot.entity == "iam.amazonaws.com"
278
+ assert slot.cells is not None
279
+ assert slot.cells[0] == "iam.amazonaws.com"
280
+ assert slot.cells[1] == "25" # count, right-justified by handler
281
+ assert slot.ratio is not None and slot.ratio >= 2.0
282
+
283
+
284
+ # ─── source-ip (share; INTERACTIVE-SCOPED; cell vs entity split) ────────────
285
+
286
+ def test_source_ip_speaks_with_one_dominant_interactive_ip() -> None:
287
+ """20 events from one IP + 4 IPs at 1 each = 24 interactive,
288
+ top_share = 20/24 ≈ 83% ≥ SHARE_GATE → speaks."""
289
+ rows: list[dict] = []
290
+ for _ in range(20):
291
+ rows.append(_ct_row(source_ip="203.0.113.99"))
292
+ for ip in ("203.0.113.10", "203.0.113.11",
293
+ "203.0.113.12", "203.0.113.13"):
294
+ rows.append(_ct_row(source_ip=ip))
295
+ body = ct_digest.summarize(_ct_df(rows))
296
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
297
+ assert slot.statistic == "share"
298
+ # Entity carries the actual IP for the lede…
299
+ assert slot.entity == "203.0.113.99"
300
+ # …but the table cell leads with "1 IP" for at-a-glance concentration.
301
+ # Exactly TWO cells — share has no rank-2 ratio.
302
+ assert slot.cells == ["1 IP", "83% of interactive"]
303
+ # No rank-2 ratio on a share slot.
304
+ assert slot.ratio is None
305
+ assert slot.magnitude is not None and 82 <= slot.magnitude <= 84
306
+
307
+
308
+ def test_source_ip_speaks_on_two_distinct_ips_with_dominant_share() -> None:
309
+ """The exact regression case the share-statistic fix unblocks.
310
+
311
+ 99 events from one IP + 1 from another → 2 distinct IPs total. The
312
+ OLD cliff-based slot dashed here because 2 < POPULATION_FLOOR=5, even
313
+ though concentration is 99%. The NEW share-based slot must speak and
314
+ name the IP — that low cardinality is the SIGNAL, not noise.
315
+ """
316
+ rows = [_ct_row(source_ip="203.0.113.99") for _ in range(99)]
317
+ rows.append(_ct_row(source_ip="203.0.113.10"))
318
+ body = ct_digest.summarize(_ct_df(rows))
319
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
320
+ assert slot.statistic == "share"
321
+ assert slot.entity == "203.0.113.99"
322
+ assert slot.cells == ["1 IP", "99% of interactive"]
323
+
324
+
325
+ def test_source_ip_speaks_on_single_distinct_ip_at_100_percent() -> None:
326
+ """10 events, all one IP → 1 distinct IP. top_share = 1.0 → speaks."""
327
+ rows = [_ct_row(source_ip="203.0.113.99") for _ in range(10)]
328
+ body = ct_digest.summarize(_ct_df(rows))
329
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
330
+ assert slot.statistic == "share"
331
+ assert slot.entity == "203.0.113.99"
332
+ assert slot.magnitude == 100.0
333
+ assert slot.cells == ["1 IP", "100% of interactive"]
334
+
335
+
336
+ def test_source_ip_dashes_just_below_share_gate() -> None:
337
+ """Locks SHARE_GATE = 0.80 as the threshold. 79 dominant + 21 spread
338
+ → top_share = 0.79, just below gate → dashes."""
339
+ rows = [_ct_row(source_ip="203.0.113.99") for _ in range(79)]
340
+ # Spread 21 across many other IPs so no single IP-other clears the gate.
341
+ for i in range(21):
342
+ rows.append(_ct_row(source_ip=f"203.0.113.{100+i}"))
343
+ body = ct_digest.summarize(_ct_df(rows))
344
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
345
+ assert slot.cells is None
346
+ assert slot.statistic == "share"
347
+
348
+
349
+ def test_source_ip_dashes_on_diverse_interactive_sources() -> None:
350
+ """Spread distribution → top_share = 1/N << SHARE_GATE → dashes."""
351
+ rows = [
352
+ _ct_row(source_ip=f"203.0.113.{i}") for i in range(10, 18)
353
+ ]
354
+ body = ct_digest.summarize(_ct_df(rows))
355
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
356
+ assert slot.cells is None
357
+ assert slot.statistic == "share"
358
+
359
+
360
+ def test_source_ip_excludes_service_lane_hostnames() -> None:
361
+ """A service-lane source_ip hostname (e.g. s3.amazonaws.com) must NOT
362
+ affect source-ip — proves interactive scoping is doing real work.
363
+
364
+ Without the interactive filter, the whole-pile share would compute
365
+ 25/31 ≈ 81% on "s3.amazonaws.com" — above SHARE_GATE — and the slot
366
+ would speak on a service hostname. The interactive filter keeps that
367
+ out: interactive lane has 6 IPs at 1 event each (top_share = 1/6 ≈
368
+ 17% << gate) → dashes.
369
+ """
370
+ rows: list[dict] = []
371
+ for _ in range(25):
372
+ rows.append(_ct_row(
373
+ lane="service",
374
+ source_ip="s3.amazonaws.com",
375
+ principal="s3.amazonaws.com",
376
+ identity_type="AWSService",
377
+ event_source="s3.amazonaws.com",
378
+ ))
379
+ for ip in ("203.0.113.10", "203.0.113.11", "203.0.113.12",
380
+ "203.0.113.13", "203.0.113.14", "203.0.113.15"):
381
+ rows.append(_ct_row(source_ip=ip))
382
+ body = ct_digest.summarize(_ct_df(rows))
383
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
384
+ assert slot.cells is None # dashed — proves the scoping
385
+
386
+
387
+ def test_source_ip_lede_omits_ratio_phrase() -> None:
388
+ """Direct check on the lede formatter contract: share slots produce
389
+ no 'Nx the next' or 'more than' clause — concentration has no peer to
390
+ compare against."""
391
+ rows = [_ct_row(source_ip="203.0.113.99") for _ in range(95)]
392
+ for ip in ("203.0.113.10", "203.0.113.11",
393
+ "203.0.113.12", "203.0.113.13", "203.0.113.14"):
394
+ rows.append(_ct_row(source_ip=ip))
395
+ body = ct_digest.summarize(_ct_df(rows))
396
+ source_ip_lede = next(
397
+ (lede for lede in body["insights"] if "203.0.113.99" in lede), None
398
+ )
399
+ assert source_ip_lede is not None
400
+ assert "x the next" not in source_ip_lede
401
+ assert "more than" not in source_ip_lede
402
+ assert source_ip_lede.endswith("interactive events.")
403
+
404
+
405
+ def test_source_ip_high_share_outranks_mid_cliff_in_salience() -> None:
406
+ """A high-share source-ip lede should rank above a mid-magnitude cliff
407
+ lede. Builds a pile where source-ip share is 95% (salience 95) and
408
+ event-source cliff ratio is ~3 (salience 3) — source-ip lede must
409
+ appear before event-source lede in body['insights']."""
410
+ rows: list[dict] = []
411
+ # 95 events from one IP, but spread across multiple event_sources so
412
+ # the event-source cliff is weak.
413
+ sources = ["iam.amazonaws.com"] * 30 + ["ec2.amazonaws.com"] * 25 + \
414
+ ["s3.amazonaws.com"] * 20 + ["sts.amazonaws.com"] * 20
415
+ for src in sources:
416
+ rows.append(_ct_row(source_ip="203.0.113.99", event_source=src))
417
+ # 5 background events so event-source clears POPULATION_FLOOR.
418
+ for i in range(5):
419
+ rows.append(_ct_row(
420
+ source_ip=f"203.0.113.{10+i}",
421
+ event_source="kms.amazonaws.com",
422
+ ))
423
+ body = ct_digest.summarize(_ct_df(rows))
424
+ ledes = body["insights"]
425
+ src_idx = next(
426
+ (i for i, lede in enumerate(ledes) if "203.0.113.99" in lede), None
427
+ )
428
+ src_evt_idx = next(
429
+ (i for i, lede in enumerate(ledes)
430
+ if "iam.amazonaws.com" in lede and "service" in lede), None
431
+ )
432
+ assert src_idx is not None
433
+ # event-source may not even make top-3, but if it does, source-ip outranks it.
434
+ if src_evt_idx is not None:
435
+ assert src_idx < src_evt_idx
436
+
437
+
438
+ # ─── region (dist; WHOLE pile; never produces a lede) ───────────────────────
439
+
440
+ def test_region_single_region_renders_100_percent() -> None:
441
+ frame = _ct_df([_ct_row(aws_region="us-east-1") for _ in range(8)])
442
+ body = ct_digest.summarize(frame)
443
+ slot = _slot_by_label(_compute_slots(frame), "region")
444
+ assert slot.statistic == "dist"
445
+ assert slot.cells == ["us-east-1 100%"]
446
+
447
+
448
+ def test_region_multi_region_renders_top_three_with_separator() -> None:
449
+ rows: list[dict] = []
450
+ for _ in range(40):
451
+ rows.append(_ct_row(aws_region="us-east-1"))
452
+ for _ in range(30):
453
+ rows.append(_ct_row(aws_region="eu-west-1"))
454
+ for _ in range(10):
455
+ rows.append(_ct_row(aws_region="us-west-2"))
456
+ body = ct_digest.summarize(_ct_df(rows))
457
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "region")
458
+ assert slot.cells == ["us-east-1 50% · eu-west-1 38% · us-west-2 12%"]
459
+
460
+
461
+ def test_region_caps_at_top_three() -> None:
462
+ rows: list[dict] = []
463
+ for region, n in (("us-east-1", 30), ("eu-west-1", 20),
464
+ ("us-west-2", 10), ("ap-south-1", 5),
465
+ ("eu-central-1", 5)):
466
+ for _ in range(n):
467
+ rows.append(_ct_row(aws_region=region))
468
+ body = ct_digest.summarize(_ct_df(rows))
469
+ slot = _slot_by_label(_compute_slots(_ct_df(rows)), "region")
470
+ assert slot.cells is not None
471
+ assert slot.cells[0].count("·") == 2 # exactly three entries → two separators
472
+ # Lower-ranked regions must NOT appear.
473
+ assert "ap-south-1" not in slot.cells[0]
474
+ assert "eu-central-1" not in slot.cells[0]
475
+
476
+
477
+ def test_region_empty_and_missing_column_have_distinct_fallbacks() -> None:
478
+ body = ct_digest.summarize(_ct_df([]))
479
+ assert _slot_by_label(_compute_slots(_ct_df([])), "region").cells == ["(no events)"]
480
+
481
+ rows = [{k: v for k, v in _ct_row().items() if k != "aws_region"}
482
+ for _ in range(3)]
483
+ body = ct_digest.summarize(pd.DataFrame(rows))
484
+ assert _slot_by_label(_compute_slots(pd.DataFrame(rows)), "region").cells == ["(no region)"]
485
+
486
+
487
+ # ─── error-rate (rate; WHOLE pile; names error CODE not principal) ──────────
488
+
489
+ def test_error_rate_dashes_when_no_errors() -> None:
490
+ frame = _ct_df([_ct_row(error_code=None) for _ in range(20)])
491
+ slot = _slot_by_label(frame, "error-rate")
492
+ assert slot.cells is None
493
+
494
+
495
+ def test_error_rate_dashes_below_rate_floor() -> None:
496
+ """200 events with 1 errored = 0.5% < RATE_FLOOR (1%) — dashes via floor."""
497
+ rows = [_ct_row(error_code=None) for _ in range(199)]
498
+ rows.append(_ct_row(error_code="AccessDenied"))
499
+ slot = _slot_by_label(_ct_df(rows), "error-rate")
500
+ assert slot.cells is None
501
+
502
+
503
+ def test_error_rate_names_top_error_code_not_principal() -> None:
504
+ """Top contributor is the most common errorCode, NOT a principal."""
505
+ rows: list[dict] = []
506
+ for _ in range(80):
507
+ rows.append(_ct_row(error_code=None))
508
+ principals = [
509
+ f"arn:aws:iam::111111111111:user/u{i}" for i in range(20)
510
+ ]
511
+ for i in range(15):
512
+ rows.append(_ct_row(principal=principals[i % len(principals)],
513
+ error_code="AccessDenied"))
514
+ for i in range(5):
515
+ rows.append(_ct_row(principal=principals[(i + 7) % len(principals)],
516
+ error_code="ValidationException"))
517
+ slot = _slot_by_label(_ct_df(rows), "error-rate")
518
+ assert slot.entity == "AccessDenied"
519
+ assert slot.cells is not None
520
+ assert slot.cells[1] == "AccessDenied"
521
+ assert slot.magnitude is not None and 19 <= slot.magnitude <= 21
522
+
523
+
524
+ def test_error_rate_notna_semantics_pin_none_nan_and_empty_string() -> None:
525
+ """Literal .notna() — None and NaN read clean; "" reads as errored."""
526
+ rows: list[dict] = []
527
+ for i in range(45):
528
+ rows.append(_ct_row(error_code=None))
529
+ for i in range(45):
530
+ rows.append(_ct_row(error_code=float("nan")))
531
+ for i in range(10):
532
+ rows.append(_ct_row(error_code=""))
533
+ slot = _slot_by_label(_ct_df(rows), "error-rate")
534
+ assert slot.cells is not None
535
+ assert slot.entity == ""
536
+ assert slot.magnitude is not None and 9 <= slot.magnitude <= 11
537
+
538
+
539
+ # ─── Ledes: dist slots never leak into prose ────────────────────────────────
540
+
541
+ def test_ledes_never_carry_dist_slot_prose() -> None:
542
+ """All-interactive single-region pile — gating slots may fire ledes, but
543
+ no lede string may contain the lane-split or region fill prose.
544
+
545
+ Checked against rendered prose, not slot labels — label-presence checks
546
+ would let "interactive 100% / service 0%" leak through if a formatter
547
+ accidentally embedded it. Same for region's "us-east-1 100%".
548
+ """
549
+ rows: list[dict] = []
550
+ # All-interactive — drives lane-split to "interactive 100% / service 0%".
551
+ for _ in range(30):
552
+ rows.append(_ct_row(
553
+ principal="arn:aws:iam::111111111111:role/AdminRole",
554
+ aws_region="us-east-1",
555
+ source_ip="203.0.113.99",
556
+ event_source="iam.amazonaws.com",
557
+ ))
558
+ # 5 more principals / IPs / sources so population floors are met but
559
+ # the cliff still fires.
560
+ for i in range(5):
561
+ rows.append(_ct_row(
562
+ principal=f"arn:aws:iam::111111111111:user/u{i}",
563
+ source_ip=f"203.0.113.{20+i}",
564
+ event_source=f"svc{i}.amazonaws.com",
565
+ aws_region="us-east-1",
566
+ ))
567
+ body = ct_digest.summarize(_ct_df(rows))
568
+ assert body["insights"] # at least one cliff lede fired
569
+ forbidden_fragments = (
570
+ "interactive 100%", "service 0%", "/ service",
571
+ "us-east-1 100%",
572
+ )
573
+ for lede in body["insights"]:
574
+ for frag in forbidden_fragments:
575
+ assert frag not in lede, (
576
+ f"dist slot prose leaked into lede: {lede!r} contains {frag!r}"
577
+ )
578
+
579
+
580
+ # ─── Summariser shape ───────────────────────────────────────────────────────
581
+
582
+ def test_summarize_returns_six_slots_in_fixed_order() -> None:
583
+ body = ct_digest.summarize(_ct_df([_ct_row() for _ in range(3)]))
584
+ labels = [s.label for s in _compute_slots(_ct_df([_ct_row() for _ in range(3)]))]
585
+ assert labels == [
586
+ "lane-split", "principal-vol", "event-source",
587
+ "source-ip", "region", "error-rate",
588
+ ]
589
+
590
+
591
+ def test_summarize_entity_label_and_zone1_extras() -> None:
592
+ rows = [
593
+ _ct_row(principal="arn:aws:iam::111111111111:user/alice",
594
+ event_source="iam.amazonaws.com"),
595
+ _ct_row(principal="arn:aws:iam::111111111111:user/bob",
596
+ event_source="ec2.amazonaws.com"),
597
+ _ct_row(principal="arn:aws:iam::111111111111:user/alice",
598
+ event_source="s3.amazonaws.com"),
599
+ ]
600
+ body = ct_digest.summarize(_ct_df(rows))
601
+ # entity_label / entity_count are deleted from the body dict under the
602
+ # flat grammar; zone1_extras carries the distinct-counts as the only
603
+ # surface the renderer consumes.
604
+ assert ("principals", "2") in body["zone1_extras"]
605
+ assert ("event sources", "3") in body["zone1_extras"]
606
+
607
+
608
+ # ─── Whole-card rendering ───────────────────────────────────────────────────
609
+
610
+ def _build_sleepy_rows() -> list[dict]:
611
+ """Build the canonical sleepy pile used by the renderer test.
612
+
613
+ 50 events, 90% service / 10% interactive, two ≈balanced interactive
614
+ principals, single region, no errors. Designed so all cliff/rate slots
615
+ correctly dash:
616
+ - principal-vol: 2 distinct interactive principals → below POPULATION_FLOOR
617
+ - source-ip: 5 distinct interactive IPs, 1 each → ratio 1.0 < gate
618
+ - event-source: rank1/rank2 = 25/20 = 1.25 < gate
619
+ - error-rate: 0 errors → kind_count short-circuit
620
+ """
621
+ rows: list[dict] = []
622
+ # Service lane: 25 lambda + 20 ec2 — keeps the whole-pile event-source
623
+ # cliff weak so the slot dashes.
624
+ for _ in range(25):
625
+ rows.append(_ct_row(
626
+ principal="lambda.amazonaws.com", lane="service",
627
+ event_source="lambda.amazonaws.com",
628
+ event_name="Invoke", identity_type="AWSService",
629
+ source_ip="lambda.amazonaws.com",
630
+ ))
631
+ for _ in range(20):
632
+ rows.append(_ct_row(
633
+ principal="ec2.amazonaws.com", lane="service",
634
+ event_source="ec2.amazonaws.com",
635
+ event_name="StartInstances", identity_type="AWSService",
636
+ source_ip="ec2.amazonaws.com",
637
+ ))
638
+ # Interactive lane: 5 events split 3/2 across 2 principals, 5 distinct
639
+ # IPs (one each — so the source-ip cliff is flat).
640
+ for src_ip in ("203.0.113.10", "203.0.113.11", "203.0.113.12"):
641
+ rows.append(_ct_row(
642
+ principal="arn:aws:iam::111111111111:user/alice",
643
+ event_source="iam.amazonaws.com", event_name="ListUsers",
644
+ source_ip=src_ip,
645
+ ))
646
+ for src_ip in ("203.0.113.20", "203.0.113.21"):
647
+ rows.append(_ct_row(
648
+ principal="arn:aws:iam::111111111111:user/bob",
649
+ event_source="sts.amazonaws.com", event_name="GetCallerIdentity",
650
+ source_ip=src_ip,
651
+ ))
652
+ return rows
653
+
654
+
655
+ def _build_attack_rows() -> list[dict]:
656
+ """Build the canonical attack-shaped pile.
657
+
658
+ 80 events, all interactive, one principal/IP utterly dominant, three
659
+ regions, ~22% errors. Sized so principal-vol / source-ip salience
660
+ (cliff ratio 76) clearly leads error-rate (salience 22) and event-source
661
+ (cliff ratio 12) — guaranteed top-3 ledes are: principal-vol, source-ip,
662
+ error-rate. event-source's cliff still fires (cells not None) but its
663
+ lede drops out of the top-3 cutoff; the slot table row still shows it.
664
+ """
665
+ rows: list[dict] = []
666
+ # 60 of the 76 dominant-role events go through IAM; the remaining 16
667
+ # spread across four other services so event-source still clears
668
+ # POPULATION_FLOOR with a meaningful but secondary cliff.
669
+ services = (
670
+ ["iam.amazonaws.com"] * 60
671
+ + ["ec2.amazonaws.com"] * 4 + ["s3.amazonaws.com"] * 4
672
+ + ["sts.amazonaws.com"] * 4 + ["kms.amazonaws.com"] * 4
673
+ )
674
+ regions = ["us-east-1"] * 38 + ["eu-west-1"] * 28 + ["us-west-2"] * 10
675
+ # 16 AccessDenied + 2 ValidationException + 58 clean → ~22.5% error rate.
676
+ error_codes = (
677
+ ["AccessDenied"] * 16 + ["ValidationException"] * 2 + [None] * 58
678
+ )
679
+ for i in range(76):
680
+ rows.append(_ct_row(
681
+ principal="arn:aws:iam::111111111111:role/AdminRole",
682
+ event_source=services[i],
683
+ event_name="CreateUser" if (i % 3) == 0 else "ListUsers",
684
+ source_ip="203.0.113.99",
685
+ aws_region=regions[i],
686
+ error_code=error_codes[i],
687
+ # Per-row ts offsets give the timeline a non-zero span. Real
688
+ # CloudTrail events have varying eventTime values; without the
689
+ # offset, run_digest's confidence floor (zero-span guard) fires.
690
+ ts=_BASE_TS + i,
691
+ ))
692
+ # 4 background events from 4 distinct (principal, IP, service) tuples —
693
+ # just enough to clear POPULATION_FLOOR on each cliff.
694
+ others = [
695
+ ("arn:aws:iam::111111111111:role/BuildBot",
696
+ "ec2.amazonaws.com", "203.0.113.10"),
697
+ ("arn:aws:iam::111111111111:user/alice",
698
+ "s3.amazonaws.com", "203.0.113.11"),
699
+ ("arn:aws:iam::111111111111:user/bob",
700
+ "sts.amazonaws.com", "203.0.113.12"),
701
+ ("arn:aws:iam::111111111111:user/carol",
702
+ "kms.amazonaws.com", "203.0.113.13"),
703
+ ]
704
+ for j, (principal, source, ip) in enumerate(others):
705
+ rows.append(_ct_row(
706
+ principal=principal,
707
+ event_source=source,
708
+ event_name="DescribeFoo",
709
+ source_ip=ip,
710
+ aws_region="us-east-1",
711
+ ts=_BASE_TS + 76 + j,
712
+ ))
713
+ return rows
714
+
715
+
716
+ def test_sleepy_card_is_quiet_with_zero_ledes() -> None:
717
+ body = ct_digest.summarize(_ct_df(_build_sleepy_rows()))
718
+ # Every gating slot dashes.
719
+ for label in ("principal-vol", "event-source", "source-ip", "error-rate"):
720
+ assert _slot_by_label(_compute_slots(_ct_df(_build_sleepy_rows())), label).cells is None, (
721
+ f"sleepy pile: {label} unexpectedly fired"
722
+ )
723
+ # Both dist slots speak.
724
+ assert _slot_by_label(_compute_slots(_ct_df(_build_sleepy_rows())), "lane-split").cells == [
725
+ "interactive 10% / service 90%",
726
+ ]
727
+ assert _slot_by_label(_compute_slots(_ct_df(_build_sleepy_rows())), "region").cells == [
728
+ "us-east-1 100%",
729
+ ]
730
+ # No gating slot → no insight.
731
+ assert body["insights"] == []
732
+ # Card renders without absent-footer machinery (no slot is ABSENT
733
+ # under the flat grammar — non-speaking just vanishes from fields).
734
+ text = _render(_card_from_body(body))
735
+ assert "cloudtrail ·" in text # identity-line-3 schema label
736
+ assert "N.B." not in text
737
+ assert "── digest" not in text # header rule is gone
738
+
739
+
740
+ def test_attack_card_fires_multiple_ledes() -> None:
741
+ body = ct_digest.summarize(_ct_df(_build_attack_rows()))
742
+ # All four gating slots fire.
743
+ for label in ("principal-vol", "event-source", "source-ip", "error-rate"):
744
+ assert _slot_by_label(_compute_slots(_ct_df(_build_attack_rows())), label).cells is not None, (
745
+ f"attack pile: {label} failed to fire"
746
+ )
747
+ # lane-split renders 100/0.
748
+ assert _slot_by_label(_compute_slots(_ct_df(_build_attack_rows())), "lane-split").cells == [
749
+ "interactive 100% / service 0%",
750
+ ]
751
+ # region renders top-3 with the dominant region first.
752
+ region_cell = _slot_by_label(_compute_slots(_ct_df(_build_attack_rows())), "region").cells[0]
753
+ assert region_cell.startswith("us-east-1 ")
754
+ assert region_cell.count("·") == 2
755
+ # AdminRole / dominant IP / top error code all named in some lede.
756
+ assert any("AdminRole" in lede for lede in body["insights"])
757
+ src_ip_lede = next(
758
+ (lede for lede in body["insights"] if "203.0.113.99" in lede), None
759
+ )
760
+ assert src_ip_lede is not None
761
+ # Source-ip lede has the new share contract — no ratio-against-next clause.
762
+ assert "x the next" not in src_ip_lede
763
+ assert "more than" not in src_ip_lede
764
+ assert any("AccessDenied" in lede for lede in body["insights"])
765
+ # Card renders — flat grammar, no header rule.
766
+ text = _render(_card_from_body(body))
767
+ assert "cloudtrail ·" in text
768
+ assert "203.0.113.99" in text # insight surfaces the dominant IP
769
+
770
+
771
+ # ─── CLI dispatch ───────────────────────────────────────────────────────────
772
+
773
+ def _spy_run_digest(monkeypatch) -> dict:
774
+ captured: dict[str, Any] = {}
775
+
776
+ def fake_run_digest(**kwargs):
777
+ captured.update(kwargs)
778
+
779
+ monkeypatch.setattr(runner, "run_digest", fake_run_digest)
780
+ return captured
781
+
782
+
783
+ def _stub_config(monkeypatch, cfg_dict: dict) -> None:
784
+ monkeypatch.setattr(cli.cfg, "load", lambda _path: cfg_dict)
785
+
786
+
787
+ _CT_NDJSON_LINE = (
788
+ '{"eventVersion": "1.08", "eventTime": "2026-06-01T12:00:00Z",'
789
+ ' "userIdentity": {"type": "IAMUser"}, "eventName": "GetObject",'
790
+ ' "eventSource": "s3.amazonaws.com", "sourceIPAddress": "192.0.2.10"}\n'
791
+ )
792
+
793
+
794
+ def _write_ct_sniff_file(tmp_path: Path) -> Path:
795
+ log_path = tmp_path / "cloudtrail.json.log"
796
+ log_path.write_text(_CT_NDJSON_LINE, encoding="utf-8")
797
+ return log_path
798
+
799
+
800
+ def test_cli_digest_cloudtrail_file_sniffs_and_routes_to_cloudtrail_dir(
801
+ tmp_path, monkeypatch,
802
+ ) -> None:
803
+ captured = _spy_run_digest(monkeypatch)
804
+ _stub_config(monkeypatch, {"loghunter": {}})
805
+ log_path = _write_ct_sniff_file(tmp_path)
806
+ cli._main(["digest", str(log_path)])
807
+ assert captured.get("schema") == "cloudtrail"
808
+ assert captured.get("cloudtrail_dir") == str(log_path)
809
+ assert captured.get("zeek_dir") is None
810
+ assert captured.get("pihole_dir") is None
811
+ assert captured.get("syslog_dir") is None
812
+
813
+
814
+ def test_cli_digest_cloudtrail_bare_falls_back_to_conn_default(tmp_path, monkeypatch) -> None:
815
+ """Bare `digest` always defaults to schema=conn under the new surface.
816
+
817
+ Configured cloudtrail_dir alone cannot drive a bare digest — documented
818
+ consequence of removing the schema token. Users wanting a cloudtrail
819
+ digest pass a CloudTrail file as positional.
820
+ """
821
+ captured = _spy_run_digest(monkeypatch)
822
+ ct_dir = tmp_path / "ct"
823
+ ct_dir.mkdir()
824
+ _stub_config(monkeypatch, {"loghunter": {"cloudtrail_dir": str(ct_dir)}})
825
+ cli._main(["digest"])
826
+ assert captured.get("schema") == "conn"
827
+ assert captured.get("cloudtrail_dir") is None
828
+
829
+
830
+ def test_cli_digest_cloudtrail_file_with_since_flag(tmp_path, monkeypatch) -> None:
831
+ captured = _spy_run_digest(monkeypatch)
832
+ _stub_config(monkeypatch, {"loghunter": {}})
833
+ log_path = _write_ct_sniff_file(tmp_path)
834
+ cli._main(["digest", str(log_path), "--since=7d"])
835
+ assert captured.get("schema") == "cloudtrail"
836
+ assert captured.get("cloudtrail_dir") == str(log_path)
837
+ assert captured.get("since") is not None
838
+
839
+
840
+ # ─── Runner-level dispatch ──────────────────────────────────────────────────
841
+
842
+ def test_run_digest_rejects_zeek_dir_at_programmatic_boundary(tmp_path) -> None:
843
+ config: dict[str, Any] = {"loghunter": {}}
844
+ with pytest.raises(ValueError,
845
+ match="zeek_dir is not valid for the cloudtrail schema"):
846
+ runner.run_digest(
847
+ config=config, schema="cloudtrail",
848
+ cloudtrail_dir=tmp_path,
849
+ zeek_dir=tmp_path / "zeek",
850
+ )
851
+
852
+
853
+ def test_run_digest_rejects_pihole_dir_at_programmatic_boundary(tmp_path) -> None:
854
+ config: dict[str, Any] = {"loghunter": {}}
855
+ with pytest.raises(ValueError,
856
+ match="pihole_dir is not valid for the cloudtrail schema"):
857
+ runner.run_digest(
858
+ config=config, schema="cloudtrail",
859
+ cloudtrail_dir=tmp_path,
860
+ pihole_dir=tmp_path / "pihole",
861
+ )
862
+
863
+
864
+ def test_run_digest_rejects_syslog_dir_at_programmatic_boundary(tmp_path) -> None:
865
+ config: dict[str, Any] = {"loghunter": {}}
866
+ with pytest.raises(ValueError,
867
+ match="syslog_dir is not valid for the cloudtrail schema"):
868
+ runner.run_digest(
869
+ config=config, schema="cloudtrail",
870
+ cloudtrail_dir=tmp_path,
871
+ syslog_dir=tmp_path / "syslog",
872
+ )
873
+
874
+
875
+ def test_run_digest_rejects_missing_cloudtrail_dir(tmp_path) -> None:
876
+ config: dict[str, Any] = {"loghunter": {}}
877
+ with pytest.raises(ValueError, match="cloudtrail_dir not configured"):
878
+ runner.run_digest(config=config, schema="cloudtrail")
879
+
880
+
881
+ # ─── End-to-end via run_digest ──────────────────────────────────────────────
882
+
883
+ def _row_to_wire_event(row: dict) -> dict:
884
+ """Render a canonical row dict back to a CloudTrail wire event.
885
+
886
+ The loader/parser pipeline reads wire JSON (eventTime / userIdentity /
887
+ eventSource / …) and produces canonical rows. Going back the other way
888
+ for synthetic test files keeps the end-to-end path realistic without
889
+ having to maintain a parallel JSON fixture file.
890
+ """
891
+ identity: dict[str, Any] = {"type": row["identity_type"]}
892
+ # Map the row's principal back to whichever userIdentity field the
893
+ # parser's derivation rule uses, so the parser's principal matches.
894
+ if row["identity_type"] == "AWSService":
895
+ identity["invokedBy"] = row["principal"]
896
+ elif row["identity_type"] == "AssumedRole":
897
+ identity["sessionContext"] = {
898
+ "sessionIssuer": {"userName": row["principal"]},
899
+ }
900
+ elif row["identity_type"] == "IAMUser":
901
+ # Use the arn so the parser's IAMUser path picks up the last
902
+ # slash-segment as principal — matches our placeholder shape.
903
+ identity["arn"] = row["principal"]
904
+ identity["userName"] = row["principal"].rsplit("/", 1)[-1] \
905
+ if "/" in row["principal"] else row["principal"]
906
+ elif row["identity_type"] == "Root":
907
+ identity["type"] = "Root"
908
+ event: dict[str, Any] = {
909
+ "eventTime": datetime.fromtimestamp(
910
+ row["ts"], tz=timezone.utc,
911
+ ).strftime("%Y-%m-%dT%H:%M:%SZ"),
912
+ "userIdentity": identity,
913
+ "eventSource": row["event_source"],
914
+ "eventName": row["event_name"],
915
+ "sourceIPAddress": row["source_ip"],
916
+ "awsRegion": row["aws_region"],
917
+ "eventID": row["event_id"],
918
+ }
919
+ if row["error_code"] is not None:
920
+ event["errorCode"] = row["error_code"]
921
+ return event
922
+
923
+
924
+ def _write_ndjson(path: Path, rows: list[dict]) -> None:
925
+ path.parent.mkdir(parents=True, exist_ok=True)
926
+ with path.open("w", encoding="utf-8") as fh:
927
+ for row in rows:
928
+ fh.write(json.dumps(_row_to_wire_event(row)))
929
+ fh.write("\n")
930
+
931
+
932
+ def test_run_digest_cloudtrail_end_to_end_renders_a_card(tmp_path, capsys) -> None:
933
+ """Full path: synthetic NDJSON file → run_digest → rendered card.
934
+
935
+ Flat grammar: identity-line schema label, dominant-IP surfaced by an
936
+ insight, dist slots (lane-split, region) always render as fields.
937
+ Promoted-insight slots do NOT also render as fields.
938
+ """
939
+ ct_dir = tmp_path / "ct"
940
+ rows = _build_attack_rows()
941
+ _write_ndjson(ct_dir / "events.json.log", rows)
942
+
943
+ config: dict[str, Any] = {"loghunter": {}}
944
+ runner.run_digest(
945
+ config=config, schema="cloudtrail",
946
+ cloudtrail_dir=ct_dir, load_all=True, skip_confirm=True,
947
+ )
948
+ out = capsys.readouterr().out
949
+ assert "cloudtrail ·" in out
950
+ # Dist slots always render in fields.
951
+ assert "lane-split:" in out
952
+ assert "region:" in out
953
+ # Attack pile surfaces the dominant IP.
954
+ assert "203.0.113.99" in out
955
+ # No header rule, no footer machinery under the flat grammar.
956
+ assert "── digest" not in out
957
+ assert "N.B." not in out
958
+ assert "ABSENT" not in out
959
+
960
+
961
+ def test_run_digest_cloudtrail_end_to_end_sleepy_pile_is_quiet(tmp_path, capsys) -> None:
962
+ """Sleepy pile: every gating slot dashes (non-speaking), so insights
963
+ is empty AND those slots vanish from fields. Only the two dist slots
964
+ (lane-split, region) survive in the fields block."""
965
+ ct_dir = tmp_path / "ct"
966
+ _write_ndjson(ct_dir / "events.json.log", _build_sleepy_rows())
967
+ config: dict[str, Any] = {"loghunter": {}}
968
+ runner.run_digest(
969
+ config=config, schema="cloudtrail",
970
+ cloudtrail_dir=ct_dir, load_all=True, skip_confirm=True,
971
+ )
972
+ out = capsys.readouterr().out
973
+ assert "cloudtrail ·" in out
974
+ assert "interactive 10% / service 90%" in out
975
+ assert "us-east-1 100%" in out
976
+ # Non-speaking gating slots vanish — no label appears in the fields.
977
+ for label in ("principal-vol:", "event-source:",
978
+ "source-ip:", "error-rate:"):
979
+ assert label not in out
980
+ assert "ABSENT" not in out