loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,980 @@
|
|
|
1
|
+
"""Tests for the cloudtrail digest card (six fixed slots, lane-scoped pair).
|
|
2
|
+
|
|
3
|
+
Covers:
|
|
4
|
+
- lane-split as a dist slot (always shows; both shares; never produces a lede)
|
|
5
|
+
- principal-vol interactive-scoping (service-lane dominant principal must
|
|
6
|
+
not bleed into the interactive cliff; floor + gate both proved to dash)
|
|
7
|
+
- event-source cliff over the WHOLE pile (interactive + service together)
|
|
8
|
+
- source-ip interactive-scoping (one IP dominating interactive speaks;
|
|
9
|
+
service-lane source_ip hostnames like "s3.amazonaws.com" must NOT count)
|
|
10
|
+
- region dist (single-region → "us-east-1 100%"; multi-region → top-3)
|
|
11
|
+
- error-rate (kind = error_code.notna(); top contributor is the error CODE,
|
|
12
|
+
not a principal; literal notna semantics; RATE_FLOOR gates real piles)
|
|
13
|
+
- ledes from gating slots only — neither lane-split nor region prose may
|
|
14
|
+
leak into a lede
|
|
15
|
+
- sleepy whole-pile card: quiet-honest; mostly dashes; zero ledes
|
|
16
|
+
- attack-shaped whole-pile card: multiple gating slots fire
|
|
17
|
+
- CLI dispatch and runner-boundary plumbing for cloudtrail_dir
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
import io
|
|
23
|
+
import json
|
|
24
|
+
from datetime import datetime, timedelta, timezone
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
import pandas as pd
|
|
29
|
+
import pytest
|
|
30
|
+
|
|
31
|
+
import loghunter.cli as cli
|
|
32
|
+
import loghunter.runner as runner
|
|
33
|
+
from loghunter.common.finding import DigestCard, RunSummary
|
|
34
|
+
from loghunter.digest import cloudtrail as ct_digest
|
|
35
|
+
from loghunter.outputs.text import TextHandler
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
# ─── Fixtures ────────────────────────────────────────────────────────────────
|
|
39
|
+
|
|
40
|
+
_NOW = datetime(2026, 6, 11, 12, 0, tzinfo=timezone.utc)
|
|
41
|
+
_BASE_TS = _NOW.timestamp()
|
|
42
|
+
|
|
43
|
+
_CT_COLUMNS = [
|
|
44
|
+
"ts", "principal", "lane", "read_write",
|
|
45
|
+
"event_source", "event_name", "identity_type",
|
|
46
|
+
"source_ip", "error_code", "aws_region", "event_id", "raw",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _ct_row(
|
|
51
|
+
principal: str = "arn:aws:iam::111111111111:user/alice",
|
|
52
|
+
lane: str = "interactive",
|
|
53
|
+
event_source: str = "iam.amazonaws.com",
|
|
54
|
+
event_name: str = "ListUsers",
|
|
55
|
+
source_ip: str = "203.0.113.10",
|
|
56
|
+
aws_region: str = "us-east-1",
|
|
57
|
+
error_code: object = None,
|
|
58
|
+
identity_type: str = "IAMUser",
|
|
59
|
+
read_write: str = "read",
|
|
60
|
+
event_id: str = "evt-0001",
|
|
61
|
+
ts: float = _BASE_TS,
|
|
62
|
+
) -> dict:
|
|
63
|
+
"""Build one canonical CloudTrail row dict with placeholder values.
|
|
64
|
+
|
|
65
|
+
Defaults to a clean interactive IAM read by an example user. Tests override
|
|
66
|
+
only the columns they care about — the rest carry safe sample values so
|
|
67
|
+
the frame always has the full 12-column shape the parser emits.
|
|
68
|
+
"""
|
|
69
|
+
return {
|
|
70
|
+
"ts": ts,
|
|
71
|
+
"principal": principal,
|
|
72
|
+
"lane": lane,
|
|
73
|
+
"read_write": read_write,
|
|
74
|
+
"event_source": event_source,
|
|
75
|
+
"event_name": event_name,
|
|
76
|
+
"identity_type": identity_type,
|
|
77
|
+
"source_ip": source_ip,
|
|
78
|
+
"error_code": error_code,
|
|
79
|
+
"aws_region": aws_region,
|
|
80
|
+
"event_id": event_id,
|
|
81
|
+
"raw": {},
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _ct_df(rows: list[dict]) -> pd.DataFrame:
|
|
86
|
+
if not rows:
|
|
87
|
+
return pd.DataFrame(columns=_CT_COLUMNS)
|
|
88
|
+
return pd.DataFrame(rows, columns=_CT_COLUMNS)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _slot_by_label(slots_or_frame, label):
|
|
92
|
+
"""Look up a computed slot by label.
|
|
93
|
+
|
|
94
|
+
Accepts either a pre-built list of DigestSlot (legacy) or a frame
|
|
95
|
+
(new — re-derives via _compute_slots). The legacy form lets the older
|
|
96
|
+
body["slots"] callers keep their shape after a global rename to
|
|
97
|
+
body["fields"] (which is the post-selection display set, not what
|
|
98
|
+
these tests want); the new form is preferred for new tests.
|
|
99
|
+
"""
|
|
100
|
+
if isinstance(slots_or_frame, pd.DataFrame):
|
|
101
|
+
slots = _compute_slots(slots_or_frame)
|
|
102
|
+
else:
|
|
103
|
+
slots = slots_or_frame
|
|
104
|
+
for s in slots:
|
|
105
|
+
if s.label == label:
|
|
106
|
+
return s
|
|
107
|
+
raise AssertionError(f"no slot with label {label!r}")
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _run_summary(
|
|
111
|
+
window: tuple[datetime, datetime] = (_NOW - timedelta(days=1), _NOW),
|
|
112
|
+
) -> RunSummary:
|
|
113
|
+
return RunSummary(
|
|
114
|
+
data_window=window,
|
|
115
|
+
record_counts={"*.json*": 100},
|
|
116
|
+
data_size_bytes=0,
|
|
117
|
+
detectors_run=[],
|
|
118
|
+
detectors_skipped={},
|
|
119
|
+
notes=[],
|
|
120
|
+
data_sources=["cloudtrail"],
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _card_from_body(body: dict) -> DigestCard:
|
|
125
|
+
return DigestCard(
|
|
126
|
+
schema="cloudtrail",
|
|
127
|
+
source_name="cloudtrail.json.log",
|
|
128
|
+
data_window=(_NOW - timedelta(days=1), _NOW),
|
|
129
|
+
record_count=100,
|
|
130
|
+
histogram_counts=[1, 2, 3, 5, 8, 5, 3, 2, 1],
|
|
131
|
+
histogram_unit="hr",
|
|
132
|
+
histogram_peak=8,
|
|
133
|
+
zone1_extras=body["zone1_extras"],
|
|
134
|
+
insights=body["insights"],
|
|
135
|
+
fields=body["fields"],
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _render(card: DigestCard) -> str:
|
|
140
|
+
"""Render the digest card through TextHandler and return the output text."""
|
|
141
|
+
buffer = io.StringIO()
|
|
142
|
+
handler = TextHandler(stream=buffer, verbose_level=0)
|
|
143
|
+
handler.render_digest(card)
|
|
144
|
+
return buffer.getvalue()
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _compute_slots(frame: pd.DataFrame) -> list:
|
|
148
|
+
"""Re-compute the canonical cloudtrail slot list for tests.
|
|
149
|
+
|
|
150
|
+
The summariser used to return the full pre-filter slot list as
|
|
151
|
+
body["slots"]. Under the flat grammar the body returns only
|
|
152
|
+
post-selection display state (`fields`). Tests that need to inspect a
|
|
153
|
+
specific slot's computed state re-derive it here — same six
|
|
154
|
+
computers, same interactive-lane scoping, in declared order.
|
|
155
|
+
"""
|
|
156
|
+
if "lane" in frame.columns:
|
|
157
|
+
frame_interactive = frame[frame["lane"] == "interactive"]
|
|
158
|
+
else:
|
|
159
|
+
frame_interactive = frame.iloc[0:0]
|
|
160
|
+
return [
|
|
161
|
+
ct_digest._slot_lane_split(frame),
|
|
162
|
+
ct_digest._slot_principal_vol(frame_interactive),
|
|
163
|
+
ct_digest._slot_event_source(frame),
|
|
164
|
+
ct_digest._slot_source_ip(frame_interactive),
|
|
165
|
+
ct_digest._slot_region(frame),
|
|
166
|
+
ct_digest._slot_error_rate(frame),
|
|
167
|
+
]
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
# ─── lane-split (dist; whole pile; always shows) ────────────────────────────
|
|
171
|
+
|
|
172
|
+
def test_lane_split_renders_both_shares() -> None:
|
|
173
|
+
frame = _ct_df(
|
|
174
|
+
[_ct_row(lane="interactive") for _ in range(3)]
|
|
175
|
+
+ [_ct_row(lane="service") for _ in range(7)]
|
|
176
|
+
)
|
|
177
|
+
body = ct_digest.summarize(frame)
|
|
178
|
+
slot = _slot_by_label(_compute_slots(frame), "lane-split")
|
|
179
|
+
assert slot.statistic == "dist"
|
|
180
|
+
assert slot.cells == ["interactive 30% / service 70%"]
|
|
181
|
+
# dist never carries entity / ratio / magnitude
|
|
182
|
+
assert slot.entity is None and slot.ratio is None and slot.magnitude is None
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def test_lane_split_all_interactive_renders_zero_service() -> None:
|
|
186
|
+
frame = _ct_df([_ct_row(lane="interactive") for _ in range(5)])
|
|
187
|
+
body = ct_digest.summarize(frame)
|
|
188
|
+
slot = _slot_by_label(_compute_slots(frame), "lane-split")
|
|
189
|
+
assert slot.cells == ["interactive 100% / service 0%"]
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_lane_split_empty_frame_renders_no_events_placeholder() -> None:
|
|
193
|
+
body = ct_digest.summarize(_ct_df([]))
|
|
194
|
+
slot = _slot_by_label(_compute_slots(_ct_df([])), "lane-split")
|
|
195
|
+
assert slot.cells == ["(no events)"]
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def test_lane_split_missing_lane_column_renders_no_lane_placeholder() -> None:
|
|
199
|
+
# Drop the column entirely — mirrors the dns qtype-mix dual-fallback contract.
|
|
200
|
+
frame = pd.DataFrame([
|
|
201
|
+
{k: v for k, v in _ct_row().items() if k != "lane"}
|
|
202
|
+
for _ in range(3)
|
|
203
|
+
])
|
|
204
|
+
body = ct_digest.summarize(frame)
|
|
205
|
+
slot = _slot_by_label(_compute_slots(frame), "lane-split")
|
|
206
|
+
assert slot.cells == ["(no lane)"]
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# ─── principal-vol (cliff; INTERACTIVE-SCOPED) ──────────────────────────────
|
|
210
|
+
|
|
211
|
+
def test_principal_vol_speaks_with_dominant_interactive_principal() -> None:
|
|
212
|
+
# 5 distinct interactive principals, clear rank1/rank2 cliff.
|
|
213
|
+
rows: list[dict] = []
|
|
214
|
+
for _ in range(20):
|
|
215
|
+
rows.append(_ct_row(principal="arn:aws:iam::111111111111:role/AdminRole"))
|
|
216
|
+
for name in ("user/alice", "user/bob", "user/carol", "user/dave"):
|
|
217
|
+
rows.append(_ct_row(principal=f"arn:aws:iam::111111111111:{name}"))
|
|
218
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
219
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "principal-vol")
|
|
220
|
+
assert slot.statistic == "cliff"
|
|
221
|
+
assert slot.entity == "arn:aws:iam::111111111111:role/AdminRole"
|
|
222
|
+
assert slot.ratio is not None and slot.ratio >= 2.0
|
|
223
|
+
# Cell renders the share-of-interactive percentage.
|
|
224
|
+
assert slot.cells is not None
|
|
225
|
+
assert slot.cells[0] == "arn:aws:iam::111111111111:role/AdminRole"
|
|
226
|
+
assert slot.cells[1].endswith("%")
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def test_principal_vol_dashes_when_interactive_neckandneck_despite_service_dominator() -> None:
|
|
230
|
+
"""Proves both (a) scoping and (b) the cliff floor.
|
|
231
|
+
|
|
232
|
+
Service lane has one dominant principal that would WIN a whole-pile cliff.
|
|
233
|
+
Interactive lane has only two principals (below POPULATION_FLOOR=5), so
|
|
234
|
+
even though one of them dominates within interactive, the slot must dash
|
|
235
|
+
— the spec calls this out explicitly.
|
|
236
|
+
"""
|
|
237
|
+
rows: list[dict] = []
|
|
238
|
+
# 10 service rows all from the same service principal — would dominate
|
|
239
|
+
# the whole-pile cliff if the interactive filter were forgotten.
|
|
240
|
+
for _ in range(10):
|
|
241
|
+
rows.append(_ct_row(
|
|
242
|
+
principal="lambda.amazonaws.com",
|
|
243
|
+
lane="service",
|
|
244
|
+
identity_type="AWSService",
|
|
245
|
+
event_source="lambda.amazonaws.com",
|
|
246
|
+
source_ip="lambda.amazonaws.com",
|
|
247
|
+
))
|
|
248
|
+
# 5 interactive rows split between two principals — below floor.
|
|
249
|
+
for _ in range(3):
|
|
250
|
+
rows.append(_ct_row(principal="arn:aws:iam::111111111111:user/alice"))
|
|
251
|
+
for _ in range(2):
|
|
252
|
+
rows.append(_ct_row(principal="arn:aws:iam::111111111111:user/bob"))
|
|
253
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
254
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "principal-vol")
|
|
255
|
+
assert slot.cells is None # dashed
|
|
256
|
+
assert slot.entity is None and slot.ratio is None
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
# ─── event-source (cliff; WHOLE pile) ───────────────────────────────────────
|
|
260
|
+
|
|
261
|
+
def test_event_source_cliff_counts_whole_pile() -> None:
|
|
262
|
+
"""event-source counts interactive + service rows together (whole-pile)."""
|
|
263
|
+
rows: list[dict] = []
|
|
264
|
+
# 25 interactive iam events.
|
|
265
|
+
for _ in range(25):
|
|
266
|
+
rows.append(_ct_row(event_source="iam.amazonaws.com"))
|
|
267
|
+
# 4 service rows across 4 other services — without the service rows the
|
|
268
|
+
# population would only be 1 distinct source and the slot would dash;
|
|
269
|
+
# whole-pile counting brings the population to 5.
|
|
270
|
+
for src in ("ec2.amazonaws.com", "s3.amazonaws.com",
|
|
271
|
+
"sts.amazonaws.com", "kms.amazonaws.com"):
|
|
272
|
+
rows.append(_ct_row(lane="service", event_source=src,
|
|
273
|
+
principal=src, identity_type="AWSService",
|
|
274
|
+
source_ip=src))
|
|
275
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
276
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "event-source")
|
|
277
|
+
assert slot.entity == "iam.amazonaws.com"
|
|
278
|
+
assert slot.cells is not None
|
|
279
|
+
assert slot.cells[0] == "iam.amazonaws.com"
|
|
280
|
+
assert slot.cells[1] == "25" # count, right-justified by handler
|
|
281
|
+
assert slot.ratio is not None and slot.ratio >= 2.0
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
# ─── source-ip (share; INTERACTIVE-SCOPED; cell vs entity split) ────────────
|
|
285
|
+
|
|
286
|
+
def test_source_ip_speaks_with_one_dominant_interactive_ip() -> None:
|
|
287
|
+
"""20 events from one IP + 4 IPs at 1 each = 24 interactive,
|
|
288
|
+
top_share = 20/24 ≈ 83% ≥ SHARE_GATE → speaks."""
|
|
289
|
+
rows: list[dict] = []
|
|
290
|
+
for _ in range(20):
|
|
291
|
+
rows.append(_ct_row(source_ip="203.0.113.99"))
|
|
292
|
+
for ip in ("203.0.113.10", "203.0.113.11",
|
|
293
|
+
"203.0.113.12", "203.0.113.13"):
|
|
294
|
+
rows.append(_ct_row(source_ip=ip))
|
|
295
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
296
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
|
|
297
|
+
assert slot.statistic == "share"
|
|
298
|
+
# Entity carries the actual IP for the lede…
|
|
299
|
+
assert slot.entity == "203.0.113.99"
|
|
300
|
+
# …but the table cell leads with "1 IP" for at-a-glance concentration.
|
|
301
|
+
# Exactly TWO cells — share has no rank-2 ratio.
|
|
302
|
+
assert slot.cells == ["1 IP", "83% of interactive"]
|
|
303
|
+
# No rank-2 ratio on a share slot.
|
|
304
|
+
assert slot.ratio is None
|
|
305
|
+
assert slot.magnitude is not None and 82 <= slot.magnitude <= 84
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
def test_source_ip_speaks_on_two_distinct_ips_with_dominant_share() -> None:
|
|
309
|
+
"""The exact regression case the share-statistic fix unblocks.
|
|
310
|
+
|
|
311
|
+
99 events from one IP + 1 from another → 2 distinct IPs total. The
|
|
312
|
+
OLD cliff-based slot dashed here because 2 < POPULATION_FLOOR=5, even
|
|
313
|
+
though concentration is 99%. The NEW share-based slot must speak and
|
|
314
|
+
name the IP — that low cardinality is the SIGNAL, not noise.
|
|
315
|
+
"""
|
|
316
|
+
rows = [_ct_row(source_ip="203.0.113.99") for _ in range(99)]
|
|
317
|
+
rows.append(_ct_row(source_ip="203.0.113.10"))
|
|
318
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
319
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
|
|
320
|
+
assert slot.statistic == "share"
|
|
321
|
+
assert slot.entity == "203.0.113.99"
|
|
322
|
+
assert slot.cells == ["1 IP", "99% of interactive"]
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def test_source_ip_speaks_on_single_distinct_ip_at_100_percent() -> None:
|
|
326
|
+
"""10 events, all one IP → 1 distinct IP. top_share = 1.0 → speaks."""
|
|
327
|
+
rows = [_ct_row(source_ip="203.0.113.99") for _ in range(10)]
|
|
328
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
329
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
|
|
330
|
+
assert slot.statistic == "share"
|
|
331
|
+
assert slot.entity == "203.0.113.99"
|
|
332
|
+
assert slot.magnitude == 100.0
|
|
333
|
+
assert slot.cells == ["1 IP", "100% of interactive"]
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def test_source_ip_dashes_just_below_share_gate() -> None:
|
|
337
|
+
"""Locks SHARE_GATE = 0.80 as the threshold. 79 dominant + 21 spread
|
|
338
|
+
→ top_share = 0.79, just below gate → dashes."""
|
|
339
|
+
rows = [_ct_row(source_ip="203.0.113.99") for _ in range(79)]
|
|
340
|
+
# Spread 21 across many other IPs so no single IP-other clears the gate.
|
|
341
|
+
for i in range(21):
|
|
342
|
+
rows.append(_ct_row(source_ip=f"203.0.113.{100+i}"))
|
|
343
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
344
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
|
|
345
|
+
assert slot.cells is None
|
|
346
|
+
assert slot.statistic == "share"
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def test_source_ip_dashes_on_diverse_interactive_sources() -> None:
|
|
350
|
+
"""Spread distribution → top_share = 1/N << SHARE_GATE → dashes."""
|
|
351
|
+
rows = [
|
|
352
|
+
_ct_row(source_ip=f"203.0.113.{i}") for i in range(10, 18)
|
|
353
|
+
]
|
|
354
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
355
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
|
|
356
|
+
assert slot.cells is None
|
|
357
|
+
assert slot.statistic == "share"
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def test_source_ip_excludes_service_lane_hostnames() -> None:
|
|
361
|
+
"""A service-lane source_ip hostname (e.g. s3.amazonaws.com) must NOT
|
|
362
|
+
affect source-ip — proves interactive scoping is doing real work.
|
|
363
|
+
|
|
364
|
+
Without the interactive filter, the whole-pile share would compute
|
|
365
|
+
25/31 ≈ 81% on "s3.amazonaws.com" — above SHARE_GATE — and the slot
|
|
366
|
+
would speak on a service hostname. The interactive filter keeps that
|
|
367
|
+
out: interactive lane has 6 IPs at 1 event each (top_share = 1/6 ≈
|
|
368
|
+
17% << gate) → dashes.
|
|
369
|
+
"""
|
|
370
|
+
rows: list[dict] = []
|
|
371
|
+
for _ in range(25):
|
|
372
|
+
rows.append(_ct_row(
|
|
373
|
+
lane="service",
|
|
374
|
+
source_ip="s3.amazonaws.com",
|
|
375
|
+
principal="s3.amazonaws.com",
|
|
376
|
+
identity_type="AWSService",
|
|
377
|
+
event_source="s3.amazonaws.com",
|
|
378
|
+
))
|
|
379
|
+
for ip in ("203.0.113.10", "203.0.113.11", "203.0.113.12",
|
|
380
|
+
"203.0.113.13", "203.0.113.14", "203.0.113.15"):
|
|
381
|
+
rows.append(_ct_row(source_ip=ip))
|
|
382
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
383
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "source-ip")
|
|
384
|
+
assert slot.cells is None # dashed — proves the scoping
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def test_source_ip_lede_omits_ratio_phrase() -> None:
|
|
388
|
+
"""Direct check on the lede formatter contract: share slots produce
|
|
389
|
+
no 'Nx the next' or 'more than' clause — concentration has no peer to
|
|
390
|
+
compare against."""
|
|
391
|
+
rows = [_ct_row(source_ip="203.0.113.99") for _ in range(95)]
|
|
392
|
+
for ip in ("203.0.113.10", "203.0.113.11",
|
|
393
|
+
"203.0.113.12", "203.0.113.13", "203.0.113.14"):
|
|
394
|
+
rows.append(_ct_row(source_ip=ip))
|
|
395
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
396
|
+
source_ip_lede = next(
|
|
397
|
+
(lede for lede in body["insights"] if "203.0.113.99" in lede), None
|
|
398
|
+
)
|
|
399
|
+
assert source_ip_lede is not None
|
|
400
|
+
assert "x the next" not in source_ip_lede
|
|
401
|
+
assert "more than" not in source_ip_lede
|
|
402
|
+
assert source_ip_lede.endswith("interactive events.")
|
|
403
|
+
|
|
404
|
+
|
|
405
|
+
def test_source_ip_high_share_outranks_mid_cliff_in_salience() -> None:
|
|
406
|
+
"""A high-share source-ip lede should rank above a mid-magnitude cliff
|
|
407
|
+
lede. Builds a pile where source-ip share is 95% (salience 95) and
|
|
408
|
+
event-source cliff ratio is ~3 (salience 3) — source-ip lede must
|
|
409
|
+
appear before event-source lede in body['insights']."""
|
|
410
|
+
rows: list[dict] = []
|
|
411
|
+
# 95 events from one IP, but spread across multiple event_sources so
|
|
412
|
+
# the event-source cliff is weak.
|
|
413
|
+
sources = ["iam.amazonaws.com"] * 30 + ["ec2.amazonaws.com"] * 25 + \
|
|
414
|
+
["s3.amazonaws.com"] * 20 + ["sts.amazonaws.com"] * 20
|
|
415
|
+
for src in sources:
|
|
416
|
+
rows.append(_ct_row(source_ip="203.0.113.99", event_source=src))
|
|
417
|
+
# 5 background events so event-source clears POPULATION_FLOOR.
|
|
418
|
+
for i in range(5):
|
|
419
|
+
rows.append(_ct_row(
|
|
420
|
+
source_ip=f"203.0.113.{10+i}",
|
|
421
|
+
event_source="kms.amazonaws.com",
|
|
422
|
+
))
|
|
423
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
424
|
+
ledes = body["insights"]
|
|
425
|
+
src_idx = next(
|
|
426
|
+
(i for i, lede in enumerate(ledes) if "203.0.113.99" in lede), None
|
|
427
|
+
)
|
|
428
|
+
src_evt_idx = next(
|
|
429
|
+
(i for i, lede in enumerate(ledes)
|
|
430
|
+
if "iam.amazonaws.com" in lede and "service" in lede), None
|
|
431
|
+
)
|
|
432
|
+
assert src_idx is not None
|
|
433
|
+
# event-source may not even make top-3, but if it does, source-ip outranks it.
|
|
434
|
+
if src_evt_idx is not None:
|
|
435
|
+
assert src_idx < src_evt_idx
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
# ─── region (dist; WHOLE pile; never produces a lede) ───────────────────────
|
|
439
|
+
|
|
440
|
+
def test_region_single_region_renders_100_percent() -> None:
|
|
441
|
+
frame = _ct_df([_ct_row(aws_region="us-east-1") for _ in range(8)])
|
|
442
|
+
body = ct_digest.summarize(frame)
|
|
443
|
+
slot = _slot_by_label(_compute_slots(frame), "region")
|
|
444
|
+
assert slot.statistic == "dist"
|
|
445
|
+
assert slot.cells == ["us-east-1 100%"]
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def test_region_multi_region_renders_top_three_with_separator() -> None:
|
|
449
|
+
rows: list[dict] = []
|
|
450
|
+
for _ in range(40):
|
|
451
|
+
rows.append(_ct_row(aws_region="us-east-1"))
|
|
452
|
+
for _ in range(30):
|
|
453
|
+
rows.append(_ct_row(aws_region="eu-west-1"))
|
|
454
|
+
for _ in range(10):
|
|
455
|
+
rows.append(_ct_row(aws_region="us-west-2"))
|
|
456
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
457
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "region")
|
|
458
|
+
assert slot.cells == ["us-east-1 50% · eu-west-1 38% · us-west-2 12%"]
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def test_region_caps_at_top_three() -> None:
|
|
462
|
+
rows: list[dict] = []
|
|
463
|
+
for region, n in (("us-east-1", 30), ("eu-west-1", 20),
|
|
464
|
+
("us-west-2", 10), ("ap-south-1", 5),
|
|
465
|
+
("eu-central-1", 5)):
|
|
466
|
+
for _ in range(n):
|
|
467
|
+
rows.append(_ct_row(aws_region=region))
|
|
468
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
469
|
+
slot = _slot_by_label(_compute_slots(_ct_df(rows)), "region")
|
|
470
|
+
assert slot.cells is not None
|
|
471
|
+
assert slot.cells[0].count("·") == 2 # exactly three entries → two separators
|
|
472
|
+
# Lower-ranked regions must NOT appear.
|
|
473
|
+
assert "ap-south-1" not in slot.cells[0]
|
|
474
|
+
assert "eu-central-1" not in slot.cells[0]
|
|
475
|
+
|
|
476
|
+
|
|
477
|
+
def test_region_empty_and_missing_column_have_distinct_fallbacks() -> None:
|
|
478
|
+
body = ct_digest.summarize(_ct_df([]))
|
|
479
|
+
assert _slot_by_label(_compute_slots(_ct_df([])), "region").cells == ["(no events)"]
|
|
480
|
+
|
|
481
|
+
rows = [{k: v for k, v in _ct_row().items() if k != "aws_region"}
|
|
482
|
+
for _ in range(3)]
|
|
483
|
+
body = ct_digest.summarize(pd.DataFrame(rows))
|
|
484
|
+
assert _slot_by_label(_compute_slots(pd.DataFrame(rows)), "region").cells == ["(no region)"]
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
# ─── error-rate (rate; WHOLE pile; names error CODE not principal) ──────────
|
|
488
|
+
|
|
489
|
+
def test_error_rate_dashes_when_no_errors() -> None:
|
|
490
|
+
frame = _ct_df([_ct_row(error_code=None) for _ in range(20)])
|
|
491
|
+
slot = _slot_by_label(frame, "error-rate")
|
|
492
|
+
assert slot.cells is None
|
|
493
|
+
|
|
494
|
+
|
|
495
|
+
def test_error_rate_dashes_below_rate_floor() -> None:
|
|
496
|
+
"""200 events with 1 errored = 0.5% < RATE_FLOOR (1%) — dashes via floor."""
|
|
497
|
+
rows = [_ct_row(error_code=None) for _ in range(199)]
|
|
498
|
+
rows.append(_ct_row(error_code="AccessDenied"))
|
|
499
|
+
slot = _slot_by_label(_ct_df(rows), "error-rate")
|
|
500
|
+
assert slot.cells is None
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
def test_error_rate_names_top_error_code_not_principal() -> None:
|
|
504
|
+
"""Top contributor is the most common errorCode, NOT a principal."""
|
|
505
|
+
rows: list[dict] = []
|
|
506
|
+
for _ in range(80):
|
|
507
|
+
rows.append(_ct_row(error_code=None))
|
|
508
|
+
principals = [
|
|
509
|
+
f"arn:aws:iam::111111111111:user/u{i}" for i in range(20)
|
|
510
|
+
]
|
|
511
|
+
for i in range(15):
|
|
512
|
+
rows.append(_ct_row(principal=principals[i % len(principals)],
|
|
513
|
+
error_code="AccessDenied"))
|
|
514
|
+
for i in range(5):
|
|
515
|
+
rows.append(_ct_row(principal=principals[(i + 7) % len(principals)],
|
|
516
|
+
error_code="ValidationException"))
|
|
517
|
+
slot = _slot_by_label(_ct_df(rows), "error-rate")
|
|
518
|
+
assert slot.entity == "AccessDenied"
|
|
519
|
+
assert slot.cells is not None
|
|
520
|
+
assert slot.cells[1] == "AccessDenied"
|
|
521
|
+
assert slot.magnitude is not None and 19 <= slot.magnitude <= 21
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def test_error_rate_notna_semantics_pin_none_nan_and_empty_string() -> None:
|
|
525
|
+
"""Literal .notna() — None and NaN read clean; "" reads as errored."""
|
|
526
|
+
rows: list[dict] = []
|
|
527
|
+
for i in range(45):
|
|
528
|
+
rows.append(_ct_row(error_code=None))
|
|
529
|
+
for i in range(45):
|
|
530
|
+
rows.append(_ct_row(error_code=float("nan")))
|
|
531
|
+
for i in range(10):
|
|
532
|
+
rows.append(_ct_row(error_code=""))
|
|
533
|
+
slot = _slot_by_label(_ct_df(rows), "error-rate")
|
|
534
|
+
assert slot.cells is not None
|
|
535
|
+
assert slot.entity == ""
|
|
536
|
+
assert slot.magnitude is not None and 9 <= slot.magnitude <= 11
|
|
537
|
+
|
|
538
|
+
|
|
539
|
+
# ─── Ledes: dist slots never leak into prose ────────────────────────────────
|
|
540
|
+
|
|
541
|
+
def test_ledes_never_carry_dist_slot_prose() -> None:
|
|
542
|
+
"""All-interactive single-region pile — gating slots may fire ledes, but
|
|
543
|
+
no lede string may contain the lane-split or region fill prose.
|
|
544
|
+
|
|
545
|
+
Checked against rendered prose, not slot labels — label-presence checks
|
|
546
|
+
would let "interactive 100% / service 0%" leak through if a formatter
|
|
547
|
+
accidentally embedded it. Same for region's "us-east-1 100%".
|
|
548
|
+
"""
|
|
549
|
+
rows: list[dict] = []
|
|
550
|
+
# All-interactive — drives lane-split to "interactive 100% / service 0%".
|
|
551
|
+
for _ in range(30):
|
|
552
|
+
rows.append(_ct_row(
|
|
553
|
+
principal="arn:aws:iam::111111111111:role/AdminRole",
|
|
554
|
+
aws_region="us-east-1",
|
|
555
|
+
source_ip="203.0.113.99",
|
|
556
|
+
event_source="iam.amazonaws.com",
|
|
557
|
+
))
|
|
558
|
+
# 5 more principals / IPs / sources so population floors are met but
|
|
559
|
+
# the cliff still fires.
|
|
560
|
+
for i in range(5):
|
|
561
|
+
rows.append(_ct_row(
|
|
562
|
+
principal=f"arn:aws:iam::111111111111:user/u{i}",
|
|
563
|
+
source_ip=f"203.0.113.{20+i}",
|
|
564
|
+
event_source=f"svc{i}.amazonaws.com",
|
|
565
|
+
aws_region="us-east-1",
|
|
566
|
+
))
|
|
567
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
568
|
+
assert body["insights"] # at least one cliff lede fired
|
|
569
|
+
forbidden_fragments = (
|
|
570
|
+
"interactive 100%", "service 0%", "/ service",
|
|
571
|
+
"us-east-1 100%",
|
|
572
|
+
)
|
|
573
|
+
for lede in body["insights"]:
|
|
574
|
+
for frag in forbidden_fragments:
|
|
575
|
+
assert frag not in lede, (
|
|
576
|
+
f"dist slot prose leaked into lede: {lede!r} contains {frag!r}"
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
|
|
580
|
+
# ─── Summariser shape ───────────────────────────────────────────────────────
|
|
581
|
+
|
|
582
|
+
def test_summarize_returns_six_slots_in_fixed_order() -> None:
|
|
583
|
+
body = ct_digest.summarize(_ct_df([_ct_row() for _ in range(3)]))
|
|
584
|
+
labels = [s.label for s in _compute_slots(_ct_df([_ct_row() for _ in range(3)]))]
|
|
585
|
+
assert labels == [
|
|
586
|
+
"lane-split", "principal-vol", "event-source",
|
|
587
|
+
"source-ip", "region", "error-rate",
|
|
588
|
+
]
|
|
589
|
+
|
|
590
|
+
|
|
591
|
+
def test_summarize_entity_label_and_zone1_extras() -> None:
|
|
592
|
+
rows = [
|
|
593
|
+
_ct_row(principal="arn:aws:iam::111111111111:user/alice",
|
|
594
|
+
event_source="iam.amazonaws.com"),
|
|
595
|
+
_ct_row(principal="arn:aws:iam::111111111111:user/bob",
|
|
596
|
+
event_source="ec2.amazonaws.com"),
|
|
597
|
+
_ct_row(principal="arn:aws:iam::111111111111:user/alice",
|
|
598
|
+
event_source="s3.amazonaws.com"),
|
|
599
|
+
]
|
|
600
|
+
body = ct_digest.summarize(_ct_df(rows))
|
|
601
|
+
# entity_label / entity_count are deleted from the body dict under the
|
|
602
|
+
# flat grammar; zone1_extras carries the distinct-counts as the only
|
|
603
|
+
# surface the renderer consumes.
|
|
604
|
+
assert ("principals", "2") in body["zone1_extras"]
|
|
605
|
+
assert ("event sources", "3") in body["zone1_extras"]
|
|
606
|
+
|
|
607
|
+
|
|
608
|
+
# ─── Whole-card rendering ───────────────────────────────────────────────────
|
|
609
|
+
|
|
610
|
+
def _build_sleepy_rows() -> list[dict]:
|
|
611
|
+
"""Build the canonical sleepy pile used by the renderer test.
|
|
612
|
+
|
|
613
|
+
50 events, 90% service / 10% interactive, two ≈balanced interactive
|
|
614
|
+
principals, single region, no errors. Designed so all cliff/rate slots
|
|
615
|
+
correctly dash:
|
|
616
|
+
- principal-vol: 2 distinct interactive principals → below POPULATION_FLOOR
|
|
617
|
+
- source-ip: 5 distinct interactive IPs, 1 each → ratio 1.0 < gate
|
|
618
|
+
- event-source: rank1/rank2 = 25/20 = 1.25 < gate
|
|
619
|
+
- error-rate: 0 errors → kind_count short-circuit
|
|
620
|
+
"""
|
|
621
|
+
rows: list[dict] = []
|
|
622
|
+
# Service lane: 25 lambda + 20 ec2 — keeps the whole-pile event-source
|
|
623
|
+
# cliff weak so the slot dashes.
|
|
624
|
+
for _ in range(25):
|
|
625
|
+
rows.append(_ct_row(
|
|
626
|
+
principal="lambda.amazonaws.com", lane="service",
|
|
627
|
+
event_source="lambda.amazonaws.com",
|
|
628
|
+
event_name="Invoke", identity_type="AWSService",
|
|
629
|
+
source_ip="lambda.amazonaws.com",
|
|
630
|
+
))
|
|
631
|
+
for _ in range(20):
|
|
632
|
+
rows.append(_ct_row(
|
|
633
|
+
principal="ec2.amazonaws.com", lane="service",
|
|
634
|
+
event_source="ec2.amazonaws.com",
|
|
635
|
+
event_name="StartInstances", identity_type="AWSService",
|
|
636
|
+
source_ip="ec2.amazonaws.com",
|
|
637
|
+
))
|
|
638
|
+
# Interactive lane: 5 events split 3/2 across 2 principals, 5 distinct
|
|
639
|
+
# IPs (one each — so the source-ip cliff is flat).
|
|
640
|
+
for src_ip in ("203.0.113.10", "203.0.113.11", "203.0.113.12"):
|
|
641
|
+
rows.append(_ct_row(
|
|
642
|
+
principal="arn:aws:iam::111111111111:user/alice",
|
|
643
|
+
event_source="iam.amazonaws.com", event_name="ListUsers",
|
|
644
|
+
source_ip=src_ip,
|
|
645
|
+
))
|
|
646
|
+
for src_ip in ("203.0.113.20", "203.0.113.21"):
|
|
647
|
+
rows.append(_ct_row(
|
|
648
|
+
principal="arn:aws:iam::111111111111:user/bob",
|
|
649
|
+
event_source="sts.amazonaws.com", event_name="GetCallerIdentity",
|
|
650
|
+
source_ip=src_ip,
|
|
651
|
+
))
|
|
652
|
+
return rows
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
def _build_attack_rows() -> list[dict]:
|
|
656
|
+
"""Build the canonical attack-shaped pile.
|
|
657
|
+
|
|
658
|
+
80 events, all interactive, one principal/IP utterly dominant, three
|
|
659
|
+
regions, ~22% errors. Sized so principal-vol / source-ip salience
|
|
660
|
+
(cliff ratio 76) clearly leads error-rate (salience 22) and event-source
|
|
661
|
+
(cliff ratio 12) — guaranteed top-3 ledes are: principal-vol, source-ip,
|
|
662
|
+
error-rate. event-source's cliff still fires (cells not None) but its
|
|
663
|
+
lede drops out of the top-3 cutoff; the slot table row still shows it.
|
|
664
|
+
"""
|
|
665
|
+
rows: list[dict] = []
|
|
666
|
+
# 60 of the 76 dominant-role events go through IAM; the remaining 16
|
|
667
|
+
# spread across four other services so event-source still clears
|
|
668
|
+
# POPULATION_FLOOR with a meaningful but secondary cliff.
|
|
669
|
+
services = (
|
|
670
|
+
["iam.amazonaws.com"] * 60
|
|
671
|
+
+ ["ec2.amazonaws.com"] * 4 + ["s3.amazonaws.com"] * 4
|
|
672
|
+
+ ["sts.amazonaws.com"] * 4 + ["kms.amazonaws.com"] * 4
|
|
673
|
+
)
|
|
674
|
+
regions = ["us-east-1"] * 38 + ["eu-west-1"] * 28 + ["us-west-2"] * 10
|
|
675
|
+
# 16 AccessDenied + 2 ValidationException + 58 clean → ~22.5% error rate.
|
|
676
|
+
error_codes = (
|
|
677
|
+
["AccessDenied"] * 16 + ["ValidationException"] * 2 + [None] * 58
|
|
678
|
+
)
|
|
679
|
+
for i in range(76):
|
|
680
|
+
rows.append(_ct_row(
|
|
681
|
+
principal="arn:aws:iam::111111111111:role/AdminRole",
|
|
682
|
+
event_source=services[i],
|
|
683
|
+
event_name="CreateUser" if (i % 3) == 0 else "ListUsers",
|
|
684
|
+
source_ip="203.0.113.99",
|
|
685
|
+
aws_region=regions[i],
|
|
686
|
+
error_code=error_codes[i],
|
|
687
|
+
# Per-row ts offsets give the timeline a non-zero span. Real
|
|
688
|
+
# CloudTrail events have varying eventTime values; without the
|
|
689
|
+
# offset, run_digest's confidence floor (zero-span guard) fires.
|
|
690
|
+
ts=_BASE_TS + i,
|
|
691
|
+
))
|
|
692
|
+
# 4 background events from 4 distinct (principal, IP, service) tuples —
|
|
693
|
+
# just enough to clear POPULATION_FLOOR on each cliff.
|
|
694
|
+
others = [
|
|
695
|
+
("arn:aws:iam::111111111111:role/BuildBot",
|
|
696
|
+
"ec2.amazonaws.com", "203.0.113.10"),
|
|
697
|
+
("arn:aws:iam::111111111111:user/alice",
|
|
698
|
+
"s3.amazonaws.com", "203.0.113.11"),
|
|
699
|
+
("arn:aws:iam::111111111111:user/bob",
|
|
700
|
+
"sts.amazonaws.com", "203.0.113.12"),
|
|
701
|
+
("arn:aws:iam::111111111111:user/carol",
|
|
702
|
+
"kms.amazonaws.com", "203.0.113.13"),
|
|
703
|
+
]
|
|
704
|
+
for j, (principal, source, ip) in enumerate(others):
|
|
705
|
+
rows.append(_ct_row(
|
|
706
|
+
principal=principal,
|
|
707
|
+
event_source=source,
|
|
708
|
+
event_name="DescribeFoo",
|
|
709
|
+
source_ip=ip,
|
|
710
|
+
aws_region="us-east-1",
|
|
711
|
+
ts=_BASE_TS + 76 + j,
|
|
712
|
+
))
|
|
713
|
+
return rows
|
|
714
|
+
|
|
715
|
+
|
|
716
|
+
def test_sleepy_card_is_quiet_with_zero_ledes() -> None:
|
|
717
|
+
body = ct_digest.summarize(_ct_df(_build_sleepy_rows()))
|
|
718
|
+
# Every gating slot dashes.
|
|
719
|
+
for label in ("principal-vol", "event-source", "source-ip", "error-rate"):
|
|
720
|
+
assert _slot_by_label(_compute_slots(_ct_df(_build_sleepy_rows())), label).cells is None, (
|
|
721
|
+
f"sleepy pile: {label} unexpectedly fired"
|
|
722
|
+
)
|
|
723
|
+
# Both dist slots speak.
|
|
724
|
+
assert _slot_by_label(_compute_slots(_ct_df(_build_sleepy_rows())), "lane-split").cells == [
|
|
725
|
+
"interactive 10% / service 90%",
|
|
726
|
+
]
|
|
727
|
+
assert _slot_by_label(_compute_slots(_ct_df(_build_sleepy_rows())), "region").cells == [
|
|
728
|
+
"us-east-1 100%",
|
|
729
|
+
]
|
|
730
|
+
# No gating slot → no insight.
|
|
731
|
+
assert body["insights"] == []
|
|
732
|
+
# Card renders without absent-footer machinery (no slot is ABSENT
|
|
733
|
+
# under the flat grammar — non-speaking just vanishes from fields).
|
|
734
|
+
text = _render(_card_from_body(body))
|
|
735
|
+
assert "cloudtrail ·" in text # identity-line-3 schema label
|
|
736
|
+
assert "N.B." not in text
|
|
737
|
+
assert "── digest" not in text # header rule is gone
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
def test_attack_card_fires_multiple_ledes() -> None:
|
|
741
|
+
body = ct_digest.summarize(_ct_df(_build_attack_rows()))
|
|
742
|
+
# All four gating slots fire.
|
|
743
|
+
for label in ("principal-vol", "event-source", "source-ip", "error-rate"):
|
|
744
|
+
assert _slot_by_label(_compute_slots(_ct_df(_build_attack_rows())), label).cells is not None, (
|
|
745
|
+
f"attack pile: {label} failed to fire"
|
|
746
|
+
)
|
|
747
|
+
# lane-split renders 100/0.
|
|
748
|
+
assert _slot_by_label(_compute_slots(_ct_df(_build_attack_rows())), "lane-split").cells == [
|
|
749
|
+
"interactive 100% / service 0%",
|
|
750
|
+
]
|
|
751
|
+
# region renders top-3 with the dominant region first.
|
|
752
|
+
region_cell = _slot_by_label(_compute_slots(_ct_df(_build_attack_rows())), "region").cells[0]
|
|
753
|
+
assert region_cell.startswith("us-east-1 ")
|
|
754
|
+
assert region_cell.count("·") == 2
|
|
755
|
+
# AdminRole / dominant IP / top error code all named in some lede.
|
|
756
|
+
assert any("AdminRole" in lede for lede in body["insights"])
|
|
757
|
+
src_ip_lede = next(
|
|
758
|
+
(lede for lede in body["insights"] if "203.0.113.99" in lede), None
|
|
759
|
+
)
|
|
760
|
+
assert src_ip_lede is not None
|
|
761
|
+
# Source-ip lede has the new share contract — no ratio-against-next clause.
|
|
762
|
+
assert "x the next" not in src_ip_lede
|
|
763
|
+
assert "more than" not in src_ip_lede
|
|
764
|
+
assert any("AccessDenied" in lede for lede in body["insights"])
|
|
765
|
+
# Card renders — flat grammar, no header rule.
|
|
766
|
+
text = _render(_card_from_body(body))
|
|
767
|
+
assert "cloudtrail ·" in text
|
|
768
|
+
assert "203.0.113.99" in text # insight surfaces the dominant IP
|
|
769
|
+
|
|
770
|
+
|
|
771
|
+
# ─── CLI dispatch ───────────────────────────────────────────────────────────
|
|
772
|
+
|
|
773
|
+
def _spy_run_digest(monkeypatch) -> dict:
|
|
774
|
+
captured: dict[str, Any] = {}
|
|
775
|
+
|
|
776
|
+
def fake_run_digest(**kwargs):
|
|
777
|
+
captured.update(kwargs)
|
|
778
|
+
|
|
779
|
+
monkeypatch.setattr(runner, "run_digest", fake_run_digest)
|
|
780
|
+
return captured
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
def _stub_config(monkeypatch, cfg_dict: dict) -> None:
|
|
784
|
+
monkeypatch.setattr(cli.cfg, "load", lambda _path: cfg_dict)
|
|
785
|
+
|
|
786
|
+
|
|
787
|
+
_CT_NDJSON_LINE = (
|
|
788
|
+
'{"eventVersion": "1.08", "eventTime": "2026-06-01T12:00:00Z",'
|
|
789
|
+
' "userIdentity": {"type": "IAMUser"}, "eventName": "GetObject",'
|
|
790
|
+
' "eventSource": "s3.amazonaws.com", "sourceIPAddress": "192.0.2.10"}\n'
|
|
791
|
+
)
|
|
792
|
+
|
|
793
|
+
|
|
794
|
+
def _write_ct_sniff_file(tmp_path: Path) -> Path:
|
|
795
|
+
log_path = tmp_path / "cloudtrail.json.log"
|
|
796
|
+
log_path.write_text(_CT_NDJSON_LINE, encoding="utf-8")
|
|
797
|
+
return log_path
|
|
798
|
+
|
|
799
|
+
|
|
800
|
+
def test_cli_digest_cloudtrail_file_sniffs_and_routes_to_cloudtrail_dir(
|
|
801
|
+
tmp_path, monkeypatch,
|
|
802
|
+
) -> None:
|
|
803
|
+
captured = _spy_run_digest(monkeypatch)
|
|
804
|
+
_stub_config(monkeypatch, {"loghunter": {}})
|
|
805
|
+
log_path = _write_ct_sniff_file(tmp_path)
|
|
806
|
+
cli._main(["digest", str(log_path)])
|
|
807
|
+
assert captured.get("schema") == "cloudtrail"
|
|
808
|
+
assert captured.get("cloudtrail_dir") == str(log_path)
|
|
809
|
+
assert captured.get("zeek_dir") is None
|
|
810
|
+
assert captured.get("pihole_dir") is None
|
|
811
|
+
assert captured.get("syslog_dir") is None
|
|
812
|
+
|
|
813
|
+
|
|
814
|
+
def test_cli_digest_cloudtrail_bare_falls_back_to_conn_default(tmp_path, monkeypatch) -> None:
|
|
815
|
+
"""Bare `digest` always defaults to schema=conn under the new surface.
|
|
816
|
+
|
|
817
|
+
Configured cloudtrail_dir alone cannot drive a bare digest — documented
|
|
818
|
+
consequence of removing the schema token. Users wanting a cloudtrail
|
|
819
|
+
digest pass a CloudTrail file as positional.
|
|
820
|
+
"""
|
|
821
|
+
captured = _spy_run_digest(monkeypatch)
|
|
822
|
+
ct_dir = tmp_path / "ct"
|
|
823
|
+
ct_dir.mkdir()
|
|
824
|
+
_stub_config(monkeypatch, {"loghunter": {"cloudtrail_dir": str(ct_dir)}})
|
|
825
|
+
cli._main(["digest"])
|
|
826
|
+
assert captured.get("schema") == "conn"
|
|
827
|
+
assert captured.get("cloudtrail_dir") is None
|
|
828
|
+
|
|
829
|
+
|
|
830
|
+
def test_cli_digest_cloudtrail_file_with_since_flag(tmp_path, monkeypatch) -> None:
|
|
831
|
+
captured = _spy_run_digest(monkeypatch)
|
|
832
|
+
_stub_config(monkeypatch, {"loghunter": {}})
|
|
833
|
+
log_path = _write_ct_sniff_file(tmp_path)
|
|
834
|
+
cli._main(["digest", str(log_path), "--since=7d"])
|
|
835
|
+
assert captured.get("schema") == "cloudtrail"
|
|
836
|
+
assert captured.get("cloudtrail_dir") == str(log_path)
|
|
837
|
+
assert captured.get("since") is not None
|
|
838
|
+
|
|
839
|
+
|
|
840
|
+
# ─── Runner-level dispatch ──────────────────────────────────────────────────
|
|
841
|
+
|
|
842
|
+
def test_run_digest_rejects_zeek_dir_at_programmatic_boundary(tmp_path) -> None:
|
|
843
|
+
config: dict[str, Any] = {"loghunter": {}}
|
|
844
|
+
with pytest.raises(ValueError,
|
|
845
|
+
match="zeek_dir is not valid for the cloudtrail schema"):
|
|
846
|
+
runner.run_digest(
|
|
847
|
+
config=config, schema="cloudtrail",
|
|
848
|
+
cloudtrail_dir=tmp_path,
|
|
849
|
+
zeek_dir=tmp_path / "zeek",
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
|
|
853
|
+
def test_run_digest_rejects_pihole_dir_at_programmatic_boundary(tmp_path) -> None:
|
|
854
|
+
config: dict[str, Any] = {"loghunter": {}}
|
|
855
|
+
with pytest.raises(ValueError,
|
|
856
|
+
match="pihole_dir is not valid for the cloudtrail schema"):
|
|
857
|
+
runner.run_digest(
|
|
858
|
+
config=config, schema="cloudtrail",
|
|
859
|
+
cloudtrail_dir=tmp_path,
|
|
860
|
+
pihole_dir=tmp_path / "pihole",
|
|
861
|
+
)
|
|
862
|
+
|
|
863
|
+
|
|
864
|
+
def test_run_digest_rejects_syslog_dir_at_programmatic_boundary(tmp_path) -> None:
|
|
865
|
+
config: dict[str, Any] = {"loghunter": {}}
|
|
866
|
+
with pytest.raises(ValueError,
|
|
867
|
+
match="syslog_dir is not valid for the cloudtrail schema"):
|
|
868
|
+
runner.run_digest(
|
|
869
|
+
config=config, schema="cloudtrail",
|
|
870
|
+
cloudtrail_dir=tmp_path,
|
|
871
|
+
syslog_dir=tmp_path / "syslog",
|
|
872
|
+
)
|
|
873
|
+
|
|
874
|
+
|
|
875
|
+
def test_run_digest_rejects_missing_cloudtrail_dir(tmp_path) -> None:
|
|
876
|
+
config: dict[str, Any] = {"loghunter": {}}
|
|
877
|
+
with pytest.raises(ValueError, match="cloudtrail_dir not configured"):
|
|
878
|
+
runner.run_digest(config=config, schema="cloudtrail")
|
|
879
|
+
|
|
880
|
+
|
|
881
|
+
# ─── End-to-end via run_digest ──────────────────────────────────────────────
|
|
882
|
+
|
|
883
|
+
def _row_to_wire_event(row: dict) -> dict:
|
|
884
|
+
"""Render a canonical row dict back to a CloudTrail wire event.
|
|
885
|
+
|
|
886
|
+
The loader/parser pipeline reads wire JSON (eventTime / userIdentity /
|
|
887
|
+
eventSource / …) and produces canonical rows. Going back the other way
|
|
888
|
+
for synthetic test files keeps the end-to-end path realistic without
|
|
889
|
+
having to maintain a parallel JSON fixture file.
|
|
890
|
+
"""
|
|
891
|
+
identity: dict[str, Any] = {"type": row["identity_type"]}
|
|
892
|
+
# Map the row's principal back to whichever userIdentity field the
|
|
893
|
+
# parser's derivation rule uses, so the parser's principal matches.
|
|
894
|
+
if row["identity_type"] == "AWSService":
|
|
895
|
+
identity["invokedBy"] = row["principal"]
|
|
896
|
+
elif row["identity_type"] == "AssumedRole":
|
|
897
|
+
identity["sessionContext"] = {
|
|
898
|
+
"sessionIssuer": {"userName": row["principal"]},
|
|
899
|
+
}
|
|
900
|
+
elif row["identity_type"] == "IAMUser":
|
|
901
|
+
# Use the arn so the parser's IAMUser path picks up the last
|
|
902
|
+
# slash-segment as principal — matches our placeholder shape.
|
|
903
|
+
identity["arn"] = row["principal"]
|
|
904
|
+
identity["userName"] = row["principal"].rsplit("/", 1)[-1] \
|
|
905
|
+
if "/" in row["principal"] else row["principal"]
|
|
906
|
+
elif row["identity_type"] == "Root":
|
|
907
|
+
identity["type"] = "Root"
|
|
908
|
+
event: dict[str, Any] = {
|
|
909
|
+
"eventTime": datetime.fromtimestamp(
|
|
910
|
+
row["ts"], tz=timezone.utc,
|
|
911
|
+
).strftime("%Y-%m-%dT%H:%M:%SZ"),
|
|
912
|
+
"userIdentity": identity,
|
|
913
|
+
"eventSource": row["event_source"],
|
|
914
|
+
"eventName": row["event_name"],
|
|
915
|
+
"sourceIPAddress": row["source_ip"],
|
|
916
|
+
"awsRegion": row["aws_region"],
|
|
917
|
+
"eventID": row["event_id"],
|
|
918
|
+
}
|
|
919
|
+
if row["error_code"] is not None:
|
|
920
|
+
event["errorCode"] = row["error_code"]
|
|
921
|
+
return event
|
|
922
|
+
|
|
923
|
+
|
|
924
|
+
def _write_ndjson(path: Path, rows: list[dict]) -> None:
|
|
925
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
926
|
+
with path.open("w", encoding="utf-8") as fh:
|
|
927
|
+
for row in rows:
|
|
928
|
+
fh.write(json.dumps(_row_to_wire_event(row)))
|
|
929
|
+
fh.write("\n")
|
|
930
|
+
|
|
931
|
+
|
|
932
|
+
def test_run_digest_cloudtrail_end_to_end_renders_a_card(tmp_path, capsys) -> None:
|
|
933
|
+
"""Full path: synthetic NDJSON file → run_digest → rendered card.
|
|
934
|
+
|
|
935
|
+
Flat grammar: identity-line schema label, dominant-IP surfaced by an
|
|
936
|
+
insight, dist slots (lane-split, region) always render as fields.
|
|
937
|
+
Promoted-insight slots do NOT also render as fields.
|
|
938
|
+
"""
|
|
939
|
+
ct_dir = tmp_path / "ct"
|
|
940
|
+
rows = _build_attack_rows()
|
|
941
|
+
_write_ndjson(ct_dir / "events.json.log", rows)
|
|
942
|
+
|
|
943
|
+
config: dict[str, Any] = {"loghunter": {}}
|
|
944
|
+
runner.run_digest(
|
|
945
|
+
config=config, schema="cloudtrail",
|
|
946
|
+
cloudtrail_dir=ct_dir, load_all=True, skip_confirm=True,
|
|
947
|
+
)
|
|
948
|
+
out = capsys.readouterr().out
|
|
949
|
+
assert "cloudtrail ·" in out
|
|
950
|
+
# Dist slots always render in fields.
|
|
951
|
+
assert "lane-split:" in out
|
|
952
|
+
assert "region:" in out
|
|
953
|
+
# Attack pile surfaces the dominant IP.
|
|
954
|
+
assert "203.0.113.99" in out
|
|
955
|
+
# No header rule, no footer machinery under the flat grammar.
|
|
956
|
+
assert "── digest" not in out
|
|
957
|
+
assert "N.B." not in out
|
|
958
|
+
assert "ABSENT" not in out
|
|
959
|
+
|
|
960
|
+
|
|
961
|
+
def test_run_digest_cloudtrail_end_to_end_sleepy_pile_is_quiet(tmp_path, capsys) -> None:
|
|
962
|
+
"""Sleepy pile: every gating slot dashes (non-speaking), so insights
|
|
963
|
+
is empty AND those slots vanish from fields. Only the two dist slots
|
|
964
|
+
(lane-split, region) survive in the fields block."""
|
|
965
|
+
ct_dir = tmp_path / "ct"
|
|
966
|
+
_write_ndjson(ct_dir / "events.json.log", _build_sleepy_rows())
|
|
967
|
+
config: dict[str, Any] = {"loghunter": {}}
|
|
968
|
+
runner.run_digest(
|
|
969
|
+
config=config, schema="cloudtrail",
|
|
970
|
+
cloudtrail_dir=ct_dir, load_all=True, skip_confirm=True,
|
|
971
|
+
)
|
|
972
|
+
out = capsys.readouterr().out
|
|
973
|
+
assert "cloudtrail ·" in out
|
|
974
|
+
assert "interactive 10% / service 90%" in out
|
|
975
|
+
assert "us-east-1 100%" in out
|
|
976
|
+
# Non-speaking gating slots vanish — no label appears in the fields.
|
|
977
|
+
for label in ("principal-vol:", "event-source:",
|
|
978
|
+
"source-ip:", "error-rate:"):
|
|
979
|
+
assert label not in out
|
|
980
|
+
assert "ABSENT" not in out
|