loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,504 @@
1
+ """Unit tests for the aws detector — per-principal CloudTrail behavioral surfacing.
2
+
3
+ All fixtures are synthetic per the privacy rail: RFC 5737 IPs only, AWS
4
+ documentation account 123456789012, obvious-placeholder principal / role names.
5
+
6
+ Each test states the property under test and exercises the smallest synthetic
7
+ frame that proves it.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from datetime import datetime, timezone
13
+ from types import SimpleNamespace
14
+
15
+ import pandas as pd
16
+
17
+ from loghunter.common.finding import DetectorContext, Severity
18
+ from loghunter.common.loader import _CLOUDTRAIL_COLUMNS
19
+ from loghunter.detectors.aws import (
20
+ DEFAULT_CONFIG,
21
+ _aggregate_per_principal,
22
+ _compute_bursts,
23
+ _compute_rarity,
24
+ _compute_weirdness,
25
+ below_floor_count,
26
+ run,
27
+ )
28
+
29
+
30
+ _DOCS_ACCT = "123456789012"
31
+ _WINDOW = (
32
+ datetime(2026, 6, 1, tzinfo=timezone.utc),
33
+ datetime(2026, 6, 8, tzinfo=timezone.utc),
34
+ )
35
+ _BASE_TS = datetime(2026, 6, 1, 12, 0, 0, tzinfo=timezone.utc).timestamp()
36
+
37
+
38
+ # ── Fixture helpers ──────────────────────────────────────────────────────────
39
+
40
+ def _event(**overrides) -> dict:
41
+ """Build a minimal canonical CloudTrail per-event row (12 fields)."""
42
+ base: dict = {
43
+ "ts": _BASE_TS,
44
+ "principal": "placeholder-user",
45
+ "lane": "interactive",
46
+ "read_write": "read",
47
+ "event_source": "s3.amazonaws.com",
48
+ "event_name": "GetObject",
49
+ "identity_type": "IAMUser",
50
+ "source_ip": "192.0.2.10",
51
+ "error_code": None,
52
+ "aws_region": "us-east-1",
53
+ "event_id": "11111111-1111-1111-1111-111111111111",
54
+ "raw": {},
55
+ }
56
+ base.update(overrides)
57
+ return base
58
+
59
+
60
+ def _df(events: list[dict]) -> pd.DataFrame:
61
+ """Build a DataFrame matching parsers/cloudtrail.py's 12-column output."""
62
+ if not events:
63
+ return pd.DataFrame(columns=_CLOUDTRAIL_COLUMNS)
64
+ rows = [_event(**e) for e in events]
65
+ return pd.DataFrame(rows, columns=_CLOUDTRAIL_COLUMNS)
66
+
67
+
68
+ def _ctx(df: pd.DataFrame, **kwargs) -> DetectorContext:
69
+ """DetectorContext for driving run() in tests.
70
+
71
+ No verbose kwarg under W6 — the result set is verbosity-invariant. Any
72
+ leftover ``verbose=`` kwarg is silently dropped to keep legacy call sites
73
+ quiet during the migration.
74
+ """
75
+ cfg = kwargs.pop("config", {})
76
+ kwargs.pop("verbose", None)
77
+ return DetectorContext(
78
+ logs={"*.json*": df},
79
+ config=cfg,
80
+ allowlist=SimpleNamespace(filter_df=lambda d, name: d),
81
+ data_window=_WINDOW,
82
+ data_sources=["cloudtrail_raw"],
83
+ )
84
+
85
+
86
+ # ── Aggregation: principal collapses across sessions ─────────────────────────
87
+
88
+ def test_aggregate_per_principal_collapses_sessions_of_same_role() -> None:
89
+ """The parser's principal key already collapses an AssumedRole's sessions; the
90
+ detector aggregates by that key, so two events with different session names
91
+ but the same parser-derived principal aggregate as one row."""
92
+ events = [_event(principal="role:placeholder-role", event_id=f"e{i}") for i in range(20)]
93
+ df = _df(events)
94
+
95
+ per = _aggregate_per_principal(df)
96
+
97
+ assert len(per) == 1
98
+ assert per.iloc[0]["principal"] == "role:placeholder-role"
99
+ assert per.iloc[0]["event_count"] == 20
100
+
101
+
102
+ def test_aggregate_features_match_known_distribution() -> None:
103
+ """Spot-check features against a small hand-constructed event mix."""
104
+ events = (
105
+ # 5 GetObject (read, success), 5 PutObject (write, 1 errored), all one IP, one region
106
+ [_event(event_name="GetObject", read_write="read") for _ in range(5)]
107
+ + [_event(event_name="PutObject", read_write="write") for _ in range(4)]
108
+ + [_event(event_name="PutObject", read_write="write", error_code="AccessDenied")]
109
+ )
110
+ df = _df(events)
111
+ per = _aggregate_per_principal(df)
112
+
113
+ assert len(per) == 1
114
+ row = per.iloc[0]
115
+ assert row["event_count"] == 10
116
+ assert abs(row["error_rate"] - 0.1) < 1e-9
117
+ assert row["distinct_event_name"] == 2 # GetObject, PutObject
118
+ assert row["distinct_source_ip"] == 1
119
+ assert row["distinct_event_source"] == 1
120
+ assert abs(row["read_ratio"] - 0.5) < 1e-9
121
+
122
+
123
+ # ── Lane split: service principals are excluded ──────────────────────────────
124
+
125
+ def test_lane_split_service_principals_yield_no_findings() -> None:
126
+ """A frame containing only service-lane events returns []."""
127
+ events = [
128
+ _event(lane="service", principal="ec2.amazonaws.com", event_name=f"Action{i}")
129
+ for i in range(100)
130
+ ]
131
+ df = _df(events)
132
+ findings = run(_ctx(df, config={"min_events": 10}))
133
+ assert findings == []
134
+
135
+
136
+ def test_lane_split_service_events_excluded_from_aggregation() -> None:
137
+ """A mixed frame with one interactive and one service-lane principal aggregates
138
+ only the interactive one."""
139
+ events = (
140
+ [_event(principal="alice") for _ in range(5)]
141
+ + [_event(principal="ec2.amazonaws.com", lane="service") for _ in range(50)]
142
+ )
143
+ df = _df(events)
144
+ from loghunter.detectors.aws import _filter_interactive
145
+ per = _aggregate_per_principal(_filter_interactive(df))
146
+ assert list(per["principal"]) == ["alice"]
147
+
148
+
149
+ # ── Signal 1: rarity ─────────────────────────────────────────────────────────
150
+
151
+ def test_rarity_log10_n_over_count() -> None:
152
+ """For 100 events with three actions in 70/20/10 proportions, rarity is
153
+ log10(N/count) per action."""
154
+ import math
155
+ events = (
156
+ [_event(event_name="GetObject") for _ in range(70)]
157
+ + [_event(event_name="ListBuckets") for _ in range(20)]
158
+ + [_event(event_name="DeleteBucket") for _ in range(10)]
159
+ )
160
+ rarity = _compute_rarity(_df(events))
161
+ assert abs(rarity["GetObject"] - math.log10(100 / 70)) < 1e-9
162
+ assert abs(rarity["ListBuckets"] - math.log10(100 / 20)) < 1e-9
163
+ assert abs(rarity["DeleteBucket"] - math.log10(100 / 10)) < 1e-9
164
+
165
+
166
+ def test_rarity_empty_frame_returns_empty_dict() -> None:
167
+ assert _compute_rarity(_df([])) == {}
168
+
169
+
170
+ # ── Signal 2: weirdness composite ────────────────────────────────────────────
171
+
172
+ def test_weirdness_composite_ranks_outlier_first() -> None:
173
+ """Five principals; one is unambiguously the standout in error rate and
174
+ distinct source-IP count. It must rank first by composite_z."""
175
+ # Build N events each for 5 principals; principal 'outlier' has high error
176
+ # rate AND many distinct source IPs. Others are bland and similar.
177
+ events: list[dict] = []
178
+ for name in ["alice", "bob", "carol", "dave"]:
179
+ events.extend(_event(principal=name, source_ip="192.0.2.10",
180
+ event_name="GetObject", error_code=None)
181
+ for _ in range(60))
182
+ for i in range(60):
183
+ events.append(_event(
184
+ principal="outlier",
185
+ source_ip=f"198.51.100.{i % 30}",
186
+ event_name=f"Action{i % 20}",
187
+ error_code="AccessDenied" if i % 2 == 0 else None,
188
+ ))
189
+ df = _df(events)
190
+ findings = run(_ctx(df, config={
191
+ "min_events": 50,
192
+ "composite_medium_threshold": 1.5,
193
+ "composite_low_threshold": 0.5,
194
+ }))
195
+ ranked = [f for f in findings if f.evidence.get("tier") == "ranked"]
196
+ assert ranked, "expected at least one ranked finding"
197
+ assert ranked[0].evidence["principal"] == "outlier"
198
+
199
+
200
+ def test_weirdness_composite_degenerate_population_yields_zero_z() -> None:
201
+ """A single scorable principal produces std == 0 across all features; all
202
+ z-scores collapse to 0, composite is 0, and the synthetic ranked_summary
203
+ is emitted instead of a per-principal finding."""
204
+ events = [_event(principal="only-one") for _ in range(60)]
205
+ df = _df(events)
206
+ findings = run(_ctx(df, config={"min_events": 50}))
207
+ ranked = [f for f in findings if f.evidence.get("tier") == "ranked"]
208
+ summary = [f for f in findings if f.evidence.get("tier") == "ranked_summary"]
209
+ assert ranked == []
210
+ assert len(summary) == 1
211
+ assert summary[0].severity == Severity.INFO
212
+ assert summary[0].evidence["scorable_count"] == 1
213
+ assert summary[0].evidence["top_composite_z"] == 0.0
214
+
215
+
216
+ # ── Signal 3: burst aggregation ──────────────────────────────────────────────
217
+
218
+ def _enum_sweep(principal: str, n_firsts: int, gap: float, start_ts: float,
219
+ error_rate: float = 0.0, n_services: int = 1) -> list[dict]:
220
+ """Construct an enumeration-sweep event sequence:
221
+ 1. one seed event (so principal isn't all-new)
222
+ 2. n_firsts events with distinct event_names spaced ``gap`` seconds apart
223
+ """
224
+ events = [_event(principal=principal, ts=start_ts, event_name="SeedAction")]
225
+ for i in range(n_firsts):
226
+ events.append(_event(
227
+ principal=principal,
228
+ ts=start_ts + (i + 1) * gap,
229
+ event_name=f"NewAction{i:03d}",
230
+ event_source=f"svc{i % n_services}.amazonaws.com",
231
+ error_code="AccessDenied" if i / n_firsts < error_rate else None,
232
+ ))
233
+ return events
234
+
235
+
236
+ def test_burst_collapses_enumeration_sweep_to_one_finding() -> None:
237
+ """N first-seen actions within burst_gap_seconds collapse to ONE burst Finding."""
238
+ events = _enum_sweep("attacker", n_firsts=10, gap=30.0, start_ts=_BASE_TS)
239
+ df = _df(events)
240
+ findings = run(_ctx(df, config={
241
+ "min_events": 1000, # nobody scorable; only burst tier exposed
242
+ "burst_gap_seconds": 300,
243
+ "burst_min_firsts": 3,
244
+ }))
245
+ bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
246
+ assert len(bursts) == 1
247
+ assert bursts[0].evidence["new_action_count"] == 10
248
+
249
+
250
+ def test_burst_negative_gap_too_wide_produces_no_finding() -> None:
251
+ """First-seen actions spread wider than burst_gap_seconds produce no burst."""
252
+ # Gap of 600s with burst_gap_seconds=300 → each first-seen event starts a fresh
253
+ # singleton burst that never reaches burst_min_firsts.
254
+ events = _enum_sweep("explorer", n_firsts=10, gap=600.0, start_ts=_BASE_TS)
255
+ df = _df(events)
256
+ findings = run(_ctx(df, config={
257
+ "min_events": 1000,
258
+ "burst_gap_seconds": 300,
259
+ "burst_min_firsts": 3,
260
+ }))
261
+ assert [f for f in findings if f.evidence.get("tier") == "burst"] == []
262
+
263
+
264
+ def test_burst_negative_too_few_firsts() -> None:
265
+ """Fewer than burst_min_firsts first-seen actions produce no burst finding."""
266
+ events = _enum_sweep("explorer", n_firsts=2, gap=30.0, start_ts=_BASE_TS)
267
+ df = _df(events)
268
+ findings = run(_ctx(df, config={
269
+ "min_events": 1000,
270
+ "burst_gap_seconds": 300,
271
+ "burst_min_firsts": 3,
272
+ }))
273
+ assert [f for f in findings if f.evidence.get("tier") == "burst"] == []
274
+
275
+
276
+ def test_burst_skips_principal_very_first_event() -> None:
277
+ """A principal's first event must NOT count as first-seen (all-new is
278
+ uninformative — handled by the seed step in _compute_bursts)."""
279
+ # A principal whose entire footprint is N events of distinct names, with
280
+ # NO seed: first event seeds, next (N-1) are first-seen.
281
+ events = [
282
+ _event(principal="alpha", ts=_BASE_TS + i * 30.0, event_name=f"Action{i:03d}")
283
+ for i in range(5)
284
+ ]
285
+ df = _df(events)
286
+ findings = run(_ctx(df, config={
287
+ "min_events": 1000,
288
+ "burst_gap_seconds": 300,
289
+ "burst_min_firsts": 3,
290
+ }))
291
+ bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
292
+ # 5 events, first is seed, 4 are first-seen → burst of 4 (>= burst_min_firsts=3)
293
+ assert len(bursts) == 1
294
+ assert bursts[0].evidence["new_action_count"] == 4
295
+
296
+
297
+ # ── Severity gates ────────────────────────────────────────────────────────────
298
+
299
+ def test_burst_default_severity_is_medium_on_clean_burst() -> None:
300
+ """A bare large burst with no errors and one service is MEDIUM."""
301
+ events = _enum_sweep("attacker", n_firsts=10, gap=30.0, start_ts=_BASE_TS,
302
+ error_rate=0.0, n_services=1)
303
+ findings = run(_ctx(_df(events), config={"min_events": 1000}))
304
+ bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
305
+ assert bursts[0].severity == Severity.MEDIUM
306
+
307
+
308
+ def test_burst_escalates_to_high_on_error_rate() -> None:
309
+ """burst error_rate >= burst_high_error_rate → HIGH."""
310
+ events = _enum_sweep("attacker", n_firsts=10, gap=30.0, start_ts=_BASE_TS,
311
+ error_rate=1.0, n_services=1)
312
+ findings = run(_ctx(_df(events), config={
313
+ "min_events": 1000,
314
+ "burst_high_error_rate": 0.5,
315
+ "burst_high_service_count": 10, # disable the service gate
316
+ }))
317
+ bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
318
+ assert bursts[0].severity == Severity.HIGH
319
+
320
+
321
+ def test_burst_escalates_to_high_on_service_spread() -> None:
322
+ """new_service_count >= burst_high_service_count → HIGH."""
323
+ events = _enum_sweep("attacker", n_firsts=10, gap=30.0, start_ts=_BASE_TS,
324
+ error_rate=0.0, n_services=5)
325
+ findings = run(_ctx(_df(events), config={
326
+ "min_events": 1000,
327
+ "burst_high_error_rate": 1.5, # disable the error gate
328
+ "burst_high_service_count": 3,
329
+ }))
330
+ bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
331
+ assert bursts[0].severity == Severity.HIGH
332
+
333
+
334
+ def test_burst_never_auto_high_on_size_alone() -> None:
335
+ """Even a very large clean burst stays MEDIUM — size alone never escalates."""
336
+ events = _enum_sweep("walker", n_firsts=100, gap=10.0, start_ts=_BASE_TS,
337
+ error_rate=0.0, n_services=1)
338
+ findings = run(_ctx(_df(events), config={
339
+ "min_events": 1000,
340
+ "burst_high_error_rate": 0.5,
341
+ "burst_high_service_count": 3,
342
+ }))
343
+ bursts = [f for f in findings if f.evidence.get("tier") == "burst"]
344
+ assert bursts[0].severity == Severity.MEDIUM
345
+
346
+
347
+ # ── Two clean-corpus cases — Glenn's watch item ──────────────────────────────
348
+
349
+ def test_clean_corpus_below_floor_emits_no_ranked_findings() -> None:
350
+ """When all principals are below min_events, no ranked tier at all — the
351
+ runner's RunSummary note is what discloses this case, not a detector
352
+ Finding."""
353
+ # 3 principals, each with 5 events (default min_events=50).
354
+ events: list[dict] = []
355
+ for name in ["alice", "bob", "carol"]:
356
+ events.extend(_event(principal=name, event_name="GetObject") for _ in range(5))
357
+ df = _df(events)
358
+ findings = run(_ctx(df))
359
+ ranked = [f for f in findings if f.evidence.get("tier") == "ranked"]
360
+ summary = [f for f in findings if f.evidence.get("tier") == "ranked_summary"]
361
+ assert ranked == []
362
+ assert summary == [] # no synthetic summary either
363
+ assert below_floor_count(df, DEFAULT_CONFIG["min_events"]) == 3
364
+
365
+
366
+ def test_clean_corpus_scorable_but_below_low_band_emits_one_summary() -> None:
367
+ """When scorable principals exist but none clears the LOW band, the synthetic
368
+ ranked_summary INFO finding is emitted (one, not per-principal)."""
369
+ # 4 principals, identical footprint — z-scores collapse to 0 < LOW.
370
+ events: list[dict] = []
371
+ for name in ["alice", "bob", "carol", "dave"]:
372
+ events.extend(_event(principal=name) for _ in range(60))
373
+ df = _df(events)
374
+ findings = run(_ctx(df, config={"min_events": 50}))
375
+ ranked = [f for f in findings if f.evidence.get("tier") == "ranked"]
376
+ summary = [f for f in findings if f.evidence.get("tier") == "ranked_summary"]
377
+ assert ranked == []
378
+ assert len(summary) == 1
379
+ assert summary[0].severity == Severity.INFO
380
+
381
+
382
+ def test_clean_corpus_summary_evidence_carries_scorable_count_and_top() -> None:
383
+ """The synthetic summary surfaces scorable_count and top_principal (the
384
+ least-unremarkable actor) as analyst pivot — not just an empty 'quiet' line."""
385
+ events: list[dict] = []
386
+ for name in ["alice", "bob"]:
387
+ events.extend(_event(principal=name) for _ in range(60))
388
+ df = _df(events)
389
+ findings = run(_ctx(df, config={"min_events": 50}))
390
+ summary = [f for f in findings if f.evidence.get("tier") == "ranked_summary"][0]
391
+ assert summary.evidence["scorable_count"] == 2
392
+ assert summary.evidence["top_principal"] in {"alice", "bob"}
393
+ assert "top_composite_z" in summary.evidence
394
+
395
+
396
+ # ── below_floor_count helper ─────────────────────────────────────────────────
397
+
398
+ def test_below_floor_count_pure_helper_counts_correctly() -> None:
399
+ events: list[dict] = []
400
+ # 2 below-floor principals (5 events each)
401
+ for name in ["alice", "bob"]:
402
+ events.extend(_event(principal=name) for _ in range(5))
403
+ # 1 at-or-above principal (50 events)
404
+ events.extend(_event(principal="carol") for _ in range(50))
405
+ df = _df(events)
406
+ assert below_floor_count(df, 50) == 2
407
+
408
+
409
+ def test_below_floor_count_none_returns_zero() -> None:
410
+ assert below_floor_count(None, 50) == 0
411
+
412
+
413
+ def test_below_floor_count_empty_returns_zero() -> None:
414
+ assert below_floor_count(_df([]), 50) == 0
415
+
416
+
417
+ def test_below_floor_count_ignores_service_lane_principals() -> None:
418
+ """Service-lane principals aren't candidates for scoring; they don't
419
+ contribute to below-floor regardless of event count."""
420
+ events = [_event(principal="ec2.amazonaws.com", lane="service") for _ in range(5)]
421
+ assert below_floor_count(_df(events), 50) == 0
422
+
423
+
424
+ def test_below_floor_count_matches_detector_internal_count() -> None:
425
+ """Same helper, same answer — analysis and disclosure never drift."""
426
+ events: list[dict] = []
427
+ for name in ["alice", "bob"]:
428
+ events.extend(_event(principal=name) for _ in range(5))
429
+ events.extend(_event(principal="carol") for _ in range(60))
430
+ df = _df(events)
431
+ n_via_helper = below_floor_count(df, 50)
432
+ # And via the detector's actual aggregation: count interactive principals
433
+ # with event_count < 50 in the per-principal frame.
434
+ from loghunter.detectors.aws import _filter_interactive
435
+ per = _aggregate_per_principal(_filter_interactive(df))
436
+ n_internal = int((per["event_count"] < 50).sum())
437
+ assert n_via_helper == n_internal == 2
438
+
439
+
440
+ # ── Output ordering & defensive contracts ────────────────────────────────────
441
+
442
+ def test_burst_findings_precede_ranked_findings() -> None:
443
+ """Two-tier ordering: bursts first, then ranked. Mixed Findings list order."""
444
+ events = (
445
+ _enum_sweep("attacker", n_firsts=5, gap=30.0, start_ts=_BASE_TS)
446
+ + [_event(principal=f"bland{i}",
447
+ source_ip=f"192.0.2.{i}",
448
+ event_name=f"Bland{j:02d}")
449
+ for i in range(4) for j in range(60)]
450
+ )
451
+ df = _df(events)
452
+ findings = run(_ctx(df, config={"min_events": 50}))
453
+ tiers = [f.evidence["tier"] for f in findings]
454
+ # No "ranked" tier finding may appear before a "burst" tier finding.
455
+ last_burst_idx = max((i for i, t in enumerate(tiers) if t == "burst"), default=-1)
456
+ first_other_idx = min(
457
+ (i for i, t in enumerate(tiers) if t in {"ranked", "ranked_summary"}),
458
+ default=len(tiers),
459
+ )
460
+ assert last_burst_idx < first_other_idx
461
+
462
+
463
+ def test_empty_frame_returns_empty_list() -> None:
464
+ df = _df([])
465
+ assert run(_ctx(df)) == []
466
+
467
+
468
+ def test_absent_pattern_returns_empty_list() -> None:
469
+ """context.logs has no entry for *.json* — run() returns [] without raising."""
470
+ ctx = DetectorContext(
471
+ logs={},
472
+ config={},
473
+ allowlist=SimpleNamespace(filter_df=lambda d, name: d),
474
+ data_window=_WINDOW,
475
+ data_sources=[],
476
+ )
477
+ assert run(ctx) == []
478
+
479
+
480
+ def test_low_band_findings_emitted_without_verbose() -> None:
481
+ """LOW ranked findings are NOT gated on context.verbose; the analyst is
482
+ asking for the detector by selecting it."""
483
+ # Make one principal a mild standout — composite ~ 1.0..1.5 range — so it
484
+ # lands in LOW band with the default thresholds (1.0 → LOW, 2.0 → MEDIUM).
485
+ events: list[dict] = []
486
+ for name in ["alice", "bob", "carol", "dave"]:
487
+ events.extend(_event(principal=name, source_ip="192.0.2.10",
488
+ event_name="GetObject")
489
+ for _ in range(60))
490
+ # mild outlier: 2 distinct event names instead of 1
491
+ for i in range(60):
492
+ events.append(_event(
493
+ principal="standout",
494
+ source_ip="192.0.2.10",
495
+ event_name="GetObject" if i % 2 == 0 else "ListBuckets",
496
+ ))
497
+ df = _df(events)
498
+ findings_default = run(_ctx(df, config={"min_events": 50}))
499
+ findings_verbose = run(_ctx(df, config={"min_events": 50}, verbose=True))
500
+ # Whatever the severity is, the counts must match (no verbose gating).
501
+ assert (
502
+ sum(1 for f in findings_default if f.evidence.get("tier") == "ranked")
503
+ == sum(1 for f in findings_verbose if f.evidence.get("tier") == "ranked")
504
+ )
@@ -0,0 +1,106 @@
1
+ """Unit tests for the be_like_water target resolver.
2
+
3
+ Gated ladder, evaluated in order — a winning gate decides without falling
4
+ through:
5
+
6
+ Step 0 (gate): trailing slash -> DIRECTORY. No disk consult.
7
+ Step 1: exists and is_file() -> FILE.
8
+ Step 2: exists and is_dir() -> DIRECTORY.
9
+ Step 3: does not exist -> FILE (basename is the filename; parent
10
+ will be mkdir-p'd at write).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from pathlib import Path
16
+
17
+ import pytest
18
+
19
+ from loghunter.common.paths import ResolvedTarget, be_like_water
20
+
21
+
22
+ def test_trailing_slash_gate_wins_over_existing_file(tmp_path: Path) -> None:
23
+ """Step 0: a target with a trailing slash is DIRECTORY even when a file
24
+ by that exact name exists on disk. The gate runs before disk reads."""
25
+ f = tmp_path / "X"
26
+ f.write_text("preexisting file content", encoding="utf-8")
27
+ assert f.is_file() # confirm the file exists
28
+
29
+ result = be_like_water(f"{f}/") # trailing slash forces directory verdict
30
+ assert result == ResolvedTarget(Path(f"{f}/").expanduser(), is_file=False)
31
+ # User intent (trailing slash) wins over disk state.
32
+
33
+
34
+ def test_existing_file_resolves_to_file(tmp_path: Path) -> None:
35
+ """Step 1: an existing file with no trailing slash -> FILE at that path."""
36
+ f = tmp_path / "events.log"
37
+ f.write_text("data", encoding="utf-8")
38
+ result = be_like_water(str(f))
39
+ assert result.is_file is True
40
+ assert result.path == f
41
+
42
+
43
+ def test_existing_directory_resolves_to_directory(tmp_path: Path) -> None:
44
+ """Step 2: an existing directory with no trailing slash -> DIRECTORY."""
45
+ d = tmp_path / "reports"
46
+ d.mkdir()
47
+ result = be_like_water(str(d))
48
+ assert result.is_file is False
49
+ assert result.path == d
50
+
51
+
52
+ def test_not_exists_resolves_to_file(tmp_path: Path) -> None:
53
+ """Step 3: a path that does not exist -> FILE named by the last segment."""
54
+ target = tmp_path / "missing" / "leaf"
55
+ assert not target.exists()
56
+ result = be_like_water(str(target))
57
+ assert result.is_file is True
58
+ assert result.path == target
59
+ # Verify NO directory was created during resolution — that's a write-time concern.
60
+ assert not target.parent.exists()
61
+
62
+
63
+ def test_trailing_slash_on_nonexistent_resolves_to_directory(tmp_path: Path) -> None:
64
+ """Step 0 (gate): trailing slash on a non-existent path -> DIRECTORY."""
65
+ target = tmp_path / "a" / "b" / "c"
66
+ assert not target.exists()
67
+ result = be_like_water(f"{target}/")
68
+ assert result.is_file is False
69
+ # Note: Path() normalizes trailing slashes, so result.path equals the
70
+ # unsuffixed equivalent — but the verdict is still DIRECTORY.
71
+ assert result.path == target
72
+ # Resolver did not create anything.
73
+ assert not target.exists()
74
+
75
+
76
+ def test_tilde_reports_consequence(monkeypatch, tmp_path: Path) -> None:
77
+ """Explicit consequence: only trailing slash, or an already-existing directory,
78
+ yields directory behavior. `--out=~/reports` (no trailing slash, not exists)
79
+ creates a FILE named "reports" (after mkdir -p of the parent at write time).
80
+ This is the surprising-but-consistent behavior we lock down.
81
+ """
82
+ # Force ~ to expand to a tmp location so the test does not touch the real HOME.
83
+ monkeypatch.setenv("HOME", str(tmp_path))
84
+ result = be_like_water("~/reports")
85
+ assert result.is_file is True # NOT a directory verdict
86
+ assert result.path == tmp_path / "reports"
87
+
88
+
89
+ def test_expanduser_applied_to_both_branches(monkeypatch, tmp_path: Path) -> None:
90
+ """expanduser is applied for both the trailing-slash gate and the disk-conform paths."""
91
+ monkeypatch.setenv("HOME", str(tmp_path))
92
+ # Trailing slash:
93
+ dir_result = be_like_water("~/foo/")
94
+ assert dir_result.path == tmp_path / "foo"
95
+ assert dir_result.is_file is False
96
+ # No trailing slash, not exists:
97
+ file_result = be_like_water("~/foo")
98
+ assert file_result.path == tmp_path / "foo"
99
+ assert file_result.is_file is True
100
+
101
+
102
+ def test_resolved_target_path_is_pathlib_path(tmp_path: Path) -> None:
103
+ """ResolvedTarget.path is a Path object, not a str — callers depend on it."""
104
+ result = be_like_water(str(tmp_path)) # tmp_path exists, is dir
105
+ assert isinstance(result.path, Path)
106
+ assert isinstance(result.is_file, bool)