loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,671 @@
1
+ """AWS detector — per-principal behavioral surfacing from CloudTrail events.
2
+
3
+ Reads the canonical 12-column per-event frame produced by parsers/cloudtrail.py
4
+ (Thread A) and surfaces two tiers of Findings:
5
+
6
+ 1. **Burst sweeps** — per-principal first-seen actions clumped within a
7
+ sliding gap become one "enumeration sweep" Finding. The strongest primitive
8
+ we have, glanceable on a single line.
9
+ 2. **Ranked principals** — a model-free transparent z-score composite over
10
+ intuitive danger signals (error rate, distinct source IPs, distinct action
11
+ names, action entropy). Severity is by absolute composite bands, not rank
12
+ position; on a clean corpus the tier honestly reports nothing stood out
13
+ rather than manufacturing a verdict.
14
+
15
+ Architecture mirrors detectors/dns.py: front half does feature derivation, back
16
+ half assembles Findings at a single shared point. Service-lane events are
17
+ excluded from all three signals — AWS-run background activity is supposed to be
18
+ broad and repetitive; scoring it produces noise.
19
+
20
+ Model-free by design. pandas + numpy only — reaching for sklearn would betray
21
+ the transparency point (a humble user must be able to read why a principal was
22
+ surfaced).
23
+
24
+ Blind spot — disclosed via RunSummary, not buried behind --verbose: a
25
+ low-volume principal performing a small number of high-impact actions is not
26
+ reliably caught by any of these signals. Principals below ``min_events`` are
27
+ set aside; their count is surfaced via ``below_floor_count()``, which the
28
+ runner reads during RunSummary note assembly. The "first-seen" label is also
29
+ window-relative; the runner emits a second note disclosing that limitation.
30
+
31
+ Investigation pivot: principal → CloudTrail console / event_id drill-back →
32
+ source IPs → whois / threat-intel on non-AWS source IPs → regions touched.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ from datetime import datetime, timezone
38
+ from typing import Any
39
+
40
+ import numpy as np
41
+ import pandas as pd
42
+
43
+ from loghunter.common.finding import DetectorContext, Finding, MethodTag, Severity
44
+
45
+ DETECTOR_NAME = "aws"
46
+ STATUS = "available"
47
+
48
+ REQUIRED_LOGS = [
49
+ {"source": "cloudtrail_dir", "pattern": "*.json*"},
50
+ ]
51
+
52
+ OPTIONAL_LOGS: list[dict] = []
53
+
54
+ DETECTOR_METHOD = MethodTag("statistical", named=False)
55
+
56
+ DEFAULT_CONFIG = {
57
+ # Per-principal event floor. Interactive principals with fewer events are
58
+ # set aside (not scored). Count surfaced via the RunSummary below-floor note.
59
+ # Valid: int >= 1.
60
+ "min_events": 50,
61
+
62
+ # Burst aggregation gap: consecutive first-seen actions whose inter-arrival
63
+ # gap is strictly less than this threshold remain in the same burst.
64
+ # Valid: seconds, int > 0.
65
+ "burst_gap_seconds": 300,
66
+
67
+ # A burst must contain at least this many first-seen actions to be a Finding.
68
+ # Valid: int >= 2.
69
+ "burst_min_firsts": 3,
70
+
71
+ # Severity escalation gates for bursts. A burst at-or-above EITHER gate
72
+ # promotes from MEDIUM to HIGH. Never auto-HIGH on size alone — that would
73
+ # manufacture verdicts a noisy-but-benign sweep does not deserve.
74
+ # Valid: error_rate in [0,1], service_count int >= 1.
75
+ "burst_high_error_rate": 0.5,
76
+ "burst_high_service_count": 3,
77
+
78
+ # Absolute composite-z bands for ranked-principal severity. NOT rank
79
+ # position — a clean corpus should not have a HIGH purely for being
80
+ # top-of-list. Valid: float, low <= medium.
81
+ "composite_medium_threshold": 2.0, # ~2σ-equivalent → MEDIUM
82
+ "composite_low_threshold": 1.0, # mild standout → LOW; below → INFO band
83
+ }
84
+
85
+
86
+ # ── Pure helper: below-floor count ────────────────────────────────────────────
87
+ #
88
+ # Pre-detector: the runner calls this during RunSummary note assembly (before
89
+ # the detector loop starts, see runner.py memory note runsummary-built-before-
90
+ # detectors). The detector also calls it internally to size the scorable set,
91
+ # so the disclosed count cannot drift from the analysis count.
92
+
93
+ def below_floor_count(df: pd.DataFrame | None, min_events: int) -> int:
94
+ """Number of interactive-lane principals with fewer than ``min_events`` events.
95
+
96
+ Pure function over the canonical CloudTrail frame. Returns 0 on empty /
97
+ None / missing-columns input.
98
+ """
99
+ if df is None or df.empty:
100
+ return 0
101
+ if "lane" not in df.columns or "principal" not in df.columns:
102
+ return 0
103
+ interactive = df[df["lane"] == "interactive"]
104
+ if interactive.empty:
105
+ return 0
106
+ counts = interactive.groupby("principal").size()
107
+ return int((counts < min_events).sum())
108
+
109
+
110
+ def interactive_count(df: pd.DataFrame | None) -> int:
111
+ """Number of interactive-lane EVENTS (rows) in the canonical CloudTrail frame.
112
+
113
+ Pure function; 0 on None / empty / missing-``lane`` input. ``== 0`` exactly
114
+ when ``run()``'s ``_filter_interactive(df)`` is empty — the silent-"nothing"
115
+ condition the runner's no-interactive disclosure note keys on.
116
+ """
117
+ if df is None or df.empty:
118
+ return 0
119
+ if "lane" not in df.columns:
120
+ return 0
121
+ return int((df["lane"] == "interactive").sum())
122
+
123
+
124
+ # ── Front half: lane filter, per-principal aggregation ────────────────────────
125
+
126
+ def _filter_interactive(df: pd.DataFrame) -> pd.DataFrame:
127
+ """Return only interactive-lane events.
128
+
129
+ The parser emits ``lane`` per event. We filter first, then aggregate the
130
+ resulting subset by principal — no assumption that a principal is purely
131
+ one lane. Service-lane events do not feed rarity, weirdness, or bursts.
132
+ """
133
+ if "lane" not in df.columns:
134
+ return df.iloc[0:0]
135
+ return df[df["lane"] == "interactive"]
136
+
137
+
138
+ def _shannon_entropy(value_counts: pd.Series) -> float:
139
+ """Shannon entropy (base 2) of a value-count distribution."""
140
+ total = value_counts.sum()
141
+ if total <= 0:
142
+ return 0.0
143
+ probs = value_counts / total
144
+ nonzero = probs[probs > 0]
145
+ if nonzero.empty:
146
+ return 0.0
147
+ return float(-(nonzero * np.log2(nonzero)).sum())
148
+
149
+
150
+ _PER_PRINCIPAL_COLUMNS = [
151
+ "principal", "event_count", "error_rate",
152
+ "distinct_source_ip", "distinct_event_name", "distinct_event_source",
153
+ "read_ratio", "action_entropy",
154
+ "distinct_aws_region", "distinct_hours_active",
155
+ ]
156
+
157
+
158
+ def _aggregate_per_principal(interactive_df: pd.DataFrame) -> pd.DataFrame:
159
+ """One row per principal in the interactive lane, with behavioral features.
160
+
161
+ All features derive from the canonical 12-column schema; we never recompute
162
+ principal/lane/read_write — those come from the parser.
163
+ """
164
+ if interactive_df.empty:
165
+ return pd.DataFrame(columns=_PER_PRINCIPAL_COLUMNS)
166
+
167
+ g = interactive_df.groupby("principal", sort=False)
168
+ event_count = g.size()
169
+
170
+ def _series(col_in_df: bool, default: float | int) -> pd.Series:
171
+ return pd.Series(default, index=event_count.index)
172
+
173
+ if "error_code" in interactive_df.columns:
174
+ error_count = g["error_code"].apply(lambda s: int(s.notna().sum()))
175
+ else:
176
+ error_count = _series(False, 0)
177
+ error_rate = (error_count / event_count).astype(float)
178
+
179
+ distinct_source_ip = (
180
+ g["source_ip"].nunique() if "source_ip" in interactive_df.columns
181
+ else _series(False, 0)
182
+ )
183
+ distinct_event_name = (
184
+ g["event_name"].nunique() if "event_name" in interactive_df.columns
185
+ else _series(False, 0)
186
+ )
187
+ distinct_event_source = (
188
+ g["event_source"].nunique() if "event_source" in interactive_df.columns
189
+ else _series(False, 0)
190
+ )
191
+ distinct_aws_region = (
192
+ g["aws_region"].nunique() if "aws_region" in interactive_df.columns
193
+ else _series(False, 0)
194
+ )
195
+
196
+ if "read_write" in interactive_df.columns:
197
+ read_count = g["read_write"].apply(lambda s: int((s == "read").sum()))
198
+ else:
199
+ read_count = _series(False, 0)
200
+ read_ratio = (read_count / event_count).astype(float)
201
+
202
+ if "event_name" in interactive_df.columns:
203
+ action_entropy = g["event_name"].apply(
204
+ lambda s: _shannon_entropy(s.value_counts())
205
+ )
206
+ else:
207
+ action_entropy = _series(False, 0.0)
208
+
209
+ if "ts" in interactive_df.columns:
210
+ hours = pd.to_datetime(
211
+ interactive_df["ts"], unit="s", utc=True, errors="coerce"
212
+ ).dt.hour
213
+ with_hour = interactive_df.assign(_hour=hours.values)
214
+ distinct_hours = (
215
+ with_hour.groupby("principal", sort=False)["_hour"].nunique()
216
+ )
217
+ else:
218
+ distinct_hours = _series(False, 0)
219
+
220
+ out = pd.DataFrame({
221
+ "principal": list(event_count.index),
222
+ "event_count": event_count.values.astype(int),
223
+ "error_rate": error_rate.values,
224
+ "distinct_source_ip": distinct_source_ip.values.astype(int),
225
+ "distinct_event_name": distinct_event_name.values.astype(int),
226
+ "distinct_event_source": distinct_event_source.values.astype(int),
227
+ "read_ratio": read_ratio.values,
228
+ "action_entropy": action_entropy.values,
229
+ "distinct_aws_region": distinct_aws_region.values.astype(int),
230
+ "distinct_hours_active": distinct_hours.values.astype(int),
231
+ })
232
+ return out
233
+
234
+
235
+ # ── Signal 1: corpus rarity ───────────────────────────────────────────────────
236
+
237
+ def _compute_rarity(interactive_df: pd.DataFrame) -> dict[str, float]:
238
+ """log10(N / count(event_name)) over interactive-lane events only.
239
+
240
+ Returns ``event_name → rarity``. Pure plain-odds — no domain opinion.
241
+ Higher = rarer action in this corpus. Returns ``{}`` on empty input.
242
+ """
243
+ if interactive_df.empty or "event_name" not in interactive_df.columns:
244
+ return {}
245
+ counts = interactive_df["event_name"].dropna().value_counts()
246
+ n = int(counts.sum())
247
+ if n == 0:
248
+ return {}
249
+ rarities = np.log10(float(n)) - np.log10(counts.astype(float).values)
250
+ return {str(k): float(v) for k, v in zip(counts.index, rarities)}
251
+
252
+
253
+ # ── Signal 2: behavioral weirdness composite ──────────────────────────────────
254
+
255
+ def _zscore(values: np.ndarray) -> np.ndarray:
256
+ """Population z-score; degenerate (std == 0) populations collapse to zeros."""
257
+ if values.size == 0:
258
+ return values.astype(float)
259
+ mean = float(values.mean())
260
+ std = float(values.std())
261
+ if std == 0:
262
+ return np.zeros_like(values, dtype=float)
263
+ return (values.astype(float) - mean) / std
264
+
265
+
266
+ def _compute_weirdness(scorable: pd.DataFrame) -> pd.DataFrame:
267
+ """Add component z-scores and a composite to the scorable per-principal frame.
268
+
269
+ Heavy-tailed count features (distinct_source_ip, distinct_event_name) are
270
+ log1p-scaled before z-scoring. Ratios (error_rate) and bounded entropy
271
+ (action_entropy) are not log1p'd.
272
+
273
+ Caller is responsible for filtering to ``event_count >= min_events`` before
274
+ calling this; we trust that contract and do not re-filter.
275
+ """
276
+ if scorable.empty:
277
+ return scorable
278
+
279
+ out = scorable.copy()
280
+ out["z_error_rate"] = _zscore(out["error_rate"].values)
281
+ out["z_distinct_source_ip"] = _zscore(
282
+ np.log1p(out["distinct_source_ip"].values.astype(float))
283
+ )
284
+ out["z_distinct_event_name"] = _zscore(
285
+ np.log1p(out["distinct_event_name"].values.astype(float))
286
+ )
287
+ out["z_action_entropy"] = _zscore(out["action_entropy"].values)
288
+ out["composite_z"] = (
289
+ out["z_error_rate"]
290
+ + out["z_distinct_source_ip"]
291
+ + out["z_distinct_event_name"]
292
+ + out["z_action_entropy"]
293
+ )
294
+ return out
295
+
296
+
297
+ # ── Signal 3: first-seen + burst aggregation ──────────────────────────────────
298
+
299
+ def _compute_bursts(
300
+ interactive_df: pd.DataFrame,
301
+ rarity: dict[str, float],
302
+ burst_gap_seconds: int,
303
+ burst_min_firsts: int,
304
+ ) -> list[dict]:
305
+ """Time-ordered pass yielding per-principal burst candidates.
306
+
307
+ For each principal, the VERY first event is skipped (all-new is
308
+ uninformative). Subsequent events whose event_name has not been seen for
309
+ this principal are recorded as first-seen records. Consecutive first-seen
310
+ records less than ``burst_gap_seconds`` apart form one burst; a closed
311
+ burst with at least ``burst_min_firsts`` records becomes a candidate.
312
+
313
+ "First seen" is first seen within this loaded window. loghunter is batch
314
+ and stateless — no cross-run persistence, no rolling baseline. The
315
+ limitation is disclosed via a RunSummary note assembled by the runner.
316
+ """
317
+ needed = {"ts", "principal", "event_name"}
318
+ if interactive_df.empty or not needed.issubset(interactive_df.columns):
319
+ return []
320
+
321
+ sorted_df = interactive_df.sort_values("ts", kind="stable").reset_index(drop=True)
322
+ columns = sorted_df.columns
323
+
324
+ seen_actions: dict[str, set[str]] = {}
325
+ first_seen_records: dict[str, list[dict]] = {}
326
+
327
+ for row in sorted_df.itertuples(index=False):
328
+ principal = getattr(row, "principal", None)
329
+ event_name = getattr(row, "event_name", None)
330
+ ts = getattr(row, "ts", None)
331
+ if principal is None or event_name is None or ts is None or pd.isna(ts):
332
+ continue
333
+
334
+ if principal not in seen_actions:
335
+ # Very first event for this principal — skip and seed the seen set.
336
+ seen_actions[principal] = {event_name}
337
+ continue
338
+
339
+ if event_name in seen_actions[principal]:
340
+ continue
341
+
342
+ seen_actions[principal].add(event_name)
343
+ first_seen_records.setdefault(principal, []).append({
344
+ "ts": float(ts),
345
+ "event_name": str(event_name),
346
+ "rarity": rarity.get(str(event_name), 0.0),
347
+ "errored": (
348
+ bool(pd.notna(getattr(row, "error_code", None)))
349
+ if "error_code" in columns else False
350
+ ),
351
+ "event_source": (
352
+ str(getattr(row, "event_source"))
353
+ if "event_source" in columns
354
+ and getattr(row, "event_source") is not None else ""
355
+ ),
356
+ "source_ip": (
357
+ str(getattr(row, "source_ip"))
358
+ if "source_ip" in columns
359
+ and getattr(row, "source_ip") is not None else ""
360
+ ),
361
+ "aws_region": (
362
+ str(getattr(row, "aws_region"))
363
+ if "aws_region" in columns
364
+ and getattr(row, "aws_region") is not None else ""
365
+ ),
366
+ "event_id": (
367
+ str(getattr(row, "event_id"))
368
+ if "event_id" in columns
369
+ and getattr(row, "event_id") is not None else ""
370
+ ),
371
+ })
372
+
373
+ bursts: list[dict] = []
374
+ for principal, records in first_seen_records.items():
375
+ current: list[dict] = []
376
+ for rec in records:
377
+ if not current:
378
+ current.append(rec)
379
+ continue
380
+ gap = rec["ts"] - current[-1]["ts"]
381
+ if gap < burst_gap_seconds:
382
+ current.append(rec)
383
+ else:
384
+ if len(current) >= burst_min_firsts:
385
+ bursts.append(_summarize_burst(principal, current))
386
+ current = [rec]
387
+ if len(current) >= burst_min_firsts:
388
+ bursts.append(_summarize_burst(principal, current))
389
+
390
+ return bursts
391
+
392
+
393
+ def _summarize_burst(principal: str, records: list[dict]) -> dict:
394
+ """Compute per-burst aggregates from a list of first-seen records."""
395
+ n = len(records)
396
+ start_ts = records[0]["ts"]
397
+ end_ts = records[-1]["ts"]
398
+ new_services = sorted({r["event_source"] for r in records if r["event_source"]})
399
+ source_ips = sorted({r["source_ip"] for r in records if r["source_ip"]})
400
+ aws_regions = sorted({r["aws_region"] for r in records if r["aws_region"]})
401
+ new_actions = [r["event_name"] for r in records]
402
+ event_ids = [r["event_id"] for r in records if r["event_id"]]
403
+ error_count = sum(1 for r in records if r["errored"])
404
+ error_rate = error_count / n if n else 0.0
405
+ mean_rarity = sum(r["rarity"] for r in records) / n if n else 0.0
406
+ return {
407
+ "principal": str(principal),
408
+ "start_time": datetime.fromtimestamp(start_ts, tz=timezone.utc).isoformat(),
409
+ "start_ts": start_ts,
410
+ "span_seconds": float(end_ts - start_ts),
411
+ "new_action_count": int(n),
412
+ "new_service_count": int(len(new_services)),
413
+ "new_actions": new_actions,
414
+ "new_services": new_services,
415
+ "source_ips": source_ips,
416
+ "aws_regions": aws_regions,
417
+ "sample_event_ids": event_ids[:10],
418
+ "error_rate": round(error_rate, 4),
419
+ "mean_rarity": round(mean_rarity, 4),
420
+ }
421
+
422
+
423
+ # ── Span / formatting helpers ─────────────────────────────────────────────────
424
+
425
+ def _span_str(seconds: float) -> str:
426
+ """Compact span: 45s / 7m / 3h / 2d. Caller can also use the raw seconds."""
427
+ s = int(seconds)
428
+ if s < 60:
429
+ return f"{s}s"
430
+ if s < 3600:
431
+ return f"{s // 60}m"
432
+ if s < 86400:
433
+ return f"{s // 3600}h"
434
+ return f"{s // 86400}d"
435
+
436
+
437
+ # ── Finding constructors ──────────────────────────────────────────────────────
438
+
439
+ def _make_burst_finding(
440
+ burst: dict,
441
+ burst_high_error_rate: float,
442
+ burst_high_service_count: int,
443
+ now: datetime,
444
+ data_window: tuple[datetime, datetime],
445
+ ) -> Finding:
446
+ """One burst → one Finding. Severity structural by signal kind."""
447
+ err_gate_hit = burst["error_rate"] >= burst_high_error_rate
448
+ svc_gate_hit = burst["new_service_count"] >= burst_high_service_count
449
+ severity = Severity.HIGH if (err_gate_hit or svc_gate_hit) else Severity.MEDIUM
450
+
451
+ title = str(burst["principal"])
452
+ description = (
453
+ f"{burst['new_action_count']} first-seen action(s) across "
454
+ f"{burst['new_service_count']} service(s) in "
455
+ f"{_span_str(burst['span_seconds'])} "
456
+ f"({burst['error_rate']:.0%} errored). Pattern resembles an "
457
+ "enumeration / recon sweep — recon, manual exploration, or a "
458
+ "misconfigured first-time deploy."
459
+ )
460
+ next_steps = [
461
+ f"Review CloudTrail events for principal {burst['principal']}",
462
+ "Drill back via event IDs: " + ", ".join(burst["sample_event_ids"][:5]),
463
+ "Verify source IPs are expected: " + ", ".join(burst["source_ips"][:5]),
464
+ "Regions touched: " + ", ".join(burst["aws_regions"]),
465
+ "Whois / threat-intel any non-AWS source IPs.",
466
+ ]
467
+ evidence: dict[str, Any] = {
468
+ "tier": "burst",
469
+ "principal": burst["principal"],
470
+ "start_time": burst["start_time"],
471
+ "span_seconds": burst["span_seconds"],
472
+ "new_action_count": burst["new_action_count"],
473
+ "new_service_count": burst["new_service_count"],
474
+ "error_rate": burst["error_rate"],
475
+ "mean_rarity": burst["mean_rarity"],
476
+ "new_actions": burst["new_actions"],
477
+ "new_services": burst["new_services"],
478
+ "source_ips": burst["source_ips"],
479
+ "aws_regions": burst["aws_regions"],
480
+ "sample_event_ids": burst["sample_event_ids"],
481
+ }
482
+ return Finding(
483
+ detector=DETECTOR_NAME,
484
+ severity=severity,
485
+ title=title,
486
+ description=description,
487
+ evidence=evidence,
488
+ next_steps=next_steps,
489
+ ts_generated=now,
490
+ data_window=data_window,
491
+ )
492
+
493
+
494
+ def _make_ranked_finding(
495
+ row: pd.Series,
496
+ severity: Severity,
497
+ interactive_df: pd.DataFrame,
498
+ now: datetime,
499
+ data_window: tuple[datetime, datetime],
500
+ ) -> Finding:
501
+ """One ranked principal → one Finding. Components + composite + raw values in evidence."""
502
+ principal = row["principal"]
503
+ sub = interactive_df[interactive_df["principal"] == principal]
504
+
505
+ top_actions = (
506
+ sub["event_name"].value_counts().head(5).index.tolist()
507
+ if "event_name" in sub.columns else []
508
+ )
509
+ source_ips = (
510
+ sorted(s for s in sub["source_ip"].dropna().unique() if isinstance(s, str))[:10]
511
+ if "source_ip" in sub.columns else []
512
+ )
513
+ aws_regions = (
514
+ sorted(s for s in sub["aws_region"].dropna().unique() if isinstance(s, str))
515
+ if "aws_region" in sub.columns else []
516
+ )
517
+ sample_event_ids = (
518
+ [s for s in sub["event_id"].head(5).tolist() if isinstance(s, str)]
519
+ if "event_id" in sub.columns else []
520
+ )
521
+
522
+ title = str(principal)
523
+ description = (
524
+ f"composite z-score {row['composite_z']:.2f} across error rate, "
525
+ "distinct source IPs, distinct action names, and action entropy — this "
526
+ "principal's behavioral fingerprint is unusual for the population."
527
+ )
528
+ next_steps = [
529
+ f"Review CloudTrail events for principal {principal}",
530
+ "Pivot on top actions: " + ", ".join(top_actions),
531
+ "Whois / threat-intel any non-AWS source IPs: " + ", ".join(source_ips[:5]),
532
+ "Drill back via event IDs: " + ", ".join(sample_event_ids),
533
+ ]
534
+ evidence: dict[str, Any] = {
535
+ "tier": "ranked",
536
+ "principal": str(principal),
537
+ "composite_z": round(float(row["composite_z"]), 4),
538
+ "z_error_rate": round(float(row["z_error_rate"]), 4),
539
+ "z_distinct_source_ip": round(float(row["z_distinct_source_ip"]), 4),
540
+ "z_distinct_event_name": round(float(row["z_distinct_event_name"]), 4),
541
+ "z_action_entropy": round(float(row["z_action_entropy"]), 4),
542
+ "event_count": int(row["event_count"]),
543
+ "error_rate": round(float(row["error_rate"]), 4),
544
+ "distinct_source_ip": int(row["distinct_source_ip"]),
545
+ "distinct_event_name": int(row["distinct_event_name"]),
546
+ "distinct_event_source": int(row["distinct_event_source"]),
547
+ "action_entropy": round(float(row["action_entropy"]), 4),
548
+ "read_ratio": round(float(row["read_ratio"]), 4),
549
+ "distinct_aws_region": int(row["distinct_aws_region"]),
550
+ "distinct_hours_active": int(row["distinct_hours_active"]),
551
+ "top_actions": top_actions,
552
+ "source_ips": source_ips,
553
+ "aws_regions": aws_regions,
554
+ "sample_event_ids": sample_event_ids,
555
+ }
556
+ return Finding(
557
+ detector=DETECTOR_NAME,
558
+ severity=severity,
559
+ title=title,
560
+ description=description,
561
+ evidence=evidence,
562
+ next_steps=next_steps,
563
+ ts_generated=now,
564
+ data_window=data_window,
565
+ )
566
+
567
+
568
+ def _make_ranked_summary_finding(
569
+ scored: pd.DataFrame,
570
+ now: datetime,
571
+ data_window: tuple[datetime, datetime],
572
+ ) -> Finding:
573
+ """One synthetic INFO Finding — the "nothing stood out" quiet line.
574
+
575
+ Emitted only when at least one scorable interactive principal exists AND
576
+ zero MEDIUM/LOW per-principal Findings result. Carries the count and the
577
+ top composite (least-unremarkable actor) as analyst pivot.
578
+ """
579
+ top = scored.sort_values("composite_z", ascending=False).iloc[0]
580
+ return Finding(
581
+ detector=DETECTOR_NAME,
582
+ severity=Severity.INFO,
583
+ title="ranked tier: no principals cleared the LOW band",
584
+ description=(
585
+ f"{len(scored)} interactive principal(s) were scored; none cleared the "
586
+ f"LOW band. Least-unremarkable actor: {top['principal']} "
587
+ f"(composite z = {float(top['composite_z']):.2f})."
588
+ ),
589
+ evidence={
590
+ "tier": "ranked_summary",
591
+ "scorable_count": int(len(scored)),
592
+ "top_principal": str(top["principal"]),
593
+ "top_composite_z": round(float(top["composite_z"]), 4),
594
+ },
595
+ next_steps=[
596
+ "No recommended action — nothing stood out.",
597
+ "Lower composite_low_threshold in [detectors.aws] to widen the surface.",
598
+ ],
599
+ ts_generated=now,
600
+ data_window=data_window,
601
+ )
602
+
603
+
604
+ # ── Detector entry point ──────────────────────────────────────────────────────
605
+
606
+ def run(context: DetectorContext) -> list[Finding]:
607
+ """Surface noteworthy CloudTrail principals: bursts first, then ranked weirdness."""
608
+ cfg = context.config
609
+ min_events: int = cfg.get("min_events", DEFAULT_CONFIG["min_events"])
610
+ burst_gap: int = cfg.get("burst_gap_seconds", DEFAULT_CONFIG["burst_gap_seconds"])
611
+ burst_min_firsts: int = cfg.get("burst_min_firsts", DEFAULT_CONFIG["burst_min_firsts"])
612
+ burst_high_err: float = cfg.get("burst_high_error_rate",
613
+ DEFAULT_CONFIG["burst_high_error_rate"])
614
+ burst_high_svcs: int = cfg.get("burst_high_service_count",
615
+ DEFAULT_CONFIG["burst_high_service_count"])
616
+ medium_threshold: float = cfg.get("composite_medium_threshold",
617
+ DEFAULT_CONFIG["composite_medium_threshold"])
618
+ low_threshold: float = cfg.get("composite_low_threshold",
619
+ DEFAULT_CONFIG["composite_low_threshold"])
620
+
621
+ df = context.logs.get("*.json*")
622
+ if df is None or df.empty:
623
+ return []
624
+
625
+ interactive = _filter_interactive(df)
626
+ if interactive.empty:
627
+ return []
628
+
629
+ per_principal = _aggregate_per_principal(interactive)
630
+ scorable = per_principal[per_principal["event_count"] >= min_events].copy()
631
+
632
+ rarity = _compute_rarity(interactive)
633
+ scored = _compute_weirdness(scorable)
634
+ burst_dicts = _compute_bursts(interactive, rarity, burst_gap, burst_min_firsts)
635
+
636
+ now = datetime.now(tz=timezone.utc)
637
+
638
+ # Burst findings: bursts first, sorted by service spread then action count.
639
+ burst_findings = [
640
+ _make_burst_finding(b, burst_high_err, burst_high_svcs, now, context.data_window)
641
+ for b in burst_dicts
642
+ ]
643
+ burst_findings.sort(
644
+ key=lambda f: (f.evidence["new_service_count"], f.evidence["new_action_count"]),
645
+ reverse=True,
646
+ )
647
+
648
+ # Ranked findings: MEDIUM and LOW per-principal (no verbose gating); when
649
+ # zero per-principal Findings result and scorable principals exist, one
650
+ # synthetic INFO summary Finding so the analyst sees the tier was scored.
651
+ ranked_findings: list[Finding] = []
652
+ if not scored.empty:
653
+ scored_sorted = scored.sort_values("composite_z", ascending=False)
654
+ for _, row in scored_sorted.iterrows():
655
+ cz = float(row["composite_z"])
656
+ if cz >= medium_threshold:
657
+ ranked_findings.append(
658
+ _make_ranked_finding(row, Severity.MEDIUM, interactive, now, context.data_window)
659
+ )
660
+ elif cz >= low_threshold:
661
+ ranked_findings.append(
662
+ _make_ranked_finding(row, Severity.LOW, interactive, now, context.data_window)
663
+ )
664
+ # cz < low_threshold → INFO band, not emitted per-principal.
665
+
666
+ if not ranked_findings:
667
+ ranked_findings.append(
668
+ _make_ranked_summary_finding(scored_sorted, now, context.data_window)
669
+ )
670
+
671
+ return burst_findings + ranked_findings