loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,671 @@
|
|
|
1
|
+
"""AWS detector — per-principal behavioral surfacing from CloudTrail events.
|
|
2
|
+
|
|
3
|
+
Reads the canonical 12-column per-event frame produced by parsers/cloudtrail.py
|
|
4
|
+
(Thread A) and surfaces two tiers of Findings:
|
|
5
|
+
|
|
6
|
+
1. **Burst sweeps** — per-principal first-seen actions clumped within a
|
|
7
|
+
sliding gap become one "enumeration sweep" Finding. The strongest primitive
|
|
8
|
+
we have, glanceable on a single line.
|
|
9
|
+
2. **Ranked principals** — a model-free transparent z-score composite over
|
|
10
|
+
intuitive danger signals (error rate, distinct source IPs, distinct action
|
|
11
|
+
names, action entropy). Severity is by absolute composite bands, not rank
|
|
12
|
+
position; on a clean corpus the tier honestly reports nothing stood out
|
|
13
|
+
rather than manufacturing a verdict.
|
|
14
|
+
|
|
15
|
+
Architecture mirrors detectors/dns.py: front half does feature derivation, back
|
|
16
|
+
half assembles Findings at a single shared point. Service-lane events are
|
|
17
|
+
excluded from all three signals — AWS-run background activity is supposed to be
|
|
18
|
+
broad and repetitive; scoring it produces noise.
|
|
19
|
+
|
|
20
|
+
Model-free by design. pandas + numpy only — reaching for sklearn would betray
|
|
21
|
+
the transparency point (a humble user must be able to read why a principal was
|
|
22
|
+
surfaced).
|
|
23
|
+
|
|
24
|
+
Blind spot — disclosed via RunSummary, not buried behind --verbose: a
|
|
25
|
+
low-volume principal performing a small number of high-impact actions is not
|
|
26
|
+
reliably caught by any of these signals. Principals below ``min_events`` are
|
|
27
|
+
set aside; their count is surfaced via ``below_floor_count()``, which the
|
|
28
|
+
runner reads during RunSummary note assembly. The "first-seen" label is also
|
|
29
|
+
window-relative; the runner emits a second note disclosing that limitation.
|
|
30
|
+
|
|
31
|
+
Investigation pivot: principal → CloudTrail console / event_id drill-back →
|
|
32
|
+
source IPs → whois / threat-intel on non-AWS source IPs → regions touched.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
from datetime import datetime, timezone
|
|
38
|
+
from typing import Any
|
|
39
|
+
|
|
40
|
+
import numpy as np
|
|
41
|
+
import pandas as pd
|
|
42
|
+
|
|
43
|
+
from loghunter.common.finding import DetectorContext, Finding, MethodTag, Severity
|
|
44
|
+
|
|
45
|
+
DETECTOR_NAME = "aws"
|
|
46
|
+
STATUS = "available"
|
|
47
|
+
|
|
48
|
+
REQUIRED_LOGS = [
|
|
49
|
+
{"source": "cloudtrail_dir", "pattern": "*.json*"},
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
OPTIONAL_LOGS: list[dict] = []
|
|
53
|
+
|
|
54
|
+
DETECTOR_METHOD = MethodTag("statistical", named=False)
|
|
55
|
+
|
|
56
|
+
DEFAULT_CONFIG = {
|
|
57
|
+
# Per-principal event floor. Interactive principals with fewer events are
|
|
58
|
+
# set aside (not scored). Count surfaced via the RunSummary below-floor note.
|
|
59
|
+
# Valid: int >= 1.
|
|
60
|
+
"min_events": 50,
|
|
61
|
+
|
|
62
|
+
# Burst aggregation gap: consecutive first-seen actions whose inter-arrival
|
|
63
|
+
# gap is strictly less than this threshold remain in the same burst.
|
|
64
|
+
# Valid: seconds, int > 0.
|
|
65
|
+
"burst_gap_seconds": 300,
|
|
66
|
+
|
|
67
|
+
# A burst must contain at least this many first-seen actions to be a Finding.
|
|
68
|
+
# Valid: int >= 2.
|
|
69
|
+
"burst_min_firsts": 3,
|
|
70
|
+
|
|
71
|
+
# Severity escalation gates for bursts. A burst at-or-above EITHER gate
|
|
72
|
+
# promotes from MEDIUM to HIGH. Never auto-HIGH on size alone — that would
|
|
73
|
+
# manufacture verdicts a noisy-but-benign sweep does not deserve.
|
|
74
|
+
# Valid: error_rate in [0,1], service_count int >= 1.
|
|
75
|
+
"burst_high_error_rate": 0.5,
|
|
76
|
+
"burst_high_service_count": 3,
|
|
77
|
+
|
|
78
|
+
# Absolute composite-z bands for ranked-principal severity. NOT rank
|
|
79
|
+
# position — a clean corpus should not have a HIGH purely for being
|
|
80
|
+
# top-of-list. Valid: float, low <= medium.
|
|
81
|
+
"composite_medium_threshold": 2.0, # ~2σ-equivalent → MEDIUM
|
|
82
|
+
"composite_low_threshold": 1.0, # mild standout → LOW; below → INFO band
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
# ── Pure helper: below-floor count ────────────────────────────────────────────
|
|
87
|
+
#
|
|
88
|
+
# Pre-detector: the runner calls this during RunSummary note assembly (before
|
|
89
|
+
# the detector loop starts, see runner.py memory note runsummary-built-before-
|
|
90
|
+
# detectors). The detector also calls it internally to size the scorable set,
|
|
91
|
+
# so the disclosed count cannot drift from the analysis count.
|
|
92
|
+
|
|
93
|
+
def below_floor_count(df: pd.DataFrame | None, min_events: int) -> int:
|
|
94
|
+
"""Number of interactive-lane principals with fewer than ``min_events`` events.
|
|
95
|
+
|
|
96
|
+
Pure function over the canonical CloudTrail frame. Returns 0 on empty /
|
|
97
|
+
None / missing-columns input.
|
|
98
|
+
"""
|
|
99
|
+
if df is None or df.empty:
|
|
100
|
+
return 0
|
|
101
|
+
if "lane" not in df.columns or "principal" not in df.columns:
|
|
102
|
+
return 0
|
|
103
|
+
interactive = df[df["lane"] == "interactive"]
|
|
104
|
+
if interactive.empty:
|
|
105
|
+
return 0
|
|
106
|
+
counts = interactive.groupby("principal").size()
|
|
107
|
+
return int((counts < min_events).sum())
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def interactive_count(df: pd.DataFrame | None) -> int:
|
|
111
|
+
"""Number of interactive-lane EVENTS (rows) in the canonical CloudTrail frame.
|
|
112
|
+
|
|
113
|
+
Pure function; 0 on None / empty / missing-``lane`` input. ``== 0`` exactly
|
|
114
|
+
when ``run()``'s ``_filter_interactive(df)`` is empty — the silent-"nothing"
|
|
115
|
+
condition the runner's no-interactive disclosure note keys on.
|
|
116
|
+
"""
|
|
117
|
+
if df is None or df.empty:
|
|
118
|
+
return 0
|
|
119
|
+
if "lane" not in df.columns:
|
|
120
|
+
return 0
|
|
121
|
+
return int((df["lane"] == "interactive").sum())
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# ── Front half: lane filter, per-principal aggregation ────────────────────────
|
|
125
|
+
|
|
126
|
+
def _filter_interactive(df: pd.DataFrame) -> pd.DataFrame:
|
|
127
|
+
"""Return only interactive-lane events.
|
|
128
|
+
|
|
129
|
+
The parser emits ``lane`` per event. We filter first, then aggregate the
|
|
130
|
+
resulting subset by principal — no assumption that a principal is purely
|
|
131
|
+
one lane. Service-lane events do not feed rarity, weirdness, or bursts.
|
|
132
|
+
"""
|
|
133
|
+
if "lane" not in df.columns:
|
|
134
|
+
return df.iloc[0:0]
|
|
135
|
+
return df[df["lane"] == "interactive"]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _shannon_entropy(value_counts: pd.Series) -> float:
|
|
139
|
+
"""Shannon entropy (base 2) of a value-count distribution."""
|
|
140
|
+
total = value_counts.sum()
|
|
141
|
+
if total <= 0:
|
|
142
|
+
return 0.0
|
|
143
|
+
probs = value_counts / total
|
|
144
|
+
nonzero = probs[probs > 0]
|
|
145
|
+
if nonzero.empty:
|
|
146
|
+
return 0.0
|
|
147
|
+
return float(-(nonzero * np.log2(nonzero)).sum())
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
_PER_PRINCIPAL_COLUMNS = [
|
|
151
|
+
"principal", "event_count", "error_rate",
|
|
152
|
+
"distinct_source_ip", "distinct_event_name", "distinct_event_source",
|
|
153
|
+
"read_ratio", "action_entropy",
|
|
154
|
+
"distinct_aws_region", "distinct_hours_active",
|
|
155
|
+
]
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _aggregate_per_principal(interactive_df: pd.DataFrame) -> pd.DataFrame:
|
|
159
|
+
"""One row per principal in the interactive lane, with behavioral features.
|
|
160
|
+
|
|
161
|
+
All features derive from the canonical 12-column schema; we never recompute
|
|
162
|
+
principal/lane/read_write — those come from the parser.
|
|
163
|
+
"""
|
|
164
|
+
if interactive_df.empty:
|
|
165
|
+
return pd.DataFrame(columns=_PER_PRINCIPAL_COLUMNS)
|
|
166
|
+
|
|
167
|
+
g = interactive_df.groupby("principal", sort=False)
|
|
168
|
+
event_count = g.size()
|
|
169
|
+
|
|
170
|
+
def _series(col_in_df: bool, default: float | int) -> pd.Series:
|
|
171
|
+
return pd.Series(default, index=event_count.index)
|
|
172
|
+
|
|
173
|
+
if "error_code" in interactive_df.columns:
|
|
174
|
+
error_count = g["error_code"].apply(lambda s: int(s.notna().sum()))
|
|
175
|
+
else:
|
|
176
|
+
error_count = _series(False, 0)
|
|
177
|
+
error_rate = (error_count / event_count).astype(float)
|
|
178
|
+
|
|
179
|
+
distinct_source_ip = (
|
|
180
|
+
g["source_ip"].nunique() if "source_ip" in interactive_df.columns
|
|
181
|
+
else _series(False, 0)
|
|
182
|
+
)
|
|
183
|
+
distinct_event_name = (
|
|
184
|
+
g["event_name"].nunique() if "event_name" in interactive_df.columns
|
|
185
|
+
else _series(False, 0)
|
|
186
|
+
)
|
|
187
|
+
distinct_event_source = (
|
|
188
|
+
g["event_source"].nunique() if "event_source" in interactive_df.columns
|
|
189
|
+
else _series(False, 0)
|
|
190
|
+
)
|
|
191
|
+
distinct_aws_region = (
|
|
192
|
+
g["aws_region"].nunique() if "aws_region" in interactive_df.columns
|
|
193
|
+
else _series(False, 0)
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
if "read_write" in interactive_df.columns:
|
|
197
|
+
read_count = g["read_write"].apply(lambda s: int((s == "read").sum()))
|
|
198
|
+
else:
|
|
199
|
+
read_count = _series(False, 0)
|
|
200
|
+
read_ratio = (read_count / event_count).astype(float)
|
|
201
|
+
|
|
202
|
+
if "event_name" in interactive_df.columns:
|
|
203
|
+
action_entropy = g["event_name"].apply(
|
|
204
|
+
lambda s: _shannon_entropy(s.value_counts())
|
|
205
|
+
)
|
|
206
|
+
else:
|
|
207
|
+
action_entropy = _series(False, 0.0)
|
|
208
|
+
|
|
209
|
+
if "ts" in interactive_df.columns:
|
|
210
|
+
hours = pd.to_datetime(
|
|
211
|
+
interactive_df["ts"], unit="s", utc=True, errors="coerce"
|
|
212
|
+
).dt.hour
|
|
213
|
+
with_hour = interactive_df.assign(_hour=hours.values)
|
|
214
|
+
distinct_hours = (
|
|
215
|
+
with_hour.groupby("principal", sort=False)["_hour"].nunique()
|
|
216
|
+
)
|
|
217
|
+
else:
|
|
218
|
+
distinct_hours = _series(False, 0)
|
|
219
|
+
|
|
220
|
+
out = pd.DataFrame({
|
|
221
|
+
"principal": list(event_count.index),
|
|
222
|
+
"event_count": event_count.values.astype(int),
|
|
223
|
+
"error_rate": error_rate.values,
|
|
224
|
+
"distinct_source_ip": distinct_source_ip.values.astype(int),
|
|
225
|
+
"distinct_event_name": distinct_event_name.values.astype(int),
|
|
226
|
+
"distinct_event_source": distinct_event_source.values.astype(int),
|
|
227
|
+
"read_ratio": read_ratio.values,
|
|
228
|
+
"action_entropy": action_entropy.values,
|
|
229
|
+
"distinct_aws_region": distinct_aws_region.values.astype(int),
|
|
230
|
+
"distinct_hours_active": distinct_hours.values.astype(int),
|
|
231
|
+
})
|
|
232
|
+
return out
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# ── Signal 1: corpus rarity ───────────────────────────────────────────────────
|
|
236
|
+
|
|
237
|
+
def _compute_rarity(interactive_df: pd.DataFrame) -> dict[str, float]:
|
|
238
|
+
"""log10(N / count(event_name)) over interactive-lane events only.
|
|
239
|
+
|
|
240
|
+
Returns ``event_name → rarity``. Pure plain-odds — no domain opinion.
|
|
241
|
+
Higher = rarer action in this corpus. Returns ``{}`` on empty input.
|
|
242
|
+
"""
|
|
243
|
+
if interactive_df.empty or "event_name" not in interactive_df.columns:
|
|
244
|
+
return {}
|
|
245
|
+
counts = interactive_df["event_name"].dropna().value_counts()
|
|
246
|
+
n = int(counts.sum())
|
|
247
|
+
if n == 0:
|
|
248
|
+
return {}
|
|
249
|
+
rarities = np.log10(float(n)) - np.log10(counts.astype(float).values)
|
|
250
|
+
return {str(k): float(v) for k, v in zip(counts.index, rarities)}
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# ── Signal 2: behavioral weirdness composite ──────────────────────────────────
|
|
254
|
+
|
|
255
|
+
def _zscore(values: np.ndarray) -> np.ndarray:
|
|
256
|
+
"""Population z-score; degenerate (std == 0) populations collapse to zeros."""
|
|
257
|
+
if values.size == 0:
|
|
258
|
+
return values.astype(float)
|
|
259
|
+
mean = float(values.mean())
|
|
260
|
+
std = float(values.std())
|
|
261
|
+
if std == 0:
|
|
262
|
+
return np.zeros_like(values, dtype=float)
|
|
263
|
+
return (values.astype(float) - mean) / std
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def _compute_weirdness(scorable: pd.DataFrame) -> pd.DataFrame:
|
|
267
|
+
"""Add component z-scores and a composite to the scorable per-principal frame.
|
|
268
|
+
|
|
269
|
+
Heavy-tailed count features (distinct_source_ip, distinct_event_name) are
|
|
270
|
+
log1p-scaled before z-scoring. Ratios (error_rate) and bounded entropy
|
|
271
|
+
(action_entropy) are not log1p'd.
|
|
272
|
+
|
|
273
|
+
Caller is responsible for filtering to ``event_count >= min_events`` before
|
|
274
|
+
calling this; we trust that contract and do not re-filter.
|
|
275
|
+
"""
|
|
276
|
+
if scorable.empty:
|
|
277
|
+
return scorable
|
|
278
|
+
|
|
279
|
+
out = scorable.copy()
|
|
280
|
+
out["z_error_rate"] = _zscore(out["error_rate"].values)
|
|
281
|
+
out["z_distinct_source_ip"] = _zscore(
|
|
282
|
+
np.log1p(out["distinct_source_ip"].values.astype(float))
|
|
283
|
+
)
|
|
284
|
+
out["z_distinct_event_name"] = _zscore(
|
|
285
|
+
np.log1p(out["distinct_event_name"].values.astype(float))
|
|
286
|
+
)
|
|
287
|
+
out["z_action_entropy"] = _zscore(out["action_entropy"].values)
|
|
288
|
+
out["composite_z"] = (
|
|
289
|
+
out["z_error_rate"]
|
|
290
|
+
+ out["z_distinct_source_ip"]
|
|
291
|
+
+ out["z_distinct_event_name"]
|
|
292
|
+
+ out["z_action_entropy"]
|
|
293
|
+
)
|
|
294
|
+
return out
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
# ── Signal 3: first-seen + burst aggregation ──────────────────────────────────
|
|
298
|
+
|
|
299
|
+
def _compute_bursts(
|
|
300
|
+
interactive_df: pd.DataFrame,
|
|
301
|
+
rarity: dict[str, float],
|
|
302
|
+
burst_gap_seconds: int,
|
|
303
|
+
burst_min_firsts: int,
|
|
304
|
+
) -> list[dict]:
|
|
305
|
+
"""Time-ordered pass yielding per-principal burst candidates.
|
|
306
|
+
|
|
307
|
+
For each principal, the VERY first event is skipped (all-new is
|
|
308
|
+
uninformative). Subsequent events whose event_name has not been seen for
|
|
309
|
+
this principal are recorded as first-seen records. Consecutive first-seen
|
|
310
|
+
records less than ``burst_gap_seconds`` apart form one burst; a closed
|
|
311
|
+
burst with at least ``burst_min_firsts`` records becomes a candidate.
|
|
312
|
+
|
|
313
|
+
"First seen" is first seen within this loaded window. loghunter is batch
|
|
314
|
+
and stateless — no cross-run persistence, no rolling baseline. The
|
|
315
|
+
limitation is disclosed via a RunSummary note assembled by the runner.
|
|
316
|
+
"""
|
|
317
|
+
needed = {"ts", "principal", "event_name"}
|
|
318
|
+
if interactive_df.empty or not needed.issubset(interactive_df.columns):
|
|
319
|
+
return []
|
|
320
|
+
|
|
321
|
+
sorted_df = interactive_df.sort_values("ts", kind="stable").reset_index(drop=True)
|
|
322
|
+
columns = sorted_df.columns
|
|
323
|
+
|
|
324
|
+
seen_actions: dict[str, set[str]] = {}
|
|
325
|
+
first_seen_records: dict[str, list[dict]] = {}
|
|
326
|
+
|
|
327
|
+
for row in sorted_df.itertuples(index=False):
|
|
328
|
+
principal = getattr(row, "principal", None)
|
|
329
|
+
event_name = getattr(row, "event_name", None)
|
|
330
|
+
ts = getattr(row, "ts", None)
|
|
331
|
+
if principal is None or event_name is None or ts is None or pd.isna(ts):
|
|
332
|
+
continue
|
|
333
|
+
|
|
334
|
+
if principal not in seen_actions:
|
|
335
|
+
# Very first event for this principal — skip and seed the seen set.
|
|
336
|
+
seen_actions[principal] = {event_name}
|
|
337
|
+
continue
|
|
338
|
+
|
|
339
|
+
if event_name in seen_actions[principal]:
|
|
340
|
+
continue
|
|
341
|
+
|
|
342
|
+
seen_actions[principal].add(event_name)
|
|
343
|
+
first_seen_records.setdefault(principal, []).append({
|
|
344
|
+
"ts": float(ts),
|
|
345
|
+
"event_name": str(event_name),
|
|
346
|
+
"rarity": rarity.get(str(event_name), 0.0),
|
|
347
|
+
"errored": (
|
|
348
|
+
bool(pd.notna(getattr(row, "error_code", None)))
|
|
349
|
+
if "error_code" in columns else False
|
|
350
|
+
),
|
|
351
|
+
"event_source": (
|
|
352
|
+
str(getattr(row, "event_source"))
|
|
353
|
+
if "event_source" in columns
|
|
354
|
+
and getattr(row, "event_source") is not None else ""
|
|
355
|
+
),
|
|
356
|
+
"source_ip": (
|
|
357
|
+
str(getattr(row, "source_ip"))
|
|
358
|
+
if "source_ip" in columns
|
|
359
|
+
and getattr(row, "source_ip") is not None else ""
|
|
360
|
+
),
|
|
361
|
+
"aws_region": (
|
|
362
|
+
str(getattr(row, "aws_region"))
|
|
363
|
+
if "aws_region" in columns
|
|
364
|
+
and getattr(row, "aws_region") is not None else ""
|
|
365
|
+
),
|
|
366
|
+
"event_id": (
|
|
367
|
+
str(getattr(row, "event_id"))
|
|
368
|
+
if "event_id" in columns
|
|
369
|
+
and getattr(row, "event_id") is not None else ""
|
|
370
|
+
),
|
|
371
|
+
})
|
|
372
|
+
|
|
373
|
+
bursts: list[dict] = []
|
|
374
|
+
for principal, records in first_seen_records.items():
|
|
375
|
+
current: list[dict] = []
|
|
376
|
+
for rec in records:
|
|
377
|
+
if not current:
|
|
378
|
+
current.append(rec)
|
|
379
|
+
continue
|
|
380
|
+
gap = rec["ts"] - current[-1]["ts"]
|
|
381
|
+
if gap < burst_gap_seconds:
|
|
382
|
+
current.append(rec)
|
|
383
|
+
else:
|
|
384
|
+
if len(current) >= burst_min_firsts:
|
|
385
|
+
bursts.append(_summarize_burst(principal, current))
|
|
386
|
+
current = [rec]
|
|
387
|
+
if len(current) >= burst_min_firsts:
|
|
388
|
+
bursts.append(_summarize_burst(principal, current))
|
|
389
|
+
|
|
390
|
+
return bursts
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def _summarize_burst(principal: str, records: list[dict]) -> dict:
|
|
394
|
+
"""Compute per-burst aggregates from a list of first-seen records."""
|
|
395
|
+
n = len(records)
|
|
396
|
+
start_ts = records[0]["ts"]
|
|
397
|
+
end_ts = records[-1]["ts"]
|
|
398
|
+
new_services = sorted({r["event_source"] for r in records if r["event_source"]})
|
|
399
|
+
source_ips = sorted({r["source_ip"] for r in records if r["source_ip"]})
|
|
400
|
+
aws_regions = sorted({r["aws_region"] for r in records if r["aws_region"]})
|
|
401
|
+
new_actions = [r["event_name"] for r in records]
|
|
402
|
+
event_ids = [r["event_id"] for r in records if r["event_id"]]
|
|
403
|
+
error_count = sum(1 for r in records if r["errored"])
|
|
404
|
+
error_rate = error_count / n if n else 0.0
|
|
405
|
+
mean_rarity = sum(r["rarity"] for r in records) / n if n else 0.0
|
|
406
|
+
return {
|
|
407
|
+
"principal": str(principal),
|
|
408
|
+
"start_time": datetime.fromtimestamp(start_ts, tz=timezone.utc).isoformat(),
|
|
409
|
+
"start_ts": start_ts,
|
|
410
|
+
"span_seconds": float(end_ts - start_ts),
|
|
411
|
+
"new_action_count": int(n),
|
|
412
|
+
"new_service_count": int(len(new_services)),
|
|
413
|
+
"new_actions": new_actions,
|
|
414
|
+
"new_services": new_services,
|
|
415
|
+
"source_ips": source_ips,
|
|
416
|
+
"aws_regions": aws_regions,
|
|
417
|
+
"sample_event_ids": event_ids[:10],
|
|
418
|
+
"error_rate": round(error_rate, 4),
|
|
419
|
+
"mean_rarity": round(mean_rarity, 4),
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
|
|
423
|
+
# ── Span / formatting helpers ─────────────────────────────────────────────────
|
|
424
|
+
|
|
425
|
+
def _span_str(seconds: float) -> str:
|
|
426
|
+
"""Compact span: 45s / 7m / 3h / 2d. Caller can also use the raw seconds."""
|
|
427
|
+
s = int(seconds)
|
|
428
|
+
if s < 60:
|
|
429
|
+
return f"{s}s"
|
|
430
|
+
if s < 3600:
|
|
431
|
+
return f"{s // 60}m"
|
|
432
|
+
if s < 86400:
|
|
433
|
+
return f"{s // 3600}h"
|
|
434
|
+
return f"{s // 86400}d"
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
# ── Finding constructors ──────────────────────────────────────────────────────
|
|
438
|
+
|
|
439
|
+
def _make_burst_finding(
|
|
440
|
+
burst: dict,
|
|
441
|
+
burst_high_error_rate: float,
|
|
442
|
+
burst_high_service_count: int,
|
|
443
|
+
now: datetime,
|
|
444
|
+
data_window: tuple[datetime, datetime],
|
|
445
|
+
) -> Finding:
|
|
446
|
+
"""One burst → one Finding. Severity structural by signal kind."""
|
|
447
|
+
err_gate_hit = burst["error_rate"] >= burst_high_error_rate
|
|
448
|
+
svc_gate_hit = burst["new_service_count"] >= burst_high_service_count
|
|
449
|
+
severity = Severity.HIGH if (err_gate_hit or svc_gate_hit) else Severity.MEDIUM
|
|
450
|
+
|
|
451
|
+
title = str(burst["principal"])
|
|
452
|
+
description = (
|
|
453
|
+
f"{burst['new_action_count']} first-seen action(s) across "
|
|
454
|
+
f"{burst['new_service_count']} service(s) in "
|
|
455
|
+
f"{_span_str(burst['span_seconds'])} "
|
|
456
|
+
f"({burst['error_rate']:.0%} errored). Pattern resembles an "
|
|
457
|
+
"enumeration / recon sweep — recon, manual exploration, or a "
|
|
458
|
+
"misconfigured first-time deploy."
|
|
459
|
+
)
|
|
460
|
+
next_steps = [
|
|
461
|
+
f"Review CloudTrail events for principal {burst['principal']}",
|
|
462
|
+
"Drill back via event IDs: " + ", ".join(burst["sample_event_ids"][:5]),
|
|
463
|
+
"Verify source IPs are expected: " + ", ".join(burst["source_ips"][:5]),
|
|
464
|
+
"Regions touched: " + ", ".join(burst["aws_regions"]),
|
|
465
|
+
"Whois / threat-intel any non-AWS source IPs.",
|
|
466
|
+
]
|
|
467
|
+
evidence: dict[str, Any] = {
|
|
468
|
+
"tier": "burst",
|
|
469
|
+
"principal": burst["principal"],
|
|
470
|
+
"start_time": burst["start_time"],
|
|
471
|
+
"span_seconds": burst["span_seconds"],
|
|
472
|
+
"new_action_count": burst["new_action_count"],
|
|
473
|
+
"new_service_count": burst["new_service_count"],
|
|
474
|
+
"error_rate": burst["error_rate"],
|
|
475
|
+
"mean_rarity": burst["mean_rarity"],
|
|
476
|
+
"new_actions": burst["new_actions"],
|
|
477
|
+
"new_services": burst["new_services"],
|
|
478
|
+
"source_ips": burst["source_ips"],
|
|
479
|
+
"aws_regions": burst["aws_regions"],
|
|
480
|
+
"sample_event_ids": burst["sample_event_ids"],
|
|
481
|
+
}
|
|
482
|
+
return Finding(
|
|
483
|
+
detector=DETECTOR_NAME,
|
|
484
|
+
severity=severity,
|
|
485
|
+
title=title,
|
|
486
|
+
description=description,
|
|
487
|
+
evidence=evidence,
|
|
488
|
+
next_steps=next_steps,
|
|
489
|
+
ts_generated=now,
|
|
490
|
+
data_window=data_window,
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def _make_ranked_finding(
|
|
495
|
+
row: pd.Series,
|
|
496
|
+
severity: Severity,
|
|
497
|
+
interactive_df: pd.DataFrame,
|
|
498
|
+
now: datetime,
|
|
499
|
+
data_window: tuple[datetime, datetime],
|
|
500
|
+
) -> Finding:
|
|
501
|
+
"""One ranked principal → one Finding. Components + composite + raw values in evidence."""
|
|
502
|
+
principal = row["principal"]
|
|
503
|
+
sub = interactive_df[interactive_df["principal"] == principal]
|
|
504
|
+
|
|
505
|
+
top_actions = (
|
|
506
|
+
sub["event_name"].value_counts().head(5).index.tolist()
|
|
507
|
+
if "event_name" in sub.columns else []
|
|
508
|
+
)
|
|
509
|
+
source_ips = (
|
|
510
|
+
sorted(s for s in sub["source_ip"].dropna().unique() if isinstance(s, str))[:10]
|
|
511
|
+
if "source_ip" in sub.columns else []
|
|
512
|
+
)
|
|
513
|
+
aws_regions = (
|
|
514
|
+
sorted(s for s in sub["aws_region"].dropna().unique() if isinstance(s, str))
|
|
515
|
+
if "aws_region" in sub.columns else []
|
|
516
|
+
)
|
|
517
|
+
sample_event_ids = (
|
|
518
|
+
[s for s in sub["event_id"].head(5).tolist() if isinstance(s, str)]
|
|
519
|
+
if "event_id" in sub.columns else []
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
title = str(principal)
|
|
523
|
+
description = (
|
|
524
|
+
f"composite z-score {row['composite_z']:.2f} across error rate, "
|
|
525
|
+
"distinct source IPs, distinct action names, and action entropy — this "
|
|
526
|
+
"principal's behavioral fingerprint is unusual for the population."
|
|
527
|
+
)
|
|
528
|
+
next_steps = [
|
|
529
|
+
f"Review CloudTrail events for principal {principal}",
|
|
530
|
+
"Pivot on top actions: " + ", ".join(top_actions),
|
|
531
|
+
"Whois / threat-intel any non-AWS source IPs: " + ", ".join(source_ips[:5]),
|
|
532
|
+
"Drill back via event IDs: " + ", ".join(sample_event_ids),
|
|
533
|
+
]
|
|
534
|
+
evidence: dict[str, Any] = {
|
|
535
|
+
"tier": "ranked",
|
|
536
|
+
"principal": str(principal),
|
|
537
|
+
"composite_z": round(float(row["composite_z"]), 4),
|
|
538
|
+
"z_error_rate": round(float(row["z_error_rate"]), 4),
|
|
539
|
+
"z_distinct_source_ip": round(float(row["z_distinct_source_ip"]), 4),
|
|
540
|
+
"z_distinct_event_name": round(float(row["z_distinct_event_name"]), 4),
|
|
541
|
+
"z_action_entropy": round(float(row["z_action_entropy"]), 4),
|
|
542
|
+
"event_count": int(row["event_count"]),
|
|
543
|
+
"error_rate": round(float(row["error_rate"]), 4),
|
|
544
|
+
"distinct_source_ip": int(row["distinct_source_ip"]),
|
|
545
|
+
"distinct_event_name": int(row["distinct_event_name"]),
|
|
546
|
+
"distinct_event_source": int(row["distinct_event_source"]),
|
|
547
|
+
"action_entropy": round(float(row["action_entropy"]), 4),
|
|
548
|
+
"read_ratio": round(float(row["read_ratio"]), 4),
|
|
549
|
+
"distinct_aws_region": int(row["distinct_aws_region"]),
|
|
550
|
+
"distinct_hours_active": int(row["distinct_hours_active"]),
|
|
551
|
+
"top_actions": top_actions,
|
|
552
|
+
"source_ips": source_ips,
|
|
553
|
+
"aws_regions": aws_regions,
|
|
554
|
+
"sample_event_ids": sample_event_ids,
|
|
555
|
+
}
|
|
556
|
+
return Finding(
|
|
557
|
+
detector=DETECTOR_NAME,
|
|
558
|
+
severity=severity,
|
|
559
|
+
title=title,
|
|
560
|
+
description=description,
|
|
561
|
+
evidence=evidence,
|
|
562
|
+
next_steps=next_steps,
|
|
563
|
+
ts_generated=now,
|
|
564
|
+
data_window=data_window,
|
|
565
|
+
)
|
|
566
|
+
|
|
567
|
+
|
|
568
|
+
def _make_ranked_summary_finding(
|
|
569
|
+
scored: pd.DataFrame,
|
|
570
|
+
now: datetime,
|
|
571
|
+
data_window: tuple[datetime, datetime],
|
|
572
|
+
) -> Finding:
|
|
573
|
+
"""One synthetic INFO Finding — the "nothing stood out" quiet line.
|
|
574
|
+
|
|
575
|
+
Emitted only when at least one scorable interactive principal exists AND
|
|
576
|
+
zero MEDIUM/LOW per-principal Findings result. Carries the count and the
|
|
577
|
+
top composite (least-unremarkable actor) as analyst pivot.
|
|
578
|
+
"""
|
|
579
|
+
top = scored.sort_values("composite_z", ascending=False).iloc[0]
|
|
580
|
+
return Finding(
|
|
581
|
+
detector=DETECTOR_NAME,
|
|
582
|
+
severity=Severity.INFO,
|
|
583
|
+
title="ranked tier: no principals cleared the LOW band",
|
|
584
|
+
description=(
|
|
585
|
+
f"{len(scored)} interactive principal(s) were scored; none cleared the "
|
|
586
|
+
f"LOW band. Least-unremarkable actor: {top['principal']} "
|
|
587
|
+
f"(composite z = {float(top['composite_z']):.2f})."
|
|
588
|
+
),
|
|
589
|
+
evidence={
|
|
590
|
+
"tier": "ranked_summary",
|
|
591
|
+
"scorable_count": int(len(scored)),
|
|
592
|
+
"top_principal": str(top["principal"]),
|
|
593
|
+
"top_composite_z": round(float(top["composite_z"]), 4),
|
|
594
|
+
},
|
|
595
|
+
next_steps=[
|
|
596
|
+
"No recommended action — nothing stood out.",
|
|
597
|
+
"Lower composite_low_threshold in [detectors.aws] to widen the surface.",
|
|
598
|
+
],
|
|
599
|
+
ts_generated=now,
|
|
600
|
+
data_window=data_window,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
# ── Detector entry point ──────────────────────────────────────────────────────
|
|
605
|
+
|
|
606
|
+
def run(context: DetectorContext) -> list[Finding]:
|
|
607
|
+
"""Surface noteworthy CloudTrail principals: bursts first, then ranked weirdness."""
|
|
608
|
+
cfg = context.config
|
|
609
|
+
min_events: int = cfg.get("min_events", DEFAULT_CONFIG["min_events"])
|
|
610
|
+
burst_gap: int = cfg.get("burst_gap_seconds", DEFAULT_CONFIG["burst_gap_seconds"])
|
|
611
|
+
burst_min_firsts: int = cfg.get("burst_min_firsts", DEFAULT_CONFIG["burst_min_firsts"])
|
|
612
|
+
burst_high_err: float = cfg.get("burst_high_error_rate",
|
|
613
|
+
DEFAULT_CONFIG["burst_high_error_rate"])
|
|
614
|
+
burst_high_svcs: int = cfg.get("burst_high_service_count",
|
|
615
|
+
DEFAULT_CONFIG["burst_high_service_count"])
|
|
616
|
+
medium_threshold: float = cfg.get("composite_medium_threshold",
|
|
617
|
+
DEFAULT_CONFIG["composite_medium_threshold"])
|
|
618
|
+
low_threshold: float = cfg.get("composite_low_threshold",
|
|
619
|
+
DEFAULT_CONFIG["composite_low_threshold"])
|
|
620
|
+
|
|
621
|
+
df = context.logs.get("*.json*")
|
|
622
|
+
if df is None or df.empty:
|
|
623
|
+
return []
|
|
624
|
+
|
|
625
|
+
interactive = _filter_interactive(df)
|
|
626
|
+
if interactive.empty:
|
|
627
|
+
return []
|
|
628
|
+
|
|
629
|
+
per_principal = _aggregate_per_principal(interactive)
|
|
630
|
+
scorable = per_principal[per_principal["event_count"] >= min_events].copy()
|
|
631
|
+
|
|
632
|
+
rarity = _compute_rarity(interactive)
|
|
633
|
+
scored = _compute_weirdness(scorable)
|
|
634
|
+
burst_dicts = _compute_bursts(interactive, rarity, burst_gap, burst_min_firsts)
|
|
635
|
+
|
|
636
|
+
now = datetime.now(tz=timezone.utc)
|
|
637
|
+
|
|
638
|
+
# Burst findings: bursts first, sorted by service spread then action count.
|
|
639
|
+
burst_findings = [
|
|
640
|
+
_make_burst_finding(b, burst_high_err, burst_high_svcs, now, context.data_window)
|
|
641
|
+
for b in burst_dicts
|
|
642
|
+
]
|
|
643
|
+
burst_findings.sort(
|
|
644
|
+
key=lambda f: (f.evidence["new_service_count"], f.evidence["new_action_count"]),
|
|
645
|
+
reverse=True,
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
# Ranked findings: MEDIUM and LOW per-principal (no verbose gating); when
|
|
649
|
+
# zero per-principal Findings result and scorable principals exist, one
|
|
650
|
+
# synthetic INFO summary Finding so the analyst sees the tier was scored.
|
|
651
|
+
ranked_findings: list[Finding] = []
|
|
652
|
+
if not scored.empty:
|
|
653
|
+
scored_sorted = scored.sort_values("composite_z", ascending=False)
|
|
654
|
+
for _, row in scored_sorted.iterrows():
|
|
655
|
+
cz = float(row["composite_z"])
|
|
656
|
+
if cz >= medium_threshold:
|
|
657
|
+
ranked_findings.append(
|
|
658
|
+
_make_ranked_finding(row, Severity.MEDIUM, interactive, now, context.data_window)
|
|
659
|
+
)
|
|
660
|
+
elif cz >= low_threshold:
|
|
661
|
+
ranked_findings.append(
|
|
662
|
+
_make_ranked_finding(row, Severity.LOW, interactive, now, context.data_window)
|
|
663
|
+
)
|
|
664
|
+
# cz < low_threshold → INFO band, not emitted per-principal.
|
|
665
|
+
|
|
666
|
+
if not ranked_findings:
|
|
667
|
+
ranked_findings.append(
|
|
668
|
+
_make_ranked_summary_finding(scored_sorted, now, context.data_window)
|
|
669
|
+
)
|
|
670
|
+
|
|
671
|
+
return burst_findings + ranked_findings
|