loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""SSL detector — TLS anomaly detection from Zeek ssl.log. (planned)
|
|
2
|
+
|
|
3
|
+
Flags self-signed certificates, weak cipher suites, unusual SNI patterns,
|
|
4
|
+
and certificate validity anomalies that may indicate malicious infrastructure.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from loghunter.common.finding import DetectorContext, Finding
|
|
10
|
+
|
|
11
|
+
DETECTOR_NAME = "ssl"
|
|
12
|
+
STATUS = "planned"
|
|
13
|
+
|
|
14
|
+
REQUIRED_LOGS = [
|
|
15
|
+
{"source": "zeek_dir", "pattern": "ssl*.log*"},
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
OPTIONAL_LOGS: list[dict] = []
|
|
19
|
+
|
|
20
|
+
DEFAULT_CONFIG: dict = {}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def run(context: DetectorContext) -> list[Finding]:
|
|
24
|
+
"""Detect TLS anomalies including self-signed certs and cipher outliers."""
|
|
25
|
+
raise NotImplementedError("ssl detector is planned — not yet implemented")
|
|
@@ -0,0 +1,266 @@
|
|
|
1
|
+
"""Syslog anomaly detector — drain3 templating + rarity scoring.
|
|
2
|
+
|
|
3
|
+
Pipeline:
|
|
4
|
+
1. drain3 log templating: assigns each message row a template_id and template_str
|
|
5
|
+
2. Rarity scoring: flags templates whose occurrence count falls at or below
|
|
6
|
+
min(percentile_threshold, max_count) as anomalous
|
|
7
|
+
3. Reboot detection: scans all rows for known reboot/shutdown signal patterns
|
|
8
|
+
4. Reboot suppression: anomalous events within reboot_suppress_window seconds after
|
|
9
|
+
a detected reboot on the same host are suppressed; one synthetic reboot annotation
|
|
10
|
+
is emitted per detected reboot in their place
|
|
11
|
+
5. Finding production: one Finding per anomalous event plus one per synthetic reboot
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from datetime import datetime, timezone
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from tqdm import tqdm
|
|
22
|
+
|
|
23
|
+
from loghunter.common.finding import DetectorContext, Finding, MethodTag, Severity
|
|
24
|
+
|
|
25
|
+
DETECTOR_NAME = "syslog"
|
|
26
|
+
STATUS = "available"
|
|
27
|
+
|
|
28
|
+
# syslog is fidelity-aware: either flat rsyslog (syslog_dir/*.log*) OR Zeek's
|
|
29
|
+
# own syslog.log (zeek_dir/syslog*.log*). At least one must be present; both
|
|
30
|
+
# concat before drain3. Detector is SOURCE-BLIND — references only the
|
|
31
|
+
# minimal-5 (ts, host, program, raw, message). Zeek's extended facility /
|
|
32
|
+
# severity ride along on the frame but are NEVER read here; the digest
|
|
33
|
+
# consumes them. Mirrors the dns detector's Zeek + Pi-hole shape.
|
|
34
|
+
REQUIRED_LOGS: list[dict] = []
|
|
35
|
+
|
|
36
|
+
OPTIONAL_LOGS = [
|
|
37
|
+
{"source": "syslog_dir", "pattern": "*.log*"},
|
|
38
|
+
{"source": "zeek_dir", "pattern": "syslog*.log*"},
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
REQUIRES_ONE_OF_OPTIONAL = True
|
|
42
|
+
REQUIRES_ONE_OF_OPTIONAL_REASON = (
|
|
43
|
+
"syslog — no syslog source found "
|
|
44
|
+
"(need syslog_dir files or zeek_dir syslog.log)"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
DRAIN_SIM_THRESH = 0.5
|
|
48
|
+
DRAIN_DEPTH = 4
|
|
49
|
+
DRAIN_PARAMETRIZE_NUMERIC = True
|
|
50
|
+
REBOOT_SUPPRESS_WINDOW = 300 # seconds
|
|
51
|
+
|
|
52
|
+
DEFAULT_CONFIG = {
|
|
53
|
+
"lookback_days": 7,
|
|
54
|
+
"rarity_pct": 10,
|
|
55
|
+
"max_count": 1,
|
|
56
|
+
"sim_thresh": DRAIN_SIM_THRESH,
|
|
57
|
+
"depth": DRAIN_DEPTH,
|
|
58
|
+
"parametrize_numeric": DRAIN_PARAMETRIZE_NUMERIC,
|
|
59
|
+
"reboot_suppress_window": REBOOT_SUPPRESS_WINDOW,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
DETECTOR_METHOD = MethodTag("drain3", named=True)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def run(context: DetectorContext) -> list[Finding]:
|
|
66
|
+
"""Detect anomalous syslog lines using drain3 templating and rarity scoring."""
|
|
67
|
+
flat_df = context.logs.get("*.log*")
|
|
68
|
+
zeek_df = context.logs.get("syslog*.log*")
|
|
69
|
+
|
|
70
|
+
frames = [df for df in (flat_df, zeek_df) if df is not None and not df.empty]
|
|
71
|
+
if not frames:
|
|
72
|
+
return []
|
|
73
|
+
df = frames[0] if len(frames) == 1 else pd.concat(frames, ignore_index=True)
|
|
74
|
+
|
|
75
|
+
cfg = context.config
|
|
76
|
+
sim_thresh = cfg.get("sim_thresh", DEFAULT_CONFIG["sim_thresh"])
|
|
77
|
+
depth = cfg.get("depth", DEFAULT_CONFIG["depth"])
|
|
78
|
+
parametrize = cfg.get("parametrize_numeric", DEFAULT_CONFIG["parametrize_numeric"])
|
|
79
|
+
rarity_pct = cfg.get("rarity_pct", DEFAULT_CONFIG["rarity_pct"])
|
|
80
|
+
max_count = cfg.get("max_count", DEFAULT_CONFIG["max_count"])
|
|
81
|
+
suppress_window = cfg.get("reboot_suppress_window", DEFAULT_CONFIG["reboot_suppress_window"])
|
|
82
|
+
|
|
83
|
+
df = _run_drain3(df, sim_thresh, depth, parametrize)
|
|
84
|
+
df, threshold, freq = _score_rarity(df, rarity_pct, max_count)
|
|
85
|
+
reboots = _detect_reboots(df)
|
|
86
|
+
anomaly_df = df[df["is_anomaly"]].copy()
|
|
87
|
+
kept_df, synthetic_records = _apply_suppression(anomaly_df, reboots, suppress_window)
|
|
88
|
+
|
|
89
|
+
now = datetime.now(timezone.utc)
|
|
90
|
+
timestamped: list[tuple[float, Finding]] = []
|
|
91
|
+
|
|
92
|
+
for row in kept_df.itertuples():
|
|
93
|
+
ts_sort = float("inf") if pd.isna(row.ts) else float(row.ts)
|
|
94
|
+
f = Finding(
|
|
95
|
+
detector=DETECTOR_NAME,
|
|
96
|
+
severity=Severity.MEDIUM,
|
|
97
|
+
title=str(row.raw)[:180],
|
|
98
|
+
description="Rare log template observed at or below rarity threshold",
|
|
99
|
+
evidence={
|
|
100
|
+
"host": row.host,
|
|
101
|
+
"template_id": int(row.template_id),
|
|
102
|
+
"template_str": row.template_str,
|
|
103
|
+
"count": int(freq[int(row.template_id)]),
|
|
104
|
+
"threshold": int(threshold),
|
|
105
|
+
},
|
|
106
|
+
next_steps=[
|
|
107
|
+
"Review surrounding log context for this host",
|
|
108
|
+
"Check if template appears in recent incidents",
|
|
109
|
+
],
|
|
110
|
+
ts_generated=now,
|
|
111
|
+
data_window=context.data_window,
|
|
112
|
+
)
|
|
113
|
+
timestamped.append((ts_sort, f))
|
|
114
|
+
|
|
115
|
+
for record in synthetic_records:
|
|
116
|
+
reboot_ts = record["ts"]
|
|
117
|
+
ts_sort = reboot_ts.timestamp() if reboot_ts is not None else float("inf")
|
|
118
|
+
f = Finding(
|
|
119
|
+
detector=DETECTOR_NAME,
|
|
120
|
+
severity=Severity.INFO,
|
|
121
|
+
title=record["raw"],
|
|
122
|
+
description="Reboot detected — anomalous events within suppression window are excluded",
|
|
123
|
+
evidence={
|
|
124
|
+
"host": record["host"],
|
|
125
|
+
"reboot_ts": reboot_ts.isoformat() if reboot_ts is not None else None,
|
|
126
|
+
"suppressed_window_seconds": suppress_window,
|
|
127
|
+
},
|
|
128
|
+
next_steps=["Review system logs around the reboot time for pre-reboot anomalies"],
|
|
129
|
+
ts_generated=now,
|
|
130
|
+
data_window=context.data_window,
|
|
131
|
+
)
|
|
132
|
+
timestamped.append((ts_sort, f))
|
|
133
|
+
|
|
134
|
+
timestamped.sort(key=lambda x: x[0])
|
|
135
|
+
return [f for _, f in timestamped]
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def _run_drain3(
|
|
139
|
+
df: pd.DataFrame,
|
|
140
|
+
sim_thresh: float,
|
|
141
|
+
depth: int,
|
|
142
|
+
parametrize_numeric: bool,
|
|
143
|
+
) -> pd.DataFrame:
|
|
144
|
+
"""Add template_id and template_str columns via drain3 log templating."""
|
|
145
|
+
try:
|
|
146
|
+
from drain3 import TemplateMiner
|
|
147
|
+
from drain3.template_miner_config import TemplateMinerConfig
|
|
148
|
+
except ImportError:
|
|
149
|
+
raise ImportError(
|
|
150
|
+
"drain3 is required for syslog detection. Run: pip install drain3"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
cfg = TemplateMinerConfig()
|
|
154
|
+
cfg.drain_sim_th = sim_thresh
|
|
155
|
+
cfg.drain_depth = depth
|
|
156
|
+
cfg.parametrize_numeric_tokens = parametrize_numeric
|
|
157
|
+
|
|
158
|
+
miner = TemplateMiner(config=cfg)
|
|
159
|
+
template_ids: list[int] = []
|
|
160
|
+
template_strs: list[str] = []
|
|
161
|
+
|
|
162
|
+
# leave=True + clean bar_format makes this the liveness narration for the
|
|
163
|
+
# syslog detector phase (the runner deliberately skips its outer spinner
|
|
164
|
+
# for syslog so the two writers don't fight for the same stderr line).
|
|
165
|
+
for msg in tqdm(
|
|
166
|
+
df["message"],
|
|
167
|
+
desc="syslog: mining templates",
|
|
168
|
+
unit=" lines",
|
|
169
|
+
unit_scale=True,
|
|
170
|
+
leave=True,
|
|
171
|
+
bar_format="{desc}: {n_fmt} lines [{elapsed}]",
|
|
172
|
+
):
|
|
173
|
+
result = miner.add_log_message(str(msg))
|
|
174
|
+
template_ids.append(result["cluster_id"])
|
|
175
|
+
template_strs.append(result["template_mined"])
|
|
176
|
+
|
|
177
|
+
df = df.copy()
|
|
178
|
+
df["template_id"] = template_ids
|
|
179
|
+
df["template_str"] = template_strs
|
|
180
|
+
return df
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _score_rarity(
|
|
184
|
+
df: pd.DataFrame,
|
|
185
|
+
rarity_pct: int,
|
|
186
|
+
max_count: int,
|
|
187
|
+
) -> tuple[pd.DataFrame, int, dict[int, int]]:
|
|
188
|
+
"""Add is_anomaly column; return (df, effective_threshold, freq_dict)."""
|
|
189
|
+
freq: dict[int, int] = {
|
|
190
|
+
int(k): int(v) for k, v in df["template_id"].value_counts().items()
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
sorted_counts = sorted(freq.values())
|
|
194
|
+
idx = max(0, int(len(sorted_counts) * rarity_pct / 100) - 1)
|
|
195
|
+
pct_threshold = sorted_counts[idx]
|
|
196
|
+
threshold = min(pct_threshold, max_count)
|
|
197
|
+
|
|
198
|
+
rare_ids = {tid for tid, count in freq.items() if count <= threshold}
|
|
199
|
+
|
|
200
|
+
df = df.copy()
|
|
201
|
+
df["is_anomaly"] = df["template_id"].map(lambda tid: int(tid) in rare_ids)
|
|
202
|
+
return df, threshold, freq
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def _detect_reboots(df: pd.DataFrame) -> dict[str, list[datetime]]:
|
|
206
|
+
"""Return {host: [reboot_datetimes]} by scanning all rows for reboot signals."""
|
|
207
|
+
from loghunter.parsers.syslog import is_reboot_signal
|
|
208
|
+
|
|
209
|
+
reboots: dict[str, list[datetime]] = defaultdict(list)
|
|
210
|
+
|
|
211
|
+
for row in df.itertuples():
|
|
212
|
+
if pd.isna(row.ts):
|
|
213
|
+
continue
|
|
214
|
+
if is_reboot_signal(str(row.raw)):
|
|
215
|
+
dt = datetime.fromtimestamp(float(row.ts), tz=timezone.utc)
|
|
216
|
+
reboots[row.host].append(dt)
|
|
217
|
+
|
|
218
|
+
return {host: sorted(times) for host, times in reboots.items()}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def _apply_suppression(
|
|
222
|
+
anomaly_df: pd.DataFrame,
|
|
223
|
+
reboots: dict[str, list[datetime]],
|
|
224
|
+
suppress_window: int,
|
|
225
|
+
) -> tuple[pd.DataFrame, list[dict[str, Any]]]:
|
|
226
|
+
"""Suppress anomalous events near reboots; emit one synthetic record per reboot.
|
|
227
|
+
|
|
228
|
+
Returns (kept_df, synthetic_reboot_records).
|
|
229
|
+
"""
|
|
230
|
+
kept_indices: list[Any] = []
|
|
231
|
+
synthetic_records: list[dict[str, Any]] = []
|
|
232
|
+
emitted_reboots: set[tuple[str, datetime]] = set()
|
|
233
|
+
|
|
234
|
+
for row in anomaly_df.itertuples():
|
|
235
|
+
host = row.host
|
|
236
|
+
ts = row.ts
|
|
237
|
+
|
|
238
|
+
if pd.isna(ts) or host not in reboots:
|
|
239
|
+
kept_indices.append(row.Index)
|
|
240
|
+
continue
|
|
241
|
+
|
|
242
|
+
event_dt = datetime.fromtimestamp(float(ts), tz=timezone.utc)
|
|
243
|
+
suppressed = False
|
|
244
|
+
|
|
245
|
+
for reboot_dt in reboots[host]:
|
|
246
|
+
delta = (event_dt - reboot_dt).total_seconds()
|
|
247
|
+
if 0 <= delta <= suppress_window:
|
|
248
|
+
key = (host, reboot_dt)
|
|
249
|
+
if key not in emitted_reboots:
|
|
250
|
+
emitted_reboots.add(key)
|
|
251
|
+
synthetic_records.append({
|
|
252
|
+
"ts": reboot_dt,
|
|
253
|
+
"host": host,
|
|
254
|
+
"raw": (
|
|
255
|
+
f"*** {host} rebooted at "
|
|
256
|
+
f"{reboot_dt.strftime('%a %b %d %H:%M:%S')} ***"
|
|
257
|
+
),
|
|
258
|
+
"synthetic": True,
|
|
259
|
+
})
|
|
260
|
+
suppressed = True
|
|
261
|
+
break
|
|
262
|
+
|
|
263
|
+
if not suppressed:
|
|
264
|
+
kept_indices.append(row.Index)
|
|
265
|
+
|
|
266
|
+
return anomaly_df.loc[kept_indices], synthetic_records
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Weird detector — signals from Zeek weird.log and notice.log. (planned)
|
|
2
|
+
|
|
3
|
+
Surfaces Zeek's own anomaly signals: protocol violations, connection state issues,
|
|
4
|
+
and notice events that indicate suspicious or malformed network behavior.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from loghunter.common.finding import DetectorContext, Finding
|
|
10
|
+
|
|
11
|
+
DETECTOR_NAME = "weird"
|
|
12
|
+
STATUS = "planned"
|
|
13
|
+
|
|
14
|
+
REQUIRED_LOGS = [
|
|
15
|
+
{"source": "zeek_dir", "pattern": "weird*.log*"},
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
OPTIONAL_LOGS = [
|
|
19
|
+
{"source": "zeek_dir", "pattern": "notice*.log*"},
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
DEFAULT_CONFIG: dict = {}
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def run(context: DetectorContext) -> list[Finding]:
|
|
26
|
+
"""Aggregate and score Zeek weird and notice events."""
|
|
27
|
+
raise NotImplementedError("weird detector is planned — not yet implemented")
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""digest verb — orient-before-the-hunt.
|
|
2
|
+
|
|
3
|
+
A digest characterises the dominant shape of a log pile and states facts
|
|
4
|
+
about it. It is a peer verb to detect, not a detector. Digest never produces
|
|
5
|
+
a Finding and never reaches a verdict.
|
|
6
|
+
|
|
7
|
+
One summariser module per schema. Each summariser is a function
|
|
8
|
+
``summarize(frame) -> dict`` returning the schema-specific body of a
|
|
9
|
+
DigestCard. The dispatcher below imports the right module by schema name; to
|
|
10
|
+
add a new schema, drop a new module beside conn.py and nothing else changes.
|
|
11
|
+
|
|
12
|
+
Architectural rail: digest consumes the loaded frame BEFORE the allowlist
|
|
13
|
+
filtering seam. Allowlisted infrastructure (resolvers, pollers) is part of
|
|
14
|
+
what's in the pile and stays on the sonar. The digest call graph must not
|
|
15
|
+
touch build_matcher or AllowlistMatcher.filter_df.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from importlib import import_module
|
|
21
|
+
from typing import Callable
|
|
22
|
+
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_summarizer(schema: str) -> Callable[..., dict]:
|
|
27
|
+
"""Return the summarize() function for a given digest schema.
|
|
28
|
+
|
|
29
|
+
The dispatcher returns the bare callable; per-schema signatures may
|
|
30
|
+
differ. Today: ``conn`` and ``cloudtrail`` take ``(frame)``;
|
|
31
|
+
``dns`` takes ``(frame, feed)`` where feed is ``"zeek"`` or
|
|
32
|
+
``"pihole"``; ``syslog`` takes ``(frame, feed)`` where feed is
|
|
33
|
+
``"zeek"`` or ``"syslog"`` (flat rsyslog). Callers (currently only
|
|
34
|
+
``run_digest``) know how to invoke the right signature per schema.
|
|
35
|
+
|
|
36
|
+
Raises ValueError with an actionable message when no summariser
|
|
37
|
+
exists for the requested schema.
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
module = import_module(f"loghunter.digest.{schema}")
|
|
41
|
+
except ModuleNotFoundError as exc:
|
|
42
|
+
raise ValueError(f"digest: no summarizer for schema {schema!r}") from exc
|
|
43
|
+
return module.summarize
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
"""Shared digest stats — the home for primitives that more than one card needs.
|
|
2
|
+
|
|
3
|
+
Hosts:
|
|
4
|
+
- ``_rate`` / ``RATE_FLOOR`` — fraction-of-events-matching-a-kind statistic
|
|
5
|
+
with top-contributor attribution
|
|
6
|
+
- ``_share`` / ``SHARE_GATE`` — concentration-against-total statistic with no
|
|
7
|
+
population floor
|
|
8
|
+
- ``select_insights_and_fields`` — shared selection helper that promotes the
|
|
9
|
+
top-N speaking gated slots to insights and
|
|
10
|
+
returns the leftover slots as fields
|
|
11
|
+
|
|
12
|
+
Cliff machinery (``_cliff``, ``CLIFF_GATE``, ``CLIFF_DISPLAY_CAP``,
|
|
13
|
+
``POPULATION_FLOOR``, ``_format_ratio_cell``, ``_format_ratio_lede``) lives in
|
|
14
|
+
``loghunter.digest.conn`` and stays there — that is the established shared
|
|
15
|
+
origin and every card already imports cliff helpers from it.
|
|
16
|
+
|
|
17
|
+
The trigger for factoring into this module is "three identical real uses"
|
|
18
|
+
(``_rate``, now imported by dns + syslog + cloudtrail) or "shared by the new
|
|
19
|
+
statistic by design" (``_share`` introduced by the cloudtrail source-ip slot
|
|
20
|
+
and reusable by any future concentration-against-total slot). Tail
|
|
21
|
+
(``_tail`` / ``TAIL_GATE``) stays local to dns.py — one-use primitives do not
|
|
22
|
+
belong here yet.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
from typing import Any, Callable
|
|
28
|
+
|
|
29
|
+
import pandas as pd
|
|
30
|
+
|
|
31
|
+
from loghunter.common.finding import DigestSlot
|
|
32
|
+
from loghunter.digest.conn import POPULATION_FLOOR
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
# ── Calibration constants ───────────────────────────────────────────────────
|
|
36
|
+
|
|
37
|
+
RATE_FLOOR = 0.01 # fraction below this → rate slots dash. Pure presence
|
|
38
|
+
# floor, NOT a badness threshold — meaning is the same on
|
|
39
|
+
# every network. Calibratable here.
|
|
40
|
+
|
|
41
|
+
SHARE_GATE = 0.80 # top-share at or above this → share slots speak. The
|
|
42
|
+
# share statistic exists to surface concentration; this
|
|
43
|
+
# gate is the concentration threshold. There is no
|
|
44
|
+
# paired population floor — see ``_share`` below.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ── Rate statistic ──────────────────────────────────────────────────────────
|
|
48
|
+
#
|
|
49
|
+
# Behavior must NOT change from the previous in-card definitions — that is the
|
|
50
|
+
# proof-of-correctness for the factoring. Identical body to the three card
|
|
51
|
+
# copies it replaces.
|
|
52
|
+
|
|
53
|
+
def _rate(kind_mask: pd.Series, contributor_series: pd.Series) -> tuple | None:
|
|
54
|
+
"""Rate statistic: what fraction of events are of a notable kind?
|
|
55
|
+
|
|
56
|
+
Returns ``(fraction, top_contributor)`` when speaking, None when dashed.
|
|
57
|
+
Dashes when total < POPULATION_FLOOR or fraction < RATE_FLOOR. The floor
|
|
58
|
+
is a pure presence bar — meaning the same on every network, never a
|
|
59
|
+
badness judgment.
|
|
60
|
+
"""
|
|
61
|
+
total = len(kind_mask)
|
|
62
|
+
if total < POPULATION_FLOOR:
|
|
63
|
+
return None
|
|
64
|
+
kind_count = int(kind_mask.sum())
|
|
65
|
+
if kind_count == 0:
|
|
66
|
+
return None
|
|
67
|
+
fraction = kind_count / total
|
|
68
|
+
if fraction < RATE_FLOOR:
|
|
69
|
+
return None
|
|
70
|
+
matching = contributor_series[kind_mask].dropna()
|
|
71
|
+
if matching.empty:
|
|
72
|
+
return None
|
|
73
|
+
top = matching.value_counts().idxmax()
|
|
74
|
+
return fraction, str(top)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# ── Share statistic ─────────────────────────────────────────────────────────
|
|
78
|
+
|
|
79
|
+
def _share(sorted_counts: pd.Series, total: int) -> tuple[Any, float] | None:
|
|
80
|
+
"""Share statistic: is one entity's count a dominant fraction of the total?
|
|
81
|
+
|
|
82
|
+
Returns ``(rank1_entity, top_share)`` when speaking, None when dashed.
|
|
83
|
+
Dashes only when ``top_share < SHARE_GATE`` — there is NO population
|
|
84
|
+
floor. The slot using this statistic exists to surface concentration
|
|
85
|
+
against the total, and low entity cardinality is the SIGNAL, not noise:
|
|
86
|
+
a pile of ONE distinct value at 100% MUST speak (top_share == 1.0); two
|
|
87
|
+
distinct entities with one at 99% MUST speak. Adding a population floor
|
|
88
|
+
here would suppress the exact attack shape the share slot was introduced
|
|
89
|
+
to catch.
|
|
90
|
+
|
|
91
|
+
``sorted_counts`` must be descending; the caller's value_counts output is
|
|
92
|
+
already that shape. ``total`` is the caller-supplied denominator — for
|
|
93
|
+
source-ip in cloudtrail that is the interactive-event count, NOT a
|
|
94
|
+
derived sum, so the share is measured against the lane the caller meant.
|
|
95
|
+
|
|
96
|
+
Defensive returns:
|
|
97
|
+
- empty series or non-positive total → None
|
|
98
|
+
- NaN rank1 → None
|
|
99
|
+
"""
|
|
100
|
+
if total <= 0 or len(sorted_counts) == 0:
|
|
101
|
+
return None
|
|
102
|
+
rank1 = sorted_counts.iloc[0]
|
|
103
|
+
if pd.isna(rank1):
|
|
104
|
+
return None
|
|
105
|
+
top_share = float(rank1) / float(total)
|
|
106
|
+
if top_share < SHARE_GATE:
|
|
107
|
+
return None
|
|
108
|
+
return sorted_counts.index[0], top_share
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
# ── Insight selection ───────────────────────────────────────────────────────
|
|
112
|
+
#
|
|
113
|
+
# Shared by all four schema summarisers. Identical mechanic across cards:
|
|
114
|
+
# filter speaking gated slots, sort by per-statistic salience, take top-3,
|
|
115
|
+
# format via the per-card formatter map → those become insights. Everything
|
|
116
|
+
# else with cells goes to fields. A promoted slot is suppressed from fields.
|
|
117
|
+
|
|
118
|
+
_INSIGHT_TOP_N = 3
|
|
119
|
+
_GATING_STATISTICS = frozenset({"cliff", "tail", "rate", "share"})
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _salience(slot: DigestSlot) -> float:
|
|
123
|
+
"""Per-statistic salience on one comparable scale.
|
|
124
|
+
|
|
125
|
+
cliff/tail use the rank-ratio directly. rate slots are stored with the
|
|
126
|
+
percentage as magnitude (e.g. 1.0 for 1%), so dividing by
|
|
127
|
+
``RATE_FLOOR * 100`` puts a rate slot's salience on the same scale as a
|
|
128
|
+
cliff ratio (1% over a 1% floor scores 1.0, comparable to a 1x cliff).
|
|
129
|
+
share is stored as percentage 0–100; a heavily concentrated single
|
|
130
|
+
source IS one of the most salient things on a card, so its raw
|
|
131
|
+
percentage ranks above typical cliff ratios.
|
|
132
|
+
"""
|
|
133
|
+
if slot.statistic in {"cliff", "tail"}:
|
|
134
|
+
return slot.ratio or 0.0
|
|
135
|
+
if slot.statistic == "rate":
|
|
136
|
+
return (slot.magnitude or 0.0) / (RATE_FLOOR * 100.0)
|
|
137
|
+
if slot.statistic == "share":
|
|
138
|
+
return slot.magnitude or 0.0
|
|
139
|
+
return 0.0
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def select_insights_and_fields(
|
|
143
|
+
slots: list[DigestSlot],
|
|
144
|
+
formatters: dict[str, Callable[[DigestSlot], str]],
|
|
145
|
+
) -> tuple[list[str], list[DigestSlot]]:
|
|
146
|
+
"""Promote speaking gated slots to insights; return leftover speaking
|
|
147
|
+
slots as fields.
|
|
148
|
+
|
|
149
|
+
Suppression rule (Glenn's precision ask): a slot is removed from
|
|
150
|
+
``fields`` ONLY when it actually produced an insight — i.e. it was in
|
|
151
|
+
the top-N speaking gated set AND its label had a formatter that ran.
|
|
152
|
+
A gating slot whose label has no formatter falls through to fields
|
|
153
|
+
instead of vanishing, preserving "each fact appears exactly once."
|
|
154
|
+
|
|
155
|
+
Dist slots (statistic not in the gating set) never produce insights;
|
|
156
|
+
they pass straight through to fields if they have cells.
|
|
157
|
+
Non-speaking slots (cells is None) are omitted from both insights
|
|
158
|
+
and fields.
|
|
159
|
+
|
|
160
|
+
Slot labels are unique within a card, so label-based suppression is
|
|
161
|
+
safe — no ``id()`` ceremony.
|
|
162
|
+
"""
|
|
163
|
+
speaking_gated = [
|
|
164
|
+
s for s in slots
|
|
165
|
+
if s.cells is not None and s.statistic in _GATING_STATISTICS
|
|
166
|
+
]
|
|
167
|
+
speaking_gated.sort(key=_salience, reverse=True)
|
|
168
|
+
|
|
169
|
+
promoted_labels: set[str] = set()
|
|
170
|
+
insights: list[str] = []
|
|
171
|
+
for s in speaking_gated[:_INSIGHT_TOP_N]:
|
|
172
|
+
fmt = formatters.get(s.label)
|
|
173
|
+
if fmt is None:
|
|
174
|
+
continue
|
|
175
|
+
insights.append(fmt(s))
|
|
176
|
+
promoted_labels.add(s.label)
|
|
177
|
+
|
|
178
|
+
fields = [
|
|
179
|
+
s for s in slots
|
|
180
|
+
if s.cells is not None and s.label not in promoted_labels
|
|
181
|
+
]
|
|
182
|
+
return insights, fields
|