loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
loghunter/digest/dns.py
ADDED
|
@@ -0,0 +1,364 @@
|
|
|
1
|
+
"""dns summariser — orient-before-the-hunt for DNS data.
|
|
2
|
+
|
|
3
|
+
The first fidelity-aware digest card: a slot set that depends on which DNS
|
|
4
|
+
feed was loaded. Four slots are shared (cliff/cliff/tail/dist over columns
|
|
5
|
+
present on both feeds); two are feed-specific:
|
|
6
|
+
|
|
7
|
+
- nxdomain-rate (rcode-based) — Zeek only; non-speaking on Pi-hole
|
|
8
|
+
- block-rate (event_type-based) — Pi-hole only; non-speaking on Zeek
|
|
9
|
+
|
|
10
|
+
A feed-uncomputable slot returns a non-speaking ``DigestSlot`` (cells=None);
|
|
11
|
+
``select_insights_and_fields`` filters it out of ``fields`` and the slot
|
|
12
|
+
simply vanishes from the rendered card. No ABSENT marker, no footer text.
|
|
13
|
+
|
|
14
|
+
Cliff machinery imported from conn so the two cards cannot drift on gate /
|
|
15
|
+
floor / display-cap behaviour. The rate statistic — and its RATE_FLOOR
|
|
16
|
+
constant — live in ``loghunter.digest._stats`` (factored once three cards
|
|
17
|
+
needed an identical copy: this one, syslog, and cloudtrail). Two more
|
|
18
|
+
statistics computed locally:
|
|
19
|
+
|
|
20
|
+
- tail: max/median ratio over a distribution, with an owner attribution
|
|
21
|
+
- dist: top-3 share-of-mix; orientation only, never produces an insight
|
|
22
|
+
|
|
23
|
+
A row is "blocked" on Pi-hole iff event_type ∈ {gravity_blocked,
|
|
24
|
+
regex_blocked} — digest computes this locally; the detector is not imported.
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
import pandas as pd
|
|
30
|
+
|
|
31
|
+
from loghunter.common.finding import DigestSlot
|
|
32
|
+
from loghunter.digest._stats import RATE_FLOOR, _rate
|
|
33
|
+
from loghunter.digest.conn import (
|
|
34
|
+
CLIFF_DISPLAY_CAP, # noqa: F401 — re-exported for downstream symmetry
|
|
35
|
+
CLIFF_GATE, # noqa: F401 — re-exported for downstream symmetry
|
|
36
|
+
POPULATION_FLOOR,
|
|
37
|
+
_cliff,
|
|
38
|
+
_format_ratio_cell,
|
|
39
|
+
_format_ratio_lede,
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ── Calibration constants — provisional, tunable in one place ───────────────
|
|
44
|
+
|
|
45
|
+
TAIL_GATE = 3.0 # max/median ratio below this → query-length is non-speaking
|
|
46
|
+
|
|
47
|
+
# Zeek emits qtype as a numeric type code; map the common ones to mnemonics
|
|
48
|
+
# for display. Unmapped codes render as "TYPE<n>" so an analyst still has a
|
|
49
|
+
# breadcrumb to look up.
|
|
50
|
+
_ZEEK_QTYPE_MNEMONICS = {
|
|
51
|
+
1: "A", 2: "NS", 5: "CNAME", 6: "SOA", 12: "PTR",
|
|
52
|
+
15: "MX", 16: "TXT", 28: "AAAA", 33: "SRV", 65: "HTTPS", 257: "CAA",
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
_BLOCK_EVENT_TYPES = frozenset({"gravity_blocked", "regex_blocked"})
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ── tail statistic ──────────────────────────────────────────────────────────
|
|
59
|
+
|
|
60
|
+
def _tail(values: pd.Series, owner_series: pd.Series) -> tuple | None:
|
|
61
|
+
"""Tail statistic: is the extreme far from the body of the distribution?
|
|
62
|
+
|
|
63
|
+
Returns ``(max_val, ratio, owner)`` when speaking, None when dashed.
|
|
64
|
+
Dashes when population < POPULATION_FLOOR, median is 0/NaN, or
|
|
65
|
+
max/median is below TAIL_GATE.
|
|
66
|
+
|
|
67
|
+
``values`` and ``owner_series`` must share an index — the owner of the
|
|
68
|
+
max is looked up by that index.
|
|
69
|
+
"""
|
|
70
|
+
cleaned = values.dropna()
|
|
71
|
+
if len(cleaned) < POPULATION_FLOOR:
|
|
72
|
+
return None
|
|
73
|
+
median = cleaned.median()
|
|
74
|
+
if pd.isna(median) or median == 0:
|
|
75
|
+
return None
|
|
76
|
+
max_val = cleaned.max()
|
|
77
|
+
if pd.isna(max_val) or max_val == 0:
|
|
78
|
+
return None
|
|
79
|
+
ratio = float(max_val) / float(median)
|
|
80
|
+
if ratio < TAIL_GATE:
|
|
81
|
+
return None
|
|
82
|
+
max_idx = cleaned.idxmax()
|
|
83
|
+
try:
|
|
84
|
+
owner = owner_series.loc[max_idx]
|
|
85
|
+
except (KeyError, ValueError):
|
|
86
|
+
return None
|
|
87
|
+
if pd.isna(owner):
|
|
88
|
+
return None
|
|
89
|
+
return int(max_val), ratio, str(owner)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# ── dist statistic — qtype-mix, always shows ────────────────────────────────
|
|
93
|
+
|
|
94
|
+
def _qtype_label(value: object, feed: str) -> str | None:
|
|
95
|
+
"""Map a single qtype value to a display string.
|
|
96
|
+
|
|
97
|
+
Zeek: numeric code → mnemonic from _ZEEK_QTYPE_MNEMONICS; unmapped
|
|
98
|
+
integers → ``"TYPE<n>"``. Pi-hole: already a string mnemonic; used
|
|
99
|
+
as-is. NaN / unparseable → None (caller filters).
|
|
100
|
+
"""
|
|
101
|
+
if pd.isna(value):
|
|
102
|
+
return None
|
|
103
|
+
if feed == "pihole":
|
|
104
|
+
s = str(value).strip()
|
|
105
|
+
return s if s else None
|
|
106
|
+
try:
|
|
107
|
+
code = int(value)
|
|
108
|
+
except (TypeError, ValueError):
|
|
109
|
+
s = str(value).strip()
|
|
110
|
+
return s if s else None
|
|
111
|
+
return _ZEEK_QTYPE_MNEMONICS.get(code, f"TYPE{code}")
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _qtype_dist(qtypes: pd.Series | None, feed: str) -> str:
|
|
115
|
+
"""Render top-3 qtype share string for the qtype-mix dist slot.
|
|
116
|
+
|
|
117
|
+
Two distinct fallbacks (consistency pinned by review):
|
|
118
|
+
- Missing column (qtypes is None) → "(no qtype)" (schema-presence fact)
|
|
119
|
+
- Empty / all-NaN series → "(no queries)" (data-shape fact)
|
|
120
|
+
Single-type pile → "A 100%". Mix → "A 82% · AAAA 11% · HTTPS 4%".
|
|
121
|
+
"""
|
|
122
|
+
if qtypes is None:
|
|
123
|
+
return "(no qtype)"
|
|
124
|
+
labels = qtypes.map(lambda v: _qtype_label(v, feed)).dropna()
|
|
125
|
+
if labels.empty:
|
|
126
|
+
return "(no queries)"
|
|
127
|
+
counts = labels.value_counts()
|
|
128
|
+
total = int(counts.sum())
|
|
129
|
+
top_three = counts.head(3)
|
|
130
|
+
parts = [
|
|
131
|
+
f"{label} {int(round(count / total * 100))}%"
|
|
132
|
+
for label, count in top_three.items()
|
|
133
|
+
]
|
|
134
|
+
return " · ".join(parts)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# ── Slot computers ──────────────────────────────────────────────────────────
|
|
138
|
+
|
|
139
|
+
def _slot_client_volume(frame: pd.DataFrame) -> DigestSlot:
|
|
140
|
+
"""client-volume — cliff over per-src query counts."""
|
|
141
|
+
label = "client-volume"
|
|
142
|
+
if frame.empty or "src" not in frame.columns:
|
|
143
|
+
return DigestSlot(label=label, statistic="cliff")
|
|
144
|
+
counts = frame["src"].value_counts(dropna=True).sort_values(ascending=False)
|
|
145
|
+
result = _cliff(counts)
|
|
146
|
+
if result is None:
|
|
147
|
+
return DigestSlot(label=label, statistic="cliff")
|
|
148
|
+
entity, magnitude, ratio = result
|
|
149
|
+
total = len(frame)
|
|
150
|
+
share_pct = (magnitude / total * 100.0) if total > 0 else 0.0
|
|
151
|
+
entity_str = str(entity)
|
|
152
|
+
return DigestSlot(
|
|
153
|
+
label=label,
|
|
154
|
+
statistic="cliff",
|
|
155
|
+
cells=[entity_str, f"{share_pct:.0f}%", _format_ratio_cell(ratio)],
|
|
156
|
+
entity=entity_str,
|
|
157
|
+
magnitude=share_pct,
|
|
158
|
+
ratio=ratio,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def _slot_domain_volume(frame: pd.DataFrame) -> DigestSlot:
|
|
163
|
+
"""domain-volume — cliff over per-query counts."""
|
|
164
|
+
label = "domain-volume"
|
|
165
|
+
if frame.empty or "query" not in frame.columns:
|
|
166
|
+
return DigestSlot(label=label, statistic="cliff")
|
|
167
|
+
counts = frame["query"].value_counts(dropna=True).sort_values(ascending=False)
|
|
168
|
+
result = _cliff(counts)
|
|
169
|
+
if result is None:
|
|
170
|
+
return DigestSlot(label=label, statistic="cliff")
|
|
171
|
+
entity, magnitude, ratio = result
|
|
172
|
+
entity_str = str(entity)
|
|
173
|
+
return DigestSlot(
|
|
174
|
+
label=label,
|
|
175
|
+
statistic="cliff",
|
|
176
|
+
cells=[entity_str, f"{int(magnitude)}", _format_ratio_cell(ratio)],
|
|
177
|
+
entity=entity_str,
|
|
178
|
+
magnitude=magnitude,
|
|
179
|
+
ratio=ratio,
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _slot_query_length(frame: pd.DataFrame) -> DigestSlot:
|
|
184
|
+
"""query-length — tail over query character lengths; names the owner.
|
|
185
|
+
|
|
186
|
+
Cell order per brief: ``[maxlen, ratio, owner]``. The lede leads with
|
|
187
|
+
the owner, but the table row leads with the magnitude (length of
|
|
188
|
+
longest query) first.
|
|
189
|
+
"""
|
|
190
|
+
label = "query-length"
|
|
191
|
+
if frame.empty or "query" not in frame.columns or "src" not in frame.columns:
|
|
192
|
+
return DigestSlot(label=label, statistic="tail")
|
|
193
|
+
queries = frame["query"].dropna().astype(str)
|
|
194
|
+
if queries.empty:
|
|
195
|
+
return DigestSlot(label=label, statistic="tail")
|
|
196
|
+
lengths = queries.str.len()
|
|
197
|
+
src_aligned = frame.loc[queries.index, "src"]
|
|
198
|
+
result = _tail(lengths, src_aligned)
|
|
199
|
+
if result is None:
|
|
200
|
+
return DigestSlot(label=label, statistic="tail")
|
|
201
|
+
max_val, ratio, owner = result
|
|
202
|
+
return DigestSlot(
|
|
203
|
+
label=label,
|
|
204
|
+
statistic="tail",
|
|
205
|
+
cells=[f"{max_val} chars", _format_ratio_cell(ratio), owner],
|
|
206
|
+
entity=owner,
|
|
207
|
+
magnitude=float(max_val),
|
|
208
|
+
ratio=ratio,
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
def _slot_qtype_mix(frame: pd.DataFrame, feed: str) -> DigestSlot:
|
|
213
|
+
"""qtype-mix — dist over query types; always shows."""
|
|
214
|
+
label = "qtype-mix"
|
|
215
|
+
qtypes = frame["qtype"] if "qtype" in frame.columns else None
|
|
216
|
+
rendered = _qtype_dist(qtypes, feed)
|
|
217
|
+
return DigestSlot(label=label, statistic="dist", cells=[rendered])
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def _slot_nxdomain_rate(frame: pd.DataFrame, feed: str) -> DigestSlot:
|
|
221
|
+
"""nxdomain-rate — rate of NXDOMAIN (rcode == 3). Zeek only.
|
|
222
|
+
|
|
223
|
+
Non-Zeek feeds return a non-speaking slot — the summariser filters those
|
|
224
|
+
out, so the slot vanishes from the card entirely on Pi-hole.
|
|
225
|
+
"""
|
|
226
|
+
label = "nxdomain-rate"
|
|
227
|
+
if feed != "zeek":
|
|
228
|
+
return DigestSlot(label=label, statistic="rate")
|
|
229
|
+
if frame.empty or "rcode" not in frame.columns or "src" not in frame.columns:
|
|
230
|
+
return DigestSlot(label=label, statistic="rate")
|
|
231
|
+
kind_mask = (frame["rcode"] == 3).fillna(False).astype(bool)
|
|
232
|
+
result = _rate(kind_mask, frame["src"])
|
|
233
|
+
if result is None:
|
|
234
|
+
return DigestSlot(label=label, statistic="rate")
|
|
235
|
+
fraction, top = result
|
|
236
|
+
pct = fraction * 100.0
|
|
237
|
+
return DigestSlot(
|
|
238
|
+
label=label,
|
|
239
|
+
statistic="rate",
|
|
240
|
+
cells=[f"{pct:.0f}% failed", top],
|
|
241
|
+
entity=top,
|
|
242
|
+
magnitude=pct,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
def _slot_block_rate(frame: pd.DataFrame, feed: str) -> DigestSlot:
|
|
247
|
+
"""block-rate — rate of blocked queries (gravity_blocked / regex_blocked).
|
|
248
|
+
Pi-hole only. Block-status derivation is local; the detector is not
|
|
249
|
+
imported.
|
|
250
|
+
|
|
251
|
+
Non-Pi-hole feeds return a non-speaking slot — the summariser filters
|
|
252
|
+
those out, so the slot vanishes from the card entirely on Zeek.
|
|
253
|
+
"""
|
|
254
|
+
label = "block-rate"
|
|
255
|
+
if feed != "pihole":
|
|
256
|
+
return DigestSlot(label=label, statistic="rate")
|
|
257
|
+
if frame.empty or "event_type" not in frame.columns or "query" not in frame.columns:
|
|
258
|
+
return DigestSlot(label=label, statistic="rate")
|
|
259
|
+
kind_mask = frame["event_type"].isin(_BLOCK_EVENT_TYPES).fillna(False).astype(bool)
|
|
260
|
+
result = _rate(kind_mask, frame["query"])
|
|
261
|
+
if result is None:
|
|
262
|
+
return DigestSlot(label=label, statistic="rate")
|
|
263
|
+
fraction, top = result
|
|
264
|
+
pct = fraction * 100.0
|
|
265
|
+
return DigestSlot(
|
|
266
|
+
label=label,
|
|
267
|
+
statistic="rate",
|
|
268
|
+
cells=[f"{pct:.0f}% blocked", top],
|
|
269
|
+
entity=top,
|
|
270
|
+
magnitude=pct,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
# ── Lede formatters ─────────────────────────────────────────────────────────
|
|
275
|
+
|
|
276
|
+
def _lede_client_volume(slot: DigestSlot) -> str:
|
|
277
|
+
return (
|
|
278
|
+
f"{slot.entity} issued {slot.magnitude:.0f}% of queries, "
|
|
279
|
+
f"{_format_ratio_lede(slot.ratio)} its nearest peer."
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def _lede_domain_volume(slot: DigestSlot) -> str:
|
|
284
|
+
return (
|
|
285
|
+
f"{slot.entity} was queried {int(slot.magnitude)} times, "
|
|
286
|
+
f"{_format_ratio_lede(slot.ratio)} the next domain."
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
|
|
290
|
+
def _lede_query_length(slot: DigestSlot) -> str:
|
|
291
|
+
# Lede leads with owner (entity); cell order leads with maxlen.
|
|
292
|
+
return (
|
|
293
|
+
f"{slot.entity} issued a {int(slot.magnitude)}-character query, "
|
|
294
|
+
f"{_format_ratio_lede(slot.ratio)} the median length."
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _lede_nxdomain_rate(slot: DigestSlot) -> str:
|
|
299
|
+
return (
|
|
300
|
+
f"{slot.magnitude:.0f}% of queries failed with NXDOMAIN, "
|
|
301
|
+
f"led by {slot.entity}."
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
|
|
305
|
+
def _lede_block_rate(slot: DigestSlot) -> str:
|
|
306
|
+
return (
|
|
307
|
+
f"{slot.magnitude:.0f}% of queries were blocked, "
|
|
308
|
+
f"led by {slot.entity}."
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
_INSIGHT_FORMATTERS = {
|
|
313
|
+
"client-volume": _lede_client_volume,
|
|
314
|
+
"domain-volume": _lede_domain_volume,
|
|
315
|
+
"query-length": _lede_query_length,
|
|
316
|
+
"nxdomain-rate": _lede_nxdomain_rate,
|
|
317
|
+
"block-rate": _lede_block_rate,
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
# ── Zone 1 extras ───────────────────────────────────────────────────────────
|
|
322
|
+
|
|
323
|
+
def _zone1_extras(frame: pd.DataFrame) -> list[tuple[str, str]]:
|
|
324
|
+
"""Two lines, brief-pinned: distinct clients + distinct domains."""
|
|
325
|
+
if frame.empty:
|
|
326
|
+
return [("clients", "0"), ("domains", "0")]
|
|
327
|
+
distinct_clients = (
|
|
328
|
+
int(frame["src"].nunique(dropna=True)) if "src" in frame.columns else 0
|
|
329
|
+
)
|
|
330
|
+
distinct_domains = (
|
|
331
|
+
int(frame["query"].nunique(dropna=True)) if "query" in frame.columns else 0
|
|
332
|
+
)
|
|
333
|
+
return [
|
|
334
|
+
("clients", str(distinct_clients)),
|
|
335
|
+
("domains", str(distinct_domains)),
|
|
336
|
+
]
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
# ── Public entry point ──────────────────────────────────────────────────────
|
|
340
|
+
|
|
341
|
+
def summarize(frame: pd.DataFrame, feed: str) -> dict:
|
|
342
|
+
"""Return the schema-specific body of a dns DigestCard.
|
|
343
|
+
|
|
344
|
+
``feed`` is ``"zeek"`` or ``"pihole"`` — selects which feed-specific
|
|
345
|
+
slots populate vs. return a non-speaking slot (which the summariser
|
|
346
|
+
then filters out of ``fields``). The four shared slots populate (or
|
|
347
|
+
stay non-speaking) the same way on both feeds.
|
|
348
|
+
"""
|
|
349
|
+
from loghunter.digest._stats import select_insights_and_fields
|
|
350
|
+
|
|
351
|
+
slots = [
|
|
352
|
+
_slot_client_volume(frame),
|
|
353
|
+
_slot_domain_volume(frame),
|
|
354
|
+
_slot_query_length(frame),
|
|
355
|
+
_slot_qtype_mix(frame, feed),
|
|
356
|
+
_slot_nxdomain_rate(frame, feed),
|
|
357
|
+
_slot_block_rate(frame, feed),
|
|
358
|
+
]
|
|
359
|
+
insights, fields = select_insights_and_fields(slots, _INSIGHT_FORMATTERS)
|
|
360
|
+
return {
|
|
361
|
+
"zone1_extras": _zone1_extras(frame),
|
|
362
|
+
"insights": insights,
|
|
363
|
+
"fields": fields,
|
|
364
|
+
}
|
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""syslog summariser — orient-before-the-hunt for fidelity-aware syslog.
|
|
2
|
+
|
|
3
|
+
The thinnest digest card by design — three slots, no manufactured depth.
|
|
4
|
+
A three-row syslog card beside a six-row dns card honestly reads as "syslog
|
|
5
|
+
is simpler," which is true; flat-grammar selection keeps it scannable.
|
|
6
|
+
|
|
7
|
+
Slots (fixed order):
|
|
8
|
+
- host-volume — cliff over per-host line counts (feed-independent)
|
|
9
|
+
- program-volume — cliff over per-program line counts (feed-independent)
|
|
10
|
+
- error-rate — rate of lines that are "errors"; KIND forks on feed
|
|
11
|
+
|
|
12
|
+
Fidelity fork (DNS precedent):
|
|
13
|
+
|
|
14
|
+
- feed ``"syslog"`` (flat rsyslog): the normalized frame carries no
|
|
15
|
+
severity field (RFC 3164 PRI is stripped by the parser), so "error" is
|
|
16
|
+
a keyword-token heuristic over the message body. Kind definition like
|
|
17
|
+
dns's "rcode == 3", not a badness threshold — gated only by RATE_FLOOR.
|
|
18
|
+
- feed ``"zeek"`` (Zeek syslog.log): Zeek emits an explicit ``severity``
|
|
19
|
+
enum on every line, so "error" is the real RFC 5424 error set
|
|
20
|
+
``{EMERG, ALERT, CRIT, ERR}``. No keyword guessing.
|
|
21
|
+
|
|
22
|
+
The lede formatter for ``error-rate`` forks its wording on ``feed`` — the
|
|
23
|
+
Zeek arm speaks in severity terms, the flat arm in token terms. The card
|
|
24
|
+
itself carries no footer surface under the flat grammar; the feed-difference
|
|
25
|
+
disclosure is implicit in the insight wording.
|
|
26
|
+
|
|
27
|
+
Cliff machinery imported from conn so the cards cannot drift on gate /
|
|
28
|
+
floor / display-cap behaviour. The rate statistic and its RATE_FLOOR
|
|
29
|
+
constant live in ``loghunter.digest._stats`` — factored once three cards
|
|
30
|
+
needed an identical copy (this one, dns, and cloudtrail).
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from __future__ import annotations
|
|
34
|
+
|
|
35
|
+
import re
|
|
36
|
+
|
|
37
|
+
import pandas as pd
|
|
38
|
+
|
|
39
|
+
from loghunter.common.finding import DigestSlot
|
|
40
|
+
from loghunter.digest._stats import RATE_FLOOR, _rate
|
|
41
|
+
from loghunter.digest.conn import (
|
|
42
|
+
CLIFF_DISPLAY_CAP, # noqa: F401 — re-exported for downstream symmetry
|
|
43
|
+
CLIFF_GATE, # noqa: F401 — re-exported for downstream symmetry
|
|
44
|
+
POPULATION_FLOOR,
|
|
45
|
+
_cliff,
|
|
46
|
+
_format_ratio_cell,
|
|
47
|
+
_format_ratio_lede,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
# ── Calibration constants ───────────────────────────────────────────────────
|
|
52
|
+
|
|
53
|
+
# Kind-definition heuristic. The normalized syslog frame carries no severity
|
|
54
|
+
# field — RFC 3164 PRI is stripped by the parser and discarded. This is plain
|
|
55
|
+
# text matching against an error-indicating token list, sorted longest-first so
|
|
56
|
+
# multi-word phrases survive alternation as the list grows.
|
|
57
|
+
_ERROR_TOKENS = (
|
|
58
|
+
"out of memory",
|
|
59
|
+
"unreachable",
|
|
60
|
+
"segfault",
|
|
61
|
+
"critical",
|
|
62
|
+
"failure",
|
|
63
|
+
"refused",
|
|
64
|
+
"timeout",
|
|
65
|
+
"denied",
|
|
66
|
+
"failed",
|
|
67
|
+
"error",
|
|
68
|
+
"fatal",
|
|
69
|
+
"panic",
|
|
70
|
+
"oom",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
# Start-boundary at the alternation, free suffix at the end. So "errors" matches
|
|
74
|
+
# "error" (start-bounded), "oom-killer" matches "oom" (hyphen is non-word),
|
|
75
|
+
# "out of memory" matches as a literal phrase, but "terror" does NOT match
|
|
76
|
+
# "error" (no word boundary before "error" when preceded by a word char).
|
|
77
|
+
_ERROR_RE = re.compile(
|
|
78
|
+
r"\b(?:" + "|".join(re.escape(t) for t in _ERROR_TOKENS) + r")",
|
|
79
|
+
re.IGNORECASE,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Zeek-feed kind: real RFC 5424 error severities. Uppercase enum strings on
|
|
83
|
+
# the wire ("EMERG", "ALERT", "CRIT", "ERR") — matched case-insensitively to
|
|
84
|
+
# absorb mixed-case Zeek emissions without column-sniffing the case shape.
|
|
85
|
+
_SEVERITY_ERROR_SET = frozenset({"EMERG", "ALERT", "CRIT", "ERR"})
|
|
86
|
+
|
|
87
|
+
# ── Slot computers ──────────────────────────────────────────────────────────
|
|
88
|
+
|
|
89
|
+
def _slot_host_volume(frame: pd.DataFrame) -> DigestSlot:
|
|
90
|
+
"""host-volume — cliff over per-host line counts."""
|
|
91
|
+
label = "host-volume"
|
|
92
|
+
if frame.empty or "host" not in frame.columns:
|
|
93
|
+
return DigestSlot(label=label, statistic="cliff")
|
|
94
|
+
counts = frame["host"].value_counts(dropna=True).sort_values(ascending=False)
|
|
95
|
+
result = _cliff(counts)
|
|
96
|
+
if result is None:
|
|
97
|
+
return DigestSlot(label=label, statistic="cliff")
|
|
98
|
+
entity, magnitude, ratio = result
|
|
99
|
+
total = len(frame)
|
|
100
|
+
share_pct = (magnitude / total * 100.0) if total > 0 else 0.0
|
|
101
|
+
entity_str = str(entity)
|
|
102
|
+
return DigestSlot(
|
|
103
|
+
label=label,
|
|
104
|
+
statistic="cliff",
|
|
105
|
+
cells=[entity_str, f"{share_pct:.0f}%", _format_ratio_cell(ratio)],
|
|
106
|
+
entity=entity_str,
|
|
107
|
+
magnitude=share_pct,
|
|
108
|
+
ratio=ratio,
|
|
109
|
+
)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def _slot_program_volume(frame: pd.DataFrame) -> DigestSlot:
|
|
113
|
+
"""program-volume — cliff over per-program line counts."""
|
|
114
|
+
label = "program-volume"
|
|
115
|
+
if frame.empty or "program" not in frame.columns:
|
|
116
|
+
return DigestSlot(label=label, statistic="cliff")
|
|
117
|
+
counts = frame["program"].value_counts(dropna=True).sort_values(ascending=False)
|
|
118
|
+
result = _cliff(counts)
|
|
119
|
+
if result is None:
|
|
120
|
+
return DigestSlot(label=label, statistic="cliff")
|
|
121
|
+
entity, magnitude, ratio = result
|
|
122
|
+
entity_str = str(entity)
|
|
123
|
+
return DigestSlot(
|
|
124
|
+
label=label,
|
|
125
|
+
statistic="cliff",
|
|
126
|
+
cells=[entity_str, f"{int(magnitude)}", _format_ratio_cell(ratio)],
|
|
127
|
+
entity=entity_str,
|
|
128
|
+
magnitude=magnitude,
|
|
129
|
+
ratio=ratio,
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _slot_error_rate(frame: pd.DataFrame, feed: str) -> DigestSlot:
|
|
134
|
+
"""error-rate — fraction of lines that are "errors". Kind forks on feed.
|
|
135
|
+
|
|
136
|
+
feed ``"zeek"`` : kind = ``severity`` ∈ {EMERG, ALERT, CRIT, ERR}. The
|
|
137
|
+
severity column may be absent on a malformed Zeek frame
|
|
138
|
+
— slot dashes in that case. Present-but-zero error-set
|
|
139
|
+
values flows through ``_rate`` and dashes via the
|
|
140
|
+
shared RATE_FLOOR (matching the flat-feed convention —
|
|
141
|
+
neither feed paints "0%").
|
|
142
|
+
feed ``"syslog"``: kind = message-text keyword match (``_ERROR_RE``).
|
|
143
|
+
Matching is against the canonical ``message`` column
|
|
144
|
+
only (header-stripped body), never ``raw`` — the
|
|
145
|
+
unstripped line would let timestamps or hostnames
|
|
146
|
+
accidentally trip tokens.
|
|
147
|
+
|
|
148
|
+
Kind definition, not badness threshold: the fraction is reported as a
|
|
149
|
+
plain fact, gated only by the shared RATE_FLOOR.
|
|
150
|
+
"""
|
|
151
|
+
label = "error-rate"
|
|
152
|
+
if frame.empty or "host" not in frame.columns:
|
|
153
|
+
return DigestSlot(label=label, statistic="rate")
|
|
154
|
+
|
|
155
|
+
if feed == "zeek":
|
|
156
|
+
if "severity" not in frame.columns:
|
|
157
|
+
return DigestSlot(label=label, statistic="rate")
|
|
158
|
+
severity = frame["severity"].astype(str).str.upper()
|
|
159
|
+
kind_mask = severity.isin(_SEVERITY_ERROR_SET)
|
|
160
|
+
else:
|
|
161
|
+
if "message" not in frame.columns:
|
|
162
|
+
return DigestSlot(label=label, statistic="rate")
|
|
163
|
+
messages = frame["message"].astype(str)
|
|
164
|
+
kind_mask = messages.str.contains(_ERROR_RE, na=False)
|
|
165
|
+
|
|
166
|
+
result = _rate(kind_mask, frame["host"])
|
|
167
|
+
if result is None:
|
|
168
|
+
return DigestSlot(label=label, statistic="rate")
|
|
169
|
+
fraction, top = result
|
|
170
|
+
pct = fraction * 100.0
|
|
171
|
+
return DigestSlot(
|
|
172
|
+
label=label,
|
|
173
|
+
statistic="rate",
|
|
174
|
+
cells=[f"{pct:.0f}%", top],
|
|
175
|
+
entity=top,
|
|
176
|
+
magnitude=pct,
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# ── Lede formatters ─────────────────────────────────────────────────────────
|
|
181
|
+
|
|
182
|
+
def _lede_host_volume(slot: DigestSlot, feed: str) -> str:
|
|
183
|
+
return (
|
|
184
|
+
f"{slot.entity} emitted {slot.magnitude:.0f}% of log lines, "
|
|
185
|
+
f"{_format_ratio_lede(slot.ratio)} the next host."
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def _lede_program_volume(slot: DigestSlot, feed: str) -> str:
|
|
190
|
+
return (
|
|
191
|
+
f"{slot.entity} emitted {int(slot.magnitude)} lines, "
|
|
192
|
+
f"{_format_ratio_lede(slot.ratio)} the next program."
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def _lede_error_rate(slot: DigestSlot, feed: str) -> str:
|
|
197
|
+
"""error-rate lede — wording forks on feed.
|
|
198
|
+
|
|
199
|
+
The Zeek variant MUST NOT say "token" or imply keyword matching — that
|
|
200
|
+
would misdescribe the real-severity Zeek path. The flat-syslog variant
|
|
201
|
+
keeps the existing keyword wording.
|
|
202
|
+
"""
|
|
203
|
+
if feed == "zeek":
|
|
204
|
+
return (
|
|
205
|
+
f"{slot.magnitude:.0f}% of lines are error-severity "
|
|
206
|
+
f"(ERR or higher), led by {slot.entity}."
|
|
207
|
+
)
|
|
208
|
+
return (
|
|
209
|
+
f"{slot.magnitude:.0f}% of lines carry an error token, "
|
|
210
|
+
f"led by {slot.entity}."
|
|
211
|
+
)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def _insight_formatters(feed: str) -> dict[str, "Callable[[DigestSlot], str]"]:
|
|
215
|
+
"""Bind ``feed`` into the feed-aware formatters so the shared selection
|
|
216
|
+
helper sees the standard ``(slot) -> str`` shape.
|
|
217
|
+
|
|
218
|
+
A small dedicated helper rather than a sentinel — the formatters take
|
|
219
|
+
feed explicitly, and partial-binding here is the obvious mechanism.
|
|
220
|
+
"""
|
|
221
|
+
return {
|
|
222
|
+
"host-volume": lambda slot: _lede_host_volume(slot, feed),
|
|
223
|
+
"program-volume": lambda slot: _lede_program_volume(slot, feed),
|
|
224
|
+
"error-rate": lambda slot: _lede_error_rate(slot, feed),
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
# ── Zone 1 extras ───────────────────────────────────────────────────────────
|
|
229
|
+
|
|
230
|
+
def _zone1_extras(frame: pd.DataFrame) -> list[tuple[str, str]]:
|
|
231
|
+
"""Two lines, brief-pinned: distinct hosts + distinct programs."""
|
|
232
|
+
if frame.empty:
|
|
233
|
+
return [("hosts", "0"), ("programs", "0")]
|
|
234
|
+
distinct_hosts = (
|
|
235
|
+
int(frame["host"].nunique(dropna=True)) if "host" in frame.columns else 0
|
|
236
|
+
)
|
|
237
|
+
distinct_programs = (
|
|
238
|
+
int(frame["program"].nunique(dropna=True)) if "program" in frame.columns else 0
|
|
239
|
+
)
|
|
240
|
+
return [
|
|
241
|
+
("hosts", str(distinct_hosts)),
|
|
242
|
+
("programs", str(distinct_programs)),
|
|
243
|
+
]
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
# ── Public entry point ─────────────────────────────────────────────────────
|
|
247
|
+
|
|
248
|
+
def summarize(frame: pd.DataFrame, feed: str) -> dict:
|
|
249
|
+
"""Return the schema-specific body of a syslog DigestCard.
|
|
250
|
+
|
|
251
|
+
``feed`` is ``"zeek"`` or ``"syslog"`` — picks which kind drives the
|
|
252
|
+
error-rate slot and which wording the lede uses. Host- and program-volume
|
|
253
|
+
cliffs are feed-independent.
|
|
254
|
+
"""
|
|
255
|
+
from loghunter.digest._stats import select_insights_and_fields
|
|
256
|
+
|
|
257
|
+
slots = [
|
|
258
|
+
_slot_host_volume(frame),
|
|
259
|
+
_slot_program_volume(frame),
|
|
260
|
+
_slot_error_rate(frame, feed),
|
|
261
|
+
]
|
|
262
|
+
insights, fields = select_insights_and_fields(
|
|
263
|
+
slots, _insight_formatters(feed),
|
|
264
|
+
)
|
|
265
|
+
return {
|
|
266
|
+
"zone1_extras": _zone1_extras(frame),
|
|
267
|
+
"insights": insights,
|
|
268
|
+
"fields": fields,
|
|
269
|
+
}
|