loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,364 @@
1
+ """dns summariser — orient-before-the-hunt for DNS data.
2
+
3
+ The first fidelity-aware digest card: a slot set that depends on which DNS
4
+ feed was loaded. Four slots are shared (cliff/cliff/tail/dist over columns
5
+ present on both feeds); two are feed-specific:
6
+
7
+ - nxdomain-rate (rcode-based) — Zeek only; non-speaking on Pi-hole
8
+ - block-rate (event_type-based) — Pi-hole only; non-speaking on Zeek
9
+
10
+ A feed-uncomputable slot returns a non-speaking ``DigestSlot`` (cells=None);
11
+ ``select_insights_and_fields`` filters it out of ``fields`` and the slot
12
+ simply vanishes from the rendered card. No ABSENT marker, no footer text.
13
+
14
+ Cliff machinery imported from conn so the two cards cannot drift on gate /
15
+ floor / display-cap behaviour. The rate statistic — and its RATE_FLOOR
16
+ constant — live in ``loghunter.digest._stats`` (factored once three cards
17
+ needed an identical copy: this one, syslog, and cloudtrail). Two more
18
+ statistics computed locally:
19
+
20
+ - tail: max/median ratio over a distribution, with an owner attribution
21
+ - dist: top-3 share-of-mix; orientation only, never produces an insight
22
+
23
+ A row is "blocked" on Pi-hole iff event_type ∈ {gravity_blocked,
24
+ regex_blocked} — digest computes this locally; the detector is not imported.
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ import pandas as pd
30
+
31
+ from loghunter.common.finding import DigestSlot
32
+ from loghunter.digest._stats import RATE_FLOOR, _rate
33
+ from loghunter.digest.conn import (
34
+ CLIFF_DISPLAY_CAP, # noqa: F401 — re-exported for downstream symmetry
35
+ CLIFF_GATE, # noqa: F401 — re-exported for downstream symmetry
36
+ POPULATION_FLOOR,
37
+ _cliff,
38
+ _format_ratio_cell,
39
+ _format_ratio_lede,
40
+ )
41
+
42
+
43
+ # ── Calibration constants — provisional, tunable in one place ───────────────
44
+
45
+ TAIL_GATE = 3.0 # max/median ratio below this → query-length is non-speaking
46
+
47
+ # Zeek emits qtype as a numeric type code; map the common ones to mnemonics
48
+ # for display. Unmapped codes render as "TYPE<n>" so an analyst still has a
49
+ # breadcrumb to look up.
50
+ _ZEEK_QTYPE_MNEMONICS = {
51
+ 1: "A", 2: "NS", 5: "CNAME", 6: "SOA", 12: "PTR",
52
+ 15: "MX", 16: "TXT", 28: "AAAA", 33: "SRV", 65: "HTTPS", 257: "CAA",
53
+ }
54
+
55
+ _BLOCK_EVENT_TYPES = frozenset({"gravity_blocked", "regex_blocked"})
56
+
57
+
58
+ # ── tail statistic ──────────────────────────────────────────────────────────
59
+
60
+ def _tail(values: pd.Series, owner_series: pd.Series) -> tuple | None:
61
+ """Tail statistic: is the extreme far from the body of the distribution?
62
+
63
+ Returns ``(max_val, ratio, owner)`` when speaking, None when dashed.
64
+ Dashes when population < POPULATION_FLOOR, median is 0/NaN, or
65
+ max/median is below TAIL_GATE.
66
+
67
+ ``values`` and ``owner_series`` must share an index — the owner of the
68
+ max is looked up by that index.
69
+ """
70
+ cleaned = values.dropna()
71
+ if len(cleaned) < POPULATION_FLOOR:
72
+ return None
73
+ median = cleaned.median()
74
+ if pd.isna(median) or median == 0:
75
+ return None
76
+ max_val = cleaned.max()
77
+ if pd.isna(max_val) or max_val == 0:
78
+ return None
79
+ ratio = float(max_val) / float(median)
80
+ if ratio < TAIL_GATE:
81
+ return None
82
+ max_idx = cleaned.idxmax()
83
+ try:
84
+ owner = owner_series.loc[max_idx]
85
+ except (KeyError, ValueError):
86
+ return None
87
+ if pd.isna(owner):
88
+ return None
89
+ return int(max_val), ratio, str(owner)
90
+
91
+
92
+ # ── dist statistic — qtype-mix, always shows ────────────────────────────────
93
+
94
+ def _qtype_label(value: object, feed: str) -> str | None:
95
+ """Map a single qtype value to a display string.
96
+
97
+ Zeek: numeric code → mnemonic from _ZEEK_QTYPE_MNEMONICS; unmapped
98
+ integers → ``"TYPE<n>"``. Pi-hole: already a string mnemonic; used
99
+ as-is. NaN / unparseable → None (caller filters).
100
+ """
101
+ if pd.isna(value):
102
+ return None
103
+ if feed == "pihole":
104
+ s = str(value).strip()
105
+ return s if s else None
106
+ try:
107
+ code = int(value)
108
+ except (TypeError, ValueError):
109
+ s = str(value).strip()
110
+ return s if s else None
111
+ return _ZEEK_QTYPE_MNEMONICS.get(code, f"TYPE{code}")
112
+
113
+
114
+ def _qtype_dist(qtypes: pd.Series | None, feed: str) -> str:
115
+ """Render top-3 qtype share string for the qtype-mix dist slot.
116
+
117
+ Two distinct fallbacks (consistency pinned by review):
118
+ - Missing column (qtypes is None) → "(no qtype)" (schema-presence fact)
119
+ - Empty / all-NaN series → "(no queries)" (data-shape fact)
120
+ Single-type pile → "A 100%". Mix → "A 82% · AAAA 11% · HTTPS 4%".
121
+ """
122
+ if qtypes is None:
123
+ return "(no qtype)"
124
+ labels = qtypes.map(lambda v: _qtype_label(v, feed)).dropna()
125
+ if labels.empty:
126
+ return "(no queries)"
127
+ counts = labels.value_counts()
128
+ total = int(counts.sum())
129
+ top_three = counts.head(3)
130
+ parts = [
131
+ f"{label} {int(round(count / total * 100))}%"
132
+ for label, count in top_three.items()
133
+ ]
134
+ return " · ".join(parts)
135
+
136
+
137
+ # ── Slot computers ──────────────────────────────────────────────────────────
138
+
139
+ def _slot_client_volume(frame: pd.DataFrame) -> DigestSlot:
140
+ """client-volume — cliff over per-src query counts."""
141
+ label = "client-volume"
142
+ if frame.empty or "src" not in frame.columns:
143
+ return DigestSlot(label=label, statistic="cliff")
144
+ counts = frame["src"].value_counts(dropna=True).sort_values(ascending=False)
145
+ result = _cliff(counts)
146
+ if result is None:
147
+ return DigestSlot(label=label, statistic="cliff")
148
+ entity, magnitude, ratio = result
149
+ total = len(frame)
150
+ share_pct = (magnitude / total * 100.0) if total > 0 else 0.0
151
+ entity_str = str(entity)
152
+ return DigestSlot(
153
+ label=label,
154
+ statistic="cliff",
155
+ cells=[entity_str, f"{share_pct:.0f}%", _format_ratio_cell(ratio)],
156
+ entity=entity_str,
157
+ magnitude=share_pct,
158
+ ratio=ratio,
159
+ )
160
+
161
+
162
+ def _slot_domain_volume(frame: pd.DataFrame) -> DigestSlot:
163
+ """domain-volume — cliff over per-query counts."""
164
+ label = "domain-volume"
165
+ if frame.empty or "query" not in frame.columns:
166
+ return DigestSlot(label=label, statistic="cliff")
167
+ counts = frame["query"].value_counts(dropna=True).sort_values(ascending=False)
168
+ result = _cliff(counts)
169
+ if result is None:
170
+ return DigestSlot(label=label, statistic="cliff")
171
+ entity, magnitude, ratio = result
172
+ entity_str = str(entity)
173
+ return DigestSlot(
174
+ label=label,
175
+ statistic="cliff",
176
+ cells=[entity_str, f"{int(magnitude)}", _format_ratio_cell(ratio)],
177
+ entity=entity_str,
178
+ magnitude=magnitude,
179
+ ratio=ratio,
180
+ )
181
+
182
+
183
+ def _slot_query_length(frame: pd.DataFrame) -> DigestSlot:
184
+ """query-length — tail over query character lengths; names the owner.
185
+
186
+ Cell order per brief: ``[maxlen, ratio, owner]``. The lede leads with
187
+ the owner, but the table row leads with the magnitude (length of
188
+ longest query) first.
189
+ """
190
+ label = "query-length"
191
+ if frame.empty or "query" not in frame.columns or "src" not in frame.columns:
192
+ return DigestSlot(label=label, statistic="tail")
193
+ queries = frame["query"].dropna().astype(str)
194
+ if queries.empty:
195
+ return DigestSlot(label=label, statistic="tail")
196
+ lengths = queries.str.len()
197
+ src_aligned = frame.loc[queries.index, "src"]
198
+ result = _tail(lengths, src_aligned)
199
+ if result is None:
200
+ return DigestSlot(label=label, statistic="tail")
201
+ max_val, ratio, owner = result
202
+ return DigestSlot(
203
+ label=label,
204
+ statistic="tail",
205
+ cells=[f"{max_val} chars", _format_ratio_cell(ratio), owner],
206
+ entity=owner,
207
+ magnitude=float(max_val),
208
+ ratio=ratio,
209
+ )
210
+
211
+
212
+ def _slot_qtype_mix(frame: pd.DataFrame, feed: str) -> DigestSlot:
213
+ """qtype-mix — dist over query types; always shows."""
214
+ label = "qtype-mix"
215
+ qtypes = frame["qtype"] if "qtype" in frame.columns else None
216
+ rendered = _qtype_dist(qtypes, feed)
217
+ return DigestSlot(label=label, statistic="dist", cells=[rendered])
218
+
219
+
220
+ def _slot_nxdomain_rate(frame: pd.DataFrame, feed: str) -> DigestSlot:
221
+ """nxdomain-rate — rate of NXDOMAIN (rcode == 3). Zeek only.
222
+
223
+ Non-Zeek feeds return a non-speaking slot — the summariser filters those
224
+ out, so the slot vanishes from the card entirely on Pi-hole.
225
+ """
226
+ label = "nxdomain-rate"
227
+ if feed != "zeek":
228
+ return DigestSlot(label=label, statistic="rate")
229
+ if frame.empty or "rcode" not in frame.columns or "src" not in frame.columns:
230
+ return DigestSlot(label=label, statistic="rate")
231
+ kind_mask = (frame["rcode"] == 3).fillna(False).astype(bool)
232
+ result = _rate(kind_mask, frame["src"])
233
+ if result is None:
234
+ return DigestSlot(label=label, statistic="rate")
235
+ fraction, top = result
236
+ pct = fraction * 100.0
237
+ return DigestSlot(
238
+ label=label,
239
+ statistic="rate",
240
+ cells=[f"{pct:.0f}% failed", top],
241
+ entity=top,
242
+ magnitude=pct,
243
+ )
244
+
245
+
246
+ def _slot_block_rate(frame: pd.DataFrame, feed: str) -> DigestSlot:
247
+ """block-rate — rate of blocked queries (gravity_blocked / regex_blocked).
248
+ Pi-hole only. Block-status derivation is local; the detector is not
249
+ imported.
250
+
251
+ Non-Pi-hole feeds return a non-speaking slot — the summariser filters
252
+ those out, so the slot vanishes from the card entirely on Zeek.
253
+ """
254
+ label = "block-rate"
255
+ if feed != "pihole":
256
+ return DigestSlot(label=label, statistic="rate")
257
+ if frame.empty or "event_type" not in frame.columns or "query" not in frame.columns:
258
+ return DigestSlot(label=label, statistic="rate")
259
+ kind_mask = frame["event_type"].isin(_BLOCK_EVENT_TYPES).fillna(False).astype(bool)
260
+ result = _rate(kind_mask, frame["query"])
261
+ if result is None:
262
+ return DigestSlot(label=label, statistic="rate")
263
+ fraction, top = result
264
+ pct = fraction * 100.0
265
+ return DigestSlot(
266
+ label=label,
267
+ statistic="rate",
268
+ cells=[f"{pct:.0f}% blocked", top],
269
+ entity=top,
270
+ magnitude=pct,
271
+ )
272
+
273
+
274
+ # ── Lede formatters ─────────────────────────────────────────────────────────
275
+
276
+ def _lede_client_volume(slot: DigestSlot) -> str:
277
+ return (
278
+ f"{slot.entity} issued {slot.magnitude:.0f}% of queries, "
279
+ f"{_format_ratio_lede(slot.ratio)} its nearest peer."
280
+ )
281
+
282
+
283
+ def _lede_domain_volume(slot: DigestSlot) -> str:
284
+ return (
285
+ f"{slot.entity} was queried {int(slot.magnitude)} times, "
286
+ f"{_format_ratio_lede(slot.ratio)} the next domain."
287
+ )
288
+
289
+
290
+ def _lede_query_length(slot: DigestSlot) -> str:
291
+ # Lede leads with owner (entity); cell order leads with maxlen.
292
+ return (
293
+ f"{slot.entity} issued a {int(slot.magnitude)}-character query, "
294
+ f"{_format_ratio_lede(slot.ratio)} the median length."
295
+ )
296
+
297
+
298
+ def _lede_nxdomain_rate(slot: DigestSlot) -> str:
299
+ return (
300
+ f"{slot.magnitude:.0f}% of queries failed with NXDOMAIN, "
301
+ f"led by {slot.entity}."
302
+ )
303
+
304
+
305
+ def _lede_block_rate(slot: DigestSlot) -> str:
306
+ return (
307
+ f"{slot.magnitude:.0f}% of queries were blocked, "
308
+ f"led by {slot.entity}."
309
+ )
310
+
311
+
312
+ _INSIGHT_FORMATTERS = {
313
+ "client-volume": _lede_client_volume,
314
+ "domain-volume": _lede_domain_volume,
315
+ "query-length": _lede_query_length,
316
+ "nxdomain-rate": _lede_nxdomain_rate,
317
+ "block-rate": _lede_block_rate,
318
+ }
319
+
320
+
321
+ # ── Zone 1 extras ───────────────────────────────────────────────────────────
322
+
323
+ def _zone1_extras(frame: pd.DataFrame) -> list[tuple[str, str]]:
324
+ """Two lines, brief-pinned: distinct clients + distinct domains."""
325
+ if frame.empty:
326
+ return [("clients", "0"), ("domains", "0")]
327
+ distinct_clients = (
328
+ int(frame["src"].nunique(dropna=True)) if "src" in frame.columns else 0
329
+ )
330
+ distinct_domains = (
331
+ int(frame["query"].nunique(dropna=True)) if "query" in frame.columns else 0
332
+ )
333
+ return [
334
+ ("clients", str(distinct_clients)),
335
+ ("domains", str(distinct_domains)),
336
+ ]
337
+
338
+
339
+ # ── Public entry point ──────────────────────────────────────────────────────
340
+
341
+ def summarize(frame: pd.DataFrame, feed: str) -> dict:
342
+ """Return the schema-specific body of a dns DigestCard.
343
+
344
+ ``feed`` is ``"zeek"`` or ``"pihole"`` — selects which feed-specific
345
+ slots populate vs. return a non-speaking slot (which the summariser
346
+ then filters out of ``fields``). The four shared slots populate (or
347
+ stay non-speaking) the same way on both feeds.
348
+ """
349
+ from loghunter.digest._stats import select_insights_and_fields
350
+
351
+ slots = [
352
+ _slot_client_volume(frame),
353
+ _slot_domain_volume(frame),
354
+ _slot_query_length(frame),
355
+ _slot_qtype_mix(frame, feed),
356
+ _slot_nxdomain_rate(frame, feed),
357
+ _slot_block_rate(frame, feed),
358
+ ]
359
+ insights, fields = select_insights_and_fields(slots, _INSIGHT_FORMATTERS)
360
+ return {
361
+ "zone1_extras": _zone1_extras(frame),
362
+ "insights": insights,
363
+ "fields": fields,
364
+ }
@@ -0,0 +1,269 @@
1
+ """syslog summariser — orient-before-the-hunt for fidelity-aware syslog.
2
+
3
+ The thinnest digest card by design — three slots, no manufactured depth.
4
+ A three-row syslog card beside a six-row dns card honestly reads as "syslog
5
+ is simpler," which is true; flat-grammar selection keeps it scannable.
6
+
7
+ Slots (fixed order):
8
+ - host-volume — cliff over per-host line counts (feed-independent)
9
+ - program-volume — cliff over per-program line counts (feed-independent)
10
+ - error-rate — rate of lines that are "errors"; KIND forks on feed
11
+
12
+ Fidelity fork (DNS precedent):
13
+
14
+ - feed ``"syslog"`` (flat rsyslog): the normalized frame carries no
15
+ severity field (RFC 3164 PRI is stripped by the parser), so "error" is
16
+ a keyword-token heuristic over the message body. Kind definition like
17
+ dns's "rcode == 3", not a badness threshold — gated only by RATE_FLOOR.
18
+ - feed ``"zeek"`` (Zeek syslog.log): Zeek emits an explicit ``severity``
19
+ enum on every line, so "error" is the real RFC 5424 error set
20
+ ``{EMERG, ALERT, CRIT, ERR}``. No keyword guessing.
21
+
22
+ The lede formatter for ``error-rate`` forks its wording on ``feed`` — the
23
+ Zeek arm speaks in severity terms, the flat arm in token terms. The card
24
+ itself carries no footer surface under the flat grammar; the feed-difference
25
+ disclosure is implicit in the insight wording.
26
+
27
+ Cliff machinery imported from conn so the cards cannot drift on gate /
28
+ floor / display-cap behaviour. The rate statistic and its RATE_FLOOR
29
+ constant live in ``loghunter.digest._stats`` — factored once three cards
30
+ needed an identical copy (this one, dns, and cloudtrail).
31
+ """
32
+
33
+ from __future__ import annotations
34
+
35
+ import re
36
+
37
+ import pandas as pd
38
+
39
+ from loghunter.common.finding import DigestSlot
40
+ from loghunter.digest._stats import RATE_FLOOR, _rate
41
+ from loghunter.digest.conn import (
42
+ CLIFF_DISPLAY_CAP, # noqa: F401 — re-exported for downstream symmetry
43
+ CLIFF_GATE, # noqa: F401 — re-exported for downstream symmetry
44
+ POPULATION_FLOOR,
45
+ _cliff,
46
+ _format_ratio_cell,
47
+ _format_ratio_lede,
48
+ )
49
+
50
+
51
+ # ── Calibration constants ───────────────────────────────────────────────────
52
+
53
+ # Kind-definition heuristic. The normalized syslog frame carries no severity
54
+ # field — RFC 3164 PRI is stripped by the parser and discarded. This is plain
55
+ # text matching against an error-indicating token list, sorted longest-first so
56
+ # multi-word phrases survive alternation as the list grows.
57
+ _ERROR_TOKENS = (
58
+ "out of memory",
59
+ "unreachable",
60
+ "segfault",
61
+ "critical",
62
+ "failure",
63
+ "refused",
64
+ "timeout",
65
+ "denied",
66
+ "failed",
67
+ "error",
68
+ "fatal",
69
+ "panic",
70
+ "oom",
71
+ )
72
+
73
+ # Start-boundary at the alternation, free suffix at the end. So "errors" matches
74
+ # "error" (start-bounded), "oom-killer" matches "oom" (hyphen is non-word),
75
+ # "out of memory" matches as a literal phrase, but "terror" does NOT match
76
+ # "error" (no word boundary before "error" when preceded by a word char).
77
+ _ERROR_RE = re.compile(
78
+ r"\b(?:" + "|".join(re.escape(t) for t in _ERROR_TOKENS) + r")",
79
+ re.IGNORECASE,
80
+ )
81
+
82
+ # Zeek-feed kind: real RFC 5424 error severities. Uppercase enum strings on
83
+ # the wire ("EMERG", "ALERT", "CRIT", "ERR") — matched case-insensitively to
84
+ # absorb mixed-case Zeek emissions without column-sniffing the case shape.
85
+ _SEVERITY_ERROR_SET = frozenset({"EMERG", "ALERT", "CRIT", "ERR"})
86
+
87
+ # ── Slot computers ──────────────────────────────────────────────────────────
88
+
89
+ def _slot_host_volume(frame: pd.DataFrame) -> DigestSlot:
90
+ """host-volume — cliff over per-host line counts."""
91
+ label = "host-volume"
92
+ if frame.empty or "host" not in frame.columns:
93
+ return DigestSlot(label=label, statistic="cliff")
94
+ counts = frame["host"].value_counts(dropna=True).sort_values(ascending=False)
95
+ result = _cliff(counts)
96
+ if result is None:
97
+ return DigestSlot(label=label, statistic="cliff")
98
+ entity, magnitude, ratio = result
99
+ total = len(frame)
100
+ share_pct = (magnitude / total * 100.0) if total > 0 else 0.0
101
+ entity_str = str(entity)
102
+ return DigestSlot(
103
+ label=label,
104
+ statistic="cliff",
105
+ cells=[entity_str, f"{share_pct:.0f}%", _format_ratio_cell(ratio)],
106
+ entity=entity_str,
107
+ magnitude=share_pct,
108
+ ratio=ratio,
109
+ )
110
+
111
+
112
+ def _slot_program_volume(frame: pd.DataFrame) -> DigestSlot:
113
+ """program-volume — cliff over per-program line counts."""
114
+ label = "program-volume"
115
+ if frame.empty or "program" not in frame.columns:
116
+ return DigestSlot(label=label, statistic="cliff")
117
+ counts = frame["program"].value_counts(dropna=True).sort_values(ascending=False)
118
+ result = _cliff(counts)
119
+ if result is None:
120
+ return DigestSlot(label=label, statistic="cliff")
121
+ entity, magnitude, ratio = result
122
+ entity_str = str(entity)
123
+ return DigestSlot(
124
+ label=label,
125
+ statistic="cliff",
126
+ cells=[entity_str, f"{int(magnitude)}", _format_ratio_cell(ratio)],
127
+ entity=entity_str,
128
+ magnitude=magnitude,
129
+ ratio=ratio,
130
+ )
131
+
132
+
133
+ def _slot_error_rate(frame: pd.DataFrame, feed: str) -> DigestSlot:
134
+ """error-rate — fraction of lines that are "errors". Kind forks on feed.
135
+
136
+ feed ``"zeek"`` : kind = ``severity`` ∈ {EMERG, ALERT, CRIT, ERR}. The
137
+ severity column may be absent on a malformed Zeek frame
138
+ — slot dashes in that case. Present-but-zero error-set
139
+ values flows through ``_rate`` and dashes via the
140
+ shared RATE_FLOOR (matching the flat-feed convention —
141
+ neither feed paints "0%").
142
+ feed ``"syslog"``: kind = message-text keyword match (``_ERROR_RE``).
143
+ Matching is against the canonical ``message`` column
144
+ only (header-stripped body), never ``raw`` — the
145
+ unstripped line would let timestamps or hostnames
146
+ accidentally trip tokens.
147
+
148
+ Kind definition, not badness threshold: the fraction is reported as a
149
+ plain fact, gated only by the shared RATE_FLOOR.
150
+ """
151
+ label = "error-rate"
152
+ if frame.empty or "host" not in frame.columns:
153
+ return DigestSlot(label=label, statistic="rate")
154
+
155
+ if feed == "zeek":
156
+ if "severity" not in frame.columns:
157
+ return DigestSlot(label=label, statistic="rate")
158
+ severity = frame["severity"].astype(str).str.upper()
159
+ kind_mask = severity.isin(_SEVERITY_ERROR_SET)
160
+ else:
161
+ if "message" not in frame.columns:
162
+ return DigestSlot(label=label, statistic="rate")
163
+ messages = frame["message"].astype(str)
164
+ kind_mask = messages.str.contains(_ERROR_RE, na=False)
165
+
166
+ result = _rate(kind_mask, frame["host"])
167
+ if result is None:
168
+ return DigestSlot(label=label, statistic="rate")
169
+ fraction, top = result
170
+ pct = fraction * 100.0
171
+ return DigestSlot(
172
+ label=label,
173
+ statistic="rate",
174
+ cells=[f"{pct:.0f}%", top],
175
+ entity=top,
176
+ magnitude=pct,
177
+ )
178
+
179
+
180
+ # ── Lede formatters ─────────────────────────────────────────────────────────
181
+
182
+ def _lede_host_volume(slot: DigestSlot, feed: str) -> str:
183
+ return (
184
+ f"{slot.entity} emitted {slot.magnitude:.0f}% of log lines, "
185
+ f"{_format_ratio_lede(slot.ratio)} the next host."
186
+ )
187
+
188
+
189
+ def _lede_program_volume(slot: DigestSlot, feed: str) -> str:
190
+ return (
191
+ f"{slot.entity} emitted {int(slot.magnitude)} lines, "
192
+ f"{_format_ratio_lede(slot.ratio)} the next program."
193
+ )
194
+
195
+
196
+ def _lede_error_rate(slot: DigestSlot, feed: str) -> str:
197
+ """error-rate lede — wording forks on feed.
198
+
199
+ The Zeek variant MUST NOT say "token" or imply keyword matching — that
200
+ would misdescribe the real-severity Zeek path. The flat-syslog variant
201
+ keeps the existing keyword wording.
202
+ """
203
+ if feed == "zeek":
204
+ return (
205
+ f"{slot.magnitude:.0f}% of lines are error-severity "
206
+ f"(ERR or higher), led by {slot.entity}."
207
+ )
208
+ return (
209
+ f"{slot.magnitude:.0f}% of lines carry an error token, "
210
+ f"led by {slot.entity}."
211
+ )
212
+
213
+
214
+ def _insight_formatters(feed: str) -> dict[str, "Callable[[DigestSlot], str]"]:
215
+ """Bind ``feed`` into the feed-aware formatters so the shared selection
216
+ helper sees the standard ``(slot) -> str`` shape.
217
+
218
+ A small dedicated helper rather than a sentinel — the formatters take
219
+ feed explicitly, and partial-binding here is the obvious mechanism.
220
+ """
221
+ return {
222
+ "host-volume": lambda slot: _lede_host_volume(slot, feed),
223
+ "program-volume": lambda slot: _lede_program_volume(slot, feed),
224
+ "error-rate": lambda slot: _lede_error_rate(slot, feed),
225
+ }
226
+
227
+
228
+ # ── Zone 1 extras ───────────────────────────────────────────────────────────
229
+
230
+ def _zone1_extras(frame: pd.DataFrame) -> list[tuple[str, str]]:
231
+ """Two lines, brief-pinned: distinct hosts + distinct programs."""
232
+ if frame.empty:
233
+ return [("hosts", "0"), ("programs", "0")]
234
+ distinct_hosts = (
235
+ int(frame["host"].nunique(dropna=True)) if "host" in frame.columns else 0
236
+ )
237
+ distinct_programs = (
238
+ int(frame["program"].nunique(dropna=True)) if "program" in frame.columns else 0
239
+ )
240
+ return [
241
+ ("hosts", str(distinct_hosts)),
242
+ ("programs", str(distinct_programs)),
243
+ ]
244
+
245
+
246
+ # ── Public entry point ─────────────────────────────────────────────────────
247
+
248
+ def summarize(frame: pd.DataFrame, feed: str) -> dict:
249
+ """Return the schema-specific body of a syslog DigestCard.
250
+
251
+ ``feed`` is ``"zeek"`` or ``"syslog"`` — picks which kind drives the
252
+ error-rate slot and which wording the lede uses. Host- and program-volume
253
+ cliffs are feed-independent.
254
+ """
255
+ from loghunter.digest._stats import select_insights_and_fields
256
+
257
+ slots = [
258
+ _slot_host_volume(frame),
259
+ _slot_program_volume(frame),
260
+ _slot_error_rate(frame, feed),
261
+ ]
262
+ insights, fields = select_insights_and_fields(
263
+ slots, _insight_formatters(feed),
264
+ )
265
+ return {
266
+ "zone1_extras": _zone1_extras(frame),
267
+ "insights": insights,
268
+ "fields": fields,
269
+ }