loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,25 @@
1
+ """SSL detector — TLS anomaly detection from Zeek ssl.log. (planned)
2
+
3
+ Flags self-signed certificates, weak cipher suites, unusual SNI patterns,
4
+ and certificate validity anomalies that may indicate malicious infrastructure.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from loghunter.common.finding import DetectorContext, Finding
10
+
11
+ DETECTOR_NAME = "ssl"
12
+ STATUS = "planned"
13
+
14
+ REQUIRED_LOGS = [
15
+ {"source": "zeek_dir", "pattern": "ssl*.log*"},
16
+ ]
17
+
18
+ OPTIONAL_LOGS: list[dict] = []
19
+
20
+ DEFAULT_CONFIG: dict = {}
21
+
22
+
23
+ def run(context: DetectorContext) -> list[Finding]:
24
+ """Detect TLS anomalies including self-signed certs and cipher outliers."""
25
+ raise NotImplementedError("ssl detector is planned — not yet implemented")
@@ -0,0 +1,266 @@
1
+ """Syslog anomaly detector — drain3 templating + rarity scoring.
2
+
3
+ Pipeline:
4
+ 1. drain3 log templating: assigns each message row a template_id and template_str
5
+ 2. Rarity scoring: flags templates whose occurrence count falls at or below
6
+ min(percentile_threshold, max_count) as anomalous
7
+ 3. Reboot detection: scans all rows for known reboot/shutdown signal patterns
8
+ 4. Reboot suppression: anomalous events within reboot_suppress_window seconds after
9
+ a detected reboot on the same host are suppressed; one synthetic reboot annotation
10
+ is emitted per detected reboot in their place
11
+ 5. Finding production: one Finding per anomalous event plus one per synthetic reboot
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from collections import defaultdict
17
+ from datetime import datetime, timezone
18
+ from typing import Any
19
+
20
+ import pandas as pd
21
+ from tqdm import tqdm
22
+
23
+ from loghunter.common.finding import DetectorContext, Finding, MethodTag, Severity
24
+
25
+ DETECTOR_NAME = "syslog"
26
+ STATUS = "available"
27
+
28
+ # syslog is fidelity-aware: either flat rsyslog (syslog_dir/*.log*) OR Zeek's
29
+ # own syslog.log (zeek_dir/syslog*.log*). At least one must be present; both
30
+ # concat before drain3. Detector is SOURCE-BLIND — references only the
31
+ # minimal-5 (ts, host, program, raw, message). Zeek's extended facility /
32
+ # severity ride along on the frame but are NEVER read here; the digest
33
+ # consumes them. Mirrors the dns detector's Zeek + Pi-hole shape.
34
+ REQUIRED_LOGS: list[dict] = []
35
+
36
+ OPTIONAL_LOGS = [
37
+ {"source": "syslog_dir", "pattern": "*.log*"},
38
+ {"source": "zeek_dir", "pattern": "syslog*.log*"},
39
+ ]
40
+
41
+ REQUIRES_ONE_OF_OPTIONAL = True
42
+ REQUIRES_ONE_OF_OPTIONAL_REASON = (
43
+ "syslog — no syslog source found "
44
+ "(need syslog_dir files or zeek_dir syslog.log)"
45
+ )
46
+
47
+ DRAIN_SIM_THRESH = 0.5
48
+ DRAIN_DEPTH = 4
49
+ DRAIN_PARAMETRIZE_NUMERIC = True
50
+ REBOOT_SUPPRESS_WINDOW = 300 # seconds
51
+
52
+ DEFAULT_CONFIG = {
53
+ "lookback_days": 7,
54
+ "rarity_pct": 10,
55
+ "max_count": 1,
56
+ "sim_thresh": DRAIN_SIM_THRESH,
57
+ "depth": DRAIN_DEPTH,
58
+ "parametrize_numeric": DRAIN_PARAMETRIZE_NUMERIC,
59
+ "reboot_suppress_window": REBOOT_SUPPRESS_WINDOW,
60
+ }
61
+
62
+ DETECTOR_METHOD = MethodTag("drain3", named=True)
63
+
64
+
65
+ def run(context: DetectorContext) -> list[Finding]:
66
+ """Detect anomalous syslog lines using drain3 templating and rarity scoring."""
67
+ flat_df = context.logs.get("*.log*")
68
+ zeek_df = context.logs.get("syslog*.log*")
69
+
70
+ frames = [df for df in (flat_df, zeek_df) if df is not None and not df.empty]
71
+ if not frames:
72
+ return []
73
+ df = frames[0] if len(frames) == 1 else pd.concat(frames, ignore_index=True)
74
+
75
+ cfg = context.config
76
+ sim_thresh = cfg.get("sim_thresh", DEFAULT_CONFIG["sim_thresh"])
77
+ depth = cfg.get("depth", DEFAULT_CONFIG["depth"])
78
+ parametrize = cfg.get("parametrize_numeric", DEFAULT_CONFIG["parametrize_numeric"])
79
+ rarity_pct = cfg.get("rarity_pct", DEFAULT_CONFIG["rarity_pct"])
80
+ max_count = cfg.get("max_count", DEFAULT_CONFIG["max_count"])
81
+ suppress_window = cfg.get("reboot_suppress_window", DEFAULT_CONFIG["reboot_suppress_window"])
82
+
83
+ df = _run_drain3(df, sim_thresh, depth, parametrize)
84
+ df, threshold, freq = _score_rarity(df, rarity_pct, max_count)
85
+ reboots = _detect_reboots(df)
86
+ anomaly_df = df[df["is_anomaly"]].copy()
87
+ kept_df, synthetic_records = _apply_suppression(anomaly_df, reboots, suppress_window)
88
+
89
+ now = datetime.now(timezone.utc)
90
+ timestamped: list[tuple[float, Finding]] = []
91
+
92
+ for row in kept_df.itertuples():
93
+ ts_sort = float("inf") if pd.isna(row.ts) else float(row.ts)
94
+ f = Finding(
95
+ detector=DETECTOR_NAME,
96
+ severity=Severity.MEDIUM,
97
+ title=str(row.raw)[:180],
98
+ description="Rare log template observed at or below rarity threshold",
99
+ evidence={
100
+ "host": row.host,
101
+ "template_id": int(row.template_id),
102
+ "template_str": row.template_str,
103
+ "count": int(freq[int(row.template_id)]),
104
+ "threshold": int(threshold),
105
+ },
106
+ next_steps=[
107
+ "Review surrounding log context for this host",
108
+ "Check if template appears in recent incidents",
109
+ ],
110
+ ts_generated=now,
111
+ data_window=context.data_window,
112
+ )
113
+ timestamped.append((ts_sort, f))
114
+
115
+ for record in synthetic_records:
116
+ reboot_ts = record["ts"]
117
+ ts_sort = reboot_ts.timestamp() if reboot_ts is not None else float("inf")
118
+ f = Finding(
119
+ detector=DETECTOR_NAME,
120
+ severity=Severity.INFO,
121
+ title=record["raw"],
122
+ description="Reboot detected — anomalous events within suppression window are excluded",
123
+ evidence={
124
+ "host": record["host"],
125
+ "reboot_ts": reboot_ts.isoformat() if reboot_ts is not None else None,
126
+ "suppressed_window_seconds": suppress_window,
127
+ },
128
+ next_steps=["Review system logs around the reboot time for pre-reboot anomalies"],
129
+ ts_generated=now,
130
+ data_window=context.data_window,
131
+ )
132
+ timestamped.append((ts_sort, f))
133
+
134
+ timestamped.sort(key=lambda x: x[0])
135
+ return [f for _, f in timestamped]
136
+
137
+
138
+ def _run_drain3(
139
+ df: pd.DataFrame,
140
+ sim_thresh: float,
141
+ depth: int,
142
+ parametrize_numeric: bool,
143
+ ) -> pd.DataFrame:
144
+ """Add template_id and template_str columns via drain3 log templating."""
145
+ try:
146
+ from drain3 import TemplateMiner
147
+ from drain3.template_miner_config import TemplateMinerConfig
148
+ except ImportError:
149
+ raise ImportError(
150
+ "drain3 is required for syslog detection. Run: pip install drain3"
151
+ )
152
+
153
+ cfg = TemplateMinerConfig()
154
+ cfg.drain_sim_th = sim_thresh
155
+ cfg.drain_depth = depth
156
+ cfg.parametrize_numeric_tokens = parametrize_numeric
157
+
158
+ miner = TemplateMiner(config=cfg)
159
+ template_ids: list[int] = []
160
+ template_strs: list[str] = []
161
+
162
+ # leave=True + clean bar_format makes this the liveness narration for the
163
+ # syslog detector phase (the runner deliberately skips its outer spinner
164
+ # for syslog so the two writers don't fight for the same stderr line).
165
+ for msg in tqdm(
166
+ df["message"],
167
+ desc="syslog: mining templates",
168
+ unit=" lines",
169
+ unit_scale=True,
170
+ leave=True,
171
+ bar_format="{desc}: {n_fmt} lines [{elapsed}]",
172
+ ):
173
+ result = miner.add_log_message(str(msg))
174
+ template_ids.append(result["cluster_id"])
175
+ template_strs.append(result["template_mined"])
176
+
177
+ df = df.copy()
178
+ df["template_id"] = template_ids
179
+ df["template_str"] = template_strs
180
+ return df
181
+
182
+
183
+ def _score_rarity(
184
+ df: pd.DataFrame,
185
+ rarity_pct: int,
186
+ max_count: int,
187
+ ) -> tuple[pd.DataFrame, int, dict[int, int]]:
188
+ """Add is_anomaly column; return (df, effective_threshold, freq_dict)."""
189
+ freq: dict[int, int] = {
190
+ int(k): int(v) for k, v in df["template_id"].value_counts().items()
191
+ }
192
+
193
+ sorted_counts = sorted(freq.values())
194
+ idx = max(0, int(len(sorted_counts) * rarity_pct / 100) - 1)
195
+ pct_threshold = sorted_counts[idx]
196
+ threshold = min(pct_threshold, max_count)
197
+
198
+ rare_ids = {tid for tid, count in freq.items() if count <= threshold}
199
+
200
+ df = df.copy()
201
+ df["is_anomaly"] = df["template_id"].map(lambda tid: int(tid) in rare_ids)
202
+ return df, threshold, freq
203
+
204
+
205
+ def _detect_reboots(df: pd.DataFrame) -> dict[str, list[datetime]]:
206
+ """Return {host: [reboot_datetimes]} by scanning all rows for reboot signals."""
207
+ from loghunter.parsers.syslog import is_reboot_signal
208
+
209
+ reboots: dict[str, list[datetime]] = defaultdict(list)
210
+
211
+ for row in df.itertuples():
212
+ if pd.isna(row.ts):
213
+ continue
214
+ if is_reboot_signal(str(row.raw)):
215
+ dt = datetime.fromtimestamp(float(row.ts), tz=timezone.utc)
216
+ reboots[row.host].append(dt)
217
+
218
+ return {host: sorted(times) for host, times in reboots.items()}
219
+
220
+
221
+ def _apply_suppression(
222
+ anomaly_df: pd.DataFrame,
223
+ reboots: dict[str, list[datetime]],
224
+ suppress_window: int,
225
+ ) -> tuple[pd.DataFrame, list[dict[str, Any]]]:
226
+ """Suppress anomalous events near reboots; emit one synthetic record per reboot.
227
+
228
+ Returns (kept_df, synthetic_reboot_records).
229
+ """
230
+ kept_indices: list[Any] = []
231
+ synthetic_records: list[dict[str, Any]] = []
232
+ emitted_reboots: set[tuple[str, datetime]] = set()
233
+
234
+ for row in anomaly_df.itertuples():
235
+ host = row.host
236
+ ts = row.ts
237
+
238
+ if pd.isna(ts) or host not in reboots:
239
+ kept_indices.append(row.Index)
240
+ continue
241
+
242
+ event_dt = datetime.fromtimestamp(float(ts), tz=timezone.utc)
243
+ suppressed = False
244
+
245
+ for reboot_dt in reboots[host]:
246
+ delta = (event_dt - reboot_dt).total_seconds()
247
+ if 0 <= delta <= suppress_window:
248
+ key = (host, reboot_dt)
249
+ if key not in emitted_reboots:
250
+ emitted_reboots.add(key)
251
+ synthetic_records.append({
252
+ "ts": reboot_dt,
253
+ "host": host,
254
+ "raw": (
255
+ f"*** {host} rebooted at "
256
+ f"{reboot_dt.strftime('%a %b %d %H:%M:%S')} ***"
257
+ ),
258
+ "synthetic": True,
259
+ })
260
+ suppressed = True
261
+ break
262
+
263
+ if not suppressed:
264
+ kept_indices.append(row.Index)
265
+
266
+ return anomaly_df.loc[kept_indices], synthetic_records
@@ -0,0 +1,27 @@
1
+ """Weird detector — signals from Zeek weird.log and notice.log. (planned)
2
+
3
+ Surfaces Zeek's own anomaly signals: protocol violations, connection state issues,
4
+ and notice events that indicate suspicious or malformed network behavior.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from loghunter.common.finding import DetectorContext, Finding
10
+
11
+ DETECTOR_NAME = "weird"
12
+ STATUS = "planned"
13
+
14
+ REQUIRED_LOGS = [
15
+ {"source": "zeek_dir", "pattern": "weird*.log*"},
16
+ ]
17
+
18
+ OPTIONAL_LOGS = [
19
+ {"source": "zeek_dir", "pattern": "notice*.log*"},
20
+ ]
21
+
22
+ DEFAULT_CONFIG: dict = {}
23
+
24
+
25
+ def run(context: DetectorContext) -> list[Finding]:
26
+ """Aggregate and score Zeek weird and notice events."""
27
+ raise NotImplementedError("weird detector is planned — not yet implemented")
@@ -0,0 +1,43 @@
1
+ """digest verb — orient-before-the-hunt.
2
+
3
+ A digest characterises the dominant shape of a log pile and states facts
4
+ about it. It is a peer verb to detect, not a detector. Digest never produces
5
+ a Finding and never reaches a verdict.
6
+
7
+ One summariser module per schema. Each summariser is a function
8
+ ``summarize(frame) -> dict`` returning the schema-specific body of a
9
+ DigestCard. The dispatcher below imports the right module by schema name; to
10
+ add a new schema, drop a new module beside conn.py and nothing else changes.
11
+
12
+ Architectural rail: digest consumes the loaded frame BEFORE the allowlist
13
+ filtering seam. Allowlisted infrastructure (resolvers, pollers) is part of
14
+ what's in the pile and stays on the sonar. The digest call graph must not
15
+ touch build_matcher or AllowlistMatcher.filter_df.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from importlib import import_module
21
+ from typing import Callable
22
+
23
+ import pandas as pd
24
+
25
+
26
+ def get_summarizer(schema: str) -> Callable[..., dict]:
27
+ """Return the summarize() function for a given digest schema.
28
+
29
+ The dispatcher returns the bare callable; per-schema signatures may
30
+ differ. Today: ``conn`` and ``cloudtrail`` take ``(frame)``;
31
+ ``dns`` takes ``(frame, feed)`` where feed is ``"zeek"`` or
32
+ ``"pihole"``; ``syslog`` takes ``(frame, feed)`` where feed is
33
+ ``"zeek"`` or ``"syslog"`` (flat rsyslog). Callers (currently only
34
+ ``run_digest``) know how to invoke the right signature per schema.
35
+
36
+ Raises ValueError with an actionable message when no summariser
37
+ exists for the requested schema.
38
+ """
39
+ try:
40
+ module = import_module(f"loghunter.digest.{schema}")
41
+ except ModuleNotFoundError as exc:
42
+ raise ValueError(f"digest: no summarizer for schema {schema!r}") from exc
43
+ return module.summarize
@@ -0,0 +1,182 @@
1
+ """Shared digest stats — the home for primitives that more than one card needs.
2
+
3
+ Hosts:
4
+ - ``_rate`` / ``RATE_FLOOR`` — fraction-of-events-matching-a-kind statistic
5
+ with top-contributor attribution
6
+ - ``_share`` / ``SHARE_GATE`` — concentration-against-total statistic with no
7
+ population floor
8
+ - ``select_insights_and_fields`` — shared selection helper that promotes the
9
+ top-N speaking gated slots to insights and
10
+ returns the leftover slots as fields
11
+
12
+ Cliff machinery (``_cliff``, ``CLIFF_GATE``, ``CLIFF_DISPLAY_CAP``,
13
+ ``POPULATION_FLOOR``, ``_format_ratio_cell``, ``_format_ratio_lede``) lives in
14
+ ``loghunter.digest.conn`` and stays there — that is the established shared
15
+ origin and every card already imports cliff helpers from it.
16
+
17
+ The trigger for factoring into this module is "three identical real uses"
18
+ (``_rate``, now imported by dns + syslog + cloudtrail) or "shared by the new
19
+ statistic by design" (``_share`` introduced by the cloudtrail source-ip slot
20
+ and reusable by any future concentration-against-total slot). Tail
21
+ (``_tail`` / ``TAIL_GATE``) stays local to dns.py — one-use primitives do not
22
+ belong here yet.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ from typing import Any, Callable
28
+
29
+ import pandas as pd
30
+
31
+ from loghunter.common.finding import DigestSlot
32
+ from loghunter.digest.conn import POPULATION_FLOOR
33
+
34
+
35
+ # ── Calibration constants ───────────────────────────────────────────────────
36
+
37
+ RATE_FLOOR = 0.01 # fraction below this → rate slots dash. Pure presence
38
+ # floor, NOT a badness threshold — meaning is the same on
39
+ # every network. Calibratable here.
40
+
41
+ SHARE_GATE = 0.80 # top-share at or above this → share slots speak. The
42
+ # share statistic exists to surface concentration; this
43
+ # gate is the concentration threshold. There is no
44
+ # paired population floor — see ``_share`` below.
45
+
46
+
47
+ # ── Rate statistic ──────────────────────────────────────────────────────────
48
+ #
49
+ # Behavior must NOT change from the previous in-card definitions — that is the
50
+ # proof-of-correctness for the factoring. Identical body to the three card
51
+ # copies it replaces.
52
+
53
+ def _rate(kind_mask: pd.Series, contributor_series: pd.Series) -> tuple | None:
54
+ """Rate statistic: what fraction of events are of a notable kind?
55
+
56
+ Returns ``(fraction, top_contributor)`` when speaking, None when dashed.
57
+ Dashes when total < POPULATION_FLOOR or fraction < RATE_FLOOR. The floor
58
+ is a pure presence bar — meaning the same on every network, never a
59
+ badness judgment.
60
+ """
61
+ total = len(kind_mask)
62
+ if total < POPULATION_FLOOR:
63
+ return None
64
+ kind_count = int(kind_mask.sum())
65
+ if kind_count == 0:
66
+ return None
67
+ fraction = kind_count / total
68
+ if fraction < RATE_FLOOR:
69
+ return None
70
+ matching = contributor_series[kind_mask].dropna()
71
+ if matching.empty:
72
+ return None
73
+ top = matching.value_counts().idxmax()
74
+ return fraction, str(top)
75
+
76
+
77
+ # ── Share statistic ─────────────────────────────────────────────────────────
78
+
79
+ def _share(sorted_counts: pd.Series, total: int) -> tuple[Any, float] | None:
80
+ """Share statistic: is one entity's count a dominant fraction of the total?
81
+
82
+ Returns ``(rank1_entity, top_share)`` when speaking, None when dashed.
83
+ Dashes only when ``top_share < SHARE_GATE`` — there is NO population
84
+ floor. The slot using this statistic exists to surface concentration
85
+ against the total, and low entity cardinality is the SIGNAL, not noise:
86
+ a pile of ONE distinct value at 100% MUST speak (top_share == 1.0); two
87
+ distinct entities with one at 99% MUST speak. Adding a population floor
88
+ here would suppress the exact attack shape the share slot was introduced
89
+ to catch.
90
+
91
+ ``sorted_counts`` must be descending; the caller's value_counts output is
92
+ already that shape. ``total`` is the caller-supplied denominator — for
93
+ source-ip in cloudtrail that is the interactive-event count, NOT a
94
+ derived sum, so the share is measured against the lane the caller meant.
95
+
96
+ Defensive returns:
97
+ - empty series or non-positive total → None
98
+ - NaN rank1 → None
99
+ """
100
+ if total <= 0 or len(sorted_counts) == 0:
101
+ return None
102
+ rank1 = sorted_counts.iloc[0]
103
+ if pd.isna(rank1):
104
+ return None
105
+ top_share = float(rank1) / float(total)
106
+ if top_share < SHARE_GATE:
107
+ return None
108
+ return sorted_counts.index[0], top_share
109
+
110
+
111
+ # ── Insight selection ───────────────────────────────────────────────────────
112
+ #
113
+ # Shared by all four schema summarisers. Identical mechanic across cards:
114
+ # filter speaking gated slots, sort by per-statistic salience, take top-3,
115
+ # format via the per-card formatter map → those become insights. Everything
116
+ # else with cells goes to fields. A promoted slot is suppressed from fields.
117
+
118
+ _INSIGHT_TOP_N = 3
119
+ _GATING_STATISTICS = frozenset({"cliff", "tail", "rate", "share"})
120
+
121
+
122
+ def _salience(slot: DigestSlot) -> float:
123
+ """Per-statistic salience on one comparable scale.
124
+
125
+ cliff/tail use the rank-ratio directly. rate slots are stored with the
126
+ percentage as magnitude (e.g. 1.0 for 1%), so dividing by
127
+ ``RATE_FLOOR * 100`` puts a rate slot's salience on the same scale as a
128
+ cliff ratio (1% over a 1% floor scores 1.0, comparable to a 1x cliff).
129
+ share is stored as percentage 0–100; a heavily concentrated single
130
+ source IS one of the most salient things on a card, so its raw
131
+ percentage ranks above typical cliff ratios.
132
+ """
133
+ if slot.statistic in {"cliff", "tail"}:
134
+ return slot.ratio or 0.0
135
+ if slot.statistic == "rate":
136
+ return (slot.magnitude or 0.0) / (RATE_FLOOR * 100.0)
137
+ if slot.statistic == "share":
138
+ return slot.magnitude or 0.0
139
+ return 0.0
140
+
141
+
142
+ def select_insights_and_fields(
143
+ slots: list[DigestSlot],
144
+ formatters: dict[str, Callable[[DigestSlot], str]],
145
+ ) -> tuple[list[str], list[DigestSlot]]:
146
+ """Promote speaking gated slots to insights; return leftover speaking
147
+ slots as fields.
148
+
149
+ Suppression rule (Glenn's precision ask): a slot is removed from
150
+ ``fields`` ONLY when it actually produced an insight — i.e. it was in
151
+ the top-N speaking gated set AND its label had a formatter that ran.
152
+ A gating slot whose label has no formatter falls through to fields
153
+ instead of vanishing, preserving "each fact appears exactly once."
154
+
155
+ Dist slots (statistic not in the gating set) never produce insights;
156
+ they pass straight through to fields if they have cells.
157
+ Non-speaking slots (cells is None) are omitted from both insights
158
+ and fields.
159
+
160
+ Slot labels are unique within a card, so label-based suppression is
161
+ safe — no ``id()`` ceremony.
162
+ """
163
+ speaking_gated = [
164
+ s for s in slots
165
+ if s.cells is not None and s.statistic in _GATING_STATISTICS
166
+ ]
167
+ speaking_gated.sort(key=_salience, reverse=True)
168
+
169
+ promoted_labels: set[str] = set()
170
+ insights: list[str] = []
171
+ for s in speaking_gated[:_INSIGHT_TOP_N]:
172
+ fmt = formatters.get(s.label)
173
+ if fmt is None:
174
+ continue
175
+ insights.append(fmt(s))
176
+ promoted_labels.add(s.label)
177
+
178
+ fields = [
179
+ s for s in slots
180
+ if s.cells is not None and s.label not in promoted_labels
181
+ ]
182
+ return insights, fields