loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,341 @@
1
+ """cloudtrail summariser — orient-before-the-hunt for CloudTrail data.
2
+
3
+ Six fixed slots, two of which are scoped to the interactive lane only:
4
+
5
+ - lane-split — dist — interactive vs service share of the WHOLE pile
6
+ (HEADLINE orient; renders first; never produces
7
+ an insight)
8
+ - principal-vol — cliff — INTERACTIVE ONLY: largest share of interactive events
9
+ - event-source — cliff — busiest AWS service across the whole pile
10
+ - source-ip — share — INTERACTIVE ONLY: concentration of one source IP
11
+ against interactive total. NO population floor —
12
+ single-IP-dominates is the SIGNAL this slot was
13
+ introduced to surface, and that case inherently
14
+ has few distinct IPs. Gated at SHARE_GATE only.
15
+ - region — dist — top-3 aws_region share across the whole pile
16
+ (always shows; never produces an insight)
17
+ - error-rate — rate — fraction of events that errored
18
+ (error_code non-null); names the top error CODE
19
+
20
+ Lane scoping is the one structural wrinkle on this card. principal-vol and
21
+ source-ip read the interactive subset only; lane-split, event-source, region,
22
+ and error-rate read the whole frame. The aws detector takes the same
23
+ interactive-first discipline; we read aws.py for understanding but do NOT
24
+ import from it — same no-cross-import rail dns/syslog follow with their
25
+ detectors.
26
+
27
+ Cliff machinery is imported from conn so the cards cannot drift on gate /
28
+ floor / display-cap behaviour. Rate (and its RATE_FLOOR) and share (and its
29
+ SHARE_GATE) live in ``loghunter.digest._stats`` — the shared stats module
30
+ factored once three cards needed an identical rate (dns + syslog +
31
+ cloudtrail) and once a second statistic without a sibling needed its
32
+ canonical home.
33
+
34
+ Dist slots (lane-split, region) never contribute an insight — ambient
35
+ orientation, not a standout, same rule as dns's qtype-mix. On a quiet
36
+ account every gating slot stays non-speaking and vanishes from ``fields``;
37
+ the card carries only the two dist slots — that IS the honest digest of a
38
+ quiet pile.
39
+ """
40
+
41
+ from __future__ import annotations
42
+
43
+ import pandas as pd
44
+
45
+ from loghunter.common.finding import DigestSlot
46
+ from loghunter.digest._stats import RATE_FLOOR, SHARE_GATE, _rate, _share
47
+ from loghunter.digest.conn import (
48
+ CLIFF_DISPLAY_CAP, # noqa: F401 — re-exported for downstream symmetry
49
+ CLIFF_GATE, # noqa: F401 — re-exported for downstream symmetry
50
+ POPULATION_FLOOR, # noqa: F401 — cliff slots in this card use it via _cliff
51
+ _cliff,
52
+ _format_ratio_cell,
53
+ _format_ratio_lede,
54
+ )
55
+
56
+
57
+ # ── dist helpers — local, no shared base ────────────────────────────────────
58
+
59
+ def _lane_split_dist(lane_series: pd.Series | None) -> str:
60
+ """Render the lane-split binary share for the lane-split dist slot.
61
+
62
+ Two distinct fallbacks (consistency with dns.qtype-mix):
63
+ - Missing column (lane_series is None) → "(no lane)" (schema-presence fact)
64
+ - Empty / all-NaN series → "(no events)" (data-shape fact)
65
+ Otherwise: ``"interactive N% / service M%"``. Any non-interactive label
66
+ counts toward the service share — the parser's derivation is "default
67
+ interactive, escalate to service when service-marked," and any unknown
68
+ label is closer to service than to interactive.
69
+ """
70
+ if lane_series is None:
71
+ return "(no lane)"
72
+ labels = lane_series.dropna()
73
+ if labels.empty:
74
+ return "(no events)"
75
+ total = int(len(labels))
76
+ interactive_count = int((labels == "interactive").sum())
77
+ service_count = total - interactive_count
78
+ interactive_pct = int(round(interactive_count / total * 100))
79
+ service_pct = int(round(service_count / total * 100))
80
+ return f"interactive {interactive_pct}% / service {service_pct}%"
81
+
82
+
83
+ def _region_dist(region_series: pd.Series | None) -> str:
84
+ """Render top-3 region share string for the region dist slot.
85
+
86
+ Two distinct fallbacks (consistency with dns.qtype-mix):
87
+ - Missing column (region_series is None) → "(no region)" (schema-presence fact)
88
+ - Empty / all-NaN series → "(no events)" (data-shape fact)
89
+ Single-region pile → "us-east-1 100%". Mix → top-3 joined by " · ".
90
+ """
91
+ if region_series is None:
92
+ return "(no region)"
93
+ labels = region_series.dropna().astype(str)
94
+ if labels.empty:
95
+ return "(no events)"
96
+ counts = labels.value_counts()
97
+ total = int(counts.sum())
98
+ top_three = counts.head(3)
99
+ parts = [
100
+ f"{label} {int(round(count / total * 100))}%"
101
+ for label, count in top_three.items()
102
+ ]
103
+ return " · ".join(parts)
104
+
105
+
106
+ # ── Slot computers ──────────────────────────────────────────────────────────
107
+
108
+ def _slot_lane_split(frame: pd.DataFrame) -> DigestSlot:
109
+ """lane-split — dist over the lane column; whole pile; always shows."""
110
+ label = "lane-split"
111
+ lane = frame["lane"] if "lane" in frame.columns else None
112
+ rendered = _lane_split_dist(lane)
113
+ return DigestSlot(label=label, statistic="dist", cells=[rendered])
114
+
115
+
116
+ def _slot_principal_vol(frame_interactive: pd.DataFrame) -> DigestSlot:
117
+ """principal-vol — cliff over per-principal counts in the interactive lane.
118
+
119
+ Share denominator is the interactive total, not the whole pile. On a
120
+ pile with two ≈balanced interactive principals (population below
121
+ POPULATION_FLOOR or rank1/rank2 ratio below CLIFF_GATE) this slot
122
+ correctly DASHES — that is the spec.
123
+ """
124
+ label = "principal-vol"
125
+ if frame_interactive.empty or "principal" not in frame_interactive.columns:
126
+ return DigestSlot(label=label, statistic="cliff")
127
+ counts = (
128
+ frame_interactive["principal"]
129
+ .value_counts(dropna=True)
130
+ .sort_values(ascending=False)
131
+ )
132
+ result = _cliff(counts)
133
+ if result is None:
134
+ return DigestSlot(label=label, statistic="cliff")
135
+ entity, magnitude, ratio = result
136
+ total = int(len(frame_interactive))
137
+ share_pct = (magnitude / total * 100.0) if total > 0 else 0.0
138
+ entity_str = str(entity)
139
+ return DigestSlot(
140
+ label=label,
141
+ statistic="cliff",
142
+ cells=[entity_str, f"{share_pct:.0f}%", _format_ratio_cell(ratio)],
143
+ entity=entity_str,
144
+ magnitude=share_pct,
145
+ ratio=ratio,
146
+ )
147
+
148
+
149
+ def _slot_event_source(frame: pd.DataFrame) -> DigestSlot:
150
+ """event-source — cliff over per-service counts across the whole pile."""
151
+ label = "event-source"
152
+ if frame.empty or "event_source" not in frame.columns:
153
+ return DigestSlot(label=label, statistic="cliff")
154
+ counts = frame["event_source"].value_counts(dropna=True).sort_values(ascending=False)
155
+ result = _cliff(counts)
156
+ if result is None:
157
+ return DigestSlot(label=label, statistic="cliff")
158
+ entity, magnitude, ratio = result
159
+ entity_str = str(entity)
160
+ return DigestSlot(
161
+ label=label,
162
+ statistic="cliff",
163
+ cells=[entity_str, f"{int(magnitude)}", _format_ratio_cell(ratio)],
164
+ entity=entity_str,
165
+ magnitude=magnitude,
166
+ ratio=ratio,
167
+ )
168
+
169
+
170
+ def _slot_source_ip(frame_interactive: pd.DataFrame) -> DigestSlot:
171
+ """source-ip — share of one source IP against the interactive total.
172
+
173
+ Concentration-against-total, NOT rank-dominance. The question this slot
174
+ asks is "is interactive traffic concentrated in one source," which is
175
+ answered by share-of-total — not by a rank1/rank2 ratio. The case the
176
+ slot exists to surface (a single attacker IP) inherently produces a
177
+ low-cardinality distribution; using cliff's POPULATION_FLOOR=5 here
178
+ would suppress exactly that signal. The share statistic has no
179
+ population floor — a pile of one distinct IP at 100% speaks, two IPs
180
+ with one at 99% speaks.
181
+
182
+ Interactive-scoped because service-lane source_ip is frequently a
183
+ service hostname (e.g. "s3.amazonaws.com"), not an IP — that string
184
+ would dominate the whole-pile share and manufacture a meaningless
185
+ "standout".
186
+
187
+ Cell vs entity split: the table cell leads with "1 IP" to make the
188
+ concentration legible at a glance; the entity field carries the actual
189
+ address so the lede names it. Two cells, not three — there is no
190
+ rank-2 ratio in a share statistic.
191
+ """
192
+ label = "source-ip"
193
+ if frame_interactive.empty or "source_ip" not in frame_interactive.columns:
194
+ return DigestSlot(label=label, statistic="share")
195
+ counts = (
196
+ frame_interactive["source_ip"]
197
+ .value_counts(dropna=True)
198
+ .sort_values(ascending=False)
199
+ )
200
+ total = int(len(frame_interactive))
201
+ result = _share(counts, total)
202
+ if result is None:
203
+ return DigestSlot(label=label, statistic="share")
204
+ entity, top_share = result
205
+ share_pct = top_share * 100.0
206
+ entity_str = str(entity)
207
+ return DigestSlot(
208
+ label=label,
209
+ statistic="share",
210
+ cells=["1 IP", f"{share_pct:.0f}% of interactive"],
211
+ entity=entity_str,
212
+ magnitude=share_pct,
213
+ )
214
+
215
+
216
+ def _slot_region(frame: pd.DataFrame) -> DigestSlot:
217
+ """region — dist over aws_region across the whole pile; always shows."""
218
+ label = "region"
219
+ regions = frame["aws_region"] if "aws_region" in frame.columns else None
220
+ rendered = _region_dist(regions)
221
+ return DigestSlot(label=label, statistic="dist", cells=[rendered])
222
+
223
+
224
+ def _slot_error_rate(frame: pd.DataFrame) -> DigestSlot:
225
+ """error-rate — rate of events with non-null error_code; names top error code.
226
+
227
+ Kind definition: ``error_code.notna()``. The parser emits None on
228
+ success; a non-null string means the call errored. The top contributor
229
+ is the most common errorCode value among errored events — NOT a
230
+ principal.
231
+
232
+ Literal notna() semantics: rows with None or NaN read as clean; rows
233
+ with an empty string read as errored (the parser does not emit "" on
234
+ success, so this is a no-op in practice but pinned by tests).
235
+ """
236
+ label = "error-rate"
237
+ if frame.empty or "error_code" not in frame.columns:
238
+ return DigestSlot(label=label, statistic="rate")
239
+ kind_mask = frame["error_code"].notna()
240
+ result = _rate(kind_mask, frame["error_code"])
241
+ if result is None:
242
+ return DigestSlot(label=label, statistic="rate")
243
+ fraction, top = result
244
+ pct = fraction * 100.0
245
+ return DigestSlot(
246
+ label=label,
247
+ statistic="rate",
248
+ cells=[f"{pct:.0f}%", top],
249
+ entity=top,
250
+ magnitude=pct,
251
+ )
252
+
253
+
254
+ # ── Lede formatters ─────────────────────────────────────────────────────────
255
+
256
+ def _lede_principal_vol(slot: DigestSlot) -> str:
257
+ return (
258
+ f"{slot.entity} drove {slot.magnitude:.0f}% of interactive events, "
259
+ f"{_format_ratio_lede(slot.ratio)} the next principal."
260
+ )
261
+
262
+
263
+ def _lede_event_source(slot: DigestSlot) -> str:
264
+ return (
265
+ f"{slot.entity} accounted for {int(slot.magnitude)} events, "
266
+ f"{_format_ratio_lede(slot.ratio)} the next service."
267
+ )
268
+
269
+
270
+ def _lede_source_ip(slot: DigestSlot) -> str:
271
+ # Share statistic — no rank-2 ratio, so no "Nx the next" clause.
272
+ return (
273
+ f"{slot.entity} is the source of {slot.magnitude:.0f}% of "
274
+ f"interactive events."
275
+ )
276
+
277
+
278
+ def _lede_error_rate(slot: DigestSlot) -> str:
279
+ return (
280
+ f"{slot.magnitude:.0f}% of events errored, "
281
+ f"most commonly {slot.entity}."
282
+ )
283
+
284
+
285
+ _INSIGHT_FORMATTERS = {
286
+ "principal-vol": _lede_principal_vol,
287
+ "event-source": _lede_event_source,
288
+ "source-ip": _lede_source_ip,
289
+ "error-rate": _lede_error_rate,
290
+ }
291
+
292
+
293
+ # ── Zone 1 extras ───────────────────────────────────────────────────────────
294
+
295
+ def _zone1_extras(frame: pd.DataFrame) -> list[tuple[str, str]]:
296
+ """Two lines, brief-pinned: distinct principals + distinct event sources."""
297
+ if frame.empty:
298
+ return [("principals", "0"), ("event sources", "0")]
299
+ distinct_principals = (
300
+ int(frame["principal"].nunique(dropna=True))
301
+ if "principal" in frame.columns else 0
302
+ )
303
+ distinct_sources = (
304
+ int(frame["event_source"].nunique(dropna=True))
305
+ if "event_source" in frame.columns else 0
306
+ )
307
+ return [
308
+ ("principals", str(distinct_principals)),
309
+ ("event sources", str(distinct_sources)),
310
+ ]
311
+
312
+
313
+ # ── Public entry point ─────────────────────────────────────────────────────
314
+
315
+ def summarize(frame: pd.DataFrame) -> dict:
316
+ """Return the schema-specific body of a cloudtrail DigestCard.
317
+
318
+ The interactive subset is derived once at the top so the two
319
+ interactive-scoped slots (principal-vol, source-ip) see the same view
320
+ of the data.
321
+ """
322
+ from loghunter.digest._stats import select_insights_and_fields
323
+
324
+ if "lane" in frame.columns:
325
+ frame_interactive = frame[frame["lane"] == "interactive"]
326
+ else:
327
+ frame_interactive = frame.iloc[0:0]
328
+ slots = [
329
+ _slot_lane_split(frame),
330
+ _slot_principal_vol(frame_interactive),
331
+ _slot_event_source(frame),
332
+ _slot_source_ip(frame_interactive),
333
+ _slot_region(frame),
334
+ _slot_error_rate(frame),
335
+ ]
336
+ insights, fields = select_insights_and_fields(slots, _INSIGHT_FORMATTERS)
337
+ return {
338
+ "zone1_extras": _zone1_extras(frame),
339
+ "insights": insights,
340
+ "fields": fields,
341
+ }