loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,367 @@
1
+ """conn summariser — orient-before-the-hunt for Zeek conn data.
2
+
3
+ Reads a normalised conn frame (canonical columns ``src, dst, port, proto, ts,
4
+ bytes, conn_state, local_orig``) and returns the schema-specific body of a
5
+ DigestCard: ``zone1_extras`` (the ambient label/value block), ``insights``
6
+ (prose sentences mechanically derived from speaking gated slots), and
7
+ ``fields`` (the display-ready, already-filtered speaking non-insight slots).
8
+
9
+ All four conn slots use the ``cliff`` statistic: rank1 / rank2 over the sorted
10
+ entity counts. A slot is non-speaking when the population is below
11
+ ``POPULATION_FLOOR`` or when the ratio is below ``CLIFF_GATE``; non-speaking
12
+ slots are filtered out of ``fields`` by ``select_insights_and_fields`` and
13
+ never reach the renderer.
14
+
15
+ Internal/external classification is computed locally; the scan detector's
16
+ home_net is intentionally not imported.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import ipaddress
22
+ from typing import Any
23
+
24
+ import pandas as pd
25
+
26
+ from loghunter.common.finding import DigestSlot
27
+
28
+
29
+ # ── Calibration constants — provisional, tunable in one place ────────────────
30
+
31
+ CLIFF_GATE = 2.0
32
+ POPULATION_FLOOR = 5
33
+ # Display-only ceiling for rendered cliff ratios. Above this, "625000.0x" and
34
+ # "60x" tell the reader the same thing (one entity utterly dominates), so the
35
+ # extra magnitude is noise. We cap the RENDERED string at >50x / "more than
36
+ # 50x"; slot.ratio continues to carry the true float so lede sort ordering
37
+ # still respects the real value.
38
+ CLIFF_DISPLAY_CAP = 50.0
39
+
40
+ _RFC1918_NETWORKS = (
41
+ ipaddress.ip_network("10.0.0.0/8"),
42
+ ipaddress.ip_network("172.16.0.0/12"),
43
+ ipaddress.ip_network("192.168.0.0/16"),
44
+ )
45
+
46
+
47
+ # ── Internal/external classifier ─────────────────────────────────────────────
48
+
49
+ def _is_internal(ip: object) -> bool:
50
+ """Return True iff ip is a string parsable as an RFC1918 address."""
51
+ if not isinstance(ip, str) or not ip:
52
+ return False
53
+ try:
54
+ addr = ipaddress.ip_address(ip)
55
+ except ValueError:
56
+ return False
57
+ return any(addr in net for net in _RFC1918_NETWORKS)
58
+
59
+
60
+ def _origin_internal_series(frame: pd.DataFrame) -> pd.Series:
61
+ """Rule B per-row originator-is-internal classification.
62
+
63
+ ``local_orig`` is the per-row signal when present (True → internal,
64
+ False → external). When ``local_orig`` is missing or NaN, fall back to
65
+ RFC1918 membership of ``src`` (Rule A applied to src).
66
+ """
67
+ src_internal = frame["src"].map(_is_internal)
68
+ if "local_orig" not in frame.columns:
69
+ return src_internal.astype(bool)
70
+ local_orig = frame["local_orig"]
71
+ resolved = local_orig.where(local_orig.notna(), src_internal)
72
+ return resolved.astype(bool)
73
+
74
+
75
+ # ── Cliff ratio display formatting ───────────────────────────────────────────
76
+
77
+ def _format_ratio_cell(ratio: float) -> str:
78
+ """Compact Zone-3 table cell. Caps at CLIFF_DISPLAY_CAP."""
79
+ if ratio >= CLIFF_DISPLAY_CAP:
80
+ return f">{int(CLIFF_DISPLAY_CAP)}x"
81
+ return f"{ratio:.1f}x"
82
+
83
+
84
+ def _format_ratio_lede(ratio: float) -> str:
85
+ """Prose Zone-2 lede fragment. Caps at CLIFF_DISPLAY_CAP.
86
+
87
+ Returns just the comparator phrase (e.g. ``"3.7x"`` or
88
+ ``"more than 50x"``); the surrounding "the next destination" / "its
89
+ nearest peer" / etc. lives in the per-slot lede formatter.
90
+ """
91
+ if ratio >= CLIFF_DISPLAY_CAP:
92
+ return f"more than {int(CLIFF_DISPLAY_CAP)}x"
93
+ return f"{ratio:.1f}x"
94
+
95
+
96
+ # ── Cliff statistic ──────────────────────────────────────────────────────────
97
+
98
+ def _cliff(sorted_counts: pd.Series) -> tuple[Any, float, float] | None:
99
+ """Evaluate the cliff slot over a descending series of entity magnitudes.
100
+
101
+ Returns ``(rank1_entity, rank1_magnitude, ratio)`` when the slot speaks;
102
+ None when it should dash. Dashes when population is below
103
+ POPULATION_FLOOR, when rank2 is zero/NaN, or when the rank1/rank2 ratio
104
+ is below CLIFF_GATE.
105
+ """
106
+ if len(sorted_counts) < POPULATION_FLOOR:
107
+ return None
108
+ rank1 = sorted_counts.iloc[0]
109
+ rank2 = sorted_counts.iloc[1]
110
+ if pd.isna(rank2) or rank2 == 0:
111
+ return None
112
+ ratio = float(rank1) / float(rank2)
113
+ if ratio < CLIFF_GATE:
114
+ return None
115
+ return sorted_counts.index[0], float(rank1), ratio
116
+
117
+
118
+ # ── Slot computations ────────────────────────────────────────────────────────
119
+
120
+ def _slot_conn_share(frame: pd.DataFrame) -> DigestSlot:
121
+ """conn-share: which host owns the largest share of connections.
122
+
123
+ Host involvement = rows where host appears as src OR dst. Each row
124
+ contributes to two hosts' counts (src and dst); a row with src == dst
125
+ counts once for that host. The brief reads "share of connections" as
126
+ endpoint involvement, not source-only.
127
+ """
128
+ label = "conn-share"
129
+ if frame.empty:
130
+ return DigestSlot(label=label, statistic="cliff")
131
+
132
+ src_counts = frame["src"].value_counts(dropna=False)
133
+ dst_counts = frame["dst"].value_counts(dropna=False)
134
+ same = frame.loc[frame["src"] == frame["dst"], "src"].value_counts(dropna=False)
135
+ involvement = src_counts.add(dst_counts, fill_value=0).sub(same, fill_value=0)
136
+ involvement = involvement.sort_values(ascending=False)
137
+
138
+ result = _cliff(involvement)
139
+ if result is None:
140
+ return DigestSlot(label=label, statistic="cliff")
141
+ entity, magnitude, ratio = result
142
+ total_rows = len(frame)
143
+ share_pct = (magnitude / total_rows * 100.0) if total_rows > 0 else 0.0
144
+ entity_str = str(entity)
145
+ return DigestSlot(
146
+ label=label,
147
+ statistic="cliff",
148
+ cells=[entity_str, f"{share_pct:.0f}%", _format_ratio_cell(ratio)],
149
+ entity=entity_str,
150
+ magnitude=share_pct,
151
+ ratio=ratio,
152
+ )
153
+
154
+
155
+ def _slot_densest_tuple(frame: pd.DataFrame) -> DigestSlot:
156
+ """densest-tuple: the single busiest (src, dst, port) flow.
157
+
158
+ Proto is intentionally not part of the key — the brief specifies the fill
159
+ format as ``src->dst:port``.
160
+ """
161
+ label = "densest-tuple"
162
+ if frame.empty:
163
+ return DigestSlot(label=label, statistic="cliff")
164
+
165
+ counts = (
166
+ frame.groupby(["src", "dst", "port"], dropna=False)
167
+ .size()
168
+ .sort_values(ascending=False)
169
+ )
170
+ result = _cliff(counts)
171
+ if result is None:
172
+ return DigestSlot(label=label, statistic="cliff")
173
+ (src, dst, port), magnitude, ratio = result
174
+ port_token = str(int(port)) if pd.notna(port) else "?"
175
+ flow = f"{src} → {dst}:{port_token}"
176
+ return DigestSlot(
177
+ label=label,
178
+ statistic="cliff",
179
+ cells=[flow, f"{int(magnitude)}", _format_ratio_cell(ratio)],
180
+ entity=flow,
181
+ magnitude=magnitude,
182
+ ratio=ratio,
183
+ )
184
+
185
+
186
+ def _slot_fan_out(frame: pd.DataFrame) -> DigestSlot:
187
+ """fan-out: src:port reaching the most distinct destinations."""
188
+ label = "fan-out"
189
+ if frame.empty:
190
+ return DigestSlot(label=label, statistic="cliff")
191
+
192
+ distinct_dsts = (
193
+ frame.groupby(["src", "port"], dropna=False)["dst"]
194
+ .nunique()
195
+ .sort_values(ascending=False)
196
+ )
197
+ result = _cliff(distinct_dsts)
198
+ if result is None:
199
+ return DigestSlot(label=label, statistic="cliff")
200
+ (src, port), magnitude, ratio = result
201
+ port_token = str(int(port)) if pd.notna(port) else "?"
202
+ src_port = f"{src}:{port_token}"
203
+ return DigestSlot(
204
+ label=label,
205
+ statistic="cliff",
206
+ cells=[src_port, f"{int(magnitude)} dsts", _format_ratio_cell(ratio)],
207
+ entity=src_port,
208
+ magnitude=magnitude,
209
+ ratio=ratio,
210
+ )
211
+
212
+
213
+ def _slot_byte_direction(frame: pd.DataFrame) -> DigestSlot:
214
+ """byte-direction: external dst receiving the largest share of outbound bytes.
215
+
216
+ A row is outbound iff (Rule B src-internal) AND (Rule A dst-external);
217
+ neither alone is sufficient. NaN/missing bytes count as 0.
218
+ """
219
+ label = "byte-direction"
220
+ if frame.empty or "bytes" not in frame.columns:
221
+ return DigestSlot(label=label, statistic="cliff")
222
+
223
+ src_internal = _origin_internal_series(frame)
224
+ dst_external = ~frame["dst"].map(_is_internal)
225
+ outbound_mask = src_internal & dst_external
226
+ if not outbound_mask.any():
227
+ return DigestSlot(label=label, statistic="cliff")
228
+
229
+ outbound = frame.loc[outbound_mask]
230
+ bytes_filled = outbound["bytes"].fillna(0)
231
+ per_dst_bytes = bytes_filled.groupby(outbound["dst"]).sum().sort_values(ascending=False)
232
+ result = _cliff(per_dst_bytes)
233
+ if result is None:
234
+ return DigestSlot(label=label, statistic="cliff")
235
+ dst, magnitude, ratio = result
236
+ total_outbound = float(bytes_filled.sum())
237
+ pct = (magnitude / total_outbound * 100.0) if total_outbound > 0 else 0.0
238
+ entity = str(dst)
239
+ return DigestSlot(
240
+ label=label,
241
+ statistic="cliff",
242
+ cells=[entity, f"{pct:.0f}%", _format_ratio_cell(ratio)],
243
+ entity=entity,
244
+ magnitude=pct,
245
+ ratio=ratio,
246
+ )
247
+
248
+
249
+ # ── Zone-1 extras ────────────────────────────────────────────────────────────
250
+
251
+ def _format_bytes(n: float) -> str:
252
+ """Format a byte count for the Zone-1 descriptive line."""
253
+ if n < 1024:
254
+ return f"{int(n)} B"
255
+ if n < 1024 ** 2:
256
+ return f"{n / 1024:.1f} KB"
257
+ if n < 1024 ** 3:
258
+ return f"{n / (1024 ** 2):.1f} MB"
259
+ if n < 1024 ** 4:
260
+ return f"{n / (1024 ** 3):.1f} GB"
261
+ return f"{n / (1024 ** 4):.1f} TB"
262
+
263
+
264
+ def _zone1_extras(frame: pd.DataFrame) -> list[tuple[str, str]]:
265
+ """Return the ambient label/value rows the conn card prints.
266
+
267
+ Exactly the four pieces the brief lists: host count, internal/external
268
+ split, outbound bytes, inbound bytes. Host count and split share one
269
+ rendered line (the split is the parenthetical of the count). Outbound and
270
+ inbound bytes are two further lines.
271
+ """
272
+ if frame.empty:
273
+ return [
274
+ ("hosts", "0"),
275
+ ("outbound bytes", _format_bytes(0)),
276
+ ("inbound bytes", _format_bytes(0)),
277
+ ]
278
+
279
+ hosts: set[str] = set()
280
+ for col in ("src", "dst"):
281
+ for value in frame[col].dropna().tolist():
282
+ if isinstance(value, str) and value:
283
+ hosts.add(value)
284
+
285
+ internal_count = sum(1 for h in hosts if _is_internal(h))
286
+ external_count = len(hosts) - internal_count
287
+
288
+ src_internal = _origin_internal_series(frame)
289
+ src_external = ~src_internal
290
+ dst_internal = frame["dst"].map(_is_internal)
291
+ dst_external = ~dst_internal
292
+ if "bytes" in frame.columns:
293
+ bytes_series = frame["bytes"].fillna(0)
294
+ else:
295
+ bytes_series = pd.Series(0, index=frame.index)
296
+ outbound_bytes = float(bytes_series[src_internal & dst_external].sum())
297
+ inbound_bytes = float(bytes_series[src_external & dst_internal].sum())
298
+
299
+ return [
300
+ ("hosts", f"{len(hosts)} ({internal_count} internal, {external_count} external)"),
301
+ ("outbound bytes", _format_bytes(outbound_bytes)),
302
+ ("inbound bytes", _format_bytes(inbound_bytes)),
303
+ ]
304
+
305
+
306
+ # ── Lede formatters ──────────────────────────────────────────────────────────
307
+
308
+ def _lede_conn_share(slot: DigestSlot) -> str:
309
+ return (
310
+ f"{slot.entity} is in {slot.magnitude:.0f}% of connections, "
311
+ f"{_format_ratio_lede(slot.ratio)} its nearest peer."
312
+ )
313
+
314
+
315
+ def _lede_densest_tuple(slot: DigestSlot) -> str:
316
+ return (
317
+ f"{slot.entity} is the densest flow at {int(slot.magnitude)} connections, "
318
+ f"{_format_ratio_lede(slot.ratio)} the next."
319
+ )
320
+
321
+
322
+ def _lede_fan_out(slot: DigestSlot) -> str:
323
+ return (
324
+ f"{slot.entity} reaches {int(slot.magnitude)} distinct destinations, "
325
+ f"{_format_ratio_lede(slot.ratio)} the next-broadest source."
326
+ )
327
+
328
+
329
+ def _lede_byte_direction(slot: DigestSlot) -> str:
330
+ return (
331
+ f"{slot.entity} receives {slot.magnitude:.0f}% of outbound bytes, "
332
+ f"{_format_ratio_lede(slot.ratio)} the next destination."
333
+ )
334
+
335
+
336
+ _INSIGHT_FORMATTERS = {
337
+ "conn-share": _lede_conn_share,
338
+ "densest-tuple": _lede_densest_tuple,
339
+ "fan-out": _lede_fan_out,
340
+ "byte-direction": _lede_byte_direction,
341
+ }
342
+
343
+
344
+ # ── Public entry point ──────────────────────────────────────────────────────
345
+
346
+ def summarize(frame: pd.DataFrame) -> dict:
347
+ """Return the schema-specific body of a conn DigestCard.
348
+
349
+ Returned keys:
350
+ zone1_extras — list[(label, value)] in render order
351
+ insights — list[str], 0..3 prose sentences
352
+ fields — list[DigestSlot] speaking-and-not-promoted, in declared order
353
+ """
354
+ from loghunter.digest._stats import select_insights_and_fields
355
+
356
+ slots = [
357
+ _slot_conn_share(frame),
358
+ _slot_densest_tuple(frame),
359
+ _slot_fan_out(frame),
360
+ _slot_byte_direction(frame),
361
+ ]
362
+ insights, fields = select_insights_and_fields(slots, _INSIGHT_FORMATTERS)
363
+ return {
364
+ "zone1_extras": _zone1_extras(frame),
365
+ "insights": insights,
366
+ "fields": fields,
367
+ }