loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,258 @@
1
+ """Beacon detector — FFT-based periodic connection detection.
2
+
3
+ Algorithm:
4
+ - Bin connection timestamps into 30-second intervals (not 10s — 10s bins place a 60s
5
+ beacon at the Nyquist limit, producing harmonic artifacts)
6
+ - Compute FFT over the binned time grid (resilient to data gaps vs raw inter-arrival)
7
+ - Composite score: 40% spectral ratio + 40% peak prominence + 20% inverted jitter CV
8
+ - Peak prominence: peak power relative to local spectral noise floor, normalized at 100x
9
+ - Jitter CV computed on outlier-cleaned inter-arrival deltas
10
+ - Minimum 20 connections per candidate flow
11
+
12
+ Reference calibration: MRTG 60s SSH poll scores ~0.608, dominant period exactly 60.0s.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ from datetime import datetime, timezone
18
+ from typing import Any
19
+
20
+ import numpy as np
21
+
22
+ from loghunter.common.finding import DetectorContext, Finding, MethodTag, Severity
23
+
24
+ DETECTOR_NAME = "beacon"
25
+ STATUS = "available"
26
+
27
+ REQUIRED_LOGS = [
28
+ {"source": "zeek_dir", "pattern": "conn*.log*"},
29
+ ]
30
+
31
+ OPTIONAL_LOGS: list[dict] = []
32
+
33
+ DEFAULT_CONFIG = {
34
+ "threshold": 0.5,
35
+ "min_connections": 20,
36
+ "bin_seconds": 30,
37
+ }
38
+
39
+ DETECTOR_METHOD = MethodTag("FFT", named=True)
40
+
41
+ # Period range to consider (seconds). Outside this, FFT peaks are ignored.
42
+ _MIN_PERIOD = 45
43
+ _MAX_PERIOD = 7200
44
+
45
+
46
+ def run(context: DetectorContext) -> list[Finding]:
47
+ """Detect beaconing flows using FFT on binned connection timestamps."""
48
+ cfg = context.config
49
+ threshold: float = cfg.get("threshold", DEFAULT_CONFIG["threshold"])
50
+ min_conns: int = cfg.get("min_connections", DEFAULT_CONFIG["min_connections"])
51
+ bin_size: int = cfg.get("bin_seconds", DEFAULT_CONFIG["bin_seconds"])
52
+
53
+ df = context.logs.get("conn*.log*")
54
+ if df is None or df.empty:
55
+ return []
56
+
57
+ df = _filter_conn(df)
58
+ if df.empty:
59
+ return []
60
+
61
+ findings: list[Finding] = []
62
+
63
+ for (src, dst, port, proto), group in df.groupby(["src", "dst", "port", "proto"]):
64
+ if len(group) < min_conns:
65
+ continue
66
+
67
+ ts_arr = group["ts"].sort_values().to_numpy(dtype=float)
68
+ score_data = _compute_beacon_score(ts_arr, bin_size)
69
+ if score_data is None or score_data["beacon_score"] < threshold:
70
+ continue
71
+
72
+ findings.append(_make_finding(
73
+ str(src), str(dst), int(port), str(proto),
74
+ score_data, group, context.data_window,
75
+ ))
76
+
77
+ findings.sort(key=lambda f: f.evidence["beacon_score"], reverse=True)
78
+ return findings
79
+
80
+
81
+ def _filter_conn(df: Any) -> Any:
82
+ """Apply standard beacon pre-filters: established conns, no multicast, local origin."""
83
+ import pandas as pd
84
+
85
+ df = df[df["conn_state"].isin(["SF", "S1"])].copy()
86
+ df = df[~df["dst"].map(_is_multicast_or_broadcast)]
87
+ df = df[~df["src"].str.startswith("fe80:", na=False)]
88
+ df = df[~df["dst"].str.startswith("fe80:", na=False)]
89
+ df = df[df["local_orig"] == True] # noqa: E712
90
+ df = df[df["bytes"].notna()]
91
+ return df
92
+
93
+
94
+ def _is_multicast_or_broadcast(ip: str) -> bool:
95
+ if not isinstance(ip, str):
96
+ return False
97
+ return (
98
+ ip.startswith("224.") or ip.startswith("239.") or
99
+ ip.startswith("255.") or ip.endswith(".255") or
100
+ ip.startswith("ff0") or ip.startswith("ff02")
101
+ )
102
+
103
+
104
+ def _compute_beacon_score(
105
+ ts_array: np.ndarray,
106
+ bin_size: int = 30,
107
+ ) -> dict[str, Any] | None:
108
+ """Score a single flow's connection timestamps for periodic beaconing via FFT.
109
+
110
+ Returns None if the flow cannot be scored (too few points, no variance, no
111
+ dominant period in the configured range).
112
+
113
+ Why binning over raw inter-arrival deltas: gaps produce delta outliers that
114
+ corrupt FFT results; binning represents gaps as zero-count bins, preserving
115
+ the periodicity signal.
116
+
117
+ Why prominence alongside spectral ratio: sparse binary signals spread energy
118
+ across harmonics, keeping the absolute spectral ratio low even for perfectly
119
+ periodic flows. Prominence measures peak power above the local noise floor,
120
+ robust to harmonic spreading.
121
+ """
122
+ if len(ts_array) < 10:
123
+ return None
124
+
125
+ t_start = ts_array.min()
126
+ t_end = ts_array.max()
127
+ n_bins = int((t_end - t_start) / bin_size) + 1
128
+
129
+ bin_idx = ((ts_array - t_start) / bin_size).astype(int)
130
+ counts = np.zeros(n_bins)
131
+ np.add.at(counts, bin_idx, 1)
132
+
133
+ std = counts.std()
134
+ if std == 0:
135
+ return None
136
+ counts_norm = (counts - counts.mean()) / std
137
+
138
+ fft_mag = np.abs(np.fft.rfft(counts_norm))
139
+ freqs = np.fft.rfftfreq(n_bins, d=bin_size)
140
+ fft_mag[0] = 0 # zero DC component
141
+
142
+ with np.errstate(divide="ignore"):
143
+ periods = np.where(freqs > 0, 1.0 / freqs, np.inf)
144
+
145
+ mask = (periods >= _MIN_PERIOD) & (periods <= _MAX_PERIOD)
146
+ fft_masked = np.where(mask, fft_mag, 0)
147
+ if fft_masked.max() == 0:
148
+ return None
149
+
150
+ peak_idx = int(fft_masked.argmax())
151
+ peak_period = float(periods[peak_idx])
152
+ peak_power = float(fft_mag[peak_idx])
153
+ total_power = float(fft_mag[1:].sum())
154
+ if total_power == 0:
155
+ return None
156
+
157
+ spectral_ratio = peak_power / total_power
158
+
159
+ window = max(10, int(peak_idx * 0.05))
160
+ lo = max(1, peak_idx - window)
161
+ hi = min(len(fft_mag) - 1, peak_idx + window)
162
+ local = np.concatenate([fft_mag[lo:peak_idx], fft_mag[peak_idx + 1:hi + 1]])
163
+ noise_floor = float(np.median(local)) if len(local) > 0 else 1.0
164
+ prominence = peak_power / (noise_floor + 1e-10)
165
+ prominence_norm = min(prominence / 100.0, 1.0)
166
+
167
+ deltas = np.diff(ts_array)
168
+ d_mean = deltas.mean()
169
+ d_std = deltas.std()
170
+ clean_deltas = deltas[np.abs(deltas - d_mean) < 3 * d_std]
171
+ if len(clean_deltas) > 1 and clean_deltas.mean() > 0:
172
+ jitter_cv = float(clean_deltas.std() / clean_deltas.mean())
173
+ else:
174
+ jitter_cv = 1.0
175
+
176
+ beacon_score = (
177
+ 0.4 * spectral_ratio +
178
+ 0.4 * prominence_norm +
179
+ 0.2 * (1.0 - min(jitter_cv, 1.0))
180
+ )
181
+
182
+ return {
183
+ "beacon_score": round(beacon_score, 4),
184
+ "dominant_period": round(peak_period, 1),
185
+ "dominant_period_m": round(peak_period / 60, 2),
186
+ "spectral_ratio": round(spectral_ratio, 4),
187
+ "prominence": round(prominence, 2),
188
+ "prominence_norm": round(prominence_norm, 4),
189
+ "jitter_cv": round(jitter_cv, 4),
190
+ "conn_count": len(ts_array),
191
+ "occupancy": round(float((counts > 0).sum()) / n_bins, 4),
192
+ }
193
+
194
+
195
+ def _make_finding(
196
+ src: str,
197
+ dst: str,
198
+ port: int,
199
+ proto: str,
200
+ score_data: dict[str, Any],
201
+ group: Any,
202
+ data_window: tuple[datetime, datetime],
203
+ ) -> Finding:
204
+ score = score_data["beacon_score"]
205
+ period_s = score_data["dominant_period"]
206
+ period_m = score_data["dominant_period_m"]
207
+ conn_count = score_data["conn_count"]
208
+
209
+ if score >= 0.7:
210
+ severity = Severity.HIGH
211
+ elif score >= 0.5:
212
+ severity = Severity.MEDIUM
213
+ else:
214
+ severity = Severity.LOW
215
+
216
+ period_str = f"{period_m:.1f}m" if period_m >= 2 else f"{period_s:.0f}s"
217
+ title = f"{src} → {dst}:{port}/{proto}"
218
+
219
+ bytes_s = group["bytes"].dropna()
220
+ bytes_mean = round(float(bytes_s.mean()), 1) if len(bytes_s) > 0 else 0.0
221
+
222
+ description = (
223
+ f"Flow {src} → {dst}:{port}/{proto} shows periodic beaconing with a dominant "
224
+ f"period of {period_str} (score={score:.4f}). "
225
+ f"Spectral ratio: {score_data['spectral_ratio']:.4f}, "
226
+ f"peak prominence: {score_data['prominence']:.2f}, "
227
+ f"jitter CV: {score_data['jitter_cv']:.4f}. "
228
+ f"Mean payload: {bytes_mean:.0f} bytes."
229
+ )
230
+
231
+ next_steps = [
232
+ f"Identify the process on {src} making connections every {period_str}",
233
+ f"Pivot to dns.log — search for lookups resolving to {dst}",
234
+ f"Check {dst} on VirusTotal, Shodan, and ASN lookup",
235
+ f"Review full history: zeek-cut id.orig_h id.resp_h id.resp_p ts | grep '{dst}'",
236
+ "Use --export-allowlist to stage this flow for allowlisting if known-good",
237
+ ]
238
+
239
+ evidence = {
240
+ **score_data,
241
+ "period_str": period_str,
242
+ "src_ip": src,
243
+ "dst_ip": dst,
244
+ "dst_port": port,
245
+ "proto": proto,
246
+ "bytes_mean": bytes_mean,
247
+ }
248
+
249
+ return Finding(
250
+ detector=DETECTOR_NAME,
251
+ severity=severity,
252
+ title=title,
253
+ description=description,
254
+ evidence=evidence,
255
+ next_steps=next_steps,
256
+ ts_generated=datetime.now(timezone.utc),
257
+ data_window=data_window,
258
+ )