loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
migrations/conn_fft.py ADDED
@@ -0,0 +1,550 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ beacon_hunt.py — weekly beaconing threat hunt
4
+ Zeek conn.log (ndjson) → scored flow report + plot
5
+
6
+ Usage:
7
+ python beacon_hunt.py data/conn/conn.log
8
+ python beacon_hunt.py data/conn/conn.log --min-conns 20 --top 30
9
+ python beacon_hunt.py data/conn/conn.log --out-dir /tmp/hunt
10
+
11
+ Outputs (written to --out-dir, default ./hunt_output/):
12
+ beacon_report_<timestamp>.txt — full text report
13
+ beacon_scores_<timestamp>.csv — all scored flows
14
+ beacon_plot_<timestamp>.png — scatter + histogram
15
+ """
16
+
17
+ import argparse
18
+ import json
19
+ import sys
20
+ from datetime import datetime, timezone
21
+ from pathlib import Path
22
+
23
+ import matplotlib
24
+ matplotlib.use("Agg") # non-interactive backend for script use
25
+ import matplotlib.pyplot as plt
26
+ import numpy as np
27
+ import pandas as pd
28
+ from tqdm import tqdm
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Configuration
33
+ # ---------------------------------------------------------------------------
34
+
35
+ # Flows to always suppress from scoring — known-good periodic infrastructure.
36
+ # Format: (dst_port, dst_ip)
37
+ ALLOWLIST_PORT_DST = {
38
+ (53, '192.0.2.53'), # DNS resolver
39
+ (123, '192.0.2.53'), # NTP
40
+ (161, '192.0.2.1'), # SNMP → router
41
+ (161, '192.0.2.11'), # SNMP → server
42
+ (6556, '192.0.2.53'), # checkmk agent
43
+ (6556, '192.0.2.11'), # checkmk agent
44
+ (6556, '192.0.2.20'), # checkmk agent
45
+ (6556, '198.51.100.1'), # checkmk agent
46
+ (9997, '192.0.2.20'), # Splunk forwarder
47
+ (8443, '192.0.2.1'), # router WebUI
48
+ (2049, '192.0.2.11'), # NFS
49
+ (111, '192.0.2.11'), # portmapper
50
+ (514, '192.0.2.20'), # syslog
51
+ (8000, '192.0.2.20'), # Splunk WebUI
52
+ (8080, '192.0.2.11'), # Pi-hole nebula-sync
53
+ }
54
+
55
+ # Known monitoring flows — labeled separately in the plot
56
+ KNOWN_MONITORING = {
57
+ ('192.0.2.10', '192.0.2.1', 22), # monitor → router SSH (MRTG)
58
+ ('192.0.2.10', '192.0.2.53', 80), # monitor → Pi-hole API
59
+ ('192.0.2.10', '192.0.2.11', 80), # monitor → Pi-hole API
60
+ ('192.0.2.11', '192.0.2.1', 22), # server → router SSH (MRTG)
61
+ }
62
+
63
+ # Score thresholds for triage tiers
64
+ THRESH_HIGH = 0.5
65
+ THRESH_MEDIUM = 0.3
66
+
67
+
68
+ # ---------------------------------------------------------------------------
69
+ # Filtering helpers
70
+ # ---------------------------------------------------------------------------
71
+
72
+ def is_multicast_or_broadcast(ip: str) -> bool:
73
+ if not isinstance(ip, str):
74
+ return False
75
+ return (
76
+ ip.startswith('224.') or
77
+ ip.startswith('239.') or
78
+ ip.startswith('255.') or
79
+ ip.endswith('.255') or
80
+ ip.startswith('ff0') or
81
+ ip.startswith('ff02')
82
+ )
83
+
84
+
85
+ def load_and_filter(log_path: Path) -> tuple[pd.DataFrame, dict]:
86
+ """Load conn.log, apply all filters, return clean DataFrame + stats dict."""
87
+ print(f"[+] Loading {log_path} ...")
88
+ df = pd.read_json(log_path, lines=True)
89
+ df["ts"] = pd.to_numeric(df["ts"], errors="coerce")
90
+ raw_rows = len(df)
91
+
92
+ t_start = pd.to_datetime(df['ts'].min(), unit='s', utc=True)
93
+ t_end = pd.to_datetime(df['ts'].max(), unit='s', utc=True)
94
+ span_h = (df['ts'].max() - df['ts'].min()) / 3600
95
+
96
+ print(f" {raw_rows:,} rows | {t_start.strftime('%Y-%m-%d %H:%M')} → "
97
+ f"{t_end.strftime('%Y-%m-%d %H:%M')} ({span_h:.1f}h)")
98
+
99
+ # 1. Established connections only
100
+ df = df[df['conn_state'].isin(['SF', 'S1'])]
101
+
102
+ # 2. Drop multicast/broadcast destinations
103
+ df = df[~df['id.resp_h'].apply(is_multicast_or_broadcast)]
104
+
105
+ # 3. Drop IPv6 link-local (NDP noise)
106
+ df = df[~df['id.orig_h'].str.startswith('fe80:', na=False)]
107
+ df = df[~df['id.resp_h'].str.startswith('fe80:', na=False)]
108
+
109
+ # 4. Require originator (no mid-stream captures)
110
+ df = df[df['local_orig'] == True]
111
+
112
+ # 5. Require non-null bytes
113
+ df = df[df['orig_bytes'].notna()]
114
+
115
+ # 6. Drop allowlisted flows
116
+ allowlist_mask = df.apply(
117
+ lambda r: (int(r['id.resp_p']), r['id.resp_h']) in ALLOWLIST_PORT_DST,
118
+ axis=1
119
+ )
120
+ df = df[~allowlist_mask]
121
+
122
+ stats = {
123
+ 'raw_rows' : raw_rows,
124
+ 'clean_rows': len(df),
125
+ 'dropped' : raw_rows - len(df),
126
+ 'pct_drop' : (raw_rows - len(df)) / raw_rows * 100,
127
+ 't_start' : t_start,
128
+ 't_end' : t_end,
129
+ 'span_h' : span_h,
130
+ }
131
+
132
+ print(f" After filters: {len(df):,} rows "
133
+ f"({stats['pct_drop']:.1f}% dropped)")
134
+ return df.reset_index(drop=True), stats
135
+
136
+
137
+ # ---------------------------------------------------------------------------
138
+ # Flow grouping
139
+ # ---------------------------------------------------------------------------
140
+
141
+ def build_candidate_flows(df: pd.DataFrame, min_conns: int) -> tuple[pd.DataFrame, pd.DataFrame]:
142
+ """Group into (src, dst, port, proto) flows and return candidates above threshold."""
143
+ flow_counts = (
144
+ df.groupby(['id.orig_h', 'id.resp_h', 'id.resp_p', 'proto'])
145
+ .size()
146
+ .reset_index(name='conn_count')
147
+ .sort_values('conn_count', ascending=False)
148
+ )
149
+ candidates = flow_counts[flow_counts['conn_count'] >= min_conns].copy()
150
+ df_cands = df.merge(
151
+ candidates[['id.orig_h', 'id.resp_h', 'id.resp_p', 'proto']],
152
+ on=['id.orig_h', 'id.resp_h', 'id.resp_p', 'proto'],
153
+ how='inner'
154
+ ).sort_values(['id.orig_h', 'id.resp_h', 'id.resp_p', 'proto', 'ts'])
155
+ return candidates, df_cands
156
+
157
+
158
+ # ---------------------------------------------------------------------------
159
+ # FFT beacon scorer
160
+ # ---------------------------------------------------------------------------
161
+
162
+ def compute_beacon_score(ts_array: np.ndarray,
163
+ bin_size: int = 30,
164
+ min_period: int = 45,
165
+ max_period: int = 7200) -> dict | None:
166
+ """
167
+ Compute FFT-based beacon score for a single flow's connection timestamps.
168
+
169
+ Approach:
170
+ 1. Bin timestamps into a regular time grid (count series)
171
+ 2. Apply FFT to find dominant periodic frequency
172
+ 3. Score using spectral ratio + peak prominence + jitter CV
173
+
174
+ Why binning instead of raw inter-arrival deltas:
175
+ Gaps produce massive delta outliers that corrupt FFT results.
176
+ Binning represents gaps as zero-count bins, preserving periodicity.
177
+
178
+ Why prominence in addition to spectral ratio:
179
+ Sparse binary signals spread energy across harmonics, keeping the
180
+ absolute spectral ratio low even for perfectly periodic flows.
181
+ Prominence measures how much the peak rises above the local noise
182
+ floor — robust to harmonic spreading.
183
+ """
184
+ if len(ts_array) < 10:
185
+ return None
186
+
187
+ t_start = ts_array.min()
188
+ t_end = ts_array.max()
189
+ n_bins = int((t_end - t_start) / bin_size) + 1
190
+
191
+ bin_idx = ((ts_array - t_start) / bin_size).astype(int)
192
+ counts = np.zeros(n_bins)
193
+ np.add.at(counts, bin_idx, 1)
194
+
195
+ std = counts.std()
196
+ if std == 0:
197
+ return None
198
+
199
+ counts_norm = (counts - counts.mean()) / std
200
+
201
+ fft_mag = np.abs(np.fft.rfft(counts_norm))
202
+ freqs = np.fft.rfftfreq(n_bins, d=bin_size)
203
+ fft_mag[0] = 0
204
+
205
+ with np.errstate(divide='ignore'):
206
+ periods = np.where(freqs > 0, 1.0 / freqs, np.inf)
207
+
208
+ mask_range = (periods >= min_period) & (periods <= max_period)
209
+ fft_masked = np.where(mask_range, fft_mag, 0)
210
+ if fft_masked.max() == 0:
211
+ return None
212
+
213
+ peak_idx = fft_masked.argmax()
214
+ peak_period = periods[peak_idx]
215
+ peak_power = fft_mag[peak_idx]
216
+ total_power = fft_mag[1:].sum()
217
+ if total_power == 0:
218
+ return None
219
+
220
+ spectral_ratio = peak_power / total_power
221
+
222
+ window = max(10, int(peak_idx * 0.05))
223
+ lo = max(1, peak_idx - window)
224
+ hi = min(len(fft_mag) - 1, peak_idx + window)
225
+ local = np.concatenate([fft_mag[lo:peak_idx], fft_mag[peak_idx+1:hi+1]])
226
+ noise_floor = np.median(local) if len(local) > 0 else 1.0
227
+ prominence = peak_power / (noise_floor + 1e-10)
228
+ prominence_norm = min(prominence / 100.0, 1.0)
229
+
230
+ deltas = np.diff(ts_array)
231
+ d_mean = deltas.mean()
232
+ d_std = deltas.std()
233
+ clean_deltas = deltas[np.abs(deltas - d_mean) < 3 * d_std]
234
+ jitter_cv = (clean_deltas.std() / clean_deltas.mean()
235
+ if len(clean_deltas) > 1 else 1.0)
236
+
237
+ # Composite: 40% spectral ratio + 40% prominence + 20% jitter
238
+ beacon_score = (
239
+ 0.4 * spectral_ratio +
240
+ 0.4 * prominence_norm +
241
+ 0.2 * (1.0 - min(jitter_cv, 1.0))
242
+ )
243
+
244
+ return {
245
+ 'dominant_period' : round(peak_period, 1),
246
+ 'dominant_period_m': round(peak_period / 60, 2),
247
+ 'spectral_ratio' : round(spectral_ratio, 4),
248
+ 'prominence' : round(prominence, 2),
249
+ 'prominence_norm' : round(prominence_norm, 4),
250
+ 'jitter_cv' : round(jitter_cv, 4),
251
+ 'beacon_score' : round(beacon_score, 4),
252
+ 'conn_count' : len(ts_array),
253
+ 'occupancy' : round((counts > 0).sum() / n_bins, 4),
254
+ }
255
+
256
+
257
+ # ---------------------------------------------------------------------------
258
+ # Score all candidate flows
259
+ # ---------------------------------------------------------------------------
260
+
261
+ def score_flows(df_cands: pd.DataFrame) -> pd.DataFrame:
262
+ results = []
263
+ grouped = df_cands.groupby(['id.orig_h', 'id.resp_h', 'id.resp_p', 'proto'])
264
+
265
+ for (orig_h, resp_h, resp_p, proto), group in tqdm(
266
+ grouped, desc="Scoring flows", unit="flow"
267
+ ):
268
+ ts_array = group['ts'].sort_values().values
269
+ score = compute_beacon_score(ts_array)
270
+ if score is None:
271
+ continue
272
+
273
+ bytes_s = group['orig_bytes'].dropna()
274
+ bytes_cv = (bytes_s.std() / bytes_s.mean()
275
+ if len(bytes_s) > 1 and bytes_s.mean() > 0 else 1.0)
276
+
277
+ results.append({
278
+ 'src_ip' : orig_h,
279
+ 'dst_ip' : resp_h,
280
+ 'dst_port' : int(resp_p),
281
+ 'proto' : proto,
282
+ **score,
283
+ 'bytes_cv' : round(bytes_cv, 4),
284
+ 'bytes_mean' : round(bytes_s.mean(), 1) if len(bytes_s) > 0 else 0,
285
+ })
286
+
287
+ return (pd.DataFrame(results)
288
+ .sort_values('beacon_score', ascending=False)
289
+ .reset_index(drop=True))
290
+
291
+
292
+ # ---------------------------------------------------------------------------
293
+ # Classification for plot coloring
294
+ # ---------------------------------------------------------------------------
295
+
296
+ def classify(row) -> str:
297
+ if (row.src_ip, row.dst_ip, row.dst_port) in KNOWN_MONITORING:
298
+ return 'monitoring'
299
+ if row.dst_port == 123:
300
+ return 'ntp'
301
+ if row.dst_port == 53:
302
+ return 'dns'
303
+ if row.beacon_score >= THRESH_HIGH:
304
+ return 'high'
305
+ if row.beacon_score >= THRESH_MEDIUM:
306
+ return 'medium'
307
+ return 'normal'
308
+
309
+
310
+ # ---------------------------------------------------------------------------
311
+ # Plot
312
+ # ---------------------------------------------------------------------------
313
+
314
+ def make_plot(df_scores: pd.DataFrame, stats: dict, out_path: Path):
315
+ plt.style.use('dark_background')
316
+ df_scores = df_scores.copy()
317
+ df_scores['category'] = df_scores.apply(classify, axis=1)
318
+
319
+ colors = {
320
+ 'monitoring': '#888888',
321
+ 'ntp' : '#4a9eff',
322
+ 'dns' : '#4aff9e',
323
+ 'high' : '#ff4a4a',
324
+ 'medium' : '#ffaa4a',
325
+ 'normal' : '#ffffff',
326
+ }
327
+ labels = {
328
+ 'monitoring': 'Known monitoring',
329
+ 'ntp' : 'NTP sync',
330
+ 'dns' : 'DNS patterns',
331
+ 'high' : f'High score (≥{THRESH_HIGH})',
332
+ 'medium' : f'Medium score (≥{THRESH_MEDIUM})',
333
+ 'normal' : 'Normal',
334
+ }
335
+
336
+ fig, axes = plt.subplots(1, 2, figsize=(18, 7))
337
+ date_str = stats['t_start'].strftime('%Y-%m-%d') + ' – ' + stats['t_end'].strftime('%Y-%m-%d')
338
+ fig.suptitle(f"Beacon Hunt | {date_str} ({stats['span_h']:.1f}h)",
339
+ fontsize=13, y=1.01)
340
+
341
+ # --- Left: score vs period bubble chart
342
+ ax = axes[0]
343
+ for cat in ['normal', 'dns', 'ntp', 'monitoring', 'medium', 'high']:
344
+ sub = df_scores[df_scores['category'] == cat]
345
+ if len(sub) == 0:
346
+ continue
347
+ sizes = np.clip(sub['conn_count'] / df_scores['conn_count'].max() * 800, 10, 800)
348
+ ax.scatter(sub['dominant_period_m'], sub['beacon_score'],
349
+ s=sizes, c=colors[cat], alpha=0.6, edgecolors='none',
350
+ label=f"{labels[cat]} (n={len(sub)})")
351
+
352
+ ax.axhline(THRESH_HIGH, color='#ff4a4a', linestyle='--', linewidth=0.8,
353
+ alpha=0.5, label=f'High threshold ({THRESH_HIGH})')
354
+ ax.axhline(THRESH_MEDIUM, color='#ffaa4a', linestyle='--', linewidth=0.8,
355
+ alpha=0.5, label=f'Medium threshold ({THRESH_MEDIUM})')
356
+
357
+ # Annotate high scorers that aren't known monitoring
358
+ for _, row in df_scores[df_scores['beacon_score'] >= THRESH_HIGH].iterrows():
359
+ if classify(row) not in ('monitoring',):
360
+ ax.annotate(
361
+ f"{row.src_ip}→{row.dst_ip}:{row.dst_port}",
362
+ xy=(row.dominant_period_m, row.beacon_score),
363
+ xytext=(8, 0), textcoords='offset points',
364
+ fontsize=6.5, color='white', alpha=0.85,
365
+ )
366
+
367
+ ax.set_xlabel('Dominant Period (minutes)', fontsize=11)
368
+ ax.set_ylabel('Beacon Score', fontsize=11)
369
+ ax.set_title('Beacon Score vs Period\n(bubble size = connection count)', fontsize=10)
370
+ ax.set_xlim(left=0)
371
+ ax.set_ylim(bottom=0)
372
+ ax.legend(fontsize=8, loc='upper right')
373
+
374
+ # --- Right: score distribution histogram
375
+ ax2 = axes[1]
376
+ bins = np.linspace(0, df_scores['beacon_score'].max() + 0.01, 60)
377
+ ax2.hist(df_scores['beacon_score'], bins=bins, color='#4a9eff',
378
+ edgecolor='none', alpha=0.8)
379
+ ax2.axvline(THRESH_HIGH, color='#ff4a4a', linestyle='--',
380
+ linewidth=1.2, label=f'High ({THRESH_HIGH})')
381
+ ax2.axvline(THRESH_MEDIUM, color='#ffaa4a', linestyle='--',
382
+ linewidth=1.2, label=f'Medium ({THRESH_MEDIUM})')
383
+
384
+ n_high = (df_scores['beacon_score'] >= THRESH_HIGH).sum()
385
+ n_medium = ((df_scores['beacon_score'] >= THRESH_MEDIUM) &
386
+ (df_scores['beacon_score'] < THRESH_HIGH)).sum()
387
+ ax2.text(THRESH_HIGH + 0.01, ax2.get_ylim()[1] * 0.85,
388
+ f"≥{THRESH_HIGH}: {n_high} flows\n≥{THRESH_MEDIUM}: {n_medium} flows",
389
+ fontsize=9, color='white')
390
+
391
+ ax2.set_xlabel('Beacon Score', fontsize=11)
392
+ ax2.set_ylabel('Flow count', fontsize=11)
393
+ ax2.set_title(f'Score Distribution\n({len(df_scores):,} candidate flows)', fontsize=10)
394
+ ax2.legend(fontsize=9)
395
+
396
+ plt.tight_layout()
397
+ plt.savefig(out_path, dpi=150, bbox_inches='tight')
398
+ plt.close()
399
+ print(f"[+] Plot saved → {out_path}")
400
+
401
+
402
+ # ---------------------------------------------------------------------------
403
+ # Text report
404
+ # ---------------------------------------------------------------------------
405
+
406
+ def write_report(df_scores: pd.DataFrame, stats: dict,
407
+ log_path: Path, top_n: int, out_path: Path):
408
+
409
+ n_high = (df_scores['beacon_score'] >= THRESH_HIGH).sum()
410
+ n_medium = ((df_scores['beacon_score'] >= THRESH_MEDIUM) &
411
+ (df_scores['beacon_score'] < THRESH_HIGH)).sum()
412
+ n_total = len(df_scores)
413
+
414
+ lines = []
415
+ w = lines.append
416
+
417
+ w("=" * 72)
418
+ w(" BEACON THREAT HUNT REPORT")
419
+ w("=" * 72)
420
+ w(f" Generated : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
421
+ w(f" Log file : {log_path}")
422
+ w(f" Window : {stats['t_start'].strftime('%Y-%m-%d %H:%M')} UTC → "
423
+ f"{stats['t_end'].strftime('%Y-%m-%d %H:%M')} UTC ({stats['span_h']:.1f}h)")
424
+ w("")
425
+ w(" DATA SUMMARY")
426
+ w(" " + "-" * 40)
427
+ w(f" Raw conn.log rows : {stats['raw_rows']:>10,}")
428
+ w(f" After filters : {stats['clean_rows']:>10,} ({stats['pct_drop']:.1f}% dropped)")
429
+ w(f" Candidate flows : {n_total:>10,} (≥20 connections)")
430
+ w("")
431
+ w(" TRIAGE SUMMARY")
432
+ w(" " + "-" * 40)
433
+ w(f" HIGH (score ≥ {THRESH_HIGH}) : {n_high:>5} flows ← investigate")
434
+ w(f" MEDIUM (score ≥ {THRESH_MEDIUM}) : {n_medium:>5} flows ← review")
435
+ w(f" NORMAL (score < {THRESH_MEDIUM}) : {n_total - n_high - n_medium:>5} flows")
436
+ w("")
437
+
438
+ # Score distribution
439
+ w(" SCORE DISTRIBUTION")
440
+ w(" " + "-" * 40)
441
+ for lo, hi in [(0.5, 1.0), (0.3, 0.5), (0.2, 0.3), (0.1, 0.2), (0.0, 0.1)]:
442
+ n = ((df_scores['beacon_score'] >= lo) & (df_scores['beacon_score'] < hi)).sum()
443
+ bar = '█' * int(n / max(n_total, 1) * 40)
444
+ w(f" {lo:.1f}–{hi:.1f} : {n:5,} {bar}")
445
+ w("")
446
+
447
+ # High priority flows
448
+ high_flows = df_scores[df_scores['beacon_score'] >= THRESH_HIGH]
449
+ if len(high_flows) > 0:
450
+ w(" HIGH PRIORITY FLOWS (score ≥ 0.5)")
451
+ w(" " + "-" * 68)
452
+ w(f" {'SRC':<18} {'DST':<18} {'PORT':>5} {'PROTO':<5} "
453
+ f"{'SCORE':>6} {'PERIOD':>8} {'PROM':>7} {'JITTER':>7} {'BYTES_CV':>8} {'CONNS':>6}")
454
+ w(" " + "-" * 68)
455
+ for _, r in high_flows.iterrows():
456
+ flag = " ◄ KNOWN INFRA" if (r.src_ip, r.dst_ip, r.dst_port) in KNOWN_MONITORING else ""
457
+ w(f" {r.src_ip:<18} {r.dst_ip:<18} {int(r.dst_port):>5} {r.proto:<5} "
458
+ f"{r.beacon_score:>6.4f} {r.dominant_period_m:>6.1f}m "
459
+ f"{r.prominence:>7.1f} {r.jitter_cv:>7.4f} {r.bytes_cv:>8.4f} "
460
+ f"{r.conn_count:>6}{flag}")
461
+ w("")
462
+
463
+ # Medium priority flows
464
+ med_flows = df_scores[
465
+ (df_scores['beacon_score'] >= THRESH_MEDIUM) &
466
+ (df_scores['beacon_score'] < THRESH_HIGH)
467
+ ]
468
+ if len(med_flows) > 0:
469
+ w(" MEDIUM PRIORITY FLOWS (0.3 ≤ score < 0.5)")
470
+ w(" " + "-" * 68)
471
+ w(f" {'SRC':<18} {'DST':<18} {'PORT':>5} {'PROTO':<5} "
472
+ f"{'SCORE':>6} {'PERIOD':>8} {'PROM':>7} {'JITTER':>7} {'BYTES_CV':>8} {'CONNS':>6}")
473
+ w(" " + "-" * 68)
474
+ for _, r in med_flows.iterrows():
475
+ w(f" {r.src_ip:<18} {r.dst_ip:<18} {int(r.dst_port):>5} {r.proto:<5} "
476
+ f"{r.beacon_score:>6.4f} {r.dominant_period_m:>6.1f}m "
477
+ f"{r.prominence:>7.1f} {r.jitter_cv:>7.4f} {r.bytes_cv:>8.4f} "
478
+ f"{r.conn_count:>6}")
479
+ w("")
480
+
481
+ # Top N overall
482
+ w(f" TOP {top_n} FLOWS BY BEACON SCORE (all tiers)")
483
+ w(" " + "-" * 68)
484
+ w(f" {'#':<4} {'SRC':<18} {'DST':<18} {'PORT':>5} {'PROTO':<5} "
485
+ f"{'SCORE':>6} {'PERIOD':>8} {'CONNS':>6}")
486
+ w(" " + "-" * 68)
487
+ for i, (_, r) in enumerate(df_scores.head(top_n).iterrows(), 1):
488
+ w(f" {i:<4} {r.src_ip:<18} {r.dst_ip:<18} {int(r.dst_port):>5} {r.proto:<5} "
489
+ f"{r.beacon_score:>6.4f} {r.dominant_period_m:>6.1f}m {r.conn_count:>6}")
490
+ w("")
491
+ w("=" * 72)
492
+ w(" END OF REPORT")
493
+ w("=" * 72)
494
+
495
+ report_text = "\n".join(lines)
496
+ out_path.write_text(report_text)
497
+ print(report_text)
498
+ print(f"\n[+] Report saved → {out_path}")
499
+
500
+
501
+ # ---------------------------------------------------------------------------
502
+ # Main
503
+ # ---------------------------------------------------------------------------
504
+
505
+ def main():
506
+ parser = argparse.ArgumentParser(
507
+ description="beacon threat hunt — Zeek conn.log → scored report"
508
+ )
509
+ parser.add_argument("log", type=Path, help="Path to Zeek conn.log (ndjson)")
510
+ parser.add_argument("--min-conns",type=int, default=20,
511
+ help="Minimum connections per flow to score (default: 20)")
512
+ parser.add_argument("--top", type=int, default=25,
513
+ help="Number of flows in top-N table (default: 25)")
514
+ parser.add_argument("--out-dir", type=Path, default=Path("hunt_output"),
515
+ help="Output directory (default: ./hunt_output/)")
516
+ args = parser.parse_args()
517
+
518
+ if not args.log.exists():
519
+ print(f"[!] Log file not found: {args.log}", file=sys.stderr)
520
+ sys.exit(1)
521
+
522
+ args.out_dir.mkdir(parents=True, exist_ok=True)
523
+ stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
524
+
525
+ # --- Run pipeline
526
+ df_clean, stats = load_and_filter(args.log)
527
+
528
+ print(f"[+] Grouping flows (min_conns={args.min_conns}) ...")
529
+ candidates, df_cands = build_candidate_flows(df_clean, args.min_conns)
530
+ print(f" {len(candidates):,} candidate flows | "
531
+ f"{len(df_cands):,} connection records")
532
+
533
+ print(f"[+] Scoring {len(candidates):,} flows ...")
534
+ df_scores = score_flows(df_cands)
535
+ print(f" Scored: {len(df_scores):,} flows")
536
+
537
+ # --- Write outputs
538
+ csv_path = args.out_dir / f"beacon_scores_{stamp}.csv"
539
+ report_path = args.out_dir / f"beacon_report_{stamp}.txt"
540
+ plot_path = args.out_dir / f"beacon_plot_{stamp}.png"
541
+
542
+ df_scores.to_csv(csv_path, index=False)
543
+ print(f"[+] CSV saved → {csv_path}")
544
+
545
+ make_plot(df_scores, stats, plot_path)
546
+ write_report(df_scores, stats, args.log, args.top, report_path)
547
+
548
+
549
+ if __name__ == "__main__":
550
+ main()