loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,520 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ dns_cluster.py — weekly DNS clustering threat hunt
4
+ Zeek dns.log (ndjson) → HDBSCAN cluster analysis + entropy-ranked noise report
5
+
6
+ Usage:
7
+ python dns_cluster.py data/dns/dns.log
8
+ python dns_cluster.py data/dns/dns.log --top 100 --min-size 300
9
+ python dns_cluster.py data/dns/dns.log --out-dir /tmp/hunt
10
+
11
+ Outputs (written to --out-dir, default ./hunt_output/):
12
+ dns_report_<timestamp>.txt — full text report + top entropy domains
13
+ dns_domains_<timestamp>.csv — noise domains with entropy scores
14
+ dns_plot_<timestamp>.png — cluster size chart + entropy distribution
15
+ """
16
+
17
+ import argparse
18
+ import json
19
+ import math
20
+ import re
21
+ import sys
22
+ from datetime import datetime
23
+ from pathlib import Path
24
+
25
+ import matplotlib
26
+ matplotlib.use("Agg")
27
+ import matplotlib.pyplot as plt
28
+ import numpy as np
29
+ import pandas as pd
30
+ import hdbscan
31
+ from sklearn.preprocessing import StandardScaler
32
+
33
+
34
+ # ---------------------------------------------------------------------------
35
+ # Configuration
36
+ # ---------------------------------------------------------------------------
37
+
38
+ # Minimum cluster size for HDBSCAN — larger = fewer, more meaningful clusters
39
+ # 500 produced 299 clusters with heavy fragmentation; 2000 is more appropriate
40
+ # for a week of traffic (~600K post-whitelist queries)
41
+ MIN_CLUSTER_SIZE = 2000
42
+ # Minimum samples — controls how conservative cluster membership is
43
+ MIN_SAMPLES = 100
44
+
45
+ # Additional infrastructure noise to suppress from the entropy report
46
+ # (patterns that survive the whitelist but aren't interesting)
47
+ INFRA_SUPPRESS = (
48
+ r'\.akam\.net$|\.edgekey\.net$|\.azure-dns\.com$'
49
+ r'|\.nsone\.net$|\.windowsupdate\.com$'
50
+ )
51
+
52
+ # Triage threshold for entropy score — above this warrants a closer look
53
+ # Lowered from 2.5: typical noise peaks around 0.8-1.0; nothing exceeded 2.1
54
+ # in a calibration run, so 1.8 gives a practical weekly review list
55
+ THRESH_HIGH_ENTROPY = 1.8
56
+
57
+
58
+ # ---------------------------------------------------------------------------
59
+ # Known-good domain patterns (whitelist)
60
+ # Domains matching any of these are excluded before clustering.
61
+ # ---------------------------------------------------------------------------
62
+
63
+ PATTERNS = [
64
+ ('reverse_dns', r'\.in-addr\.arpa$'),
65
+ ('ipv6_arpa', r'\.ip6\.arpa$'),
66
+ ('mdns_local', r'\.local$'),
67
+ ('mdns_service', r'^_'),
68
+ ('uuid', r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'),
69
+ ('ntp', r'pool\.ntp\.org$|\.ntp\.org$'),
70
+ ('akamai', r'\.akamai\.net$|\.akamaiedge\.net$|\.akamai\.com$'
71
+ r'|\.akamaihd\.net$|\.akadns\.net$|\.akamaized\.net$'
72
+ r'|\.akamaitechnologies\.com$'),
73
+ ('apple_cdn', r'\.apple\.com$|\.icloud\.com$|\.aaplimg\.com$|\.apple-dns\.net$'),
74
+ ('aws', r'\.amazonaws\.com$|\.awsglobalaccelerator\.com$|\.cloudfront\.net$'),
75
+ ('google', r'\.googlevideo\.com$|\.googleapis\.com$|\.gstatic\.com$'
76
+ r'|\.googleusercontent\.com$|\.googledomains\.com$|\.google\.com$'),
77
+ ('azure', r'\.azurefd\.net$|\.azureedge\.net$|\.cloudapp\.azure\.com$'
78
+ r'|\.azurewebsites\.net$|\.trafficmanager\.net$|\.windows\.net$'),
79
+ ('sonos_ws', r'conn-i-[0-9a-f]+\..*\.sonos\.com$'),
80
+ ('amazon_video', r'\.amazonvideo\.com$|\.amazon\.com$|\.amazonalexa\.com$|\.a2z\.com$'),
81
+ ('oracle_idcs', r'\.oraclecloud\.com$|\.oracle\.com$'),
82
+ ('sonos', r'\.sonos\.com$'),
83
+ ('dropbox', r'\.dropbox\.com$|\.dropbox-dns\.com$'),
84
+ ('zoom', r'\.zoom\.us$'),
85
+ ('mozilla', r'\.mozilla\.net$|\.mozilla\.org$|\.mozgcp\.net$'),
86
+ ('microsoft', r'\.microsoft\.com$|\.office\.com$|\.live\.com$'
87
+ r'|\.skype\.com$|\.msidentity\.com$'),
88
+ ('fastly', r'\.fastly\.net$|\.fastly-edge\.com$'),
89
+ ('tinypass', r'\.tinypass\.com$'),
90
+ ('atlassian', r'\.atlassian\.com$|\.atlassian-dev\.net$|\.atl-paas\.net$'),
91
+
92
+ ('awsdns', r'(^|\.)awsdns-\d+\.\w+(\.\w+)?$'),
93
+ ('aws_ns', r'ns-\d+\.awsdns'),
94
+ ('awswaf', r'(^|\.)awswaf\.com$'),
95
+ ('ovh_ns', r'ns\d+\.ovh\.net$|dns\d+\.ovh\.net$'),
96
+ ('ultradns', r'\.ultradns\.(net|com|org|info|co\.uk)$'),
97
+ ('azure_ns', r'ns\d+-\d+\.azure-dns\.(com|net|org|info)$'),
98
+ ('backblaze', r'pod-\d+-\d+-\d+\.backblaze\.com$'
99
+ r'|pod-\d{3}-\d{4}-\d{2}\.backblaze\.com$|ca\d+\.backblaze\.com$'),
100
+ ('msedge', r'\.t-msedge\.net$|\.fb-t-msedge\.net$'),
101
+ ('nameservers', r'^ns\d*[-\.]|\.awsdns-|\.ultradns\.|\.cloudns\.'
102
+ r'|\.constellix\.|\.digicertdns\.|\.domaincontrol\.'),
103
+ ('diagnostic_dns', r'\.prod\.diagnostic\.networking\.aws\.dev$'),
104
+ ('oracledns', r'\.dns\.oraclecloud\.net$'),
105
+ ('sentinelone', r'\.sentinelone\.net$'),
106
+ ('hcaptcha', r'\.hcaptcha\.com$'),
107
+ ('sentry', r'\.sentry\.io$'),
108
+ ('attlocal', r'\.attlocal\.net$'),
109
+ ('msedge_cdn', r'\.(ax|bx|ln)-\d+\.(ax|bx|ln)(-dc)?-msedge\.net$'),
110
+ ('splunk_telemetry', r'(^|\.)scs\.splunk\.com$|(^|\.)splunk\.com$'),
111
+ ('netdata', r'(^|\.)netdata\.cloud$'),
112
+ ('lenovo_mgmt', r'(^|\.)lenovo\.com$'),
113
+ ('vdinfo_iot', r'(^|\.)vdinfo\.site$|(^|\.)kvaedit\.site$'),
114
+ ('opendns_diag', r'^debug\.opendns\.com$'),
115
+ ('rapid7', r'(^|\.)rapid7\.com$|(^|\.)r7ops\.com$|(^|\.)r7sec\.com$'),
116
+ ('web_diag_aws', r'(^|\.)diagnostic\.networking\.aws\.dev$'),
117
+ ]
118
+
119
+ # Pre-compile for performance
120
+ _COMPILED_PATTERNS = [(label, re.compile(pat, re.IGNORECASE)) for label, pat in PATTERNS]
121
+
122
+
123
+ def is_whitelisted(query: str) -> bool:
124
+ return any(pat.search(query) for _, pat in _COMPILED_PATTERNS)
125
+
126
+
127
+ def categorize(query: str) -> str:
128
+ for label, pat in _COMPILED_PATTERNS:
129
+ if pat.search(query):
130
+ return label
131
+ return 'uncategorized'
132
+
133
+
134
+ # ---------------------------------------------------------------------------
135
+ # Feature engineering helpers
136
+ # ---------------------------------------------------------------------------
137
+
138
+ def q_len(q): return len(q)
139
+ def q_parts(q): return len(q.split('.'))
140
+ def q_suffix_len(q): return len(q.split('.')[-1])
141
+ def q_domain_len(q):
142
+ try: return len(q.split('.')[-2])
143
+ except: return 0
144
+
145
+
146
+ def summit(val):
147
+ """Sum TTL list or pass through scalar."""
148
+ if isinstance(val, (int, float)):
149
+ return float(val)
150
+ return np.array(val, dtype=np.float32).sum()
151
+
152
+
153
+ def entropy(s: str) -> float:
154
+ """
155
+ Composite entropy score for a domain label.
156
+ Combines Shannon entropy with character class heuristics to
157
+ distinguish DGA/random labels from human-readable ones.
158
+ Higher score = more suspicious.
159
+ """
160
+ if not s:
161
+ return 0.0
162
+ s = s.lower()
163
+ n = len(s)
164
+
165
+ # Shannon entropy
166
+ counts = {c: s.count(c) for c in set(s)}
167
+ probs = [v / n for v in counts.values()]
168
+ shannon = -sum(p * math.log2(p) for p in probs)
169
+
170
+ # Character class ratios
171
+ digits = sum(c.isdigit() for c in s) / n
172
+ vowels = sum(c in 'aeiou' for c in s) / n
173
+ unique_ratio = len(set(s)) / n
174
+
175
+ # Repetition penalty (runs like 'aaa', '111')
176
+ max_run = run = 1
177
+ for i in range(1, n):
178
+ run = run + 1 if s[i] == s[i-1] else 1
179
+ max_run = max(max_run, run)
180
+ run_penalty = max_run / n
181
+
182
+ # Normalize entropy (log2 of ~36-char alphabet a-z0-9)
183
+ norm_entropy = shannon / math.log2(36)
184
+
185
+ return (
186
+ 1.5 * norm_entropy +
187
+ 0.5 * unique_ratio +
188
+ 1.0 * digits -
189
+ 0.5 * vowels -
190
+ 0.3 * run_penalty
191
+ )
192
+
193
+
194
+ # ---------------------------------------------------------------------------
195
+ # Load and prepare
196
+ # ---------------------------------------------------------------------------
197
+
198
+ def load_and_prepare(log_path: Path) -> tuple[pd.DataFrame, pd.Series, dict]:
199
+ """
200
+ Load dns.log, apply whitelist, engineer features.
201
+ Returns (feature_df, query_series, stats_dict).
202
+ """
203
+ print(f"[+] Loading {log_path} ...")
204
+ records = []
205
+ skipped = 0
206
+ with open(log_path) as f:
207
+ for i, line in enumerate(f, 1):
208
+ line = line.strip()
209
+ if not line:
210
+ continue
211
+ try:
212
+ records.append(json.loads(line))
213
+ except json.JSONDecodeError as e:
214
+ skipped += 1
215
+ if skipped <= 5:
216
+ print(f" [!] Skipping line {i}: {e}")
217
+
218
+ df_raw = pd.DataFrame(records)
219
+ raw_rows = len(df_raw)
220
+
221
+ t_start = pd.to_datetime(df_raw['ts'].min(), unit='s', utc=True)
222
+ t_end = pd.to_datetime(df_raw['ts'].max(), unit='s', utc=True)
223
+ span_h = (df_raw['ts'].max() - df_raw['ts'].min()) / 3600
224
+
225
+ print(f" {raw_rows:,} rows | {t_start.strftime('%Y-%m-%d %H:%M')} → "
226
+ f"{t_end.strftime('%Y-%m-%d %H:%M')} ({span_h:.1f}h)"
227
+ + (f" [{skipped} lines skipped]" if skipped else ""))
228
+
229
+ # Internet DNS only (qclass=1), drop whitelisted domains
230
+ df = df_raw[df_raw['qclass'] == 1].copy().reset_index(drop=True)
231
+ before_wl = len(df)
232
+ df = df[~df['query'].apply(is_whitelisted)].reset_index(drop=True)
233
+ after_wl = len(df)
234
+
235
+ print(f" After qclass filter + whitelist: {after_wl:,} rows "
236
+ f"({before_wl - after_wl:,} whitelisted)")
237
+
238
+ # Save queries before feature engineering drops the column
239
+ qs = df['query'].copy()
240
+
241
+ # Drop metadata columns not useful for clustering
242
+ drop_cols = [c for c in """ts uid id.orig_h id.orig_p id.resp_h id.resp_p
243
+ proto qclass qclass_name qtype_name rcode_name
244
+ AA RD RA Z trans_id rejected""".split() if c in df.columns]
245
+ df.drop(columns=drop_cols, inplace=True)
246
+
247
+ # --- Feature engineering
248
+ df['rtt'] = df['rtt'].fillna(df['rtt'].median())
249
+ df['TTLs'] = df['TTLs'].fillna(0).apply(summit)
250
+ df['rtt'] = np.log1p(df['rtt'])
251
+ df['TTLs'] = np.log1p(df['TTLs'])
252
+ df['rcode'] = df['rcode'].fillna(-1)
253
+
254
+ df['qlen'] = qs.apply(q_len)
255
+ df['qparts'] = qs.apply(q_parts)
256
+ df['sufflen'] = qs.apply(q_suffix_len)
257
+ df['domlen'] = qs.apply(q_domain_len)
258
+
259
+ df['answers'] = df['answers'].apply(
260
+ lambda x: len(x) if isinstance(x, list) else 0
261
+ )
262
+ df['TC'] = df['TC'].fillna(0).astype(int)
263
+
264
+ # TLD one-hot (top 20 + 'other')
265
+ df['TLD'] = qs.apply(lambda q: q.split('.')[-1])
266
+ top_tlds = df['TLD'].value_counts().nlargest(20).index
267
+ df['TLD'] = df['TLD'].where(df['TLD'].isin(top_tlds), 'other')
268
+ df = pd.get_dummies(df, columns=['TLD'], drop_first=True)
269
+
270
+ df.drop(columns='query', inplace=True)
271
+
272
+ # Standardize numeric features
273
+ num_cols = ['rtt', 'TTLs', 'qlen', 'qparts', 'sufflen', 'domlen', 'answers']
274
+ df[num_cols] = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std()
275
+
276
+ stats = {
277
+ 'raw_rows' : raw_rows,
278
+ 'after_wl' : after_wl,
279
+ 'whitelisted' : before_wl - after_wl,
280
+ 'skipped' : skipped,
281
+ 't_start' : t_start,
282
+ 't_end' : t_end,
283
+ 'span_h' : span_h,
284
+ }
285
+
286
+ return df, qs, stats
287
+
288
+
289
+ # ---------------------------------------------------------------------------
290
+ # Clustering
291
+ # ---------------------------------------------------------------------------
292
+
293
+ def run_clustering(df: pd.DataFrame,
294
+ min_cluster_size: int,
295
+ min_samples: int) -> np.ndarray:
296
+ """Run HDBSCAN on feature matrix, return label array."""
297
+ print(f"[+] Clustering {len(df):,} records "
298
+ f"(min_cluster_size={min_cluster_size}, min_samples={min_samples}) ...")
299
+ clusterer = hdbscan.HDBSCAN(
300
+ min_cluster_size=min_cluster_size,
301
+ min_samples=min_samples,
302
+ core_dist_n_jobs=-1,
303
+ prediction_data=True,
304
+ )
305
+ labels = clusterer.fit_predict(df.to_numpy())
306
+ print(f" Done.")
307
+ return labels
308
+
309
+
310
+ # ---------------------------------------------------------------------------
311
+ # Plot
312
+ # ---------------------------------------------------------------------------
313
+
314
+ def make_plot(qs: pd.Series, labels: np.ndarray,
315
+ noise_df: pd.DataFrame, stats: dict, out_path: Path):
316
+ plt.style.use('dark_background')
317
+
318
+ fig, axes = plt.subplots(1, 2, figsize=(16, 6))
319
+ date_str = (stats['t_start'].strftime('%Y-%m-%d') + ' – ' +
320
+ stats['t_end'].strftime('%Y-%m-%d'))
321
+ fig.suptitle(f"DNS Cluster Hunt | {date_str} ({stats['span_h']:.1f}h)",
322
+ fontsize=13, y=1.01)
323
+
324
+ # --- Left: cluster size bar chart
325
+ ax = axes[0]
326
+ cluster_ids = sorted(set(labels))
327
+ cluster_sizes = [np.sum(labels == c) for c in cluster_ids]
328
+
329
+ # Separate noise from clusters for coloring
330
+ colors = ['#ff4a4a' if c == -1 else '#4a9eff' for c in cluster_ids]
331
+ bar_labels = [f'noise' if c == -1 else f'C{c}' for c in cluster_ids]
332
+
333
+ bars = ax.bar(range(len(cluster_ids)), cluster_sizes, color=colors,
334
+ edgecolor='none', alpha=0.85)
335
+ ax.set_xticks(range(len(cluster_ids)))
336
+ ax.set_xticklabels(bar_labels, rotation=45, ha='right', fontsize=8)
337
+ ax.set_xlabel('Cluster', fontsize=11)
338
+ ax.set_ylabel('Query count', fontsize=11)
339
+ ax.set_title(f'Cluster Sizes\n({len([c for c in cluster_ids if c >= 0])} clusters '
340
+ f'+ {np.sum(labels == -1):,} noise)', fontsize=10)
341
+
342
+ # Annotate bar values
343
+ for bar, size in zip(bars, cluster_sizes):
344
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 50,
345
+ f'{size:,}', ha='center', va='bottom', fontsize=7, color='white')
346
+
347
+ # --- Right: noise entropy distribution
348
+ ax2 = axes[1]
349
+ ax2.hist(noise_df['label_entropy'], bins=50,
350
+ color='#4aff9e', edgecolor='none', alpha=0.8)
351
+ ax2.axvline(THRESH_HIGH_ENTROPY, color='#ff4a4a', linestyle='--',
352
+ linewidth=1.2,
353
+ label=f'High threshold ({THRESH_HIGH_ENTROPY})')
354
+
355
+ n_high = (noise_df['label_entropy'] >= THRESH_HIGH_ENTROPY).sum()
356
+ ax2.text(THRESH_HIGH_ENTROPY + 0.05,
357
+ ax2.get_ylim()[1] * 0.85,
358
+ f"≥{THRESH_HIGH_ENTROPY}: {n_high:,} domains",
359
+ fontsize=9, color='white')
360
+
361
+ ax2.set_xlabel('Entropy Score', fontsize=11)
362
+ ax2.set_ylabel('Domain count', fontsize=11)
363
+ ax2.set_title(f'Noise Domain Entropy Distribution\n({len(noise_df):,} unclustered domains)',
364
+ fontsize=10)
365
+ ax2.legend(fontsize=9)
366
+
367
+ plt.tight_layout()
368
+ plt.savefig(out_path, dpi=150, bbox_inches='tight')
369
+ plt.close()
370
+ print(f"[+] Plot saved → {out_path}")
371
+
372
+
373
+ # ---------------------------------------------------------------------------
374
+ # Text report
375
+ # ---------------------------------------------------------------------------
376
+
377
+ def write_report(qs: pd.Series, labels: np.ndarray,
378
+ noise_df: pd.DataFrame, stats: dict,
379
+ log_path: Path, top_n: int, out_path: Path):
380
+
381
+ cluster_ids = sorted(c for c in set(labels) if c >= 0)
382
+ n_clusters = len(cluster_ids)
383
+ n_noise = int(np.sum(labels == -1))
384
+ n_total = len(labels)
385
+ n_high = int((noise_df['label_entropy'] >= THRESH_HIGH_ENTROPY).sum())
386
+
387
+ lines = []
388
+ w = lines.append
389
+
390
+ w("=" * 72)
391
+ w(" DNS CLUSTER THREAT HUNT REPORT")
392
+ w("=" * 72)
393
+ w(f" Generated : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
394
+ w(f" Log file : {log_path}")
395
+ w(f" Window : {stats['t_start'].strftime('%Y-%m-%d %H:%M')} UTC → "
396
+ f"{stats['t_end'].strftime('%Y-%m-%d %H:%M')} UTC ({stats['span_h']:.1f}h)")
397
+ w("")
398
+ w(" DATA SUMMARY")
399
+ w(" " + "-" * 40)
400
+ w(f" Raw dns.log rows : {stats['raw_rows']:>10,}")
401
+ w(f" After whitelist : {stats['after_wl']:>10,} ({stats['whitelisted']:,} whitelisted)")
402
+ w(f" Clustered : {n_total:>10,}")
403
+ w("")
404
+ w(" CLUSTERING SUMMARY")
405
+ w(" " + "-" * 40)
406
+ w(f" Clusters found : {n_clusters:>10,}")
407
+ w(f" Noise (unclustered): {n_noise:>10,} ({n_noise/n_total*100:.1f}%)")
408
+ w(f" High entropy noise : {n_high:>10,} (entropy ≥ {THRESH_HIGH_ENTROPY})")
409
+ w("")
410
+
411
+ # Cluster breakdown
412
+ w(" CLUSTER BREAKDOWN")
413
+ w(" " + "-" * 40)
414
+ w(f" {'ID':>4} {'SIZE':>8} {'PCT':>6} SAMPLE DOMAINS")
415
+ w(" " + "-" * 40)
416
+ for cid in cluster_ids:
417
+ mask = labels == cid
418
+ size = mask.sum()
419
+ pct = size / n_total * 100
420
+ samples = qs[mask].unique()[:4]
421
+ sample_str = ' '.join(samples)
422
+ w(f" {cid:>4} {size:>8,} {pct:>5.1f}% {sample_str}")
423
+ w("")
424
+
425
+ # Entropy distribution of noise
426
+ w(" NOISE ENTROPY DISTRIBUTION")
427
+ w(" " + "-" * 40)
428
+ bins = [(3.0, 99), (2.5, 3.0), (2.0, 2.5), (1.5, 2.0), (0.0, 1.5)]
429
+ for lo, hi in bins:
430
+ n = ((noise_df['label_entropy'] >= lo) &
431
+ (noise_df['label_entropy'] < hi)).sum()
432
+ bar = '█' * int(n / max(len(noise_df), 1) * 40)
433
+ hi_str = f"{hi:.1f}" if hi < 99 else " ∞ "
434
+ w(f" {lo:.1f}–{hi_str} : {n:6,} {bar}")
435
+ w("")
436
+
437
+ # Top N high entropy domains
438
+ w(f" TOP {top_n} DOMAINS BY ENTROPY SCORE")
439
+ w(f" (unclustered noise only — whitelisted domains already excluded)")
440
+ w(" " + "-" * 50)
441
+ w(f" {'ENTROPY':>8} DOMAIN")
442
+ w(" " + "-" * 50)
443
+ for _, row in noise_df.head(top_n).iterrows():
444
+ flag = " ◄ HIGH" if row['label_entropy'] >= THRESH_HIGH_ENTROPY else ""
445
+ w(f" {row['label_entropy']:>8.3f} {row['query']}{flag}")
446
+ w("")
447
+ w("=" * 72)
448
+ w(" END OF REPORT")
449
+ w("=" * 72)
450
+
451
+ report_text = "\n".join(lines)
452
+ out_path.write_text(report_text)
453
+ print(report_text)
454
+ print(f"\n[+] Report saved → {out_path}")
455
+
456
+
457
+ # ---------------------------------------------------------------------------
458
+ # Main
459
+ # ---------------------------------------------------------------------------
460
+
461
+ def main():
462
+ parser = argparse.ArgumentParser(
463
+ description="DNS cluster threat hunt — Zeek dns.log → cluster report"
464
+ )
465
+ parser.add_argument("log", type=Path, help="Path to Zeek dns.log (ndjson)")
466
+ parser.add_argument("--top", type=int, default=250,
467
+ help="Top N entropy domains in report (default: 250)")
468
+ parser.add_argument("--min-size", type=int, default=MIN_CLUSTER_SIZE,
469
+ help=f"HDBSCAN min_cluster_size (default: {MIN_CLUSTER_SIZE})")
470
+ parser.add_argument("--min-samples", type=int, default=MIN_SAMPLES,
471
+ help=f"HDBSCAN min_samples (default: {MIN_SAMPLES})")
472
+ parser.add_argument("--out-dir", type=Path, default=Path("hunt_output"),
473
+ help="Output directory (default: ./hunt_output/)")
474
+ args = parser.parse_args()
475
+
476
+ if not args.log.exists():
477
+ print(f"[!] Log file not found: {args.log}", file=sys.stderr)
478
+ sys.exit(1)
479
+
480
+ args.out_dir.mkdir(parents=True, exist_ok=True)
481
+ stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
482
+
483
+ # --- Run pipeline
484
+ df_features, qs, stats = load_and_prepare(args.log)
485
+
486
+ labels = run_clustering(df_features, args.min_size, args.min_samples)
487
+
488
+ # Build noise DataFrame with entropy scores
489
+ noise_mask = labels == -1
490
+ noise_queries = np.unique(qs[noise_mask].values)
491
+ noise_df = pd.DataFrame({'query': noise_queries})
492
+ noise_df['label_entropy'] = noise_df['query'].apply(
493
+ lambda q: entropy(q.split('.')[0])
494
+ )
495
+ # Suppress remaining infra noise
496
+ noise_df = noise_df[~noise_df['query'].str.contains(
497
+ INFRA_SUPPRESS, case=False, regex=True
498
+ )]
499
+ noise_df = noise_df.sort_values('label_entropy', ascending=False).reset_index(drop=True)
500
+
501
+ n_clusters = len(set(labels) - {-1})
502
+ n_noise = int(noise_mask.sum())
503
+ print(f" {n_clusters} clusters | {n_noise:,} noise records "
504
+ f"({n_noise/len(labels)*100:.1f}%) | "
505
+ f"{len(noise_df):,} unique noise domains")
506
+
507
+ # --- Write outputs
508
+ csv_path = args.out_dir / f"dns_domains_{stamp}.csv"
509
+ report_path = args.out_dir / f"dns_report_{stamp}.txt"
510
+ plot_path = args.out_dir / f"dns_plot_{stamp}.png"
511
+
512
+ noise_df.to_csv(csv_path, index=False)
513
+ print(f"[+] CSV saved → {csv_path}")
514
+
515
+ make_plot(qs, labels, noise_df, stats, plot_path)
516
+ write_report(qs, labels, noise_df, stats, args.log, args.top, report_path)
517
+
518
+
519
+ if __name__ == "__main__":
520
+ main()