loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
migrations/dns_dbscan.py
ADDED
|
@@ -0,0 +1,520 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
dns_cluster.py — weekly DNS clustering threat hunt
|
|
4
|
+
Zeek dns.log (ndjson) → HDBSCAN cluster analysis + entropy-ranked noise report
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
python dns_cluster.py data/dns/dns.log
|
|
8
|
+
python dns_cluster.py data/dns/dns.log --top 100 --min-size 300
|
|
9
|
+
python dns_cluster.py data/dns/dns.log --out-dir /tmp/hunt
|
|
10
|
+
|
|
11
|
+
Outputs (written to --out-dir, default ./hunt_output/):
|
|
12
|
+
dns_report_<timestamp>.txt — full text report + top entropy domains
|
|
13
|
+
dns_domains_<timestamp>.csv — noise domains with entropy scores
|
|
14
|
+
dns_plot_<timestamp>.png — cluster size chart + entropy distribution
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import math
|
|
20
|
+
import re
|
|
21
|
+
import sys
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
from pathlib import Path
|
|
24
|
+
|
|
25
|
+
import matplotlib
|
|
26
|
+
matplotlib.use("Agg")
|
|
27
|
+
import matplotlib.pyplot as plt
|
|
28
|
+
import numpy as np
|
|
29
|
+
import pandas as pd
|
|
30
|
+
import hdbscan
|
|
31
|
+
from sklearn.preprocessing import StandardScaler
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ---------------------------------------------------------------------------
|
|
35
|
+
# Configuration
|
|
36
|
+
# ---------------------------------------------------------------------------
|
|
37
|
+
|
|
38
|
+
# Minimum cluster size for HDBSCAN — larger = fewer, more meaningful clusters
|
|
39
|
+
# 500 produced 299 clusters with heavy fragmentation; 2000 is more appropriate
|
|
40
|
+
# for a week of traffic (~600K post-whitelist queries)
|
|
41
|
+
MIN_CLUSTER_SIZE = 2000
|
|
42
|
+
# Minimum samples — controls how conservative cluster membership is
|
|
43
|
+
MIN_SAMPLES = 100
|
|
44
|
+
|
|
45
|
+
# Additional infrastructure noise to suppress from the entropy report
|
|
46
|
+
# (patterns that survive the whitelist but aren't interesting)
|
|
47
|
+
INFRA_SUPPRESS = (
|
|
48
|
+
r'\.akam\.net$|\.edgekey\.net$|\.azure-dns\.com$'
|
|
49
|
+
r'|\.nsone\.net$|\.windowsupdate\.com$'
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Triage threshold for entropy score — above this warrants a closer look
|
|
53
|
+
# Lowered from 2.5: typical noise peaks around 0.8-1.0; nothing exceeded 2.1
|
|
54
|
+
# in a calibration run, so 1.8 gives a practical weekly review list
|
|
55
|
+
THRESH_HIGH_ENTROPY = 1.8
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
# ---------------------------------------------------------------------------
|
|
59
|
+
# Known-good domain patterns (whitelist)
|
|
60
|
+
# Domains matching any of these are excluded before clustering.
|
|
61
|
+
# ---------------------------------------------------------------------------
|
|
62
|
+
|
|
63
|
+
PATTERNS = [
|
|
64
|
+
('reverse_dns', r'\.in-addr\.arpa$'),
|
|
65
|
+
('ipv6_arpa', r'\.ip6\.arpa$'),
|
|
66
|
+
('mdns_local', r'\.local$'),
|
|
67
|
+
('mdns_service', r'^_'),
|
|
68
|
+
('uuid', r'[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}'),
|
|
69
|
+
('ntp', r'pool\.ntp\.org$|\.ntp\.org$'),
|
|
70
|
+
('akamai', r'\.akamai\.net$|\.akamaiedge\.net$|\.akamai\.com$'
|
|
71
|
+
r'|\.akamaihd\.net$|\.akadns\.net$|\.akamaized\.net$'
|
|
72
|
+
r'|\.akamaitechnologies\.com$'),
|
|
73
|
+
('apple_cdn', r'\.apple\.com$|\.icloud\.com$|\.aaplimg\.com$|\.apple-dns\.net$'),
|
|
74
|
+
('aws', r'\.amazonaws\.com$|\.awsglobalaccelerator\.com$|\.cloudfront\.net$'),
|
|
75
|
+
('google', r'\.googlevideo\.com$|\.googleapis\.com$|\.gstatic\.com$'
|
|
76
|
+
r'|\.googleusercontent\.com$|\.googledomains\.com$|\.google\.com$'),
|
|
77
|
+
('azure', r'\.azurefd\.net$|\.azureedge\.net$|\.cloudapp\.azure\.com$'
|
|
78
|
+
r'|\.azurewebsites\.net$|\.trafficmanager\.net$|\.windows\.net$'),
|
|
79
|
+
('sonos_ws', r'conn-i-[0-9a-f]+\..*\.sonos\.com$'),
|
|
80
|
+
('amazon_video', r'\.amazonvideo\.com$|\.amazon\.com$|\.amazonalexa\.com$|\.a2z\.com$'),
|
|
81
|
+
('oracle_idcs', r'\.oraclecloud\.com$|\.oracle\.com$'),
|
|
82
|
+
('sonos', r'\.sonos\.com$'),
|
|
83
|
+
('dropbox', r'\.dropbox\.com$|\.dropbox-dns\.com$'),
|
|
84
|
+
('zoom', r'\.zoom\.us$'),
|
|
85
|
+
('mozilla', r'\.mozilla\.net$|\.mozilla\.org$|\.mozgcp\.net$'),
|
|
86
|
+
('microsoft', r'\.microsoft\.com$|\.office\.com$|\.live\.com$'
|
|
87
|
+
r'|\.skype\.com$|\.msidentity\.com$'),
|
|
88
|
+
('fastly', r'\.fastly\.net$|\.fastly-edge\.com$'),
|
|
89
|
+
('tinypass', r'\.tinypass\.com$'),
|
|
90
|
+
('atlassian', r'\.atlassian\.com$|\.atlassian-dev\.net$|\.atl-paas\.net$'),
|
|
91
|
+
|
|
92
|
+
('awsdns', r'(^|\.)awsdns-\d+\.\w+(\.\w+)?$'),
|
|
93
|
+
('aws_ns', r'ns-\d+\.awsdns'),
|
|
94
|
+
('awswaf', r'(^|\.)awswaf\.com$'),
|
|
95
|
+
('ovh_ns', r'ns\d+\.ovh\.net$|dns\d+\.ovh\.net$'),
|
|
96
|
+
('ultradns', r'\.ultradns\.(net|com|org|info|co\.uk)$'),
|
|
97
|
+
('azure_ns', r'ns\d+-\d+\.azure-dns\.(com|net|org|info)$'),
|
|
98
|
+
('backblaze', r'pod-\d+-\d+-\d+\.backblaze\.com$'
|
|
99
|
+
r'|pod-\d{3}-\d{4}-\d{2}\.backblaze\.com$|ca\d+\.backblaze\.com$'),
|
|
100
|
+
('msedge', r'\.t-msedge\.net$|\.fb-t-msedge\.net$'),
|
|
101
|
+
('nameservers', r'^ns\d*[-\.]|\.awsdns-|\.ultradns\.|\.cloudns\.'
|
|
102
|
+
r'|\.constellix\.|\.digicertdns\.|\.domaincontrol\.'),
|
|
103
|
+
('diagnostic_dns', r'\.prod\.diagnostic\.networking\.aws\.dev$'),
|
|
104
|
+
('oracledns', r'\.dns\.oraclecloud\.net$'),
|
|
105
|
+
('sentinelone', r'\.sentinelone\.net$'),
|
|
106
|
+
('hcaptcha', r'\.hcaptcha\.com$'),
|
|
107
|
+
('sentry', r'\.sentry\.io$'),
|
|
108
|
+
('attlocal', r'\.attlocal\.net$'),
|
|
109
|
+
('msedge_cdn', r'\.(ax|bx|ln)-\d+\.(ax|bx|ln)(-dc)?-msedge\.net$'),
|
|
110
|
+
('splunk_telemetry', r'(^|\.)scs\.splunk\.com$|(^|\.)splunk\.com$'),
|
|
111
|
+
('netdata', r'(^|\.)netdata\.cloud$'),
|
|
112
|
+
('lenovo_mgmt', r'(^|\.)lenovo\.com$'),
|
|
113
|
+
('vdinfo_iot', r'(^|\.)vdinfo\.site$|(^|\.)kvaedit\.site$'),
|
|
114
|
+
('opendns_diag', r'^debug\.opendns\.com$'),
|
|
115
|
+
('rapid7', r'(^|\.)rapid7\.com$|(^|\.)r7ops\.com$|(^|\.)r7sec\.com$'),
|
|
116
|
+
('web_diag_aws', r'(^|\.)diagnostic\.networking\.aws\.dev$'),
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
# Pre-compile for performance
|
|
120
|
+
_COMPILED_PATTERNS = [(label, re.compile(pat, re.IGNORECASE)) for label, pat in PATTERNS]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def is_whitelisted(query: str) -> bool:
|
|
124
|
+
return any(pat.search(query) for _, pat in _COMPILED_PATTERNS)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def categorize(query: str) -> str:
|
|
128
|
+
for label, pat in _COMPILED_PATTERNS:
|
|
129
|
+
if pat.search(query):
|
|
130
|
+
return label
|
|
131
|
+
return 'uncategorized'
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
# ---------------------------------------------------------------------------
|
|
135
|
+
# Feature engineering helpers
|
|
136
|
+
# ---------------------------------------------------------------------------
|
|
137
|
+
|
|
138
|
+
def q_len(q): return len(q)
|
|
139
|
+
def q_parts(q): return len(q.split('.'))
|
|
140
|
+
def q_suffix_len(q): return len(q.split('.')[-1])
|
|
141
|
+
def q_domain_len(q):
|
|
142
|
+
try: return len(q.split('.')[-2])
|
|
143
|
+
except: return 0
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def summit(val):
|
|
147
|
+
"""Sum TTL list or pass through scalar."""
|
|
148
|
+
if isinstance(val, (int, float)):
|
|
149
|
+
return float(val)
|
|
150
|
+
return np.array(val, dtype=np.float32).sum()
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def entropy(s: str) -> float:
|
|
154
|
+
"""
|
|
155
|
+
Composite entropy score for a domain label.
|
|
156
|
+
Combines Shannon entropy with character class heuristics to
|
|
157
|
+
distinguish DGA/random labels from human-readable ones.
|
|
158
|
+
Higher score = more suspicious.
|
|
159
|
+
"""
|
|
160
|
+
if not s:
|
|
161
|
+
return 0.0
|
|
162
|
+
s = s.lower()
|
|
163
|
+
n = len(s)
|
|
164
|
+
|
|
165
|
+
# Shannon entropy
|
|
166
|
+
counts = {c: s.count(c) for c in set(s)}
|
|
167
|
+
probs = [v / n for v in counts.values()]
|
|
168
|
+
shannon = -sum(p * math.log2(p) for p in probs)
|
|
169
|
+
|
|
170
|
+
# Character class ratios
|
|
171
|
+
digits = sum(c.isdigit() for c in s) / n
|
|
172
|
+
vowels = sum(c in 'aeiou' for c in s) / n
|
|
173
|
+
unique_ratio = len(set(s)) / n
|
|
174
|
+
|
|
175
|
+
# Repetition penalty (runs like 'aaa', '111')
|
|
176
|
+
max_run = run = 1
|
|
177
|
+
for i in range(1, n):
|
|
178
|
+
run = run + 1 if s[i] == s[i-1] else 1
|
|
179
|
+
max_run = max(max_run, run)
|
|
180
|
+
run_penalty = max_run / n
|
|
181
|
+
|
|
182
|
+
# Normalize entropy (log2 of ~36-char alphabet a-z0-9)
|
|
183
|
+
norm_entropy = shannon / math.log2(36)
|
|
184
|
+
|
|
185
|
+
return (
|
|
186
|
+
1.5 * norm_entropy +
|
|
187
|
+
0.5 * unique_ratio +
|
|
188
|
+
1.0 * digits -
|
|
189
|
+
0.5 * vowels -
|
|
190
|
+
0.3 * run_penalty
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# ---------------------------------------------------------------------------
|
|
195
|
+
# Load and prepare
|
|
196
|
+
# ---------------------------------------------------------------------------
|
|
197
|
+
|
|
198
|
+
def load_and_prepare(log_path: Path) -> tuple[pd.DataFrame, pd.Series, dict]:
|
|
199
|
+
"""
|
|
200
|
+
Load dns.log, apply whitelist, engineer features.
|
|
201
|
+
Returns (feature_df, query_series, stats_dict).
|
|
202
|
+
"""
|
|
203
|
+
print(f"[+] Loading {log_path} ...")
|
|
204
|
+
records = []
|
|
205
|
+
skipped = 0
|
|
206
|
+
with open(log_path) as f:
|
|
207
|
+
for i, line in enumerate(f, 1):
|
|
208
|
+
line = line.strip()
|
|
209
|
+
if not line:
|
|
210
|
+
continue
|
|
211
|
+
try:
|
|
212
|
+
records.append(json.loads(line))
|
|
213
|
+
except json.JSONDecodeError as e:
|
|
214
|
+
skipped += 1
|
|
215
|
+
if skipped <= 5:
|
|
216
|
+
print(f" [!] Skipping line {i}: {e}")
|
|
217
|
+
|
|
218
|
+
df_raw = pd.DataFrame(records)
|
|
219
|
+
raw_rows = len(df_raw)
|
|
220
|
+
|
|
221
|
+
t_start = pd.to_datetime(df_raw['ts'].min(), unit='s', utc=True)
|
|
222
|
+
t_end = pd.to_datetime(df_raw['ts'].max(), unit='s', utc=True)
|
|
223
|
+
span_h = (df_raw['ts'].max() - df_raw['ts'].min()) / 3600
|
|
224
|
+
|
|
225
|
+
print(f" {raw_rows:,} rows | {t_start.strftime('%Y-%m-%d %H:%M')} → "
|
|
226
|
+
f"{t_end.strftime('%Y-%m-%d %H:%M')} ({span_h:.1f}h)"
|
|
227
|
+
+ (f" [{skipped} lines skipped]" if skipped else ""))
|
|
228
|
+
|
|
229
|
+
# Internet DNS only (qclass=1), drop whitelisted domains
|
|
230
|
+
df = df_raw[df_raw['qclass'] == 1].copy().reset_index(drop=True)
|
|
231
|
+
before_wl = len(df)
|
|
232
|
+
df = df[~df['query'].apply(is_whitelisted)].reset_index(drop=True)
|
|
233
|
+
after_wl = len(df)
|
|
234
|
+
|
|
235
|
+
print(f" After qclass filter + whitelist: {after_wl:,} rows "
|
|
236
|
+
f"({before_wl - after_wl:,} whitelisted)")
|
|
237
|
+
|
|
238
|
+
# Save queries before feature engineering drops the column
|
|
239
|
+
qs = df['query'].copy()
|
|
240
|
+
|
|
241
|
+
# Drop metadata columns not useful for clustering
|
|
242
|
+
drop_cols = [c for c in """ts uid id.orig_h id.orig_p id.resp_h id.resp_p
|
|
243
|
+
proto qclass qclass_name qtype_name rcode_name
|
|
244
|
+
AA RD RA Z trans_id rejected""".split() if c in df.columns]
|
|
245
|
+
df.drop(columns=drop_cols, inplace=True)
|
|
246
|
+
|
|
247
|
+
# --- Feature engineering
|
|
248
|
+
df['rtt'] = df['rtt'].fillna(df['rtt'].median())
|
|
249
|
+
df['TTLs'] = df['TTLs'].fillna(0).apply(summit)
|
|
250
|
+
df['rtt'] = np.log1p(df['rtt'])
|
|
251
|
+
df['TTLs'] = np.log1p(df['TTLs'])
|
|
252
|
+
df['rcode'] = df['rcode'].fillna(-1)
|
|
253
|
+
|
|
254
|
+
df['qlen'] = qs.apply(q_len)
|
|
255
|
+
df['qparts'] = qs.apply(q_parts)
|
|
256
|
+
df['sufflen'] = qs.apply(q_suffix_len)
|
|
257
|
+
df['domlen'] = qs.apply(q_domain_len)
|
|
258
|
+
|
|
259
|
+
df['answers'] = df['answers'].apply(
|
|
260
|
+
lambda x: len(x) if isinstance(x, list) else 0
|
|
261
|
+
)
|
|
262
|
+
df['TC'] = df['TC'].fillna(0).astype(int)
|
|
263
|
+
|
|
264
|
+
# TLD one-hot (top 20 + 'other')
|
|
265
|
+
df['TLD'] = qs.apply(lambda q: q.split('.')[-1])
|
|
266
|
+
top_tlds = df['TLD'].value_counts().nlargest(20).index
|
|
267
|
+
df['TLD'] = df['TLD'].where(df['TLD'].isin(top_tlds), 'other')
|
|
268
|
+
df = pd.get_dummies(df, columns=['TLD'], drop_first=True)
|
|
269
|
+
|
|
270
|
+
df.drop(columns='query', inplace=True)
|
|
271
|
+
|
|
272
|
+
# Standardize numeric features
|
|
273
|
+
num_cols = ['rtt', 'TTLs', 'qlen', 'qparts', 'sufflen', 'domlen', 'answers']
|
|
274
|
+
df[num_cols] = (df[num_cols] - df[num_cols].mean()) / df[num_cols].std()
|
|
275
|
+
|
|
276
|
+
stats = {
|
|
277
|
+
'raw_rows' : raw_rows,
|
|
278
|
+
'after_wl' : after_wl,
|
|
279
|
+
'whitelisted' : before_wl - after_wl,
|
|
280
|
+
'skipped' : skipped,
|
|
281
|
+
't_start' : t_start,
|
|
282
|
+
't_end' : t_end,
|
|
283
|
+
'span_h' : span_h,
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
return df, qs, stats
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
# ---------------------------------------------------------------------------
|
|
290
|
+
# Clustering
|
|
291
|
+
# ---------------------------------------------------------------------------
|
|
292
|
+
|
|
293
|
+
def run_clustering(df: pd.DataFrame,
|
|
294
|
+
min_cluster_size: int,
|
|
295
|
+
min_samples: int) -> np.ndarray:
|
|
296
|
+
"""Run HDBSCAN on feature matrix, return label array."""
|
|
297
|
+
print(f"[+] Clustering {len(df):,} records "
|
|
298
|
+
f"(min_cluster_size={min_cluster_size}, min_samples={min_samples}) ...")
|
|
299
|
+
clusterer = hdbscan.HDBSCAN(
|
|
300
|
+
min_cluster_size=min_cluster_size,
|
|
301
|
+
min_samples=min_samples,
|
|
302
|
+
core_dist_n_jobs=-1,
|
|
303
|
+
prediction_data=True,
|
|
304
|
+
)
|
|
305
|
+
labels = clusterer.fit_predict(df.to_numpy())
|
|
306
|
+
print(f" Done.")
|
|
307
|
+
return labels
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# ---------------------------------------------------------------------------
|
|
311
|
+
# Plot
|
|
312
|
+
# ---------------------------------------------------------------------------
|
|
313
|
+
|
|
314
|
+
def make_plot(qs: pd.Series, labels: np.ndarray,
|
|
315
|
+
noise_df: pd.DataFrame, stats: dict, out_path: Path):
|
|
316
|
+
plt.style.use('dark_background')
|
|
317
|
+
|
|
318
|
+
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
|
|
319
|
+
date_str = (stats['t_start'].strftime('%Y-%m-%d') + ' – ' +
|
|
320
|
+
stats['t_end'].strftime('%Y-%m-%d'))
|
|
321
|
+
fig.suptitle(f"DNS Cluster Hunt | {date_str} ({stats['span_h']:.1f}h)",
|
|
322
|
+
fontsize=13, y=1.01)
|
|
323
|
+
|
|
324
|
+
# --- Left: cluster size bar chart
|
|
325
|
+
ax = axes[0]
|
|
326
|
+
cluster_ids = sorted(set(labels))
|
|
327
|
+
cluster_sizes = [np.sum(labels == c) for c in cluster_ids]
|
|
328
|
+
|
|
329
|
+
# Separate noise from clusters for coloring
|
|
330
|
+
colors = ['#ff4a4a' if c == -1 else '#4a9eff' for c in cluster_ids]
|
|
331
|
+
bar_labels = [f'noise' if c == -1 else f'C{c}' for c in cluster_ids]
|
|
332
|
+
|
|
333
|
+
bars = ax.bar(range(len(cluster_ids)), cluster_sizes, color=colors,
|
|
334
|
+
edgecolor='none', alpha=0.85)
|
|
335
|
+
ax.set_xticks(range(len(cluster_ids)))
|
|
336
|
+
ax.set_xticklabels(bar_labels, rotation=45, ha='right', fontsize=8)
|
|
337
|
+
ax.set_xlabel('Cluster', fontsize=11)
|
|
338
|
+
ax.set_ylabel('Query count', fontsize=11)
|
|
339
|
+
ax.set_title(f'Cluster Sizes\n({len([c for c in cluster_ids if c >= 0])} clusters '
|
|
340
|
+
f'+ {np.sum(labels == -1):,} noise)', fontsize=10)
|
|
341
|
+
|
|
342
|
+
# Annotate bar values
|
|
343
|
+
for bar, size in zip(bars, cluster_sizes):
|
|
344
|
+
ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 50,
|
|
345
|
+
f'{size:,}', ha='center', va='bottom', fontsize=7, color='white')
|
|
346
|
+
|
|
347
|
+
# --- Right: noise entropy distribution
|
|
348
|
+
ax2 = axes[1]
|
|
349
|
+
ax2.hist(noise_df['label_entropy'], bins=50,
|
|
350
|
+
color='#4aff9e', edgecolor='none', alpha=0.8)
|
|
351
|
+
ax2.axvline(THRESH_HIGH_ENTROPY, color='#ff4a4a', linestyle='--',
|
|
352
|
+
linewidth=1.2,
|
|
353
|
+
label=f'High threshold ({THRESH_HIGH_ENTROPY})')
|
|
354
|
+
|
|
355
|
+
n_high = (noise_df['label_entropy'] >= THRESH_HIGH_ENTROPY).sum()
|
|
356
|
+
ax2.text(THRESH_HIGH_ENTROPY + 0.05,
|
|
357
|
+
ax2.get_ylim()[1] * 0.85,
|
|
358
|
+
f"≥{THRESH_HIGH_ENTROPY}: {n_high:,} domains",
|
|
359
|
+
fontsize=9, color='white')
|
|
360
|
+
|
|
361
|
+
ax2.set_xlabel('Entropy Score', fontsize=11)
|
|
362
|
+
ax2.set_ylabel('Domain count', fontsize=11)
|
|
363
|
+
ax2.set_title(f'Noise Domain Entropy Distribution\n({len(noise_df):,} unclustered domains)',
|
|
364
|
+
fontsize=10)
|
|
365
|
+
ax2.legend(fontsize=9)
|
|
366
|
+
|
|
367
|
+
plt.tight_layout()
|
|
368
|
+
plt.savefig(out_path, dpi=150, bbox_inches='tight')
|
|
369
|
+
plt.close()
|
|
370
|
+
print(f"[+] Plot saved → {out_path}")
|
|
371
|
+
|
|
372
|
+
|
|
373
|
+
# ---------------------------------------------------------------------------
|
|
374
|
+
# Text report
|
|
375
|
+
# ---------------------------------------------------------------------------
|
|
376
|
+
|
|
377
|
+
def write_report(qs: pd.Series, labels: np.ndarray,
|
|
378
|
+
noise_df: pd.DataFrame, stats: dict,
|
|
379
|
+
log_path: Path, top_n: int, out_path: Path):
|
|
380
|
+
|
|
381
|
+
cluster_ids = sorted(c for c in set(labels) if c >= 0)
|
|
382
|
+
n_clusters = len(cluster_ids)
|
|
383
|
+
n_noise = int(np.sum(labels == -1))
|
|
384
|
+
n_total = len(labels)
|
|
385
|
+
n_high = int((noise_df['label_entropy'] >= THRESH_HIGH_ENTROPY).sum())
|
|
386
|
+
|
|
387
|
+
lines = []
|
|
388
|
+
w = lines.append
|
|
389
|
+
|
|
390
|
+
w("=" * 72)
|
|
391
|
+
w(" DNS CLUSTER THREAT HUNT REPORT")
|
|
392
|
+
w("=" * 72)
|
|
393
|
+
w(f" Generated : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
|
394
|
+
w(f" Log file : {log_path}")
|
|
395
|
+
w(f" Window : {stats['t_start'].strftime('%Y-%m-%d %H:%M')} UTC → "
|
|
396
|
+
f"{stats['t_end'].strftime('%Y-%m-%d %H:%M')} UTC ({stats['span_h']:.1f}h)")
|
|
397
|
+
w("")
|
|
398
|
+
w(" DATA SUMMARY")
|
|
399
|
+
w(" " + "-" * 40)
|
|
400
|
+
w(f" Raw dns.log rows : {stats['raw_rows']:>10,}")
|
|
401
|
+
w(f" After whitelist : {stats['after_wl']:>10,} ({stats['whitelisted']:,} whitelisted)")
|
|
402
|
+
w(f" Clustered : {n_total:>10,}")
|
|
403
|
+
w("")
|
|
404
|
+
w(" CLUSTERING SUMMARY")
|
|
405
|
+
w(" " + "-" * 40)
|
|
406
|
+
w(f" Clusters found : {n_clusters:>10,}")
|
|
407
|
+
w(f" Noise (unclustered): {n_noise:>10,} ({n_noise/n_total*100:.1f}%)")
|
|
408
|
+
w(f" High entropy noise : {n_high:>10,} (entropy ≥ {THRESH_HIGH_ENTROPY})")
|
|
409
|
+
w("")
|
|
410
|
+
|
|
411
|
+
# Cluster breakdown
|
|
412
|
+
w(" CLUSTER BREAKDOWN")
|
|
413
|
+
w(" " + "-" * 40)
|
|
414
|
+
w(f" {'ID':>4} {'SIZE':>8} {'PCT':>6} SAMPLE DOMAINS")
|
|
415
|
+
w(" " + "-" * 40)
|
|
416
|
+
for cid in cluster_ids:
|
|
417
|
+
mask = labels == cid
|
|
418
|
+
size = mask.sum()
|
|
419
|
+
pct = size / n_total * 100
|
|
420
|
+
samples = qs[mask].unique()[:4]
|
|
421
|
+
sample_str = ' '.join(samples)
|
|
422
|
+
w(f" {cid:>4} {size:>8,} {pct:>5.1f}% {sample_str}")
|
|
423
|
+
w("")
|
|
424
|
+
|
|
425
|
+
# Entropy distribution of noise
|
|
426
|
+
w(" NOISE ENTROPY DISTRIBUTION")
|
|
427
|
+
w(" " + "-" * 40)
|
|
428
|
+
bins = [(3.0, 99), (2.5, 3.0), (2.0, 2.5), (1.5, 2.0), (0.0, 1.5)]
|
|
429
|
+
for lo, hi in bins:
|
|
430
|
+
n = ((noise_df['label_entropy'] >= lo) &
|
|
431
|
+
(noise_df['label_entropy'] < hi)).sum()
|
|
432
|
+
bar = '█' * int(n / max(len(noise_df), 1) * 40)
|
|
433
|
+
hi_str = f"{hi:.1f}" if hi < 99 else " ∞ "
|
|
434
|
+
w(f" {lo:.1f}–{hi_str} : {n:6,} {bar}")
|
|
435
|
+
w("")
|
|
436
|
+
|
|
437
|
+
# Top N high entropy domains
|
|
438
|
+
w(f" TOP {top_n} DOMAINS BY ENTROPY SCORE")
|
|
439
|
+
w(f" (unclustered noise only — whitelisted domains already excluded)")
|
|
440
|
+
w(" " + "-" * 50)
|
|
441
|
+
w(f" {'ENTROPY':>8} DOMAIN")
|
|
442
|
+
w(" " + "-" * 50)
|
|
443
|
+
for _, row in noise_df.head(top_n).iterrows():
|
|
444
|
+
flag = " ◄ HIGH" if row['label_entropy'] >= THRESH_HIGH_ENTROPY else ""
|
|
445
|
+
w(f" {row['label_entropy']:>8.3f} {row['query']}{flag}")
|
|
446
|
+
w("")
|
|
447
|
+
w("=" * 72)
|
|
448
|
+
w(" END OF REPORT")
|
|
449
|
+
w("=" * 72)
|
|
450
|
+
|
|
451
|
+
report_text = "\n".join(lines)
|
|
452
|
+
out_path.write_text(report_text)
|
|
453
|
+
print(report_text)
|
|
454
|
+
print(f"\n[+] Report saved → {out_path}")
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
# ---------------------------------------------------------------------------
|
|
458
|
+
# Main
|
|
459
|
+
# ---------------------------------------------------------------------------
|
|
460
|
+
|
|
461
|
+
def main():
|
|
462
|
+
parser = argparse.ArgumentParser(
|
|
463
|
+
description="DNS cluster threat hunt — Zeek dns.log → cluster report"
|
|
464
|
+
)
|
|
465
|
+
parser.add_argument("log", type=Path, help="Path to Zeek dns.log (ndjson)")
|
|
466
|
+
parser.add_argument("--top", type=int, default=250,
|
|
467
|
+
help="Top N entropy domains in report (default: 250)")
|
|
468
|
+
parser.add_argument("--min-size", type=int, default=MIN_CLUSTER_SIZE,
|
|
469
|
+
help=f"HDBSCAN min_cluster_size (default: {MIN_CLUSTER_SIZE})")
|
|
470
|
+
parser.add_argument("--min-samples", type=int, default=MIN_SAMPLES,
|
|
471
|
+
help=f"HDBSCAN min_samples (default: {MIN_SAMPLES})")
|
|
472
|
+
parser.add_argument("--out-dir", type=Path, default=Path("hunt_output"),
|
|
473
|
+
help="Output directory (default: ./hunt_output/)")
|
|
474
|
+
args = parser.parse_args()
|
|
475
|
+
|
|
476
|
+
if not args.log.exists():
|
|
477
|
+
print(f"[!] Log file not found: {args.log}", file=sys.stderr)
|
|
478
|
+
sys.exit(1)
|
|
479
|
+
|
|
480
|
+
args.out_dir.mkdir(parents=True, exist_ok=True)
|
|
481
|
+
stamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
482
|
+
|
|
483
|
+
# --- Run pipeline
|
|
484
|
+
df_features, qs, stats = load_and_prepare(args.log)
|
|
485
|
+
|
|
486
|
+
labels = run_clustering(df_features, args.min_size, args.min_samples)
|
|
487
|
+
|
|
488
|
+
# Build noise DataFrame with entropy scores
|
|
489
|
+
noise_mask = labels == -1
|
|
490
|
+
noise_queries = np.unique(qs[noise_mask].values)
|
|
491
|
+
noise_df = pd.DataFrame({'query': noise_queries})
|
|
492
|
+
noise_df['label_entropy'] = noise_df['query'].apply(
|
|
493
|
+
lambda q: entropy(q.split('.')[0])
|
|
494
|
+
)
|
|
495
|
+
# Suppress remaining infra noise
|
|
496
|
+
noise_df = noise_df[~noise_df['query'].str.contains(
|
|
497
|
+
INFRA_SUPPRESS, case=False, regex=True
|
|
498
|
+
)]
|
|
499
|
+
noise_df = noise_df.sort_values('label_entropy', ascending=False).reset_index(drop=True)
|
|
500
|
+
|
|
501
|
+
n_clusters = len(set(labels) - {-1})
|
|
502
|
+
n_noise = int(noise_mask.sum())
|
|
503
|
+
print(f" {n_clusters} clusters | {n_noise:,} noise records "
|
|
504
|
+
f"({n_noise/len(labels)*100:.1f}%) | "
|
|
505
|
+
f"{len(noise_df):,} unique noise domains")
|
|
506
|
+
|
|
507
|
+
# --- Write outputs
|
|
508
|
+
csv_path = args.out_dir / f"dns_domains_{stamp}.csv"
|
|
509
|
+
report_path = args.out_dir / f"dns_report_{stamp}.txt"
|
|
510
|
+
plot_path = args.out_dir / f"dns_plot_{stamp}.png"
|
|
511
|
+
|
|
512
|
+
noise_df.to_csv(csv_path, index=False)
|
|
513
|
+
print(f"[+] CSV saved → {csv_path}")
|
|
514
|
+
|
|
515
|
+
make_plot(qs, labels, noise_df, stats, plot_path)
|
|
516
|
+
write_report(qs, labels, noise_df, stats, args.log, args.top, report_path)
|
|
517
|
+
|
|
518
|
+
|
|
519
|
+
if __name__ == "__main__":
|
|
520
|
+
main()
|