loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
migrations/conn_scan.py
ADDED
|
@@ -0,0 +1,1097 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
lh-scan — Port Scan Detector
|
|
4
|
+
Part of the loghunter suite.
|
|
5
|
+
|
|
6
|
+
Detects port scanning activity from Zeek conn.log data.
|
|
7
|
+
|
|
8
|
+
Scan types detected:
|
|
9
|
+
vertical one source → many ports on one target host
|
|
10
|
+
horizontal one source → same port across many hosts
|
|
11
|
+
block one source → many ports AND many hosts
|
|
12
|
+
slow activity spread across time windows to evade per-window thresholds
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
lh-scan conn.log
|
|
16
|
+
lh-scan /path/to/logs/conn.*.log.gz
|
|
17
|
+
lh-scan conn.log --output scan_results/
|
|
18
|
+
lh-scan conn.log --format json
|
|
19
|
+
lh-scan conn.log --min-severity MEDIUM
|
|
20
|
+
lh-scan conn.log --vertical-threshold 20 --horizontal-threshold 20
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import argparse
|
|
24
|
+
import gzip
|
|
25
|
+
import glob
|
|
26
|
+
import json
|
|
27
|
+
import os
|
|
28
|
+
import sys
|
|
29
|
+
import warnings
|
|
30
|
+
from datetime import datetime, timezone
|
|
31
|
+
from pathlib import Path
|
|
32
|
+
|
|
33
|
+
import numpy as np
|
|
34
|
+
import pandas as pd
|
|
35
|
+
from scipy.stats import entropy as scipy_entropy
|
|
36
|
+
from tqdm import tqdm
|
|
37
|
+
|
|
38
|
+
warnings.filterwarnings('ignore')
|
|
39
|
+
|
|
40
|
+
# ── Version ───────────────────────────────────────────────────────────────────
|
|
41
|
+
VERSION = '1.0.0'
|
|
42
|
+
|
|
43
|
+
# ── Defaults ──────────────────────────────────────────────────────────────────
|
|
44
|
+
DEFAULT_VERTICAL_PORT_THRESHOLD = 15
|
|
45
|
+
DEFAULT_HORIZONTAL_HOST_THRESHOLD = 15
|
|
46
|
+
DEFAULT_BLOCK_PORT_THRESHOLD = 20
|
|
47
|
+
DEFAULT_BLOCK_HOST_THRESHOLD = 20
|
|
48
|
+
DEFAULT_BLOCK_SCAN_STATE_MIN = 0.30
|
|
49
|
+
DEFAULT_SLOW_SCAN_STATE_MIN = 0.30
|
|
50
|
+
DEFAULT_FAST_WINDOW_SECS = 60
|
|
51
|
+
DEFAULT_SLOW_WINDOW_SECS = 3600
|
|
52
|
+
DEFAULT_MIN_CONNECTIONS = 3
|
|
53
|
+
DEFAULT_SLOW_MIN_PORTS = 8
|
|
54
|
+
DEFAULT_SLOW_MIN_BUCKETS = 4
|
|
55
|
+
|
|
56
|
+
SCAN_STATES = {'S0', 'REJ', 'RSTO', 'RSTR', 'SH', 'OTH'}
|
|
57
|
+
|
|
58
|
+
BITTORRENT_PORTS_PEER = {6881, 6882, 6883, 6884, 6885, 6886, 6887, 6888, 6889,
|
|
59
|
+
51413, 51414}
|
|
60
|
+
BITTORRENT_PORTS_TRACKER = {6969, 2710}
|
|
61
|
+
|
|
62
|
+
# IoT/smart device discovery ports — multicast/broadcast, structurally produce
|
|
63
|
+
# high S0/OTH rates that are not scanning
|
|
64
|
+
IOT_DISCOVERY_PORTS = {
|
|
65
|
+
5353, # mDNS
|
|
66
|
+
1900, # SSDP/UPnP
|
|
67
|
+
5355, # LLMNR
|
|
68
|
+
137, # NetBIOS Name Service
|
|
69
|
+
138, # NetBIOS Datagram
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# IoT multicast/broadcast destination ranges — connections to these are never scans
|
|
73
|
+
IOT_MULTICAST_PREFIXES = ('224.', '239.', '255.255.255.255', 'ff0', 'ff1', 'ff2')
|
|
74
|
+
|
|
75
|
+
DARK_PORTS = {0, 1, 2, 3, 4, 6, 8}
|
|
76
|
+
|
|
77
|
+
REQUIRED_FIELDS = {'ts', 'id.orig_h', 'id.resp_h', 'id.resp_p', 'proto', 'conn_state'}
|
|
78
|
+
OPTIONAL_FIELDS = {'orig_bytes', 'resp_bytes', 'duration', 'orig_pkts', 'resp_pkts'}
|
|
79
|
+
|
|
80
|
+
SCAN_TYPE_DESCRIPTIONS = {
|
|
81
|
+
'vertical' : 'Port scan (one host, many ports)',
|
|
82
|
+
'horizontal': 'Network sweep (many hosts, one port)',
|
|
83
|
+
'block' : 'Block scan (many hosts AND many ports)',
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
STATE_EXPLANATIONS = {
|
|
87
|
+
'S0' : 'SYN sent, no response (filtered/firewalled)',
|
|
88
|
+
'REJ' : 'Port closed (RST received)',
|
|
89
|
+
'RSTO': 'Connection reset by originator',
|
|
90
|
+
'RSTR': 'Connection reset by responder',
|
|
91
|
+
'SF' : 'Normal established+closed connection',
|
|
92
|
+
'SH' : 'Half-open scan (SYN+FIN)',
|
|
93
|
+
'OTH' : 'No SYN observed',
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
98
|
+
# Data loading
|
|
99
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
100
|
+
|
|
101
|
+
def open_log(path: str):
|
|
102
|
+
"""Open a plain or gzipped log file."""
|
|
103
|
+
if path.endswith('.gz'):
|
|
104
|
+
return gzip.open(path, 'rt', encoding='utf-8', errors='replace')
|
|
105
|
+
return open(path, 'r', encoding='utf-8', errors='replace')
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def load_conn_log(pattern: str, verbose: bool = False) -> tuple[pd.DataFrame, int]:
|
|
109
|
+
"""
|
|
110
|
+
Load one or more Zeek conn.log files matching a glob pattern.
|
|
111
|
+
Handles plain and gzipped files transparently.
|
|
112
|
+
Parses line-by-line with json.loads — avoids ujson issues with Zeek output.
|
|
113
|
+
"""
|
|
114
|
+
paths = sorted(glob.glob(pattern)) if ('*' in pattern or '?' in pattern) else [pattern]
|
|
115
|
+
if not paths:
|
|
116
|
+
raise FileNotFoundError(f"No files matched: {pattern}")
|
|
117
|
+
|
|
118
|
+
rows = []
|
|
119
|
+
skipped = 0
|
|
120
|
+
|
|
121
|
+
for path in paths:
|
|
122
|
+
with open_log(path) as fh:
|
|
123
|
+
for line in tqdm(fh, desc=f" {Path(path).name}", unit=" lines",
|
|
124
|
+
leave=False, disable=not verbose):
|
|
125
|
+
line = line.strip()
|
|
126
|
+
if not line or line.startswith('#'):
|
|
127
|
+
continue
|
|
128
|
+
try:
|
|
129
|
+
rec = json.loads(line)
|
|
130
|
+
except json.JSONDecodeError:
|
|
131
|
+
skipped += 1
|
|
132
|
+
continue
|
|
133
|
+
if not REQUIRED_FIELDS.issubset(rec.keys()):
|
|
134
|
+
skipped += 1
|
|
135
|
+
continue
|
|
136
|
+
row = {f: rec[f] for f in REQUIRED_FIELDS}
|
|
137
|
+
for f in OPTIONAL_FIELDS:
|
|
138
|
+
row[f] = rec.get(f)
|
|
139
|
+
rows.append(row)
|
|
140
|
+
|
|
141
|
+
if not rows:
|
|
142
|
+
raise ValueError(f"No valid conn.log records found in: {pattern}")
|
|
143
|
+
|
|
144
|
+
df = pd.DataFrame(rows)
|
|
145
|
+
df.rename(columns={
|
|
146
|
+
'id.orig_h': 'src_ip',
|
|
147
|
+
'id.resp_h': 'dst_ip',
|
|
148
|
+
'id.resp_p': 'dst_port',
|
|
149
|
+
}, inplace=True)
|
|
150
|
+
df['ts'] = pd.to_datetime(df['ts'], unit='s', utc=True)
|
|
151
|
+
df['dst_port'] = pd.to_numeric(df['dst_port'], errors='coerce').astype('Int32')
|
|
152
|
+
df.sort_values('ts', inplace=True)
|
|
153
|
+
df.reset_index(drop=True, inplace=True)
|
|
154
|
+
return df, skipped
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
158
|
+
# Pre-filtering
|
|
159
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
160
|
+
|
|
161
|
+
def ip_in_nets(ip: str, nets: list) -> bool:
|
|
162
|
+
"""Return True if ip falls within any of the given CIDR strings."""
|
|
163
|
+
import ipaddress
|
|
164
|
+
try:
|
|
165
|
+
addr = ipaddress.ip_address(ip)
|
|
166
|
+
return any(addr in ipaddress.ip_network(n, strict=False) for n in nets)
|
|
167
|
+
except ValueError:
|
|
168
|
+
return False
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def build_internal_mask(series: pd.Series, nets: list) -> pd.Series:
|
|
172
|
+
"""Vectorized internal IP classification."""
|
|
173
|
+
return series.map(lambda ip: ip_in_nets(ip, nets))
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def classify_direction(src_int: bool, dst_int: bool) -> str:
|
|
177
|
+
if src_int and dst_int:
|
|
178
|
+
return 'internal→internal'
|
|
179
|
+
elif src_int:
|
|
180
|
+
return 'internal→external'
|
|
181
|
+
elif dst_int:
|
|
182
|
+
return 'external→internal'
|
|
183
|
+
return 'external→external'
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def prefilter(df_raw: pd.DataFrame, args) -> pd.DataFrame:
|
|
187
|
+
"""
|
|
188
|
+
Apply pre-filters to remove traffic that produces structural false positives:
|
|
189
|
+
- ICMP: Zeek encodes type/code in port fields — not real port numbers
|
|
190
|
+
- IPv6 link-local (fe80::/10): neighbor discovery, not scanning
|
|
191
|
+
- IoT multicast/broadcast destinations: mDNS, SSDP, etc.
|
|
192
|
+
- Allowlisted source IPs
|
|
193
|
+
Then classify direction (internal/external) using home_nets.
|
|
194
|
+
"""
|
|
195
|
+
n_raw = len(df_raw)
|
|
196
|
+
df = df_raw.copy()
|
|
197
|
+
|
|
198
|
+
# ICMP — port field semantics are different, not suitable for scan detection
|
|
199
|
+
icmp_mask = df['proto'] == 'icmp'
|
|
200
|
+
df = df[~icmp_mask].copy()
|
|
201
|
+
n_icmp = icmp_mask.sum()
|
|
202
|
+
|
|
203
|
+
# IPv6 link-local
|
|
204
|
+
ipv6_ll_mask = (df['src_ip'].str.startswith('fe80:') |
|
|
205
|
+
df['dst_ip'].str.startswith('fe80:'))
|
|
206
|
+
df = df[~ipv6_ll_mask].copy()
|
|
207
|
+
n_ipv6 = ipv6_ll_mask.sum()
|
|
208
|
+
|
|
209
|
+
# IoT multicast/broadcast destinations
|
|
210
|
+
multicast_mask = df['dst_ip'].map(
|
|
211
|
+
lambda ip: any(ip.startswith(p) for p in IOT_MULTICAST_PREFIXES)
|
|
212
|
+
)
|
|
213
|
+
df = df[~multicast_mask].copy()
|
|
214
|
+
n_multicast = multicast_mask.sum()
|
|
215
|
+
|
|
216
|
+
# Allowlisted source IPs
|
|
217
|
+
n_allowlist = 0
|
|
218
|
+
if args.allowlist_ips:
|
|
219
|
+
al_mask = df['src_ip'].isin(args.allowlist_ips)
|
|
220
|
+
n_allowlist = al_mask.sum()
|
|
221
|
+
df = df[~al_mask].copy()
|
|
222
|
+
|
|
223
|
+
# Direction classification
|
|
224
|
+
home_nets = args.home_nets or []
|
|
225
|
+
src_int = build_internal_mask(df['src_ip'], home_nets)
|
|
226
|
+
dst_int = build_internal_mask(df['dst_ip'], home_nets)
|
|
227
|
+
df['direction'] = [classify_direction(si, di)
|
|
228
|
+
for si, di in zip(src_int, dst_int)]
|
|
229
|
+
|
|
230
|
+
if args.verbose:
|
|
231
|
+
print(f" Pre-filter summary:")
|
|
232
|
+
print(f" Raw rows : {n_raw:,}")
|
|
233
|
+
print(f" ICMP excluded : {n_icmp:,}")
|
|
234
|
+
print(f" IPv6 LL excl. : {n_ipv6:,}")
|
|
235
|
+
print(f" Multicast excl: {n_multicast:,}")
|
|
236
|
+
if n_allowlist:
|
|
237
|
+
print(f" Allowlist excl: {n_allowlist:,}")
|
|
238
|
+
print(f" Working rows : {len(df):,}")
|
|
239
|
+
|
|
240
|
+
return df
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
244
|
+
# Detectors
|
|
245
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
246
|
+
|
|
247
|
+
def detect_vertical_scans(df: pd.DataFrame, args) -> pd.DataFrame:
|
|
248
|
+
"""
|
|
249
|
+
Vertical scan: one src → many distinct ports on one dst.
|
|
250
|
+
Two-pass: global groupby filter → sliding window on candidates only.
|
|
251
|
+
"""
|
|
252
|
+
threshold = args.vertical_threshold
|
|
253
|
+
window_secs = args.slow_window
|
|
254
|
+
|
|
255
|
+
# Pass 1
|
|
256
|
+
global_counts = (
|
|
257
|
+
df.groupby(['src_ip', 'dst_ip'])['dst_port']
|
|
258
|
+
.nunique()
|
|
259
|
+
.reset_index(name='global_distinct_ports')
|
|
260
|
+
)
|
|
261
|
+
candidates = global_counts[global_counts['global_distinct_ports'] >= threshold]
|
|
262
|
+
|
|
263
|
+
if args.verbose:
|
|
264
|
+
print(f" Vertical Pass 1: {len(candidates)} candidate pairs "
|
|
265
|
+
f"(of {len(global_counts):,} total)")
|
|
266
|
+
|
|
267
|
+
if len(candidates) == 0:
|
|
268
|
+
return pd.DataFrame()
|
|
269
|
+
|
|
270
|
+
# Pass 2: merge instead of apply() for scalability
|
|
271
|
+
cand_keys = candidates[['src_ip', 'dst_ip']]
|
|
272
|
+
df_cands = df.merge(cand_keys, on=['src_ip', 'dst_ip'])
|
|
273
|
+
|
|
274
|
+
results = []
|
|
275
|
+
for (src, dst), grp in df_cands.groupby(['src_ip', 'dst_ip']):
|
|
276
|
+
grp = grp.sort_values('ts')
|
|
277
|
+
ts_arr = grp['ts'].values.astype('int64') / 1e9
|
|
278
|
+
port_arr = grp['dst_port'].values
|
|
279
|
+
state_arr = grp['conn_state'].values
|
|
280
|
+
|
|
281
|
+
port_counts = {}
|
|
282
|
+
max_ports_in_window = 0
|
|
283
|
+
best_window_start = ts_arr[0]
|
|
284
|
+
left = 0
|
|
285
|
+
|
|
286
|
+
for right in range(len(ts_arr)):
|
|
287
|
+
p = port_arr[right]
|
|
288
|
+
if p is not None and not (isinstance(p, float) and np.isnan(p)):
|
|
289
|
+
port_counts[p] = port_counts.get(p, 0) + 1
|
|
290
|
+
while ts_arr[right] - ts_arr[left] > window_secs:
|
|
291
|
+
lp = port_arr[left]
|
|
292
|
+
if lp is not None and not (isinstance(lp, float) and np.isnan(lp)):
|
|
293
|
+
port_counts[lp] -= 1
|
|
294
|
+
if port_counts[lp] == 0:
|
|
295
|
+
del port_counts[lp]
|
|
296
|
+
left += 1
|
|
297
|
+
n = len(port_counts)
|
|
298
|
+
if n > max_ports_in_window:
|
|
299
|
+
max_ports_in_window = n
|
|
300
|
+
best_window_start = ts_arr[left]
|
|
301
|
+
|
|
302
|
+
if max_ports_in_window < threshold:
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
state_counts = pd.Series(state_arr).value_counts()
|
|
306
|
+
total_conns = len(state_arr)
|
|
307
|
+
scan_state_count = sum(state_counts.get(s, 0) for s in SCAN_STATES)
|
|
308
|
+
scan_state_ratio = scan_state_count / total_conns
|
|
309
|
+
|
|
310
|
+
port_series = pd.Series(port_arr).dropna()
|
|
311
|
+
port_buckets = pd.cut(port_series, bins=[0, 1023, 49151, 65535],
|
|
312
|
+
labels=['well-known', 'registered', 'ephemeral'])
|
|
313
|
+
port_range_entropy = scipy_entropy(port_buckets.value_counts().values + 1)
|
|
314
|
+
|
|
315
|
+
results.append({
|
|
316
|
+
'scan_type' : 'vertical',
|
|
317
|
+
'src_ip' : src,
|
|
318
|
+
'dst_ip' : dst,
|
|
319
|
+
'dst_port' : None,
|
|
320
|
+
'port_class' : None,
|
|
321
|
+
'distinct_ports' : max_ports_in_window,
|
|
322
|
+
'distinct_hosts' : 1,
|
|
323
|
+
'total_conns' : total_conns,
|
|
324
|
+
'scan_state_ratio' : round(scan_state_ratio, 3),
|
|
325
|
+
'top_states' : ', '.join(state_counts.head(3).index.tolist()),
|
|
326
|
+
'port_range_entropy' : round(port_range_entropy, 3),
|
|
327
|
+
'window_start' : datetime.fromtimestamp(
|
|
328
|
+
best_window_start, tz=timezone.utc
|
|
329
|
+
).strftime('%Y-%m-%d %H:%M:%S'),
|
|
330
|
+
'window_secs' : window_secs,
|
|
331
|
+
'direction' : grp['direction'].iloc[0],
|
|
332
|
+
})
|
|
333
|
+
|
|
334
|
+
return pd.DataFrame(results)
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def detect_horizontal_scans(df: pd.DataFrame, args) -> pd.DataFrame:
|
|
338
|
+
"""
|
|
339
|
+
Horizontal scan: one src → same port across many distinct hosts.
|
|
340
|
+
Two-pass: global groupby filter → sliding window on candidates only.
|
|
341
|
+
"""
|
|
342
|
+
threshold = args.horizontal_threshold
|
|
343
|
+
window_secs = args.slow_window
|
|
344
|
+
|
|
345
|
+
df_tcp_udp = df[df['dst_port'].notna()].copy()
|
|
346
|
+
|
|
347
|
+
# Pass 1
|
|
348
|
+
global_counts = (
|
|
349
|
+
df_tcp_udp.groupby(['src_ip', 'dst_port'])['dst_ip']
|
|
350
|
+
.nunique()
|
|
351
|
+
.reset_index(name='global_distinct_hosts')
|
|
352
|
+
)
|
|
353
|
+
candidates = global_counts[global_counts['global_distinct_hosts'] >= threshold]
|
|
354
|
+
|
|
355
|
+
if args.verbose:
|
|
356
|
+
print(f" Horizontal Pass 1: {len(candidates)} candidate pairs "
|
|
357
|
+
f"(of {len(global_counts):,} total)")
|
|
358
|
+
|
|
359
|
+
if len(candidates) == 0:
|
|
360
|
+
return pd.DataFrame()
|
|
361
|
+
|
|
362
|
+
# Pass 2: merge for scalability
|
|
363
|
+
cand_keys = candidates[['src_ip', 'dst_port']]
|
|
364
|
+
df_cands = df_tcp_udp.merge(cand_keys, on=['src_ip', 'dst_port'])
|
|
365
|
+
|
|
366
|
+
results = []
|
|
367
|
+
for (src, port), grp in df_cands.groupby(['src_ip', 'dst_port']):
|
|
368
|
+
grp = grp.sort_values('ts')
|
|
369
|
+
ts_arr = grp['ts'].values.astype('int64') / 1e9
|
|
370
|
+
host_arr = grp['dst_ip'].values
|
|
371
|
+
state_arr = grp['conn_state'].values
|
|
372
|
+
|
|
373
|
+
host_counts = {}
|
|
374
|
+
max_hosts_in_window = 0
|
|
375
|
+
best_window_start = ts_arr[0]
|
|
376
|
+
left = 0
|
|
377
|
+
|
|
378
|
+
for right in range(len(ts_arr)):
|
|
379
|
+
h = host_arr[right]
|
|
380
|
+
if h is not None:
|
|
381
|
+
host_counts[h] = host_counts.get(h, 0) + 1
|
|
382
|
+
while ts_arr[right] - ts_arr[left] > window_secs:
|
|
383
|
+
lh = host_arr[left]
|
|
384
|
+
if lh is not None:
|
|
385
|
+
host_counts[lh] -= 1
|
|
386
|
+
if host_counts[lh] == 0:
|
|
387
|
+
del host_counts[lh]
|
|
388
|
+
left += 1
|
|
389
|
+
n = len(host_counts)
|
|
390
|
+
if n > max_hosts_in_window:
|
|
391
|
+
max_hosts_in_window = n
|
|
392
|
+
best_window_start = ts_arr[left]
|
|
393
|
+
|
|
394
|
+
if max_hosts_in_window < threshold:
|
|
395
|
+
continue
|
|
396
|
+
|
|
397
|
+
state_counts = pd.Series(state_arr).value_counts()
|
|
398
|
+
total_conns = len(state_arr)
|
|
399
|
+
scan_state_ratio = sum(state_counts.get(s, 0) for s in SCAN_STATES) / total_conns
|
|
400
|
+
velocity = max_hosts_in_window / max(ts_arr[-1] - ts_arr[0], 1)
|
|
401
|
+
|
|
402
|
+
port = int(port)
|
|
403
|
+
if port <= 1023:
|
|
404
|
+
port_class = 'well-known'
|
|
405
|
+
elif port <= 49151:
|
|
406
|
+
port_class = 'registered'
|
|
407
|
+
else:
|
|
408
|
+
port_class = 'ephemeral'
|
|
409
|
+
|
|
410
|
+
results.append({
|
|
411
|
+
'scan_type' : 'horizontal',
|
|
412
|
+
'src_ip' : src,
|
|
413
|
+
'dst_ip' : None,
|
|
414
|
+
'dst_port' : port,
|
|
415
|
+
'port_class' : port_class,
|
|
416
|
+
'distinct_ports' : 1,
|
|
417
|
+
'distinct_hosts' : max_hosts_in_window,
|
|
418
|
+
'total_conns' : total_conns,
|
|
419
|
+
'scan_state_ratio' : round(scan_state_ratio, 3),
|
|
420
|
+
'top_states' : ', '.join(state_counts.head(3).index.tolist()),
|
|
421
|
+
'velocity_hosts_per_sec' : round(velocity, 4),
|
|
422
|
+
'window_start' : datetime.fromtimestamp(
|
|
423
|
+
best_window_start, tz=timezone.utc
|
|
424
|
+
).strftime('%Y-%m-%d %H:%M:%S'),
|
|
425
|
+
'window_secs' : window_secs,
|
|
426
|
+
'direction' : grp['direction'].iloc[0],
|
|
427
|
+
})
|
|
428
|
+
|
|
429
|
+
return pd.DataFrame(results)
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def detect_block_scans(df: pd.DataFrame, args) -> pd.DataFrame:
|
|
433
|
+
"""
|
|
434
|
+
Block scan: one src → many ports AND many hosts.
|
|
435
|
+
scan_state_ratio_min is a hard gate — without it any active workstation fires.
|
|
436
|
+
"""
|
|
437
|
+
port_threshold = args.block_port_threshold
|
|
438
|
+
host_threshold = args.block_host_threshold
|
|
439
|
+
scan_state_ratio_min = args.block_state_min
|
|
440
|
+
window_secs = args.slow_window
|
|
441
|
+
|
|
442
|
+
df_w = df[df['dst_port'].notna()].copy()
|
|
443
|
+
df_w['time_bucket'] = (
|
|
444
|
+
df_w['ts'].values.astype('int64') / 1e9 // window_secs
|
|
445
|
+
).astype(int)
|
|
446
|
+
df_w['is_scan_state'] = df_w['conn_state'].isin(SCAN_STATES)
|
|
447
|
+
|
|
448
|
+
# Pass 1: global filter
|
|
449
|
+
global_agg = df_w.groupby('src_ip').agg(
|
|
450
|
+
global_distinct_ports=('dst_port', 'nunique'),
|
|
451
|
+
global_distinct_hosts=('dst_ip', 'nunique'),
|
|
452
|
+
scan_state_ratio=('is_scan_state', 'mean'),
|
|
453
|
+
).reset_index()
|
|
454
|
+
|
|
455
|
+
candidates = global_agg[
|
|
456
|
+
(global_agg['global_distinct_ports'] >= port_threshold) &
|
|
457
|
+
(global_agg['global_distinct_hosts'] >= host_threshold) &
|
|
458
|
+
(global_agg['scan_state_ratio'] >= scan_state_ratio_min)
|
|
459
|
+
]
|
|
460
|
+
|
|
461
|
+
if args.verbose:
|
|
462
|
+
print(f" Block Pass 1: {len(candidates)} candidate src IPs "
|
|
463
|
+
f"(of {global_agg['src_ip'].nunique():,} total)")
|
|
464
|
+
|
|
465
|
+
if len(candidates) == 0:
|
|
466
|
+
return pd.DataFrame()
|
|
467
|
+
|
|
468
|
+
df_cands = df_w[df_w['src_ip'].isin(candidates['src_ip'])]
|
|
469
|
+
bucket_agg = df_cands.groupby(['src_ip', 'time_bucket']).agg(
|
|
470
|
+
distinct_ports=('dst_port', 'nunique'),
|
|
471
|
+
distinct_hosts=('dst_ip', 'nunique'),
|
|
472
|
+
total_conns=('dst_port', 'count'),
|
|
473
|
+
scan_state_ratio=('is_scan_state', 'mean'),
|
|
474
|
+
top_states=('conn_state',
|
|
475
|
+
lambda x: ', '.join(x.value_counts().head(3).index.tolist())),
|
|
476
|
+
direction=('direction', 'first'),
|
|
477
|
+
ports_well_known=('dst_port', lambda x: (x <= 1023).sum()),
|
|
478
|
+
ports_registered=('dst_port', lambda x: ((x > 1023) & (x <= 49151)).sum()),
|
|
479
|
+
ports_ephemeral=('dst_port', lambda x: (x > 49151).sum()),
|
|
480
|
+
window_start_ts=('ts', lambda x: x.values.astype('int64').min() / 1e9),
|
|
481
|
+
).reset_index()
|
|
482
|
+
|
|
483
|
+
findings = bucket_agg[
|
|
484
|
+
(bucket_agg['distinct_ports'] >= port_threshold) &
|
|
485
|
+
(bucket_agg['distinct_hosts'] >= host_threshold) &
|
|
486
|
+
(bucket_agg['scan_state_ratio'] >= scan_state_ratio_min)
|
|
487
|
+
].copy()
|
|
488
|
+
|
|
489
|
+
if len(findings) == 0:
|
|
490
|
+
return pd.DataFrame()
|
|
491
|
+
|
|
492
|
+
findings['scan_type'] = 'block'
|
|
493
|
+
findings['dst_ip'] = None
|
|
494
|
+
findings['dst_port'] = None
|
|
495
|
+
findings['port_class'] = None
|
|
496
|
+
findings['window_secs'] = window_secs
|
|
497
|
+
findings['window_start'] = findings['window_start_ts'].map(
|
|
498
|
+
lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc)
|
|
499
|
+
.strftime('%Y-%m-%d %H:%M:%S')
|
|
500
|
+
)
|
|
501
|
+
findings['scan_state_ratio'] = findings['scan_state_ratio'].round(3)
|
|
502
|
+
findings['breadth_score'] = findings['distinct_ports'] * findings['distinct_hosts']
|
|
503
|
+
|
|
504
|
+
findings = (
|
|
505
|
+
findings
|
|
506
|
+
.sort_values('breadth_score', ascending=False)
|
|
507
|
+
.drop_duplicates(subset=['src_ip'], keep='first')
|
|
508
|
+
.drop(columns=['time_bucket', 'window_start_ts', 'breadth_score'])
|
|
509
|
+
.reset_index(drop=True)
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
return findings
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
def detect_slow_scans(df: pd.DataFrame, args) -> pd.DataFrame:
|
|
516
|
+
"""
|
|
517
|
+
Slow scan / temporal spread detector.
|
|
518
|
+
|
|
519
|
+
Finds hosts whose port diversity is spread across many time buckets,
|
|
520
|
+
staying below per-window thresholds deliberately.
|
|
521
|
+
|
|
522
|
+
temporal_spread_score = total_unique_ports / max_ports_in_any_single_bucket
|
|
523
|
+
Score >> 1 = deliberately spread (slow scan pattern)
|
|
524
|
+
Score ≈ 1 = clustered in time (normal behavior)
|
|
525
|
+
|
|
526
|
+
scan_state_ratio gate filters out IoT/mobile devices whose spread comes
|
|
527
|
+
from network attach/detach cycles rather than scanning activity.
|
|
528
|
+
"""
|
|
529
|
+
min_ports = args.slow_min_ports
|
|
530
|
+
min_buckets = args.slow_min_buckets
|
|
531
|
+
state_min = args.slow_state_min
|
|
532
|
+
bucket_secs = args.slow_window
|
|
533
|
+
vert_threshold = args.vertical_threshold
|
|
534
|
+
|
|
535
|
+
df_w = df[df['dst_port'].notna()].copy()
|
|
536
|
+
df_w['time_bucket'] = (
|
|
537
|
+
df_w['ts'].values.astype('int64') / 1e9 // bucket_secs
|
|
538
|
+
).astype(int)
|
|
539
|
+
|
|
540
|
+
# IoT pattern recognition helpers
|
|
541
|
+
def is_iot_discovery(grp: pd.DataFrame) -> bool:
|
|
542
|
+
"""
|
|
543
|
+
Return True if this src looks like IoT device discovery traffic rather
|
|
544
|
+
than scanning. Signals:
|
|
545
|
+
- Majority of traffic is to well-known IoT discovery ports (mDNS, SSDP)
|
|
546
|
+
- Top destinations are DNS servers or multicast groups
|
|
547
|
+
- Very low unique external routable destinations
|
|
548
|
+
"""
|
|
549
|
+
port_counts = grp['dst_port'].value_counts()
|
|
550
|
+
top_ports = set(port_counts.head(3).index.tolist())
|
|
551
|
+
# If top ports are dominated by discovery ports, likely IoT
|
|
552
|
+
if top_ports.issubset(IOT_DISCOVERY_PORTS | {53, 443, 80}):
|
|
553
|
+
# And the traffic is mostly to internal/multicast destinations
|
|
554
|
+
ext_conns = grp[~grp['direction'].str.startswith('internal')].shape[0]
|
|
555
|
+
if ext_conns / len(grp) < 0.1:
|
|
556
|
+
return True
|
|
557
|
+
return False
|
|
558
|
+
|
|
559
|
+
results = []
|
|
560
|
+
|
|
561
|
+
for src, grp in df_w.groupby('src_ip'):
|
|
562
|
+
n_buckets = grp['time_bucket'].nunique()
|
|
563
|
+
if n_buckets < min_buckets:
|
|
564
|
+
continue
|
|
565
|
+
|
|
566
|
+
total_unique_ports = grp['dst_port'].nunique()
|
|
567
|
+
if total_unique_ports < min_ports:
|
|
568
|
+
continue
|
|
569
|
+
|
|
570
|
+
max_ports_in_bucket = grp.groupby('time_bucket')['dst_port'].nunique().max()
|
|
571
|
+
|
|
572
|
+
# Already caught by vertical detector — skip
|
|
573
|
+
if max_ports_in_bucket >= vert_threshold:
|
|
574
|
+
continue
|
|
575
|
+
|
|
576
|
+
spread_score = round(total_unique_ports / max(max_ports_in_bucket, 1), 2)
|
|
577
|
+
state_counts = grp['conn_state'].value_counts()
|
|
578
|
+
scan_state_ratio = sum(state_counts.get(s, 0) for s in SCAN_STATES) / len(grp)
|
|
579
|
+
|
|
580
|
+
# State ratio gate — filters IoT/mobile network attach/detach patterns
|
|
581
|
+
if scan_state_ratio < state_min:
|
|
582
|
+
continue
|
|
583
|
+
|
|
584
|
+
# IoT discovery pattern check
|
|
585
|
+
iot_flag = is_iot_discovery(grp)
|
|
586
|
+
|
|
587
|
+
# Pattern tag for slow scan findings
|
|
588
|
+
if iot_flag:
|
|
589
|
+
pattern_tag = 'iot_discovery'
|
|
590
|
+
pattern_notes = (
|
|
591
|
+
f"Traffic pattern consistent with IoT device discovery (mDNS/SSDP/UPnP). "
|
|
592
|
+
f"High temporal spread from repeated network attach/detach cycles rather "
|
|
593
|
+
f"than deliberate scanning. Add to iot_devices in loghunter.conf to suppress."
|
|
594
|
+
)
|
|
595
|
+
elif scan_state_ratio >= 0.60:
|
|
596
|
+
pattern_tag = 'slow_scan'
|
|
597
|
+
pattern_notes = (
|
|
598
|
+
f"Temporal spread score {spread_score:.2f} with {scan_state_ratio:.1%} "
|
|
599
|
+
f"scan-indicative states across {n_buckets} time windows. "
|
|
600
|
+
f"Activity deliberately paced below per-window detection threshold. "
|
|
601
|
+
f"Strong slow scan signature."
|
|
602
|
+
)
|
|
603
|
+
else:
|
|
604
|
+
pattern_tag = 'slow_scan_candidate'
|
|
605
|
+
pattern_notes = (
|
|
606
|
+
f"Temporal spread score {spread_score:.2f} with {scan_state_ratio:.1%} "
|
|
607
|
+
f"scan-indicative states across {n_buckets} time windows. "
|
|
608
|
+
f"Moderate confidence — review destination IPs and ports."
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
results.append({
|
|
612
|
+
'scan_type' : 'slow',
|
|
613
|
+
'src_ip' : src,
|
|
614
|
+
'dst_ip' : None,
|
|
615
|
+
'dst_port' : None,
|
|
616
|
+
'port_class' : None,
|
|
617
|
+
'distinct_ports' : total_unique_ports,
|
|
618
|
+
'distinct_hosts' : grp['dst_ip'].nunique(),
|
|
619
|
+
'max_ports_in_bucket' : int(max_ports_in_bucket),
|
|
620
|
+
'active_buckets' : n_buckets,
|
|
621
|
+
'temporal_spread_score' : spread_score,
|
|
622
|
+
'total_conns' : len(grp),
|
|
623
|
+
'scan_state_ratio' : round(scan_state_ratio, 3),
|
|
624
|
+
'top_states' : ', '.join(state_counts.head(3).index.tolist()),
|
|
625
|
+
'window_start' : grp['ts'].min().strftime('%Y-%m-%d %H:%M:%S'),
|
|
626
|
+
'window_secs' : bucket_secs,
|
|
627
|
+
'direction' : grp['direction'].iloc[0],
|
|
628
|
+
'pattern_tag' : pattern_tag,
|
|
629
|
+
'pattern_notes' : pattern_notes,
|
|
630
|
+
})
|
|
631
|
+
|
|
632
|
+
if not results:
|
|
633
|
+
return pd.DataFrame()
|
|
634
|
+
|
|
635
|
+
return (
|
|
636
|
+
pd.DataFrame(results)
|
|
637
|
+
.sort_values('temporal_spread_score', ascending=False)
|
|
638
|
+
.reset_index(drop=True)
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
|
|
642
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
643
|
+
# Fingerprinting
|
|
644
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
645
|
+
|
|
646
|
+
def conn_state_fingerprints(df: pd.DataFrame, scanner_ips: list) -> dict:
|
|
647
|
+
"""Compute per-src connection state fingerprints for candidate scanner IPs."""
|
|
648
|
+
fps = {}
|
|
649
|
+
for ip in scanner_ips:
|
|
650
|
+
src_df = df[df['src_ip'] == ip]
|
|
651
|
+
if len(src_df) == 0:
|
|
652
|
+
continue
|
|
653
|
+
dist = src_df['conn_state'].value_counts(normalize=True).round(3)
|
|
654
|
+
scan_score = sum(dist.get(s, 0) for s in SCAN_STATES)
|
|
655
|
+
fps[ip] = {
|
|
656
|
+
'total_connections' : len(src_df),
|
|
657
|
+
'state_distribution': dist.to_dict(),
|
|
658
|
+
'scan_state_score' : round(scan_score, 3),
|
|
659
|
+
'dominant_state' : dist.index[0] if len(dist) > 0 else None,
|
|
660
|
+
}
|
|
661
|
+
return fps
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
665
|
+
# Pattern classification
|
|
666
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
667
|
+
|
|
668
|
+
def classify_finding(row) -> tuple[str, str]:
|
|
669
|
+
"""
|
|
670
|
+
Returns (pattern_tag, explanation) for a finding.
|
|
671
|
+
Ordered from most specific to least specific.
|
|
672
|
+
"""
|
|
673
|
+
src = row['src_ip']
|
|
674
|
+
port = row.get('dst_port')
|
|
675
|
+
ratio = row['scan_state_ratio']
|
|
676
|
+
hosts = row.get('distinct_hosts') or 0
|
|
677
|
+
ports = row.get('distinct_ports') or 0
|
|
678
|
+
scan_type = row['scan_type']
|
|
679
|
+
|
|
680
|
+
# Slow scan findings are pre-tagged by detect_slow_scans()
|
|
681
|
+
if scan_type == 'slow':
|
|
682
|
+
return (row.get('pattern_tag', 'slow_scan_candidate'),
|
|
683
|
+
row.get('pattern_notes', ''))
|
|
684
|
+
|
|
685
|
+
# ── IoT discovery ports ──
|
|
686
|
+
if port in IOT_DISCOVERY_PORTS and ratio < 0.40:
|
|
687
|
+
return ('iot_discovery',
|
|
688
|
+
f"Port {port} is an IoT/device discovery port (mDNS/SSDP/UPnP/NetBIOS). "
|
|
689
|
+
f"High host counts on this port are normal for device discovery protocols. "
|
|
690
|
+
f"Not a port scan. Add source to iot_devices in loghunter.conf to suppress.")
|
|
691
|
+
|
|
692
|
+
# ── BitTorrent peer ports ──
|
|
693
|
+
if port in BITTORRENT_PORTS_PEER and ratio >= 0.50:
|
|
694
|
+
return ('bittorrent',
|
|
695
|
+
f"BitTorrent peer connections on port {port} — {hosts} peers contacted, "
|
|
696
|
+
f"{ratio:.1%} failed connections (normal for BT peer discovery). "
|
|
697
|
+
f"If this host shouldn't run BitTorrent, investigate.")
|
|
698
|
+
|
|
699
|
+
# ── BitTorrent tracker ports ──
|
|
700
|
+
if port in BITTORRENT_PORTS_TRACKER and ratio >= 0.15:
|
|
701
|
+
return ('bittorrent',
|
|
702
|
+
f"BitTorrent tracker traffic on port {port} — {hosts} trackers contacted, "
|
|
703
|
+
f"{ratio:.1%} failed connections (normal for tracker announce/scrape). "
|
|
704
|
+
f"If this host shouldn't run BitTorrent, investigate.")
|
|
705
|
+
|
|
706
|
+
# ── DNS recursive resolution ──
|
|
707
|
+
if port == 53 and ratio < 0.05 and hosts >= 15:
|
|
708
|
+
return ('dns_resolver',
|
|
709
|
+
f"DNS recursive resolution — {hosts} external resolvers on port 53, "
|
|
710
|
+
f"{ratio:.1%} failed. This is a DNS server or resolver, not a scanner. "
|
|
711
|
+
f"Add to dns_servers in loghunter.conf to suppress.")
|
|
712
|
+
|
|
713
|
+
# ── Normal HTTPS browsing / cloud services ──
|
|
714
|
+
if port == 443 and ratio < 0.10 and hosts >= 15:
|
|
715
|
+
return ('https_browsing',
|
|
716
|
+
f"HTTPS to {hosts} external hosts, {ratio:.1%} failed — consistent with "
|
|
717
|
+
f"normal web browsing or cloud service traffic. "
|
|
718
|
+
f"Add to workstations or servers in loghunter.conf to suppress.")
|
|
719
|
+
|
|
720
|
+
# ── Normal HTTP browsing ──
|
|
721
|
+
if port == 80 and ratio < 0.10 and hosts >= 15:
|
|
722
|
+
return ('http_browsing',
|
|
723
|
+
f"HTTP to {hosts} external hosts, {ratio:.1%} failed — consistent with "
|
|
724
|
+
f"normal web traffic.")
|
|
725
|
+
|
|
726
|
+
# ── Streaming device / DNS-blocked HTTPS ──
|
|
727
|
+
if port == 443 and 0.10 <= ratio < 0.50 and hosts >= 20:
|
|
728
|
+
return ('streaming_blocked',
|
|
729
|
+
f"{hosts} HTTPS destinations, {ratio:.1%} failed. On a media/streaming "
|
|
730
|
+
f"device this pattern is consistent with DNS-level blocking (Pi-hole, "
|
|
731
|
+
f"NextDNS) causing direct connection fallback attempts. "
|
|
732
|
+
f"Add to media_devices in loghunter.conf to suppress.")
|
|
733
|
+
|
|
734
|
+
# ── Dark / unassigned ports ──
|
|
735
|
+
if port in DARK_PORTS and ratio >= 0.90:
|
|
736
|
+
return ('dark_traffic',
|
|
737
|
+
f"Port {port} is unassigned/reserved — likely a Zeek encoding artifact "
|
|
738
|
+
f"(e.g. ICMP type/code) or internet background radiation. "
|
|
739
|
+
f"Check proto field in conn.log.")
|
|
740
|
+
|
|
741
|
+
# ── Strong scanner signature ──
|
|
742
|
+
if scan_type == 'vertical' and ratio >= 0.60 and ports >= 1000:
|
|
743
|
+
return ('confirmed_scan',
|
|
744
|
+
f"Full port range scan — {ports} distinct ports on single target "
|
|
745
|
+
f"with {ratio:.1%} scan-indicative states. Strong scanner signature.")
|
|
746
|
+
|
|
747
|
+
if ratio >= 0.60:
|
|
748
|
+
return ('confirmed_scan',
|
|
749
|
+
f"{ratio:.1%} scan-indicative states "
|
|
750
|
+
f"({'ports' if scan_type == 'vertical' else 'hosts'}: {max(ports, hosts)}). "
|
|
751
|
+
f"Strong scanner signature.")
|
|
752
|
+
|
|
753
|
+
return ('unknown', '')
|
|
754
|
+
|
|
755
|
+
|
|
756
|
+
def severity_label(row) -> str:
|
|
757
|
+
"""
|
|
758
|
+
Severity driven by scan_state_ratio as primary signal.
|
|
759
|
+
Breadth is a secondary escalator only — not sufficient on its own.
|
|
760
|
+
Known benign patterns are always LOW regardless of breadth.
|
|
761
|
+
"""
|
|
762
|
+
ratio = row['scan_state_ratio']
|
|
763
|
+
breadth = max(row.get('distinct_ports') or 0, row.get('distinct_hosts') or 0)
|
|
764
|
+
tag = row['pattern_tag']
|
|
765
|
+
|
|
766
|
+
# Benign patterns — LOW regardless of breadth
|
|
767
|
+
if tag in ('dns_resolver', 'https_browsing', 'http_browsing',
|
|
768
|
+
'iot_discovery', 'dark_traffic'):
|
|
769
|
+
return 'LOW'
|
|
770
|
+
|
|
771
|
+
# Slow scan has its own severity logic
|
|
772
|
+
if row.get('scan_type') == 'slow':
|
|
773
|
+
if tag == 'slow_scan':
|
|
774
|
+
return 'HIGH' if ratio >= 0.60 else 'MEDIUM'
|
|
775
|
+
return 'LOW'
|
|
776
|
+
|
|
777
|
+
if ratio >= 0.60:
|
|
778
|
+
return 'HIGH'
|
|
779
|
+
if ratio >= 0.30 and breadth >= 50:
|
|
780
|
+
return 'HIGH'
|
|
781
|
+
if ratio >= 0.20:
|
|
782
|
+
return 'MEDIUM'
|
|
783
|
+
if ratio >= 0.10 and breadth >= 25:
|
|
784
|
+
return 'MEDIUM'
|
|
785
|
+
return 'LOW'
|
|
786
|
+
|
|
787
|
+
|
|
788
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
789
|
+
# Synthesis
|
|
790
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
791
|
+
|
|
792
|
+
def synthesize(detector_outputs: list[pd.DataFrame],
|
|
793
|
+
fingerprints: dict) -> pd.DataFrame:
|
|
794
|
+
"""
|
|
795
|
+
Combine all detector outputs, attach fingerprints, classify, assign severity,
|
|
796
|
+
deduplicate across fast/slow windows, and sort by severity.
|
|
797
|
+
"""
|
|
798
|
+
all_dfs = [d for d in detector_outputs if len(d) > 0]
|
|
799
|
+
if not all_dfs:
|
|
800
|
+
return pd.DataFrame()
|
|
801
|
+
|
|
802
|
+
df_all = pd.concat(all_dfs, ignore_index=True)
|
|
803
|
+
|
|
804
|
+
# Attach global scan_state_score
|
|
805
|
+
df_all['scan_state_score'] = df_all['src_ip'].map(
|
|
806
|
+
lambda ip: fingerprints.get(ip, {}).get(
|
|
807
|
+
'scan_state_score',
|
|
808
|
+
df_all.loc[df_all['src_ip'] == ip, 'scan_state_ratio'].iloc[0]
|
|
809
|
+
)
|
|
810
|
+
)
|
|
811
|
+
|
|
812
|
+
# Classify findings that don't already have a tag (slow scan findings are pre-tagged)
|
|
813
|
+
needs_classification = ~df_all.get('pattern_tag', pd.Series(dtype=str)).notna()
|
|
814
|
+
if 'pattern_tag' not in df_all.columns:
|
|
815
|
+
classified = df_all.apply(classify_finding, axis=1)
|
|
816
|
+
df_all['pattern_tag'] = classified.map(lambda x: x[0])
|
|
817
|
+
df_all['pattern_notes'] = classified.map(lambda x: x[1])
|
|
818
|
+
else:
|
|
819
|
+
# Fill in untagged rows (non-slow detectors)
|
|
820
|
+
mask = df_all['pattern_tag'].isna()
|
|
821
|
+
if mask.any():
|
|
822
|
+
classified = df_all[mask].apply(classify_finding, axis=1)
|
|
823
|
+
df_all.loc[mask, 'pattern_tag'] = classified.map(lambda x: x[0]).values
|
|
824
|
+
df_all.loc[mask, 'pattern_notes'] = classified.map(lambda x: x[1]).values
|
|
825
|
+
|
|
826
|
+
df_all['severity'] = df_all.apply(severity_label, axis=1)
|
|
827
|
+
|
|
828
|
+
# Deduplicate across windows — keep largest breadth per unique event
|
|
829
|
+
df_all['breadth'] = df_all[['distinct_ports', 'distinct_hosts']].fillna(0).max(axis=1)
|
|
830
|
+
df_dedup = (
|
|
831
|
+
df_all
|
|
832
|
+
.sort_values('breadth', ascending=False)
|
|
833
|
+
.drop_duplicates(subset=['scan_type', 'src_ip', 'dst_ip', 'dst_port'], keep='first')
|
|
834
|
+
.reset_index(drop=True)
|
|
835
|
+
)
|
|
836
|
+
|
|
837
|
+
sev_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2}
|
|
838
|
+
df_dedup['_sev_ord'] = df_dedup['severity'].map(sev_order)
|
|
839
|
+
df_dedup = (
|
|
840
|
+
df_dedup
|
|
841
|
+
.sort_values(['_sev_ord', 'scan_state_ratio', 'breadth'],
|
|
842
|
+
ascending=[True, False, False])
|
|
843
|
+
.drop(columns='_sev_ord')
|
|
844
|
+
.reset_index(drop=True)
|
|
845
|
+
)
|
|
846
|
+
|
|
847
|
+
return df_dedup
|
|
848
|
+
|
|
849
|
+
|
|
850
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
851
|
+
# Reporting
|
|
852
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
853
|
+
|
|
854
|
+
def print_report(df_dedup: pd.DataFrame, ts_min, ts_max, n_raw: int,
|
|
855
|
+
min_severity: str = 'LOW', file=sys.stdout):
|
|
856
|
+
"""
|
|
857
|
+
Compact tabular report — one row per finding.
|
|
858
|
+
|
|
859
|
+
All pattern analysis, notes, and next-steps logic is preserved in the
|
|
860
|
+
DataFrame (pattern_tag, pattern_notes, scan_state_ratio, etc.) and
|
|
861
|
+
in the JSON/CSV exports. The verbose block format will be re-enabled
|
|
862
|
+
once pattern recognition is fully validated across diverse network types.
|
|
863
|
+
"""
|
|
864
|
+
w = lambda s: print(s, file=file)
|
|
865
|
+
|
|
866
|
+
sev_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2}
|
|
867
|
+
min_sev_ord = sev_order.get(min_severity, 2)
|
|
868
|
+
|
|
869
|
+
report_df = df_dedup[
|
|
870
|
+
df_dedup['severity'].map(sev_order) <= min_sev_ord
|
|
871
|
+
].copy()
|
|
872
|
+
|
|
873
|
+
if len(report_df) == 0:
|
|
874
|
+
w(f"No findings at or above {min_severity} severity.")
|
|
875
|
+
else:
|
|
876
|
+
# Summary counts
|
|
877
|
+
counts = report_df['severity'].value_counts()
|
|
878
|
+
w(f"Synthesized findings: {len(report_df)} unique scan events")
|
|
879
|
+
w("severity")
|
|
880
|
+
for sev in ['HIGH', 'MEDIUM', 'LOW']:
|
|
881
|
+
n = counts.get(sev, 0)
|
|
882
|
+
if n:
|
|
883
|
+
w(f"{sev:>8s} {n}")
|
|
884
|
+
w("")
|
|
885
|
+
|
|
886
|
+
# Build display columns — keep it to what fits a terminal cleanly
|
|
887
|
+
display_cols = [
|
|
888
|
+
'severity', 'pattern_tag', 'scan_type', 'src_ip', 'dst_ip',
|
|
889
|
+
'dst_port', 'distinct_ports', 'distinct_hosts',
|
|
890
|
+
'scan_state_ratio', 'window_start', 'direction',
|
|
891
|
+
]
|
|
892
|
+
# Add spread score column for slow scan findings if present
|
|
893
|
+
if 'temporal_spread_score' in report_df.columns:
|
|
894
|
+
has_slow = report_df['scan_type'].eq('slow').any()
|
|
895
|
+
if has_slow:
|
|
896
|
+
display_cols.insert(display_cols.index('scan_state_ratio'),
|
|
897
|
+
'temporal_spread_score')
|
|
898
|
+
|
|
899
|
+
# Right-align severity for readability
|
|
900
|
+
report_df['severity'] = report_df['severity'].map(
|
|
901
|
+
lambda s: f"{s:>6s}"
|
|
902
|
+
)
|
|
903
|
+
|
|
904
|
+
w(report_df[display_cols].to_string(index=False))
|
|
905
|
+
|
|
906
|
+
w("")
|
|
907
|
+
w(f"Data: {ts_min.strftime('%Y-%m-%d %H:%M')} → "
|
|
908
|
+
f"{ts_max.strftime('%Y-%m-%d %H:%M')} UTC "
|
|
909
|
+
f"({n_raw:,} connections)")
|
|
910
|
+
|
|
911
|
+
|
|
912
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
913
|
+
# Export
|
|
914
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
915
|
+
|
|
916
|
+
def export_results(df_dedup: pd.DataFrame, ts_min, ts_max, n_raw: int,
|
|
917
|
+
output_dir: Path, run_ts: str, args):
|
|
918
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
919
|
+
|
|
920
|
+
if args.format in ('text', 'both'):
|
|
921
|
+
out_txt = output_dir / f"scan_findings_{run_ts}.txt"
|
|
922
|
+
with open(out_txt, 'w') as f:
|
|
923
|
+
print_report(df_dedup, ts_min, ts_max, n_raw,
|
|
924
|
+
min_severity=args.min_severity, file=f)
|
|
925
|
+
print(f"Report : {out_txt}")
|
|
926
|
+
|
|
927
|
+
if args.format in ('json', 'both'):
|
|
928
|
+
if len(df_dedup) > 0:
|
|
929
|
+
out_json = output_dir / f"scan_findings_{run_ts}.json"
|
|
930
|
+
with open(out_json, 'w') as jf:
|
|
931
|
+
for _, row in df_dedup.iterrows():
|
|
932
|
+
event = row.to_dict()
|
|
933
|
+
event['_sourcetype'] = 'lh_scan_findings'
|
|
934
|
+
event['detector_version'] = VERSION
|
|
935
|
+
event = {k: ('' if v is None else v) for k, v in event.items()}
|
|
936
|
+
jf.write(json.dumps(event) + '\n')
|
|
937
|
+
print(f"JSON (Splunk) : {out_json}")
|
|
938
|
+
|
|
939
|
+
if len(df_dedup) > 0:
|
|
940
|
+
out_csv = output_dir / f"scan_findings_{run_ts}.csv"
|
|
941
|
+
df_dedup.to_csv(out_csv, index=False)
|
|
942
|
+
print(f"CSV : {out_csv}")
|
|
943
|
+
|
|
944
|
+
|
|
945
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
946
|
+
# CLI
|
|
947
|
+
# ══════════════════════════════════════════════════════════════════════════════
|
|
948
|
+
|
|
949
|
+
def build_parser() -> argparse.ArgumentParser:
|
|
950
|
+
p = argparse.ArgumentParser(
|
|
951
|
+
prog='lh-scan',
|
|
952
|
+
description='Port scan detector — part of the loghunter suite.',
|
|
953
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
954
|
+
epilog=__doc__,
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
p.add_argument('log_path',
|
|
958
|
+
help='Path to Zeek conn.log or glob pattern (e.g. logs/conn.*.log.gz)')
|
|
959
|
+
|
|
960
|
+
# Network context
|
|
961
|
+
net = p.add_argument_group('network context')
|
|
962
|
+
net.add_argument('--home-nets', nargs='+', metavar='CIDR',
|
|
963
|
+
default=['10.0.0.0/8', '172.16.0.0/12', '192.168.0.0/16'],
|
|
964
|
+
help='Internal network CIDRs (default: RFC1918)')
|
|
965
|
+
net.add_argument('--allowlist-ips', nargs='+', metavar='IP',
|
|
966
|
+
default=[],
|
|
967
|
+
help='Source IPs to exclude from scan detection')
|
|
968
|
+
|
|
969
|
+
# Detection thresholds
|
|
970
|
+
thresh = p.add_argument_group('detection thresholds')
|
|
971
|
+
thresh.add_argument('--vertical-threshold', type=int,
|
|
972
|
+
default=DEFAULT_VERTICAL_PORT_THRESHOLD,
|
|
973
|
+
help=f'Distinct ports to trigger vertical scan '
|
|
974
|
+
f'(default: {DEFAULT_VERTICAL_PORT_THRESHOLD})')
|
|
975
|
+
thresh.add_argument('--horizontal-threshold', type=int,
|
|
976
|
+
default=DEFAULT_HORIZONTAL_HOST_THRESHOLD,
|
|
977
|
+
help=f'Distinct hosts to trigger horizontal scan '
|
|
978
|
+
f'(default: {DEFAULT_HORIZONTAL_HOST_THRESHOLD})')
|
|
979
|
+
thresh.add_argument('--block-port-threshold', type=int,
|
|
980
|
+
default=DEFAULT_BLOCK_PORT_THRESHOLD,
|
|
981
|
+
help=f'Port threshold for block scan (default: {DEFAULT_BLOCK_PORT_THRESHOLD})')
|
|
982
|
+
thresh.add_argument('--block-host-threshold', type=int,
|
|
983
|
+
default=DEFAULT_BLOCK_HOST_THRESHOLD,
|
|
984
|
+
help=f'Host threshold for block scan (default: {DEFAULT_BLOCK_HOST_THRESHOLD})')
|
|
985
|
+
thresh.add_argument('--block-state-min', type=float,
|
|
986
|
+
default=DEFAULT_BLOCK_SCAN_STATE_MIN,
|
|
987
|
+
help=f'Min scan_state_ratio for block scan '
|
|
988
|
+
f'(default: {DEFAULT_BLOCK_SCAN_STATE_MIN})')
|
|
989
|
+
thresh.add_argument('--slow-state-min', type=float,
|
|
990
|
+
default=DEFAULT_SLOW_SCAN_STATE_MIN,
|
|
991
|
+
help=f'Min scan_state_ratio for slow scan '
|
|
992
|
+
f'(default: {DEFAULT_SLOW_SCAN_STATE_MIN})')
|
|
993
|
+
thresh.add_argument('--slow-min-ports', type=int,
|
|
994
|
+
default=DEFAULT_SLOW_MIN_PORTS,
|
|
995
|
+
help=f'Min unique ports for slow scan (default: {DEFAULT_SLOW_MIN_PORTS})')
|
|
996
|
+
thresh.add_argument('--slow-min-buckets', type=int,
|
|
997
|
+
default=DEFAULT_SLOW_MIN_BUCKETS,
|
|
998
|
+
help=f'Min active time buckets for slow scan '
|
|
999
|
+
f'(default: {DEFAULT_SLOW_MIN_BUCKETS})')
|
|
1000
|
+
thresh.add_argument('--fast-window', type=int,
|
|
1001
|
+
default=DEFAULT_FAST_WINDOW_SECS,
|
|
1002
|
+
help=f'Fast detection window in seconds (default: {DEFAULT_FAST_WINDOW_SECS})')
|
|
1003
|
+
thresh.add_argument('--slow-window', type=int,
|
|
1004
|
+
default=DEFAULT_SLOW_WINDOW_SECS,
|
|
1005
|
+
help=f'Slow detection window in seconds (default: {DEFAULT_SLOW_WINDOW_SECS})')
|
|
1006
|
+
|
|
1007
|
+
# Output
|
|
1008
|
+
out = p.add_argument_group('output')
|
|
1009
|
+
out.add_argument('--output', metavar='DIR', default=None,
|
|
1010
|
+
help='Write results to this directory (default: print to stdout)')
|
|
1011
|
+
out.add_argument('--format', choices=['text', 'json', 'both'], default='text',
|
|
1012
|
+
help='Output format (default: text)')
|
|
1013
|
+
out.add_argument('--min-severity', choices=['HIGH', 'MEDIUM', 'LOW'], default='LOW',
|
|
1014
|
+
help='Minimum severity to report (default: LOW)')
|
|
1015
|
+
|
|
1016
|
+
p.add_argument('--verbose', '-v', action='store_true',
|
|
1017
|
+
help='Print progress and diagnostic detail')
|
|
1018
|
+
p.add_argument('--version', action='version', version=f'lh-scan {VERSION}')
|
|
1019
|
+
|
|
1020
|
+
return p
|
|
1021
|
+
|
|
1022
|
+
|
|
1023
|
+
def main():
|
|
1024
|
+
parser = build_parser()
|
|
1025
|
+
args = parser.parse_args()
|
|
1026
|
+
|
|
1027
|
+
run_ts = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
1028
|
+
|
|
1029
|
+
# ── Load ──────────────────────────────────────────────────────────────────
|
|
1030
|
+
print(f"lh-scan {VERSION} — loading {args.log_path}")
|
|
1031
|
+
try:
|
|
1032
|
+
df_raw, n_skipped = load_conn_log(args.log_path, verbose=args.verbose)
|
|
1033
|
+
except (FileNotFoundError, ValueError) as e:
|
|
1034
|
+
print(f"Error: {e}", file=sys.stderr)
|
|
1035
|
+
sys.exit(1)
|
|
1036
|
+
|
|
1037
|
+
ts_min = df_raw['ts'].min()
|
|
1038
|
+
ts_max = df_raw['ts'].max()
|
|
1039
|
+
span_hours = (ts_max - ts_min).total_seconds() / 3600
|
|
1040
|
+
n_raw = len(df_raw)
|
|
1041
|
+
|
|
1042
|
+
print(f"Loaded {n_raw:,} connections "
|
|
1043
|
+
f"({ts_min.strftime('%Y-%m-%d %H:%M')} → {ts_max.strftime('%Y-%m-%d %H:%M')} UTC, "
|
|
1044
|
+
f"{span_hours:.1f}h)")
|
|
1045
|
+
if n_skipped:
|
|
1046
|
+
print(f" Skipped {n_skipped:,} malformed rows")
|
|
1047
|
+
|
|
1048
|
+
# ── Pre-filter ────────────────────────────────────────────────────────────
|
|
1049
|
+
print("Pre-filtering...")
|
|
1050
|
+
df = prefilter(df_raw, args)
|
|
1051
|
+
|
|
1052
|
+
# ── Detect ────────────────────────────────────────────────────────────────
|
|
1053
|
+
print("Running vertical scan detection...")
|
|
1054
|
+
df_vert_slow = detect_vertical_scans(df, args)
|
|
1055
|
+
# Fast window: temporarily override slow_window
|
|
1056
|
+
args_fast = argparse.Namespace(**vars(args))
|
|
1057
|
+
args_fast.slow_window = args.fast_window
|
|
1058
|
+
df_vert_fast = detect_vertical_scans(df, args_fast)
|
|
1059
|
+
|
|
1060
|
+
print("Running horizontal scan detection...")
|
|
1061
|
+
df_horiz_slow = detect_horizontal_scans(df, args)
|
|
1062
|
+
df_horiz_fast = detect_horizontal_scans(df, args_fast)
|
|
1063
|
+
|
|
1064
|
+
print("Running block scan detection...")
|
|
1065
|
+
df_block_slow = detect_block_scans(df, args)
|
|
1066
|
+
df_block_fast = detect_block_scans(df, args_fast)
|
|
1067
|
+
|
|
1068
|
+
print("Running slow scan / temporal spread analysis...")
|
|
1069
|
+
df_slow = detect_slow_scans(df, args)
|
|
1070
|
+
|
|
1071
|
+
# ── Fingerprint ───────────────────────────────────────────────────────────
|
|
1072
|
+
all_dfs = [df_vert_slow, df_vert_fast, df_horiz_slow, df_horiz_fast,
|
|
1073
|
+
df_block_slow, df_block_fast, df_slow]
|
|
1074
|
+
scanner_ips = list(set(
|
|
1075
|
+
ip for d in all_dfs if len(d) > 0
|
|
1076
|
+
for ip in d['src_ip'].unique()
|
|
1077
|
+
))
|
|
1078
|
+
fingerprints = conn_state_fingerprints(df, scanner_ips)
|
|
1079
|
+
|
|
1080
|
+
# ── Synthesize ────────────────────────────────────────────────────────────
|
|
1081
|
+
df_findings = synthesize(all_dfs, fingerprints)
|
|
1082
|
+
|
|
1083
|
+
# ── Report ────────────────────────────────────────────────────────────────
|
|
1084
|
+
if args.output:
|
|
1085
|
+
output_dir = Path(args.output)
|
|
1086
|
+
export_results(df_findings, ts_min, ts_max, n_raw,
|
|
1087
|
+
output_dir, run_ts, args)
|
|
1088
|
+
print_report(df_findings, ts_min, ts_max, n_raw,
|
|
1089
|
+
min_severity=args.min_severity)
|
|
1090
|
+
else:
|
|
1091
|
+
print()
|
|
1092
|
+
print_report(df_findings, ts_min, ts_max, n_raw,
|
|
1093
|
+
min_severity=args.min_severity)
|
|
1094
|
+
|
|
1095
|
+
|
|
1096
|
+
if __name__ == '__main__':
|
|
1097
|
+
main()
|