loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,1097 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ lh-scan — Port Scan Detector
4
+ Part of the loghunter suite.
5
+
6
+ Detects port scanning activity from Zeek conn.log data.
7
+
8
+ Scan types detected:
9
+ vertical one source → many ports on one target host
10
+ horizontal one source → same port across many hosts
11
+ block one source → many ports AND many hosts
12
+ slow activity spread across time windows to evade per-window thresholds
13
+
14
+ Usage:
15
+ lh-scan conn.log
16
+ lh-scan /path/to/logs/conn.*.log.gz
17
+ lh-scan conn.log --output scan_results/
18
+ lh-scan conn.log --format json
19
+ lh-scan conn.log --min-severity MEDIUM
20
+ lh-scan conn.log --vertical-threshold 20 --horizontal-threshold 20
21
+ """
22
+
23
+ import argparse
24
+ import gzip
25
+ import glob
26
+ import json
27
+ import os
28
+ import sys
29
+ import warnings
30
+ from datetime import datetime, timezone
31
+ from pathlib import Path
32
+
33
+ import numpy as np
34
+ import pandas as pd
35
+ from scipy.stats import entropy as scipy_entropy
36
+ from tqdm import tqdm
37
+
38
+ warnings.filterwarnings('ignore')
39
+
40
+ # ── Version ───────────────────────────────────────────────────────────────────
41
+ VERSION = '1.0.0'
42
+
43
+ # ── Defaults ──────────────────────────────────────────────────────────────────
44
+ DEFAULT_VERTICAL_PORT_THRESHOLD = 15
45
+ DEFAULT_HORIZONTAL_HOST_THRESHOLD = 15
46
+ DEFAULT_BLOCK_PORT_THRESHOLD = 20
47
+ DEFAULT_BLOCK_HOST_THRESHOLD = 20
48
+ DEFAULT_BLOCK_SCAN_STATE_MIN = 0.30
49
+ DEFAULT_SLOW_SCAN_STATE_MIN = 0.30
50
+ DEFAULT_FAST_WINDOW_SECS = 60
51
+ DEFAULT_SLOW_WINDOW_SECS = 3600
52
+ DEFAULT_MIN_CONNECTIONS = 3
53
+ DEFAULT_SLOW_MIN_PORTS = 8
54
+ DEFAULT_SLOW_MIN_BUCKETS = 4
55
+
56
+ SCAN_STATES = {'S0', 'REJ', 'RSTO', 'RSTR', 'SH', 'OTH'}
57
+
58
+ BITTORRENT_PORTS_PEER = {6881, 6882, 6883, 6884, 6885, 6886, 6887, 6888, 6889,
59
+ 51413, 51414}
60
+ BITTORRENT_PORTS_TRACKER = {6969, 2710}
61
+
62
+ # IoT/smart device discovery ports — multicast/broadcast, structurally produce
63
+ # high S0/OTH rates that are not scanning
64
+ IOT_DISCOVERY_PORTS = {
65
+ 5353, # mDNS
66
+ 1900, # SSDP/UPnP
67
+ 5355, # LLMNR
68
+ 137, # NetBIOS Name Service
69
+ 138, # NetBIOS Datagram
70
+ }
71
+
72
+ # IoT multicast/broadcast destination ranges — connections to these are never scans
73
+ IOT_MULTICAST_PREFIXES = ('224.', '239.', '255.255.255.255', 'ff0', 'ff1', 'ff2')
74
+
75
+ DARK_PORTS = {0, 1, 2, 3, 4, 6, 8}
76
+
77
+ REQUIRED_FIELDS = {'ts', 'id.orig_h', 'id.resp_h', 'id.resp_p', 'proto', 'conn_state'}
78
+ OPTIONAL_FIELDS = {'orig_bytes', 'resp_bytes', 'duration', 'orig_pkts', 'resp_pkts'}
79
+
80
+ SCAN_TYPE_DESCRIPTIONS = {
81
+ 'vertical' : 'Port scan (one host, many ports)',
82
+ 'horizontal': 'Network sweep (many hosts, one port)',
83
+ 'block' : 'Block scan (many hosts AND many ports)',
84
+ }
85
+
86
+ STATE_EXPLANATIONS = {
87
+ 'S0' : 'SYN sent, no response (filtered/firewalled)',
88
+ 'REJ' : 'Port closed (RST received)',
89
+ 'RSTO': 'Connection reset by originator',
90
+ 'RSTR': 'Connection reset by responder',
91
+ 'SF' : 'Normal established+closed connection',
92
+ 'SH' : 'Half-open scan (SYN+FIN)',
93
+ 'OTH' : 'No SYN observed',
94
+ }
95
+
96
+
97
+ # ══════════════════════════════════════════════════════════════════════════════
98
+ # Data loading
99
+ # ══════════════════════════════════════════════════════════════════════════════
100
+
101
+ def open_log(path: str):
102
+ """Open a plain or gzipped log file."""
103
+ if path.endswith('.gz'):
104
+ return gzip.open(path, 'rt', encoding='utf-8', errors='replace')
105
+ return open(path, 'r', encoding='utf-8', errors='replace')
106
+
107
+
108
+ def load_conn_log(pattern: str, verbose: bool = False) -> tuple[pd.DataFrame, int]:
109
+ """
110
+ Load one or more Zeek conn.log files matching a glob pattern.
111
+ Handles plain and gzipped files transparently.
112
+ Parses line-by-line with json.loads — avoids ujson issues with Zeek output.
113
+ """
114
+ paths = sorted(glob.glob(pattern)) if ('*' in pattern or '?' in pattern) else [pattern]
115
+ if not paths:
116
+ raise FileNotFoundError(f"No files matched: {pattern}")
117
+
118
+ rows = []
119
+ skipped = 0
120
+
121
+ for path in paths:
122
+ with open_log(path) as fh:
123
+ for line in tqdm(fh, desc=f" {Path(path).name}", unit=" lines",
124
+ leave=False, disable=not verbose):
125
+ line = line.strip()
126
+ if not line or line.startswith('#'):
127
+ continue
128
+ try:
129
+ rec = json.loads(line)
130
+ except json.JSONDecodeError:
131
+ skipped += 1
132
+ continue
133
+ if not REQUIRED_FIELDS.issubset(rec.keys()):
134
+ skipped += 1
135
+ continue
136
+ row = {f: rec[f] for f in REQUIRED_FIELDS}
137
+ for f in OPTIONAL_FIELDS:
138
+ row[f] = rec.get(f)
139
+ rows.append(row)
140
+
141
+ if not rows:
142
+ raise ValueError(f"No valid conn.log records found in: {pattern}")
143
+
144
+ df = pd.DataFrame(rows)
145
+ df.rename(columns={
146
+ 'id.orig_h': 'src_ip',
147
+ 'id.resp_h': 'dst_ip',
148
+ 'id.resp_p': 'dst_port',
149
+ }, inplace=True)
150
+ df['ts'] = pd.to_datetime(df['ts'], unit='s', utc=True)
151
+ df['dst_port'] = pd.to_numeric(df['dst_port'], errors='coerce').astype('Int32')
152
+ df.sort_values('ts', inplace=True)
153
+ df.reset_index(drop=True, inplace=True)
154
+ return df, skipped
155
+
156
+
157
+ # ══════════════════════════════════════════════════════════════════════════════
158
+ # Pre-filtering
159
+ # ══════════════════════════════════════════════════════════════════════════════
160
+
161
+ def ip_in_nets(ip: str, nets: list) -> bool:
162
+ """Return True if ip falls within any of the given CIDR strings."""
163
+ import ipaddress
164
+ try:
165
+ addr = ipaddress.ip_address(ip)
166
+ return any(addr in ipaddress.ip_network(n, strict=False) for n in nets)
167
+ except ValueError:
168
+ return False
169
+
170
+
171
+ def build_internal_mask(series: pd.Series, nets: list) -> pd.Series:
172
+ """Vectorized internal IP classification."""
173
+ return series.map(lambda ip: ip_in_nets(ip, nets))
174
+
175
+
176
+ def classify_direction(src_int: bool, dst_int: bool) -> str:
177
+ if src_int and dst_int:
178
+ return 'internal→internal'
179
+ elif src_int:
180
+ return 'internal→external'
181
+ elif dst_int:
182
+ return 'external→internal'
183
+ return 'external→external'
184
+
185
+
186
+ def prefilter(df_raw: pd.DataFrame, args) -> pd.DataFrame:
187
+ """
188
+ Apply pre-filters to remove traffic that produces structural false positives:
189
+ - ICMP: Zeek encodes type/code in port fields — not real port numbers
190
+ - IPv6 link-local (fe80::/10): neighbor discovery, not scanning
191
+ - IoT multicast/broadcast destinations: mDNS, SSDP, etc.
192
+ - Allowlisted source IPs
193
+ Then classify direction (internal/external) using home_nets.
194
+ """
195
+ n_raw = len(df_raw)
196
+ df = df_raw.copy()
197
+
198
+ # ICMP — port field semantics are different, not suitable for scan detection
199
+ icmp_mask = df['proto'] == 'icmp'
200
+ df = df[~icmp_mask].copy()
201
+ n_icmp = icmp_mask.sum()
202
+
203
+ # IPv6 link-local
204
+ ipv6_ll_mask = (df['src_ip'].str.startswith('fe80:') |
205
+ df['dst_ip'].str.startswith('fe80:'))
206
+ df = df[~ipv6_ll_mask].copy()
207
+ n_ipv6 = ipv6_ll_mask.sum()
208
+
209
+ # IoT multicast/broadcast destinations
210
+ multicast_mask = df['dst_ip'].map(
211
+ lambda ip: any(ip.startswith(p) for p in IOT_MULTICAST_PREFIXES)
212
+ )
213
+ df = df[~multicast_mask].copy()
214
+ n_multicast = multicast_mask.sum()
215
+
216
+ # Allowlisted source IPs
217
+ n_allowlist = 0
218
+ if args.allowlist_ips:
219
+ al_mask = df['src_ip'].isin(args.allowlist_ips)
220
+ n_allowlist = al_mask.sum()
221
+ df = df[~al_mask].copy()
222
+
223
+ # Direction classification
224
+ home_nets = args.home_nets or []
225
+ src_int = build_internal_mask(df['src_ip'], home_nets)
226
+ dst_int = build_internal_mask(df['dst_ip'], home_nets)
227
+ df['direction'] = [classify_direction(si, di)
228
+ for si, di in zip(src_int, dst_int)]
229
+
230
+ if args.verbose:
231
+ print(f" Pre-filter summary:")
232
+ print(f" Raw rows : {n_raw:,}")
233
+ print(f" ICMP excluded : {n_icmp:,}")
234
+ print(f" IPv6 LL excl. : {n_ipv6:,}")
235
+ print(f" Multicast excl: {n_multicast:,}")
236
+ if n_allowlist:
237
+ print(f" Allowlist excl: {n_allowlist:,}")
238
+ print(f" Working rows : {len(df):,}")
239
+
240
+ return df
241
+
242
+
243
+ # ══════════════════════════════════════════════════════════════════════════════
244
+ # Detectors
245
+ # ══════════════════════════════════════════════════════════════════════════════
246
+
247
+ def detect_vertical_scans(df: pd.DataFrame, args) -> pd.DataFrame:
248
+ """
249
+ Vertical scan: one src → many distinct ports on one dst.
250
+ Two-pass: global groupby filter → sliding window on candidates only.
251
+ """
252
+ threshold = args.vertical_threshold
253
+ window_secs = args.slow_window
254
+
255
+ # Pass 1
256
+ global_counts = (
257
+ df.groupby(['src_ip', 'dst_ip'])['dst_port']
258
+ .nunique()
259
+ .reset_index(name='global_distinct_ports')
260
+ )
261
+ candidates = global_counts[global_counts['global_distinct_ports'] >= threshold]
262
+
263
+ if args.verbose:
264
+ print(f" Vertical Pass 1: {len(candidates)} candidate pairs "
265
+ f"(of {len(global_counts):,} total)")
266
+
267
+ if len(candidates) == 0:
268
+ return pd.DataFrame()
269
+
270
+ # Pass 2: merge instead of apply() for scalability
271
+ cand_keys = candidates[['src_ip', 'dst_ip']]
272
+ df_cands = df.merge(cand_keys, on=['src_ip', 'dst_ip'])
273
+
274
+ results = []
275
+ for (src, dst), grp in df_cands.groupby(['src_ip', 'dst_ip']):
276
+ grp = grp.sort_values('ts')
277
+ ts_arr = grp['ts'].values.astype('int64') / 1e9
278
+ port_arr = grp['dst_port'].values
279
+ state_arr = grp['conn_state'].values
280
+
281
+ port_counts = {}
282
+ max_ports_in_window = 0
283
+ best_window_start = ts_arr[0]
284
+ left = 0
285
+
286
+ for right in range(len(ts_arr)):
287
+ p = port_arr[right]
288
+ if p is not None and not (isinstance(p, float) and np.isnan(p)):
289
+ port_counts[p] = port_counts.get(p, 0) + 1
290
+ while ts_arr[right] - ts_arr[left] > window_secs:
291
+ lp = port_arr[left]
292
+ if lp is not None and not (isinstance(lp, float) and np.isnan(lp)):
293
+ port_counts[lp] -= 1
294
+ if port_counts[lp] == 0:
295
+ del port_counts[lp]
296
+ left += 1
297
+ n = len(port_counts)
298
+ if n > max_ports_in_window:
299
+ max_ports_in_window = n
300
+ best_window_start = ts_arr[left]
301
+
302
+ if max_ports_in_window < threshold:
303
+ continue
304
+
305
+ state_counts = pd.Series(state_arr).value_counts()
306
+ total_conns = len(state_arr)
307
+ scan_state_count = sum(state_counts.get(s, 0) for s in SCAN_STATES)
308
+ scan_state_ratio = scan_state_count / total_conns
309
+
310
+ port_series = pd.Series(port_arr).dropna()
311
+ port_buckets = pd.cut(port_series, bins=[0, 1023, 49151, 65535],
312
+ labels=['well-known', 'registered', 'ephemeral'])
313
+ port_range_entropy = scipy_entropy(port_buckets.value_counts().values + 1)
314
+
315
+ results.append({
316
+ 'scan_type' : 'vertical',
317
+ 'src_ip' : src,
318
+ 'dst_ip' : dst,
319
+ 'dst_port' : None,
320
+ 'port_class' : None,
321
+ 'distinct_ports' : max_ports_in_window,
322
+ 'distinct_hosts' : 1,
323
+ 'total_conns' : total_conns,
324
+ 'scan_state_ratio' : round(scan_state_ratio, 3),
325
+ 'top_states' : ', '.join(state_counts.head(3).index.tolist()),
326
+ 'port_range_entropy' : round(port_range_entropy, 3),
327
+ 'window_start' : datetime.fromtimestamp(
328
+ best_window_start, tz=timezone.utc
329
+ ).strftime('%Y-%m-%d %H:%M:%S'),
330
+ 'window_secs' : window_secs,
331
+ 'direction' : grp['direction'].iloc[0],
332
+ })
333
+
334
+ return pd.DataFrame(results)
335
+
336
+
337
+ def detect_horizontal_scans(df: pd.DataFrame, args) -> pd.DataFrame:
338
+ """
339
+ Horizontal scan: one src → same port across many distinct hosts.
340
+ Two-pass: global groupby filter → sliding window on candidates only.
341
+ """
342
+ threshold = args.horizontal_threshold
343
+ window_secs = args.slow_window
344
+
345
+ df_tcp_udp = df[df['dst_port'].notna()].copy()
346
+
347
+ # Pass 1
348
+ global_counts = (
349
+ df_tcp_udp.groupby(['src_ip', 'dst_port'])['dst_ip']
350
+ .nunique()
351
+ .reset_index(name='global_distinct_hosts')
352
+ )
353
+ candidates = global_counts[global_counts['global_distinct_hosts'] >= threshold]
354
+
355
+ if args.verbose:
356
+ print(f" Horizontal Pass 1: {len(candidates)} candidate pairs "
357
+ f"(of {len(global_counts):,} total)")
358
+
359
+ if len(candidates) == 0:
360
+ return pd.DataFrame()
361
+
362
+ # Pass 2: merge for scalability
363
+ cand_keys = candidates[['src_ip', 'dst_port']]
364
+ df_cands = df_tcp_udp.merge(cand_keys, on=['src_ip', 'dst_port'])
365
+
366
+ results = []
367
+ for (src, port), grp in df_cands.groupby(['src_ip', 'dst_port']):
368
+ grp = grp.sort_values('ts')
369
+ ts_arr = grp['ts'].values.astype('int64') / 1e9
370
+ host_arr = grp['dst_ip'].values
371
+ state_arr = grp['conn_state'].values
372
+
373
+ host_counts = {}
374
+ max_hosts_in_window = 0
375
+ best_window_start = ts_arr[0]
376
+ left = 0
377
+
378
+ for right in range(len(ts_arr)):
379
+ h = host_arr[right]
380
+ if h is not None:
381
+ host_counts[h] = host_counts.get(h, 0) + 1
382
+ while ts_arr[right] - ts_arr[left] > window_secs:
383
+ lh = host_arr[left]
384
+ if lh is not None:
385
+ host_counts[lh] -= 1
386
+ if host_counts[lh] == 0:
387
+ del host_counts[lh]
388
+ left += 1
389
+ n = len(host_counts)
390
+ if n > max_hosts_in_window:
391
+ max_hosts_in_window = n
392
+ best_window_start = ts_arr[left]
393
+
394
+ if max_hosts_in_window < threshold:
395
+ continue
396
+
397
+ state_counts = pd.Series(state_arr).value_counts()
398
+ total_conns = len(state_arr)
399
+ scan_state_ratio = sum(state_counts.get(s, 0) for s in SCAN_STATES) / total_conns
400
+ velocity = max_hosts_in_window / max(ts_arr[-1] - ts_arr[0], 1)
401
+
402
+ port = int(port)
403
+ if port <= 1023:
404
+ port_class = 'well-known'
405
+ elif port <= 49151:
406
+ port_class = 'registered'
407
+ else:
408
+ port_class = 'ephemeral'
409
+
410
+ results.append({
411
+ 'scan_type' : 'horizontal',
412
+ 'src_ip' : src,
413
+ 'dst_ip' : None,
414
+ 'dst_port' : port,
415
+ 'port_class' : port_class,
416
+ 'distinct_ports' : 1,
417
+ 'distinct_hosts' : max_hosts_in_window,
418
+ 'total_conns' : total_conns,
419
+ 'scan_state_ratio' : round(scan_state_ratio, 3),
420
+ 'top_states' : ', '.join(state_counts.head(3).index.tolist()),
421
+ 'velocity_hosts_per_sec' : round(velocity, 4),
422
+ 'window_start' : datetime.fromtimestamp(
423
+ best_window_start, tz=timezone.utc
424
+ ).strftime('%Y-%m-%d %H:%M:%S'),
425
+ 'window_secs' : window_secs,
426
+ 'direction' : grp['direction'].iloc[0],
427
+ })
428
+
429
+ return pd.DataFrame(results)
430
+
431
+
432
+ def detect_block_scans(df: pd.DataFrame, args) -> pd.DataFrame:
433
+ """
434
+ Block scan: one src → many ports AND many hosts.
435
+ scan_state_ratio_min is a hard gate — without it any active workstation fires.
436
+ """
437
+ port_threshold = args.block_port_threshold
438
+ host_threshold = args.block_host_threshold
439
+ scan_state_ratio_min = args.block_state_min
440
+ window_secs = args.slow_window
441
+
442
+ df_w = df[df['dst_port'].notna()].copy()
443
+ df_w['time_bucket'] = (
444
+ df_w['ts'].values.astype('int64') / 1e9 // window_secs
445
+ ).astype(int)
446
+ df_w['is_scan_state'] = df_w['conn_state'].isin(SCAN_STATES)
447
+
448
+ # Pass 1: global filter
449
+ global_agg = df_w.groupby('src_ip').agg(
450
+ global_distinct_ports=('dst_port', 'nunique'),
451
+ global_distinct_hosts=('dst_ip', 'nunique'),
452
+ scan_state_ratio=('is_scan_state', 'mean'),
453
+ ).reset_index()
454
+
455
+ candidates = global_agg[
456
+ (global_agg['global_distinct_ports'] >= port_threshold) &
457
+ (global_agg['global_distinct_hosts'] >= host_threshold) &
458
+ (global_agg['scan_state_ratio'] >= scan_state_ratio_min)
459
+ ]
460
+
461
+ if args.verbose:
462
+ print(f" Block Pass 1: {len(candidates)} candidate src IPs "
463
+ f"(of {global_agg['src_ip'].nunique():,} total)")
464
+
465
+ if len(candidates) == 0:
466
+ return pd.DataFrame()
467
+
468
+ df_cands = df_w[df_w['src_ip'].isin(candidates['src_ip'])]
469
+ bucket_agg = df_cands.groupby(['src_ip', 'time_bucket']).agg(
470
+ distinct_ports=('dst_port', 'nunique'),
471
+ distinct_hosts=('dst_ip', 'nunique'),
472
+ total_conns=('dst_port', 'count'),
473
+ scan_state_ratio=('is_scan_state', 'mean'),
474
+ top_states=('conn_state',
475
+ lambda x: ', '.join(x.value_counts().head(3).index.tolist())),
476
+ direction=('direction', 'first'),
477
+ ports_well_known=('dst_port', lambda x: (x <= 1023).sum()),
478
+ ports_registered=('dst_port', lambda x: ((x > 1023) & (x <= 49151)).sum()),
479
+ ports_ephemeral=('dst_port', lambda x: (x > 49151).sum()),
480
+ window_start_ts=('ts', lambda x: x.values.astype('int64').min() / 1e9),
481
+ ).reset_index()
482
+
483
+ findings = bucket_agg[
484
+ (bucket_agg['distinct_ports'] >= port_threshold) &
485
+ (bucket_agg['distinct_hosts'] >= host_threshold) &
486
+ (bucket_agg['scan_state_ratio'] >= scan_state_ratio_min)
487
+ ].copy()
488
+
489
+ if len(findings) == 0:
490
+ return pd.DataFrame()
491
+
492
+ findings['scan_type'] = 'block'
493
+ findings['dst_ip'] = None
494
+ findings['dst_port'] = None
495
+ findings['port_class'] = None
496
+ findings['window_secs'] = window_secs
497
+ findings['window_start'] = findings['window_start_ts'].map(
498
+ lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc)
499
+ .strftime('%Y-%m-%d %H:%M:%S')
500
+ )
501
+ findings['scan_state_ratio'] = findings['scan_state_ratio'].round(3)
502
+ findings['breadth_score'] = findings['distinct_ports'] * findings['distinct_hosts']
503
+
504
+ findings = (
505
+ findings
506
+ .sort_values('breadth_score', ascending=False)
507
+ .drop_duplicates(subset=['src_ip'], keep='first')
508
+ .drop(columns=['time_bucket', 'window_start_ts', 'breadth_score'])
509
+ .reset_index(drop=True)
510
+ )
511
+
512
+ return findings
513
+
514
+
515
+ def detect_slow_scans(df: pd.DataFrame, args) -> pd.DataFrame:
516
+ """
517
+ Slow scan / temporal spread detector.
518
+
519
+ Finds hosts whose port diversity is spread across many time buckets,
520
+ staying below per-window thresholds deliberately.
521
+
522
+ temporal_spread_score = total_unique_ports / max_ports_in_any_single_bucket
523
+ Score >> 1 = deliberately spread (slow scan pattern)
524
+ Score ≈ 1 = clustered in time (normal behavior)
525
+
526
+ scan_state_ratio gate filters out IoT/mobile devices whose spread comes
527
+ from network attach/detach cycles rather than scanning activity.
528
+ """
529
+ min_ports = args.slow_min_ports
530
+ min_buckets = args.slow_min_buckets
531
+ state_min = args.slow_state_min
532
+ bucket_secs = args.slow_window
533
+ vert_threshold = args.vertical_threshold
534
+
535
+ df_w = df[df['dst_port'].notna()].copy()
536
+ df_w['time_bucket'] = (
537
+ df_w['ts'].values.astype('int64') / 1e9 // bucket_secs
538
+ ).astype(int)
539
+
540
+ # IoT pattern recognition helpers
541
+ def is_iot_discovery(grp: pd.DataFrame) -> bool:
542
+ """
543
+ Return True if this src looks like IoT device discovery traffic rather
544
+ than scanning. Signals:
545
+ - Majority of traffic is to well-known IoT discovery ports (mDNS, SSDP)
546
+ - Top destinations are DNS servers or multicast groups
547
+ - Very low unique external routable destinations
548
+ """
549
+ port_counts = grp['dst_port'].value_counts()
550
+ top_ports = set(port_counts.head(3).index.tolist())
551
+ # If top ports are dominated by discovery ports, likely IoT
552
+ if top_ports.issubset(IOT_DISCOVERY_PORTS | {53, 443, 80}):
553
+ # And the traffic is mostly to internal/multicast destinations
554
+ ext_conns = grp[~grp['direction'].str.startswith('internal')].shape[0]
555
+ if ext_conns / len(grp) < 0.1:
556
+ return True
557
+ return False
558
+
559
+ results = []
560
+
561
+ for src, grp in df_w.groupby('src_ip'):
562
+ n_buckets = grp['time_bucket'].nunique()
563
+ if n_buckets < min_buckets:
564
+ continue
565
+
566
+ total_unique_ports = grp['dst_port'].nunique()
567
+ if total_unique_ports < min_ports:
568
+ continue
569
+
570
+ max_ports_in_bucket = grp.groupby('time_bucket')['dst_port'].nunique().max()
571
+
572
+ # Already caught by vertical detector — skip
573
+ if max_ports_in_bucket >= vert_threshold:
574
+ continue
575
+
576
+ spread_score = round(total_unique_ports / max(max_ports_in_bucket, 1), 2)
577
+ state_counts = grp['conn_state'].value_counts()
578
+ scan_state_ratio = sum(state_counts.get(s, 0) for s in SCAN_STATES) / len(grp)
579
+
580
+ # State ratio gate — filters IoT/mobile network attach/detach patterns
581
+ if scan_state_ratio < state_min:
582
+ continue
583
+
584
+ # IoT discovery pattern check
585
+ iot_flag = is_iot_discovery(grp)
586
+
587
+ # Pattern tag for slow scan findings
588
+ if iot_flag:
589
+ pattern_tag = 'iot_discovery'
590
+ pattern_notes = (
591
+ f"Traffic pattern consistent with IoT device discovery (mDNS/SSDP/UPnP). "
592
+ f"High temporal spread from repeated network attach/detach cycles rather "
593
+ f"than deliberate scanning. Add to iot_devices in loghunter.conf to suppress."
594
+ )
595
+ elif scan_state_ratio >= 0.60:
596
+ pattern_tag = 'slow_scan'
597
+ pattern_notes = (
598
+ f"Temporal spread score {spread_score:.2f} with {scan_state_ratio:.1%} "
599
+ f"scan-indicative states across {n_buckets} time windows. "
600
+ f"Activity deliberately paced below per-window detection threshold. "
601
+ f"Strong slow scan signature."
602
+ )
603
+ else:
604
+ pattern_tag = 'slow_scan_candidate'
605
+ pattern_notes = (
606
+ f"Temporal spread score {spread_score:.2f} with {scan_state_ratio:.1%} "
607
+ f"scan-indicative states across {n_buckets} time windows. "
608
+ f"Moderate confidence — review destination IPs and ports."
609
+ )
610
+
611
+ results.append({
612
+ 'scan_type' : 'slow',
613
+ 'src_ip' : src,
614
+ 'dst_ip' : None,
615
+ 'dst_port' : None,
616
+ 'port_class' : None,
617
+ 'distinct_ports' : total_unique_ports,
618
+ 'distinct_hosts' : grp['dst_ip'].nunique(),
619
+ 'max_ports_in_bucket' : int(max_ports_in_bucket),
620
+ 'active_buckets' : n_buckets,
621
+ 'temporal_spread_score' : spread_score,
622
+ 'total_conns' : len(grp),
623
+ 'scan_state_ratio' : round(scan_state_ratio, 3),
624
+ 'top_states' : ', '.join(state_counts.head(3).index.tolist()),
625
+ 'window_start' : grp['ts'].min().strftime('%Y-%m-%d %H:%M:%S'),
626
+ 'window_secs' : bucket_secs,
627
+ 'direction' : grp['direction'].iloc[0],
628
+ 'pattern_tag' : pattern_tag,
629
+ 'pattern_notes' : pattern_notes,
630
+ })
631
+
632
+ if not results:
633
+ return pd.DataFrame()
634
+
635
+ return (
636
+ pd.DataFrame(results)
637
+ .sort_values('temporal_spread_score', ascending=False)
638
+ .reset_index(drop=True)
639
+ )
640
+
641
+
642
+ # ══════════════════════════════════════════════════════════════════════════════
643
+ # Fingerprinting
644
+ # ══════════════════════════════════════════════════════════════════════════════
645
+
646
+ def conn_state_fingerprints(df: pd.DataFrame, scanner_ips: list) -> dict:
647
+ """Compute per-src connection state fingerprints for candidate scanner IPs."""
648
+ fps = {}
649
+ for ip in scanner_ips:
650
+ src_df = df[df['src_ip'] == ip]
651
+ if len(src_df) == 0:
652
+ continue
653
+ dist = src_df['conn_state'].value_counts(normalize=True).round(3)
654
+ scan_score = sum(dist.get(s, 0) for s in SCAN_STATES)
655
+ fps[ip] = {
656
+ 'total_connections' : len(src_df),
657
+ 'state_distribution': dist.to_dict(),
658
+ 'scan_state_score' : round(scan_score, 3),
659
+ 'dominant_state' : dist.index[0] if len(dist) > 0 else None,
660
+ }
661
+ return fps
662
+
663
+
664
+ # ══════════════════════════════════════════════════════════════════════════════
665
+ # Pattern classification
666
+ # ══════════════════════════════════════════════════════════════════════════════
667
+
668
+ def classify_finding(row) -> tuple[str, str]:
669
+ """
670
+ Returns (pattern_tag, explanation) for a finding.
671
+ Ordered from most specific to least specific.
672
+ """
673
+ src = row['src_ip']
674
+ port = row.get('dst_port')
675
+ ratio = row['scan_state_ratio']
676
+ hosts = row.get('distinct_hosts') or 0
677
+ ports = row.get('distinct_ports') or 0
678
+ scan_type = row['scan_type']
679
+
680
+ # Slow scan findings are pre-tagged by detect_slow_scans()
681
+ if scan_type == 'slow':
682
+ return (row.get('pattern_tag', 'slow_scan_candidate'),
683
+ row.get('pattern_notes', ''))
684
+
685
+ # ── IoT discovery ports ──
686
+ if port in IOT_DISCOVERY_PORTS and ratio < 0.40:
687
+ return ('iot_discovery',
688
+ f"Port {port} is an IoT/device discovery port (mDNS/SSDP/UPnP/NetBIOS). "
689
+ f"High host counts on this port are normal for device discovery protocols. "
690
+ f"Not a port scan. Add source to iot_devices in loghunter.conf to suppress.")
691
+
692
+ # ── BitTorrent peer ports ──
693
+ if port in BITTORRENT_PORTS_PEER and ratio >= 0.50:
694
+ return ('bittorrent',
695
+ f"BitTorrent peer connections on port {port} — {hosts} peers contacted, "
696
+ f"{ratio:.1%} failed connections (normal for BT peer discovery). "
697
+ f"If this host shouldn't run BitTorrent, investigate.")
698
+
699
+ # ── BitTorrent tracker ports ──
700
+ if port in BITTORRENT_PORTS_TRACKER and ratio >= 0.15:
701
+ return ('bittorrent',
702
+ f"BitTorrent tracker traffic on port {port} — {hosts} trackers contacted, "
703
+ f"{ratio:.1%} failed connections (normal for tracker announce/scrape). "
704
+ f"If this host shouldn't run BitTorrent, investigate.")
705
+
706
+ # ── DNS recursive resolution ──
707
+ if port == 53 and ratio < 0.05 and hosts >= 15:
708
+ return ('dns_resolver',
709
+ f"DNS recursive resolution — {hosts} external resolvers on port 53, "
710
+ f"{ratio:.1%} failed. This is a DNS server or resolver, not a scanner. "
711
+ f"Add to dns_servers in loghunter.conf to suppress.")
712
+
713
+ # ── Normal HTTPS browsing / cloud services ──
714
+ if port == 443 and ratio < 0.10 and hosts >= 15:
715
+ return ('https_browsing',
716
+ f"HTTPS to {hosts} external hosts, {ratio:.1%} failed — consistent with "
717
+ f"normal web browsing or cloud service traffic. "
718
+ f"Add to workstations or servers in loghunter.conf to suppress.")
719
+
720
+ # ── Normal HTTP browsing ──
721
+ if port == 80 and ratio < 0.10 and hosts >= 15:
722
+ return ('http_browsing',
723
+ f"HTTP to {hosts} external hosts, {ratio:.1%} failed — consistent with "
724
+ f"normal web traffic.")
725
+
726
+ # ── Streaming device / DNS-blocked HTTPS ──
727
+ if port == 443 and 0.10 <= ratio < 0.50 and hosts >= 20:
728
+ return ('streaming_blocked',
729
+ f"{hosts} HTTPS destinations, {ratio:.1%} failed. On a media/streaming "
730
+ f"device this pattern is consistent with DNS-level blocking (Pi-hole, "
731
+ f"NextDNS) causing direct connection fallback attempts. "
732
+ f"Add to media_devices in loghunter.conf to suppress.")
733
+
734
+ # ── Dark / unassigned ports ──
735
+ if port in DARK_PORTS and ratio >= 0.90:
736
+ return ('dark_traffic',
737
+ f"Port {port} is unassigned/reserved — likely a Zeek encoding artifact "
738
+ f"(e.g. ICMP type/code) or internet background radiation. "
739
+ f"Check proto field in conn.log.")
740
+
741
+ # ── Strong scanner signature ──
742
+ if scan_type == 'vertical' and ratio >= 0.60 and ports >= 1000:
743
+ return ('confirmed_scan',
744
+ f"Full port range scan — {ports} distinct ports on single target "
745
+ f"with {ratio:.1%} scan-indicative states. Strong scanner signature.")
746
+
747
+ if ratio >= 0.60:
748
+ return ('confirmed_scan',
749
+ f"{ratio:.1%} scan-indicative states "
750
+ f"({'ports' if scan_type == 'vertical' else 'hosts'}: {max(ports, hosts)}). "
751
+ f"Strong scanner signature.")
752
+
753
+ return ('unknown', '')
754
+
755
+
756
+ def severity_label(row) -> str:
757
+ """
758
+ Severity driven by scan_state_ratio as primary signal.
759
+ Breadth is a secondary escalator only — not sufficient on its own.
760
+ Known benign patterns are always LOW regardless of breadth.
761
+ """
762
+ ratio = row['scan_state_ratio']
763
+ breadth = max(row.get('distinct_ports') or 0, row.get('distinct_hosts') or 0)
764
+ tag = row['pattern_tag']
765
+
766
+ # Benign patterns — LOW regardless of breadth
767
+ if tag in ('dns_resolver', 'https_browsing', 'http_browsing',
768
+ 'iot_discovery', 'dark_traffic'):
769
+ return 'LOW'
770
+
771
+ # Slow scan has its own severity logic
772
+ if row.get('scan_type') == 'slow':
773
+ if tag == 'slow_scan':
774
+ return 'HIGH' if ratio >= 0.60 else 'MEDIUM'
775
+ return 'LOW'
776
+
777
+ if ratio >= 0.60:
778
+ return 'HIGH'
779
+ if ratio >= 0.30 and breadth >= 50:
780
+ return 'HIGH'
781
+ if ratio >= 0.20:
782
+ return 'MEDIUM'
783
+ if ratio >= 0.10 and breadth >= 25:
784
+ return 'MEDIUM'
785
+ return 'LOW'
786
+
787
+
788
+ # ══════════════════════════════════════════════════════════════════════════════
789
+ # Synthesis
790
+ # ══════════════════════════════════════════════════════════════════════════════
791
+
792
+ def synthesize(detector_outputs: list[pd.DataFrame],
793
+ fingerprints: dict) -> pd.DataFrame:
794
+ """
795
+ Combine all detector outputs, attach fingerprints, classify, assign severity,
796
+ deduplicate across fast/slow windows, and sort by severity.
797
+ """
798
+ all_dfs = [d for d in detector_outputs if len(d) > 0]
799
+ if not all_dfs:
800
+ return pd.DataFrame()
801
+
802
+ df_all = pd.concat(all_dfs, ignore_index=True)
803
+
804
+ # Attach global scan_state_score
805
+ df_all['scan_state_score'] = df_all['src_ip'].map(
806
+ lambda ip: fingerprints.get(ip, {}).get(
807
+ 'scan_state_score',
808
+ df_all.loc[df_all['src_ip'] == ip, 'scan_state_ratio'].iloc[0]
809
+ )
810
+ )
811
+
812
+ # Classify findings that don't already have a tag (slow scan findings are pre-tagged)
813
+ needs_classification = ~df_all.get('pattern_tag', pd.Series(dtype=str)).notna()
814
+ if 'pattern_tag' not in df_all.columns:
815
+ classified = df_all.apply(classify_finding, axis=1)
816
+ df_all['pattern_tag'] = classified.map(lambda x: x[0])
817
+ df_all['pattern_notes'] = classified.map(lambda x: x[1])
818
+ else:
819
+ # Fill in untagged rows (non-slow detectors)
820
+ mask = df_all['pattern_tag'].isna()
821
+ if mask.any():
822
+ classified = df_all[mask].apply(classify_finding, axis=1)
823
+ df_all.loc[mask, 'pattern_tag'] = classified.map(lambda x: x[0]).values
824
+ df_all.loc[mask, 'pattern_notes'] = classified.map(lambda x: x[1]).values
825
+
826
+ df_all['severity'] = df_all.apply(severity_label, axis=1)
827
+
828
+ # Deduplicate across windows — keep largest breadth per unique event
829
+ df_all['breadth'] = df_all[['distinct_ports', 'distinct_hosts']].fillna(0).max(axis=1)
830
+ df_dedup = (
831
+ df_all
832
+ .sort_values('breadth', ascending=False)
833
+ .drop_duplicates(subset=['scan_type', 'src_ip', 'dst_ip', 'dst_port'], keep='first')
834
+ .reset_index(drop=True)
835
+ )
836
+
837
+ sev_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2}
838
+ df_dedup['_sev_ord'] = df_dedup['severity'].map(sev_order)
839
+ df_dedup = (
840
+ df_dedup
841
+ .sort_values(['_sev_ord', 'scan_state_ratio', 'breadth'],
842
+ ascending=[True, False, False])
843
+ .drop(columns='_sev_ord')
844
+ .reset_index(drop=True)
845
+ )
846
+
847
+ return df_dedup
848
+
849
+
850
+ # ══════════════════════════════════════════════════════════════════════════════
851
+ # Reporting
852
+ # ══════════════════════════════════════════════════════════════════════════════
853
+
854
+ def print_report(df_dedup: pd.DataFrame, ts_min, ts_max, n_raw: int,
855
+ min_severity: str = 'LOW', file=sys.stdout):
856
+ """
857
+ Compact tabular report — one row per finding.
858
+
859
+ All pattern analysis, notes, and next-steps logic is preserved in the
860
+ DataFrame (pattern_tag, pattern_notes, scan_state_ratio, etc.) and
861
+ in the JSON/CSV exports. The verbose block format will be re-enabled
862
+ once pattern recognition is fully validated across diverse network types.
863
+ """
864
+ w = lambda s: print(s, file=file)
865
+
866
+ sev_order = {'HIGH': 0, 'MEDIUM': 1, 'LOW': 2}
867
+ min_sev_ord = sev_order.get(min_severity, 2)
868
+
869
+ report_df = df_dedup[
870
+ df_dedup['severity'].map(sev_order) <= min_sev_ord
871
+ ].copy()
872
+
873
+ if len(report_df) == 0:
874
+ w(f"No findings at or above {min_severity} severity.")
875
+ else:
876
+ # Summary counts
877
+ counts = report_df['severity'].value_counts()
878
+ w(f"Synthesized findings: {len(report_df)} unique scan events")
879
+ w("severity")
880
+ for sev in ['HIGH', 'MEDIUM', 'LOW']:
881
+ n = counts.get(sev, 0)
882
+ if n:
883
+ w(f"{sev:>8s} {n}")
884
+ w("")
885
+
886
+ # Build display columns — keep it to what fits a terminal cleanly
887
+ display_cols = [
888
+ 'severity', 'pattern_tag', 'scan_type', 'src_ip', 'dst_ip',
889
+ 'dst_port', 'distinct_ports', 'distinct_hosts',
890
+ 'scan_state_ratio', 'window_start', 'direction',
891
+ ]
892
+ # Add spread score column for slow scan findings if present
893
+ if 'temporal_spread_score' in report_df.columns:
894
+ has_slow = report_df['scan_type'].eq('slow').any()
895
+ if has_slow:
896
+ display_cols.insert(display_cols.index('scan_state_ratio'),
897
+ 'temporal_spread_score')
898
+
899
+ # Right-align severity for readability
900
+ report_df['severity'] = report_df['severity'].map(
901
+ lambda s: f"{s:>6s}"
902
+ )
903
+
904
+ w(report_df[display_cols].to_string(index=False))
905
+
906
+ w("")
907
+ w(f"Data: {ts_min.strftime('%Y-%m-%d %H:%M')} → "
908
+ f"{ts_max.strftime('%Y-%m-%d %H:%M')} UTC "
909
+ f"({n_raw:,} connections)")
910
+
911
+
912
+ # ══════════════════════════════════════════════════════════════════════════════
913
+ # Export
914
+ # ══════════════════════════════════════════════════════════════════════════════
915
+
916
+ def export_results(df_dedup: pd.DataFrame, ts_min, ts_max, n_raw: int,
917
+ output_dir: Path, run_ts: str, args):
918
+ output_dir.mkdir(parents=True, exist_ok=True)
919
+
920
+ if args.format in ('text', 'both'):
921
+ out_txt = output_dir / f"scan_findings_{run_ts}.txt"
922
+ with open(out_txt, 'w') as f:
923
+ print_report(df_dedup, ts_min, ts_max, n_raw,
924
+ min_severity=args.min_severity, file=f)
925
+ print(f"Report : {out_txt}")
926
+
927
+ if args.format in ('json', 'both'):
928
+ if len(df_dedup) > 0:
929
+ out_json = output_dir / f"scan_findings_{run_ts}.json"
930
+ with open(out_json, 'w') as jf:
931
+ for _, row in df_dedup.iterrows():
932
+ event = row.to_dict()
933
+ event['_sourcetype'] = 'lh_scan_findings'
934
+ event['detector_version'] = VERSION
935
+ event = {k: ('' if v is None else v) for k, v in event.items()}
936
+ jf.write(json.dumps(event) + '\n')
937
+ print(f"JSON (Splunk) : {out_json}")
938
+
939
+ if len(df_dedup) > 0:
940
+ out_csv = output_dir / f"scan_findings_{run_ts}.csv"
941
+ df_dedup.to_csv(out_csv, index=False)
942
+ print(f"CSV : {out_csv}")
943
+
944
+
945
+ # ══════════════════════════════════════════════════════════════════════════════
946
+ # CLI
947
+ # ══════════════════════════════════════════════════════════════════════════════
948
+
949
+ def build_parser() -> argparse.ArgumentParser:
950
+ p = argparse.ArgumentParser(
951
+ prog='lh-scan',
952
+ description='Port scan detector — part of the loghunter suite.',
953
+ formatter_class=argparse.RawDescriptionHelpFormatter,
954
+ epilog=__doc__,
955
+ )
956
+
957
+ p.add_argument('log_path',
958
+ help='Path to Zeek conn.log or glob pattern (e.g. logs/conn.*.log.gz)')
959
+
960
+ # Network context
961
+ net = p.add_argument_group('network context')
962
+ net.add_argument('--home-nets', nargs='+', metavar='CIDR',
963
+ default=['10.0.0.0/8', '172.16.0.0/12', '192.168.0.0/16'],
964
+ help='Internal network CIDRs (default: RFC1918)')
965
+ net.add_argument('--allowlist-ips', nargs='+', metavar='IP',
966
+ default=[],
967
+ help='Source IPs to exclude from scan detection')
968
+
969
+ # Detection thresholds
970
+ thresh = p.add_argument_group('detection thresholds')
971
+ thresh.add_argument('--vertical-threshold', type=int,
972
+ default=DEFAULT_VERTICAL_PORT_THRESHOLD,
973
+ help=f'Distinct ports to trigger vertical scan '
974
+ f'(default: {DEFAULT_VERTICAL_PORT_THRESHOLD})')
975
+ thresh.add_argument('--horizontal-threshold', type=int,
976
+ default=DEFAULT_HORIZONTAL_HOST_THRESHOLD,
977
+ help=f'Distinct hosts to trigger horizontal scan '
978
+ f'(default: {DEFAULT_HORIZONTAL_HOST_THRESHOLD})')
979
+ thresh.add_argument('--block-port-threshold', type=int,
980
+ default=DEFAULT_BLOCK_PORT_THRESHOLD,
981
+ help=f'Port threshold for block scan (default: {DEFAULT_BLOCK_PORT_THRESHOLD})')
982
+ thresh.add_argument('--block-host-threshold', type=int,
983
+ default=DEFAULT_BLOCK_HOST_THRESHOLD,
984
+ help=f'Host threshold for block scan (default: {DEFAULT_BLOCK_HOST_THRESHOLD})')
985
+ thresh.add_argument('--block-state-min', type=float,
986
+ default=DEFAULT_BLOCK_SCAN_STATE_MIN,
987
+ help=f'Min scan_state_ratio for block scan '
988
+ f'(default: {DEFAULT_BLOCK_SCAN_STATE_MIN})')
989
+ thresh.add_argument('--slow-state-min', type=float,
990
+ default=DEFAULT_SLOW_SCAN_STATE_MIN,
991
+ help=f'Min scan_state_ratio for slow scan '
992
+ f'(default: {DEFAULT_SLOW_SCAN_STATE_MIN})')
993
+ thresh.add_argument('--slow-min-ports', type=int,
994
+ default=DEFAULT_SLOW_MIN_PORTS,
995
+ help=f'Min unique ports for slow scan (default: {DEFAULT_SLOW_MIN_PORTS})')
996
+ thresh.add_argument('--slow-min-buckets', type=int,
997
+ default=DEFAULT_SLOW_MIN_BUCKETS,
998
+ help=f'Min active time buckets for slow scan '
999
+ f'(default: {DEFAULT_SLOW_MIN_BUCKETS})')
1000
+ thresh.add_argument('--fast-window', type=int,
1001
+ default=DEFAULT_FAST_WINDOW_SECS,
1002
+ help=f'Fast detection window in seconds (default: {DEFAULT_FAST_WINDOW_SECS})')
1003
+ thresh.add_argument('--slow-window', type=int,
1004
+ default=DEFAULT_SLOW_WINDOW_SECS,
1005
+ help=f'Slow detection window in seconds (default: {DEFAULT_SLOW_WINDOW_SECS})')
1006
+
1007
+ # Output
1008
+ out = p.add_argument_group('output')
1009
+ out.add_argument('--output', metavar='DIR', default=None,
1010
+ help='Write results to this directory (default: print to stdout)')
1011
+ out.add_argument('--format', choices=['text', 'json', 'both'], default='text',
1012
+ help='Output format (default: text)')
1013
+ out.add_argument('--min-severity', choices=['HIGH', 'MEDIUM', 'LOW'], default='LOW',
1014
+ help='Minimum severity to report (default: LOW)')
1015
+
1016
+ p.add_argument('--verbose', '-v', action='store_true',
1017
+ help='Print progress and diagnostic detail')
1018
+ p.add_argument('--version', action='version', version=f'lh-scan {VERSION}')
1019
+
1020
+ return p
1021
+
1022
+
1023
+ def main():
1024
+ parser = build_parser()
1025
+ args = parser.parse_args()
1026
+
1027
+ run_ts = datetime.now().strftime('%Y%m%d_%H%M%S')
1028
+
1029
+ # ── Load ──────────────────────────────────────────────────────────────────
1030
+ print(f"lh-scan {VERSION} — loading {args.log_path}")
1031
+ try:
1032
+ df_raw, n_skipped = load_conn_log(args.log_path, verbose=args.verbose)
1033
+ except (FileNotFoundError, ValueError) as e:
1034
+ print(f"Error: {e}", file=sys.stderr)
1035
+ sys.exit(1)
1036
+
1037
+ ts_min = df_raw['ts'].min()
1038
+ ts_max = df_raw['ts'].max()
1039
+ span_hours = (ts_max - ts_min).total_seconds() / 3600
1040
+ n_raw = len(df_raw)
1041
+
1042
+ print(f"Loaded {n_raw:,} connections "
1043
+ f"({ts_min.strftime('%Y-%m-%d %H:%M')} → {ts_max.strftime('%Y-%m-%d %H:%M')} UTC, "
1044
+ f"{span_hours:.1f}h)")
1045
+ if n_skipped:
1046
+ print(f" Skipped {n_skipped:,} malformed rows")
1047
+
1048
+ # ── Pre-filter ────────────────────────────────────────────────────────────
1049
+ print("Pre-filtering...")
1050
+ df = prefilter(df_raw, args)
1051
+
1052
+ # ── Detect ────────────────────────────────────────────────────────────────
1053
+ print("Running vertical scan detection...")
1054
+ df_vert_slow = detect_vertical_scans(df, args)
1055
+ # Fast window: temporarily override slow_window
1056
+ args_fast = argparse.Namespace(**vars(args))
1057
+ args_fast.slow_window = args.fast_window
1058
+ df_vert_fast = detect_vertical_scans(df, args_fast)
1059
+
1060
+ print("Running horizontal scan detection...")
1061
+ df_horiz_slow = detect_horizontal_scans(df, args)
1062
+ df_horiz_fast = detect_horizontal_scans(df, args_fast)
1063
+
1064
+ print("Running block scan detection...")
1065
+ df_block_slow = detect_block_scans(df, args)
1066
+ df_block_fast = detect_block_scans(df, args_fast)
1067
+
1068
+ print("Running slow scan / temporal spread analysis...")
1069
+ df_slow = detect_slow_scans(df, args)
1070
+
1071
+ # ── Fingerprint ───────────────────────────────────────────────────────────
1072
+ all_dfs = [df_vert_slow, df_vert_fast, df_horiz_slow, df_horiz_fast,
1073
+ df_block_slow, df_block_fast, df_slow]
1074
+ scanner_ips = list(set(
1075
+ ip for d in all_dfs if len(d) > 0
1076
+ for ip in d['src_ip'].unique()
1077
+ ))
1078
+ fingerprints = conn_state_fingerprints(df, scanner_ips)
1079
+
1080
+ # ── Synthesize ────────────────────────────────────────────────────────────
1081
+ df_findings = synthesize(all_dfs, fingerprints)
1082
+
1083
+ # ── Report ────────────────────────────────────────────────────────────────
1084
+ if args.output:
1085
+ output_dir = Path(args.output)
1086
+ export_results(df_findings, ts_min, ts_max, n_raw,
1087
+ output_dir, run_ts, args)
1088
+ print_report(df_findings, ts_min, ts_max, n_raw,
1089
+ min_severity=args.min_severity)
1090
+ else:
1091
+ print()
1092
+ print_report(df_findings, ts_min, ts_max, n_raw,
1093
+ min_severity=args.min_severity)
1094
+
1095
+
1096
+ if __name__ == '__main__':
1097
+ main()