loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,735 @@
1
+ """Scan detector — port scan detection from Zeek conn.log.
2
+
3
+ Detects vertical (one→many ports), horizontal (one→many hosts), block
4
+ (many ports AND many hosts), and slow (temporally spread) port scanning.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import ipaddress
10
+ from datetime import datetime, timezone
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+ from loghunter.common.finding import DetectorContext, Finding, MethodTag, Severity
16
+
17
+ DETECTOR_NAME = "scan"
18
+ STATUS = "available"
19
+
20
+ REQUIRED_LOGS = [
21
+ {"source": "zeek_dir", "pattern": "conn*.log*"},
22
+ ]
23
+
24
+ OPTIONAL_LOGS: list[dict] = []
25
+
26
+ DEFAULT_CONFIG = {
27
+ "vertical_threshold": 15,
28
+ "horizontal_threshold": 15,
29
+ "block_port_threshold": 20,
30
+ "block_host_threshold": 20,
31
+ "block_state_min": 0.30,
32
+ "slow_state_min": 0.30,
33
+ "window_secs": 3600,
34
+ "slow_min_ports": 8,
35
+ "slow_min_buckets": 4,
36
+ }
37
+
38
+ DETECTOR_METHOD = MethodTag("pattern", named=False)
39
+
40
+ # ── Domain-knowledge constants ────────────────────────────────────────────────
41
+
42
+ SCAN_STATES = {'S0', 'REJ', 'RSTO', 'RSTR', 'SH', 'OTH'}
43
+
44
+ BITTORRENT_PORTS_PEER = {6881, 6882, 6883, 6884, 6885, 6886, 6887, 6888, 6889,
45
+ 51413, 51414}
46
+ BITTORRENT_PORTS_TRACKER = {6969, 2710}
47
+
48
+ # IoT/smart device discovery ports — multicast/broadcast, structurally produce
49
+ # high S0/OTH rates that are not scanning
50
+ IOT_DISCOVERY_PORTS = {
51
+ 5353, # mDNS
52
+ 1900, # SSDP/UPnP
53
+ 5355, # LLMNR
54
+ 137, # NetBIOS Name Service
55
+ 138, # NetBIOS Datagram
56
+ }
57
+
58
+ # IoT multicast/broadcast destination ranges — connections to these are never scans
59
+ IOT_MULTICAST_PREFIXES = ('224.', '239.', '255.255.255.255', 'ff0', 'ff1', 'ff2')
60
+
61
+ DARK_PORTS = {0, 1, 2, 3, 4, 6, 8}
62
+
63
+ SCAN_TYPE_DESCRIPTIONS = {
64
+ 'vertical' : 'Port scan (one host, many ports)',
65
+ 'horizontal': 'Network sweep (many hosts, one port)',
66
+ 'block' : 'Block scan (many hosts AND many ports)',
67
+ }
68
+
69
+ STATE_EXPLANATIONS = {
70
+ 'S0' : 'SYN sent, no response (filtered/firewalled)',
71
+ 'REJ' : 'Port closed (RST received)',
72
+ 'RSTO': 'Connection reset by originator',
73
+ 'RSTR': 'Connection reset by responder',
74
+ 'SF' : 'Normal established+closed connection',
75
+ 'SH' : 'Half-open scan (SYN+FIN)',
76
+ 'OTH' : 'No SYN observed',
77
+ }
78
+
79
+
80
+ # ── Zone-label seam ───────────────────────────────────────────────────────────
81
+ #
82
+ # Standalone-callable fallback: when run() is invoked with a DetectorContext
83
+ # whose home_net is empty (e.g. from a notebook), this RFC1918 list is used.
84
+ # The runner is the normal supply path — it resolves [loghunter].home_net and
85
+ # passes it on every DetectorContext.
86
+ _DEFAULT_HOME_NET = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
87
+
88
+
89
+ def _zone_of(ip: str, home_net: list[str]) -> str:
90
+ """Return the zone label for an IP given the operator's home_net.
91
+
92
+ Today returns "internal" or "external". The function body is the seam:
93
+ adding a third zone (e.g. "dmz") is a single new if-check inside this
94
+ function — signature and callers do not change. Zones are descriptive
95
+ labels only; there is no trust-rank or numeric ordering at this stage.
96
+ """
97
+ try:
98
+ addr = ipaddress.ip_address(ip)
99
+ except ValueError:
100
+ return "external"
101
+ if any(addr in ipaddress.ip_network(n, strict=False) for n in home_net):
102
+ return "internal"
103
+ return "external"
104
+
105
+
106
+ def _classify_direction(src: str, dst: str, home_net: list[str]) -> tuple[str, str, str]:
107
+ """Compute (src_zone, dst_zone, rendered) for a flow.
108
+
109
+ The rendered direction string falls out of the zone pair via mechanical
110
+ f-string interpolation — not a hardcoded 2×2 branch. For the two-zone case
111
+ the four strings ("internal→internal", "internal→external",
112
+ "external→internal", "external→external") are produced byte-identically;
113
+ introducing additional zones would yield the new combinations without
114
+ touching this function.
115
+ """
116
+ src_zone = _zone_of(src, home_net)
117
+ dst_zone = _zone_of(dst, home_net)
118
+ return src_zone, dst_zone, f"{src_zone}→{dst_zone}"
119
+
120
+
121
+ def _prefilter(df: pd.DataFrame, home_net: list[str]) -> pd.DataFrame:
122
+ """Drop ICMP, IPv6 link-local, and IoT multicast rows; add direction columns.
123
+
124
+ Normalizes expected columns to safe types first so downstream detection code
125
+ does not crash on malformed-but-loadable conn logs. Rows with missing or
126
+ unparseable values simply never meet scan thresholds and produce no findings.
127
+
128
+ Adds two columns: ``direction`` (rendered string for evidence display) and
129
+ ``src_zone`` (raw zone label, used by structural checks that would otherwise
130
+ have to string-parse the rendered direction).
131
+ """
132
+ df = df.copy()
133
+
134
+ # Ensure required string columns exist and contain no None/NaN values.
135
+ for col in ("src", "dst", "proto", "conn_state"):
136
+ if col not in df.columns:
137
+ df[col] = ""
138
+ else:
139
+ df[col] = df[col].fillna("").astype(str)
140
+
141
+ # Port and timestamp must be numeric for every scan mode. Malformed rows are
142
+ # dropped here instead of letting lower-level pandas operations raise KeyError.
143
+ for col in ("port", "ts"):
144
+ if col not in df.columns:
145
+ df[col] = np.nan
146
+ else:
147
+ df[col] = pd.to_numeric(df[col], errors="coerce")
148
+
149
+ df = df[df["port"].notna() & df["ts"].notna()]
150
+ if df.empty:
151
+ return df
152
+
153
+ df = df[df['proto'] != 'icmp']
154
+ if df.empty:
155
+ return df
156
+ df = df[~(df['src'].str.startswith('fe80:') | df['dst'].str.startswith('fe80:'))]
157
+ if df.empty:
158
+ return df
159
+ df = df[~df['dst'].map(lambda ip: any(ip.startswith(p) for p in IOT_MULTICAST_PREFIXES))]
160
+ if df.empty:
161
+ return df
162
+ df = df.copy() # break view chain before column assignment
163
+ triples = [_classify_direction(s, d, home_net) for s, d in zip(df['src'], df['dst'])]
164
+ df['src_zone'] = [t[0] for t in triples]
165
+ df['direction'] = [t[2] for t in triples]
166
+ return df
167
+
168
+
169
+ def _detect_vertical(df: pd.DataFrame, cfg: dict) -> list[dict]:
170
+ """Vertical scan: one src → many distinct ports on one dst."""
171
+ threshold = cfg['vertical_threshold']
172
+ window_secs = cfg['window_secs']
173
+
174
+ global_counts = (
175
+ df.groupby(['src', 'dst'])['port']
176
+ .nunique()
177
+ .reset_index(name='global_distinct_ports')
178
+ )
179
+ candidates = global_counts[global_counts['global_distinct_ports'] >= threshold]
180
+
181
+ if len(candidates) == 0:
182
+ return []
183
+
184
+ cand_keys = candidates[['src', 'dst']]
185
+ df_cands = df.merge(cand_keys, on=['src', 'dst'])
186
+
187
+ results = []
188
+ for (src, dst), grp in df_cands.groupby(['src', 'dst']):
189
+ grp = grp.sort_values('ts')
190
+ ts_arr = grp['ts'].values # already float epoch seconds
191
+ port_arr = grp['port'].values
192
+ state_arr = grp['conn_state'].values
193
+
194
+ port_counts = {}
195
+ max_ports_in_window = 0
196
+ best_window_start = ts_arr[0]
197
+ left = 0
198
+
199
+ for right in range(len(ts_arr)):
200
+ p = port_arr[right]
201
+ if p is not None and not (isinstance(p, float) and np.isnan(p)):
202
+ port_counts[p] = port_counts.get(p, 0) + 1
203
+ while ts_arr[right] - ts_arr[left] > window_secs:
204
+ lp = port_arr[left]
205
+ if lp is not None and not (isinstance(lp, float) and np.isnan(lp)):
206
+ port_counts[lp] -= 1
207
+ if port_counts[lp] == 0:
208
+ del port_counts[lp]
209
+ left += 1
210
+ n = len(port_counts)
211
+ if n > max_ports_in_window:
212
+ max_ports_in_window = n
213
+ best_window_start = ts_arr[left]
214
+
215
+ if max_ports_in_window < threshold:
216
+ continue
217
+
218
+ state_counts = pd.Series(state_arr).value_counts()
219
+ total_conns = len(state_arr)
220
+ scan_state_count = sum(state_counts.get(s, 0) for s in SCAN_STATES)
221
+ scan_state_ratio = scan_state_count / total_conns
222
+
223
+ port_series = pd.Series(port_arr).dropna()
224
+ port_buckets = pd.cut(port_series, bins=[0, 1023, 49151, 65535],
225
+ labels=['well-known', 'registered', 'ephemeral'])
226
+ counts_arr = (port_buckets.value_counts().values + 1).astype(float)
227
+ probs = counts_arr / counts_arr.sum()
228
+ port_range_entropy = round(float(-np.sum(probs * np.log(probs))), 3)
229
+
230
+ results.append({
231
+ 'scan_type' : 'vertical',
232
+ 'src' : src,
233
+ 'dst' : dst,
234
+ 'port' : None,
235
+ 'port_class' : None,
236
+ 'distinct_ports' : max_ports_in_window,
237
+ 'distinct_hosts' : 1,
238
+ 'total_conns' : total_conns,
239
+ 'scan_state_ratio' : round(scan_state_ratio, 3),
240
+ 'top_states' : ', '.join(state_counts.head(3).index.tolist()),
241
+ 'port_range_entropy' : port_range_entropy,
242
+ 'window_start' : datetime.fromtimestamp(
243
+ best_window_start, tz=timezone.utc
244
+ ).strftime('%Y-%m-%d %H:%M:%S'),
245
+ 'window_secs' : window_secs,
246
+ 'direction' : grp['direction'].iloc[0],
247
+ })
248
+
249
+ return results
250
+
251
+
252
+ def _detect_horizontal(df: pd.DataFrame, cfg: dict) -> list[dict]:
253
+ """Horizontal scan: one src → same port across many distinct hosts."""
254
+ threshold = cfg['horizontal_threshold']
255
+ window_secs = cfg['window_secs']
256
+
257
+ df_tcp_udp = df[df['port'].notna()].copy()
258
+
259
+ global_counts = (
260
+ df_tcp_udp.groupby(['src', 'port'])['dst']
261
+ .nunique()
262
+ .reset_index(name='global_distinct_hosts')
263
+ )
264
+ candidates = global_counts[global_counts['global_distinct_hosts'] >= threshold]
265
+
266
+ if len(candidates) == 0:
267
+ return []
268
+
269
+ cand_keys = candidates[['src', 'port']]
270
+ df_cands = df_tcp_udp.merge(cand_keys, on=['src', 'port'])
271
+
272
+ results = []
273
+ for (src, port), grp in df_cands.groupby(['src', 'port']):
274
+ grp = grp.sort_values('ts')
275
+ ts_arr = grp['ts'].values # already float epoch seconds
276
+ host_arr = grp['dst'].values
277
+ state_arr = grp['conn_state'].values
278
+
279
+ host_counts = {}
280
+ max_hosts_in_window = 0
281
+ best_window_start = ts_arr[0]
282
+ left = 0
283
+
284
+ for right in range(len(ts_arr)):
285
+ h = host_arr[right]
286
+ if h is not None:
287
+ host_counts[h] = host_counts.get(h, 0) + 1
288
+ while ts_arr[right] - ts_arr[left] > window_secs:
289
+ lh = host_arr[left]
290
+ if lh is not None:
291
+ host_counts[lh] -= 1
292
+ if host_counts[lh] == 0:
293
+ del host_counts[lh]
294
+ left += 1
295
+ n = len(host_counts)
296
+ if n > max_hosts_in_window:
297
+ max_hosts_in_window = n
298
+ best_window_start = ts_arr[left]
299
+
300
+ if max_hosts_in_window < threshold:
301
+ continue
302
+
303
+ state_counts = pd.Series(state_arr).value_counts()
304
+ total_conns = len(state_arr)
305
+ scan_state_ratio = sum(state_counts.get(s, 0) for s in SCAN_STATES) / total_conns
306
+ velocity = max_hosts_in_window / max(ts_arr[-1] - ts_arr[0], 1)
307
+
308
+ port_int = int(port)
309
+ if port_int <= 1023:
310
+ port_class = 'well-known'
311
+ elif port_int <= 49151:
312
+ port_class = 'registered'
313
+ else:
314
+ port_class = 'ephemeral'
315
+
316
+ results.append({
317
+ 'scan_type' : 'horizontal',
318
+ 'src' : src,
319
+ 'dst' : None,
320
+ 'port' : port_int,
321
+ 'port_class' : port_class,
322
+ 'distinct_ports' : 1,
323
+ 'distinct_hosts' : max_hosts_in_window,
324
+ 'total_conns' : total_conns,
325
+ 'scan_state_ratio' : round(scan_state_ratio, 3),
326
+ 'top_states' : ', '.join(state_counts.head(3).index.tolist()),
327
+ 'velocity_hosts_per_sec' : round(velocity, 4),
328
+ 'window_start' : datetime.fromtimestamp(
329
+ best_window_start, tz=timezone.utc
330
+ ).strftime('%Y-%m-%d %H:%M:%S'),
331
+ 'window_secs' : window_secs,
332
+ 'direction' : grp['direction'].iloc[0],
333
+ })
334
+
335
+ return results
336
+
337
+
338
+ def _detect_block(df: pd.DataFrame, cfg: dict) -> list[dict]:
339
+ """Block scan: one src → many ports AND many hosts within a time window."""
340
+ port_threshold = cfg['block_port_threshold']
341
+ host_threshold = cfg['block_host_threshold']
342
+ scan_state_ratio_min = cfg['block_state_min']
343
+ window_secs = cfg['window_secs']
344
+
345
+ df_w = df[df['port'].notna()].copy()
346
+ df_w['time_bucket'] = (df_w['ts'] // window_secs).astype(int)
347
+ df_w['is_scan_state'] = df_w['conn_state'].isin(SCAN_STATES)
348
+
349
+ global_agg = df_w.groupby('src').agg(
350
+ global_distinct_ports=('port', 'nunique'),
351
+ global_distinct_hosts=('dst', 'nunique'),
352
+ scan_state_ratio=('is_scan_state', 'mean'),
353
+ ).reset_index()
354
+
355
+ candidates = global_agg[
356
+ (global_agg['global_distinct_ports'] >= port_threshold) &
357
+ (global_agg['global_distinct_hosts'] >= host_threshold) &
358
+ (global_agg['scan_state_ratio'] >= scan_state_ratio_min)
359
+ ]
360
+
361
+ if len(candidates) == 0:
362
+ return []
363
+
364
+ df_cands = df_w[df_w['src'].isin(candidates['src'])]
365
+ bucket_agg = df_cands.groupby(['src', 'time_bucket']).agg(
366
+ distinct_ports=('port', 'nunique'),
367
+ distinct_hosts=('dst', 'nunique'),
368
+ total_conns=('port', 'count'),
369
+ scan_state_ratio=('is_scan_state', 'mean'),
370
+ top_states=('conn_state',
371
+ lambda x: ', '.join(x.value_counts().head(3).index.tolist())),
372
+ direction=('direction', 'first'),
373
+ ports_well_known=('port', lambda x: (x <= 1023).sum()),
374
+ ports_registered=('port', lambda x: ((x > 1023) & (x <= 49151)).sum()),
375
+ ports_ephemeral=('port', lambda x: (x > 49151).sum()),
376
+ window_start_ts=('ts', 'min'),
377
+ ).reset_index()
378
+
379
+ findings = bucket_agg[
380
+ (bucket_agg['distinct_ports'] >= port_threshold) &
381
+ (bucket_agg['distinct_hosts'] >= host_threshold) &
382
+ (bucket_agg['scan_state_ratio'] >= scan_state_ratio_min)
383
+ ].copy()
384
+
385
+ if len(findings) == 0:
386
+ return []
387
+
388
+ findings['scan_type'] = 'block'
389
+ findings['dst'] = None
390
+ findings['port'] = None
391
+ findings['port_class'] = None
392
+ findings['window_secs'] = window_secs
393
+ findings['window_start'] = findings['window_start_ts'].map(
394
+ lambda ts: datetime.fromtimestamp(ts, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
395
+ )
396
+ findings['scan_state_ratio'] = findings['scan_state_ratio'].round(3)
397
+ findings['breadth_score'] = findings['distinct_ports'] * findings['distinct_hosts']
398
+
399
+ findings = (
400
+ findings
401
+ .sort_values('breadth_score', ascending=False)
402
+ .drop_duplicates(subset=['src'], keep='first')
403
+ .drop(columns=['time_bucket', 'window_start_ts', 'breadth_score'])
404
+ .reset_index(drop=True)
405
+ )
406
+
407
+ return findings.to_dict('records')
408
+
409
+
410
+ def _detect_slow(df: pd.DataFrame, cfg: dict) -> list[dict]:
411
+ """Slow scan: port diversity spread across many time buckets to evade per-window thresholds."""
412
+ min_ports = cfg['slow_min_ports']
413
+ min_buckets = cfg['slow_min_buckets']
414
+ state_min = cfg['slow_state_min']
415
+ bucket_secs = cfg['window_secs']
416
+ vert_threshold = cfg['vertical_threshold']
417
+
418
+ df_w = df[df['port'].notna()].copy()
419
+ df_w['time_bucket'] = (df_w['ts'] // bucket_secs).astype(int)
420
+
421
+ def is_iot_discovery(grp: pd.DataFrame) -> bool:
422
+ port_counts = grp['port'].value_counts()
423
+ top_ports = set(port_counts.head(3).index.tolist())
424
+ if top_ports.issubset(IOT_DISCOVERY_PORTS | {53, 443, 80}):
425
+ ext_conns = grp[grp['src_zone'] != 'internal'].shape[0]
426
+ if ext_conns / len(grp) < 0.1:
427
+ return True
428
+ return False
429
+
430
+ results = []
431
+
432
+ for src, grp in df_w.groupby('src'):
433
+ n_buckets = grp['time_bucket'].nunique()
434
+ if n_buckets < min_buckets:
435
+ continue
436
+
437
+ total_unique_ports = grp['port'].nunique()
438
+ if total_unique_ports < min_ports:
439
+ continue
440
+
441
+ max_ports_in_bucket = grp.groupby('time_bucket')['port'].nunique().max()
442
+
443
+ if max_ports_in_bucket >= vert_threshold:
444
+ continue
445
+
446
+ spread_score = round(total_unique_ports / max(max_ports_in_bucket, 1), 2)
447
+ state_counts = grp['conn_state'].value_counts()
448
+ scan_state_ratio = sum(state_counts.get(s, 0) for s in SCAN_STATES) / len(grp)
449
+
450
+ if scan_state_ratio < state_min:
451
+ continue
452
+
453
+ iot_flag = is_iot_discovery(grp)
454
+
455
+ if iot_flag:
456
+ pattern_tag = 'iot_discovery'
457
+ pattern_notes = (
458
+ f"Traffic pattern consistent with IoT device discovery (mDNS/SSDP/UPnP). "
459
+ f"High temporal spread from repeated network attach/detach cycles rather "
460
+ f"than deliberate scanning. Add source to allowlist to suppress if known."
461
+ )
462
+ elif scan_state_ratio >= 0.60:
463
+ pattern_tag = 'slow_scan'
464
+ pattern_notes = (
465
+ f"Temporal spread score {spread_score:.2f} with {scan_state_ratio:.1%} "
466
+ f"scan-indicative states across {n_buckets} time windows. "
467
+ f"Activity deliberately paced below per-window detection threshold. "
468
+ f"Strong slow scan signature."
469
+ )
470
+ else:
471
+ pattern_tag = 'slow_scan_candidate'
472
+ pattern_notes = (
473
+ f"Temporal spread score {spread_score:.2f} with {scan_state_ratio:.1%} "
474
+ f"scan-indicative states across {n_buckets} time windows. "
475
+ f"Moderate confidence — review destination IPs and ports."
476
+ )
477
+
478
+ results.append({
479
+ 'scan_type' : 'slow',
480
+ 'src' : src,
481
+ 'dst' : None,
482
+ 'port' : None,
483
+ 'port_class' : None,
484
+ 'distinct_ports' : total_unique_ports,
485
+ 'distinct_hosts' : grp['dst'].nunique(),
486
+ 'max_ports_in_bucket' : int(max_ports_in_bucket),
487
+ 'active_buckets' : n_buckets,
488
+ 'temporal_spread_score' : spread_score,
489
+ 'total_conns' : len(grp),
490
+ 'scan_state_ratio' : round(scan_state_ratio, 3),
491
+ 'top_states' : ', '.join(state_counts.head(3).index.tolist()),
492
+ 'window_start' : datetime.fromtimestamp(
493
+ float(grp['ts'].min()), tz=timezone.utc
494
+ ).strftime('%Y-%m-%d %H:%M:%S'),
495
+ 'window_secs' : bucket_secs,
496
+ 'direction' : grp['direction'].iloc[0],
497
+ 'pattern_tag' : pattern_tag,
498
+ 'pattern_notes' : pattern_notes,
499
+ })
500
+
501
+ return results
502
+
503
+
504
+ def _classify(row: dict) -> tuple[str, str]:
505
+ """Return (pattern_tag, explanation) for a finding dict."""
506
+ port = row.get('port')
507
+ ratio = row['scan_state_ratio']
508
+ hosts = row.get('distinct_hosts') or 0
509
+ ports = row.get('distinct_ports') or 0
510
+ scan_type = row['scan_type']
511
+
512
+ if scan_type == 'slow':
513
+ return (row.get('pattern_tag', 'slow_scan_candidate'),
514
+ row.get('pattern_notes', ''))
515
+
516
+ if port in IOT_DISCOVERY_PORTS and ratio < 0.40:
517
+ return ('iot_discovery',
518
+ f"Port {port} is an IoT/device discovery port (mDNS/SSDP/UPnP/NetBIOS). "
519
+ f"High host counts on this port are normal for device discovery protocols. "
520
+ f"Not a port scan. Add source to allowlist to suppress.")
521
+
522
+ if port in BITTORRENT_PORTS_PEER and ratio >= 0.50:
523
+ return ('bittorrent',
524
+ f"BitTorrent peer connections on port {port} — {hosts} peers contacted, "
525
+ f"{ratio:.1%} failed connections (normal for BT peer discovery). "
526
+ f"If this host shouldn't run BitTorrent, investigate.")
527
+
528
+ if port in BITTORRENT_PORTS_TRACKER and ratio >= 0.15:
529
+ return ('bittorrent',
530
+ f"BitTorrent tracker traffic on port {port} — {hosts} trackers contacted, "
531
+ f"{ratio:.1%} failed connections (normal for tracker announce/scrape). "
532
+ f"If this host shouldn't run BitTorrent, investigate.")
533
+
534
+ if port == 53 and ratio < 0.05 and hosts >= 15:
535
+ return ('dns_resolver',
536
+ f"DNS recursive resolution — {hosts} external resolvers on port 53, "
537
+ f"{ratio:.1%} failed. This is a DNS server or resolver, not a scanner. "
538
+ f"Add source to allowlist to suppress.")
539
+
540
+ if port == 443 and ratio < 0.10 and hosts >= 15:
541
+ return ('https_browsing',
542
+ f"HTTPS to {hosts} external hosts, {ratio:.1%} failed — consistent with "
543
+ f"normal web browsing or cloud service traffic. "
544
+ f"Add source to allowlist to suppress.")
545
+
546
+ if port == 80 and ratio < 0.10 and hosts >= 15:
547
+ return ('http_browsing',
548
+ f"HTTP to {hosts} external hosts, {ratio:.1%} failed — consistent with "
549
+ f"normal web traffic.")
550
+
551
+ if port == 443 and 0.10 <= ratio < 0.50 and hosts >= 20:
552
+ return ('streaming_blocked',
553
+ f"{hosts} HTTPS destinations, {ratio:.1%} failed. On a media/streaming "
554
+ f"device this pattern is consistent with DNS-level blocking (Pi-hole, "
555
+ f"NextDNS) causing direct connection fallback attempts. "
556
+ f"Add source to allowlist to suppress.")
557
+
558
+ if port in DARK_PORTS and ratio >= 0.90:
559
+ return ('dark_traffic',
560
+ f"Port {port} is unassigned/reserved — likely a Zeek encoding artifact "
561
+ f"(e.g. ICMP type/code) or internet background radiation. "
562
+ f"Check proto field in conn.log.")
563
+
564
+ if scan_type == 'vertical' and ratio >= 0.60 and ports >= 1000:
565
+ return ('confirmed_scan',
566
+ f"Full port range scan — {ports} distinct ports on single target "
567
+ f"with {ratio:.1%} scan-indicative states. Strong scanner signature.")
568
+
569
+ if ratio >= 0.60:
570
+ return ('confirmed_scan',
571
+ f"{ratio:.1%} scan-indicative states "
572
+ f"({'ports' if scan_type == 'vertical' else 'hosts'}: {max(ports, hosts)}). "
573
+ f"Strong scanner signature.")
574
+
575
+ return ('unknown', '')
576
+
577
+
578
+ def _to_severity(row: dict) -> Severity:
579
+ """Return Severity based on scan_state_ratio, breadth, and pattern_tag."""
580
+ ratio = row['scan_state_ratio']
581
+ breadth = max(row.get('distinct_ports') or 0, row.get('distinct_hosts') or 0)
582
+ tag = row['pattern_tag']
583
+
584
+ if tag in ('dns_resolver', 'https_browsing', 'http_browsing',
585
+ 'iot_discovery', 'dark_traffic'):
586
+ return Severity.LOW
587
+
588
+ if row.get('scan_type') == 'slow':
589
+ if tag == 'slow_scan':
590
+ return Severity.HIGH if ratio >= 0.60 else Severity.MEDIUM
591
+ return Severity.LOW
592
+
593
+ if ratio >= 0.60:
594
+ return Severity.HIGH
595
+ if ratio >= 0.30 and breadth >= 50:
596
+ return Severity.HIGH
597
+ if ratio >= 0.20:
598
+ return Severity.MEDIUM
599
+ if ratio >= 0.10 and breadth >= 25:
600
+ return Severity.MEDIUM
601
+ return Severity.LOW
602
+
603
+
604
+ def _make_finding(row: dict, data_window: tuple) -> Finding:
605
+ """Construct a Finding from a classified result dict."""
606
+ scan_type = row['scan_type']
607
+ src = row['src']
608
+ dst = row.get('dst')
609
+ port = row.get('port')
610
+ distinct_ports = row.get('distinct_ports', 0)
611
+ distinct_hosts = row.get('distinct_hosts', 0)
612
+ active_buckets = row.get('active_buckets')
613
+
614
+ if scan_type == 'vertical':
615
+ title = f"{src} → {dst}"
616
+ elif scan_type == 'horizontal':
617
+ title = f"{src} → *:{port}"
618
+ elif scan_type == 'block':
619
+ title = f"{src} → *"
620
+ else:
621
+ title = f"{src} slow scan"
622
+
623
+ description = row.get('pattern_notes') or (
624
+ f"{scan_type} scan — {row.get('total_conns', 0)} connections, "
625
+ f"scan_state_ratio={row.get('scan_state_ratio', 0):.2f}"
626
+ )
627
+
628
+ evidence: dict = {
629
+ 'scan_type' : scan_type,
630
+ 'src' : src,
631
+ 'dst' : dst,
632
+ 'port' : port,
633
+ 'distinct_ports' : distinct_ports,
634
+ 'distinct_hosts' : distinct_hosts,
635
+ 'total_conns' : row.get('total_conns'),
636
+ 'scan_state_ratio' : row.get('scan_state_ratio'),
637
+ 'top_states' : row.get('top_states'),
638
+ 'direction' : row.get('direction'),
639
+ 'pattern_tag' : row.get('pattern_tag'),
640
+ 'window_start' : row.get('window_start'),
641
+ 'window_secs' : row.get('window_secs'),
642
+ }
643
+ if scan_type == 'slow':
644
+ evidence['temporal_spread_score'] = row.get('temporal_spread_score')
645
+ evidence['active_buckets'] = active_buckets
646
+ evidence['max_ports_in_bucket'] = row.get('max_ports_in_bucket')
647
+
648
+ pattern_tag = row.get('pattern_tag', 'unknown')
649
+ severity = row['_severity']
650
+
651
+ if pattern_tag == 'confirmed_scan' or severity == Severity.HIGH:
652
+ next_steps = [
653
+ "Pivot to conn.log to review full connection history for this source.",
654
+ "Check reverse DNS for the source host.",
655
+ "Look up source IP on Shodan for open services and prior reports.",
656
+ ]
657
+ elif pattern_tag == 'bittorrent':
658
+ next_steps = [
659
+ "Expected behavior if this host runs BitTorrent.",
660
+ "Add source to allowlist to suppress if BitTorrent is authorized.",
661
+ ]
662
+ elif pattern_tag in ('iot_discovery', 'dns_resolver', 'https_browsing'):
663
+ next_steps = [
664
+ "Known benign traffic pattern — add source to allowlist to suppress.",
665
+ ]
666
+ elif scan_type == 'slow' and pattern_tag == 'slow_scan':
667
+ next_steps = [
668
+ "Pivot to conn.log to review full connection history for this source.",
669
+ "Check reverse DNS for the source host.",
670
+ "Look up source IP on Shodan.",
671
+ f"Note temporal spread: activity paced across {active_buckets} time windows.",
672
+ ]
673
+ else:
674
+ next_steps = [f"Review conn.log for {src} to assess scan intent."]
675
+
676
+ return Finding(
677
+ detector='scan',
678
+ severity=severity,
679
+ title=title,
680
+ description=description,
681
+ evidence=evidence,
682
+ next_steps=next_steps,
683
+ ts_generated=datetime.now(tz=timezone.utc),
684
+ data_window=data_window,
685
+ )
686
+
687
+
688
+ # ── Detector entry point ──────────────────────────────────────────────────────
689
+
690
+ def run(context: DetectorContext) -> list[Finding]:
691
+ """Detect port scan activity: vertical, horizontal, block, and slow scans."""
692
+ cfg: dict = {**DEFAULT_CONFIG, **context.config}
693
+ home_net = list(context.home_net) if context.home_net else list(_DEFAULT_HOME_NET)
694
+
695
+ df = context.logs.get('conn*.log*')
696
+ if df is None or df.empty:
697
+ return []
698
+
699
+ df = _prefilter(df, home_net)
700
+ if df.empty:
701
+ return []
702
+
703
+ all_rows: list[dict] = []
704
+ all_rows.extend(_detect_vertical(df, cfg))
705
+ all_rows.extend(_detect_horizontal(df, cfg))
706
+ all_rows.extend(_detect_block(df, cfg))
707
+ all_rows.extend(_detect_slow(df, cfg))
708
+
709
+ if not all_rows:
710
+ return []
711
+
712
+ # Deduplicate: keep highest-breadth result per unique (scan_type, src, dst, port)
713
+ seen: dict[tuple, dict] = {}
714
+ for row in all_rows:
715
+ key = (row['scan_type'], row.get('src'), row.get('dst'), row.get('port'))
716
+ breadth = max(row.get('distinct_ports') or 0, row.get('distinct_hosts') or 0)
717
+ if key not in seen or breadth > max(
718
+ seen[key].get('distinct_ports') or 0,
719
+ seen[key].get('distinct_hosts') or 0,
720
+ ):
721
+ seen[key] = row
722
+
723
+ deduped = list(seen.values())
724
+
725
+ for row in deduped:
726
+ if 'pattern_tag' not in row:
727
+ row['pattern_tag'], row['pattern_notes'] = _classify(row)
728
+ row['_severity'] = _to_severity(row)
729
+
730
+ sev_order = {Severity.HIGH: 0, Severity.MEDIUM: 1, Severity.LOW: 2, Severity.INFO: 3}
731
+ deduped.sort(
732
+ key=lambda r: (sev_order[r['_severity']], -r.get('scan_state_ratio', 0))
733
+ )
734
+
735
+ return [_make_finding(row, context.data_window) for row in deduped]