loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,479 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ syslog_hunt.py — Syslog structural anomaly detection.
4
+
5
+ Reads a flat syslog file (one RFC 3164 line per line, as produced by
6
+ get_syslog.py), runs drain3 log templating followed by rarity-based
7
+ anomaly scoring, and writes a plain-text report to ./hunt_output/.
8
+
9
+ Pipeline:
10
+ 1. Load & parse — strip RFC 3164 PRI prefix and syslog header
11
+ 2. Normalize — collapse PID variants (sshd[1234] → sshd[*])
12
+ 3. Template — drain3 structural clustering
13
+ 4. Score — rarity ranking (bottom N percentile = anomalous)
14
+ 5. Reboot detect — suppress per-host kernel boot bursts, emit single line
15
+ 6. Report — flat list of anomalous raw syslog lines
16
+
17
+ Usage:
18
+ python syslog_hunt.py syslog_20260515_1d.log
19
+ python syslog_hunt.py --rarity 5 --max-count 2 syslog.log
20
+ python syslog_hunt.py --exclude host1.example.com host2.example.com syslog.log
21
+
22
+ Cron example (daily, 06:00):
23
+ 0 6 * * * cd /opt/hunt && python syslog_hunt.py syslog_$(date +%%Y%%m%%d)_1d.log
24
+
25
+ Dependencies:
26
+ pip install drain3
27
+ """
28
+
29
+ import argparse
30
+ import re
31
+ import sys
32
+ from collections import defaultdict
33
+ from datetime import datetime, timezone, timedelta
34
+ from pathlib import Path
35
+
36
+ # ── Dependency check ──────────────────────────────────────────────────────────
37
+ try:
38
+ from drain3 import TemplateMiner
39
+ from drain3.template_miner_config import TemplateMinerConfig
40
+ except ImportError:
41
+ print("ERROR: drain3 not installed. Run: pip install drain3")
42
+ sys.exit(1)
43
+
44
+ # ── Compiled patterns ─────────────────────────────────────────────────────────
45
+ PRI_RE = re.compile(r'^<\d+>')
46
+ SYSLOG_HDR_RE = re.compile(r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\S+\s+')
47
+ PROC_PID_RE = re.compile(r'\[\d+\]')
48
+
49
+ # Syslog timestamp for approximate event time parsing (no year — use current year)
50
+ SYSLOG_TS_RE = re.compile(r'^(\w{3})\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})')
51
+
52
+ # Reboot signal patterns — any of these in a message body triggers reboot detection
53
+ REBOOT_SIGNALS_RE = re.compile(
54
+ r'(systemd-logind.*[Ss]ystem is rebooting|'
55
+ r'rsyslogd.*exiting on signal 15|'
56
+ r'systemd-shutdown.*Sending SIGTERM to remaining|'
57
+ r'kernel: Linux version\s)',
58
+ re.IGNORECASE
59
+ )
60
+
61
+ # ── Pipeline defaults ─────────────────────────────────────────────────────────
62
+ DRAIN_SIM_THRESH = 0.5
63
+ DRAIN_DEPTH = 4
64
+ DRAIN_PARAMETRIZE_NUMERIC = True
65
+ DEFAULT_RARITY_PCT = 10
66
+ DEFAULT_MAX_COUNT = 1 # hard ceiling on template count regardless of percentile
67
+ REBOOT_SUPPRESS_WINDOW = 300 # seconds: suppress anomalies within this window of reboot
68
+
69
+ # ── Text formatting ───────────────────────────────────────────────────────────
70
+ WIDTH = 72
71
+
72
+ def banner(text):
73
+ return "\n" + "═" * WIDTH + f"\n {text}\n" + "═" * WIDTH
74
+
75
+ def section(text):
76
+ return f"\n── {text} " + "─" * max(0, WIDTH - len(text) - 4)
77
+
78
+ # ── Parsing ───────────────────────────────────────────────────────────────────
79
+
80
+ def parse_host(raw):
81
+ """Extract hostname from RFC 3164 syslog line."""
82
+ stripped = PRI_RE.sub("", raw).strip()
83
+ parts = stripped.split()
84
+ return parts[3] if len(parts) >= 4 else "unknown"
85
+
86
+
87
+ def strip_header(raw):
88
+ """Remove RFC 3164 PRI prefix and timestamp+hostname."""
89
+ raw = PRI_RE.sub("", raw)
90
+ return SYSLOG_HDR_RE.sub("", raw).strip()
91
+
92
+
93
+ def normalize(msg):
94
+ """Collapse PID brackets so sshd[1234] and sshd[5678] share a template."""
95
+ return PROC_PID_RE.sub("[*]", msg)
96
+
97
+
98
+ def parse_syslog_ts(raw):
99
+ """
100
+ Parse the syslog timestamp from a raw line. Returns a datetime in local
101
+ time (naive, current year assumed) or None if unparseable.
102
+ """
103
+ stripped = PRI_RE.sub("", raw).strip()
104
+ m = SYSLOG_TS_RE.match(stripped)
105
+ if not m:
106
+ return None
107
+ month_str, day_str, time_str = m.group(1), m.group(2), m.group(3)
108
+ year = datetime.now().year
109
+ try:
110
+ return datetime.strptime(
111
+ f"{year} {month_str} {day_str.zfill(2)} {time_str}",
112
+ "%Y %b %d %H:%M:%S"
113
+ )
114
+ except ValueError:
115
+ return None
116
+
117
+ # ── Load ──────────────────────────────────────────────────────────────────────
118
+
119
+ def load_syslog(path, exclude_hosts):
120
+ """
121
+ Read flat syslog file. Returns list of dicts:
122
+ raw — original line
123
+ host — parsed hostname
124
+ message — stripped + normalized message body
125
+ ts — datetime (local, naive) or None
126
+ """
127
+ events = []
128
+ skipped_empty = 0
129
+ skipped_host = 0
130
+
131
+ with open(path, encoding="utf-8", errors="replace") as f:
132
+ for line in f:
133
+ raw = line.rstrip("\n")
134
+ if not raw or raw.startswith("#"):
135
+ continue
136
+
137
+ host = parse_host(raw)
138
+
139
+ if exclude_hosts and host in exclude_hosts:
140
+ skipped_host += 1
141
+ continue
142
+
143
+ msg = normalize(strip_header(raw))
144
+ if not msg:
145
+ skipped_empty += 1
146
+ continue
147
+
148
+ events.append({
149
+ "raw": raw,
150
+ "host": host,
151
+ "message": msg,
152
+ "ts": parse_syslog_ts(raw),
153
+ })
154
+
155
+ print(f" Loaded : {len(events):,} events")
156
+ if skipped_host:
157
+ print(f" Excluded hosts: {skipped_host:,} events")
158
+ if skipped_empty:
159
+ print(f" Skipped empty : {skipped_empty:,} events")
160
+ return events
161
+
162
+ # ── Templating ────────────────────────────────────────────────────────────────
163
+
164
+ def run_drain3(events):
165
+ """Run drain3 on all events. Adds template_id and template_str in-place."""
166
+ cfg = TemplateMinerConfig()
167
+ cfg.drain_sim_th = DRAIN_SIM_THRESH
168
+ cfg.drain_depth = DRAIN_DEPTH
169
+ cfg.parametrize_numeric_tokens = DRAIN_PARAMETRIZE_NUMERIC
170
+
171
+ miner = TemplateMiner(config=cfg)
172
+ n = len(events)
173
+ report_every = max(1, n // 20)
174
+
175
+ print(f" Templating {n:,} events...", end="", flush=True)
176
+ for i, ev in enumerate(events):
177
+ result = miner.add_log_message(ev["message"])
178
+ ev["template_id"] = result["cluster_id"]
179
+ ev["template_str"] = result["template_mined"]
180
+ if (i + 1) % report_every == 0:
181
+ print(f"\r Templating {n:,} events... {(i+1)/n*100:.0f}%",
182
+ end="", flush=True)
183
+
184
+ n_templates = len({ev["template_id"] for ev in events})
185
+ print(f"\r Templating complete: {n_templates:,} unique templates "
186
+ f"from {n:,} events")
187
+ return events
188
+
189
+ # ── Rarity scoring ────────────────────────────────────────────────────────────
190
+
191
+ def score_rarity(events, rarity_pct, max_count):
192
+ """
193
+ Flag events whose template count falls at or below the effective threshold.
194
+ Effective threshold = min(percentile-derived value, max_count).
195
+ Adds is_anomaly bool in-place. Returns (threshold, freq_dict).
196
+ """
197
+ freq = defaultdict(int)
198
+ for ev in events:
199
+ freq[ev["template_id"]] += 1
200
+
201
+ sorted_counts = sorted(freq.values())
202
+ idx = max(0, int(len(sorted_counts) * rarity_pct / 100) - 1)
203
+ pct_threshold = sorted_counts[idx]
204
+
205
+ threshold = min(pct_threshold, max_count)
206
+
207
+ rare_ids = {tid for tid, count in freq.items() if count <= threshold}
208
+ for ev in events:
209
+ ev["is_anomaly"] = ev["template_id"] in rare_ids
210
+
211
+ n_anom = sum(ev["is_anomaly"] for ev in events)
212
+ print(f" Rarity threshold : <= {threshold} events "
213
+ f"(pct={pct_threshold}, max_count cap={max_count})")
214
+ print(f" Anomalous : {len(rare_ids):,} templates | "
215
+ f"{n_anom:,} events ({n_anom/len(events)*100:.2f}%)")
216
+
217
+ return threshold, dict(freq)
218
+
219
+ # ── Reboot detection ──────────────────────────────────────────────────────────
220
+
221
+ def detect_reboots(events):
222
+ """
223
+ Scan all events for reboot signals. For each host, record the timestamp
224
+ of each detected reboot. Returns dict: host -> list of reboot datetimes.
225
+ """
226
+ reboots = defaultdict(list)
227
+ for ev in events:
228
+ if ev["ts"] and REBOOT_SIGNALS_RE.search(ev["raw"]):
229
+ reboots[ev["host"]].append(ev["ts"])
230
+ for host in reboots:
231
+ reboots[host].sort()
232
+ return dict(reboots)
233
+
234
+
235
+ def apply_reboot_suppression(noise_events, reboots):
236
+ """
237
+ For each anomalous event, check if it falls within REBOOT_SUPPRESS_WINDOW
238
+ seconds after a detected reboot on the same host. If so, suppress it.
239
+
240
+ Returns:
241
+ kept — anomalous events not suppressed
242
+ reboot_lines — synthetic reboot annotation lines (one per reboot)
243
+ suppressed_n — count of suppressed events
244
+ """
245
+ reboot_lines = []
246
+ suppressed_n = 0
247
+ kept = []
248
+ emitted_reboots = set() # (host, reboot_ts) already announced
249
+
250
+ for ev in noise_events:
251
+ host = ev["host"]
252
+ ts = ev["ts"]
253
+
254
+ if ts is None or host not in reboots:
255
+ kept.append(ev)
256
+ continue
257
+
258
+ suppressed = False
259
+ for rts in reboots[host]:
260
+ delta = (ts - rts).total_seconds()
261
+ if 0 <= delta <= REBOOT_SUPPRESS_WINDOW:
262
+ # Emit a single reboot line the first time we see this reboot
263
+ key = (host, rts)
264
+ if key not in emitted_reboots:
265
+ emitted_reboots.add(key)
266
+ reboot_lines.append({
267
+ "ts": rts,
268
+ "host": host,
269
+ "raw": f"*** {host} rebooted at "
270
+ f"{rts.strftime('%a %b %d %H:%M:%S')} ***",
271
+ "synthetic": True,
272
+ })
273
+ suppressed = True
274
+ suppressed_n += 1
275
+ break
276
+
277
+ if not suppressed:
278
+ kept.append(ev)
279
+
280
+ return kept, reboot_lines, suppressed_n
281
+
282
+ # ── Report building ───────────────────────────────────────────────────────────
283
+
284
+ def time_range_str(events):
285
+ """Return a human-readable time range string from event timestamps."""
286
+ timestamps = [ev["ts"] for ev in events if ev["ts"] is not None]
287
+ if not timestamps:
288
+ return "unknown"
289
+ earliest = min(timestamps)
290
+ latest = max(timestamps)
291
+ fmt = "%a %b %d %H:%M:%S"
292
+ if earliest.date() == latest.date():
293
+ return (f"{earliest.strftime(fmt)} – "
294
+ f"{latest.strftime('%H:%M:%S')}")
295
+ return f"{earliest.strftime(fmt)} – {latest.strftime(fmt)}"
296
+
297
+
298
+ def build_report(events, freq, threshold, rarity_pct, max_count,
299
+ input_path, reboots):
300
+ run_ts = datetime.now().strftime("%a %b %d %H:%M:%S %Y")
301
+ total = len(events)
302
+ noise_raw = [ev for ev in events if ev["is_anomaly"]]
303
+
304
+ # Apply reboot suppression
305
+ kept, reboot_lines, suppressed_n = apply_reboot_suppression(
306
+ noise_raw, reboots
307
+ )
308
+
309
+ # Merge kept anomalies with synthetic reboot lines, sort by timestamp
310
+ all_findings = kept + reboot_lines
311
+ all_findings.sort(key=lambda ev: ev["ts"] if ev.get("ts") else datetime.min)
312
+
313
+ n_noise = len(kept)
314
+ n_synthetic = len(reboot_lines)
315
+ pct_noise = n_noise / total * 100 if total else 0
316
+
317
+ # Per-host totals (original events only)
318
+ host_total = defaultdict(int)
319
+ host_noise = defaultdict(int)
320
+ for ev in events:
321
+ host_total[ev["host"]] += 1
322
+ for ev in kept:
323
+ host_noise[ev["host"]] += 1
324
+
325
+ n_templates = len({ev["template_id"] for ev in kept})
326
+
327
+ out = []
328
+
329
+ # ── Header ──
330
+ out.append(banner(f"syslog_hunt.py | Anomaly Report | {run_ts}"))
331
+
332
+ # ── Summary ──
333
+ out.append(section("Summary"))
334
+ out.append(f" Input : {input_path.name}")
335
+ out.append(f" Scan range : {time_range_str(events)}")
336
+ out.append(f" Total events : {total:,}")
337
+ out.append(f" Rarity threshold : <= {threshold} events")
338
+ out.append(f" Anomalous templates: {n_templates:,}")
339
+ out.append(f" Anomalous events : {n_noise:,} ({pct_noise:.2f}%)")
340
+ if suppressed_n:
341
+ out.append(f" Reboot-suppressed : {suppressed_n:,} events "
342
+ f"({n_synthetic} reboot(s) detected)")
343
+
344
+ # ── Host breakdown ──
345
+ out.append(section("Anomaly rate by host"))
346
+ sorted_hosts = sorted(
347
+ host_total.keys(),
348
+ key=lambda h: host_noise.get(h, 0) / host_total[h],
349
+ reverse=True,
350
+ )
351
+ for host in sorted_hosts:
352
+ tot = host_total[host]
353
+ anom = host_noise.get(host, 0)
354
+ rate = anom / tot * 100 if tot else 0
355
+ bar = "█" * min(40, int(rate * 4))
356
+ out.append(
357
+ f" {host:<35} {anom:>5,} / {tot:>8,} ({rate:>5.2f}%) {bar}"
358
+ )
359
+
360
+ # ── Findings ──
361
+ n_findings = len(all_findings)
362
+ out.append(section(f"Findings — {n_noise} anomalous events "
363
+ f"({n_templates} templates)"
364
+ + (f" + {n_synthetic} reboot(s)" if n_synthetic else "")))
365
+
366
+ for ev in all_findings:
367
+ out.append(f" {ev['raw'][:200]}")
368
+
369
+ out.append(banner("End of report"))
370
+ return "\n".join(out) + "\n"
371
+
372
+ # ── Main ──────────────────────────────────────────────────────────────────────
373
+
374
+ def main():
375
+ parser = argparse.ArgumentParser(
376
+ description="Syslog structural anomaly detection.",
377
+ formatter_class=argparse.RawDescriptionHelpFormatter,
378
+ epilog=__doc__,
379
+ )
380
+ parser.add_argument(
381
+ "input",
382
+ type=Path,
383
+ help="Flat syslog file (one raw line per line)",
384
+ )
385
+ parser.add_argument(
386
+ "--rarity", "-r",
387
+ type=int,
388
+ default=DEFAULT_RARITY_PCT,
389
+ metavar="PCT",
390
+ help=f"Bottom N percentile flagged as anomalous (default: {DEFAULT_RARITY_PCT})",
391
+ )
392
+ parser.add_argument(
393
+ "--max-count", "-m",
394
+ type=int,
395
+ default=DEFAULT_MAX_COUNT,
396
+ dest="max_count",
397
+ help=f"Hard cap on template count (default: {DEFAULT_MAX_COUNT})",
398
+ )
399
+ parser.add_argument(
400
+ "--exclude", "-x",
401
+ nargs="+",
402
+ default=[],
403
+ metavar="HOST",
404
+ help="Hosts to exclude (e.g. --exclude host1.example.com host2.example.com)",
405
+ )
406
+ parser.add_argument(
407
+ "--out", "-o",
408
+ type=Path,
409
+ default=None,
410
+ help="Override output file path",
411
+ )
412
+ args = parser.parse_args()
413
+
414
+ if not args.input.exists():
415
+ print(f"ERROR: file not found: {args.input}")
416
+ sys.exit(1)
417
+
418
+ exclude_hosts = set(args.exclude)
419
+
420
+ # Output path
421
+ out_dir = Path("./hunt_output")
422
+ out_dir.mkdir(exist_ok=True)
423
+ if args.out:
424
+ outpath = args.out
425
+ else:
426
+ ts = datetime.now().strftime("%Y%m%d_%H%M")
427
+ stem = args.input.stem
428
+ outpath = out_dir / f"{stem}_anomalies_{ts}.txt"
429
+
430
+ # ── Run ──
431
+ print(banner(f"syslog_hunt.py | {args.input.name}"))
432
+ print(f" File : {args.input} "
433
+ f"({args.input.stat().st_size / 1e6:.1f} MB)")
434
+ print(f" Rarity : bottom {args.rarity}th percentile "
435
+ f"| max_count cap={args.max_count}")
436
+ if exclude_hosts:
437
+ print(f" Excluded : {', '.join(sorted(exclude_hosts))}")
438
+ print(f" Output : {outpath}")
439
+
440
+ print(section("Stage 1 — Load"))
441
+ events = load_syslog(args.input, exclude_hosts)
442
+ if not events:
443
+ print("No events loaded. Check file and host exclusions.")
444
+ sys.exit(1)
445
+ hosts = sorted({ev["host"] for ev in events})
446
+ print(f" Hosts : {', '.join(hosts)}")
447
+ print(f" Range : {time_range_str(events)}")
448
+
449
+ print(section("Stage 2 — drain3 Templating"))
450
+ events = run_drain3(events)
451
+
452
+ print(section("Stage 3 — Rarity Scoring"))
453
+ threshold, freq = score_rarity(events, args.rarity, args.max_count)
454
+
455
+ print(section("Stage 4 — Reboot Detection"))
456
+ reboots = detect_reboots(events)
457
+ if reboots:
458
+ for host, times in sorted(reboots.items()):
459
+ for t in times:
460
+ print(f" {host}: reboot at {t.strftime('%a %b %d %H:%M:%S')}")
461
+ else:
462
+ print(" No reboots detected.")
463
+
464
+ print(section("Stage 5 — Building Report"))
465
+ report = build_report(
466
+ events, freq, threshold,
467
+ args.rarity, args.max_count,
468
+ args.input, reboots,
469
+ )
470
+
471
+ outpath.write_text(report, encoding="utf-8")
472
+ print(f" Written : {outpath} "
473
+ f"({outpath.stat().st_size / 1024:.1f} KB)")
474
+
475
+ print(report)
476
+
477
+
478
+ if __name__ == "__main__":
479
+ main()
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python3
2
+ # flatten_own.py — one CloudTrail file → parquet (same projection as flaws)
3
+ import json, sys, os
4
+ import pandas as pd
5
+
6
+ SRC = sys.argv[1] if len(sys.argv) > 1 else "cloudtrail_20260520_to_20260603_00h.json.log"
7
+ OUT = os.path.splitext(SRC)[0] + ".parquet"
8
+
9
+ READ_PREFIXES = ("Get","List","Describe","Head","Lookup","Search","BatchGet","Select","Query","Scan")
10
+
11
+ def principal(ui):
12
+ p = ui.get("principalId","") or ""
13
+ return p.split(":")[-1] if ":" in p else (p or ui.get("type","?"))
14
+
15
+ def flatten(e):
16
+ ui = e.get("userIdentity",{}) or {}
17
+ attrs = (ui.get("sessionContext",{}) or {}).get("attributes",{}) or {}
18
+ name = e.get("eventName") or ""
19
+ return {
20
+ "eventTime": e.get("eventTime"),
21
+ "eventSource": (e.get("eventSource") or "").replace(".amazonaws.com",""),
22
+ "eventName": name,
23
+ "eventType": e.get("eventType"),
24
+ "awsRegion": e.get("awsRegion"),
25
+ "sourceIP": e.get("sourceIPAddress"),
26
+ "userAgent": e.get("userAgent"),
27
+ "id_type": ui.get("type"),
28
+ "principal": principal(ui),
29
+ "arn": ui.get("arn"),
30
+ "accountId": ui.get("accountId") or e.get("recipientAccountId"),
31
+ "invokedBy": ui.get("invokedBy"),
32
+ "mfa": attrs.get("mfaAuthenticated") == "true",
33
+ "accessKeyId": ui.get("accessKeyId"),
34
+ "readOnly_raw": e.get("readOnly"),
35
+ "is_read": name.startswith(READ_PREFIXES),
36
+ "errorCode": e.get("errorCode"),
37
+ "errorMessage": e.get("errorMessage"),
38
+ "has_request": bool(e.get("requestParameters")),
39
+ "has_response": bool(e.get("responseElements")),
40
+ "has_resources": bool(e.get("resources")),
41
+ "eventVersion": e.get("eventVersion"),
42
+ "eventID": e.get("eventID"),
43
+ }
44
+
45
+ # tolerate either {"Records":[...]} OR one-JSON-per-line (your sample was JSONL)
46
+ with open(SRC) as f:
47
+ text = f.read().strip()
48
+ try:
49
+ obj = json.loads(text)
50
+ recs = obj["Records"] if isinstance(obj, dict) and "Records" in obj else (obj if isinstance(obj, list) else [obj])
51
+ except json.JSONDecodeError:
52
+ recs = [json.loads(ln) for ln in text.splitlines() if ln.strip()]
53
+
54
+ df = pd.DataFrame(flatten(e) for e in recs)
55
+ df["eventTime"] = pd.to_datetime(df["eventTime"], errors="coerce", utc=True)
56
+ df = df.sort_values("eventTime").reset_index(drop=True)
57
+ df.to_parquet(OUT, engine="pyarrow", compression="zstd", index=False)
58
+ print(f"{len(df):,} events → {OUT} ({os.path.getsize(OUT)/1e6:.1f} MB)")
59
+ print(f"span: {df['eventTime'].min()} → {df['eventTime'].max()}")
tests/__init__.py ADDED
@@ -0,0 +1 @@
1
+ """LogHunter test suite."""
@@ -0,0 +1,116 @@
1
+ """Shared FakeS3Client + envelope helpers for CloudTrail exporter tests.
2
+
3
+ Lives here (not in either test file) so that the always-run mock test set in
4
+ tests/test_cloudtrail_exporter.py does not transitively import botocore, while
5
+ the botocore-gated set in tests/test_cloudtrail_exporter_botocore.py can reuse
6
+ the same fakes. No botocore reference in this module.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import gzip
12
+ import json
13
+ from typing import Any
14
+
15
+
16
+ def _gz_envelope(records: list[dict]) -> bytes:
17
+ """Encode a {"Records": [...]} envelope as gzipped JSON."""
18
+ return gzip.compress(json.dumps({"Records": records}).encode("utf-8"))
19
+
20
+
21
+ class _Body:
22
+ def __init__(self, content: bytes):
23
+ self._content = content
24
+
25
+ def read(self) -> bytes:
26
+ return self._content
27
+
28
+
29
+ class _FakePaginator:
30
+ def __init__(
31
+ self,
32
+ data: dict[str, dict[str, Any]],
33
+ log: list[str] | None = None,
34
+ prefix_errors: dict[str, Exception] | None = None,
35
+ ):
36
+ self.data = data
37
+ self.log = log if log is not None else []
38
+ self.prefix_errors = prefix_errors or {}
39
+
40
+ def paginate(self, Bucket: str, Prefix: str = "", Delimiter: str | None = None):
41
+ self.log.append(Prefix)
42
+ if Prefix in self.prefix_errors:
43
+ raise self.prefix_errors[Prefix]
44
+ keys = [k for k in self.data if k.startswith(Prefix)]
45
+ if Delimiter == "/":
46
+ common = set()
47
+ contents = []
48
+ for key in keys:
49
+ rest = key[len(Prefix):]
50
+ if "/" in rest:
51
+ common.add(Prefix + rest.split("/", 1)[0] + "/")
52
+ else:
53
+ contents.append({"Key": key, "Size": self.data[key]["size"]})
54
+ yield {
55
+ "CommonPrefixes": [{"Prefix": p} for p in sorted(common)],
56
+ "Contents": contents,
57
+ }
58
+ else:
59
+ yield {
60
+ "Contents": [
61
+ {"Key": k, "Size": self.data[k]["size"]} for k in sorted(keys)
62
+ ],
63
+ }
64
+
65
+
66
+ class FakeS3Client:
67
+ """Minimal in-memory S3 stub: list_objects_v2 (via paginator) + get_object."""
68
+
69
+ def __init__(self, data: dict[str, dict[str, Any]] | None = None):
70
+ self.data: dict[str, dict[str, Any]] = data or {}
71
+ self.get_object_keys: list[str] = []
72
+ self._get_object_errors: dict[str, Exception] = {}
73
+ self._list_error: Exception | None = None
74
+ self.list_prefix_log: list[str] = []
75
+ self._list_error_for_prefix: dict[str, Exception] = {}
76
+
77
+ def add_object(self, key: str, body: bytes, size: int | None = None) -> None:
78
+ self.data[key] = {"body": body, "size": size if size is not None else len(body)}
79
+
80
+ def add_year_root_marker(self, prefix: str) -> None:
81
+ """Force a 'CommonPrefix' under ``prefix`` for a YYYY/ directory.
82
+
83
+ Adds a synthetic '__keep__' key so listing finds the directory.
84
+ """
85
+ self.data[f"{prefix}__keep__"] = {"body": b"", "size": 0}
86
+
87
+ def set_get_object_error(self, key: str, exc: Exception) -> None:
88
+ self._get_object_errors[key] = exc
89
+
90
+ def set_list_error(self, exc: Exception) -> None:
91
+ self._list_error = exc
92
+
93
+ def set_list_error_for_prefix(self, prefix: str, exc: Exception) -> None:
94
+ """Raise ``exc`` when list_objects_v2 is called with exactly ``prefix``."""
95
+ self._list_error_for_prefix[prefix] = exc
96
+
97
+ def get_paginator(self, op: str):
98
+ if op != "list_objects_v2":
99
+ raise NotImplementedError(op)
100
+ if self._list_error is not None:
101
+ err = self._list_error
102
+
103
+ class _ErrorPaginator:
104
+ def paginate(self, **_):
105
+ raise err
106
+
107
+ return _ErrorPaginator()
108
+ return _FakePaginator(
109
+ self.data, self.list_prefix_log, self._list_error_for_prefix
110
+ )
111
+
112
+ def get_object(self, Bucket: str, Key: str):
113
+ self.get_object_keys.append(Key)
114
+ if Key in self._get_object_errors:
115
+ raise self._get_object_errors[Key]
116
+ return {"Body": _Body(self.data[Key]["body"])}
tests/conftest.py ADDED
@@ -0,0 +1,17 @@
1
+ """Test fixtures shared across the suite."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+
7
+
8
+ def pytest_configure(config: pytest.Config) -> None:
9
+ """Register custom markers."""
10
+ # Reserved for future opt-in/opt-out behaviour; the drift tripwire still
11
+ # uses it as a self-documenting hint that the test depends on real shipped
12
+ # _DEFAULTS, even though there is no longer an autouse fixture to opt out of.
13
+ config.addinivalue_line(
14
+ "markers",
15
+ "real_defaults: documents that the test depends on the actual shipped "
16
+ "_DEFAULTS (no per-test mutation of config defaults applied)",
17
+ )