loghunter-cli 0.1.0.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. loghunter_cli-0.1.0.dev0/LICENSE +21 -0
  2. loghunter_cli-0.1.0.dev0/PKG-INFO +336 -0
  3. loghunter_cli-0.1.0.dev0/README.md +307 -0
  4. loghunter_cli-0.1.0.dev0/loghunter/__init__.py +3 -0
  5. loghunter_cli-0.1.0.dev0/loghunter/cli.py +1108 -0
  6. loghunter_cli-0.1.0.dev0/loghunter/cli_init.py +567 -0
  7. loghunter_cli-0.1.0.dev0/loghunter/common/__init__.py +1 -0
  8. loghunter_cli-0.1.0.dev0/loghunter/common/allowlist.py +436 -0
  9. loghunter_cli-0.1.0.dev0/loghunter/common/clustering.py +326 -0
  10. loghunter_cli-0.1.0.dev0/loghunter/common/config.py +221 -0
  11. loghunter_cli-0.1.0.dev0/loghunter/common/display.py +323 -0
  12. loghunter_cli-0.1.0.dev0/loghunter/common/errors.py +45 -0
  13. loghunter_cli-0.1.0.dev0/loghunter/common/finding.py +239 -0
  14. loghunter_cli-0.1.0.dev0/loghunter/common/loader/__init__.py +136 -0
  15. loghunter_cli-0.1.0.dev0/loghunter/common/loader/diagnostics.py +94 -0
  16. loghunter_cli-0.1.0.dev0/loghunter/common/loader/discovery.py +335 -0
  17. loghunter_cli-0.1.0.dev0/loghunter/common/loader/io.py +76 -0
  18. loghunter_cli-0.1.0.dev0/loghunter/common/loader/pipeline.py +1010 -0
  19. loghunter_cli-0.1.0.dev0/loghunter/common/loader/sniff.py +184 -0
  20. loghunter_cli-0.1.0.dev0/loghunter/common/loader/types.py +207 -0
  21. loghunter_cli-0.1.0.dev0/loghunter/common/loader/windowing.py +523 -0
  22. loghunter_cli-0.1.0.dev0/loghunter/common/output.py +93 -0
  23. loghunter_cli-0.1.0.dev0/loghunter/common/paths.py +105 -0
  24. loghunter_cli-0.1.0.dev0/loghunter/common/sources.py +392 -0
  25. loghunter_cli-0.1.0.dev0/loghunter/data/allowlist/connections.txt +50 -0
  26. loghunter_cli-0.1.0.dev0/loghunter/data/allowlist/domains_devices.txt +5 -0
  27. loghunter_cli-0.1.0.dev0/loghunter/data/allowlist/domains_homelab.txt +5 -0
  28. loghunter_cli-0.1.0.dev0/loghunter/data/allowlist/domains_universal.txt +125 -0
  29. loghunter_cli-0.1.0.dev0/loghunter/data/config_example.toml +144 -0
  30. loghunter_cli-0.1.0.dev0/loghunter/detectors/__init__.py +5 -0
  31. loghunter_cli-0.1.0.dev0/loghunter/detectors/auth.py +27 -0
  32. loghunter_cli-0.1.0.dev0/loghunter/detectors/aws.py +671 -0
  33. loghunter_cli-0.1.0.dev0/loghunter/detectors/beacon.py +258 -0
  34. loghunter_cli-0.1.0.dev0/loghunter/detectors/dns.py +778 -0
  35. loghunter_cli-0.1.0.dev0/loghunter/detectors/dnsblock.py +29 -0
  36. loghunter_cli-0.1.0.dev0/loghunter/detectors/duration.py +178 -0
  37. loghunter_cli-0.1.0.dev0/loghunter/detectors/protocol.py +26 -0
  38. loghunter_cli-0.1.0.dev0/loghunter/detectors/scan.py +735 -0
  39. loghunter_cli-0.1.0.dev0/loghunter/detectors/ssl.py +25 -0
  40. loghunter_cli-0.1.0.dev0/loghunter/detectors/syslog.py +266 -0
  41. loghunter_cli-0.1.0.dev0/loghunter/detectors/weird.py +27 -0
  42. loghunter_cli-0.1.0.dev0/loghunter/digest/__init__.py +43 -0
  43. loghunter_cli-0.1.0.dev0/loghunter/digest/_stats.py +182 -0
  44. loghunter_cli-0.1.0.dev0/loghunter/digest/blob.py +698 -0
  45. loghunter_cli-0.1.0.dev0/loghunter/digest/cloudtrail.py +341 -0
  46. loghunter_cli-0.1.0.dev0/loghunter/digest/conn.py +367 -0
  47. loghunter_cli-0.1.0.dev0/loghunter/digest/dns.py +364 -0
  48. loghunter_cli-0.1.0.dev0/loghunter/digest/syslog.py +269 -0
  49. loghunter_cli-0.1.0.dev0/loghunter/exporters/__init__.py +534 -0
  50. loghunter_cli-0.1.0.dev0/loghunter/exporters/cloudtrail.py +499 -0
  51. loghunter_cli-0.1.0.dev0/loghunter/exporters/splunk.py +222 -0
  52. loghunter_cli-0.1.0.dev0/loghunter/outputs/__init__.py +1 -0
  53. loghunter_cli-0.1.0.dev0/loghunter/outputs/allowlist.py +75 -0
  54. loghunter_cli-0.1.0.dev0/loghunter/outputs/csv.py +70 -0
  55. loghunter_cli-0.1.0.dev0/loghunter/outputs/email.py +44 -0
  56. loghunter_cli-0.1.0.dev0/loghunter/outputs/html.py +99 -0
  57. loghunter_cli-0.1.0.dev0/loghunter/outputs/json.py +77 -0
  58. loghunter_cli-0.1.0.dev0/loghunter/outputs/text.py +1422 -0
  59. loghunter_cli-0.1.0.dev0/loghunter/parsers/__init__.py +1 -0
  60. loghunter_cli-0.1.0.dev0/loghunter/parsers/cloudtrail.py +287 -0
  61. loghunter_cli-0.1.0.dev0/loghunter/parsers/dnsmasq.py +331 -0
  62. loghunter_cli-0.1.0.dev0/loghunter/parsers/syslog.py +150 -0
  63. loghunter_cli-0.1.0.dev0/loghunter/parsers/zeek.py +294 -0
  64. loghunter_cli-0.1.0.dev0/loghunter/parsers/zeek_tsv.py +310 -0
  65. loghunter_cli-0.1.0.dev0/loghunter/runner.py +1895 -0
  66. loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/PKG-INFO +336 -0
  67. loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/SOURCES.txt +125 -0
  68. loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/dependency_links.txt +1 -0
  69. loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/entry_points.txt +2 -0
  70. loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/requires.txt +22 -0
  71. loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/top_level.txt +7 -0
  72. loghunter_cli-0.1.0.dev0/migrations/cloudtrail_parquet.py +59 -0
  73. loghunter_cli-0.1.0.dev0/migrations/conn_fft.py +550 -0
  74. loghunter_cli-0.1.0.dev0/migrations/conn_scan.py +1097 -0
  75. loghunter_cli-0.1.0.dev0/migrations/dns_dbscan.py +520 -0
  76. loghunter_cli-0.1.0.dev0/migrations/get_syslog.py +402 -0
  77. loghunter_cli-0.1.0.dev0/migrations/syslog_drain3.py +479 -0
  78. loghunter_cli-0.1.0.dev0/pyproject.toml +38 -0
  79. loghunter_cli-0.1.0.dev0/scratch/junk/parquet.py +59 -0
  80. loghunter_cli-0.1.0.dev0/setup.cfg +4 -0
  81. loghunter_cli-0.1.0.dev0/tests/__init__.py +1 -0
  82. loghunter_cli-0.1.0.dev0/tests/_cloudtrail_fakes.py +116 -0
  83. loghunter_cli-0.1.0.dev0/tests/conftest.py +17 -0
  84. loghunter_cli-0.1.0.dev0/tests/test_allowlist_defaults_accessor.py +90 -0
  85. loghunter_cli-0.1.0.dev0/tests/test_architecture_spine.py +302 -0
  86. loghunter_cli-0.1.0.dev0/tests/test_aws_detector.py +504 -0
  87. loghunter_cli-0.1.0.dev0/tests/test_be_like_water.py +106 -0
  88. loghunter_cli-0.1.0.dev0/tests/test_cli_help.py +342 -0
  89. loghunter_cli-0.1.0.dev0/tests/test_cli_multi_positional.py +458 -0
  90. loghunter_cli-0.1.0.dev0/tests/test_cloudtrail_exporter.py +631 -0
  91. loghunter_cli-0.1.0.dev0/tests/test_cloudtrail_exporter_botocore.py +207 -0
  92. loghunter_cli-0.1.0.dev0/tests/test_cloudtrail_parser.py +393 -0
  93. loghunter_cli-0.1.0.dev0/tests/test_clustering.py +85 -0
  94. loghunter_cli-0.1.0.dev0/tests/test_clustering_interruptible.py +404 -0
  95. loghunter_cli-0.1.0.dev0/tests/test_config_cli.py +1006 -0
  96. loghunter_cli-0.1.0.dev0/tests/test_config_example_drift.py +164 -0
  97. loghunter_cli-0.1.0.dev0/tests/test_digest_blob.py +1237 -0
  98. loghunter_cli-0.1.0.dev0/tests/test_digest_cli.py +1040 -0
  99. loghunter_cli-0.1.0.dev0/tests/test_digest_cloudtrail.py +980 -0
  100. loghunter_cli-0.1.0.dev0/tests/test_digest_conn.py +1189 -0
  101. loghunter_cli-0.1.0.dev0/tests/test_digest_dns.py +770 -0
  102. loghunter_cli-0.1.0.dev0/tests/test_digest_stats.py +282 -0
  103. loghunter_cli-0.1.0.dev0/tests/test_digest_syslog.py +724 -0
  104. loghunter_cli-0.1.0.dev0/tests/test_display.py +370 -0
  105. loghunter_cli-0.1.0.dev0/tests/test_dns_detector.py +1010 -0
  106. loghunter_cli-0.1.0.dev0/tests/test_dnsmasq_parser.py +467 -0
  107. loghunter_cli-0.1.0.dev0/tests/test_duration_detector.py +491 -0
  108. loghunter_cli-0.1.0.dev0/tests/test_export_orchestrator_shape.py +153 -0
  109. loghunter_cli-0.1.0.dev0/tests/test_init_wizard.py +707 -0
  110. loghunter_cli-0.1.0.dev0/tests/test_loader.py +3639 -0
  111. loghunter_cli-0.1.0.dev0/tests/test_loader_package_surface.py +115 -0
  112. loghunter_cli-0.1.0.dev0/tests/test_loader_window_model.py +215 -0
  113. loghunter_cli-0.1.0.dev0/tests/test_output_path_cascade.py +575 -0
  114. loghunter_cli-0.1.0.dev0/tests/test_resolve_path.py +111 -0
  115. loghunter_cli-0.1.0.dev0/tests/test_root_provenance.py +212 -0
  116. loghunter_cli-0.1.0.dev0/tests/test_runner.py +2599 -0
  117. loghunter_cli-0.1.0.dev0/tests/test_scan_detector.py +455 -0
  118. loghunter_cli-0.1.0.dev0/tests/test_search_paths.py +50 -0
  119. loghunter_cli-0.1.0.dev0/tests/test_sniff_orchestrator.py +373 -0
  120. loghunter_cli-0.1.0.dev0/tests/test_sniff_recognizers.py +573 -0
  121. loghunter_cli-0.1.0.dev0/tests/test_source_resolution_seam.py +471 -0
  122. loghunter_cli-0.1.0.dev0/tests/test_sources.py +648 -0
  123. loghunter_cli-0.1.0.dev0/tests/test_splunk_exporter.py +351 -0
  124. loghunter_cli-0.1.0.dev0/tests/test_syslog_detector.py +458 -0
  125. loghunter_cli-0.1.0.dev0/tests/test_syslog_parser.py +582 -0
  126. loghunter_cli-0.1.0.dev0/tests/test_text_output.py +1225 -0
  127. loghunter_cli-0.1.0.dev0/tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 David Augros
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,336 @@
1
+ Metadata-Version: 2.4
2
+ Name: loghunter-cli
3
+ Version: 0.1.0.dev0
4
+ Summary: ML-assisted network and log analysis toolkit for security practitioners and threat hunters.
5
+ Author-email: David Augros <code@augros.org>
6
+ License: MIT
7
+ Requires-Python: >=3.11
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ Requires-Dist: pandas>=2.0
11
+ Requires-Dist: numpy>=1.26
12
+ Requires-Dist: scikit-learn>=1.3
13
+ Requires-Dist: hdbscan>=0.8
14
+ Requires-Dist: drain3>=0.9
15
+ Requires-Dist: tqdm>=4.0
16
+ Requires-Dist: tldextract>=3.0
17
+ Provides-Extra: fast
18
+ Requires-Dist: fast-hdbscan>=0.2; extra == "fast"
19
+ Provides-Extra: splunk
20
+ Requires-Dist: splunk-sdk; extra == "splunk"
21
+ Provides-Extra: cloudtrail
22
+ Requires-Dist: boto3; extra == "cloudtrail"
23
+ Requires-Dist: botocore[crt]; extra == "cloudtrail"
24
+ Provides-Extra: all
25
+ Requires-Dist: loghunt[fast]; extra == "all"
26
+ Requires-Dist: loghunt[splunk]; extra == "all"
27
+ Requires-Dist: loghunt[cloudtrail]; extra == "all"
28
+ Dynamic: license-file
29
+
30
+ # LogHunter
31
+
32
+ LogHunter is a local-first command-line threat-hunting workbench for self-hosters. You
33
+ point it at the logs you already have — Zeek, Pi-hole/dnsmasq, syslog, CloudTrail — and it
34
+ tells you what's in them and runs transparent detectors over them: beaconing, suspicious
35
+ DNS, port scans, rare syslog events, abnormally long connections, and unusual CloudTrail
36
+ activity. Every run names the technique behind each detector, so you always know whether a
37
+ finding came from a published algorithm or an honest heuristic.
38
+
39
+ **Not a SIEM. Not an agent. Not magic.** Nothing to deploy, no database, no daemon, no
40
+ account. Install it, point it at a directory of logs, read the output. It runs on the
41
+ admin's own box, over logs at rest.
42
+
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](#license)
44
+ ![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)
45
+
46
+ > **Status: early / pre-1.0 (`0.1.0.dev0`).** The six detectors below work and are
47
+ > covered by tests, but interfaces may still move before 1.0. Feedback welcome.
48
+
49
+ <!-- TODO(screenshots): a real terminal capture of `loghunter ~/zeek` and a `digest` card go here. -->
50
+
51
+ A run opens with a summary banner — what was loaded, and which technique each detector
52
+ used — then groups findings by detector (illustrative output; addresses are
53
+ [RFC 5737](https://datatracker.ietf.org/doc/html/rfc5737) documentation space):
54
+
55
+ ```
56
+ LogHunter · Threat Hunt
57
+ ══════════════════════════════════════════════════════════════════════════════
58
+ Data found: 2026-05-31 00:00 → 2026-06-01 00:00 (24h)
59
+ Records: 1,284,402 conn.log · 318,221 dns.log · 44,019 *.log
60
+ Detectors: beacon (FFT) · dns (fast-HDBSCAN) · syslog (drain3) · scan [pattern] · duration [heuristics]
61
+ ══════════════════════════════════════════════════════════════════════════════
62
+
63
+ beacon — 2 findings · 1 H 1 M
64
+ ────────────────────────────────────────────────────────────────────────────────
65
+ [H] 192.0.2.37 → 198.51.100.20:443/tcp score 0.91 period 60.0s 1,440 conns
66
+ [M] 192.0.2.37 → 198.51.100.61:8443/tcp score 0.74 period 300.0s 288 conns
67
+
68
+ dns — 1 finding · 1 M
69
+ ────────────────────────────────────────────────────────────────────────────────
70
+ [M] dga-lookups.example entropy 3.91 14 subdomains cluster -1 (noise)
71
+ ```
72
+
73
+ The two-tier styling of the `Detectors:` line is deliberate: published techniques glow in
74
+ parentheses — `(FFT)`, `(HDBSCAN)`, `(drain3)` — while honest house methods are plain in
75
+ brackets — `[pattern]`, `[heuristics]`, `[statistical]`. The restraint is the point. A
76
+ heuristic is never dressed up as an algorithm, which is what makes the glow trustworthy.
77
+
78
+ ## Quick start
79
+
80
+ ```bash
81
+ pip install loghunt
82
+
83
+ # one-time, detection-driven setup — finds your logs and writes a config
84
+ loghunter init
85
+
86
+ # hunt across everything enabled in your config
87
+ loghunter
88
+
89
+ # or point at a directory / file directly
90
+ loghunter ~/zeek-logs
91
+ loghunter syslog /var/log
92
+
93
+ # orient before you hunt — a fast, factual profile of a single file
94
+ loghunter digest /var/log/messages
95
+ ```
96
+
97
+ No config file is required to get started — `loghunter <path>` works against a directory or
98
+ a single file. `loghunter init` just makes it repeatable.
99
+
100
+ ## Why use LogHunter?
101
+
102
+ - **It runs where your logs are.** No services, no database, no daemon, no agent to push.
103
+ `pip install`, point it at a directory, get output. The only setup step that exists at all
104
+ is `loghunter init`, and that only writes a config file.
105
+ - **Real methods, made visible.** Beaconing is found with an FFT over connection timing;
106
+ DNS with HDBSCAN clustering over per-query behavior; rare syslog events with drain3
107
+ log-templating plus rarity scoring; CloudTrail with a transparent per-principal z-score
108
+ composite. Every run tells you which technique ran. You can read *why* something was
109
+ surfaced — no black box.
110
+ - **Big-tent ingestion.** One tool reads Zeek (NDJSON *and* TSV, flat *or* date-partitioned
111
+ directories), Pi-hole/dnsmasq, flat RFC 3164 syslog (Debian *and* RHEL/Fedora layouts),
112
+ and CloudTrail. Rotation and `.gz`/`.bz2`/`.xz` compression are handled transparently.
113
+ - **Orient before you hunt.** `loghunter digest FILE` reads a log and reports facts about
114
+ it — time span, top talkers, the shape of the mix — with zero verdicts. It's sonar, not a
115
+ baggage scanner: it tells you what's there so you know where to point the detectors.
116
+ - **Filter before analyze.** A flat-file allowlist suppresses known-good infrastructure
117
+ *before* any detector sees the data, so your noise floor is yours to set and detectors
118
+ never have to know the allowlist exists.
119
+ - **Honest output.** Findings carry a severity, the evidence behind the score, and (with
120
+ `-v`/`-vv`) the analyst pivots to chase next. Machine formats (`json`, `csv`, `html`) are
121
+ lossless; the terminal view is the one that summarizes.
122
+
123
+ ## Why *not* use LogHunter?
124
+
125
+ - **It is not real-time and not a SIEM.** It runs over logs at rest, in batches. There's no
126
+ streaming, no alerting pipeline, no live correlation across sources at scale. If you need an
127
+ always-on detection platform, you need a SIEM; LogHunter is the workbench you reach for to
128
+ *hunt*.
129
+ - **It is stateless between runs.** There's no persisted baseline and no rolling history.
130
+ CloudTrail "first-seen" novelty, for example, is relative to the window you loaded — not to
131
+ all of recorded time.
132
+ - **Detector coverage is v1.** Six detectors ship today (below). `auth`, `ssl`, `protocol`,
133
+ and `weird` are planned but not built.
134
+ - **The richest network signal wants Zeek.** Pi-hole/dnsmasq gives you DNS only — no RTT,
135
+ TTL, or connection correlation. LogHunter will tell you so and keep working, but Zeek is
136
+ where it shines.
137
+ - **It surfaces, it doesn't block.** This is a tool for a human triaging behavior, not a
138
+ signature IDS or an enforcement point.
139
+
140
+ ## What it hunts
141
+
142
+ | Detector | Surfaces | Method | Source |
143
+ |-----------|-----------------------------------------------------|------------------------------|--------------------------------|
144
+ | `beacon` | periodic C2-style callbacks | FFT over connection timing | Zeek `conn.log` |
145
+ | `dns` | DGA / tunneling / anomalous lookups | HDBSCAN clustering | Zeek `dns.log` **or** Pi-hole |
146
+ | `syslog` | rare events & reboots | drain3 templating + rarity | syslog (flat) **or** Zeek `syslog.log` |
147
+ | `scan` | vertical / horizontal / block / slow port scans | pattern (heuristic) | Zeek `conn.log` |
148
+ | `duration`| abnormally long-lived connections | heuristics | Zeek `conn.log` |
149
+ | `aws` | per-principal anomalous CloudTrail behavior | statistical (z-score composite) | CloudTrail `*.json` |
150
+
151
+ `dns` and `syslog` each answer **one** question across **two** source families — Zeek and
152
+ Pi-hole for DNS, flat rsyslog and Zeek's own `syslog.log` for syslog — and adapt to whichever
153
+ fidelity they're handed.
154
+
155
+ Run them all (`loghunter`), select some (`loghunter --detect=beacon,dns`), or exclude
156
+ (`loghunter --detect='all,!syslog'`). Each detector is also its own subcommand:
157
+ `loghunter beacon ~/zeek`.
158
+
159
+ ## How a run works
160
+
161
+ ```
162
+ discover & parse → allowlist (suppress) → detect → render
163
+ ```
164
+
165
+ Responsibilities don't bleed across that line. The **loader** finds files, decompresses,
166
+ normalizes every connection source to one canonical schema, and absorbs storage variation
167
+ (TSV vs. NDJSON, flat vs. dated directories, rotation). The **allowlist** suppresses
168
+ known-good traffic *before* analysis. **Detectors** only analyze — they never open files,
169
+ read config, or suppress. **Output handlers** only render. The CLI is the one place that
170
+ turns an error into an actionable message and owns the exit code.
171
+
172
+ Because detectors are pure analysis, every one is importable and callable as an ordinary
173
+ Python function — useful in a notebook when you want to experiment.
174
+
175
+ ### Analysis window
176
+
177
+ Pointed at a **directory**, an unqualified run looks back over the last `default_window`
178
+ (`1d` out of the box) of *that source's own* data — the right default for a live log dir
179
+ you don't want to read in full every time. Pointed at a **single file**, it reads the whole
180
+ file. Override either way:
181
+
182
+ ```bash
183
+ loghunter --since=7d ~/zeek # last 7 days
184
+ loghunter --since=2026-05-01 --until=2026-05-08 ~/zeek
185
+ loghunter --days=2-4 ~/zeek # 2 to 4 days ago
186
+ loghunter --all ~/zeek # the entire archive
187
+ ```
188
+
189
+ CloudTrail is the one source that opts out of the default window — novelty detection needs
190
+ full history, so it always loads in full unless you narrow it explicitly.
191
+
192
+ ## Orient before the hunt: `digest`
193
+
194
+ ```bash
195
+ loghunter digest /var/log/messages
196
+ loghunter digest conn.log dns.log # several files → several cards
197
+ ```
198
+
199
+ `digest` content-sniffs each file, routes it to the right summarizer (conn, dns, syslog,
200
+ cloudtrail), and falls back to a fast byte-profiler — **blob** — for anything it doesn't
201
+ recognize. A card is flush-left and factual: the file's time window, line count and size, a
202
+ scale-anchored histogram, and a handful of plain-language insights ("one client accounts for
203
+ 71% of queries"). It states facts and superlatives, never verdicts — no "suspicious," no
204
+ "anomalous." It reads your data *before* the allowlist, because everything in the file,
205
+ allowlisted or not, is part of "what's in here." The blob profiler is bounded: it samples a
206
+ big file rather than reading it, so a one-gigabyte mystery file costs the same as a
207
+ one-kilobyte one.
208
+
209
+ ## Installation
210
+
211
+ LogHunter is published on PyPI as **`loghunt`** (the command, import package, and config
212
+ section are all `loghunter`).
213
+
214
+ ```bash
215
+ pip install loghunt # core
216
+ pip install 'loghunt[fast]' # fast-hdbscan accelerator for DNS clustering
217
+ pip install 'loghunt[splunk]' # Splunk exporter
218
+ pip install 'loghunt[cloudtrail]' # CloudTrail (S3) exporter
219
+ pip install 'loghunt[all]' # everything above
220
+ ```
221
+
222
+ Requires **Python 3.11+**. A bare `pip install loghunt` always works — the DNS clustering
223
+ runs on stock `hdbscan` (a base dependency); `[fast]` swaps in a numba-accelerated backend
224
+ when you want it, and the tool tells you which one is active on every run.
225
+
226
+ From source:
227
+
228
+ ```bash
229
+ git clone https://github.com/spiralbend/loghunter
230
+ cd loghunter
231
+ pip install -e '.[all]'
232
+ ```
233
+
234
+ ## Configuration
235
+
236
+ Configuration is optional — LogHunter runs against a path with none. When you want it
237
+ repeatable, `loghunter init` looks at the conventional locations on your box, profiles what
238
+ it finds (which log families, rough size, freshness — without reading a single log line),
239
+ and writes a fully-annotated `~/.loghunter/config.toml`. It never clobbers settings you
240
+ already have.
241
+
242
+ Config is loaded from the first of:
243
+
244
+ 1. `--config=FILE`
245
+ 2. `~/.loghunter/config.toml`
246
+ 3. `/etc/loghunter/config.toml`
247
+
248
+ Everything LogHunter owns lives under the hidden `~/.loghunter/` — config, allowlists,
249
+ exports, reports — so it can't collide with a project directory. A trimmed example:
250
+
251
+ ```toml
252
+ [loghunter]
253
+ detect = "all" # "all" | "dns,beacon" | "all,!syslog"
254
+ zeek_dir = "/var/log/zeek"
255
+ syslog_dir = "/var/log"
256
+ # pihole_dir = "/var/log/pihole"
257
+ # cloudtrail_dir = "/var/log/cloudtrail"
258
+
259
+ home_net = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
260
+ default_window = "1d" # lookback for a directory; "" or "all" = full
261
+ output_format = "text" # text | json | csv | html
262
+ ```
263
+
264
+ Findings print to your terminal by default — keep it pipeable. Set `report_dir` (or pass
265
+ `--out=PATH`) to write report files instead. Every tunable a detector exposes is documented
266
+ as a commented "engine room" at the bottom of the generated config; you rarely need it, and
267
+ `loghunter <detector> --help` lists the full surface.
268
+
269
+ ## Log sources it speaks
270
+
271
+ - **Zeek** — `conn.log`, `dns.log`, `syslog.log`, in NDJSON or TSV, from a flat directory or
272
+ date-partitioned subdirectories. Rotation and gzip/bzip2/xz compression are transparent.
273
+ - **Pi-hole / dnsmasq** — DNS event logs, aggregated per domain for clustering.
274
+ - **syslog** — flat RFC 3164. Discovery is content-sniffed, not filename-matched, so it
275
+ handles both the Debian convention (`syslog`, `auth.log`, `kern.log`) and the RHEL/Fedora
276
+ one (extensionless `messages`, `secure`, `maillog`) — and won't mistake `dnf.log` or a
277
+ binary like `wtmp` for a log stream.
278
+ - **CloudTrail** — gzipped JSON event records, read locally or pulled from S3 (below).
279
+
280
+ ## The allowlist
281
+
282
+ Two kinds of allowlist file, never conflated:
283
+
284
+ - **Flat files = suppression.** One rule per line — an IP, a CIDR, a `:port/proto`, or a
285
+ domain glob/regex. Matching traffic is dropped before any detector runs. LogHunter ships a
286
+ curated domain list and never ships numeric connection suppressions (those depend on your
287
+ hosts, and shipping them could hide real findings).
288
+ - **TOML stanzas = classification.** When a detector needs to know *what* something is
289
+ (a nameserver, a backup client) rather than whether to drop it.
290
+
291
+ A bare host IP with no port suppresses *all* traffic involving that host — powerful, and
292
+ called out as such wherever it appears.
293
+
294
+ ## Pulling logs in: exporters
295
+
296
+ LogHunter can fetch logs from external systems to local files, which it then analyzes like
297
+ any other source — the syslog detector can't tell whether the data came from rsyslog or a
298
+ Splunk export.
299
+
300
+ ```bash
301
+ loghunter export # run the configured "default" query
302
+ loghunter export auth # run a named query
303
+ ```
304
+
305
+ - **Splunk** — named SPL queries under `[export.splunk.query.<name>]`. Prefer the
306
+ `LOGHUNTER_SPLUNK_USER` / `LOGHUNTER_SPLUNK_PASS` environment variables over plaintext
307
+ credentials in config.
308
+ - **CloudTrail** — pulls gzipped JSON from an S3 prefix. AWS authentication is *not* handled
309
+ here: you authenticate your shell, and boto3 resolves the ambient credential chain.
310
+ LogHunter never reads, stores, or prompts for AWS credentials, and warns before a large
311
+ egress.
312
+
313
+ ## Output formats
314
+
315
+ `text` (default, grouped and summarized), `json` (one finding per line, pipeable), `csv`
316
+ (flattened), and `html` (a self-contained file). Pass `--output=json` or set `output_format`
317
+ in config. `-v` adds the curated "why it scored" detail; `-vv` adds raw debug — template
318
+ strings, cluster membership, full evidence. Color is enhancement-only and TTY-gated: piped
319
+ or redirected output is always plain, and the machine formats never emit an escape code.
320
+
321
+ ## Building from source & running tests
322
+
323
+ ```bash
324
+ git clone https://github.com/spiralbend/loghunter
325
+ cd loghunter
326
+ pip install -e '.[all]'
327
+ python -m pytest
328
+ ```
329
+
330
+ `main` is kept runnable. Architecture tests cover the boundaries that matter — detector
331
+ discovery, run planning, loader metadata, allowlist suppression, output registration, and
332
+ CLI error formatting.
333
+
334
+ ## License
335
+
336
+ LogHunter is licensed under the [MIT License](LICENSE).
@@ -0,0 +1,307 @@
1
+ # LogHunter
2
+
3
+ LogHunter is a local-first command-line threat-hunting workbench for self-hosters. You
4
+ point it at the logs you already have — Zeek, Pi-hole/dnsmasq, syslog, CloudTrail — and it
5
+ tells you what's in them and runs transparent detectors over them: beaconing, suspicious
6
+ DNS, port scans, rare syslog events, abnormally long connections, and unusual CloudTrail
7
+ activity. Every run names the technique behind each detector, so you always know whether a
8
+ finding came from a published algorithm or an honest heuristic.
9
+
10
+ **Not a SIEM. Not an agent. Not magic.** Nothing to deploy, no database, no daemon, no
11
+ account. Install it, point it at a directory of logs, read the output. It runs on the
12
+ admin's own box, over logs at rest.
13
+
14
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](#license)
15
+ ![Python 3.11+](https://img.shields.io/badge/python-3.11%2B-blue.svg)
16
+
17
+ > **Status: early / pre-1.0 (`0.1.0.dev0`).** The six detectors below work and are
18
+ > covered by tests, but interfaces may still move before 1.0. Feedback welcome.
19
+
20
+ <!-- TODO(screenshots): a real terminal capture of `loghunter ~/zeek` and a `digest` card go here. -->
21
+
22
+ A run opens with a summary banner — what was loaded, and which technique each detector
23
+ used — then groups findings by detector (illustrative output; addresses are
24
+ [RFC 5737](https://datatracker.ietf.org/doc/html/rfc5737) documentation space):
25
+
26
+ ```
27
+ LogHunter · Threat Hunt
28
+ ══════════════════════════════════════════════════════════════════════════════
29
+ Data found: 2026-05-31 00:00 → 2026-06-01 00:00 (24h)
30
+ Records: 1,284,402 conn.log · 318,221 dns.log · 44,019 *.log
31
+ Detectors: beacon (FFT) · dns (fast-HDBSCAN) · syslog (drain3) · scan [pattern] · duration [heuristics]
32
+ ══════════════════════════════════════════════════════════════════════════════
33
+
34
+ beacon — 2 findings · 1 H 1 M
35
+ ────────────────────────────────────────────────────────────────────────────────
36
+ [H] 192.0.2.37 → 198.51.100.20:443/tcp score 0.91 period 60.0s 1,440 conns
37
+ [M] 192.0.2.37 → 198.51.100.61:8443/tcp score 0.74 period 300.0s 288 conns
38
+
39
+ dns — 1 finding · 1 M
40
+ ────────────────────────────────────────────────────────────────────────────────
41
+ [M] dga-lookups.example entropy 3.91 14 subdomains cluster -1 (noise)
42
+ ```
43
+
44
+ The two-tier styling of the `Detectors:` line is deliberate: published techniques glow in
45
+ parentheses — `(FFT)`, `(HDBSCAN)`, `(drain3)` — while honest house methods are plain in
46
+ brackets — `[pattern]`, `[heuristics]`, `[statistical]`. The restraint is the point. A
47
+ heuristic is never dressed up as an algorithm, which is what makes the glow trustworthy.
48
+
49
+ ## Quick start
50
+
51
+ ```bash
52
+ pip install loghunt
53
+
54
+ # one-time, detection-driven setup — finds your logs and writes a config
55
+ loghunter init
56
+
57
+ # hunt across everything enabled in your config
58
+ loghunter
59
+
60
+ # or point at a directory / file directly
61
+ loghunter ~/zeek-logs
62
+ loghunter syslog /var/log
63
+
64
+ # orient before you hunt — a fast, factual profile of a single file
65
+ loghunter digest /var/log/messages
66
+ ```
67
+
68
+ No config file is required to get started — `loghunter <path>` works against a directory or
69
+ a single file. `loghunter init` just makes it repeatable.
70
+
71
+ ## Why use LogHunter?
72
+
73
+ - **It runs where your logs are.** No services, no database, no daemon, no agent to push.
74
+ `pip install`, point it at a directory, get output. The only setup step that exists at all
75
+ is `loghunter init`, and that only writes a config file.
76
+ - **Real methods, made visible.** Beaconing is found with an FFT over connection timing;
77
+ DNS with HDBSCAN clustering over per-query behavior; rare syslog events with drain3
78
+ log-templating plus rarity scoring; CloudTrail with a transparent per-principal z-score
79
+ composite. Every run tells you which technique ran. You can read *why* something was
80
+ surfaced — no black box.
81
+ - **Big-tent ingestion.** One tool reads Zeek (NDJSON *and* TSV, flat *or* date-partitioned
82
+ directories), Pi-hole/dnsmasq, flat RFC 3164 syslog (Debian *and* RHEL/Fedora layouts),
83
+ and CloudTrail. Rotation and `.gz`/`.bz2`/`.xz` compression are handled transparently.
84
+ - **Orient before you hunt.** `loghunter digest FILE` reads a log and reports facts about
85
+ it — time span, top talkers, the shape of the mix — with zero verdicts. It's sonar, not a
86
+ baggage scanner: it tells you what's there so you know where to point the detectors.
87
+ - **Filter before analyze.** A flat-file allowlist suppresses known-good infrastructure
88
+ *before* any detector sees the data, so your noise floor is yours to set and detectors
89
+ never have to know the allowlist exists.
90
+ - **Honest output.** Findings carry a severity, the evidence behind the score, and (with
91
+ `-v`/`-vv`) the analyst pivots to chase next. Machine formats (`json`, `csv`, `html`) are
92
+ lossless; the terminal view is the one that summarizes.
93
+
94
+ ## Why *not* use LogHunter?
95
+
96
+ - **It is not real-time and not a SIEM.** It runs over logs at rest, in batches. There's no
97
+ streaming, no alerting pipeline, no live correlation across sources at scale. If you need an
98
+ always-on detection platform, you need a SIEM; LogHunter is the workbench you reach for to
99
+ *hunt*.
100
+ - **It is stateless between runs.** There's no persisted baseline and no rolling history.
101
+ CloudTrail "first-seen" novelty, for example, is relative to the window you loaded — not to
102
+ all of recorded time.
103
+ - **Detector coverage is v1.** Six detectors ship today (below). `auth`, `ssl`, `protocol`,
104
+ and `weird` are planned but not built.
105
+ - **The richest network signal wants Zeek.** Pi-hole/dnsmasq gives you DNS only — no RTT,
106
+ TTL, or connection correlation. LogHunter will tell you so and keep working, but Zeek is
107
+ where it shines.
108
+ - **It surfaces, it doesn't block.** This is a tool for a human triaging behavior, not a
109
+ signature IDS or an enforcement point.
110
+
111
+ ## What it hunts
112
+
113
+ | Detector | Surfaces | Method | Source |
114
+ |-----------|-----------------------------------------------------|------------------------------|--------------------------------|
115
+ | `beacon` | periodic C2-style callbacks | FFT over connection timing | Zeek `conn.log` |
116
+ | `dns` | DGA / tunneling / anomalous lookups | HDBSCAN clustering | Zeek `dns.log` **or** Pi-hole |
117
+ | `syslog` | rare events & reboots | drain3 templating + rarity | syslog (flat) **or** Zeek `syslog.log` |
118
+ | `scan` | vertical / horizontal / block / slow port scans | pattern (heuristic) | Zeek `conn.log` |
119
+ | `duration`| abnormally long-lived connections | heuristics | Zeek `conn.log` |
120
+ | `aws` | per-principal anomalous CloudTrail behavior | statistical (z-score composite) | CloudTrail `*.json` |
121
+
122
+ `dns` and `syslog` each answer **one** question across **two** source families — Zeek and
123
+ Pi-hole for DNS, flat rsyslog and Zeek's own `syslog.log` for syslog — and adapt to whichever
124
+ fidelity they're handed.
125
+
126
+ Run them all (`loghunter`), select some (`loghunter --detect=beacon,dns`), or exclude
127
+ (`loghunter --detect='all,!syslog'`). Each detector is also its own subcommand:
128
+ `loghunter beacon ~/zeek`.
129
+
130
+ ## How a run works
131
+
132
+ ```
133
+ discover & parse → allowlist (suppress) → detect → render
134
+ ```
135
+
136
+ Responsibilities don't bleed across that line. The **loader** finds files, decompresses,
137
+ normalizes every connection source to one canonical schema, and absorbs storage variation
138
+ (TSV vs. NDJSON, flat vs. dated directories, rotation). The **allowlist** suppresses
139
+ known-good traffic *before* analysis. **Detectors** only analyze — they never open files,
140
+ read config, or suppress. **Output handlers** only render. The CLI is the one place that
141
+ turns an error into an actionable message and owns the exit code.
142
+
143
+ Because detectors are pure analysis, every one is importable and callable as an ordinary
144
+ Python function — useful in a notebook when you want to experiment.
145
+
146
+ ### Analysis window
147
+
148
+ Pointed at a **directory**, an unqualified run looks back over the last `default_window`
149
+ (`1d` out of the box) of *that source's own* data — the right default for a live log dir
150
+ you don't want to read in full every time. Pointed at a **single file**, it reads the whole
151
+ file. Override either way:
152
+
153
+ ```bash
154
+ loghunter --since=7d ~/zeek # last 7 days
155
+ loghunter --since=2026-05-01 --until=2026-05-08 ~/zeek
156
+ loghunter --days=2-4 ~/zeek # 2 to 4 days ago
157
+ loghunter --all ~/zeek # the entire archive
158
+ ```
159
+
160
+ CloudTrail is the one source that opts out of the default window — novelty detection needs
161
+ full history, so it always loads in full unless you narrow it explicitly.
162
+
163
+ ## Orient before the hunt: `digest`
164
+
165
+ ```bash
166
+ loghunter digest /var/log/messages
167
+ loghunter digest conn.log dns.log # several files → several cards
168
+ ```
169
+
170
+ `digest` content-sniffs each file, routes it to the right summarizer (conn, dns, syslog,
171
+ cloudtrail), and falls back to a fast byte-profiler — **blob** — for anything it doesn't
172
+ recognize. A card is flush-left and factual: the file's time window, line count and size, a
173
+ scale-anchored histogram, and a handful of plain-language insights ("one client accounts for
174
+ 71% of queries"). It states facts and superlatives, never verdicts — no "suspicious," no
175
+ "anomalous." It reads your data *before* the allowlist, because everything in the file,
176
+ allowlisted or not, is part of "what's in here." The blob profiler is bounded: it samples a
177
+ big file rather than reading it, so a one-gigabyte mystery file costs the same as a
178
+ one-kilobyte one.
179
+
180
+ ## Installation
181
+
182
+ LogHunter is published on PyPI as **`loghunt`** (the command, import package, and config
183
+ section are all `loghunter`).
184
+
185
+ ```bash
186
+ pip install loghunt # core
187
+ pip install 'loghunt[fast]' # fast-hdbscan accelerator for DNS clustering
188
+ pip install 'loghunt[splunk]' # Splunk exporter
189
+ pip install 'loghunt[cloudtrail]' # CloudTrail (S3) exporter
190
+ pip install 'loghunt[all]' # everything above
191
+ ```
192
+
193
+ Requires **Python 3.11+**. A bare `pip install loghunt` always works — the DNS clustering
194
+ runs on stock `hdbscan` (a base dependency); `[fast]` swaps in a numba-accelerated backend
195
+ when you want it, and the tool tells you which one is active on every run.
196
+
197
+ From source:
198
+
199
+ ```bash
200
+ git clone https://github.com/spiralbend/loghunter
201
+ cd loghunter
202
+ pip install -e '.[all]'
203
+ ```
204
+
205
+ ## Configuration
206
+
207
+ Configuration is optional — LogHunter runs against a path with none. When you want it
208
+ repeatable, `loghunter init` looks at the conventional locations on your box, profiles what
209
+ it finds (which log families, rough size, freshness — without reading a single log line),
210
+ and writes a fully-annotated `~/.loghunter/config.toml`. It never clobbers settings you
211
+ already have.
212
+
213
+ Config is loaded from the first of:
214
+
215
+ 1. `--config=FILE`
216
+ 2. `~/.loghunter/config.toml`
217
+ 3. `/etc/loghunter/config.toml`
218
+
219
+ Everything LogHunter owns lives under the hidden `~/.loghunter/` — config, allowlists,
220
+ exports, reports — so it can't collide with a project directory. A trimmed example:
221
+
222
+ ```toml
223
+ [loghunter]
224
+ detect = "all" # "all" | "dns,beacon" | "all,!syslog"
225
+ zeek_dir = "/var/log/zeek"
226
+ syslog_dir = "/var/log"
227
+ # pihole_dir = "/var/log/pihole"
228
+ # cloudtrail_dir = "/var/log/cloudtrail"
229
+
230
+ home_net = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
231
+ default_window = "1d" # lookback for a directory; "" or "all" = full
232
+ output_format = "text" # text | json | csv | html
233
+ ```
234
+
235
+ Findings print to your terminal by default — keep it pipeable. Set `report_dir` (or pass
236
+ `--out=PATH`) to write report files instead. Every tunable a detector exposes is documented
237
+ as a commented "engine room" at the bottom of the generated config; you rarely need it, and
238
+ `loghunter <detector> --help` lists the full surface.
239
+
240
+ ## Log sources it speaks
241
+
242
+ - **Zeek** — `conn.log`, `dns.log`, `syslog.log`, in NDJSON or TSV, from a flat directory or
243
+ date-partitioned subdirectories. Rotation and gzip/bzip2/xz compression are transparent.
244
+ - **Pi-hole / dnsmasq** — DNS event logs, aggregated per domain for clustering.
245
+ - **syslog** — flat RFC 3164. Discovery is content-sniffed, not filename-matched, so it
246
+ handles both the Debian convention (`syslog`, `auth.log`, `kern.log`) and the RHEL/Fedora
247
+ one (extensionless `messages`, `secure`, `maillog`) — and won't mistake `dnf.log` or a
248
+ binary like `wtmp` for a log stream.
249
+ - **CloudTrail** — gzipped JSON event records, read locally or pulled from S3 (below).
250
+
251
+ ## The allowlist
252
+
253
+ Two kinds of allowlist file, never conflated:
254
+
255
+ - **Flat files = suppression.** One rule per line — an IP, a CIDR, a `:port/proto`, or a
256
+ domain glob/regex. Matching traffic is dropped before any detector runs. LogHunter ships a
257
+ curated domain list and never ships numeric connection suppressions (those depend on your
258
+ hosts, and shipping them could hide real findings).
259
+ - **TOML stanzas = classification.** When a detector needs to know *what* something is
260
+ (a nameserver, a backup client) rather than whether to drop it.
261
+
262
+ A bare host IP with no port suppresses *all* traffic involving that host — powerful, and
263
+ called out as such wherever it appears.
264
+
265
+ ## Pulling logs in: exporters
266
+
267
+ LogHunter can fetch logs from external systems to local files, which it then analyzes like
268
+ any other source — the syslog detector can't tell whether the data came from rsyslog or a
269
+ Splunk export.
270
+
271
+ ```bash
272
+ loghunter export # run the configured "default" query
273
+ loghunter export auth # run a named query
274
+ ```
275
+
276
+ - **Splunk** — named SPL queries under `[export.splunk.query.<name>]`. Prefer the
277
+ `LOGHUNTER_SPLUNK_USER` / `LOGHUNTER_SPLUNK_PASS` environment variables over plaintext
278
+ credentials in config.
279
+ - **CloudTrail** — pulls gzipped JSON from an S3 prefix. AWS authentication is *not* handled
280
+ here: you authenticate your shell, and boto3 resolves the ambient credential chain.
281
+ LogHunter never reads, stores, or prompts for AWS credentials, and warns before a large
282
+ egress.
283
+
284
+ ## Output formats
285
+
286
+ `text` (default, grouped and summarized), `json` (one finding per line, pipeable), `csv`
287
+ (flattened), and `html` (a self-contained file). Pass `--output=json` or set `output_format`
288
+ in config. `-v` adds the curated "why it scored" detail; `-vv` adds raw debug — template
289
+ strings, cluster membership, full evidence. Color is enhancement-only and TTY-gated: piped
290
+ or redirected output is always plain, and the machine formats never emit an escape code.
291
+
292
+ ## Building from source & running tests
293
+
294
+ ```bash
295
+ git clone https://github.com/spiralbend/loghunter
296
+ cd loghunter
297
+ pip install -e '.[all]'
298
+ python -m pytest
299
+ ```
300
+
301
+ `main` is kept runnable. Architecture tests cover the boundaries that matter — detector
302
+ discovery, run planning, loader metadata, allowlist suppression, output registration, and
303
+ CLI error formatting.
304
+
305
+ ## License
306
+
307
+ LogHunter is licensed under the [MIT License](LICENSE).
@@ -0,0 +1,3 @@
1
+ """LogHunter — network and log analysis tools for self-hosted security practitioners."""
2
+
3
+ __version__ = "0.1.0"