loghunter-cli 0.1.0.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter_cli-0.1.0.dev0/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0/PKG-INFO +336 -0
- loghunter_cli-0.1.0.dev0/README.md +307 -0
- loghunter_cli-0.1.0.dev0/loghunter/__init__.py +3 -0
- loghunter_cli-0.1.0.dev0/loghunter/cli.py +1108 -0
- loghunter_cli-0.1.0.dev0/loghunter/cli_init.py +567 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/__init__.py +1 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/allowlist.py +436 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/clustering.py +326 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/config.py +221 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/display.py +323 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/errors.py +45 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/finding.py +239 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/loader/__init__.py +136 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/loader/diagnostics.py +94 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/loader/discovery.py +335 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/loader/io.py +76 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/loader/pipeline.py +1010 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/loader/sniff.py +184 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/loader/types.py +207 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/loader/windowing.py +523 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/output.py +93 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/paths.py +105 -0
- loghunter_cli-0.1.0.dev0/loghunter/common/sources.py +392 -0
- loghunter_cli-0.1.0.dev0/loghunter/data/allowlist/connections.txt +50 -0
- loghunter_cli-0.1.0.dev0/loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter_cli-0.1.0.dev0/loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter_cli-0.1.0.dev0/loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter_cli-0.1.0.dev0/loghunter/data/config_example.toml +144 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/__init__.py +5 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/auth.py +27 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/aws.py +671 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/beacon.py +258 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/dns.py +778 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/dnsblock.py +29 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/duration.py +178 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/protocol.py +26 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/scan.py +735 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/ssl.py +25 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/syslog.py +266 -0
- loghunter_cli-0.1.0.dev0/loghunter/detectors/weird.py +27 -0
- loghunter_cli-0.1.0.dev0/loghunter/digest/__init__.py +43 -0
- loghunter_cli-0.1.0.dev0/loghunter/digest/_stats.py +182 -0
- loghunter_cli-0.1.0.dev0/loghunter/digest/blob.py +698 -0
- loghunter_cli-0.1.0.dev0/loghunter/digest/cloudtrail.py +341 -0
- loghunter_cli-0.1.0.dev0/loghunter/digest/conn.py +367 -0
- loghunter_cli-0.1.0.dev0/loghunter/digest/dns.py +364 -0
- loghunter_cli-0.1.0.dev0/loghunter/digest/syslog.py +269 -0
- loghunter_cli-0.1.0.dev0/loghunter/exporters/__init__.py +534 -0
- loghunter_cli-0.1.0.dev0/loghunter/exporters/cloudtrail.py +499 -0
- loghunter_cli-0.1.0.dev0/loghunter/exporters/splunk.py +222 -0
- loghunter_cli-0.1.0.dev0/loghunter/outputs/__init__.py +1 -0
- loghunter_cli-0.1.0.dev0/loghunter/outputs/allowlist.py +75 -0
- loghunter_cli-0.1.0.dev0/loghunter/outputs/csv.py +70 -0
- loghunter_cli-0.1.0.dev0/loghunter/outputs/email.py +44 -0
- loghunter_cli-0.1.0.dev0/loghunter/outputs/html.py +99 -0
- loghunter_cli-0.1.0.dev0/loghunter/outputs/json.py +77 -0
- loghunter_cli-0.1.0.dev0/loghunter/outputs/text.py +1422 -0
- loghunter_cli-0.1.0.dev0/loghunter/parsers/__init__.py +1 -0
- loghunter_cli-0.1.0.dev0/loghunter/parsers/cloudtrail.py +287 -0
- loghunter_cli-0.1.0.dev0/loghunter/parsers/dnsmasq.py +331 -0
- loghunter_cli-0.1.0.dev0/loghunter/parsers/syslog.py +150 -0
- loghunter_cli-0.1.0.dev0/loghunter/parsers/zeek.py +294 -0
- loghunter_cli-0.1.0.dev0/loghunter/parsers/zeek_tsv.py +310 -0
- loghunter_cli-0.1.0.dev0/loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/PKG-INFO +336 -0
- loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/SOURCES.txt +125 -0
- loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/dependency_links.txt +1 -0
- loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/requires.txt +22 -0
- loghunter_cli-0.1.0.dev0/loghunter_cli.egg-info/top_level.txt +7 -0
- loghunter_cli-0.1.0.dev0/migrations/cloudtrail_parquet.py +59 -0
- loghunter_cli-0.1.0.dev0/migrations/conn_fft.py +550 -0
- loghunter_cli-0.1.0.dev0/migrations/conn_scan.py +1097 -0
- loghunter_cli-0.1.0.dev0/migrations/dns_dbscan.py +520 -0
- loghunter_cli-0.1.0.dev0/migrations/get_syslog.py +402 -0
- loghunter_cli-0.1.0.dev0/migrations/syslog_drain3.py +479 -0
- loghunter_cli-0.1.0.dev0/pyproject.toml +38 -0
- loghunter_cli-0.1.0.dev0/scratch/junk/parquet.py +59 -0
- loghunter_cli-0.1.0.dev0/setup.cfg +4 -0
- loghunter_cli-0.1.0.dev0/tests/__init__.py +1 -0
- loghunter_cli-0.1.0.dev0/tests/_cloudtrail_fakes.py +116 -0
- loghunter_cli-0.1.0.dev0/tests/conftest.py +17 -0
- loghunter_cli-0.1.0.dev0/tests/test_allowlist_defaults_accessor.py +90 -0
- loghunter_cli-0.1.0.dev0/tests/test_architecture_spine.py +302 -0
- loghunter_cli-0.1.0.dev0/tests/test_aws_detector.py +504 -0
- loghunter_cli-0.1.0.dev0/tests/test_be_like_water.py +106 -0
- loghunter_cli-0.1.0.dev0/tests/test_cli_help.py +342 -0
- loghunter_cli-0.1.0.dev0/tests/test_cli_multi_positional.py +458 -0
- loghunter_cli-0.1.0.dev0/tests/test_cloudtrail_exporter.py +631 -0
- loghunter_cli-0.1.0.dev0/tests/test_cloudtrail_exporter_botocore.py +207 -0
- loghunter_cli-0.1.0.dev0/tests/test_cloudtrail_parser.py +393 -0
- loghunter_cli-0.1.0.dev0/tests/test_clustering.py +85 -0
- loghunter_cli-0.1.0.dev0/tests/test_clustering_interruptible.py +404 -0
- loghunter_cli-0.1.0.dev0/tests/test_config_cli.py +1006 -0
- loghunter_cli-0.1.0.dev0/tests/test_config_example_drift.py +164 -0
- loghunter_cli-0.1.0.dev0/tests/test_digest_blob.py +1237 -0
- loghunter_cli-0.1.0.dev0/tests/test_digest_cli.py +1040 -0
- loghunter_cli-0.1.0.dev0/tests/test_digest_cloudtrail.py +980 -0
- loghunter_cli-0.1.0.dev0/tests/test_digest_conn.py +1189 -0
- loghunter_cli-0.1.0.dev0/tests/test_digest_dns.py +770 -0
- loghunter_cli-0.1.0.dev0/tests/test_digest_stats.py +282 -0
- loghunter_cli-0.1.0.dev0/tests/test_digest_syslog.py +724 -0
- loghunter_cli-0.1.0.dev0/tests/test_display.py +370 -0
- loghunter_cli-0.1.0.dev0/tests/test_dns_detector.py +1010 -0
- loghunter_cli-0.1.0.dev0/tests/test_dnsmasq_parser.py +467 -0
- loghunter_cli-0.1.0.dev0/tests/test_duration_detector.py +491 -0
- loghunter_cli-0.1.0.dev0/tests/test_export_orchestrator_shape.py +153 -0
- loghunter_cli-0.1.0.dev0/tests/test_init_wizard.py +707 -0
- loghunter_cli-0.1.0.dev0/tests/test_loader.py +3639 -0
- loghunter_cli-0.1.0.dev0/tests/test_loader_package_surface.py +115 -0
- loghunter_cli-0.1.0.dev0/tests/test_loader_window_model.py +215 -0
- loghunter_cli-0.1.0.dev0/tests/test_output_path_cascade.py +575 -0
- loghunter_cli-0.1.0.dev0/tests/test_resolve_path.py +111 -0
- loghunter_cli-0.1.0.dev0/tests/test_root_provenance.py +212 -0
- loghunter_cli-0.1.0.dev0/tests/test_runner.py +2599 -0
- loghunter_cli-0.1.0.dev0/tests/test_scan_detector.py +455 -0
- loghunter_cli-0.1.0.dev0/tests/test_search_paths.py +50 -0
- loghunter_cli-0.1.0.dev0/tests/test_sniff_orchestrator.py +373 -0
- loghunter_cli-0.1.0.dev0/tests/test_sniff_recognizers.py +573 -0
- loghunter_cli-0.1.0.dev0/tests/test_source_resolution_seam.py +471 -0
- loghunter_cli-0.1.0.dev0/tests/test_sources.py +648 -0
- loghunter_cli-0.1.0.dev0/tests/test_splunk_exporter.py +351 -0
- loghunter_cli-0.1.0.dev0/tests/test_syslog_detector.py +458 -0
- loghunter_cli-0.1.0.dev0/tests/test_syslog_parser.py +582 -0
- loghunter_cli-0.1.0.dev0/tests/test_text_output.py +1225 -0
- loghunter_cli-0.1.0.dev0/tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 David Augros
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,336 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: loghunter-cli
|
|
3
|
+
Version: 0.1.0.dev0
|
|
4
|
+
Summary: ML-assisted network and log analysis toolkit for security practitioners and threat hunters.
|
|
5
|
+
Author-email: David Augros <code@augros.org>
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.11
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
License-File: LICENSE
|
|
10
|
+
Requires-Dist: pandas>=2.0
|
|
11
|
+
Requires-Dist: numpy>=1.26
|
|
12
|
+
Requires-Dist: scikit-learn>=1.3
|
|
13
|
+
Requires-Dist: hdbscan>=0.8
|
|
14
|
+
Requires-Dist: drain3>=0.9
|
|
15
|
+
Requires-Dist: tqdm>=4.0
|
|
16
|
+
Requires-Dist: tldextract>=3.0
|
|
17
|
+
Provides-Extra: fast
|
|
18
|
+
Requires-Dist: fast-hdbscan>=0.2; extra == "fast"
|
|
19
|
+
Provides-Extra: splunk
|
|
20
|
+
Requires-Dist: splunk-sdk; extra == "splunk"
|
|
21
|
+
Provides-Extra: cloudtrail
|
|
22
|
+
Requires-Dist: boto3; extra == "cloudtrail"
|
|
23
|
+
Requires-Dist: botocore[crt]; extra == "cloudtrail"
|
|
24
|
+
Provides-Extra: all
|
|
25
|
+
Requires-Dist: loghunt[fast]; extra == "all"
|
|
26
|
+
Requires-Dist: loghunt[splunk]; extra == "all"
|
|
27
|
+
Requires-Dist: loghunt[cloudtrail]; extra == "all"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# LogHunter
|
|
31
|
+
|
|
32
|
+
LogHunter is a local-first command-line threat-hunting workbench for self-hosters. You
|
|
33
|
+
point it at the logs you already have — Zeek, Pi-hole/dnsmasq, syslog, CloudTrail — and it
|
|
34
|
+
tells you what's in them and runs transparent detectors over them: beaconing, suspicious
|
|
35
|
+
DNS, port scans, rare syslog events, abnormally long connections, and unusual CloudTrail
|
|
36
|
+
activity. Every run names the technique behind each detector, so you always know whether a
|
|
37
|
+
finding came from a published algorithm or an honest heuristic.
|
|
38
|
+
|
|
39
|
+
**Not a SIEM. Not an agent. Not magic.** Nothing to deploy, no database, no daemon, no
|
|
40
|
+
account. Install it, point it at a directory of logs, read the output. It runs on the
|
|
41
|
+
admin's own box, over logs at rest.
|
|
42
|
+
|
|
43
|
+
[](#license)
|
|
44
|
+

|
|
45
|
+
|
|
46
|
+
> **Status: early / pre-1.0 (`0.1.0.dev0`).** The six detectors below work and are
|
|
47
|
+
> covered by tests, but interfaces may still move before 1.0. Feedback welcome.
|
|
48
|
+
|
|
49
|
+
<!-- TODO(screenshots): a real terminal capture of `loghunter ~/zeek` and a `digest` card go here. -->
|
|
50
|
+
|
|
51
|
+
A run opens with a summary banner — what was loaded, and which technique each detector
|
|
52
|
+
used — then groups findings by detector (illustrative output; addresses are
|
|
53
|
+
[RFC 5737](https://datatracker.ietf.org/doc/html/rfc5737) documentation space):
|
|
54
|
+
|
|
55
|
+
```
|
|
56
|
+
LogHunter · Threat Hunt
|
|
57
|
+
══════════════════════════════════════════════════════════════════════════════
|
|
58
|
+
Data found: 2026-05-31 00:00 → 2026-06-01 00:00 (24h)
|
|
59
|
+
Records: 1,284,402 conn.log · 318,221 dns.log · 44,019 *.log
|
|
60
|
+
Detectors: beacon (FFT) · dns (fast-HDBSCAN) · syslog (drain3) · scan [pattern] · duration [heuristics]
|
|
61
|
+
══════════════════════════════════════════════════════════════════════════════
|
|
62
|
+
|
|
63
|
+
beacon — 2 findings · 1 H 1 M
|
|
64
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
65
|
+
[H] 192.0.2.37 → 198.51.100.20:443/tcp score 0.91 period 60.0s 1,440 conns
|
|
66
|
+
[M] 192.0.2.37 → 198.51.100.61:8443/tcp score 0.74 period 300.0s 288 conns
|
|
67
|
+
|
|
68
|
+
dns — 1 finding · 1 M
|
|
69
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
70
|
+
[M] dga-lookups.example entropy 3.91 14 subdomains cluster -1 (noise)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
The two-tier styling of the `Detectors:` line is deliberate: published techniques glow in
|
|
74
|
+
parentheses — `(FFT)`, `(HDBSCAN)`, `(drain3)` — while honest house methods are plain in
|
|
75
|
+
brackets — `[pattern]`, `[heuristics]`, `[statistical]`. The restraint is the point. A
|
|
76
|
+
heuristic is never dressed up as an algorithm, which is what makes the glow trustworthy.
|
|
77
|
+
|
|
78
|
+
## Quick start
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install loghunt
|
|
82
|
+
|
|
83
|
+
# one-time, detection-driven setup — finds your logs and writes a config
|
|
84
|
+
loghunter init
|
|
85
|
+
|
|
86
|
+
# hunt across everything enabled in your config
|
|
87
|
+
loghunter
|
|
88
|
+
|
|
89
|
+
# or point at a directory / file directly
|
|
90
|
+
loghunter ~/zeek-logs
|
|
91
|
+
loghunter syslog /var/log
|
|
92
|
+
|
|
93
|
+
# orient before you hunt — a fast, factual profile of a single file
|
|
94
|
+
loghunter digest /var/log/messages
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
No config file is required to get started — `loghunter <path>` works against a directory or
|
|
98
|
+
a single file. `loghunter init` just makes it repeatable.
|
|
99
|
+
|
|
100
|
+
## Why use LogHunter?
|
|
101
|
+
|
|
102
|
+
- **It runs where your logs are.** No services, no database, no daemon, no agent to push.
|
|
103
|
+
`pip install`, point it at a directory, get output. The only setup step that exists at all
|
|
104
|
+
is `loghunter init`, and that only writes a config file.
|
|
105
|
+
- **Real methods, made visible.** Beaconing is found with an FFT over connection timing;
|
|
106
|
+
DNS with HDBSCAN clustering over per-query behavior; rare syslog events with drain3
|
|
107
|
+
log-templating plus rarity scoring; CloudTrail with a transparent per-principal z-score
|
|
108
|
+
composite. Every run tells you which technique ran. You can read *why* something was
|
|
109
|
+
surfaced — no black box.
|
|
110
|
+
- **Big-tent ingestion.** One tool reads Zeek (NDJSON *and* TSV, flat *or* date-partitioned
|
|
111
|
+
directories), Pi-hole/dnsmasq, flat RFC 3164 syslog (Debian *and* RHEL/Fedora layouts),
|
|
112
|
+
and CloudTrail. Rotation and `.gz`/`.bz2`/`.xz` compression are handled transparently.
|
|
113
|
+
- **Orient before you hunt.** `loghunter digest FILE` reads a log and reports facts about
|
|
114
|
+
it — time span, top talkers, the shape of the mix — with zero verdicts. It's sonar, not a
|
|
115
|
+
baggage scanner: it tells you what's there so you know where to point the detectors.
|
|
116
|
+
- **Filter before analyze.** A flat-file allowlist suppresses known-good infrastructure
|
|
117
|
+
*before* any detector sees the data, so your noise floor is yours to set and detectors
|
|
118
|
+
never have to know the allowlist exists.
|
|
119
|
+
- **Honest output.** Findings carry a severity, the evidence behind the score, and (with
|
|
120
|
+
`-v`/`-vv`) the analyst pivots to chase next. Machine formats (`json`, `csv`, `html`) are
|
|
121
|
+
lossless; the terminal view is the one that summarizes.
|
|
122
|
+
|
|
123
|
+
## Why *not* use LogHunter?
|
|
124
|
+
|
|
125
|
+
- **It is not real-time and not a SIEM.** It runs over logs at rest, in batches. There's no
|
|
126
|
+
streaming, no alerting pipeline, no live correlation across sources at scale. If you need an
|
|
127
|
+
always-on detection platform, you need a SIEM; LogHunter is the workbench you reach for to
|
|
128
|
+
*hunt*.
|
|
129
|
+
- **It is stateless between runs.** There's no persisted baseline and no rolling history.
|
|
130
|
+
CloudTrail "first-seen" novelty, for example, is relative to the window you loaded — not to
|
|
131
|
+
all of recorded time.
|
|
132
|
+
- **Detector coverage is v1.** Six detectors ship today (below). `auth`, `ssl`, `protocol`,
|
|
133
|
+
and `weird` are planned but not built.
|
|
134
|
+
- **The richest network signal wants Zeek.** Pi-hole/dnsmasq gives you DNS only — no RTT,
|
|
135
|
+
TTL, or connection correlation. LogHunter will tell you so and keep working, but Zeek is
|
|
136
|
+
where it shines.
|
|
137
|
+
- **It surfaces, it doesn't block.** This is a tool for a human triaging behavior, not a
|
|
138
|
+
signature IDS or an enforcement point.
|
|
139
|
+
|
|
140
|
+
## What it hunts
|
|
141
|
+
|
|
142
|
+
| Detector | Surfaces | Method | Source |
|
|
143
|
+
|-----------|-----------------------------------------------------|------------------------------|--------------------------------|
|
|
144
|
+
| `beacon` | periodic C2-style callbacks | FFT over connection timing | Zeek `conn.log` |
|
|
145
|
+
| `dns` | DGA / tunneling / anomalous lookups | HDBSCAN clustering | Zeek `dns.log` **or** Pi-hole |
|
|
146
|
+
| `syslog` | rare events & reboots | drain3 templating + rarity | syslog (flat) **or** Zeek `syslog.log` |
|
|
147
|
+
| `scan` | vertical / horizontal / block / slow port scans | pattern (heuristic) | Zeek `conn.log` |
|
|
148
|
+
| `duration`| abnormally long-lived connections | heuristics | Zeek `conn.log` |
|
|
149
|
+
| `aws` | per-principal anomalous CloudTrail behavior | statistical (z-score composite) | CloudTrail `*.json` |
|
|
150
|
+
|
|
151
|
+
`dns` and `syslog` each answer **one** question across **two** source families — Zeek and
|
|
152
|
+
Pi-hole for DNS, flat rsyslog and Zeek's own `syslog.log` for syslog — and adapt to whichever
|
|
153
|
+
fidelity they're handed.
|
|
154
|
+
|
|
155
|
+
Run them all (`loghunter`), select some (`loghunter --detect=beacon,dns`), or exclude
|
|
156
|
+
(`loghunter --detect='all,!syslog'`). Each detector is also its own subcommand:
|
|
157
|
+
`loghunter beacon ~/zeek`.
|
|
158
|
+
|
|
159
|
+
## How a run works
|
|
160
|
+
|
|
161
|
+
```
|
|
162
|
+
discover & parse → allowlist (suppress) → detect → render
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
Responsibilities don't bleed across that line. The **loader** finds files, decompresses,
|
|
166
|
+
normalizes every connection source to one canonical schema, and absorbs storage variation
|
|
167
|
+
(TSV vs. NDJSON, flat vs. dated directories, rotation). The **allowlist** suppresses
|
|
168
|
+
known-good traffic *before* analysis. **Detectors** only analyze — they never open files,
|
|
169
|
+
read config, or suppress. **Output handlers** only render. The CLI is the one place that
|
|
170
|
+
turns an error into an actionable message and owns the exit code.
|
|
171
|
+
|
|
172
|
+
Because detectors are pure analysis, every one is importable and callable as an ordinary
|
|
173
|
+
Python function — useful in a notebook when you want to experiment.
|
|
174
|
+
|
|
175
|
+
### Analysis window
|
|
176
|
+
|
|
177
|
+
Pointed at a **directory**, an unqualified run looks back over the last `default_window`
|
|
178
|
+
(`1d` out of the box) of *that source's own* data — the right default for a live log dir
|
|
179
|
+
you don't want to read in full every time. Pointed at a **single file**, it reads the whole
|
|
180
|
+
file. Override either way:
|
|
181
|
+
|
|
182
|
+
```bash
|
|
183
|
+
loghunter --since=7d ~/zeek # last 7 days
|
|
184
|
+
loghunter --since=2026-05-01 --until=2026-05-08 ~/zeek
|
|
185
|
+
loghunter --days=2-4 ~/zeek # 2 to 4 days ago
|
|
186
|
+
loghunter --all ~/zeek # the entire archive
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
CloudTrail is the one source that opts out of the default window — novelty detection needs
|
|
190
|
+
full history, so it always loads in full unless you narrow it explicitly.
|
|
191
|
+
|
|
192
|
+
## Orient before the hunt: `digest`
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
loghunter digest /var/log/messages
|
|
196
|
+
loghunter digest conn.log dns.log # several files → several cards
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
`digest` content-sniffs each file, routes it to the right summarizer (conn, dns, syslog,
|
|
200
|
+
cloudtrail), and falls back to a fast byte-profiler — **blob** — for anything it doesn't
|
|
201
|
+
recognize. A card is flush-left and factual: the file's time window, line count and size, a
|
|
202
|
+
scale-anchored histogram, and a handful of plain-language insights ("one client accounts for
|
|
203
|
+
71% of queries"). It states facts and superlatives, never verdicts — no "suspicious," no
|
|
204
|
+
"anomalous." It reads your data *before* the allowlist, because everything in the file,
|
|
205
|
+
allowlisted or not, is part of "what's in here." The blob profiler is bounded: it samples a
|
|
206
|
+
big file rather than reading it, so a one-gigabyte mystery file costs the same as a
|
|
207
|
+
one-kilobyte one.
|
|
208
|
+
|
|
209
|
+
## Installation
|
|
210
|
+
|
|
211
|
+
LogHunter is published on PyPI as **`loghunt`** (the command, import package, and config
|
|
212
|
+
section are all `loghunter`).
|
|
213
|
+
|
|
214
|
+
```bash
|
|
215
|
+
pip install loghunt # core
|
|
216
|
+
pip install 'loghunt[fast]' # fast-hdbscan accelerator for DNS clustering
|
|
217
|
+
pip install 'loghunt[splunk]' # Splunk exporter
|
|
218
|
+
pip install 'loghunt[cloudtrail]' # CloudTrail (S3) exporter
|
|
219
|
+
pip install 'loghunt[all]' # everything above
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Requires **Python 3.11+**. A bare `pip install loghunt` always works — the DNS clustering
|
|
223
|
+
runs on stock `hdbscan` (a base dependency); `[fast]` swaps in a numba-accelerated backend
|
|
224
|
+
when you want it, and the tool tells you which one is active on every run.
|
|
225
|
+
|
|
226
|
+
From source:
|
|
227
|
+
|
|
228
|
+
```bash
|
|
229
|
+
git clone https://github.com/spiralbend/loghunter
|
|
230
|
+
cd loghunter
|
|
231
|
+
pip install -e '.[all]'
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Configuration
|
|
235
|
+
|
|
236
|
+
Configuration is optional — LogHunter runs against a path with none. When you want it
|
|
237
|
+
repeatable, `loghunter init` looks at the conventional locations on your box, profiles what
|
|
238
|
+
it finds (which log families, rough size, freshness — without reading a single log line),
|
|
239
|
+
and writes a fully-annotated `~/.loghunter/config.toml`. It never clobbers settings you
|
|
240
|
+
already have.
|
|
241
|
+
|
|
242
|
+
Config is loaded from the first of:
|
|
243
|
+
|
|
244
|
+
1. `--config=FILE`
|
|
245
|
+
2. `~/.loghunter/config.toml`
|
|
246
|
+
3. `/etc/loghunter/config.toml`
|
|
247
|
+
|
|
248
|
+
Everything LogHunter owns lives under the hidden `~/.loghunter/` — config, allowlists,
|
|
249
|
+
exports, reports — so it can't collide with a project directory. A trimmed example:
|
|
250
|
+
|
|
251
|
+
```toml
|
|
252
|
+
[loghunter]
|
|
253
|
+
detect = "all" # "all" | "dns,beacon" | "all,!syslog"
|
|
254
|
+
zeek_dir = "/var/log/zeek"
|
|
255
|
+
syslog_dir = "/var/log"
|
|
256
|
+
# pihole_dir = "/var/log/pihole"
|
|
257
|
+
# cloudtrail_dir = "/var/log/cloudtrail"
|
|
258
|
+
|
|
259
|
+
home_net = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
|
|
260
|
+
default_window = "1d" # lookback for a directory; "" or "all" = full
|
|
261
|
+
output_format = "text" # text | json | csv | html
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
Findings print to your terminal by default — keep it pipeable. Set `report_dir` (or pass
|
|
265
|
+
`--out=PATH`) to write report files instead. Every tunable a detector exposes is documented
|
|
266
|
+
as a commented "engine room" at the bottom of the generated config; you rarely need it, and
|
|
267
|
+
`loghunter <detector> --help` lists the full surface.
|
|
268
|
+
|
|
269
|
+
## Log sources it speaks
|
|
270
|
+
|
|
271
|
+
- **Zeek** — `conn.log`, `dns.log`, `syslog.log`, in NDJSON or TSV, from a flat directory or
|
|
272
|
+
date-partitioned subdirectories. Rotation and gzip/bzip2/xz compression are transparent.
|
|
273
|
+
- **Pi-hole / dnsmasq** — DNS event logs, aggregated per domain for clustering.
|
|
274
|
+
- **syslog** — flat RFC 3164. Discovery is content-sniffed, not filename-matched, so it
|
|
275
|
+
handles both the Debian convention (`syslog`, `auth.log`, `kern.log`) and the RHEL/Fedora
|
|
276
|
+
one (extensionless `messages`, `secure`, `maillog`) — and won't mistake `dnf.log` or a
|
|
277
|
+
binary like `wtmp` for a log stream.
|
|
278
|
+
- **CloudTrail** — gzipped JSON event records, read locally or pulled from S3 (below).
|
|
279
|
+
|
|
280
|
+
## The allowlist
|
|
281
|
+
|
|
282
|
+
Two kinds of allowlist file, never conflated:
|
|
283
|
+
|
|
284
|
+
- **Flat files = suppression.** One rule per line — an IP, a CIDR, a `:port/proto`, or a
|
|
285
|
+
domain glob/regex. Matching traffic is dropped before any detector runs. LogHunter ships a
|
|
286
|
+
curated domain list and never ships numeric connection suppressions (those depend on your
|
|
287
|
+
hosts, and shipping them could hide real findings).
|
|
288
|
+
- **TOML stanzas = classification.** When a detector needs to know *what* something is
|
|
289
|
+
(a nameserver, a backup client) rather than whether to drop it.
|
|
290
|
+
|
|
291
|
+
A bare host IP with no port suppresses *all* traffic involving that host — powerful, and
|
|
292
|
+
called out as such wherever it appears.
|
|
293
|
+
|
|
294
|
+
## Pulling logs in: exporters
|
|
295
|
+
|
|
296
|
+
LogHunter can fetch logs from external systems to local files, which it then analyzes like
|
|
297
|
+
any other source — the syslog detector can't tell whether the data came from rsyslog or a
|
|
298
|
+
Splunk export.
|
|
299
|
+
|
|
300
|
+
```bash
|
|
301
|
+
loghunter export # run the configured "default" query
|
|
302
|
+
loghunter export auth # run a named query
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
- **Splunk** — named SPL queries under `[export.splunk.query.<name>]`. Prefer the
|
|
306
|
+
`LOGHUNTER_SPLUNK_USER` / `LOGHUNTER_SPLUNK_PASS` environment variables over plaintext
|
|
307
|
+
credentials in config.
|
|
308
|
+
- **CloudTrail** — pulls gzipped JSON from an S3 prefix. AWS authentication is *not* handled
|
|
309
|
+
here: you authenticate your shell, and boto3 resolves the ambient credential chain.
|
|
310
|
+
LogHunter never reads, stores, or prompts for AWS credentials, and warns before a large
|
|
311
|
+
egress.
|
|
312
|
+
|
|
313
|
+
## Output formats
|
|
314
|
+
|
|
315
|
+
`text` (default, grouped and summarized), `json` (one finding per line, pipeable), `csv`
|
|
316
|
+
(flattened), and `html` (a self-contained file). Pass `--output=json` or set `output_format`
|
|
317
|
+
in config. `-v` adds the curated "why it scored" detail; `-vv` adds raw debug — template
|
|
318
|
+
strings, cluster membership, full evidence. Color is enhancement-only and TTY-gated: piped
|
|
319
|
+
or redirected output is always plain, and the machine formats never emit an escape code.
|
|
320
|
+
|
|
321
|
+
## Building from source & running tests
|
|
322
|
+
|
|
323
|
+
```bash
|
|
324
|
+
git clone https://github.com/spiralbend/loghunter
|
|
325
|
+
cd loghunter
|
|
326
|
+
pip install -e '.[all]'
|
|
327
|
+
python -m pytest
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
`main` is kept runnable. Architecture tests cover the boundaries that matter — detector
|
|
331
|
+
discovery, run planning, loader metadata, allowlist suppression, output registration, and
|
|
332
|
+
CLI error formatting.
|
|
333
|
+
|
|
334
|
+
## License
|
|
335
|
+
|
|
336
|
+
LogHunter is licensed under the [MIT License](LICENSE).
|
|
@@ -0,0 +1,307 @@
|
|
|
1
|
+
# LogHunter
|
|
2
|
+
|
|
3
|
+
LogHunter is a local-first command-line threat-hunting workbench for self-hosters. You
|
|
4
|
+
point it at the logs you already have — Zeek, Pi-hole/dnsmasq, syslog, CloudTrail — and it
|
|
5
|
+
tells you what's in them and runs transparent detectors over them: beaconing, suspicious
|
|
6
|
+
DNS, port scans, rare syslog events, abnormally long connections, and unusual CloudTrail
|
|
7
|
+
activity. Every run names the technique behind each detector, so you always know whether a
|
|
8
|
+
finding came from a published algorithm or an honest heuristic.
|
|
9
|
+
|
|
10
|
+
**Not a SIEM. Not an agent. Not magic.** Nothing to deploy, no database, no daemon, no
|
|
11
|
+
account. Install it, point it at a directory of logs, read the output. It runs on the
|
|
12
|
+
admin's own box, over logs at rest.
|
|
13
|
+
|
|
14
|
+
[](#license)
|
|
15
|
+

|
|
16
|
+
|
|
17
|
+
> **Status: early / pre-1.0 (`0.1.0.dev0`).** The six detectors below work and are
|
|
18
|
+
> covered by tests, but interfaces may still move before 1.0. Feedback welcome.
|
|
19
|
+
|
|
20
|
+
<!-- TODO(screenshots): a real terminal capture of `loghunter ~/zeek` and a `digest` card go here. -->
|
|
21
|
+
|
|
22
|
+
A run opens with a summary banner — what was loaded, and which technique each detector
|
|
23
|
+
used — then groups findings by detector (illustrative output; addresses are
|
|
24
|
+
[RFC 5737](https://datatracker.ietf.org/doc/html/rfc5737) documentation space):
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
LogHunter · Threat Hunt
|
|
28
|
+
══════════════════════════════════════════════════════════════════════════════
|
|
29
|
+
Data found: 2026-05-31 00:00 → 2026-06-01 00:00 (24h)
|
|
30
|
+
Records: 1,284,402 conn.log · 318,221 dns.log · 44,019 *.log
|
|
31
|
+
Detectors: beacon (FFT) · dns (fast-HDBSCAN) · syslog (drain3) · scan [pattern] · duration [heuristics]
|
|
32
|
+
══════════════════════════════════════════════════════════════════════════════
|
|
33
|
+
|
|
34
|
+
beacon — 2 findings · 1 H 1 M
|
|
35
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
36
|
+
[H] 192.0.2.37 → 198.51.100.20:443/tcp score 0.91 period 60.0s 1,440 conns
|
|
37
|
+
[M] 192.0.2.37 → 198.51.100.61:8443/tcp score 0.74 period 300.0s 288 conns
|
|
38
|
+
|
|
39
|
+
dns — 1 finding · 1 M
|
|
40
|
+
────────────────────────────────────────────────────────────────────────────────
|
|
41
|
+
[M] dga-lookups.example entropy 3.91 14 subdomains cluster -1 (noise)
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
The two-tier styling of the `Detectors:` line is deliberate: published techniques glow in
|
|
45
|
+
parentheses — `(FFT)`, `(HDBSCAN)`, `(drain3)` — while honest house methods are plain in
|
|
46
|
+
brackets — `[pattern]`, `[heuristics]`, `[statistical]`. The restraint is the point. A
|
|
47
|
+
heuristic is never dressed up as an algorithm, which is what makes the glow trustworthy.
|
|
48
|
+
|
|
49
|
+
## Quick start
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install loghunt
|
|
53
|
+
|
|
54
|
+
# one-time, detection-driven setup — finds your logs and writes a config
|
|
55
|
+
loghunter init
|
|
56
|
+
|
|
57
|
+
# hunt across everything enabled in your config
|
|
58
|
+
loghunter
|
|
59
|
+
|
|
60
|
+
# or point at a directory / file directly
|
|
61
|
+
loghunter ~/zeek-logs
|
|
62
|
+
loghunter syslog /var/log
|
|
63
|
+
|
|
64
|
+
# orient before you hunt — a fast, factual profile of a single file
|
|
65
|
+
loghunter digest /var/log/messages
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
No config file is required to get started — `loghunter <path>` works against a directory or
|
|
69
|
+
a single file. `loghunter init` just makes it repeatable.
|
|
70
|
+
|
|
71
|
+
## Why use LogHunter?
|
|
72
|
+
|
|
73
|
+
- **It runs where your logs are.** No services, no database, no daemon, no agent to push.
|
|
74
|
+
`pip install`, point it at a directory, get output. The only setup step that exists at all
|
|
75
|
+
is `loghunter init`, and that only writes a config file.
|
|
76
|
+
- **Real methods, made visible.** Beaconing is found with an FFT over connection timing;
|
|
77
|
+
DNS with HDBSCAN clustering over per-query behavior; rare syslog events with drain3
|
|
78
|
+
log-templating plus rarity scoring; CloudTrail with a transparent per-principal z-score
|
|
79
|
+
composite. Every run tells you which technique ran. You can read *why* something was
|
|
80
|
+
surfaced — no black box.
|
|
81
|
+
- **Big-tent ingestion.** One tool reads Zeek (NDJSON *and* TSV, flat *or* date-partitioned
|
|
82
|
+
directories), Pi-hole/dnsmasq, flat RFC 3164 syslog (Debian *and* RHEL/Fedora layouts),
|
|
83
|
+
and CloudTrail. Rotation and `.gz`/`.bz2`/`.xz` compression are handled transparently.
|
|
84
|
+
- **Orient before you hunt.** `loghunter digest FILE` reads a log and reports facts about
|
|
85
|
+
it — time span, top talkers, the shape of the mix — with zero verdicts. It's sonar, not a
|
|
86
|
+
baggage scanner: it tells you what's there so you know where to point the detectors.
|
|
87
|
+
- **Filter before analyze.** A flat-file allowlist suppresses known-good infrastructure
|
|
88
|
+
*before* any detector sees the data, so your noise floor is yours to set and detectors
|
|
89
|
+
never have to know the allowlist exists.
|
|
90
|
+
- **Honest output.** Findings carry a severity, the evidence behind the score, and (with
|
|
91
|
+
`-v`/`-vv`) the analyst pivots to chase next. Machine formats (`json`, `csv`, `html`) are
|
|
92
|
+
lossless; the terminal view is the one that summarizes.
|
|
93
|
+
|
|
94
|
+
## Why *not* use LogHunter?
|
|
95
|
+
|
|
96
|
+
- **It is not real-time and not a SIEM.** It runs over logs at rest, in batches. There's no
|
|
97
|
+
streaming, no alerting pipeline, no live correlation across sources at scale. If you need an
|
|
98
|
+
always-on detection platform, you need a SIEM; LogHunter is the workbench you reach for to
|
|
99
|
+
*hunt*.
|
|
100
|
+
- **It is stateless between runs.** There's no persisted baseline and no rolling history.
|
|
101
|
+
CloudTrail "first-seen" novelty, for example, is relative to the window you loaded — not to
|
|
102
|
+
all of recorded time.
|
|
103
|
+
- **Detector coverage is v1.** Six detectors ship today (below). `auth`, `ssl`, `protocol`,
|
|
104
|
+
and `weird` are planned but not built.
|
|
105
|
+
- **The richest network signal wants Zeek.** Pi-hole/dnsmasq gives you DNS only — no RTT,
|
|
106
|
+
TTL, or connection correlation. LogHunter will tell you so and keep working, but Zeek is
|
|
107
|
+
where it shines.
|
|
108
|
+
- **It surfaces, it doesn't block.** This is a tool for a human triaging behavior, not a
|
|
109
|
+
signature IDS or an enforcement point.
|
|
110
|
+
|
|
111
|
+
## What it hunts
|
|
112
|
+
|
|
113
|
+
| Detector | Surfaces | Method | Source |
|
|
114
|
+
|-----------|-----------------------------------------------------|------------------------------|--------------------------------|
|
|
115
|
+
| `beacon` | periodic C2-style callbacks | FFT over connection timing | Zeek `conn.log` |
|
|
116
|
+
| `dns` | DGA / tunneling / anomalous lookups | HDBSCAN clustering | Zeek `dns.log` **or** Pi-hole |
|
|
117
|
+
| `syslog` | rare events & reboots | drain3 templating + rarity | syslog (flat) **or** Zeek `syslog.log` |
|
|
118
|
+
| `scan` | vertical / horizontal / block / slow port scans | pattern (heuristic) | Zeek `conn.log` |
|
|
119
|
+
| `duration`| abnormally long-lived connections | heuristics | Zeek `conn.log` |
|
|
120
|
+
| `aws` | per-principal anomalous CloudTrail behavior | statistical (z-score composite) | CloudTrail `*.json` |
|
|
121
|
+
|
|
122
|
+
`dns` and `syslog` each answer **one** question across **two** source families — Zeek and
|
|
123
|
+
Pi-hole for DNS, flat rsyslog and Zeek's own `syslog.log` for syslog — and adapt to whichever
|
|
124
|
+
fidelity they're handed.
|
|
125
|
+
|
|
126
|
+
Run them all (`loghunter`), select some (`loghunter --detect=beacon,dns`), or exclude
|
|
127
|
+
(`loghunter --detect='all,!syslog'`). Each detector is also its own subcommand:
|
|
128
|
+
`loghunter beacon ~/zeek`.
|
|
129
|
+
|
|
130
|
+
## How a run works
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
discover & parse → allowlist (suppress) → detect → render
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Responsibilities don't bleed across that line. The **loader** finds files, decompresses,
|
|
137
|
+
normalizes every connection source to one canonical schema, and absorbs storage variation
|
|
138
|
+
(TSV vs. NDJSON, flat vs. dated directories, rotation). The **allowlist** suppresses
|
|
139
|
+
known-good traffic *before* analysis. **Detectors** only analyze — they never open files,
|
|
140
|
+
read config, or suppress. **Output handlers** only render. The CLI is the one place that
|
|
141
|
+
turns an error into an actionable message and owns the exit code.
|
|
142
|
+
|
|
143
|
+
Because detectors are pure analysis, every one is importable and callable as an ordinary
|
|
144
|
+
Python function — useful in a notebook when you want to experiment.
|
|
145
|
+
|
|
146
|
+
### Analysis window
|
|
147
|
+
|
|
148
|
+
Pointed at a **directory**, an unqualified run looks back over the last `default_window`
|
|
149
|
+
(`1d` out of the box) of *that source's own* data — the right default for a live log dir
|
|
150
|
+
you don't want to read in full every time. Pointed at a **single file**, it reads the whole
|
|
151
|
+
file. Override either way:
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
loghunter --since=7d ~/zeek # last 7 days
|
|
155
|
+
loghunter --since=2026-05-01 --until=2026-05-08 ~/zeek
|
|
156
|
+
loghunter --days=2-4 ~/zeek # 2 to 4 days ago
|
|
157
|
+
loghunter --all ~/zeek # the entire archive
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
CloudTrail is the one source that opts out of the default window — novelty detection needs
|
|
161
|
+
full history, so it always loads in full unless you narrow it explicitly.
|
|
162
|
+
|
|
163
|
+
## Orient before the hunt: `digest`
|
|
164
|
+
|
|
165
|
+
```bash
|
|
166
|
+
loghunter digest /var/log/messages
|
|
167
|
+
loghunter digest conn.log dns.log # several files → several cards
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
`digest` content-sniffs each file, routes it to the right summarizer (conn, dns, syslog,
|
|
171
|
+
cloudtrail), and falls back to a fast byte-profiler — **blob** — for anything it doesn't
|
|
172
|
+
recognize. A card is flush-left and factual: the file's time window, line count and size, a
|
|
173
|
+
scale-anchored histogram, and a handful of plain-language insights ("one client accounts for
|
|
174
|
+
71% of queries"). It states facts and superlatives, never verdicts — no "suspicious," no
|
|
175
|
+
"anomalous." It reads your data *before* the allowlist, because everything in the file,
|
|
176
|
+
allowlisted or not, is part of "what's in here." The blob profiler is bounded: it samples a
|
|
177
|
+
big file rather than reading it, so a one-gigabyte mystery file costs the same as a
|
|
178
|
+
one-kilobyte one.
|
|
179
|
+
|
|
180
|
+
## Installation
|
|
181
|
+
|
|
182
|
+
LogHunter is published on PyPI as **`loghunt`** (the command, import package, and config
|
|
183
|
+
section are all `loghunter`).
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
pip install loghunt # core
|
|
187
|
+
pip install 'loghunt[fast]' # fast-hdbscan accelerator for DNS clustering
|
|
188
|
+
pip install 'loghunt[splunk]' # Splunk exporter
|
|
189
|
+
pip install 'loghunt[cloudtrail]' # CloudTrail (S3) exporter
|
|
190
|
+
pip install 'loghunt[all]' # everything above
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
Requires **Python 3.11+**. A bare `pip install loghunt` always works — the DNS clustering
|
|
194
|
+
runs on stock `hdbscan` (a base dependency); `[fast]` swaps in a numba-accelerated backend
|
|
195
|
+
when you want it, and the tool tells you which one is active on every run.
|
|
196
|
+
|
|
197
|
+
From source:
|
|
198
|
+
|
|
199
|
+
```bash
|
|
200
|
+
git clone https://github.com/spiralbend/loghunter
|
|
201
|
+
cd loghunter
|
|
202
|
+
pip install -e '.[all]'
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
## Configuration
|
|
206
|
+
|
|
207
|
+
Configuration is optional — LogHunter runs against a path with none. When you want it
|
|
208
|
+
repeatable, `loghunter init` looks at the conventional locations on your box, profiles what
|
|
209
|
+
it finds (which log families, rough size, freshness — without reading a single log line),
|
|
210
|
+
and writes a fully-annotated `~/.loghunter/config.toml`. It never clobbers settings you
|
|
211
|
+
already have.
|
|
212
|
+
|
|
213
|
+
Config is loaded from the first of:
|
|
214
|
+
|
|
215
|
+
1. `--config=FILE`
|
|
216
|
+
2. `~/.loghunter/config.toml`
|
|
217
|
+
3. `/etc/loghunter/config.toml`
|
|
218
|
+
|
|
219
|
+
Everything LogHunter owns lives under the hidden `~/.loghunter/` — config, allowlists,
|
|
220
|
+
exports, reports — so it can't collide with a project directory. A trimmed example:
|
|
221
|
+
|
|
222
|
+
```toml
|
|
223
|
+
[loghunter]
|
|
224
|
+
detect = "all" # "all" | "dns,beacon" | "all,!syslog"
|
|
225
|
+
zeek_dir = "/var/log/zeek"
|
|
226
|
+
syslog_dir = "/var/log"
|
|
227
|
+
# pihole_dir = "/var/log/pihole"
|
|
228
|
+
# cloudtrail_dir = "/var/log/cloudtrail"
|
|
229
|
+
|
|
230
|
+
home_net = ["10.0.0.0/8", "172.16.0.0/12", "192.168.0.0/16"]
|
|
231
|
+
default_window = "1d" # lookback for a directory; "" or "all" = full
|
|
232
|
+
output_format = "text" # text | json | csv | html
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
Findings print to your terminal by default — keep it pipeable. Set `report_dir` (or pass
|
|
236
|
+
`--out=PATH`) to write report files instead. Every tunable a detector exposes is documented
|
|
237
|
+
as a commented "engine room" at the bottom of the generated config; you rarely need it, and
|
|
238
|
+
`loghunter <detector> --help` lists the full surface.
|
|
239
|
+
|
|
240
|
+
## Log sources it speaks
|
|
241
|
+
|
|
242
|
+
- **Zeek** — `conn.log`, `dns.log`, `syslog.log`, in NDJSON or TSV, from a flat directory or
|
|
243
|
+
date-partitioned subdirectories. Rotation and gzip/bzip2/xz compression are transparent.
|
|
244
|
+
- **Pi-hole / dnsmasq** — DNS event logs, aggregated per domain for clustering.
|
|
245
|
+
- **syslog** — flat RFC 3164. Discovery is content-sniffed, not filename-matched, so it
|
|
246
|
+
handles both the Debian convention (`syslog`, `auth.log`, `kern.log`) and the RHEL/Fedora
|
|
247
|
+
one (extensionless `messages`, `secure`, `maillog`) — and won't mistake `dnf.log` or a
|
|
248
|
+
binary like `wtmp` for a log stream.
|
|
249
|
+
- **CloudTrail** — gzipped JSON event records, read locally or pulled from S3 (below).
|
|
250
|
+
|
|
251
|
+
## The allowlist
|
|
252
|
+
|
|
253
|
+
Two kinds of allowlist file, never conflated:
|
|
254
|
+
|
|
255
|
+
- **Flat files = suppression.** One rule per line — an IP, a CIDR, a `:port/proto`, or a
|
|
256
|
+
domain glob/regex. Matching traffic is dropped before any detector runs. LogHunter ships a
|
|
257
|
+
curated domain list and never ships numeric connection suppressions (those depend on your
|
|
258
|
+
hosts, and shipping them could hide real findings).
|
|
259
|
+
- **TOML stanzas = classification.** When a detector needs to know *what* something is
|
|
260
|
+
(a nameserver, a backup client) rather than whether to drop it.
|
|
261
|
+
|
|
262
|
+
A bare host IP with no port suppresses *all* traffic involving that host — powerful, and
|
|
263
|
+
called out as such wherever it appears.
|
|
264
|
+
|
|
265
|
+
## Pulling logs in: exporters
|
|
266
|
+
|
|
267
|
+
LogHunter can fetch logs from external systems to local files, which it then analyzes like
|
|
268
|
+
any other source — the syslog detector can't tell whether the data came from rsyslog or a
|
|
269
|
+
Splunk export.
|
|
270
|
+
|
|
271
|
+
```bash
|
|
272
|
+
loghunter export # run the configured "default" query
|
|
273
|
+
loghunter export auth # run a named query
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
- **Splunk** — named SPL queries under `[export.splunk.query.<name>]`. Prefer the
|
|
277
|
+
`LOGHUNTER_SPLUNK_USER` / `LOGHUNTER_SPLUNK_PASS` environment variables over plaintext
|
|
278
|
+
credentials in config.
|
|
279
|
+
- **CloudTrail** — pulls gzipped JSON from an S3 prefix. AWS authentication is *not* handled
|
|
280
|
+
here: you authenticate your shell, and boto3 resolves the ambient credential chain.
|
|
281
|
+
LogHunter never reads, stores, or prompts for AWS credentials, and warns before a large
|
|
282
|
+
egress.
|
|
283
|
+
|
|
284
|
+
## Output formats
|
|
285
|
+
|
|
286
|
+
`text` (default, grouped and summarized), `json` (one finding per line, pipeable), `csv`
|
|
287
|
+
(flattened), and `html` (a self-contained file). Pass `--output=json` or set `output_format`
|
|
288
|
+
in config. `-v` adds the curated "why it scored" detail; `-vv` adds raw debug — template
|
|
289
|
+
strings, cluster membership, full evidence. Color is enhancement-only and TTY-gated: piped
|
|
290
|
+
or redirected output is always plain, and the machine formats never emit an escape code.
|
|
291
|
+
|
|
292
|
+
## Building from source & running tests
|
|
293
|
+
|
|
294
|
+
```bash
|
|
295
|
+
git clone https://github.com/spiralbend/loghunter
|
|
296
|
+
cd loghunter
|
|
297
|
+
pip install -e '.[all]'
|
|
298
|
+
python -m pytest
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
`main` is kept runnable. Architecture tests cover the boundaries that matter — detector
|
|
302
|
+
discovery, run planning, loader metadata, allowlist suppression, output registration, and
|
|
303
|
+
CLI error formatting.
|
|
304
|
+
|
|
305
|
+
## License
|
|
306
|
+
|
|
307
|
+
LogHunter is licensed under the [MIT License](LICENSE).
|