loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,1010 @@
|
|
|
1
|
+
"""Direct tests for the DNS detector — minimal-schema readiness and feature matrix.
|
|
2
|
+
|
|
3
|
+
All IP addresses use RFC 5737 documentation space: 192.0.2.x, 198.51.100.x.
|
|
4
|
+
All domains are placeholders — no real hostnames or infrastructure.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from datetime import datetime, timezone
|
|
10
|
+
from types import SimpleNamespace
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
import pytest
|
|
15
|
+
|
|
16
|
+
from loghunter.common import clustering
|
|
17
|
+
from loghunter.common.finding import DetectorContext, Finding, Severity
|
|
18
|
+
from loghunter.outputs.text import TextHandler, _partition_dns as _dns_sections
|
|
19
|
+
from loghunter.detectors.dns import (
|
|
20
|
+
DEFAULT_CONFIG,
|
|
21
|
+
_build_features,
|
|
22
|
+
_build_pihole_aggregate,
|
|
23
|
+
_build_pihole_features,
|
|
24
|
+
_enrich_zeek_with_pihole,
|
|
25
|
+
_shared_back_half,
|
|
26
|
+
entropy as dns_entropy,
|
|
27
|
+
run,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@pytest.fixture(autouse=True)
|
|
32
|
+
def _in_process_clustering(monkeypatch: pytest.MonkeyPatch) -> None:
|
|
33
|
+
"""Force the in-process escape hatch for every detector-logic test.
|
|
34
|
+
|
|
35
|
+
Detector-logic tests in this file do NOT exercise the process-isolation
|
|
36
|
+
machinery — that has its own dedicated suite in
|
|
37
|
+
tests/test_clustering_interruptible.py. Keeping these tests in-process
|
|
38
|
+
avoids spawn overhead per test and keeps the mock target visible to
|
|
39
|
+
the parent process.
|
|
40
|
+
|
|
41
|
+
The mock target IS ``loghunter.common.clustering.HDBSCAN`` — NOT
|
|
42
|
+
``loghunter.detectors.dns.HDBSCAN``: the dns detector no longer
|
|
43
|
+
imports HDBSCAN, and even if it did, ``fit_predict_interruptible``'s
|
|
44
|
+
in-process path constructs ``clustering.HDBSCAN`` directly. Patching
|
|
45
|
+
the dns-module symbol would intercept nothing.
|
|
46
|
+
"""
|
|
47
|
+
monkeypatch.setattr(
|
|
48
|
+
clustering, "_CLUSTERING_ISOLATE_ENABLED", False,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
_NOW = datetime(2026, 5, 30, tzinfo=timezone.utc)
|
|
52
|
+
_WINDOW = (_NOW, _NOW)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _ctx(df: pd.DataFrame, cfg: dict | None = None) -> DetectorContext:
|
|
56
|
+
return DetectorContext(
|
|
57
|
+
logs={"dns*.log*": df},
|
|
58
|
+
config=cfg or {},
|
|
59
|
+
allowlist=None,
|
|
60
|
+
data_window=_WINDOW,
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _fake_extract(query: str) -> SimpleNamespace:
|
|
65
|
+
"""Stable tldextract stub — avoids cache writes; returns plausible attributes."""
|
|
66
|
+
return SimpleNamespace(
|
|
67
|
+
domain="example",
|
|
68
|
+
suffix="com",
|
|
69
|
+
subdomain="",
|
|
70
|
+
top_domain_under_public_suffix="example.com",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
# ── Test 1 — minimal-schema run() doesn't raise ───────────────────────────────
|
|
75
|
+
|
|
76
|
+
def test_run_minimal_schema_does_not_raise(monkeypatch) -> None:
|
|
77
|
+
"""dns.run() with only ts/src/query returns a list without raising.
|
|
78
|
+
|
|
79
|
+
tldextract is patched to avoid cache writes (PermissionError in sandboxes).
|
|
80
|
+
HDBSCAN config is set small so the test exercises the schema path, not calibration.
|
|
81
|
+
"""
|
|
82
|
+
import loghunter.detectors.dns as dns_mod
|
|
83
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", _fake_extract)
|
|
84
|
+
|
|
85
|
+
df = pd.DataFrame([
|
|
86
|
+
{"ts": 1.0, "src": "192.0.2.1", "query": "api.test.example.com"},
|
|
87
|
+
{"ts": 2.0, "src": "192.0.2.2", "query": "beacon.test.example.net"},
|
|
88
|
+
{"ts": 3.0, "src": "192.0.2.1", "query": "cdn.example.com"},
|
|
89
|
+
{"ts": 4.0, "src": "192.0.2.3", "query": "dns.test.example.net"},
|
|
90
|
+
])
|
|
91
|
+
ctx = _ctx(df, cfg={"min_cluster_size": 2, "min_samples": 1})
|
|
92
|
+
result = run(ctx)
|
|
93
|
+
assert isinstance(result, list)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# ── Test 2 — _build_features minimal schema → query-derived only ─────────────
|
|
97
|
+
|
|
98
|
+
def test_build_features_minimal_schema_omits_extended_columns() -> None:
|
|
99
|
+
"""With only ts/src/query, extended features must be absent (drop-from-matrix, not zero-fill).
|
|
100
|
+
|
|
101
|
+
Queries include both .com and .net TLDs so pd.get_dummies(drop_first=True)
|
|
102
|
+
still produces at least one TLD_ column.
|
|
103
|
+
"""
|
|
104
|
+
df = pd.DataFrame([
|
|
105
|
+
{"ts": 1.0, "src": "192.0.2.1", "query": "api.test.example.com"},
|
|
106
|
+
{"ts": 2.0, "src": "192.0.2.2", "query": "beacon.test.example.net"},
|
|
107
|
+
{"ts": 3.0, "src": "192.0.2.1", "query": "cdn.example.com"},
|
|
108
|
+
{"ts": 4.0, "src": "192.0.2.3", "query": "resolve.test.example.net"},
|
|
109
|
+
])
|
|
110
|
+
feat = _build_features(df)
|
|
111
|
+
|
|
112
|
+
for col in ("qlen", "qparts", "sufflen", "domlen"):
|
|
113
|
+
assert col in feat.columns, f"expected query-derived column {col!r} in feature matrix"
|
|
114
|
+
|
|
115
|
+
tld_cols = [c for c in feat.columns if c.startswith("TLD_")]
|
|
116
|
+
assert len(tld_cols) >= 1, "expected at least one TLD_ one-hot column"
|
|
117
|
+
|
|
118
|
+
for col in ("rtt", "ttl", "rcode", "answer", "tc"):
|
|
119
|
+
assert col not in feat.columns, (
|
|
120
|
+
f"{col!r} must be absent from feature matrix when not present in input "
|
|
121
|
+
"(drop-from-matrix, not zero-fill)"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
# ── Test 3 — _build_features extended schema → extended features included ─────
|
|
126
|
+
|
|
127
|
+
def test_build_features_extended_schema_includes_extended_columns() -> None:
|
|
128
|
+
"""When canonical extended columns are present they appear in the feature matrix."""
|
|
129
|
+
df = pd.DataFrame([
|
|
130
|
+
{
|
|
131
|
+
"ts": 1.0, "src": "192.0.2.1", "query": "api.example.com",
|
|
132
|
+
"rtt": 0.05, "ttl": 300.0, "rcode": 0,
|
|
133
|
+
"answer": ["198.51.100.1"], "tc": 0,
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
"ts": 2.0, "src": "192.0.2.2", "query": "cdn.example.net",
|
|
137
|
+
"rtt": 0.03, "ttl": 60.0, "rcode": 0,
|
|
138
|
+
"answer": ["198.51.100.5", "198.51.100.6"], "tc": 0,
|
|
139
|
+
},
|
|
140
|
+
])
|
|
141
|
+
feat = _build_features(df)
|
|
142
|
+
|
|
143
|
+
for col in ("rtt", "ttl", "rcode", "answer", "tc"):
|
|
144
|
+
assert col in feat.columns, f"expected extended column {col!r} in feature matrix"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ── Test 4 — Zeek path golden regression ─────────────────────────────────────
|
|
148
|
+
|
|
149
|
+
# Fixture: 6 rows — 1 filtered (single-label), 2 cluster-0, 3 noise.
|
|
150
|
+
# Noise group: a3f7bc19.malware.example + m8x2q9n.malware.example → malware.example group finding.
|
|
151
|
+
# Noise singleton: k8x2m5q7n1p.suspect.example → singleton finding.
|
|
152
|
+
#
|
|
153
|
+
# After the Commit-2 refactor, the ONLY permitted change is the additive
|
|
154
|
+
# evidence key source="zeek". All other keys and values must match exactly.
|
|
155
|
+
# Assert (a) exact final order, (b) every pre-refactor evidence key unchanged,
|
|
156
|
+
# (c) the only new key in post-refactor output is source=="zeek".
|
|
157
|
+
|
|
158
|
+
_REGRESSION_DF = pd.DataFrame([
|
|
159
|
+
{"ts": 1.0, "src": "192.0.2.1", "query": "localhost"}, # single-label → filtered by has_dot
|
|
160
|
+
{"ts": 2.0, "src": "192.0.2.2", "query": "safe.example.com"}, # cluster 0
|
|
161
|
+
{"ts": 3.0, "src": "192.0.2.3", "query": "normal.example.org"}, # cluster 0
|
|
162
|
+
{"ts": 4.0, "src": "192.0.2.1", "query": "a3f7bc19.malware.example"}, # noise → group
|
|
163
|
+
{"ts": 5.0, "src": "192.0.2.2", "query": "m8x2q9n.malware.example"}, # noise → group
|
|
164
|
+
{"ts": 6.0, "src": "192.0.2.1", "query": "k8x2m5q7n1p.suspect.example"}, # noise → singleton
|
|
165
|
+
])
|
|
166
|
+
|
|
167
|
+
# Per-query tldextract results — deterministic, avoids cache writes.
|
|
168
|
+
# localhost included so the degenerate filter path is exercised via has_dot.
|
|
169
|
+
_REGRESSION_EXT = {
|
|
170
|
+
"localhost": SimpleNamespace(domain="localhost", suffix="", subdomain="", top_domain_under_public_suffix=""),
|
|
171
|
+
"safe.example.com": SimpleNamespace(domain="example", suffix="com", subdomain="safe", top_domain_under_public_suffix="example.com"),
|
|
172
|
+
"normal.example.org": SimpleNamespace(domain="example", suffix="org", subdomain="normal", top_domain_under_public_suffix="example.org"),
|
|
173
|
+
"a3f7bc19.malware.example": SimpleNamespace(domain="malware", suffix="example", subdomain="a3f7bc19", top_domain_under_public_suffix="malware.example"),
|
|
174
|
+
"m8x2q9n.malware.example": SimpleNamespace(domain="malware", suffix="example", subdomain="m8x2q9n", top_domain_under_public_suffix="malware.example"),
|
|
175
|
+
"k8x2m5q7n1p.suspect.example": SimpleNamespace(domain="suspect", suffix="example", subdomain="k8x2m5q7n1p", top_domain_under_public_suffix="suspect.example"),
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _regression_fake_extract(q: str) -> SimpleNamespace:
|
|
180
|
+
return _REGRESSION_EXT.get(
|
|
181
|
+
q,
|
|
182
|
+
SimpleNamespace(domain="", suffix="", subdomain="", top_domain_under_public_suffix=""),
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
# After degenerate filter, dns_df has 5 rows in this order (localhost dropped):
|
|
187
|
+
# 0: safe.example.com, 1: normal.example.org,
|
|
188
|
+
# 2: a3f7bc19.malware.example, 3: m8x2q9n.malware.example, 4: k8x2m5q7n1p.suspect.example
|
|
189
|
+
# FakeHDBSCAN puts rows 0–1 in cluster 0, rows 2–4 as noise.
|
|
190
|
+
class _FakeHDBSCAN:
|
|
191
|
+
def __init__(self, **kwargs): pass
|
|
192
|
+
def fit_predict(self, X: np.ndarray) -> np.ndarray:
|
|
193
|
+
return np.array([0, 0, -1, -1, -1])
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def test_zeek_path_regression(monkeypatch) -> None:
|
|
197
|
+
"""Golden: Zeek path produces a group + singleton finding in exact order.
|
|
198
|
+
|
|
199
|
+
Commit 2 may ONLY add source='zeek' to each finding's evidence.
|
|
200
|
+
Every pre-existing key and value must be identical after the refactor.
|
|
201
|
+
"""
|
|
202
|
+
import loghunter.detectors.dns as dns_mod
|
|
203
|
+
|
|
204
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", _regression_fake_extract)
|
|
205
|
+
monkeypatch.setattr(clustering, "HDBSCAN", _FakeHDBSCAN)
|
|
206
|
+
|
|
207
|
+
# Expected entropy values derived from the actual entropy() function —
|
|
208
|
+
# no hardcoded floats; if entropy() changes the test will catch it.
|
|
209
|
+
ent_a3f7bc19 = dns_entropy("a3f7bc19") # subdomain of a3f7bc19.malware.example
|
|
210
|
+
ent_m8x2q9n = dns_entropy("m8x2q9n") # subdomain of m8x2q9n.malware.example
|
|
211
|
+
ent_suspect = dns_entropy("k8x2m5q7n1p") # subdomain of k8x2m5q7n1p.suspect.example
|
|
212
|
+
|
|
213
|
+
ctx = DetectorContext(
|
|
214
|
+
logs={"dns*.log*": _REGRESSION_DF.copy()},
|
|
215
|
+
config={"min_cluster_size": 5, "min_samples": 1, "threshold": 1.5, "thresh_high_entropy": 1.8},
|
|
216
|
+
allowlist=None,
|
|
217
|
+
data_window=(_NOW, _NOW),
|
|
218
|
+
)
|
|
219
|
+
findings = run(ctx)
|
|
220
|
+
|
|
221
|
+
# ── Exact count ───────────────────────────────────────────────────────────
|
|
222
|
+
assert len(findings) == 2, f"expected 2 findings, got {len(findings)}: {[f.title for f in findings]}"
|
|
223
|
+
|
|
224
|
+
# ── Exact order: group first (sorted by max_entropy desc), then singletons ─
|
|
225
|
+
golden_titles = [
|
|
226
|
+
"malware.example (2 subdomains, entropy ≥ 1.8)",
|
|
227
|
+
"k8x2m5q7n1p.suspect.example",
|
|
228
|
+
]
|
|
229
|
+
assert [f.title for f in findings] == golden_titles, (
|
|
230
|
+
f"finding order mismatch: {[f.title for f in findings]}"
|
|
231
|
+
)
|
|
232
|
+
|
|
233
|
+
# ── Group finding ─────────────────────────────────────────────────────────
|
|
234
|
+
grp_f = findings[0]
|
|
235
|
+
# max_ent < 1.8 so MEDIUM (subdomains score 1.77 and 1.70)
|
|
236
|
+
assert grp_f.severity == Severity.MEDIUM
|
|
237
|
+
|
|
238
|
+
expected_grp_ev = {
|
|
239
|
+
"registrable_domain": "malware.example",
|
|
240
|
+
"subdomain_count": 2,
|
|
241
|
+
"max_entropy": round(max(ent_a3f7bc19, ent_m8x2q9n), 4),
|
|
242
|
+
"min_entropy": round(min(ent_a3f7bc19, ent_m8x2q9n), 4),
|
|
243
|
+
"total_queries": 2,
|
|
244
|
+
"unique_sources": 2,
|
|
245
|
+
"sample_domains": ["a3f7bc19.malware.example", "m8x2q9n.malware.example"],
|
|
246
|
+
"querier_ips": ["192.0.2.1", "192.0.2.2"],
|
|
247
|
+
}
|
|
248
|
+
for key, expected_val in expected_grp_ev.items():
|
|
249
|
+
assert grp_f.evidence[key] == expected_val, (
|
|
250
|
+
f"group evidence[{key!r}]: got {grp_f.evidence[key]!r}, expected {expected_val!r}"
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# source is the ONE permitted additive key in Commit 2
|
|
254
|
+
assert grp_f.evidence.get("source") == "zeek", (
|
|
255
|
+
f"expected source='zeek', got {grp_f.evidence.get('source')!r}"
|
|
256
|
+
)
|
|
257
|
+
_pre_refactor_grp_keys = {
|
|
258
|
+
"registrable_domain", "subdomain_count", "max_entropy", "min_entropy",
|
|
259
|
+
"total_queries", "unique_sources", "sample_domains", "querier_ips",
|
|
260
|
+
}
|
|
261
|
+
new_grp_keys = set(grp_f.evidence.keys()) - _pre_refactor_grp_keys
|
|
262
|
+
assert new_grp_keys == {"source"}, (
|
|
263
|
+
f"only 'source' may be added to group evidence; unexpected new keys: {new_grp_keys}"
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# ── Singleton finding ─────────────────────────────────────────────────────
|
|
267
|
+
sng_f = findings[1]
|
|
268
|
+
assert sng_f.severity == Severity.HIGH # 1.93 >= 1.8
|
|
269
|
+
assert sng_f.title == "k8x2m5q7n1p.suspect.example"
|
|
270
|
+
|
|
271
|
+
expected_sng_ev = {
|
|
272
|
+
"entropy": round(ent_suspect, 4),
|
|
273
|
+
"query_count": 1,
|
|
274
|
+
"unique_sources": 1,
|
|
275
|
+
"querier_ips": ["192.0.2.1"],
|
|
276
|
+
"rcode_distribution": {}, # no rcode column in fixture
|
|
277
|
+
}
|
|
278
|
+
for key, expected_val in expected_sng_ev.items():
|
|
279
|
+
assert sng_f.evidence[key] == expected_val, (
|
|
280
|
+
f"singleton evidence[{key!r}]: got {sng_f.evidence[key]!r}, expected {expected_val!r}"
|
|
281
|
+
)
|
|
282
|
+
assert sng_f.evidence.get("source") == "zeek", (
|
|
283
|
+
f"expected source='zeek', got {sng_f.evidence.get('source')!r}"
|
|
284
|
+
)
|
|
285
|
+
_pre_refactor_sng_keys = {
|
|
286
|
+
"entropy", "query_count", "unique_sources", "querier_ips", "rcode_distribution",
|
|
287
|
+
}
|
|
288
|
+
new_sng_keys = set(sng_f.evidence.keys()) - _pre_refactor_sng_keys
|
|
289
|
+
assert new_sng_keys == {"source"}, (
|
|
290
|
+
f"only 'source' may be added to singleton evidence; unexpected new keys: {new_sng_keys}"
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# ── Pihole aggregate tests ────────────────────────────────────────────────────
|
|
295
|
+
|
|
296
|
+
def test_pihole_aggregate_produces_per_domain_rows(monkeypatch) -> None:
|
|
297
|
+
"""_build_pihole_aggregate produces exactly one row per unique query domain."""
|
|
298
|
+
import loghunter.detectors.dns as dns_mod
|
|
299
|
+
|
|
300
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: SimpleNamespace(
|
|
301
|
+
domain="example", suffix="com", subdomain=q.split(".")[0],
|
|
302
|
+
top_domain_under_public_suffix="example.com",
|
|
303
|
+
))
|
|
304
|
+
|
|
305
|
+
rows = [
|
|
306
|
+
# alpha — 3 query events from 2 clients + 1 forwarded
|
|
307
|
+
{"query": "alpha.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
308
|
+
{"query": "alpha.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
309
|
+
{"query": "alpha.example.com", "event_type": "query", "src": "192.0.2.2", "qtype": "A"},
|
|
310
|
+
{"query": "alpha.example.com", "event_type": "forwarded", "src": None, "qtype": None},
|
|
311
|
+
# beta — 2 query events from 1 client
|
|
312
|
+
{"query": "beta.example.com", "event_type": "query", "src": "192.0.2.3", "qtype": "AAAA"},
|
|
313
|
+
{"query": "beta.example.com", "event_type": "query", "src": "192.0.2.3", "qtype": "AAAA"},
|
|
314
|
+
# gamma — 4 query events from 3 clients
|
|
315
|
+
{"query": "gamma.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
316
|
+
{"query": "gamma.example.com", "event_type": "query", "src": "192.0.2.4", "qtype": "A"},
|
|
317
|
+
{"query": "gamma.example.com", "event_type": "query", "src": "192.0.2.5", "qtype": "A"},
|
|
318
|
+
{"query": "gamma.example.com", "event_type": "query", "src": "192.0.2.5", "qtype": "A"},
|
|
319
|
+
]
|
|
320
|
+
agg = _build_pihole_aggregate(pd.DataFrame(rows))
|
|
321
|
+
|
|
322
|
+
assert len(agg) == 3, f"expected 3 rows, got {len(agg)}"
|
|
323
|
+
|
|
324
|
+
alpha = agg[agg["query"] == "alpha.example.com"].iloc[0]
|
|
325
|
+
assert alpha["query_count"] == 3
|
|
326
|
+
assert alpha["unique_clients"] == 2
|
|
327
|
+
# 1 forwarded / 3 query events
|
|
328
|
+
assert round(float(alpha["forward_ratio"]), 4) == round(1 / 3, 4)
|
|
329
|
+
|
|
330
|
+
beta = agg[agg["query"] == "beta.example.com"].iloc[0]
|
|
331
|
+
assert beta["query_count"] == 2
|
|
332
|
+
assert beta["unique_clients"] == 1
|
|
333
|
+
|
|
334
|
+
gamma = agg[agg["query"] == "gamma.example.com"].iloc[0]
|
|
335
|
+
assert gamma["query_count"] == 4
|
|
336
|
+
assert gamma["unique_clients"] == 3
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def test_pihole_blocked_domain_evidence(monkeypatch) -> None:
|
|
340
|
+
"""A domain with a gravity_blocked event produces was_blocked=True, block_ratio > 0."""
|
|
341
|
+
import loghunter.detectors.dns as dns_mod
|
|
342
|
+
|
|
343
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: SimpleNamespace(
|
|
344
|
+
domain="example", suffix="com", subdomain="blocked",
|
|
345
|
+
top_domain_under_public_suffix="example.com",
|
|
346
|
+
))
|
|
347
|
+
|
|
348
|
+
rows = [
|
|
349
|
+
{"query": "blocked.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
350
|
+
{"query": "blocked.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
351
|
+
{"query": "blocked.example.com", "event_type": "gravity_blocked", "src": None, "qtype": None},
|
|
352
|
+
]
|
|
353
|
+
agg = _build_pihole_aggregate(pd.DataFrame(rows))
|
|
354
|
+
|
|
355
|
+
assert len(agg) == 1
|
|
356
|
+
row = agg.iloc[0]
|
|
357
|
+
assert bool(row["was_blocked"]) is True
|
|
358
|
+
assert row["block_ratio"] > 0
|
|
359
|
+
# block_count=1, total_count=3, block_ratio=1/3
|
|
360
|
+
assert round(float(row["block_ratio"]), 4) == round(1 / 3, 4)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
def test_block_ratio_union_gravity_and_regex(monkeypatch) -> None:
|
|
364
|
+
"""block_count collapses gravity_blocked + regex_blocked (not counted separately)."""
|
|
365
|
+
import loghunter.detectors.dns as dns_mod
|
|
366
|
+
|
|
367
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: SimpleNamespace(
|
|
368
|
+
domain="example", suffix="com", subdomain="evil",
|
|
369
|
+
top_domain_under_public_suffix="example.com",
|
|
370
|
+
))
|
|
371
|
+
|
|
372
|
+
rows = [
|
|
373
|
+
{"query": "evil.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
374
|
+
{"query": "evil.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
375
|
+
{"query": "evil.example.com", "event_type": "gravity_blocked", "src": None, "qtype": None},
|
|
376
|
+
{"query": "evil.example.com", "event_type": "gravity_blocked", "src": None, "qtype": None},
|
|
377
|
+
{"query": "evil.example.com", "event_type": "regex_blocked", "src": None, "qtype": None},
|
|
378
|
+
]
|
|
379
|
+
agg = _build_pihole_aggregate(pd.DataFrame(rows))
|
|
380
|
+
|
|
381
|
+
assert len(agg) == 1
|
|
382
|
+
row = agg.iloc[0]
|
|
383
|
+
assert int(row["block_count"]) == 3, "gravity_blocked (2) + regex_blocked (1) must sum to 3"
|
|
384
|
+
# total_count=5, block_ratio=3/5=0.6
|
|
385
|
+
assert round(float(row["block_ratio"]), 4) == round(3 / 5, 4)
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def test_block_ratio_not_in_pihole_feature_matrix() -> None:
|
|
389
|
+
"""Evidence-only columns must not appear in the pihole feature matrix."""
|
|
390
|
+
agg_df = pd.DataFrame([
|
|
391
|
+
{
|
|
392
|
+
"query": "sub1.example.com",
|
|
393
|
+
"query_count": 5, "forward_count": 2, "cache_count": 1,
|
|
394
|
+
"block_count": 0, "special_count": 0, "total_count": 8,
|
|
395
|
+
"unique_clients": 2, "unique_qtypes": 1,
|
|
396
|
+
"querier_ips": ["192.0.2.1", "192.0.2.2"],
|
|
397
|
+
"qtype_counts": {"A": 5},
|
|
398
|
+
"forward_ratio": 0.4, "cache_ratio": 0.2,
|
|
399
|
+
"block_ratio": 0.0, "was_blocked": False,
|
|
400
|
+
"unique_sources": 2,
|
|
401
|
+
},
|
|
402
|
+
{
|
|
403
|
+
"query": "sub2.example.net",
|
|
404
|
+
"query_count": 3, "forward_count": 1, "cache_count": 0,
|
|
405
|
+
"block_count": 1, "special_count": 0, "total_count": 4,
|
|
406
|
+
"unique_clients": 1, "unique_qtypes": 2,
|
|
407
|
+
"querier_ips": ["192.0.2.3"],
|
|
408
|
+
"qtype_counts": {"A": 2, "AAAA": 1},
|
|
409
|
+
"forward_ratio": 0.333, "cache_ratio": 0.0,
|
|
410
|
+
"block_ratio": 0.25, "was_blocked": True,
|
|
411
|
+
"unique_sources": 1,
|
|
412
|
+
},
|
|
413
|
+
])
|
|
414
|
+
|
|
415
|
+
feat = _build_pihole_features(agg_df)
|
|
416
|
+
|
|
417
|
+
for col in ("block_ratio", "was_blocked", "block_count", "forward_count",
|
|
418
|
+
"cache_count", "total_count", "special_count"):
|
|
419
|
+
assert col not in feat.columns, f"{col!r} must not appear in pihole feature matrix"
|
|
420
|
+
|
|
421
|
+
|
|
422
|
+
# ── Both-mode (Zeek + pihole) tests ──────────────────────────────────────────
|
|
423
|
+
|
|
424
|
+
# Shared fixture for both-mode tests.
|
|
425
|
+
# After degenerate filter, dns_df has 3 rows: safe→cluster 0, normal→cluster 0, noise.
|
|
426
|
+
# FakeHDBSCAN3 assigns [0, 0, -1] so a3f7bc19.example.com is the noise domain.
|
|
427
|
+
_BOTH_MODE_ZEEK_EXT = {
|
|
428
|
+
"safe.example.com": SimpleNamespace(domain="example", suffix="com", subdomain="safe", top_domain_under_public_suffix="example.com"),
|
|
429
|
+
"normal.example.net": SimpleNamespace(domain="example", suffix="net", subdomain="normal", top_domain_under_public_suffix="example.net"),
|
|
430
|
+
"a3f7bc19.example.com": SimpleNamespace(domain="example", suffix="com", subdomain="a3f7bc19", top_domain_under_public_suffix="example.com"),
|
|
431
|
+
}
|
|
432
|
+
|
|
433
|
+
_BOTH_MODE_ZEEK_DF = pd.DataFrame([
|
|
434
|
+
{"ts": 1.0, "src": "192.0.2.1", "query": "safe.example.com"},
|
|
435
|
+
{"ts": 2.0, "src": "192.0.2.2", "query": "normal.example.net"},
|
|
436
|
+
{"ts": 3.0, "src": "192.0.2.1", "query": "a3f7bc19.example.com"},
|
|
437
|
+
])
|
|
438
|
+
|
|
439
|
+
_BOTH_MODE_PIHOLE_DF = pd.DataFrame([
|
|
440
|
+
{"ts": 4.0, "src": None, "query": "a3f7bc19.example.com", "event_type": "gravity_blocked", "qtype": None},
|
|
441
|
+
{"ts": 5.0, "src": "192.0.2.1", "query": "a3f7bc19.example.com", "event_type": "query", "qtype": "A"},
|
|
442
|
+
])
|
|
443
|
+
|
|
444
|
+
|
|
445
|
+
class _FakeHDBSCAN3:
|
|
446
|
+
def __init__(self, **kwargs): pass
|
|
447
|
+
def fit_predict(self, X: np.ndarray) -> np.ndarray:
|
|
448
|
+
return np.array([0, 0, -1])
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
def test_both_mode_zeek_enriched_with_pihole_block(monkeypatch) -> None:
|
|
452
|
+
"""Both-mode: Zeek noise domain enriched with pihole block data carries was_blocked in evidence."""
|
|
453
|
+
import loghunter.detectors.dns as dns_mod
|
|
454
|
+
|
|
455
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: _BOTH_MODE_ZEEK_EXT.get(
|
|
456
|
+
q, SimpleNamespace(domain="", suffix="", subdomain="", top_domain_under_public_suffix=""),
|
|
457
|
+
))
|
|
458
|
+
monkeypatch.setattr(clustering, "HDBSCAN", _FakeHDBSCAN3)
|
|
459
|
+
|
|
460
|
+
ctx = DetectorContext(
|
|
461
|
+
logs={"dns*.log*": _BOTH_MODE_ZEEK_DF.copy(), "pihole*.log*": _BOTH_MODE_PIHOLE_DF.copy()},
|
|
462
|
+
config={"min_cluster_size": 3, "min_samples": 1, "threshold": 1.5, "thresh_high_entropy": 1.8},
|
|
463
|
+
allowlist=None,
|
|
464
|
+
data_window=_WINDOW,
|
|
465
|
+
)
|
|
466
|
+
findings = run(ctx)
|
|
467
|
+
|
|
468
|
+
assert len(findings) >= 1, "expected at least one finding from both-mode run"
|
|
469
|
+
f = findings[0]
|
|
470
|
+
assert f.evidence["source"] == "zeek"
|
|
471
|
+
assert f.evidence.get("was_blocked") is True, "noise domain blocked by pihole must carry was_blocked=True"
|
|
472
|
+
assert f.evidence.get("block_ratio", 0.0) > 0.0, "block_ratio must be > 0 for a blocked domain"
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def test_both_mode_pihole_not_independently_clustered(monkeypatch) -> None:
|
|
476
|
+
"""Both-mode: pihole data enriches Zeek; no independent pihole clustering happens."""
|
|
477
|
+
import loghunter.detectors.dns as dns_mod
|
|
478
|
+
|
|
479
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: _BOTH_MODE_ZEEK_EXT.get(
|
|
480
|
+
q, SimpleNamespace(domain="", suffix="", subdomain="", top_domain_under_public_suffix=""),
|
|
481
|
+
))
|
|
482
|
+
monkeypatch.setattr(clustering, "HDBSCAN", _FakeHDBSCAN3)
|
|
483
|
+
|
|
484
|
+
ctx = DetectorContext(
|
|
485
|
+
logs={"dns*.log*": _BOTH_MODE_ZEEK_DF.copy(), "pihole*.log*": _BOTH_MODE_PIHOLE_DF.copy()},
|
|
486
|
+
config={"min_cluster_size": 3, "min_samples": 1, "threshold": 1.5, "thresh_high_entropy": 1.8},
|
|
487
|
+
allowlist=None,
|
|
488
|
+
data_window=_WINDOW,
|
|
489
|
+
)
|
|
490
|
+
findings = run(ctx)
|
|
491
|
+
|
|
492
|
+
assert all(f.evidence.get("source") != "pihole" for f in findings), (
|
|
493
|
+
"both-mode must only produce zeek findings — no independent pihole clustering"
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
|
|
497
|
+
# ── Pihole event-type exclusion tests ────────────────────────────────────────
|
|
498
|
+
|
|
499
|
+
def test_excluded_event_types_not_in_aggregation(monkeypatch) -> None:
|
|
500
|
+
"""dnssec_query, dhcp, and pihole_hostname must not contribute to query_count."""
|
|
501
|
+
import loghunter.detectors.dns as dns_mod
|
|
502
|
+
|
|
503
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: SimpleNamespace(
|
|
504
|
+
domain="example", suffix="com", subdomain="target",
|
|
505
|
+
top_domain_under_public_suffix="example.com",
|
|
506
|
+
))
|
|
507
|
+
|
|
508
|
+
rows = [
|
|
509
|
+
# 3 query events that SHOULD count
|
|
510
|
+
{"query": "target.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
511
|
+
{"query": "target.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
512
|
+
{"query": "target.example.com", "event_type": "query", "src": "192.0.2.2", "qtype": "A"},
|
|
513
|
+
# excluded event types
|
|
514
|
+
{"query": "target.example.com", "event_type": "dnssec_query", "src": None, "qtype": "DS"},
|
|
515
|
+
{"query": "target.example.com", "event_type": "dnssec_query", "src": None, "qtype": "DNSKEY"},
|
|
516
|
+
{"query": "target.example.com", "event_type": "dhcp", "src": None, "qtype": None},
|
|
517
|
+
{"query": "target.example.com", "event_type": "pihole_hostname", "src": None, "qtype": None},
|
|
518
|
+
]
|
|
519
|
+
agg = _build_pihole_aggregate(pd.DataFrame(rows))
|
|
520
|
+
|
|
521
|
+
assert len(agg) == 1
|
|
522
|
+
assert agg.iloc[0]["query_count"] == 3, "only query events should count in query_count"
|
|
523
|
+
|
|
524
|
+
|
|
525
|
+
def test_special_not_in_cluster_features_but_in_evidence(monkeypatch) -> None:
|
|
526
|
+
"""special events count as annotation in aggregate but must not enter the feature matrix."""
|
|
527
|
+
import loghunter.detectors.dns as dns_mod
|
|
528
|
+
|
|
529
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: SimpleNamespace(
|
|
530
|
+
domain="example", suffix="com", subdomain="relay",
|
|
531
|
+
top_domain_under_public_suffix="example.com",
|
|
532
|
+
))
|
|
533
|
+
|
|
534
|
+
rows = [
|
|
535
|
+
{"query": "relay.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
536
|
+
{"query": "relay.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
537
|
+
{"query": "relay.example.com", "event_type": "query", "src": "192.0.2.2", "qtype": "A"},
|
|
538
|
+
{"query": "relay.example.com", "event_type": "special", "src": None, "qtype": None},
|
|
539
|
+
{"query": "relay.example.com", "event_type": "special", "src": None, "qtype": None},
|
|
540
|
+
]
|
|
541
|
+
agg = _build_pihole_aggregate(pd.DataFrame(rows))
|
|
542
|
+
|
|
543
|
+
assert len(agg) == 1
|
|
544
|
+
assert int(agg.iloc[0]["special_count"]) == 2, "special events must be counted in aggregate"
|
|
545
|
+
|
|
546
|
+
feat = _build_pihole_features(agg)
|
|
547
|
+
assert "special_count" not in feat.columns, "special_count must not enter the feature matrix"
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
# ── Pihole-only end-to-end test ───────────────────────────────────────────────
|
|
551
|
+
|
|
552
|
+
_PIHOLE_ONLY_EXT = {
|
|
553
|
+
"a3f7bc19.sus1.example": SimpleNamespace(domain="sus1", suffix="example", subdomain="a3f7bc19", top_domain_under_public_suffix="sus1.example"),
|
|
554
|
+
"m8x2q9n.sus2.example": SimpleNamespace(domain="sus2", suffix="example", subdomain="m8x2q9n", top_domain_under_public_suffix="sus2.example"),
|
|
555
|
+
"k8x2m5q7n1p.sus3.example": SimpleNamespace(domain="sus3", suffix="example", subdomain="k8x2m5q7n1p", top_domain_under_public_suffix="sus3.example"),
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
|
|
559
|
+
def test_pihole_only_run_produces_findings(monkeypatch) -> None:
|
|
560
|
+
"""pihole-only run returns findings with source='pihole' for all entries."""
|
|
561
|
+
import loghunter.detectors.dns as dns_mod
|
|
562
|
+
|
|
563
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: _PIHOLE_ONLY_EXT.get(
|
|
564
|
+
q, SimpleNamespace(domain="", suffix="", subdomain="", top_domain_under_public_suffix=""),
|
|
565
|
+
))
|
|
566
|
+
|
|
567
|
+
class _FakeAllNoise:
|
|
568
|
+
def __init__(self, **kwargs): pass
|
|
569
|
+
def fit_predict(self, X: np.ndarray) -> np.ndarray:
|
|
570
|
+
return np.full(len(X), -1, dtype=int)
|
|
571
|
+
|
|
572
|
+
monkeypatch.setattr(clustering, "HDBSCAN", _FakeAllNoise)
|
|
573
|
+
|
|
574
|
+
rows = []
|
|
575
|
+
for domain in _PIHOLE_ONLY_EXT:
|
|
576
|
+
for i in range(3):
|
|
577
|
+
rows.append({"query": domain, "event_type": "query", "src": f"192.0.2.{i + 1}", "qtype": "A"})
|
|
578
|
+
pihole_df = pd.DataFrame(rows)
|
|
579
|
+
|
|
580
|
+
ctx = DetectorContext(
|
|
581
|
+
logs={"pihole*.log*": pihole_df},
|
|
582
|
+
config={
|
|
583
|
+
"threshold": 1.5,
|
|
584
|
+
"thresh_high_entropy": 1.8,
|
|
585
|
+
"pihole": {"min_cluster_size": 2, "min_samples": 1},
|
|
586
|
+
},
|
|
587
|
+
allowlist=None,
|
|
588
|
+
data_window=_WINDOW,
|
|
589
|
+
)
|
|
590
|
+
findings = run(ctx)
|
|
591
|
+
|
|
592
|
+
assert isinstance(findings, list)
|
|
593
|
+
assert len(findings) >= 1, "pihole-only run should produce at least one finding"
|
|
594
|
+
assert all(f.evidence.get("source") == "pihole" for f in findings), (
|
|
595
|
+
"all findings from a pihole-only run must carry source='pihole'"
|
|
596
|
+
)
|
|
597
|
+
|
|
598
|
+
|
|
599
|
+
# ── _shared_back_half grouping ────────────────────────────────────────────────
|
|
600
|
+
|
|
601
|
+
def test_shared_back_half_grouping_consistency() -> None:
|
|
602
|
+
"""Two candidate rows sharing the same registrable domain produce one group finding."""
|
|
603
|
+
candidate_df = pd.DataFrame([
|
|
604
|
+
{
|
|
605
|
+
"query": "a3f7bc19.example.com",
|
|
606
|
+
"label_entropy": 1.77,
|
|
607
|
+
"registrable_domain": "example.com",
|
|
608
|
+
"unique_sources": 1,
|
|
609
|
+
"querier_ips": ["192.0.2.1"],
|
|
610
|
+
"source": "pihole",
|
|
611
|
+
"query_count": 5,
|
|
612
|
+
"was_blocked": False,
|
|
613
|
+
"block_ratio": 0.0,
|
|
614
|
+
"cache_ratio": 0.2,
|
|
615
|
+
"forward_ratio": 0.8,
|
|
616
|
+
"qtype_counts": {"A": 5},
|
|
617
|
+
"special_count": 0,
|
|
618
|
+
},
|
|
619
|
+
{
|
|
620
|
+
"query": "m8x2q9n.example.com",
|
|
621
|
+
"label_entropy": 1.70,
|
|
622
|
+
"registrable_domain": "example.com",
|
|
623
|
+
"unique_sources": 1,
|
|
624
|
+
"querier_ips": ["192.0.2.2"],
|
|
625
|
+
"source": "pihole",
|
|
626
|
+
"query_count": 3,
|
|
627
|
+
"was_blocked": False,
|
|
628
|
+
"block_ratio": 0.0,
|
|
629
|
+
"cache_ratio": 0.3,
|
|
630
|
+
"forward_ratio": 0.7,
|
|
631
|
+
"qtype_counts": {"A": 3},
|
|
632
|
+
"special_count": 0,
|
|
633
|
+
},
|
|
634
|
+
])
|
|
635
|
+
|
|
636
|
+
findings = _shared_back_half(candidate_df, threshold=1.5, thresh_high=1.8, now=_NOW, data_window=_WINDOW)
|
|
637
|
+
|
|
638
|
+
assert len(findings) == 1, "two rows with same registrable domain must produce one group finding"
|
|
639
|
+
assert findings[0].evidence["subdomain_count"] == 2
|
|
640
|
+
assert findings[0].evidence["source"] == "pihole"
|
|
641
|
+
|
|
642
|
+
|
|
643
|
+
def test_pihole_group_qtype_counts_aggregated() -> None:
|
|
644
|
+
"""pihole group finding aggregates qtype_counts across all member rows."""
|
|
645
|
+
candidate_df = pd.DataFrame([
|
|
646
|
+
{
|
|
647
|
+
"query": "a3f7bc19.example.com",
|
|
648
|
+
"label_entropy": 1.77,
|
|
649
|
+
"registrable_domain": "example.com",
|
|
650
|
+
"unique_sources": 1,
|
|
651
|
+
"querier_ips": ["192.0.2.1"],
|
|
652
|
+
"source": "pihole",
|
|
653
|
+
"query_count": 5,
|
|
654
|
+
"was_blocked": False,
|
|
655
|
+
"block_ratio": 0.0,
|
|
656
|
+
"cache_ratio": 0.2,
|
|
657
|
+
"forward_ratio": 0.8,
|
|
658
|
+
"qtype_counts": {"A": 4, "AAAA": 1},
|
|
659
|
+
"special_count": 0,
|
|
660
|
+
},
|
|
661
|
+
{
|
|
662
|
+
"query": "m8x2q9n.example.com",
|
|
663
|
+
"label_entropy": 1.70,
|
|
664
|
+
"registrable_domain": "example.com",
|
|
665
|
+
"unique_sources": 1,
|
|
666
|
+
"querier_ips": ["192.0.2.2"],
|
|
667
|
+
"source": "pihole",
|
|
668
|
+
"query_count": 3,
|
|
669
|
+
"was_blocked": False,
|
|
670
|
+
"block_ratio": 0.0,
|
|
671
|
+
"cache_ratio": 0.3,
|
|
672
|
+
"forward_ratio": 0.7,
|
|
673
|
+
"qtype_counts": {"A": 2, "HTTPS": 1},
|
|
674
|
+
"special_count": 0,
|
|
675
|
+
},
|
|
676
|
+
])
|
|
677
|
+
|
|
678
|
+
findings = _shared_back_half(candidate_df, threshold=1.5, thresh_high=1.8, now=_NOW, data_window=_WINDOW)
|
|
679
|
+
|
|
680
|
+
assert len(findings) == 1
|
|
681
|
+
qtypes = findings[0].evidence.get("qtype_counts")
|
|
682
|
+
assert isinstance(qtypes, dict), "group finding must have qtype_counts dict"
|
|
683
|
+
assert qtypes.get("A") == 6, f"A count should be 4+2=6, got {qtypes.get('A')}"
|
|
684
|
+
assert qtypes.get("AAAA") == 1, f"AAAA count should be 1, got {qtypes.get('AAAA')}"
|
|
685
|
+
assert qtypes.get("HTTPS") == 1, f"HTTPS count should be 1, got {qtypes.get('HTTPS')}"
|
|
686
|
+
|
|
687
|
+
|
|
688
|
+
# ── Null/invalid query guard ──────────────────────────────────────────────────
|
|
689
|
+
|
|
690
|
+
def test_pihole_aggregate_null_query_rows_ignored(monkeypatch) -> None:
|
|
691
|
+
"""Rows with query=None, query='', or query=<non-string> are silently dropped."""
|
|
692
|
+
import loghunter.detectors.dns as dns_mod
|
|
693
|
+
|
|
694
|
+
monkeypatch.setattr(dns_mod.tldextract, "extract", lambda q: SimpleNamespace(
|
|
695
|
+
domain="example", suffix="com", subdomain="api",
|
|
696
|
+
top_domain_under_public_suffix="example.com",
|
|
697
|
+
))
|
|
698
|
+
|
|
699
|
+
rows = [
|
|
700
|
+
{"query": "api.example.com", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
701
|
+
{"query": None, "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
702
|
+
{"query": "", "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
703
|
+
{"query": 123, "event_type": "query", "src": "192.0.2.1", "qtype": "A"},
|
|
704
|
+
]
|
|
705
|
+
agg = _build_pihole_aggregate(pd.DataFrame(rows))
|
|
706
|
+
|
|
707
|
+
assert len(agg) == 1, f"expected 1 valid row, got {len(agg)}"
|
|
708
|
+
assert agg.iloc[0]["query"] == "api.example.com"
|
|
709
|
+
|
|
710
|
+
|
|
711
|
+
# ── Partial pihole config override ───────────────────────────────────────────
|
|
712
|
+
|
|
713
|
+
def test_pihole_cfg_partial_override_keeps_defaults(monkeypatch) -> None:
|
|
714
|
+
"""Partial pihole config override preserves unspecified defaults."""
|
|
715
|
+
import loghunter.detectors.dns as dns_mod
|
|
716
|
+
|
|
717
|
+
captured: dict = {}
|
|
718
|
+
|
|
719
|
+
def _spy_run_pihole_path(pihole_df: pd.DataFrame, pihole_cfg: dict) -> None:
|
|
720
|
+
captured["pihole_cfg"] = dict(pihole_cfg)
|
|
721
|
+
return None
|
|
722
|
+
|
|
723
|
+
monkeypatch.setattr(dns_mod, "_run_pihole_path", _spy_run_pihole_path)
|
|
724
|
+
|
|
725
|
+
pihole_df = pd.DataFrame([
|
|
726
|
+
{"ts": 1.0, "src": "192.0.2.1", "query": "api.example.com", "event_type": "query", "qtype": "A"},
|
|
727
|
+
])
|
|
728
|
+
ctx = DetectorContext(
|
|
729
|
+
logs={"pihole*.log*": pihole_df},
|
|
730
|
+
config={"pihole": {"min_samples": 3}},
|
|
731
|
+
allowlist=None,
|
|
732
|
+
data_window=_WINDOW,
|
|
733
|
+
)
|
|
734
|
+
run(ctx)
|
|
735
|
+
|
|
736
|
+
assert "pihole_cfg" in captured, "_run_pihole_path was not called"
|
|
737
|
+
assert captured["pihole_cfg"]["min_cluster_size"] == DEFAULT_CONFIG["pihole"]["min_cluster_size"], (
|
|
738
|
+
"min_cluster_size must fall back to DEFAULT_CONFIG when not overridden"
|
|
739
|
+
)
|
|
740
|
+
assert captured["pihole_cfg"]["min_samples"] == 3, "min_samples must be taken from user config"
|
|
741
|
+
|
|
742
|
+
|
|
743
|
+
# ── _enrich_zeek_with_pihole: no-match defaults ───────────────────────────────
|
|
744
|
+
|
|
745
|
+
def test_both_mode_no_pihole_match_block_ratio_is_zero() -> None:
|
|
746
|
+
"""Zeek domains with no pihole match get block_ratio=0.0, was_blocked=False (never NaN)."""
|
|
747
|
+
dns_df = pd.DataFrame([
|
|
748
|
+
{"ts": 1.0, "src": "192.0.2.1", "query": "sub.example.com"},
|
|
749
|
+
{"ts": 2.0, "src": "192.0.2.2", "query": "api.example.com"},
|
|
750
|
+
])
|
|
751
|
+
pihole_df = pd.DataFrame([
|
|
752
|
+
{"ts": 3.0, "src": None, "query": "other.unrelated.example", "event_type": "gravity_blocked", "qtype": None},
|
|
753
|
+
])
|
|
754
|
+
|
|
755
|
+
result = _enrich_zeek_with_pihole(dns_df.copy(), pihole_df)
|
|
756
|
+
|
|
757
|
+
assert "block_ratio" in result.columns
|
|
758
|
+
assert "was_blocked" in result.columns
|
|
759
|
+
assert (result["block_ratio"] == 0.0).all(), "unmatched domains must have block_ratio=0.0"
|
|
760
|
+
assert (result["was_blocked"] == False).all(), "unmatched domains must have was_blocked=False"
|
|
761
|
+
assert not result["block_ratio"].isna().any(), "block_ratio must never be NaN"
|
|
762
|
+
|
|
763
|
+
|
|
764
|
+
# ── Text renderer tests ───────────────────────────────────────────────────────
|
|
765
|
+
|
|
766
|
+
def _make_dns_finding(severity: Severity, title: str, evidence: dict) -> Finding:
|
|
767
|
+
return Finding(
|
|
768
|
+
detector="dns",
|
|
769
|
+
severity=severity,
|
|
770
|
+
title=title,
|
|
771
|
+
description="test description",
|
|
772
|
+
evidence=evidence,
|
|
773
|
+
next_steps=[],
|
|
774
|
+
ts_generated=_NOW,
|
|
775
|
+
data_window=_WINDOW,
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
|
|
779
|
+
def test_text_renderer_pihole_default_shows_blocked() -> None:
|
|
780
|
+
"""Pihole singleton with was_blocked=True shows BLOCKED token in default output."""
|
|
781
|
+
f = _make_dns_finding(
|
|
782
|
+
Severity.HIGH,
|
|
783
|
+
"a3f7bc19.example.com",
|
|
784
|
+
{
|
|
785
|
+
"source": "pihole",
|
|
786
|
+
"entropy": 1.77,
|
|
787
|
+
"query_count": 5,
|
|
788
|
+
"unique_sources": 2,
|
|
789
|
+
"querier_ips": ["192.0.2.1"],
|
|
790
|
+
"was_blocked": True,
|
|
791
|
+
"block_ratio": 0.5,
|
|
792
|
+
"cache_ratio": 0.2,
|
|
793
|
+
"forward_ratio": 0.3,
|
|
794
|
+
"qtype_counts": {"A": 5},
|
|
795
|
+
"special_count": 0,
|
|
796
|
+
},
|
|
797
|
+
)
|
|
798
|
+
handler = TextHandler(verbose_level=0)
|
|
799
|
+
lines = handler._render_dns_group(_dns_sections([f]))
|
|
800
|
+
output = "\n".join(lines)
|
|
801
|
+
assert "BLOCKED" in output, "was_blocked=True must show BLOCKED token in default output"
|
|
802
|
+
|
|
803
|
+
|
|
804
|
+
def test_text_renderer_pihole_default_no_blocked_marker() -> None:
|
|
805
|
+
"""Pihole singleton with was_blocked=False omits BLOCKED token from default output."""
|
|
806
|
+
f = _make_dns_finding(
|
|
807
|
+
Severity.MEDIUM,
|
|
808
|
+
"sub.example.com",
|
|
809
|
+
{
|
|
810
|
+
"source": "pihole",
|
|
811
|
+
"entropy": 1.55,
|
|
812
|
+
"query_count": 3,
|
|
813
|
+
"unique_sources": 1,
|
|
814
|
+
"querier_ips": ["192.0.2.1"],
|
|
815
|
+
"was_blocked": False,
|
|
816
|
+
"block_ratio": 0.0,
|
|
817
|
+
"cache_ratio": 0.4,
|
|
818
|
+
"forward_ratio": 0.6,
|
|
819
|
+
"qtype_counts": {"A": 3},
|
|
820
|
+
"special_count": 0,
|
|
821
|
+
},
|
|
822
|
+
)
|
|
823
|
+
handler = TextHandler(verbose_level=0)
|
|
824
|
+
lines = handler._render_dns_group(_dns_sections([f]))
|
|
825
|
+
output = "\n".join(lines)
|
|
826
|
+
assert "BLOCKED" not in output, "was_blocked=False must not show BLOCKED token"
|
|
827
|
+
|
|
828
|
+
|
|
829
|
+
def test_text_renderer_pihole_verbose_shows_ratios() -> None:
|
|
830
|
+
"""Pihole singleton verbose output shows block/cache/fwd ratios; no rcode lines."""
|
|
831
|
+
f = _make_dns_finding(
|
|
832
|
+
Severity.HIGH,
|
|
833
|
+
"a3f7bc19.example.com",
|
|
834
|
+
{
|
|
835
|
+
"source": "pihole",
|
|
836
|
+
"entropy": 1.77,
|
|
837
|
+
"query_count": 5,
|
|
838
|
+
"unique_sources": 2,
|
|
839
|
+
"querier_ips": ["192.0.2.1"],
|
|
840
|
+
"was_blocked": True,
|
|
841
|
+
"block_ratio": 0.5,
|
|
842
|
+
"cache_ratio": 0.2,
|
|
843
|
+
"forward_ratio": 0.3,
|
|
844
|
+
"qtype_counts": {"A": 5},
|
|
845
|
+
"special_count": 0,
|
|
846
|
+
},
|
|
847
|
+
)
|
|
848
|
+
handler = TextHandler(verbose_level=1)
|
|
849
|
+
lines = handler._render_dns_group(_dns_sections([f]))
|
|
850
|
+
output = "\n".join(lines)
|
|
851
|
+
|
|
852
|
+
# W3 curated tail surfaces ratios as raw key:value pairs (no percent
|
|
853
|
+
# formatting in the uniform evidence block — see GLENN notes).
|
|
854
|
+
assert "block_ratio: 0.5" in output, "block_ratio must appear in verbose output"
|
|
855
|
+
assert "was_blocked: True" in output, "was_blocked must appear in verbose output"
|
|
856
|
+
# Pihole's qtype_counts is part of the pihole curated subset.
|
|
857
|
+
assert "qtype_counts" in output, "qtype_counts must appear in pihole verbose"
|
|
858
|
+
assert "rcode_distribution" not in output, "pihole verbose must not show rcode_distribution"
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def test_text_renderer_zeek_default_line_unchanged() -> None:
|
|
862
|
+
"""Zeek-only singletons produce the character-identical pre-pihole format (no BLOCKED column)."""
|
|
863
|
+
f1 = _make_dns_finding(
|
|
864
|
+
Severity.HIGH,
|
|
865
|
+
"sub.example.com",
|
|
866
|
+
{
|
|
867
|
+
"source": "zeek",
|
|
868
|
+
"entropy": 2.10,
|
|
869
|
+
"query_count": 5,
|
|
870
|
+
"unique_sources": 2,
|
|
871
|
+
"querier_ips": ["192.0.2.1"],
|
|
872
|
+
"rcode_distribution": {},
|
|
873
|
+
},
|
|
874
|
+
)
|
|
875
|
+
f2 = _make_dns_finding(
|
|
876
|
+
Severity.HIGH,
|
|
877
|
+
"api.example.net",
|
|
878
|
+
{
|
|
879
|
+
"source": "zeek",
|
|
880
|
+
"entropy": 1.92,
|
|
881
|
+
"query_count": 3,
|
|
882
|
+
"unique_sources": 1,
|
|
883
|
+
"querier_ips": ["192.0.2.2"],
|
|
884
|
+
"rcode_distribution": {},
|
|
885
|
+
},
|
|
886
|
+
)
|
|
887
|
+
handler = TextHandler(verbose_level=0)
|
|
888
|
+
lines = handler._render_dns_group(_dns_sections([f1, f2]))
|
|
889
|
+
|
|
890
|
+
# Analytically derive expected strings — column widths are max across both rows:
|
|
891
|
+
# ent_w=8 ("ent=2.10"), qry_w=5 ("5 qry"), src_w=5 ("2 src"), blocked_w=0 (no blocked)
|
|
892
|
+
tag = f"{'[H]':<4}" # "[H] "
|
|
893
|
+
expected_1 = f" {tag} {'ent=2.10':<8} {'5 qry':>5} {'2 src':>5} sub.example.com"
|
|
894
|
+
expected_2 = f" {tag} {'ent=1.92':<8} {'3 qry':>5} {'1 src':>5} api.example.net"
|
|
895
|
+
|
|
896
|
+
# Row lines start with the 2-space indent + severity tag. Skip the
|
|
897
|
+
# subsection label (e.g. "singletons (2)") and any blank lines.
|
|
898
|
+
singleton_lines = [l for l in lines if l.startswith(" [")]
|
|
899
|
+
assert singleton_lines[0] == expected_1, (
|
|
900
|
+
f"line 1 mismatch:\n got: {singleton_lines[0]!r}\n expected: {expected_1!r}"
|
|
901
|
+
)
|
|
902
|
+
assert singleton_lines[1] == expected_2, (
|
|
903
|
+
f"line 2 mismatch:\n got: {singleton_lines[1]!r}\n expected: {expected_2!r}"
|
|
904
|
+
)
|
|
905
|
+
assert "BLOCKED" not in "\n".join(lines), "Zeek-only output must not contain BLOCKED token"
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def test_text_renderer_dns_sections_have_breathing_room() -> None:
|
|
909
|
+
"""DNS output separates detector and subgroup sections with blank lines."""
|
|
910
|
+
singleton = _make_dns_finding(
|
|
911
|
+
Severity.HIGH,
|
|
912
|
+
"a3f7bc19.example.com",
|
|
913
|
+
{
|
|
914
|
+
"source": "pihole",
|
|
915
|
+
"entropy": 1.77,
|
|
916
|
+
"query_count": 5,
|
|
917
|
+
"unique_sources": 1,
|
|
918
|
+
"querier_ips": ["192.0.2.1"],
|
|
919
|
+
"was_blocked": False,
|
|
920
|
+
"block_ratio": 0.0,
|
|
921
|
+
"cache_ratio": 0.2,
|
|
922
|
+
"forward_ratio": 0.8,
|
|
923
|
+
"qtype_counts": {"A": 5},
|
|
924
|
+
"special_count": 0,
|
|
925
|
+
},
|
|
926
|
+
)
|
|
927
|
+
group = _make_dns_finding(
|
|
928
|
+
Severity.MEDIUM,
|
|
929
|
+
"example.com (2 subdomains, entropy >= 1.8)",
|
|
930
|
+
{
|
|
931
|
+
"source": "pihole",
|
|
932
|
+
"registrable_domain": "example.com",
|
|
933
|
+
"subdomain_count": 2,
|
|
934
|
+
"max_entropy": 1.77,
|
|
935
|
+
"min_entropy": 1.7,
|
|
936
|
+
"total_queries": 8,
|
|
937
|
+
"unique_sources": 2,
|
|
938
|
+
"sample_domains": ["a3f7bc19.example.com", "m8x2q9n.example.com"],
|
|
939
|
+
"querier_ips": ["192.0.2.1", "192.0.2.2"],
|
|
940
|
+
"was_blocked": False,
|
|
941
|
+
"block_ratio": 0.0,
|
|
942
|
+
"cache_ratio": 0.2,
|
|
943
|
+
"forward_ratio": 0.8,
|
|
944
|
+
"qtype_counts": {"A": 8},
|
|
945
|
+
"special_count": 0,
|
|
946
|
+
},
|
|
947
|
+
)
|
|
948
|
+
|
|
949
|
+
lines = TextHandler(verbose_level=0)._render_dns_group(_dns_sections([singleton, group]))
|
|
950
|
+
|
|
951
|
+
# The detector-level blank line above the singletons subsection is emitted
|
|
952
|
+
# by TextHandler.write (`print(file=stream)` before the header rule), not
|
|
953
|
+
# by the renderer itself. Inside the renderer's output we only see the
|
|
954
|
+
# gap BETWEEN sections.
|
|
955
|
+
group_header = next(i for i, line in enumerate(lines) if "groups" in line)
|
|
956
|
+
assert lines[group_header - 1] == ""
|
|
957
|
+
|
|
958
|
+
|
|
959
|
+
def test_text_renderer_zeek_verbose_rcode_unchanged() -> None:
|
|
960
|
+
"""Zeek singleton verbose output shows rcodes: line; no ratio lines."""
|
|
961
|
+
f = _make_dns_finding(
|
|
962
|
+
Severity.HIGH,
|
|
963
|
+
"sub.example.com",
|
|
964
|
+
{
|
|
965
|
+
"source": "zeek",
|
|
966
|
+
"entropy": 1.92,
|
|
967
|
+
"query_count": 6,
|
|
968
|
+
"unique_sources": 2,
|
|
969
|
+
"querier_ips": ["192.0.2.1"],
|
|
970
|
+
"rcode_distribution": {"NOERROR": 5, "NXDOMAIN": 1},
|
|
971
|
+
},
|
|
972
|
+
)
|
|
973
|
+
handler = TextHandler(verbose_level=1)
|
|
974
|
+
lines = handler._render_dns_group(_dns_sections([f]))
|
|
975
|
+
output = "\n".join(lines)
|
|
976
|
+
|
|
977
|
+
# W3 curated tail: zeek singleton surfaces rcode_distribution as a raw
|
|
978
|
+
# key:value pair (no per-detector ratio formatting).
|
|
979
|
+
assert "rcode_distribution" in output, "Zeek verbose must show rcode_distribution"
|
|
980
|
+
assert "NOERROR" in output, "rcode keys must appear"
|
|
981
|
+
assert "block_ratio" not in output, "Zeek-only verbose must not show block_ratio"
|
|
982
|
+
|
|
983
|
+
|
|
984
|
+
def test_text_renderer_both_mode_verbose_shows_was_blocked() -> None:
|
|
985
|
+
"""Both-mode Zeek singleton with was_blocked=True shows was_blocked annotation in verbose."""
|
|
986
|
+
f = _make_dns_finding(
|
|
987
|
+
Severity.HIGH,
|
|
988
|
+
"a3f7bc19.example.com",
|
|
989
|
+
{
|
|
990
|
+
"source": "zeek",
|
|
991
|
+
"entropy": 1.77,
|
|
992
|
+
"query_count": 3,
|
|
993
|
+
"unique_sources": 1,
|
|
994
|
+
"querier_ips": ["192.0.2.1"],
|
|
995
|
+
"rcode_distribution": {},
|
|
996
|
+
"was_blocked": True,
|
|
997
|
+
"block_ratio": 0.5,
|
|
998
|
+
},
|
|
999
|
+
)
|
|
1000
|
+
handler = TextHandler(verbose_level=1)
|
|
1001
|
+
lines = handler._render_dns_group(_dns_sections([f]))
|
|
1002
|
+
output = "\n".join(lines)
|
|
1003
|
+
|
|
1004
|
+
# W3 curated tail: both-mode (Zeek + Pi-hole enrichment) surfaces
|
|
1005
|
+
# was_blocked + block_ratio as raw key:value pairs. The "(Pi-hole
|
|
1006
|
+
# enrichment)" prose annotation from the old per-detector block is gone
|
|
1007
|
+
# — the keys themselves carry the provenance (a Zeek finding doesn't
|
|
1008
|
+
# have was_blocked otherwise).
|
|
1009
|
+
assert "was_blocked: True" in output, "both-mode verbose must show was_blocked"
|
|
1010
|
+
assert "block_ratio: 0.5" in output, "block_ratio must appear in both-mode verbose"
|