loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Parser modules for normalizing local log formats before detector analysis."""
|
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
"""CloudTrail event parsing — normalize raw AWS events into canonical row dicts.
|
|
2
|
+
|
|
3
|
+
Provides pure parsing functions with no file I/O. File discovery, decompression,
|
|
4
|
+
NDJSON / envelope sniffing, DataFrame construction, and timeframe filtering are
|
|
5
|
+
handled by loader.py via load_cloudtrail().
|
|
6
|
+
|
|
7
|
+
Per-event normalization, not aggregation. The aws detector aggregates per-principal
|
|
8
|
+
on the back end — the same parser-emits-fragments / detector-aggregates split that
|
|
9
|
+
dnsmasq uses (`_build_pihole_aggregate` lives in the detector, not the parser).
|
|
10
|
+
|
|
11
|
+
Canonical CloudTrail event schema (v1)
|
|
12
|
+
──────────────────────────────────────
|
|
13
|
+
parse_event() emits one dict per event with these twelve keys, all always present.
|
|
14
|
+
|
|
15
|
+
Carried fields (verbatim from the wire event, or None when absent):
|
|
16
|
+
ts — eventTime parsed to unix epoch float; None when missing/garbage
|
|
17
|
+
event_source — eventSource (full string, e.g. "s3.amazonaws.com")
|
|
18
|
+
event_name — eventName (e.g. "ListBuckets")
|
|
19
|
+
identity_type — userIdentity.type (IAMUser, AssumedRole, AWSService, Root, …)
|
|
20
|
+
source_ip — sourceIPAddress
|
|
21
|
+
error_code — errorCode; None means the call succeeded
|
|
22
|
+
aws_region — awsRegion — human-triage pivot
|
|
23
|
+
event_id — eventID — drill-back anchor; the analyst's key to the full event
|
|
24
|
+
|
|
25
|
+
Derived fields (computed from one or more wire fields by the rules below):
|
|
26
|
+
principal — stable per-actor key; collapses userIdentity variants so a role
|
|
27
|
+
assumed many times is one actor, not many. See _derive_principal.
|
|
28
|
+
lane — "interactive" | "service", mechanically derived from
|
|
29
|
+
userIdentity.type / invokedBy / service-linked-role naming
|
|
30
|
+
read_write — "read" | "write"; top-level readOnly when present, else inferred
|
|
31
|
+
from the action verb (handles thinner old event schemas)
|
|
32
|
+
|
|
33
|
+
Escape hatch (SCHEMA.md → promote-don't-grep):
|
|
34
|
+
raw — original event dict, unmodified. No v1 detector reads this at
|
|
35
|
+
runtime. Future detectors that need fields living in `raw` —
|
|
36
|
+
recipient_account_id, user_agent, requestParameters,
|
|
37
|
+
responseElements, resources — promote them to real, typed,
|
|
38
|
+
documented canonical columns at that time, with real knowledge
|
|
39
|
+
of what the signal needs. Detectors never reach into `raw`
|
|
40
|
+
mid-analysis.
|
|
41
|
+
|
|
42
|
+
Adding a carried column later is a one-line, obvious change: add the key to the
|
|
43
|
+
single result-dict literal in parse_event(), pulled with .get(...).
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import json
|
|
49
|
+
import re
|
|
50
|
+
from datetime import datetime
|
|
51
|
+
from typing import Any
|
|
52
|
+
|
|
53
|
+
# Verb-inference fallback for read_write — used only when an event has no top-level
|
|
54
|
+
# readOnly field (older CloudTrail event schemas). A prefix match in this set maps
|
|
55
|
+
# the event to "read"; everything else is "write".
|
|
56
|
+
_READ_VERB_PREFIXES: tuple[str, ...] = (
|
|
57
|
+
"Get", "List", "Describe", "Head", "Lookup",
|
|
58
|
+
"Search", "BatchGet", "Select", "Query", "Scan",
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def parse_event(event: Any) -> dict | None:
|
|
63
|
+
"""Parse a raw CloudTrail event dict into the canonical row dict.
|
|
64
|
+
|
|
65
|
+
Returns None only when ``event`` is not a dict. For any dict input, returns
|
|
66
|
+
a row with all twelve canonical keys present — never raises on missing,
|
|
67
|
+
malformed, or unexpected fields anywhere in the event. All sub-lookups are
|
|
68
|
+
defensive: missing nested objects degrade to the appropriate field-level
|
|
69
|
+
fallback rather than aborting the parse.
|
|
70
|
+
"""
|
|
71
|
+
if not isinstance(event, dict):
|
|
72
|
+
return None
|
|
73
|
+
|
|
74
|
+
user_identity = event.get("userIdentity")
|
|
75
|
+
identity_type = user_identity.get("type") if isinstance(user_identity, dict) else None
|
|
76
|
+
|
|
77
|
+
return {
|
|
78
|
+
"ts": _parse_event_time(event.get("eventTime")),
|
|
79
|
+
"principal": _derive_principal(user_identity),
|
|
80
|
+
"lane": _derive_lane(user_identity),
|
|
81
|
+
"read_write": _derive_read_write(event),
|
|
82
|
+
"event_source": event.get("eventSource"),
|
|
83
|
+
"event_name": event.get("eventName"),
|
|
84
|
+
"identity_type": identity_type,
|
|
85
|
+
"source_ip": event.get("sourceIPAddress"),
|
|
86
|
+
"error_code": event.get("errorCode"),
|
|
87
|
+
"aws_region": event.get("awsRegion"),
|
|
88
|
+
"event_id": event.get("eventID"),
|
|
89
|
+
"raw": event,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ── Derivation helpers ────────────────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
def _parse_event_time(s: Any) -> float | None:
|
|
96
|
+
"""Parse an ISO 8601 eventTime string to unix epoch float; None on failure.
|
|
97
|
+
|
|
98
|
+
Mirrors exporters/cloudtrail.py:_parse_event_time. CloudTrail emits "Z" suffix
|
|
99
|
+
UTC; fromisoformat handles a "+00:00" offset, hence the substitution.
|
|
100
|
+
"""
|
|
101
|
+
if not isinstance(s, str) or not s:
|
|
102
|
+
return None
|
|
103
|
+
try:
|
|
104
|
+
return datetime.fromisoformat(s.replace("Z", "+00:00")).timestamp()
|
|
105
|
+
except (ValueError, TypeError):
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _derive_principal(user_identity: Any) -> str:
|
|
110
|
+
"""Derive a stable per-actor key from userIdentity.
|
|
111
|
+
|
|
112
|
+
Load-bearing intent: a role assumed many times is one actor. AssumedRole events
|
|
113
|
+
key on the session *issuer*, not the per-assumption session name, so every
|
|
114
|
+
session of one role aggregates together. See SCHEMA.md for the full rule.
|
|
115
|
+
"""
|
|
116
|
+
if not isinstance(user_identity, dict):
|
|
117
|
+
return "unknown"
|
|
118
|
+
|
|
119
|
+
itype = user_identity.get("type")
|
|
120
|
+
|
|
121
|
+
if itype == "AssumedRole":
|
|
122
|
+
session_context = user_identity.get("sessionContext")
|
|
123
|
+
if isinstance(session_context, dict):
|
|
124
|
+
issuer = session_context.get("sessionIssuer")
|
|
125
|
+
if isinstance(issuer, dict):
|
|
126
|
+
user_name = issuer.get("userName")
|
|
127
|
+
if isinstance(user_name, str) and user_name:
|
|
128
|
+
return user_name
|
|
129
|
+
arn = issuer.get("arn")
|
|
130
|
+
if isinstance(arn, str) and arn:
|
|
131
|
+
last = arn.rsplit("/", 1)[-1]
|
|
132
|
+
if last:
|
|
133
|
+
return last
|
|
134
|
+
issuer_pid = issuer.get("principalId")
|
|
135
|
+
if isinstance(issuer_pid, str) and issuer_pid:
|
|
136
|
+
return issuer_pid
|
|
137
|
+
# fall through to generic fallback when sessionIssuer is absent/empty
|
|
138
|
+
|
|
139
|
+
elif itype == "IAMUser":
|
|
140
|
+
user_name = user_identity.get("userName")
|
|
141
|
+
if isinstance(user_name, str) and user_name:
|
|
142
|
+
return user_name
|
|
143
|
+
arn = user_identity.get("arn")
|
|
144
|
+
if isinstance(arn, str) and arn:
|
|
145
|
+
last = arn.rsplit("/", 1)[-1]
|
|
146
|
+
if last:
|
|
147
|
+
return last
|
|
148
|
+
# principalId is the next step, handled by the generic fallback below
|
|
149
|
+
|
|
150
|
+
elif itype == "AWSService":
|
|
151
|
+
invoked_by = user_identity.get("invokedBy")
|
|
152
|
+
if isinstance(invoked_by, str) and invoked_by:
|
|
153
|
+
return invoked_by
|
|
154
|
+
# fall through to generic fallback
|
|
155
|
+
|
|
156
|
+
elif itype == "Root":
|
|
157
|
+
return "root"
|
|
158
|
+
|
|
159
|
+
# Generic fallback: federated/SAML/WebIdentity/IdentityCenter/AWSAccount/unknown
|
|
160
|
+
# types, plus fall-through from the per-type branches above when their preferred
|
|
161
|
+
# fields are missing. principalId stability is what keeps two distinct actors
|
|
162
|
+
# under an unknown type from collapsing into one bucket.
|
|
163
|
+
pid = user_identity.get("principalId")
|
|
164
|
+
if isinstance(pid, str) and pid:
|
|
165
|
+
return pid
|
|
166
|
+
if isinstance(itype, str) and itype:
|
|
167
|
+
return itype
|
|
168
|
+
return "unknown"
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def _derive_lane(user_identity: Any) -> str:
|
|
172
|
+
"""Return "service" or "interactive" for an event's userIdentity.
|
|
173
|
+
|
|
174
|
+
Mechanical, no security judgment. "service" if any of:
|
|
175
|
+
1. userIdentity.type is AWSService or AWSAccount
|
|
176
|
+
2. userIdentity.invokedBy ends with "amazonaws.com"
|
|
177
|
+
3. "AWSServiceRoleFor" appears in userIdentity.arn or
|
|
178
|
+
sessionContext.sessionIssuer.arn
|
|
179
|
+
|
|
180
|
+
Otherwise "interactive". No hardcoded principal-name list — corpus-specific
|
|
181
|
+
role names are not a parser concern.
|
|
182
|
+
"""
|
|
183
|
+
if not isinstance(user_identity, dict):
|
|
184
|
+
return "interactive"
|
|
185
|
+
|
|
186
|
+
itype = user_identity.get("type")
|
|
187
|
+
if itype in ("AWSService", "AWSAccount"):
|
|
188
|
+
return "service"
|
|
189
|
+
|
|
190
|
+
invoked_by = user_identity.get("invokedBy")
|
|
191
|
+
if isinstance(invoked_by, str) and invoked_by.endswith("amazonaws.com"):
|
|
192
|
+
return "service"
|
|
193
|
+
|
|
194
|
+
arn = user_identity.get("arn")
|
|
195
|
+
if isinstance(arn, str) and "AWSServiceRoleFor" in arn:
|
|
196
|
+
return "service"
|
|
197
|
+
|
|
198
|
+
session_context = user_identity.get("sessionContext")
|
|
199
|
+
if isinstance(session_context, dict):
|
|
200
|
+
issuer = session_context.get("sessionIssuer")
|
|
201
|
+
if isinstance(issuer, dict):
|
|
202
|
+
issuer_arn = issuer.get("arn")
|
|
203
|
+
if isinstance(issuer_arn, str) and "AWSServiceRoleFor" in issuer_arn:
|
|
204
|
+
return "service"
|
|
205
|
+
|
|
206
|
+
return "interactive"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def _derive_read_write(event: dict) -> str:
|
|
210
|
+
"""Return "read" or "write" from top-level readOnly, else verb inference.
|
|
211
|
+
|
|
212
|
+
readOnly precedence: boolean True / string "true" → "read"; boolean False /
|
|
213
|
+
string "false" → "write". Absent readOnly falls back to the action verb:
|
|
214
|
+
eventName starting with a known read prefix → "read", else "write".
|
|
215
|
+
"""
|
|
216
|
+
read_only = event.get("readOnly")
|
|
217
|
+
if read_only is True:
|
|
218
|
+
return "read"
|
|
219
|
+
if read_only is False:
|
|
220
|
+
return "write"
|
|
221
|
+
if isinstance(read_only, str):
|
|
222
|
+
lowered = read_only.lower()
|
|
223
|
+
if lowered == "true":
|
|
224
|
+
return "read"
|
|
225
|
+
if lowered == "false":
|
|
226
|
+
return "write"
|
|
227
|
+
|
|
228
|
+
# readOnly absent or in an unrecognised shape — verb inference fallback.
|
|
229
|
+
name = event.get("eventName")
|
|
230
|
+
if isinstance(name, str) and name:
|
|
231
|
+
for prefix in _READ_VERB_PREFIXES:
|
|
232
|
+
if name.startswith(prefix):
|
|
233
|
+
return "read"
|
|
234
|
+
return "write"
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
SNIFF_PEEK_LINES: int = 200
|
|
238
|
+
|
|
239
|
+
# Quoted-key + colon — matches a JSON key declaration, not a value substring.
|
|
240
|
+
_CT_KEY_RE: dict[str, re.Pattern[str]] = {
|
|
241
|
+
"Records": re.compile(r'"Records"\s*:'),
|
|
242
|
+
"eventVersion": re.compile(r'"eventVersion"\s*:'),
|
|
243
|
+
"eventTime": re.compile(r'"eventTime"\s*:'),
|
|
244
|
+
"userIdentity": re.compile(r'"userIdentity"\s*:'),
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
_CT_EVENT_KEYS: tuple[str, ...] = ("eventVersion", "eventTime", "userIdentity")
|
|
248
|
+
|
|
249
|
+
|
|
250
|
+
def sniff(sample: list[str]) -> str | None:
|
|
251
|
+
"""Recognize CloudTrail JSON (NDJSON event or envelope) and return "cloudtrail".
|
|
252
|
+
|
|
253
|
+
Two paths, either one wins:
|
|
254
|
+
|
|
255
|
+
1. NDJSON: the first non-empty line parses as a dict containing at least
|
|
256
|
+
two of ``eventVersion``, ``eventTime``, ``userIdentity``.
|
|
257
|
+
2. Envelope (structural — does not require the sample to contain a
|
|
258
|
+
parseable JSON document): the joined sample contains the quoted key
|
|
259
|
+
``"Records":`` AND at least two of the three per-event keys
|
|
260
|
+
(``"eventVersion":``, ``"eventTime":``, ``"userIdentity":``) as
|
|
261
|
+
quoted-key tokens. Survives pretty-printed envelopes whose first
|
|
262
|
+
record exceeds the sample budget.
|
|
263
|
+
|
|
264
|
+
Returns None when neither path matches.
|
|
265
|
+
|
|
266
|
+
Pure: takes already-decoded lines, performs no I/O.
|
|
267
|
+
"""
|
|
268
|
+
for raw_line in sample:
|
|
269
|
+
line = raw_line.strip()
|
|
270
|
+
if not line:
|
|
271
|
+
continue
|
|
272
|
+
try:
|
|
273
|
+
obj = json.loads(line)
|
|
274
|
+
except (json.JSONDecodeError, ValueError):
|
|
275
|
+
break
|
|
276
|
+
if isinstance(obj, dict):
|
|
277
|
+
hit = sum(1 for k in _CT_EVENT_KEYS if k in obj)
|
|
278
|
+
if hit >= 2:
|
|
279
|
+
return "cloudtrail"
|
|
280
|
+
break
|
|
281
|
+
|
|
282
|
+
joined = "\n".join(sample)
|
|
283
|
+
if _CT_KEY_RE["Records"].search(joined):
|
|
284
|
+
hit = sum(1 for k in _CT_EVENT_KEYS if _CT_KEY_RE[k].search(joined))
|
|
285
|
+
if hit >= 2:
|
|
286
|
+
return "cloudtrail"
|
|
287
|
+
return None
|
|
@@ -0,0 +1,331 @@
|
|
|
1
|
+
"""dnsmasq/Pi-hole log parsing — extract structured event dicts for loader assembly.
|
|
2
|
+
|
|
3
|
+
Provides pure parsing functions with no file I/O. File discovery, DataFrame
|
|
4
|
+
construction, and hostname assignment are handled by loader.py via load_pihole().
|
|
5
|
+
|
|
6
|
+
Known limitation: dnsmasq logs carry no timezone information. Timestamps are
|
|
7
|
+
parsed as if UTC (tzinfo=timezone.utc applied directly to the wall-clock value),
|
|
8
|
+
matching the behaviour of parsers/syslog.py. When a Pi-hole host runs in a
|
|
9
|
+
non-UTC timezone the ts values will be offset from true UTC by the host's local
|
|
10
|
+
offset; LogHunter's internal timeframe arithmetic is consistent because the same
|
|
11
|
+
convention is applied throughout.
|
|
12
|
+
|
|
13
|
+
Canonical-plus-event schema
|
|
14
|
+
────────────────────────────
|
|
15
|
+
The parser emits one dict per parsed event line. Fields divide into two groups:
|
|
16
|
+
|
|
17
|
+
Canonical DNS fields (shared vocabulary with the Zeek path):
|
|
18
|
+
ts — UTC-aware datetime or None
|
|
19
|
+
src — querying client IP (str | None; populated ONLY on query events)
|
|
20
|
+
query — queried domain (str | None; the "domain" of the event where applicable)
|
|
21
|
+
|
|
22
|
+
dnsmasq event fields (parser-specific):
|
|
23
|
+
event_type — query | forwarded | reply | cached | gravity_blocked |
|
|
24
|
+
config | validation | dnssec_query | special | dhcp |
|
|
25
|
+
pihole_hostname | regex_blocked | unknown
|
|
26
|
+
qtype — query type (A, AAAA, HTTPS, …) (str | None; query and dnssec_query events)
|
|
27
|
+
dst — upstream resolver or validation target (str | None; forwarded and
|
|
28
|
+
dnssec_query events; also holds an opaque token on dhcp rows —
|
|
29
|
+
not canonical DNS on dhcp; guard with event_type check before use)
|
|
30
|
+
answer — answer payload (str | None; reply/cached/gravity_blocked/config/special/
|
|
31
|
+
pihole_hostname events; also holds an opaque token on dhcp rows —
|
|
32
|
+
same caveat as dst)
|
|
33
|
+
validation — DNSSEC status or block disposition phrase (str | None; validation events
|
|
34
|
+
carry the DNSSEC verdict; regex_blocked events carry the matched
|
|
35
|
+
disposition phrase e.g. "regex denied", "exactly blacklisted")
|
|
36
|
+
host — source host, set by the loader from filename (parser leaves "")
|
|
37
|
+
raw — original line
|
|
38
|
+
message — message portion after the dnsmasq[PID]: prefix
|
|
39
|
+
|
|
40
|
+
Rule for detector authors: features may depend on the canonical fields (ts, src,
|
|
41
|
+
query) freely. Any feature derived from the dnsmasq event fields must be guarded
|
|
42
|
+
(presence-checked) so the same detector code can run against a Zeek frame that
|
|
43
|
+
lacks them.
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
from __future__ import annotations
|
|
47
|
+
|
|
48
|
+
import re
|
|
49
|
+
from datetime import datetime
|
|
50
|
+
|
|
51
|
+
from loghunter.parsers.syslog import parse_timestamp
|
|
52
|
+
|
|
53
|
+
# ── Compiled patterns ─────────────────────────────────────────────────────────
|
|
54
|
+
|
|
55
|
+
# Outer grammar: Mon DD HH:MM:SS [hostname] dnsmasq[PID]: <message>
|
|
56
|
+
# Single-digit days appear with a leading space (dnsmasq format); \s+ handles both.
|
|
57
|
+
_OUTER_RE = re.compile(
|
|
58
|
+
r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}'
|
|
59
|
+
r'\s+(?:\S+\s+)?dnsmasq\[\d+\]:\s+'
|
|
60
|
+
r'(?P<message>.+)$'
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
# Inner message grammars — evaluated in order, first match wins.
|
|
64
|
+
_QUERY_RE = re.compile(
|
|
65
|
+
r'^query\[(?P<qtype>[^\]]+)\]\s+(?P<domain>\S+)\s+from\s+(?P<src>\S+)'
|
|
66
|
+
)
|
|
67
|
+
_FWD_RE = re.compile(
|
|
68
|
+
r'^forwarded\s+(?P<domain>\S+)\s+to\s+(?P<dst>\S+)'
|
|
69
|
+
)
|
|
70
|
+
_REPLY_RE = re.compile(
|
|
71
|
+
r'^reply\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
|
|
72
|
+
)
|
|
73
|
+
_CACHED_RE = re.compile(
|
|
74
|
+
r'^(?:cached(?:-stale)?)\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
|
|
75
|
+
)
|
|
76
|
+
_GRAVITY_RE = re.compile(
|
|
77
|
+
r'^gravity blocked (?:\([^\)]+\)\s+)?(?P<domain>\S+) is (?P<answer>.+)$'
|
|
78
|
+
)
|
|
79
|
+
_CONFIG_RE = re.compile(
|
|
80
|
+
r'^(?P<source>/\S+|config)\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
|
|
81
|
+
)
|
|
82
|
+
_VALID_RE = re.compile(
|
|
83
|
+
r'^validation result is (?P<status>\S+)'
|
|
84
|
+
)
|
|
85
|
+
# dnssec-query: resolver-internal DNSSEC validation traffic. Must precede _FWD_RE
|
|
86
|
+
# as future-proofing — today _FWD_RE starts with "forwarded" so there is no live
|
|
87
|
+
# conflict, but both patterns contain " to " and this ordering makes the intent explicit.
|
|
88
|
+
_DNSSEC_QUERY_RE = re.compile(
|
|
89
|
+
r'^dnssec-query\[(?P<qtype>[^\]]+)\]\s+(?P<domain>\S+)\s+to\s+(?P<dst>\S+)'
|
|
90
|
+
)
|
|
91
|
+
# special domain: Apple Private Relay / resolver override disposition. Must precede
|
|
92
|
+
# _CONFIG_RE as future-proofing — today _CONFIG_RE starts with "/" or "config" so
|
|
93
|
+
# there is no live conflict, but both match "<token> <domain> is <answer>".
|
|
94
|
+
_SPECIAL_RE = re.compile(
|
|
95
|
+
r'^special domain\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
|
|
96
|
+
)
|
|
97
|
+
# Pi-hole hostname self-resolution. Must precede _CONFIG_RE for the same reason as
|
|
98
|
+
# _SPECIAL_RE — the "Pi-hole hostname" literal prefix is unambiguous today but the
|
|
99
|
+
# ordering makes the intent explicit.
|
|
100
|
+
_PIHOLE_HOSTNAME_RE = re.compile(
|
|
101
|
+
r'^Pi-hole hostname\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
|
|
102
|
+
)
|
|
103
|
+
# Regex/blacklist block disposition. Covers the spelling variants FTL emits across
|
|
104
|
+
# versions: "regex denied", "regex blacklisted", "exactly denied", "exactly blacklisted",
|
|
105
|
+
# and bare "blacklisted". Must precede _CONFIG_RE — the "<token(s)> <domain> is <answer>"
|
|
106
|
+
# shape overlaps, and the literal disposition prefix is unambiguous.
|
|
107
|
+
_REGEX_BLOCKED_RE = re.compile(
|
|
108
|
+
r'^(?P<disposition>regex (?:denied|blacklisted)|exactly (?:denied|blacklisted)|blacklisted)'
|
|
109
|
+
r'\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
|
|
110
|
+
)
|
|
111
|
+
# DHCP lease lines ride the same log file. Both field orders occur in the wild:
|
|
112
|
+
# "DHCP <ip> is <hostname>" AND "DHCP <hostname> is <ip>".
|
|
113
|
+
_DHCP_RE = re.compile(
|
|
114
|
+
r'^DHCP\s+(?P<a>\S+)\s+is\s+(?P<b>\S+)'
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ── Parsing functions ─────────────────────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
def parse_line(raw: str) -> dict | None:
|
|
121
|
+
"""Parse a raw dnsmasq log line into a normalized event dict.
|
|
122
|
+
|
|
123
|
+
Returns None for blank lines, comment lines (starting with #), and lines
|
|
124
|
+
that do not match the dnsmasq outer grammar.
|
|
125
|
+
|
|
126
|
+
Returns a dict with the canonical-plus-event schema described in the module
|
|
127
|
+
docstring. All keys are always present. The 'host' field is left as "" —
|
|
128
|
+
the loader fills it from the filename stem before building the DataFrame.
|
|
129
|
+
The 'event_type' is "unknown" when the outer grammar matches but no inner
|
|
130
|
+
grammar does; 'query' is None in that case. Unknown lines are retained
|
|
131
|
+
(not dropped) so the detector session can discover new message patterns.
|
|
132
|
+
"""
|
|
133
|
+
if not raw or raw.lstrip().startswith("#"):
|
|
134
|
+
return None
|
|
135
|
+
|
|
136
|
+
m = _OUTER_RE.match(raw.strip())
|
|
137
|
+
if not m:
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
message = m.group("message")
|
|
141
|
+
ts: datetime | None = parse_timestamp(raw)
|
|
142
|
+
|
|
143
|
+
result: dict = {
|
|
144
|
+
"ts": ts,
|
|
145
|
+
"src": None,
|
|
146
|
+
"query": None,
|
|
147
|
+
"event_type": "unknown",
|
|
148
|
+
"qtype": None,
|
|
149
|
+
"dst": None,
|
|
150
|
+
"answer": None,
|
|
151
|
+
"validation": None,
|
|
152
|
+
"host": "",
|
|
153
|
+
"raw": raw,
|
|
154
|
+
"message": message,
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
m_q = _QUERY_RE.match(message)
|
|
158
|
+
if m_q:
|
|
159
|
+
result.update({
|
|
160
|
+
"event_type": "query",
|
|
161
|
+
"qtype": m_q.group("qtype"),
|
|
162
|
+
"query": m_q.group("domain"),
|
|
163
|
+
"src": m_q.group("src"),
|
|
164
|
+
})
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
# dhcp rows are non-DNS DHCP lease events that ride the dnsmasq log file.
|
|
168
|
+
# They are excluded from all DNS analysis. Parsed here only to keep the unknown
|
|
169
|
+
# bucket clean and to let the detector trivially filter with event_type == "dhcp".
|
|
170
|
+
# dst and answer hold the two raw "DHCP <a> is <b>" tokens as opaque strings;
|
|
171
|
+
# the "is" separator does NOT mean domain/answer here. Do not use dst or answer
|
|
172
|
+
# from dhcp rows in any DNS aggregation — guard with event_type == "dhcp" first.
|
|
173
|
+
m_dhcp = _DHCP_RE.match(message)
|
|
174
|
+
if m_dhcp:
|
|
175
|
+
result.update({
|
|
176
|
+
"event_type": "dhcp",
|
|
177
|
+
"dst": m_dhcp.group("a"), # opaque — not a DNS resolver address
|
|
178
|
+
"answer": m_dhcp.group("b"), # opaque — not a DNS answer
|
|
179
|
+
})
|
|
180
|
+
return result
|
|
181
|
+
|
|
182
|
+
# dnssec-query events are resolver-internal DNSSEC validation traffic keyed to
|
|
183
|
+
# zone-cut labels and root-style validation targets (e.g. example.test, or the
|
|
184
|
+
# root zone) that frequently NEVER appear as a client query. They must NOT be
|
|
185
|
+
# counted as "forwarded" events and must NOT be merged into per-domain query
|
|
186
|
+
# aggregation by default — doing so would inflate forward_ratio for domains with
|
|
187
|
+
# zero client queries and reproduce the divide-by-zero/infinite-ratio problem
|
|
188
|
+
# observed during exploration. Capture now; defer feature use to a deliberate
|
|
189
|
+
# later decision.
|
|
190
|
+
m_dq = _DNSSEC_QUERY_RE.match(message)
|
|
191
|
+
if m_dq:
|
|
192
|
+
result.update({
|
|
193
|
+
"event_type": "dnssec_query",
|
|
194
|
+
"qtype": m_dq.group("qtype"),
|
|
195
|
+
"query": m_dq.group("domain"),
|
|
196
|
+
"dst": m_dq.group("dst"),
|
|
197
|
+
})
|
|
198
|
+
return result
|
|
199
|
+
|
|
200
|
+
m_f = _FWD_RE.match(message)
|
|
201
|
+
if m_f:
|
|
202
|
+
result.update({
|
|
203
|
+
"event_type": "forwarded",
|
|
204
|
+
"query": m_f.group("domain"),
|
|
205
|
+
"dst": m_f.group("dst"),
|
|
206
|
+
})
|
|
207
|
+
return result
|
|
208
|
+
|
|
209
|
+
m_r = _REPLY_RE.match(message)
|
|
210
|
+
if m_r:
|
|
211
|
+
result.update({
|
|
212
|
+
"event_type": "reply",
|
|
213
|
+
"query": m_r.group("domain"),
|
|
214
|
+
"answer": m_r.group("answer").strip(),
|
|
215
|
+
})
|
|
216
|
+
return result
|
|
217
|
+
|
|
218
|
+
m_c = _CACHED_RE.match(message)
|
|
219
|
+
if m_c:
|
|
220
|
+
result.update({
|
|
221
|
+
"event_type": "cached",
|
|
222
|
+
"query": m_c.group("domain"),
|
|
223
|
+
"answer": m_c.group("answer").strip(),
|
|
224
|
+
})
|
|
225
|
+
return result
|
|
226
|
+
|
|
227
|
+
m_g = _GRAVITY_RE.match(message)
|
|
228
|
+
if m_g:
|
|
229
|
+
result.update({
|
|
230
|
+
"event_type": "gravity_blocked",
|
|
231
|
+
"query": m_g.group("domain"),
|
|
232
|
+
"answer": m_g.group("answer").strip(),
|
|
233
|
+
})
|
|
234
|
+
return result
|
|
235
|
+
|
|
236
|
+
m_sp = _SPECIAL_RE.match(message)
|
|
237
|
+
if m_sp:
|
|
238
|
+
result.update({
|
|
239
|
+
"event_type": "special",
|
|
240
|
+
"query": m_sp.group("domain"),
|
|
241
|
+
"answer": m_sp.group("answer").strip(),
|
|
242
|
+
})
|
|
243
|
+
return result
|
|
244
|
+
|
|
245
|
+
# pihole_hostname rows are Pi-hole's own host self-resolution chatter (FTL
|
|
246
|
+
# answering for its own hostname). They have no DNS hunting value and are
|
|
247
|
+
# excluded from all DNS aggregation. Parsed here only to keep the unknown
|
|
248
|
+
# bucket clean; the detector filters them out with event_type == "pihole_hostname".
|
|
249
|
+
m_ph = _PIHOLE_HOSTNAME_RE.match(message)
|
|
250
|
+
if m_ph:
|
|
251
|
+
result.update({
|
|
252
|
+
"event_type": "pihole_hostname",
|
|
253
|
+
"query": m_ph.group("domain"),
|
|
254
|
+
"answer": m_ph.group("answer").strip(),
|
|
255
|
+
})
|
|
256
|
+
return result
|
|
257
|
+
|
|
258
|
+
# regex_blocked and gravity_blocked are TWO mechanisms of the SAME outcome
|
|
259
|
+
# (Pi-hole refused to resolve). The parser keeps them DISTINCT because the
|
|
260
|
+
# gravity-vs-regex distinction is a real Pi-hole config detail worth preserving
|
|
261
|
+
# at the source. The DETECTOR is responsible for collapsing both into a single
|
|
262
|
+
# "blocked" notion when computing block_ratio / was_blocked — do not collapse
|
|
263
|
+
# them here. (Separation of powers: parser stays faithful to the source; detector
|
|
264
|
+
# owns the abstraction.)
|
|
265
|
+
# The disposition phrase is stored in the validation field — the same field used
|
|
266
|
+
# for DNSSEC verdicts — because both describe the resolution outcome and no new
|
|
267
|
+
# schema column is needed. Guard with event_type == "regex_blocked" before using.
|
|
268
|
+
m_rb = _REGEX_BLOCKED_RE.match(message)
|
|
269
|
+
if m_rb:
|
|
270
|
+
result.update({
|
|
271
|
+
"event_type": "regex_blocked",
|
|
272
|
+
"query": m_rb.group("domain"),
|
|
273
|
+
"answer": m_rb.group("answer").strip(),
|
|
274
|
+
"validation": m_rb.group("disposition"),
|
|
275
|
+
})
|
|
276
|
+
return result
|
|
277
|
+
|
|
278
|
+
m_cf = _CONFIG_RE.match(message)
|
|
279
|
+
if m_cf:
|
|
280
|
+
result.update({
|
|
281
|
+
"event_type": "config",
|
|
282
|
+
"query": m_cf.group("domain"),
|
|
283
|
+
"answer": m_cf.group("answer").strip(),
|
|
284
|
+
})
|
|
285
|
+
return result
|
|
286
|
+
|
|
287
|
+
m_v = _VALID_RE.match(message)
|
|
288
|
+
if m_v:
|
|
289
|
+
result.update({
|
|
290
|
+
"event_type": "validation",
|
|
291
|
+
"validation": m_v.group("status"),
|
|
292
|
+
})
|
|
293
|
+
return result
|
|
294
|
+
|
|
295
|
+
# No inner grammar matched — return as unknown; query stays None.
|
|
296
|
+
return result
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
SNIFF_PEEK_LINES: int = 32
|
|
300
|
+
|
|
301
|
+
# Event types that prove the file is a dnsmasq/Pi-hole DNS log. dhcp and
|
|
302
|
+
# unknown are intentionally absent — they may precede the first DNS event
|
|
303
|
+
# but never claim "dns" on their own.
|
|
304
|
+
_DNS_BEARING_EVENT_TYPES: frozenset[str] = frozenset({
|
|
305
|
+
"query", "forwarded", "reply", "cached",
|
|
306
|
+
"gravity_blocked", "regex_blocked",
|
|
307
|
+
"config", "validation",
|
|
308
|
+
"dnssec_query", "special", "pihole_hostname",
|
|
309
|
+
})
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def sniff(sample: list[str]) -> str | None:
|
|
313
|
+
"""Recognize a dnsmasq/Pi-hole DNS log and return "dns".
|
|
314
|
+
|
|
315
|
+
Calls ``parse_line`` on each sample line and inspects ``event_type``.
|
|
316
|
+
Returns "dns" on the first line whose event_type is a DNS-bearing kind
|
|
317
|
+
(query/forwarded/reply/cached/gravity_blocked/regex_blocked/config/
|
|
318
|
+
validation/dnssec_query/special/pihole_hostname). Tolerates leading
|
|
319
|
+
runs of DHCP-lease or unknown dnsmasq chatter — they do not short-
|
|
320
|
+
circuit. Returns None when the budget is exhausted without a
|
|
321
|
+
DNS-bearing event, or when no line matches the dnsmasq outer grammar.
|
|
322
|
+
|
|
323
|
+
Pure: takes already-decoded lines, performs no I/O.
|
|
324
|
+
"""
|
|
325
|
+
for raw_line in sample:
|
|
326
|
+
record = parse_line(raw_line)
|
|
327
|
+
if record is None:
|
|
328
|
+
continue
|
|
329
|
+
if record["event_type"] in _DNS_BEARING_EVENT_TYPES:
|
|
330
|
+
return "dns"
|
|
331
|
+
return None
|