loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1 @@
1
+ """Parser modules for normalizing local log formats before detector analysis."""
@@ -0,0 +1,287 @@
1
+ """CloudTrail event parsing — normalize raw AWS events into canonical row dicts.
2
+
3
+ Provides pure parsing functions with no file I/O. File discovery, decompression,
4
+ NDJSON / envelope sniffing, DataFrame construction, and timeframe filtering are
5
+ handled by loader.py via load_cloudtrail().
6
+
7
+ Per-event normalization, not aggregation. The aws detector aggregates per-principal
8
+ on the back end — the same parser-emits-fragments / detector-aggregates split that
9
+ dnsmasq uses (`_build_pihole_aggregate` lives in the detector, not the parser).
10
+
11
+ Canonical CloudTrail event schema (v1)
12
+ ──────────────────────────────────────
13
+ parse_event() emits one dict per event with these twelve keys, all always present.
14
+
15
+ Carried fields (verbatim from the wire event, or None when absent):
16
+ ts — eventTime parsed to unix epoch float; None when missing/garbage
17
+ event_source — eventSource (full string, e.g. "s3.amazonaws.com")
18
+ event_name — eventName (e.g. "ListBuckets")
19
+ identity_type — userIdentity.type (IAMUser, AssumedRole, AWSService, Root, …)
20
+ source_ip — sourceIPAddress
21
+ error_code — errorCode; None means the call succeeded
22
+ aws_region — awsRegion — human-triage pivot
23
+ event_id — eventID — drill-back anchor; the analyst's key to the full event
24
+
25
+ Derived fields (computed from one or more wire fields by the rules below):
26
+ principal — stable per-actor key; collapses userIdentity variants so a role
27
+ assumed many times is one actor, not many. See _derive_principal.
28
+ lane — "interactive" | "service", mechanically derived from
29
+ userIdentity.type / invokedBy / service-linked-role naming
30
+ read_write — "read" | "write"; top-level readOnly when present, else inferred
31
+ from the action verb (handles thinner old event schemas)
32
+
33
+ Escape hatch (SCHEMA.md → promote-don't-grep):
34
+ raw — original event dict, unmodified. No v1 detector reads this at
35
+ runtime. Future detectors that need fields living in `raw` —
36
+ recipient_account_id, user_agent, requestParameters,
37
+ responseElements, resources — promote them to real, typed,
38
+ documented canonical columns at that time, with real knowledge
39
+ of what the signal needs. Detectors never reach into `raw`
40
+ mid-analysis.
41
+
42
+ Adding a carried column later is a one-line, obvious change: add the key to the
43
+ single result-dict literal in parse_event(), pulled with .get(...).
44
+ """
45
+
46
+ from __future__ import annotations
47
+
48
+ import json
49
+ import re
50
+ from datetime import datetime
51
+ from typing import Any
52
+
53
+ # Verb-inference fallback for read_write — used only when an event has no top-level
54
+ # readOnly field (older CloudTrail event schemas). A prefix match in this set maps
55
+ # the event to "read"; everything else is "write".
56
+ _READ_VERB_PREFIXES: tuple[str, ...] = (
57
+ "Get", "List", "Describe", "Head", "Lookup",
58
+ "Search", "BatchGet", "Select", "Query", "Scan",
59
+ )
60
+
61
+
62
+ def parse_event(event: Any) -> dict | None:
63
+ """Parse a raw CloudTrail event dict into the canonical row dict.
64
+
65
+ Returns None only when ``event`` is not a dict. For any dict input, returns
66
+ a row with all twelve canonical keys present — never raises on missing,
67
+ malformed, or unexpected fields anywhere in the event. All sub-lookups are
68
+ defensive: missing nested objects degrade to the appropriate field-level
69
+ fallback rather than aborting the parse.
70
+ """
71
+ if not isinstance(event, dict):
72
+ return None
73
+
74
+ user_identity = event.get("userIdentity")
75
+ identity_type = user_identity.get("type") if isinstance(user_identity, dict) else None
76
+
77
+ return {
78
+ "ts": _parse_event_time(event.get("eventTime")),
79
+ "principal": _derive_principal(user_identity),
80
+ "lane": _derive_lane(user_identity),
81
+ "read_write": _derive_read_write(event),
82
+ "event_source": event.get("eventSource"),
83
+ "event_name": event.get("eventName"),
84
+ "identity_type": identity_type,
85
+ "source_ip": event.get("sourceIPAddress"),
86
+ "error_code": event.get("errorCode"),
87
+ "aws_region": event.get("awsRegion"),
88
+ "event_id": event.get("eventID"),
89
+ "raw": event,
90
+ }
91
+
92
+
93
+ # ── Derivation helpers ────────────────────────────────────────────────────────
94
+
95
+ def _parse_event_time(s: Any) -> float | None:
96
+ """Parse an ISO 8601 eventTime string to unix epoch float; None on failure.
97
+
98
+ Mirrors exporters/cloudtrail.py:_parse_event_time. CloudTrail emits "Z" suffix
99
+ UTC; fromisoformat handles a "+00:00" offset, hence the substitution.
100
+ """
101
+ if not isinstance(s, str) or not s:
102
+ return None
103
+ try:
104
+ return datetime.fromisoformat(s.replace("Z", "+00:00")).timestamp()
105
+ except (ValueError, TypeError):
106
+ return None
107
+
108
+
109
+ def _derive_principal(user_identity: Any) -> str:
110
+ """Derive a stable per-actor key from userIdentity.
111
+
112
+ Load-bearing intent: a role assumed many times is one actor. AssumedRole events
113
+ key on the session *issuer*, not the per-assumption session name, so every
114
+ session of one role aggregates together. See SCHEMA.md for the full rule.
115
+ """
116
+ if not isinstance(user_identity, dict):
117
+ return "unknown"
118
+
119
+ itype = user_identity.get("type")
120
+
121
+ if itype == "AssumedRole":
122
+ session_context = user_identity.get("sessionContext")
123
+ if isinstance(session_context, dict):
124
+ issuer = session_context.get("sessionIssuer")
125
+ if isinstance(issuer, dict):
126
+ user_name = issuer.get("userName")
127
+ if isinstance(user_name, str) and user_name:
128
+ return user_name
129
+ arn = issuer.get("arn")
130
+ if isinstance(arn, str) and arn:
131
+ last = arn.rsplit("/", 1)[-1]
132
+ if last:
133
+ return last
134
+ issuer_pid = issuer.get("principalId")
135
+ if isinstance(issuer_pid, str) and issuer_pid:
136
+ return issuer_pid
137
+ # fall through to generic fallback when sessionIssuer is absent/empty
138
+
139
+ elif itype == "IAMUser":
140
+ user_name = user_identity.get("userName")
141
+ if isinstance(user_name, str) and user_name:
142
+ return user_name
143
+ arn = user_identity.get("arn")
144
+ if isinstance(arn, str) and arn:
145
+ last = arn.rsplit("/", 1)[-1]
146
+ if last:
147
+ return last
148
+ # principalId is the next step, handled by the generic fallback below
149
+
150
+ elif itype == "AWSService":
151
+ invoked_by = user_identity.get("invokedBy")
152
+ if isinstance(invoked_by, str) and invoked_by:
153
+ return invoked_by
154
+ # fall through to generic fallback
155
+
156
+ elif itype == "Root":
157
+ return "root"
158
+
159
+ # Generic fallback: federated/SAML/WebIdentity/IdentityCenter/AWSAccount/unknown
160
+ # types, plus fall-through from the per-type branches above when their preferred
161
+ # fields are missing. principalId stability is what keeps two distinct actors
162
+ # under an unknown type from collapsing into one bucket.
163
+ pid = user_identity.get("principalId")
164
+ if isinstance(pid, str) and pid:
165
+ return pid
166
+ if isinstance(itype, str) and itype:
167
+ return itype
168
+ return "unknown"
169
+
170
+
171
+ def _derive_lane(user_identity: Any) -> str:
172
+ """Return "service" or "interactive" for an event's userIdentity.
173
+
174
+ Mechanical, no security judgment. "service" if any of:
175
+ 1. userIdentity.type is AWSService or AWSAccount
176
+ 2. userIdentity.invokedBy ends with "amazonaws.com"
177
+ 3. "AWSServiceRoleFor" appears in userIdentity.arn or
178
+ sessionContext.sessionIssuer.arn
179
+
180
+ Otherwise "interactive". No hardcoded principal-name list — corpus-specific
181
+ role names are not a parser concern.
182
+ """
183
+ if not isinstance(user_identity, dict):
184
+ return "interactive"
185
+
186
+ itype = user_identity.get("type")
187
+ if itype in ("AWSService", "AWSAccount"):
188
+ return "service"
189
+
190
+ invoked_by = user_identity.get("invokedBy")
191
+ if isinstance(invoked_by, str) and invoked_by.endswith("amazonaws.com"):
192
+ return "service"
193
+
194
+ arn = user_identity.get("arn")
195
+ if isinstance(arn, str) and "AWSServiceRoleFor" in arn:
196
+ return "service"
197
+
198
+ session_context = user_identity.get("sessionContext")
199
+ if isinstance(session_context, dict):
200
+ issuer = session_context.get("sessionIssuer")
201
+ if isinstance(issuer, dict):
202
+ issuer_arn = issuer.get("arn")
203
+ if isinstance(issuer_arn, str) and "AWSServiceRoleFor" in issuer_arn:
204
+ return "service"
205
+
206
+ return "interactive"
207
+
208
+
209
+ def _derive_read_write(event: dict) -> str:
210
+ """Return "read" or "write" from top-level readOnly, else verb inference.
211
+
212
+ readOnly precedence: boolean True / string "true" → "read"; boolean False /
213
+ string "false" → "write". Absent readOnly falls back to the action verb:
214
+ eventName starting with a known read prefix → "read", else "write".
215
+ """
216
+ read_only = event.get("readOnly")
217
+ if read_only is True:
218
+ return "read"
219
+ if read_only is False:
220
+ return "write"
221
+ if isinstance(read_only, str):
222
+ lowered = read_only.lower()
223
+ if lowered == "true":
224
+ return "read"
225
+ if lowered == "false":
226
+ return "write"
227
+
228
+ # readOnly absent or in an unrecognised shape — verb inference fallback.
229
+ name = event.get("eventName")
230
+ if isinstance(name, str) and name:
231
+ for prefix in _READ_VERB_PREFIXES:
232
+ if name.startswith(prefix):
233
+ return "read"
234
+ return "write"
235
+
236
+
237
+ SNIFF_PEEK_LINES: int = 200
238
+
239
+ # Quoted-key + colon — matches a JSON key declaration, not a value substring.
240
+ _CT_KEY_RE: dict[str, re.Pattern[str]] = {
241
+ "Records": re.compile(r'"Records"\s*:'),
242
+ "eventVersion": re.compile(r'"eventVersion"\s*:'),
243
+ "eventTime": re.compile(r'"eventTime"\s*:'),
244
+ "userIdentity": re.compile(r'"userIdentity"\s*:'),
245
+ }
246
+
247
+ _CT_EVENT_KEYS: tuple[str, ...] = ("eventVersion", "eventTime", "userIdentity")
248
+
249
+
250
+ def sniff(sample: list[str]) -> str | None:
251
+ """Recognize CloudTrail JSON (NDJSON event or envelope) and return "cloudtrail".
252
+
253
+ Two paths, either one wins:
254
+
255
+ 1. NDJSON: the first non-empty line parses as a dict containing at least
256
+ two of ``eventVersion``, ``eventTime``, ``userIdentity``.
257
+ 2. Envelope (structural — does not require the sample to contain a
258
+ parseable JSON document): the joined sample contains the quoted key
259
+ ``"Records":`` AND at least two of the three per-event keys
260
+ (``"eventVersion":``, ``"eventTime":``, ``"userIdentity":``) as
261
+ quoted-key tokens. Survives pretty-printed envelopes whose first
262
+ record exceeds the sample budget.
263
+
264
+ Returns None when neither path matches.
265
+
266
+ Pure: takes already-decoded lines, performs no I/O.
267
+ """
268
+ for raw_line in sample:
269
+ line = raw_line.strip()
270
+ if not line:
271
+ continue
272
+ try:
273
+ obj = json.loads(line)
274
+ except (json.JSONDecodeError, ValueError):
275
+ break
276
+ if isinstance(obj, dict):
277
+ hit = sum(1 for k in _CT_EVENT_KEYS if k in obj)
278
+ if hit >= 2:
279
+ return "cloudtrail"
280
+ break
281
+
282
+ joined = "\n".join(sample)
283
+ if _CT_KEY_RE["Records"].search(joined):
284
+ hit = sum(1 for k in _CT_EVENT_KEYS if _CT_KEY_RE[k].search(joined))
285
+ if hit >= 2:
286
+ return "cloudtrail"
287
+ return None
@@ -0,0 +1,331 @@
1
+ """dnsmasq/Pi-hole log parsing — extract structured event dicts for loader assembly.
2
+
3
+ Provides pure parsing functions with no file I/O. File discovery, DataFrame
4
+ construction, and hostname assignment are handled by loader.py via load_pihole().
5
+
6
+ Known limitation: dnsmasq logs carry no timezone information. Timestamps are
7
+ parsed as if UTC (tzinfo=timezone.utc applied directly to the wall-clock value),
8
+ matching the behaviour of parsers/syslog.py. When a Pi-hole host runs in a
9
+ non-UTC timezone the ts values will be offset from true UTC by the host's local
10
+ offset; LogHunter's internal timeframe arithmetic is consistent because the same
11
+ convention is applied throughout.
12
+
13
+ Canonical-plus-event schema
14
+ ────────────────────────────
15
+ The parser emits one dict per parsed event line. Fields divide into two groups:
16
+
17
+ Canonical DNS fields (shared vocabulary with the Zeek path):
18
+ ts — UTC-aware datetime or None
19
+ src — querying client IP (str | None; populated ONLY on query events)
20
+ query — queried domain (str | None; the "domain" of the event where applicable)
21
+
22
+ dnsmasq event fields (parser-specific):
23
+ event_type — query | forwarded | reply | cached | gravity_blocked |
24
+ config | validation | dnssec_query | special | dhcp |
25
+ pihole_hostname | regex_blocked | unknown
26
+ qtype — query type (A, AAAA, HTTPS, …) (str | None; query and dnssec_query events)
27
+ dst — upstream resolver or validation target (str | None; forwarded and
28
+ dnssec_query events; also holds an opaque token on dhcp rows —
29
+ not canonical DNS on dhcp; guard with event_type check before use)
30
+ answer — answer payload (str | None; reply/cached/gravity_blocked/config/special/
31
+ pihole_hostname events; also holds an opaque token on dhcp rows —
32
+ same caveat as dst)
33
+ validation — DNSSEC status or block disposition phrase (str | None; validation events
34
+ carry the DNSSEC verdict; regex_blocked events carry the matched
35
+ disposition phrase e.g. "regex denied", "exactly blacklisted")
36
+ host — source host, set by the loader from filename (parser leaves "")
37
+ raw — original line
38
+ message — message portion after the dnsmasq[PID]: prefix
39
+
40
+ Rule for detector authors: features may depend on the canonical fields (ts, src,
41
+ query) freely. Any feature derived from the dnsmasq event fields must be guarded
42
+ (presence-checked) so the same detector code can run against a Zeek frame that
43
+ lacks them.
44
+ """
45
+
46
+ from __future__ import annotations
47
+
48
+ import re
49
+ from datetime import datetime
50
+
51
+ from loghunter.parsers.syslog import parse_timestamp
52
+
53
+ # ── Compiled patterns ─────────────────────────────────────────────────────────
54
+
55
+ # Outer grammar: Mon DD HH:MM:SS [hostname] dnsmasq[PID]: <message>
56
+ # Single-digit days appear with a leading space (dnsmasq format); \s+ handles both.
57
+ _OUTER_RE = re.compile(
58
+ r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}'
59
+ r'\s+(?:\S+\s+)?dnsmasq\[\d+\]:\s+'
60
+ r'(?P<message>.+)$'
61
+ )
62
+
63
+ # Inner message grammars — evaluated in order, first match wins.
64
+ _QUERY_RE = re.compile(
65
+ r'^query\[(?P<qtype>[^\]]+)\]\s+(?P<domain>\S+)\s+from\s+(?P<src>\S+)'
66
+ )
67
+ _FWD_RE = re.compile(
68
+ r'^forwarded\s+(?P<domain>\S+)\s+to\s+(?P<dst>\S+)'
69
+ )
70
+ _REPLY_RE = re.compile(
71
+ r'^reply\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
72
+ )
73
+ _CACHED_RE = re.compile(
74
+ r'^(?:cached(?:-stale)?)\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
75
+ )
76
+ _GRAVITY_RE = re.compile(
77
+ r'^gravity blocked (?:\([^\)]+\)\s+)?(?P<domain>\S+) is (?P<answer>.+)$'
78
+ )
79
+ _CONFIG_RE = re.compile(
80
+ r'^(?P<source>/\S+|config)\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
81
+ )
82
+ _VALID_RE = re.compile(
83
+ r'^validation result is (?P<status>\S+)'
84
+ )
85
+ # dnssec-query: resolver-internal DNSSEC validation traffic. Must precede _FWD_RE
86
+ # as future-proofing — today _FWD_RE starts with "forwarded" so there is no live
87
+ # conflict, but both patterns contain " to " and this ordering makes the intent explicit.
88
+ _DNSSEC_QUERY_RE = re.compile(
89
+ r'^dnssec-query\[(?P<qtype>[^\]]+)\]\s+(?P<domain>\S+)\s+to\s+(?P<dst>\S+)'
90
+ )
91
+ # special domain: Apple Private Relay / resolver override disposition. Must precede
92
+ # _CONFIG_RE as future-proofing — today _CONFIG_RE starts with "/" or "config" so
93
+ # there is no live conflict, but both match "<token> <domain> is <answer>".
94
+ _SPECIAL_RE = re.compile(
95
+ r'^special domain\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
96
+ )
97
+ # Pi-hole hostname self-resolution. Must precede _CONFIG_RE for the same reason as
98
+ # _SPECIAL_RE — the "Pi-hole hostname" literal prefix is unambiguous today but the
99
+ # ordering makes the intent explicit.
100
+ _PIHOLE_HOSTNAME_RE = re.compile(
101
+ r'^Pi-hole hostname\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
102
+ )
103
+ # Regex/blacklist block disposition. Covers the spelling variants FTL emits across
104
+ # versions: "regex denied", "regex blacklisted", "exactly denied", "exactly blacklisted",
105
+ # and bare "blacklisted". Must precede _CONFIG_RE — the "<token(s)> <domain> is <answer>"
106
+ # shape overlaps, and the literal disposition prefix is unambiguous.
107
+ _REGEX_BLOCKED_RE = re.compile(
108
+ r'^(?P<disposition>regex (?:denied|blacklisted)|exactly (?:denied|blacklisted)|blacklisted)'
109
+ r'\s+(?P<domain>\S+)\s+is\s+(?P<answer>.+)$'
110
+ )
111
+ # DHCP lease lines ride the same log file. Both field orders occur in the wild:
112
+ # "DHCP <ip> is <hostname>" AND "DHCP <hostname> is <ip>".
113
+ _DHCP_RE = re.compile(
114
+ r'^DHCP\s+(?P<a>\S+)\s+is\s+(?P<b>\S+)'
115
+ )
116
+
117
+
118
+ # ── Parsing functions ─────────────────────────────────────────────────────────
119
+
120
+ def parse_line(raw: str) -> dict | None:
121
+ """Parse a raw dnsmasq log line into a normalized event dict.
122
+
123
+ Returns None for blank lines, comment lines (starting with #), and lines
124
+ that do not match the dnsmasq outer grammar.
125
+
126
+ Returns a dict with the canonical-plus-event schema described in the module
127
+ docstring. All keys are always present. The 'host' field is left as "" —
128
+ the loader fills it from the filename stem before building the DataFrame.
129
+ The 'event_type' is "unknown" when the outer grammar matches but no inner
130
+ grammar does; 'query' is None in that case. Unknown lines are retained
131
+ (not dropped) so the detector session can discover new message patterns.
132
+ """
133
+ if not raw or raw.lstrip().startswith("#"):
134
+ return None
135
+
136
+ m = _OUTER_RE.match(raw.strip())
137
+ if not m:
138
+ return None
139
+
140
+ message = m.group("message")
141
+ ts: datetime | None = parse_timestamp(raw)
142
+
143
+ result: dict = {
144
+ "ts": ts,
145
+ "src": None,
146
+ "query": None,
147
+ "event_type": "unknown",
148
+ "qtype": None,
149
+ "dst": None,
150
+ "answer": None,
151
+ "validation": None,
152
+ "host": "",
153
+ "raw": raw,
154
+ "message": message,
155
+ }
156
+
157
+ m_q = _QUERY_RE.match(message)
158
+ if m_q:
159
+ result.update({
160
+ "event_type": "query",
161
+ "qtype": m_q.group("qtype"),
162
+ "query": m_q.group("domain"),
163
+ "src": m_q.group("src"),
164
+ })
165
+ return result
166
+
167
+ # dhcp rows are non-DNS DHCP lease events that ride the dnsmasq log file.
168
+ # They are excluded from all DNS analysis. Parsed here only to keep the unknown
169
+ # bucket clean and to let the detector trivially filter with event_type == "dhcp".
170
+ # dst and answer hold the two raw "DHCP <a> is <b>" tokens as opaque strings;
171
+ # the "is" separator does NOT mean domain/answer here. Do not use dst or answer
172
+ # from dhcp rows in any DNS aggregation — guard with event_type == "dhcp" first.
173
+ m_dhcp = _DHCP_RE.match(message)
174
+ if m_dhcp:
175
+ result.update({
176
+ "event_type": "dhcp",
177
+ "dst": m_dhcp.group("a"), # opaque — not a DNS resolver address
178
+ "answer": m_dhcp.group("b"), # opaque — not a DNS answer
179
+ })
180
+ return result
181
+
182
+ # dnssec-query events are resolver-internal DNSSEC validation traffic keyed to
183
+ # zone-cut labels and root-style validation targets (e.g. example.test, or the
184
+ # root zone) that frequently NEVER appear as a client query. They must NOT be
185
+ # counted as "forwarded" events and must NOT be merged into per-domain query
186
+ # aggregation by default — doing so would inflate forward_ratio for domains with
187
+ # zero client queries and reproduce the divide-by-zero/infinite-ratio problem
188
+ # observed during exploration. Capture now; defer feature use to a deliberate
189
+ # later decision.
190
+ m_dq = _DNSSEC_QUERY_RE.match(message)
191
+ if m_dq:
192
+ result.update({
193
+ "event_type": "dnssec_query",
194
+ "qtype": m_dq.group("qtype"),
195
+ "query": m_dq.group("domain"),
196
+ "dst": m_dq.group("dst"),
197
+ })
198
+ return result
199
+
200
+ m_f = _FWD_RE.match(message)
201
+ if m_f:
202
+ result.update({
203
+ "event_type": "forwarded",
204
+ "query": m_f.group("domain"),
205
+ "dst": m_f.group("dst"),
206
+ })
207
+ return result
208
+
209
+ m_r = _REPLY_RE.match(message)
210
+ if m_r:
211
+ result.update({
212
+ "event_type": "reply",
213
+ "query": m_r.group("domain"),
214
+ "answer": m_r.group("answer").strip(),
215
+ })
216
+ return result
217
+
218
+ m_c = _CACHED_RE.match(message)
219
+ if m_c:
220
+ result.update({
221
+ "event_type": "cached",
222
+ "query": m_c.group("domain"),
223
+ "answer": m_c.group("answer").strip(),
224
+ })
225
+ return result
226
+
227
+ m_g = _GRAVITY_RE.match(message)
228
+ if m_g:
229
+ result.update({
230
+ "event_type": "gravity_blocked",
231
+ "query": m_g.group("domain"),
232
+ "answer": m_g.group("answer").strip(),
233
+ })
234
+ return result
235
+
236
+ m_sp = _SPECIAL_RE.match(message)
237
+ if m_sp:
238
+ result.update({
239
+ "event_type": "special",
240
+ "query": m_sp.group("domain"),
241
+ "answer": m_sp.group("answer").strip(),
242
+ })
243
+ return result
244
+
245
+ # pihole_hostname rows are Pi-hole's own host self-resolution chatter (FTL
246
+ # answering for its own hostname). They have no DNS hunting value and are
247
+ # excluded from all DNS aggregation. Parsed here only to keep the unknown
248
+ # bucket clean; the detector filters them out with event_type == "pihole_hostname".
249
+ m_ph = _PIHOLE_HOSTNAME_RE.match(message)
250
+ if m_ph:
251
+ result.update({
252
+ "event_type": "pihole_hostname",
253
+ "query": m_ph.group("domain"),
254
+ "answer": m_ph.group("answer").strip(),
255
+ })
256
+ return result
257
+
258
+ # regex_blocked and gravity_blocked are TWO mechanisms of the SAME outcome
259
+ # (Pi-hole refused to resolve). The parser keeps them DISTINCT because the
260
+ # gravity-vs-regex distinction is a real Pi-hole config detail worth preserving
261
+ # at the source. The DETECTOR is responsible for collapsing both into a single
262
+ # "blocked" notion when computing block_ratio / was_blocked — do not collapse
263
+ # them here. (Separation of powers: parser stays faithful to the source; detector
264
+ # owns the abstraction.)
265
+ # The disposition phrase is stored in the validation field — the same field used
266
+ # for DNSSEC verdicts — because both describe the resolution outcome and no new
267
+ # schema column is needed. Guard with event_type == "regex_blocked" before using.
268
+ m_rb = _REGEX_BLOCKED_RE.match(message)
269
+ if m_rb:
270
+ result.update({
271
+ "event_type": "regex_blocked",
272
+ "query": m_rb.group("domain"),
273
+ "answer": m_rb.group("answer").strip(),
274
+ "validation": m_rb.group("disposition"),
275
+ })
276
+ return result
277
+
278
+ m_cf = _CONFIG_RE.match(message)
279
+ if m_cf:
280
+ result.update({
281
+ "event_type": "config",
282
+ "query": m_cf.group("domain"),
283
+ "answer": m_cf.group("answer").strip(),
284
+ })
285
+ return result
286
+
287
+ m_v = _VALID_RE.match(message)
288
+ if m_v:
289
+ result.update({
290
+ "event_type": "validation",
291
+ "validation": m_v.group("status"),
292
+ })
293
+ return result
294
+
295
+ # No inner grammar matched — return as unknown; query stays None.
296
+ return result
297
+
298
+
299
+ SNIFF_PEEK_LINES: int = 32
300
+
301
+ # Event types that prove the file is a dnsmasq/Pi-hole DNS log. dhcp and
302
+ # unknown are intentionally absent — they may precede the first DNS event
303
+ # but never claim "dns" on their own.
304
+ _DNS_BEARING_EVENT_TYPES: frozenset[str] = frozenset({
305
+ "query", "forwarded", "reply", "cached",
306
+ "gravity_blocked", "regex_blocked",
307
+ "config", "validation",
308
+ "dnssec_query", "special", "pihole_hostname",
309
+ })
310
+
311
+
312
+ def sniff(sample: list[str]) -> str | None:
313
+ """Recognize a dnsmasq/Pi-hole DNS log and return "dns".
314
+
315
+ Calls ``parse_line`` on each sample line and inspects ``event_type``.
316
+ Returns "dns" on the first line whose event_type is a DNS-bearing kind
317
+ (query/forwarded/reply/cached/gravity_blocked/regex_blocked/config/
318
+ validation/dnssec_query/special/pihole_hostname). Tolerates leading
319
+ runs of DHCP-lease or unknown dnsmasq chatter — they do not short-
320
+ circuit. Returns None when the budget is exhausted without a
321
+ DNS-bearing event, or when no line matches the dnsmasq outer grammar.
322
+
323
+ Pure: takes already-decoded lines, performs no I/O.
324
+ """
325
+ for raw_line in sample:
326
+ record = parse_line(raw_line)
327
+ if record is None:
328
+ continue
329
+ if record["event_type"] in _DNS_BEARING_EVENT_TYPES:
330
+ return "dns"
331
+ return None