loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
"""RFC 3164 syslog line parsing — extract (ts, host, message) for detector analysis.
|
|
2
|
+
|
|
3
|
+
Provides pure parsing functions with no file I/O. File discovery and DataFrame
|
|
4
|
+
construction are handled by loader.py. The syslog detector operates on the
|
|
5
|
+
normalized output produced here via load_syslog().
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import re
|
|
9
|
+
from datetime import datetime, timedelta, timezone
|
|
10
|
+
|
|
11
|
+
# ── Compiled patterns ─────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
PRI_RE = re.compile(r'^<\d+>')
|
|
14
|
+
SYSLOG_HDR_RE = re.compile(r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\S+\s+')
|
|
15
|
+
PROC_PID_RE = re.compile(r'\[\d+\]')
|
|
16
|
+
|
|
17
|
+
# Program/process token at the head of a header-stripped syslog body.
|
|
18
|
+
# Matches the leading run of non-whitespace characters up to the first '[' or ':'.
|
|
19
|
+
PROGRAM_RE = re.compile(r'^[^\[:\s]+')
|
|
20
|
+
|
|
21
|
+
# Timestamp in position 0–2 after stripping PRI (month day HH:MM:SS)
|
|
22
|
+
SYSLOG_TS_RE = re.compile(r'^(\w{3})\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})')
|
|
23
|
+
|
|
24
|
+
# Reboot signal patterns — any match triggers reboot detection in the detector
|
|
25
|
+
REBOOT_SIGNALS_RE = re.compile(
|
|
26
|
+
r'(systemd-logind.*[Ss]ystem is rebooting|'
|
|
27
|
+
r'rsyslogd.*exiting on signal 15|'
|
|
28
|
+
r'systemd-shutdown.*Sending SIGTERM to remaining|'
|
|
29
|
+
r'kernel: Linux version\s)',
|
|
30
|
+
re.IGNORECASE,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# ── Parsing functions ─────────────────────────────────────────────────────────
|
|
35
|
+
|
|
36
|
+
def parse_host(raw: str) -> str:
|
|
37
|
+
"""Extract hostname from an RFC 3164 syslog line (field 4 after stripping PRI).
|
|
38
|
+
|
|
39
|
+
Returns "unknown" if the line is too short to contain a hostname field.
|
|
40
|
+
"""
|
|
41
|
+
stripped = PRI_RE.sub("", raw).strip()
|
|
42
|
+
parts = stripped.split()
|
|
43
|
+
return parts[3] if len(parts) >= 4 else "unknown"
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def strip_header(raw: str) -> str:
|
|
47
|
+
"""Remove RFC 3164 PRI prefix and the Mon DD HH:MM:SS hostname header."""
|
|
48
|
+
raw = PRI_RE.sub("", raw)
|
|
49
|
+
return SYSLOG_HDR_RE.sub("", raw).strip()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def normalize_pids(msg: str) -> str:
|
|
53
|
+
"""Collapse process PID brackets so sshd[1234] and sshd[5678] share a template."""
|
|
54
|
+
return PROC_PID_RE.sub("[*]", msg)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def parse_program(body: str) -> str:
|
|
58
|
+
"""Extract the program/process token from a header-stripped syslog body.
|
|
59
|
+
|
|
60
|
+
Strips surrounding whitespace, then returns the leading run of
|
|
61
|
+
non-whitespace characters up to the first '[' or ':' (e.g. 'sshd',
|
|
62
|
+
'postfix/smtpd', 'kernel'). Returns 'unknown' when no such token exists
|
|
63
|
+
(empty body after stripping, or first non-whitespace character is '[' or ':').
|
|
64
|
+
"""
|
|
65
|
+
m = PROGRAM_RE.match(body.strip())
|
|
66
|
+
return m.group(0) if m else "unknown"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def parse_timestamp(raw: str) -> datetime | None:
|
|
70
|
+
"""Parse an RFC 3164 timestamp to a UTC-aware datetime.
|
|
71
|
+
|
|
72
|
+
RFC 3164 carries no year. This function uses the current UTC year as a starting
|
|
73
|
+
point, then applies a rollback heuristic: if the resulting datetime is more than
|
|
74
|
+
7 days in the future, the log is from the previous year and the year is decremented.
|
|
75
|
+
|
|
76
|
+
Returns None if the line contains no parseable timestamp.
|
|
77
|
+
"""
|
|
78
|
+
stripped = PRI_RE.sub("", raw).strip()
|
|
79
|
+
m = SYSLOG_TS_RE.match(stripped)
|
|
80
|
+
if not m:
|
|
81
|
+
return None
|
|
82
|
+
month_str, day_str, time_str = m.group(1), m.group(2), m.group(3)
|
|
83
|
+
year = datetime.now(timezone.utc).year
|
|
84
|
+
try:
|
|
85
|
+
dt = datetime.strptime(
|
|
86
|
+
f"{year} {month_str} {day_str.zfill(2)} {time_str}",
|
|
87
|
+
"%Y %b %d %H:%M:%S",
|
|
88
|
+
).replace(tzinfo=timezone.utc)
|
|
89
|
+
except ValueError:
|
|
90
|
+
return None
|
|
91
|
+
if dt > datetime.now(timezone.utc) + timedelta(days=7):
|
|
92
|
+
dt = dt.replace(year=dt.year - 1)
|
|
93
|
+
return dt
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def is_reboot_signal(raw: str) -> bool:
|
|
97
|
+
"""Return True if the raw line matches a known reboot or shutdown pattern."""
|
|
98
|
+
return bool(REBOOT_SIGNALS_RE.search(raw))
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def parse_line(raw: str) -> dict | None:
|
|
102
|
+
"""Parse a raw syslog line into a normalized record dict.
|
|
103
|
+
|
|
104
|
+
Returns None for blank lines and comment lines (starting with #).
|
|
105
|
+
Returns a dict with keys: ts (datetime | None), host (str), program (str),
|
|
106
|
+
raw (str), message (str). Empty message strings are preserved — the caller
|
|
107
|
+
decides whether to filter them.
|
|
108
|
+
"""
|
|
109
|
+
if not raw or raw.lstrip().startswith("#"):
|
|
110
|
+
return None
|
|
111
|
+
body = strip_header(raw)
|
|
112
|
+
return {
|
|
113
|
+
"ts": parse_timestamp(raw),
|
|
114
|
+
"host": parse_host(raw),
|
|
115
|
+
"program": parse_program(body),
|
|
116
|
+
"raw": raw,
|
|
117
|
+
"message": normalize_pids(body),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
SNIFF_PEEK_LINES: int = 32
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def sniff(sample: list[str]) -> str | None:
|
|
125
|
+
"""Recognize an RFC 3164 syslog line and return "syslog".
|
|
126
|
+
|
|
127
|
+
Real-header signal — not "parse_line non-None" (which is true for any
|
|
128
|
+
nonblank line). Requires BOTH:
|
|
129
|
+
|
|
130
|
+
1. After optional PRI stripping and lstrip, SYSLOG_HDR_RE matches
|
|
131
|
+
(Mon DD HH:MM:SS HOSTNAME header shape).
|
|
132
|
+
2. parse_timestamp returns a non-None datetime (proves the leading
|
|
133
|
+
timestamp portion is real, not a regex coincidence).
|
|
134
|
+
|
|
135
|
+
Returns "syslog" on the first line that passes both checks. Returns
|
|
136
|
+
None when the budget is exhausted with no real-header line — garbage
|
|
137
|
+
text, prose, and blank-only samples fall through correctly.
|
|
138
|
+
|
|
139
|
+
Pure: takes already-decoded lines, performs no I/O.
|
|
140
|
+
"""
|
|
141
|
+
for raw_line in sample:
|
|
142
|
+
if not raw_line or raw_line.lstrip().startswith("#"):
|
|
143
|
+
continue
|
|
144
|
+
stripped = PRI_RE.sub("", raw_line).lstrip()
|
|
145
|
+
if not SYSLOG_HDR_RE.match(stripped):
|
|
146
|
+
continue
|
|
147
|
+
if parse_timestamp(raw_line) is None:
|
|
148
|
+
continue
|
|
149
|
+
return "syslog"
|
|
150
|
+
return None
|
|
@@ -0,0 +1,294 @@
|
|
|
1
|
+
"""Zeek NDJSON log normalization — column maps and normalize functions for conn and dns logs."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
# Zeek conn log column → canonical name. Only columns that need renaming are listed.
|
|
8
|
+
# Columns that already have canonical names (proto, ts, conn_state, local_orig) are absent.
|
|
9
|
+
_CONN_COLUMN_MAP: dict[str, str] = {
|
|
10
|
+
"id.orig_h": "src",
|
|
11
|
+
"id.resp_h": "dst",
|
|
12
|
+
"id.resp_p": "port",
|
|
13
|
+
"orig_bytes": "bytes",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
# Zeek dns log → canonical DNS schema.
|
|
17
|
+
# Renames: TTLs→ttl, answers→answer, TC→tc, id.orig_h→src.
|
|
18
|
+
# rtt, rcode, and qtype are already canonical (qtype as Zeek's raw numeric
|
|
19
|
+
# type code, e.g. 1 = A, 28 = AAAA); qclass is filtered (aperture) and
|
|
20
|
+
# dropped — see _normalize_dns_df.
|
|
21
|
+
# Canonical minimal schema: ts, src, query.
|
|
22
|
+
# Canonical extended schema (nullable): qtype, rtt, ttl, rcode, answer, tc.
|
|
23
|
+
_DNS_COLUMN_MAP: dict[str, str] = {
|
|
24
|
+
"id.orig_h": "src",
|
|
25
|
+
"TTLs": "ttl",
|
|
26
|
+
"answers": "answer",
|
|
27
|
+
"TC": "tc",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
_REQUIRED_COLUMNS: dict[str, set[str]] = {
|
|
31
|
+
"conn": {"src", "dst", "port", "proto", "ts", "duration"},
|
|
32
|
+
"dns": {"src", "query", "ts"},
|
|
33
|
+
"syslog": {"ts", "host", "program", "raw", "message"},
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
# Canonical but nullable fields: present in _REQUIRED_COLUMNS for documentation,
|
|
37
|
+
# but absent from real logs without error (e.g. Zeek omits duration for open connections).
|
|
38
|
+
# Add new nullable canonical fields here, not to _REQUIRED_COLUMNS alone, so
|
|
39
|
+
# _schema_warning never fires for expected-absent columns.
|
|
40
|
+
_OPTIONAL_COLUMNS: dict[str, set[str]] = {
|
|
41
|
+
"conn": {"duration", "bytes", "conn_state", "local_orig"},
|
|
42
|
+
"dns": {"qtype", "rtt", "ttl", "rcode", "answer", "tc"},
|
|
43
|
+
# syslog extended (Zeek-only): facility/severity carried as-is from Zeek
|
|
44
|
+
# (uppercase enum strings, e.g. "DAEMON" / "INFO"). The digest consumes
|
|
45
|
+
# severity; the detector is source-blind.
|
|
46
|
+
"syslog": {"facility", "severity"},
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _normalize_conn_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
51
|
+
"""Rename Zeek conn log columns to the canonical schema. Only renames columns that exist."""
|
|
52
|
+
rename = {k: v for k, v in _CONN_COLUMN_MAP.items() if k in df.columns}
|
|
53
|
+
return df.rename(columns=rename) if rename else df
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _normalize_zeek_syslog_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
57
|
+
"""Normalize Zeek syslog.log to the canonical fidelity-aware syslog schema.
|
|
58
|
+
|
|
59
|
+
Minimal (always present on the happy path; v1-required):
|
|
60
|
+
ts, host, program, raw, message
|
|
61
|
+
Extended (Zeek-only, nullable):
|
|
62
|
+
facility, severity — uppercase enum strings (e.g. "DAEMON", "INFO"),
|
|
63
|
+
carried as-is for consumer interpretation. The digest reads severity
|
|
64
|
+
(error-set {EMERG, ALERT, CRIT, ERR}); the detector is source-blind
|
|
65
|
+
and never references either column.
|
|
66
|
+
|
|
67
|
+
Per-row derivation (happy path):
|
|
68
|
+
raw = Zeek `message` verbatim (drives finding title)
|
|
69
|
+
host = embedded RFC 3164 hostname via parse_host(raw); falls back
|
|
70
|
+
to Zeek `id.orig_h` when parse_host returns "unknown"
|
|
71
|
+
program = parse_program(strip_header(raw))
|
|
72
|
+
message = normalize_pids(strip_header(raw)) # canonical, drain3-aligned
|
|
73
|
+
ts = Zeek ts (already canonical epoch float)
|
|
74
|
+
|
|
75
|
+
Malformed-frame path: when input lacks `message`, the normalizer does
|
|
76
|
+
NOT synthesize message/raw/program just to satisfy shape — that would
|
|
77
|
+
paint a confident-but-empty card. The output frame omits the columns
|
|
78
|
+
that cannot be derived; loader._schema_warning then fires the
|
|
79
|
+
actionable "syslog.log fields not found" warning.
|
|
80
|
+
|
|
81
|
+
Drops uid/id.orig_p/id.resp_h/id.resp_p/proto and id.orig_h (the latter
|
|
82
|
+
after being consumed as the host fallback). Reuses the RFC 3164 helpers
|
|
83
|
+
in parsers/syslog.py so the doubled-timestamp invariant (^-anchored
|
|
84
|
+
strip_header strips only the leading transport header) holds for both
|
|
85
|
+
feeds.
|
|
86
|
+
"""
|
|
87
|
+
from loghunter.parsers.syslog import (
|
|
88
|
+
normalize_pids,
|
|
89
|
+
parse_host,
|
|
90
|
+
parse_program,
|
|
91
|
+
strip_header,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
drop_cols = {"uid", "id.orig_h", "id.orig_p",
|
|
95
|
+
"id.resp_h", "id.resp_p", "proto"}
|
|
96
|
+
|
|
97
|
+
if "message" not in df.columns:
|
|
98
|
+
# Honesty rail: preserve absence so _schema_warning fires.
|
|
99
|
+
keep = [c for c in df.columns if c not in drop_cols]
|
|
100
|
+
return df[keep].copy() if keep else df.copy()
|
|
101
|
+
|
|
102
|
+
# Narrow trailing-line-terminator strip: Zeek's NDJSON `message` field can
|
|
103
|
+
# carry the upstream record's trailing "\r"/"\n" (observed: 15,995 of one
|
|
104
|
+
# production capture). The detector uses raw as a single-line finding title;
|
|
105
|
+
# an embedded trailing "\n" then renders as a blank spacer row beneath the
|
|
106
|
+
# finding. Mirrors flat load_syslog's `line.rstrip("\n")` at the file-line
|
|
107
|
+
# boundary — same contract for the canonical column, applied once at the
|
|
108
|
+
# canonical seam.
|
|
109
|
+
# str.rstrip("\r\n") treats the arg as a CHARSET, so any mix of trailing
|
|
110
|
+
# CR/LF is removed; embedded mid-line newlines (Glenn confirmed none in the
|
|
111
|
+
# corpus) would survive verbatim, preserving fidelity.
|
|
112
|
+
raw = df["message"].astype(str).str.rstrip("\r\n")
|
|
113
|
+
stripped = raw.map(strip_header)
|
|
114
|
+
|
|
115
|
+
embedded_host = raw.map(parse_host)
|
|
116
|
+
if "id.orig_h" in df.columns:
|
|
117
|
+
host = embedded_host.where(embedded_host != "unknown", df["id.orig_h"])
|
|
118
|
+
else:
|
|
119
|
+
host = embedded_host
|
|
120
|
+
|
|
121
|
+
out = pd.DataFrame({
|
|
122
|
+
"ts": df["ts"] if "ts" in df.columns else pd.Series(dtype="float64"),
|
|
123
|
+
"host": host,
|
|
124
|
+
"program": stripped.map(parse_program),
|
|
125
|
+
"raw": raw,
|
|
126
|
+
"message": stripped.map(normalize_pids),
|
|
127
|
+
})
|
|
128
|
+
if "facility" in df.columns:
|
|
129
|
+
out["facility"] = df["facility"].values
|
|
130
|
+
if "severity" in df.columns:
|
|
131
|
+
out["severity"] = df["severity"].values
|
|
132
|
+
|
|
133
|
+
return out
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _normalize_dns_df(df: pd.DataFrame) -> pd.DataFrame:
|
|
137
|
+
"""Normalize Zeek dns.log to the canonical DNS schema.
|
|
138
|
+
|
|
139
|
+
Renames TTLs→ttl, answers→answer, TC→tc, id.orig_h→src.
|
|
140
|
+
Applies the internet-class aperture (qclass == 1) and drops qclass.
|
|
141
|
+
Carries qtype through as Zeek's raw numeric type code (e.g. 1 = A,
|
|
142
|
+
28 = AAAA); consumers wanting mnemonics map them downstream.
|
|
143
|
+
"""
|
|
144
|
+
rename = {k: v for k, v in _DNS_COLUMN_MAP.items() if k in df.columns}
|
|
145
|
+
if rename:
|
|
146
|
+
df = df.rename(columns=rename)
|
|
147
|
+
|
|
148
|
+
if "qclass" in df.columns:
|
|
149
|
+
df = df[df["qclass"] == 1] # keeps only internet-class; == 1 already drops nulls
|
|
150
|
+
df = df.drop(columns=["qclass"])
|
|
151
|
+
|
|
152
|
+
return df
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
SNIFF_PEEK_LINES: int = 4
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _has_rename_collision(keys, column_map: dict[str, str]) -> bool:
|
|
159
|
+
"""True iff any (zeek_key, canonical) pair in column_map has BOTH the
|
|
160
|
+
zeek key AND its canonical rename target present in `keys`.
|
|
161
|
+
|
|
162
|
+
A clean Zeek conn/dns NDJSON without `_path` carries the `id.*` keys
|
|
163
|
+
and never a native `src`/`dst`/`port`. A record carrying both halves
|
|
164
|
+
of any rename pair (e.g. `id.orig_h` AND `src`) would produce a
|
|
165
|
+
duplicate canonical column when the loader's rename runs, which then
|
|
166
|
+
crashes the downstream summariser — so the record is not a clean
|
|
167
|
+
conn/dns and the field-set fallback must not claim it.
|
|
168
|
+
"""
|
|
169
|
+
return any(z in keys and c in keys for z, c in column_map.items())
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def sniff(sample: list[str]) -> str | None:
|
|
173
|
+
"""Recognize a Zeek NDJSON conn or dns line and return its digester target.
|
|
174
|
+
|
|
175
|
+
Parses the first non-empty line of ``sample`` as JSON and inspects its
|
|
176
|
+
keys. Recognition proceeds in two layers:
|
|
177
|
+
|
|
178
|
+
1. **``_path`` gate (Zeek-native).** When the parsed dict carries the
|
|
179
|
+
Zeek ``_path`` directive (Zeek's own per-log-type tag, e.g. ``conn``,
|
|
180
|
+
``dns``, ``syslog``, ``notice``, ``analyzer``, …), trust it directly:
|
|
181
|
+
``_path == "conn"`` → ``"conn"``; ``_path == "dns"`` → ``"dns"``;
|
|
182
|
+
``_path == "syslog"`` → ``"syslog"``; any other value → ``None`` (we
|
|
183
|
+
do not have a digester for that log type — fall to the blob floor).
|
|
184
|
+
This is the NDJSON twin of the TSV ``#path`` gate in
|
|
185
|
+
``zeek_tsv.sniff``; non-claimable Zeek logs (notice.log,
|
|
186
|
+
analyzer.log) carry the 5-tuple as connection context but are NOT
|
|
187
|
+
conn frames and must not be claimed as such.
|
|
188
|
+
|
|
189
|
+
2. **Field-set fallback (Zeek NDJSON without ``_path``, hand-rolled
|
|
190
|
+
NDJSON).** When ``_path`` is absent, fall through to field-set tests
|
|
191
|
+
in this fixed order:
|
|
192
|
+
|
|
193
|
+
a. **dns** when the line carries the DNS key set (``query`` +
|
|
194
|
+
``src``/``id.orig_h`` + ``ts``).
|
|
195
|
+
b. **syslog** when the line carries facility + severity + message
|
|
196
|
+
+ ts + ``src``/``id.orig_h``. The three syslog-specific keys
|
|
197
|
+
(facility/severity/message) together are a tight signature —
|
|
198
|
+
neither ``notice.log`` nor ``analyzer.log`` carries that
|
|
199
|
+
triple, so the false-claim risk from sharing the 5-tuple does
|
|
200
|
+
NOT recur. Required to sit BEFORE the conn fallback: a Zeek
|
|
201
|
+
syslog.log emitted without ``_path`` carries the 5-tuple in
|
|
202
|
+
addition to the syslog fields, and the conn fallback would
|
|
203
|
+
otherwise claim it.
|
|
204
|
+
c. **conn** when it carries the conn key set (src/dst/port/proto/
|
|
205
|
+
ts via either native or canonical names) AND ``query`` is
|
|
206
|
+
absent — "no query" is the explicit disambiguator from DNS.
|
|
207
|
+
|
|
208
|
+
Returns None when none of the key sets matches.
|
|
209
|
+
|
|
210
|
+
Returns None for non-JSON, JSON that is not a dict, and dicts lacking
|
|
211
|
+
either signal.
|
|
212
|
+
|
|
213
|
+
``duration`` is NOT required for conn — it is optional (Zeek omits it
|
|
214
|
+
for open connections); see _OPTIONAL_COLUMNS.
|
|
215
|
+
|
|
216
|
+
Pure: takes already-decoded lines, performs no I/O.
|
|
217
|
+
"""
|
|
218
|
+
for raw_line in sample:
|
|
219
|
+
line = raw_line.strip()
|
|
220
|
+
if not line:
|
|
221
|
+
continue
|
|
222
|
+
try:
|
|
223
|
+
obj = json.loads(line)
|
|
224
|
+
except (json.JSONDecodeError, ValueError):
|
|
225
|
+
return None
|
|
226
|
+
if not isinstance(obj, dict):
|
|
227
|
+
return None
|
|
228
|
+
keys = obj.keys()
|
|
229
|
+
|
|
230
|
+
# Layer 1: _path gate — Zeek emits this on every native log line.
|
|
231
|
+
# Trust it directly and reject anything that isn't conn or dns.
|
|
232
|
+
if "_path" in keys:
|
|
233
|
+
path = obj.get("_path")
|
|
234
|
+
if path == "conn":
|
|
235
|
+
return "conn"
|
|
236
|
+
if path == "dns":
|
|
237
|
+
return "dns"
|
|
238
|
+
if path == "syslog":
|
|
239
|
+
return "syslog"
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
# Layer 2: field-set fallback for Zeek NDJSON emitted without _path
|
|
243
|
+
# and for hand-rolled non-Zeek NDJSON.
|
|
244
|
+
has_src = "src" in keys or "id.orig_h" in keys
|
|
245
|
+
has_ts = "ts" in keys
|
|
246
|
+
|
|
247
|
+
# 2a. dns: query is the disambiguator. Rejected when the record
|
|
248
|
+
# also carries a Zeek-native key whose canonical rename target is
|
|
249
|
+
# already present (e.g. id.orig_h + native src) — that collision
|
|
250
|
+
# would crash the dns summariser at rename time.
|
|
251
|
+
if (
|
|
252
|
+
has_src
|
|
253
|
+
and has_ts
|
|
254
|
+
and "query" in keys
|
|
255
|
+
and not _has_rename_collision(keys, _DNS_COLUMN_MAP)
|
|
256
|
+
):
|
|
257
|
+
return "dns"
|
|
258
|
+
|
|
259
|
+
# 2b. syslog: facility + severity + message form a tight Zeek-syslog
|
|
260
|
+
# signature. MUST sit before the conn fallback — Zeek syslog.log
|
|
261
|
+
# without `_path` carries the 5-tuple alongside the syslog fields,
|
|
262
|
+
# so the conn fallback would otherwise claim it as conn. Notice and
|
|
263
|
+
# analyzer logs DO NOT carry the (facility, severity, message)
|
|
264
|
+
# triple, so this does not reopen the notice/analyzer false-claim.
|
|
265
|
+
if (
|
|
266
|
+
has_src
|
|
267
|
+
and has_ts
|
|
268
|
+
and "facility" in keys
|
|
269
|
+
and "severity" in keys
|
|
270
|
+
and "message" in keys
|
|
271
|
+
):
|
|
272
|
+
return "syslog"
|
|
273
|
+
|
|
274
|
+
# 2c. conn: full 5-tuple, no query. Rejected when the record
|
|
275
|
+
# also carries a Zeek-native key whose canonical rename target
|
|
276
|
+
# is already present (e.g. id.orig_h + native src, the
|
|
277
|
+
# notice.log shape) — that collision would crash the conn
|
|
278
|
+
# summariser with the "Grouper for 'src' not 1-dimensional"
|
|
279
|
+
# pandas error.
|
|
280
|
+
has_dst = "dst" in keys or "id.resp_h" in keys
|
|
281
|
+
has_port = "port" in keys or "id.resp_p" in keys
|
|
282
|
+
has_proto = "proto" in keys
|
|
283
|
+
if (
|
|
284
|
+
has_src
|
|
285
|
+
and has_dst
|
|
286
|
+
and has_port
|
|
287
|
+
and has_proto
|
|
288
|
+
and has_ts
|
|
289
|
+
and "query" not in keys
|
|
290
|
+
and not _has_rename_collision(keys, _CONN_COLUMN_MAP)
|
|
291
|
+
):
|
|
292
|
+
return "conn"
|
|
293
|
+
return None
|
|
294
|
+
return None
|