loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,150 @@
1
+ """RFC 3164 syslog line parsing — extract (ts, host, message) for detector analysis.
2
+
3
+ Provides pure parsing functions with no file I/O. File discovery and DataFrame
4
+ construction are handled by loader.py. The syslog detector operates on the
5
+ normalized output produced here via load_syslog().
6
+ """
7
+
8
+ import re
9
+ from datetime import datetime, timedelta, timezone
10
+
11
+ # ── Compiled patterns ─────────────────────────────────────────────────────────
12
+
13
+ PRI_RE = re.compile(r'^<\d+>')
14
+ SYSLOG_HDR_RE = re.compile(r'^\w{3}\s+\d{1,2}\s+\d{2}:\d{2}:\d{2}\s+\S+\s+')
15
+ PROC_PID_RE = re.compile(r'\[\d+\]')
16
+
17
+ # Program/process token at the head of a header-stripped syslog body.
18
+ # Matches the leading run of non-whitespace characters up to the first '[' or ':'.
19
+ PROGRAM_RE = re.compile(r'^[^\[:\s]+')
20
+
21
+ # Timestamp in position 0–2 after stripping PRI (month day HH:MM:SS)
22
+ SYSLOG_TS_RE = re.compile(r'^(\w{3})\s+(\d{1,2})\s+(\d{2}:\d{2}:\d{2})')
23
+
24
+ # Reboot signal patterns — any match triggers reboot detection in the detector
25
+ REBOOT_SIGNALS_RE = re.compile(
26
+ r'(systemd-logind.*[Ss]ystem is rebooting|'
27
+ r'rsyslogd.*exiting on signal 15|'
28
+ r'systemd-shutdown.*Sending SIGTERM to remaining|'
29
+ r'kernel: Linux version\s)',
30
+ re.IGNORECASE,
31
+ )
32
+
33
+
34
+ # ── Parsing functions ─────────────────────────────────────────────────────────
35
+
36
+ def parse_host(raw: str) -> str:
37
+ """Extract hostname from an RFC 3164 syslog line (field 4 after stripping PRI).
38
+
39
+ Returns "unknown" if the line is too short to contain a hostname field.
40
+ """
41
+ stripped = PRI_RE.sub("", raw).strip()
42
+ parts = stripped.split()
43
+ return parts[3] if len(parts) >= 4 else "unknown"
44
+
45
+
46
+ def strip_header(raw: str) -> str:
47
+ """Remove RFC 3164 PRI prefix and the Mon DD HH:MM:SS hostname header."""
48
+ raw = PRI_RE.sub("", raw)
49
+ return SYSLOG_HDR_RE.sub("", raw).strip()
50
+
51
+
52
+ def normalize_pids(msg: str) -> str:
53
+ """Collapse process PID brackets so sshd[1234] and sshd[5678] share a template."""
54
+ return PROC_PID_RE.sub("[*]", msg)
55
+
56
+
57
+ def parse_program(body: str) -> str:
58
+ """Extract the program/process token from a header-stripped syslog body.
59
+
60
+ Strips surrounding whitespace, then returns the leading run of
61
+ non-whitespace characters up to the first '[' or ':' (e.g. 'sshd',
62
+ 'postfix/smtpd', 'kernel'). Returns 'unknown' when no such token exists
63
+ (empty body after stripping, or first non-whitespace character is '[' or ':').
64
+ """
65
+ m = PROGRAM_RE.match(body.strip())
66
+ return m.group(0) if m else "unknown"
67
+
68
+
69
+ def parse_timestamp(raw: str) -> datetime | None:
70
+ """Parse an RFC 3164 timestamp to a UTC-aware datetime.
71
+
72
+ RFC 3164 carries no year. This function uses the current UTC year as a starting
73
+ point, then applies a rollback heuristic: if the resulting datetime is more than
74
+ 7 days in the future, the log is from the previous year and the year is decremented.
75
+
76
+ Returns None if the line contains no parseable timestamp.
77
+ """
78
+ stripped = PRI_RE.sub("", raw).strip()
79
+ m = SYSLOG_TS_RE.match(stripped)
80
+ if not m:
81
+ return None
82
+ month_str, day_str, time_str = m.group(1), m.group(2), m.group(3)
83
+ year = datetime.now(timezone.utc).year
84
+ try:
85
+ dt = datetime.strptime(
86
+ f"{year} {month_str} {day_str.zfill(2)} {time_str}",
87
+ "%Y %b %d %H:%M:%S",
88
+ ).replace(tzinfo=timezone.utc)
89
+ except ValueError:
90
+ return None
91
+ if dt > datetime.now(timezone.utc) + timedelta(days=7):
92
+ dt = dt.replace(year=dt.year - 1)
93
+ return dt
94
+
95
+
96
+ def is_reboot_signal(raw: str) -> bool:
97
+ """Return True if the raw line matches a known reboot or shutdown pattern."""
98
+ return bool(REBOOT_SIGNALS_RE.search(raw))
99
+
100
+
101
+ def parse_line(raw: str) -> dict | None:
102
+ """Parse a raw syslog line into a normalized record dict.
103
+
104
+ Returns None for blank lines and comment lines (starting with #).
105
+ Returns a dict with keys: ts (datetime | None), host (str), program (str),
106
+ raw (str), message (str). Empty message strings are preserved — the caller
107
+ decides whether to filter them.
108
+ """
109
+ if not raw or raw.lstrip().startswith("#"):
110
+ return None
111
+ body = strip_header(raw)
112
+ return {
113
+ "ts": parse_timestamp(raw),
114
+ "host": parse_host(raw),
115
+ "program": parse_program(body),
116
+ "raw": raw,
117
+ "message": normalize_pids(body),
118
+ }
119
+
120
+
121
+ SNIFF_PEEK_LINES: int = 32
122
+
123
+
124
+ def sniff(sample: list[str]) -> str | None:
125
+ """Recognize an RFC 3164 syslog line and return "syslog".
126
+
127
+ Real-header signal — not "parse_line non-None" (which is true for any
128
+ nonblank line). Requires BOTH:
129
+
130
+ 1. After optional PRI stripping and lstrip, SYSLOG_HDR_RE matches
131
+ (Mon DD HH:MM:SS HOSTNAME header shape).
132
+ 2. parse_timestamp returns a non-None datetime (proves the leading
133
+ timestamp portion is real, not a regex coincidence).
134
+
135
+ Returns "syslog" on the first line that passes both checks. Returns
136
+ None when the budget is exhausted with no real-header line — garbage
137
+ text, prose, and blank-only samples fall through correctly.
138
+
139
+ Pure: takes already-decoded lines, performs no I/O.
140
+ """
141
+ for raw_line in sample:
142
+ if not raw_line or raw_line.lstrip().startswith("#"):
143
+ continue
144
+ stripped = PRI_RE.sub("", raw_line).lstrip()
145
+ if not SYSLOG_HDR_RE.match(stripped):
146
+ continue
147
+ if parse_timestamp(raw_line) is None:
148
+ continue
149
+ return "syslog"
150
+ return None
@@ -0,0 +1,294 @@
1
+ """Zeek NDJSON log normalization — column maps and normalize functions for conn and dns logs."""
2
+
3
+ import json
4
+
5
+ import pandas as pd
6
+
7
+ # Zeek conn log column → canonical name. Only columns that need renaming are listed.
8
+ # Columns that already have canonical names (proto, ts, conn_state, local_orig) are absent.
9
+ _CONN_COLUMN_MAP: dict[str, str] = {
10
+ "id.orig_h": "src",
11
+ "id.resp_h": "dst",
12
+ "id.resp_p": "port",
13
+ "orig_bytes": "bytes",
14
+ }
15
+
16
+ # Zeek dns log → canonical DNS schema.
17
+ # Renames: TTLs→ttl, answers→answer, TC→tc, id.orig_h→src.
18
+ # rtt, rcode, and qtype are already canonical (qtype as Zeek's raw numeric
19
+ # type code, e.g. 1 = A, 28 = AAAA); qclass is filtered (aperture) and
20
+ # dropped — see _normalize_dns_df.
21
+ # Canonical minimal schema: ts, src, query.
22
+ # Canonical extended schema (nullable): qtype, rtt, ttl, rcode, answer, tc.
23
+ _DNS_COLUMN_MAP: dict[str, str] = {
24
+ "id.orig_h": "src",
25
+ "TTLs": "ttl",
26
+ "answers": "answer",
27
+ "TC": "tc",
28
+ }
29
+
30
+ _REQUIRED_COLUMNS: dict[str, set[str]] = {
31
+ "conn": {"src", "dst", "port", "proto", "ts", "duration"},
32
+ "dns": {"src", "query", "ts"},
33
+ "syslog": {"ts", "host", "program", "raw", "message"},
34
+ }
35
+
36
+ # Canonical but nullable fields: present in _REQUIRED_COLUMNS for documentation,
37
+ # but absent from real logs without error (e.g. Zeek omits duration for open connections).
38
+ # Add new nullable canonical fields here, not to _REQUIRED_COLUMNS alone, so
39
+ # _schema_warning never fires for expected-absent columns.
40
+ _OPTIONAL_COLUMNS: dict[str, set[str]] = {
41
+ "conn": {"duration", "bytes", "conn_state", "local_orig"},
42
+ "dns": {"qtype", "rtt", "ttl", "rcode", "answer", "tc"},
43
+ # syslog extended (Zeek-only): facility/severity carried as-is from Zeek
44
+ # (uppercase enum strings, e.g. "DAEMON" / "INFO"). The digest consumes
45
+ # severity; the detector is source-blind.
46
+ "syslog": {"facility", "severity"},
47
+ }
48
+
49
+
50
+ def _normalize_conn_df(df: pd.DataFrame) -> pd.DataFrame:
51
+ """Rename Zeek conn log columns to the canonical schema. Only renames columns that exist."""
52
+ rename = {k: v for k, v in _CONN_COLUMN_MAP.items() if k in df.columns}
53
+ return df.rename(columns=rename) if rename else df
54
+
55
+
56
+ def _normalize_zeek_syslog_df(df: pd.DataFrame) -> pd.DataFrame:
57
+ """Normalize Zeek syslog.log to the canonical fidelity-aware syslog schema.
58
+
59
+ Minimal (always present on the happy path; v1-required):
60
+ ts, host, program, raw, message
61
+ Extended (Zeek-only, nullable):
62
+ facility, severity — uppercase enum strings (e.g. "DAEMON", "INFO"),
63
+ carried as-is for consumer interpretation. The digest reads severity
64
+ (error-set {EMERG, ALERT, CRIT, ERR}); the detector is source-blind
65
+ and never references either column.
66
+
67
+ Per-row derivation (happy path):
68
+ raw = Zeek `message` verbatim (drives finding title)
69
+ host = embedded RFC 3164 hostname via parse_host(raw); falls back
70
+ to Zeek `id.orig_h` when parse_host returns "unknown"
71
+ program = parse_program(strip_header(raw))
72
+ message = normalize_pids(strip_header(raw)) # canonical, drain3-aligned
73
+ ts = Zeek ts (already canonical epoch float)
74
+
75
+ Malformed-frame path: when input lacks `message`, the normalizer does
76
+ NOT synthesize message/raw/program just to satisfy shape — that would
77
+ paint a confident-but-empty card. The output frame omits the columns
78
+ that cannot be derived; loader._schema_warning then fires the
79
+ actionable "syslog.log fields not found" warning.
80
+
81
+ Drops uid/id.orig_p/id.resp_h/id.resp_p/proto and id.orig_h (the latter
82
+ after being consumed as the host fallback). Reuses the RFC 3164 helpers
83
+ in parsers/syslog.py so the doubled-timestamp invariant (^-anchored
84
+ strip_header strips only the leading transport header) holds for both
85
+ feeds.
86
+ """
87
+ from loghunter.parsers.syslog import (
88
+ normalize_pids,
89
+ parse_host,
90
+ parse_program,
91
+ strip_header,
92
+ )
93
+
94
+ drop_cols = {"uid", "id.orig_h", "id.orig_p",
95
+ "id.resp_h", "id.resp_p", "proto"}
96
+
97
+ if "message" not in df.columns:
98
+ # Honesty rail: preserve absence so _schema_warning fires.
99
+ keep = [c for c in df.columns if c not in drop_cols]
100
+ return df[keep].copy() if keep else df.copy()
101
+
102
+ # Narrow trailing-line-terminator strip: Zeek's NDJSON `message` field can
103
+ # carry the upstream record's trailing "\r"/"\n" (observed: 15,995 of one
104
+ # production capture). The detector uses raw as a single-line finding title;
105
+ # an embedded trailing "\n" then renders as a blank spacer row beneath the
106
+ # finding. Mirrors flat load_syslog's `line.rstrip("\n")` at the file-line
107
+ # boundary — same contract for the canonical column, applied once at the
108
+ # canonical seam.
109
+ # str.rstrip("\r\n") treats the arg as a CHARSET, so any mix of trailing
110
+ # CR/LF is removed; embedded mid-line newlines (Glenn confirmed none in the
111
+ # corpus) would survive verbatim, preserving fidelity.
112
+ raw = df["message"].astype(str).str.rstrip("\r\n")
113
+ stripped = raw.map(strip_header)
114
+
115
+ embedded_host = raw.map(parse_host)
116
+ if "id.orig_h" in df.columns:
117
+ host = embedded_host.where(embedded_host != "unknown", df["id.orig_h"])
118
+ else:
119
+ host = embedded_host
120
+
121
+ out = pd.DataFrame({
122
+ "ts": df["ts"] if "ts" in df.columns else pd.Series(dtype="float64"),
123
+ "host": host,
124
+ "program": stripped.map(parse_program),
125
+ "raw": raw,
126
+ "message": stripped.map(normalize_pids),
127
+ })
128
+ if "facility" in df.columns:
129
+ out["facility"] = df["facility"].values
130
+ if "severity" in df.columns:
131
+ out["severity"] = df["severity"].values
132
+
133
+ return out
134
+
135
+
136
+ def _normalize_dns_df(df: pd.DataFrame) -> pd.DataFrame:
137
+ """Normalize Zeek dns.log to the canonical DNS schema.
138
+
139
+ Renames TTLs→ttl, answers→answer, TC→tc, id.orig_h→src.
140
+ Applies the internet-class aperture (qclass == 1) and drops qclass.
141
+ Carries qtype through as Zeek's raw numeric type code (e.g. 1 = A,
142
+ 28 = AAAA); consumers wanting mnemonics map them downstream.
143
+ """
144
+ rename = {k: v for k, v in _DNS_COLUMN_MAP.items() if k in df.columns}
145
+ if rename:
146
+ df = df.rename(columns=rename)
147
+
148
+ if "qclass" in df.columns:
149
+ df = df[df["qclass"] == 1] # keeps only internet-class; == 1 already drops nulls
150
+ df = df.drop(columns=["qclass"])
151
+
152
+ return df
153
+
154
+
155
+ SNIFF_PEEK_LINES: int = 4
156
+
157
+
158
+ def _has_rename_collision(keys, column_map: dict[str, str]) -> bool:
159
+ """True iff any (zeek_key, canonical) pair in column_map has BOTH the
160
+ zeek key AND its canonical rename target present in `keys`.
161
+
162
+ A clean Zeek conn/dns NDJSON without `_path` carries the `id.*` keys
163
+ and never a native `src`/`dst`/`port`. A record carrying both halves
164
+ of any rename pair (e.g. `id.orig_h` AND `src`) would produce a
165
+ duplicate canonical column when the loader's rename runs, which then
166
+ crashes the downstream summariser — so the record is not a clean
167
+ conn/dns and the field-set fallback must not claim it.
168
+ """
169
+ return any(z in keys and c in keys for z, c in column_map.items())
170
+
171
+
172
+ def sniff(sample: list[str]) -> str | None:
173
+ """Recognize a Zeek NDJSON conn or dns line and return its digester target.
174
+
175
+ Parses the first non-empty line of ``sample`` as JSON and inspects its
176
+ keys. Recognition proceeds in two layers:
177
+
178
+ 1. **``_path`` gate (Zeek-native).** When the parsed dict carries the
179
+ Zeek ``_path`` directive (Zeek's own per-log-type tag, e.g. ``conn``,
180
+ ``dns``, ``syslog``, ``notice``, ``analyzer``, …), trust it directly:
181
+ ``_path == "conn"`` → ``"conn"``; ``_path == "dns"`` → ``"dns"``;
182
+ ``_path == "syslog"`` → ``"syslog"``; any other value → ``None`` (we
183
+ do not have a digester for that log type — fall to the blob floor).
184
+ This is the NDJSON twin of the TSV ``#path`` gate in
185
+ ``zeek_tsv.sniff``; non-claimable Zeek logs (notice.log,
186
+ analyzer.log) carry the 5-tuple as connection context but are NOT
187
+ conn frames and must not be claimed as such.
188
+
189
+ 2. **Field-set fallback (Zeek NDJSON without ``_path``, hand-rolled
190
+ NDJSON).** When ``_path`` is absent, fall through to field-set tests
191
+ in this fixed order:
192
+
193
+ a. **dns** when the line carries the DNS key set (``query`` +
194
+ ``src``/``id.orig_h`` + ``ts``).
195
+ b. **syslog** when the line carries facility + severity + message
196
+ + ts + ``src``/``id.orig_h``. The three syslog-specific keys
197
+ (facility/severity/message) together are a tight signature —
198
+ neither ``notice.log`` nor ``analyzer.log`` carries that
199
+ triple, so the false-claim risk from sharing the 5-tuple does
200
+ NOT recur. Required to sit BEFORE the conn fallback: a Zeek
201
+ syslog.log emitted without ``_path`` carries the 5-tuple in
202
+ addition to the syslog fields, and the conn fallback would
203
+ otherwise claim it.
204
+ c. **conn** when it carries the conn key set (src/dst/port/proto/
205
+ ts via either native or canonical names) AND ``query`` is
206
+ absent — "no query" is the explicit disambiguator from DNS.
207
+
208
+ Returns None when none of the key sets matches.
209
+
210
+ Returns None for non-JSON, JSON that is not a dict, and dicts lacking
211
+ either signal.
212
+
213
+ ``duration`` is NOT required for conn — it is optional (Zeek omits it
214
+ for open connections); see _OPTIONAL_COLUMNS.
215
+
216
+ Pure: takes already-decoded lines, performs no I/O.
217
+ """
218
+ for raw_line in sample:
219
+ line = raw_line.strip()
220
+ if not line:
221
+ continue
222
+ try:
223
+ obj = json.loads(line)
224
+ except (json.JSONDecodeError, ValueError):
225
+ return None
226
+ if not isinstance(obj, dict):
227
+ return None
228
+ keys = obj.keys()
229
+
230
+ # Layer 1: _path gate — Zeek emits this on every native log line.
231
+ # Trust it directly and reject anything that isn't conn or dns.
232
+ if "_path" in keys:
233
+ path = obj.get("_path")
234
+ if path == "conn":
235
+ return "conn"
236
+ if path == "dns":
237
+ return "dns"
238
+ if path == "syslog":
239
+ return "syslog"
240
+ return None
241
+
242
+ # Layer 2: field-set fallback for Zeek NDJSON emitted without _path
243
+ # and for hand-rolled non-Zeek NDJSON.
244
+ has_src = "src" in keys or "id.orig_h" in keys
245
+ has_ts = "ts" in keys
246
+
247
+ # 2a. dns: query is the disambiguator. Rejected when the record
248
+ # also carries a Zeek-native key whose canonical rename target is
249
+ # already present (e.g. id.orig_h + native src) — that collision
250
+ # would crash the dns summariser at rename time.
251
+ if (
252
+ has_src
253
+ and has_ts
254
+ and "query" in keys
255
+ and not _has_rename_collision(keys, _DNS_COLUMN_MAP)
256
+ ):
257
+ return "dns"
258
+
259
+ # 2b. syslog: facility + severity + message form a tight Zeek-syslog
260
+ # signature. MUST sit before the conn fallback — Zeek syslog.log
261
+ # without `_path` carries the 5-tuple alongside the syslog fields,
262
+ # so the conn fallback would otherwise claim it as conn. Notice and
263
+ # analyzer logs DO NOT carry the (facility, severity, message)
264
+ # triple, so this does not reopen the notice/analyzer false-claim.
265
+ if (
266
+ has_src
267
+ and has_ts
268
+ and "facility" in keys
269
+ and "severity" in keys
270
+ and "message" in keys
271
+ ):
272
+ return "syslog"
273
+
274
+ # 2c. conn: full 5-tuple, no query. Rejected when the record
275
+ # also carries a Zeek-native key whose canonical rename target
276
+ # is already present (e.g. id.orig_h + native src, the
277
+ # notice.log shape) — that collision would crash the conn
278
+ # summariser with the "Grouper for 'src' not 1-dimensional"
279
+ # pandas error.
280
+ has_dst = "dst" in keys or "id.resp_h" in keys
281
+ has_port = "port" in keys or "id.resp_p" in keys
282
+ has_proto = "proto" in keys
283
+ if (
284
+ has_src
285
+ and has_dst
286
+ and has_port
287
+ and has_proto
288
+ and has_ts
289
+ and "query" not in keys
290
+ and not _has_rename_collision(keys, _CONN_COLUMN_MAP)
291
+ ):
292
+ return "conn"
293
+ return None
294
+ return None