loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
|
@@ -0,0 +1,580 @@
|
|
|
1
|
+
"""Parity, directive-behavior, and smoke tests for loghunter.parsers.zeek_tsv.
|
|
2
|
+
|
|
3
|
+
All fixture data is hand-authored using RFC 5737 documentation IP space
|
|
4
|
+
(192.0.2.x, 198.51.100.x, 203.0.113.x). No real network data anywhere.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import math
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
from loghunter.parsers.zeek import _normalize_conn_df, _normalize_dns_df
|
|
16
|
+
from loghunter.parsers.zeek_tsv import parse_tsv_log
|
|
17
|
+
|
|
18
|
+
# ── Fixture constants ─────────────────────────────────────────────────────────
|
|
19
|
+
#
|
|
20
|
+
# Every fixture is a raw string to be passed as splitlines(keepends=True).
|
|
21
|
+
# Tab characters are written as literal tabs. RFC 5737 IPs throughout.
|
|
22
|
+
|
|
23
|
+
_CONN_TSV = (
|
|
24
|
+
"#separator \\x09\n"
|
|
25
|
+
"#set_separator\t,\n"
|
|
26
|
+
"#empty_field\t(empty)\n"
|
|
27
|
+
"#unset_field\t-\n"
|
|
28
|
+
"#path\tconn\n"
|
|
29
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
|
|
30
|
+
"\tproto\tservice\tduration\torig_bytes\tresp_bytes"
|
|
31
|
+
"\tconn_state\tlocal_orig\tlocal_resp\ttunnel_parents\n"
|
|
32
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport"
|
|
33
|
+
"\tenum\tstring\tinterval\tcount\tcount"
|
|
34
|
+
"\tstring\tbool\tbool\tset[string]\n"
|
|
35
|
+
# Row A: duration present, service present, local_orig=T, tunnel_parents=(empty)
|
|
36
|
+
"1748649600.000000\tCTest01\t192.0.2.10\t51514\t203.0.113.20\t443"
|
|
37
|
+
"\ttcp\tssl\t3.5\t1500\t8200\tSF\tT\tF\t(empty)\n"
|
|
38
|
+
# Row B: duration unset, service unset, local_orig=F, tunnel_parents unset
|
|
39
|
+
"1748649660.000000\tCTest02\t198.51.100.1\t54321\t192.0.2.20\t22"
|
|
40
|
+
"\ttcp\t-\t-\t0\t0\tS0\tF\tF\t-\n"
|
|
41
|
+
"#close\t2026-01-01-00:00:00\n"
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
# Equivalent events in NDJSON. Absent keys mirror TSV unset tokens.
|
|
45
|
+
_CONN_NDJSON = (
|
|
46
|
+
'{"ts":1748649600.0,"uid":"CTest01","id.orig_h":"192.0.2.10","id.orig_p":51514,'
|
|
47
|
+
'"id.resp_h":"203.0.113.20","id.resp_p":443,"proto":"tcp","service":"ssl",'
|
|
48
|
+
'"duration":3.5,"orig_bytes":1500,"resp_bytes":8200,"conn_state":"SF",'
|
|
49
|
+
'"local_orig":true,"local_resp":false,"tunnel_parents":[]}\n'
|
|
50
|
+
'{"ts":1748649660.0,"uid":"CTest02","id.orig_h":"198.51.100.1","id.orig_p":54321,'
|
|
51
|
+
'"id.resp_h":"192.0.2.20","id.resp_p":22,"proto":"tcp",'
|
|
52
|
+
'"orig_bytes":0,"resp_bytes":0,"conn_state":"S0",'
|
|
53
|
+
'"local_orig":false,"local_resp":false}\n'
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
_DNS_TSV = (
|
|
57
|
+
"#separator \\x09\n"
|
|
58
|
+
"#set_separator\t,\n"
|
|
59
|
+
"#empty_field\t(empty)\n"
|
|
60
|
+
"#unset_field\t-\n"
|
|
61
|
+
"#path\tdns\n"
|
|
62
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
|
|
63
|
+
"\tproto\ttrans_id\trtt\tquery\tqclass\tqclass_name\tqtype\tqtype_name"
|
|
64
|
+
"\trcode\tAA\tTC\tRD\tRA\tZ\tanswers\tTTLs\trejected\n"
|
|
65
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport"
|
|
66
|
+
"\tenum\tcount\tinterval\tstring\tcount\tstring\tcount\tstring"
|
|
67
|
+
"\tcount\tbool\tbool\tbool\tbool\tcount\tvector[string]\tvector[interval]\tbool\n"
|
|
68
|
+
# Row 1: qclass=1, multi-value answers/TTLs, rtt present, AA=T, TC=F
|
|
69
|
+
"1748649700.000000\tCDns01\t192.0.2.1\t12345\t192.0.2.53\t53"
|
|
70
|
+
"\tudp\t1001\t0.050000\talpha.invalid\t1\tC_INTERNET\t1\tA"
|
|
71
|
+
"\t0\tT\tF\tT\tT\t0\t198.51.100.1,203.0.113.1\t300.000000,300.000000\tF\n"
|
|
72
|
+
# Row 2: qclass=2 — dropped by normalizer aperture
|
|
73
|
+
"1748649701.000000\tCDns02\t192.0.2.2\t12346\t192.0.2.53\t53"
|
|
74
|
+
"\tudp\t1002\t0.030000\tbeta.invalid\t2\tC_CSNET\t1\tA"
|
|
75
|
+
"\t0\tF\tF\tT\tT\t0\t198.51.100.2\t60.000000\tF\n"
|
|
76
|
+
# Row 3: qclass=1, empty query, rtt unset, answers unset, TTLs unset
|
|
77
|
+
"1748649702.000000\tCDns03\t192.0.2.3\t12347\t192.0.2.53\t53"
|
|
78
|
+
"\tudp\t1003\t-\t(empty)\t1\tC_INTERNET\t48\tDNSKEY"
|
|
79
|
+
"\t0\tF\tF\tT\tF\t0\t-\t-\tF\n"
|
|
80
|
+
"#close\t2026-01-01-00:00:00\n"
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# NDJSON equivalent — qclass=2 row included so the aperture can drop it identically.
|
|
84
|
+
_DNS_NDJSON = (
|
|
85
|
+
'{"ts":1748649700.0,"uid":"CDns01","id.orig_h":"192.0.2.1","id.orig_p":12345,'
|
|
86
|
+
'"id.resp_h":"192.0.2.53","id.resp_p":53,"proto":"udp","trans_id":1001,'
|
|
87
|
+
'"rtt":0.05,"query":"alpha.invalid","qclass":1,"qclass_name":"C_INTERNET",'
|
|
88
|
+
'"qtype":1,"qtype_name":"A","rcode":0,"AA":true,"TC":false,"RD":true,"RA":true,'
|
|
89
|
+
'"Z":0,"answers":["198.51.100.1","203.0.113.1"],"TTLs":[300.0,300.0],'
|
|
90
|
+
'"rejected":false}\n'
|
|
91
|
+
'{"ts":1748649701.0,"uid":"CDns02","id.orig_h":"192.0.2.2","id.orig_p":12346,'
|
|
92
|
+
'"id.resp_h":"192.0.2.53","id.resp_p":53,"proto":"udp","trans_id":1002,'
|
|
93
|
+
'"rtt":0.03,"query":"beta.invalid","qclass":2,"qclass_name":"C_CSNET",'
|
|
94
|
+
'"qtype":1,"qtype_name":"A","rcode":0,"AA":false,"TC":false,"RD":true,"RA":true,'
|
|
95
|
+
'"Z":0,"answers":["198.51.100.2"],"TTLs":[60.0],"rejected":false}\n'
|
|
96
|
+
'{"ts":1748649702.0,"uid":"CDns03","id.orig_h":"192.0.2.3","id.orig_p":12347,'
|
|
97
|
+
'"id.resp_h":"192.0.2.53","id.resp_p":53,"proto":"udp","trans_id":1003,'
|
|
98
|
+
'"query":"","qclass":1,"qclass_name":"C_INTERNET",'
|
|
99
|
+
'"qtype":48,"qtype_name":"DNSKEY","rcode":0,"AA":false,"TC":false,'
|
|
100
|
+
'"RD":true,"RA":false,"Z":0,"rejected":false}\n'
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
_SMOKE_TSV = (
|
|
104
|
+
"#separator \\x09\n"
|
|
105
|
+
"#set_separator\t,\n"
|
|
106
|
+
"#empty_field\t(empty)\n"
|
|
107
|
+
"#unset_field\t-\n"
|
|
108
|
+
"#path\tconn\n"
|
|
109
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
|
|
110
|
+
"\tproto\tduration\tlocal_orig\ttunnel_parents\n"
|
|
111
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport"
|
|
112
|
+
"\tenum\tinterval\tbool\tset[string]\n"
|
|
113
|
+
"1748649800.000000\tCSmk01\t192.0.2.10\t11111\t203.0.113.1\t80\ttcp\t1.5\tT\tfoo,bar\n"
|
|
114
|
+
"1748649801.000000\tCSmk02\t192.0.2.11\t22222\t203.0.113.2\t443\ttcp\t-\tF\t-\n"
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ── Helper ────────────────────────────────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
def _ndjson_df(ndjson: str) -> pd.DataFrame:
|
|
121
|
+
records = [json.loads(ln) for ln in ndjson.strip().splitlines()]
|
|
122
|
+
return pd.DataFrame(records)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _compare(tsv_df: pd.DataFrame, ndjson_df: pd.DataFrame) -> None:
|
|
126
|
+
"""Sort both frames by ts, reset index, compare ignoring column order."""
|
|
127
|
+
left = tsv_df.sort_values("ts").reset_index(drop=True)
|
|
128
|
+
right = ndjson_df.sort_values("ts").reset_index(drop=True)
|
|
129
|
+
pd.testing.assert_frame_equal(left, right, check_like=True)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# ── Parity tests ──────────────────────────────────────────────────────────────
|
|
133
|
+
|
|
134
|
+
def test_conn_tsv_ndjson_parity() -> None:
|
|
135
|
+
"""TSV and NDJSON conn paths produce identical normalized DataFrames."""
|
|
136
|
+
tsv_df = _normalize_conn_df(parse_tsv_log(_CONN_TSV.splitlines(keepends=True)))
|
|
137
|
+
ndjson_df = _normalize_conn_df(_ndjson_df(_CONN_NDJSON))
|
|
138
|
+
_compare(tsv_df, ndjson_df)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def test_dns_tsv_ndjson_parity() -> None:
|
|
142
|
+
"""TSV and NDJSON dns paths produce identical normalized DataFrames.
|
|
143
|
+
|
|
144
|
+
Both paths go through _normalize_dns_df which applies the qclass==1
|
|
145
|
+
aperture. The qclass=2 row must be dropped by both paths identically,
|
|
146
|
+
and the surviving qtype values must pass through as raw numeric type
|
|
147
|
+
codes (1 = A, 48 = DNSKEY) in both paths — locks the qtype-carries-
|
|
148
|
+
through behaviour on the TSV side of the Zeek path.
|
|
149
|
+
"""
|
|
150
|
+
tsv_df = _normalize_dns_df(parse_tsv_log(_DNS_TSV.splitlines(keepends=True)))
|
|
151
|
+
ndjson_df = _normalize_dns_df(_ndjson_df(_DNS_NDJSON))
|
|
152
|
+
assert len(tsv_df) == 2, "qclass=2 row must be dropped"
|
|
153
|
+
# qtype must be present and preserved as raw numeric in both paths.
|
|
154
|
+
assert "qtype" in tsv_df.columns, "qtype must survive TSV normalization"
|
|
155
|
+
assert "qtype" in ndjson_df.columns, "qtype must survive NDJSON normalization"
|
|
156
|
+
assert sorted(tsv_df["qtype"].tolist()) == [1, 48]
|
|
157
|
+
assert sorted(ndjson_df["qtype"].tolist()) == [1, 48]
|
|
158
|
+
_compare(tsv_df, ndjson_df)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ── Directive-behavior tests ──────────────────────────────────────────────────
|
|
162
|
+
|
|
163
|
+
def test_non_tab_separator() -> None:
|
|
164
|
+
"""Parser honors a non-tab #separator directive."""
|
|
165
|
+
tsv = (
|
|
166
|
+
"#separator \\x7c\n" # pipe |
|
|
167
|
+
"#set_separator|,\n"
|
|
168
|
+
"#empty_field|(empty)\n"
|
|
169
|
+
"#unset_field|-\n"
|
|
170
|
+
"#path|conn\n"
|
|
171
|
+
"#fields|ts|src\n"
|
|
172
|
+
"#types|time|string\n"
|
|
173
|
+
"1748649600.000000|192.0.2.1\n"
|
|
174
|
+
)
|
|
175
|
+
df = parse_tsv_log(tsv.splitlines(keepends=True))
|
|
176
|
+
assert list(df.columns) == ["ts", "src"]
|
|
177
|
+
assert df.iloc[0]["ts"] == pytest.approx(1748649600.0)
|
|
178
|
+
assert df.iloc[0]["src"] == "192.0.2.1"
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def test_custom_empty_and_unset_tokens() -> None:
|
|
182
|
+
"""Custom #empty_field and #unset_field tokens are honored."""
|
|
183
|
+
tsv = (
|
|
184
|
+
"#separator \\x09\n"
|
|
185
|
+
"#set_separator\t,\n"
|
|
186
|
+
"#empty_field\tEMPTY\n"
|
|
187
|
+
"#unset_field\tNONE\n"
|
|
188
|
+
"#path\ttest\n"
|
|
189
|
+
"#fields\tts\ta\tb\n"
|
|
190
|
+
"#types\ttime\tstring\tstring\n"
|
|
191
|
+
"1748649600.0\tEMPTY\tNONE\n"
|
|
192
|
+
)
|
|
193
|
+
df = parse_tsv_log(tsv.splitlines(keepends=True))
|
|
194
|
+
assert df.iloc[0]["a"] == "" # empty token → empty string
|
|
195
|
+
assert "b" not in df.iloc[0] or math.isnan(df.iloc[0]["b"]) # unset → absent/NaN
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def test_custom_set_separator() -> None:
|
|
199
|
+
"""Custom #set_separator is used to split set/vector fields."""
|
|
200
|
+
tsv = (
|
|
201
|
+
"#separator \\x09\n"
|
|
202
|
+
"#set_separator\t|\n"
|
|
203
|
+
"#empty_field\t(empty)\n"
|
|
204
|
+
"#unset_field\t-\n"
|
|
205
|
+
"#path\ttest\n"
|
|
206
|
+
"#fields\tts\tanswers\n"
|
|
207
|
+
"#types\ttime\tvector[string]\n"
|
|
208
|
+
"1748649600.0\talpha|beta|gamma\n"
|
|
209
|
+
)
|
|
210
|
+
df = parse_tsv_log(tsv.splitlines(keepends=True))
|
|
211
|
+
assert df.iloc[0]["answers"] == ["alpha", "beta", "gamma"]
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def test_missing_separator_raises() -> None:
|
|
215
|
+
"""A header without #separator followed by a data row raises ValueError."""
|
|
216
|
+
tsv = (
|
|
217
|
+
"#set_separator\t,\n"
|
|
218
|
+
"#fields\tts\tsrc\n"
|
|
219
|
+
"#types\ttime\tstring\n"
|
|
220
|
+
"1748649600.0\t192.0.2.1\n"
|
|
221
|
+
)
|
|
222
|
+
with pytest.raises(ValueError, match="missing #separator"):
|
|
223
|
+
parse_tsv_log(tsv.splitlines(keepends=True))
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def test_fields_types_length_mismatch_raises() -> None:
|
|
227
|
+
"""Mismatched #fields / #types lengths raise ValueError."""
|
|
228
|
+
tsv = (
|
|
229
|
+
"#separator \\x09\n"
|
|
230
|
+
"#fields\tts\tsrc\tdst\n"
|
|
231
|
+
"#types\ttime\tstring\n" # only 2 types for 3 fields
|
|
232
|
+
)
|
|
233
|
+
with pytest.raises(ValueError, match="#fields"):
|
|
234
|
+
parse_tsv_log(tsv.splitlines(keepends=True))
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def test_missing_fields_raises() -> None:
|
|
238
|
+
"""A header with no #fields line raises ValueError."""
|
|
239
|
+
tsv = (
|
|
240
|
+
"#separator \\x09\n"
|
|
241
|
+
"#types\ttime\tstring\n"
|
|
242
|
+
"1748649600.0\t192.0.2.1\n"
|
|
243
|
+
)
|
|
244
|
+
with pytest.raises(ValueError, match="missing #fields"):
|
|
245
|
+
parse_tsv_log(tsv.splitlines(keepends=True))
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def test_ragged_row_raises() -> None:
|
|
249
|
+
"""A data row with the wrong token count raises ValueError with line and counts."""
|
|
250
|
+
tsv = (
|
|
251
|
+
"#separator \\x09\n"
|
|
252
|
+
"#fields\tts\tsrc\tdst\n"
|
|
253
|
+
"#types\ttime\tstring\tstring\n"
|
|
254
|
+
"1748649600.0\t192.0.2.1\n" # only 2 tokens, 3 expected
|
|
255
|
+
)
|
|
256
|
+
with pytest.raises(ValueError) as exc_info:
|
|
257
|
+
parse_tsv_log(tsv.splitlines(keepends=True))
|
|
258
|
+
msg = str(exc_info.value)
|
|
259
|
+
assert "3" in msg # expected count
|
|
260
|
+
assert "2" in msg # actual count
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def test_bool_unknown_value_raises() -> None:
|
|
264
|
+
"""A bool field with a value other than T or F raises ValueError."""
|
|
265
|
+
tsv = (
|
|
266
|
+
"#separator \\x09\n"
|
|
267
|
+
"#fields\tts\tflag\n"
|
|
268
|
+
"#types\ttime\tbool\n"
|
|
269
|
+
"1748649600.0\tyes\n"
|
|
270
|
+
)
|
|
271
|
+
with pytest.raises(ValueError, match="bool"):
|
|
272
|
+
parse_tsv_log(tsv.splitlines(keepends=True))
|
|
273
|
+
|
|
274
|
+
|
|
275
|
+
def test_numeric_empty_field_raises() -> None:
|
|
276
|
+
"""An empty token in a count-typed field raises ValueError."""
|
|
277
|
+
tsv = (
|
|
278
|
+
"#separator \\x09\n"
|
|
279
|
+
"#fields\tts\tport\n"
|
|
280
|
+
"#types\ttime\tcount\n"
|
|
281
|
+
"1748649600.0\t(empty)\n"
|
|
282
|
+
)
|
|
283
|
+
with pytest.raises(ValueError, match="empty"):
|
|
284
|
+
parse_tsv_log(tsv.splitlines(keepends=True))
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def test_unknown_zeek_type_raises() -> None:
|
|
288
|
+
"""An unsupported Zeek type raises ValueError on coercion."""
|
|
289
|
+
tsv = (
|
|
290
|
+
"#separator \\x09\n"
|
|
291
|
+
"#fields\tts\tinfo\n"
|
|
292
|
+
"#types\ttime\trecord\n"
|
|
293
|
+
"1748649600.0\tsomevalue\n"
|
|
294
|
+
)
|
|
295
|
+
with pytest.raises(ValueError, match="unsupported Zeek type"):
|
|
296
|
+
parse_tsv_log(tsv.splitlines(keepends=True))
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def test_missing_optional_directives_use_zeek_defaults() -> None:
|
|
300
|
+
"""Without #set_separator/#empty_field/#unset_field, Zeek spec defaults apply."""
|
|
301
|
+
tsv = (
|
|
302
|
+
"#separator \\x09\n"
|
|
303
|
+
# No #set_separator, #empty_field, or #unset_field
|
|
304
|
+
"#fields\tts\tname\ttags\n"
|
|
305
|
+
"#types\ttime\tstring\tset[string]\n"
|
|
306
|
+
"1748649600.0\t(empty)\talpha,beta\n" # (empty) → "" for string default
|
|
307
|
+
"1748649601.0\t-\t-\n" # - → unset/NaN for string default
|
|
308
|
+
)
|
|
309
|
+
df = parse_tsv_log(tsv.splitlines(keepends=True))
|
|
310
|
+
assert df.iloc[0]["name"] == "" # default empty_field applied
|
|
311
|
+
assert df.iloc[0]["tags"] == ["alpha", "beta"] # default set_separator applied
|
|
312
|
+
assert "name" not in df.iloc[1] or pd.isna(df.iloc[1]["name"]) # default unset
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
# ── Smoke test ────────────────────────────────────────────────────────────────
|
|
316
|
+
|
|
317
|
+
def test_conn_tsv_smoke() -> None:
|
|
318
|
+
"""Post-normalization smoke: canonical columns, correct dtypes, lists for set/vector."""
|
|
319
|
+
df = _normalize_conn_df(parse_tsv_log(_SMOKE_TSV.splitlines(keepends=True)))
|
|
320
|
+
|
|
321
|
+
# Canonical column names present after normalization.
|
|
322
|
+
for col in ("src", "dst", "port", "proto", "ts"):
|
|
323
|
+
assert col in df.columns, f"canonical column {col!r} missing"
|
|
324
|
+
|
|
325
|
+
assert len(df) == 2
|
|
326
|
+
|
|
327
|
+
# duration dtype is numeric float — not the raw string "-".
|
|
328
|
+
assert df["duration"].dtype == float or str(df["duration"].dtype).startswith("float")
|
|
329
|
+
assert not (df["duration"] == "-").any(), "unset token must not survive as a string"
|
|
330
|
+
|
|
331
|
+
# The row with duration=- should be NaN.
|
|
332
|
+
assert df["duration"].isna().any()
|
|
333
|
+
|
|
334
|
+
# local_orig values are Python bools, not strings.
|
|
335
|
+
for v in df["local_orig"].dropna():
|
|
336
|
+
assert isinstance(v, (bool,)), f"local_orig should be bool, got {type(v)}"
|
|
337
|
+
|
|
338
|
+
# tunnel_parents values are lists, not strings.
|
|
339
|
+
for v in df["tunnel_parents"].dropna():
|
|
340
|
+
assert isinstance(v, list), f"tunnel_parents should be list, got {type(v)}"
|
|
341
|
+
|
|
342
|
+
# Unset tunnel_parents row has NaN, not "-".
|
|
343
|
+
assert not (df["tunnel_parents"].dropna() == "-").any()
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# ── Zeek syslog.log normalizer + TSV+NDJSON parity ────────────────────────────
|
|
347
|
+
#
|
|
348
|
+
# v1 promotion of Zeek syslog.log. Normalizer lives in parsers/zeek.py beside
|
|
349
|
+
# the conn / dns normalizers; both front-ends produce the Zeek-native
|
|
350
|
+
# intermediate frame and the single normalizer maps both to the canonical
|
|
351
|
+
# fidelity-aware syslog schema:
|
|
352
|
+
#
|
|
353
|
+
# Minimal (both feeds): ts, host, program, raw, message
|
|
354
|
+
# Extended (Zeek only): facility, severity (uppercase enum strings)
|
|
355
|
+
#
|
|
356
|
+
# Per-row derivation reuses parsers/syslog.py helpers (strip_header,
|
|
357
|
+
# parse_program, normalize_pids, parse_host), so the doubled-timestamp
|
|
358
|
+
# invariant — strip_header is ^-anchored — holds on this path.
|
|
359
|
+
|
|
360
|
+
_SYSLOG_TSV = (
|
|
361
|
+
"#separator \\x09\n"
|
|
362
|
+
"#set_separator\t,\n"
|
|
363
|
+
"#empty_field\t(empty)\n"
|
|
364
|
+
"#unset_field\t-\n"
|
|
365
|
+
"#path\tsyslog\n"
|
|
366
|
+
"#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
|
|
367
|
+
"\tproto\tfacility\tseverity\tmessage\n"
|
|
368
|
+
"#types\ttime\tstring\taddr\tport\taddr\tport"
|
|
369
|
+
"\tenum\tstring\tstring\tstring\n"
|
|
370
|
+
# Row A: embedded RFC 3164 hostname present → host = "host1"
|
|
371
|
+
"1779750000.000000\tCSL01\t192.0.2.10\t41514\t198.51.100.20\t514"
|
|
372
|
+
"\tudp\tDAEMON\tINFO"
|
|
373
|
+
"\tJun 11 12:00:00 host1 sshd[1234]: Accepted publickey for user from 192.0.2.10\n"
|
|
374
|
+
# Row B: short body — under 4 whitespace-separated tokens, so parse_host
|
|
375
|
+
# returns "unknown" → fallback to id.orig_h = "192.0.2.10". parse_host is
|
|
376
|
+
# dumb-positional: field 4 verbatim, with NO hostname validation; the
|
|
377
|
+
# fallback is gated on the literal "unknown" sentinel.
|
|
378
|
+
"1779750060.000000\tCSL02\t192.0.2.10\t41515\t198.51.100.20\t514"
|
|
379
|
+
"\tudp\tKERN\tERR"
|
|
380
|
+
"\tkernel: oops\n"
|
|
381
|
+
)
|
|
382
|
+
|
|
383
|
+
# NDJSON equivalent — _path on every line; Zeek native field names.
|
|
384
|
+
_SYSLOG_NDJSON = (
|
|
385
|
+
'{"_path":"syslog","ts":1779750000.0,"uid":"CSL01",'
|
|
386
|
+
'"id.orig_h":"192.0.2.10","id.orig_p":41514,'
|
|
387
|
+
'"id.resp_h":"198.51.100.20","id.resp_p":514,"proto":"udp",'
|
|
388
|
+
'"facility":"DAEMON","severity":"INFO",'
|
|
389
|
+
'"message":"Jun 11 12:00:00 host1 sshd[1234]: Accepted publickey for user from 192.0.2.10"}\n'
|
|
390
|
+
'{"_path":"syslog","ts":1779750060.0,"uid":"CSL02",'
|
|
391
|
+
'"id.orig_h":"192.0.2.10","id.orig_p":41515,'
|
|
392
|
+
'"id.resp_h":"198.51.100.20","id.resp_p":514,"proto":"udp",'
|
|
393
|
+
'"facility":"KERN","severity":"ERR",'
|
|
394
|
+
'"message":"kernel: oops"}\n'
|
|
395
|
+
)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def test_zeek_syslog_normalizer_tsv_happy_path() -> None:
|
|
399
|
+
"""Zeek-syslog TSV → canonical 7-col frame; derived columns correct."""
|
|
400
|
+
from loghunter.parsers.zeek import _normalize_zeek_syslog_df
|
|
401
|
+
|
|
402
|
+
raw = parse_tsv_log(_SYSLOG_TSV.splitlines(keepends=True))
|
|
403
|
+
df = _normalize_zeek_syslog_df(raw)
|
|
404
|
+
|
|
405
|
+
# Minimal-5-first, then extended.
|
|
406
|
+
assert list(df.columns) == [
|
|
407
|
+
"ts", "host", "program", "raw", "message", "facility", "severity",
|
|
408
|
+
], "minimal-5 must come first; extended last (concat-friendly)"
|
|
409
|
+
|
|
410
|
+
# Dropped Zeek-only columns.
|
|
411
|
+
for col in ("uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto"):
|
|
412
|
+
assert col not in df.columns
|
|
413
|
+
|
|
414
|
+
# Row A: embedded host wins.
|
|
415
|
+
assert df.iloc[0]["host"] == "host1"
|
|
416
|
+
assert df.iloc[0]["program"] == "sshd"
|
|
417
|
+
# `raw` is verbatim from Zeek's message.
|
|
418
|
+
assert df.iloc[0]["raw"].startswith("Jun 11 12:00:00 host1 sshd[1234]:")
|
|
419
|
+
# `message` is header-stripped and PID-normalised.
|
|
420
|
+
assert df.iloc[0]["message"] == "sshd[*]: Accepted publickey for user from 192.0.2.10"
|
|
421
|
+
assert df.iloc[0]["facility"] == "DAEMON"
|
|
422
|
+
assert df.iloc[0]["severity"] == "INFO"
|
|
423
|
+
assert df.iloc[0]["ts"] == 1779750000.0
|
|
424
|
+
|
|
425
|
+
# Row B: parse_host returns "unknown" (under-4-field body) → id.orig_h
|
|
426
|
+
# fallback kicks in.
|
|
427
|
+
assert df.iloc[1]["host"] == "192.0.2.10"
|
|
428
|
+
assert df.iloc[1]["program"] == "kernel"
|
|
429
|
+
assert df.iloc[1]["message"] == "kernel: oops"
|
|
430
|
+
assert df.iloc[1]["severity"] == "ERR"
|
|
431
|
+
|
|
432
|
+
|
|
433
|
+
def test_zeek_syslog_tsv_ndjson_parity() -> None:
|
|
434
|
+
"""TSV and NDJSON paths produce identical normalized frames."""
|
|
435
|
+
from loghunter.parsers.zeek import _normalize_zeek_syslog_df
|
|
436
|
+
|
|
437
|
+
tsv_df = _normalize_zeek_syslog_df(
|
|
438
|
+
parse_tsv_log(_SYSLOG_TSV.splitlines(keepends=True))
|
|
439
|
+
)
|
|
440
|
+
ndjson_df = _normalize_zeek_syslog_df(_ndjson_df(_SYSLOG_NDJSON))
|
|
441
|
+
_compare(tsv_df, ndjson_df)
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def test_zeek_syslog_normalizer_malformed_missing_message_preserves_absence() -> None:
|
|
445
|
+
"""Honesty rail: when source `message` is absent, normalizer does NOT
|
|
446
|
+
synthesize message / raw / program just to satisfy shape — the output
|
|
447
|
+
omits those columns so _schema_warning fires with the actionable
|
|
448
|
+
"syslog.log fields not found: …" message. Without this discipline,
|
|
449
|
+
fabricated empty content would flow into the detector/digest."""
|
|
450
|
+
from loghunter.common.loader import _schema_warning
|
|
451
|
+
from loghunter.parsers.zeek import _normalize_zeek_syslog_df
|
|
452
|
+
|
|
453
|
+
raw_df = pd.DataFrame([
|
|
454
|
+
{
|
|
455
|
+
"ts": 1779750000.0,
|
|
456
|
+
"uid": "CSL01",
|
|
457
|
+
"id.orig_h": "192.0.2.10",
|
|
458
|
+
"id.orig_p": 41514,
|
|
459
|
+
"id.resp_h": "198.51.100.20",
|
|
460
|
+
"id.resp_p": 514,
|
|
461
|
+
"proto": "udp",
|
|
462
|
+
"facility": "DAEMON",
|
|
463
|
+
"severity": "INFO",
|
|
464
|
+
# message intentionally absent
|
|
465
|
+
}
|
|
466
|
+
])
|
|
467
|
+
df = _normalize_zeek_syslog_df(raw_df)
|
|
468
|
+
|
|
469
|
+
# Derived columns absent so the schema warning can fire.
|
|
470
|
+
for col in ("message", "raw", "program"):
|
|
471
|
+
assert col not in df.columns, (
|
|
472
|
+
f"{col} must not be synthesized when source `message` is missing"
|
|
473
|
+
)
|
|
474
|
+
# ts / facility / severity survive (carried, not derived).
|
|
475
|
+
assert "ts" in df.columns
|
|
476
|
+
assert "facility" in df.columns
|
|
477
|
+
assert "severity" in df.columns
|
|
478
|
+
|
|
479
|
+
warning = _schema_warning("syslog*.log*", df)
|
|
480
|
+
assert warning is not None
|
|
481
|
+
assert "syslog.log fields not found" in warning
|
|
482
|
+
assert "message" in warning
|
|
483
|
+
assert "program" in warning
|
|
484
|
+
assert "raw" in warning
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def test_zeek_tsv_sniff_syslog_path_claims_syslog() -> None:
|
|
488
|
+
"""TSV sniff layer claims `#path syslog` — the TSV twin of the NDJSON
|
|
489
|
+
`_path == "syslog"` claim. Test in test_sniff_recognizers covers the
|
|
490
|
+
NDJSON side."""
|
|
491
|
+
from loghunter.parsers.zeek_tsv import sniff
|
|
492
|
+
|
|
493
|
+
sample = [
|
|
494
|
+
"#separator \\x09\n",
|
|
495
|
+
"#set_separator\t,\n",
|
|
496
|
+
"#empty_field\t(empty)\n",
|
|
497
|
+
"#unset_field\t-\n",
|
|
498
|
+
"#path\tsyslog\n",
|
|
499
|
+
"#fields\tts\tuid\tid.orig_h\tfacility\tseverity\tmessage\n",
|
|
500
|
+
"#types\ttime\tstring\taddr\tstring\tstring\tstring\n",
|
|
501
|
+
]
|
|
502
|
+
assert sniff(sample) == "syslog"
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def test_zeek_syslog_normalizer_strips_trailing_crlf_from_raw() -> None:
|
|
506
|
+
"""P2 regression (Glenn bug handoff): Zeek's NDJSON `message` field can
|
|
507
|
+
carry the upstream record's trailing CR/LF. Pre-fix this leaked into
|
|
508
|
+
canonical `raw`, and the syslog detector's `title=str(row.raw)[:180]`
|
|
509
|
+
rendered a blank spacer row beneath every affected finding. Fix is a
|
|
510
|
+
narrow trailing-line-terminator strip at the parser seam (mirrors flat
|
|
511
|
+
`load_syslog`'s `line.rstrip("\\n")` discipline). RFC 5737 placeholders.
|
|
512
|
+
|
|
513
|
+
Canonical `message` was already clean because `strip_header` calls
|
|
514
|
+
`.strip()`; this test pins both columns to the same contract.
|
|
515
|
+
"""
|
|
516
|
+
from loghunter.parsers.zeek import _normalize_zeek_syslog_df
|
|
517
|
+
|
|
518
|
+
raw_df = pd.DataFrame([
|
|
519
|
+
{
|
|
520
|
+
"ts": 1779750000.0,
|
|
521
|
+
"uid": "CSL01",
|
|
522
|
+
"id.orig_h": "192.0.2.10",
|
|
523
|
+
"id.orig_p": 41514,
|
|
524
|
+
"id.resp_h": "198.51.100.20",
|
|
525
|
+
"id.resp_p": 514,
|
|
526
|
+
"proto": "udp",
|
|
527
|
+
"facility": "DAEMON",
|
|
528
|
+
"severity": "INFO",
|
|
529
|
+
# Trailing LF, mixed-form CR/LF, bare CR — any combination
|
|
530
|
+
# that an upstream agent might leave on the wire.
|
|
531
|
+
"message": "Jun 11 12:00:00 host1 sshd[1234]: line ending in LF\n",
|
|
532
|
+
},
|
|
533
|
+
{
|
|
534
|
+
"ts": 1779750060.0,
|
|
535
|
+
"uid": "CSL02",
|
|
536
|
+
"id.orig_h": "192.0.2.10",
|
|
537
|
+
"id.orig_p": 41515,
|
|
538
|
+
"id.resp_h": "198.51.100.20",
|
|
539
|
+
"id.resp_p": 514,
|
|
540
|
+
"proto": "udp",
|
|
541
|
+
"facility": "DAEMON",
|
|
542
|
+
"severity": "INFO",
|
|
543
|
+
"message": "Jun 11 12:01:00 host1 sshd[1235]: line ending in CRLF\r\n",
|
|
544
|
+
},
|
|
545
|
+
{
|
|
546
|
+
"ts": 1779750120.0,
|
|
547
|
+
"uid": "CSL03",
|
|
548
|
+
"id.orig_h": "192.0.2.10",
|
|
549
|
+
"id.orig_p": 41516,
|
|
550
|
+
"id.resp_h": "198.51.100.20",
|
|
551
|
+
"id.resp_p": 514,
|
|
552
|
+
"proto": "udp",
|
|
553
|
+
"facility": "DAEMON",
|
|
554
|
+
"severity": "INFO",
|
|
555
|
+
"message": "Jun 11 12:02:00 host1 sshd[1236]: line ending in bare CR\r",
|
|
556
|
+
},
|
|
557
|
+
])
|
|
558
|
+
df = _normalize_zeek_syslog_df(raw_df)
|
|
559
|
+
|
|
560
|
+
# Canonical raw must not carry trailing CR or LF on any row.
|
|
561
|
+
for value in df["raw"].tolist():
|
|
562
|
+
assert not value.endswith("\n"), f"raw must not end in LF: {value!r}"
|
|
563
|
+
assert not value.endswith("\r"), f"raw must not end in CR: {value!r}"
|
|
564
|
+
# Canonical message remains clean too (already guaranteed by strip_header).
|
|
565
|
+
for value in df["message"].tolist():
|
|
566
|
+
assert not value.endswith("\n"), f"message must not end in LF: {value!r}"
|
|
567
|
+
assert not value.endswith("\r"), f"message must not end in CR: {value!r}"
|
|
568
|
+
|
|
569
|
+
# Detector title contract: str(raw)[:180] must be a single physical line.
|
|
570
|
+
for value in df["raw"].tolist():
|
|
571
|
+
title = str(value)[:180]
|
|
572
|
+
assert "\n" not in title, (
|
|
573
|
+
f"detector title (str(raw)[:180]) must not contain a newline; "
|
|
574
|
+
f"got: {title!r}"
|
|
575
|
+
)
|
|
576
|
+
|
|
577
|
+
# The raw payload up to the terminator stays intact (no broader trim).
|
|
578
|
+
assert df.iloc[0]["raw"].endswith("line ending in LF")
|
|
579
|
+
assert df.iloc[1]["raw"].endswith("line ending in CRLF")
|
|
580
|
+
assert df.iloc[2]["raw"].endswith("line ending in bare CR")
|