loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,580 @@
1
+ """Parity, directive-behavior, and smoke tests for loghunter.parsers.zeek_tsv.
2
+
3
+ All fixture data is hand-authored using RFC 5737 documentation IP space
4
+ (192.0.2.x, 198.51.100.x, 203.0.113.x). No real network data anywhere.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ import math
11
+
12
+ import pandas as pd
13
+ import pytest
14
+
15
+ from loghunter.parsers.zeek import _normalize_conn_df, _normalize_dns_df
16
+ from loghunter.parsers.zeek_tsv import parse_tsv_log
17
+
18
+ # ── Fixture constants ─────────────────────────────────────────────────────────
19
+ #
20
+ # Every fixture is a raw string to be passed as splitlines(keepends=True).
21
+ # Tab characters are written as literal tabs. RFC 5737 IPs throughout.
22
+
23
+ _CONN_TSV = (
24
+ "#separator \\x09\n"
25
+ "#set_separator\t,\n"
26
+ "#empty_field\t(empty)\n"
27
+ "#unset_field\t-\n"
28
+ "#path\tconn\n"
29
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
30
+ "\tproto\tservice\tduration\torig_bytes\tresp_bytes"
31
+ "\tconn_state\tlocal_orig\tlocal_resp\ttunnel_parents\n"
32
+ "#types\ttime\tstring\taddr\tport\taddr\tport"
33
+ "\tenum\tstring\tinterval\tcount\tcount"
34
+ "\tstring\tbool\tbool\tset[string]\n"
35
+ # Row A: duration present, service present, local_orig=T, tunnel_parents=(empty)
36
+ "1748649600.000000\tCTest01\t192.0.2.10\t51514\t203.0.113.20\t443"
37
+ "\ttcp\tssl\t3.5\t1500\t8200\tSF\tT\tF\t(empty)\n"
38
+ # Row B: duration unset, service unset, local_orig=F, tunnel_parents unset
39
+ "1748649660.000000\tCTest02\t198.51.100.1\t54321\t192.0.2.20\t22"
40
+ "\ttcp\t-\t-\t0\t0\tS0\tF\tF\t-\n"
41
+ "#close\t2026-01-01-00:00:00\n"
42
+ )
43
+
44
+ # Equivalent events in NDJSON. Absent keys mirror TSV unset tokens.
45
+ _CONN_NDJSON = (
46
+ '{"ts":1748649600.0,"uid":"CTest01","id.orig_h":"192.0.2.10","id.orig_p":51514,'
47
+ '"id.resp_h":"203.0.113.20","id.resp_p":443,"proto":"tcp","service":"ssl",'
48
+ '"duration":3.5,"orig_bytes":1500,"resp_bytes":8200,"conn_state":"SF",'
49
+ '"local_orig":true,"local_resp":false,"tunnel_parents":[]}\n'
50
+ '{"ts":1748649660.0,"uid":"CTest02","id.orig_h":"198.51.100.1","id.orig_p":54321,'
51
+ '"id.resp_h":"192.0.2.20","id.resp_p":22,"proto":"tcp",'
52
+ '"orig_bytes":0,"resp_bytes":0,"conn_state":"S0",'
53
+ '"local_orig":false,"local_resp":false}\n'
54
+ )
55
+
56
+ _DNS_TSV = (
57
+ "#separator \\x09\n"
58
+ "#set_separator\t,\n"
59
+ "#empty_field\t(empty)\n"
60
+ "#unset_field\t-\n"
61
+ "#path\tdns\n"
62
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
63
+ "\tproto\ttrans_id\trtt\tquery\tqclass\tqclass_name\tqtype\tqtype_name"
64
+ "\trcode\tAA\tTC\tRD\tRA\tZ\tanswers\tTTLs\trejected\n"
65
+ "#types\ttime\tstring\taddr\tport\taddr\tport"
66
+ "\tenum\tcount\tinterval\tstring\tcount\tstring\tcount\tstring"
67
+ "\tcount\tbool\tbool\tbool\tbool\tcount\tvector[string]\tvector[interval]\tbool\n"
68
+ # Row 1: qclass=1, multi-value answers/TTLs, rtt present, AA=T, TC=F
69
+ "1748649700.000000\tCDns01\t192.0.2.1\t12345\t192.0.2.53\t53"
70
+ "\tudp\t1001\t0.050000\talpha.invalid\t1\tC_INTERNET\t1\tA"
71
+ "\t0\tT\tF\tT\tT\t0\t198.51.100.1,203.0.113.1\t300.000000,300.000000\tF\n"
72
+ # Row 2: qclass=2 — dropped by normalizer aperture
73
+ "1748649701.000000\tCDns02\t192.0.2.2\t12346\t192.0.2.53\t53"
74
+ "\tudp\t1002\t0.030000\tbeta.invalid\t2\tC_CSNET\t1\tA"
75
+ "\t0\tF\tF\tT\tT\t0\t198.51.100.2\t60.000000\tF\n"
76
+ # Row 3: qclass=1, empty query, rtt unset, answers unset, TTLs unset
77
+ "1748649702.000000\tCDns03\t192.0.2.3\t12347\t192.0.2.53\t53"
78
+ "\tudp\t1003\t-\t(empty)\t1\tC_INTERNET\t48\tDNSKEY"
79
+ "\t0\tF\tF\tT\tF\t0\t-\t-\tF\n"
80
+ "#close\t2026-01-01-00:00:00\n"
81
+ )
82
+
83
+ # NDJSON equivalent — qclass=2 row included so the aperture can drop it identically.
84
+ _DNS_NDJSON = (
85
+ '{"ts":1748649700.0,"uid":"CDns01","id.orig_h":"192.0.2.1","id.orig_p":12345,'
86
+ '"id.resp_h":"192.0.2.53","id.resp_p":53,"proto":"udp","trans_id":1001,'
87
+ '"rtt":0.05,"query":"alpha.invalid","qclass":1,"qclass_name":"C_INTERNET",'
88
+ '"qtype":1,"qtype_name":"A","rcode":0,"AA":true,"TC":false,"RD":true,"RA":true,'
89
+ '"Z":0,"answers":["198.51.100.1","203.0.113.1"],"TTLs":[300.0,300.0],'
90
+ '"rejected":false}\n'
91
+ '{"ts":1748649701.0,"uid":"CDns02","id.orig_h":"192.0.2.2","id.orig_p":12346,'
92
+ '"id.resp_h":"192.0.2.53","id.resp_p":53,"proto":"udp","trans_id":1002,'
93
+ '"rtt":0.03,"query":"beta.invalid","qclass":2,"qclass_name":"C_CSNET",'
94
+ '"qtype":1,"qtype_name":"A","rcode":0,"AA":false,"TC":false,"RD":true,"RA":true,'
95
+ '"Z":0,"answers":["198.51.100.2"],"TTLs":[60.0],"rejected":false}\n'
96
+ '{"ts":1748649702.0,"uid":"CDns03","id.orig_h":"192.0.2.3","id.orig_p":12347,'
97
+ '"id.resp_h":"192.0.2.53","id.resp_p":53,"proto":"udp","trans_id":1003,'
98
+ '"query":"","qclass":1,"qclass_name":"C_INTERNET",'
99
+ '"qtype":48,"qtype_name":"DNSKEY","rcode":0,"AA":false,"TC":false,'
100
+ '"RD":true,"RA":false,"Z":0,"rejected":false}\n'
101
+ )
102
+
103
+ _SMOKE_TSV = (
104
+ "#separator \\x09\n"
105
+ "#set_separator\t,\n"
106
+ "#empty_field\t(empty)\n"
107
+ "#unset_field\t-\n"
108
+ "#path\tconn\n"
109
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
110
+ "\tproto\tduration\tlocal_orig\ttunnel_parents\n"
111
+ "#types\ttime\tstring\taddr\tport\taddr\tport"
112
+ "\tenum\tinterval\tbool\tset[string]\n"
113
+ "1748649800.000000\tCSmk01\t192.0.2.10\t11111\t203.0.113.1\t80\ttcp\t1.5\tT\tfoo,bar\n"
114
+ "1748649801.000000\tCSmk02\t192.0.2.11\t22222\t203.0.113.2\t443\ttcp\t-\tF\t-\n"
115
+ )
116
+
117
+
118
+ # ── Helper ────────────────────────────────────────────────────────────────────
119
+
120
+ def _ndjson_df(ndjson: str) -> pd.DataFrame:
121
+ records = [json.loads(ln) for ln in ndjson.strip().splitlines()]
122
+ return pd.DataFrame(records)
123
+
124
+
125
+ def _compare(tsv_df: pd.DataFrame, ndjson_df: pd.DataFrame) -> None:
126
+ """Sort both frames by ts, reset index, compare ignoring column order."""
127
+ left = tsv_df.sort_values("ts").reset_index(drop=True)
128
+ right = ndjson_df.sort_values("ts").reset_index(drop=True)
129
+ pd.testing.assert_frame_equal(left, right, check_like=True)
130
+
131
+
132
+ # ── Parity tests ──────────────────────────────────────────────────────────────
133
+
134
+ def test_conn_tsv_ndjson_parity() -> None:
135
+ """TSV and NDJSON conn paths produce identical normalized DataFrames."""
136
+ tsv_df = _normalize_conn_df(parse_tsv_log(_CONN_TSV.splitlines(keepends=True)))
137
+ ndjson_df = _normalize_conn_df(_ndjson_df(_CONN_NDJSON))
138
+ _compare(tsv_df, ndjson_df)
139
+
140
+
141
+ def test_dns_tsv_ndjson_parity() -> None:
142
+ """TSV and NDJSON dns paths produce identical normalized DataFrames.
143
+
144
+ Both paths go through _normalize_dns_df which applies the qclass==1
145
+ aperture. The qclass=2 row must be dropped by both paths identically,
146
+ and the surviving qtype values must pass through as raw numeric type
147
+ codes (1 = A, 48 = DNSKEY) in both paths — locks the qtype-carries-
148
+ through behaviour on the TSV side of the Zeek path.
149
+ """
150
+ tsv_df = _normalize_dns_df(parse_tsv_log(_DNS_TSV.splitlines(keepends=True)))
151
+ ndjson_df = _normalize_dns_df(_ndjson_df(_DNS_NDJSON))
152
+ assert len(tsv_df) == 2, "qclass=2 row must be dropped"
153
+ # qtype must be present and preserved as raw numeric in both paths.
154
+ assert "qtype" in tsv_df.columns, "qtype must survive TSV normalization"
155
+ assert "qtype" in ndjson_df.columns, "qtype must survive NDJSON normalization"
156
+ assert sorted(tsv_df["qtype"].tolist()) == [1, 48]
157
+ assert sorted(ndjson_df["qtype"].tolist()) == [1, 48]
158
+ _compare(tsv_df, ndjson_df)
159
+
160
+
161
+ # ── Directive-behavior tests ──────────────────────────────────────────────────
162
+
163
+ def test_non_tab_separator() -> None:
164
+ """Parser honors a non-tab #separator directive."""
165
+ tsv = (
166
+ "#separator \\x7c\n" # pipe |
167
+ "#set_separator|,\n"
168
+ "#empty_field|(empty)\n"
169
+ "#unset_field|-\n"
170
+ "#path|conn\n"
171
+ "#fields|ts|src\n"
172
+ "#types|time|string\n"
173
+ "1748649600.000000|192.0.2.1\n"
174
+ )
175
+ df = parse_tsv_log(tsv.splitlines(keepends=True))
176
+ assert list(df.columns) == ["ts", "src"]
177
+ assert df.iloc[0]["ts"] == pytest.approx(1748649600.0)
178
+ assert df.iloc[0]["src"] == "192.0.2.1"
179
+
180
+
181
+ def test_custom_empty_and_unset_tokens() -> None:
182
+ """Custom #empty_field and #unset_field tokens are honored."""
183
+ tsv = (
184
+ "#separator \\x09\n"
185
+ "#set_separator\t,\n"
186
+ "#empty_field\tEMPTY\n"
187
+ "#unset_field\tNONE\n"
188
+ "#path\ttest\n"
189
+ "#fields\tts\ta\tb\n"
190
+ "#types\ttime\tstring\tstring\n"
191
+ "1748649600.0\tEMPTY\tNONE\n"
192
+ )
193
+ df = parse_tsv_log(tsv.splitlines(keepends=True))
194
+ assert df.iloc[0]["a"] == "" # empty token → empty string
195
+ assert "b" not in df.iloc[0] or math.isnan(df.iloc[0]["b"]) # unset → absent/NaN
196
+
197
+
198
+ def test_custom_set_separator() -> None:
199
+ """Custom #set_separator is used to split set/vector fields."""
200
+ tsv = (
201
+ "#separator \\x09\n"
202
+ "#set_separator\t|\n"
203
+ "#empty_field\t(empty)\n"
204
+ "#unset_field\t-\n"
205
+ "#path\ttest\n"
206
+ "#fields\tts\tanswers\n"
207
+ "#types\ttime\tvector[string]\n"
208
+ "1748649600.0\talpha|beta|gamma\n"
209
+ )
210
+ df = parse_tsv_log(tsv.splitlines(keepends=True))
211
+ assert df.iloc[0]["answers"] == ["alpha", "beta", "gamma"]
212
+
213
+
214
+ def test_missing_separator_raises() -> None:
215
+ """A header without #separator followed by a data row raises ValueError."""
216
+ tsv = (
217
+ "#set_separator\t,\n"
218
+ "#fields\tts\tsrc\n"
219
+ "#types\ttime\tstring\n"
220
+ "1748649600.0\t192.0.2.1\n"
221
+ )
222
+ with pytest.raises(ValueError, match="missing #separator"):
223
+ parse_tsv_log(tsv.splitlines(keepends=True))
224
+
225
+
226
+ def test_fields_types_length_mismatch_raises() -> None:
227
+ """Mismatched #fields / #types lengths raise ValueError."""
228
+ tsv = (
229
+ "#separator \\x09\n"
230
+ "#fields\tts\tsrc\tdst\n"
231
+ "#types\ttime\tstring\n" # only 2 types for 3 fields
232
+ )
233
+ with pytest.raises(ValueError, match="#fields"):
234
+ parse_tsv_log(tsv.splitlines(keepends=True))
235
+
236
+
237
+ def test_missing_fields_raises() -> None:
238
+ """A header with no #fields line raises ValueError."""
239
+ tsv = (
240
+ "#separator \\x09\n"
241
+ "#types\ttime\tstring\n"
242
+ "1748649600.0\t192.0.2.1\n"
243
+ )
244
+ with pytest.raises(ValueError, match="missing #fields"):
245
+ parse_tsv_log(tsv.splitlines(keepends=True))
246
+
247
+
248
+ def test_ragged_row_raises() -> None:
249
+ """A data row with the wrong token count raises ValueError with line and counts."""
250
+ tsv = (
251
+ "#separator \\x09\n"
252
+ "#fields\tts\tsrc\tdst\n"
253
+ "#types\ttime\tstring\tstring\n"
254
+ "1748649600.0\t192.0.2.1\n" # only 2 tokens, 3 expected
255
+ )
256
+ with pytest.raises(ValueError) as exc_info:
257
+ parse_tsv_log(tsv.splitlines(keepends=True))
258
+ msg = str(exc_info.value)
259
+ assert "3" in msg # expected count
260
+ assert "2" in msg # actual count
261
+
262
+
263
+ def test_bool_unknown_value_raises() -> None:
264
+ """A bool field with a value other than T or F raises ValueError."""
265
+ tsv = (
266
+ "#separator \\x09\n"
267
+ "#fields\tts\tflag\n"
268
+ "#types\ttime\tbool\n"
269
+ "1748649600.0\tyes\n"
270
+ )
271
+ with pytest.raises(ValueError, match="bool"):
272
+ parse_tsv_log(tsv.splitlines(keepends=True))
273
+
274
+
275
+ def test_numeric_empty_field_raises() -> None:
276
+ """An empty token in a count-typed field raises ValueError."""
277
+ tsv = (
278
+ "#separator \\x09\n"
279
+ "#fields\tts\tport\n"
280
+ "#types\ttime\tcount\n"
281
+ "1748649600.0\t(empty)\n"
282
+ )
283
+ with pytest.raises(ValueError, match="empty"):
284
+ parse_tsv_log(tsv.splitlines(keepends=True))
285
+
286
+
287
+ def test_unknown_zeek_type_raises() -> None:
288
+ """An unsupported Zeek type raises ValueError on coercion."""
289
+ tsv = (
290
+ "#separator \\x09\n"
291
+ "#fields\tts\tinfo\n"
292
+ "#types\ttime\trecord\n"
293
+ "1748649600.0\tsomevalue\n"
294
+ )
295
+ with pytest.raises(ValueError, match="unsupported Zeek type"):
296
+ parse_tsv_log(tsv.splitlines(keepends=True))
297
+
298
+
299
+ def test_missing_optional_directives_use_zeek_defaults() -> None:
300
+ """Without #set_separator/#empty_field/#unset_field, Zeek spec defaults apply."""
301
+ tsv = (
302
+ "#separator \\x09\n"
303
+ # No #set_separator, #empty_field, or #unset_field
304
+ "#fields\tts\tname\ttags\n"
305
+ "#types\ttime\tstring\tset[string]\n"
306
+ "1748649600.0\t(empty)\talpha,beta\n" # (empty) → "" for string default
307
+ "1748649601.0\t-\t-\n" # - → unset/NaN for string default
308
+ )
309
+ df = parse_tsv_log(tsv.splitlines(keepends=True))
310
+ assert df.iloc[0]["name"] == "" # default empty_field applied
311
+ assert df.iloc[0]["tags"] == ["alpha", "beta"] # default set_separator applied
312
+ assert "name" not in df.iloc[1] or pd.isna(df.iloc[1]["name"]) # default unset
313
+
314
+
315
+ # ── Smoke test ────────────────────────────────────────────────────────────────
316
+
317
+ def test_conn_tsv_smoke() -> None:
318
+ """Post-normalization smoke: canonical columns, correct dtypes, lists for set/vector."""
319
+ df = _normalize_conn_df(parse_tsv_log(_SMOKE_TSV.splitlines(keepends=True)))
320
+
321
+ # Canonical column names present after normalization.
322
+ for col in ("src", "dst", "port", "proto", "ts"):
323
+ assert col in df.columns, f"canonical column {col!r} missing"
324
+
325
+ assert len(df) == 2
326
+
327
+ # duration dtype is numeric float — not the raw string "-".
328
+ assert df["duration"].dtype == float or str(df["duration"].dtype).startswith("float")
329
+ assert not (df["duration"] == "-").any(), "unset token must not survive as a string"
330
+
331
+ # The row with duration=- should be NaN.
332
+ assert df["duration"].isna().any()
333
+
334
+ # local_orig values are Python bools, not strings.
335
+ for v in df["local_orig"].dropna():
336
+ assert isinstance(v, (bool,)), f"local_orig should be bool, got {type(v)}"
337
+
338
+ # tunnel_parents values are lists, not strings.
339
+ for v in df["tunnel_parents"].dropna():
340
+ assert isinstance(v, list), f"tunnel_parents should be list, got {type(v)}"
341
+
342
+ # Unset tunnel_parents row has NaN, not "-".
343
+ assert not (df["tunnel_parents"].dropna() == "-").any()
344
+
345
+
346
+ # ── Zeek syslog.log normalizer + TSV+NDJSON parity ────────────────────────────
347
+ #
348
+ # v1 promotion of Zeek syslog.log. Normalizer lives in parsers/zeek.py beside
349
+ # the conn / dns normalizers; both front-ends produce the Zeek-native
350
+ # intermediate frame and the single normalizer maps both to the canonical
351
+ # fidelity-aware syslog schema:
352
+ #
353
+ # Minimal (both feeds): ts, host, program, raw, message
354
+ # Extended (Zeek only): facility, severity (uppercase enum strings)
355
+ #
356
+ # Per-row derivation reuses parsers/syslog.py helpers (strip_header,
357
+ # parse_program, normalize_pids, parse_host), so the doubled-timestamp
358
+ # invariant — strip_header is ^-anchored — holds on this path.
359
+
360
+ _SYSLOG_TSV = (
361
+ "#separator \\x09\n"
362
+ "#set_separator\t,\n"
363
+ "#empty_field\t(empty)\n"
364
+ "#unset_field\t-\n"
365
+ "#path\tsyslog\n"
366
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p"
367
+ "\tproto\tfacility\tseverity\tmessage\n"
368
+ "#types\ttime\tstring\taddr\tport\taddr\tport"
369
+ "\tenum\tstring\tstring\tstring\n"
370
+ # Row A: embedded RFC 3164 hostname present → host = "host1"
371
+ "1779750000.000000\tCSL01\t192.0.2.10\t41514\t198.51.100.20\t514"
372
+ "\tudp\tDAEMON\tINFO"
373
+ "\tJun 11 12:00:00 host1 sshd[1234]: Accepted publickey for user from 192.0.2.10\n"
374
+ # Row B: short body — under 4 whitespace-separated tokens, so parse_host
375
+ # returns "unknown" → fallback to id.orig_h = "192.0.2.10". parse_host is
376
+ # dumb-positional: field 4 verbatim, with NO hostname validation; the
377
+ # fallback is gated on the literal "unknown" sentinel.
378
+ "1779750060.000000\tCSL02\t192.0.2.10\t41515\t198.51.100.20\t514"
379
+ "\tudp\tKERN\tERR"
380
+ "\tkernel: oops\n"
381
+ )
382
+
383
+ # NDJSON equivalent — _path on every line; Zeek native field names.
384
+ _SYSLOG_NDJSON = (
385
+ '{"_path":"syslog","ts":1779750000.0,"uid":"CSL01",'
386
+ '"id.orig_h":"192.0.2.10","id.orig_p":41514,'
387
+ '"id.resp_h":"198.51.100.20","id.resp_p":514,"proto":"udp",'
388
+ '"facility":"DAEMON","severity":"INFO",'
389
+ '"message":"Jun 11 12:00:00 host1 sshd[1234]: Accepted publickey for user from 192.0.2.10"}\n'
390
+ '{"_path":"syslog","ts":1779750060.0,"uid":"CSL02",'
391
+ '"id.orig_h":"192.0.2.10","id.orig_p":41515,'
392
+ '"id.resp_h":"198.51.100.20","id.resp_p":514,"proto":"udp",'
393
+ '"facility":"KERN","severity":"ERR",'
394
+ '"message":"kernel: oops"}\n'
395
+ )
396
+
397
+
398
+ def test_zeek_syslog_normalizer_tsv_happy_path() -> None:
399
+ """Zeek-syslog TSV → canonical 7-col frame; derived columns correct."""
400
+ from loghunter.parsers.zeek import _normalize_zeek_syslog_df
401
+
402
+ raw = parse_tsv_log(_SYSLOG_TSV.splitlines(keepends=True))
403
+ df = _normalize_zeek_syslog_df(raw)
404
+
405
+ # Minimal-5-first, then extended.
406
+ assert list(df.columns) == [
407
+ "ts", "host", "program", "raw", "message", "facility", "severity",
408
+ ], "minimal-5 must come first; extended last (concat-friendly)"
409
+
410
+ # Dropped Zeek-only columns.
411
+ for col in ("uid", "id.orig_h", "id.orig_p", "id.resp_h", "id.resp_p", "proto"):
412
+ assert col not in df.columns
413
+
414
+ # Row A: embedded host wins.
415
+ assert df.iloc[0]["host"] == "host1"
416
+ assert df.iloc[0]["program"] == "sshd"
417
+ # `raw` is verbatim from Zeek's message.
418
+ assert df.iloc[0]["raw"].startswith("Jun 11 12:00:00 host1 sshd[1234]:")
419
+ # `message` is header-stripped and PID-normalised.
420
+ assert df.iloc[0]["message"] == "sshd[*]: Accepted publickey for user from 192.0.2.10"
421
+ assert df.iloc[0]["facility"] == "DAEMON"
422
+ assert df.iloc[0]["severity"] == "INFO"
423
+ assert df.iloc[0]["ts"] == 1779750000.0
424
+
425
+ # Row B: parse_host returns "unknown" (under-4-field body) → id.orig_h
426
+ # fallback kicks in.
427
+ assert df.iloc[1]["host"] == "192.0.2.10"
428
+ assert df.iloc[1]["program"] == "kernel"
429
+ assert df.iloc[1]["message"] == "kernel: oops"
430
+ assert df.iloc[1]["severity"] == "ERR"
431
+
432
+
433
+ def test_zeek_syslog_tsv_ndjson_parity() -> None:
434
+ """TSV and NDJSON paths produce identical normalized frames."""
435
+ from loghunter.parsers.zeek import _normalize_zeek_syslog_df
436
+
437
+ tsv_df = _normalize_zeek_syslog_df(
438
+ parse_tsv_log(_SYSLOG_TSV.splitlines(keepends=True))
439
+ )
440
+ ndjson_df = _normalize_zeek_syslog_df(_ndjson_df(_SYSLOG_NDJSON))
441
+ _compare(tsv_df, ndjson_df)
442
+
443
+
444
+ def test_zeek_syslog_normalizer_malformed_missing_message_preserves_absence() -> None:
445
+ """Honesty rail: when source `message` is absent, normalizer does NOT
446
+ synthesize message / raw / program just to satisfy shape — the output
447
+ omits those columns so _schema_warning fires with the actionable
448
+ "syslog.log fields not found: …" message. Without this discipline,
449
+ fabricated empty content would flow into the detector/digest."""
450
+ from loghunter.common.loader import _schema_warning
451
+ from loghunter.parsers.zeek import _normalize_zeek_syslog_df
452
+
453
+ raw_df = pd.DataFrame([
454
+ {
455
+ "ts": 1779750000.0,
456
+ "uid": "CSL01",
457
+ "id.orig_h": "192.0.2.10",
458
+ "id.orig_p": 41514,
459
+ "id.resp_h": "198.51.100.20",
460
+ "id.resp_p": 514,
461
+ "proto": "udp",
462
+ "facility": "DAEMON",
463
+ "severity": "INFO",
464
+ # message intentionally absent
465
+ }
466
+ ])
467
+ df = _normalize_zeek_syslog_df(raw_df)
468
+
469
+ # Derived columns absent so the schema warning can fire.
470
+ for col in ("message", "raw", "program"):
471
+ assert col not in df.columns, (
472
+ f"{col} must not be synthesized when source `message` is missing"
473
+ )
474
+ # ts / facility / severity survive (carried, not derived).
475
+ assert "ts" in df.columns
476
+ assert "facility" in df.columns
477
+ assert "severity" in df.columns
478
+
479
+ warning = _schema_warning("syslog*.log*", df)
480
+ assert warning is not None
481
+ assert "syslog.log fields not found" in warning
482
+ assert "message" in warning
483
+ assert "program" in warning
484
+ assert "raw" in warning
485
+
486
+
487
+ def test_zeek_tsv_sniff_syslog_path_claims_syslog() -> None:
488
+ """TSV sniff layer claims `#path syslog` — the TSV twin of the NDJSON
489
+ `_path == "syslog"` claim. Test in test_sniff_recognizers covers the
490
+ NDJSON side."""
491
+ from loghunter.parsers.zeek_tsv import sniff
492
+
493
+ sample = [
494
+ "#separator \\x09\n",
495
+ "#set_separator\t,\n",
496
+ "#empty_field\t(empty)\n",
497
+ "#unset_field\t-\n",
498
+ "#path\tsyslog\n",
499
+ "#fields\tts\tuid\tid.orig_h\tfacility\tseverity\tmessage\n",
500
+ "#types\ttime\tstring\taddr\tstring\tstring\tstring\n",
501
+ ]
502
+ assert sniff(sample) == "syslog"
503
+
504
+
505
+ def test_zeek_syslog_normalizer_strips_trailing_crlf_from_raw() -> None:
506
+ """P2 regression (Glenn bug handoff): Zeek's NDJSON `message` field can
507
+ carry the upstream record's trailing CR/LF. Pre-fix this leaked into
508
+ canonical `raw`, and the syslog detector's `title=str(row.raw)[:180]`
509
+ rendered a blank spacer row beneath every affected finding. Fix is a
510
+ narrow trailing-line-terminator strip at the parser seam (mirrors flat
511
+ `load_syslog`'s `line.rstrip("\\n")` discipline). RFC 5737 placeholders.
512
+
513
+ Canonical `message` was already clean because `strip_header` calls
514
+ `.strip()`; this test pins both columns to the same contract.
515
+ """
516
+ from loghunter.parsers.zeek import _normalize_zeek_syslog_df
517
+
518
+ raw_df = pd.DataFrame([
519
+ {
520
+ "ts": 1779750000.0,
521
+ "uid": "CSL01",
522
+ "id.orig_h": "192.0.2.10",
523
+ "id.orig_p": 41514,
524
+ "id.resp_h": "198.51.100.20",
525
+ "id.resp_p": 514,
526
+ "proto": "udp",
527
+ "facility": "DAEMON",
528
+ "severity": "INFO",
529
+ # Trailing LF, mixed-form CR/LF, bare CR — any combination
530
+ # that an upstream agent might leave on the wire.
531
+ "message": "Jun 11 12:00:00 host1 sshd[1234]: line ending in LF\n",
532
+ },
533
+ {
534
+ "ts": 1779750060.0,
535
+ "uid": "CSL02",
536
+ "id.orig_h": "192.0.2.10",
537
+ "id.orig_p": 41515,
538
+ "id.resp_h": "198.51.100.20",
539
+ "id.resp_p": 514,
540
+ "proto": "udp",
541
+ "facility": "DAEMON",
542
+ "severity": "INFO",
543
+ "message": "Jun 11 12:01:00 host1 sshd[1235]: line ending in CRLF\r\n",
544
+ },
545
+ {
546
+ "ts": 1779750120.0,
547
+ "uid": "CSL03",
548
+ "id.orig_h": "192.0.2.10",
549
+ "id.orig_p": 41516,
550
+ "id.resp_h": "198.51.100.20",
551
+ "id.resp_p": 514,
552
+ "proto": "udp",
553
+ "facility": "DAEMON",
554
+ "severity": "INFO",
555
+ "message": "Jun 11 12:02:00 host1 sshd[1236]: line ending in bare CR\r",
556
+ },
557
+ ])
558
+ df = _normalize_zeek_syslog_df(raw_df)
559
+
560
+ # Canonical raw must not carry trailing CR or LF on any row.
561
+ for value in df["raw"].tolist():
562
+ assert not value.endswith("\n"), f"raw must not end in LF: {value!r}"
563
+ assert not value.endswith("\r"), f"raw must not end in CR: {value!r}"
564
+ # Canonical message remains clean too (already guaranteed by strip_header).
565
+ for value in df["message"].tolist():
566
+ assert not value.endswith("\n"), f"message must not end in LF: {value!r}"
567
+ assert not value.endswith("\r"), f"message must not end in CR: {value!r}"
568
+
569
+ # Detector title contract: str(raw)[:180] must be a single physical line.
570
+ for value in df["raw"].tolist():
571
+ title = str(value)[:180]
572
+ assert "\n" not in title, (
573
+ f"detector title (str(raw)[:180]) must not contain a newline; "
574
+ f"got: {title!r}"
575
+ )
576
+
577
+ # The raw payload up to the terminator stays intact (no broader trim).
578
+ assert df.iloc[0]["raw"].endswith("line ending in LF")
579
+ assert df.iloc[1]["raw"].endswith("line ending in CRLF")
580
+ assert df.iloc[2]["raw"].endswith("line ending in bare CR")