loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,573 @@
1
+ """Unit tests for per-parser sniff recognizers (pure, no I/O).
2
+
3
+ All sample data is synthetic per the privacy rail — RFC 5737 documentation
4
+ IPs (192.0.2.x, 198.51.100.x, 203.0.113.x) and placeholder hostnames only.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import pytest
10
+
11
+ from loghunter.parsers import cloudtrail, dnsmasq, syslog, zeek, zeek_tsv
12
+
13
+
14
+ # ── Sample fixtures ───────────────────────────────────────────────────────────
15
+
16
+ ZEEK_TSV_CONN_SAMPLE: list[str] = [
17
+ "#separator \\x09\n",
18
+ "#set_separator\t,\n",
19
+ "#empty_field\t(empty)\n",
20
+ "#unset_field\t-\n",
21
+ "#path\tconn\n",
22
+ "#open\t2026-06-01-12-00-00\n",
23
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\tduration\n",
24
+ "#types\ttime\tstring\taddr\tport\taddr\tport\tenum\tinterval\n",
25
+ ]
26
+
27
+ ZEEK_TSV_DNS_SAMPLE: list[str] = [
28
+ "#separator \\x09\n",
29
+ "#set_separator\t,\n",
30
+ "#empty_field\t(empty)\n",
31
+ "#unset_field\t-\n",
32
+ "#path\tdns\n",
33
+ "#open\t2026-06-01-12-00-00\n",
34
+ "#fields\tts\tuid\tid.orig_h\tquery\tqtype\n",
35
+ "#types\ttime\tstring\taddr\tstring\tcount\n",
36
+ ]
37
+
38
+ ZEEK_TSV_UNSUPPORTED_PATH_SAMPLE: list[str] = [
39
+ "#separator \\x09\n",
40
+ "#set_separator\t,\n",
41
+ "#empty_field\t(empty)\n",
42
+ "#unset_field\t-\n",
43
+ "#path\thttp\n",
44
+ "#fields\tts\tuid\thost\n",
45
+ "#types\ttime\tstring\tstring\n",
46
+ ]
47
+
48
+ ZEEK_NDJSON_CONN_SAMPLE: list[str] = [
49
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.20",'
50
+ ' "id.resp_p": 443, "proto": "tcp", "duration": 1.23}\n',
51
+ ]
52
+
53
+ ZEEK_NDJSON_CONN_NO_DURATION_SAMPLE: list[str] = [
54
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.20",'
55
+ ' "id.resp_p": 443, "proto": "tcp"}\n',
56
+ ]
57
+
58
+ ZEEK_NDJSON_DNS_SAMPLE: list[str] = [
59
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "query": "example.test", "qtype": 1}\n',
60
+ ]
61
+
62
+ # Zeek-native conn line with the _path directive — must claim "conn" via the
63
+ # _path gate (layer 1), even before the field-set fallback would fire.
64
+ ZEEK_NDJSON_CONN_WITH_PATH_SAMPLE: list[str] = [
65
+ '{"_path": "conn", "ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
66
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
67
+ ' "duration": 1.23}\n',
68
+ ]
69
+
70
+ # Zeek-native dns line with the _path directive — must claim "dns" via the
71
+ # _path gate.
72
+ ZEEK_NDJSON_DNS_WITH_PATH_SAMPLE: list[str] = [
73
+ '{"_path": "dns", "ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
74
+ ' "query": "example.test", "qtype": 1}\n',
75
+ ]
76
+
77
+ # Zeek's own non-conn / non-dns NDJSON logs — they carry the 5-tuple as
78
+ # connection context but are NOT conn frames. The _path gate rejects them
79
+ # so the sniff cascade falls through to the blob floor.
80
+ ZEEK_NDJSON_NOTICE_SAMPLE: list[str] = [
81
+ '{"_path": "notice", "ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
82
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
83
+ ' "note": "Placeholder::Note", "msg": "placeholder message"}\n',
84
+ ]
85
+
86
+ ZEEK_NDJSON_SYSLOG_SAMPLE: list[str] = [
87
+ '{"_path": "syslog", "ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
88
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 514, "proto": "udp",'
89
+ ' "facility": "LOCAL0", "severity": "INFO",'
90
+ ' "message": "placeholder message"}\n',
91
+ ]
92
+
93
+ # Zeek NDJSON syslog WITHOUT the _path directive. Some upstream agents emit
94
+ # Zeek logs minus _path. Pre-fix, this line fell through the _path gate, the
95
+ # conn field-set fallback claimed it as conn (5-tuple present, no `query`),
96
+ # and operators got an empty frame from `loghunter syslog`. Post-fix, the
97
+ # syslog field-set fallback (facility + severity + message + ts + src) catches
98
+ # it BEFORE the conn fallback. RFC 5737 placeholders only.
99
+ ZEEK_NDJSON_SYSLOG_NO_PATH_SAMPLE: list[str] = [
100
+ '{"ts": 1779750000.0, "uid": "CSL01",'
101
+ ' "id.orig_h": "192.0.2.10", "id.orig_p": 41514,'
102
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 514,'
103
+ ' "proto": "udp", "facility": "DAEMON", "severity": "INFO",'
104
+ ' "message": "Jun 11 12:00:00 host1 sshd[1234]: placeholder"}\n',
105
+ ]
106
+
107
+ # Defensive negatives — a line missing ONE of (facility, severity, message)
108
+ # must NOT be claimed as syslog. With the full 5-tuple still present, the
109
+ # conn fallback claims them as conn (the documented field-set behaviour for
110
+ # hand-rolled NDJSON). These prove the syslog fallback is tight on the
111
+ # three-key signature, not a "has facility OR severity" loosening.
112
+ ZEEK_NDJSON_NO_PATH_CONN_NO_FACILITY_SAMPLE: list[str] = [
113
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
114
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
115
+ ' "severity": "INFO", "message": "placeholder"}\n',
116
+ ]
117
+ ZEEK_NDJSON_NO_PATH_CONN_NO_SEVERITY_SAMPLE: list[str] = [
118
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
119
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
120
+ ' "facility": "DAEMON", "message": "placeholder"}\n',
121
+ ]
122
+ ZEEK_NDJSON_NO_PATH_CONN_NO_MESSAGE_SAMPLE: list[str] = [
123
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
124
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
125
+ ' "facility": "DAEMON", "severity": "INFO"}\n',
126
+ ]
127
+
128
+ ZEEK_NDJSON_ANALYZER_SAMPLE: list[str] = [
129
+ '{"_path": "analyzer", "ts": 1779750000.0, "cause": "violation",'
130
+ ' "analyzer_kind": "protocol", "analyzer_name": "Placeholder",'
131
+ ' "uid": "Cxxxxxx", "id.orig_h": "192.0.2.10",'
132
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
133
+ ' "failure_reason": "placeholder"}\n',
134
+ ]
135
+
136
+ # notice.log-shaped NDJSON WITHOUT _path. Carries the conn 5-tuple via
137
+ # id.* keys (so the 2c conn fallback would historically claim it) AND its
138
+ # OWN native `src`/`dst` columns. That double-presence is the
139
+ # rename-collision signal: the loader's id.orig_h→src rename would
140
+ # duplicate the canonical `src` column and crash the conn summariser
141
+ # with pandas' "Grouper for 'src' not 1-dimensional". The field-set
142
+ # fallback must reject this and fall through to None so the sniff
143
+ # cascade lands at the blob floor.
144
+ ZEEK_NDJSON_NOTICE_NO_PATH_SAMPLE: list[str] = [
145
+ '{"ts": 1779750000.0, "uid": "Cxxxxxx",'
146
+ ' "id.orig_h": "192.0.2.10", "id.orig_p": 41514,'
147
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
148
+ ' "src": "192.0.2.10", "dst": "198.51.100.20",'
149
+ ' "note": "Placeholder::Note", "msg": "placeholder message"}\n',
150
+ ]
151
+
152
+ # Hand-rolled dns-shaped NDJSON carrying BOTH `id.orig_h` and a native
153
+ # `src`. The rename pair (id.orig_h → src) collides, so the 2a dns
154
+ # fallback must reject the claim and fall through to None.
155
+ ZEEK_NDJSON_DNS_NO_PATH_NATIVE_SRC_COLLISION_SAMPLE: list[str] = [
156
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
157
+ ' "src": "192.0.2.10", "query": "example.test", "qtype": 1}\n',
158
+ ]
159
+
160
+ CLOUDTRAIL_NDJSON_SAMPLE: list[str] = [
161
+ '{"eventVersion": "1.08", "eventTime": "2026-06-01T12:00:00Z",'
162
+ ' "userIdentity": {"type": "IAMUser"}, "eventName": "GetObject",'
163
+ ' "eventSource": "s3.amazonaws.com"}\n',
164
+ ]
165
+
166
+ CLOUDTRAIL_ENVELOPE_SAMPLE: list[str] = [
167
+ "{\n",
168
+ ' "Records": [\n',
169
+ " {\n",
170
+ ' "eventVersion": "1.08",\n',
171
+ ' "eventTime": "2026-06-01T12:00:00Z",\n',
172
+ ' "userIdentity": {\n',
173
+ ' "type": "IAMUser"\n',
174
+ " },\n",
175
+ ' "eventName": "GetObject"\n',
176
+ " }\n",
177
+ " ]\n",
178
+ "}\n",
179
+ ]
180
+
181
+ # Envelope whose first record is enormous — used to verify the structural
182
+ # scan does not depend on the sample being fully parseable as JSON. Mimics a
183
+ # pretty-printed event with many extra keys before the recognizer's tokens.
184
+ CLOUDTRAIL_ENVELOPE_BIG_RECORD_SAMPLE: list[str] = (
185
+ ["{\n", ' "Records": [\n', " {\n"]
186
+ + [f' "extraKey{i}": "value{i}",\n' for i in range(120)]
187
+ + [
188
+ ' "eventVersion": "1.08",\n',
189
+ ' "eventTime": "2026-06-01T12:00:00Z",\n',
190
+ ' "userIdentity": {"type": "IAMUser"}\n',
191
+ ]
192
+ )
193
+
194
+ DNSMASQ_DNS_SAMPLE: list[str] = [
195
+ "Jun 1 12:00:00 piholehost dnsmasq[123]: query[A] example.test from 192.0.2.10\n",
196
+ "Jun 1 12:00:01 piholehost dnsmasq[123]: forwarded example.test to 198.51.100.53\n",
197
+ ]
198
+
199
+ # All-DHCP sample: outer grammar matches but inner is dhcp only — must NOT
200
+ # claim "dns". Guards the distinction James called out.
201
+ DNSMASQ_DHCP_ONLY_SAMPLE: list[str] = [
202
+ "Jun 1 12:00:00 piholehost dnsmasq[123]: DHCP 192.0.2.10 is placeholder-host\n",
203
+ "Jun 1 12:00:01 piholehost dnsmasq[123]: DHCP placeholder-host is 192.0.2.11\n",
204
+ ]
205
+
206
+ # All-unknown sample: outer grammar matches but message is unrecognized
207
+ # chatter — must NOT claim "dns".
208
+ DNSMASQ_UNKNOWN_ONLY_SAMPLE: list[str] = [
209
+ "Jun 1 12:00:00 piholehost dnsmasq[123]: started, version 2.86 cachesize 10000\n",
210
+ "Jun 1 12:00:01 piholehost dnsmasq[123]: compile time options: IPv6 GNU-getopt\n",
211
+ ]
212
+
213
+ # DHCP prefix followed by a real DNS event — must still claim "dns" (the
214
+ # budget tolerates leading DHCP/unknown lines).
215
+ DNSMASQ_DHCP_PREFIX_THEN_DNS_SAMPLE: list[str] = [
216
+ "Jun 1 12:00:00 piholehost dnsmasq[123]: DHCP 192.0.2.10 is placeholder-host\n",
217
+ "Jun 1 12:00:01 piholehost dnsmasq[123]: query[A] example.test from 192.0.2.11\n",
218
+ ]
219
+
220
+ SYSLOG_SAMPLE: list[str] = [
221
+ "<13>Jun 1 12:00:00 examplehost sshd[1234]: Accepted publickey for placeholder\n",
222
+ "Jun 1 12:00:01 examplehost cron[5678]: (root) CMD (placeholder)\n",
223
+ ]
224
+
225
+ SYSLOG_NO_PRI_SAMPLE: list[str] = [
226
+ "Jun 1 12:00:00 examplehost sshd[1234]: Accepted publickey for placeholder\n",
227
+ ]
228
+
229
+ GARBAGE_TEXT_SAMPLE: list[str] = [
230
+ "hello world\n",
231
+ "this is not a log\n",
232
+ "lorem ipsum dolor sit amet\n",
233
+ ]
234
+
235
+ # Generic JSON envelope with a "Records" key but no CloudTrail event keys
236
+ # — must NOT be claimed as cloudtrail. Guards the structural scan's
237
+ # precision (James's CT negative test).
238
+ GENERIC_RECORDS_JSON_SAMPLE: list[str] = [
239
+ "{\n",
240
+ ' "Records": [\n',
241
+ " {\n",
242
+ ' "foo": "bar",\n',
243
+ ' "baz": 42\n',
244
+ " }\n",
245
+ " ]\n",
246
+ "}\n",
247
+ ]
248
+
249
+ EMPTY_SAMPLE: list[str] = []
250
+
251
+ BLANK_SAMPLE: list[str] = ["\n", "\n", " \n"]
252
+
253
+
254
+ # ── Positive recognizer tests ─────────────────────────────────────────────────
255
+
256
+ def test_zeek_tsv_sniff_conn() -> None:
257
+ assert zeek_tsv.sniff(ZEEK_TSV_CONN_SAMPLE) == "conn"
258
+
259
+
260
+ def test_zeek_tsv_sniff_dns() -> None:
261
+ assert zeek_tsv.sniff(ZEEK_TSV_DNS_SAMPLE) == "dns"
262
+
263
+
264
+ def test_zeek_tsv_sniff_unsupported_path_returns_none() -> None:
265
+ # #path http has no digester in v1 — must fall through, not claim a slot.
266
+ assert zeek_tsv.sniff(ZEEK_TSV_UNSUPPORTED_PATH_SAMPLE) is None
267
+
268
+
269
+ def test_zeek_tsv_sniff_missing_fields_directive_returns_none() -> None:
270
+ sample = [
271
+ "#separator \\x09\n",
272
+ "#path\tconn\n",
273
+ ]
274
+ assert zeek_tsv.sniff(sample) is None
275
+
276
+
277
+ def test_zeek_tsv_sniff_missing_separator_returns_none() -> None:
278
+ sample = [
279
+ "#path\tconn\n",
280
+ "#fields\tts\tuid\n",
281
+ ]
282
+ # Without #separator we cannot reliably split other directives.
283
+ assert zeek_tsv.sniff(sample) is None
284
+
285
+
286
+ def test_zeek_tsv_sniff_path_substring_in_payload_returns_none() -> None:
287
+ # James's concern: literal-substring "#path" in arbitrary text must NOT
288
+ # claim Zeek TSV. The leading-#-required guard handles this.
289
+ sample = [
290
+ "this line mentions #path conn but is not a header\n",
291
+ "second line of prose\n",
292
+ ]
293
+ assert zeek_tsv.sniff(sample) is None
294
+
295
+
296
+ def test_zeek_sniff_conn_ndjson() -> None:
297
+ assert zeek.sniff(ZEEK_NDJSON_CONN_SAMPLE) == "conn"
298
+
299
+
300
+ def test_zeek_sniff_conn_no_duration() -> None:
301
+ # duration is optional (Zeek omits it on open connections).
302
+ assert zeek.sniff(ZEEK_NDJSON_CONN_NO_DURATION_SAMPLE) == "conn"
303
+
304
+
305
+ def test_zeek_sniff_dns_ndjson() -> None:
306
+ assert zeek.sniff(ZEEK_NDJSON_DNS_SAMPLE) == "dns"
307
+
308
+
309
+ def test_zeek_sniff_dns_wins_over_conn_when_query_present() -> None:
310
+ line = (
311
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.20",'
312
+ ' "id.resp_p": 443, "proto": "tcp", "duration": 0.1, "query": "example.test"}\n'
313
+ )
314
+ # Pathological mix: should classify as dns because query is the
315
+ # documented disambiguator.
316
+ assert zeek.sniff([line]) == "dns"
317
+
318
+
319
+ # ── _path gate — Zeek-native conn/dns NDJSON ──────────────────────────────────
320
+
321
+
322
+ def test_zeek_sniff_conn_with_path_directive() -> None:
323
+ """_path == 'conn' claims conn directly (layer 1)."""
324
+ assert zeek.sniff(ZEEK_NDJSON_CONN_WITH_PATH_SAMPLE) == "conn"
325
+
326
+
327
+ def test_zeek_sniff_dns_with_path_directive() -> None:
328
+ """_path == 'dns' claims dns directly (layer 1)."""
329
+ assert zeek.sniff(ZEEK_NDJSON_DNS_WITH_PATH_SAMPLE) == "dns"
330
+
331
+
332
+ # ── _path gate — Zeek non-conn/non-dns logs are handled correctly ────────────
333
+ #
334
+ # Zeek's own syslog.log, notice.log, analyzer.log lines carry the 5-tuple as
335
+ # connection context. Pre-fix, the field-set fallback would claim them as
336
+ # conn, the loader would normalise them, and the digest summariser would
337
+ # crash on the resulting frame (e.g. duplicate `src` column → "Grouper for
338
+ # 'src' not 1-dimensional"). Post-fix, the _path gate is authoritative:
339
+ # _path == "syslog" is now a real claim (v1 promotion); _path in {notice,
340
+ # analyzer, …} still falls to the blob floor.
341
+
342
+
343
+ def test_zeek_sniff_notice_path_not_claimed_as_conn() -> None:
344
+ assert zeek.sniff(ZEEK_NDJSON_NOTICE_SAMPLE) is None
345
+
346
+
347
+ def test_zeek_sniff_syslog_path_claims_syslog() -> None:
348
+ # _path == "syslog" → the new v1 syslog claim (fidelity-aware syslog
349
+ # schema). Pre-promotion this returned None; post-promotion this is the
350
+ # entry point that routes Zeek syslog.log through _normalize_zeek_syslog_df.
351
+ assert zeek.sniff(ZEEK_NDJSON_SYSLOG_SAMPLE) == "syslog"
352
+
353
+
354
+ def test_zeek_sniff_analyzer_path_not_claimed_as_conn() -> None:
355
+ assert zeek.sniff(ZEEK_NDJSON_ANALYZER_SAMPLE) is None
356
+
357
+
358
+ # ── Field-set fallback — Zeek NDJSON without _path ───────────────────────────
359
+ #
360
+ # Some upstream agents emit Zeek logs minus the _path directive. The original
361
+ # v1 promotion only claimed syslog via the _path gate, so a no-_path Zeek
362
+ # syslog.log fell through to the conn fallback (full 5-tuple present, no
363
+ # `query`) and was misrouted as conn. The syslog field-set fallback now
364
+ # catches these BEFORE the conn fallback, gated on the tight (facility,
365
+ # severity, message) triple that no other Zeek log type carries.
366
+
367
+
368
+ def test_zeek_sniff_syslog_without_path_claims_syslog() -> None:
369
+ """Zeek syslog NDJSON without _path is claimed via the field-set fallback
370
+ on (facility, severity, message) + src/ts. Pre-fix this fell to conn."""
371
+ assert zeek.sniff(ZEEK_NDJSON_SYSLOG_NO_PATH_SAMPLE) == "syslog"
372
+
373
+
374
+ def test_zeek_sniff_field_set_syslog_requires_facility() -> None:
375
+ """Negative: missing `facility` falls through the syslog fallback. Full
376
+ 5-tuple still present → claimed as conn by the conn fallback."""
377
+ assert zeek.sniff(ZEEK_NDJSON_NO_PATH_CONN_NO_FACILITY_SAMPLE) == "conn"
378
+
379
+
380
+ def test_zeek_sniff_field_set_syslog_requires_severity() -> None:
381
+ """Negative: missing `severity` falls through the syslog fallback."""
382
+ assert zeek.sniff(ZEEK_NDJSON_NO_PATH_CONN_NO_SEVERITY_SAMPLE) == "conn"
383
+
384
+
385
+ def test_zeek_sniff_field_set_syslog_requires_message() -> None:
386
+ """Negative: missing `message` falls through the syslog fallback. The
387
+ triple is load-bearing — loosening any of facility/severity/message would
388
+ reopen the notice/analyzer false-claim risk."""
389
+ assert zeek.sniff(ZEEK_NDJSON_NO_PATH_CONN_NO_MESSAGE_SAMPLE) == "conn"
390
+
391
+
392
+ # ── Field-set fallback — rename-collision guard ──────────────────────────────
393
+ #
394
+ # Records carrying BOTH a Zeek-native key (id.orig_h/id.resp_h/id.resp_p/
395
+ # orig_bytes/TTLs/answers/TC) AND its canonical rename target are NOT
396
+ # clean conn/dns frames — the loader's rename would crash the summariser.
397
+ # The field-set fallback must reject them; sniff returns None and the
398
+ # orchestrator drops to the blob floor.
399
+
400
+
401
+ def test_zeek_sniff_notice_no_path_native_src_collision_returns_none() -> None:
402
+ """notice.log-shaped NDJSON without _path carrying id.orig_h plus
403
+ native src/dst — the original incident shape — must NOT be claimed as
404
+ conn. Falls through to None so the orchestrator drops to blob."""
405
+ assert zeek.sniff(ZEEK_NDJSON_NOTICE_NO_PATH_SAMPLE) is None
406
+
407
+
408
+ def test_zeek_sniff_dns_no_path_native_src_collision_returns_none() -> None:
409
+ """A dns-shaped pathless NDJSON carrying both id.orig_h AND native src
410
+ is also a rename-collision shape — the 2a dns fallback must reject
411
+ it."""
412
+ assert zeek.sniff(ZEEK_NDJSON_DNS_NO_PATH_NATIVE_SRC_COLLISION_SAMPLE) is None
413
+
414
+
415
+ def test_zeek_sniff_clean_no_path_conn_still_claims_conn() -> None:
416
+ """Regression: a clean pathless conn NDJSON (id.* keys only, NO
417
+ native src/dst) must STILL claim conn. Over-rejection would break
418
+ legitimate exported Zeek conn NDJSON."""
419
+ assert zeek.sniff(ZEEK_NDJSON_CONN_SAMPLE) == "conn"
420
+
421
+
422
+ def test_zeek_sniff_clean_no_path_dns_still_claims_dns() -> None:
423
+ """Regression: a clean pathless dns NDJSON (id.orig_h + query, NO
424
+ native src) must STILL claim dns."""
425
+ assert zeek.sniff(ZEEK_NDJSON_DNS_SAMPLE) == "dns"
426
+
427
+
428
+ def test_zeek_sniff_path_gate_trusted_over_field_set() -> None:
429
+ """When _path is present, it is the only signal consulted — the field set
430
+ (even a valid conn or dns set) does not get a second say. This is the
431
+ contract that prevents notice/syslog/analyzer false claims."""
432
+ # _path says "weird"; field set looks like conn. _path wins → None.
433
+ line = (
434
+ '{"_path": "weird", "ts": 1779750000.0, "id.orig_h": "192.0.2.10",'
435
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp"}\n'
436
+ )
437
+ assert zeek.sniff([line]) is None
438
+
439
+
440
+ def test_cloudtrail_sniff_ndjson() -> None:
441
+ assert cloudtrail.sniff(CLOUDTRAIL_NDJSON_SAMPLE) == "cloudtrail"
442
+
443
+
444
+ def test_cloudtrail_sniff_envelope() -> None:
445
+ assert cloudtrail.sniff(CLOUDTRAIL_ENVELOPE_SAMPLE) == "cloudtrail"
446
+
447
+
448
+ def test_cloudtrail_sniff_envelope_with_huge_first_record() -> None:
449
+ # The structural scan finds quoted keys without needing a parseable
450
+ # bounded sample — even when the first record sprawls.
451
+ assert cloudtrail.sniff(CLOUDTRAIL_ENVELOPE_BIG_RECORD_SAMPLE) == "cloudtrail"
452
+
453
+
454
+ def test_cloudtrail_sniff_generic_records_returns_none() -> None:
455
+ # James's negative test: a "Records" key without CT-event keys must
456
+ # NOT claim cloudtrail.
457
+ assert cloudtrail.sniff(GENERIC_RECORDS_JSON_SAMPLE) is None
458
+
459
+
460
+ def test_cloudtrail_sniff_event_keys_as_string_values_does_not_false_positive() -> None:
461
+ # The quoted-key + colon regex requires the token to appear as a JSON
462
+ # key, not as a value.
463
+ sample = [
464
+ '{"message": "the eventTime field was updated"}\n',
465
+ '{"note": "userIdentity values include IAMUser"}\n',
466
+ ]
467
+ assert cloudtrail.sniff(sample) is None
468
+
469
+
470
+ def test_dnsmasq_sniff_dns() -> None:
471
+ assert dnsmasq.sniff(DNSMASQ_DNS_SAMPLE) == "dns"
472
+
473
+
474
+ def test_dnsmasq_sniff_dhcp_only_returns_none() -> None:
475
+ assert dnsmasq.sniff(DNSMASQ_DHCP_ONLY_SAMPLE) is None
476
+
477
+
478
+ def test_dnsmasq_sniff_unknown_only_returns_none() -> None:
479
+ assert dnsmasq.sniff(DNSMASQ_UNKNOWN_ONLY_SAMPLE) is None
480
+
481
+
482
+ def test_dnsmasq_sniff_tolerates_dhcp_prefix_before_dns_event() -> None:
483
+ assert dnsmasq.sniff(DNSMASQ_DHCP_PREFIX_THEN_DNS_SAMPLE) == "dns"
484
+
485
+
486
+ def test_syslog_sniff_with_pri() -> None:
487
+ assert syslog.sniff(SYSLOG_SAMPLE) == "syslog"
488
+
489
+
490
+ def test_syslog_sniff_without_pri() -> None:
491
+ assert syslog.sniff(SYSLOG_NO_PRI_SAMPLE) == "syslog"
492
+
493
+
494
+ def test_syslog_sniff_garbage_text_returns_none() -> None:
495
+ # The original parse_line-non-None contract would have classified this
496
+ # as syslog; the tightened recognizer correctly returns None.
497
+ assert syslog.sniff(GARBAGE_TEXT_SAMPLE) is None
498
+
499
+
500
+ def test_syslog_sniff_missing_timestamp_returns_none() -> None:
501
+ # Looks vaguely header-shaped but timestamp does not parse.
502
+ sample = ["Foo 1 12:00:00 examplehost prog: text\n"]
503
+ assert syslog.sniff(sample) is None
504
+
505
+
506
+ # ── Edge cases shared by all recognizers ──────────────────────────────────────
507
+
508
+ @pytest.mark.parametrize(
509
+ "mod",
510
+ [zeek_tsv, zeek, cloudtrail, dnsmasq, syslog],
511
+ ids=["zeek_tsv", "zeek", "cloudtrail", "dnsmasq", "syslog"],
512
+ )
513
+ def test_empty_sample_returns_none(mod) -> None:
514
+ assert mod.sniff(EMPTY_SAMPLE) is None
515
+
516
+
517
+ @pytest.mark.parametrize(
518
+ "mod",
519
+ [zeek_tsv, zeek, cloudtrail, dnsmasq, syslog],
520
+ ids=["zeek_tsv", "zeek", "cloudtrail", "dnsmasq", "syslog"],
521
+ )
522
+ def test_blank_sample_returns_none(mod) -> None:
523
+ assert mod.sniff(BLANK_SAMPLE) is None
524
+
525
+
526
+ @pytest.mark.parametrize(
527
+ "mod",
528
+ [zeek_tsv, zeek, cloudtrail, dnsmasq, syslog],
529
+ ids=["zeek_tsv", "zeek", "cloudtrail", "dnsmasq", "syslog"],
530
+ )
531
+ def test_garbage_text_returns_none(mod) -> None:
532
+ assert mod.sniff(GARBAGE_TEXT_SAMPLE) is None
533
+
534
+
535
+ # ── Cross-format negative matrix ──────────────────────────────────────────────
536
+ #
537
+ # Each recognizer fed with every OTHER format's sample. Most pairs return
538
+ # None. The one documented overlap is syslog.sniff(dnsmasq sample) — dnsmasq
539
+ # IS RFC 3164, so the recognizer-level signal is genuinely "syslog"; the
540
+ # orchestrator resolves the ambiguity by running dnsmasq first.
541
+
542
+ _FOREIGN_SAMPLES = {
543
+ "zeek_tsv_conn": (zeek_tsv, ZEEK_TSV_CONN_SAMPLE),
544
+ "zeek_tsv_dns": (zeek_tsv, ZEEK_TSV_DNS_SAMPLE),
545
+ "zeek_ndjson_conn": (zeek, ZEEK_NDJSON_CONN_SAMPLE),
546
+ "zeek_ndjson_dns": (zeek, ZEEK_NDJSON_DNS_SAMPLE),
547
+ "ct_ndjson": (cloudtrail, CLOUDTRAIL_NDJSON_SAMPLE),
548
+ "ct_envelope": (cloudtrail, CLOUDTRAIL_ENVELOPE_SAMPLE),
549
+ "dnsmasq": (dnsmasq, DNSMASQ_DNS_SAMPLE),
550
+ "syslog": (syslog, SYSLOG_SAMPLE),
551
+ }
552
+
553
+
554
+ @pytest.mark.parametrize("origin_name", list(_FOREIGN_SAMPLES.keys()))
555
+ @pytest.mark.parametrize(
556
+ "target_mod",
557
+ [zeek_tsv, zeek, cloudtrail, dnsmasq, syslog],
558
+ ids=["zeek_tsv", "zeek", "cloudtrail", "dnsmasq", "syslog"],
559
+ )
560
+ def test_cross_format_negative_matrix(origin_name, target_mod) -> None:
561
+ origin_mod, sample = _FOREIGN_SAMPLES[origin_name]
562
+ if target_mod is origin_mod:
563
+ pytest.skip("self-match handled by positive tests")
564
+ result = target_mod.sniff(sample)
565
+ # Documented overlap: dnsmasq logs ARE syslog. The orchestrator's
566
+ # precedence (dnsmasq before syslog) is what resolves this.
567
+ if target_mod is syslog and origin_name == "dnsmasq":
568
+ assert result == "syslog"
569
+ else:
570
+ assert result is None, (
571
+ f"{target_mod.__name__}.sniff falsely claimed {result!r} for "
572
+ f"a {origin_name} sample"
573
+ )