loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,373 @@
1
+ """Integration tests for sniff_format — the loader-layer orchestrator.
2
+
3
+ Verifies file I/O integration (`_open_log`, gzip transparency), precedence,
4
+ the blob floor, and the bounded-read perf guarantee. All sample data is
5
+ synthetic per the privacy rail — RFC 5737 IPs and placeholder hostnames.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import gzip
11
+ import json
12
+ from pathlib import Path
13
+
14
+ import pytest
15
+
16
+ from loghunter.common import loader
17
+ from loghunter.common.loader import _SNIFF_MAX_PEEK, sniff_format, sniff_format_detailed
18
+
19
+
20
+ # ── File fixture helpers ──────────────────────────────────────────────────────
21
+
22
+ def _write(path: Path, lines: list[str]) -> None:
23
+ path.write_text("".join(lines), encoding="utf-8")
24
+
25
+
26
+ def _write_gz(path: Path, lines: list[str]) -> None:
27
+ path.write_bytes(gzip.compress("".join(lines).encode("utf-8")))
28
+
29
+
30
+ ZEEK_TSV_CONN_LINES = [
31
+ "#separator \\x09\n",
32
+ "#set_separator\t,\n",
33
+ "#empty_field\t(empty)\n",
34
+ "#unset_field\t-\n",
35
+ "#path\tconn\n",
36
+ "#open\t2026-06-01-12-00-00\n",
37
+ "#fields\tts\tuid\tid.orig_h\tid.orig_p\tid.resp_h\tid.resp_p\tproto\tduration\n",
38
+ "#types\ttime\tstring\taddr\tport\taddr\tport\tenum\tinterval\n",
39
+ ]
40
+
41
+ ZEEK_TSV_DNS_LINES = [
42
+ "#separator \\x09\n",
43
+ "#path\tdns\n",
44
+ "#fields\tts\tuid\tid.orig_h\tquery\n",
45
+ "#types\ttime\tstring\taddr\tstring\n",
46
+ ]
47
+
48
+ ZEEK_NDJSON_CONN_LINE = (
49
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "id.resp_h": "198.51.100.20",'
50
+ ' "id.resp_p": 443, "proto": "tcp", "duration": 1.23}\n'
51
+ )
52
+
53
+ ZEEK_NDJSON_DNS_LINE = (
54
+ '{"ts": 1779750000.0, "id.orig_h": "192.0.2.10", "query": "example.test"}\n'
55
+ )
56
+
57
+ CLOUDTRAIL_NDJSON_LINE = json.dumps({
58
+ "eventVersion": "1.08",
59
+ "eventTime": "2026-06-01T12:00:00Z",
60
+ "userIdentity": {"type": "IAMUser"},
61
+ "eventName": "GetObject",
62
+ "eventSource": "s3.amazonaws.com",
63
+ "sourceIPAddress": "192.0.2.10",
64
+ }) + "\n"
65
+
66
+ CLOUDTRAIL_ENVELOPE_PAYLOAD = json.dumps({
67
+ "Records": [
68
+ {
69
+ "eventVersion": "1.08",
70
+ "eventTime": "2026-06-01T12:00:00Z",
71
+ "userIdentity": {"type": "IAMUser"},
72
+ "eventName": "GetObject",
73
+ "eventSource": "s3.amazonaws.com",
74
+ "sourceIPAddress": "192.0.2.10",
75
+ }
76
+ ]
77
+ }, indent=2) + "\n"
78
+
79
+ DNSMASQ_LINES = [
80
+ "Jun 1 12:00:00 piholehost dnsmasq[123]: query[A] example.test from 192.0.2.10\n",
81
+ "Jun 1 12:00:01 piholehost dnsmasq[123]: forwarded example.test to 198.51.100.53\n",
82
+ ]
83
+
84
+ SYSLOG_LINES = [
85
+ "<13>Jun 1 12:00:00 examplehost sshd[1234]: Accepted publickey for placeholder\n",
86
+ "Jun 1 12:00:01 examplehost cron[5678]: (root) CMD (placeholder)\n",
87
+ ]
88
+
89
+
90
+ # ── Per-format classification ─────────────────────────────────────────────────
91
+
92
+ def test_sniff_format_zeek_tsv_conn(tmp_path: Path) -> None:
93
+ path = tmp_path / "conn.log"
94
+ _write(path, ZEEK_TSV_CONN_LINES)
95
+ assert sniff_format(path) == "conn"
96
+
97
+
98
+ def test_sniff_format_zeek_tsv_dns(tmp_path: Path) -> None:
99
+ path = tmp_path / "dns.log"
100
+ _write(path, ZEEK_TSV_DNS_LINES)
101
+ assert sniff_format(path) == "dns"
102
+
103
+
104
+ def test_sniff_format_zeek_ndjson_conn(tmp_path: Path) -> None:
105
+ path = tmp_path / "conn.log"
106
+ _write(path, [ZEEK_NDJSON_CONN_LINE])
107
+ assert sniff_format(path) == "conn"
108
+
109
+
110
+ def test_sniff_format_zeek_ndjson_dns(tmp_path: Path) -> None:
111
+ path = tmp_path / "dns.log"
112
+ _write(path, [ZEEK_NDJSON_DNS_LINE])
113
+ assert sniff_format(path) == "dns"
114
+
115
+
116
+ def test_sniff_format_cloudtrail_ndjson(tmp_path: Path) -> None:
117
+ path = tmp_path / "cloudtrail.json.log"
118
+ _write(path, [CLOUDTRAIL_NDJSON_LINE])
119
+ assert sniff_format(path) == "cloudtrail"
120
+
121
+
122
+ def test_sniff_format_cloudtrail_envelope(tmp_path: Path) -> None:
123
+ path = tmp_path / "cloudtrail.json"
124
+ path.write_text(CLOUDTRAIL_ENVELOPE_PAYLOAD, encoding="utf-8")
125
+ assert sniff_format(path) == "cloudtrail"
126
+
127
+
128
+ def test_sniff_format_cloudtrail_envelope_gz(tmp_path: Path) -> None:
129
+ # Exercises the _open_log gzip path in the orchestrator end-to-end.
130
+ path = tmp_path / "cloudtrail.json.gz"
131
+ path.write_bytes(gzip.compress(CLOUDTRAIL_ENVELOPE_PAYLOAD.encode("utf-8")))
132
+ assert sniff_format(path) == "cloudtrail"
133
+
134
+
135
+ def test_sniff_format_dnsmasq(tmp_path: Path) -> None:
136
+ path = tmp_path / "pihole.log"
137
+ _write(path, DNSMASQ_LINES)
138
+ assert sniff_format(path) == "dns"
139
+
140
+
141
+ def test_sniff_format_syslog(tmp_path: Path) -> None:
142
+ path = tmp_path / "syslog"
143
+ _write(path, SYSLOG_LINES)
144
+ assert sniff_format(path) == "syslog"
145
+
146
+
147
+ def test_sniff_format_zeek_tsv_gz(tmp_path: Path) -> None:
148
+ path = tmp_path / "conn.log.gz"
149
+ _write_gz(path, ZEEK_TSV_CONN_LINES)
150
+ assert sniff_format(path) == "conn"
151
+
152
+
153
+ # ── Ambiguity / precedence ────────────────────────────────────────────────────
154
+
155
+ def test_zeek_ndjson_not_claimed_as_cloudtrail(tmp_path: Path) -> None:
156
+ # A Zeek NDJSON conn line is JSON but lacks CT event keys — cloudtrail
157
+ # must not claim it; the zeek recognizer downstream wins.
158
+ path = tmp_path / "conn.log"
159
+ _write(path, [ZEEK_NDJSON_CONN_LINE])
160
+ assert sniff_format(path) == "conn"
161
+
162
+
163
+ def test_cloudtrail_event_not_claimed_as_zeek(tmp_path: Path) -> None:
164
+ # A CloudTrail per-event NDJSON line is JSON but lacks Zeek's key sets
165
+ # — the cloudtrail recognizer wins (precedence: cloudtrail before zeek).
166
+ path = tmp_path / "events.json.log"
167
+ _write(path, [CLOUDTRAIL_NDJSON_LINE])
168
+ assert sniff_format(path) == "cloudtrail"
169
+
170
+
171
+ def test_zeek_ndjson_notice_no_path_routes_to_blob(tmp_path: Path) -> None:
172
+ # notice.log-shaped pathless NDJSON: carries the conn 5-tuple via
173
+ # id.* AND its own native src/dst (the original incident shape).
174
+ # The Layer-2 conn fallback rejects the rename-collision; sniff
175
+ # returns None and the orchestrator drops to the blob floor.
176
+ line = (
177
+ '{"ts": 1779750000.0, "uid": "Cxxxxxx",'
178
+ ' "id.orig_h": "192.0.2.10", "id.orig_p": 41514,'
179
+ ' "id.resp_h": "198.51.100.20", "id.resp_p": 443, "proto": "tcp",'
180
+ ' "src": "192.0.2.10", "dst": "198.51.100.20",'
181
+ ' "note": "Placeholder::Note", "msg": "placeholder message"}\n'
182
+ )
183
+ path = tmp_path / "notice.log"
184
+ _write(path, [line])
185
+ assert sniff_format(path) == "blob"
186
+
187
+
188
+ def test_dnsmasq_wins_over_syslog(tmp_path: Path) -> None:
189
+ # Dnsmasq IS RFC 3164 — both recognizers would match at the
190
+ # recognizer level. The orchestrator runs dnsmasq first.
191
+ path = tmp_path / "pihole.log"
192
+ _write(path, DNSMASQ_LINES)
193
+ assert sniff_format(path) == "dns"
194
+
195
+
196
+ # ── Blob floor ────────────────────────────────────────────────────────────────
197
+
198
+ def test_sniff_format_unrecognized_text_returns_blob(tmp_path: Path) -> None:
199
+ path = tmp_path / "mystery.txt"
200
+ _write(path, ["hello world\n", "this is not a log\n", "lorem ipsum\n"])
201
+ assert sniff_format(path) == "blob"
202
+
203
+
204
+ def test_sniff_format_empty_file_returns_blob(tmp_path: Path) -> None:
205
+ path = tmp_path / "empty.log"
206
+ path.write_text("", encoding="utf-8")
207
+ assert sniff_format(path) == "blob"
208
+
209
+
210
+ def test_sniff_format_blanks_only_returns_blob(tmp_path: Path) -> None:
211
+ path = tmp_path / "blanks.log"
212
+ _write(path, ["\n", "\n", " \n"])
213
+ assert sniff_format(path) == "blob"
214
+
215
+
216
+ # ── Bounded-read perf guarantee ───────────────────────────────────────────────
217
+
218
+ class _CountingHandle:
219
+ """Context-manager iterator that counts how many lines were pulled."""
220
+
221
+ def __init__(self, lines):
222
+ self._iter = iter(lines)
223
+ self.read_count = 0
224
+
225
+ def __enter__(self):
226
+ return self
227
+
228
+ def __exit__(self, exc_type, exc, tb):
229
+ return False
230
+
231
+ def __iter__(self):
232
+ return self
233
+
234
+ def __next__(self):
235
+ line = next(self._iter)
236
+ self.read_count += 1
237
+ return line
238
+
239
+
240
+ def test_sniff_format_caps_reads_at_max_peek(monkeypatch, tmp_path: Path) -> None:
241
+ """Orchestrator pulls at most _SNIFF_MAX_PEEK lines, even for huge inputs."""
242
+ over_budget_count = _SNIFF_MAX_PEEK + 100_000
243
+ lines = (f"random text line {i}\n" for i in range(over_budget_count))
244
+ handle = _CountingHandle(lines)
245
+
246
+ def fake_open_log(path):
247
+ return handle
248
+
249
+ monkeypatch.setattr(loader, "_open_log", fake_open_log)
250
+ result = sniff_format(tmp_path / "fake")
251
+
252
+ assert result == "blob"
253
+ assert handle.read_count == _SNIFF_MAX_PEEK
254
+
255
+
256
+ def test_sniff_format_pulls_only_as_many_lines_as_file_has(
257
+ monkeypatch, tmp_path: Path
258
+ ) -> None:
259
+ """When the file is smaller than the budget, only the file's lines are pulled."""
260
+ short_lines = ["hello\n", "world\n", "shorter than budget\n"]
261
+ assert len(short_lines) < _SNIFF_MAX_PEEK
262
+ handle = _CountingHandle(short_lines)
263
+
264
+ def fake_open_log(path):
265
+ return handle
266
+
267
+ monkeypatch.setattr(loader, "_open_log", fake_open_log)
268
+ result = sniff_format(tmp_path / "fake")
269
+
270
+ assert result == "blob"
271
+ assert handle.read_count == len(short_lines)
272
+
273
+
274
+ # ── sniff_format_detailed: schema + origin + empty-state ─────────────────────
275
+
276
+ def test_detailed_zeek_ndjson_conn_origin_zeek(tmp_path: Path) -> None:
277
+ path = tmp_path / "conn.log"
278
+ _write(path, [ZEEK_NDJSON_CONN_LINE])
279
+ result = sniff_format_detailed(path)
280
+ assert result.state == "classified"
281
+ assert result.schema == "conn"
282
+ assert result.origin == "zeek"
283
+
284
+
285
+ def test_detailed_zeek_ndjson_dns_origin_zeek(tmp_path: Path) -> None:
286
+ path = tmp_path / "dns.log"
287
+ _write(path, [ZEEK_NDJSON_DNS_LINE])
288
+ result = sniff_format_detailed(path)
289
+ assert result.state == "classified"
290
+ assert result.schema == "dns"
291
+ assert result.origin == "zeek"
292
+
293
+
294
+ def test_detailed_dnsmasq_origin_pihole(tmp_path: Path) -> None:
295
+ path = tmp_path / "pihole.log"
296
+ _write(path, DNSMASQ_LINES)
297
+ result = sniff_format_detailed(path)
298
+ assert result.state == "classified"
299
+ assert result.schema == "dns"
300
+ assert result.origin == "pihole"
301
+
302
+
303
+ def test_detailed_cloudtrail_origin_cloudtrail(tmp_path: Path) -> None:
304
+ path = tmp_path / "cloudtrail.json.log"
305
+ _write(path, [CLOUDTRAIL_NDJSON_LINE])
306
+ result = sniff_format_detailed(path)
307
+ assert result.state == "classified"
308
+ assert result.schema == "cloudtrail"
309
+ assert result.origin == "cloudtrail"
310
+
311
+
312
+ def test_detailed_syslog_origin_syslog(tmp_path: Path) -> None:
313
+ path = tmp_path / "syslog"
314
+ _write(path, SYSLOG_LINES)
315
+ result = sniff_format_detailed(path)
316
+ assert result.state == "classified"
317
+ assert result.schema == "syslog"
318
+ assert result.origin == "syslog"
319
+
320
+
321
+ def test_detailed_zero_byte_file_is_empty(tmp_path: Path) -> None:
322
+ path = tmp_path / "empty.log"
323
+ path.write_text("", encoding="utf-8")
324
+ result = sniff_format_detailed(path)
325
+ assert result.state == "empty"
326
+ assert result.schema is None
327
+ assert result.origin is None
328
+
329
+
330
+ def test_detailed_short_whitespace_only_file_is_empty(tmp_path: Path) -> None:
331
+ path = tmp_path / "blanks.log"
332
+ _write(path, ["\n", " \n", "\t\n"])
333
+ result = sniff_format_detailed(path)
334
+ assert result.state == "empty"
335
+ assert result.schema is None
336
+
337
+
338
+ def test_detailed_long_whitespace_falls_to_blob_not_empty(
339
+ monkeypatch, tmp_path: Path
340
+ ) -> None:
341
+ """More leading-whitespace lines than the bounded peek can prove → blob, not empty.
342
+
343
+ The EOF probe cannot confirm the file is truly empty when it has more
344
+ content past the peek. Whitespace beyond what we read must NOT short-
345
+ circuit to the empty path. Locks the EOF-sensitive contract.
346
+ """
347
+ # Yield _SNIFF_MAX_PEEK whitespace lines followed by more whitespace —
348
+ # the EOF probe will pull one extra line, so EOF is not reached and
349
+ # the result must NOT be "empty".
350
+ extra_whitespace = (f" \n" for _ in range(_SNIFF_MAX_PEEK + 5))
351
+ handle = _CountingHandle(extra_whitespace)
352
+
353
+ def fake_open_log(path):
354
+ return handle
355
+
356
+ fake_path = tmp_path / "fake"
357
+ fake_path.write_text("placeholder", encoding="utf-8") # nonzero size to pass stat()
358
+ monkeypatch.setattr(loader, "_open_log", fake_open_log)
359
+ result = sniff_format_detailed(fake_path)
360
+ assert result.state == "classified"
361
+ assert result.schema == "blob"
362
+ assert result.origin is None
363
+ # Peek + 1 EOF probe; never more.
364
+ assert handle.read_count == _SNIFF_MAX_PEEK + 1
365
+
366
+
367
+ def test_detailed_unrecognized_text_is_blob(tmp_path: Path) -> None:
368
+ path = tmp_path / "mystery.txt"
369
+ _write(path, ["hello world\n", "this is not a log\n"])
370
+ result = sniff_format_detailed(path)
371
+ assert result.state == "classified"
372
+ assert result.schema == "blob"
373
+ assert result.origin is None