loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,310 @@
1
+ """Zeek TSV log parser — header-block parsing and type coercion.
2
+
3
+ This module is the TSV front-end for Zeek log parsing. It produces a pre-normalization
4
+ DataFrame with Zeek-native column names and Python-typed values, ready for consumption
5
+ by the normalizers in parsers/zeek.py (_normalize_conn_df, _normalize_dns_df).
6
+
7
+ Architecture: one normalizer, two front-ends. The NDJSON front-end (common/loader.py)
8
+ and this TSV front-end both produce the same intermediate DataFrame shape. Normalizers
9
+ are never aware of which format was loaded.
10
+
11
+ File I/O and decompression are the caller's responsibility (common/loader.py, stage 2).
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+ from dataclasses import dataclass, field
18
+ from typing import Any, Iterable, Iterator
19
+
20
+ import pandas as pd
21
+
22
+ # Sentinel returned by _coerce when a field value is the unset token.
23
+ # The caller must omit the key from the record dict entirely.
24
+ # Using absent keys (rather than explicit None) mirrors the NDJSON path:
25
+ # pd.DataFrame(records) produces NaN for absent keys, matching NDJSON absent-field behavior.
26
+ _UNSET = object()
27
+
28
+
29
+ @dataclass
30
+ class _TSVHeader:
31
+ """Parsed Zeek TSV header block directives."""
32
+
33
+ separator: str = "\t"
34
+ set_separator: str = "," # Zeek spec default
35
+ empty_field: str = "(empty)" # Zeek spec default
36
+ unset_field: str = "-" # Zeek spec default
37
+ path: str = ""
38
+ fields: list[str] = field(default_factory=list)
39
+ types: list[str] = field(default_factory=list)
40
+
41
+ # Tracks whether #separator was actually declared (required).
42
+ _separator_seen: bool = field(default=False, repr=False)
43
+
44
+
45
+ def _unescape_separator(raw: str) -> str:
46
+ """Convert Zeek #separator escape sequences (e.g. \\x09) to real characters."""
47
+ return re.sub(r"\\x([0-9a-fA-F]{2})", lambda m: chr(int(m.group(1), 16)), raw)
48
+
49
+
50
+ SNIFF_PEEK_LINES: int = 16
51
+
52
+
53
+ def sniff(sample: list[str]) -> str | None:
54
+ """Recognize a Zeek TSV header and return its digester target.
55
+
56
+ Returns "conn", "dns", or "syslog" when the sample carries a well-formed
57
+ Zeek TSV header block declaring #separator, #fields, and #path with a
58
+ value of "conn", "dns", or "syslog". Returns None for any other shape —
59
+ including text that happens to contain a "#path" substring without a
60
+ real header block, and Zeek TSV logs whose #path is something else
61
+ (notice/analyzer/etc. — no digester yet, fall to the blob floor).
62
+
63
+ Pure: takes already-decoded lines, performs no I/O. Mirrors the header
64
+ parse in _parse_header without draining the iterator.
65
+ """
66
+ separator: str | None = None
67
+ path: str | None = None
68
+ fields_seen = False
69
+ saw_directive = False
70
+
71
+ for raw_line in sample:
72
+ line = raw_line.rstrip("\r\n")
73
+ if not line:
74
+ continue
75
+ if not line.startswith("#"):
76
+ break
77
+ saw_directive = True
78
+ if line.startswith("#separator ") or line.startswith("#separator\t"):
79
+ raw_val = line.split(None, 1)[1].strip()
80
+ separator = _unescape_separator(raw_val)
81
+ continue
82
+ if separator is None:
83
+ # Other directives use the parsed separator; without #separator
84
+ # we cannot split them. Skip — #separator may yet appear.
85
+ continue
86
+ parts = line[1:].split(separator)
87
+ key = parts[0]
88
+ values = parts[1:]
89
+ if key == "path":
90
+ path = values[0] if values else ""
91
+ elif key == "fields":
92
+ fields_seen = True
93
+
94
+ if not saw_directive or separator is None or not fields_seen:
95
+ return None
96
+ if path == "conn":
97
+ return "conn"
98
+ if path == "dns":
99
+ return "dns"
100
+ if path == "syslog":
101
+ return "syslog"
102
+ return None
103
+
104
+
105
+ def _parse_header(lines: Iterator[str]) -> tuple[_TSVHeader, list[str]]:
106
+ """Parse the Zeek TSV header block and return (header, buffered_data_lines).
107
+
108
+ Reads #-prefixed directive lines until the first non-# line or #close.
109
+ The first non-# line is the first data row; it is included in data_lines.
110
+
111
+ Raises ValueError if #fields or #types is missing, their lengths differ,
112
+ or #separator was never declared before data rows appear.
113
+ """
114
+ hdr = _TSVHeader()
115
+ data_lines: list[str] = []
116
+
117
+ for raw_line in lines:
118
+ line = raw_line.rstrip("\r\n")
119
+
120
+ if not line:
121
+ continue
122
+
123
+ if line.startswith("#separator ") or line.startswith("#separator\t"):
124
+ # #separator uses plain space as its own delimiter.
125
+ raw_val = line.split(None, 1)[1].strip()
126
+ hdr.separator = _unescape_separator(raw_val)
127
+ hdr._separator_seen = True
128
+ continue
129
+
130
+ if line.startswith("#close"):
131
+ break
132
+
133
+ if line.startswith("#"):
134
+ # All other directives use the declared separator.
135
+ parts = line[1:].split(hdr.separator)
136
+ key = parts[0]
137
+ values = parts[1:]
138
+
139
+ if key == "set_separator":
140
+ hdr.set_separator = values[0] if values else ","
141
+ elif key == "empty_field":
142
+ hdr.empty_field = values[0] if values else "(empty)"
143
+ elif key == "unset_field":
144
+ hdr.unset_field = values[0] if values else "-"
145
+ elif key == "path":
146
+ hdr.path = values[0] if values else ""
147
+ elif key == "fields":
148
+ hdr.fields = values
149
+ elif key == "types":
150
+ hdr.types = values
151
+ # #open and other directives are silently ignored.
152
+ continue
153
+
154
+ # First non-# line: data row.
155
+ if not hdr._separator_seen:
156
+ raise ValueError("Zeek TSV header missing #separator")
157
+ data_lines.append(line)
158
+ break
159
+
160
+ # Drain remaining lines.
161
+ for raw_line in lines:
162
+ line = raw_line.rstrip("\r\n")
163
+ if line.startswith("#close"):
164
+ break
165
+ if line:
166
+ data_lines.append(line)
167
+
168
+ # Validate required directives.
169
+ if not hdr.fields:
170
+ raise ValueError("Zeek TSV header missing #fields")
171
+ if not hdr.types:
172
+ raise ValueError("Zeek TSV header missing #types")
173
+ if len(hdr.fields) != len(hdr.types):
174
+ raise ValueError(
175
+ f"Zeek TSV #fields has {len(hdr.fields)} columns but "
176
+ f"#types has {len(hdr.types)} — header is malformed"
177
+ )
178
+
179
+ return hdr, data_lines
180
+
181
+
182
+ # Container-type prefix regex for set[…] and vector[…].
183
+ _CONTAINER_RE = re.compile(r"^(?:set|vector)\[(.+)\]$")
184
+
185
+ # Known scalar Zeek types. Anything not in this set or not a container raises.
186
+ _SCALAR_TYPES = frozenset({
187
+ "time", "interval", "double",
188
+ "count", "int", "port",
189
+ "bool",
190
+ "addr", "string", "enum",
191
+ })
192
+
193
+
194
+ def _coerce(
195
+ raw: str,
196
+ zeek_type: str,
197
+ set_sep: str,
198
+ empty_field: str,
199
+ unset_field: str,
200
+ ) -> Any:
201
+ """Coerce a raw TSV field value to its Python equivalent for the given Zeek type.
202
+
203
+ Returns _UNSET when the value is the unset token — the caller must omit the key
204
+ from the record dict rather than inserting None.
205
+
206
+ Raises ValueError for unknown types, invalid bool tokens, empty tokens on numeric
207
+ or bool types, and _UNSET appearing inside a collection element.
208
+ """
209
+ if raw == unset_field:
210
+ return _UNSET
211
+
212
+ # Container types: set[inner] and vector[inner].
213
+ m = _CONTAINER_RE.match(zeek_type)
214
+ if m:
215
+ if raw == empty_field:
216
+ return []
217
+ inner_type = m.group(1)
218
+ result = []
219
+ for element in raw.split(set_sep):
220
+ coerced = _coerce(element, inner_type, set_sep, empty_field, unset_field)
221
+ if coerced is _UNSET:
222
+ raise ValueError(
223
+ f"Zeek TSV: unset token found inside collection element "
224
+ f"(type {zeek_type!r}); individual elements cannot be unset"
225
+ )
226
+ result.append(coerced)
227
+ return result
228
+
229
+ # Scalar types.
230
+ if zeek_type in ("time", "interval", "double"):
231
+ if raw == empty_field:
232
+ raise ValueError(
233
+ f"Zeek TSV: empty token in numeric field (type {zeek_type!r})"
234
+ )
235
+ return float(raw)
236
+
237
+ if zeek_type in ("count", "int", "port"):
238
+ if raw == empty_field:
239
+ raise ValueError(
240
+ f"Zeek TSV: empty token in numeric field (type {zeek_type!r})"
241
+ )
242
+ return int(raw)
243
+
244
+ if zeek_type == "bool":
245
+ if raw == empty_field:
246
+ raise ValueError("Zeek TSV: empty token in bool field")
247
+ if raw == "T":
248
+ return True
249
+ if raw == "F":
250
+ return False
251
+ raise ValueError(
252
+ f"Zeek TSV: invalid bool token {raw!r} — expected 'T' or 'F'"
253
+ )
254
+
255
+ if zeek_type in ("addr", "string", "enum"):
256
+ return "" if raw == empty_field else raw
257
+
258
+ raise ValueError(f"Zeek TSV: unsupported Zeek type {zeek_type!r}")
259
+
260
+
261
+ def parse_tsv_log(source: Iterable[str]) -> pd.DataFrame:
262
+ """Parse a single Zeek TSV log stream and return a pre-normalization DataFrame.
263
+
264
+ source may be an open text stream or any iterable of strings (e.g. the result of
265
+ str.splitlines(keepends=True)).
266
+
267
+ Column names retain Zeek-native names (id.orig_h, id.resp_p, TTLs, answers, etc.).
268
+ Values are typed as Python objects matching what json.loads produces on the NDJSON
269
+ path: floats for time/interval/double, ints for count/int/port, bools for bool,
270
+ lists for set[…]/vector[…], absent key for unset fields.
271
+
272
+ This output is intended to be passed directly to _normalize_conn_df or
273
+ _normalize_dns_df in loghunter.parsers.zeek, unchanged.
274
+
275
+ Raises ValueError for malformed headers, ragged rows, invalid coercions, or
276
+ unknown Zeek types.
277
+ """
278
+ hdr, data_lines = _parse_header(iter(source))
279
+
280
+ n_fields = len(hdr.fields)
281
+ records: list[dict[str, Any]] = []
282
+
283
+ for lineno, line in enumerate(data_lines, start=1):
284
+ # Strip any residual line endings (header parser may have left some if
285
+ # data_lines were collected after the first data row was already stripped).
286
+ line = line.rstrip("\r\n")
287
+ if not line or line.startswith("#"):
288
+ continue
289
+
290
+ tokens = line.split(hdr.separator)
291
+ if len(tokens) != n_fields:
292
+ raise ValueError(
293
+ f"Zeek TSV: line {lineno} has {len(tokens)} fields, "
294
+ f"expected {n_fields}"
295
+ )
296
+
297
+ record: dict[str, Any] = {}
298
+ for fname, ftype, raw in zip(hdr.fields, hdr.types, tokens):
299
+ value = _coerce(
300
+ raw, ftype, hdr.set_separator, hdr.empty_field, hdr.unset_field
301
+ )
302
+ if value is not _UNSET:
303
+ record[fname] = value
304
+
305
+ records.append(record)
306
+
307
+ if not records:
308
+ return pd.DataFrame(columns=hdr.fields)
309
+
310
+ return pd.DataFrame(records)