loghunter-cli 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. loghunter/__init__.py +3 -0
  2. loghunter/cli.py +1108 -0
  3. loghunter/cli_init.py +567 -0
  4. loghunter/common/__init__.py +1 -0
  5. loghunter/common/allowlist.py +436 -0
  6. loghunter/common/clustering.py +326 -0
  7. loghunter/common/config.py +221 -0
  8. loghunter/common/display.py +323 -0
  9. loghunter/common/errors.py +45 -0
  10. loghunter/common/finding.py +239 -0
  11. loghunter/common/loader/__init__.py +136 -0
  12. loghunter/common/loader/diagnostics.py +94 -0
  13. loghunter/common/loader/discovery.py +335 -0
  14. loghunter/common/loader/io.py +76 -0
  15. loghunter/common/loader/pipeline.py +1010 -0
  16. loghunter/common/loader/sniff.py +184 -0
  17. loghunter/common/loader/types.py +207 -0
  18. loghunter/common/loader/windowing.py +523 -0
  19. loghunter/common/output.py +93 -0
  20. loghunter/common/paths.py +105 -0
  21. loghunter/common/sources.py +392 -0
  22. loghunter/data/allowlist/connections.txt +50 -0
  23. loghunter/data/allowlist/domains_devices.txt +5 -0
  24. loghunter/data/allowlist/domains_homelab.txt +5 -0
  25. loghunter/data/allowlist/domains_universal.txt +125 -0
  26. loghunter/data/config_example.toml +144 -0
  27. loghunter/detectors/__init__.py +5 -0
  28. loghunter/detectors/auth.py +27 -0
  29. loghunter/detectors/aws.py +671 -0
  30. loghunter/detectors/beacon.py +258 -0
  31. loghunter/detectors/dns.py +778 -0
  32. loghunter/detectors/dnsblock.py +29 -0
  33. loghunter/detectors/duration.py +178 -0
  34. loghunter/detectors/protocol.py +26 -0
  35. loghunter/detectors/scan.py +735 -0
  36. loghunter/detectors/ssl.py +25 -0
  37. loghunter/detectors/syslog.py +266 -0
  38. loghunter/detectors/weird.py +27 -0
  39. loghunter/digest/__init__.py +43 -0
  40. loghunter/digest/_stats.py +182 -0
  41. loghunter/digest/blob.py +698 -0
  42. loghunter/digest/cloudtrail.py +341 -0
  43. loghunter/digest/conn.py +367 -0
  44. loghunter/digest/dns.py +364 -0
  45. loghunter/digest/syslog.py +269 -0
  46. loghunter/exporters/__init__.py +534 -0
  47. loghunter/exporters/cloudtrail.py +499 -0
  48. loghunter/exporters/splunk.py +222 -0
  49. loghunter/outputs/__init__.py +1 -0
  50. loghunter/outputs/allowlist.py +75 -0
  51. loghunter/outputs/csv.py +70 -0
  52. loghunter/outputs/email.py +44 -0
  53. loghunter/outputs/html.py +99 -0
  54. loghunter/outputs/json.py +77 -0
  55. loghunter/outputs/text.py +1422 -0
  56. loghunter/parsers/__init__.py +1 -0
  57. loghunter/parsers/cloudtrail.py +287 -0
  58. loghunter/parsers/dnsmasq.py +331 -0
  59. loghunter/parsers/syslog.py +150 -0
  60. loghunter/parsers/zeek.py +294 -0
  61. loghunter/parsers/zeek_tsv.py +310 -0
  62. loghunter/runner.py +1895 -0
  63. loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
  64. loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
  65. loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
  66. loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  67. loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
  68. loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
  69. migrations/cloudtrail_parquet.py +59 -0
  70. migrations/conn_fft.py +550 -0
  71. migrations/conn_scan.py +1097 -0
  72. migrations/dns_dbscan.py +520 -0
  73. migrations/get_syslog.py +402 -0
  74. migrations/syslog_drain3.py +479 -0
  75. scratch/junk/parquet.py +59 -0
  76. tests/__init__.py +1 -0
  77. tests/_cloudtrail_fakes.py +116 -0
  78. tests/conftest.py +17 -0
  79. tests/test_allowlist_defaults_accessor.py +90 -0
  80. tests/test_architecture_spine.py +302 -0
  81. tests/test_aws_detector.py +504 -0
  82. tests/test_be_like_water.py +106 -0
  83. tests/test_cli_help.py +342 -0
  84. tests/test_cli_multi_positional.py +458 -0
  85. tests/test_cloudtrail_exporter.py +631 -0
  86. tests/test_cloudtrail_exporter_botocore.py +207 -0
  87. tests/test_cloudtrail_parser.py +393 -0
  88. tests/test_clustering.py +85 -0
  89. tests/test_clustering_interruptible.py +404 -0
  90. tests/test_config_cli.py +1006 -0
  91. tests/test_config_example_drift.py +164 -0
  92. tests/test_digest_blob.py +1237 -0
  93. tests/test_digest_cli.py +1040 -0
  94. tests/test_digest_cloudtrail.py +980 -0
  95. tests/test_digest_conn.py +1189 -0
  96. tests/test_digest_dns.py +770 -0
  97. tests/test_digest_stats.py +282 -0
  98. tests/test_digest_syslog.py +724 -0
  99. tests/test_display.py +370 -0
  100. tests/test_dns_detector.py +1010 -0
  101. tests/test_dnsmasq_parser.py +467 -0
  102. tests/test_duration_detector.py +491 -0
  103. tests/test_export_orchestrator_shape.py +153 -0
  104. tests/test_init_wizard.py +707 -0
  105. tests/test_loader.py +3639 -0
  106. tests/test_loader_package_surface.py +115 -0
  107. tests/test_loader_window_model.py +215 -0
  108. tests/test_output_path_cascade.py +575 -0
  109. tests/test_resolve_path.py +111 -0
  110. tests/test_root_provenance.py +212 -0
  111. tests/test_runner.py +2599 -0
  112. tests/test_scan_detector.py +455 -0
  113. tests/test_search_paths.py +50 -0
  114. tests/test_sniff_orchestrator.py +373 -0
  115. tests/test_sniff_recognizers.py +573 -0
  116. tests/test_source_resolution_seam.py +471 -0
  117. tests/test_sources.py +648 -0
  118. tests/test_splunk_exporter.py +351 -0
  119. tests/test_syslog_detector.py +458 -0
  120. tests/test_syslog_parser.py +582 -0
  121. tests/test_text_output.py +1225 -0
  122. tests/test_zeek_tsv_parser.py +580 -0
@@ -0,0 +1,698 @@
1
+ """blob summariser — orient-before-the-hunt for unrecognized sources.
2
+
3
+ The blob path is the digest's escape hatch for inputs that have NO parser.
4
+ It describes bytes as bytes and extracts zero fields — not even a timestamp.
5
+ The moment any code reads a field it has become a parser, and parsers are a
6
+ separate component with a separate contract; the blob path exists precisely
7
+ so the operator can point digest at an unknown source and get a visibly
8
+ degraded card rather than an error and a shrug.
9
+
10
+ O(sample) rail (non-negotiable): the profiler reads ONE bounded sample and
11
+ profiles THAT. A 1 GB file and a 1 KB file cost the same. The only whole-file
12
+ fact is the on-disk size (a stat, free). Random seeks for plain files;
13
+ head-only decompressed prefix for gzip. drain3 — the only expensive item —
14
+ runs over the sampled lines, behind a quarantine flag, and is suppressed by a
15
+ meaninglessness floor when its output would be vacuous.
16
+
17
+ Identification cascade: magic bytes first (TERMINAL — content IS the
18
+ artifact; CONTAINER — bytes are compressed transport, decompress and look
19
+ underneath). Char-class profile next (binary or text, sample-derived
20
+ fraction). Shape-guess for text: a labeled best-guess (JSON / CSV / TSV /
21
+ HTML / key-value / long-lines / freeform) that drives the headline. Every
22
+ output is a GUESS, never a parsed claim.
23
+
24
+ Determinism: seek offsets are derived from file size (evenly spaced
25
+ fractions), so the same file always yields the same sample and the same
26
+ card. No unseeded randomness.
27
+
28
+ Per-file boundary discipline: blob today is single-file under the sniff
29
+ path. The decode-per-chunk discipline is preserved at the helper boundary so
30
+ a future multi-file return cannot silently merge line N of file A with
31
+ line 1 of file B and falsify line/template/token facts.
32
+ """
33
+
34
+ from __future__ import annotations
35
+
36
+ import bz2
37
+ import gzip
38
+ import json
39
+ import lzma
40
+ import statistics
41
+ from collections import Counter
42
+ from pathlib import Path
43
+
44
+ from loghunter.common.finding import BlobCard
45
+
46
+
47
+ # ── Calibration constants ────────────────────────────────────────────────────
48
+
49
+ # Plain-file sample budget. Head + K seeks; each seek reads a bounded byte
50
+ # window. Total bytes read ≤ _HEAD_BYTES + _SEEK_COUNT * _SEEK_BYTES + slack
51
+ # for "skip to next newline" tails. Bounded regardless of file size — the rail.
52
+ _HEAD_BYTES = 64 * 1024 # 64 KB head
53
+ _SEEK_COUNT = 5 # 5 evenly-spaced seek points
54
+ _SEEK_BYTES = 32 * 1024 # 32 KB per seek
55
+ # A file must be larger than the head + seek budget by this factor to bother
56
+ # with seeks — smaller files are fully covered by the head alone.
57
+ _SEEK_MIN_SIZE = _HEAD_BYTES * 4
58
+
59
+ # Compressed (head-only) sample budget.
60
+ _DECOMPRESSED_PREFIX_BYTES = 256 * 1024 # 256 KB after decompression
61
+
62
+ # Hard cap on lines profiled. With ~80-char average lines and the head+seeks
63
+ # budget above, we expect well under this; the cap protects against
64
+ # pathological all-short-lines inputs (e.g. lots of empty lines).
65
+ _MAX_SAMPLED_LINES = 8000
66
+
67
+ # Line-length shape gate (unchanged).
68
+ _SHAPE_CV_GATE = 0.5
69
+
70
+ # Token / template caps.
71
+ _TOP_TEMPLATE_N = 6
72
+ _TOP_TOKENS_N = 10
73
+
74
+ # drain3 engine config — mirrors the syslog detector's defaults; do not import
75
+ # from detectors/syslog.py, blob is upstream of any parsed frame.
76
+ _DRAIN_SIM_TH = 0.5
77
+ _DRAIN_DEPTH = 4
78
+ _DRAIN_PARAMETRIZE_NUMERIC = True
79
+
80
+ # QUARANTINE switch. When False, drain3 does not run and Templates slot
81
+ # vanishes on every blob card — the renderer copes because every template
82
+ # field is Optional. Flip to False from the perf probe if sampled-drain3
83
+ # proves too slow.
84
+ _BLOB_DRAIN3_ENABLED = True
85
+
86
+ # Meaninglessness floor: when distinct_templates / sampled_lines exceeds this
87
+ # ratio, the output is nearly 1-template-per-line — freeform that does not
88
+ # template. Suppress the result; renderer vanishes the slot. Better silent
89
+ # than vacuous ("~480 distinct structures over 500 lines" tells nothing).
90
+ _TEMPLATE_RATIO_FLOOR = 0.5
91
+
92
+ # Printable byte set: TAB, LF, CR, plus space..tilde (0x20..0x7E).
93
+ _PRINTABLE_BYTES = frozenset(b"\t\n\r") | frozenset(range(0x20, 0x7F))
94
+
95
+ # 256-entry translation table — 0x01 for printable, 0x00 for everything else.
96
+ # Lets us count printables in C via bytes.translate + bytes.count instead of
97
+ # a Python-level loop over every byte. Operates on the sample only.
98
+ _PRINTABLE_TRANSLATE = bytes(
99
+ 1 if i in _PRINTABLE_BYTES else 0
100
+ for i in range(256)
101
+ )
102
+
103
+
104
+ # ── Magic-byte signature table ──────────────────────────────────────────────
105
+ #
106
+ # Hand-rolled, zero dependency. TERMINAL = content IS the artifact (image,
107
+ # binary, document) — no point profiling text underneath, because there is
108
+ # none. CONTAINER = bytes are compressed transport — switch to the
109
+ # decompressed-head path and profile the content shape underneath.
110
+ #
111
+ # Order matters within each list: longer/more-specific prefixes first. Each
112
+ # entry is (prefix_bytes, label).
113
+ #
114
+ # Container support: gzip, bzip2, xz — all stdlib (gzip / bz2 / lzma). The
115
+ # magic-byte ID is authoritative; suffix is a fast-path hint only (see
116
+ # _open_log_bytes). zstd is DEFERRED — no stdlib opener before Python 3.14,
117
+ # would add a dependency for a blob-nicety; revisit when the toolchain
118
+ # minimum bumps.
119
+
120
+ _TERMINAL_MAGIC: list[tuple[bytes, str]] = [
121
+ (b"\x89PNG\r\n\x1a\n", "PNG image"),
122
+ (b"\xff\xd8\xff", "JPEG image"),
123
+ (b"GIF87a", "GIF image"),
124
+ (b"GIF89a", "GIF image"),
125
+ (b"%PDF-", "PDF document"),
126
+ (b"\x7fELF", "ELF binary"),
127
+ (b"PK\x03\x04", "zip archive"),
128
+ (b"PK\x05\x06", "zip archive (empty)"),
129
+ (b"PK\x07\x08", "zip archive (spanned)"),
130
+ (b"\xca\xfe\xba\xbe", "Java class file"),
131
+ (b"\x00asm", "WebAssembly module"),
132
+ (b"SQLite format 3\x00", "SQLite database"),
133
+ ]
134
+
135
+ _CONTAINER_MAGIC: list[tuple[bytes, str]] = [
136
+ (b"\x1f\x8b", "gzip"),
137
+ (b"\xfd7zXZ\x00", "xz"),
138
+ (b"BZh", "bzip2"),
139
+ ]
140
+
141
+ # Suffix → container label fast-path. The magic table above is the canonical
142
+ # identifier; this mapping just lets us skip the magic read on the well-named
143
+ # common case. A correctly-named .gz / .bz2 / .xz file is opened directly via
144
+ # the matching stdlib opener; an UNKNOWN suffix (or a misnamed .log that
145
+ # happens to be xz-compressed) is identified by magic and routed to the same
146
+ # opener — see summarize_blob and _open_log_bytes.
147
+ _SUFFIX_TO_CONTAINER: dict[str, str] = {
148
+ ".gz": "gzip",
149
+ ".bz2": "bzip2",
150
+ ".xz": "xz",
151
+ }
152
+
153
+
154
+ # ── File openers ────────────────────────────────────────────────────────────
155
+
156
+ def _open_log_bytes(path: Path, container_label: str | None = None):
157
+ """Open a plain or container-compressed file in binary mode.
158
+
159
+ Parallel to common.loader._open_log but bytes-mode — the blob path needs
160
+ pre-decode access for the char-class profile and the strict UTF-8 probe.
161
+
162
+ ``container_label`` is the magic-derived container kind (one of
163
+ ``"gzip"``, ``"bzip2"``, ``"xz"``) when the caller already identified
164
+ the file as compressed. When ``None``, the path suffix is consulted as a
165
+ fast-path hint; an unmatched suffix opens the file plain. Magic-driven
166
+ callers should pass ``container_label`` explicitly so a misnamed
167
+ container (xz bytes in ``mystery.log``) routes to the correct opener.
168
+ """
169
+ if container_label is None:
170
+ container_label = _SUFFIX_TO_CONTAINER.get(path.suffix.lower())
171
+ if container_label == "gzip":
172
+ return gzip.open(path, "rb")
173
+ if container_label == "bzip2":
174
+ return bz2.open(path, "rb")
175
+ if container_label == "xz":
176
+ return lzma.open(path, "rb")
177
+ return path.open("rb")
178
+
179
+
180
+ # ── Sampling ────────────────────────────────────────────────────────────────
181
+
182
+ def _read_head(path: Path) -> bytes:
183
+ """Read the first _HEAD_BYTES bytes of a plain file."""
184
+ with path.open("rb") as fh:
185
+ return fh.read(_HEAD_BYTES)
186
+
187
+
188
+ def _read_seek(path: Path, offset: int) -> bytes:
189
+ """Seek to offset, read a hard-bounded window, return content after the
190
+ first newline within that window.
191
+
192
+ O(sample) rail: total bytes pulled from disk is EXACTLY _SEEK_BYTES per
193
+ seek, regardless of where the next newline lives. An unbounded
194
+ ``readline()`` here would scan to EOF on a long-line / no-newline file
195
+ (5 MB single-line file pulled 13 MB through readline()), breaking the
196
+ rail. The discipline: read the bounded window in ONE call; if a newline
197
+ lives inside it, return the post-newline slice (a clean line boundary);
198
+ if not, this seek yielded no usable lines — return an empty chunk and
199
+ let the cascade fall back to the head sample.
200
+ """
201
+ with path.open("rb") as fh:
202
+ fh.seek(offset)
203
+ window = fh.read(_SEEK_BYTES)
204
+ nl = window.find(b"\n")
205
+ if nl < 0:
206
+ return b""
207
+ return window[nl + 1:]
208
+
209
+
210
+ def _sample_plain_body(
211
+ path: Path, head: bytes, st_size: int,
212
+ ) -> tuple[list[bytes], int]:
213
+ """Read deterministic body-seek chunks for a plain file.
214
+
215
+ Returns (body_chunks, sample_read_count). The caller already has the
216
+ head bytes — avoids a duplicate head read.
217
+
218
+ Seek offsets are evenly spaced fractions of the file size — deterministic
219
+ by construction; the same file → the same sample. No RNG. For small
220
+ files (≤ _SEEK_MIN_SIZE), skip the seeks; the head alone covers them.
221
+ """
222
+ if st_size <= _SEEK_MIN_SIZE:
223
+ return [], 1
224
+
225
+ body_chunks: list[bytes] = []
226
+ # Evenly spaced offsets in (head_end, st_size). Use (k / (K+1)) * st_size
227
+ # so seeks land at 1/6, 2/6, 3/6, 4/6, 5/6 of the file when K=5.
228
+ for k in range(1, _SEEK_COUNT + 1):
229
+ offset = (st_size * k) // (_SEEK_COUNT + 1)
230
+ # Don't seek into the head region — we already have it.
231
+ if offset < _HEAD_BYTES:
232
+ continue
233
+ body_chunks.append(_read_seek(path, offset))
234
+ return body_chunks, 1 + len(body_chunks)
235
+
236
+
237
+ def _sample_compressed(path: Path, container_label: str | None = None) -> bytes:
238
+ """Decompress up to _DECOMPRESSED_PREFIX_BYTES of a compressed file.
239
+
240
+ HEAD-ONLY by construction — random seek into a compressed stream is
241
+ invalid for all three supported containers (gzip / bzip2 / xz). The
242
+ bound here is the decompressed prefix size, not the on-disk size; a
243
+ container that decompresses to many GB still costs O(sample) here.
244
+
245
+ ``container_label`` is forwarded to ``_open_log_bytes``; pass it
246
+ explicitly when the kind came from magic ID so a misnamed file routes
247
+ correctly.
248
+ """
249
+ with _open_log_bytes(path, container_label) as fh:
250
+ return fh.read(_DECOMPRESSED_PREFIX_BYTES)
251
+
252
+
253
+ # ── Char-class profile (over RAW sampled bytes — sample fact, not whole-file) ─
254
+
255
+ def _char_class_profile(sample: bytes) -> tuple[float, float]:
256
+ """Return (printable_pct, nonprintable_pct) over the raw sample bytes.
257
+
258
+ Printable ≡ TAB, LF, CR, or any byte in [0x20, 0x7E]. Everything else —
259
+ other control characters, 0x7F, the entire 0x80-0xFF range — counts as
260
+ non-printable. This is a SAMPLE fact (computed over the bounded sample
261
+ only). Do not optimize it into a whole-file scan "for accuracy" — that
262
+ would break the O(sample) rail. The Bytes row is honest as a
263
+ sample-derived fraction; sampling bias is acceptable for orientation.
264
+ """
265
+ if not sample:
266
+ return 0.0, 0.0
267
+ translated = sample.translate(_PRINTABLE_TRANSLATE)
268
+ printable = translated.count(b"\x01")
269
+ pct = printable / len(sample) * 100.0
270
+ return pct, 100.0 - pct
271
+
272
+
273
+ # ── UTF-8 cleanness probe ───────────────────────────────────────────────────
274
+
275
+ def _utf8_probe(sample: bytes) -> bool:
276
+ """True iff the sample decodes strictly as UTF-8.
277
+
278
+ A single strict decode over the concatenated sample. The downstream
279
+ line-level decode (in _decode_lines) uses errors="replace" so the
280
+ profiler is robust to non-UTF-8 bytes — but the renderer only claims
281
+ "UTF-8 clean" when this probe succeeded.
282
+
283
+ Per-file note: if multi-file blob ever returns, this concatenation
284
+ becomes a boundary problem — probe per-file and AND the results.
285
+ Today's caller passes a single file's sample, so a flat concat is fine.
286
+ """
287
+ if not sample:
288
+ return True
289
+ try:
290
+ sample.decode("utf-8")
291
+ except UnicodeDecodeError:
292
+ return False
293
+ return True
294
+
295
+
296
+ # ── Line decoding ───────────────────────────────────────────────────────────
297
+
298
+ def _decode_lines(chunk: bytes) -> list[str]:
299
+ """Decode a sample chunk and split to lines, errors=replace.
300
+
301
+ Splitlines (no keepends) on the replacement-decoded text. One chunk at a
302
+ time so callers that pass multiple chunks (head + seek chunks for plain)
303
+ preserve per-chunk boundaries — never merge the tail of one chunk with
304
+ the head of the next.
305
+ """
306
+ if not chunk:
307
+ return []
308
+ return chunk.decode("utf-8", errors="replace").splitlines()
309
+
310
+
311
+ # ── Line-length shape ───────────────────────────────────────────────────────
312
+
313
+ def _line_length_shape(
314
+ lengths: list[int],
315
+ ) -> tuple[float, float, int, int, float, str]:
316
+ """Return (mean, median, p95, max, stdev, shape) for line lengths.
317
+
318
+ ``shape`` is exactly ``"uniform"`` or ``"varied"``. A blob with fewer
319
+ than two lines, or a mean of zero, characterises as ``"uniform"`` (no
320
+ variance to call out). p95 is the 95th percentile of the sample's line
321
+ lengths; for very small samples (< 20 lines) ``statistics.quantiles``
322
+ falls back to a single-quantile estimate via max.
323
+ """
324
+ if not lengths:
325
+ return 0.0, 0.0, 0, 0, 0.0, "uniform"
326
+ mean = statistics.fmean(lengths)
327
+ median = float(statistics.median(lengths))
328
+ max_len = max(lengths)
329
+ # 95th percentile. statistics.quantiles(n=20) interpolates between data
330
+ # points (exclusive method), which can EXTRAPOLATE past max on small
331
+ # samples — e.g. lengths=[1, 100] yields p95=184 with max=100. p95 is
332
+ # supposed to be an order statistic FROM the sample, never beyond it.
333
+ # Need at least 20 data points to land a 95th percentile inside the
334
+ # observed range without extrapolation; below that, collapse to max.
335
+ # The `min(..., max_len)` clamp is belt-and-braces against any other
336
+ # degenerate input that survives the threshold.
337
+ if len(lengths) >= 20:
338
+ p95 = min(int(statistics.quantiles(lengths, n=20)[18]), max_len)
339
+ else:
340
+ p95 = max_len
341
+ if len(lengths) < 2 or mean == 0.0:
342
+ return mean, median, p95, max_len, 0.0, "uniform"
343
+ stdev = statistics.stdev(lengths)
344
+ cv = stdev / mean
345
+ shape = "varied" if cv >= _SHAPE_CV_GATE else "uniform"
346
+ return mean, median, p95, max_len, stdev, shape
347
+
348
+
349
+ # ── Magic-byte identification ────────────────────────────────────────────────
350
+
351
+ def _magic_id(head: bytes) -> tuple[str | None, str | None, bytes | None]:
352
+ """Return (kind, label, prefix_bytes) where kind ∈ {"terminal","container",None}.
353
+
354
+ Compares the first ~16 bytes against the hand-rolled signature tables.
355
+ TERMINAL hits win immediately and skip the text-shape cascade.
356
+ CONTAINER hits switch the caller into decompressed-prefix mode.
357
+ """
358
+ for prefix, label in _TERMINAL_MAGIC:
359
+ if head.startswith(prefix):
360
+ return "terminal", label, prefix
361
+ for prefix, label in _CONTAINER_MAGIC:
362
+ if head.startswith(prefix):
363
+ return "container", label, prefix
364
+ return None, None, None
365
+
366
+
367
+ # ── Shape-guess cascade ─────────────────────────────────────────────────────
368
+
369
+ def _shape_guess(body_lines: list[str], all_lines: list[str]) -> str:
370
+ """Return a labeled best-guess of the text's shape.
371
+
372
+ Input rule: prefer body (seek) lines when available — they avoid being
373
+ fooled by a preamble (Zeek #-header, CSV header row). Fall back to the
374
+ full bounded sample for compressed files (head-only) and small plain
375
+ files (no useful seek body). Never returns None; returns "freeform
376
+ text" as the floor. Output is ALWAYS a guess, never a parsed claim.
377
+ """
378
+ lines = body_lines if body_lines else all_lines
379
+ # Strip empty lines for the structural tests; do not modify the input.
380
+ non_empty = [ln for ln in lines if ln.strip()]
381
+ if not non_empty:
382
+ return "freeform text"
383
+
384
+ # JSON: first non-blank starts with { or [, AND a sampled line parses.
385
+ first = non_empty[0].lstrip()
386
+ if first and first[0] in "{[":
387
+ # Try a small handful of lines so the test is cheap.
388
+ for candidate in non_empty[:5]:
389
+ try:
390
+ json.loads(candidate)
391
+ return "JSON"
392
+ except (ValueError, TypeError):
393
+ continue
394
+
395
+ # CSV / TSV: consistent delimiter count across the body, ≥ 1 columns.
396
+ for delim, name in ((",", "CSV"), ("\t", "TSV")):
397
+ counts = [ln.count(delim) for ln in non_empty]
398
+ # Require at least one delimiter per line on most lines, and tight
399
+ # consistency: the dominant count covers ≥ 80% of lines.
400
+ nonzero = [c for c in counts if c >= 1]
401
+ if not nonzero:
402
+ continue
403
+ top_count, top_freq = Counter(nonzero).most_common(1)[0]
404
+ if top_freq / len(counts) >= 0.8:
405
+ cols = top_count + 1
406
+ return f"{name}, ~{cols} columns"
407
+
408
+ # HTML / XML: angle-bracket tag structure on a reasonable fraction.
409
+ tag_lines = sum(1 for ln in non_empty if "<" in ln and ">" in ln)
410
+ if tag_lines / len(non_empty) >= 0.5:
411
+ return "HTML/XML"
412
+
413
+ # key=value or key: value: at least N=2 such pairs on most lines.
414
+ kv_lines = 0
415
+ for ln in non_empty:
416
+ if ln.count("=") >= 2 or _count_kv_colons(ln) >= 2:
417
+ kv_lines += 1
418
+ if kv_lines / len(non_empty) >= 0.6:
419
+ return "key-value text"
420
+
421
+ # Long lines / minified.
422
+ mean_len = sum(len(ln) for ln in non_empty) / len(non_empty)
423
+ if mean_len >= 400:
424
+ return f"very long lines (mean {int(mean_len)} chars), possibly minified"
425
+
426
+ return "freeform text"
427
+
428
+
429
+ def _count_kv_colons(line: str) -> int:
430
+ """Count colon-pairs that look like key: value (colon followed by space)."""
431
+ return line.count(": ")
432
+
433
+
434
+ # ── Template structure (drain3) — QUARANTINED + FLOORED ─────────────────────
435
+
436
+ def _template_structure(
437
+ lines: list[str],
438
+ ) -> tuple[int, float, int, int] | None:
439
+ """Return (distinct, top_coverage_pct, top_n, singletons) or None.
440
+
441
+ Quarantine: returns None when _BLOB_DRAIN3_ENABLED is False (drain3
442
+ dormant). Renderer copes — Templates slot vanishes.
443
+
444
+ Meaninglessness floor: returns None when distinct/total exceeds
445
+ _TEMPLATE_RATIO_FLOOR — the input is freeform that doesn't template,
446
+ and saying "~480 distinct templates over 500 lines" is the opposite of
447
+ helpful. Better silent than vacuous.
448
+
449
+ drain3 runs over the SAMPLE only. The caller passes the sampled lines,
450
+ never the whole file.
451
+ """
452
+ if not _BLOB_DRAIN3_ENABLED or not lines:
453
+ return None
454
+
455
+ try:
456
+ from drain3 import TemplateMiner
457
+ from drain3.template_miner_config import TemplateMinerConfig
458
+ except ImportError as exc:
459
+ raise ImportError(
460
+ "drain3 is required for the digest blob path. "
461
+ "Run: pip install drain3"
462
+ ) from exc
463
+
464
+ cfg = TemplateMinerConfig()
465
+ cfg.drain_sim_th = _DRAIN_SIM_TH
466
+ cfg.drain_depth = _DRAIN_DEPTH
467
+ cfg.parametrize_numeric_tokens = _DRAIN_PARAMETRIZE_NUMERIC
468
+
469
+ miner = TemplateMiner(config=cfg)
470
+ counts: Counter[int] = Counter()
471
+ for line in lines:
472
+ result = miner.add_log_message(line)
473
+ counts[int(result["cluster_id"])] += 1
474
+
475
+ distinct = len(counts)
476
+ total = sum(counts.values())
477
+ if total == 0:
478
+ return None
479
+
480
+ # Meaninglessness floor: near-1:1 templates means freeform.
481
+ if distinct / total >= _TEMPLATE_RATIO_FLOOR:
482
+ return None
483
+
484
+ top_counts = counts.most_common(_TOP_TEMPLATE_N)
485
+ top_sum = sum(c for _, c in top_counts)
486
+ top_coverage = top_sum / total * 100.0
487
+ singletons = sum(1 for c in counts.values() if c == 1)
488
+ return distinct, top_coverage, _TOP_TEMPLATE_N, singletons
489
+
490
+
491
+ # ── Top literal tokens (over the sample) ─────────────────────────────────────
492
+
493
+ def _top_tokens(lines: list[str]) -> list[tuple[str, int]]:
494
+ """Top-N most frequent whitespace-split tokens over the sampled lines.
495
+
496
+ Frequency only — no field semantics. The renderer labels this block
497
+ "[literal]" so no reader mistakes counts for parsed fields.
498
+ """
499
+ counter: Counter[str] = Counter()
500
+ for line in lines:
501
+ counter.update(line.split())
502
+ return list(counter.most_common(_TOP_TOKENS_N))
503
+
504
+
505
+ # ── Top-level JSON object keys (over the sample) ────────────────────────────
506
+
507
+ def _json_field_names(lines: list[str]) -> list[str] | None:
508
+ """First-seen union of top-level JSON object keys across sampled lines.
509
+
510
+ O(sample): iterates the already-held sample, no new I/O. For each
511
+ non-blank line that parses as a JSON OBJECT (dict root), walks its
512
+ top-level keys and accumulates them in first-appearance order
513
+ (dedup-preserving-first-seen). Catches optional fields that only
514
+ appear on some rows (e.g. dhcp's ``host_name``).
515
+
516
+ Returns None when NO sampled line parses to a dict — top-level JSON
517
+ arrays / scalars / malformed JSON / empty sample. The caller treats
518
+ None as "fall back to the existing tokens row."
519
+
520
+ Names only — never reads a value. This is a structural description
521
+ of the bytes' shape (one rung deeper than ``shape: JSON``), strictly
522
+ more rail-respecting than the token dump it replaces, and inherently
523
+ privacy-safe.
524
+ """
525
+ seen_set: set[str] = set()
526
+ seen_list: list[str] = []
527
+ for raw in lines:
528
+ line = raw.strip()
529
+ if not line:
530
+ continue
531
+ try:
532
+ obj = json.loads(line)
533
+ except (ValueError, TypeError):
534
+ continue
535
+ if not isinstance(obj, dict):
536
+ continue
537
+ for key in obj.keys():
538
+ if key not in seen_set:
539
+ seen_set.add(key)
540
+ seen_list.append(str(key))
541
+ return seen_list if seen_list else None
542
+
543
+
544
+ # ── Public entry point ──────────────────────────────────────────────────────
545
+
546
+ def summarize_blob(path: Path) -> BlobCard:
547
+ """Describe the byte stream at ``path`` as bytes. Returns a BlobCard.
548
+
549
+ Reads a bounded sample (head + deterministic seeks for plain;
550
+ decompressed prefix for gzip) and profiles THAT. The only whole-file
551
+ fact is byte_size (a stat). Terminal magic hits (PNG, PDF, ELF, etc.)
552
+ short-circuit the text cascade — text slots stay None and the renderer
553
+ vanishes them.
554
+ """
555
+ st_size = path.stat().st_size
556
+
557
+ # ── Read head + classify magic ──────────────────────────────────────
558
+ #
559
+ # Container routing is magic-authoritative; suffix is just a fast-path
560
+ # hint so a well-named .gz / .bz2 / .xz skips the head read entirely.
561
+ # An UNKNOWN suffix (e.g. xz bytes in "mystery.log") falls to the
562
+ # magic check, which carries the container label so _sample_compressed
563
+ # opens with the correct stdlib opener.
564
+ suffix_kind = _SUFFIX_TO_CONTAINER.get(path.suffix.lower())
565
+ if suffix_kind is not None:
566
+ # Suffix fast-path: trust the extension, decompress directly. Total
567
+ # bytes pulled this branch = _DECOMPRESSED_PREFIX_BYTES (one read).
568
+ is_compressed = True
569
+ head = _sample_compressed(path, suffix_kind)
570
+ # If the decompressed prefix happens to start with terminal magic
571
+ # (gzipped PNG? unusual), the later _magic_id call still profiles
572
+ # the content as terminal — we route on what is THERE.
573
+ sample_read_count = 1
574
+ body_chunks: list[bytes] = []
575
+ else:
576
+ # Read head ONCE — used for both magic ID and (if no container hit)
577
+ # the sample. Reading it twice would falsify the O(sample) byte
578
+ # budget test as well as do unnecessary work.
579
+ head = _read_head(path)
580
+ kind, label, _prefix = _magic_id(head[:16])
581
+ if kind == "container":
582
+ # Magic-driven container detection on a misnamed file. The
583
+ # label IS the container kind (one of "gzip" / "bzip2" / "xz"
584
+ # — the canonical strings in _SUFFIX_TO_CONTAINER); forward it
585
+ # so the opener routes correctly. Two bounded reads total
586
+ # (head + compressed prefix); rail honored.
587
+ is_compressed = True
588
+ head = _sample_compressed(path, label)
589
+ sample_read_count = 1
590
+ body_chunks = []
591
+ else:
592
+ is_compressed = False
593
+ body_chunks, sample_read_count = _sample_plain_body(
594
+ path, head, st_size,
595
+ )
596
+
597
+ # Concatenated sample bytes — used for char-class and UTF-8 probe.
598
+ # Per-file: blob is single-file under sniff today; this concat is over
599
+ # ONE file's chunks, which is fine. Multi-file return would need
600
+ # per-file probes; flagged in the helper docstring.
601
+ sample_bytes = head + b"".join(body_chunks)
602
+
603
+ # ── Terminal magic ID on the decompressed head's leading bytes ──────
604
+ # For compressed inputs, we ID the CONTENT under decompression, so the
605
+ # head here is the decompressed prefix. A terminal hit there means the
606
+ # gzipped content is itself a binary artifact (rare but coherent).
607
+ kind, file_type_guess, file_type_magic = _magic_id(head[:16])
608
+
609
+ # ── Char-class + UTF-8 on the sample ────────────────────────────────
610
+ printable_pct, nonprintable_pct = _char_class_profile(sample_bytes)
611
+ utf8_clean = _utf8_probe(sample_bytes)
612
+
613
+ # Decode lines per chunk (boundary discipline preserved).
614
+ head_lines = _decode_lines(head)
615
+ body_lines: list[str] = []
616
+ for chunk in body_chunks:
617
+ body_lines.extend(_decode_lines(chunk))
618
+ all_sampled_lines = head_lines + body_lines
619
+
620
+ # Cap line count (pathological short-line inputs).
621
+ if len(all_sampled_lines) > _MAX_SAMPLED_LINES:
622
+ all_sampled_lines = all_sampled_lines[:_MAX_SAMPLED_LINES]
623
+ # Trim body_lines proportionally — head_lines first, then body.
624
+ if len(head_lines) >= _MAX_SAMPLED_LINES:
625
+ head_lines = head_lines[:_MAX_SAMPLED_LINES]
626
+ body_lines = []
627
+ else:
628
+ remaining = _MAX_SAMPLED_LINES - len(head_lines)
629
+ body_lines = body_lines[:remaining]
630
+
631
+ sampled_line_count = len(all_sampled_lines)
632
+
633
+ # ── Terminal binary path: skip text cascade, vanish text slots ──────
634
+ if kind == "terminal":
635
+ return BlobCard(
636
+ source_name=path.name,
637
+ byte_size=st_size,
638
+ sampled_line_count=sampled_line_count,
639
+ sample_read_count=sample_read_count,
640
+ is_compressed=is_compressed,
641
+ printable_pct=printable_pct,
642
+ nonprintable_pct=nonprintable_pct,
643
+ utf8_clean=utf8_clean,
644
+ file_type_guess=file_type_guess,
645
+ file_type_magic=file_type_magic,
646
+ shape_guess=None,
647
+ )
648
+
649
+ # ── Text path: shape-guess, line stats, tokens, templates ───────────
650
+ shape_guess = _shape_guess(body_lines, all_sampled_lines)
651
+
652
+ lengths = [len(ln) for ln in all_sampled_lines]
653
+ mean_len, median_len, p95_len, max_len, stdev_len, shape = _line_length_shape(
654
+ lengths
655
+ )
656
+
657
+ tokens = _top_tokens(all_sampled_lines)
658
+
659
+ # JSON shape-guess only: extract top-level object key NAMES (never
660
+ # values) for the renderer's `fields:` row. On non-JSON shapes and on
661
+ # JSON-of-arrays/scalars this stays None, and the renderer falls back
662
+ # to the existing `tokens:` row.
663
+ if shape_guess == "JSON":
664
+ json_field_names = _json_field_names(all_sampled_lines)
665
+ else:
666
+ json_field_names = None
667
+
668
+ template_result = _template_structure(all_sampled_lines)
669
+ if template_result is None:
670
+ distinct = top_cov = top_n = singletons = None
671
+ else:
672
+ distinct, top_cov, top_n, singletons = template_result
673
+
674
+ return BlobCard(
675
+ source_name=path.name,
676
+ byte_size=st_size,
677
+ sampled_line_count=sampled_line_count,
678
+ sample_read_count=sample_read_count,
679
+ is_compressed=is_compressed,
680
+ printable_pct=printable_pct,
681
+ nonprintable_pct=nonprintable_pct,
682
+ utf8_clean=utf8_clean,
683
+ file_type_guess=None,
684
+ file_type_magic=None,
685
+ shape_guess=shape_guess,
686
+ mean_line_length=mean_len,
687
+ median_line_length=median_len,
688
+ line_length_p95=p95_len,
689
+ max_line_length=max_len,
690
+ line_length_stdev=stdev_len,
691
+ line_length_shape=shape,
692
+ top_tokens=tokens,
693
+ json_field_names=json_field_names,
694
+ distinct_templates=distinct,
695
+ top_template_coverage_pct=top_cov,
696
+ top_template_n=top_n,
697
+ singleton_template_count=singletons,
698
+ )