loghunter-cli 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- loghunter/__init__.py +3 -0
- loghunter/cli.py +1108 -0
- loghunter/cli_init.py +567 -0
- loghunter/common/__init__.py +1 -0
- loghunter/common/allowlist.py +436 -0
- loghunter/common/clustering.py +326 -0
- loghunter/common/config.py +221 -0
- loghunter/common/display.py +323 -0
- loghunter/common/errors.py +45 -0
- loghunter/common/finding.py +239 -0
- loghunter/common/loader/__init__.py +136 -0
- loghunter/common/loader/diagnostics.py +94 -0
- loghunter/common/loader/discovery.py +335 -0
- loghunter/common/loader/io.py +76 -0
- loghunter/common/loader/pipeline.py +1010 -0
- loghunter/common/loader/sniff.py +184 -0
- loghunter/common/loader/types.py +207 -0
- loghunter/common/loader/windowing.py +523 -0
- loghunter/common/output.py +93 -0
- loghunter/common/paths.py +105 -0
- loghunter/common/sources.py +392 -0
- loghunter/data/allowlist/connections.txt +50 -0
- loghunter/data/allowlist/domains_devices.txt +5 -0
- loghunter/data/allowlist/domains_homelab.txt +5 -0
- loghunter/data/allowlist/domains_universal.txt +125 -0
- loghunter/data/config_example.toml +144 -0
- loghunter/detectors/__init__.py +5 -0
- loghunter/detectors/auth.py +27 -0
- loghunter/detectors/aws.py +671 -0
- loghunter/detectors/beacon.py +258 -0
- loghunter/detectors/dns.py +778 -0
- loghunter/detectors/dnsblock.py +29 -0
- loghunter/detectors/duration.py +178 -0
- loghunter/detectors/protocol.py +26 -0
- loghunter/detectors/scan.py +735 -0
- loghunter/detectors/ssl.py +25 -0
- loghunter/detectors/syslog.py +266 -0
- loghunter/detectors/weird.py +27 -0
- loghunter/digest/__init__.py +43 -0
- loghunter/digest/_stats.py +182 -0
- loghunter/digest/blob.py +698 -0
- loghunter/digest/cloudtrail.py +341 -0
- loghunter/digest/conn.py +367 -0
- loghunter/digest/dns.py +364 -0
- loghunter/digest/syslog.py +269 -0
- loghunter/exporters/__init__.py +534 -0
- loghunter/exporters/cloudtrail.py +499 -0
- loghunter/exporters/splunk.py +222 -0
- loghunter/outputs/__init__.py +1 -0
- loghunter/outputs/allowlist.py +75 -0
- loghunter/outputs/csv.py +70 -0
- loghunter/outputs/email.py +44 -0
- loghunter/outputs/html.py +99 -0
- loghunter/outputs/json.py +77 -0
- loghunter/outputs/text.py +1422 -0
- loghunter/parsers/__init__.py +1 -0
- loghunter/parsers/cloudtrail.py +287 -0
- loghunter/parsers/dnsmasq.py +331 -0
- loghunter/parsers/syslog.py +150 -0
- loghunter/parsers/zeek.py +294 -0
- loghunter/parsers/zeek_tsv.py +310 -0
- loghunter/runner.py +1895 -0
- loghunter_cli-0.1.0.dev0.dist-info/METADATA +336 -0
- loghunter_cli-0.1.0.dev0.dist-info/RECORD +122 -0
- loghunter_cli-0.1.0.dev0.dist-info/WHEEL +5 -0
- loghunter_cli-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- loghunter_cli-0.1.0.dev0.dist-info/licenses/LICENSE +21 -0
- loghunter_cli-0.1.0.dev0.dist-info/top_level.txt +4 -0
- migrations/cloudtrail_parquet.py +59 -0
- migrations/conn_fft.py +550 -0
- migrations/conn_scan.py +1097 -0
- migrations/dns_dbscan.py +520 -0
- migrations/get_syslog.py +402 -0
- migrations/syslog_drain3.py +479 -0
- scratch/junk/parquet.py +59 -0
- tests/__init__.py +1 -0
- tests/_cloudtrail_fakes.py +116 -0
- tests/conftest.py +17 -0
- tests/test_allowlist_defaults_accessor.py +90 -0
- tests/test_architecture_spine.py +302 -0
- tests/test_aws_detector.py +504 -0
- tests/test_be_like_water.py +106 -0
- tests/test_cli_help.py +342 -0
- tests/test_cli_multi_positional.py +458 -0
- tests/test_cloudtrail_exporter.py +631 -0
- tests/test_cloudtrail_exporter_botocore.py +207 -0
- tests/test_cloudtrail_parser.py +393 -0
- tests/test_clustering.py +85 -0
- tests/test_clustering_interruptible.py +404 -0
- tests/test_config_cli.py +1006 -0
- tests/test_config_example_drift.py +164 -0
- tests/test_digest_blob.py +1237 -0
- tests/test_digest_cli.py +1040 -0
- tests/test_digest_cloudtrail.py +980 -0
- tests/test_digest_conn.py +1189 -0
- tests/test_digest_dns.py +770 -0
- tests/test_digest_stats.py +282 -0
- tests/test_digest_syslog.py +724 -0
- tests/test_display.py +370 -0
- tests/test_dns_detector.py +1010 -0
- tests/test_dnsmasq_parser.py +467 -0
- tests/test_duration_detector.py +491 -0
- tests/test_export_orchestrator_shape.py +153 -0
- tests/test_init_wizard.py +707 -0
- tests/test_loader.py +3639 -0
- tests/test_loader_package_surface.py +115 -0
- tests/test_loader_window_model.py +215 -0
- tests/test_output_path_cascade.py +575 -0
- tests/test_resolve_path.py +111 -0
- tests/test_root_provenance.py +212 -0
- tests/test_runner.py +2599 -0
- tests/test_scan_detector.py +455 -0
- tests/test_search_paths.py +50 -0
- tests/test_sniff_orchestrator.py +373 -0
- tests/test_sniff_recognizers.py +573 -0
- tests/test_source_resolution_seam.py +471 -0
- tests/test_sources.py +648 -0
- tests/test_splunk_exporter.py +351 -0
- tests/test_syslog_detector.py +458 -0
- tests/test_syslog_parser.py +582 -0
- tests/test_text_output.py +1225 -0
- tests/test_zeek_tsv_parser.py +580 -0
loghunter/digest/blob.py
ADDED
|
@@ -0,0 +1,698 @@
|
|
|
1
|
+
"""blob summariser — orient-before-the-hunt for unrecognized sources.
|
|
2
|
+
|
|
3
|
+
The blob path is the digest's escape hatch for inputs that have NO parser.
|
|
4
|
+
It describes bytes as bytes and extracts zero fields — not even a timestamp.
|
|
5
|
+
The moment any code reads a field it has become a parser, and parsers are a
|
|
6
|
+
separate component with a separate contract; the blob path exists precisely
|
|
7
|
+
so the operator can point digest at an unknown source and get a visibly
|
|
8
|
+
degraded card rather than an error and a shrug.
|
|
9
|
+
|
|
10
|
+
O(sample) rail (non-negotiable): the profiler reads ONE bounded sample and
|
|
11
|
+
profiles THAT. A 1 GB file and a 1 KB file cost the same. The only whole-file
|
|
12
|
+
fact is the on-disk size (a stat, free). Random seeks for plain files;
|
|
13
|
+
head-only decompressed prefix for gzip. drain3 — the only expensive item —
|
|
14
|
+
runs over the sampled lines, behind a quarantine flag, and is suppressed by a
|
|
15
|
+
meaninglessness floor when its output would be vacuous.
|
|
16
|
+
|
|
17
|
+
Identification cascade: magic bytes first (TERMINAL — content IS the
|
|
18
|
+
artifact; CONTAINER — bytes are compressed transport, decompress and look
|
|
19
|
+
underneath). Char-class profile next (binary or text, sample-derived
|
|
20
|
+
fraction). Shape-guess for text: a labeled best-guess (JSON / CSV / TSV /
|
|
21
|
+
HTML / key-value / long-lines / freeform) that drives the headline. Every
|
|
22
|
+
output is a GUESS, never a parsed claim.
|
|
23
|
+
|
|
24
|
+
Determinism: seek offsets are derived from file size (evenly spaced
|
|
25
|
+
fractions), so the same file always yields the same sample and the same
|
|
26
|
+
card. No unseeded randomness.
|
|
27
|
+
|
|
28
|
+
Per-file boundary discipline: blob today is single-file under the sniff
|
|
29
|
+
path. The decode-per-chunk discipline is preserved at the helper boundary so
|
|
30
|
+
a future multi-file return cannot silently merge line N of file A with
|
|
31
|
+
line 1 of file B and falsify line/template/token facts.
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import bz2
|
|
37
|
+
import gzip
|
|
38
|
+
import json
|
|
39
|
+
import lzma
|
|
40
|
+
import statistics
|
|
41
|
+
from collections import Counter
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
|
|
44
|
+
from loghunter.common.finding import BlobCard
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# ── Calibration constants ────────────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
# Plain-file sample budget. Head + K seeks; each seek reads a bounded byte
|
|
50
|
+
# window. Total bytes read ≤ _HEAD_BYTES + _SEEK_COUNT * _SEEK_BYTES + slack
|
|
51
|
+
# for "skip to next newline" tails. Bounded regardless of file size — the rail.
|
|
52
|
+
_HEAD_BYTES = 64 * 1024 # 64 KB head
|
|
53
|
+
_SEEK_COUNT = 5 # 5 evenly-spaced seek points
|
|
54
|
+
_SEEK_BYTES = 32 * 1024 # 32 KB per seek
|
|
55
|
+
# A file must be larger than the head + seek budget by this factor to bother
|
|
56
|
+
# with seeks — smaller files are fully covered by the head alone.
|
|
57
|
+
_SEEK_MIN_SIZE = _HEAD_BYTES * 4
|
|
58
|
+
|
|
59
|
+
# Compressed (head-only) sample budget.
|
|
60
|
+
_DECOMPRESSED_PREFIX_BYTES = 256 * 1024 # 256 KB after decompression
|
|
61
|
+
|
|
62
|
+
# Hard cap on lines profiled. With ~80-char average lines and the head+seeks
|
|
63
|
+
# budget above, we expect well under this; the cap protects against
|
|
64
|
+
# pathological all-short-lines inputs (e.g. lots of empty lines).
|
|
65
|
+
_MAX_SAMPLED_LINES = 8000
|
|
66
|
+
|
|
67
|
+
# Line-length shape gate (unchanged).
|
|
68
|
+
_SHAPE_CV_GATE = 0.5
|
|
69
|
+
|
|
70
|
+
# Token / template caps.
|
|
71
|
+
_TOP_TEMPLATE_N = 6
|
|
72
|
+
_TOP_TOKENS_N = 10
|
|
73
|
+
|
|
74
|
+
# drain3 engine config — mirrors the syslog detector's defaults; do not import
|
|
75
|
+
# from detectors/syslog.py, blob is upstream of any parsed frame.
|
|
76
|
+
_DRAIN_SIM_TH = 0.5
|
|
77
|
+
_DRAIN_DEPTH = 4
|
|
78
|
+
_DRAIN_PARAMETRIZE_NUMERIC = True
|
|
79
|
+
|
|
80
|
+
# QUARANTINE switch. When False, drain3 does not run and Templates slot
|
|
81
|
+
# vanishes on every blob card — the renderer copes because every template
|
|
82
|
+
# field is Optional. Flip to False from the perf probe if sampled-drain3
|
|
83
|
+
# proves too slow.
|
|
84
|
+
_BLOB_DRAIN3_ENABLED = True
|
|
85
|
+
|
|
86
|
+
# Meaninglessness floor: when distinct_templates / sampled_lines exceeds this
|
|
87
|
+
# ratio, the output is nearly 1-template-per-line — freeform that does not
|
|
88
|
+
# template. Suppress the result; renderer vanishes the slot. Better silent
|
|
89
|
+
# than vacuous ("~480 distinct structures over 500 lines" tells nothing).
|
|
90
|
+
_TEMPLATE_RATIO_FLOOR = 0.5
|
|
91
|
+
|
|
92
|
+
# Printable byte set: TAB, LF, CR, plus space..tilde (0x20..0x7E).
|
|
93
|
+
_PRINTABLE_BYTES = frozenset(b"\t\n\r") | frozenset(range(0x20, 0x7F))
|
|
94
|
+
|
|
95
|
+
# 256-entry translation table — 0x01 for printable, 0x00 for everything else.
|
|
96
|
+
# Lets us count printables in C via bytes.translate + bytes.count instead of
|
|
97
|
+
# a Python-level loop over every byte. Operates on the sample only.
|
|
98
|
+
_PRINTABLE_TRANSLATE = bytes(
|
|
99
|
+
1 if i in _PRINTABLE_BYTES else 0
|
|
100
|
+
for i in range(256)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
# ── Magic-byte signature table ──────────────────────────────────────────────
|
|
105
|
+
#
|
|
106
|
+
# Hand-rolled, zero dependency. TERMINAL = content IS the artifact (image,
|
|
107
|
+
# binary, document) — no point profiling text underneath, because there is
|
|
108
|
+
# none. CONTAINER = bytes are compressed transport — switch to the
|
|
109
|
+
# decompressed-head path and profile the content shape underneath.
|
|
110
|
+
#
|
|
111
|
+
# Order matters within each list: longer/more-specific prefixes first. Each
|
|
112
|
+
# entry is (prefix_bytes, label).
|
|
113
|
+
#
|
|
114
|
+
# Container support: gzip, bzip2, xz — all stdlib (gzip / bz2 / lzma). The
|
|
115
|
+
# magic-byte ID is authoritative; suffix is a fast-path hint only (see
|
|
116
|
+
# _open_log_bytes). zstd is DEFERRED — no stdlib opener before Python 3.14,
|
|
117
|
+
# would add a dependency for a blob-nicety; revisit when the toolchain
|
|
118
|
+
# minimum bumps.
|
|
119
|
+
|
|
120
|
+
_TERMINAL_MAGIC: list[tuple[bytes, str]] = [
|
|
121
|
+
(b"\x89PNG\r\n\x1a\n", "PNG image"),
|
|
122
|
+
(b"\xff\xd8\xff", "JPEG image"),
|
|
123
|
+
(b"GIF87a", "GIF image"),
|
|
124
|
+
(b"GIF89a", "GIF image"),
|
|
125
|
+
(b"%PDF-", "PDF document"),
|
|
126
|
+
(b"\x7fELF", "ELF binary"),
|
|
127
|
+
(b"PK\x03\x04", "zip archive"),
|
|
128
|
+
(b"PK\x05\x06", "zip archive (empty)"),
|
|
129
|
+
(b"PK\x07\x08", "zip archive (spanned)"),
|
|
130
|
+
(b"\xca\xfe\xba\xbe", "Java class file"),
|
|
131
|
+
(b"\x00asm", "WebAssembly module"),
|
|
132
|
+
(b"SQLite format 3\x00", "SQLite database"),
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
_CONTAINER_MAGIC: list[tuple[bytes, str]] = [
|
|
136
|
+
(b"\x1f\x8b", "gzip"),
|
|
137
|
+
(b"\xfd7zXZ\x00", "xz"),
|
|
138
|
+
(b"BZh", "bzip2"),
|
|
139
|
+
]
|
|
140
|
+
|
|
141
|
+
# Suffix → container label fast-path. The magic table above is the canonical
|
|
142
|
+
# identifier; this mapping just lets us skip the magic read on the well-named
|
|
143
|
+
# common case. A correctly-named .gz / .bz2 / .xz file is opened directly via
|
|
144
|
+
# the matching stdlib opener; an UNKNOWN suffix (or a misnamed .log that
|
|
145
|
+
# happens to be xz-compressed) is identified by magic and routed to the same
|
|
146
|
+
# opener — see summarize_blob and _open_log_bytes.
|
|
147
|
+
_SUFFIX_TO_CONTAINER: dict[str, str] = {
|
|
148
|
+
".gz": "gzip",
|
|
149
|
+
".bz2": "bzip2",
|
|
150
|
+
".xz": "xz",
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
# ── File openers ────────────────────────────────────────────────────────────
|
|
155
|
+
|
|
156
|
+
def _open_log_bytes(path: Path, container_label: str | None = None):
|
|
157
|
+
"""Open a plain or container-compressed file in binary mode.
|
|
158
|
+
|
|
159
|
+
Parallel to common.loader._open_log but bytes-mode — the blob path needs
|
|
160
|
+
pre-decode access for the char-class profile and the strict UTF-8 probe.
|
|
161
|
+
|
|
162
|
+
``container_label`` is the magic-derived container kind (one of
|
|
163
|
+
``"gzip"``, ``"bzip2"``, ``"xz"``) when the caller already identified
|
|
164
|
+
the file as compressed. When ``None``, the path suffix is consulted as a
|
|
165
|
+
fast-path hint; an unmatched suffix opens the file plain. Magic-driven
|
|
166
|
+
callers should pass ``container_label`` explicitly so a misnamed
|
|
167
|
+
container (xz bytes in ``mystery.log``) routes to the correct opener.
|
|
168
|
+
"""
|
|
169
|
+
if container_label is None:
|
|
170
|
+
container_label = _SUFFIX_TO_CONTAINER.get(path.suffix.lower())
|
|
171
|
+
if container_label == "gzip":
|
|
172
|
+
return gzip.open(path, "rb")
|
|
173
|
+
if container_label == "bzip2":
|
|
174
|
+
return bz2.open(path, "rb")
|
|
175
|
+
if container_label == "xz":
|
|
176
|
+
return lzma.open(path, "rb")
|
|
177
|
+
return path.open("rb")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
# ── Sampling ────────────────────────────────────────────────────────────────
|
|
181
|
+
|
|
182
|
+
def _read_head(path: Path) -> bytes:
|
|
183
|
+
"""Read the first _HEAD_BYTES bytes of a plain file."""
|
|
184
|
+
with path.open("rb") as fh:
|
|
185
|
+
return fh.read(_HEAD_BYTES)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _read_seek(path: Path, offset: int) -> bytes:
|
|
189
|
+
"""Seek to offset, read a hard-bounded window, return content after the
|
|
190
|
+
first newline within that window.
|
|
191
|
+
|
|
192
|
+
O(sample) rail: total bytes pulled from disk is EXACTLY _SEEK_BYTES per
|
|
193
|
+
seek, regardless of where the next newline lives. An unbounded
|
|
194
|
+
``readline()`` here would scan to EOF on a long-line / no-newline file
|
|
195
|
+
(5 MB single-line file pulled 13 MB through readline()), breaking the
|
|
196
|
+
rail. The discipline: read the bounded window in ONE call; if a newline
|
|
197
|
+
lives inside it, return the post-newline slice (a clean line boundary);
|
|
198
|
+
if not, this seek yielded no usable lines — return an empty chunk and
|
|
199
|
+
let the cascade fall back to the head sample.
|
|
200
|
+
"""
|
|
201
|
+
with path.open("rb") as fh:
|
|
202
|
+
fh.seek(offset)
|
|
203
|
+
window = fh.read(_SEEK_BYTES)
|
|
204
|
+
nl = window.find(b"\n")
|
|
205
|
+
if nl < 0:
|
|
206
|
+
return b""
|
|
207
|
+
return window[nl + 1:]
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
def _sample_plain_body(
|
|
211
|
+
path: Path, head: bytes, st_size: int,
|
|
212
|
+
) -> tuple[list[bytes], int]:
|
|
213
|
+
"""Read deterministic body-seek chunks for a plain file.
|
|
214
|
+
|
|
215
|
+
Returns (body_chunks, sample_read_count). The caller already has the
|
|
216
|
+
head bytes — avoids a duplicate head read.
|
|
217
|
+
|
|
218
|
+
Seek offsets are evenly spaced fractions of the file size — deterministic
|
|
219
|
+
by construction; the same file → the same sample. No RNG. For small
|
|
220
|
+
files (≤ _SEEK_MIN_SIZE), skip the seeks; the head alone covers them.
|
|
221
|
+
"""
|
|
222
|
+
if st_size <= _SEEK_MIN_SIZE:
|
|
223
|
+
return [], 1
|
|
224
|
+
|
|
225
|
+
body_chunks: list[bytes] = []
|
|
226
|
+
# Evenly spaced offsets in (head_end, st_size). Use (k / (K+1)) * st_size
|
|
227
|
+
# so seeks land at 1/6, 2/6, 3/6, 4/6, 5/6 of the file when K=5.
|
|
228
|
+
for k in range(1, _SEEK_COUNT + 1):
|
|
229
|
+
offset = (st_size * k) // (_SEEK_COUNT + 1)
|
|
230
|
+
# Don't seek into the head region — we already have it.
|
|
231
|
+
if offset < _HEAD_BYTES:
|
|
232
|
+
continue
|
|
233
|
+
body_chunks.append(_read_seek(path, offset))
|
|
234
|
+
return body_chunks, 1 + len(body_chunks)
|
|
235
|
+
|
|
236
|
+
|
|
237
|
+
def _sample_compressed(path: Path, container_label: str | None = None) -> bytes:
|
|
238
|
+
"""Decompress up to _DECOMPRESSED_PREFIX_BYTES of a compressed file.
|
|
239
|
+
|
|
240
|
+
HEAD-ONLY by construction — random seek into a compressed stream is
|
|
241
|
+
invalid for all three supported containers (gzip / bzip2 / xz). The
|
|
242
|
+
bound here is the decompressed prefix size, not the on-disk size; a
|
|
243
|
+
container that decompresses to many GB still costs O(sample) here.
|
|
244
|
+
|
|
245
|
+
``container_label`` is forwarded to ``_open_log_bytes``; pass it
|
|
246
|
+
explicitly when the kind came from magic ID so a misnamed file routes
|
|
247
|
+
correctly.
|
|
248
|
+
"""
|
|
249
|
+
with _open_log_bytes(path, container_label) as fh:
|
|
250
|
+
return fh.read(_DECOMPRESSED_PREFIX_BYTES)
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
# ── Char-class profile (over RAW sampled bytes — sample fact, not whole-file) ─
|
|
254
|
+
|
|
255
|
+
def _char_class_profile(sample: bytes) -> tuple[float, float]:
|
|
256
|
+
"""Return (printable_pct, nonprintable_pct) over the raw sample bytes.
|
|
257
|
+
|
|
258
|
+
Printable ≡ TAB, LF, CR, or any byte in [0x20, 0x7E]. Everything else —
|
|
259
|
+
other control characters, 0x7F, the entire 0x80-0xFF range — counts as
|
|
260
|
+
non-printable. This is a SAMPLE fact (computed over the bounded sample
|
|
261
|
+
only). Do not optimize it into a whole-file scan "for accuracy" — that
|
|
262
|
+
would break the O(sample) rail. The Bytes row is honest as a
|
|
263
|
+
sample-derived fraction; sampling bias is acceptable for orientation.
|
|
264
|
+
"""
|
|
265
|
+
if not sample:
|
|
266
|
+
return 0.0, 0.0
|
|
267
|
+
translated = sample.translate(_PRINTABLE_TRANSLATE)
|
|
268
|
+
printable = translated.count(b"\x01")
|
|
269
|
+
pct = printable / len(sample) * 100.0
|
|
270
|
+
return pct, 100.0 - pct
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
# ── UTF-8 cleanness probe ───────────────────────────────────────────────────
|
|
274
|
+
|
|
275
|
+
def _utf8_probe(sample: bytes) -> bool:
|
|
276
|
+
"""True iff the sample decodes strictly as UTF-8.
|
|
277
|
+
|
|
278
|
+
A single strict decode over the concatenated sample. The downstream
|
|
279
|
+
line-level decode (in _decode_lines) uses errors="replace" so the
|
|
280
|
+
profiler is robust to non-UTF-8 bytes — but the renderer only claims
|
|
281
|
+
"UTF-8 clean" when this probe succeeded.
|
|
282
|
+
|
|
283
|
+
Per-file note: if multi-file blob ever returns, this concatenation
|
|
284
|
+
becomes a boundary problem — probe per-file and AND the results.
|
|
285
|
+
Today's caller passes a single file's sample, so a flat concat is fine.
|
|
286
|
+
"""
|
|
287
|
+
if not sample:
|
|
288
|
+
return True
|
|
289
|
+
try:
|
|
290
|
+
sample.decode("utf-8")
|
|
291
|
+
except UnicodeDecodeError:
|
|
292
|
+
return False
|
|
293
|
+
return True
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
# ── Line decoding ───────────────────────────────────────────────────────────
|
|
297
|
+
|
|
298
|
+
def _decode_lines(chunk: bytes) -> list[str]:
|
|
299
|
+
"""Decode a sample chunk and split to lines, errors=replace.
|
|
300
|
+
|
|
301
|
+
Splitlines (no keepends) on the replacement-decoded text. One chunk at a
|
|
302
|
+
time so callers that pass multiple chunks (head + seek chunks for plain)
|
|
303
|
+
preserve per-chunk boundaries — never merge the tail of one chunk with
|
|
304
|
+
the head of the next.
|
|
305
|
+
"""
|
|
306
|
+
if not chunk:
|
|
307
|
+
return []
|
|
308
|
+
return chunk.decode("utf-8", errors="replace").splitlines()
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
# ── Line-length shape ───────────────────────────────────────────────────────
|
|
312
|
+
|
|
313
|
+
def _line_length_shape(
|
|
314
|
+
lengths: list[int],
|
|
315
|
+
) -> tuple[float, float, int, int, float, str]:
|
|
316
|
+
"""Return (mean, median, p95, max, stdev, shape) for line lengths.
|
|
317
|
+
|
|
318
|
+
``shape`` is exactly ``"uniform"`` or ``"varied"``. A blob with fewer
|
|
319
|
+
than two lines, or a mean of zero, characterises as ``"uniform"`` (no
|
|
320
|
+
variance to call out). p95 is the 95th percentile of the sample's line
|
|
321
|
+
lengths; for very small samples (< 20 lines) ``statistics.quantiles``
|
|
322
|
+
falls back to a single-quantile estimate via max.
|
|
323
|
+
"""
|
|
324
|
+
if not lengths:
|
|
325
|
+
return 0.0, 0.0, 0, 0, 0.0, "uniform"
|
|
326
|
+
mean = statistics.fmean(lengths)
|
|
327
|
+
median = float(statistics.median(lengths))
|
|
328
|
+
max_len = max(lengths)
|
|
329
|
+
# 95th percentile. statistics.quantiles(n=20) interpolates between data
|
|
330
|
+
# points (exclusive method), which can EXTRAPOLATE past max on small
|
|
331
|
+
# samples — e.g. lengths=[1, 100] yields p95=184 with max=100. p95 is
|
|
332
|
+
# supposed to be an order statistic FROM the sample, never beyond it.
|
|
333
|
+
# Need at least 20 data points to land a 95th percentile inside the
|
|
334
|
+
# observed range without extrapolation; below that, collapse to max.
|
|
335
|
+
# The `min(..., max_len)` clamp is belt-and-braces against any other
|
|
336
|
+
# degenerate input that survives the threshold.
|
|
337
|
+
if len(lengths) >= 20:
|
|
338
|
+
p95 = min(int(statistics.quantiles(lengths, n=20)[18]), max_len)
|
|
339
|
+
else:
|
|
340
|
+
p95 = max_len
|
|
341
|
+
if len(lengths) < 2 or mean == 0.0:
|
|
342
|
+
return mean, median, p95, max_len, 0.0, "uniform"
|
|
343
|
+
stdev = statistics.stdev(lengths)
|
|
344
|
+
cv = stdev / mean
|
|
345
|
+
shape = "varied" if cv >= _SHAPE_CV_GATE else "uniform"
|
|
346
|
+
return mean, median, p95, max_len, stdev, shape
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
# ── Magic-byte identification ────────────────────────────────────────────────
|
|
350
|
+
|
|
351
|
+
def _magic_id(head: bytes) -> tuple[str | None, str | None, bytes | None]:
|
|
352
|
+
"""Return (kind, label, prefix_bytes) where kind ∈ {"terminal","container",None}.
|
|
353
|
+
|
|
354
|
+
Compares the first ~16 bytes against the hand-rolled signature tables.
|
|
355
|
+
TERMINAL hits win immediately and skip the text-shape cascade.
|
|
356
|
+
CONTAINER hits switch the caller into decompressed-prefix mode.
|
|
357
|
+
"""
|
|
358
|
+
for prefix, label in _TERMINAL_MAGIC:
|
|
359
|
+
if head.startswith(prefix):
|
|
360
|
+
return "terminal", label, prefix
|
|
361
|
+
for prefix, label in _CONTAINER_MAGIC:
|
|
362
|
+
if head.startswith(prefix):
|
|
363
|
+
return "container", label, prefix
|
|
364
|
+
return None, None, None
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
# ── Shape-guess cascade ─────────────────────────────────────────────────────
|
|
368
|
+
|
|
369
|
+
def _shape_guess(body_lines: list[str], all_lines: list[str]) -> str:
|
|
370
|
+
"""Return a labeled best-guess of the text's shape.
|
|
371
|
+
|
|
372
|
+
Input rule: prefer body (seek) lines when available — they avoid being
|
|
373
|
+
fooled by a preamble (Zeek #-header, CSV header row). Fall back to the
|
|
374
|
+
full bounded sample for compressed files (head-only) and small plain
|
|
375
|
+
files (no useful seek body). Never returns None; returns "freeform
|
|
376
|
+
text" as the floor. Output is ALWAYS a guess, never a parsed claim.
|
|
377
|
+
"""
|
|
378
|
+
lines = body_lines if body_lines else all_lines
|
|
379
|
+
# Strip empty lines for the structural tests; do not modify the input.
|
|
380
|
+
non_empty = [ln for ln in lines if ln.strip()]
|
|
381
|
+
if not non_empty:
|
|
382
|
+
return "freeform text"
|
|
383
|
+
|
|
384
|
+
# JSON: first non-blank starts with { or [, AND a sampled line parses.
|
|
385
|
+
first = non_empty[0].lstrip()
|
|
386
|
+
if first and first[0] in "{[":
|
|
387
|
+
# Try a small handful of lines so the test is cheap.
|
|
388
|
+
for candidate in non_empty[:5]:
|
|
389
|
+
try:
|
|
390
|
+
json.loads(candidate)
|
|
391
|
+
return "JSON"
|
|
392
|
+
except (ValueError, TypeError):
|
|
393
|
+
continue
|
|
394
|
+
|
|
395
|
+
# CSV / TSV: consistent delimiter count across the body, ≥ 1 columns.
|
|
396
|
+
for delim, name in ((",", "CSV"), ("\t", "TSV")):
|
|
397
|
+
counts = [ln.count(delim) for ln in non_empty]
|
|
398
|
+
# Require at least one delimiter per line on most lines, and tight
|
|
399
|
+
# consistency: the dominant count covers ≥ 80% of lines.
|
|
400
|
+
nonzero = [c for c in counts if c >= 1]
|
|
401
|
+
if not nonzero:
|
|
402
|
+
continue
|
|
403
|
+
top_count, top_freq = Counter(nonzero).most_common(1)[0]
|
|
404
|
+
if top_freq / len(counts) >= 0.8:
|
|
405
|
+
cols = top_count + 1
|
|
406
|
+
return f"{name}, ~{cols} columns"
|
|
407
|
+
|
|
408
|
+
# HTML / XML: angle-bracket tag structure on a reasonable fraction.
|
|
409
|
+
tag_lines = sum(1 for ln in non_empty if "<" in ln and ">" in ln)
|
|
410
|
+
if tag_lines / len(non_empty) >= 0.5:
|
|
411
|
+
return "HTML/XML"
|
|
412
|
+
|
|
413
|
+
# key=value or key: value: at least N=2 such pairs on most lines.
|
|
414
|
+
kv_lines = 0
|
|
415
|
+
for ln in non_empty:
|
|
416
|
+
if ln.count("=") >= 2 or _count_kv_colons(ln) >= 2:
|
|
417
|
+
kv_lines += 1
|
|
418
|
+
if kv_lines / len(non_empty) >= 0.6:
|
|
419
|
+
return "key-value text"
|
|
420
|
+
|
|
421
|
+
# Long lines / minified.
|
|
422
|
+
mean_len = sum(len(ln) for ln in non_empty) / len(non_empty)
|
|
423
|
+
if mean_len >= 400:
|
|
424
|
+
return f"very long lines (mean {int(mean_len)} chars), possibly minified"
|
|
425
|
+
|
|
426
|
+
return "freeform text"
|
|
427
|
+
|
|
428
|
+
|
|
429
|
+
def _count_kv_colons(line: str) -> int:
|
|
430
|
+
"""Count colon-pairs that look like key: value (colon followed by space)."""
|
|
431
|
+
return line.count(": ")
|
|
432
|
+
|
|
433
|
+
|
|
434
|
+
# ── Template structure (drain3) — QUARANTINED + FLOORED ─────────────────────
|
|
435
|
+
|
|
436
|
+
def _template_structure(
|
|
437
|
+
lines: list[str],
|
|
438
|
+
) -> tuple[int, float, int, int] | None:
|
|
439
|
+
"""Return (distinct, top_coverage_pct, top_n, singletons) or None.
|
|
440
|
+
|
|
441
|
+
Quarantine: returns None when _BLOB_DRAIN3_ENABLED is False (drain3
|
|
442
|
+
dormant). Renderer copes — Templates slot vanishes.
|
|
443
|
+
|
|
444
|
+
Meaninglessness floor: returns None when distinct/total exceeds
|
|
445
|
+
_TEMPLATE_RATIO_FLOOR — the input is freeform that doesn't template,
|
|
446
|
+
and saying "~480 distinct templates over 500 lines" is the opposite of
|
|
447
|
+
helpful. Better silent than vacuous.
|
|
448
|
+
|
|
449
|
+
drain3 runs over the SAMPLE only. The caller passes the sampled lines,
|
|
450
|
+
never the whole file.
|
|
451
|
+
"""
|
|
452
|
+
if not _BLOB_DRAIN3_ENABLED or not lines:
|
|
453
|
+
return None
|
|
454
|
+
|
|
455
|
+
try:
|
|
456
|
+
from drain3 import TemplateMiner
|
|
457
|
+
from drain3.template_miner_config import TemplateMinerConfig
|
|
458
|
+
except ImportError as exc:
|
|
459
|
+
raise ImportError(
|
|
460
|
+
"drain3 is required for the digest blob path. "
|
|
461
|
+
"Run: pip install drain3"
|
|
462
|
+
) from exc
|
|
463
|
+
|
|
464
|
+
cfg = TemplateMinerConfig()
|
|
465
|
+
cfg.drain_sim_th = _DRAIN_SIM_TH
|
|
466
|
+
cfg.drain_depth = _DRAIN_DEPTH
|
|
467
|
+
cfg.parametrize_numeric_tokens = _DRAIN_PARAMETRIZE_NUMERIC
|
|
468
|
+
|
|
469
|
+
miner = TemplateMiner(config=cfg)
|
|
470
|
+
counts: Counter[int] = Counter()
|
|
471
|
+
for line in lines:
|
|
472
|
+
result = miner.add_log_message(line)
|
|
473
|
+
counts[int(result["cluster_id"])] += 1
|
|
474
|
+
|
|
475
|
+
distinct = len(counts)
|
|
476
|
+
total = sum(counts.values())
|
|
477
|
+
if total == 0:
|
|
478
|
+
return None
|
|
479
|
+
|
|
480
|
+
# Meaninglessness floor: near-1:1 templates means freeform.
|
|
481
|
+
if distinct / total >= _TEMPLATE_RATIO_FLOOR:
|
|
482
|
+
return None
|
|
483
|
+
|
|
484
|
+
top_counts = counts.most_common(_TOP_TEMPLATE_N)
|
|
485
|
+
top_sum = sum(c for _, c in top_counts)
|
|
486
|
+
top_coverage = top_sum / total * 100.0
|
|
487
|
+
singletons = sum(1 for c in counts.values() if c == 1)
|
|
488
|
+
return distinct, top_coverage, _TOP_TEMPLATE_N, singletons
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
# ── Top literal tokens (over the sample) ─────────────────────────────────────
|
|
492
|
+
|
|
493
|
+
def _top_tokens(lines: list[str]) -> list[tuple[str, int]]:
|
|
494
|
+
"""Top-N most frequent whitespace-split tokens over the sampled lines.
|
|
495
|
+
|
|
496
|
+
Frequency only — no field semantics. The renderer labels this block
|
|
497
|
+
"[literal]" so no reader mistakes counts for parsed fields.
|
|
498
|
+
"""
|
|
499
|
+
counter: Counter[str] = Counter()
|
|
500
|
+
for line in lines:
|
|
501
|
+
counter.update(line.split())
|
|
502
|
+
return list(counter.most_common(_TOP_TOKENS_N))
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
# ── Top-level JSON object keys (over the sample) ────────────────────────────
|
|
506
|
+
|
|
507
|
+
def _json_field_names(lines: list[str]) -> list[str] | None:
|
|
508
|
+
"""First-seen union of top-level JSON object keys across sampled lines.
|
|
509
|
+
|
|
510
|
+
O(sample): iterates the already-held sample, no new I/O. For each
|
|
511
|
+
non-blank line that parses as a JSON OBJECT (dict root), walks its
|
|
512
|
+
top-level keys and accumulates them in first-appearance order
|
|
513
|
+
(dedup-preserving-first-seen). Catches optional fields that only
|
|
514
|
+
appear on some rows (e.g. dhcp's ``host_name``).
|
|
515
|
+
|
|
516
|
+
Returns None when NO sampled line parses to a dict — top-level JSON
|
|
517
|
+
arrays / scalars / malformed JSON / empty sample. The caller treats
|
|
518
|
+
None as "fall back to the existing tokens row."
|
|
519
|
+
|
|
520
|
+
Names only — never reads a value. This is a structural description
|
|
521
|
+
of the bytes' shape (one rung deeper than ``shape: JSON``), strictly
|
|
522
|
+
more rail-respecting than the token dump it replaces, and inherently
|
|
523
|
+
privacy-safe.
|
|
524
|
+
"""
|
|
525
|
+
seen_set: set[str] = set()
|
|
526
|
+
seen_list: list[str] = []
|
|
527
|
+
for raw in lines:
|
|
528
|
+
line = raw.strip()
|
|
529
|
+
if not line:
|
|
530
|
+
continue
|
|
531
|
+
try:
|
|
532
|
+
obj = json.loads(line)
|
|
533
|
+
except (ValueError, TypeError):
|
|
534
|
+
continue
|
|
535
|
+
if not isinstance(obj, dict):
|
|
536
|
+
continue
|
|
537
|
+
for key in obj.keys():
|
|
538
|
+
if key not in seen_set:
|
|
539
|
+
seen_set.add(key)
|
|
540
|
+
seen_list.append(str(key))
|
|
541
|
+
return seen_list if seen_list else None
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
# ── Public entry point ──────────────────────────────────────────────────────
|
|
545
|
+
|
|
546
|
+
def summarize_blob(path: Path) -> BlobCard:
|
|
547
|
+
"""Describe the byte stream at ``path`` as bytes. Returns a BlobCard.
|
|
548
|
+
|
|
549
|
+
Reads a bounded sample (head + deterministic seeks for plain;
|
|
550
|
+
decompressed prefix for gzip) and profiles THAT. The only whole-file
|
|
551
|
+
fact is byte_size (a stat). Terminal magic hits (PNG, PDF, ELF, etc.)
|
|
552
|
+
short-circuit the text cascade — text slots stay None and the renderer
|
|
553
|
+
vanishes them.
|
|
554
|
+
"""
|
|
555
|
+
st_size = path.stat().st_size
|
|
556
|
+
|
|
557
|
+
# ── Read head + classify magic ──────────────────────────────────────
|
|
558
|
+
#
|
|
559
|
+
# Container routing is magic-authoritative; suffix is just a fast-path
|
|
560
|
+
# hint so a well-named .gz / .bz2 / .xz skips the head read entirely.
|
|
561
|
+
# An UNKNOWN suffix (e.g. xz bytes in "mystery.log") falls to the
|
|
562
|
+
# magic check, which carries the container label so _sample_compressed
|
|
563
|
+
# opens with the correct stdlib opener.
|
|
564
|
+
suffix_kind = _SUFFIX_TO_CONTAINER.get(path.suffix.lower())
|
|
565
|
+
if suffix_kind is not None:
|
|
566
|
+
# Suffix fast-path: trust the extension, decompress directly. Total
|
|
567
|
+
# bytes pulled this branch = _DECOMPRESSED_PREFIX_BYTES (one read).
|
|
568
|
+
is_compressed = True
|
|
569
|
+
head = _sample_compressed(path, suffix_kind)
|
|
570
|
+
# If the decompressed prefix happens to start with terminal magic
|
|
571
|
+
# (gzipped PNG? unusual), the later _magic_id call still profiles
|
|
572
|
+
# the content as terminal — we route on what is THERE.
|
|
573
|
+
sample_read_count = 1
|
|
574
|
+
body_chunks: list[bytes] = []
|
|
575
|
+
else:
|
|
576
|
+
# Read head ONCE — used for both magic ID and (if no container hit)
|
|
577
|
+
# the sample. Reading it twice would falsify the O(sample) byte
|
|
578
|
+
# budget test as well as do unnecessary work.
|
|
579
|
+
head = _read_head(path)
|
|
580
|
+
kind, label, _prefix = _magic_id(head[:16])
|
|
581
|
+
if kind == "container":
|
|
582
|
+
# Magic-driven container detection on a misnamed file. The
|
|
583
|
+
# label IS the container kind (one of "gzip" / "bzip2" / "xz"
|
|
584
|
+
# — the canonical strings in _SUFFIX_TO_CONTAINER); forward it
|
|
585
|
+
# so the opener routes correctly. Two bounded reads total
|
|
586
|
+
# (head + compressed prefix); rail honored.
|
|
587
|
+
is_compressed = True
|
|
588
|
+
head = _sample_compressed(path, label)
|
|
589
|
+
sample_read_count = 1
|
|
590
|
+
body_chunks = []
|
|
591
|
+
else:
|
|
592
|
+
is_compressed = False
|
|
593
|
+
body_chunks, sample_read_count = _sample_plain_body(
|
|
594
|
+
path, head, st_size,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
# Concatenated sample bytes — used for char-class and UTF-8 probe.
|
|
598
|
+
# Per-file: blob is single-file under sniff today; this concat is over
|
|
599
|
+
# ONE file's chunks, which is fine. Multi-file return would need
|
|
600
|
+
# per-file probes; flagged in the helper docstring.
|
|
601
|
+
sample_bytes = head + b"".join(body_chunks)
|
|
602
|
+
|
|
603
|
+
# ── Terminal magic ID on the decompressed head's leading bytes ──────
|
|
604
|
+
# For compressed inputs, we ID the CONTENT under decompression, so the
|
|
605
|
+
# head here is the decompressed prefix. A terminal hit there means the
|
|
606
|
+
# gzipped content is itself a binary artifact (rare but coherent).
|
|
607
|
+
kind, file_type_guess, file_type_magic = _magic_id(head[:16])
|
|
608
|
+
|
|
609
|
+
# ── Char-class + UTF-8 on the sample ────────────────────────────────
|
|
610
|
+
printable_pct, nonprintable_pct = _char_class_profile(sample_bytes)
|
|
611
|
+
utf8_clean = _utf8_probe(sample_bytes)
|
|
612
|
+
|
|
613
|
+
# Decode lines per chunk (boundary discipline preserved).
|
|
614
|
+
head_lines = _decode_lines(head)
|
|
615
|
+
body_lines: list[str] = []
|
|
616
|
+
for chunk in body_chunks:
|
|
617
|
+
body_lines.extend(_decode_lines(chunk))
|
|
618
|
+
all_sampled_lines = head_lines + body_lines
|
|
619
|
+
|
|
620
|
+
# Cap line count (pathological short-line inputs).
|
|
621
|
+
if len(all_sampled_lines) > _MAX_SAMPLED_LINES:
|
|
622
|
+
all_sampled_lines = all_sampled_lines[:_MAX_SAMPLED_LINES]
|
|
623
|
+
# Trim body_lines proportionally — head_lines first, then body.
|
|
624
|
+
if len(head_lines) >= _MAX_SAMPLED_LINES:
|
|
625
|
+
head_lines = head_lines[:_MAX_SAMPLED_LINES]
|
|
626
|
+
body_lines = []
|
|
627
|
+
else:
|
|
628
|
+
remaining = _MAX_SAMPLED_LINES - len(head_lines)
|
|
629
|
+
body_lines = body_lines[:remaining]
|
|
630
|
+
|
|
631
|
+
sampled_line_count = len(all_sampled_lines)
|
|
632
|
+
|
|
633
|
+
# ── Terminal binary path: skip text cascade, vanish text slots ──────
|
|
634
|
+
if kind == "terminal":
|
|
635
|
+
return BlobCard(
|
|
636
|
+
source_name=path.name,
|
|
637
|
+
byte_size=st_size,
|
|
638
|
+
sampled_line_count=sampled_line_count,
|
|
639
|
+
sample_read_count=sample_read_count,
|
|
640
|
+
is_compressed=is_compressed,
|
|
641
|
+
printable_pct=printable_pct,
|
|
642
|
+
nonprintable_pct=nonprintable_pct,
|
|
643
|
+
utf8_clean=utf8_clean,
|
|
644
|
+
file_type_guess=file_type_guess,
|
|
645
|
+
file_type_magic=file_type_magic,
|
|
646
|
+
shape_guess=None,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
# ── Text path: shape-guess, line stats, tokens, templates ───────────
|
|
650
|
+
shape_guess = _shape_guess(body_lines, all_sampled_lines)
|
|
651
|
+
|
|
652
|
+
lengths = [len(ln) for ln in all_sampled_lines]
|
|
653
|
+
mean_len, median_len, p95_len, max_len, stdev_len, shape = _line_length_shape(
|
|
654
|
+
lengths
|
|
655
|
+
)
|
|
656
|
+
|
|
657
|
+
tokens = _top_tokens(all_sampled_lines)
|
|
658
|
+
|
|
659
|
+
# JSON shape-guess only: extract top-level object key NAMES (never
|
|
660
|
+
# values) for the renderer's `fields:` row. On non-JSON shapes and on
|
|
661
|
+
# JSON-of-arrays/scalars this stays None, and the renderer falls back
|
|
662
|
+
# to the existing `tokens:` row.
|
|
663
|
+
if shape_guess == "JSON":
|
|
664
|
+
json_field_names = _json_field_names(all_sampled_lines)
|
|
665
|
+
else:
|
|
666
|
+
json_field_names = None
|
|
667
|
+
|
|
668
|
+
template_result = _template_structure(all_sampled_lines)
|
|
669
|
+
if template_result is None:
|
|
670
|
+
distinct = top_cov = top_n = singletons = None
|
|
671
|
+
else:
|
|
672
|
+
distinct, top_cov, top_n, singletons = template_result
|
|
673
|
+
|
|
674
|
+
return BlobCard(
|
|
675
|
+
source_name=path.name,
|
|
676
|
+
byte_size=st_size,
|
|
677
|
+
sampled_line_count=sampled_line_count,
|
|
678
|
+
sample_read_count=sample_read_count,
|
|
679
|
+
is_compressed=is_compressed,
|
|
680
|
+
printable_pct=printable_pct,
|
|
681
|
+
nonprintable_pct=nonprintable_pct,
|
|
682
|
+
utf8_clean=utf8_clean,
|
|
683
|
+
file_type_guess=None,
|
|
684
|
+
file_type_magic=None,
|
|
685
|
+
shape_guess=shape_guess,
|
|
686
|
+
mean_line_length=mean_len,
|
|
687
|
+
median_line_length=median_len,
|
|
688
|
+
line_length_p95=p95_len,
|
|
689
|
+
max_line_length=max_len,
|
|
690
|
+
line_length_stdev=stdev_len,
|
|
691
|
+
line_length_shape=shape,
|
|
692
|
+
top_tokens=tokens,
|
|
693
|
+
json_field_names=json_field_names,
|
|
694
|
+
distinct_templates=distinct,
|
|
695
|
+
top_template_coverage_pct=top_cov,
|
|
696
|
+
top_template_n=top_n,
|
|
697
|
+
singleton_template_count=singletons,
|
|
698
|
+
)
|