codetool-shell 0.1.1__py3-none-win_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codetool_shell/__init__.py +11 -0
- codetool_shell/api.py +59 -0
- codetool_shell/bin/windows-arm64/codetool-shell-rust.exe +0 -0
- codetool_shell/filters/__init__.py +14 -0
- codetool_shell/filters/build_compiler/__init__.py +7 -0
- codetool_shell/filters/build_compiler/detector.py +412 -0
- codetool_shell/filters/build_compiler/reducer.py +166 -0
- codetool_shell/filters/build_compiler/summary.py +617 -0
- codetool_shell/filters/ci_job_log/__init__.py +7 -0
- codetool_shell/filters/ci_job_log/detector.py +64 -0
- codetool_shell/filters/ci_job_log/reducer.py +99 -0
- codetool_shell/filters/ci_job_log/summary.py +243 -0
- codetool_shell/filters/diff/__init__.py +7 -0
- codetool_shell/filters/diff/detector.py +136 -0
- codetool_shell/filters/diff/reducer.py +308 -0
- codetool_shell/filters/generic_log/__init__.py +7 -0
- codetool_shell/filters/generic_log/detector.py +175 -0
- codetool_shell/filters/generic_log/reducer.py +99 -0
- codetool_shell/filters/generic_log/summary.py +161 -0
- codetool_shell/filters/git.py +514 -0
- codetool_shell/filters/html_cleanup/__init__.py +7 -0
- codetool_shell/filters/html_cleanup/detector.py +136 -0
- codetool_shell/filters/html_cleanup/reducer.py +27 -0
- codetool_shell/filters/html_cleanup/summary.py +422 -0
- codetool_shell/filters/json_payload/__init__.py +7 -0
- codetool_shell/filters/json_payload/detector.py +62 -0
- codetool_shell/filters/json_payload/reducer.py +81 -0
- codetool_shell/filters/json_payload/summary.py +233 -0
- codetool_shell/filters/listing/__init__.py +7 -0
- codetool_shell/filters/listing/detector.py +294 -0
- codetool_shell/filters/listing/reducer.py +30 -0
- codetool_shell/filters/log_template/__init__.py +7 -0
- codetool_shell/filters/log_template/constants.py +76 -0
- codetool_shell/filters/log_template/detector.py +331 -0
- codetool_shell/filters/log_template/reducer.py +78 -0
- codetool_shell/filters/log_template/template.py +280 -0
- codetool_shell/filters/log_template/types.py +21 -0
- codetool_shell/filters/opaque_payload/__init__.py +7 -0
- codetool_shell/filters/opaque_payload/detector.py +563 -0
- codetool_shell/filters/opaque_payload/reducer.py +142 -0
- codetool_shell/filters/opaque_payload/summary.py +61 -0
- codetool_shell/filters/package_manager/__init__.py +7 -0
- codetool_shell/filters/package_manager/detector.py +220 -0
- codetool_shell/filters/package_manager/reducer.py +110 -0
- codetool_shell/filters/package_manager/summary.py +172 -0
- codetool_shell/filters/pipeline.py +65 -0
- codetool_shell/filters/rg.py +250 -0
- codetool_shell/filters/system_output/__init__.py +7 -0
- codetool_shell/filters/system_output/detector.py +600 -0
- codetool_shell/filters/system_output/reducer.py +331 -0
- codetool_shell/filters/system_output/summary.py +164 -0
- codetool_shell/filters/table/__init__.py +7 -0
- codetool_shell/filters/table/detector.py +244 -0
- codetool_shell/filters/table/reducer.py +57 -0
- codetool_shell/filters/table/summary.py +37 -0
- codetool_shell/filters/test_runner/__init__.py +7 -0
- codetool_shell/filters/test_runner/ansi.py +80 -0
- codetool_shell/filters/test_runner/detector.py +409 -0
- codetool_shell/filters/test_runner/reducer.py +288 -0
- codetool_shell/filters/test_runner/summary.py +449 -0
- codetool_shell/filters/text.py +38 -0
- codetool_shell/filters/traceback/__init__.py +7 -0
- codetool_shell/filters/traceback/detector.py +209 -0
- codetool_shell/filters/traceback/reducer.py +141 -0
- codetool_shell/filters/traceback/summary.py +122 -0
- codetool_shell/filters/tree.py +59 -0
- codetool_shell/py.typed +0 -0
- codetool_shell/python_backend.py +38 -0
- codetool_shell/rust_backend.py +254 -0
- codetool_shell-0.1.1.dist-info/METADATA +152 -0
- codetool_shell-0.1.1.dist-info/RECORD +72 -0
- codetool_shell-0.1.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,563 @@
|
|
|
1
|
+
"""Conservative opaque payload detectors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import base64
|
|
6
|
+
import binascii
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from .summary import byte_len
|
|
13
|
+
|
|
14
|
+
_BASE64_ALPHABET = frozenset(
|
|
15
|
+
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=_-"
|
|
16
|
+
)
|
|
17
|
+
_BASE64_STANDARD_ALPHABET = frozenset(
|
|
18
|
+
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
|
|
19
|
+
)
|
|
20
|
+
_BASE64_URLSAFE_ALPHABET = frozenset(
|
|
21
|
+
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_="
|
|
22
|
+
)
|
|
23
|
+
_HEX_ALPHABET = frozenset("0123456789abcdefABCDEF")
|
|
24
|
+
_STRUCTURAL_PAYLOAD_CHARS = frozenset("{}[]\"':,<>;")
|
|
25
|
+
_BEGIN_RE = re.compile(r"^-----BEGIN ([A-Z0-9][A-Z0-9 ._-]*?)-----$")
|
|
26
|
+
_TITLE_RE = re.compile(r"(?is)<title[^>]*>(.*?)</title>")
|
|
27
|
+
_SIMPLE_KEY_RE = re.compile(r"^[A-Za-z0-9_.:-]{1,64}$")
|
|
28
|
+
_MIME_RE = re.compile(r"^[A-Za-z0-9.+-]+/[A-Za-z0-9.+-]+$")
|
|
29
|
+
_BASE64_SINGLE_TOKEN_MIN_LENGTH = 1024
|
|
30
|
+
_BASE64_BLOCK_MIN_LINES = 4
|
|
31
|
+
_BASE64_BLOCK_MIN_CHARS = 2048
|
|
32
|
+
_BASE64_RUN_LINE_MIN_LENGTH = 32
|
|
33
|
+
_BASE64_VALID_ALPHABET_RATIO = 0.98
|
|
34
|
+
_PEM_BODY_MIN_LINES = 4
|
|
35
|
+
_PEM_BODY_MIN_BYTES = 512
|
|
36
|
+
_PEM_BODY_LINE_MIN_LENGTH = 16
|
|
37
|
+
_DATA_URL_METADATA_MAX_LENGTH = 256
|
|
38
|
+
_DATA_URL_PAYLOAD_MIN_LENGTH = 2048
|
|
39
|
+
_DATA_URL_MIN_BYTES = 4096
|
|
40
|
+
_DATA_URL_BASE64_ALPHABET_RATIO = 0.95
|
|
41
|
+
_OPAQUE_JSON_STRING_MIN_BYTES = 8192
|
|
42
|
+
_OPAQUE_JSON_MIN_OPAQUE_RATIO_PERCENT = 70
|
|
43
|
+
_WHOLE_PAYLOAD_MIN_BYTES = 32768
|
|
44
|
+
_WHOLE_PAYLOAD_MAX_LINES = 3
|
|
45
|
+
_WHOLE_PAYLOAD_MAX_LINE_MIN_LENGTH = 8192
|
|
46
|
+
_MINIFIED_PAYLOAD_MAX_WHITESPACE_RATIO = 0.10
|
|
47
|
+
_OPAQUE_STRING_MAX_WHITESPACE_RATIO = 0.01
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class LinePayloadSpan:
|
|
52
|
+
"""A line-oriented opaque payload span."""
|
|
53
|
+
|
|
54
|
+
start: int
|
|
55
|
+
end: int
|
|
56
|
+
kind: str
|
|
57
|
+
byte_count: int
|
|
58
|
+
line_count: int
|
|
59
|
+
label: str | None = None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass(frozen=True)
|
|
63
|
+
class DataUrlSpan:
|
|
64
|
+
"""A character span for a large data URL."""
|
|
65
|
+
|
|
66
|
+
start: int
|
|
67
|
+
end: int
|
|
68
|
+
mime: str
|
|
69
|
+
encoding: str
|
|
70
|
+
byte_count: int
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
@dataclass(frozen=True)
|
|
74
|
+
class JsonPayloadSummary:
|
|
75
|
+
"""Whole JSON payload summary metadata."""
|
|
76
|
+
|
|
77
|
+
kind: str
|
|
78
|
+
keys: list[str]
|
|
79
|
+
byte_count: int
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
@dataclass(frozen=True)
|
|
83
|
+
class HtmlPayloadSummary:
|
|
84
|
+
"""Whole HTML payload summary metadata."""
|
|
85
|
+
|
|
86
|
+
byte_count: int
|
|
87
|
+
title: str | None
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass(frozen=True)
|
|
91
|
+
class MinifiedJsSummary:
|
|
92
|
+
"""Whole minified JavaScript payload summary metadata."""
|
|
93
|
+
|
|
94
|
+
byte_count: int
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
@dataclass(frozen=True)
|
|
98
|
+
class HttpBody:
|
|
99
|
+
"""HTTP response headers split from a body."""
|
|
100
|
+
|
|
101
|
+
headers: str
|
|
102
|
+
separator: str
|
|
103
|
+
body: str
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def split_http_response(text: str) -> HttpBody | None:
|
|
107
|
+
"""Split clear HTTP response headers from a body."""
|
|
108
|
+
|
|
109
|
+
header_end = text.find("\r\n\r\n")
|
|
110
|
+
separator = "\r\n\r\n"
|
|
111
|
+
if header_end < 0:
|
|
112
|
+
header_end = text.find("\n\n")
|
|
113
|
+
separator = "\n\n"
|
|
114
|
+
if header_end <= 0:
|
|
115
|
+
return None
|
|
116
|
+
|
|
117
|
+
headers = text[:header_end]
|
|
118
|
+
normalized_headers = headers.replace("\r\n", "\n")
|
|
119
|
+
first_line = normalized_headers.split("\n", 1)[0]
|
|
120
|
+
if not first_line.startswith("HTTP/"):
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
header_lines = normalized_headers.split("\n")[1:]
|
|
124
|
+
if header_lines and not any(":" in line for line in header_lines):
|
|
125
|
+
return None
|
|
126
|
+
|
|
127
|
+
return HttpBody(
|
|
128
|
+
headers=headers,
|
|
129
|
+
separator=separator,
|
|
130
|
+
body=text[header_end + len(separator) :],
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def find_line_payload_spans(lines: list[str]) -> list[LinePayloadSpan]:
|
|
135
|
+
"""Return non-overlapping PEM/base64 line spans."""
|
|
136
|
+
|
|
137
|
+
spans: list[LinePayloadSpan] = []
|
|
138
|
+
index = 0
|
|
139
|
+
while index < len(lines):
|
|
140
|
+
pem_span = _parse_pem_span(lines, index)
|
|
141
|
+
if pem_span is not None:
|
|
142
|
+
spans.append(pem_span)
|
|
143
|
+
index = pem_span.end
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
run_end, total_chars, concatenated = _base64_run(lines, index)
|
|
147
|
+
if (
|
|
148
|
+
run_end - index >= _BASE64_BLOCK_MIN_LINES
|
|
149
|
+
and total_chars >= _BASE64_BLOCK_MIN_CHARS
|
|
150
|
+
and not _is_hex_only(concatenated)
|
|
151
|
+
):
|
|
152
|
+
spans.append(
|
|
153
|
+
LinePayloadSpan(
|
|
154
|
+
start=index,
|
|
155
|
+
end=run_end,
|
|
156
|
+
kind="base64",
|
|
157
|
+
byte_count=_line_span_byte_count(lines, index, run_end),
|
|
158
|
+
line_count=run_end - index,
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
index = run_end
|
|
162
|
+
continue
|
|
163
|
+
|
|
164
|
+
line = lines[index].strip()
|
|
165
|
+
if lines[index] == line and _is_base64_token(
|
|
166
|
+
line, min_length=_BASE64_SINGLE_TOKEN_MIN_LENGTH
|
|
167
|
+
):
|
|
168
|
+
spans.append(
|
|
169
|
+
LinePayloadSpan(
|
|
170
|
+
start=index,
|
|
171
|
+
end=index + 1,
|
|
172
|
+
kind="base64",
|
|
173
|
+
byte_count=byte_len(line),
|
|
174
|
+
line_count=1,
|
|
175
|
+
)
|
|
176
|
+
)
|
|
177
|
+
index += 1
|
|
178
|
+
|
|
179
|
+
return spans
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def find_data_url_spans(text: str) -> list[DataUrlSpan]:
|
|
183
|
+
"""Return large data URL spans in ``text``."""
|
|
184
|
+
|
|
185
|
+
spans: list[DataUrlSpan] = []
|
|
186
|
+
index = 0
|
|
187
|
+
while True:
|
|
188
|
+
start = text.find("data:", index)
|
|
189
|
+
if start < 0:
|
|
190
|
+
break
|
|
191
|
+
|
|
192
|
+
span = _parse_data_url(text, start)
|
|
193
|
+
if span is None:
|
|
194
|
+
index = start + len("data:")
|
|
195
|
+
continue
|
|
196
|
+
|
|
197
|
+
spans.append(span)
|
|
198
|
+
index = span.end
|
|
199
|
+
|
|
200
|
+
return spans
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def summarize_json_payload(text: str) -> JsonPayloadSummary | None:
|
|
204
|
+
"""Summarize valid JSON source maps or JSON dominated by opaque strings."""
|
|
205
|
+
|
|
206
|
+
body = text.strip()
|
|
207
|
+
if not (body.startswith("{") and body.endswith("}")):
|
|
208
|
+
return None
|
|
209
|
+
|
|
210
|
+
try:
|
|
211
|
+
value = json.loads(body, parse_constant=_reject_json_constant)
|
|
212
|
+
except (json.JSONDecodeError, ValueError):
|
|
213
|
+
return None
|
|
214
|
+
if not isinstance(value, dict):
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
keys = [key for key in value if _SIMPLE_KEY_RE.fullmatch(key)]
|
|
218
|
+
if not keys:
|
|
219
|
+
return None
|
|
220
|
+
|
|
221
|
+
try:
|
|
222
|
+
serialized = json.dumps(value, ensure_ascii=False, separators=(",", ":"))
|
|
223
|
+
except (TypeError, ValueError):
|
|
224
|
+
return None
|
|
225
|
+
|
|
226
|
+
serialized_bytes = byte_len(serialized)
|
|
227
|
+
opaque_bytes = _opaque_string_bytes(value)
|
|
228
|
+
if (
|
|
229
|
+
opaque_bytes < _OPAQUE_JSON_STRING_MIN_BYTES
|
|
230
|
+
or opaque_bytes * 100 < serialized_bytes * _OPAQUE_JSON_MIN_OPAQUE_RATIO_PERCENT
|
|
231
|
+
):
|
|
232
|
+
return None
|
|
233
|
+
|
|
234
|
+
kind = "json-source-map" if _is_source_map_signature(value) else "json-opaque"
|
|
235
|
+
return JsonPayloadSummary(kind=kind, keys=keys, byte_count=byte_len(text))
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
def summarize_html_payload(text: str) -> HtmlPayloadSummary | None:
|
|
239
|
+
"""Summarize a clear, large HTML document body."""
|
|
240
|
+
|
|
241
|
+
body_bytes = byte_len(text)
|
|
242
|
+
lines = text.split("\n")
|
|
243
|
+
max_line_len = max((len(line) for line in lines), default=0)
|
|
244
|
+
if (
|
|
245
|
+
body_bytes < _WHOLE_PAYLOAD_MIN_BYTES
|
|
246
|
+
and max_line_len < _WHOLE_PAYLOAD_MAX_LINE_MIN_LENGTH
|
|
247
|
+
):
|
|
248
|
+
return None
|
|
249
|
+
|
|
250
|
+
stripped = text.strip()
|
|
251
|
+
lowered = stripped.lower()
|
|
252
|
+
if not (("<!doctype html" in lowered or "<html" in lowered) and "</html>" in lowered):
|
|
253
|
+
return None
|
|
254
|
+
if not (lowered.startswith("<!doctype html") or lowered.startswith("<html")):
|
|
255
|
+
return None
|
|
256
|
+
if not lowered.endswith("</html>"):
|
|
257
|
+
return None
|
|
258
|
+
|
|
259
|
+
return HtmlPayloadSummary(byte_count=body_bytes, title=_extract_title(text))
|
|
260
|
+
|
|
261
|
+
|
|
262
|
+
def summarize_minified_js_payload(text: str) -> MinifiedJsSummary | None:
|
|
263
|
+
"""Summarize a large, clearly minified JavaScript body."""
|
|
264
|
+
|
|
265
|
+
body_bytes = byte_len(text)
|
|
266
|
+
if body_bytes < _WHOLE_PAYLOAD_MIN_BYTES:
|
|
267
|
+
return None
|
|
268
|
+
|
|
269
|
+
lines = text.splitlines()
|
|
270
|
+
nonblank_lines = [line for line in lines if line.strip()]
|
|
271
|
+
max_line_len = max((len(line) for line in nonblank_lines), default=0)
|
|
272
|
+
if (
|
|
273
|
+
len(nonblank_lines) > _WHOLE_PAYLOAD_MAX_LINES
|
|
274
|
+
and max_line_len < _WHOLE_PAYLOAD_MAX_LINE_MIN_LENGTH
|
|
275
|
+
):
|
|
276
|
+
return None
|
|
277
|
+
if _whitespace_ratio(text) > _MINIFIED_PAYLOAD_MAX_WHITESPACE_RATIO:
|
|
278
|
+
return None
|
|
279
|
+
|
|
280
|
+
stripped = text.strip()
|
|
281
|
+
lowered = stripped.lower()
|
|
282
|
+
if not stripped.startswith(
|
|
283
|
+
("(()=>", "(function", "!function", "function", "var ", "let ", "const ")
|
|
284
|
+
):
|
|
285
|
+
return None
|
|
286
|
+
has_anchor = (
|
|
287
|
+
"function" in lowered
|
|
288
|
+
or "webpack" in lowered
|
|
289
|
+
or "//# sourcemappingurl" in lowered
|
|
290
|
+
or "=>" in text
|
|
291
|
+
or "(()=>" in text
|
|
292
|
+
or "!function" in text
|
|
293
|
+
)
|
|
294
|
+
if not has_anchor:
|
|
295
|
+
return None
|
|
296
|
+
|
|
297
|
+
return MinifiedJsSummary(byte_count=body_bytes)
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def _parse_pem_span(lines: list[str], start: int) -> LinePayloadSpan | None:
|
|
301
|
+
match = _BEGIN_RE.fullmatch(lines[start].strip())
|
|
302
|
+
if match is None:
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
label = match.group(1)
|
|
306
|
+
end_marker = f"-----END {label}-----"
|
|
307
|
+
cursor = start + 1
|
|
308
|
+
while cursor < len(lines):
|
|
309
|
+
if lines[cursor].strip() == end_marker:
|
|
310
|
+
body_lines = lines[start + 1 : cursor]
|
|
311
|
+
if _is_pem_body(body_lines):
|
|
312
|
+
return LinePayloadSpan(
|
|
313
|
+
start=start,
|
|
314
|
+
end=cursor + 1,
|
|
315
|
+
kind="pem",
|
|
316
|
+
label=label,
|
|
317
|
+
byte_count=_line_span_byte_count(lines, start, cursor + 1),
|
|
318
|
+
line_count=cursor + 1 - start,
|
|
319
|
+
)
|
|
320
|
+
return None
|
|
321
|
+
cursor += 1
|
|
322
|
+
|
|
323
|
+
return None
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
def _is_pem_body(lines: list[str]) -> bool:
|
|
327
|
+
nonblank = [line.strip() for line in lines if line.strip()]
|
|
328
|
+
body_bytes = byte_len("\n".join(lines))
|
|
329
|
+
if len(nonblank) < _PEM_BODY_MIN_LINES and body_bytes < _PEM_BODY_MIN_BYTES:
|
|
330
|
+
return False
|
|
331
|
+
|
|
332
|
+
payload_lines = 0
|
|
333
|
+
for line in nonblank:
|
|
334
|
+
if ":" in line and len(line) <= 80:
|
|
335
|
+
continue
|
|
336
|
+
if not _is_base64ish_line(line, min_length=_PEM_BODY_LINE_MIN_LENGTH):
|
|
337
|
+
return False
|
|
338
|
+
payload_lines += 1
|
|
339
|
+
|
|
340
|
+
return payload_lines > 0
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def _base64_run(lines: list[str], start: int) -> tuple[int, int, str]:
|
|
344
|
+
cursor = start
|
|
345
|
+
total_chars = 0
|
|
346
|
+
chunks: list[str] = []
|
|
347
|
+
while cursor < len(lines):
|
|
348
|
+
line = lines[cursor].strip()
|
|
349
|
+
if lines[cursor] != line or not _is_base64ish_line(
|
|
350
|
+
line, min_length=_BASE64_RUN_LINE_MIN_LENGTH, allow_diff_prefix=False
|
|
351
|
+
):
|
|
352
|
+
break
|
|
353
|
+
chunks.append(line)
|
|
354
|
+
total_chars += len(line)
|
|
355
|
+
cursor += 1
|
|
356
|
+
return cursor, total_chars, "".join(chunks)
|
|
357
|
+
|
|
358
|
+
|
|
359
|
+
def _is_base64_token(text: str, *, min_length: int) -> bool:
|
|
360
|
+
return (
|
|
361
|
+
len(text) >= min_length
|
|
362
|
+
and not any(char.isspace() for char in text)
|
|
363
|
+
and not _looks_like_path_or_diff_line(text)
|
|
364
|
+
and not _has_structural_payload_chars(text)
|
|
365
|
+
and _valid_base64_shape(text)
|
|
366
|
+
and not _is_hex_only(text)
|
|
367
|
+
)
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _is_base64ish_line(
|
|
371
|
+
text: str, *, min_length: int, allow_diff_prefix: bool = True
|
|
372
|
+
) -> bool:
|
|
373
|
+
return (
|
|
374
|
+
len(text) >= min_length
|
|
375
|
+
and not any(char.isspace() for char in text)
|
|
376
|
+
and (allow_diff_prefix or not _looks_like_path_or_diff_line(text))
|
|
377
|
+
and not _has_structural_payload_chars(text)
|
|
378
|
+
and _valid_base64_shape(text)
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def _base64_ratio(text: str) -> float:
|
|
383
|
+
if not text:
|
|
384
|
+
return 0.0
|
|
385
|
+
allowed = sum(1 for char in text if char in _BASE64_ALPHABET)
|
|
386
|
+
return allowed / len(text)
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _is_hex_only(text: str) -> bool:
|
|
390
|
+
stripped = text.strip()
|
|
391
|
+
return bool(stripped) and all(char in _HEX_ALPHABET for char in stripped)
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def _has_structural_payload_chars(text: str) -> bool:
|
|
395
|
+
return any(char in _STRUCTURAL_PAYLOAD_CHARS for char in text)
|
|
396
|
+
|
|
397
|
+
|
|
398
|
+
def _looks_like_path_or_diff_line(text: str) -> bool:
|
|
399
|
+
if text.startswith(("+", "-")):
|
|
400
|
+
return True
|
|
401
|
+
slash_count = text.count("/")
|
|
402
|
+
if slash_count >= 2 and slash_count * 8 >= len(text):
|
|
403
|
+
return True
|
|
404
|
+
if slash_count >= 3 and all(part for part in text.split("/")):
|
|
405
|
+
return True
|
|
406
|
+
return False
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _valid_base64_shape(text: str) -> bool:
|
|
410
|
+
if _base64_ratio(text) < _BASE64_VALID_ALPHABET_RATIO:
|
|
411
|
+
return False
|
|
412
|
+
has_standard = any(char in "+/" for char in text)
|
|
413
|
+
has_urlsafe = any(char in "-_" for char in text)
|
|
414
|
+
if has_standard and has_urlsafe:
|
|
415
|
+
return False
|
|
416
|
+
alphabet = _BASE64_URLSAFE_ALPHABET if has_urlsafe else _BASE64_STANDARD_ALPHABET
|
|
417
|
+
if not all(char in alphabet for char in text):
|
|
418
|
+
return False
|
|
419
|
+
if "=" in text.rstrip("="):
|
|
420
|
+
return False
|
|
421
|
+
payload = text.rstrip("=")
|
|
422
|
+
if len(payload) % 4 == 1:
|
|
423
|
+
return False
|
|
424
|
+
padded = payload + ("=" * ((4 - len(payload) % 4) % 4))
|
|
425
|
+
try:
|
|
426
|
+
if has_urlsafe:
|
|
427
|
+
base64.urlsafe_b64decode(padded.encode("ascii"))
|
|
428
|
+
else:
|
|
429
|
+
base64.b64decode(padded.encode("ascii"), validate=True)
|
|
430
|
+
except (binascii.Error, ValueError):
|
|
431
|
+
return False
|
|
432
|
+
return True
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def _line_span_byte_count(lines: list[str], start: int, end: int) -> int:
|
|
436
|
+
return byte_len("\n".join(lines[start:end]))
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _parse_data_url(text: str, start: int) -> DataUrlSpan | None:
|
|
440
|
+
metadata_start = start + len("data:")
|
|
441
|
+
cursor = metadata_start
|
|
442
|
+
while cursor < len(text):
|
|
443
|
+
char = text[cursor]
|
|
444
|
+
if char == ",":
|
|
445
|
+
break
|
|
446
|
+
if (
|
|
447
|
+
_is_data_url_metadata_delimiter(char)
|
|
448
|
+
or cursor - metadata_start > _DATA_URL_METADATA_MAX_LENGTH
|
|
449
|
+
):
|
|
450
|
+
return None
|
|
451
|
+
cursor += 1
|
|
452
|
+
|
|
453
|
+
if cursor >= len(text) or text[cursor] != ",":
|
|
454
|
+
return None
|
|
455
|
+
|
|
456
|
+
metadata = text[metadata_start:cursor]
|
|
457
|
+
parts = metadata.split(";") if metadata else []
|
|
458
|
+
mime = parts[0].lower() if parts else ""
|
|
459
|
+
if not _MIME_RE.fullmatch(mime):
|
|
460
|
+
return None
|
|
461
|
+
|
|
462
|
+
encoding = "base64" if any(part.lower() == "base64" for part in parts[1:]) else "urlencoded"
|
|
463
|
+
payload_start = cursor + 1
|
|
464
|
+
payload_end = payload_start
|
|
465
|
+
while payload_end < len(text) and not _is_data_url_payload_delimiter(
|
|
466
|
+
text[payload_end], encoding=encoding
|
|
467
|
+
):
|
|
468
|
+
payload_end += 1
|
|
469
|
+
|
|
470
|
+
payload = text[payload_start:payload_end]
|
|
471
|
+
if not payload:
|
|
472
|
+
return None
|
|
473
|
+
|
|
474
|
+
byte_count = byte_len(text[start:payload_end])
|
|
475
|
+
if len(payload) < _DATA_URL_PAYLOAD_MIN_LENGTH and byte_count < _DATA_URL_MIN_BYTES:
|
|
476
|
+
return None
|
|
477
|
+
if encoding == "base64" and _base64_ratio(payload) < _DATA_URL_BASE64_ALPHABET_RATIO:
|
|
478
|
+
return None
|
|
479
|
+
|
|
480
|
+
return DataUrlSpan(
|
|
481
|
+
start=start,
|
|
482
|
+
end=payload_end,
|
|
483
|
+
mime=mime,
|
|
484
|
+
encoding=encoding,
|
|
485
|
+
byte_count=byte_count,
|
|
486
|
+
)
|
|
487
|
+
|
|
488
|
+
|
|
489
|
+
def _is_data_url_metadata_delimiter(char: str) -> bool:
|
|
490
|
+
return char.isspace() or char in {'"', "'", "<", ">", ")", "]", "}", "`"}
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def _is_data_url_payload_delimiter(char: str, *, encoding: str) -> bool:
|
|
494
|
+
if _is_data_url_metadata_delimiter(char):
|
|
495
|
+
return True
|
|
496
|
+
if encoding == "base64":
|
|
497
|
+
return char not in _BASE64_ALPHABET
|
|
498
|
+
return False
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def _reject_json_constant(value: str) -> None:
|
|
502
|
+
raise ValueError(f"non-standard JSON constant {value}")
|
|
503
|
+
|
|
504
|
+
|
|
505
|
+
def _is_source_map_signature(value: dict[str, Any]) -> bool:
|
|
506
|
+
keys = {key.lower() for key in value}
|
|
507
|
+
return "version" in keys and "mappings" in keys and (
|
|
508
|
+
"sources" in keys or "names" in keys or "file" in keys
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
|
|
512
|
+
def _opaque_string_bytes(value: Any, key_hint: str = "") -> int:
|
|
513
|
+
if isinstance(value, str):
|
|
514
|
+
if len(value) >= _OPAQUE_JSON_STRING_MIN_BYTES and _is_opaque_string(value, key_hint):
|
|
515
|
+
return byte_len(value)
|
|
516
|
+
return 0
|
|
517
|
+
if isinstance(value, dict):
|
|
518
|
+
return sum(_opaque_string_bytes(child, str(key)) for key, child in value.items())
|
|
519
|
+
if isinstance(value, list):
|
|
520
|
+
return sum(_opaque_string_bytes(child, key_hint) for child in value)
|
|
521
|
+
return 0
|
|
522
|
+
|
|
523
|
+
|
|
524
|
+
def _is_opaque_string(value: str, key_hint: str) -> bool:
|
|
525
|
+
key = key_hint.lower()
|
|
526
|
+
return (
|
|
527
|
+
key == "mappings"
|
|
528
|
+
or value.startswith("data:")
|
|
529
|
+
or _is_base64_token(value, min_length=_BASE64_SINGLE_TOKEN_MIN_LENGTH)
|
|
530
|
+
or _looks_like_minified_js_string(value)
|
|
531
|
+
or _whitespace_ratio(value) <= _OPAQUE_STRING_MAX_WHITESPACE_RATIO
|
|
532
|
+
)
|
|
533
|
+
|
|
534
|
+
|
|
535
|
+
def _looks_like_minified_js_string(value: str) -> bool:
|
|
536
|
+
if _whitespace_ratio(value) > _MINIFIED_PAYLOAD_MAX_WHITESPACE_RATIO:
|
|
537
|
+
return False
|
|
538
|
+
lowered = value.lower()
|
|
539
|
+
return (
|
|
540
|
+
"function" in lowered
|
|
541
|
+
or "webpack" in lowered
|
|
542
|
+
or "=>" in value
|
|
543
|
+
or "(()=>" in value
|
|
544
|
+
or "!function" in value
|
|
545
|
+
)
|
|
546
|
+
|
|
547
|
+
|
|
548
|
+
def _whitespace_ratio(value: str) -> float:
|
|
549
|
+
if not value:
|
|
550
|
+
return 0.0
|
|
551
|
+
return sum(1 for char in value if char.isspace()) / len(value)
|
|
552
|
+
|
|
553
|
+
|
|
554
|
+
def _extract_title(text: str) -> str | None:
|
|
555
|
+
match = _TITLE_RE.search(text)
|
|
556
|
+
if match is None:
|
|
557
|
+
return None
|
|
558
|
+
title = re.sub(r"\s+", " ", match.group(1)).strip()
|
|
559
|
+
if not title:
|
|
560
|
+
return None
|
|
561
|
+
if len(title) > 80:
|
|
562
|
+
return f"{title[:77]}..."
|
|
563
|
+
return title
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Reduce opaque blobs and long payloads."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..text import join_preserving_final_newline, score, split_preserving_final_newline
|
|
6
|
+
from .detector import (
|
|
7
|
+
LinePayloadSpan,
|
|
8
|
+
find_data_url_spans,
|
|
9
|
+
find_line_payload_spans,
|
|
10
|
+
split_http_response,
|
|
11
|
+
summarize_html_payload,
|
|
12
|
+
summarize_json_payload,
|
|
13
|
+
summarize_minified_js_payload,
|
|
14
|
+
)
|
|
15
|
+
from .summary import (
|
|
16
|
+
format_base64_summary,
|
|
17
|
+
format_data_url_summary,
|
|
18
|
+
format_html_summary,
|
|
19
|
+
format_json_summary,
|
|
20
|
+
format_minified_js_summary,
|
|
21
|
+
format_pem_summary,
|
|
22
|
+
with_final_newline,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def compress_opaque_payload_output(text: str) -> str:
|
|
27
|
+
"""Summarize high-confidence opaque payloads, otherwise return unchanged."""
|
|
28
|
+
|
|
29
|
+
best = text
|
|
30
|
+
|
|
31
|
+
whole_payload = _summarize_whole_payload(text)
|
|
32
|
+
if whole_payload is not None:
|
|
33
|
+
best = _choose_smaller(best, whole_payload)
|
|
34
|
+
|
|
35
|
+
span_replacements = _replace_line_payloads(text)
|
|
36
|
+
span_replacements = _replace_data_urls(span_replacements)
|
|
37
|
+
best = _choose_smaller(best, span_replacements)
|
|
38
|
+
|
|
39
|
+
return best
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def _summarize_whole_payload(text: str) -> str | None:
|
|
43
|
+
http_body = split_http_response(text)
|
|
44
|
+
if http_body is not None:
|
|
45
|
+
body_summary = _summarize_body(http_body.body)
|
|
46
|
+
if body_summary is None:
|
|
47
|
+
return None
|
|
48
|
+
return f"{http_body.headers}{http_body.separator}{body_summary}"
|
|
49
|
+
|
|
50
|
+
return _summarize_body(text)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _summarize_body(body: str) -> str | None:
|
|
54
|
+
final_newline = body.endswith("\n")
|
|
55
|
+
|
|
56
|
+
json_summary = summarize_json_payload(body)
|
|
57
|
+
if json_summary is not None:
|
|
58
|
+
return with_final_newline(
|
|
59
|
+
format_json_summary(
|
|
60
|
+
kind=json_summary.kind,
|
|
61
|
+
keys=json_summary.keys,
|
|
62
|
+
byte_count=json_summary.byte_count,
|
|
63
|
+
),
|
|
64
|
+
final_newline,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
html_summary = summarize_html_payload(body)
|
|
68
|
+
if html_summary is not None:
|
|
69
|
+
return with_final_newline(
|
|
70
|
+
format_html_summary(
|
|
71
|
+
byte_count=html_summary.byte_count,
|
|
72
|
+
title=html_summary.title,
|
|
73
|
+
),
|
|
74
|
+
final_newline,
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
js_summary = summarize_minified_js_payload(body)
|
|
78
|
+
if js_summary is not None:
|
|
79
|
+
return with_final_newline(
|
|
80
|
+
format_minified_js_summary(byte_count=js_summary.byte_count),
|
|
81
|
+
final_newline,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _replace_line_payloads(text: str) -> str:
|
|
88
|
+
lines, final_newline = split_preserving_final_newline(text)
|
|
89
|
+
spans = find_line_payload_spans(lines)
|
|
90
|
+
if not spans:
|
|
91
|
+
return text
|
|
92
|
+
|
|
93
|
+
output: list[str] = []
|
|
94
|
+
cursor = 0
|
|
95
|
+
for span in spans:
|
|
96
|
+
output.extend(lines[cursor : span.start])
|
|
97
|
+
output.append(_format_line_span(span))
|
|
98
|
+
cursor = span.end
|
|
99
|
+
output.extend(lines[cursor:])
|
|
100
|
+
|
|
101
|
+
return join_preserving_final_newline(output, final_newline)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _replace_data_urls(text: str) -> str:
|
|
105
|
+
spans = find_data_url_spans(text)
|
|
106
|
+
if not spans:
|
|
107
|
+
return text
|
|
108
|
+
|
|
109
|
+
output: list[str] = []
|
|
110
|
+
cursor = 0
|
|
111
|
+
for span in spans:
|
|
112
|
+
output.append(text[cursor : span.start])
|
|
113
|
+
output.append(
|
|
114
|
+
format_data_url_summary(
|
|
115
|
+
mime=span.mime,
|
|
116
|
+
encoding=span.encoding,
|
|
117
|
+
byte_count=span.byte_count,
|
|
118
|
+
)
|
|
119
|
+
)
|
|
120
|
+
cursor = span.end
|
|
121
|
+
output.append(text[cursor:])
|
|
122
|
+
return "".join(output)
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _format_line_span(span: LinePayloadSpan) -> str:
|
|
126
|
+
if span.kind == "pem":
|
|
127
|
+
assert span.label is not None
|
|
128
|
+
return format_pem_summary(
|
|
129
|
+
label=span.label,
|
|
130
|
+
line_count=span.line_count,
|
|
131
|
+
byte_count=span.byte_count,
|
|
132
|
+
)
|
|
133
|
+
return format_base64_summary(
|
|
134
|
+
line_count=span.line_count,
|
|
135
|
+
byte_count=span.byte_count,
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _choose_smaller(current: str, candidate: str) -> str:
|
|
140
|
+
if score(candidate) < score(current):
|
|
141
|
+
return candidate
|
|
142
|
+
return current
|