codetool-shell 0.1.1__py3-none-win_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. codetool_shell/__init__.py +11 -0
  2. codetool_shell/api.py +59 -0
  3. codetool_shell/bin/windows-arm64/codetool-shell-rust.exe +0 -0
  4. codetool_shell/filters/__init__.py +14 -0
  5. codetool_shell/filters/build_compiler/__init__.py +7 -0
  6. codetool_shell/filters/build_compiler/detector.py +412 -0
  7. codetool_shell/filters/build_compiler/reducer.py +166 -0
  8. codetool_shell/filters/build_compiler/summary.py +617 -0
  9. codetool_shell/filters/ci_job_log/__init__.py +7 -0
  10. codetool_shell/filters/ci_job_log/detector.py +64 -0
  11. codetool_shell/filters/ci_job_log/reducer.py +99 -0
  12. codetool_shell/filters/ci_job_log/summary.py +243 -0
  13. codetool_shell/filters/diff/__init__.py +7 -0
  14. codetool_shell/filters/diff/detector.py +136 -0
  15. codetool_shell/filters/diff/reducer.py +308 -0
  16. codetool_shell/filters/generic_log/__init__.py +7 -0
  17. codetool_shell/filters/generic_log/detector.py +175 -0
  18. codetool_shell/filters/generic_log/reducer.py +99 -0
  19. codetool_shell/filters/generic_log/summary.py +161 -0
  20. codetool_shell/filters/git.py +514 -0
  21. codetool_shell/filters/html_cleanup/__init__.py +7 -0
  22. codetool_shell/filters/html_cleanup/detector.py +136 -0
  23. codetool_shell/filters/html_cleanup/reducer.py +27 -0
  24. codetool_shell/filters/html_cleanup/summary.py +422 -0
  25. codetool_shell/filters/json_payload/__init__.py +7 -0
  26. codetool_shell/filters/json_payload/detector.py +62 -0
  27. codetool_shell/filters/json_payload/reducer.py +81 -0
  28. codetool_shell/filters/json_payload/summary.py +233 -0
  29. codetool_shell/filters/listing/__init__.py +7 -0
  30. codetool_shell/filters/listing/detector.py +294 -0
  31. codetool_shell/filters/listing/reducer.py +30 -0
  32. codetool_shell/filters/log_template/__init__.py +7 -0
  33. codetool_shell/filters/log_template/constants.py +76 -0
  34. codetool_shell/filters/log_template/detector.py +331 -0
  35. codetool_shell/filters/log_template/reducer.py +78 -0
  36. codetool_shell/filters/log_template/template.py +280 -0
  37. codetool_shell/filters/log_template/types.py +21 -0
  38. codetool_shell/filters/opaque_payload/__init__.py +7 -0
  39. codetool_shell/filters/opaque_payload/detector.py +563 -0
  40. codetool_shell/filters/opaque_payload/reducer.py +142 -0
  41. codetool_shell/filters/opaque_payload/summary.py +61 -0
  42. codetool_shell/filters/package_manager/__init__.py +7 -0
  43. codetool_shell/filters/package_manager/detector.py +220 -0
  44. codetool_shell/filters/package_manager/reducer.py +110 -0
  45. codetool_shell/filters/package_manager/summary.py +172 -0
  46. codetool_shell/filters/pipeline.py +65 -0
  47. codetool_shell/filters/rg.py +250 -0
  48. codetool_shell/filters/system_output/__init__.py +7 -0
  49. codetool_shell/filters/system_output/detector.py +600 -0
  50. codetool_shell/filters/system_output/reducer.py +331 -0
  51. codetool_shell/filters/system_output/summary.py +164 -0
  52. codetool_shell/filters/table/__init__.py +7 -0
  53. codetool_shell/filters/table/detector.py +244 -0
  54. codetool_shell/filters/table/reducer.py +57 -0
  55. codetool_shell/filters/table/summary.py +37 -0
  56. codetool_shell/filters/test_runner/__init__.py +7 -0
  57. codetool_shell/filters/test_runner/ansi.py +80 -0
  58. codetool_shell/filters/test_runner/detector.py +409 -0
  59. codetool_shell/filters/test_runner/reducer.py +288 -0
  60. codetool_shell/filters/test_runner/summary.py +449 -0
  61. codetool_shell/filters/text.py +38 -0
  62. codetool_shell/filters/traceback/__init__.py +7 -0
  63. codetool_shell/filters/traceback/detector.py +209 -0
  64. codetool_shell/filters/traceback/reducer.py +141 -0
  65. codetool_shell/filters/traceback/summary.py +122 -0
  66. codetool_shell/filters/tree.py +59 -0
  67. codetool_shell/py.typed +0 -0
  68. codetool_shell/python_backend.py +38 -0
  69. codetool_shell/rust_backend.py +254 -0
  70. codetool_shell-0.1.1.dist-info/METADATA +152 -0
  71. codetool_shell-0.1.1.dist-info/RECORD +72 -0
  72. codetool_shell-0.1.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,563 @@
1
+ """Conservative opaque payload detectors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import base64
6
+ import binascii
7
+ import json
8
+ import re
9
+ from dataclasses import dataclass
10
+ from typing import Any
11
+
12
+ from .summary import byte_len
13
+
14
+ _BASE64_ALPHABET = frozenset(
15
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=_-"
16
+ )
17
+ _BASE64_STANDARD_ALPHABET = frozenset(
18
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/="
19
+ )
20
+ _BASE64_URLSAFE_ALPHABET = frozenset(
21
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_="
22
+ )
23
+ _HEX_ALPHABET = frozenset("0123456789abcdefABCDEF")
24
+ _STRUCTURAL_PAYLOAD_CHARS = frozenset("{}[]\"':,<>;")
25
+ _BEGIN_RE = re.compile(r"^-----BEGIN ([A-Z0-9][A-Z0-9 ._-]*?)-----$")
26
+ _TITLE_RE = re.compile(r"(?is)<title[^>]*>(.*?)</title>")
27
+ _SIMPLE_KEY_RE = re.compile(r"^[A-Za-z0-9_.:-]{1,64}$")
28
+ _MIME_RE = re.compile(r"^[A-Za-z0-9.+-]+/[A-Za-z0-9.+-]+$")
29
+ _BASE64_SINGLE_TOKEN_MIN_LENGTH = 1024
30
+ _BASE64_BLOCK_MIN_LINES = 4
31
+ _BASE64_BLOCK_MIN_CHARS = 2048
32
+ _BASE64_RUN_LINE_MIN_LENGTH = 32
33
+ _BASE64_VALID_ALPHABET_RATIO = 0.98
34
+ _PEM_BODY_MIN_LINES = 4
35
+ _PEM_BODY_MIN_BYTES = 512
36
+ _PEM_BODY_LINE_MIN_LENGTH = 16
37
+ _DATA_URL_METADATA_MAX_LENGTH = 256
38
+ _DATA_URL_PAYLOAD_MIN_LENGTH = 2048
39
+ _DATA_URL_MIN_BYTES = 4096
40
+ _DATA_URL_BASE64_ALPHABET_RATIO = 0.95
41
+ _OPAQUE_JSON_STRING_MIN_BYTES = 8192
42
+ _OPAQUE_JSON_MIN_OPAQUE_RATIO_PERCENT = 70
43
+ _WHOLE_PAYLOAD_MIN_BYTES = 32768
44
+ _WHOLE_PAYLOAD_MAX_LINES = 3
45
+ _WHOLE_PAYLOAD_MAX_LINE_MIN_LENGTH = 8192
46
+ _MINIFIED_PAYLOAD_MAX_WHITESPACE_RATIO = 0.10
47
+ _OPAQUE_STRING_MAX_WHITESPACE_RATIO = 0.01
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class LinePayloadSpan:
52
+ """A line-oriented opaque payload span."""
53
+
54
+ start: int
55
+ end: int
56
+ kind: str
57
+ byte_count: int
58
+ line_count: int
59
+ label: str | None = None
60
+
61
+
62
+ @dataclass(frozen=True)
63
+ class DataUrlSpan:
64
+ """A character span for a large data URL."""
65
+
66
+ start: int
67
+ end: int
68
+ mime: str
69
+ encoding: str
70
+ byte_count: int
71
+
72
+
73
+ @dataclass(frozen=True)
74
+ class JsonPayloadSummary:
75
+ """Whole JSON payload summary metadata."""
76
+
77
+ kind: str
78
+ keys: list[str]
79
+ byte_count: int
80
+
81
+
82
+ @dataclass(frozen=True)
83
+ class HtmlPayloadSummary:
84
+ """Whole HTML payload summary metadata."""
85
+
86
+ byte_count: int
87
+ title: str | None
88
+
89
+
90
+ @dataclass(frozen=True)
91
+ class MinifiedJsSummary:
92
+ """Whole minified JavaScript payload summary metadata."""
93
+
94
+ byte_count: int
95
+
96
+
97
+ @dataclass(frozen=True)
98
+ class HttpBody:
99
+ """HTTP response headers split from a body."""
100
+
101
+ headers: str
102
+ separator: str
103
+ body: str
104
+
105
+
106
+ def split_http_response(text: str) -> HttpBody | None:
107
+ """Split clear HTTP response headers from a body."""
108
+
109
+ header_end = text.find("\r\n\r\n")
110
+ separator = "\r\n\r\n"
111
+ if header_end < 0:
112
+ header_end = text.find("\n\n")
113
+ separator = "\n\n"
114
+ if header_end <= 0:
115
+ return None
116
+
117
+ headers = text[:header_end]
118
+ normalized_headers = headers.replace("\r\n", "\n")
119
+ first_line = normalized_headers.split("\n", 1)[0]
120
+ if not first_line.startswith("HTTP/"):
121
+ return None
122
+
123
+ header_lines = normalized_headers.split("\n")[1:]
124
+ if header_lines and not any(":" in line for line in header_lines):
125
+ return None
126
+
127
+ return HttpBody(
128
+ headers=headers,
129
+ separator=separator,
130
+ body=text[header_end + len(separator) :],
131
+ )
132
+
133
+
134
+ def find_line_payload_spans(lines: list[str]) -> list[LinePayloadSpan]:
135
+ """Return non-overlapping PEM/base64 line spans."""
136
+
137
+ spans: list[LinePayloadSpan] = []
138
+ index = 0
139
+ while index < len(lines):
140
+ pem_span = _parse_pem_span(lines, index)
141
+ if pem_span is not None:
142
+ spans.append(pem_span)
143
+ index = pem_span.end
144
+ continue
145
+
146
+ run_end, total_chars, concatenated = _base64_run(lines, index)
147
+ if (
148
+ run_end - index >= _BASE64_BLOCK_MIN_LINES
149
+ and total_chars >= _BASE64_BLOCK_MIN_CHARS
150
+ and not _is_hex_only(concatenated)
151
+ ):
152
+ spans.append(
153
+ LinePayloadSpan(
154
+ start=index,
155
+ end=run_end,
156
+ kind="base64",
157
+ byte_count=_line_span_byte_count(lines, index, run_end),
158
+ line_count=run_end - index,
159
+ )
160
+ )
161
+ index = run_end
162
+ continue
163
+
164
+ line = lines[index].strip()
165
+ if lines[index] == line and _is_base64_token(
166
+ line, min_length=_BASE64_SINGLE_TOKEN_MIN_LENGTH
167
+ ):
168
+ spans.append(
169
+ LinePayloadSpan(
170
+ start=index,
171
+ end=index + 1,
172
+ kind="base64",
173
+ byte_count=byte_len(line),
174
+ line_count=1,
175
+ )
176
+ )
177
+ index += 1
178
+
179
+ return spans
180
+
181
+
182
+ def find_data_url_spans(text: str) -> list[DataUrlSpan]:
183
+ """Return large data URL spans in ``text``."""
184
+
185
+ spans: list[DataUrlSpan] = []
186
+ index = 0
187
+ while True:
188
+ start = text.find("data:", index)
189
+ if start < 0:
190
+ break
191
+
192
+ span = _parse_data_url(text, start)
193
+ if span is None:
194
+ index = start + len("data:")
195
+ continue
196
+
197
+ spans.append(span)
198
+ index = span.end
199
+
200
+ return spans
201
+
202
+
203
+ def summarize_json_payload(text: str) -> JsonPayloadSummary | None:
204
+ """Summarize valid JSON source maps or JSON dominated by opaque strings."""
205
+
206
+ body = text.strip()
207
+ if not (body.startswith("{") and body.endswith("}")):
208
+ return None
209
+
210
+ try:
211
+ value = json.loads(body, parse_constant=_reject_json_constant)
212
+ except (json.JSONDecodeError, ValueError):
213
+ return None
214
+ if not isinstance(value, dict):
215
+ return None
216
+
217
+ keys = [key for key in value if _SIMPLE_KEY_RE.fullmatch(key)]
218
+ if not keys:
219
+ return None
220
+
221
+ try:
222
+ serialized = json.dumps(value, ensure_ascii=False, separators=(",", ":"))
223
+ except (TypeError, ValueError):
224
+ return None
225
+
226
+ serialized_bytes = byte_len(serialized)
227
+ opaque_bytes = _opaque_string_bytes(value)
228
+ if (
229
+ opaque_bytes < _OPAQUE_JSON_STRING_MIN_BYTES
230
+ or opaque_bytes * 100 < serialized_bytes * _OPAQUE_JSON_MIN_OPAQUE_RATIO_PERCENT
231
+ ):
232
+ return None
233
+
234
+ kind = "json-source-map" if _is_source_map_signature(value) else "json-opaque"
235
+ return JsonPayloadSummary(kind=kind, keys=keys, byte_count=byte_len(text))
236
+
237
+
238
+ def summarize_html_payload(text: str) -> HtmlPayloadSummary | None:
239
+ """Summarize a clear, large HTML document body."""
240
+
241
+ body_bytes = byte_len(text)
242
+ lines = text.split("\n")
243
+ max_line_len = max((len(line) for line in lines), default=0)
244
+ if (
245
+ body_bytes < _WHOLE_PAYLOAD_MIN_BYTES
246
+ and max_line_len < _WHOLE_PAYLOAD_MAX_LINE_MIN_LENGTH
247
+ ):
248
+ return None
249
+
250
+ stripped = text.strip()
251
+ lowered = stripped.lower()
252
+ if not (("<!doctype html" in lowered or "<html" in lowered) and "</html>" in lowered):
253
+ return None
254
+ if not (lowered.startswith("<!doctype html") or lowered.startswith("<html")):
255
+ return None
256
+ if not lowered.endswith("</html>"):
257
+ return None
258
+
259
+ return HtmlPayloadSummary(byte_count=body_bytes, title=_extract_title(text))
260
+
261
+
262
+ def summarize_minified_js_payload(text: str) -> MinifiedJsSummary | None:
263
+ """Summarize a large, clearly minified JavaScript body."""
264
+
265
+ body_bytes = byte_len(text)
266
+ if body_bytes < _WHOLE_PAYLOAD_MIN_BYTES:
267
+ return None
268
+
269
+ lines = text.splitlines()
270
+ nonblank_lines = [line for line in lines if line.strip()]
271
+ max_line_len = max((len(line) for line in nonblank_lines), default=0)
272
+ if (
273
+ len(nonblank_lines) > _WHOLE_PAYLOAD_MAX_LINES
274
+ and max_line_len < _WHOLE_PAYLOAD_MAX_LINE_MIN_LENGTH
275
+ ):
276
+ return None
277
+ if _whitespace_ratio(text) > _MINIFIED_PAYLOAD_MAX_WHITESPACE_RATIO:
278
+ return None
279
+
280
+ stripped = text.strip()
281
+ lowered = stripped.lower()
282
+ if not stripped.startswith(
283
+ ("(()=>", "(function", "!function", "function", "var ", "let ", "const ")
284
+ ):
285
+ return None
286
+ has_anchor = (
287
+ "function" in lowered
288
+ or "webpack" in lowered
289
+ or "//# sourcemappingurl" in lowered
290
+ or "=>" in text
291
+ or "(()=>" in text
292
+ or "!function" in text
293
+ )
294
+ if not has_anchor:
295
+ return None
296
+
297
+ return MinifiedJsSummary(byte_count=body_bytes)
298
+
299
+
300
+ def _parse_pem_span(lines: list[str], start: int) -> LinePayloadSpan | None:
301
+ match = _BEGIN_RE.fullmatch(lines[start].strip())
302
+ if match is None:
303
+ return None
304
+
305
+ label = match.group(1)
306
+ end_marker = f"-----END {label}-----"
307
+ cursor = start + 1
308
+ while cursor < len(lines):
309
+ if lines[cursor].strip() == end_marker:
310
+ body_lines = lines[start + 1 : cursor]
311
+ if _is_pem_body(body_lines):
312
+ return LinePayloadSpan(
313
+ start=start,
314
+ end=cursor + 1,
315
+ kind="pem",
316
+ label=label,
317
+ byte_count=_line_span_byte_count(lines, start, cursor + 1),
318
+ line_count=cursor + 1 - start,
319
+ )
320
+ return None
321
+ cursor += 1
322
+
323
+ return None
324
+
325
+
326
+ def _is_pem_body(lines: list[str]) -> bool:
327
+ nonblank = [line.strip() for line in lines if line.strip()]
328
+ body_bytes = byte_len("\n".join(lines))
329
+ if len(nonblank) < _PEM_BODY_MIN_LINES and body_bytes < _PEM_BODY_MIN_BYTES:
330
+ return False
331
+
332
+ payload_lines = 0
333
+ for line in nonblank:
334
+ if ":" in line and len(line) <= 80:
335
+ continue
336
+ if not _is_base64ish_line(line, min_length=_PEM_BODY_LINE_MIN_LENGTH):
337
+ return False
338
+ payload_lines += 1
339
+
340
+ return payload_lines > 0
341
+
342
+
343
+ def _base64_run(lines: list[str], start: int) -> tuple[int, int, str]:
344
+ cursor = start
345
+ total_chars = 0
346
+ chunks: list[str] = []
347
+ while cursor < len(lines):
348
+ line = lines[cursor].strip()
349
+ if lines[cursor] != line or not _is_base64ish_line(
350
+ line, min_length=_BASE64_RUN_LINE_MIN_LENGTH, allow_diff_prefix=False
351
+ ):
352
+ break
353
+ chunks.append(line)
354
+ total_chars += len(line)
355
+ cursor += 1
356
+ return cursor, total_chars, "".join(chunks)
357
+
358
+
359
+ def _is_base64_token(text: str, *, min_length: int) -> bool:
360
+ return (
361
+ len(text) >= min_length
362
+ and not any(char.isspace() for char in text)
363
+ and not _looks_like_path_or_diff_line(text)
364
+ and not _has_structural_payload_chars(text)
365
+ and _valid_base64_shape(text)
366
+ and not _is_hex_only(text)
367
+ )
368
+
369
+
370
+ def _is_base64ish_line(
371
+ text: str, *, min_length: int, allow_diff_prefix: bool = True
372
+ ) -> bool:
373
+ return (
374
+ len(text) >= min_length
375
+ and not any(char.isspace() for char in text)
376
+ and (allow_diff_prefix or not _looks_like_path_or_diff_line(text))
377
+ and not _has_structural_payload_chars(text)
378
+ and _valid_base64_shape(text)
379
+ )
380
+
381
+
382
+ def _base64_ratio(text: str) -> float:
383
+ if not text:
384
+ return 0.0
385
+ allowed = sum(1 for char in text if char in _BASE64_ALPHABET)
386
+ return allowed / len(text)
387
+
388
+
389
+ def _is_hex_only(text: str) -> bool:
390
+ stripped = text.strip()
391
+ return bool(stripped) and all(char in _HEX_ALPHABET for char in stripped)
392
+
393
+
394
+ def _has_structural_payload_chars(text: str) -> bool:
395
+ return any(char in _STRUCTURAL_PAYLOAD_CHARS for char in text)
396
+
397
+
398
+ def _looks_like_path_or_diff_line(text: str) -> bool:
399
+ if text.startswith(("+", "-")):
400
+ return True
401
+ slash_count = text.count("/")
402
+ if slash_count >= 2 and slash_count * 8 >= len(text):
403
+ return True
404
+ if slash_count >= 3 and all(part for part in text.split("/")):
405
+ return True
406
+ return False
407
+
408
+
409
+ def _valid_base64_shape(text: str) -> bool:
410
+ if _base64_ratio(text) < _BASE64_VALID_ALPHABET_RATIO:
411
+ return False
412
+ has_standard = any(char in "+/" for char in text)
413
+ has_urlsafe = any(char in "-_" for char in text)
414
+ if has_standard and has_urlsafe:
415
+ return False
416
+ alphabet = _BASE64_URLSAFE_ALPHABET if has_urlsafe else _BASE64_STANDARD_ALPHABET
417
+ if not all(char in alphabet for char in text):
418
+ return False
419
+ if "=" in text.rstrip("="):
420
+ return False
421
+ payload = text.rstrip("=")
422
+ if len(payload) % 4 == 1:
423
+ return False
424
+ padded = payload + ("=" * ((4 - len(payload) % 4) % 4))
425
+ try:
426
+ if has_urlsafe:
427
+ base64.urlsafe_b64decode(padded.encode("ascii"))
428
+ else:
429
+ base64.b64decode(padded.encode("ascii"), validate=True)
430
+ except (binascii.Error, ValueError):
431
+ return False
432
+ return True
433
+
434
+
435
+ def _line_span_byte_count(lines: list[str], start: int, end: int) -> int:
436
+ return byte_len("\n".join(lines[start:end]))
437
+
438
+
439
+ def _parse_data_url(text: str, start: int) -> DataUrlSpan | None:
440
+ metadata_start = start + len("data:")
441
+ cursor = metadata_start
442
+ while cursor < len(text):
443
+ char = text[cursor]
444
+ if char == ",":
445
+ break
446
+ if (
447
+ _is_data_url_metadata_delimiter(char)
448
+ or cursor - metadata_start > _DATA_URL_METADATA_MAX_LENGTH
449
+ ):
450
+ return None
451
+ cursor += 1
452
+
453
+ if cursor >= len(text) or text[cursor] != ",":
454
+ return None
455
+
456
+ metadata = text[metadata_start:cursor]
457
+ parts = metadata.split(";") if metadata else []
458
+ mime = parts[0].lower() if parts else ""
459
+ if not _MIME_RE.fullmatch(mime):
460
+ return None
461
+
462
+ encoding = "base64" if any(part.lower() == "base64" for part in parts[1:]) else "urlencoded"
463
+ payload_start = cursor + 1
464
+ payload_end = payload_start
465
+ while payload_end < len(text) and not _is_data_url_payload_delimiter(
466
+ text[payload_end], encoding=encoding
467
+ ):
468
+ payload_end += 1
469
+
470
+ payload = text[payload_start:payload_end]
471
+ if not payload:
472
+ return None
473
+
474
+ byte_count = byte_len(text[start:payload_end])
475
+ if len(payload) < _DATA_URL_PAYLOAD_MIN_LENGTH and byte_count < _DATA_URL_MIN_BYTES:
476
+ return None
477
+ if encoding == "base64" and _base64_ratio(payload) < _DATA_URL_BASE64_ALPHABET_RATIO:
478
+ return None
479
+
480
+ return DataUrlSpan(
481
+ start=start,
482
+ end=payload_end,
483
+ mime=mime,
484
+ encoding=encoding,
485
+ byte_count=byte_count,
486
+ )
487
+
488
+
489
+ def _is_data_url_metadata_delimiter(char: str) -> bool:
490
+ return char.isspace() or char in {'"', "'", "<", ">", ")", "]", "}", "`"}
491
+
492
+
493
+ def _is_data_url_payload_delimiter(char: str, *, encoding: str) -> bool:
494
+ if _is_data_url_metadata_delimiter(char):
495
+ return True
496
+ if encoding == "base64":
497
+ return char not in _BASE64_ALPHABET
498
+ return False
499
+
500
+
501
+ def _reject_json_constant(value: str) -> None:
502
+ raise ValueError(f"non-standard JSON constant {value}")
503
+
504
+
505
+ def _is_source_map_signature(value: dict[str, Any]) -> bool:
506
+ keys = {key.lower() for key in value}
507
+ return "version" in keys and "mappings" in keys and (
508
+ "sources" in keys or "names" in keys or "file" in keys
509
+ )
510
+
511
+
512
+ def _opaque_string_bytes(value: Any, key_hint: str = "") -> int:
513
+ if isinstance(value, str):
514
+ if len(value) >= _OPAQUE_JSON_STRING_MIN_BYTES and _is_opaque_string(value, key_hint):
515
+ return byte_len(value)
516
+ return 0
517
+ if isinstance(value, dict):
518
+ return sum(_opaque_string_bytes(child, str(key)) for key, child in value.items())
519
+ if isinstance(value, list):
520
+ return sum(_opaque_string_bytes(child, key_hint) for child in value)
521
+ return 0
522
+
523
+
524
+ def _is_opaque_string(value: str, key_hint: str) -> bool:
525
+ key = key_hint.lower()
526
+ return (
527
+ key == "mappings"
528
+ or value.startswith("data:")
529
+ or _is_base64_token(value, min_length=_BASE64_SINGLE_TOKEN_MIN_LENGTH)
530
+ or _looks_like_minified_js_string(value)
531
+ or _whitespace_ratio(value) <= _OPAQUE_STRING_MAX_WHITESPACE_RATIO
532
+ )
533
+
534
+
535
+ def _looks_like_minified_js_string(value: str) -> bool:
536
+ if _whitespace_ratio(value) > _MINIFIED_PAYLOAD_MAX_WHITESPACE_RATIO:
537
+ return False
538
+ lowered = value.lower()
539
+ return (
540
+ "function" in lowered
541
+ or "webpack" in lowered
542
+ or "=>" in value
543
+ or "(()=>" in value
544
+ or "!function" in value
545
+ )
546
+
547
+
548
+ def _whitespace_ratio(value: str) -> float:
549
+ if not value:
550
+ return 0.0
551
+ return sum(1 for char in value if char.isspace()) / len(value)
552
+
553
+
554
+ def _extract_title(text: str) -> str | None:
555
+ match = _TITLE_RE.search(text)
556
+ if match is None:
557
+ return None
558
+ title = re.sub(r"\s+", " ", match.group(1)).strip()
559
+ if not title:
560
+ return None
561
+ if len(title) > 80:
562
+ return f"{title[:77]}..."
563
+ return title
@@ -0,0 +1,142 @@
1
+ """Reduce opaque blobs and long payloads."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..text import join_preserving_final_newline, score, split_preserving_final_newline
6
+ from .detector import (
7
+ LinePayloadSpan,
8
+ find_data_url_spans,
9
+ find_line_payload_spans,
10
+ split_http_response,
11
+ summarize_html_payload,
12
+ summarize_json_payload,
13
+ summarize_minified_js_payload,
14
+ )
15
+ from .summary import (
16
+ format_base64_summary,
17
+ format_data_url_summary,
18
+ format_html_summary,
19
+ format_json_summary,
20
+ format_minified_js_summary,
21
+ format_pem_summary,
22
+ with_final_newline,
23
+ )
24
+
25
+
26
+ def compress_opaque_payload_output(text: str) -> str:
27
+ """Summarize high-confidence opaque payloads, otherwise return unchanged."""
28
+
29
+ best = text
30
+
31
+ whole_payload = _summarize_whole_payload(text)
32
+ if whole_payload is not None:
33
+ best = _choose_smaller(best, whole_payload)
34
+
35
+ span_replacements = _replace_line_payloads(text)
36
+ span_replacements = _replace_data_urls(span_replacements)
37
+ best = _choose_smaller(best, span_replacements)
38
+
39
+ return best
40
+
41
+
42
+ def _summarize_whole_payload(text: str) -> str | None:
43
+ http_body = split_http_response(text)
44
+ if http_body is not None:
45
+ body_summary = _summarize_body(http_body.body)
46
+ if body_summary is None:
47
+ return None
48
+ return f"{http_body.headers}{http_body.separator}{body_summary}"
49
+
50
+ return _summarize_body(text)
51
+
52
+
53
+ def _summarize_body(body: str) -> str | None:
54
+ final_newline = body.endswith("\n")
55
+
56
+ json_summary = summarize_json_payload(body)
57
+ if json_summary is not None:
58
+ return with_final_newline(
59
+ format_json_summary(
60
+ kind=json_summary.kind,
61
+ keys=json_summary.keys,
62
+ byte_count=json_summary.byte_count,
63
+ ),
64
+ final_newline,
65
+ )
66
+
67
+ html_summary = summarize_html_payload(body)
68
+ if html_summary is not None:
69
+ return with_final_newline(
70
+ format_html_summary(
71
+ byte_count=html_summary.byte_count,
72
+ title=html_summary.title,
73
+ ),
74
+ final_newline,
75
+ )
76
+
77
+ js_summary = summarize_minified_js_payload(body)
78
+ if js_summary is not None:
79
+ return with_final_newline(
80
+ format_minified_js_summary(byte_count=js_summary.byte_count),
81
+ final_newline,
82
+ )
83
+
84
+ return None
85
+
86
+
87
+ def _replace_line_payloads(text: str) -> str:
88
+ lines, final_newline = split_preserving_final_newline(text)
89
+ spans = find_line_payload_spans(lines)
90
+ if not spans:
91
+ return text
92
+
93
+ output: list[str] = []
94
+ cursor = 0
95
+ for span in spans:
96
+ output.extend(lines[cursor : span.start])
97
+ output.append(_format_line_span(span))
98
+ cursor = span.end
99
+ output.extend(lines[cursor:])
100
+
101
+ return join_preserving_final_newline(output, final_newline)
102
+
103
+
104
+ def _replace_data_urls(text: str) -> str:
105
+ spans = find_data_url_spans(text)
106
+ if not spans:
107
+ return text
108
+
109
+ output: list[str] = []
110
+ cursor = 0
111
+ for span in spans:
112
+ output.append(text[cursor : span.start])
113
+ output.append(
114
+ format_data_url_summary(
115
+ mime=span.mime,
116
+ encoding=span.encoding,
117
+ byte_count=span.byte_count,
118
+ )
119
+ )
120
+ cursor = span.end
121
+ output.append(text[cursor:])
122
+ return "".join(output)
123
+
124
+
125
+ def _format_line_span(span: LinePayloadSpan) -> str:
126
+ if span.kind == "pem":
127
+ assert span.label is not None
128
+ return format_pem_summary(
129
+ label=span.label,
130
+ line_count=span.line_count,
131
+ byte_count=span.byte_count,
132
+ )
133
+ return format_base64_summary(
134
+ line_count=span.line_count,
135
+ byte_count=span.byte_count,
136
+ )
137
+
138
+
139
+ def _choose_smaller(current: str, candidate: str) -> str:
140
+ if score(candidate) < score(current):
141
+ return candidate
142
+ return current