codetool-shell 0.1.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. codetool_shell/__init__.py +11 -0
  2. codetool_shell/api.py +59 -0
  3. codetool_shell/bin/windows-x86_64/codetool-shell-rust.exe +0 -0
  4. codetool_shell/filters/__init__.py +14 -0
  5. codetool_shell/filters/build_compiler/__init__.py +7 -0
  6. codetool_shell/filters/build_compiler/detector.py +412 -0
  7. codetool_shell/filters/build_compiler/reducer.py +166 -0
  8. codetool_shell/filters/build_compiler/summary.py +617 -0
  9. codetool_shell/filters/ci_job_log/__init__.py +7 -0
  10. codetool_shell/filters/ci_job_log/detector.py +64 -0
  11. codetool_shell/filters/ci_job_log/reducer.py +99 -0
  12. codetool_shell/filters/ci_job_log/summary.py +243 -0
  13. codetool_shell/filters/diff/__init__.py +7 -0
  14. codetool_shell/filters/diff/detector.py +136 -0
  15. codetool_shell/filters/diff/reducer.py +308 -0
  16. codetool_shell/filters/generic_log/__init__.py +7 -0
  17. codetool_shell/filters/generic_log/detector.py +175 -0
  18. codetool_shell/filters/generic_log/reducer.py +99 -0
  19. codetool_shell/filters/generic_log/summary.py +161 -0
  20. codetool_shell/filters/git.py +514 -0
  21. codetool_shell/filters/html_cleanup/__init__.py +7 -0
  22. codetool_shell/filters/html_cleanup/detector.py +136 -0
  23. codetool_shell/filters/html_cleanup/reducer.py +27 -0
  24. codetool_shell/filters/html_cleanup/summary.py +422 -0
  25. codetool_shell/filters/json_payload/__init__.py +7 -0
  26. codetool_shell/filters/json_payload/detector.py +62 -0
  27. codetool_shell/filters/json_payload/reducer.py +81 -0
  28. codetool_shell/filters/json_payload/summary.py +233 -0
  29. codetool_shell/filters/listing/__init__.py +7 -0
  30. codetool_shell/filters/listing/detector.py +294 -0
  31. codetool_shell/filters/listing/reducer.py +30 -0
  32. codetool_shell/filters/log_template/__init__.py +7 -0
  33. codetool_shell/filters/log_template/constants.py +76 -0
  34. codetool_shell/filters/log_template/detector.py +331 -0
  35. codetool_shell/filters/log_template/reducer.py +78 -0
  36. codetool_shell/filters/log_template/template.py +280 -0
  37. codetool_shell/filters/log_template/types.py +21 -0
  38. codetool_shell/filters/opaque_payload/__init__.py +7 -0
  39. codetool_shell/filters/opaque_payload/detector.py +563 -0
  40. codetool_shell/filters/opaque_payload/reducer.py +142 -0
  41. codetool_shell/filters/opaque_payload/summary.py +61 -0
  42. codetool_shell/filters/package_manager/__init__.py +7 -0
  43. codetool_shell/filters/package_manager/detector.py +220 -0
  44. codetool_shell/filters/package_manager/reducer.py +110 -0
  45. codetool_shell/filters/package_manager/summary.py +172 -0
  46. codetool_shell/filters/pipeline.py +65 -0
  47. codetool_shell/filters/rg.py +250 -0
  48. codetool_shell/filters/system_output/__init__.py +7 -0
  49. codetool_shell/filters/system_output/detector.py +600 -0
  50. codetool_shell/filters/system_output/reducer.py +331 -0
  51. codetool_shell/filters/system_output/summary.py +164 -0
  52. codetool_shell/filters/table/__init__.py +7 -0
  53. codetool_shell/filters/table/detector.py +244 -0
  54. codetool_shell/filters/table/reducer.py +57 -0
  55. codetool_shell/filters/table/summary.py +37 -0
  56. codetool_shell/filters/test_runner/__init__.py +7 -0
  57. codetool_shell/filters/test_runner/ansi.py +80 -0
  58. codetool_shell/filters/test_runner/detector.py +409 -0
  59. codetool_shell/filters/test_runner/reducer.py +288 -0
  60. codetool_shell/filters/test_runner/summary.py +449 -0
  61. codetool_shell/filters/text.py +38 -0
  62. codetool_shell/filters/traceback/__init__.py +7 -0
  63. codetool_shell/filters/traceback/detector.py +209 -0
  64. codetool_shell/filters/traceback/reducer.py +141 -0
  65. codetool_shell/filters/traceback/summary.py +122 -0
  66. codetool_shell/filters/tree.py +59 -0
  67. codetool_shell/py.typed +0 -0
  68. codetool_shell/python_backend.py +38 -0
  69. codetool_shell/rust_backend.py +254 -0
  70. codetool_shell-0.1.1.dist-info/METADATA +152 -0
  71. codetool_shell-0.1.1.dist-info/RECORD +72 -0
  72. codetool_shell-0.1.1.dist-info/WHEEL +4 -0
@@ -0,0 +1,136 @@
1
+ """Conservative full-document HTML detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+
7
+
8
+ _MIN_HTML_BYTES = 4096
9
+ _MAX_CONTROL_RATIO = 0.01
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class HttpBody:
14
+ """HTTP response headers split from a body."""
15
+
16
+ headers: str
17
+ separator: str
18
+ body: str
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class HtmlPayload:
23
+ """A detected HTML document, optionally wrapped by HTTP headers."""
24
+
25
+ headers: str
26
+ separator: str
27
+ body: str
28
+
29
+
30
+ def detect_html_payload(text: str) -> HtmlPayload | None:
31
+ """Return a clear full HTML document body, or ``None``."""
32
+
33
+ http_body = split_http_response(text)
34
+ if http_body is not None:
35
+ body = http_body.body
36
+ headers = http_body.headers
37
+ separator = http_body.separator
38
+ else:
39
+ body = text
40
+ headers = ""
41
+ separator = ""
42
+
43
+ if not _is_clear_full_html_document(body):
44
+ return None
45
+ return HtmlPayload(headers=headers, separator=separator, body=body)
46
+
47
+
48
+ def split_http_response(text: str) -> HttpBody | None:
49
+ """Split clear HTTP response headers from a body."""
50
+
51
+ header_end = text.find("\r\n\r\n")
52
+ separator = "\r\n\r\n"
53
+ if header_end < 0:
54
+ header_end = text.find("\n\n")
55
+ separator = "\n\n"
56
+ if header_end <= 0:
57
+ return None
58
+
59
+ headers = text[:header_end]
60
+ normalized_headers = headers.replace("\r\n", "\n")
61
+ first_line = normalized_headers.split("\n", 1)[0]
62
+ if not first_line.startswith("HTTP/"):
63
+ return None
64
+
65
+ header_lines = normalized_headers.split("\n")[1:]
66
+ if header_lines and not any(":" in line for line in header_lines):
67
+ return None
68
+
69
+ return HttpBody(
70
+ headers=headers,
71
+ separator=separator,
72
+ body=text[header_end + len(separator) :],
73
+ )
74
+
75
+
76
+ def _is_clear_full_html_document(body: str) -> bool:
77
+ if len(body.encode("utf-8")) < _MIN_HTML_BYTES:
78
+ return False
79
+ if _is_control_heavy(body):
80
+ return False
81
+
82
+ stripped = body.strip()
83
+ lowered = stripped.lower()
84
+ if not (lowered.startswith("<!doctype html") or lowered.startswith("<html")):
85
+ return False
86
+ if not lowered.endswith("</html>"):
87
+ return False
88
+ if not all(marker in lowered for marker in ("<html", "</html>", "<body", "</body>")):
89
+ return False
90
+ if lowered.find("<body") > lowered.find("</body>"):
91
+ return False
92
+ if lowered.find("</body>") > lowered.rfind("</html>"):
93
+ return False
94
+
95
+ if _looks_like_non_html_source(body):
96
+ return False
97
+
98
+ return True
99
+
100
+
101
+ def _is_control_heavy(text: str) -> bool:
102
+ if "\x00" in text:
103
+ return True
104
+ controls = sum(
105
+ 1 for char in text if ord(char) < 32 and char not in {"\n", "\r", "\t"}
106
+ )
107
+ return controls / max(len(text), 1) > _MAX_CONTROL_RATIO
108
+
109
+
110
+ def _looks_like_non_html_source(text: str) -> bool:
111
+ lowered = text.lower()
112
+ if any(marker in text for marker in ("```", "{{", "{%", "<%", "<?php")):
113
+ return True
114
+ if any(
115
+ marker in text
116
+ for marker in (
117
+ "className=",
118
+ "export default",
119
+ "const App",
120
+ "function App(",
121
+ "React.",
122
+ "</>",
123
+ )
124
+ ):
125
+ return True
126
+ if "Traceback (most recent call last):" in text:
127
+ return True
128
+ if "\ndiff --git " in text or text.startswith("diff --git "):
129
+ return True
130
+ if "\n@@ " in text or "\n--- " in text or "\n+++ " in text:
131
+ return True
132
+ if "\n --> " in text and ("error:" in lowered or "warning:" in lowered):
133
+ return True
134
+ if lowered.startswith(("error[", "error:", "warning:")):
135
+ return True
136
+ return False
@@ -0,0 +1,27 @@
1
+ """Reduce full HTML documents to semantic visible content."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from ..text import score
6
+ from .detector import detect_html_payload
7
+ from .summary import format_html_summary, summarize_html_document
8
+
9
+
10
+ def compress_html_cleanup_output(text: str) -> str:
11
+ """Compress a clear full HTML document, otherwise return unchanged."""
12
+
13
+ payload = detect_html_payload(text)
14
+ if payload is None:
15
+ return text
16
+
17
+ summary = summarize_html_document(payload.body)
18
+ if summary is None:
19
+ return text
20
+
21
+ candidate = f"{payload.headers}{payload.separator}" + format_html_summary(
22
+ summary,
23
+ final_newline=payload.body.endswith("\n"),
24
+ )
25
+ if score(candidate) < score(text):
26
+ return candidate
27
+ return text
@@ -0,0 +1,422 @@
1
+ """Extract and format semantic HTML document summaries."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import html
6
+ import re
7
+ from dataclasses import dataclass
8
+
9
+
10
+ _COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
11
+ _SKIP_TAG_RE = re.compile(
12
+ r"<(script|style|noscript|svg|template)\b[^>]*>.*?</\1\s*>",
13
+ re.IGNORECASE | re.DOTALL,
14
+ )
15
+ _TITLE_RE = re.compile(r"<title\b[^>]*>(.*?)</title\s*>", re.IGNORECASE | re.DOTALL)
16
+ _BODY_OPEN_RE = re.compile(r"<body\b[^>]*>", re.IGNORECASE)
17
+ _BODY_CLOSE_RE = re.compile(r"</body\s*>", re.IGNORECASE)
18
+ _HEADING_RE = re.compile(
19
+ r"<h([1-6])\b[^>]*>(.*?)</h\1\s*>", re.IGNORECASE | re.DOTALL
20
+ )
21
+ _ANCHOR_RE = re.compile(r"<a\b([^>]*)>(.*?)</a\s*>", re.IGNORECASE | re.DOTALL)
22
+ _HREF_RE = re.compile(
23
+ r"""href\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))""",
24
+ re.IGNORECASE,
25
+ )
26
+ _TAG_RE = re.compile(r"<[^>]+>")
27
+ _BLOCK_TAG_RE = re.compile(
28
+ r"</?(?:address|article|aside|blockquote|br|dd|details|div|dl|dt|figcaption|"
29
+ r"figure|footer|form|h[1-6]|header|hr|li|main|nav|ol|p|pre|section|table|"
30
+ r"tbody|td|tfoot|th|thead|tr|ul)\b[^>]*>",
31
+ re.IGNORECASE,
32
+ )
33
+ _DATA_URL_RE = re.compile(r"data:[^\s\"'<>`)]+", re.IGNORECASE)
34
+ _LONG_DATA_URL_MIN_CHARS = 128
35
+ _MAX_TITLE_CHARS = 80
36
+ _MAX_HEADING_CHARS = 120
37
+ _MAX_LINK_LABEL_CHARS = 80
38
+ _MAX_HREF_CHARS = 160
39
+ _MAX_TEXT_BLOCK_CHARS = 240
40
+ _MAX_HEADINGS = 8
41
+ _MAX_LINKS = 8
42
+ _MAX_SMALL_TEXT_BLOCKS = 8
43
+ _FIRST_TEXT_BLOCKS = 3
44
+ _LAST_TEXT_BLOCKS = 2
45
+ _MAX_SALIENT_TEXT_BLOCKS = 3
46
+ _SALIENT_TERMS = (
47
+ "error",
48
+ "warning",
49
+ "failed",
50
+ "failure",
51
+ "exception",
52
+ "traceback",
53
+ "fatal",
54
+ "critical",
55
+ "denied",
56
+ "404",
57
+ "500",
58
+ )
59
+ _SAFE_URL_SCHEMES = {"http", "https", "mailto", "tel"}
60
+ _COMMON_HREF_ENTITY_RE = re.compile(
61
+ r"&(newline|tab|colon);", re.IGNORECASE
62
+ )
63
+
64
+
65
+ @dataclass(frozen=True)
66
+ class RemovalCounts:
67
+ """Counts of non-semantic HTML payloads removed by cleanup."""
68
+
69
+ comments: int = 0
70
+ scripts: int = 0
71
+ styles: int = 0
72
+ svg: int = 0
73
+ data_urls: int = 0
74
+
75
+ @property
76
+ def any(self) -> bool:
77
+ return any((self.comments, self.scripts, self.styles, self.svg, self.data_urls))
78
+
79
+
80
+ @dataclass(frozen=True)
81
+ class Link:
82
+ """A visible anchor label and safe href."""
83
+
84
+ label: str
85
+ href: str
86
+
87
+
88
+ @dataclass(frozen=True)
89
+ class Heading:
90
+ """A visible heading."""
91
+
92
+ level: str
93
+ text: str
94
+
95
+
96
+ @dataclass(frozen=True)
97
+ class HtmlSummary:
98
+ """Extracted semantic HTML content."""
99
+
100
+ byte_count: int
101
+ title: str | None
102
+ headings: list[Heading]
103
+ links: list[Link]
104
+ text_blocks: list[str]
105
+ removals: RemovalCounts
106
+
107
+ @property
108
+ def has_semantic_content(self) -> bool:
109
+ return bool(self.headings or self.links or self.text_blocks)
110
+
111
+
112
+ def summarize_html_document(body: str) -> HtmlSummary | None:
113
+ """Extract a bounded semantic summary from a full HTML document."""
114
+
115
+ cleaned, removals = _remove_nonsemantic_html(body)
116
+ title = _extract_title(cleaned)
117
+ body_html = _extract_body(cleaned)
118
+ if body_html is None:
119
+ return None
120
+
121
+ headings = _extract_headings(body_html)
122
+ links = _extract_links(body_html)
123
+ text_blocks = _extract_text_blocks(body_html)
124
+
125
+ summary = HtmlSummary(
126
+ byte_count=len(body.encode("utf-8")),
127
+ title=title,
128
+ headings=headings,
129
+ links=links,
130
+ text_blocks=text_blocks,
131
+ removals=removals,
132
+ )
133
+ if not summary.has_semantic_content:
134
+ return None
135
+ return summary
136
+
137
+
138
+ def format_html_summary(summary: HtmlSummary, *, final_newline: bool) -> str:
139
+ """Format an HTML summary in a stable, bounded layout."""
140
+
141
+ title = f" title={_quote(summary.title)}" if summary.title else ""
142
+ lines = [f"html document: bytes={summary.byte_count}{title}"]
143
+
144
+ if summary.headings:
145
+ lines.append("headings:")
146
+ for heading in summary.headings[:_MAX_HEADINGS]:
147
+ lines.append(f"- h{heading.level}: {heading.text}")
148
+ omitted = len(summary.headings) - _MAX_HEADINGS
149
+ if omitted > 0:
150
+ lines.append(f"… {omitted} headings omitted")
151
+
152
+ if summary.links:
153
+ lines.append("links:")
154
+ for link in summary.links[:_MAX_LINKS]:
155
+ lines.append(f"- {link.label} -> {link.href}")
156
+ omitted = len(summary.links) - _MAX_LINKS
157
+ if omitted > 0:
158
+ lines.append(f"… {omitted} links omitted")
159
+
160
+ if summary.text_blocks:
161
+ lines.append("text:")
162
+ lines.extend(_format_text_blocks(summary.text_blocks))
163
+
164
+ if summary.removals.any:
165
+ lines.append(
166
+ "[html cleanup removed: "
167
+ f"comments={summary.removals.comments} "
168
+ f"scripts={summary.removals.scripts} "
169
+ f"styles={summary.removals.styles} "
170
+ f"svg={summary.removals.svg} "
171
+ f"data_urls={summary.removals.data_urls}]"
172
+ )
173
+
174
+ output = "\n".join(lines)
175
+ if final_newline:
176
+ output += "\n"
177
+ return output
178
+
179
+
180
+ def _remove_nonsemantic_html(text: str) -> tuple[str, RemovalCounts]:
181
+ comments = 0
182
+ scripts = 0
183
+ styles = 0
184
+ svg = 0
185
+
186
+ def replace_comment(match: re.Match[str]) -> str:
187
+ nonlocal comments
188
+ comments += 1
189
+ return " "
190
+
191
+ without_comments = _COMMENT_RE.sub(replace_comment, text)
192
+
193
+ def replace_skipped(match: re.Match[str]) -> str:
194
+ nonlocal scripts, styles, svg
195
+ tag = match.group(1).lower()
196
+ if tag == "style":
197
+ styles += 1
198
+ elif tag == "svg":
199
+ svg += 1
200
+ else:
201
+ scripts += 1
202
+ return " "
203
+
204
+ cleaned = _SKIP_TAG_RE.sub(replace_skipped, without_comments)
205
+ data_urls = sum(
206
+ 1
207
+ for match in _DATA_URL_RE.finditer(cleaned)
208
+ if len(match.group(0)) >= _LONG_DATA_URL_MIN_CHARS
209
+ )
210
+ return cleaned, RemovalCounts(comments, scripts, styles, svg, data_urls)
211
+
212
+
213
+ def _extract_title(text: str) -> str | None:
214
+ match = _TITLE_RE.search(text)
215
+ if match is None:
216
+ return None
217
+ title = _visible_text(match.group(1))
218
+ if not title:
219
+ return None
220
+ return _truncate(title, _MAX_TITLE_CHARS)
221
+
222
+
223
+ def _extract_body(text: str) -> str | None:
224
+ open_match = _BODY_OPEN_RE.search(text)
225
+ if open_match is None:
226
+ return None
227
+ close_match = _BODY_CLOSE_RE.search(text, open_match.end())
228
+ if close_match is None:
229
+ return None
230
+ return text[open_match.end() : close_match.start()]
231
+
232
+
233
+ def _extract_headings(body_html: str) -> list[Heading]:
234
+ headings: list[Heading] = []
235
+ seen: set[tuple[str, str]] = set()
236
+ for match in _HEADING_RE.finditer(body_html):
237
+ text = _visible_text(match.group(2))
238
+ if not text:
239
+ continue
240
+ heading = Heading(match.group(1), _truncate(text, _MAX_HEADING_CHARS))
241
+ key = (heading.level, heading.text)
242
+ if key in seen:
243
+ continue
244
+ seen.add(key)
245
+ headings.append(heading)
246
+ return headings
247
+
248
+
249
+ def _extract_links(body_html: str) -> list[Link]:
250
+ links: list[Link] = []
251
+ seen: set[tuple[str, str]] = set()
252
+ for match in _ANCHOR_RE.finditer(body_html):
253
+ raw_href = _extract_href(match.group(1))
254
+ href = _normalize_safe_href(raw_href) if raw_href is not None else None
255
+ if not href:
256
+ continue
257
+ label = _visible_text(match.group(2))
258
+ if not label:
259
+ label = href
260
+ link = Link(
261
+ label=_truncate(label, _MAX_LINK_LABEL_CHARS),
262
+ href=_truncate(href, _MAX_HREF_CHARS),
263
+ )
264
+ key = (link.label, link.href)
265
+ if key in seen:
266
+ continue
267
+ seen.add(key)
268
+ links.append(link)
269
+ return links
270
+
271
+
272
+ def _extract_href(attrs: str) -> str | None:
273
+ match = _HREF_RE.search(attrs)
274
+ if match is None:
275
+ return None
276
+ for group in match.groups():
277
+ if group is not None:
278
+ return group.strip()
279
+ return None
280
+
281
+
282
+ def _normalize_safe_href(raw_href: str) -> str | None:
283
+ """Decode and allow only relative, fragment, and explicitly safe hrefs."""
284
+
285
+ decoded = _decode_href_entities(raw_href).strip()
286
+ if not decoded:
287
+ return None
288
+ scheme = _normalized_explicit_scheme(decoded)
289
+ if scheme is not None and scheme not in _SAFE_URL_SCHEMES:
290
+ return None
291
+ href = _sanitize_snippet(decoded).strip()
292
+ return href or None
293
+
294
+
295
+ def _decode_href_entities(value: str) -> str:
296
+ """Decode href entities needed for robust scheme detection."""
297
+
298
+ def replace_common(match: re.Match[str]) -> str:
299
+ entity = match.group(1).lower()
300
+ if entity == "newline":
301
+ return "\n"
302
+ if entity == "tab":
303
+ return "\t"
304
+ return ":"
305
+
306
+ return html.unescape(_COMMON_HREF_ENTITY_RE.sub(replace_common, value))
307
+
308
+
309
+ def _normalized_explicit_scheme(href: str) -> str | None:
310
+ """Return a normalized explicit URL scheme, or None for relative URLs."""
311
+
312
+ trimmed = href.lstrip()
313
+ if not trimmed or trimmed.startswith("#"):
314
+ return None
315
+
316
+ prefix_chars: list[str] = []
317
+ for char in trimmed:
318
+ if char in "/?#":
319
+ return None
320
+ if char == ":":
321
+ scheme = "".join(
322
+ candidate
323
+ for candidate in prefix_chars
324
+ if not _is_ascii_whitespace_or_control(candidate)
325
+ ).lower()
326
+ if not scheme:
327
+ return ""
328
+ if not re.fullmatch(r"[a-z][a-z0-9+.-]*", scheme):
329
+ return ""
330
+ return scheme
331
+ prefix_chars.append(char)
332
+ return None
333
+
334
+
335
+ def _is_ascii_whitespace_or_control(char: str) -> bool:
336
+ codepoint = ord(char)
337
+ return codepoint <= 32 or codepoint == 127
338
+
339
+
340
+ def _extract_text_blocks(body_html: str) -> list[str]:
341
+ with_breaks = _BLOCK_TAG_RE.sub("\n", body_html)
342
+ without_tags = _TAG_RE.sub(" ", with_breaks)
343
+ blocks: list[str] = []
344
+ seen: set[str] = set()
345
+ for raw_block in without_tags.splitlines():
346
+ block = _normalize_visible_text(raw_block)
347
+ if not block or not _is_semantic_text_block(block):
348
+ continue
349
+ block = _truncate(block, _MAX_TEXT_BLOCK_CHARS)
350
+ if block in seen:
351
+ continue
352
+ seen.add(block)
353
+ blocks.append(block)
354
+ return blocks
355
+
356
+
357
+ def _format_text_blocks(blocks: list[str]) -> list[str]:
358
+ if len(blocks) <= _MAX_SMALL_TEXT_BLOCKS:
359
+ return [f"- {block}" for block in blocks]
360
+
361
+ selected = set(range(_FIRST_TEXT_BLOCKS))
362
+ selected.update(range(max(_FIRST_TEXT_BLOCKS, len(blocks) - _LAST_TEXT_BLOCKS), len(blocks)))
363
+
364
+ salient = [
365
+ index
366
+ for index, block in enumerate(blocks)
367
+ if index not in selected and _is_salient_text(block)
368
+ ][:_MAX_SALIENT_TEXT_BLOCKS]
369
+ selected.update(salient)
370
+
371
+ output: list[str] = []
372
+ previous = -1
373
+ for index in sorted(selected):
374
+ omitted = index - previous - 1
375
+ if omitted > 0:
376
+ output.append(f"… {omitted} text blocks omitted")
377
+ output.append(f"- {blocks[index]}")
378
+ previous = index
379
+ omitted = len(blocks) - previous - 1
380
+ if omitted > 0:
381
+ output.append(f"… {omitted} text blocks omitted")
382
+ return output
383
+
384
+
385
+ def _visible_text(fragment: str) -> str:
386
+ return _normalize_visible_text(_TAG_RE.sub(" ", fragment))
387
+
388
+
389
+ def _normalize_visible_text(text: str) -> str:
390
+ unescaped = html.unescape(text)
391
+ collapsed = re.sub(r"\s+", " ", unescaped).strip()
392
+ return _sanitize_snippet(collapsed)
393
+
394
+
395
+ def _sanitize_snippet(text: str) -> str:
396
+ return _DATA_URL_RE.sub("[data-url omitted]", text)
397
+
398
+
399
+ def _is_semantic_text_block(text: str) -> bool:
400
+ alnum_count = sum(1 for char in text if char.isalnum())
401
+ if alnum_count < 3:
402
+ return False
403
+ words = [part for part in re.split(r"\W+", text) if part]
404
+ if len(words) >= 2:
405
+ return True
406
+ return _is_salient_text(text)
407
+
408
+
409
+ def _is_salient_text(text: str) -> bool:
410
+ lowered = text.lower()
411
+ return any(term in lowered for term in _SALIENT_TERMS)
412
+
413
+
414
+ def _truncate(text: str, max_chars: int) -> str:
415
+ if len(text) <= max_chars:
416
+ return text
417
+ return f"{text[: max_chars - 1]}…"
418
+
419
+
420
+ def _quote(value: str) -> str:
421
+ escaped = value.replace("\\", "\\\\").replace('"', '\\"')
422
+ return f'"{escaped}"'
@@ -0,0 +1,7 @@
1
+ """JSON and JSONL payload compression."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .reducer import compress_json_payload_output
6
+
7
+ __all__ = ["compress_json_payload_output"]
@@ -0,0 +1,62 @@
1
+ """Conservative JSON/JSONL payload detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import dataclass
7
+ from typing import Any
8
+
9
+
10
+ @dataclass(frozen=True)
11
+ class JsonlPayload:
12
+ """Parsed JSONL object records."""
13
+
14
+ records: tuple[dict[str, Any], ...]
15
+
16
+
17
+ def parse_whole_json_payload(text: str) -> Any | None:
18
+ """Return a parsed whole-output JSON object/array, or ``None``."""
19
+
20
+ body = text.strip()
21
+ if not _looks_like_json_container(body):
22
+ return None
23
+ try:
24
+ value = json.loads(body, parse_constant=_reject_json_constant)
25
+ except (json.JSONDecodeError, ValueError):
26
+ return None
27
+ if not isinstance(value, (dict, list)):
28
+ return None
29
+ return value
30
+
31
+
32
+ def parse_jsonl_payload(lines: list[str]) -> JsonlPayload | None:
33
+ """Return parsed object-record JSONL when every non-empty line is JSON."""
34
+
35
+ records: list[dict[str, Any]] = []
36
+ for line in lines:
37
+ body = line.strip()
38
+ if not body:
39
+ continue
40
+ if not (body.startswith("{") and body.endswith("}")):
41
+ return None
42
+ try:
43
+ value = json.loads(body, parse_constant=_reject_json_constant)
44
+ except (json.JSONDecodeError, ValueError):
45
+ return None
46
+ if not isinstance(value, dict):
47
+ return None
48
+ records.append(value)
49
+
50
+ if not records:
51
+ return None
52
+ return JsonlPayload(tuple(records))
53
+
54
+
55
+ def _looks_like_json_container(body: str) -> bool:
56
+ return (body.startswith("{") and body.endswith("}")) or (
57
+ body.startswith("[") and body.endswith("]")
58
+ )
59
+
60
+
61
+ def _reject_json_constant(value: str) -> None:
62
+ raise ValueError(f"non-standard JSON constant {value}")