codetool-shell 0.1.1__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codetool_shell/__init__.py +11 -0
- codetool_shell/api.py +59 -0
- codetool_shell/bin/windows-x86_64/codetool-shell-rust.exe +0 -0
- codetool_shell/filters/__init__.py +14 -0
- codetool_shell/filters/build_compiler/__init__.py +7 -0
- codetool_shell/filters/build_compiler/detector.py +412 -0
- codetool_shell/filters/build_compiler/reducer.py +166 -0
- codetool_shell/filters/build_compiler/summary.py +617 -0
- codetool_shell/filters/ci_job_log/__init__.py +7 -0
- codetool_shell/filters/ci_job_log/detector.py +64 -0
- codetool_shell/filters/ci_job_log/reducer.py +99 -0
- codetool_shell/filters/ci_job_log/summary.py +243 -0
- codetool_shell/filters/diff/__init__.py +7 -0
- codetool_shell/filters/diff/detector.py +136 -0
- codetool_shell/filters/diff/reducer.py +308 -0
- codetool_shell/filters/generic_log/__init__.py +7 -0
- codetool_shell/filters/generic_log/detector.py +175 -0
- codetool_shell/filters/generic_log/reducer.py +99 -0
- codetool_shell/filters/generic_log/summary.py +161 -0
- codetool_shell/filters/git.py +514 -0
- codetool_shell/filters/html_cleanup/__init__.py +7 -0
- codetool_shell/filters/html_cleanup/detector.py +136 -0
- codetool_shell/filters/html_cleanup/reducer.py +27 -0
- codetool_shell/filters/html_cleanup/summary.py +422 -0
- codetool_shell/filters/json_payload/__init__.py +7 -0
- codetool_shell/filters/json_payload/detector.py +62 -0
- codetool_shell/filters/json_payload/reducer.py +81 -0
- codetool_shell/filters/json_payload/summary.py +233 -0
- codetool_shell/filters/listing/__init__.py +7 -0
- codetool_shell/filters/listing/detector.py +294 -0
- codetool_shell/filters/listing/reducer.py +30 -0
- codetool_shell/filters/log_template/__init__.py +7 -0
- codetool_shell/filters/log_template/constants.py +76 -0
- codetool_shell/filters/log_template/detector.py +331 -0
- codetool_shell/filters/log_template/reducer.py +78 -0
- codetool_shell/filters/log_template/template.py +280 -0
- codetool_shell/filters/log_template/types.py +21 -0
- codetool_shell/filters/opaque_payload/__init__.py +7 -0
- codetool_shell/filters/opaque_payload/detector.py +563 -0
- codetool_shell/filters/opaque_payload/reducer.py +142 -0
- codetool_shell/filters/opaque_payload/summary.py +61 -0
- codetool_shell/filters/package_manager/__init__.py +7 -0
- codetool_shell/filters/package_manager/detector.py +220 -0
- codetool_shell/filters/package_manager/reducer.py +110 -0
- codetool_shell/filters/package_manager/summary.py +172 -0
- codetool_shell/filters/pipeline.py +65 -0
- codetool_shell/filters/rg.py +250 -0
- codetool_shell/filters/system_output/__init__.py +7 -0
- codetool_shell/filters/system_output/detector.py +600 -0
- codetool_shell/filters/system_output/reducer.py +331 -0
- codetool_shell/filters/system_output/summary.py +164 -0
- codetool_shell/filters/table/__init__.py +7 -0
- codetool_shell/filters/table/detector.py +244 -0
- codetool_shell/filters/table/reducer.py +57 -0
- codetool_shell/filters/table/summary.py +37 -0
- codetool_shell/filters/test_runner/__init__.py +7 -0
- codetool_shell/filters/test_runner/ansi.py +80 -0
- codetool_shell/filters/test_runner/detector.py +409 -0
- codetool_shell/filters/test_runner/reducer.py +288 -0
- codetool_shell/filters/test_runner/summary.py +449 -0
- codetool_shell/filters/text.py +38 -0
- codetool_shell/filters/traceback/__init__.py +7 -0
- codetool_shell/filters/traceback/detector.py +209 -0
- codetool_shell/filters/traceback/reducer.py +141 -0
- codetool_shell/filters/traceback/summary.py +122 -0
- codetool_shell/filters/tree.py +59 -0
- codetool_shell/py.typed +0 -0
- codetool_shell/python_backend.py +38 -0
- codetool_shell/rust_backend.py +254 -0
- codetool_shell-0.1.1.dist-info/METADATA +152 -0
- codetool_shell-0.1.1.dist-info/RECORD +72 -0
- codetool_shell-0.1.1.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Conservative full-document HTML detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
_MIN_HTML_BYTES = 4096
|
|
9
|
+
_MAX_CONTROL_RATIO = 0.01
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass(frozen=True)
|
|
13
|
+
class HttpBody:
|
|
14
|
+
"""HTTP response headers split from a body."""
|
|
15
|
+
|
|
16
|
+
headers: str
|
|
17
|
+
separator: str
|
|
18
|
+
body: str
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class HtmlPayload:
|
|
23
|
+
"""A detected HTML document, optionally wrapped by HTTP headers."""
|
|
24
|
+
|
|
25
|
+
headers: str
|
|
26
|
+
separator: str
|
|
27
|
+
body: str
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def detect_html_payload(text: str) -> HtmlPayload | None:
|
|
31
|
+
"""Return a clear full HTML document body, or ``None``."""
|
|
32
|
+
|
|
33
|
+
http_body = split_http_response(text)
|
|
34
|
+
if http_body is not None:
|
|
35
|
+
body = http_body.body
|
|
36
|
+
headers = http_body.headers
|
|
37
|
+
separator = http_body.separator
|
|
38
|
+
else:
|
|
39
|
+
body = text
|
|
40
|
+
headers = ""
|
|
41
|
+
separator = ""
|
|
42
|
+
|
|
43
|
+
if not _is_clear_full_html_document(body):
|
|
44
|
+
return None
|
|
45
|
+
return HtmlPayload(headers=headers, separator=separator, body=body)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def split_http_response(text: str) -> HttpBody | None:
|
|
49
|
+
"""Split clear HTTP response headers from a body."""
|
|
50
|
+
|
|
51
|
+
header_end = text.find("\r\n\r\n")
|
|
52
|
+
separator = "\r\n\r\n"
|
|
53
|
+
if header_end < 0:
|
|
54
|
+
header_end = text.find("\n\n")
|
|
55
|
+
separator = "\n\n"
|
|
56
|
+
if header_end <= 0:
|
|
57
|
+
return None
|
|
58
|
+
|
|
59
|
+
headers = text[:header_end]
|
|
60
|
+
normalized_headers = headers.replace("\r\n", "\n")
|
|
61
|
+
first_line = normalized_headers.split("\n", 1)[0]
|
|
62
|
+
if not first_line.startswith("HTTP/"):
|
|
63
|
+
return None
|
|
64
|
+
|
|
65
|
+
header_lines = normalized_headers.split("\n")[1:]
|
|
66
|
+
if header_lines and not any(":" in line for line in header_lines):
|
|
67
|
+
return None
|
|
68
|
+
|
|
69
|
+
return HttpBody(
|
|
70
|
+
headers=headers,
|
|
71
|
+
separator=separator,
|
|
72
|
+
body=text[header_end + len(separator) :],
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _is_clear_full_html_document(body: str) -> bool:
|
|
77
|
+
if len(body.encode("utf-8")) < _MIN_HTML_BYTES:
|
|
78
|
+
return False
|
|
79
|
+
if _is_control_heavy(body):
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
stripped = body.strip()
|
|
83
|
+
lowered = stripped.lower()
|
|
84
|
+
if not (lowered.startswith("<!doctype html") or lowered.startswith("<html")):
|
|
85
|
+
return False
|
|
86
|
+
if not lowered.endswith("</html>"):
|
|
87
|
+
return False
|
|
88
|
+
if not all(marker in lowered for marker in ("<html", "</html>", "<body", "</body>")):
|
|
89
|
+
return False
|
|
90
|
+
if lowered.find("<body") > lowered.find("</body>"):
|
|
91
|
+
return False
|
|
92
|
+
if lowered.find("</body>") > lowered.rfind("</html>"):
|
|
93
|
+
return False
|
|
94
|
+
|
|
95
|
+
if _looks_like_non_html_source(body):
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
return True
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _is_control_heavy(text: str) -> bool:
|
|
102
|
+
if "\x00" in text:
|
|
103
|
+
return True
|
|
104
|
+
controls = sum(
|
|
105
|
+
1 for char in text if ord(char) < 32 and char not in {"\n", "\r", "\t"}
|
|
106
|
+
)
|
|
107
|
+
return controls / max(len(text), 1) > _MAX_CONTROL_RATIO
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _looks_like_non_html_source(text: str) -> bool:
|
|
111
|
+
lowered = text.lower()
|
|
112
|
+
if any(marker in text for marker in ("```", "{{", "{%", "<%", "<?php")):
|
|
113
|
+
return True
|
|
114
|
+
if any(
|
|
115
|
+
marker in text
|
|
116
|
+
for marker in (
|
|
117
|
+
"className=",
|
|
118
|
+
"export default",
|
|
119
|
+
"const App",
|
|
120
|
+
"function App(",
|
|
121
|
+
"React.",
|
|
122
|
+
"</>",
|
|
123
|
+
)
|
|
124
|
+
):
|
|
125
|
+
return True
|
|
126
|
+
if "Traceback (most recent call last):" in text:
|
|
127
|
+
return True
|
|
128
|
+
if "\ndiff --git " in text or text.startswith("diff --git "):
|
|
129
|
+
return True
|
|
130
|
+
if "\n@@ " in text or "\n--- " in text or "\n+++ " in text:
|
|
131
|
+
return True
|
|
132
|
+
if "\n --> " in text and ("error:" in lowered or "warning:" in lowered):
|
|
133
|
+
return True
|
|
134
|
+
if lowered.startswith(("error[", "error:", "warning:")):
|
|
135
|
+
return True
|
|
136
|
+
return False
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Reduce full HTML documents to semantic visible content."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from ..text import score
|
|
6
|
+
from .detector import detect_html_payload
|
|
7
|
+
from .summary import format_html_summary, summarize_html_document
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def compress_html_cleanup_output(text: str) -> str:
|
|
11
|
+
"""Compress a clear full HTML document, otherwise return unchanged."""
|
|
12
|
+
|
|
13
|
+
payload = detect_html_payload(text)
|
|
14
|
+
if payload is None:
|
|
15
|
+
return text
|
|
16
|
+
|
|
17
|
+
summary = summarize_html_document(payload.body)
|
|
18
|
+
if summary is None:
|
|
19
|
+
return text
|
|
20
|
+
|
|
21
|
+
candidate = f"{payload.headers}{payload.separator}" + format_html_summary(
|
|
22
|
+
summary,
|
|
23
|
+
final_newline=payload.body.endswith("\n"),
|
|
24
|
+
)
|
|
25
|
+
if score(candidate) < score(text):
|
|
26
|
+
return candidate
|
|
27
|
+
return text
|
|
@@ -0,0 +1,422 @@
|
|
|
1
|
+
"""Extract and format semantic HTML document summaries."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import html
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
|
|
11
|
+
_SKIP_TAG_RE = re.compile(
|
|
12
|
+
r"<(script|style|noscript|svg|template)\b[^>]*>.*?</\1\s*>",
|
|
13
|
+
re.IGNORECASE | re.DOTALL,
|
|
14
|
+
)
|
|
15
|
+
_TITLE_RE = re.compile(r"<title\b[^>]*>(.*?)</title\s*>", re.IGNORECASE | re.DOTALL)
|
|
16
|
+
_BODY_OPEN_RE = re.compile(r"<body\b[^>]*>", re.IGNORECASE)
|
|
17
|
+
_BODY_CLOSE_RE = re.compile(r"</body\s*>", re.IGNORECASE)
|
|
18
|
+
_HEADING_RE = re.compile(
|
|
19
|
+
r"<h([1-6])\b[^>]*>(.*?)</h\1\s*>", re.IGNORECASE | re.DOTALL
|
|
20
|
+
)
|
|
21
|
+
_ANCHOR_RE = re.compile(r"<a\b([^>]*)>(.*?)</a\s*>", re.IGNORECASE | re.DOTALL)
|
|
22
|
+
_HREF_RE = re.compile(
|
|
23
|
+
r"""href\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'=<>`]+))""",
|
|
24
|
+
re.IGNORECASE,
|
|
25
|
+
)
|
|
26
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
27
|
+
_BLOCK_TAG_RE = re.compile(
|
|
28
|
+
r"</?(?:address|article|aside|blockquote|br|dd|details|div|dl|dt|figcaption|"
|
|
29
|
+
r"figure|footer|form|h[1-6]|header|hr|li|main|nav|ol|p|pre|section|table|"
|
|
30
|
+
r"tbody|td|tfoot|th|thead|tr|ul)\b[^>]*>",
|
|
31
|
+
re.IGNORECASE,
|
|
32
|
+
)
|
|
33
|
+
_DATA_URL_RE = re.compile(r"data:[^\s\"'<>`)]+", re.IGNORECASE)
|
|
34
|
+
_LONG_DATA_URL_MIN_CHARS = 128
|
|
35
|
+
_MAX_TITLE_CHARS = 80
|
|
36
|
+
_MAX_HEADING_CHARS = 120
|
|
37
|
+
_MAX_LINK_LABEL_CHARS = 80
|
|
38
|
+
_MAX_HREF_CHARS = 160
|
|
39
|
+
_MAX_TEXT_BLOCK_CHARS = 240
|
|
40
|
+
_MAX_HEADINGS = 8
|
|
41
|
+
_MAX_LINKS = 8
|
|
42
|
+
_MAX_SMALL_TEXT_BLOCKS = 8
|
|
43
|
+
_FIRST_TEXT_BLOCKS = 3
|
|
44
|
+
_LAST_TEXT_BLOCKS = 2
|
|
45
|
+
_MAX_SALIENT_TEXT_BLOCKS = 3
|
|
46
|
+
_SALIENT_TERMS = (
|
|
47
|
+
"error",
|
|
48
|
+
"warning",
|
|
49
|
+
"failed",
|
|
50
|
+
"failure",
|
|
51
|
+
"exception",
|
|
52
|
+
"traceback",
|
|
53
|
+
"fatal",
|
|
54
|
+
"critical",
|
|
55
|
+
"denied",
|
|
56
|
+
"404",
|
|
57
|
+
"500",
|
|
58
|
+
)
|
|
59
|
+
_SAFE_URL_SCHEMES = {"http", "https", "mailto", "tel"}
|
|
60
|
+
_COMMON_HREF_ENTITY_RE = re.compile(
|
|
61
|
+
r"&(newline|tab|colon);", re.IGNORECASE
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass(frozen=True)
|
|
66
|
+
class RemovalCounts:
|
|
67
|
+
"""Counts of non-semantic HTML payloads removed by cleanup."""
|
|
68
|
+
|
|
69
|
+
comments: int = 0
|
|
70
|
+
scripts: int = 0
|
|
71
|
+
styles: int = 0
|
|
72
|
+
svg: int = 0
|
|
73
|
+
data_urls: int = 0
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def any(self) -> bool:
|
|
77
|
+
return any((self.comments, self.scripts, self.styles, self.svg, self.data_urls))
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
@dataclass(frozen=True)
|
|
81
|
+
class Link:
|
|
82
|
+
"""A visible anchor label and safe href."""
|
|
83
|
+
|
|
84
|
+
label: str
|
|
85
|
+
href: str
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass(frozen=True)
|
|
89
|
+
class Heading:
|
|
90
|
+
"""A visible heading."""
|
|
91
|
+
|
|
92
|
+
level: str
|
|
93
|
+
text: str
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass(frozen=True)
|
|
97
|
+
class HtmlSummary:
|
|
98
|
+
"""Extracted semantic HTML content."""
|
|
99
|
+
|
|
100
|
+
byte_count: int
|
|
101
|
+
title: str | None
|
|
102
|
+
headings: list[Heading]
|
|
103
|
+
links: list[Link]
|
|
104
|
+
text_blocks: list[str]
|
|
105
|
+
removals: RemovalCounts
|
|
106
|
+
|
|
107
|
+
@property
|
|
108
|
+
def has_semantic_content(self) -> bool:
|
|
109
|
+
return bool(self.headings or self.links or self.text_blocks)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def summarize_html_document(body: str) -> HtmlSummary | None:
|
|
113
|
+
"""Extract a bounded semantic summary from a full HTML document."""
|
|
114
|
+
|
|
115
|
+
cleaned, removals = _remove_nonsemantic_html(body)
|
|
116
|
+
title = _extract_title(cleaned)
|
|
117
|
+
body_html = _extract_body(cleaned)
|
|
118
|
+
if body_html is None:
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
headings = _extract_headings(body_html)
|
|
122
|
+
links = _extract_links(body_html)
|
|
123
|
+
text_blocks = _extract_text_blocks(body_html)
|
|
124
|
+
|
|
125
|
+
summary = HtmlSummary(
|
|
126
|
+
byte_count=len(body.encode("utf-8")),
|
|
127
|
+
title=title,
|
|
128
|
+
headings=headings,
|
|
129
|
+
links=links,
|
|
130
|
+
text_blocks=text_blocks,
|
|
131
|
+
removals=removals,
|
|
132
|
+
)
|
|
133
|
+
if not summary.has_semantic_content:
|
|
134
|
+
return None
|
|
135
|
+
return summary
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def format_html_summary(summary: HtmlSummary, *, final_newline: bool) -> str:
|
|
139
|
+
"""Format an HTML summary in a stable, bounded layout."""
|
|
140
|
+
|
|
141
|
+
title = f" title={_quote(summary.title)}" if summary.title else ""
|
|
142
|
+
lines = [f"html document: bytes={summary.byte_count}{title}"]
|
|
143
|
+
|
|
144
|
+
if summary.headings:
|
|
145
|
+
lines.append("headings:")
|
|
146
|
+
for heading in summary.headings[:_MAX_HEADINGS]:
|
|
147
|
+
lines.append(f"- h{heading.level}: {heading.text}")
|
|
148
|
+
omitted = len(summary.headings) - _MAX_HEADINGS
|
|
149
|
+
if omitted > 0:
|
|
150
|
+
lines.append(f"… {omitted} headings omitted")
|
|
151
|
+
|
|
152
|
+
if summary.links:
|
|
153
|
+
lines.append("links:")
|
|
154
|
+
for link in summary.links[:_MAX_LINKS]:
|
|
155
|
+
lines.append(f"- {link.label} -> {link.href}")
|
|
156
|
+
omitted = len(summary.links) - _MAX_LINKS
|
|
157
|
+
if omitted > 0:
|
|
158
|
+
lines.append(f"… {omitted} links omitted")
|
|
159
|
+
|
|
160
|
+
if summary.text_blocks:
|
|
161
|
+
lines.append("text:")
|
|
162
|
+
lines.extend(_format_text_blocks(summary.text_blocks))
|
|
163
|
+
|
|
164
|
+
if summary.removals.any:
|
|
165
|
+
lines.append(
|
|
166
|
+
"[html cleanup removed: "
|
|
167
|
+
f"comments={summary.removals.comments} "
|
|
168
|
+
f"scripts={summary.removals.scripts} "
|
|
169
|
+
f"styles={summary.removals.styles} "
|
|
170
|
+
f"svg={summary.removals.svg} "
|
|
171
|
+
f"data_urls={summary.removals.data_urls}]"
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
output = "\n".join(lines)
|
|
175
|
+
if final_newline:
|
|
176
|
+
output += "\n"
|
|
177
|
+
return output
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _remove_nonsemantic_html(text: str) -> tuple[str, RemovalCounts]:
|
|
181
|
+
comments = 0
|
|
182
|
+
scripts = 0
|
|
183
|
+
styles = 0
|
|
184
|
+
svg = 0
|
|
185
|
+
|
|
186
|
+
def replace_comment(match: re.Match[str]) -> str:
|
|
187
|
+
nonlocal comments
|
|
188
|
+
comments += 1
|
|
189
|
+
return " "
|
|
190
|
+
|
|
191
|
+
without_comments = _COMMENT_RE.sub(replace_comment, text)
|
|
192
|
+
|
|
193
|
+
def replace_skipped(match: re.Match[str]) -> str:
|
|
194
|
+
nonlocal scripts, styles, svg
|
|
195
|
+
tag = match.group(1).lower()
|
|
196
|
+
if tag == "style":
|
|
197
|
+
styles += 1
|
|
198
|
+
elif tag == "svg":
|
|
199
|
+
svg += 1
|
|
200
|
+
else:
|
|
201
|
+
scripts += 1
|
|
202
|
+
return " "
|
|
203
|
+
|
|
204
|
+
cleaned = _SKIP_TAG_RE.sub(replace_skipped, without_comments)
|
|
205
|
+
data_urls = sum(
|
|
206
|
+
1
|
|
207
|
+
for match in _DATA_URL_RE.finditer(cleaned)
|
|
208
|
+
if len(match.group(0)) >= _LONG_DATA_URL_MIN_CHARS
|
|
209
|
+
)
|
|
210
|
+
return cleaned, RemovalCounts(comments, scripts, styles, svg, data_urls)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def _extract_title(text: str) -> str | None:
|
|
214
|
+
match = _TITLE_RE.search(text)
|
|
215
|
+
if match is None:
|
|
216
|
+
return None
|
|
217
|
+
title = _visible_text(match.group(1))
|
|
218
|
+
if not title:
|
|
219
|
+
return None
|
|
220
|
+
return _truncate(title, _MAX_TITLE_CHARS)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _extract_body(text: str) -> str | None:
|
|
224
|
+
open_match = _BODY_OPEN_RE.search(text)
|
|
225
|
+
if open_match is None:
|
|
226
|
+
return None
|
|
227
|
+
close_match = _BODY_CLOSE_RE.search(text, open_match.end())
|
|
228
|
+
if close_match is None:
|
|
229
|
+
return None
|
|
230
|
+
return text[open_match.end() : close_match.start()]
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _extract_headings(body_html: str) -> list[Heading]:
|
|
234
|
+
headings: list[Heading] = []
|
|
235
|
+
seen: set[tuple[str, str]] = set()
|
|
236
|
+
for match in _HEADING_RE.finditer(body_html):
|
|
237
|
+
text = _visible_text(match.group(2))
|
|
238
|
+
if not text:
|
|
239
|
+
continue
|
|
240
|
+
heading = Heading(match.group(1), _truncate(text, _MAX_HEADING_CHARS))
|
|
241
|
+
key = (heading.level, heading.text)
|
|
242
|
+
if key in seen:
|
|
243
|
+
continue
|
|
244
|
+
seen.add(key)
|
|
245
|
+
headings.append(heading)
|
|
246
|
+
return headings
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def _extract_links(body_html: str) -> list[Link]:
|
|
250
|
+
links: list[Link] = []
|
|
251
|
+
seen: set[tuple[str, str]] = set()
|
|
252
|
+
for match in _ANCHOR_RE.finditer(body_html):
|
|
253
|
+
raw_href = _extract_href(match.group(1))
|
|
254
|
+
href = _normalize_safe_href(raw_href) if raw_href is not None else None
|
|
255
|
+
if not href:
|
|
256
|
+
continue
|
|
257
|
+
label = _visible_text(match.group(2))
|
|
258
|
+
if not label:
|
|
259
|
+
label = href
|
|
260
|
+
link = Link(
|
|
261
|
+
label=_truncate(label, _MAX_LINK_LABEL_CHARS),
|
|
262
|
+
href=_truncate(href, _MAX_HREF_CHARS),
|
|
263
|
+
)
|
|
264
|
+
key = (link.label, link.href)
|
|
265
|
+
if key in seen:
|
|
266
|
+
continue
|
|
267
|
+
seen.add(key)
|
|
268
|
+
links.append(link)
|
|
269
|
+
return links
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def _extract_href(attrs: str) -> str | None:
|
|
273
|
+
match = _HREF_RE.search(attrs)
|
|
274
|
+
if match is None:
|
|
275
|
+
return None
|
|
276
|
+
for group in match.groups():
|
|
277
|
+
if group is not None:
|
|
278
|
+
return group.strip()
|
|
279
|
+
return None
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def _normalize_safe_href(raw_href: str) -> str | None:
|
|
283
|
+
"""Decode and allow only relative, fragment, and explicitly safe hrefs."""
|
|
284
|
+
|
|
285
|
+
decoded = _decode_href_entities(raw_href).strip()
|
|
286
|
+
if not decoded:
|
|
287
|
+
return None
|
|
288
|
+
scheme = _normalized_explicit_scheme(decoded)
|
|
289
|
+
if scheme is not None and scheme not in _SAFE_URL_SCHEMES:
|
|
290
|
+
return None
|
|
291
|
+
href = _sanitize_snippet(decoded).strip()
|
|
292
|
+
return href or None
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _decode_href_entities(value: str) -> str:
|
|
296
|
+
"""Decode href entities needed for robust scheme detection."""
|
|
297
|
+
|
|
298
|
+
def replace_common(match: re.Match[str]) -> str:
|
|
299
|
+
entity = match.group(1).lower()
|
|
300
|
+
if entity == "newline":
|
|
301
|
+
return "\n"
|
|
302
|
+
if entity == "tab":
|
|
303
|
+
return "\t"
|
|
304
|
+
return ":"
|
|
305
|
+
|
|
306
|
+
return html.unescape(_COMMON_HREF_ENTITY_RE.sub(replace_common, value))
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def _normalized_explicit_scheme(href: str) -> str | None:
|
|
310
|
+
"""Return a normalized explicit URL scheme, or None for relative URLs."""
|
|
311
|
+
|
|
312
|
+
trimmed = href.lstrip()
|
|
313
|
+
if not trimmed or trimmed.startswith("#"):
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
prefix_chars: list[str] = []
|
|
317
|
+
for char in trimmed:
|
|
318
|
+
if char in "/?#":
|
|
319
|
+
return None
|
|
320
|
+
if char == ":":
|
|
321
|
+
scheme = "".join(
|
|
322
|
+
candidate
|
|
323
|
+
for candidate in prefix_chars
|
|
324
|
+
if not _is_ascii_whitespace_or_control(candidate)
|
|
325
|
+
).lower()
|
|
326
|
+
if not scheme:
|
|
327
|
+
return ""
|
|
328
|
+
if not re.fullmatch(r"[a-z][a-z0-9+.-]*", scheme):
|
|
329
|
+
return ""
|
|
330
|
+
return scheme
|
|
331
|
+
prefix_chars.append(char)
|
|
332
|
+
return None
|
|
333
|
+
|
|
334
|
+
|
|
335
|
+
def _is_ascii_whitespace_or_control(char: str) -> bool:
|
|
336
|
+
codepoint = ord(char)
|
|
337
|
+
return codepoint <= 32 or codepoint == 127
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _extract_text_blocks(body_html: str) -> list[str]:
|
|
341
|
+
with_breaks = _BLOCK_TAG_RE.sub("\n", body_html)
|
|
342
|
+
without_tags = _TAG_RE.sub(" ", with_breaks)
|
|
343
|
+
blocks: list[str] = []
|
|
344
|
+
seen: set[str] = set()
|
|
345
|
+
for raw_block in without_tags.splitlines():
|
|
346
|
+
block = _normalize_visible_text(raw_block)
|
|
347
|
+
if not block or not _is_semantic_text_block(block):
|
|
348
|
+
continue
|
|
349
|
+
block = _truncate(block, _MAX_TEXT_BLOCK_CHARS)
|
|
350
|
+
if block in seen:
|
|
351
|
+
continue
|
|
352
|
+
seen.add(block)
|
|
353
|
+
blocks.append(block)
|
|
354
|
+
return blocks
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def _format_text_blocks(blocks: list[str]) -> list[str]:
|
|
358
|
+
if len(blocks) <= _MAX_SMALL_TEXT_BLOCKS:
|
|
359
|
+
return [f"- {block}" for block in blocks]
|
|
360
|
+
|
|
361
|
+
selected = set(range(_FIRST_TEXT_BLOCKS))
|
|
362
|
+
selected.update(range(max(_FIRST_TEXT_BLOCKS, len(blocks) - _LAST_TEXT_BLOCKS), len(blocks)))
|
|
363
|
+
|
|
364
|
+
salient = [
|
|
365
|
+
index
|
|
366
|
+
for index, block in enumerate(blocks)
|
|
367
|
+
if index not in selected and _is_salient_text(block)
|
|
368
|
+
][:_MAX_SALIENT_TEXT_BLOCKS]
|
|
369
|
+
selected.update(salient)
|
|
370
|
+
|
|
371
|
+
output: list[str] = []
|
|
372
|
+
previous = -1
|
|
373
|
+
for index in sorted(selected):
|
|
374
|
+
omitted = index - previous - 1
|
|
375
|
+
if omitted > 0:
|
|
376
|
+
output.append(f"… {omitted} text blocks omitted")
|
|
377
|
+
output.append(f"- {blocks[index]}")
|
|
378
|
+
previous = index
|
|
379
|
+
omitted = len(blocks) - previous - 1
|
|
380
|
+
if omitted > 0:
|
|
381
|
+
output.append(f"… {omitted} text blocks omitted")
|
|
382
|
+
return output
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def _visible_text(fragment: str) -> str:
|
|
386
|
+
return _normalize_visible_text(_TAG_RE.sub(" ", fragment))
|
|
387
|
+
|
|
388
|
+
|
|
389
|
+
def _normalize_visible_text(text: str) -> str:
|
|
390
|
+
unescaped = html.unescape(text)
|
|
391
|
+
collapsed = re.sub(r"\s+", " ", unescaped).strip()
|
|
392
|
+
return _sanitize_snippet(collapsed)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def _sanitize_snippet(text: str) -> str:
|
|
396
|
+
return _DATA_URL_RE.sub("[data-url omitted]", text)
|
|
397
|
+
|
|
398
|
+
|
|
399
|
+
def _is_semantic_text_block(text: str) -> bool:
|
|
400
|
+
alnum_count = sum(1 for char in text if char.isalnum())
|
|
401
|
+
if alnum_count < 3:
|
|
402
|
+
return False
|
|
403
|
+
words = [part for part in re.split(r"\W+", text) if part]
|
|
404
|
+
if len(words) >= 2:
|
|
405
|
+
return True
|
|
406
|
+
return _is_salient_text(text)
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _is_salient_text(text: str) -> bool:
|
|
410
|
+
lowered = text.lower()
|
|
411
|
+
return any(term in lowered for term in _SALIENT_TERMS)
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
def _truncate(text: str, max_chars: int) -> str:
|
|
415
|
+
if len(text) <= max_chars:
|
|
416
|
+
return text
|
|
417
|
+
return f"{text[: max_chars - 1]}…"
|
|
418
|
+
|
|
419
|
+
|
|
420
|
+
def _quote(value: str) -> str:
|
|
421
|
+
escaped = value.replace("\\", "\\\\").replace('"', '\\"')
|
|
422
|
+
return f'"{escaped}"'
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Conservative JSON/JSONL payload detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass(frozen=True)
|
|
11
|
+
class JsonlPayload:
|
|
12
|
+
"""Parsed JSONL object records."""
|
|
13
|
+
|
|
14
|
+
records: tuple[dict[str, Any], ...]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_whole_json_payload(text: str) -> Any | None:
|
|
18
|
+
"""Return a parsed whole-output JSON object/array, or ``None``."""
|
|
19
|
+
|
|
20
|
+
body = text.strip()
|
|
21
|
+
if not _looks_like_json_container(body):
|
|
22
|
+
return None
|
|
23
|
+
try:
|
|
24
|
+
value = json.loads(body, parse_constant=_reject_json_constant)
|
|
25
|
+
except (json.JSONDecodeError, ValueError):
|
|
26
|
+
return None
|
|
27
|
+
if not isinstance(value, (dict, list)):
|
|
28
|
+
return None
|
|
29
|
+
return value
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def parse_jsonl_payload(lines: list[str]) -> JsonlPayload | None:
|
|
33
|
+
"""Return parsed object-record JSONL when every non-empty line is JSON."""
|
|
34
|
+
|
|
35
|
+
records: list[dict[str, Any]] = []
|
|
36
|
+
for line in lines:
|
|
37
|
+
body = line.strip()
|
|
38
|
+
if not body:
|
|
39
|
+
continue
|
|
40
|
+
if not (body.startswith("{") and body.endswith("}")):
|
|
41
|
+
return None
|
|
42
|
+
try:
|
|
43
|
+
value = json.loads(body, parse_constant=_reject_json_constant)
|
|
44
|
+
except (json.JSONDecodeError, ValueError):
|
|
45
|
+
return None
|
|
46
|
+
if not isinstance(value, dict):
|
|
47
|
+
return None
|
|
48
|
+
records.append(value)
|
|
49
|
+
|
|
50
|
+
if not records:
|
|
51
|
+
return None
|
|
52
|
+
return JsonlPayload(tuple(records))
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _looks_like_json_container(body: str) -> bool:
|
|
56
|
+
return (body.startswith("{") and body.endswith("}")) or (
|
|
57
|
+
body.startswith("[") and body.endswith("]")
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _reject_json_constant(value: str) -> None:
|
|
62
|
+
raise ValueError(f"non-standard JSON constant {value}")
|