justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/__init__.py
CHANGED
|
@@ -1,14 +1,62 @@
|
|
|
1
1
|
from .parser import JustHTML, StrictModeError
|
|
2
|
+
from .sanitize import (
|
|
3
|
+
CSS_PRESET_TEXT,
|
|
4
|
+
DEFAULT_DOCUMENT_POLICY,
|
|
5
|
+
DEFAULT_POLICY,
|
|
6
|
+
SanitizationPolicy,
|
|
7
|
+
UnsafeHtmlError,
|
|
8
|
+
UrlPolicy,
|
|
9
|
+
UrlProxy,
|
|
10
|
+
UrlRule,
|
|
11
|
+
)
|
|
2
12
|
from .selector import SelectorError, matches, query
|
|
3
13
|
from .serialize import to_html, to_test_format
|
|
4
14
|
from .stream import stream
|
|
5
15
|
from .tokens import ParseError
|
|
16
|
+
from .transforms import (
|
|
17
|
+
CollapseWhitespace,
|
|
18
|
+
Decide,
|
|
19
|
+
Drop,
|
|
20
|
+
Edit,
|
|
21
|
+
EditAttrs,
|
|
22
|
+
EditDocument,
|
|
23
|
+
Empty,
|
|
24
|
+
Linkify,
|
|
25
|
+
PruneEmpty,
|
|
26
|
+
RewriteAttrs,
|
|
27
|
+
Sanitize,
|
|
28
|
+
SetAttrs,
|
|
29
|
+
Stage,
|
|
30
|
+
Unwrap,
|
|
31
|
+
)
|
|
6
32
|
|
|
7
33
|
__all__ = [
|
|
34
|
+
"CSS_PRESET_TEXT",
|
|
35
|
+
"DEFAULT_DOCUMENT_POLICY",
|
|
36
|
+
"DEFAULT_POLICY",
|
|
37
|
+
"CollapseWhitespace",
|
|
38
|
+
"Decide",
|
|
39
|
+
"Drop",
|
|
40
|
+
"Edit",
|
|
41
|
+
"EditAttrs",
|
|
42
|
+
"EditDocument",
|
|
43
|
+
"Empty",
|
|
8
44
|
"JustHTML",
|
|
45
|
+
"Linkify",
|
|
9
46
|
"ParseError",
|
|
47
|
+
"PruneEmpty",
|
|
48
|
+
"RewriteAttrs",
|
|
49
|
+
"SanitizationPolicy",
|
|
50
|
+
"Sanitize",
|
|
10
51
|
"SelectorError",
|
|
52
|
+
"SetAttrs",
|
|
53
|
+
"Stage",
|
|
11
54
|
"StrictModeError",
|
|
55
|
+
"UnsafeHtmlError",
|
|
56
|
+
"Unwrap",
|
|
57
|
+
"UrlPolicy",
|
|
58
|
+
"UrlProxy",
|
|
59
|
+
"UrlRule",
|
|
12
60
|
"matches",
|
|
13
61
|
"query",
|
|
14
62
|
"stream",
|
justhtml/__main__.py
CHANGED
|
@@ -8,9 +8,10 @@ import io
|
|
|
8
8
|
import sys
|
|
9
9
|
from importlib.metadata import PackageNotFoundError, version
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import cast
|
|
11
|
+
from typing import TextIO, cast
|
|
12
12
|
|
|
13
13
|
from . import JustHTML
|
|
14
|
+
from .context import FragmentContext
|
|
14
15
|
from .selector import SelectorError
|
|
15
16
|
|
|
16
17
|
|
|
@@ -31,7 +32,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
31
32
|
" curl -s https://example.com | justhtml -\n"
|
|
32
33
|
" justhtml page.html --selector 'main p' --format text\n"
|
|
33
34
|
" justhtml page.html --selector 'a' --format html\n"
|
|
34
|
-
" justhtml page.html --selector 'article' --format markdown\n"
|
|
35
|
+
" justhtml page.html --selector 'article' --allow-tags article --format markdown\n"
|
|
35
36
|
"\n"
|
|
36
37
|
"If you don't have the 'justhtml' command available, use:\n"
|
|
37
38
|
" python -m justhtml ...\n"
|
|
@@ -44,6 +45,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
44
45
|
nargs="?",
|
|
45
46
|
help="HTML file to parse, or '-' to read from stdin",
|
|
46
47
|
)
|
|
48
|
+
parser.add_argument("--output", help="File to write output to")
|
|
47
49
|
parser.add_argument(
|
|
48
50
|
"--selector",
|
|
49
51
|
help="CSS selector for choosing nodes (defaults to the document root)",
|
|
@@ -54,12 +56,32 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
54
56
|
default="html",
|
|
55
57
|
help="Output format (default: html)",
|
|
56
58
|
)
|
|
59
|
+
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--unsafe",
|
|
62
|
+
action="store_true",
|
|
63
|
+
help="Disable sanitization (trusted input only)",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
parser.add_argument(
|
|
67
|
+
"--allow-tags",
|
|
68
|
+
help=(
|
|
69
|
+
"Safe mode: allow these additional tags during sanitization (comma-separated). "
|
|
70
|
+
"Example: --allow-tags article,section"
|
|
71
|
+
),
|
|
72
|
+
)
|
|
57
73
|
parser.add_argument(
|
|
58
74
|
"--first",
|
|
59
75
|
action="store_true",
|
|
60
76
|
help="Only output the first matching node",
|
|
61
77
|
)
|
|
62
78
|
|
|
79
|
+
parser.add_argument(
|
|
80
|
+
"--fragment",
|
|
81
|
+
action="store_true",
|
|
82
|
+
help="Parse input as an HTML fragment (context: <div>)",
|
|
83
|
+
)
|
|
84
|
+
|
|
63
85
|
parser.add_argument(
|
|
64
86
|
"--separator",
|
|
65
87
|
default=" ",
|
|
@@ -108,7 +130,37 @@ def _read_html(path: str) -> str | bytes:
|
|
|
108
130
|
def main() -> None:
|
|
109
131
|
args = _parse_args(sys.argv[1:])
|
|
110
132
|
html = _read_html(args.path)
|
|
111
|
-
|
|
133
|
+
fragment_context = FragmentContext("div") if args.fragment else None
|
|
134
|
+
safe = not args.unsafe
|
|
135
|
+
|
|
136
|
+
policy = None
|
|
137
|
+
if safe and args.allow_tags:
|
|
138
|
+
from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy # noqa: PLC0415
|
|
139
|
+
|
|
140
|
+
extra_tags: set[str] = set()
|
|
141
|
+
for part in str(args.allow_tags).replace(" ", ",").split(","):
|
|
142
|
+
tag = part.strip().lower()
|
|
143
|
+
if tag:
|
|
144
|
+
extra_tags.add(tag)
|
|
145
|
+
|
|
146
|
+
base = DEFAULT_POLICY if fragment_context is not None else DEFAULT_DOCUMENT_POLICY
|
|
147
|
+
allowed = set(base.allowed_tags)
|
|
148
|
+
allowed.update(extra_tags)
|
|
149
|
+
policy = SanitizationPolicy(
|
|
150
|
+
allowed_tags=allowed,
|
|
151
|
+
allowed_attributes=base.allowed_attributes,
|
|
152
|
+
url_policy=base.url_policy,
|
|
153
|
+
drop_comments=base.drop_comments,
|
|
154
|
+
drop_doctype=base.drop_doctype,
|
|
155
|
+
drop_foreign_namespaces=base.drop_foreign_namespaces,
|
|
156
|
+
drop_content_tags=base.drop_content_tags,
|
|
157
|
+
allowed_css_properties=base.allowed_css_properties,
|
|
158
|
+
force_link_rel=base.force_link_rel,
|
|
159
|
+
unsafe_handling=base.unsafe_handling,
|
|
160
|
+
disallowed_tag_handling=base.disallowed_tag_handling,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
doc = JustHTML(html, fragment_context=fragment_context, safe=safe, policy=policy)
|
|
112
164
|
|
|
113
165
|
try:
|
|
114
166
|
nodes = doc.query(args.selector) if args.selector else [doc.root]
|
|
@@ -122,22 +174,39 @@ def main() -> None:
|
|
|
122
174
|
if args.first:
|
|
123
175
|
nodes = [nodes[0]]
|
|
124
176
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
177
|
+
def write_output(out: TextIO) -> None:
|
|
178
|
+
if args.format == "html":
|
|
179
|
+
outputs = [node.to_html() for node in nodes]
|
|
180
|
+
out.write("\n".join(outputs))
|
|
181
|
+
out.write("\n")
|
|
182
|
+
return
|
|
183
|
+
|
|
184
|
+
if args.format == "text":
|
|
185
|
+
# Keep these branches explicit so coverage will highlight untested CLI options.
|
|
186
|
+
if args.separator == " ":
|
|
187
|
+
if args.strip:
|
|
188
|
+
outputs = [node.to_text(strip=True) for node in nodes]
|
|
189
|
+
else:
|
|
190
|
+
outputs = [node.to_text(strip=False) for node in nodes]
|
|
191
|
+
else:
|
|
192
|
+
if args.strip:
|
|
193
|
+
outputs = [node.to_text(separator=args.separator, strip=True) for node in nodes]
|
|
194
|
+
else:
|
|
195
|
+
outputs = [node.to_text(separator=args.separator, strip=False) for node in nodes]
|
|
196
|
+
out.write("\n".join(outputs))
|
|
197
|
+
out.write("\n")
|
|
198
|
+
return
|
|
199
|
+
|
|
200
|
+
outputs = [node.to_markdown() for node in nodes]
|
|
201
|
+
out.write("\n\n".join(outputs))
|
|
202
|
+
out.write("\n")
|
|
203
|
+
|
|
204
|
+
if args.output:
|
|
205
|
+
with Path(args.output).open(mode="w", encoding="utf-8") as outfile:
|
|
206
|
+
write_output(outfile)
|
|
135
207
|
return
|
|
136
208
|
|
|
137
|
-
|
|
138
|
-
sys.stdout.write("\n\n".join(outputs))
|
|
139
|
-
sys.stdout.write("\n")
|
|
140
|
-
return
|
|
209
|
+
write_output(sys.stdout)
|
|
141
210
|
|
|
142
211
|
|
|
143
212
|
if __name__ == "__main__":
|
justhtml/constants.py
CHANGED
|
@@ -184,6 +184,18 @@ HTML4_PUBLIC_PREFIXES = (
|
|
|
184
184
|
|
|
185
185
|
HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
|
|
186
186
|
|
|
187
|
+
# Elements where pretty-printing and whitespace-collapsing transforms should
|
|
188
|
+
# preserve text node whitespace.
|
|
189
|
+
WHITESPACE_PRESERVING_ELEMENTS: Final[frozenset[str]] = frozenset(
|
|
190
|
+
{
|
|
191
|
+
"code",
|
|
192
|
+
"pre",
|
|
193
|
+
"script",
|
|
194
|
+
"style",
|
|
195
|
+
"textarea",
|
|
196
|
+
}
|
|
197
|
+
)
|
|
198
|
+
|
|
187
199
|
FORMATTING_ELEMENTS = {
|
|
188
200
|
"a",
|
|
189
201
|
"b",
|
justhtml/entities.py
CHANGED
|
@@ -7,6 +7,10 @@ Supports both named entities (&, ) and numeric references (<, 
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
import html.entities
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Callable
|
|
10
14
|
|
|
11
15
|
# Use Python's complete HTML5 entity list (2231 entities)
|
|
12
16
|
# Keys include the trailing semicolon (e.g., "amp;", "lang;")
|
|
@@ -168,7 +172,23 @@ NUMERIC_REPLACEMENTS: dict[int, str] = {
|
|
|
168
172
|
}
|
|
169
173
|
|
|
170
174
|
|
|
171
|
-
def
|
|
175
|
+
def _is_control_character(codepoint: int) -> bool:
|
|
176
|
+
# C0 controls and C1 controls
|
|
177
|
+
return (0x00 <= codepoint <= 0x1F) or (0x7F <= codepoint <= 0x9F)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _is_noncharacter(codepoint: int) -> bool:
|
|
181
|
+
if 0xFDD0 <= codepoint <= 0xFDEF:
|
|
182
|
+
return True
|
|
183
|
+
last = codepoint & 0xFFFF
|
|
184
|
+
return last == 0xFFFE or last == 0xFFFF
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def decode_numeric_entity(
|
|
188
|
+
text: str,
|
|
189
|
+
is_hex: bool = False,
|
|
190
|
+
report_error: Callable[[str], None] | None = None,
|
|
191
|
+
) -> str:
|
|
172
192
|
"""Decode a numeric character reference like < or <.
|
|
173
193
|
|
|
174
194
|
Args:
|
|
@@ -181,20 +201,30 @@ def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
|
|
|
181
201
|
base = 16 if is_hex else 10
|
|
182
202
|
codepoint = int(text, base)
|
|
183
203
|
|
|
184
|
-
# Apply HTML5 replacements for certain ranges
|
|
185
|
-
if codepoint in NUMERIC_REPLACEMENTS:
|
|
186
|
-
return NUMERIC_REPLACEMENTS[codepoint]
|
|
187
|
-
|
|
188
204
|
# Invalid ranges per HTML5 spec
|
|
189
205
|
if codepoint > 0x10FFFF:
|
|
190
206
|
return "\ufffd" # REPLACEMENT CHARACTER
|
|
191
207
|
if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
|
|
192
208
|
return "\ufffd"
|
|
193
209
|
|
|
210
|
+
if report_error is not None:
|
|
211
|
+
if _is_control_character(codepoint):
|
|
212
|
+
report_error("control-character-reference")
|
|
213
|
+
if _is_noncharacter(codepoint):
|
|
214
|
+
report_error("noncharacter-character-reference")
|
|
215
|
+
|
|
216
|
+
# Apply HTML5 replacements for certain ranges
|
|
217
|
+
if codepoint in NUMERIC_REPLACEMENTS:
|
|
218
|
+
return NUMERIC_REPLACEMENTS[codepoint]
|
|
219
|
+
|
|
194
220
|
return chr(codepoint)
|
|
195
221
|
|
|
196
222
|
|
|
197
|
-
def decode_entities_in_text(
|
|
223
|
+
def decode_entities_in_text(
|
|
224
|
+
text: str,
|
|
225
|
+
in_attribute: bool = False,
|
|
226
|
+
report_error: Callable[[str], None] | None = None,
|
|
227
|
+
) -> str:
|
|
198
228
|
"""Decode all HTML entities in text.
|
|
199
229
|
|
|
200
230
|
This is a simple implementation that handles:
|
|
@@ -247,7 +277,9 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
|
|
|
247
277
|
digit_text = text[digit_start:j]
|
|
248
278
|
|
|
249
279
|
if digit_text:
|
|
250
|
-
|
|
280
|
+
if report_error is not None and not has_semicolon:
|
|
281
|
+
report_error("missing-semicolon-after-character-reference")
|
|
282
|
+
result.append(decode_numeric_entity(digit_text, is_hex=is_hex, report_error=report_error))
|
|
251
283
|
i = j + 1 if has_semicolon else j
|
|
252
284
|
continue
|
|
253
285
|
|
|
@@ -285,6 +317,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
|
|
|
285
317
|
best_match_len = k
|
|
286
318
|
break
|
|
287
319
|
if best_match:
|
|
320
|
+
if report_error is not None:
|
|
321
|
+
report_error("missing-semicolon-after-character-reference")
|
|
288
322
|
result.append(best_match)
|
|
289
323
|
i = i + 1 + best_match_len
|
|
290
324
|
continue
|
|
@@ -302,6 +336,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
|
|
|
302
336
|
continue
|
|
303
337
|
|
|
304
338
|
# Decode legacy entity
|
|
339
|
+
if report_error is not None and not has_semicolon:
|
|
340
|
+
report_error("missing-semicolon-after-character-reference")
|
|
305
341
|
result.append(NAMED_ENTITIES[entity_name])
|
|
306
342
|
i = j
|
|
307
343
|
continue
|
|
@@ -329,6 +365,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
|
|
|
329
365
|
i += 1
|
|
330
366
|
continue
|
|
331
367
|
|
|
368
|
+
if report_error is not None:
|
|
369
|
+
report_error("missing-semicolon-after-character-reference")
|
|
332
370
|
result.append(best_match)
|
|
333
371
|
i = i + 1 + best_match_len
|
|
334
372
|
continue
|
justhtml/errors.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
"""Centralized error message definitions and helpers for
|
|
1
|
+
"""Centralized error message definitions and helpers for JustHTML errors.
|
|
2
2
|
|
|
3
|
-
This module provides human-readable error messages for
|
|
4
|
-
emitted by
|
|
3
|
+
This module provides human-readable error messages for parse error codes
|
|
4
|
+
emitted by the tokenizer and tree builder during HTML parsing, plus selected
|
|
5
|
+
security findings emitted by the sanitizer.
|
|
5
6
|
"""
|
|
6
7
|
|
|
7
8
|
from __future__ import annotations
|
|
@@ -75,6 +76,8 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
|
|
|
75
76
|
"illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
|
|
76
77
|
"missing-semicolon-after-character-reference": "Missing semicolon after character reference",
|
|
77
78
|
"named-entity-without-semicolon": "Named entity used without semicolon",
|
|
79
|
+
"noncharacter-character-reference": "Noncharacter in character reference",
|
|
80
|
+
"noncharacter-in-input-stream": "Noncharacter in input stream",
|
|
78
81
|
# ================================================================
|
|
79
82
|
# TREE BUILDER ERRORS
|
|
80
83
|
# ================================================================
|
|
@@ -107,8 +110,11 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
|
|
|
107
110
|
# Foster parenting / table errors
|
|
108
111
|
"foster-parenting-character": "Text content in table requires foster parenting",
|
|
109
112
|
"foster-parenting-start-tag": "Start tag in table requires foster parenting",
|
|
113
|
+
"unexpected-character-implies-table-voodoo": "Unexpected character in table triggers foster parenting",
|
|
110
114
|
"unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
|
|
111
115
|
"unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
|
|
116
|
+
"unexpected-implied-end-tag-in-table-view": "Unexpected implied end tag while closing table",
|
|
117
|
+
"eof-in-table": "Unexpected end of file in table",
|
|
112
118
|
"unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
|
|
113
119
|
"unexpected-form-in-table": "Form element not allowed in table context",
|
|
114
120
|
"unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
|
|
@@ -134,6 +140,14 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
|
|
|
134
140
|
"adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
|
|
135
141
|
"non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
|
|
136
142
|
"image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
|
|
143
|
+
# Select insertion mode (context-specific taxonomy)
|
|
144
|
+
"unexpected-start-tag-in-select": f"Unexpected <{tag_name}> start tag in <select>",
|
|
145
|
+
"unexpected-end-tag-in-select": f"Unexpected </{tag_name}> end tag in <select>",
|
|
146
|
+
"unexpected-select-in-select": "Unexpected nested <select> in <select>",
|
|
147
|
+
# ================================================================
|
|
148
|
+
# SECURITY ERRORS
|
|
149
|
+
# ================================================================
|
|
150
|
+
"unsafe-html": "Unsafe HTML detected by sanitization policy",
|
|
137
151
|
}
|
|
138
152
|
|
|
139
153
|
# Return message or fall back to the code itself if not found
|