justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +6 -0
- justhtml/__main__.py +49 -16
- justhtml/entities.py +45 -7
- justhtml/errors.py +9 -0
- justhtml/node.py +358 -89
- justhtml/parser.py +70 -14
- justhtml/sanitize.py +763 -0
- justhtml/selector.py +114 -18
- justhtml/serialize.py +332 -28
- justhtml/tokenizer.py +249 -179
- justhtml/tokens.py +8 -3
- justhtml/treebuilder.py +50 -14
- justhtml/treebuilder_modes.py +100 -36
- justhtml-0.24.0.dist-info/METADATA +192 -0
- justhtml-0.24.0.dist-info/RECORD +24 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0
justhtml/__init__.py
CHANGED
|
@@ -1,16 +1,22 @@
|
|
|
1
1
|
from .parser import JustHTML, StrictModeError
|
|
2
|
+
from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy, UrlRule, sanitize
|
|
2
3
|
from .selector import SelectorError, matches, query
|
|
3
4
|
from .serialize import to_html, to_test_format
|
|
4
5
|
from .stream import stream
|
|
5
6
|
from .tokens import ParseError
|
|
6
7
|
|
|
7
8
|
__all__ = [
|
|
9
|
+
"DEFAULT_DOCUMENT_POLICY",
|
|
10
|
+
"DEFAULT_POLICY",
|
|
8
11
|
"JustHTML",
|
|
9
12
|
"ParseError",
|
|
13
|
+
"SanitizationPolicy",
|
|
10
14
|
"SelectorError",
|
|
11
15
|
"StrictModeError",
|
|
16
|
+
"UrlRule",
|
|
12
17
|
"matches",
|
|
13
18
|
"query",
|
|
19
|
+
"sanitize",
|
|
14
20
|
"stream",
|
|
15
21
|
"to_html",
|
|
16
22
|
"to_test_format",
|
justhtml/__main__.py
CHANGED
|
@@ -8,9 +8,10 @@ import io
|
|
|
8
8
|
import sys
|
|
9
9
|
from importlib.metadata import PackageNotFoundError, version
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import cast
|
|
11
|
+
from typing import TextIO, cast
|
|
12
12
|
|
|
13
13
|
from . import JustHTML
|
|
14
|
+
from .context import FragmentContext
|
|
14
15
|
from .selector import SelectorError
|
|
15
16
|
|
|
16
17
|
|
|
@@ -44,6 +45,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
44
45
|
nargs="?",
|
|
45
46
|
help="HTML file to parse, or '-' to read from stdin",
|
|
46
47
|
)
|
|
48
|
+
parser.add_argument("--output", help="File to write output to")
|
|
47
49
|
parser.add_argument(
|
|
48
50
|
"--selector",
|
|
49
51
|
help="CSS selector for choosing nodes (defaults to the document root)",
|
|
@@ -54,12 +56,24 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
|
|
|
54
56
|
default="html",
|
|
55
57
|
help="Output format (default: html)",
|
|
56
58
|
)
|
|
59
|
+
|
|
60
|
+
parser.add_argument(
|
|
61
|
+
"--unsafe",
|
|
62
|
+
action="store_true",
|
|
63
|
+
help="Disable sanitization (trusted input only)",
|
|
64
|
+
)
|
|
57
65
|
parser.add_argument(
|
|
58
66
|
"--first",
|
|
59
67
|
action="store_true",
|
|
60
68
|
help="Only output the first matching node",
|
|
61
69
|
)
|
|
62
70
|
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--fragment",
|
|
73
|
+
action="store_true",
|
|
74
|
+
help="Parse input as an HTML fragment (context: <div>)",
|
|
75
|
+
)
|
|
76
|
+
|
|
63
77
|
parser.add_argument(
|
|
64
78
|
"--separator",
|
|
65
79
|
default=" ",
|
|
@@ -108,7 +122,8 @@ def _read_html(path: str) -> str | bytes:
|
|
|
108
122
|
def main() -> None:
|
|
109
123
|
args = _parse_args(sys.argv[1:])
|
|
110
124
|
html = _read_html(args.path)
|
|
111
|
-
|
|
125
|
+
fragment_context = FragmentContext("div") if args.fragment else None
|
|
126
|
+
doc = JustHTML(html, fragment_context=fragment_context)
|
|
112
127
|
|
|
113
128
|
try:
|
|
114
129
|
nodes = doc.query(args.selector) if args.selector else [doc.root]
|
|
@@ -122,22 +137,40 @@ def main() -> None:
|
|
|
122
137
|
if args.first:
|
|
123
138
|
nodes = [nodes[0]]
|
|
124
139
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
140
|
+
def write_output(out: TextIO) -> None:
|
|
141
|
+
safe = not args.unsafe
|
|
142
|
+
if args.format == "html":
|
|
143
|
+
outputs = [node.to_html(safe=safe) for node in nodes]
|
|
144
|
+
out.write("\n".join(outputs))
|
|
145
|
+
out.write("\n")
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
if args.format == "text":
|
|
149
|
+
# Keep these branches explicit so coverage will highlight untested CLI options.
|
|
150
|
+
if args.separator == " ":
|
|
151
|
+
if args.strip:
|
|
152
|
+
outputs = [node.to_text(strip=True, safe=safe) for node in nodes]
|
|
153
|
+
else:
|
|
154
|
+
outputs = [node.to_text(strip=False, safe=safe) for node in nodes]
|
|
155
|
+
else:
|
|
156
|
+
if args.strip:
|
|
157
|
+
outputs = [node.to_text(separator=args.separator, strip=True, safe=safe) for node in nodes]
|
|
158
|
+
else:
|
|
159
|
+
outputs = [node.to_text(separator=args.separator, strip=False, safe=safe) for node in nodes]
|
|
160
|
+
out.write("\n".join(outputs))
|
|
161
|
+
out.write("\n")
|
|
162
|
+
return
|
|
163
|
+
|
|
164
|
+
outputs = [node.to_markdown(safe=safe) for node in nodes]
|
|
165
|
+
out.write("\n\n".join(outputs))
|
|
166
|
+
out.write("\n")
|
|
167
|
+
|
|
168
|
+
if args.output:
|
|
169
|
+
with Path(args.output).open(mode="w", encoding="utf-8") as outfile:
|
|
170
|
+
write_output(outfile)
|
|
135
171
|
return
|
|
136
172
|
|
|
137
|
-
|
|
138
|
-
sys.stdout.write("\n\n".join(outputs))
|
|
139
|
-
sys.stdout.write("\n")
|
|
140
|
-
return
|
|
173
|
+
write_output(sys.stdout)
|
|
141
174
|
|
|
142
175
|
|
|
143
176
|
if __name__ == "__main__":
|
justhtml/entities.py
CHANGED
|
@@ -7,6 +7,10 @@ Supports both named entities (&, ) and numeric references (<, 
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
import html.entities
|
|
10
|
+
from typing import TYPE_CHECKING
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Callable
|
|
10
14
|
|
|
11
15
|
# Use Python's complete HTML5 entity list (2231 entities)
|
|
12
16
|
# Keys include the trailing semicolon (e.g., "amp;", "lang;")
|
|
@@ -168,7 +172,23 @@ NUMERIC_REPLACEMENTS: dict[int, str] = {
|
|
|
168
172
|
}
|
|
169
173
|
|
|
170
174
|
|
|
171
|
-
def
|
|
175
|
+
def _is_control_character(codepoint: int) -> bool:
|
|
176
|
+
# C0 controls and C1 controls
|
|
177
|
+
return (0x00 <= codepoint <= 0x1F) or (0x7F <= codepoint <= 0x9F)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _is_noncharacter(codepoint: int) -> bool:
|
|
181
|
+
if 0xFDD0 <= codepoint <= 0xFDEF:
|
|
182
|
+
return True
|
|
183
|
+
last = codepoint & 0xFFFF
|
|
184
|
+
return last == 0xFFFE or last == 0xFFFF
|
|
185
|
+
|
|
186
|
+
|
|
187
|
+
def decode_numeric_entity(
|
|
188
|
+
text: str,
|
|
189
|
+
is_hex: bool = False,
|
|
190
|
+
report_error: Callable[[str], None] | None = None,
|
|
191
|
+
) -> str:
|
|
172
192
|
"""Decode a numeric character reference like < or <.
|
|
173
193
|
|
|
174
194
|
Args:
|
|
@@ -181,20 +201,30 @@ def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
|
|
|
181
201
|
base = 16 if is_hex else 10
|
|
182
202
|
codepoint = int(text, base)
|
|
183
203
|
|
|
184
|
-
# Apply HTML5 replacements for certain ranges
|
|
185
|
-
if codepoint in NUMERIC_REPLACEMENTS:
|
|
186
|
-
return NUMERIC_REPLACEMENTS[codepoint]
|
|
187
|
-
|
|
188
204
|
# Invalid ranges per HTML5 spec
|
|
189
205
|
if codepoint > 0x10FFFF:
|
|
190
206
|
return "\ufffd" # REPLACEMENT CHARACTER
|
|
191
207
|
if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
|
|
192
208
|
return "\ufffd"
|
|
193
209
|
|
|
210
|
+
if report_error is not None:
|
|
211
|
+
if _is_control_character(codepoint):
|
|
212
|
+
report_error("control-character-reference")
|
|
213
|
+
if _is_noncharacter(codepoint):
|
|
214
|
+
report_error("noncharacter-character-reference")
|
|
215
|
+
|
|
216
|
+
# Apply HTML5 replacements for certain ranges
|
|
217
|
+
if codepoint in NUMERIC_REPLACEMENTS:
|
|
218
|
+
return NUMERIC_REPLACEMENTS[codepoint]
|
|
219
|
+
|
|
194
220
|
return chr(codepoint)
|
|
195
221
|
|
|
196
222
|
|
|
197
|
-
def decode_entities_in_text(
|
|
223
|
+
def decode_entities_in_text(
|
|
224
|
+
text: str,
|
|
225
|
+
in_attribute: bool = False,
|
|
226
|
+
report_error: Callable[[str], None] | None = None,
|
|
227
|
+
) -> str:
|
|
198
228
|
"""Decode all HTML entities in text.
|
|
199
229
|
|
|
200
230
|
This is a simple implementation that handles:
|
|
@@ -247,7 +277,9 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
|
|
|
247
277
|
digit_text = text[digit_start:j]
|
|
248
278
|
|
|
249
279
|
if digit_text:
|
|
250
|
-
|
|
280
|
+
if report_error is not None and not has_semicolon:
|
|
281
|
+
report_error("missing-semicolon-after-character-reference")
|
|
282
|
+
result.append(decode_numeric_entity(digit_text, is_hex=is_hex, report_error=report_error))
|
|
251
283
|
i = j + 1 if has_semicolon else j
|
|
252
284
|
continue
|
|
253
285
|
|
|
@@ -285,6 +317,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
|
|
|
285
317
|
best_match_len = k
|
|
286
318
|
break
|
|
287
319
|
if best_match:
|
|
320
|
+
if report_error is not None:
|
|
321
|
+
report_error("missing-semicolon-after-character-reference")
|
|
288
322
|
result.append(best_match)
|
|
289
323
|
i = i + 1 + best_match_len
|
|
290
324
|
continue
|
|
@@ -302,6 +336,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
|
|
|
302
336
|
continue
|
|
303
337
|
|
|
304
338
|
# Decode legacy entity
|
|
339
|
+
if report_error is not None and not has_semicolon:
|
|
340
|
+
report_error("missing-semicolon-after-character-reference")
|
|
305
341
|
result.append(NAMED_ENTITIES[entity_name])
|
|
306
342
|
i = j
|
|
307
343
|
continue
|
|
@@ -329,6 +365,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
|
|
|
329
365
|
i += 1
|
|
330
366
|
continue
|
|
331
367
|
|
|
368
|
+
if report_error is not None:
|
|
369
|
+
report_error("missing-semicolon-after-character-reference")
|
|
332
370
|
result.append(best_match)
|
|
333
371
|
i = i + 1 + best_match_len
|
|
334
372
|
continue
|
justhtml/errors.py
CHANGED
|
@@ -75,6 +75,8 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
|
|
|
75
75
|
"illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
|
|
76
76
|
"missing-semicolon-after-character-reference": "Missing semicolon after character reference",
|
|
77
77
|
"named-entity-without-semicolon": "Named entity used without semicolon",
|
|
78
|
+
"noncharacter-character-reference": "Noncharacter in character reference",
|
|
79
|
+
"noncharacter-in-input-stream": "Noncharacter in input stream",
|
|
78
80
|
# ================================================================
|
|
79
81
|
# TREE BUILDER ERRORS
|
|
80
82
|
# ================================================================
|
|
@@ -107,8 +109,11 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
|
|
|
107
109
|
# Foster parenting / table errors
|
|
108
110
|
"foster-parenting-character": "Text content in table requires foster parenting",
|
|
109
111
|
"foster-parenting-start-tag": "Start tag in table requires foster parenting",
|
|
112
|
+
"unexpected-character-implies-table-voodoo": "Unexpected character in table triggers foster parenting",
|
|
110
113
|
"unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
|
|
111
114
|
"unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
|
|
115
|
+
"unexpected-implied-end-tag-in-table-view": "Unexpected implied end tag while closing table",
|
|
116
|
+
"eof-in-table": "Unexpected end of file in table",
|
|
112
117
|
"unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
|
|
113
118
|
"unexpected-form-in-table": "Form element not allowed in table context",
|
|
114
119
|
"unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
|
|
@@ -134,6 +139,10 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
|
|
|
134
139
|
"adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
|
|
135
140
|
"non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
|
|
136
141
|
"image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
|
|
142
|
+
# Select insertion mode (context-specific taxonomy)
|
|
143
|
+
"unexpected-start-tag-in-select": f"Unexpected <{tag_name}> start tag in <select>",
|
|
144
|
+
"unexpected-end-tag-in-select": f"Unexpected </{tag_name}> end tag in <select>",
|
|
145
|
+
"unexpected-select-in-select": "Unexpected nested <select> in <select>",
|
|
137
146
|
}
|
|
138
147
|
|
|
139
148
|
# Return message or fall back to the code itself if not found
|