justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +28 -0
- justhtml/__main__.py +161 -13
- justhtml/constants.py +17 -1
- justhtml/context.py +7 -1
- justhtml/encoding.py +405 -0
- justhtml/entities.py +57 -17
- justhtml/errors.py +20 -4
- justhtml/linkify.py +438 -0
- justhtml/node.py +738 -41
- justhtml/parser.py +188 -21
- justhtml/py.typed +0 -0
- justhtml/sanitize.py +1141 -0
- justhtml/selector.py +240 -104
- justhtml/serialize.py +418 -57
- justhtml/stream.py +34 -10
- justhtml/tokenizer.py +433 -289
- justhtml/tokens.py +91 -23
- justhtml/transforms.py +690 -0
- justhtml/treebuilder.py +196 -111
- justhtml/treebuilder_modes.py +191 -117
- justhtml/treebuilder_utils.py +11 -4
- justhtml-0.33.0.dist-info/METADATA +196 -0
- justhtml-0.33.0.dist-info/RECORD +26 -0
- justhtml-0.33.0.dist-info/entry_points.txt +2 -0
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.6.0.dist-info/METADATA +0 -126
- justhtml-0.6.0.dist-info/RECORD +0 -20
- {justhtml-0.6.0.dist-info → justhtml-0.33.0.dist-info}/WHEEL +0 -0
justhtml/parser.py
CHANGED
|
@@ -1,8 +1,23 @@
|
|
|
1
1
|
"""Minimal JustHTML parser entry point."""
|
|
2
2
|
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from .context import FragmentContext
|
|
8
|
+
from .encoding import decode_html
|
|
3
9
|
from .tokenizer import Tokenizer, TokenizerOpts
|
|
10
|
+
from .transforms import apply_compiled_transforms, compile_transforms
|
|
4
11
|
from .treebuilder import TreeBuilder
|
|
5
12
|
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from collections.abc import Callable
|
|
15
|
+
|
|
16
|
+
from .node import SimpleDomNode
|
|
17
|
+
from .sanitize import SanitizationPolicy
|
|
18
|
+
from .tokens import ParseError
|
|
19
|
+
from .transforms import Transform
|
|
20
|
+
|
|
6
21
|
|
|
7
22
|
class StrictModeError(SyntaxError):
|
|
8
23
|
"""Raised when strict mode encounters a parse error.
|
|
@@ -11,7 +26,9 @@ class StrictModeError(SyntaxError):
|
|
|
11
26
|
with source location highlighting.
|
|
12
27
|
"""
|
|
13
28
|
|
|
14
|
-
|
|
29
|
+
error: ParseError
|
|
30
|
+
|
|
31
|
+
def __init__(self, error: ParseError) -> None:
|
|
15
32
|
self.error = error
|
|
16
33
|
# Use the ParseError's as_exception() to get enhanced display
|
|
17
34
|
exc = error.as_exception()
|
|
@@ -26,24 +43,58 @@ class StrictModeError(SyntaxError):
|
|
|
26
43
|
|
|
27
44
|
|
|
28
45
|
class JustHTML:
|
|
29
|
-
__slots__ = ("debug", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
|
|
46
|
+
__slots__ = ("debug", "encoding", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
|
|
47
|
+
|
|
48
|
+
debug: bool
|
|
49
|
+
encoding: str | None
|
|
50
|
+
errors: list[ParseError]
|
|
51
|
+
fragment_context: FragmentContext | None
|
|
52
|
+
root: SimpleDomNode
|
|
53
|
+
tokenizer: Tokenizer
|
|
54
|
+
tree_builder: TreeBuilder
|
|
30
55
|
|
|
31
56
|
def __init__(
|
|
32
57
|
self,
|
|
33
|
-
html,
|
|
58
|
+
html: str | bytes | bytearray | memoryview | None,
|
|
34
59
|
*,
|
|
35
|
-
collect_errors=False,
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
60
|
+
collect_errors: bool = False,
|
|
61
|
+
track_node_locations: bool = False,
|
|
62
|
+
debug: bool = False,
|
|
63
|
+
encoding: str | None = None,
|
|
64
|
+
fragment: bool = False,
|
|
65
|
+
fragment_context: FragmentContext | None = None,
|
|
66
|
+
iframe_srcdoc: bool = False,
|
|
67
|
+
strict: bool = False,
|
|
68
|
+
tokenizer_opts: TokenizerOpts | None = None,
|
|
69
|
+
tree_builder: TreeBuilder | None = None,
|
|
70
|
+
transforms: list[Transform] | None = None,
|
|
71
|
+
) -> None:
|
|
72
|
+
if fragment_context is not None:
|
|
73
|
+
fragment = True
|
|
74
|
+
|
|
75
|
+
if fragment and fragment_context is None:
|
|
76
|
+
fragment_context = FragmentContext("div")
|
|
77
|
+
|
|
78
|
+
# Compile transforms early so invalid selectors fail fast.
|
|
79
|
+
compiled_transforms = None
|
|
80
|
+
if transforms:
|
|
81
|
+
compiled_transforms = compile_transforms(tuple(transforms))
|
|
82
|
+
|
|
43
83
|
self.debug = bool(debug)
|
|
44
84
|
self.fragment_context = fragment_context
|
|
45
|
-
|
|
46
|
-
|
|
85
|
+
self.encoding = None
|
|
86
|
+
|
|
87
|
+
html_str: str
|
|
88
|
+
if isinstance(html, (bytes, bytearray, memoryview)):
|
|
89
|
+
html_str, chosen = decode_html(bytes(html), transport_encoding=encoding)
|
|
90
|
+
self.encoding = chosen
|
|
91
|
+
elif html is not None:
|
|
92
|
+
html_str = str(html)
|
|
93
|
+
else:
|
|
94
|
+
html_str = ""
|
|
95
|
+
|
|
96
|
+
# Enable error collection if strict mode is on.
|
|
97
|
+
# Node location tracking is opt-in to avoid slowing down the common case.
|
|
47
98
|
should_collect = collect_errors or strict
|
|
48
99
|
|
|
49
100
|
self.tree_builder = tree_builder or TreeBuilder(
|
|
@@ -63,24 +114,140 @@ class JustHTML:
|
|
|
63
114
|
elif tag_name in ("plaintext", "script"):
|
|
64
115
|
opts.initial_state = Tokenizer.PLAINTEXT
|
|
65
116
|
|
|
66
|
-
self.tokenizer = Tokenizer(
|
|
117
|
+
self.tokenizer = Tokenizer(
|
|
118
|
+
self.tree_builder,
|
|
119
|
+
opts,
|
|
120
|
+
collect_errors=should_collect,
|
|
121
|
+
track_node_locations=bool(track_node_locations),
|
|
122
|
+
)
|
|
67
123
|
# Link tokenizer to tree_builder for position info
|
|
68
124
|
self.tree_builder.tokenizer = self.tokenizer
|
|
69
125
|
|
|
70
|
-
self.tokenizer.run(
|
|
126
|
+
self.tokenizer.run(html_str)
|
|
71
127
|
self.root = self.tree_builder.finish()
|
|
72
128
|
|
|
73
|
-
|
|
74
|
-
|
|
129
|
+
if compiled_transforms is not None:
|
|
130
|
+
apply_compiled_transforms(self.root, compiled_transforms)
|
|
131
|
+
|
|
132
|
+
if should_collect:
|
|
133
|
+
# Merge errors from both tokenizer and tree builder.
|
|
134
|
+
# Public API: users expect errors to be ordered by input position.
|
|
135
|
+
merged_errors = self.tokenizer.errors + self.tree_builder.errors
|
|
136
|
+
self.errors = self._sorted_errors(merged_errors)
|
|
137
|
+
else:
|
|
138
|
+
self.errors = []
|
|
75
139
|
|
|
76
140
|
# In strict mode, raise on first error
|
|
77
141
|
if strict and self.errors:
|
|
78
142
|
raise StrictModeError(self.errors[0])
|
|
79
143
|
|
|
80
|
-
def query(self, selector):
|
|
144
|
+
def query(self, selector: str) -> list[Any]:
|
|
81
145
|
"""Query the document using a CSS selector. Delegates to root.query()."""
|
|
82
146
|
return self.root.query(selector)
|
|
83
147
|
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
148
|
+
@staticmethod
|
|
149
|
+
def _sorted_errors(errors: list[ParseError]) -> list[ParseError]:
|
|
150
|
+
indexed_errors = enumerate(errors)
|
|
151
|
+
return [
|
|
152
|
+
e
|
|
153
|
+
for _, e in sorted(
|
|
154
|
+
indexed_errors,
|
|
155
|
+
key=lambda t: (
|
|
156
|
+
t[1].line if t[1].line is not None else 1_000_000_000,
|
|
157
|
+
t[1].column if t[1].column is not None else 1_000_000_000,
|
|
158
|
+
t[0],
|
|
159
|
+
),
|
|
160
|
+
)
|
|
161
|
+
]
|
|
162
|
+
|
|
163
|
+
def _set_security_errors(self, errors: list[ParseError]) -> None:
|
|
164
|
+
if not self.errors and not errors:
|
|
165
|
+
return
|
|
166
|
+
|
|
167
|
+
base = [e for e in self.errors if e.category != "security"]
|
|
168
|
+
self.errors = self._sorted_errors(base + errors)
|
|
169
|
+
|
|
170
|
+
def _with_security_error_collection(
|
|
171
|
+
self,
|
|
172
|
+
policy: SanitizationPolicy | None,
|
|
173
|
+
serialize: Callable[[], str],
|
|
174
|
+
) -> str:
|
|
175
|
+
if policy is not None and policy.unsafe_handling == "collect":
|
|
176
|
+
policy.reset_collected_security_errors()
|
|
177
|
+
out = serialize()
|
|
178
|
+
self._set_security_errors(policy.collected_security_errors())
|
|
179
|
+
return out
|
|
180
|
+
|
|
181
|
+
# Avoid stale security errors if a previous serialization used collect.
|
|
182
|
+
self._set_security_errors([])
|
|
183
|
+
return serialize()
|
|
184
|
+
|
|
185
|
+
def to_html(
|
|
186
|
+
self,
|
|
187
|
+
pretty: bool = True,
|
|
188
|
+
indent_size: int = 2,
|
|
189
|
+
*,
|
|
190
|
+
safe: bool = True,
|
|
191
|
+
policy: SanitizationPolicy | None = None,
|
|
192
|
+
) -> str:
|
|
193
|
+
"""Serialize the document to HTML.
|
|
194
|
+
|
|
195
|
+
- `safe=True` sanitizes untrusted content before serialization.
|
|
196
|
+
- `policy` overrides the default sanitization policy.
|
|
197
|
+
"""
|
|
198
|
+
if not safe:
|
|
199
|
+
return self.root.to_html(
|
|
200
|
+
indent=0,
|
|
201
|
+
indent_size=indent_size,
|
|
202
|
+
pretty=pretty,
|
|
203
|
+
safe=False,
|
|
204
|
+
policy=policy,
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
return self._with_security_error_collection(
|
|
208
|
+
policy,
|
|
209
|
+
lambda: self.root.to_html(
|
|
210
|
+
indent=0,
|
|
211
|
+
indent_size=indent_size,
|
|
212
|
+
pretty=pretty,
|
|
213
|
+
safe=True,
|
|
214
|
+
policy=policy,
|
|
215
|
+
),
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def to_text(
|
|
219
|
+
self,
|
|
220
|
+
separator: str = " ",
|
|
221
|
+
strip: bool = True,
|
|
222
|
+
*,
|
|
223
|
+
safe: bool = True,
|
|
224
|
+
policy: SanitizationPolicy | None = None,
|
|
225
|
+
) -> str:
|
|
226
|
+
"""Return the document's concatenated text.
|
|
227
|
+
|
|
228
|
+
- `safe=True` sanitizes untrusted content before text extraction.
|
|
229
|
+
- `policy` overrides the default sanitization policy.
|
|
230
|
+
|
|
231
|
+
Delegates to `root.to_text(...)`.
|
|
232
|
+
"""
|
|
233
|
+
if not safe:
|
|
234
|
+
return self.root.to_text(separator=separator, strip=strip, safe=False, policy=policy)
|
|
235
|
+
|
|
236
|
+
return self._with_security_error_collection(
|
|
237
|
+
policy,
|
|
238
|
+
lambda: self.root.to_text(separator=separator, strip=strip, safe=True, policy=policy),
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
|
|
242
|
+
"""Return a GitHub Flavored Markdown representation.
|
|
243
|
+
|
|
244
|
+
- `safe=True` sanitizes untrusted content before conversion.
|
|
245
|
+
- `policy` overrides the default sanitization policy.
|
|
246
|
+
"""
|
|
247
|
+
if not safe:
|
|
248
|
+
return self.root.to_markdown(safe=False, policy=policy)
|
|
249
|
+
|
|
250
|
+
return self._with_security_error_collection(
|
|
251
|
+
policy,
|
|
252
|
+
lambda: self.root.to_markdown(safe=True, policy=policy),
|
|
253
|
+
)
|
justhtml/py.typed
ADDED
|
File without changes
|