justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
justhtml/parser.py
CHANGED
|
@@ -4,14 +4,17 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
+
from .context import FragmentContext
|
|
7
8
|
from .encoding import decode_html
|
|
8
9
|
from .tokenizer import Tokenizer, TokenizerOpts
|
|
10
|
+
from .transforms import apply_compiled_transforms, compile_transforms
|
|
9
11
|
from .treebuilder import TreeBuilder
|
|
10
12
|
|
|
11
13
|
if TYPE_CHECKING:
|
|
12
|
-
from .context import FragmentContext
|
|
13
14
|
from .node import SimpleDomNode
|
|
15
|
+
from .sanitize import SanitizationPolicy
|
|
14
16
|
from .tokens import ParseError
|
|
17
|
+
from .transforms import TransformSpec
|
|
15
18
|
|
|
16
19
|
|
|
17
20
|
class StrictModeError(SyntaxError):
|
|
@@ -52,15 +55,49 @@ class JustHTML:
|
|
|
52
55
|
self,
|
|
53
56
|
html: str | bytes | bytearray | memoryview | None,
|
|
54
57
|
*,
|
|
58
|
+
safe: bool = True,
|
|
59
|
+
policy: SanitizationPolicy | None = None,
|
|
55
60
|
collect_errors: bool = False,
|
|
61
|
+
track_node_locations: bool = False,
|
|
56
62
|
debug: bool = False,
|
|
57
63
|
encoding: str | None = None,
|
|
64
|
+
fragment: bool = False,
|
|
58
65
|
fragment_context: FragmentContext | None = None,
|
|
59
66
|
iframe_srcdoc: bool = False,
|
|
60
67
|
strict: bool = False,
|
|
61
68
|
tokenizer_opts: TokenizerOpts | None = None,
|
|
62
69
|
tree_builder: TreeBuilder | None = None,
|
|
70
|
+
transforms: list[TransformSpec] | None = None,
|
|
63
71
|
) -> None:
|
|
72
|
+
if fragment_context is not None:
|
|
73
|
+
fragment = True
|
|
74
|
+
|
|
75
|
+
if fragment and fragment_context is None:
|
|
76
|
+
fragment_context = FragmentContext("div")
|
|
77
|
+
|
|
78
|
+
track_tag_spans = False
|
|
79
|
+
has_sanitize_transform = False
|
|
80
|
+
needs_escape_incomplete_tags = False
|
|
81
|
+
if transforms:
|
|
82
|
+
from .sanitize import DEFAULT_POLICY # noqa: PLC0415
|
|
83
|
+
from .transforms import Sanitize # noqa: PLC0415
|
|
84
|
+
|
|
85
|
+
for t in transforms:
|
|
86
|
+
if isinstance(t, Sanitize):
|
|
87
|
+
has_sanitize_transform = True
|
|
88
|
+
effective = t.policy or DEFAULT_POLICY
|
|
89
|
+
if effective.disallowed_tag_handling == "escape":
|
|
90
|
+
track_tag_spans = True
|
|
91
|
+
needs_escape_incomplete_tags = True
|
|
92
|
+
break
|
|
93
|
+
|
|
94
|
+
# If we will auto-sanitize (safe=True and no Sanitize in transforms),
|
|
95
|
+
# escape-mode tag reconstruction may require tracking tag spans.
|
|
96
|
+
if safe and not has_sanitize_transform and policy is not None:
|
|
97
|
+
if policy.disallowed_tag_handling == "escape":
|
|
98
|
+
track_tag_spans = True
|
|
99
|
+
needs_escape_incomplete_tags = True
|
|
100
|
+
|
|
64
101
|
self.debug = bool(debug)
|
|
65
102
|
self.fragment_context = fragment_context
|
|
66
103
|
self.encoding = None
|
|
@@ -74,15 +111,19 @@ class JustHTML:
|
|
|
74
111
|
else:
|
|
75
112
|
html_str = ""
|
|
76
113
|
|
|
77
|
-
# Enable error collection if strict mode is on
|
|
114
|
+
# Enable error collection if strict mode is on.
|
|
115
|
+
# Node location tracking is opt-in to avoid slowing down the common case.
|
|
78
116
|
should_collect = collect_errors or strict
|
|
79
117
|
|
|
80
118
|
self.tree_builder = tree_builder or TreeBuilder(
|
|
81
119
|
fragment_context=fragment_context,
|
|
82
120
|
iframe_srcdoc=iframe_srcdoc,
|
|
83
121
|
collect_errors=should_collect,
|
|
122
|
+
track_tag_spans=track_tag_spans,
|
|
84
123
|
)
|
|
85
124
|
opts = tokenizer_opts or TokenizerOpts()
|
|
125
|
+
if needs_escape_incomplete_tags:
|
|
126
|
+
opts.emit_bogus_markup_as_text = True
|
|
86
127
|
|
|
87
128
|
# For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
|
|
88
129
|
if fragment_context and not fragment_context.namespace:
|
|
@@ -94,15 +135,73 @@ class JustHTML:
|
|
|
94
135
|
elif tag_name in ("plaintext", "script"):
|
|
95
136
|
opts.initial_state = Tokenizer.PLAINTEXT
|
|
96
137
|
|
|
97
|
-
self.tokenizer = Tokenizer(
|
|
138
|
+
self.tokenizer = Tokenizer(
|
|
139
|
+
self.tree_builder,
|
|
140
|
+
opts,
|
|
141
|
+
collect_errors=should_collect,
|
|
142
|
+
track_node_locations=bool(track_node_locations),
|
|
143
|
+
track_tag_positions=bool(track_node_locations) or track_tag_spans,
|
|
144
|
+
)
|
|
98
145
|
# Link tokenizer to tree_builder for position info
|
|
99
146
|
self.tree_builder.tokenizer = self.tokenizer
|
|
100
147
|
|
|
101
148
|
self.tokenizer.run(html_str)
|
|
102
149
|
self.root = self.tree_builder.finish()
|
|
103
150
|
|
|
104
|
-
|
|
105
|
-
|
|
151
|
+
transform_errors: list[ParseError] = []
|
|
152
|
+
|
|
153
|
+
# Apply transforms after parse.
|
|
154
|
+
# Safety model: when safe=True, the in-memory tree is sanitized exactly once
|
|
155
|
+
# during construction by ensuring a Sanitize transform runs.
|
|
156
|
+
if transforms or safe:
|
|
157
|
+
from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY # noqa: PLC0415
|
|
158
|
+
from .transforms import Sanitize # noqa: PLC0415
|
|
159
|
+
|
|
160
|
+
final_transforms: list[TransformSpec] = list(transforms or [])
|
|
161
|
+
|
|
162
|
+
# Normalize explicit Sanitize() transforms to use the same default policy
|
|
163
|
+
# choice as the old safe-output sanitizer (document vs fragment).
|
|
164
|
+
if final_transforms:
|
|
165
|
+
default_mode_policy = DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY
|
|
166
|
+
for i, t in enumerate(final_transforms):
|
|
167
|
+
if isinstance(t, Sanitize) and t.policy is None:
|
|
168
|
+
final_transforms[i] = Sanitize(
|
|
169
|
+
policy=default_mode_policy, enabled=t.enabled, callback=t.callback, report=t.report
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Auto-append a final Sanitize step only if the user didn't include
|
|
173
|
+
# Sanitize anywhere in their transform list.
|
|
174
|
+
if safe and not any(isinstance(t, Sanitize) for t in final_transforms):
|
|
175
|
+
effective_policy = (
|
|
176
|
+
policy
|
|
177
|
+
if policy is not None
|
|
178
|
+
else (DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY)
|
|
179
|
+
)
|
|
180
|
+
# Avoid stale collected errors on reused policy objects.
|
|
181
|
+
if effective_policy.unsafe_handling == "collect":
|
|
182
|
+
effective_policy.reset_collected_security_errors()
|
|
183
|
+
final_transforms.append(Sanitize(policy=effective_policy))
|
|
184
|
+
|
|
185
|
+
if final_transforms:
|
|
186
|
+
compiled_transforms = compile_transforms(tuple(final_transforms))
|
|
187
|
+
apply_compiled_transforms(self.root, compiled_transforms, errors=transform_errors)
|
|
188
|
+
|
|
189
|
+
# Merge collected security errors into the document error list.
|
|
190
|
+
# This mirrors the old behavior where safe output could feed
|
|
191
|
+
# security findings into doc.errors.
|
|
192
|
+
for t in final_transforms:
|
|
193
|
+
if isinstance(t, Sanitize):
|
|
194
|
+
t_policy = t.policy
|
|
195
|
+
if t_policy is not None and t_policy.unsafe_handling == "collect":
|
|
196
|
+
transform_errors.extend(t_policy.collected_security_errors())
|
|
197
|
+
|
|
198
|
+
if should_collect:
|
|
199
|
+
# Merge errors from both tokenizer and tree builder.
|
|
200
|
+
# Public API: users expect errors to be ordered by input position.
|
|
201
|
+
merged_errors = self.tokenizer.errors + self.tree_builder.errors + transform_errors
|
|
202
|
+
self.errors = self._sorted_errors(merged_errors)
|
|
203
|
+
else:
|
|
204
|
+
self.errors = transform_errors
|
|
106
205
|
|
|
107
206
|
# In strict mode, raise on first error
|
|
108
207
|
if strict and self.errors:
|
|
@@ -112,20 +211,44 @@ class JustHTML:
|
|
|
112
211
|
"""Query the document using a CSS selector. Delegates to root.query()."""
|
|
113
212
|
return self.root.query(selector)
|
|
114
213
|
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
214
|
+
@staticmethod
|
|
215
|
+
def _sorted_errors(errors: list[ParseError]) -> list[ParseError]:
|
|
216
|
+
indexed_errors = enumerate(errors)
|
|
217
|
+
return [
|
|
218
|
+
e
|
|
219
|
+
for _, e in sorted(
|
|
220
|
+
indexed_errors,
|
|
221
|
+
key=lambda t: (
|
|
222
|
+
t[1].line if t[1].line is not None else 1_000_000_000,
|
|
223
|
+
t[1].column if t[1].column is not None else 1_000_000_000,
|
|
224
|
+
t[0],
|
|
225
|
+
),
|
|
226
|
+
)
|
|
227
|
+
]
|
|
228
|
+
|
|
229
|
+
def to_html(
|
|
230
|
+
self,
|
|
231
|
+
pretty: bool = True,
|
|
232
|
+
indent_size: int = 2,
|
|
233
|
+
) -> str:
|
|
234
|
+
"""Serialize the document to HTML.
|
|
121
235
|
|
|
122
|
-
|
|
236
|
+
Sanitization (when enabled) happens during construction.
|
|
123
237
|
"""
|
|
238
|
+
return self.root.to_html(
|
|
239
|
+
indent=0,
|
|
240
|
+
indent_size=indent_size,
|
|
241
|
+
pretty=pretty,
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
def to_text(
|
|
245
|
+
self,
|
|
246
|
+
separator: str = " ",
|
|
247
|
+
strip: bool = True,
|
|
248
|
+
) -> str:
|
|
249
|
+
"""Return the document's concatenated text."""
|
|
124
250
|
return self.root.to_text(separator=separator, strip=strip)
|
|
125
251
|
|
|
126
252
|
def to_markdown(self) -> str:
|
|
127
|
-
"""Return a GitHub Flavored Markdown representation.
|
|
128
|
-
|
|
129
|
-
Delegates to `root.to_markdown()`.
|
|
130
|
-
"""
|
|
253
|
+
"""Return a GitHub Flavored Markdown representation."""
|
|
131
254
|
return self.root.to_markdown()
|