justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +6 -0
- justhtml/__main__.py +49 -16
- justhtml/entities.py +45 -7
- justhtml/errors.py +9 -0
- justhtml/node.py +358 -89
- justhtml/parser.py +70 -14
- justhtml/sanitize.py +763 -0
- justhtml/selector.py +114 -18
- justhtml/serialize.py +332 -28
- justhtml/tokenizer.py +249 -179
- justhtml/tokens.py +8 -3
- justhtml/treebuilder.py +50 -14
- justhtml/treebuilder_modes.py +100 -36
- justhtml-0.24.0.dist-info/METADATA +192 -0
- justhtml-0.24.0.dist-info/RECORD +24 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.24.0.dist-info}/entry_points.txt +0 -0
justhtml/parser.py
CHANGED
|
@@ -4,13 +4,14 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import TYPE_CHECKING, Any
|
|
6
6
|
|
|
7
|
+
from .context import FragmentContext
|
|
7
8
|
from .encoding import decode_html
|
|
8
9
|
from .tokenizer import Tokenizer, TokenizerOpts
|
|
9
10
|
from .treebuilder import TreeBuilder
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
12
|
-
from .context import FragmentContext
|
|
13
13
|
from .node import SimpleDomNode
|
|
14
|
+
from .sanitize import SanitizationPolicy
|
|
14
15
|
from .tokens import ParseError
|
|
15
16
|
|
|
16
17
|
|
|
@@ -53,14 +54,22 @@ class JustHTML:
|
|
|
53
54
|
html: str | bytes | bytearray | memoryview | None,
|
|
54
55
|
*,
|
|
55
56
|
collect_errors: bool = False,
|
|
57
|
+
track_node_locations: bool = False,
|
|
56
58
|
debug: bool = False,
|
|
57
59
|
encoding: str | None = None,
|
|
60
|
+
fragment: bool = False,
|
|
58
61
|
fragment_context: FragmentContext | None = None,
|
|
59
62
|
iframe_srcdoc: bool = False,
|
|
60
63
|
strict: bool = False,
|
|
61
64
|
tokenizer_opts: TokenizerOpts | None = None,
|
|
62
65
|
tree_builder: TreeBuilder | None = None,
|
|
63
66
|
) -> None:
|
|
67
|
+
if fragment_context is not None:
|
|
68
|
+
fragment = True
|
|
69
|
+
|
|
70
|
+
if fragment and fragment_context is None:
|
|
71
|
+
fragment_context = FragmentContext("div")
|
|
72
|
+
|
|
64
73
|
self.debug = bool(debug)
|
|
65
74
|
self.fragment_context = fragment_context
|
|
66
75
|
self.encoding = None
|
|
@@ -74,7 +83,8 @@ class JustHTML:
|
|
|
74
83
|
else:
|
|
75
84
|
html_str = ""
|
|
76
85
|
|
|
77
|
-
# Enable error collection if strict mode is on
|
|
86
|
+
# Enable error collection if strict mode is on.
|
|
87
|
+
# Node location tracking is opt-in to avoid slowing down the common case.
|
|
78
88
|
should_collect = collect_errors or strict
|
|
79
89
|
|
|
80
90
|
self.tree_builder = tree_builder or TreeBuilder(
|
|
@@ -94,15 +104,33 @@ class JustHTML:
|
|
|
94
104
|
elif tag_name in ("plaintext", "script"):
|
|
95
105
|
opts.initial_state = Tokenizer.PLAINTEXT
|
|
96
106
|
|
|
97
|
-
self.tokenizer = Tokenizer(
|
|
107
|
+
self.tokenizer = Tokenizer(
|
|
108
|
+
self.tree_builder,
|
|
109
|
+
opts,
|
|
110
|
+
collect_errors=should_collect,
|
|
111
|
+
track_node_locations=bool(track_node_locations),
|
|
112
|
+
)
|
|
98
113
|
# Link tokenizer to tree_builder for position info
|
|
99
114
|
self.tree_builder.tokenizer = self.tokenizer
|
|
100
115
|
|
|
101
116
|
self.tokenizer.run(html_str)
|
|
102
117
|
self.root = self.tree_builder.finish()
|
|
103
118
|
|
|
104
|
-
# Merge errors from both tokenizer and tree builder
|
|
105
|
-
|
|
119
|
+
# Merge errors from both tokenizer and tree builder.
|
|
120
|
+
# Public API: users expect errors to be ordered by input position.
|
|
121
|
+
merged_errors = self.tokenizer.errors + self.tree_builder.errors
|
|
122
|
+
indexed_errors = enumerate(merged_errors)
|
|
123
|
+
self.errors = [
|
|
124
|
+
e
|
|
125
|
+
for _, e in sorted(
|
|
126
|
+
indexed_errors,
|
|
127
|
+
key=lambda t: (
|
|
128
|
+
t[1].line if t[1].line is not None else 1_000_000_000,
|
|
129
|
+
t[1].column if t[1].column is not None else 1_000_000_000,
|
|
130
|
+
t[0],
|
|
131
|
+
),
|
|
132
|
+
)
|
|
133
|
+
]
|
|
106
134
|
|
|
107
135
|
# In strict mode, raise on first error
|
|
108
136
|
if strict and self.errors:
|
|
@@ -112,20 +140,48 @@ class JustHTML:
|
|
|
112
140
|
"""Query the document using a CSS selector. Delegates to root.query()."""
|
|
113
141
|
return self.root.query(selector)
|
|
114
142
|
|
|
115
|
-
def to_html(
|
|
116
|
-
|
|
117
|
-
|
|
143
|
+
def to_html(
|
|
144
|
+
self,
|
|
145
|
+
pretty: bool = True,
|
|
146
|
+
indent_size: int = 2,
|
|
147
|
+
*,
|
|
148
|
+
safe: bool = True,
|
|
149
|
+
policy: SanitizationPolicy | None = None,
|
|
150
|
+
) -> str:
|
|
151
|
+
"""Serialize the document to HTML.
|
|
152
|
+
|
|
153
|
+
- `safe=True` sanitizes untrusted content before serialization.
|
|
154
|
+
- `policy` overrides the default sanitization policy.
|
|
155
|
+
"""
|
|
156
|
+
return self.root.to_html(
|
|
157
|
+
indent=0,
|
|
158
|
+
indent_size=indent_size,
|
|
159
|
+
pretty=pretty,
|
|
160
|
+
safe=safe,
|
|
161
|
+
policy=policy,
|
|
162
|
+
)
|
|
118
163
|
|
|
119
|
-
def to_text(
|
|
164
|
+
def to_text(
|
|
165
|
+
self,
|
|
166
|
+
separator: str = " ",
|
|
167
|
+
strip: bool = True,
|
|
168
|
+
*,
|
|
169
|
+
safe: bool = True,
|
|
170
|
+
policy: SanitizationPolicy | None = None,
|
|
171
|
+
) -> str:
|
|
120
172
|
"""Return the document's concatenated text.
|
|
121
173
|
|
|
122
|
-
|
|
174
|
+
- `safe=True` sanitizes untrusted content before text extraction.
|
|
175
|
+
- `policy` overrides the default sanitization policy.
|
|
176
|
+
|
|
177
|
+
Delegates to `root.to_text(...)`.
|
|
123
178
|
"""
|
|
124
|
-
return self.root.to_text(separator=separator, strip=strip)
|
|
179
|
+
return self.root.to_text(separator=separator, strip=strip, safe=safe, policy=policy)
|
|
125
180
|
|
|
126
|
-
def to_markdown(self) -> str:
|
|
181
|
+
def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
|
|
127
182
|
"""Return a GitHub Flavored Markdown representation.
|
|
128
183
|
|
|
129
|
-
|
|
184
|
+
- `safe=True` sanitizes untrusted content before conversion.
|
|
185
|
+
- `policy` overrides the default sanitization policy.
|
|
130
186
|
"""
|
|
131
|
-
return self.root.to_markdown()
|
|
187
|
+
return self.root.to_markdown(safe=safe, policy=policy)
|