justhtml 0.6.0__py3-none-any.whl → 0.33.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/parser.py CHANGED
@@ -1,8 +1,23 @@
1
1
  """Minimal JustHTML parser entry point."""
2
2
 
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from .context import FragmentContext
8
+ from .encoding import decode_html
3
9
  from .tokenizer import Tokenizer, TokenizerOpts
10
+ from .transforms import apply_compiled_transforms, compile_transforms
4
11
  from .treebuilder import TreeBuilder
5
12
 
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Callable
15
+
16
+ from .node import SimpleDomNode
17
+ from .sanitize import SanitizationPolicy
18
+ from .tokens import ParseError
19
+ from .transforms import Transform
20
+
6
21
 
7
22
  class StrictModeError(SyntaxError):
8
23
  """Raised when strict mode encounters a parse error.
@@ -11,7 +26,9 @@ class StrictModeError(SyntaxError):
11
26
  with source location highlighting.
12
27
  """
13
28
 
14
- def __init__(self, error):
29
+ error: ParseError
30
+
31
+ def __init__(self, error: ParseError) -> None:
15
32
  self.error = error
16
33
  # Use the ParseError's as_exception() to get enhanced display
17
34
  exc = error.as_exception()
@@ -26,24 +43,58 @@ class StrictModeError(SyntaxError):
26
43
 
27
44
 
28
45
  class JustHTML:
29
- __slots__ = ("debug", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
46
+ __slots__ = ("debug", "encoding", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
47
+
48
+ debug: bool
49
+ encoding: str | None
50
+ errors: list[ParseError]
51
+ fragment_context: FragmentContext | None
52
+ root: SimpleDomNode
53
+ tokenizer: Tokenizer
54
+ tree_builder: TreeBuilder
30
55
 
31
56
  def __init__(
32
57
  self,
33
- html,
58
+ html: str | bytes | bytearray | memoryview | None,
34
59
  *,
35
- collect_errors=False,
36
- debug=False,
37
- fragment_context=None,
38
- iframe_srcdoc=False,
39
- strict=False,
40
- tokenizer_opts=None,
41
- tree_builder=None,
42
- ):
60
+ collect_errors: bool = False,
61
+ track_node_locations: bool = False,
62
+ debug: bool = False,
63
+ encoding: str | None = None,
64
+ fragment: bool = False,
65
+ fragment_context: FragmentContext | None = None,
66
+ iframe_srcdoc: bool = False,
67
+ strict: bool = False,
68
+ tokenizer_opts: TokenizerOpts | None = None,
69
+ tree_builder: TreeBuilder | None = None,
70
+ transforms: list[Transform] | None = None,
71
+ ) -> None:
72
+ if fragment_context is not None:
73
+ fragment = True
74
+
75
+ if fragment and fragment_context is None:
76
+ fragment_context = FragmentContext("div")
77
+
78
+ # Compile transforms early so invalid selectors fail fast.
79
+ compiled_transforms = None
80
+ if transforms:
81
+ compiled_transforms = compile_transforms(tuple(transforms))
82
+
43
83
  self.debug = bool(debug)
44
84
  self.fragment_context = fragment_context
45
-
46
- # Enable error collection if strict mode is on
85
+ self.encoding = None
86
+
87
+ html_str: str
88
+ if isinstance(html, (bytes, bytearray, memoryview)):
89
+ html_str, chosen = decode_html(bytes(html), transport_encoding=encoding)
90
+ self.encoding = chosen
91
+ elif html is not None:
92
+ html_str = str(html)
93
+ else:
94
+ html_str = ""
95
+
96
+ # Enable error collection if strict mode is on.
97
+ # Node location tracking is opt-in to avoid slowing down the common case.
47
98
  should_collect = collect_errors or strict
48
99
 
49
100
  self.tree_builder = tree_builder or TreeBuilder(
@@ -63,24 +114,140 @@ class JustHTML:
63
114
  elif tag_name in ("plaintext", "script"):
64
115
  opts.initial_state = Tokenizer.PLAINTEXT
65
116
 
66
- self.tokenizer = Tokenizer(self.tree_builder, opts, collect_errors=should_collect)
117
+ self.tokenizer = Tokenizer(
118
+ self.tree_builder,
119
+ opts,
120
+ collect_errors=should_collect,
121
+ track_node_locations=bool(track_node_locations),
122
+ )
67
123
  # Link tokenizer to tree_builder for position info
68
124
  self.tree_builder.tokenizer = self.tokenizer
69
125
 
70
- self.tokenizer.run(html or "")
126
+ self.tokenizer.run(html_str)
71
127
  self.root = self.tree_builder.finish()
72
128
 
73
- # Merge errors from both tokenizer and tree builder
74
- self.errors = self.tokenizer.errors + self.tree_builder.errors
129
+ if compiled_transforms is not None:
130
+ apply_compiled_transforms(self.root, compiled_transforms)
131
+
132
+ if should_collect:
133
+ # Merge errors from both tokenizer and tree builder.
134
+ # Public API: users expect errors to be ordered by input position.
135
+ merged_errors = self.tokenizer.errors + self.tree_builder.errors
136
+ self.errors = self._sorted_errors(merged_errors)
137
+ else:
138
+ self.errors = []
75
139
 
76
140
  # In strict mode, raise on first error
77
141
  if strict and self.errors:
78
142
  raise StrictModeError(self.errors[0])
79
143
 
80
- def query(self, selector):
144
+ def query(self, selector: str) -> list[Any]:
81
145
  """Query the document using a CSS selector. Delegates to root.query()."""
82
146
  return self.root.query(selector)
83
147
 
84
- def to_html(self, pretty=True, indent_size=2):
85
- """Serialize the document to HTML. Delegates to root.to_html()."""
86
- return self.root.to_html(indent=0, indent_size=indent_size, pretty=pretty)
148
+ @staticmethod
149
+ def _sorted_errors(errors: list[ParseError]) -> list[ParseError]:
150
+ indexed_errors = enumerate(errors)
151
+ return [
152
+ e
153
+ for _, e in sorted(
154
+ indexed_errors,
155
+ key=lambda t: (
156
+ t[1].line if t[1].line is not None else 1_000_000_000,
157
+ t[1].column if t[1].column is not None else 1_000_000_000,
158
+ t[0],
159
+ ),
160
+ )
161
+ ]
162
+
163
+ def _set_security_errors(self, errors: list[ParseError]) -> None:
164
+ if not self.errors and not errors:
165
+ return
166
+
167
+ base = [e for e in self.errors if e.category != "security"]
168
+ self.errors = self._sorted_errors(base + errors)
169
+
170
+ def _with_security_error_collection(
171
+ self,
172
+ policy: SanitizationPolicy | None,
173
+ serialize: Callable[[], str],
174
+ ) -> str:
175
+ if policy is not None and policy.unsafe_handling == "collect":
176
+ policy.reset_collected_security_errors()
177
+ out = serialize()
178
+ self._set_security_errors(policy.collected_security_errors())
179
+ return out
180
+
181
+ # Avoid stale security errors if a previous serialization used collect.
182
+ self._set_security_errors([])
183
+ return serialize()
184
+
185
+ def to_html(
186
+ self,
187
+ pretty: bool = True,
188
+ indent_size: int = 2,
189
+ *,
190
+ safe: bool = True,
191
+ policy: SanitizationPolicy | None = None,
192
+ ) -> str:
193
+ """Serialize the document to HTML.
194
+
195
+ - `safe=True` sanitizes untrusted content before serialization.
196
+ - `policy` overrides the default sanitization policy.
197
+ """
198
+ if not safe:
199
+ return self.root.to_html(
200
+ indent=0,
201
+ indent_size=indent_size,
202
+ pretty=pretty,
203
+ safe=False,
204
+ policy=policy,
205
+ )
206
+
207
+ return self._with_security_error_collection(
208
+ policy,
209
+ lambda: self.root.to_html(
210
+ indent=0,
211
+ indent_size=indent_size,
212
+ pretty=pretty,
213
+ safe=True,
214
+ policy=policy,
215
+ ),
216
+ )
217
+
218
+ def to_text(
219
+ self,
220
+ separator: str = " ",
221
+ strip: bool = True,
222
+ *,
223
+ safe: bool = True,
224
+ policy: SanitizationPolicy | None = None,
225
+ ) -> str:
226
+ """Return the document's concatenated text.
227
+
228
+ - `safe=True` sanitizes untrusted content before text extraction.
229
+ - `policy` overrides the default sanitization policy.
230
+
231
+ Delegates to `root.to_text(...)`.
232
+ """
233
+ if not safe:
234
+ return self.root.to_text(separator=separator, strip=strip, safe=False, policy=policy)
235
+
236
+ return self._with_security_error_collection(
237
+ policy,
238
+ lambda: self.root.to_text(separator=separator, strip=strip, safe=True, policy=policy),
239
+ )
240
+
241
+ def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
242
+ """Return a GitHub Flavored Markdown representation.
243
+
244
+ - `safe=True` sanitizes untrusted content before conversion.
245
+ - `policy` overrides the default sanitization policy.
246
+ """
247
+ if not safe:
248
+ return self.root.to_markdown(safe=False, policy=policy)
249
+
250
+ return self._with_security_error_collection(
251
+ policy,
252
+ lambda: self.root.to_markdown(safe=True, policy=policy),
253
+ )
justhtml/py.typed ADDED
File without changes