justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/parser.py CHANGED
@@ -4,14 +4,17 @@ from __future__ import annotations
4
4
 
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
+ from .context import FragmentContext
7
8
  from .encoding import decode_html
8
9
  from .tokenizer import Tokenizer, TokenizerOpts
10
+ from .transforms import apply_compiled_transforms, compile_transforms
9
11
  from .treebuilder import TreeBuilder
10
12
 
11
13
  if TYPE_CHECKING:
12
- from .context import FragmentContext
13
14
  from .node import SimpleDomNode
15
+ from .sanitize import SanitizationPolicy
14
16
  from .tokens import ParseError
17
+ from .transforms import TransformSpec
15
18
 
16
19
 
17
20
  class StrictModeError(SyntaxError):
@@ -52,15 +55,49 @@ class JustHTML:
52
55
  self,
53
56
  html: str | bytes | bytearray | memoryview | None,
54
57
  *,
58
+ safe: bool = True,
59
+ policy: SanitizationPolicy | None = None,
55
60
  collect_errors: bool = False,
61
+ track_node_locations: bool = False,
56
62
  debug: bool = False,
57
63
  encoding: str | None = None,
64
+ fragment: bool = False,
58
65
  fragment_context: FragmentContext | None = None,
59
66
  iframe_srcdoc: bool = False,
60
67
  strict: bool = False,
61
68
  tokenizer_opts: TokenizerOpts | None = None,
62
69
  tree_builder: TreeBuilder | None = None,
70
+ transforms: list[TransformSpec] | None = None,
63
71
  ) -> None:
72
+ if fragment_context is not None:
73
+ fragment = True
74
+
75
+ if fragment and fragment_context is None:
76
+ fragment_context = FragmentContext("div")
77
+
78
+ track_tag_spans = False
79
+ has_sanitize_transform = False
80
+ needs_escape_incomplete_tags = False
81
+ if transforms:
82
+ from .sanitize import DEFAULT_POLICY # noqa: PLC0415
83
+ from .transforms import Sanitize # noqa: PLC0415
84
+
85
+ for t in transforms:
86
+ if isinstance(t, Sanitize):
87
+ has_sanitize_transform = True
88
+ effective = t.policy or DEFAULT_POLICY
89
+ if effective.disallowed_tag_handling == "escape":
90
+ track_tag_spans = True
91
+ needs_escape_incomplete_tags = True
92
+ break
93
+
94
+ # If we will auto-sanitize (safe=True and no Sanitize in transforms),
95
+ # escape-mode tag reconstruction may require tracking tag spans.
96
+ if safe and not has_sanitize_transform and policy is not None:
97
+ if policy.disallowed_tag_handling == "escape":
98
+ track_tag_spans = True
99
+ needs_escape_incomplete_tags = True
100
+
64
101
  self.debug = bool(debug)
65
102
  self.fragment_context = fragment_context
66
103
  self.encoding = None
@@ -74,15 +111,19 @@ class JustHTML:
74
111
  else:
75
112
  html_str = ""
76
113
 
77
- # Enable error collection if strict mode is on
114
+ # Enable error collection if strict mode is on.
115
+ # Node location tracking is opt-in to avoid slowing down the common case.
78
116
  should_collect = collect_errors or strict
79
117
 
80
118
  self.tree_builder = tree_builder or TreeBuilder(
81
119
  fragment_context=fragment_context,
82
120
  iframe_srcdoc=iframe_srcdoc,
83
121
  collect_errors=should_collect,
122
+ track_tag_spans=track_tag_spans,
84
123
  )
85
124
  opts = tokenizer_opts or TokenizerOpts()
125
+ if needs_escape_incomplete_tags:
126
+ opts.emit_bogus_markup_as_text = True
86
127
 
87
128
  # For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
88
129
  if fragment_context and not fragment_context.namespace:
@@ -94,15 +135,73 @@ class JustHTML:
94
135
  elif tag_name in ("plaintext", "script"):
95
136
  opts.initial_state = Tokenizer.PLAINTEXT
96
137
 
97
- self.tokenizer = Tokenizer(self.tree_builder, opts, collect_errors=should_collect)
138
+ self.tokenizer = Tokenizer(
139
+ self.tree_builder,
140
+ opts,
141
+ collect_errors=should_collect,
142
+ track_node_locations=bool(track_node_locations),
143
+ track_tag_positions=bool(track_node_locations) or track_tag_spans,
144
+ )
98
145
  # Link tokenizer to tree_builder for position info
99
146
  self.tree_builder.tokenizer = self.tokenizer
100
147
 
101
148
  self.tokenizer.run(html_str)
102
149
  self.root = self.tree_builder.finish()
103
150
 
104
- # Merge errors from both tokenizer and tree builder
105
- self.errors = self.tokenizer.errors + self.tree_builder.errors
151
+ transform_errors: list[ParseError] = []
152
+
153
+ # Apply transforms after parse.
154
+ # Safety model: when safe=True, the in-memory tree is sanitized exactly once
155
+ # during construction by ensuring a Sanitize transform runs.
156
+ if transforms or safe:
157
+ from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY # noqa: PLC0415
158
+ from .transforms import Sanitize # noqa: PLC0415
159
+
160
+ final_transforms: list[TransformSpec] = list(transforms or [])
161
+
162
+ # Normalize explicit Sanitize() transforms to use the same default policy
163
+ # choice as the old safe-output sanitizer (document vs fragment).
164
+ if final_transforms:
165
+ default_mode_policy = DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY
166
+ for i, t in enumerate(final_transforms):
167
+ if isinstance(t, Sanitize) and t.policy is None:
168
+ final_transforms[i] = Sanitize(
169
+ policy=default_mode_policy, enabled=t.enabled, callback=t.callback, report=t.report
170
+ )
171
+
172
+ # Auto-append a final Sanitize step only if the user didn't include
173
+ # Sanitize anywhere in their transform list.
174
+ if safe and not any(isinstance(t, Sanitize) for t in final_transforms):
175
+ effective_policy = (
176
+ policy
177
+ if policy is not None
178
+ else (DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY)
179
+ )
180
+ # Avoid stale collected errors on reused policy objects.
181
+ if effective_policy.unsafe_handling == "collect":
182
+ effective_policy.reset_collected_security_errors()
183
+ final_transforms.append(Sanitize(policy=effective_policy))
184
+
185
+ if final_transforms:
186
+ compiled_transforms = compile_transforms(tuple(final_transforms))
187
+ apply_compiled_transforms(self.root, compiled_transforms, errors=transform_errors)
188
+
189
+ # Merge collected security errors into the document error list.
190
+ # This mirrors the old behavior where safe output could feed
191
+ # security findings into doc.errors.
192
+ for t in final_transforms:
193
+ if isinstance(t, Sanitize):
194
+ t_policy = t.policy
195
+ if t_policy is not None and t_policy.unsafe_handling == "collect":
196
+ transform_errors.extend(t_policy.collected_security_errors())
197
+
198
+ if should_collect:
199
+ # Merge errors from both tokenizer and tree builder.
200
+ # Public API: users expect errors to be ordered by input position.
201
+ merged_errors = self.tokenizer.errors + self.tree_builder.errors + transform_errors
202
+ self.errors = self._sorted_errors(merged_errors)
203
+ else:
204
+ self.errors = transform_errors
106
205
 
107
206
  # In strict mode, raise on first error
108
207
  if strict and self.errors:
@@ -112,20 +211,44 @@ class JustHTML:
112
211
  """Query the document using a CSS selector. Delegates to root.query()."""
113
212
  return self.root.query(selector)
114
213
 
115
- def to_html(self, pretty: bool = True, indent_size: int = 2) -> str:
116
- """Serialize the document to HTML. Delegates to root.to_html()."""
117
- return self.root.to_html(indent=0, indent_size=indent_size, pretty=pretty)
118
-
119
- def to_text(self, separator: str = " ", strip: bool = True) -> str:
120
- """Return the document's concatenated text.
214
+ @staticmethod
215
+ def _sorted_errors(errors: list[ParseError]) -> list[ParseError]:
216
+ indexed_errors = enumerate(errors)
217
+ return [
218
+ e
219
+ for _, e in sorted(
220
+ indexed_errors,
221
+ key=lambda t: (
222
+ t[1].line if t[1].line is not None else 1_000_000_000,
223
+ t[1].column if t[1].column is not None else 1_000_000_000,
224
+ t[0],
225
+ ),
226
+ )
227
+ ]
228
+
229
+ def to_html(
230
+ self,
231
+ pretty: bool = True,
232
+ indent_size: int = 2,
233
+ ) -> str:
234
+ """Serialize the document to HTML.
121
235
 
122
- Delegates to `root.to_text(separator=..., strip=...)`.
236
+ Sanitization (when enabled) happens during construction.
123
237
  """
238
+ return self.root.to_html(
239
+ indent=0,
240
+ indent_size=indent_size,
241
+ pretty=pretty,
242
+ )
243
+
244
+ def to_text(
245
+ self,
246
+ separator: str = " ",
247
+ strip: bool = True,
248
+ ) -> str:
249
+ """Return the document's concatenated text."""
124
250
  return self.root.to_text(separator=separator, strip=strip)
125
251
 
126
252
  def to_markdown(self) -> str:
127
- """Return a GitHub Flavored Markdown representation.
128
-
129
- Delegates to `root.to_markdown()`.
130
- """
253
+ """Return a GitHub Flavored Markdown representation."""
131
254
  return self.root.to_markdown()