justhtml 0.24.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/node.py CHANGED
@@ -3,12 +3,10 @@ from __future__ import annotations
3
3
  from typing import TYPE_CHECKING, Any
4
4
  from urllib.parse import quote
5
5
 
6
- from .sanitize import sanitize
7
6
  from .selector import query
8
7
  from .serialize import to_html
9
8
 
10
9
  if TYPE_CHECKING:
11
- from .sanitize import SanitizationPolicy
12
10
  from .tokens import Doctype
13
11
 
14
12
 
@@ -192,6 +190,7 @@ class SimpleDomNode:
192
190
  "_origin_col",
193
191
  "_origin_line",
194
192
  "_origin_pos",
193
+ "_source_html",
195
194
  "attrs",
196
195
  "children",
197
196
  "data",
@@ -209,6 +208,7 @@ class SimpleDomNode:
209
208
  _origin_pos: int | None
210
209
  _origin_line: int | None
211
210
  _origin_col: int | None
211
+ _source_html: str | None
212
212
 
213
213
  def __init__(
214
214
  self,
@@ -220,6 +220,7 @@ class SimpleDomNode:
220
220
  self.name = name
221
221
  self.parent = None
222
222
  self.data = data
223
+ self._source_html = None
223
224
  self._origin_pos = None
224
225
  self._origin_line = None
225
226
  self._origin_col = None
@@ -271,12 +272,9 @@ class SimpleDomNode:
271
272
  indent: int = 0,
272
273
  indent_size: int = 2,
273
274
  pretty: bool = True,
274
- *,
275
- safe: bool = True,
276
- policy: SanitizationPolicy | None = None,
277
275
  ) -> str:
278
276
  """Convert node to HTML string."""
279
- return to_html(self, indent, indent_size, pretty=pretty, safe=safe, policy=policy)
277
+ return to_html(self, indent, indent_size, pretty=pretty)
280
278
 
281
279
  def query(self, selector: str) -> list[Any]:
282
280
  """
@@ -312,39 +310,27 @@ class SimpleDomNode:
312
310
  self,
313
311
  separator: str = " ",
314
312
  strip: bool = True,
315
- *,
316
- safe: bool = True,
317
- policy: SanitizationPolicy | None = None,
318
313
  ) -> str:
319
314
  """Return the concatenated text of this node's descendants.
320
315
 
321
316
  - `separator` controls how text nodes are joined (default: a single space).
322
317
  - `strip=True` strips each text node and drops empty segments.
323
- - `safe=True` sanitizes untrusted HTML before extracting text.
324
- - `policy` overrides the default sanitization policy.
325
-
326
318
  Template element contents are included via `template_content`.
327
319
  """
328
- node: Any = sanitize(self, policy=policy) if safe else self
320
+ node: Any = self
329
321
  parts: list[str] = []
330
322
  _to_text_collect(node, parts, strip=strip)
331
323
  if not parts:
332
324
  return ""
333
325
  return separator.join(parts)
334
326
 
335
- def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
327
+ def to_markdown(self) -> str:
336
328
  """Return a GitHub Flavored Markdown representation of this subtree.
337
329
 
338
330
  This is a pragmatic HTML->Markdown converter intended for readability.
339
331
  - Tables and images are preserved as raw HTML.
340
332
  - Unknown elements fall back to rendering their children.
341
333
  """
342
- if safe:
343
- node = sanitize(self, policy=policy)
344
- builder = _MarkdownBuilder()
345
- _to_markdown_walk(node, builder, preserve_whitespace=False, list_depth=0)
346
- return builder.finish()
347
-
348
334
  builder = _MarkdownBuilder()
349
335
  _to_markdown_walk(self, builder, preserve_whitespace=False, list_depth=0)
350
336
  return builder.finish()
@@ -405,22 +391,25 @@ class SimpleDomNode:
405
391
  """Return True if this node has children."""
406
392
  return bool(self.children)
407
393
 
408
- def clone_node(self, deep: bool = False) -> SimpleDomNode:
394
+ def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> SimpleDomNode:
409
395
  """
410
396
  Clone this node.
411
397
 
412
398
  Args:
413
399
  deep: If True, recursively clone children.
400
+ override_attrs: Optional dictionary to use as attributes for the clone.
414
401
 
415
402
  Returns:
416
403
  A new node that is a copy of this node.
417
404
  """
405
+ attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else None)
418
406
  clone = SimpleDomNode(
419
407
  self.name,
420
- self.attrs.copy() if self.attrs else None,
408
+ attrs,
421
409
  self.data,
422
410
  self.namespace,
423
411
  )
412
+ clone._source_html = self._source_html
424
413
  clone._origin_pos = self._origin_pos
425
414
  clone._origin_line = self._origin_line
426
415
  clone._origin_col = self._origin_col
@@ -431,11 +420,25 @@ class SimpleDomNode:
431
420
 
432
421
 
433
422
  class ElementNode(SimpleDomNode):
434
- __slots__ = ("template_content",)
423
+ __slots__ = (
424
+ "_end_tag_end",
425
+ "_end_tag_present",
426
+ "_end_tag_start",
427
+ "_self_closing",
428
+ "_start_tag_end",
429
+ "_start_tag_start",
430
+ "template_content",
431
+ )
435
432
 
436
433
  template_content: SimpleDomNode | None
437
434
  children: list[Any]
438
435
  attrs: dict[str, str | None]
436
+ _start_tag_start: int | None
437
+ _start_tag_end: int | None
438
+ _end_tag_start: int | None
439
+ _end_tag_end: int | None
440
+ _end_tag_present: bool
441
+ _self_closing: bool
439
442
 
440
443
  def __init__(self, name: str, attrs: dict[str, str | None] | None, namespace: str | None) -> None:
441
444
  self.name = name
@@ -445,15 +448,30 @@ class ElementNode(SimpleDomNode):
445
448
  self.children = []
446
449
  self.attrs = attrs if attrs is not None else {}
447
450
  self.template_content = None
451
+ self._source_html = None
448
452
  self._origin_pos = None
449
453
  self._origin_line = None
450
454
  self._origin_col = None
451
-
452
- def clone_node(self, deep: bool = False) -> ElementNode:
453
- clone = ElementNode(self.name, self.attrs.copy() if self.attrs else {}, self.namespace)
455
+ self._start_tag_start = None
456
+ self._start_tag_end = None
457
+ self._end_tag_start = None
458
+ self._end_tag_end = None
459
+ self._end_tag_present = False
460
+ self._self_closing = False
461
+
462
+ def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> ElementNode:
463
+ attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
464
+ clone = ElementNode(self.name, attrs, self.namespace)
465
+ clone._source_html = self._source_html
454
466
  clone._origin_pos = self._origin_pos
455
467
  clone._origin_line = self._origin_line
456
468
  clone._origin_col = self._origin_col
469
+ clone._start_tag_start = self._start_tag_start
470
+ clone._start_tag_end = self._start_tag_end
471
+ clone._end_tag_start = self._end_tag_start
472
+ clone._end_tag_end = self._end_tag_end
473
+ clone._end_tag_present = self._end_tag_present
474
+ clone._self_closing = self._self_closing
457
475
  if deep:
458
476
  for child in self.children:
459
477
  clone.append_child(child.clone_node(deep=True))
@@ -476,16 +494,24 @@ class TemplateNode(ElementNode):
476
494
  else:
477
495
  self.template_content = None
478
496
 
479
- def clone_node(self, deep: bool = False) -> TemplateNode:
497
+ def clone_node(self, deep: bool = False, override_attrs: dict[str, str | None] | None = None) -> TemplateNode:
498
+ attrs = override_attrs if override_attrs is not None else (self.attrs.copy() if self.attrs else {})
480
499
  clone = TemplateNode(
481
500
  self.name,
482
- self.attrs.copy() if self.attrs else {},
501
+ attrs,
483
502
  None,
484
503
  self.namespace,
485
504
  )
505
+ clone._source_html = self._source_html
486
506
  clone._origin_pos = self._origin_pos
487
507
  clone._origin_line = self._origin_line
488
508
  clone._origin_col = self._origin_col
509
+ clone._start_tag_start = self._start_tag_start
510
+ clone._start_tag_end = self._start_tag_end
511
+ clone._end_tag_start = self._end_tag_start
512
+ clone._end_tag_end = self._end_tag_end
513
+ clone._end_tag_present = self._end_tag_present
514
+ clone._self_closing = self._self_closing
489
515
  if deep:
490
516
  if self.template_content:
491
517
  clone.template_content = self.template_content.clone_node(deep=True)
@@ -542,15 +568,8 @@ class TextNode:
542
568
  self,
543
569
  separator: str = " ",
544
570
  strip: bool = True,
545
- *,
546
- safe: bool = True,
547
- policy: SanitizationPolicy | None = None,
548
571
  ) -> str:
549
- # Parameters are accepted for API consistency; they don't affect leaf nodes.
550
572
  _ = separator
551
- _ = safe
552
- _ = policy
553
-
554
573
  if self.data is None:
555
574
  return ""
556
575
  if strip:
justhtml/parser.py CHANGED
@@ -7,12 +7,14 @@ from typing import TYPE_CHECKING, Any
7
7
  from .context import FragmentContext
8
8
  from .encoding import decode_html
9
9
  from .tokenizer import Tokenizer, TokenizerOpts
10
+ from .transforms import apply_compiled_transforms, compile_transforms
10
11
  from .treebuilder import TreeBuilder
11
12
 
12
13
  if TYPE_CHECKING:
13
14
  from .node import SimpleDomNode
14
15
  from .sanitize import SanitizationPolicy
15
16
  from .tokens import ParseError
17
+ from .transforms import TransformSpec
16
18
 
17
19
 
18
20
  class StrictModeError(SyntaxError):
@@ -53,6 +55,8 @@ class JustHTML:
53
55
  self,
54
56
  html: str | bytes | bytearray | memoryview | None,
55
57
  *,
58
+ safe: bool = True,
59
+ policy: SanitizationPolicy | None = None,
56
60
  collect_errors: bool = False,
57
61
  track_node_locations: bool = False,
58
62
  debug: bool = False,
@@ -63,6 +67,7 @@ class JustHTML:
63
67
  strict: bool = False,
64
68
  tokenizer_opts: TokenizerOpts | None = None,
65
69
  tree_builder: TreeBuilder | None = None,
70
+ transforms: list[TransformSpec] | None = None,
66
71
  ) -> None:
67
72
  if fragment_context is not None:
68
73
  fragment = True
@@ -70,6 +75,29 @@ class JustHTML:
70
75
  if fragment and fragment_context is None:
71
76
  fragment_context = FragmentContext("div")
72
77
 
78
+ track_tag_spans = False
79
+ has_sanitize_transform = False
80
+ needs_escape_incomplete_tags = False
81
+ if transforms:
82
+ from .sanitize import DEFAULT_POLICY # noqa: PLC0415
83
+ from .transforms import Sanitize # noqa: PLC0415
84
+
85
+ for t in transforms:
86
+ if isinstance(t, Sanitize):
87
+ has_sanitize_transform = True
88
+ effective = t.policy or DEFAULT_POLICY
89
+ if effective.disallowed_tag_handling == "escape":
90
+ track_tag_spans = True
91
+ needs_escape_incomplete_tags = True
92
+ break
93
+
94
+ # If we will auto-sanitize (safe=True and no Sanitize in transforms),
95
+ # escape-mode tag reconstruction may require tracking tag spans.
96
+ if safe and not has_sanitize_transform and policy is not None:
97
+ if policy.disallowed_tag_handling == "escape":
98
+ track_tag_spans = True
99
+ needs_escape_incomplete_tags = True
100
+
73
101
  self.debug = bool(debug)
74
102
  self.fragment_context = fragment_context
75
103
  self.encoding = None
@@ -91,8 +119,11 @@ class JustHTML:
91
119
  fragment_context=fragment_context,
92
120
  iframe_srcdoc=iframe_srcdoc,
93
121
  collect_errors=should_collect,
122
+ track_tag_spans=track_tag_spans,
94
123
  )
95
124
  opts = tokenizer_opts or TokenizerOpts()
125
+ if needs_escape_incomplete_tags:
126
+ opts.emit_bogus_markup_as_text = True
96
127
 
97
128
  # For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
98
129
  if fragment_context and not fragment_context.namespace:
@@ -109,6 +140,7 @@ class JustHTML:
109
140
  opts,
110
141
  collect_errors=should_collect,
111
142
  track_node_locations=bool(track_node_locations),
143
+ track_tag_positions=bool(track_node_locations) or track_tag_spans,
112
144
  )
113
145
  # Link tokenizer to tree_builder for position info
114
146
  self.tree_builder.tokenizer = self.tokenizer
@@ -116,11 +148,73 @@ class JustHTML:
116
148
  self.tokenizer.run(html_str)
117
149
  self.root = self.tree_builder.finish()
118
150
 
119
- # Merge errors from both tokenizer and tree builder.
120
- # Public API: users expect errors to be ordered by input position.
121
- merged_errors = self.tokenizer.errors + self.tree_builder.errors
122
- indexed_errors = enumerate(merged_errors)
123
- self.errors = [
151
+ transform_errors: list[ParseError] = []
152
+
153
+ # Apply transforms after parse.
154
+ # Safety model: when safe=True, the in-memory tree is sanitized exactly once
155
+ # during construction by ensuring a Sanitize transform runs.
156
+ if transforms or safe:
157
+ from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY # noqa: PLC0415
158
+ from .transforms import Sanitize # noqa: PLC0415
159
+
160
+ final_transforms: list[TransformSpec] = list(transforms or [])
161
+
162
+ # Normalize explicit Sanitize() transforms to use the same default policy
163
+ # choice as the old safe-output sanitizer (document vs fragment).
164
+ if final_transforms:
165
+ default_mode_policy = DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY
166
+ for i, t in enumerate(final_transforms):
167
+ if isinstance(t, Sanitize) and t.policy is None:
168
+ final_transforms[i] = Sanitize(
169
+ policy=default_mode_policy, enabled=t.enabled, callback=t.callback, report=t.report
170
+ )
171
+
172
+ # Auto-append a final Sanitize step only if the user didn't include
173
+ # Sanitize anywhere in their transform list.
174
+ if safe and not any(isinstance(t, Sanitize) for t in final_transforms):
175
+ effective_policy = (
176
+ policy
177
+ if policy is not None
178
+ else (DEFAULT_DOCUMENT_POLICY if self.root.name == "#document" else DEFAULT_POLICY)
179
+ )
180
+ # Avoid stale collected errors on reused policy objects.
181
+ if effective_policy.unsafe_handling == "collect":
182
+ effective_policy.reset_collected_security_errors()
183
+ final_transforms.append(Sanitize(policy=effective_policy))
184
+
185
+ if final_transforms:
186
+ compiled_transforms = compile_transforms(tuple(final_transforms))
187
+ apply_compiled_transforms(self.root, compiled_transforms, errors=transform_errors)
188
+
189
+ # Merge collected security errors into the document error list.
190
+ # This mirrors the old behavior where safe output could feed
191
+ # security findings into doc.errors.
192
+ for t in final_transforms:
193
+ if isinstance(t, Sanitize):
194
+ t_policy = t.policy
195
+ if t_policy is not None and t_policy.unsafe_handling == "collect":
196
+ transform_errors.extend(t_policy.collected_security_errors())
197
+
198
+ if should_collect:
199
+ # Merge errors from both tokenizer and tree builder.
200
+ # Public API: users expect errors to be ordered by input position.
201
+ merged_errors = self.tokenizer.errors + self.tree_builder.errors + transform_errors
202
+ self.errors = self._sorted_errors(merged_errors)
203
+ else:
204
+ self.errors = transform_errors
205
+
206
+ # In strict mode, raise on first error
207
+ if strict and self.errors:
208
+ raise StrictModeError(self.errors[0])
209
+
210
+ def query(self, selector: str) -> list[Any]:
211
+ """Query the document using a CSS selector. Delegates to root.query()."""
212
+ return self.root.query(selector)
213
+
214
+ @staticmethod
215
+ def _sorted_errors(errors: list[ParseError]) -> list[ParseError]:
216
+ indexed_errors = enumerate(errors)
217
+ return [
124
218
  e
125
219
  for _, e in sorted(
126
220
  indexed_errors,
@@ -132,56 +226,29 @@ class JustHTML:
132
226
  )
133
227
  ]
134
228
 
135
- # In strict mode, raise on first error
136
- if strict and self.errors:
137
- raise StrictModeError(self.errors[0])
138
-
139
- def query(self, selector: str) -> list[Any]:
140
- """Query the document using a CSS selector. Delegates to root.query()."""
141
- return self.root.query(selector)
142
-
143
229
  def to_html(
144
230
  self,
145
231
  pretty: bool = True,
146
232
  indent_size: int = 2,
147
- *,
148
- safe: bool = True,
149
- policy: SanitizationPolicy | None = None,
150
233
  ) -> str:
151
234
  """Serialize the document to HTML.
152
235
 
153
- - `safe=True` sanitizes untrusted content before serialization.
154
- - `policy` overrides the default sanitization policy.
236
+ Sanitization (when enabled) happens during construction.
155
237
  """
156
238
  return self.root.to_html(
157
239
  indent=0,
158
240
  indent_size=indent_size,
159
241
  pretty=pretty,
160
- safe=safe,
161
- policy=policy,
162
242
  )
163
243
 
164
244
  def to_text(
165
245
  self,
166
246
  separator: str = " ",
167
247
  strip: bool = True,
168
- *,
169
- safe: bool = True,
170
- policy: SanitizationPolicy | None = None,
171
248
  ) -> str:
172
- """Return the document's concatenated text.
173
-
174
- - `safe=True` sanitizes untrusted content before text extraction.
175
- - `policy` overrides the default sanitization policy.
249
+ """Return the document's concatenated text."""
250
+ return self.root.to_text(separator=separator, strip=strip)
176
251
 
177
- Delegates to `root.to_text(...)`.
178
- """
179
- return self.root.to_text(separator=separator, strip=strip, safe=safe, policy=policy)
180
-
181
- def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
182
- """Return a GitHub Flavored Markdown representation.
183
-
184
- - `safe=True` sanitizes untrusted content before conversion.
185
- - `policy` overrides the default sanitization policy.
186
- """
187
- return self.root.to_markdown(safe=safe, policy=policy)
252
+ def to_markdown(self) -> str:
253
+ """Return a GitHub Flavored Markdown representation."""
254
+ return self.root.to_markdown()