justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/parser.py CHANGED
@@ -4,13 +4,14 @@ from __future__ import annotations
4
4
 
5
5
  from typing import TYPE_CHECKING, Any
6
6
 
7
+ from .context import FragmentContext
7
8
  from .encoding import decode_html
8
9
  from .tokenizer import Tokenizer, TokenizerOpts
9
10
  from .treebuilder import TreeBuilder
10
11
 
11
12
  if TYPE_CHECKING:
12
- from .context import FragmentContext
13
13
  from .node import SimpleDomNode
14
+ from .sanitize import SanitizationPolicy
14
15
  from .tokens import ParseError
15
16
 
16
17
 
@@ -53,14 +54,22 @@ class JustHTML:
53
54
  html: str | bytes | bytearray | memoryview | None,
54
55
  *,
55
56
  collect_errors: bool = False,
57
+ track_node_locations: bool = False,
56
58
  debug: bool = False,
57
59
  encoding: str | None = None,
60
+ fragment: bool = False,
58
61
  fragment_context: FragmentContext | None = None,
59
62
  iframe_srcdoc: bool = False,
60
63
  strict: bool = False,
61
64
  tokenizer_opts: TokenizerOpts | None = None,
62
65
  tree_builder: TreeBuilder | None = None,
63
66
  ) -> None:
67
+ if fragment_context is not None:
68
+ fragment = True
69
+
70
+ if fragment and fragment_context is None:
71
+ fragment_context = FragmentContext("div")
72
+
64
73
  self.debug = bool(debug)
65
74
  self.fragment_context = fragment_context
66
75
  self.encoding = None
@@ -74,7 +83,8 @@ class JustHTML:
74
83
  else:
75
84
  html_str = ""
76
85
 
77
- # Enable error collection if strict mode is on
86
+ # Enable error collection if strict mode is on.
87
+ # Node location tracking is opt-in to avoid slowing down the common case.
78
88
  should_collect = collect_errors or strict
79
89
 
80
90
  self.tree_builder = tree_builder or TreeBuilder(
@@ -94,15 +104,33 @@ class JustHTML:
94
104
  elif tag_name in ("plaintext", "script"):
95
105
  opts.initial_state = Tokenizer.PLAINTEXT
96
106
 
97
- self.tokenizer = Tokenizer(self.tree_builder, opts, collect_errors=should_collect)
107
+ self.tokenizer = Tokenizer(
108
+ self.tree_builder,
109
+ opts,
110
+ collect_errors=should_collect,
111
+ track_node_locations=bool(track_node_locations),
112
+ )
98
113
  # Link tokenizer to tree_builder for position info
99
114
  self.tree_builder.tokenizer = self.tokenizer
100
115
 
101
116
  self.tokenizer.run(html_str)
102
117
  self.root = self.tree_builder.finish()
103
118
 
104
- # Merge errors from both tokenizer and tree builder
105
- self.errors = self.tokenizer.errors + self.tree_builder.errors
119
+ # Merge errors from both tokenizer and tree builder.
120
+ # Public API: users expect errors to be ordered by input position.
121
+ merged_errors = self.tokenizer.errors + self.tree_builder.errors
122
+ indexed_errors = enumerate(merged_errors)
123
+ self.errors = [
124
+ e
125
+ for _, e in sorted(
126
+ indexed_errors,
127
+ key=lambda t: (
128
+ t[1].line if t[1].line is not None else 1_000_000_000,
129
+ t[1].column if t[1].column is not None else 1_000_000_000,
130
+ t[0],
131
+ ),
132
+ )
133
+ ]
106
134
 
107
135
  # In strict mode, raise on first error
108
136
  if strict and self.errors:
@@ -112,20 +140,48 @@ class JustHTML:
112
140
  """Query the document using a CSS selector. Delegates to root.query()."""
113
141
  return self.root.query(selector)
114
142
 
115
- def to_html(self, pretty: bool = True, indent_size: int = 2) -> str:
116
- """Serialize the document to HTML. Delegates to root.to_html()."""
117
- return self.root.to_html(indent=0, indent_size=indent_size, pretty=pretty)
143
+ def to_html(
144
+ self,
145
+ pretty: bool = True,
146
+ indent_size: int = 2,
147
+ *,
148
+ safe: bool = True,
149
+ policy: SanitizationPolicy | None = None,
150
+ ) -> str:
151
+ """Serialize the document to HTML.
152
+
153
+ - `safe=True` sanitizes untrusted content before serialization.
154
+ - `policy` overrides the default sanitization policy.
155
+ """
156
+ return self.root.to_html(
157
+ indent=0,
158
+ indent_size=indent_size,
159
+ pretty=pretty,
160
+ safe=safe,
161
+ policy=policy,
162
+ )
118
163
 
119
- def to_text(self, separator: str = " ", strip: bool = True) -> str:
164
+ def to_text(
165
+ self,
166
+ separator: str = " ",
167
+ strip: bool = True,
168
+ *,
169
+ safe: bool = True,
170
+ policy: SanitizationPolicy | None = None,
171
+ ) -> str:
120
172
  """Return the document's concatenated text.
121
173
 
122
- Delegates to `root.to_text(separator=..., strip=...)`.
174
+ - `safe=True` sanitizes untrusted content before text extraction.
175
+ - `policy` overrides the default sanitization policy.
176
+
177
+ Delegates to `root.to_text(...)`.
123
178
  """
124
- return self.root.to_text(separator=separator, strip=strip)
179
+ return self.root.to_text(separator=separator, strip=strip, safe=safe, policy=policy)
125
180
 
126
- def to_markdown(self) -> str:
181
+ def to_markdown(self, *, safe: bool = True, policy: SanitizationPolicy | None = None) -> str:
127
182
  """Return a GitHub Flavored Markdown representation.
128
183
 
129
- Delegates to `root.to_markdown()`.
184
+ - `safe=True` sanitizes untrusted content before conversion.
185
+ - `policy` overrides the default sanitization policy.
130
186
  """
131
- return self.root.to_markdown()
187
+ return self.root.to_markdown(safe=safe, policy=policy)