justhtml 0.12.0__py3-none-any.whl → 0.24.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/__init__.py CHANGED
@@ -1,16 +1,22 @@
1
1
  from .parser import JustHTML, StrictModeError
2
+ from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy, UrlRule, sanitize
2
3
  from .selector import SelectorError, matches, query
3
4
  from .serialize import to_html, to_test_format
4
5
  from .stream import stream
5
6
  from .tokens import ParseError
6
7
 
7
8
  __all__ = [
9
+ "DEFAULT_DOCUMENT_POLICY",
10
+ "DEFAULT_POLICY",
8
11
  "JustHTML",
9
12
  "ParseError",
13
+ "SanitizationPolicy",
10
14
  "SelectorError",
11
15
  "StrictModeError",
16
+ "UrlRule",
12
17
  "matches",
13
18
  "query",
19
+ "sanitize",
14
20
  "stream",
15
21
  "to_html",
16
22
  "to_test_format",
justhtml/__main__.py CHANGED
@@ -8,9 +8,10 @@ import io
8
8
  import sys
9
9
  from importlib.metadata import PackageNotFoundError, version
10
10
  from pathlib import Path
11
- from typing import cast
11
+ from typing import TextIO, cast
12
12
 
13
13
  from . import JustHTML
14
+ from .context import FragmentContext
14
15
  from .selector import SelectorError
15
16
 
16
17
 
@@ -44,6 +45,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
44
45
  nargs="?",
45
46
  help="HTML file to parse, or '-' to read from stdin",
46
47
  )
48
+ parser.add_argument("--output", help="File to write output to")
47
49
  parser.add_argument(
48
50
  "--selector",
49
51
  help="CSS selector for choosing nodes (defaults to the document root)",
@@ -54,12 +56,24 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
54
56
  default="html",
55
57
  help="Output format (default: html)",
56
58
  )
59
+
60
+ parser.add_argument(
61
+ "--unsafe",
62
+ action="store_true",
63
+ help="Disable sanitization (trusted input only)",
64
+ )
57
65
  parser.add_argument(
58
66
  "--first",
59
67
  action="store_true",
60
68
  help="Only output the first matching node",
61
69
  )
62
70
 
71
+ parser.add_argument(
72
+ "--fragment",
73
+ action="store_true",
74
+ help="Parse input as an HTML fragment (context: <div>)",
75
+ )
76
+
63
77
  parser.add_argument(
64
78
  "--separator",
65
79
  default=" ",
@@ -108,7 +122,8 @@ def _read_html(path: str) -> str | bytes:
108
122
  def main() -> None:
109
123
  args = _parse_args(sys.argv[1:])
110
124
  html = _read_html(args.path)
111
- doc = JustHTML(html)
125
+ fragment_context = FragmentContext("div") if args.fragment else None
126
+ doc = JustHTML(html, fragment_context=fragment_context)
112
127
 
113
128
  try:
114
129
  nodes = doc.query(args.selector) if args.selector else [doc.root]
@@ -122,22 +137,40 @@ def main() -> None:
122
137
  if args.first:
123
138
  nodes = [nodes[0]]
124
139
 
125
- if args.format == "html":
126
- outputs = [node.to_html() for node in nodes]
127
- sys.stdout.write("\n".join(outputs))
128
- sys.stdout.write("\n")
129
- return
130
-
131
- if args.format == "text":
132
- outputs = [node.to_text(separator=args.separator, strip=args.strip) for node in nodes]
133
- sys.stdout.write("\n".join(outputs))
134
- sys.stdout.write("\n")
140
+ def write_output(out: TextIO) -> None:
141
+ safe = not args.unsafe
142
+ if args.format == "html":
143
+ outputs = [node.to_html(safe=safe) for node in nodes]
144
+ out.write("\n".join(outputs))
145
+ out.write("\n")
146
+ return
147
+
148
+ if args.format == "text":
149
+ # Keep these branches explicit so coverage will highlight untested CLI options.
150
+ if args.separator == " ":
151
+ if args.strip:
152
+ outputs = [node.to_text(strip=True, safe=safe) for node in nodes]
153
+ else:
154
+ outputs = [node.to_text(strip=False, safe=safe) for node in nodes]
155
+ else:
156
+ if args.strip:
157
+ outputs = [node.to_text(separator=args.separator, strip=True, safe=safe) for node in nodes]
158
+ else:
159
+ outputs = [node.to_text(separator=args.separator, strip=False, safe=safe) for node in nodes]
160
+ out.write("\n".join(outputs))
161
+ out.write("\n")
162
+ return
163
+
164
+ outputs = [node.to_markdown(safe=safe) for node in nodes]
165
+ out.write("\n\n".join(outputs))
166
+ out.write("\n")
167
+
168
+ if args.output:
169
+ with Path(args.output).open(mode="w", encoding="utf-8") as outfile:
170
+ write_output(outfile)
135
171
  return
136
172
 
137
- outputs = [node.to_markdown() for node in nodes]
138
- sys.stdout.write("\n\n".join(outputs))
139
- sys.stdout.write("\n")
140
- return
173
+ write_output(sys.stdout)
141
174
 
142
175
 
143
176
  if __name__ == "__main__":
justhtml/entities.py CHANGED
@@ -7,6 +7,10 @@ Supports both named entities (&amp;, &nbsp;) and numeric references (&#60;, &#x3
7
7
  from __future__ import annotations
8
8
 
9
9
  import html.entities
10
+ from typing import TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Callable
10
14
 
11
15
  # Use Python's complete HTML5 entity list (2231 entities)
12
16
  # Keys include the trailing semicolon (e.g., "amp;", "lang;")
@@ -168,7 +172,23 @@ NUMERIC_REPLACEMENTS: dict[int, str] = {
168
172
  }
169
173
 
170
174
 
171
- def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
175
+ def _is_control_character(codepoint: int) -> bool:
176
+ # C0 controls and C1 controls
177
+ return (0x00 <= codepoint <= 0x1F) or (0x7F <= codepoint <= 0x9F)
178
+
179
+
180
+ def _is_noncharacter(codepoint: int) -> bool:
181
+ if 0xFDD0 <= codepoint <= 0xFDEF:
182
+ return True
183
+ last = codepoint & 0xFFFF
184
+ return last == 0xFFFE or last == 0xFFFF
185
+
186
+
187
+ def decode_numeric_entity(
188
+ text: str,
189
+ is_hex: bool = False,
190
+ report_error: Callable[[str], None] | None = None,
191
+ ) -> str:
172
192
  """Decode a numeric character reference like &#60; or &#x3C;.
173
193
 
174
194
  Args:
@@ -181,20 +201,30 @@ def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
181
201
  base = 16 if is_hex else 10
182
202
  codepoint = int(text, base)
183
203
 
184
- # Apply HTML5 replacements for certain ranges
185
- if codepoint in NUMERIC_REPLACEMENTS:
186
- return NUMERIC_REPLACEMENTS[codepoint]
187
-
188
204
  # Invalid ranges per HTML5 spec
189
205
  if codepoint > 0x10FFFF:
190
206
  return "\ufffd" # REPLACEMENT CHARACTER
191
207
  if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
192
208
  return "\ufffd"
193
209
 
210
+ if report_error is not None:
211
+ if _is_control_character(codepoint):
212
+ report_error("control-character-reference")
213
+ if _is_noncharacter(codepoint):
214
+ report_error("noncharacter-character-reference")
215
+
216
+ # Apply HTML5 replacements for certain ranges
217
+ if codepoint in NUMERIC_REPLACEMENTS:
218
+ return NUMERIC_REPLACEMENTS[codepoint]
219
+
194
220
  return chr(codepoint)
195
221
 
196
222
 
197
- def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
223
+ def decode_entities_in_text(
224
+ text: str,
225
+ in_attribute: bool = False,
226
+ report_error: Callable[[str], None] | None = None,
227
+ ) -> str:
198
228
  """Decode all HTML entities in text.
199
229
 
200
230
  This is a simple implementation that handles:
@@ -247,7 +277,9 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
247
277
  digit_text = text[digit_start:j]
248
278
 
249
279
  if digit_text:
250
- result.append(decode_numeric_entity(digit_text, is_hex=is_hex))
280
+ if report_error is not None and not has_semicolon:
281
+ report_error("missing-semicolon-after-character-reference")
282
+ result.append(decode_numeric_entity(digit_text, is_hex=is_hex, report_error=report_error))
251
283
  i = j + 1 if has_semicolon else j
252
284
  continue
253
285
 
@@ -285,6 +317,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
285
317
  best_match_len = k
286
318
  break
287
319
  if best_match:
320
+ if report_error is not None:
321
+ report_error("missing-semicolon-after-character-reference")
288
322
  result.append(best_match)
289
323
  i = i + 1 + best_match_len
290
324
  continue
@@ -302,6 +336,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
302
336
  continue
303
337
 
304
338
  # Decode legacy entity
339
+ if report_error is not None and not has_semicolon:
340
+ report_error("missing-semicolon-after-character-reference")
305
341
  result.append(NAMED_ENTITIES[entity_name])
306
342
  i = j
307
343
  continue
@@ -329,6 +365,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
329
365
  i += 1
330
366
  continue
331
367
 
368
+ if report_error is not None:
369
+ report_error("missing-semicolon-after-character-reference")
332
370
  result.append(best_match)
333
371
  i = i + 1 + best_match_len
334
372
  continue
justhtml/errors.py CHANGED
@@ -75,6 +75,8 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
75
75
  "illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
76
76
  "missing-semicolon-after-character-reference": "Missing semicolon after character reference",
77
77
  "named-entity-without-semicolon": "Named entity used without semicolon",
78
+ "noncharacter-character-reference": "Noncharacter in character reference",
79
+ "noncharacter-in-input-stream": "Noncharacter in input stream",
78
80
  # ================================================================
79
81
  # TREE BUILDER ERRORS
80
82
  # ================================================================
@@ -107,8 +109,11 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
107
109
  # Foster parenting / table errors
108
110
  "foster-parenting-character": "Text content in table requires foster parenting",
109
111
  "foster-parenting-start-tag": "Start tag in table requires foster parenting",
112
+ "unexpected-character-implies-table-voodoo": "Unexpected character in table triggers foster parenting",
110
113
  "unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
111
114
  "unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
115
+ "unexpected-implied-end-tag-in-table-view": "Unexpected implied end tag while closing table",
116
+ "eof-in-table": "Unexpected end of file in table",
112
117
  "unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
113
118
  "unexpected-form-in-table": "Form element not allowed in table context",
114
119
  "unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
@@ -134,6 +139,10 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
134
139
  "adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
135
140
  "non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
136
141
  "image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
142
+ # Select insertion mode (context-specific taxonomy)
143
+ "unexpected-start-tag-in-select": f"Unexpected <{tag_name}> start tag in <select>",
144
+ "unexpected-end-tag-in-select": f"Unexpected </{tag_name}> end tag in <select>",
145
+ "unexpected-select-in-select": "Unexpected nested <select> in <select>",
137
146
  }
138
147
 
139
148
  # Return message or fall back to the code itself if not found