justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/__init__.py CHANGED
@@ -1,14 +1,62 @@
1
1
  from .parser import JustHTML, StrictModeError
2
+ from .sanitize import (
3
+ CSS_PRESET_TEXT,
4
+ DEFAULT_DOCUMENT_POLICY,
5
+ DEFAULT_POLICY,
6
+ SanitizationPolicy,
7
+ UnsafeHtmlError,
8
+ UrlPolicy,
9
+ UrlProxy,
10
+ UrlRule,
11
+ )
2
12
  from .selector import SelectorError, matches, query
3
13
  from .serialize import to_html, to_test_format
4
14
  from .stream import stream
5
15
  from .tokens import ParseError
16
+ from .transforms import (
17
+ CollapseWhitespace,
18
+ Decide,
19
+ Drop,
20
+ Edit,
21
+ EditAttrs,
22
+ EditDocument,
23
+ Empty,
24
+ Linkify,
25
+ PruneEmpty,
26
+ RewriteAttrs,
27
+ Sanitize,
28
+ SetAttrs,
29
+ Stage,
30
+ Unwrap,
31
+ )
6
32
 
7
33
  __all__ = [
34
+ "CSS_PRESET_TEXT",
35
+ "DEFAULT_DOCUMENT_POLICY",
36
+ "DEFAULT_POLICY",
37
+ "CollapseWhitespace",
38
+ "Decide",
39
+ "Drop",
40
+ "Edit",
41
+ "EditAttrs",
42
+ "EditDocument",
43
+ "Empty",
8
44
  "JustHTML",
45
+ "Linkify",
9
46
  "ParseError",
47
+ "PruneEmpty",
48
+ "RewriteAttrs",
49
+ "SanitizationPolicy",
50
+ "Sanitize",
10
51
  "SelectorError",
52
+ "SetAttrs",
53
+ "Stage",
11
54
  "StrictModeError",
55
+ "UnsafeHtmlError",
56
+ "Unwrap",
57
+ "UrlPolicy",
58
+ "UrlProxy",
59
+ "UrlRule",
12
60
  "matches",
13
61
  "query",
14
62
  "stream",
justhtml/__main__.py CHANGED
@@ -8,9 +8,10 @@ import io
8
8
  import sys
9
9
  from importlib.metadata import PackageNotFoundError, version
10
10
  from pathlib import Path
11
- from typing import cast
11
+ from typing import TextIO, cast
12
12
 
13
13
  from . import JustHTML
14
+ from .context import FragmentContext
14
15
  from .selector import SelectorError
15
16
 
16
17
 
@@ -31,7 +32,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
31
32
  " curl -s https://example.com | justhtml -\n"
32
33
  " justhtml page.html --selector 'main p' --format text\n"
33
34
  " justhtml page.html --selector 'a' --format html\n"
34
- " justhtml page.html --selector 'article' --format markdown\n"
35
+ " justhtml page.html --selector 'article' --allow-tags article --format markdown\n"
35
36
  "\n"
36
37
  "If you don't have the 'justhtml' command available, use:\n"
37
38
  " python -m justhtml ...\n"
@@ -44,6 +45,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
44
45
  nargs="?",
45
46
  help="HTML file to parse, or '-' to read from stdin",
46
47
  )
48
+ parser.add_argument("--output", help="File to write output to")
47
49
  parser.add_argument(
48
50
  "--selector",
49
51
  help="CSS selector for choosing nodes (defaults to the document root)",
@@ -54,12 +56,32 @@ def _parse_args(argv: list[str]) -> argparse.Namespace:
54
56
  default="html",
55
57
  help="Output format (default: html)",
56
58
  )
59
+
60
+ parser.add_argument(
61
+ "--unsafe",
62
+ action="store_true",
63
+ help="Disable sanitization (trusted input only)",
64
+ )
65
+
66
+ parser.add_argument(
67
+ "--allow-tags",
68
+ help=(
69
+ "Safe mode: allow these additional tags during sanitization (comma-separated). "
70
+ "Example: --allow-tags article,section"
71
+ ),
72
+ )
57
73
  parser.add_argument(
58
74
  "--first",
59
75
  action="store_true",
60
76
  help="Only output the first matching node",
61
77
  )
62
78
 
79
+ parser.add_argument(
80
+ "--fragment",
81
+ action="store_true",
82
+ help="Parse input as an HTML fragment (context: <div>)",
83
+ )
84
+
63
85
  parser.add_argument(
64
86
  "--separator",
65
87
  default=" ",
@@ -108,7 +130,37 @@ def _read_html(path: str) -> str | bytes:
108
130
  def main() -> None:
109
131
  args = _parse_args(sys.argv[1:])
110
132
  html = _read_html(args.path)
111
- doc = JustHTML(html)
133
+ fragment_context = FragmentContext("div") if args.fragment else None
134
+ safe = not args.unsafe
135
+
136
+ policy = None
137
+ if safe and args.allow_tags:
138
+ from .sanitize import DEFAULT_DOCUMENT_POLICY, DEFAULT_POLICY, SanitizationPolicy # noqa: PLC0415
139
+
140
+ extra_tags: set[str] = set()
141
+ for part in str(args.allow_tags).replace(" ", ",").split(","):
142
+ tag = part.strip().lower()
143
+ if tag:
144
+ extra_tags.add(tag)
145
+
146
+ base = DEFAULT_POLICY if fragment_context is not None else DEFAULT_DOCUMENT_POLICY
147
+ allowed = set(base.allowed_tags)
148
+ allowed.update(extra_tags)
149
+ policy = SanitizationPolicy(
150
+ allowed_tags=allowed,
151
+ allowed_attributes=base.allowed_attributes,
152
+ url_policy=base.url_policy,
153
+ drop_comments=base.drop_comments,
154
+ drop_doctype=base.drop_doctype,
155
+ drop_foreign_namespaces=base.drop_foreign_namespaces,
156
+ drop_content_tags=base.drop_content_tags,
157
+ allowed_css_properties=base.allowed_css_properties,
158
+ force_link_rel=base.force_link_rel,
159
+ unsafe_handling=base.unsafe_handling,
160
+ disallowed_tag_handling=base.disallowed_tag_handling,
161
+ )
162
+
163
+ doc = JustHTML(html, fragment_context=fragment_context, safe=safe, policy=policy)
112
164
 
113
165
  try:
114
166
  nodes = doc.query(args.selector) if args.selector else [doc.root]
@@ -122,22 +174,39 @@ def main() -> None:
122
174
  if args.first:
123
175
  nodes = [nodes[0]]
124
176
 
125
- if args.format == "html":
126
- outputs = [node.to_html() for node in nodes]
127
- sys.stdout.write("\n".join(outputs))
128
- sys.stdout.write("\n")
129
- return
130
-
131
- if args.format == "text":
132
- outputs = [node.to_text(separator=args.separator, strip=args.strip) for node in nodes]
133
- sys.stdout.write("\n".join(outputs))
134
- sys.stdout.write("\n")
177
+ def write_output(out: TextIO) -> None:
178
+ if args.format == "html":
179
+ outputs = [node.to_html() for node in nodes]
180
+ out.write("\n".join(outputs))
181
+ out.write("\n")
182
+ return
183
+
184
+ if args.format == "text":
185
+ # Keep these branches explicit so coverage will highlight untested CLI options.
186
+ if args.separator == " ":
187
+ if args.strip:
188
+ outputs = [node.to_text(strip=True) for node in nodes]
189
+ else:
190
+ outputs = [node.to_text(strip=False) for node in nodes]
191
+ else:
192
+ if args.strip:
193
+ outputs = [node.to_text(separator=args.separator, strip=True) for node in nodes]
194
+ else:
195
+ outputs = [node.to_text(separator=args.separator, strip=False) for node in nodes]
196
+ out.write("\n".join(outputs))
197
+ out.write("\n")
198
+ return
199
+
200
+ outputs = [node.to_markdown() for node in nodes]
201
+ out.write("\n\n".join(outputs))
202
+ out.write("\n")
203
+
204
+ if args.output:
205
+ with Path(args.output).open(mode="w", encoding="utf-8") as outfile:
206
+ write_output(outfile)
135
207
  return
136
208
 
137
- outputs = [node.to_markdown() for node in nodes]
138
- sys.stdout.write("\n\n".join(outputs))
139
- sys.stdout.write("\n")
140
- return
209
+ write_output(sys.stdout)
141
210
 
142
211
 
143
212
  if __name__ == "__main__":
justhtml/constants.py CHANGED
@@ -184,6 +184,18 @@ HTML4_PUBLIC_PREFIXES = (
184
184
 
185
185
  HEADING_ELEMENTS = {"h1", "h2", "h3", "h4", "h5", "h6"}
186
186
 
187
+ # Elements where pretty-printing and whitespace-collapsing transforms should
188
+ # preserve text node whitespace.
189
+ WHITESPACE_PRESERVING_ELEMENTS: Final[frozenset[str]] = frozenset(
190
+ {
191
+ "code",
192
+ "pre",
193
+ "script",
194
+ "style",
195
+ "textarea",
196
+ }
197
+ )
198
+
187
199
  FORMATTING_ELEMENTS = {
188
200
  "a",
189
201
  "b",
justhtml/entities.py CHANGED
@@ -7,6 +7,10 @@ Supports both named entities (&amp;, &nbsp;) and numeric references (&#60;, &#x3
7
7
  from __future__ import annotations
8
8
 
9
9
  import html.entities
10
+ from typing import TYPE_CHECKING
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Callable
10
14
 
11
15
  # Use Python's complete HTML5 entity list (2231 entities)
12
16
  # Keys include the trailing semicolon (e.g., "amp;", "lang;")
@@ -168,7 +172,23 @@ NUMERIC_REPLACEMENTS: dict[int, str] = {
168
172
  }
169
173
 
170
174
 
171
- def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
175
+ def _is_control_character(codepoint: int) -> bool:
176
+ # C0 controls and C1 controls
177
+ return (0x00 <= codepoint <= 0x1F) or (0x7F <= codepoint <= 0x9F)
178
+
179
+
180
+ def _is_noncharacter(codepoint: int) -> bool:
181
+ if 0xFDD0 <= codepoint <= 0xFDEF:
182
+ return True
183
+ last = codepoint & 0xFFFF
184
+ return last == 0xFFFE or last == 0xFFFF
185
+
186
+
187
+ def decode_numeric_entity(
188
+ text: str,
189
+ is_hex: bool = False,
190
+ report_error: Callable[[str], None] | None = None,
191
+ ) -> str:
172
192
  """Decode a numeric character reference like &#60; or &#x3C;.
173
193
 
174
194
  Args:
@@ -181,20 +201,30 @@ def decode_numeric_entity(text: str, is_hex: bool = False) -> str:
181
201
  base = 16 if is_hex else 10
182
202
  codepoint = int(text, base)
183
203
 
184
- # Apply HTML5 replacements for certain ranges
185
- if codepoint in NUMERIC_REPLACEMENTS:
186
- return NUMERIC_REPLACEMENTS[codepoint]
187
-
188
204
  # Invalid ranges per HTML5 spec
189
205
  if codepoint > 0x10FFFF:
190
206
  return "\ufffd" # REPLACEMENT CHARACTER
191
207
  if 0xD800 <= codepoint <= 0xDFFF: # Surrogate range
192
208
  return "\ufffd"
193
209
 
210
+ if report_error is not None:
211
+ if _is_control_character(codepoint):
212
+ report_error("control-character-reference")
213
+ if _is_noncharacter(codepoint):
214
+ report_error("noncharacter-character-reference")
215
+
216
+ # Apply HTML5 replacements for certain ranges
217
+ if codepoint in NUMERIC_REPLACEMENTS:
218
+ return NUMERIC_REPLACEMENTS[codepoint]
219
+
194
220
  return chr(codepoint)
195
221
 
196
222
 
197
- def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
223
+ def decode_entities_in_text(
224
+ text: str,
225
+ in_attribute: bool = False,
226
+ report_error: Callable[[str], None] | None = None,
227
+ ) -> str:
198
228
  """Decode all HTML entities in text.
199
229
 
200
230
  This is a simple implementation that handles:
@@ -247,7 +277,9 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
247
277
  digit_text = text[digit_start:j]
248
278
 
249
279
  if digit_text:
250
- result.append(decode_numeric_entity(digit_text, is_hex=is_hex))
280
+ if report_error is not None and not has_semicolon:
281
+ report_error("missing-semicolon-after-character-reference")
282
+ result.append(decode_numeric_entity(digit_text, is_hex=is_hex, report_error=report_error))
251
283
  i = j + 1 if has_semicolon else j
252
284
  continue
253
285
 
@@ -285,6 +317,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
285
317
  best_match_len = k
286
318
  break
287
319
  if best_match:
320
+ if report_error is not None:
321
+ report_error("missing-semicolon-after-character-reference")
288
322
  result.append(best_match)
289
323
  i = i + 1 + best_match_len
290
324
  continue
@@ -302,6 +336,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
302
336
  continue
303
337
 
304
338
  # Decode legacy entity
339
+ if report_error is not None and not has_semicolon:
340
+ report_error("missing-semicolon-after-character-reference")
305
341
  result.append(NAMED_ENTITIES[entity_name])
306
342
  i = j
307
343
  continue
@@ -329,6 +365,8 @@ def decode_entities_in_text(text: str, in_attribute: bool = False) -> str:
329
365
  i += 1
330
366
  continue
331
367
 
368
+ if report_error is not None:
369
+ report_error("missing-semicolon-after-character-reference")
332
370
  result.append(best_match)
333
371
  i = i + 1 + best_match_len
334
372
  continue
justhtml/errors.py CHANGED
@@ -1,7 +1,8 @@
1
- """Centralized error message definitions and helpers for HTML parsing errors.
1
+ """Centralized error message definitions and helpers for JustHTML errors.
2
2
 
3
- This module provides human-readable error messages for all parse error codes
4
- emitted by both the tokenizer and tree builder during HTML parsing.
3
+ This module provides human-readable error messages for parse error codes
4
+ emitted by the tokenizer and tree builder during HTML parsing, plus selected
5
+ security findings emitted by the sanitizer.
5
6
  """
6
7
 
7
8
  from __future__ import annotations
@@ -75,6 +76,8 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
75
76
  "illegal-codepoint-for-numeric-entity": "Invalid codepoint in numeric character reference",
76
77
  "missing-semicolon-after-character-reference": "Missing semicolon after character reference",
77
78
  "named-entity-without-semicolon": "Named entity used without semicolon",
79
+ "noncharacter-character-reference": "Noncharacter in character reference",
80
+ "noncharacter-in-input-stream": "Noncharacter in input stream",
78
81
  # ================================================================
79
82
  # TREE BUILDER ERRORS
80
83
  # ================================================================
@@ -107,8 +110,11 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
107
110
  # Foster parenting / table errors
108
111
  "foster-parenting-character": "Text content in table requires foster parenting",
109
112
  "foster-parenting-start-tag": "Start tag in table requires foster parenting",
113
+ "unexpected-character-implies-table-voodoo": "Unexpected character in table triggers foster parenting",
110
114
  "unexpected-start-tag-implies-table-voodoo": f"<{tag_name}> start tag in table triggers foster parenting",
111
115
  "unexpected-end-tag-implies-table-voodoo": f"</{tag_name}> end tag in table triggers foster parenting",
116
+ "unexpected-implied-end-tag-in-table-view": "Unexpected implied end tag while closing table",
117
+ "eof-in-table": "Unexpected end of file in table",
112
118
  "unexpected-cell-in-table-body": "Unexpected table cell outside of table row",
113
119
  "unexpected-form-in-table": "Form element not allowed in table context",
114
120
  "unexpected-hidden-input-in-table": "Hidden input in table triggers foster parenting",
@@ -134,6 +140,14 @@ def generate_error_message(code: str, tag_name: str | None = None) -> str:
134
140
  "adoption-agency-1.3": "Misnested tags require adoption agency algorithm",
135
141
  "non-void-html-element-start-tag-with-trailing-solidus": f"<{tag_name}/> self-closing syntax on non-void element",
136
142
  "image-start-tag": f"Deprecated <{tag_name}> tag (use <img> instead)",
143
+ # Select insertion mode (context-specific taxonomy)
144
+ "unexpected-start-tag-in-select": f"Unexpected <{tag_name}> start tag in <select>",
145
+ "unexpected-end-tag-in-select": f"Unexpected </{tag_name}> end tag in <select>",
146
+ "unexpected-select-in-select": "Unexpected nested <select> in <select>",
147
+ # ================================================================
148
+ # SECURITY ERRORS
149
+ # ================================================================
150
+ "unsafe-html": "Unsafe HTML detected by sanitization policy",
137
151
  }
138
152
 
139
153
  # Return message or fall back to the code itself if not found