justhtml 2.0.0__tar.gz → 2.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (148) hide show
  1. {justhtml-2.0.0 → justhtml-2.2.0}/CHANGELOG.md +29 -0
  2. justhtml-2.2.0/PKG-INFO +187 -0
  3. justhtml-2.2.0/README.md +133 -0
  4. {justhtml-2.0.0 → justhtml-2.2.0}/SECURITY.md +2 -1
  5. justhtml-2.2.0/assets/justhtml-readme-explainer.png +0 -0
  6. {justhtml-2.0.0 → justhtml-2.2.0}/benchmarks/correctness.py +2 -2
  7. justhtml-2.2.0/docs/comparison.md +82 -0
  8. {justhtml-2.0.0 → justhtml-2.2.0}/docs/index.md +1 -0
  9. {justhtml-2.0.0 → justhtml-2.2.0}/pyproject.toml +1 -1
  10. {justhtml-2.0.0 → justhtml-2.2.0}/scripts/release.py +45 -7
  11. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/parser/__init__.py +9 -6
  12. justhtml-2.2.0/src/justhtml/parser/stream.py +206 -0
  13. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/__init__.py +1 -0
  14. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/runtime.py +4 -0
  15. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/spec.py +1 -0
  16. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/serializer/html.py +3 -1
  17. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/tokenizer/html.py +48 -4
  18. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/__init__.py +19 -0
  19. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/compile.py +33 -50
  20. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/runtime.py +73 -13
  21. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/treebuilder/core.py +128 -57
  22. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/treebuilder/modes.py +109 -58
  23. {justhtml-2.0.0 → justhtml-2.2.0}/test-summary.txt +4 -4
  24. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/branch_coverage.dat +3 -3
  25. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_precommit_coverage.py +11 -4
  26. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_sanitize.py +97 -0
  27. justhtml-2.2.0/tests/test_stream.py +273 -0
  28. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_transforms.py +53 -0
  29. justhtml-2.2.0/tests/test_treebuilder.py +640 -0
  30. justhtml-2.0.0/PKG-INFO +0 -271
  31. justhtml-2.0.0/README.md +0 -217
  32. justhtml-2.0.0/src/justhtml/parser/stream.py +0 -110
  33. justhtml-2.0.0/tests/test_stream.py +0 -65
  34. justhtml-2.0.0/tests/test_treebuilder.py +0 -241
  35. {justhtml-2.0.0 → justhtml-2.2.0}/.github/copilot-instructions.md +0 -0
  36. {justhtml-2.0.0 → justhtml-2.2.0}/.github/workflows/ci.yml +0 -0
  37. {justhtml-2.0.0 → justhtml-2.2.0}/.github/workflows/publish.yml +0 -0
  38. {justhtml-2.0.0 → justhtml-2.2.0}/.gitignore +0 -0
  39. {justhtml-2.0.0 → justhtml-2.2.0}/.pre-commit-config.yaml +0 -0
  40. {justhtml-2.0.0 → justhtml-2.2.0}/CODE_OF_CONDUCT.md +0 -0
  41. {justhtml-2.0.0 → justhtml-2.2.0}/CONTRIBUTING.md +0 -0
  42. {justhtml-2.0.0 → justhtml-2.2.0}/LICENSE +0 -0
  43. {justhtml-2.0.0 → justhtml-2.2.0}/Makefile +0 -0
  44. {justhtml-2.0.0 → justhtml-2.2.0}/benchmarks/fuzz.py +0 -0
  45. {justhtml-2.0.0 → justhtml-2.2.0}/benchmarks/performance.py +0 -0
  46. {justhtml-2.0.0 → justhtml-2.2.0}/benchmarks/profile.py +0 -0
  47. {justhtml-2.0.0 → justhtml-2.2.0}/docs/_config.yml +0 -0
  48. {justhtml-2.0.0 → justhtml-2.2.0}/docs/_layouts/default.html +0 -0
  49. {justhtml-2.0.0 → justhtml-2.2.0}/docs/api.md +0 -0
  50. {justhtml-2.0.0 → justhtml-2.2.0}/docs/assets/search.js +0 -0
  51. {justhtml-2.0.0 → justhtml-2.2.0}/docs/bleach-migration.md +0 -0
  52. {justhtml-2.0.0 → justhtml-2.2.0}/docs/building.md +0 -0
  53. {justhtml-2.0.0 → justhtml-2.2.0}/docs/cli.md +0 -0
  54. {justhtml-2.0.0 → justhtml-2.2.0}/docs/correctness.md +0 -0
  55. {justhtml-2.0.0 → justhtml-2.2.0}/docs/encoding.md +0 -0
  56. {justhtml-2.0.0 → justhtml-2.2.0}/docs/errors.md +0 -0
  57. {justhtml-2.0.0 → justhtml-2.2.0}/docs/fragments.md +0 -0
  58. {justhtml-2.0.0 → justhtml-2.2.0}/docs/html-cleaning.md +0 -0
  59. {justhtml-2.0.0 → justhtml-2.2.0}/docs/linkify.md +0 -0
  60. {justhtml-2.0.0 → justhtml-2.2.0}/docs/migration-examples.md +0 -0
  61. {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/app.js +0 -0
  62. {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/index.html +0 -0
  63. {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/py/__init__.py +0 -0
  64. {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/py/install_latest_justhtml.py +0 -0
  65. {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/py/render.py +0 -0
  66. {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/py/use_local_repo.py +0 -0
  67. {justhtml-2.0.0 → justhtml-2.2.0}/docs/quickstart.md +0 -0
  68. {justhtml-2.0.0 → justhtml-2.2.0}/docs/sanitization.md +0 -0
  69. {justhtml-2.0.0 → justhtml-2.2.0}/docs/selectors.md +0 -0
  70. {justhtml-2.0.0 → justhtml-2.2.0}/docs/streaming.md +0 -0
  71. {justhtml-2.0.0 → justhtml-2.2.0}/docs/text.md +0 -0
  72. {justhtml-2.0.0 → justhtml-2.2.0}/docs/transforms.md +0 -0
  73. {justhtml-2.0.0 → justhtml-2.2.0}/docs/unsafe-handling.md +0 -0
  74. {justhtml-2.0.0 → justhtml-2.2.0}/docs/url-cleaning.md +0 -0
  75. {justhtml-2.0.0 → justhtml-2.2.0}/llms.txt +0 -0
  76. {justhtml-2.0.0 → justhtml-2.2.0}/run_tests.py +0 -0
  77. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/__init__.py +0 -0
  78. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/__main__.py +0 -0
  79. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/__init__.py +0 -0
  80. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/constants.py +0 -0
  81. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/entities.py +0 -0
  82. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/errors.py +0 -0
  83. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/rawtext.py +0 -0
  84. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/dom/__init__.py +0 -0
  85. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/dom/builder.py +0 -0
  86. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/parser/context.py +0 -0
  87. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/parser/encoding.py +0 -0
  88. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/py.typed +0 -0
  89. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/__init__.py +0 -0
  90. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/css.py +0 -0
  91. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/dom.py +0 -0
  92. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/policy.py +0 -0
  93. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/policy_defaults.py +0 -0
  94. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/rawtext.py +0 -0
  95. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/policy.py +0 -0
  96. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/selector/__init__.py +0 -0
  97. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/selector/core.py +0 -0
  98. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/serializer/__init__.py +0 -0
  99. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/serializer/markdown.py +0 -0
  100. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/tokenizer/__init__.py +0 -0
  101. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/tokenizer/tokens.py +0 -0
  102. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/linkify.py +0 -0
  103. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/linkify_core.py +0 -0
  104. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/spec.py +0 -0
  105. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/treebuilder/__init__.py +0 -0
  106. {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/treebuilder/utils.py +0 -0
  107. {justhtml-2.0.0 → justhtml-2.2.0}/tests/README.md +0 -0
  108. {justhtml-2.0.0 → justhtml-2.2.0}/tests/__init__.py +0 -0
  109. {justhtml-2.0.0 → justhtml-2.2.0}/tests/data/wikipedia.html +0 -0
  110. {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/__init__.py +0 -0
  111. {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/encoding.py +0 -0
  112. {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/regressions.py +0 -0
  113. {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/reporter.py +0 -0
  114. {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/serializer.py +0 -0
  115. {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/tokenizer.py +0 -0
  116. {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/tree.py +0 -0
  117. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-sanitize-tests/cases.json +0 -0
  118. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/coverage_gaps.test +0 -0
  119. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/empty_stack_edge_cases.dat +0 -0
  120. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/entities.test +0 -0
  121. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/iframe_srcdoc.dat +0 -0
  122. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/tokenizer_edge_cases.test +0 -0
  123. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/treebuilder_coverage.dat +0 -0
  124. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/xml_coercion.dat +0 -0
  125. {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/xml_coercion_coverage.test +0 -0
  126. {justhtml-2.0.0 → justhtml-2.2.0}/tests/linkify-it/LICENSE.txt +0 -0
  127. {justhtml-2.0.0 → justhtml-2.2.0}/tests/linkify-it/README.md +0 -0
  128. {justhtml-2.0.0 → justhtml-2.2.0}/tests/linkify-it/fixtures/links.txt +0 -0
  129. {justhtml-2.0.0 → justhtml-2.2.0}/tests/linkify-it/fixtures/not_links.txt +0 -0
  130. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_builder.py +0 -0
  131. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_cli.py +0 -0
  132. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_docs_examples.py +0 -0
  133. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_encoding.py +0 -0
  134. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_errors.py +0 -0
  135. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_linkify_internals.py +0 -0
  136. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_linkify_it.py +0 -0
  137. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_linkify_transform.py +0 -0
  138. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_node.py +0 -0
  139. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_playground_local_repo_file_list.py +0 -0
  140. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_sanitize_integration.py +0 -0
  141. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_sanitize_transform.py +0 -0
  142. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_selector.py +0 -0
  143. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_serialize.py +0 -0
  144. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_tokenizer.py +0 -0
  145. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_transforms_compiler.py +0 -0
  146. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_transforms_edge_cases.py +0 -0
  147. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_transforms_sanitize_integration.py +0 -0
  148. {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_wikipedia.py +0 -0
@@ -5,6 +5,35 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [Unreleased]
9
+
10
+ ## [2.2.0] - 2026-06-07
11
+
12
+ ### Fixed
13
+ - Handle `<select><selectedcontent></selectedcontent></select>` without crashing when no `<option>` is present, replace selectedcontent fallback content during parser finalization, and avoid repeated selectedcontent subtree scans.
14
+ - Preserve source order and tag text when escape-mode sanitization handles disallowed rawtext/RCDATA elements with attributed or self-closing end tags.
15
+ - Make `stream()` use namespace-aware tokenizer context for SVG/MathML CDATA, rawtext decisions, self-closing foreign tags, and foreign end-tag stack updates.
16
+ - Use the correct initial tokenizer states for HTML fragment contexts such as `<title>`, `<textarea>`, `<script>`, `<style>`, and scripting-disabled `<noscript>`.
17
+ - Use HTML rawtext/RCDATA tokenizer states for text-like elements inside SVG/MathML HTML integration points and MathML text integration points.
18
+ - Generate implied end tags before removing `<form>` on `</form>` so following controls do not remain inside still-open descendants.
19
+ - Keep `<form>` elements inside `<template>` from claiming the global form pointer, including table-template form insertion.
20
+ - Close open `<p>` elements correctly around `<option>`, `<optgroup>`, `<hr>`, `<p>`, and `<div>` starts in `<select>` parsing.
21
+ - Close `<template>` correctly when `</template>` is seen while parsing inside `<select>`.
22
+ - Keep `</p>` and `</br>` foreign-content breakouts inside MathML text integration points such as `<mi>` and `<mtext>`.
23
+ - Align customizable `<select>` parsing with Chromium for phantom `</p>` handling and generic custom child elements.
24
+
25
+ ### Security
26
+ - (Severity: Low) Strip invisible Unicode during URL sink validation even when general invisible-Unicode stripping is disabled. Previously, custom policies using `strip_invisible_unicode=False` could preserve scheme-obfuscated values such as `javascript\u200b:` in otherwise URL-validated attributes.
27
+
28
+ ## [2.1.0] - 2026-06-06
29
+
30
+ ### Performance
31
+ - Avoid quadratic work for deeply nested HTML during default sanitization by carrying foreign-content context through the transform traversal instead of rescanning each node's ancestors, and by short-circuiting repeated `<p>` scope checks when no `<p>` is open.
32
+ - Speed up default sanitization by constructing selector matchers only for transforms that actually need selector matching.
33
+ - Speed up sanitizer attribute and text cleanup by skipping URL-sink resolution for non-URL attributes and bypassing invisible-Unicode regex scans for ASCII values.
34
+ - Avoid unnecessary default work by skipping selectedcontent finalization when no `<select>` was parsed and by bypassing invisible-Unicode transform helper calls for ASCII node values.
35
+ - Skip the sanitizer rawtext hardening pass when the active policy cannot preserve `<script>` or `<style>` elements.
36
+
8
37
  ## [2.0.0] - 2026-05-24
9
38
 
10
39
  ### Changed
@@ -0,0 +1,187 @@
1
+ Metadata-Version: 2.4
2
+ Name: justhtml
3
+ Version: 2.2.0
4
+ Summary: A pure Python HTML5 parser that just works.
5
+ Project-URL: Homepage, https://github.com/emilstenstrom/justhtml
6
+ Project-URL: Issues, https://github.com/emilstenstrom/justhtml/issues
7
+ Author-email: Emil Stenström <emil@emilstenstrom.se>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2025 Emil Stenström (JustHTML)
11
+ Copyright (c) 2014-2017, The html5ever Project Developers (html5ever inspiration)
12
+ Copyright (c) 2006-2013 James Graham, Sam Sneddon, and
13
+ other contributors (html5lib-tests)
14
+
15
+ Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ of this software and associated documentation files (the "Software"), to deal
17
+ in the Software without restriction, including without limitation the rights
18
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the Software is
20
+ furnished to do so, subject to the following conditions:
21
+
22
+ The above copyright notice and this permission notice shall be included in all
23
+ copies or substantial portions of the Software.
24
+
25
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ SOFTWARE.
32
+ License-File: LICENSE
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Operating System :: OS Independent
35
+ Classifier: Programming Language :: Python :: 3
36
+ Requires-Python: >=3.10
37
+ Provides-Extra: benchmark
38
+ Requires-Dist: beautifulsoup4; extra == 'benchmark'
39
+ Requires-Dist: html5-parser; extra == 'benchmark'
40
+ Requires-Dist: html5lib; extra == 'benchmark'
41
+ Requires-Dist: lxml; extra == 'benchmark'
42
+ Requires-Dist: markupever; extra == 'benchmark'
43
+ Requires-Dist: psutil; extra == 'benchmark'
44
+ Requires-Dist: selectolax>=0.4.8; extra == 'benchmark'
45
+ Requires-Dist: zstandard; extra == 'benchmark'
46
+ Provides-Extra: dev
47
+ Requires-Dist: build; extra == 'dev'
48
+ Requires-Dist: coverage; extra == 'dev'
49
+ Requires-Dist: mypy>=1.0; (platform_python_implementation != 'PyPy') and extra == 'dev'
50
+ Requires-Dist: pre-commit; extra == 'dev'
51
+ Requires-Dist: ruff==0.14.7; extra == 'dev'
52
+ Requires-Dist: twine; extra == 'dev'
53
+ Description-Content-Type: text/markdown
54
+
55
+ # JustHTML
56
+
57
+ HTML from the real web is messy. It is often malformed, user supplied, scraped from unknown pages, or headed for a browser where small parsing differences can become security bugs.
58
+
59
+ JustHTML gives Python projects one small dependency for the common HTML jobs:
60
+
61
+ - parse HTML like a browser, including broken markup
62
+ - sanitize untrusted HTML by default
63
+ - query with CSS selectors
64
+ - transform, serialize, extract text, or convert to Markdown
65
+ - run anywhere Python runs, with no C extension and no system package to install
66
+
67
+ ```bash
68
+ pip install justhtml
69
+ ```
70
+
71
+ Requires Python 3.10 or later.
72
+
73
+ [Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
74
+
75
+ ![JustHTML turns messy unsafe HTML into a sanitized, queryable DOM, then serializes it to text, Markdown, or HTML.](assets/justhtml-readme-explainer.png)
76
+
77
+ ## Why Use It?
78
+
79
+ Most Python HTML libraries optimize for one part of the problem.
80
+
81
+ `html.parser` is built in, but not HTML5-correct. BeautifulSoup is convenient, but depends heavily on the parser underneath. `lxml` and C/Rust-backed parsers are fast, but usually leave sanitization as a separate concern. `html5lib` and Bleach shaped the Python ecosystem, but both are no longer the obvious foundation for new projects.
82
+
83
+ JustHTML is for applications that want a boring, inspectable, pure-Python default:
84
+
85
+ - **Correct parsing:** browser-style HTML5 recovery, tested against the official html5lib fixtures.
86
+ - **Safe by default:** `JustHTML(html)` sanitizes before you query or serialize.
87
+ - **One DOM:** parse once, then sanitize, query, transform, serialize, extract text, or produce Markdown.
88
+ - **Easy deployment:** zero runtime dependencies, no compiler, works on PyPy and Pyodide.
89
+ - **Honest tradeoff:** if you are parsing terabytes of trusted HTML, use a C/Rust parser. If you need reliable handling of untrusted or malformed HTML inside a Python app, use JustHTML.
90
+
91
+ ## Quick Start
92
+
93
+ ```python
94
+ from justhtml import JustHTML
95
+
96
+ doc = JustHTML(
97
+ "<p>Hello<script>alert(1)</script> "
98
+ "<a href='javascript:alert(1)'>bad</a> "
99
+ "<a href='https://example.com'>ok</a></p>",
100
+ fragment=True,
101
+ )
102
+
103
+ print(doc.to_html(pretty=False))
104
+ # => <p>Hello <a>bad</a> <a href="https://example.com">ok</a></p>
105
+ ```
106
+
107
+ Sanitization is enabled by default. Disable it only for trusted input:
108
+
109
+ ```python
110
+ doc = JustHTML("<main><p class='intro'>Hello</p></main>", sanitize=False)
111
+ intro = doc.query_one("p.intro")
112
+
113
+ print(intro.to_text())
114
+ # => Hello
115
+ ```
116
+
117
+ ## What You Can Do
118
+
119
+ ```python
120
+ from justhtml import JustHTML, Linkify, SetAttrs, Unwrap
121
+
122
+ doc = JustHTML(
123
+ "<p>Hello <span>world</span> example.com</p>",
124
+ fragment=True,
125
+ sanitize=False,
126
+ transforms=[
127
+ Unwrap("span"),
128
+ Linkify(),
129
+ SetAttrs("a", rel="nofollow"),
130
+ ],
131
+ )
132
+
133
+ print(doc.to_html(pretty=False))
134
+ # => <p>Hello world <a href="http://example.com" rel="nofollow">example.com</a></p>
135
+ ```
136
+
137
+ JustHTML includes:
138
+
139
+ - [CSS selectors](docs/selectors.md): `query()` and `query_one()`
140
+ - [Sanitization](docs/sanitization.md): allowlisted HTML cleaning, URL policies, inline CSS controls
141
+ - [Transforms](docs/transforms.md): unwrap, drop, edit attributes, linkify, compose cleanup pipelines
142
+ - [Text output](docs/text.md): `to_text()` and Markdown generation
143
+ - [Builder API](docs/building.md): construct nodes directly from Python
144
+ - [Streaming](docs/streaming.md): process large inputs incrementally
145
+ - [Bleach migration guide](docs/bleach-migration.md): move existing sanitizer code to JustHTML policies
146
+
147
+ ## Command Line
148
+
149
+ ```bash
150
+ # Pretty-print an HTML file
151
+ justhtml index.html
152
+
153
+ # Parse from stdin
154
+ curl -s https://example.com | justhtml -
155
+
156
+ # Extract text from selected nodes
157
+ justhtml index.html --selector "main p" --format text
158
+
159
+ # Convert selected HTML to Markdown
160
+ justhtml index.html --selector "article" --format markdown
161
+ ```
162
+
163
+ ## Correctness
164
+
165
+ JustHTML is tested against the official html5lib tokenizer, tree-construction, serializer, and encoding fixtures, plus project-specific sanitizer, selector, transform, CLI, and regression tests.
166
+
167
+ The current test summary is 10,257 passing tests with 100% line and branch coverage. See [Correctness Testing](docs/correctness.md) for details.
168
+
169
+ ## Documentation
170
+
171
+ - [Quickstart](docs/quickstart.md)
172
+ - [Comparison](docs/comparison.md)
173
+ - [API Reference](docs/api.md)
174
+ - [Sanitization & Security](docs/sanitization.md)
175
+ - [Migrating from Bleach](docs/bleach-migration.md)
176
+ - [Command Line](docs/cli.md)
177
+ - [Full documentation site](https://emilstenstrom.github.io/justhtml/)
178
+
179
+ ## Security
180
+
181
+ JustHTML sanitizes by default, but output safety still depends on where you put it. HTML body output is not automatically safe inside JavaScript, CSS, URL attributes, or other contexts.
182
+
183
+ For the supported-version policy and vulnerability reporting, see [SECURITY.md](SECURITY.md).
184
+
185
+ ## License
186
+
187
+ MIT. Free to use for commercial and non-commercial projects.
@@ -0,0 +1,133 @@
1
+ # JustHTML
2
+
3
+ HTML from the real web is messy. It is often malformed, user supplied, scraped from unknown pages, or headed for a browser where small parsing differences can become security bugs.
4
+
5
+ JustHTML gives Python projects one small dependency for the common HTML jobs:
6
+
7
+ - parse HTML like a browser, including broken markup
8
+ - sanitize untrusted HTML by default
9
+ - query with CSS selectors
10
+ - transform, serialize, extract text, or convert to Markdown
11
+ - run anywhere Python runs, with no C extension and no system package to install
12
+
13
+ ```bash
14
+ pip install justhtml
15
+ ```
16
+
17
+ Requires Python 3.10 or later.
18
+
19
+ [Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
20
+
21
+ ![JustHTML turns messy unsafe HTML into a sanitized, queryable DOM, then serializes it to text, Markdown, or HTML.](assets/justhtml-readme-explainer.png)
22
+
23
+ ## Why Use It?
24
+
25
+ Most Python HTML libraries optimize for one part of the problem.
26
+
27
+ `html.parser` is built in, but not HTML5-correct. BeautifulSoup is convenient, but depends heavily on the parser underneath. `lxml` and C/Rust-backed parsers are fast, but usually leave sanitization as a separate concern. `html5lib` and Bleach shaped the Python ecosystem, but both are no longer the obvious foundation for new projects.
28
+
29
+ JustHTML is for applications that want a boring, inspectable, pure-Python default:
30
+
31
+ - **Correct parsing:** browser-style HTML5 recovery, tested against the official html5lib fixtures.
32
+ - **Safe by default:** `JustHTML(html)` sanitizes before you query or serialize.
33
+ - **One DOM:** parse once, then sanitize, query, transform, serialize, extract text, or produce Markdown.
34
+ - **Easy deployment:** zero runtime dependencies, no compiler, works on PyPy and Pyodide.
35
+ - **Honest tradeoff:** if you are parsing terabytes of trusted HTML, use a C/Rust parser. If you need reliable handling of untrusted or malformed HTML inside a Python app, use JustHTML.
36
+
37
+ ## Quick Start
38
+
39
+ ```python
40
+ from justhtml import JustHTML
41
+
42
+ doc = JustHTML(
43
+ "<p>Hello<script>alert(1)</script> "
44
+ "<a href='javascript:alert(1)'>bad</a> "
45
+ "<a href='https://example.com'>ok</a></p>",
46
+ fragment=True,
47
+ )
48
+
49
+ print(doc.to_html(pretty=False))
50
+ # => <p>Hello <a>bad</a> <a href="https://example.com">ok</a></p>
51
+ ```
52
+
53
+ Sanitization is enabled by default. Disable it only for trusted input:
54
+
55
+ ```python
56
+ doc = JustHTML("<main><p class='intro'>Hello</p></main>", sanitize=False)
57
+ intro = doc.query_one("p.intro")
58
+
59
+ print(intro.to_text())
60
+ # => Hello
61
+ ```
62
+
63
+ ## What You Can Do
64
+
65
+ ```python
66
+ from justhtml import JustHTML, Linkify, SetAttrs, Unwrap
67
+
68
+ doc = JustHTML(
69
+ "<p>Hello <span>world</span> example.com</p>",
70
+ fragment=True,
71
+ sanitize=False,
72
+ transforms=[
73
+ Unwrap("span"),
74
+ Linkify(),
75
+ SetAttrs("a", rel="nofollow"),
76
+ ],
77
+ )
78
+
79
+ print(doc.to_html(pretty=False))
80
+ # => <p>Hello world <a href="http://example.com" rel="nofollow">example.com</a></p>
81
+ ```
82
+
83
+ JustHTML includes:
84
+
85
+ - [CSS selectors](docs/selectors.md): `query()` and `query_one()`
86
+ - [Sanitization](docs/sanitization.md): allowlisted HTML cleaning, URL policies, inline CSS controls
87
+ - [Transforms](docs/transforms.md): unwrap, drop, edit attributes, linkify, compose cleanup pipelines
88
+ - [Text output](docs/text.md): `to_text()` and Markdown generation
89
+ - [Builder API](docs/building.md): construct nodes directly from Python
90
+ - [Streaming](docs/streaming.md): process large inputs incrementally
91
+ - [Bleach migration guide](docs/bleach-migration.md): move existing sanitizer code to JustHTML policies
92
+
93
+ ## Command Line
94
+
95
+ ```bash
96
+ # Pretty-print an HTML file
97
+ justhtml index.html
98
+
99
+ # Parse from stdin
100
+ curl -s https://example.com | justhtml -
101
+
102
+ # Extract text from selected nodes
103
+ justhtml index.html --selector "main p" --format text
104
+
105
+ # Convert selected HTML to Markdown
106
+ justhtml index.html --selector "article" --format markdown
107
+ ```
108
+
109
+ ## Correctness
110
+
111
+ JustHTML is tested against the official html5lib tokenizer, tree-construction, serializer, and encoding fixtures, plus project-specific sanitizer, selector, transform, CLI, and regression tests.
112
+
113
+ The current test summary is 10,257 passing tests with 100% line and branch coverage. See [Correctness Testing](docs/correctness.md) for details.
114
+
115
+ ## Documentation
116
+
117
+ - [Quickstart](docs/quickstart.md)
118
+ - [Comparison](docs/comparison.md)
119
+ - [API Reference](docs/api.md)
120
+ - [Sanitization & Security](docs/sanitization.md)
121
+ - [Migrating from Bleach](docs/bleach-migration.md)
122
+ - [Command Line](docs/cli.md)
123
+ - [Full documentation site](https://emilstenstrom.github.io/justhtml/)
124
+
125
+ ## Security
126
+
127
+ JustHTML sanitizes by default, but output safety still depends on where you put it. HTML body output is not automatically safe inside JavaScript, CSS, URL attributes, or other contexts.
128
+
129
+ For the supported-version policy and vulnerability reporting, see [SECURITY.md](SECURITY.md).
130
+
131
+ ## License
132
+
133
+ MIT. Free to use for commercial and non-commercial projects.
@@ -15,7 +15,8 @@
15
15
 
16
16
  | Version | Supported |
17
17
  | ------- | ---------------------------------------- |
18
- | 1.x | :white_check_mark: (until 2.0 is released) |
18
+ | 2.x | :white_check_mark: (until 3.0 is released) |
19
+ | 1.x | :x: |
19
20
  | < 1.0 | :x: |
20
21
 
21
22
  ## Security Domains
@@ -18,8 +18,8 @@ from enum import Enum
18
18
  from pathlib import Path
19
19
 
20
20
  from justhtml import JustHTML
21
- from justhtml.context import FragmentContext
22
- from justhtml.serialize import to_test_format
21
+ from justhtml.parser.context import FragmentContext
22
+ from justhtml.serializer import to_test_format
23
23
 
24
24
  # Available parsers
25
25
  PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax", "markupever"]
@@ -0,0 +1,82 @@
1
+ [← Back to docs](index.md)
2
+
3
+ # Comparison
4
+
5
+ Use JustHTML when you want browser-grade HTML parsing, safe-by-default sanitization, CSS selectors, transforms, text extraction, and serialization in one pure-Python package.
6
+
7
+ Use a different tool when one narrow requirement matters more than the whole pipeline: maximum throughput, a BeautifulSoup-specific API, XPath-heavy XML work, or integration with an existing lxml tree.
8
+
9
+ ## At a Glance
10
+
11
+ | Tool | HTML5 parsing [1][2] | Speed | Query | Build | Sanitize | Notes |
12
+ |------|------------------------------------------|-------|----------|-------|------------------|-------|
13
+ | **JustHTML**<br>Pure Python | ✅&nbsp;100% | ⚡ Fast | ✅ CSS selectors | ✅ `element()` | ✅ Built-in | Correct, secure, easy to install, and fast enough. |
14
+ | **`selectolax`**<br>Python wrapper of C-based Lexbor | ✅&nbsp;100% | 🚀 Very Fast | ✅ CSS selectors | ✅ `create_node()` | ❌ Needs sanitization | Very fast and spec-compliant. |
15
+ | **Chromium**<br>browser engine | ✅&nbsp;99.5% | 🚀&nbsp;Very&nbsp;Fast | — | — | — | — |
16
+ | **WebKit**<br>browser engine | ✅ 98.4% | 🚀 Very Fast | — | — | — | — |
17
+ | **Firefox**<br>browser engine | ✅ 97.6% | 🚀 Very Fast | — | — | — | — |
18
+ | **`markupever`**<br>Python wrapper of Rust-based html5ever | 🟡 89% | 🚀 Very Fast | ✅ CSS selectors | ✅ `TreeDom .create_*()` | ❌ Needs sanitization | Fast and mostly correct, but missing benchmarked capabilities count against compliance. |
19
+ | **`html5lib`**<br>Pure Python | 🟡 86% | 🐢 Slow | 🟡 XPath (lxml) | 🟡 Tree API | 🔴 [Deprecated](https://github.com/html5lib/html5lib-python/issues/443) | Unmaintained reference implementation; incomplete coverage of the tree-construction fixtures. |
20
+ | **`html5_parser`**<br>Python wrapper of C-based Gumbo | 🔴 49% | 🚀 Very Fast | 🟡 XPath (lxml) | 🟡 `etree` (lxml) | ❌ Needs sanitization | Fast, but its public tree API loses information needed by many fixtures. |
21
+ | **`BeautifulSoup`**<br>Pure Python | 🔴 <1% (default) | 🐢 Slow | 🟡 Custom API | ✅ `new_tag()` API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
22
+ | **`html.parser`**<br>Python stdlib | 🔴 <1% | ⚡ Fast | ❌ None | ❌ None | ❌ Needs sanitization | Standard library. Chokes on malformed HTML. |
23
+ | **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 <1% | 🚀 Very Fast | 🟡 XPath | ✅ `etree` / E-factory | ❌ Needs sanitization | Fast but not HTML5 compliant. Context-fragment cases are skipped; supported cases still perform poorly. Don't use the old lxml.html.clean module! |
24
+
25
+ [1]: Parser compliance scores are from a strict run of the [html5lib-tests](https://github.com/html5lib/html5lib-tests) tree-construction fixtures (1,743 non-script tests). The score is `pass / (pass + fail + error)`; unsupported public API capabilities count as failures rather than being faked. The benchmark may compose multiple public APIs from the same parser, but does not use testcase-specific shims or synthetic adapters when an API surface is missing. See [Correctness Testing](correctness.md) for details.
26
+
27
+ [2]: Browser numbers are from a local rerun of [`justhtml-html5lib-tests-bench`](https://github.com/EmilStenstrom/justhtml-html5lib-tests-bench) against this repo's `tests/html5lib-tests-tree/*.dat` corpus: Chromium 1762/1770, WebKit 1742/1770, Firefox 1728/1770, with 12 skipped scripting-enabled cases per engine.
28
+
29
+ ## Why JustHTML
30
+
31
+ Most Python HTML projects start simple and then accumulate extra tools:
32
+
33
+ - a parser for broken HTML
34
+ - a sanitizer for user input
35
+ - a selector engine
36
+ - a serializer
37
+ - linkification or cleanup filters
38
+ - text or Markdown extraction
39
+
40
+ JustHTML keeps those operations on one DOM. That makes the behavior easier to reason about, especially when the input is untrusted.
41
+
42
+ ```python
43
+ from justhtml import JustHTML
44
+
45
+ doc = JustHTML("<p>Hello<script>alert(1)</script><a href='javascript:x'>link</a></p>", fragment=True)
46
+
47
+ print(doc.to_html(pretty=False))
48
+ # <p>Hello<a>link</a></p>
49
+ ```
50
+
51
+ Sanitization happens before you query or serialize unless you explicitly disable it with `sanitize=False`.
52
+
53
+ ## When to Choose Another Tool
54
+
55
+ Choose **selectolax** when raw speed is the main requirement and the HTML is trusted or sanitized elsewhere.
56
+
57
+ Choose **markupever** or **html5_parser** when you specifically want their underlying parser engines or tree APIs and can accept their compatibility tradeoffs.
58
+
59
+ Choose **BeautifulSoup** when you want its forgiving, familiar scraping API and parser correctness is not the main risk.
60
+
61
+ Choose **lxml** when your project is already built around XPath, etree, or XML-style processing.
62
+
63
+ Choose **nh3** when you only need fast sanitization and are happy with a Rust-backed dependency.
64
+
65
+ Choose **html.parser** when you need a tiny stdlib-only script for trusted input and HTML5 correctness does not matter.
66
+
67
+ Choose **Bleach** only for existing codebases that already depend on it. For new projects, prefer an actively maintained sanitizer path. See [Migrating from Bleach](bleach-migration.md).
68
+
69
+ ## Tradeoffs
70
+
71
+ JustHTML is pure Python. That makes it easy to install, inspect, debug, and run in environments like Pyodide, but it will not beat C or Rust parsers on raw throughput.
72
+
73
+ JustHTML sanitizes HTML output by default. That is the right default for user-generated content, CMS snippets, comments, scraped fragments, and transform pipelines that eventually return to a browser. If all of your input is trusted, pass `sanitize=False`.
74
+
75
+ JustHTML's sanitizer emits HTML-only output. SVG and MathML can still be parsed when sanitization is disabled, but sanitized output drops foreign-namespace content to keep the security model smaller and more reviewable.
76
+
77
+ ## Related Pages
78
+
79
+ - [Correctness Testing](correctness.md)
80
+ - [Sanitization & Security](sanitization.md)
81
+ - [Migrating from Bleach](bleach-migration.md)
82
+ - [Performance Benchmark](../benchmarks/performance.py)
@@ -21,6 +21,7 @@ A pure Python HTML5 parser that just works.
21
21
  ## Contents
22
22
 
23
23
  - **[Quickstart](quickstart.md)** - Get up and running in 2 minutes
24
+ - **[Comparison](comparison.md)** - How JustHTML compares with other Python HTML tools
24
25
  - **[Learn by examples](migration-examples.md)** - Real-world StackOverflow tasks rewritten with JustHTML
25
26
  - **[API Reference](api.md)** - Complete public API documentation
26
27
  - **[Command Line](cli.md)** - Use `justhtml` to extract HTML, text, or Markdown
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "justhtml"
3
3
  authors = [{ name = "Emil Stenström", email = "emil@emilstenstrom.se" }]
4
- version = "2.0.0"
4
+ version = "2.2.0"
5
5
  description = "A pure Python HTML5 parser that just works."
6
6
  readme = "README.md"
7
7
  license = { file = "LICENSE" }
@@ -2,18 +2,18 @@
2
2
  """Interactive release helper for JustHTML.
3
3
 
4
4
  What it does (in order):
5
- 1) Bumps version in pyproject.toml ([project].version)
6
- 2) Commits the change
7
- 3) Creates an annotated git tag
8
- 4) Pushes commit + tag
9
- 5) Creates a GitHub release (marked as latest) via `gh`
10
-
11
- This script is intentionally minimal and uses only `git` and the GitHub CLI (`gh`).
5
+ 1) Verifies the repo-wide pre-commit checks CI runs for releases
6
+ 2) Bumps version in pyproject.toml ([project].version)
7
+ 3) Commits the change
8
+ 4) Creates an annotated git tag
9
+ 5) Pushes commit + tag
10
+ 6) Creates a GitHub release (marked as latest) via `gh`
12
11
  """
13
12
 
14
13
  from __future__ import annotations
15
14
 
16
15
  import argparse
16
+ import os
17
17
  import re
18
18
  import shlex
19
19
  import subprocess
@@ -55,6 +55,29 @@ def _run(cmd: list[str], *, check: bool = True) -> CmdResult:
55
55
  return out
56
56
 
57
57
 
58
+ def _run_with_env(cmd: list[str], *, env: dict[str, str], check: bool = True) -> CmdResult:
59
+ merged_env = os.environ.copy()
60
+ merged_env.update(env)
61
+ p = subprocess.run( # noqa: S603
62
+ cmd,
63
+ check=False,
64
+ text=True,
65
+ stdout=subprocess.PIPE,
66
+ stderr=subprocess.STDOUT,
67
+ env=merged_env,
68
+ )
69
+ out = CmdResult(stdout=p.stdout, returncode=p.returncode)
70
+ if check and p.returncode != 0:
71
+ raise RuntimeError(
72
+ "Command failed (exit {code}): {cmd}\n{out}".format(
73
+ code=p.returncode,
74
+ cmd=_quote_cmd(cmd),
75
+ out=(p.stdout or "").rstrip(),
76
+ )
77
+ )
78
+ return out
79
+
80
+
58
81
  def _run_quiet_ok(cmd: list[str]) -> bool:
59
82
  p = subprocess.run( # noqa: S603
60
83
  cmd,
@@ -248,6 +271,13 @@ def _default_repo_from_remote(remote: str) -> str:
248
271
  return f"{m.group('owner')}/{m.group('repo')}"
249
272
 
250
273
 
274
+ def _run_release_checks() -> None:
275
+ print("Running release checks: SKIP=mypy pre-commit run --all-files")
276
+ out = _run_with_env(["pre-commit", "run", "--all-files"], env={"SKIP": "mypy"}).stdout
277
+ if out.strip():
278
+ print(out.rstrip())
279
+
280
+
251
281
  def main(argv: list[str] | None = None) -> int:
252
282
  parser = argparse.ArgumentParser(description="Bump version, tag, and create a GitHub release.")
253
283
  parser.add_argument("--version", help="New version, e.g. 0.21.0 (will be tagged as v0.21.0 unless --tag is set)")
@@ -294,12 +324,20 @@ def main(argv: list[str] | None = None) -> int:
294
324
  action="store_true",
295
325
  help="Do not prompt for confirmation before push/release.",
296
326
  )
327
+ parser.add_argument(
328
+ "--skip-checks",
329
+ action="store_true",
330
+ help="Skip repo-wide pre-commit validation. Use only when you have already validated the exact release commit.",
331
+ )
297
332
 
298
333
  args = parser.parse_args(argv)
299
334
 
300
335
  try:
301
336
  _require_clean_git()
302
337
 
338
+ if not args.skip_checks:
339
+ _run_release_checks()
340
+
303
341
  py_text = PYPROJECT_PATH.read_text(encoding="utf-8")
304
342
  current_version = _read_current_version(py_text)
305
343
 
@@ -163,14 +163,17 @@ class JustHTML:
163
163
  if needs_escape_incomplete_tags:
164
164
  opts.emit_bogus_markup_as_text = True
165
165
 
166
- # For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
167
- if fragment_context and not fragment_context.namespace:
168
- rawtext_elements = {"textarea", "title", "style"}
166
+ # For text-like HTML fragment contexts, set the initial tokenizer state
167
+ # to match the context element.
168
+ if fragment_context and fragment_context.namespace in {None, "html"}:
169
169
  tag_name = fragment_context.tag_name.lower()
170
- if tag_name in rawtext_elements:
170
+ if tag_name in {"textarea", "title"}:
171
+ opts.initial_state = Tokenizer.RCDATA
172
+ elif tag_name in {"iframe", "noembed", "noframes", "script", "style", "xmp"} or (
173
+ tag_name == "noscript" and opts.scripting_enabled
174
+ ):
171
175
  opts.initial_state = Tokenizer.RAWTEXT
172
- opts.initial_rawtext_tag = tag_name
173
- elif tag_name in ("plaintext", "script"):
176
+ elif tag_name == "plaintext":
174
177
  opts.initial_state = Tokenizer.PLAINTEXT
175
178
 
176
179
  self.tokenizer = Tokenizer(