justhtml 2.0.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {justhtml-2.0.0 → justhtml-2.1.0}/CHANGELOG.md +11 -0
  2. justhtml-2.1.0/PKG-INFO +185 -0
  3. justhtml-2.1.0/README.md +131 -0
  4. {justhtml-2.0.0 → justhtml-2.1.0}/SECURITY.md +2 -1
  5. {justhtml-2.0.0 → justhtml-2.1.0}/benchmarks/correctness.py +2 -2
  6. justhtml-2.1.0/docs/comparison.md +82 -0
  7. {justhtml-2.0.0 → justhtml-2.1.0}/docs/index.md +1 -0
  8. {justhtml-2.0.0 → justhtml-2.1.0}/pyproject.toml +1 -1
  9. {justhtml-2.0.0 → justhtml-2.1.0}/scripts/release.py +45 -7
  10. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/url/__init__.py +1 -0
  11. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/url/runtime.py +2 -0
  12. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/url/spec.py +1 -0
  13. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/transforms/__init__.py +19 -0
  14. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/transforms/compile.py +33 -50
  15. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/transforms/runtime.py +73 -13
  16. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/treebuilder/core.py +42 -3
  17. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/treebuilder/modes.py +6 -0
  18. {justhtml-2.0.0 → justhtml-2.1.0}/test-summary.txt +4 -4
  19. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_precommit_coverage.py +11 -4
  20. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_sanitize.py +11 -0
  21. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_transforms.py +53 -0
  22. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_treebuilder.py +15 -7
  23. justhtml-2.0.0/PKG-INFO +0 -271
  24. justhtml-2.0.0/README.md +0 -217
  25. {justhtml-2.0.0 → justhtml-2.1.0}/.github/copilot-instructions.md +0 -0
  26. {justhtml-2.0.0 → justhtml-2.1.0}/.github/workflows/ci.yml +0 -0
  27. {justhtml-2.0.0 → justhtml-2.1.0}/.github/workflows/publish.yml +0 -0
  28. {justhtml-2.0.0 → justhtml-2.1.0}/.gitignore +0 -0
  29. {justhtml-2.0.0 → justhtml-2.1.0}/.pre-commit-config.yaml +0 -0
  30. {justhtml-2.0.0 → justhtml-2.1.0}/CODE_OF_CONDUCT.md +0 -0
  31. {justhtml-2.0.0 → justhtml-2.1.0}/CONTRIBUTING.md +0 -0
  32. {justhtml-2.0.0 → justhtml-2.1.0}/LICENSE +0 -0
  33. {justhtml-2.0.0 → justhtml-2.1.0}/Makefile +0 -0
  34. {justhtml-2.0.0 → justhtml-2.1.0}/benchmarks/fuzz.py +0 -0
  35. {justhtml-2.0.0 → justhtml-2.1.0}/benchmarks/performance.py +0 -0
  36. {justhtml-2.0.0 → justhtml-2.1.0}/benchmarks/profile.py +0 -0
  37. {justhtml-2.0.0 → justhtml-2.1.0}/docs/_config.yml +0 -0
  38. {justhtml-2.0.0 → justhtml-2.1.0}/docs/_layouts/default.html +0 -0
  39. {justhtml-2.0.0 → justhtml-2.1.0}/docs/api.md +0 -0
  40. {justhtml-2.0.0 → justhtml-2.1.0}/docs/assets/search.js +0 -0
  41. {justhtml-2.0.0 → justhtml-2.1.0}/docs/bleach-migration.md +0 -0
  42. {justhtml-2.0.0 → justhtml-2.1.0}/docs/building.md +0 -0
  43. {justhtml-2.0.0 → justhtml-2.1.0}/docs/cli.md +0 -0
  44. {justhtml-2.0.0 → justhtml-2.1.0}/docs/correctness.md +0 -0
  45. {justhtml-2.0.0 → justhtml-2.1.0}/docs/encoding.md +0 -0
  46. {justhtml-2.0.0 → justhtml-2.1.0}/docs/errors.md +0 -0
  47. {justhtml-2.0.0 → justhtml-2.1.0}/docs/fragments.md +0 -0
  48. {justhtml-2.0.0 → justhtml-2.1.0}/docs/html-cleaning.md +0 -0
  49. {justhtml-2.0.0 → justhtml-2.1.0}/docs/linkify.md +0 -0
  50. {justhtml-2.0.0 → justhtml-2.1.0}/docs/migration-examples.md +0 -0
  51. {justhtml-2.0.0 → justhtml-2.1.0}/docs/playground/app.js +0 -0
  52. {justhtml-2.0.0 → justhtml-2.1.0}/docs/playground/index.html +0 -0
  53. {justhtml-2.0.0 → justhtml-2.1.0}/docs/playground/py/__init__.py +0 -0
  54. {justhtml-2.0.0 → justhtml-2.1.0}/docs/playground/py/install_latest_justhtml.py +0 -0
  55. {justhtml-2.0.0 → justhtml-2.1.0}/docs/playground/py/render.py +0 -0
  56. {justhtml-2.0.0 → justhtml-2.1.0}/docs/playground/py/use_local_repo.py +0 -0
  57. {justhtml-2.0.0 → justhtml-2.1.0}/docs/quickstart.md +0 -0
  58. {justhtml-2.0.0 → justhtml-2.1.0}/docs/sanitization.md +0 -0
  59. {justhtml-2.0.0 → justhtml-2.1.0}/docs/selectors.md +0 -0
  60. {justhtml-2.0.0 → justhtml-2.1.0}/docs/streaming.md +0 -0
  61. {justhtml-2.0.0 → justhtml-2.1.0}/docs/text.md +0 -0
  62. {justhtml-2.0.0 → justhtml-2.1.0}/docs/transforms.md +0 -0
  63. {justhtml-2.0.0 → justhtml-2.1.0}/docs/unsafe-handling.md +0 -0
  64. {justhtml-2.0.0 → justhtml-2.1.0}/docs/url-cleaning.md +0 -0
  65. {justhtml-2.0.0 → justhtml-2.1.0}/llms.txt +0 -0
  66. {justhtml-2.0.0 → justhtml-2.1.0}/run_tests.py +0 -0
  67. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/__init__.py +0 -0
  68. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/__main__.py +0 -0
  69. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/core/__init__.py +0 -0
  70. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/core/constants.py +0 -0
  71. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/core/entities.py +0 -0
  72. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/core/errors.py +0 -0
  73. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/core/rawtext.py +0 -0
  74. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/dom/__init__.py +0 -0
  75. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/dom/builder.py +0 -0
  76. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/parser/__init__.py +0 -0
  77. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/parser/context.py +0 -0
  78. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/parser/encoding.py +0 -0
  79. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/parser/stream.py +0 -0
  80. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/py.typed +0 -0
  81. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/__init__.py +0 -0
  82. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/css.py +0 -0
  83. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/dom.py +0 -0
  84. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/policy.py +0 -0
  85. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/policy_defaults.py +0 -0
  86. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/rawtext.py +0 -0
  87. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/sanitizer/url/policy.py +0 -0
  88. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/selector/__init__.py +0 -0
  89. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/selector/core.py +0 -0
  90. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/serializer/__init__.py +0 -0
  91. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/serializer/html.py +0 -0
  92. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/serializer/markdown.py +0 -0
  93. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/tokenizer/__init__.py +0 -0
  94. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/tokenizer/html.py +0 -0
  95. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/tokenizer/tokens.py +0 -0
  96. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/transforms/linkify.py +0 -0
  97. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/transforms/linkify_core.py +0 -0
  98. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/transforms/spec.py +0 -0
  99. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/treebuilder/__init__.py +0 -0
  100. {justhtml-2.0.0 → justhtml-2.1.0}/src/justhtml/treebuilder/utils.py +0 -0
  101. {justhtml-2.0.0 → justhtml-2.1.0}/tests/README.md +0 -0
  102. {justhtml-2.0.0 → justhtml-2.1.0}/tests/__init__.py +0 -0
  103. {justhtml-2.0.0 → justhtml-2.1.0}/tests/data/wikipedia.html +0 -0
  104. {justhtml-2.0.0 → justhtml-2.1.0}/tests/harness/__init__.py +0 -0
  105. {justhtml-2.0.0 → justhtml-2.1.0}/tests/harness/encoding.py +0 -0
  106. {justhtml-2.0.0 → justhtml-2.1.0}/tests/harness/regressions.py +0 -0
  107. {justhtml-2.0.0 → justhtml-2.1.0}/tests/harness/reporter.py +0 -0
  108. {justhtml-2.0.0 → justhtml-2.1.0}/tests/harness/serializer.py +0 -0
  109. {justhtml-2.0.0 → justhtml-2.1.0}/tests/harness/tokenizer.py +0 -0
  110. {justhtml-2.0.0 → justhtml-2.1.0}/tests/harness/tree.py +0 -0
  111. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-sanitize-tests/cases.json +0 -0
  112. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/branch_coverage.dat +0 -0
  113. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/coverage_gaps.test +0 -0
  114. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/empty_stack_edge_cases.dat +0 -0
  115. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/entities.test +0 -0
  116. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/iframe_srcdoc.dat +0 -0
  117. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/tokenizer_edge_cases.test +0 -0
  118. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/treebuilder_coverage.dat +0 -0
  119. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/xml_coercion.dat +0 -0
  120. {justhtml-2.0.0 → justhtml-2.1.0}/tests/justhtml-tests/xml_coercion_coverage.test +0 -0
  121. {justhtml-2.0.0 → justhtml-2.1.0}/tests/linkify-it/LICENSE.txt +0 -0
  122. {justhtml-2.0.0 → justhtml-2.1.0}/tests/linkify-it/README.md +0 -0
  123. {justhtml-2.0.0 → justhtml-2.1.0}/tests/linkify-it/fixtures/links.txt +0 -0
  124. {justhtml-2.0.0 → justhtml-2.1.0}/tests/linkify-it/fixtures/not_links.txt +0 -0
  125. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_builder.py +0 -0
  126. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_cli.py +0 -0
  127. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_docs_examples.py +0 -0
  128. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_encoding.py +0 -0
  129. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_errors.py +0 -0
  130. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_linkify_internals.py +0 -0
  131. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_linkify_it.py +0 -0
  132. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_linkify_transform.py +0 -0
  133. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_node.py +0 -0
  134. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_playground_local_repo_file_list.py +0 -0
  135. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_sanitize_integration.py +0 -0
  136. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_sanitize_transform.py +0 -0
  137. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_selector.py +0 -0
  138. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_serialize.py +0 -0
  139. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_stream.py +0 -0
  140. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_tokenizer.py +0 -0
  141. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_transforms_compiler.py +0 -0
  142. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_transforms_edge_cases.py +0 -0
  143. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_transforms_sanitize_integration.py +0 -0
  144. {justhtml-2.0.0 → justhtml-2.1.0}/tests/test_wikipedia.py +0 -0
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
5
5
  The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
6
6
  and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
7
 
8
+ ## [Unreleased]
9
+
10
+ ## [2.1.0] - 2026-06-06
11
+
12
+ ### Performance
13
+ - Avoid quadratic work for deeply nested HTML during default sanitization by carrying foreign-content context through the transform traversal instead of rescanning each node's ancestors, and by short-circuiting repeated `<p>` scope checks when no `<p>` is open.
14
+ - Speed up default sanitization by constructing selector matchers only for transforms that actually need selector matching.
15
+ - Speed up sanitizer attribute and text cleanup by skipping URL-sink resolution for non-URL attributes and bypassing invisible-Unicode regex scans for ASCII values.
16
+ - Avoid unnecessary default work by skipping selectedcontent finalization when no `<select>` was parsed and by bypassing invisible-Unicode transform helper calls for ASCII node values.
17
+ - Skip the sanitizer rawtext hardening pass when the active policy cannot preserve `<script>` or `<style>` elements.
18
+
8
19
  ## [2.0.0] - 2026-05-24
9
20
 
10
21
  ### Changed
@@ -0,0 +1,185 @@
1
+ Metadata-Version: 2.4
2
+ Name: justhtml
3
+ Version: 2.1.0
4
+ Summary: A pure Python HTML5 parser that just works.
5
+ Project-URL: Homepage, https://github.com/emilstenstrom/justhtml
6
+ Project-URL: Issues, https://github.com/emilstenstrom/justhtml/issues
7
+ Author-email: Emil Stenström <emil@emilstenstrom.se>
8
+ License: MIT License
9
+
10
+ Copyright (c) 2025 Emil Stenström (JustHTML)
11
+ Copyright (c) 2014-2017, The html5ever Project Developers (html5ever inspiration)
12
+ Copyright (c) 2006-2013 James Graham, Sam Sneddon, and
13
+ other contributors (html5lib-tests)
14
+
15
+ Permission is hereby granted, free of charge, to any person obtaining a copy
16
+ of this software and associated documentation files (the "Software"), to deal
17
+ in the Software without restriction, including without limitation the rights
18
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
19
+ copies of the Software, and to permit persons to whom the Software is
20
+ furnished to do so, subject to the following conditions:
21
+
22
+ The above copyright notice and this permission notice shall be included in all
23
+ copies or substantial portions of the Software.
24
+
25
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
26
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
28
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
29
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
30
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31
+ SOFTWARE.
32
+ License-File: LICENSE
33
+ Classifier: License :: OSI Approved :: MIT License
34
+ Classifier: Operating System :: OS Independent
35
+ Classifier: Programming Language :: Python :: 3
36
+ Requires-Python: >=3.10
37
+ Provides-Extra: benchmark
38
+ Requires-Dist: beautifulsoup4; extra == 'benchmark'
39
+ Requires-Dist: html5-parser; extra == 'benchmark'
40
+ Requires-Dist: html5lib; extra == 'benchmark'
41
+ Requires-Dist: lxml; extra == 'benchmark'
42
+ Requires-Dist: markupever; extra == 'benchmark'
43
+ Requires-Dist: psutil; extra == 'benchmark'
44
+ Requires-Dist: selectolax>=0.4.8; extra == 'benchmark'
45
+ Requires-Dist: zstandard; extra == 'benchmark'
46
+ Provides-Extra: dev
47
+ Requires-Dist: build; extra == 'dev'
48
+ Requires-Dist: coverage; extra == 'dev'
49
+ Requires-Dist: mypy>=1.0; (platform_python_implementation != 'PyPy') and extra == 'dev'
50
+ Requires-Dist: pre-commit; extra == 'dev'
51
+ Requires-Dist: ruff==0.14.7; extra == 'dev'
52
+ Requires-Dist: twine; extra == 'dev'
53
+ Description-Content-Type: text/markdown
54
+
55
+ # JustHTML
56
+
57
+ HTML from the real web is messy. It is often malformed, user supplied, scraped from unknown pages, or headed for a browser where small parsing differences can become security bugs.
58
+
59
+ JustHTML gives Python projects one small dependency for the common HTML jobs:
60
+
61
+ - parse HTML like a browser, including broken markup
62
+ - sanitize untrusted HTML by default
63
+ - query with CSS selectors
64
+ - transform, serialize, extract text, or convert to Markdown
65
+ - run anywhere Python runs, with no C extension and no system package to install
66
+
67
+ ```bash
68
+ pip install justhtml
69
+ ```
70
+
71
+ Requires Python 3.10 or later.
72
+
73
+ [Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
74
+
75
+ ## Why Use It?
76
+
77
+ Most Python HTML libraries optimize for one part of the problem.
78
+
79
+ `html.parser` is built in, but not HTML5-correct. BeautifulSoup is convenient, but depends heavily on the parser underneath. `lxml` and C/Rust-backed parsers are fast, but usually leave sanitization as a separate concern. `html5lib` and Bleach shaped the Python ecosystem, but both are no longer the obvious foundation for new projects.
80
+
81
+ JustHTML is for applications that want a boring, inspectable, pure-Python default:
82
+
83
+ - **Correct parsing:** browser-style HTML5 recovery, tested against the official html5lib fixtures.
84
+ - **Safe by default:** `JustHTML(html)` sanitizes before you query or serialize.
85
+ - **One DOM:** parse once, then sanitize, query, transform, serialize, extract text, or produce Markdown.
86
+ - **Easy deployment:** zero runtime dependencies, no compiler, works on PyPy and Pyodide.
87
+ - **Honest tradeoff:** if you are parsing terabytes of trusted HTML, use a C/Rust parser. If you need reliable handling of untrusted or malformed HTML inside a Python app, use JustHTML.
88
+
89
+ ## Quick Start
90
+
91
+ ```python
92
+ from justhtml import JustHTML
93
+
94
+ doc = JustHTML(
95
+ "<p>Hello<script>alert(1)</script> "
96
+ "<a href='javascript:alert(1)'>bad</a> "
97
+ "<a href='https://example.com'>ok</a></p>",
98
+ fragment=True,
99
+ )
100
+
101
+ print(doc.to_html(pretty=False))
102
+ # => <p>Hello <a>bad</a> <a href="https://example.com">ok</a></p>
103
+ ```
104
+
105
+ Sanitization is enabled by default. Disable it only for trusted input:
106
+
107
+ ```python
108
+ doc = JustHTML("<main><p class='intro'>Hello</p></main>", sanitize=False)
109
+ intro = doc.query_one("p.intro")
110
+
111
+ print(intro.to_text())
112
+ # => Hello
113
+ ```
114
+
115
+ ## What You Can Do
116
+
117
+ ```python
118
+ from justhtml import JustHTML, Linkify, SetAttrs, Unwrap
119
+
120
+ doc = JustHTML(
121
+ "<p>Hello <span>world</span> example.com</p>",
122
+ fragment=True,
123
+ sanitize=False,
124
+ transforms=[
125
+ Unwrap("span"),
126
+ Linkify(),
127
+ SetAttrs("a", rel="nofollow"),
128
+ ],
129
+ )
130
+
131
+ print(doc.to_html(pretty=False))
132
+ # => <p>Hello world <a href="http://example.com" rel="nofollow">example.com</a></p>
133
+ ```
134
+
135
+ JustHTML includes:
136
+
137
+ - [CSS selectors](docs/selectors.md): `query()` and `query_one()`
138
+ - [Sanitization](docs/sanitization.md): allowlisted HTML cleaning, URL policies, inline CSS controls
139
+ - [Transforms](docs/transforms.md): unwrap, drop, edit attributes, linkify, compose cleanup pipelines
140
+ - [Text output](docs/text.md): `to_text()` and Markdown generation
141
+ - [Builder API](docs/building.md): construct nodes directly from Python
142
+ - [Streaming](docs/streaming.md): process large inputs incrementally
143
+ - [Bleach migration guide](docs/bleach-migration.md): move existing sanitizer code to JustHTML policies
144
+
145
+ ## Command Line
146
+
147
+ ```bash
148
+ # Pretty-print an HTML file
149
+ justhtml index.html
150
+
151
+ # Parse from stdin
152
+ curl -s https://example.com | justhtml -
153
+
154
+ # Extract text from selected nodes
155
+ justhtml index.html --selector "main p" --format text
156
+
157
+ # Convert selected HTML to Markdown
158
+ justhtml index.html --selector "article" --format markdown
159
+ ```
160
+
161
+ ## Correctness
162
+
163
+ JustHTML is tested against the official html5lib tokenizer, tree-construction, serializer, and encoding fixtures, plus project-specific sanitizer, selector, transform, CLI, and regression tests.
164
+
165
+ The current test summary is 10,257 passing tests with 100% line and branch coverage. See [Correctness Testing](docs/correctness.md) for details.
166
+
167
+ ## Documentation
168
+
169
+ - [Quickstart](docs/quickstart.md)
170
+ - [Comparison](docs/comparison.md)
171
+ - [API Reference](docs/api.md)
172
+ - [Sanitization & Security](docs/sanitization.md)
173
+ - [Migrating from Bleach](docs/bleach-migration.md)
174
+ - [Command Line](docs/cli.md)
175
+ - [Full documentation site](https://emilstenstrom.github.io/justhtml/)
176
+
177
+ ## Security
178
+
179
+ JustHTML sanitizes by default, but output safety still depends on where you put it. HTML body output is not automatically safe inside JavaScript, CSS, URL attributes, or other contexts.
180
+
181
+ For the supported-version policy and vulnerability reporting, see [SECURITY.md](SECURITY.md).
182
+
183
+ ## License
184
+
185
+ MIT. Free to use for commercial and non-commercial projects.
@@ -0,0 +1,131 @@
1
+ # JustHTML
2
+
3
+ HTML from the real web is messy. It is often malformed, user supplied, scraped from unknown pages, or headed for a browser where small parsing differences can become security bugs.
4
+
5
+ JustHTML gives Python projects one small dependency for the common HTML jobs:
6
+
7
+ - parse HTML like a browser, including broken markup
8
+ - sanitize untrusted HTML by default
9
+ - query with CSS selectors
10
+ - transform, serialize, extract text, or convert to Markdown
11
+ - run anywhere Python runs, with no C extension and no system package to install
12
+
13
+ ```bash
14
+ pip install justhtml
15
+ ```
16
+
17
+ Requires Python 3.10 or later.
18
+
19
+ [Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
20
+
21
+ ## Why Use It?
22
+
23
+ Most Python HTML libraries optimize for one part of the problem.
24
+
25
+ `html.parser` is built in, but not HTML5-correct. BeautifulSoup is convenient, but depends heavily on the parser underneath. `lxml` and C/Rust-backed parsers are fast, but usually leave sanitization as a separate concern. `html5lib` and Bleach shaped the Python ecosystem, but both are no longer the obvious foundation for new projects.
26
+
27
+ JustHTML is for applications that want a boring, inspectable, pure-Python default:
28
+
29
+ - **Correct parsing:** browser-style HTML5 recovery, tested against the official html5lib fixtures.
30
+ - **Safe by default:** `JustHTML(html)` sanitizes before you query or serialize.
31
+ - **One DOM:** parse once, then sanitize, query, transform, serialize, extract text, or produce Markdown.
32
+ - **Easy deployment:** zero runtime dependencies, no compiler, works on PyPy and Pyodide.
33
+ - **Honest tradeoff:** if you are parsing terabytes of trusted HTML, use a C/Rust parser. If you need reliable handling of untrusted or malformed HTML inside a Python app, use JustHTML.
34
+
35
+ ## Quick Start
36
+
37
+ ```python
38
+ from justhtml import JustHTML
39
+
40
+ doc = JustHTML(
41
+ "<p>Hello<script>alert(1)</script> "
42
+ "<a href='javascript:alert(1)'>bad</a> "
43
+ "<a href='https://example.com'>ok</a></p>",
44
+ fragment=True,
45
+ )
46
+
47
+ print(doc.to_html(pretty=False))
48
+ # => <p>Hello <a>bad</a> <a href="https://example.com">ok</a></p>
49
+ ```
50
+
51
+ Sanitization is enabled by default. Disable it only for trusted input:
52
+
53
+ ```python
54
+ doc = JustHTML("<main><p class='intro'>Hello</p></main>", sanitize=False)
55
+ intro = doc.query_one("p.intro")
56
+
57
+ print(intro.to_text())
58
+ # => Hello
59
+ ```
60
+
61
+ ## What You Can Do
62
+
63
+ ```python
64
+ from justhtml import JustHTML, Linkify, SetAttrs, Unwrap
65
+
66
+ doc = JustHTML(
67
+ "<p>Hello <span>world</span> example.com</p>",
68
+ fragment=True,
69
+ sanitize=False,
70
+ transforms=[
71
+ Unwrap("span"),
72
+ Linkify(),
73
+ SetAttrs("a", rel="nofollow"),
74
+ ],
75
+ )
76
+
77
+ print(doc.to_html(pretty=False))
78
+ # => <p>Hello world <a href="http://example.com" rel="nofollow">example.com</a></p>
79
+ ```
80
+
81
+ JustHTML includes:
82
+
83
+ - [CSS selectors](docs/selectors.md): `query()` and `query_one()`
84
+ - [Sanitization](docs/sanitization.md): allowlisted HTML cleaning, URL policies, inline CSS controls
85
+ - [Transforms](docs/transforms.md): unwrap, drop, edit attributes, linkify, compose cleanup pipelines
86
+ - [Text output](docs/text.md): `to_text()` and Markdown generation
87
+ - [Builder API](docs/building.md): construct nodes directly from Python
88
+ - [Streaming](docs/streaming.md): process large inputs incrementally
89
+ - [Bleach migration guide](docs/bleach-migration.md): move existing sanitizer code to JustHTML policies
90
+
91
+ ## Command Line
92
+
93
+ ```bash
94
+ # Pretty-print an HTML file
95
+ justhtml index.html
96
+
97
+ # Parse from stdin
98
+ curl -s https://example.com | justhtml -
99
+
100
+ # Extract text from selected nodes
101
+ justhtml index.html --selector "main p" --format text
102
+
103
+ # Convert selected HTML to Markdown
104
+ justhtml index.html --selector "article" --format markdown
105
+ ```
106
+
107
+ ## Correctness
108
+
109
+ JustHTML is tested against the official html5lib tokenizer, tree-construction, serializer, and encoding fixtures, plus project-specific sanitizer, selector, transform, CLI, and regression tests.
110
+
111
+ The current test summary is 10,257 passing tests with 100% line and branch coverage. See [Correctness Testing](docs/correctness.md) for details.
112
+
113
+ ## Documentation
114
+
115
+ - [Quickstart](docs/quickstart.md)
116
+ - [Comparison](docs/comparison.md)
117
+ - [API Reference](docs/api.md)
118
+ - [Sanitization & Security](docs/sanitization.md)
119
+ - [Migrating from Bleach](docs/bleach-migration.md)
120
+ - [Command Line](docs/cli.md)
121
+ - [Full documentation site](https://emilstenstrom.github.io/justhtml/)
122
+
123
+ ## Security
124
+
125
+ JustHTML sanitizes by default, but output safety still depends on where you put it. HTML body output is not automatically safe inside JavaScript, CSS, URL attributes, or other contexts.
126
+
127
+ For the supported-version policy and vulnerability reporting, see [SECURITY.md](SECURITY.md).
128
+
129
+ ## License
130
+
131
+ MIT. Free to use for commercial and non-commercial projects.
@@ -15,7 +15,8 @@
15
15
 
16
16
  | Version | Supported |
17
17
  | ------- | ---------------------------------------- |
18
- | 1.x | :white_check_mark: (until 2.0 is released) |
18
+ | 2.x | :white_check_mark: (until 3.0 is released) |
19
+ | 1.x | :x: |
19
20
  | < 1.0 | :x: |
20
21
 
21
22
  ## Security Domains
@@ -18,8 +18,8 @@ from enum import Enum
18
18
  from pathlib import Path
19
19
 
20
20
  from justhtml import JustHTML
21
- from justhtml.context import FragmentContext
22
- from justhtml.serialize import to_test_format
21
+ from justhtml.parser.context import FragmentContext
22
+ from justhtml.serializer import to_test_format
23
23
 
24
24
  # Available parsers
25
25
  PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax", "markupever"]
@@ -0,0 +1,82 @@
1
+ [← Back to docs](index.md)
2
+
3
+ # Comparison
4
+
5
+ Use JustHTML when you want browser-grade HTML parsing, safe-by-default sanitization, CSS selectors, transforms, text extraction, and serialization in one pure-Python package.
6
+
7
+ Use a different tool when one narrow requirement matters more than the whole pipeline: maximum throughput, a BeautifulSoup-specific API, XPath-heavy XML work, or integration with an existing lxml tree.
8
+
9
+ ## At a Glance
10
+
11
+ | Tool | HTML5 parsing [1][2] | Speed | Query | Build | Sanitize | Notes |
12
+ |------|------------------------------------------|-------|----------|-------|------------------|-------|
13
+ | **JustHTML**<br>Pure Python | ✅&nbsp;100% | ⚡ Fast | ✅ CSS selectors | ✅ `element()` | ✅ Built-in | Correct, secure, easy to install, and fast enough. |
14
+ | **`selectolax`**<br>Python wrapper of C-based Lexbor | ✅&nbsp;100% | 🚀 Very Fast | ✅ CSS selectors | ✅ `create_node()` | ❌ Needs sanitization | Very fast and spec-compliant. |
15
+ | **Chromium**<br>browser engine | ✅&nbsp;99.5% | 🚀&nbsp;Very&nbsp;Fast | — | — | — | — |
16
+ | **WebKit**<br>browser engine | ✅ 98.4% | 🚀 Very Fast | — | — | — | — |
17
+ | **Firefox**<br>browser engine | ✅ 97.6% | 🚀 Very Fast | — | — | — | — |
18
+ | **`markupever`**<br>Python wrapper of Rust-based html5ever | 🟡 89% | 🚀 Very Fast | ✅ CSS selectors | ✅ `TreeDom .create_*()` | ❌ Needs sanitization | Fast and mostly correct, but missing benchmarked capabilities count against compliance. |
19
+ | **`html5lib`**<br>Pure Python | 🟡 86% | 🐢 Slow | 🟡 XPath (lxml) | 🟡 Tree API | 🔴 [Deprecated](https://github.com/html5lib/html5lib-python/issues/443) | Unmaintained reference implementation; incomplete coverage of the tree-construction fixtures. |
20
+ | **`html5_parser`**<br>Python wrapper of C-based Gumbo | 🔴 49% | 🚀 Very Fast | 🟡 XPath (lxml) | 🟡 `etree` (lxml) | ❌ Needs sanitization | Fast, but its public tree API loses information needed by many fixtures. |
21
+ | **`BeautifulSoup`**<br>Pure Python | 🔴 <1% (default) | 🐢 Slow | 🟡 Custom API | ✅ `new_tag()` API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
22
+ | **`html.parser`**<br>Python stdlib | 🔴 <1% | ⚡ Fast | ❌ None | ❌ None | ❌ Needs sanitization | Standard library. Chokes on malformed HTML. |
23
+ | **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 <1% | 🚀 Very Fast | 🟡 XPath | ✅ `etree` / E-factory | ❌ Needs sanitization | Fast but not HTML5 compliant. Context-fragment cases are skipped; supported cases still perform poorly. Don't use the old lxml.html.clean module! |
24
+
25
+ [1]: Parser compliance scores are from a strict run of the [html5lib-tests](https://github.com/html5lib/html5lib-tests) tree-construction fixtures (1,743 non-script tests). The score is `pass / (pass + fail + error)`; unsupported public API capabilities count as failures rather than being faked. The benchmark may compose multiple public APIs from the same parser, but does not use testcase-specific shims or synthetic adapters when an API surface is missing. See [Correctness Testing](correctness.md) for details.
26
+
27
+ [2]: Browser numbers are from a local rerun of [`justhtml-html5lib-tests-bench`](https://github.com/EmilStenstrom/justhtml-html5lib-tests-bench) against this repo's `tests/html5lib-tests-tree/*.dat` corpus: Chromium 1762/1770, WebKit 1742/1770, Firefox 1728/1770, with 12 skipped scripting-enabled cases per engine.
28
+
29
+ ## Why JustHTML
30
+
31
+ Most Python HTML projects start simple and then accumulate extra tools:
32
+
33
+ - a parser for broken HTML
34
+ - a sanitizer for user input
35
+ - a selector engine
36
+ - a serializer
37
+ - linkification or cleanup filters
38
+ - text or Markdown extraction
39
+
40
+ JustHTML keeps those operations on one DOM. That makes the behavior easier to reason about, especially when the input is untrusted.
41
+
42
+ ```python
43
+ from justhtml import JustHTML
44
+
45
+ doc = JustHTML("<p>Hello<script>alert(1)</script><a href='javascript:x'>link</a></p>", fragment=True)
46
+
47
+ print(doc.to_html(pretty=False))
48
+ # <p>Hello<a>link</a></p>
49
+ ```
50
+
51
+ Sanitization happens before you query or serialize unless you explicitly disable it with `sanitize=False`.
52
+
53
+ ## When to Choose Another Tool
54
+
55
+ Choose **selectolax** when raw speed is the main requirement and the HTML is trusted or sanitized elsewhere.
56
+
57
+ Choose **markupever** or **html5_parser** when you specifically want their underlying parser engines or tree APIs and can accept their compatibility tradeoffs.
58
+
59
+ Choose **BeautifulSoup** when you want its forgiving, familiar scraping API and parser correctness is not the main risk.
60
+
61
+ Choose **lxml** when your project is already built around XPath, etree, or XML-style processing.
62
+
63
+ Choose **nh3** when you only need fast sanitization and are happy with a Rust-backed dependency.
64
+
65
+ Choose **html.parser** when you need a tiny stdlib-only script for trusted input and HTML5 correctness does not matter.
66
+
67
+ Choose **Bleach** only for existing codebases that already depend on it. For new projects, prefer an actively maintained sanitizer path. See [Migrating from Bleach](bleach-migration.md).
68
+
69
+ ## Tradeoffs
70
+
71
+ JustHTML is pure Python. That makes it easy to install, inspect, debug, and run in environments like Pyodide, but it will not beat C or Rust parsers on raw throughput.
72
+
73
+ JustHTML sanitizes HTML output by default. That is the right default for user-generated content, CMS snippets, comments, scraped fragments, and transform pipelines that eventually return to a browser. If all of your input is trusted, pass `sanitize=False`.
74
+
75
+ JustHTML's sanitizer emits HTML-only output. SVG and MathML can still be parsed when sanitization is disabled, but sanitized output drops foreign-namespace content to keep the security model smaller and more reviewable.
76
+
77
+ ## Related Pages
78
+
79
+ - [Correctness Testing](correctness.md)
80
+ - [Sanitization & Security](sanitization.md)
81
+ - [Migrating from Bleach](bleach-migration.md)
82
+ - [Performance Benchmark](../benchmarks/performance.py)
@@ -21,6 +21,7 @@ A pure Python HTML5 parser that just works.
21
21
  ## Contents
22
22
 
23
23
  - **[Quickstart](quickstart.md)** - Get up and running in 2 minutes
24
+ - **[Comparison](comparison.md)** - How JustHTML compares with other Python HTML tools
24
25
  - **[Learn by examples](migration-examples.md)** - Real-world StackOverflow tasks rewritten with JustHTML
25
26
  - **[API Reference](api.md)** - Complete public API documentation
26
27
  - **[Command Line](cli.md)** - Use `justhtml` to extract HTML, text, or Markdown
@@ -1,7 +1,7 @@
1
1
  [project]
2
2
  name = "justhtml"
3
3
  authors = [{ name = "Emil Stenström", email = "emil@emilstenstrom.se" }]
4
- version = "2.0.0"
4
+ version = "2.1.0"
5
5
  description = "A pure Python HTML5 parser that just works."
6
6
  readme = "README.md"
7
7
  license = { file = "LICENSE" }
@@ -2,18 +2,18 @@
2
2
  """Interactive release helper for JustHTML.
3
3
 
4
4
  What it does (in order):
5
- 1) Bumps version in pyproject.toml ([project].version)
6
- 2) Commits the change
7
- 3) Creates an annotated git tag
8
- 4) Pushes commit + tag
9
- 5) Creates a GitHub release (marked as latest) via `gh`
10
-
11
- This script is intentionally minimal and uses only `git` and the GitHub CLI (`gh`).
5
+ 1) Verifies the repo-wide pre-commit checks CI runs for releases
6
+ 2) Bumps version in pyproject.toml ([project].version)
7
+ 3) Commits the change
8
+ 4) Creates an annotated git tag
9
+ 5) Pushes commit + tag
10
+ 6) Creates a GitHub release (marked as latest) via `gh`
12
11
  """
13
12
 
14
13
  from __future__ import annotations
15
14
 
16
15
  import argparse
16
+ import os
17
17
  import re
18
18
  import shlex
19
19
  import subprocess
@@ -55,6 +55,29 @@ def _run(cmd: list[str], *, check: bool = True) -> CmdResult:
55
55
  return out
56
56
 
57
57
 
58
+ def _run_with_env(cmd: list[str], *, env: dict[str, str], check: bool = True) -> CmdResult:
59
+ merged_env = os.environ.copy()
60
+ merged_env.update(env)
61
+ p = subprocess.run( # noqa: S603
62
+ cmd,
63
+ check=False,
64
+ text=True,
65
+ stdout=subprocess.PIPE,
66
+ stderr=subprocess.STDOUT,
67
+ env=merged_env,
68
+ )
69
+ out = CmdResult(stdout=p.stdout, returncode=p.returncode)
70
+ if check and p.returncode != 0:
71
+ raise RuntimeError(
72
+ "Command failed (exit {code}): {cmd}\n{out}".format(
73
+ code=p.returncode,
74
+ cmd=_quote_cmd(cmd),
75
+ out=(p.stdout or "").rstrip(),
76
+ )
77
+ )
78
+ return out
79
+
80
+
58
81
  def _run_quiet_ok(cmd: list[str]) -> bool:
59
82
  p = subprocess.run( # noqa: S603
60
83
  cmd,
@@ -248,6 +271,13 @@ def _default_repo_from_remote(remote: str) -> str:
248
271
  return f"{m.group('owner')}/{m.group('repo')}"
249
272
 
250
273
 
274
+ def _run_release_checks() -> None:
275
+ print("Running release checks: SKIP=mypy pre-commit run --all-files")
276
+ out = _run_with_env(["pre-commit", "run", "--all-files"], env={"SKIP": "mypy"}).stdout
277
+ if out.strip():
278
+ print(out.rstrip())
279
+
280
+
251
281
  def main(argv: list[str] | None = None) -> int:
252
282
  parser = argparse.ArgumentParser(description="Bump version, tag, and create a GitHub release.")
253
283
  parser.add_argument("--version", help="New version, e.g. 0.21.0 (will be tagged as v0.21.0 unless --tag is set)")
@@ -294,12 +324,20 @@ def main(argv: list[str] | None = None) -> int:
294
324
  action="store_true",
295
325
  help="Do not prompt for confirmation before push/release.",
296
326
  )
327
+ parser.add_argument(
328
+ "--skip-checks",
329
+ action="store_true",
330
+ help="Skip repo-wide pre-commit validation. Use only when you have already validated the exact release commit.",
331
+ )
297
332
 
298
333
  args = parser.parse_args(argv)
299
334
 
300
335
  try:
301
336
  _require_clean_git()
302
337
 
338
+ if not args.skip_checks:
339
+ _run_release_checks()
340
+
303
341
  py_text = PYPROJECT_PATH.read_text(encoding="utf-8")
304
342
  current_version = _read_current_version(py_text)
305
343
 
@@ -18,6 +18,7 @@ _url_policy_signature = _url_policy._url_policy_signature
18
18
  _url_rule_signature = _url_policy._url_rule_signature
19
19
  _URL_BEARING_PARAM_NAMES = _url_spec._URL_BEARING_PARAM_NAMES
20
20
  _URL_LIKE_ATTRS = _url_spec._URL_LIKE_ATTRS
21
+ _URL_SINK_ATTRS = _url_spec._URL_SINK_ATTRS
21
22
  _url_sink_kind_for_attr = _url_spec._url_sink_kind_for_attr
22
23
  _effective_allow_relative = _url_runtime._effective_allow_relative
23
24
  _effective_proxy = _url_runtime._effective_proxy
@@ -51,6 +51,8 @@ def _normalize_url_for_checking(value: str) -> str:
51
51
 
52
52
 
53
53
  def _strip_invisible_unicode(value: str) -> str:
54
+ if value.isascii():
55
+ return value
54
56
  if not _INVISIBLE_UNICODE_STRIP_REGEX.search(value):
55
57
  return value
56
58
  return _INVISIBLE_UNICODE_STRIP_REGEX.sub("", value)
@@ -84,6 +84,7 @@ _URL_SINKS: tuple[UrlSink, ...] = (
84
84
  _URL_SINKS_BY_ATTR: Mapping[str, tuple[UrlSink, ...]] = {
85
85
  attr: tuple(sink for sink in _URL_SINKS if sink.attr == attr) for attr in {sink.attr for sink in _URL_SINKS}
86
86
  }
87
+ _URL_SINK_ATTRS: frozenset[str] = frozenset(_URL_SINKS_BY_ATTR)
87
88
 
88
89
 
89
90
  def _url_sink_kind_for_attr(*, tag: str, attr: str, attrs: Mapping[str, str | None]) -> UrlSinkKind | None:
@@ -368,6 +368,13 @@ class _CompiledDecideElementsChain:
368
368
  self.callbacks = callbacks
369
369
 
370
370
 
371
+ @dataclass(frozen=True, slots=True)
372
+ class _CompiledDropForeignNamespacesTransform:
373
+ kind: Literal["drop_foreign_namespaces"]
374
+ callback: NodeCallback | None
375
+ report: ReportCallback | None
376
+
377
+
371
378
  @dataclass(frozen=True, slots=True)
372
379
  class _CompiledDropCommentsTransform:
373
380
  kind: Literal["drop_comments"]
@@ -406,11 +413,20 @@ class _CompiledHardenRawtextTransform:
406
413
  policy: SanitizationPolicy
407
414
 
408
415
 
416
+ @dataclass(frozen=True, slots=True)
417
+ class _CompiledSelectorLimitsTransform:
418
+ """No-op boundary that carries sanitizer selector limits when no terminal sanitizer pass is needed."""
419
+
420
+ kind: Literal["selector_limits"]
421
+ selector_limits: SelectorLimits
422
+
423
+
409
424
  CompiledTransform = (
410
425
  _CompiledSelectorTransform
411
426
  | _CompiledDecideTransform
412
427
  | _CompiledDecideChain
413
428
  | _CompiledDecideElementsChain
429
+ | _CompiledDropForeignNamespacesTransform
414
430
  | _CompiledEditAttrsTransform
415
431
  | _CompiledEditAttrsChain
416
432
  | _CompiledStripInvisibleUnicodeTransform
@@ -424,6 +440,7 @@ CompiledTransform = (
424
440
  | _CompiledStageHookTransform
425
441
  | _CompiledStageBoundary
426
442
  | _CompiledHardenRawtextTransform
443
+ | _CompiledSelectorLimitsTransform
427
444
  )
428
445
 
429
446
 
@@ -431,6 +448,8 @@ def _selector_limits_from_compiled(
431
448
  compiled: list[CompiledTransform] | tuple[CompiledTransform, ...],
432
449
  ) -> SelectorLimits:
433
450
  for t in reversed(compiled):
451
+ if isinstance(t, _CompiledSelectorLimitsTransform):
452
+ return t.selector_limits
434
453
  if isinstance(t, _CompiledHardenRawtextTransform):
435
454
  return t.policy.selector_limits
436
455