justhtml 2.0.0__tar.gz → 2.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {justhtml-2.0.0 → justhtml-2.2.0}/CHANGELOG.md +29 -0
- justhtml-2.2.0/PKG-INFO +187 -0
- justhtml-2.2.0/README.md +133 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/SECURITY.md +2 -1
- justhtml-2.2.0/assets/justhtml-readme-explainer.png +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/benchmarks/correctness.py +2 -2
- justhtml-2.2.0/docs/comparison.md +82 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/index.md +1 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/pyproject.toml +1 -1
- {justhtml-2.0.0 → justhtml-2.2.0}/scripts/release.py +45 -7
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/parser/__init__.py +9 -6
- justhtml-2.2.0/src/justhtml/parser/stream.py +206 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/__init__.py +1 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/runtime.py +4 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/spec.py +1 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/serializer/html.py +3 -1
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/tokenizer/html.py +48 -4
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/__init__.py +19 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/compile.py +33 -50
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/runtime.py +73 -13
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/treebuilder/core.py +128 -57
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/treebuilder/modes.py +109 -58
- {justhtml-2.0.0 → justhtml-2.2.0}/test-summary.txt +4 -4
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/branch_coverage.dat +3 -3
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_precommit_coverage.py +11 -4
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_sanitize.py +97 -0
- justhtml-2.2.0/tests/test_stream.py +273 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_transforms.py +53 -0
- justhtml-2.2.0/tests/test_treebuilder.py +640 -0
- justhtml-2.0.0/PKG-INFO +0 -271
- justhtml-2.0.0/README.md +0 -217
- justhtml-2.0.0/src/justhtml/parser/stream.py +0 -110
- justhtml-2.0.0/tests/test_stream.py +0 -65
- justhtml-2.0.0/tests/test_treebuilder.py +0 -241
- {justhtml-2.0.0 → justhtml-2.2.0}/.github/copilot-instructions.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/.github/workflows/ci.yml +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/.github/workflows/publish.yml +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/.gitignore +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/.pre-commit-config.yaml +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/CODE_OF_CONDUCT.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/CONTRIBUTING.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/LICENSE +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/Makefile +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/benchmarks/fuzz.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/benchmarks/performance.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/benchmarks/profile.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/_config.yml +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/_layouts/default.html +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/api.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/assets/search.js +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/bleach-migration.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/building.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/cli.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/correctness.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/encoding.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/errors.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/fragments.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/html-cleaning.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/linkify.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/migration-examples.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/app.js +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/index.html +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/py/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/py/install_latest_justhtml.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/py/render.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/playground/py/use_local_repo.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/quickstart.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/sanitization.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/selectors.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/streaming.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/text.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/transforms.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/unsafe-handling.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/docs/url-cleaning.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/llms.txt +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/run_tests.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/__main__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/constants.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/entities.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/errors.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/core/rawtext.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/dom/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/dom/builder.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/parser/context.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/parser/encoding.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/py.typed +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/css.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/dom.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/policy.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/policy_defaults.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/rawtext.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/sanitizer/url/policy.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/selector/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/selector/core.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/serializer/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/serializer/markdown.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/tokenizer/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/tokenizer/tokens.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/linkify.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/linkify_core.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/transforms/spec.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/treebuilder/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/src/justhtml/treebuilder/utils.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/README.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/data/wikipedia.html +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/__init__.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/encoding.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/regressions.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/reporter.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/serializer.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/tokenizer.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/harness/tree.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-sanitize-tests/cases.json +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/coverage_gaps.test +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/empty_stack_edge_cases.dat +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/entities.test +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/iframe_srcdoc.dat +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/tokenizer_edge_cases.test +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/treebuilder_coverage.dat +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/xml_coercion.dat +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/justhtml-tests/xml_coercion_coverage.test +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/linkify-it/LICENSE.txt +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/linkify-it/README.md +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/linkify-it/fixtures/links.txt +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/linkify-it/fixtures/not_links.txt +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_builder.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_cli.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_docs_examples.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_encoding.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_errors.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_linkify_internals.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_linkify_it.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_linkify_transform.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_node.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_playground_local_repo_file_list.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_sanitize_integration.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_sanitize_transform.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_selector.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_serialize.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_tokenizer.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_transforms_compiler.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_transforms_edge_cases.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_transforms_sanitize_integration.py +0 -0
- {justhtml-2.0.0 → justhtml-2.2.0}/tests/test_wikipedia.py +0 -0
|
@@ -5,6 +5,35 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [2.2.0] - 2026-06-07
|
|
11
|
+
|
|
12
|
+
### Fixed
|
|
13
|
+
- Handle `<select><selectedcontent></selectedcontent></select>` without crashing when no `<option>` is present, replace selectedcontent fallback content during parser finalization, and avoid repeated selectedcontent subtree scans.
|
|
14
|
+
- Preserve source order and tag text when escape-mode sanitization handles disallowed rawtext/RCDATA elements with attributed or self-closing end tags.
|
|
15
|
+
- Make `stream()` use namespace-aware tokenizer context for SVG/MathML CDATA, rawtext decisions, self-closing foreign tags, and foreign end-tag stack updates.
|
|
16
|
+
- Use the correct initial tokenizer states for HTML fragment contexts such as `<title>`, `<textarea>`, `<script>`, `<style>`, and scripting-disabled `<noscript>`.
|
|
17
|
+
- Use HTML rawtext/RCDATA tokenizer states for text-like elements inside SVG/MathML HTML integration points and MathML text integration points.
|
|
18
|
+
- Generate implied end tags before removing `<form>` on `</form>` so following controls do not remain inside still-open descendants.
|
|
19
|
+
- Keep `<form>` elements inside `<template>` from claiming the global form pointer, including table-template form insertion.
|
|
20
|
+
- Close open `<p>` elements correctly around `<option>`, `<optgroup>`, `<hr>`, `<p>`, and `<div>` starts in `<select>` parsing.
|
|
21
|
+
- Close `<template>` correctly when `</template>` is seen while parsing inside `<select>`.
|
|
22
|
+
- Keep `</p>` and `</br>` foreign-content breakouts inside MathML text integration points such as `<mi>` and `<mtext>`.
|
|
23
|
+
- Align customizable `<select>` parsing with Chromium for phantom `</p>` handling and generic custom child elements.
|
|
24
|
+
|
|
25
|
+
### Security
|
|
26
|
+
- (Severity: Low) Strip invisible Unicode during URL sink validation even when general invisible-Unicode stripping is disabled. Previously, custom policies using `strip_invisible_unicode=False` could preserve scheme-obfuscated values such as `javascript\u200b:` in otherwise URL-validated attributes.
|
|
27
|
+
|
|
28
|
+
## [2.1.0] - 2026-06-06
|
|
29
|
+
|
|
30
|
+
### Performance
|
|
31
|
+
- Avoid quadratic work for deeply nested HTML during default sanitization by carrying foreign-content context through the transform traversal instead of rescanning each node's ancestors, and by short-circuiting repeated `<p>` scope checks when no `<p>` is open.
|
|
32
|
+
- Speed up default sanitization by constructing selector matchers only for transforms that actually need selector matching.
|
|
33
|
+
- Speed up sanitizer attribute and text cleanup by skipping URL-sink resolution for non-URL attributes and bypassing invisible-Unicode regex scans for ASCII values.
|
|
34
|
+
- Avoid unnecessary default work by skipping selectedcontent finalization when no `<select>` was parsed and by bypassing invisible-Unicode transform helper calls for ASCII node values.
|
|
35
|
+
- Skip the sanitizer rawtext hardening pass when the active policy cannot preserve `<script>` or `<style>` elements.
|
|
36
|
+
|
|
8
37
|
## [2.0.0] - 2026-05-24
|
|
9
38
|
|
|
10
39
|
### Changed
|
justhtml-2.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: justhtml
|
|
3
|
+
Version: 2.2.0
|
|
4
|
+
Summary: A pure Python HTML5 parser that just works.
|
|
5
|
+
Project-URL: Homepage, https://github.com/emilstenstrom/justhtml
|
|
6
|
+
Project-URL: Issues, https://github.com/emilstenstrom/justhtml/issues
|
|
7
|
+
Author-email: Emil Stenström <emil@emilstenstrom.se>
|
|
8
|
+
License: MIT License
|
|
9
|
+
|
|
10
|
+
Copyright (c) 2025 Emil Stenström (JustHTML)
|
|
11
|
+
Copyright (c) 2014-2017, The html5ever Project Developers (html5ever inspiration)
|
|
12
|
+
Copyright (c) 2006-2013 James Graham, Sam Sneddon, and
|
|
13
|
+
other contributors (html5lib-tests)
|
|
14
|
+
|
|
15
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
16
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
17
|
+
in the Software without restriction, including without limitation the rights
|
|
18
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
19
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
20
|
+
furnished to do so, subject to the following conditions:
|
|
21
|
+
|
|
22
|
+
The above copyright notice and this permission notice shall be included in all
|
|
23
|
+
copies or substantial portions of the Software.
|
|
24
|
+
|
|
25
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
26
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
27
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
28
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
29
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
30
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
31
|
+
SOFTWARE.
|
|
32
|
+
License-File: LICENSE
|
|
33
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
34
|
+
Classifier: Operating System :: OS Independent
|
|
35
|
+
Classifier: Programming Language :: Python :: 3
|
|
36
|
+
Requires-Python: >=3.10
|
|
37
|
+
Provides-Extra: benchmark
|
|
38
|
+
Requires-Dist: beautifulsoup4; extra == 'benchmark'
|
|
39
|
+
Requires-Dist: html5-parser; extra == 'benchmark'
|
|
40
|
+
Requires-Dist: html5lib; extra == 'benchmark'
|
|
41
|
+
Requires-Dist: lxml; extra == 'benchmark'
|
|
42
|
+
Requires-Dist: markupever; extra == 'benchmark'
|
|
43
|
+
Requires-Dist: psutil; extra == 'benchmark'
|
|
44
|
+
Requires-Dist: selectolax>=0.4.8; extra == 'benchmark'
|
|
45
|
+
Requires-Dist: zstandard; extra == 'benchmark'
|
|
46
|
+
Provides-Extra: dev
|
|
47
|
+
Requires-Dist: build; extra == 'dev'
|
|
48
|
+
Requires-Dist: coverage; extra == 'dev'
|
|
49
|
+
Requires-Dist: mypy>=1.0; (platform_python_implementation != 'PyPy') and extra == 'dev'
|
|
50
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
51
|
+
Requires-Dist: ruff==0.14.7; extra == 'dev'
|
|
52
|
+
Requires-Dist: twine; extra == 'dev'
|
|
53
|
+
Description-Content-Type: text/markdown
|
|
54
|
+
|
|
55
|
+
# JustHTML
|
|
56
|
+
|
|
57
|
+
HTML from the real web is messy. It is often malformed, user supplied, scraped from unknown pages, or headed for a browser where small parsing differences can become security bugs.
|
|
58
|
+
|
|
59
|
+
JustHTML gives Python projects one small dependency for the common HTML jobs:
|
|
60
|
+
|
|
61
|
+
- parse HTML like a browser, including broken markup
|
|
62
|
+
- sanitize untrusted HTML by default
|
|
63
|
+
- query with CSS selectors
|
|
64
|
+
- transform, serialize, extract text, or convert to Markdown
|
|
65
|
+
- run anywhere Python runs, with no C extension and no system package to install
|
|
66
|
+
|
|
67
|
+
```bash
|
|
68
|
+
pip install justhtml
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
Requires Python 3.10 or later.
|
|
72
|
+
|
|
73
|
+
[Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
|
|
74
|
+
|
|
75
|
+

|
|
76
|
+
|
|
77
|
+
## Why Use It?
|
|
78
|
+
|
|
79
|
+
Most Python HTML libraries optimize for one part of the problem.
|
|
80
|
+
|
|
81
|
+
`html.parser` is built in, but not HTML5-correct. BeautifulSoup is convenient, but depends heavily on the parser underneath. `lxml` and C/Rust-backed parsers are fast, but usually leave sanitization as a separate concern. `html5lib` and Bleach shaped the Python ecosystem, but both are no longer the obvious foundation for new projects.
|
|
82
|
+
|
|
83
|
+
JustHTML is for applications that want a boring, inspectable, pure-Python default:
|
|
84
|
+
|
|
85
|
+
- **Correct parsing:** browser-style HTML5 recovery, tested against the official html5lib fixtures.
|
|
86
|
+
- **Safe by default:** `JustHTML(html)` sanitizes before you query or serialize.
|
|
87
|
+
- **One DOM:** parse once, then sanitize, query, transform, serialize, extract text, or produce Markdown.
|
|
88
|
+
- **Easy deployment:** zero runtime dependencies, no compiler, works on PyPy and Pyodide.
|
|
89
|
+
- **Honest tradeoff:** if you are parsing terabytes of trusted HTML, use a C/Rust parser. If you need reliable handling of untrusted or malformed HTML inside a Python app, use JustHTML.
|
|
90
|
+
|
|
91
|
+
## Quick Start
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
from justhtml import JustHTML
|
|
95
|
+
|
|
96
|
+
doc = JustHTML(
|
|
97
|
+
"<p>Hello<script>alert(1)</script> "
|
|
98
|
+
"<a href='javascript:alert(1)'>bad</a> "
|
|
99
|
+
"<a href='https://example.com'>ok</a></p>",
|
|
100
|
+
fragment=True,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
print(doc.to_html(pretty=False))
|
|
104
|
+
# => <p>Hello <a>bad</a> <a href="https://example.com">ok</a></p>
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Sanitization is enabled by default. Disable it only for trusted input:
|
|
108
|
+
|
|
109
|
+
```python
|
|
110
|
+
doc = JustHTML("<main><p class='intro'>Hello</p></main>", sanitize=False)
|
|
111
|
+
intro = doc.query_one("p.intro")
|
|
112
|
+
|
|
113
|
+
print(intro.to_text())
|
|
114
|
+
# => Hello
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## What You Can Do
|
|
118
|
+
|
|
119
|
+
```python
|
|
120
|
+
from justhtml import JustHTML, Linkify, SetAttrs, Unwrap
|
|
121
|
+
|
|
122
|
+
doc = JustHTML(
|
|
123
|
+
"<p>Hello <span>world</span> example.com</p>",
|
|
124
|
+
fragment=True,
|
|
125
|
+
sanitize=False,
|
|
126
|
+
transforms=[
|
|
127
|
+
Unwrap("span"),
|
|
128
|
+
Linkify(),
|
|
129
|
+
SetAttrs("a", rel="nofollow"),
|
|
130
|
+
],
|
|
131
|
+
)
|
|
132
|
+
|
|
133
|
+
print(doc.to_html(pretty=False))
|
|
134
|
+
# => <p>Hello world <a href="http://example.com" rel="nofollow">example.com</a></p>
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
JustHTML includes:
|
|
138
|
+
|
|
139
|
+
- [CSS selectors](docs/selectors.md): `query()` and `query_one()`
|
|
140
|
+
- [Sanitization](docs/sanitization.md): allowlisted HTML cleaning, URL policies, inline CSS controls
|
|
141
|
+
- [Transforms](docs/transforms.md): unwrap, drop, edit attributes, linkify, compose cleanup pipelines
|
|
142
|
+
- [Text output](docs/text.md): `to_text()` and Markdown generation
|
|
143
|
+
- [Builder API](docs/building.md): construct nodes directly from Python
|
|
144
|
+
- [Streaming](docs/streaming.md): process large inputs incrementally
|
|
145
|
+
- [Bleach migration guide](docs/bleach-migration.md): move existing sanitizer code to JustHTML policies
|
|
146
|
+
|
|
147
|
+
## Command Line
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Pretty-print an HTML file
|
|
151
|
+
justhtml index.html
|
|
152
|
+
|
|
153
|
+
# Parse from stdin
|
|
154
|
+
curl -s https://example.com | justhtml -
|
|
155
|
+
|
|
156
|
+
# Extract text from selected nodes
|
|
157
|
+
justhtml index.html --selector "main p" --format text
|
|
158
|
+
|
|
159
|
+
# Convert selected HTML to Markdown
|
|
160
|
+
justhtml index.html --selector "article" --format markdown
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
## Correctness
|
|
164
|
+
|
|
165
|
+
JustHTML is tested against the official html5lib tokenizer, tree-construction, serializer, and encoding fixtures, plus project-specific sanitizer, selector, transform, CLI, and regression tests.
|
|
166
|
+
|
|
167
|
+
The current test summary is 10,257 passing tests with 100% line and branch coverage. See [Correctness Testing](docs/correctness.md) for details.
|
|
168
|
+
|
|
169
|
+
## Documentation
|
|
170
|
+
|
|
171
|
+
- [Quickstart](docs/quickstart.md)
|
|
172
|
+
- [Comparison](docs/comparison.md)
|
|
173
|
+
- [API Reference](docs/api.md)
|
|
174
|
+
- [Sanitization & Security](docs/sanitization.md)
|
|
175
|
+
- [Migrating from Bleach](docs/bleach-migration.md)
|
|
176
|
+
- [Command Line](docs/cli.md)
|
|
177
|
+
- [Full documentation site](https://emilstenstrom.github.io/justhtml/)
|
|
178
|
+
|
|
179
|
+
## Security
|
|
180
|
+
|
|
181
|
+
JustHTML sanitizes by default, but output safety still depends on where you put it. HTML body output is not automatically safe inside JavaScript, CSS, URL attributes, or other contexts.
|
|
182
|
+
|
|
183
|
+
For the supported-version policy and vulnerability reporting, see [SECURITY.md](SECURITY.md).
|
|
184
|
+
|
|
185
|
+
## License
|
|
186
|
+
|
|
187
|
+
MIT. Free to use for commercial and non-commercial projects.
|
justhtml-2.2.0/README.md
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# JustHTML
|
|
2
|
+
|
|
3
|
+
HTML from the real web is messy. It is often malformed, user supplied, scraped from unknown pages, or headed for a browser where small parsing differences can become security bugs.
|
|
4
|
+
|
|
5
|
+
JustHTML gives Python projects one small dependency for the common HTML jobs:
|
|
6
|
+
|
|
7
|
+
- parse HTML like a browser, including broken markup
|
|
8
|
+
- sanitize untrusted HTML by default
|
|
9
|
+
- query with CSS selectors
|
|
10
|
+
- transform, serialize, extract text, or convert to Markdown
|
|
11
|
+
- run anywhere Python runs, with no C extension and no system package to install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install justhtml
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Requires Python 3.10 or later.
|
|
18
|
+
|
|
19
|
+
[Documentation](https://emilstenstrom.github.io/justhtml/) | [Comparison](docs/comparison.md) | [Playground](https://emilstenstrom.github.io/justhtml/playground/) | [Security policy](SECURITY.md)
|
|
20
|
+
|
|
21
|
+

|
|
22
|
+
|
|
23
|
+
## Why Use It?
|
|
24
|
+
|
|
25
|
+
Most Python HTML libraries optimize for one part of the problem.
|
|
26
|
+
|
|
27
|
+
`html.parser` is built in, but not HTML5-correct. BeautifulSoup is convenient, but depends heavily on the parser underneath. `lxml` and C/Rust-backed parsers are fast, but usually leave sanitization as a separate concern. `html5lib` and Bleach shaped the Python ecosystem, but both are no longer the obvious foundation for new projects.
|
|
28
|
+
|
|
29
|
+
JustHTML is for applications that want a boring, inspectable, pure-Python default:
|
|
30
|
+
|
|
31
|
+
- **Correct parsing:** browser-style HTML5 recovery, tested against the official html5lib fixtures.
|
|
32
|
+
- **Safe by default:** `JustHTML(html)` sanitizes before you query or serialize.
|
|
33
|
+
- **One DOM:** parse once, then sanitize, query, transform, serialize, extract text, or produce Markdown.
|
|
34
|
+
- **Easy deployment:** zero runtime dependencies, no compiler, works on PyPy and Pyodide.
|
|
35
|
+
- **Honest tradeoff:** if you are parsing terabytes of trusted HTML, use a C/Rust parser. If you need reliable handling of untrusted or malformed HTML inside a Python app, use JustHTML.
|
|
36
|
+
|
|
37
|
+
## Quick Start
|
|
38
|
+
|
|
39
|
+
```python
|
|
40
|
+
from justhtml import JustHTML
|
|
41
|
+
|
|
42
|
+
doc = JustHTML(
|
|
43
|
+
"<p>Hello<script>alert(1)</script> "
|
|
44
|
+
"<a href='javascript:alert(1)'>bad</a> "
|
|
45
|
+
"<a href='https://example.com'>ok</a></p>",
|
|
46
|
+
fragment=True,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
print(doc.to_html(pretty=False))
|
|
50
|
+
# => <p>Hello <a>bad</a> <a href="https://example.com">ok</a></p>
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Sanitization is enabled by default. Disable it only for trusted input:
|
|
54
|
+
|
|
55
|
+
```python
|
|
56
|
+
doc = JustHTML("<main><p class='intro'>Hello</p></main>", sanitize=False)
|
|
57
|
+
intro = doc.query_one("p.intro")
|
|
58
|
+
|
|
59
|
+
print(intro.to_text())
|
|
60
|
+
# => Hello
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## What You Can Do
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from justhtml import JustHTML, Linkify, SetAttrs, Unwrap
|
|
67
|
+
|
|
68
|
+
doc = JustHTML(
|
|
69
|
+
"<p>Hello <span>world</span> example.com</p>",
|
|
70
|
+
fragment=True,
|
|
71
|
+
sanitize=False,
|
|
72
|
+
transforms=[
|
|
73
|
+
Unwrap("span"),
|
|
74
|
+
Linkify(),
|
|
75
|
+
SetAttrs("a", rel="nofollow"),
|
|
76
|
+
],
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
print(doc.to_html(pretty=False))
|
|
80
|
+
# => <p>Hello world <a href="http://example.com" rel="nofollow">example.com</a></p>
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
JustHTML includes:
|
|
84
|
+
|
|
85
|
+
- [CSS selectors](docs/selectors.md): `query()` and `query_one()`
|
|
86
|
+
- [Sanitization](docs/sanitization.md): allowlisted HTML cleaning, URL policies, inline CSS controls
|
|
87
|
+
- [Transforms](docs/transforms.md): unwrap, drop, edit attributes, linkify, compose cleanup pipelines
|
|
88
|
+
- [Text output](docs/text.md): `to_text()` and Markdown generation
|
|
89
|
+
- [Builder API](docs/building.md): construct nodes directly from Python
|
|
90
|
+
- [Streaming](docs/streaming.md): process large inputs incrementally
|
|
91
|
+
- [Bleach migration guide](docs/bleach-migration.md): move existing sanitizer code to JustHTML policies
|
|
92
|
+
|
|
93
|
+
## Command Line
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# Pretty-print an HTML file
|
|
97
|
+
justhtml index.html
|
|
98
|
+
|
|
99
|
+
# Parse from stdin
|
|
100
|
+
curl -s https://example.com | justhtml -
|
|
101
|
+
|
|
102
|
+
# Extract text from selected nodes
|
|
103
|
+
justhtml index.html --selector "main p" --format text
|
|
104
|
+
|
|
105
|
+
# Convert selected HTML to Markdown
|
|
106
|
+
justhtml index.html --selector "article" --format markdown
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## Correctness
|
|
110
|
+
|
|
111
|
+
JustHTML is tested against the official html5lib tokenizer, tree-construction, serializer, and encoding fixtures, plus project-specific sanitizer, selector, transform, CLI, and regression tests.
|
|
112
|
+
|
|
113
|
+
The current test summary is 10,257 passing tests with 100% line and branch coverage. See [Correctness Testing](docs/correctness.md) for details.
|
|
114
|
+
|
|
115
|
+
## Documentation
|
|
116
|
+
|
|
117
|
+
- [Quickstart](docs/quickstart.md)
|
|
118
|
+
- [Comparison](docs/comparison.md)
|
|
119
|
+
- [API Reference](docs/api.md)
|
|
120
|
+
- [Sanitization & Security](docs/sanitization.md)
|
|
121
|
+
- [Migrating from Bleach](docs/bleach-migration.md)
|
|
122
|
+
- [Command Line](docs/cli.md)
|
|
123
|
+
- [Full documentation site](https://emilstenstrom.github.io/justhtml/)
|
|
124
|
+
|
|
125
|
+
## Security
|
|
126
|
+
|
|
127
|
+
JustHTML sanitizes by default, but output safety still depends on where you put it. HTML body output is not automatically safe inside JavaScript, CSS, URL attributes, or other contexts.
|
|
128
|
+
|
|
129
|
+
For the supported-version policy and vulnerability reporting, see [SECURITY.md](SECURITY.md).
|
|
130
|
+
|
|
131
|
+
## License
|
|
132
|
+
|
|
133
|
+
MIT. Free to use for commercial and non-commercial projects.
|
|
@@ -15,7 +15,8 @@
|
|
|
15
15
|
|
|
16
16
|
| Version | Supported |
|
|
17
17
|
| ------- | ---------------------------------------- |
|
|
18
|
-
|
|
|
18
|
+
| 2.x | :white_check_mark: (until 3.0 is released) |
|
|
19
|
+
| 1.x | :x: |
|
|
19
20
|
| < 1.0 | :x: |
|
|
20
21
|
|
|
21
22
|
## Security Domains
|
|
Binary file
|
|
@@ -18,8 +18,8 @@ from enum import Enum
|
|
|
18
18
|
from pathlib import Path
|
|
19
19
|
|
|
20
20
|
from justhtml import JustHTML
|
|
21
|
-
from justhtml.context import FragmentContext
|
|
22
|
-
from justhtml.
|
|
21
|
+
from justhtml.parser.context import FragmentContext
|
|
22
|
+
from justhtml.serializer import to_test_format
|
|
23
23
|
|
|
24
24
|
# Available parsers
|
|
25
25
|
PARSERS = ["justhtml", "html5lib", "html5_parser", "lxml", "bs4", "html.parser", "selectolax", "markupever"]
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
[← Back to docs](index.md)
|
|
2
|
+
|
|
3
|
+
# Comparison
|
|
4
|
+
|
|
5
|
+
Use JustHTML when you want browser-grade HTML parsing, safe-by-default sanitization, CSS selectors, transforms, text extraction, and serialization in one pure-Python package.
|
|
6
|
+
|
|
7
|
+
Use a different tool when one narrow requirement matters more than the whole pipeline: maximum throughput, a BeautifulSoup-specific API, XPath-heavy XML work, or integration with an existing lxml tree.
|
|
8
|
+
|
|
9
|
+
## At a Glance
|
|
10
|
+
|
|
11
|
+
| Tool | HTML5 parsing [1][2] | Speed | Query | Build | Sanitize | Notes |
|
|
12
|
+
|------|------------------------------------------|-------|----------|-------|------------------|-------|
|
|
13
|
+
| **JustHTML**<br>Pure Python | ✅ 100% | ⚡ Fast | ✅ CSS selectors | ✅ `element()` | ✅ Built-in | Correct, secure, easy to install, and fast enough. |
|
|
14
|
+
| **`selectolax`**<br>Python wrapper of C-based Lexbor | ✅ 100% | 🚀 Very Fast | ✅ CSS selectors | ✅ `create_node()` | ❌ Needs sanitization | Very fast and spec-compliant. |
|
|
15
|
+
| **Chromium**<br>browser engine | ✅ 99.5% | 🚀 Very Fast | — | — | — | — |
|
|
16
|
+
| **WebKit**<br>browser engine | ✅ 98.4% | 🚀 Very Fast | — | — | — | — |
|
|
17
|
+
| **Firefox**<br>browser engine | ✅ 97.6% | 🚀 Very Fast | — | — | — | — |
|
|
18
|
+
| **`markupever`**<br>Python wrapper of Rust-based html5ever | 🟡 89% | 🚀 Very Fast | ✅ CSS selectors | ✅ `TreeDom .create_*()` | ❌ Needs sanitization | Fast and mostly correct, but missing benchmarked capabilities count against compliance. |
|
|
19
|
+
| **`html5lib`**<br>Pure Python | 🟡 86% | 🐢 Slow | 🟡 XPath (lxml) | 🟡 Tree API | 🔴 [Deprecated](https://github.com/html5lib/html5lib-python/issues/443) | Unmaintained reference implementation; incomplete coverage of the tree-construction fixtures. |
|
|
20
|
+
| **`html5_parser`**<br>Python wrapper of C-based Gumbo | 🔴 49% | 🚀 Very Fast | 🟡 XPath (lxml) | 🟡 `etree` (lxml) | ❌ Needs sanitization | Fast, but its public tree API loses information needed by many fixtures. |
|
|
21
|
+
| **`BeautifulSoup`**<br>Pure Python | 🔴 <1% (default) | 🐢 Slow | 🟡 Custom API | ✅ `new_tag()` API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
|
|
22
|
+
| **`html.parser`**<br>Python stdlib | 🔴 <1% | ⚡ Fast | ❌ None | ❌ None | ❌ Needs sanitization | Standard library. Chokes on malformed HTML. |
|
|
23
|
+
| **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 <1% | 🚀 Very Fast | 🟡 XPath | ✅ `etree` / E-factory | ❌ Needs sanitization | Fast but not HTML5 compliant. Context-fragment cases are skipped; supported cases still perform poorly. Don't use the old lxml.html.clean module! |
|
|
24
|
+
|
|
25
|
+
[1]: Parser compliance scores are from a strict run of the [html5lib-tests](https://github.com/html5lib/html5lib-tests) tree-construction fixtures (1,743 non-script tests). The score is `pass / (pass + fail + error)`; unsupported public API capabilities count as failures rather than being faked. The benchmark may compose multiple public APIs from the same parser, but does not use testcase-specific shims or synthetic adapters when an API surface is missing. See [Correctness Testing](correctness.md) for details.
|
|
26
|
+
|
|
27
|
+
[2]: Browser numbers are from a local rerun of [`justhtml-html5lib-tests-bench`](https://github.com/EmilStenstrom/justhtml-html5lib-tests-bench) against this repo's `tests/html5lib-tests-tree/*.dat` corpus: Chromium 1762/1770, WebKit 1742/1770, Firefox 1728/1770, with 12 skipped scripting-enabled cases per engine.
|
|
28
|
+
|
|
29
|
+
## Why JustHTML
|
|
30
|
+
|
|
31
|
+
Most Python HTML projects start simple and then accumulate extra tools:
|
|
32
|
+
|
|
33
|
+
- a parser for broken HTML
|
|
34
|
+
- a sanitizer for user input
|
|
35
|
+
- a selector engine
|
|
36
|
+
- a serializer
|
|
37
|
+
- linkification or cleanup filters
|
|
38
|
+
- text or Markdown extraction
|
|
39
|
+
|
|
40
|
+
JustHTML keeps those operations on one DOM. That makes the behavior easier to reason about, especially when the input is untrusted.
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from justhtml import JustHTML
|
|
44
|
+
|
|
45
|
+
doc = JustHTML("<p>Hello<script>alert(1)</script><a href='javascript:x'>link</a></p>", fragment=True)
|
|
46
|
+
|
|
47
|
+
print(doc.to_html(pretty=False))
|
|
48
|
+
# <p>Hello<a>link</a></p>
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Sanitization happens before you query or serialize unless you explicitly disable it with `sanitize=False`.
|
|
52
|
+
|
|
53
|
+
## When to Choose Another Tool
|
|
54
|
+
|
|
55
|
+
Choose **selectolax** when raw speed is the main requirement and the HTML is trusted or sanitized elsewhere.
|
|
56
|
+
|
|
57
|
+
Choose **markupever** or **html5_parser** when you specifically want their underlying parser engines or tree APIs and can accept their compatibility tradeoffs.
|
|
58
|
+
|
|
59
|
+
Choose **BeautifulSoup** when you want its forgiving, familiar scraping API and parser correctness is not the main risk.
|
|
60
|
+
|
|
61
|
+
Choose **lxml** when your project is already built around XPath, etree, or XML-style processing.
|
|
62
|
+
|
|
63
|
+
Choose **nh3** when you only need fast sanitization and are happy with a Rust-backed dependency.
|
|
64
|
+
|
|
65
|
+
Choose **html.parser** when you need a tiny stdlib-only script for trusted input and HTML5 correctness does not matter.
|
|
66
|
+
|
|
67
|
+
Choose **Bleach** only for existing codebases that already depend on it. For new projects, prefer an actively maintained sanitizer path. See [Migrating from Bleach](bleach-migration.md).
|
|
68
|
+
|
|
69
|
+
## Tradeoffs
|
|
70
|
+
|
|
71
|
+
JustHTML is pure Python. That makes it easy to install, inspect, debug, and run in environments like Pyodide, but it will not beat C or Rust parsers on raw throughput.
|
|
72
|
+
|
|
73
|
+
JustHTML sanitizes HTML output by default. That is the right default for user-generated content, CMS snippets, comments, scraped fragments, and transform pipelines that eventually return to a browser. If all of your input is trusted, pass `sanitize=False`.
|
|
74
|
+
|
|
75
|
+
JustHTML's sanitizer emits HTML-only output. SVG and MathML can still be parsed when sanitization is disabled, but sanitized output drops foreign-namespace content to keep the security model smaller and more reviewable.
|
|
76
|
+
|
|
77
|
+
## Related Pages
|
|
78
|
+
|
|
79
|
+
- [Correctness Testing](correctness.md)
|
|
80
|
+
- [Sanitization & Security](sanitization.md)
|
|
81
|
+
- [Migrating from Bleach](bleach-migration.md)
|
|
82
|
+
- [Performance Benchmark](../benchmarks/performance.py)
|
|
@@ -21,6 +21,7 @@ A pure Python HTML5 parser that just works.
|
|
|
21
21
|
## Contents
|
|
22
22
|
|
|
23
23
|
- **[Quickstart](quickstart.md)** - Get up and running in 2 minutes
|
|
24
|
+
- **[Comparison](comparison.md)** - How JustHTML compares with other Python HTML tools
|
|
24
25
|
- **[Learn by examples](migration-examples.md)** - Real-world StackOverflow tasks rewritten with JustHTML
|
|
25
26
|
- **[API Reference](api.md)** - Complete public API documentation
|
|
26
27
|
- **[Command Line](cli.md)** - Use `justhtml` to extract HTML, text, or Markdown
|
|
@@ -2,18 +2,18 @@
|
|
|
2
2
|
"""Interactive release helper for JustHTML.
|
|
3
3
|
|
|
4
4
|
What it does (in order):
|
|
5
|
-
1)
|
|
6
|
-
2)
|
|
7
|
-
3)
|
|
8
|
-
4)
|
|
9
|
-
5)
|
|
10
|
-
|
|
11
|
-
This script is intentionally minimal and uses only `git` and the GitHub CLI (`gh`).
|
|
5
|
+
1) Verifies the repo-wide pre-commit checks CI runs for releases
|
|
6
|
+
2) Bumps version in pyproject.toml ([project].version)
|
|
7
|
+
3) Commits the change
|
|
8
|
+
4) Creates an annotated git tag
|
|
9
|
+
5) Pushes commit + tag
|
|
10
|
+
6) Creates a GitHub release (marked as latest) via `gh`
|
|
12
11
|
"""
|
|
13
12
|
|
|
14
13
|
from __future__ import annotations
|
|
15
14
|
|
|
16
15
|
import argparse
|
|
16
|
+
import os
|
|
17
17
|
import re
|
|
18
18
|
import shlex
|
|
19
19
|
import subprocess
|
|
@@ -55,6 +55,29 @@ def _run(cmd: list[str], *, check: bool = True) -> CmdResult:
|
|
|
55
55
|
return out
|
|
56
56
|
|
|
57
57
|
|
|
58
|
+
def _run_with_env(cmd: list[str], *, env: dict[str, str], check: bool = True) -> CmdResult:
|
|
59
|
+
merged_env = os.environ.copy()
|
|
60
|
+
merged_env.update(env)
|
|
61
|
+
p = subprocess.run( # noqa: S603
|
|
62
|
+
cmd,
|
|
63
|
+
check=False,
|
|
64
|
+
text=True,
|
|
65
|
+
stdout=subprocess.PIPE,
|
|
66
|
+
stderr=subprocess.STDOUT,
|
|
67
|
+
env=merged_env,
|
|
68
|
+
)
|
|
69
|
+
out = CmdResult(stdout=p.stdout, returncode=p.returncode)
|
|
70
|
+
if check and p.returncode != 0:
|
|
71
|
+
raise RuntimeError(
|
|
72
|
+
"Command failed (exit {code}): {cmd}\n{out}".format(
|
|
73
|
+
code=p.returncode,
|
|
74
|
+
cmd=_quote_cmd(cmd),
|
|
75
|
+
out=(p.stdout or "").rstrip(),
|
|
76
|
+
)
|
|
77
|
+
)
|
|
78
|
+
return out
|
|
79
|
+
|
|
80
|
+
|
|
58
81
|
def _run_quiet_ok(cmd: list[str]) -> bool:
|
|
59
82
|
p = subprocess.run( # noqa: S603
|
|
60
83
|
cmd,
|
|
@@ -248,6 +271,13 @@ def _default_repo_from_remote(remote: str) -> str:
|
|
|
248
271
|
return f"{m.group('owner')}/{m.group('repo')}"
|
|
249
272
|
|
|
250
273
|
|
|
274
|
+
def _run_release_checks() -> None:
|
|
275
|
+
print("Running release checks: SKIP=mypy pre-commit run --all-files")
|
|
276
|
+
out = _run_with_env(["pre-commit", "run", "--all-files"], env={"SKIP": "mypy"}).stdout
|
|
277
|
+
if out.strip():
|
|
278
|
+
print(out.rstrip())
|
|
279
|
+
|
|
280
|
+
|
|
251
281
|
def main(argv: list[str] | None = None) -> int:
|
|
252
282
|
parser = argparse.ArgumentParser(description="Bump version, tag, and create a GitHub release.")
|
|
253
283
|
parser.add_argument("--version", help="New version, e.g. 0.21.0 (will be tagged as v0.21.0 unless --tag is set)")
|
|
@@ -294,12 +324,20 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
294
324
|
action="store_true",
|
|
295
325
|
help="Do not prompt for confirmation before push/release.",
|
|
296
326
|
)
|
|
327
|
+
parser.add_argument(
|
|
328
|
+
"--skip-checks",
|
|
329
|
+
action="store_true",
|
|
330
|
+
help="Skip repo-wide pre-commit validation. Use only when you have already validated the exact release commit.",
|
|
331
|
+
)
|
|
297
332
|
|
|
298
333
|
args = parser.parse_args(argv)
|
|
299
334
|
|
|
300
335
|
try:
|
|
301
336
|
_require_clean_git()
|
|
302
337
|
|
|
338
|
+
if not args.skip_checks:
|
|
339
|
+
_run_release_checks()
|
|
340
|
+
|
|
303
341
|
py_text = PYPROJECT_PATH.read_text(encoding="utf-8")
|
|
304
342
|
current_version = _read_current_version(py_text)
|
|
305
343
|
|
|
@@ -163,14 +163,17 @@ class JustHTML:
|
|
|
163
163
|
if needs_escape_incomplete_tags:
|
|
164
164
|
opts.emit_bogus_markup_as_text = True
|
|
165
165
|
|
|
166
|
-
# For
|
|
167
|
-
|
|
168
|
-
|
|
166
|
+
# For text-like HTML fragment contexts, set the initial tokenizer state
|
|
167
|
+
# to match the context element.
|
|
168
|
+
if fragment_context and fragment_context.namespace in {None, "html"}:
|
|
169
169
|
tag_name = fragment_context.tag_name.lower()
|
|
170
|
-
if tag_name in
|
|
170
|
+
if tag_name in {"textarea", "title"}:
|
|
171
|
+
opts.initial_state = Tokenizer.RCDATA
|
|
172
|
+
elif tag_name in {"iframe", "noembed", "noframes", "script", "style", "xmp"} or (
|
|
173
|
+
tag_name == "noscript" and opts.scripting_enabled
|
|
174
|
+
):
|
|
171
175
|
opts.initial_state = Tokenizer.RAWTEXT
|
|
172
|
-
|
|
173
|
-
elif tag_name in ("plaintext", "script"):
|
|
176
|
+
elif tag_name == "plaintext":
|
|
174
177
|
opts.initial_state = Tokenizer.PLAINTEXT
|
|
175
178
|
|
|
176
179
|
self.tokenizer = Tokenizer(
|