justhtml 0.12.0__py3-none-any.whl → 0.38.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +48 -0
- justhtml/__main__.py +86 -17
- justhtml/constants.py +12 -0
- justhtml/entities.py +45 -7
- justhtml/errors.py +17 -3
- justhtml/linkify.py +438 -0
- justhtml/node.py +385 -97
- justhtml/parser.py +139 -16
- justhtml/sanitize.py +992 -0
- justhtml/selector.py +117 -19
- justhtml/serialize.py +671 -41
- justhtml/tokenizer.py +364 -194
- justhtml/tokens.py +28 -5
- justhtml/transforms.py +2568 -0
- justhtml/treebuilder.py +297 -204
- justhtml/treebuilder_modes.py +208 -138
- justhtml-0.38.0.dist-info/METADATA +213 -0
- justhtml-0.38.0.dist-info/RECORD +26 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/licenses/LICENSE +4 -1
- justhtml-0.12.0.dist-info/METADATA +0 -164
- justhtml-0.12.0.dist-info/RECORD +0 -23
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/WHEEL +0 -0
- {justhtml-0.12.0.dist-info → justhtml-0.38.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: justhtml
|
|
3
|
+
Version: 0.38.0
|
|
4
|
+
Summary: A pure Python HTML5 parser that just works.
|
|
5
|
+
Project-URL: Homepage, https://github.com/emilstenstrom/justhtml
|
|
6
|
+
Project-URL: Issues, https://github.com/emilstenstrom/justhtml/issues
|
|
7
|
+
Author-email: Emil Stenström <emil@emilstenstrom.se>
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Provides-Extra: benchmark
|
|
14
|
+
Requires-Dist: beautifulsoup4; extra == 'benchmark'
|
|
15
|
+
Requires-Dist: html5-parser; extra == 'benchmark'
|
|
16
|
+
Requires-Dist: html5lib; extra == 'benchmark'
|
|
17
|
+
Requires-Dist: lxml; extra == 'benchmark'
|
|
18
|
+
Requires-Dist: psutil; extra == 'benchmark'
|
|
19
|
+
Requires-Dist: selectolax; extra == 'benchmark'
|
|
20
|
+
Requires-Dist: zstandard; extra == 'benchmark'
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: build; extra == 'dev'
|
|
23
|
+
Requires-Dist: coverage; extra == 'dev'
|
|
24
|
+
Requires-Dist: mypy>=1.0; (platform_python_implementation != 'PyPy') and extra == 'dev'
|
|
25
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
26
|
+
Requires-Dist: ruff==0.14.7; extra == 'dev'
|
|
27
|
+
Requires-Dist: twine; extra == 'dev'
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
|
|
30
|
+
# JustHTML
|
|
31
|
+
|
|
32
|
+
A pure Python HTML5 parser that just works. No C extensions to compile. No system dependencies to install. No complex API to learn.
|
|
33
|
+
|
|
34
|
+
[📖 Full documentation](docs/index.md) | [🛝 Try it in the Playground](https://emilstenstrom.github.io/justhtml/playground/)
|
|
35
|
+
|
|
36
|
+
## Why use JustHTML?
|
|
37
|
+
|
|
38
|
+
- **Just... Correct ✅** — Spec-perfect HTML5 parsing with browser-grade error recovery — passes the official 9k+ [html5lib-tests](https://github.com/html5lib/html5lib-tests) suite, with 100% line+branch coverage. ([Correctness](docs/correctness.md))
|
|
39
|
+
|
|
40
|
+
```python
|
|
41
|
+
JustHTML("<p><b>Hi<i>there</b>!", fragment=True).to_html()
|
|
42
|
+
# => <p><b>Hi<i>there</i></b><i>!</i></p>
|
|
43
|
+
|
|
44
|
+
# Note: fragment=True parses snippets (no <html>/<body> needed)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
- **Just... Python 🐍** — Pure Python, zero dependencies — no C extensions or system libraries, easy to debug, and works anywhere Python runs, including PyPy and Pyodide. ([Run in the browser](https://emilstenstrom.github.io/justhtml/playground/))
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
python -m pip show justhtml | grep -E '^Requires:'
|
|
52
|
+
# Requires: [intentionally left blank]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
- **Just... Secure 🔒** — Safe-by-default sanitization at construction time — built-in Bleach-style allowlist sanitization on `JustHTML(...)` (disable with `safe=False`). Can sanitize inline CSS rules. ([Sanitization & Security](docs/sanitization.md))
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
JustHTML(
|
|
59
|
+
"<p>Hello<script>alert(1)</script> "
|
|
60
|
+
"<a href=\"javascript:alert(1)\">bad</a> "
|
|
61
|
+
"<a href=\"https://example.com/?a=1&b=2\">ok</a></p>",
|
|
62
|
+
fragment=True,
|
|
63
|
+
).to_html()
|
|
64
|
+
# => <p>Hello <a>bad</a> <a href="https://example.com/?a=1&b=2">ok</a></p>
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
- **Just... Query 🔍** — CSS selectors out of the box — one method (`query()`), familiar syntax (combinators, groups, pseudo-classes), and plain Python nodes as results. ([CSS Selectors](docs/selectors.md))
|
|
68
|
+
|
|
69
|
+
```python
|
|
70
|
+
JustHTML(
|
|
71
|
+
"<main><p class=\"x\">Hi</p><p>Bye</p></main>",
|
|
72
|
+
fragment=True,
|
|
73
|
+
).query("main p.x")[0].to_html()
|
|
74
|
+
# => <p class="x">Hi</p>
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
- **Just... Transform 🏗️** — Built-in DOM transforms for: drop/unwrap nodes, rewrite attributes, linkify text, and compose safe pipelines. ([Transforms](docs/transforms.md))
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
from justhtml import JustHTML, Linkify, SetAttrs, Unwrap
|
|
81
|
+
|
|
82
|
+
doc = JustHTML(
|
|
83
|
+
"<p>Hello <span class=\"x\">world</span> example.com</p>",
|
|
84
|
+
transforms=[
|
|
85
|
+
Unwrap("span.x"),
|
|
86
|
+
Linkify(),
|
|
87
|
+
SetAttrs("a", rel="nofollow"),
|
|
88
|
+
],
|
|
89
|
+
)
|
|
90
|
+
print(doc.to_html(pretty=False))
|
|
91
|
+
# => <p>Hello world <a href="https://example.com" rel="nofollow">example.com</a></p>
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
- **Just... Fast Enough ⚡** — Fast for the common case (fastest pure-Python HTML5 parser available); for terabytes, use a C/Rust parser like `html5ever`. ([Benchmarks](benchmarks/performance.py))
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
/usr/bin/time -f '%e s' bash -lc \
|
|
98
|
+
"curl -Ls https://en.wikipedia.org/wiki/HTML | python -m justhtml - > /dev/null"
|
|
99
|
+
# 0.41 s
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Comparison
|
|
103
|
+
|
|
104
|
+
| Tool | HTML5 parsing [1][2] | Speed | CSS query | Sanitizes output | Notes |
|
|
105
|
+
|------|------------------------------------------|-------|----------|------------------|-------|
|
|
106
|
+
| **JustHTML**<br>Pure Python | ✅ **100%** | ⚡ Fast | ✅ CSS selectors | ✅ Built-in (`safe=True`) | Correct, easy to install, and fast enough. |
|
|
107
|
+
| **Chromium**<br>browser engine | ✅ **99%** | 🚀 Very Fast | — | — | — |
|
|
108
|
+
| **WebKit**<br>browser engine | ✅ **98%** | 🚀 Very Fast | — | — | — |
|
|
109
|
+
| **Firefox**<br>browser engine | ✅ **97%** | 🚀 Very Fast | — | — | — |
|
|
110
|
+
| **`html5lib`**<br>Pure Python | 🟡 88% | 🐢 Slow | 🟡 XPath (lxml) | 🔴 [Deprecated](https://github.com/html5lib/html5lib-python/issues/443) | Unmaintained. Reference implementation; Correct but quite slow. |
|
|
111
|
+
| **`html5_parser`**<br>Python wrapper of C-based Gumbo | 🟡 84% | 🚀 Very Fast | 🟡 XPath (lxml) | ❌ Needs sanitization | Fast and mostly correct. |
|
|
112
|
+
| **`selectolax`**<br>Python wrapper of C-based Lexbor | 🟡 68% | 🚀 Very Fast | ✅ CSS selectors | ❌ Needs sanitization | Very fast but less compliant. |
|
|
113
|
+
| **`html.parser`**<br>Python stdlib | 🔴 4% | ⚡ Fast | ❌ None | ❌ Needs sanitization | Standard library. Chokes on malformed HTML. |
|
|
114
|
+
| **`BeautifulSoup`**<br>Pure Python | 🔴 4% (default) | 🐢 Slow | 🟡 Custom API | ❌ Needs sanitization | Wraps `html.parser` (default). Can use lxml or html5lib. |
|
|
115
|
+
| **`lxml`**<br>Python wrapper of C-based libxml2 | 🔴 1% | 🚀 Very Fast | 🟡 XPath | ❌ Needs sanitization | Fast but not HTML5 compliant. Don't use the old lxml.html.clean module! |
|
|
116
|
+
|
|
117
|
+
[1]: Parser compliance scores are from a strict run of the [html5lib-tests](https://github.com/html5lib/html5lib-tests) tree-construction fixtures (1,743 non-script tests). See [docs/correctness.md](docs/correctness.md) for details.
|
|
118
|
+
|
|
119
|
+
[2]: Browser numbers are from [`justhtml-html5lib-tests-bench`](https://github.com/EmilStenstrom/justhtml-html5lib-tests-bench) on the upstream `html5lib-tests/tree-construction` corpus (excluding 12 scripting-enabled cases).
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
## Installation
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
pip install justhtml
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
Next: [Quickstart Guide](docs/quickstart.md), [CSS Selectors](docs/selectors.md), [Sanitization & Security](docs/sanitization.md), or [try the Playground](https://emilstenstrom.github.io/justhtml/playground/).
|
|
129
|
+
|
|
130
|
+
Requires Python 3.10 or later.
|
|
131
|
+
|
|
132
|
+
## Quick Example
|
|
133
|
+
|
|
134
|
+
```python
|
|
135
|
+
from justhtml import JustHTML
|
|
136
|
+
|
|
137
|
+
doc = JustHTML("<html><body><p class='intro'>Hello!</p></body></html>")
|
|
138
|
+
|
|
139
|
+
# Query with CSS selectors
|
|
140
|
+
for p in doc.query("p.intro"):
|
|
141
|
+
print(p.name) # "p"
|
|
142
|
+
print(p.attrs) # {"class": "intro"}
|
|
143
|
+
print(p.to_html()) # <p class="intro">Hello!</p>
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
See the **[Quickstart Guide](docs/quickstart.md)** for more examples including tree traversal, streaming, and strict mode.
|
|
147
|
+
|
|
148
|
+
## Command Line
|
|
149
|
+
|
|
150
|
+
If you installed JustHTML (for example with `pip install justhtml` or `pip install -e .`), you can use the `justhtml` command.
|
|
151
|
+
If you don't have it available, use the equivalent `python -m justhtml ...` form instead.
|
|
152
|
+
|
|
153
|
+
```bash
|
|
154
|
+
# Pretty-print an HTML file
|
|
155
|
+
justhtml index.html
|
|
156
|
+
|
|
157
|
+
# Parse from stdin
|
|
158
|
+
curl -s https://example.com | justhtml -
|
|
159
|
+
|
|
160
|
+
# Select nodes and output text
|
|
161
|
+
justhtml index.html --selector "main p" --format text
|
|
162
|
+
|
|
163
|
+
# Select nodes and output Markdown (subset of GFM)
|
|
164
|
+
justhtml index.html --selector "article" --format markdown
|
|
165
|
+
|
|
166
|
+
# Select nodes and output HTML
|
|
167
|
+
justhtml index.html --selector "a" --format html
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
# Example: extract Markdown from GitHub README HTML
|
|
172
|
+
curl -s https://github.com/EmilStenstrom/justhtml/ | justhtml - --selector '.markdown-body' --format markdown | head -n 15
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Output:
|
|
176
|
+
|
|
177
|
+
```text
|
|
178
|
+
# JustHTML
|
|
179
|
+
|
|
180
|
+
[](#justhtml)
|
|
181
|
+
|
|
182
|
+
A pure Python HTML5 parser that just works. No C extensions to compile. No system dependencies to install. No complex API to learn.
|
|
183
|
+
|
|
184
|
+
**[📖 Read the full documentation here](/EmilStenstrom/justhtml/blob/main/docs/index.md)**
|
|
185
|
+
|
|
186
|
+
## Why use JustHTML?
|
|
187
|
+
|
|
188
|
+
- **Just... Correct ✅** — Spec-perfect HTML5 parsing with browser-grade error recovery — passes the official 9k+ [html5lib-tests](https://github.com/html5lib/html5lib-tests) suite, with 100% line+branch coverage. ([Correctness](/EmilStenstrom/justhtml/blob/main/docs/correctness.md))
|
|
189
|
+
- **Just... Python 🐍** — Pure Python, zero dependencies — no C extensions or system libraries, easy to debug, and works anywhere Python runs (including PyPy and Pyodide). ([Quickstart](/EmilStenstrom/justhtml/blob/main/docs/quickstart.md))
|
|
190
|
+
- **Just... Secure 🔒** — Safe-by-default sanitization at construction time — built-in Bleach-style allowlist sanitization on `JustHTML(...)` (disable with `safe=False`), plus URL/CSS rules. ([Sanitization & Security](/EmilStenstrom/justhtml/blob/main/docs/sanitization.md))
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Security
|
|
194
|
+
|
|
195
|
+
For security policy and vulnerability reporting, please see [SECURITY.md](SECURITY.md).
|
|
196
|
+
|
|
197
|
+
## Contributing
|
|
198
|
+
|
|
199
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines.
|
|
200
|
+
|
|
201
|
+
## Acknowledgments
|
|
202
|
+
|
|
203
|
+
JustHTML started as a Python port of [html5ever](https://github.com/servo/html5ever), the HTML5 parser from Mozilla's Servo browser engine. While the codebase has since evolved significantly, html5ever's clean architecture and spec-compliant approach were invaluable as a starting point. Thank you to the Servo team for their excellent work.
|
|
204
|
+
|
|
205
|
+
Correctness and conformance work is heavily guided by the [html5lib](https://github.com/html5lib/html5lib-python) ecosystem and especially the official [html5lib-tests](https://github.com/html5lib/html5lib-tests) fixtures used across implementations.
|
|
206
|
+
|
|
207
|
+
The sanitization API and threat-model expectations are informed by established Python sanitizers like [Bleach](https://github.com/mozilla/bleach) and [nh3](https://github.com/messense/nh3).
|
|
208
|
+
|
|
209
|
+
The CSS selector query API is inspired by the ergonomics of [lxml.cssselect](https://lxml.de/cssselect.html).
|
|
210
|
+
|
|
211
|
+
## License
|
|
212
|
+
|
|
213
|
+
MIT. Free to use both for commercial and non-commercial use.
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
justhtml/__init__.py,sha256=cyFtwOsxM_m-xG3vNdO4YvBQvEp0HOWUN3EnfGwGotc,1183
|
|
2
|
+
justhtml/__main__.py,sha256=aupMvpS2_C4b11GcSNm5_JdlDkllaQLE3_CR8ttUmmk,6559
|
|
3
|
+
justhtml/constants.py,sha256=85cNNHS3fCSwvFGsQSV7uk_G1Ce0llHBkg3sW8k7WZ8,11881
|
|
4
|
+
justhtml/context.py,sha256=Ac4mV-a3ZgJILQbstFu-EB6bRA5oYlSkHqpTxMlMfk0,293
|
|
5
|
+
justhtml/encoding.py,sha256=9mscoXtBb57zehG_BxzN6aTTJHaNfywk5gwxrnH92K8,11310
|
|
6
|
+
justhtml/entities.py,sha256=_cQ3MBrV2hJwAUPVF8JJf7zbrdrxycKOe3Z_thg93Ng,11161
|
|
7
|
+
justhtml/errors.py,sha256=XVTgiXmfh1tX3PjGKBuhiCQ-72gNVuimBUXexHW9pKo,11045
|
|
8
|
+
justhtml/linkify.py,sha256=qTrEJ4UeSC8fVbryst6HfZkgAs69YvaNWkM2sB3zS74,14112
|
|
9
|
+
justhtml/node.py,sha256=A9IetRR8_MC2QCmmcEiAV5nIg97rorUnlDZ9-LfkjOM,27857
|
|
10
|
+
justhtml/parser.py,sha256=STLG33TkMvb0Z_RH5gUDmcsEjWF_QQ2aabRDXhuUF1I,9984
|
|
11
|
+
justhtml/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
justhtml/sanitize.py,sha256=D0aOgy_iFtCnyNZjFaxoAoZvLvoHhUWgeL02p_M9d7k,33188
|
|
13
|
+
justhtml/selector.py,sha256=FLW-rOZwJxGf4uD6ZdHYI7QcEGzstBOOrf-Ubo-37uA,36015
|
|
14
|
+
justhtml/serialize.py,sha256=AZIGuIFJ8oLfpzz938svNN4wGgxYNA_EGheMzuwoi2s,32766
|
|
15
|
+
justhtml/stream.py,sha256=n8pKtVAivG0VerCWEcXSEBwzj8Tm1ltEAL7F46RGUVM,3431
|
|
16
|
+
justhtml/tokenizer.py,sha256=_v3dpjAuq89gjPJMbZLOKgrTc6GmV-QhuDSKGQA_3Pk,107171
|
|
17
|
+
justhtml/tokens.py,sha256=mk3VBdiula7voCKahRFJ45F14_Qh9Ega-XQ4wwavjMg,7695
|
|
18
|
+
justhtml/transforms.py,sha256=ptHXJ26AtbGTz0zZNIZQP47JphbATme3TyKK7x-qzw4,95289
|
|
19
|
+
justhtml/treebuilder.py,sha256=7RQCtHhRTj4uGlALPZtIzVD-ZoEK0ezyn1-Tto9yw3k,60972
|
|
20
|
+
justhtml/treebuilder_modes.py,sha256=8xupHR4IMaCyLGwKX6lcGDMwalMFlgne3B_fhMvyAE0,98887
|
|
21
|
+
justhtml/treebuilder_utils.py,sha256=LjK9tg9sNYR-sJdXKemJCzzzgh6lQW1KBqyvhpWtaoQ,2912
|
|
22
|
+
justhtml-0.38.0.dist-info/METADATA,sha256=yU5XJ-gqssbudTodF55FvNeRQchuPgTu3bFvI7Y9OuU,10171
|
|
23
|
+
justhtml-0.38.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
24
|
+
justhtml-0.38.0.dist-info/entry_points.txt,sha256=UN06mPn7J0cBM1dqyf245FvmU9mF3ivgplSr5ppdp6g,52
|
|
25
|
+
justhtml-0.38.0.dist-info/licenses/LICENSE,sha256=_IBvKQiU5PIZRnE1-yHzMEj41agX8PgoQkbXLaKdVy4,1256
|
|
26
|
+
justhtml-0.38.0.dist-info/RECORD,,
|
|
@@ -1,6 +1,9 @@
|
|
|
1
1
|
MIT License
|
|
2
2
|
|
|
3
|
-
Copyright (c) 2025 Emil Stenström
|
|
3
|
+
Copyright (c) 2025 Emil Stenström (JustHTML)
|
|
4
|
+
Copyright (c) 2014-2017, The html5ever Project Developers (html5ever inspiration)
|
|
5
|
+
Copyright (c) 2006-2013 James Graham, Sam Sneddon, and
|
|
6
|
+
other contributors (html5lib-tests)
|
|
4
7
|
|
|
5
8
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
9
|
of this software and associated documentation files (the "Software"), to deal
|
|
@@ -1,164 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: justhtml
|
|
3
|
-
Version: 0.12.0
|
|
4
|
-
Summary: A pure Python HTML5 parser that just works.
|
|
5
|
-
Project-URL: Homepage, https://github.com/emilstenstrom/justhtml
|
|
6
|
-
Project-URL: Issues, https://github.com/emilstenstrom/justhtml/issues
|
|
7
|
-
Author-email: Emil Stenström <emil@emilstenstrom.se>
|
|
8
|
-
License-File: LICENSE
|
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
-
Classifier: Operating System :: OS Independent
|
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Requires-Python: >=3.10
|
|
13
|
-
Provides-Extra: benchmark
|
|
14
|
-
Requires-Dist: beautifulsoup4; extra == 'benchmark'
|
|
15
|
-
Requires-Dist: html5-parser; extra == 'benchmark'
|
|
16
|
-
Requires-Dist: html5lib; extra == 'benchmark'
|
|
17
|
-
Requires-Dist: lxml; extra == 'benchmark'
|
|
18
|
-
Requires-Dist: psutil; extra == 'benchmark'
|
|
19
|
-
Requires-Dist: selectolax; extra == 'benchmark'
|
|
20
|
-
Requires-Dist: zstandard; extra == 'benchmark'
|
|
21
|
-
Provides-Extra: dev
|
|
22
|
-
Requires-Dist: build; extra == 'dev'
|
|
23
|
-
Requires-Dist: coverage; extra == 'dev'
|
|
24
|
-
Requires-Dist: mypy>=1.0; (platform_python_implementation != 'PyPy') and extra == 'dev'
|
|
25
|
-
Requires-Dist: pre-commit; extra == 'dev'
|
|
26
|
-
Requires-Dist: ruff==0.14.7; extra == 'dev'
|
|
27
|
-
Requires-Dist: twine; extra == 'dev'
|
|
28
|
-
Description-Content-Type: text/markdown
|
|
29
|
-
|
|
30
|
-
# JustHTML
|
|
31
|
-
|
|
32
|
-
A pure Python HTML5 parser that just works. No C extensions to compile. No system dependencies to install. No complex API to learn.
|
|
33
|
-
|
|
34
|
-
**[📖 Read the full documentation here](docs/index.md)**
|
|
35
|
-
|
|
36
|
-
## Why use JustHTML?
|
|
37
|
-
|
|
38
|
-
### 1. Just... Correct ✅
|
|
39
|
-
It implements the official WHATWG HTML5 specification exactly. If a browser can parse it, JustHTML can parse it. It handles all the complex error-handling rules that browsers use.
|
|
40
|
-
|
|
41
|
-
- **Verified Compliance**: Passes all 9k+ tests in the official [html5lib-tests](https://github.com/html5lib/html5lib-tests) suite (used by browser vendors).
|
|
42
|
-
- **100% Coverage**: Every line and branch of code is covered by integration tests.
|
|
43
|
-
- **Fuzz Tested**: Has parsed 6 million randomized broken HTML documents to ensure it never crashes or hangs (see benchmarks/fuzz.py).
|
|
44
|
-
- **Living Standard**: It tracks the living standard, not a snapshot from 2012.
|
|
45
|
-
|
|
46
|
-
### 2. Just... Python 🐍
|
|
47
|
-
JustHTML has **zero dependencies**. It's pure Python.
|
|
48
|
-
|
|
49
|
-
- **Just Install**: No C extensions to compile, no system libraries (like libxml2) required. Works on PyPy, WASM (Pyodide) (yes, it's in the test matrix), and anywhere Python runs.
|
|
50
|
-
- **No dependency upgrade hassle**: Some libraries depend on a large set of libraries, all which require upgrades to avoid security issues.
|
|
51
|
-
- **Debuggable**: It's just Python code. You can step through it with a debugger to understand exactly how your HTML is being parsed.
|
|
52
|
-
- **Returns plain python objects**: Other parsers return lxml or etree trees which means you have another API to learn. JustHTML returns a set of nested objects you can iterate over. Simple.
|
|
53
|
-
|
|
54
|
-
### 3. Just... Query 🔍
|
|
55
|
-
Find elements with CSS selectors. Just one method to learn - `query()` - and it uses CSS syntax you already know.
|
|
56
|
-
|
|
57
|
-
```python
|
|
58
|
-
doc.query("div.container > p.intro") # Familiar CSS syntax
|
|
59
|
-
doc.query("#main, .sidebar") # Selector groups
|
|
60
|
-
doc.query("li:nth-child(2n+1)") # Pseudo-classes
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
### 4. Just... Fast Enough ⚡
|
|
64
|
-
|
|
65
|
-
If you need to parse terabytes of data, use a C or Rust parser (like `html5ever`). They are 10x-20x faster.
|
|
66
|
-
|
|
67
|
-
But for most use cases, JustHTML is **fast enough**. It parses the Wikipedia homepage in ~0.1s. It is the fastest pure-Python HTML5 parser available, outperforming `html5lib` and `BeautifulSoup`.
|
|
68
|
-
|
|
69
|
-
## Comparison to other parsers
|
|
70
|
-
|
|
71
|
-
| Parser | HTML5 Compliance | Pure Python? | Speed | Query API | Notes |
|
|
72
|
-
|--------|:----------------:|:------------:|-------|-----------|-------|
|
|
73
|
-
| **JustHTML** | ✅ **100%** | ✅ Yes | ⚡ Fast | ✅ CSS selectors | It just works. Correct, easy to install, and fast enough. |
|
|
74
|
-
| `html5lib` | 🟡 88% | ✅ Yes | 🐢 Slow | ❌ None | The reference implementation. Very correct but quite slow. |
|
|
75
|
-
| `html5_parser` | 🟡 84% | ❌ No | 🚀 Very Fast | 🟡 XPath (lxml) | C-based (Gumbo). Fast and mostly correct. |
|
|
76
|
-
| `selectolax` | 🟡 68% | ❌ No | 🚀 Very Fast | ✅ CSS selectors | C-based (Lexbor). Very fast but less compliant. |
|
|
77
|
-
| `BeautifulSoup` | 🔴 4% | ✅ Yes | 🐢 Slow | 🟡 Custom API | Wrapper around `html.parser`. Not spec compliant. |
|
|
78
|
-
| `html.parser` | 🔴 4% | ✅ Yes | ⚡ Fast | ❌ None | Standard library. Chokes on malformed HTML. |
|
|
79
|
-
| `lxml` | 🔴 1% | ❌ No | 🚀 Very Fast | 🟡 XPath | C-based (libxml2). Fast but not HTML5 compliant. |
|
|
80
|
-
|
|
81
|
-
*Compliance scores from running the [html5lib-tests](https://github.com/html5lib/html5lib-tests) suite (1,743 tree-construction tests). See `benchmarks/correctness.py`.*
|
|
82
|
-
|
|
83
|
-
## Installation
|
|
84
|
-
|
|
85
|
-
Requires Python 3.10 or later.
|
|
86
|
-
|
|
87
|
-
```bash
|
|
88
|
-
pip install justhtml
|
|
89
|
-
```
|
|
90
|
-
|
|
91
|
-
## Quick Example
|
|
92
|
-
|
|
93
|
-
```python
|
|
94
|
-
from justhtml import JustHTML
|
|
95
|
-
|
|
96
|
-
doc = JustHTML("<html><body><p class='intro'>Hello!</p></body></html>")
|
|
97
|
-
|
|
98
|
-
# Query with CSS selectors
|
|
99
|
-
for p in doc.query("p.intro"):
|
|
100
|
-
print(p.name) # "p"
|
|
101
|
-
print(p.attrs) # {"class": "intro"}
|
|
102
|
-
print(p.to_html()) # <p class="intro">Hello!</p>
|
|
103
|
-
```
|
|
104
|
-
|
|
105
|
-
See the **[Quickstart Guide](docs/quickstart.md)** for more examples including tree traversal, streaming, and strict mode.
|
|
106
|
-
|
|
107
|
-
## Command Line
|
|
108
|
-
|
|
109
|
-
If you installed JustHTML (for example with `pip install justhtml` or `pip install -e .`), you can use the `justhtml` command.
|
|
110
|
-
If you don't have it available, use the equivalent `python -m justhtml ...` form instead.
|
|
111
|
-
|
|
112
|
-
```bash
|
|
113
|
-
# Pretty-print an HTML file
|
|
114
|
-
justhtml index.html
|
|
115
|
-
|
|
116
|
-
# Parse from stdin
|
|
117
|
-
curl -s https://example.com | justhtml -
|
|
118
|
-
|
|
119
|
-
# Select nodes and output text
|
|
120
|
-
justhtml index.html --selector "main p" --format text
|
|
121
|
-
|
|
122
|
-
# Select nodes and output Markdown (subset of GFM)
|
|
123
|
-
justhtml index.html --selector "article" --format markdown
|
|
124
|
-
|
|
125
|
-
# Select nodes and output HTML
|
|
126
|
-
justhtml index.html --selector "a" --format html
|
|
127
|
-
```
|
|
128
|
-
|
|
129
|
-
```bash
|
|
130
|
-
# Example: extract Markdown from GitHub README HTML
|
|
131
|
-
curl -s https://github.com/EmilStenstrom/justhtml/ | justhtml - --selector '.markdown-body' --format markdown | head -n 15
|
|
132
|
-
```
|
|
133
|
-
|
|
134
|
-
Output:
|
|
135
|
-
|
|
136
|
-
```text
|
|
137
|
-
# JustHTML
|
|
138
|
-
|
|
139
|
-
[](#justhtml)
|
|
140
|
-
|
|
141
|
-
A pure Python HTML5 parser that just works. No C extensions to compile. No system dependencies to install. No complex API to learn.
|
|
142
|
-
|
|
143
|
-
**[📖 Read the full documentation here](/EmilStenstrom/justhtml/blob/main/docs/index.md)**
|
|
144
|
-
|
|
145
|
-
## Why use JustHTML?
|
|
146
|
-
|
|
147
|
-
[](#why-use-justhtml)
|
|
148
|
-
|
|
149
|
-
### 1. Just... Correct ✅
|
|
150
|
-
|
|
151
|
-
[](#1-just-correct-)
|
|
152
|
-
```
|
|
153
|
-
|
|
154
|
-
## Contributing
|
|
155
|
-
|
|
156
|
-
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines.
|
|
157
|
-
|
|
158
|
-
## Acknowledgments
|
|
159
|
-
|
|
160
|
-
JustHTML started as a Python port of [html5ever](https://github.com/servo/html5ever), the HTML5 parser from Mozilla's Servo browser engine. While the codebase has since evolved significantly, html5ever's clean architecture and spec-compliant approach were invaluable as a starting point. Thank you to the Servo team for their excellent work.
|
|
161
|
-
|
|
162
|
-
## License
|
|
163
|
-
|
|
164
|
-
MIT. Free to use both for commercial and non-commercial use.
|
justhtml-0.12.0.dist-info/RECORD
DELETED
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
justhtml/__init__.py,sha256=rsc4X1uTsJziqKtZxWQsIqwuC5DI0cvfYw5q_FtEOCo,375
|
|
2
|
-
justhtml/__main__.py,sha256=o-ur6qdyt8EmYDKjCP3waHjuvmD38j6mpAx-CJZtAXs,3876
|
|
3
|
-
justhtml/constants.py,sha256=-UATvXXQ7ueFWxJHW79c2eMmMWaSKoqwwcNIGesTAj0,11603
|
|
4
|
-
justhtml/context.py,sha256=Ac4mV-a3ZgJILQbstFu-EB6bRA5oYlSkHqpTxMlMfk0,293
|
|
5
|
-
justhtml/encoding.py,sha256=9mscoXtBb57zehG_BxzN6aTTJHaNfywk5gwxrnH92K8,11310
|
|
6
|
-
justhtml/entities.py,sha256=qLijZDS2n6Cc8UqbVZYNv2XzKVahysFBlAbyHkerD7c,9808
|
|
7
|
-
justhtml/errors.py,sha256=kt8uA9pH43PEiL2ccrV6zJMXK5iZvbtDtIhIWIzS8FU,10004
|
|
8
|
-
justhtml/node.py,sha256=NEDQzD7VBuch-BIZixNIUyZNCHydnThF_eSwXeA5OuA,19056
|
|
9
|
-
justhtml/parser.py,sha256=S_mbXd3YcNGJSZYR_34wkI1mW6SZNQw5d0_LOtjHCRM,4628
|
|
10
|
-
justhtml/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
justhtml/selector.py,sha256=_L9XmJTVJVuAZF12B_nex9iytR4a_xlkIePwUVVvHUk,32884
|
|
12
|
-
justhtml/serialize.py,sha256=Cezl3KVy9RT4sPFr6QJmtWiVjwAvwZSuBoRUP-E3yKM,8606
|
|
13
|
-
justhtml/stream.py,sha256=n8pKtVAivG0VerCWEcXSEBwzj8Tm1ltEAL7F46RGUVM,3431
|
|
14
|
-
justhtml/tokenizer.py,sha256=auDP8svHMlB0PxohVcq3VxhN3WjXUs2A8ckHN78SjKc,100623
|
|
15
|
-
justhtml/tokens.py,sha256=WNGpQ6nHQECY7J8jBvs3XAP2Lxy_xExxRMlqBn_MexI,6836
|
|
16
|
-
justhtml/treebuilder.py,sha256=Mjhyp3Ral5ACCGDi4GL-clHeg65TUrQdoiW2ZunZqNY,55606
|
|
17
|
-
justhtml/treebuilder_modes.py,sha256=VVAbHWbLq1DP3KxNYQa2m2DjJ5jEVxBufV7eSTE_gfA,94040
|
|
18
|
-
justhtml/treebuilder_utils.py,sha256=LjK9tg9sNYR-sJdXKemJCzzzgh6lQW1KBqyvhpWtaoQ,2912
|
|
19
|
-
justhtml-0.12.0.dist-info/METADATA,sha256=8Ex7MpHIhbJaGJvyOvIkEvMBS48K4z4zTlm7_nRTPkk,6922
|
|
20
|
-
justhtml-0.12.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
21
|
-
justhtml-0.12.0.dist-info/entry_points.txt,sha256=UN06mPn7J0cBM1dqyf245FvmU9mF3ivgplSr5ppdp6g,52
|
|
22
|
-
justhtml-0.12.0.dist-info/licenses/LICENSE,sha256=jM1KAJ1VQZAo7SCGVK1jtVj11zgIc5_BxZAUhXq01V8,1072
|
|
23
|
-
justhtml-0.12.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|