justhtml 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +29 -0
- justhtml/constants.py +441 -0
- justhtml/context.py +6 -0
- justhtml/entities.py +342 -0
- justhtml/errors.py +138 -0
- justhtml/node.py +208 -0
- justhtml/parser.py +86 -0
- justhtml/selector.py +925 -0
- justhtml/serialize.py +201 -0
- justhtml/stream.py +83 -0
- justhtml/tokenizer.py +2590 -0
- justhtml/tokens.py +175 -0
- justhtml/treebuilder.py +1231 -0
- justhtml/treebuilder_modes.py +2012 -0
- justhtml/treebuilder_utils.py +86 -0
- justhtml-0.6.0.dist-info/METADATA +126 -0
- justhtml-0.6.0.dist-info/RECORD +20 -0
- justhtml-0.6.0.dist-info/WHEEL +4 -0
- justhtml-0.6.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import enum
|
|
2
|
+
|
|
3
|
+
from .constants import (
|
|
4
|
+
HTML4_PUBLIC_PREFIXES,
|
|
5
|
+
LIMITED_QUIRKY_PUBLIC_PREFIXES,
|
|
6
|
+
QUIRKY_PUBLIC_MATCHES,
|
|
7
|
+
QUIRKY_PUBLIC_PREFIXES,
|
|
8
|
+
QUIRKY_SYSTEM_MATCHES,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class InsertionMode(enum.IntEnum):
|
|
13
|
+
INITIAL = 0
|
|
14
|
+
BEFORE_HTML = 1
|
|
15
|
+
BEFORE_HEAD = 2
|
|
16
|
+
IN_HEAD = 3
|
|
17
|
+
IN_HEAD_NOSCRIPT = 4
|
|
18
|
+
AFTER_HEAD = 5
|
|
19
|
+
TEXT = 6
|
|
20
|
+
IN_BODY = 7
|
|
21
|
+
AFTER_BODY = 8
|
|
22
|
+
AFTER_AFTER_BODY = 9
|
|
23
|
+
IN_TABLE = 10
|
|
24
|
+
IN_TABLE_TEXT = 11
|
|
25
|
+
IN_CAPTION = 12
|
|
26
|
+
IN_COLUMN_GROUP = 13
|
|
27
|
+
IN_TABLE_BODY = 14
|
|
28
|
+
IN_ROW = 15
|
|
29
|
+
IN_CELL = 16
|
|
30
|
+
IN_FRAMESET = 17
|
|
31
|
+
AFTER_FRAMESET = 18
|
|
32
|
+
AFTER_AFTER_FRAMESET = 19
|
|
33
|
+
IN_SELECT = 20
|
|
34
|
+
IN_TEMPLATE = 21
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def is_all_whitespace(text):
|
|
38
|
+
return text.strip("\t\n\f\r ") == ""
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def contains_prefix(haystack, needle):
|
|
42
|
+
return any(needle.startswith(prefix) for prefix in haystack)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def doctype_error_and_quirks(doctype, iframe_srcdoc=False):
|
|
46
|
+
name = doctype.name.lower() if doctype.name else None
|
|
47
|
+
public_id = doctype.public_id
|
|
48
|
+
system_id = doctype.system_id
|
|
49
|
+
|
|
50
|
+
acceptable = (
|
|
51
|
+
("html", None, None),
|
|
52
|
+
("html", None, "about:legacy-compat"),
|
|
53
|
+
("html", "-//W3C//DTD HTML 4.0//EN", None),
|
|
54
|
+
("html", "-//W3C//DTD HTML 4.0//EN", "http://www.w3.org/TR/REC-html40/strict.dtd"),
|
|
55
|
+
("html", "-//W3C//DTD HTML 4.01//EN", None),
|
|
56
|
+
("html", "-//W3C//DTD HTML 4.01//EN", "http://www.w3.org/TR/html4/strict.dtd"),
|
|
57
|
+
("html", "-//W3C//DTD XHTML 1.0 Strict//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"),
|
|
58
|
+
("html", "-//W3C//DTD XHTML 1.1//EN", "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"),
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
key = (name, public_id, system_id)
|
|
62
|
+
parse_error = key not in acceptable
|
|
63
|
+
|
|
64
|
+
public_lower = public_id.lower() if public_id else None
|
|
65
|
+
system_lower = system_id.lower() if system_id else None
|
|
66
|
+
|
|
67
|
+
if doctype.force_quirks:
|
|
68
|
+
quirks_mode = "quirks"
|
|
69
|
+
elif iframe_srcdoc:
|
|
70
|
+
quirks_mode = "no-quirks"
|
|
71
|
+
elif name != "html":
|
|
72
|
+
quirks_mode = "quirks"
|
|
73
|
+
elif public_lower in QUIRKY_PUBLIC_MATCHES:
|
|
74
|
+
quirks_mode = "quirks"
|
|
75
|
+
elif system_lower in QUIRKY_SYSTEM_MATCHES:
|
|
76
|
+
quirks_mode = "quirks"
|
|
77
|
+
elif public_lower and contains_prefix(QUIRKY_PUBLIC_PREFIXES, public_lower):
|
|
78
|
+
quirks_mode = "quirks"
|
|
79
|
+
elif public_lower and contains_prefix(LIMITED_QUIRKY_PUBLIC_PREFIXES, public_lower):
|
|
80
|
+
quirks_mode = "limited-quirks"
|
|
81
|
+
elif public_lower and contains_prefix(HTML4_PUBLIC_PREFIXES, public_lower):
|
|
82
|
+
quirks_mode = "quirks" if system_lower is None else "limited-quirks"
|
|
83
|
+
else:
|
|
84
|
+
quirks_mode = "no-quirks"
|
|
85
|
+
|
|
86
|
+
return parse_error, quirks_mode
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: justhtml
|
|
3
|
+
Version: 0.6.0
|
|
4
|
+
Summary: A pure Python HTML5 parser that just works.
|
|
5
|
+
Project-URL: Homepage, https://github.com/emilstenstrom/justhtml
|
|
6
|
+
Project-URL: Issues, https://github.com/emilstenstrom/justhtml/issues
|
|
7
|
+
Author-email: Emil StenstrΓΆm <emil@emilstenstrom.se>
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Requires-Python: >=3.10
|
|
13
|
+
Provides-Extra: benchmark
|
|
14
|
+
Requires-Dist: beautifulsoup4; extra == 'benchmark'
|
|
15
|
+
Requires-Dist: html5-parser; extra == 'benchmark'
|
|
16
|
+
Requires-Dist: html5lib; extra == 'benchmark'
|
|
17
|
+
Requires-Dist: lxml; extra == 'benchmark'
|
|
18
|
+
Requires-Dist: psutil; extra == 'benchmark'
|
|
19
|
+
Requires-Dist: selectolax; extra == 'benchmark'
|
|
20
|
+
Requires-Dist: zstandard; extra == 'benchmark'
|
|
21
|
+
Provides-Extra: dev
|
|
22
|
+
Requires-Dist: build; extra == 'dev'
|
|
23
|
+
Requires-Dist: coverage; extra == 'dev'
|
|
24
|
+
Requires-Dist: pre-commit; extra == 'dev'
|
|
25
|
+
Requires-Dist: ruff==0.14.7; extra == 'dev'
|
|
26
|
+
Requires-Dist: twine; extra == 'dev'
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# JustHTML
|
|
30
|
+
|
|
31
|
+
A pure Python HTML5 parser that just works. No C extensions to compile. No system dependencies to install. No complex API to learn.
|
|
32
|
+
|
|
33
|
+
**[π Read the full documentation here](docs/index.md)**
|
|
34
|
+
|
|
35
|
+
## Why use JustHTML?
|
|
36
|
+
|
|
37
|
+
### 1. Just... Correct β
|
|
38
|
+
It implements the official WHATWG HTML5 specification exactly. If a browser can parse it, JustHTML can parse it. It handles all the complex error-handling rules that browsers use.
|
|
39
|
+
|
|
40
|
+
- **Verified Compliance**: Passes all 8,500+ tests in the official [html5lib-tests](https://github.com/html5lib/html5lib-tests) suite (used by browser vendors).
|
|
41
|
+
- **100% Coverage**: Every line and branch of code is covered by integration tests.
|
|
42
|
+
- **Fuzz Tested**: Has parsed 3 million randomized broken HTML documents to ensure it never crashes or hangs (see benchmarks/fuzz.py).
|
|
43
|
+
- **Living Standard**: It tracks the living standard, not a snapshot from 2012.
|
|
44
|
+
|
|
45
|
+
### 2. Just... Python π
|
|
46
|
+
JustHTML has **zero dependencies**. It's pure Python.
|
|
47
|
+
|
|
48
|
+
- **Just Install**: No C extensions to compile, no system libraries (like libxml2) required. Works on PyPy, WASM (Pyodide) (yes, it's in the test matrix), and anywhere Python runs.
|
|
49
|
+
- **No dependency upgrade hassle**: Some libraries depend on a large set of libraries, all which require upgrades to avoid security issues.
|
|
50
|
+
- **Debuggable**: It's just Python code. You can step through it with a debugger to understand exactly how your HTML is being parsed.
|
|
51
|
+
- **Returns plain python objects**: Other parsers return lxml or etree trees which means you have another API to learn. JustHTML returns a set of nested objects you can iterate over. Simple.
|
|
52
|
+
|
|
53
|
+
### 3. Just... Query π
|
|
54
|
+
Find elements with CSS selectors. Just one method to learn - `query()` - and it uses CSS syntax you already know.
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
doc.query("div.container > p.intro") # Familiar CSS syntax
|
|
58
|
+
doc.query("#main, .sidebar") # Selector groups
|
|
59
|
+
doc.query("li:nth-child(2n+1)") # Pseudo-classes
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### 4. Just... Fast Enough β‘
|
|
63
|
+
|
|
64
|
+
If you need to parse terabytes of data, use a C or Rust parser (like `html5ever`). They are 10x-20x faster.
|
|
65
|
+
|
|
66
|
+
But for most use cases, JustHTML is **fast enough**. It parses the Wikipedia homepage in ~0.1s. It is the fastest pure-Python HTML5 parser available, outperforming `html5lib` and `BeautifulSoup`.
|
|
67
|
+
|
|
68
|
+
## Comparison to other parsers
|
|
69
|
+
|
|
70
|
+
| Parser | HTML5 Compliance | Pure Python? | Speed | Query API | Notes |
|
|
71
|
+
|--------|:----------------:|:------------:|-------|-----------|-------|
|
|
72
|
+
| **JustHTML** | β
**100%** | β
Yes | β‘ Fast | β
CSS selectors | It just works. Correct, easy to install, and fast enough. |
|
|
73
|
+
| `html5lib` | π‘ 88% | β
Yes | π’ Slow | β None | The reference implementation. Very correct but quite slow. |
|
|
74
|
+
| `html5_parser` | π‘ 84% | β No | π Very Fast | π‘ XPath (lxml) | C-based (Gumbo). Fast and mostly correct. |
|
|
75
|
+
| `selectolax` | π‘ 68% | β No | π Very Fast | β
CSS selectors | C-based (Lexbor). Very fast but less compliant. |
|
|
76
|
+
| `BeautifulSoup` | π΄ 4% | β
Yes | π’ Slow | π‘ Custom API | Wrapper around `html.parser`. Not spec compliant. |
|
|
77
|
+
| `html.parser` | π΄ 4% | β
Yes | β‘ Fast | β None | Standard library. Chokes on malformed HTML. |
|
|
78
|
+
| `lxml` | π΄ 1% | β No | π Very Fast | π‘ XPath | C-based (libxml2). Fast but not HTML5 compliant. |
|
|
79
|
+
|
|
80
|
+
*Compliance scores from running the [html5lib-tests](https://github.com/html5lib/html5lib-tests) suite (1,743 tree-construction tests). See `benchmarks/correctness.py`.*
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
Requires Python 3.10 or later.
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
pip install justhtml
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Quick Example
|
|
91
|
+
|
|
92
|
+
```python
|
|
93
|
+
from justhtml import JustHTML
|
|
94
|
+
|
|
95
|
+
doc = JustHTML("<html><body><p class='intro'>Hello!</p></body></html>")
|
|
96
|
+
|
|
97
|
+
# Query with CSS selectors
|
|
98
|
+
for p in doc.query("p.intro"):
|
|
99
|
+
print(p.name) # "p"
|
|
100
|
+
print(p.attrs) # {"class": "intro"}
|
|
101
|
+
print(p.to_html()) # <p class="intro">Hello!</p>
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
See the **[Quickstart Guide](docs/quickstart.md)** for more examples including tree traversal, streaming, and strict mode.
|
|
105
|
+
|
|
106
|
+
## Command Line
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
# Pretty-print an HTML file
|
|
110
|
+
python -m justhtml index.html
|
|
111
|
+
|
|
112
|
+
# Parse from stdin
|
|
113
|
+
curl -s https://example.com | python -m justhtml -
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## Contributing
|
|
117
|
+
|
|
118
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines.
|
|
119
|
+
|
|
120
|
+
## Acknowledgments
|
|
121
|
+
|
|
122
|
+
JustHTML started as a Python port of [html5ever](https://github.com/servo/html5ever), the HTML5 parser from Mozilla's Servo browser engine. While the codebase has since evolved significantly, html5ever's clean architecture and spec-compliant approach were invaluable as a starting point. Thank you to the Servo team for their excellent work.
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
MIT. Free to use both for commercial and non-commercial use.
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
justhtml/__init__.py,sha256=rsc4X1uTsJziqKtZxWQsIqwuC5DI0cvfYw5q_FtEOCo,375
|
|
2
|
+
justhtml/__main__.py,sha256=VEaIacoiUvKNXA1HxePLsKaSEP44x7fCdTFJ8wKwnG4,577
|
|
3
|
+
justhtml/constants.py,sha256=gmQ2Jtujtj06oivuJ_K_JyqFWRNWsaT-izvPTuA5B2U,11526
|
|
4
|
+
justhtml/context.py,sha256=tiR5UKj1VUuq5_1W_CRAVGl06QNlh_ecy4GejG58IFo,184
|
|
5
|
+
justhtml/entities.py,sha256=g26GxmmS7BHhogkskzTlSIihKrutlMu_SX1EdE2yvJY,9644
|
|
6
|
+
justhtml/errors.py,sha256=_ITld5kWoedZPZ7sPqogJwSVW6d_pdXSmkEMhZlUMVg,9942
|
|
7
|
+
justhtml/node.py,sha256=_QmkO4UveAu_PtbxX4TIPxsF6zo5aOwXD34yqqr64YQ,6026
|
|
8
|
+
justhtml/parser.py,sha256=W1CduxrWYUgoS-HgJOwRdAar_hsV9NvtMbP76hDO4Sk,3110
|
|
9
|
+
justhtml/selector.py,sha256=E7ZOBiVHWdmSDQG64-c2SOzDtmMOijlN8tIisZWChs4,30896
|
|
10
|
+
justhtml/serialize.py,sha256=GSmevQjm2GIECYgcUCx7Ki0LrtNaV4l8p9DgD_Jdv_Y,6489
|
|
11
|
+
justhtml/stream.py,sha256=jB5cNNS4J2463E9aAvpKfaZwv8wmzghrrvjoMsSUMGY,2764
|
|
12
|
+
justhtml/tokenizer.py,sha256=1q7vXaJ1_Wf48b3Mc23WC3-2KfDlmXRk1iBFwDMKqCs,97728
|
|
13
|
+
justhtml/tokens.py,sha256=gqoMAFlti-P0mhuHORwDsiWYi3VTHCdLexlUGY6At3E,5872
|
|
14
|
+
justhtml/treebuilder.py,sha256=WFriVhIPngpUcg4zQcmXztkl74jjkbr-7renrcLRiqI,52365
|
|
15
|
+
justhtml/treebuilder_modes.py,sha256=H1TRl6qTEk-N5IzbSH-ExZcp_MlH2O_LBrzcWa30Z8k,93009
|
|
16
|
+
justhtml/treebuilder_utils.py,sha256=O-uGSNhz2M1oxTMu3I8rPtVaZlU1zc9veqbMbioiqmk,2636
|
|
17
|
+
justhtml-0.6.0.dist-info/METADATA,sha256=Xk0gTzxQCaIpaXaBlW2ZlWZVjOYIN9rfxAM7vLyvnNE,5802
|
|
18
|
+
justhtml-0.6.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
19
|
+
justhtml-0.6.0.dist-info/licenses/LICENSE,sha256=jM1KAJ1VQZAo7SCGVK1jtVj11zgIc5_BxZAUhXq01V8,1072
|
|
20
|
+
justhtml-0.6.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Emil StenstrΓΆm
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|