justhtml 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of justhtml might be problematic. Click here for more details.
- justhtml/__init__.py +17 -0
- justhtml/__main__.py +144 -0
- justhtml/constants.py +445 -0
- justhtml/context.py +12 -0
- justhtml/encoding.py +405 -0
- justhtml/entities.py +344 -0
- justhtml/errors.py +140 -0
- justhtml/node.py +632 -0
- justhtml/parser.py +131 -0
- justhtml/py.typed +0 -0
- justhtml/selector.py +965 -0
- justhtml/serialize.py +258 -0
- justhtml/stream.py +107 -0
- justhtml/tokenizer.py +2647 -0
- justhtml/tokens.py +223 -0
- justhtml/treebuilder.py +1279 -0
- justhtml/treebuilder_modes.py +2016 -0
- justhtml/treebuilder_utils.py +93 -0
- justhtml-0.12.0.dist-info/METADATA +164 -0
- justhtml-0.12.0.dist-info/RECORD +23 -0
- justhtml-0.12.0.dist-info/WHEEL +4 -0
- justhtml-0.12.0.dist-info/entry_points.txt +2 -0
- justhtml-0.12.0.dist-info/licenses/LICENSE +21 -0
justhtml/parser.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""Minimal JustHTML parser entry point."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING, Any
|
|
6
|
+
|
|
7
|
+
from .encoding import decode_html
|
|
8
|
+
from .tokenizer import Tokenizer, TokenizerOpts
|
|
9
|
+
from .treebuilder import TreeBuilder
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from .context import FragmentContext
|
|
13
|
+
from .node import SimpleDomNode
|
|
14
|
+
from .tokens import ParseError
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class StrictModeError(SyntaxError):
|
|
18
|
+
"""Raised when strict mode encounters a parse error.
|
|
19
|
+
|
|
20
|
+
Inherits from SyntaxError to provide Python 3.11+ enhanced error display
|
|
21
|
+
with source location highlighting.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
error: ParseError
|
|
25
|
+
|
|
26
|
+
def __init__(self, error: ParseError) -> None:
|
|
27
|
+
self.error = error
|
|
28
|
+
# Use the ParseError's as_exception() to get enhanced display
|
|
29
|
+
exc = error.as_exception()
|
|
30
|
+
super().__init__(exc.msg)
|
|
31
|
+
# Copy SyntaxError attributes for enhanced display
|
|
32
|
+
self.filename = exc.filename
|
|
33
|
+
self.lineno = exc.lineno
|
|
34
|
+
self.offset = exc.offset
|
|
35
|
+
self.text = exc.text
|
|
36
|
+
self.end_lineno = getattr(exc, "end_lineno", None)
|
|
37
|
+
self.end_offset = getattr(exc, "end_offset", None)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class JustHTML:
|
|
41
|
+
__slots__ = ("debug", "encoding", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
|
|
42
|
+
|
|
43
|
+
debug: bool
|
|
44
|
+
encoding: str | None
|
|
45
|
+
errors: list[ParseError]
|
|
46
|
+
fragment_context: FragmentContext | None
|
|
47
|
+
root: SimpleDomNode
|
|
48
|
+
tokenizer: Tokenizer
|
|
49
|
+
tree_builder: TreeBuilder
|
|
50
|
+
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
html: str | bytes | bytearray | memoryview | None,
|
|
54
|
+
*,
|
|
55
|
+
collect_errors: bool = False,
|
|
56
|
+
debug: bool = False,
|
|
57
|
+
encoding: str | None = None,
|
|
58
|
+
fragment_context: FragmentContext | None = None,
|
|
59
|
+
iframe_srcdoc: bool = False,
|
|
60
|
+
strict: bool = False,
|
|
61
|
+
tokenizer_opts: TokenizerOpts | None = None,
|
|
62
|
+
tree_builder: TreeBuilder | None = None,
|
|
63
|
+
) -> None:
|
|
64
|
+
self.debug = bool(debug)
|
|
65
|
+
self.fragment_context = fragment_context
|
|
66
|
+
self.encoding = None
|
|
67
|
+
|
|
68
|
+
html_str: str
|
|
69
|
+
if isinstance(html, (bytes, bytearray, memoryview)):
|
|
70
|
+
html_str, chosen = decode_html(bytes(html), transport_encoding=encoding)
|
|
71
|
+
self.encoding = chosen
|
|
72
|
+
elif html is not None:
|
|
73
|
+
html_str = str(html)
|
|
74
|
+
else:
|
|
75
|
+
html_str = ""
|
|
76
|
+
|
|
77
|
+
# Enable error collection if strict mode is on
|
|
78
|
+
should_collect = collect_errors or strict
|
|
79
|
+
|
|
80
|
+
self.tree_builder = tree_builder or TreeBuilder(
|
|
81
|
+
fragment_context=fragment_context,
|
|
82
|
+
iframe_srcdoc=iframe_srcdoc,
|
|
83
|
+
collect_errors=should_collect,
|
|
84
|
+
)
|
|
85
|
+
opts = tokenizer_opts or TokenizerOpts()
|
|
86
|
+
|
|
87
|
+
# For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
|
|
88
|
+
if fragment_context and not fragment_context.namespace:
|
|
89
|
+
rawtext_elements = {"textarea", "title", "style"}
|
|
90
|
+
tag_name = fragment_context.tag_name.lower()
|
|
91
|
+
if tag_name in rawtext_elements:
|
|
92
|
+
opts.initial_state = Tokenizer.RAWTEXT
|
|
93
|
+
opts.initial_rawtext_tag = tag_name
|
|
94
|
+
elif tag_name in ("plaintext", "script"):
|
|
95
|
+
opts.initial_state = Tokenizer.PLAINTEXT
|
|
96
|
+
|
|
97
|
+
self.tokenizer = Tokenizer(self.tree_builder, opts, collect_errors=should_collect)
|
|
98
|
+
# Link tokenizer to tree_builder for position info
|
|
99
|
+
self.tree_builder.tokenizer = self.tokenizer
|
|
100
|
+
|
|
101
|
+
self.tokenizer.run(html_str)
|
|
102
|
+
self.root = self.tree_builder.finish()
|
|
103
|
+
|
|
104
|
+
# Merge errors from both tokenizer and tree builder
|
|
105
|
+
self.errors = self.tokenizer.errors + self.tree_builder.errors
|
|
106
|
+
|
|
107
|
+
# In strict mode, raise on first error
|
|
108
|
+
if strict and self.errors:
|
|
109
|
+
raise StrictModeError(self.errors[0])
|
|
110
|
+
|
|
111
|
+
def query(self, selector: str) -> list[Any]:
|
|
112
|
+
"""Query the document using a CSS selector. Delegates to root.query()."""
|
|
113
|
+
return self.root.query(selector)
|
|
114
|
+
|
|
115
|
+
def to_html(self, pretty: bool = True, indent_size: int = 2) -> str:
|
|
116
|
+
"""Serialize the document to HTML. Delegates to root.to_html()."""
|
|
117
|
+
return self.root.to_html(indent=0, indent_size=indent_size, pretty=pretty)
|
|
118
|
+
|
|
119
|
+
def to_text(self, separator: str = " ", strip: bool = True) -> str:
|
|
120
|
+
"""Return the document's concatenated text.
|
|
121
|
+
|
|
122
|
+
Delegates to `root.to_text(separator=..., strip=...)`.
|
|
123
|
+
"""
|
|
124
|
+
return self.root.to_text(separator=separator, strip=strip)
|
|
125
|
+
|
|
126
|
+
def to_markdown(self) -> str:
|
|
127
|
+
"""Return a GitHub Flavored Markdown representation.
|
|
128
|
+
|
|
129
|
+
Delegates to `root.to_markdown()`.
|
|
130
|
+
"""
|
|
131
|
+
return self.root.to_markdown()
|
justhtml/py.typed
ADDED
|
File without changes
|