justhtml 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of justhtml might be problematic. Click here for more details.

justhtml/parser.py ADDED
@@ -0,0 +1,131 @@
1
+ """Minimal JustHTML parser entry point."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from .encoding import decode_html
8
+ from .tokenizer import Tokenizer, TokenizerOpts
9
+ from .treebuilder import TreeBuilder
10
+
11
+ if TYPE_CHECKING:
12
+ from .context import FragmentContext
13
+ from .node import SimpleDomNode
14
+ from .tokens import ParseError
15
+
16
+
17
+ class StrictModeError(SyntaxError):
18
+ """Raised when strict mode encounters a parse error.
19
+
20
+ Inherits from SyntaxError to provide Python 3.11+ enhanced error display
21
+ with source location highlighting.
22
+ """
23
+
24
+ error: ParseError
25
+
26
+ def __init__(self, error: ParseError) -> None:
27
+ self.error = error
28
+ # Use the ParseError's as_exception() to get enhanced display
29
+ exc = error.as_exception()
30
+ super().__init__(exc.msg)
31
+ # Copy SyntaxError attributes for enhanced display
32
+ self.filename = exc.filename
33
+ self.lineno = exc.lineno
34
+ self.offset = exc.offset
35
+ self.text = exc.text
36
+ self.end_lineno = getattr(exc, "end_lineno", None)
37
+ self.end_offset = getattr(exc, "end_offset", None)
38
+
39
+
40
+ class JustHTML:
41
+ __slots__ = ("debug", "encoding", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
42
+
43
+ debug: bool
44
+ encoding: str | None
45
+ errors: list[ParseError]
46
+ fragment_context: FragmentContext | None
47
+ root: SimpleDomNode
48
+ tokenizer: Tokenizer
49
+ tree_builder: TreeBuilder
50
+
51
+ def __init__(
52
+ self,
53
+ html: str | bytes | bytearray | memoryview | None,
54
+ *,
55
+ collect_errors: bool = False,
56
+ debug: bool = False,
57
+ encoding: str | None = None,
58
+ fragment_context: FragmentContext | None = None,
59
+ iframe_srcdoc: bool = False,
60
+ strict: bool = False,
61
+ tokenizer_opts: TokenizerOpts | None = None,
62
+ tree_builder: TreeBuilder | None = None,
63
+ ) -> None:
64
+ self.debug = bool(debug)
65
+ self.fragment_context = fragment_context
66
+ self.encoding = None
67
+
68
+ html_str: str
69
+ if isinstance(html, (bytes, bytearray, memoryview)):
70
+ html_str, chosen = decode_html(bytes(html), transport_encoding=encoding)
71
+ self.encoding = chosen
72
+ elif html is not None:
73
+ html_str = str(html)
74
+ else:
75
+ html_str = ""
76
+
77
+ # Enable error collection if strict mode is on
78
+ should_collect = collect_errors or strict
79
+
80
+ self.tree_builder = tree_builder or TreeBuilder(
81
+ fragment_context=fragment_context,
82
+ iframe_srcdoc=iframe_srcdoc,
83
+ collect_errors=should_collect,
84
+ )
85
+ opts = tokenizer_opts or TokenizerOpts()
86
+
87
+ # For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
88
+ if fragment_context and not fragment_context.namespace:
89
+ rawtext_elements = {"textarea", "title", "style"}
90
+ tag_name = fragment_context.tag_name.lower()
91
+ if tag_name in rawtext_elements:
92
+ opts.initial_state = Tokenizer.RAWTEXT
93
+ opts.initial_rawtext_tag = tag_name
94
+ elif tag_name in ("plaintext", "script"):
95
+ opts.initial_state = Tokenizer.PLAINTEXT
96
+
97
+ self.tokenizer = Tokenizer(self.tree_builder, opts, collect_errors=should_collect)
98
+ # Link tokenizer to tree_builder for position info
99
+ self.tree_builder.tokenizer = self.tokenizer
100
+
101
+ self.tokenizer.run(html_str)
102
+ self.root = self.tree_builder.finish()
103
+
104
+ # Merge errors from both tokenizer and tree builder
105
+ self.errors = self.tokenizer.errors + self.tree_builder.errors
106
+
107
+ # In strict mode, raise on first error
108
+ if strict and self.errors:
109
+ raise StrictModeError(self.errors[0])
110
+
111
+ def query(self, selector: str) -> list[Any]:
112
+ """Query the document using a CSS selector. Delegates to root.query()."""
113
+ return self.root.query(selector)
114
+
115
+ def to_html(self, pretty: bool = True, indent_size: int = 2) -> str:
116
+ """Serialize the document to HTML. Delegates to root.to_html()."""
117
+ return self.root.to_html(indent=0, indent_size=indent_size, pretty=pretty)
118
+
119
+ def to_text(self, separator: str = " ", strip: bool = True) -> str:
120
+ """Return the document's concatenated text.
121
+
122
+ Delegates to `root.to_text(separator=..., strip=...)`.
123
+ """
124
+ return self.root.to_text(separator=separator, strip=strip)
125
+
126
+ def to_markdown(self) -> str:
127
+ """Return a GitHub Flavored Markdown representation.
128
+
129
+ Delegates to `root.to_markdown()`.
130
+ """
131
+ return self.root.to_markdown()
justhtml/py.typed ADDED
File without changes