justhtml 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
justhtml/parser.py ADDED
@@ -0,0 +1,86 @@
1
+ """Minimal JustHTML parser entry point."""
2
+
3
+ from .tokenizer import Tokenizer, TokenizerOpts
4
+ from .treebuilder import TreeBuilder
5
+
6
+
7
+ class StrictModeError(SyntaxError):
8
+ """Raised when strict mode encounters a parse error.
9
+
10
+ Inherits from SyntaxError to provide Python 3.11+ enhanced error display
11
+ with source location highlighting.
12
+ """
13
+
14
+ def __init__(self, error):
15
+ self.error = error
16
+ # Use the ParseError's as_exception() to get enhanced display
17
+ exc = error.as_exception()
18
+ super().__init__(exc.msg)
19
+ # Copy SyntaxError attributes for enhanced display
20
+ self.filename = exc.filename
21
+ self.lineno = exc.lineno
22
+ self.offset = exc.offset
23
+ self.text = exc.text
24
+ self.end_lineno = getattr(exc, "end_lineno", None)
25
+ self.end_offset = getattr(exc, "end_offset", None)
26
+
27
+
28
+ class JustHTML:
29
+ __slots__ = ("debug", "errors", "fragment_context", "root", "tokenizer", "tree_builder")
30
+
31
+ def __init__(
32
+ self,
33
+ html,
34
+ *,
35
+ collect_errors=False,
36
+ debug=False,
37
+ fragment_context=None,
38
+ iframe_srcdoc=False,
39
+ strict=False,
40
+ tokenizer_opts=None,
41
+ tree_builder=None,
42
+ ):
43
+ self.debug = bool(debug)
44
+ self.fragment_context = fragment_context
45
+
46
+ # Enable error collection if strict mode is on
47
+ should_collect = collect_errors or strict
48
+
49
+ self.tree_builder = tree_builder or TreeBuilder(
50
+ fragment_context=fragment_context,
51
+ iframe_srcdoc=iframe_srcdoc,
52
+ collect_errors=should_collect,
53
+ )
54
+ opts = tokenizer_opts or TokenizerOpts()
55
+
56
+ # For RAWTEXT fragment contexts, set initial tokenizer state and rawtext tag
57
+ if fragment_context and not fragment_context.namespace:
58
+ rawtext_elements = {"textarea", "title", "style"}
59
+ tag_name = fragment_context.tag_name.lower()
60
+ if tag_name in rawtext_elements:
61
+ opts.initial_state = Tokenizer.RAWTEXT
62
+ opts.initial_rawtext_tag = tag_name
63
+ elif tag_name in ("plaintext", "script"):
64
+ opts.initial_state = Tokenizer.PLAINTEXT
65
+
66
+ self.tokenizer = Tokenizer(self.tree_builder, opts, collect_errors=should_collect)
67
+ # Link tokenizer to tree_builder for position info
68
+ self.tree_builder.tokenizer = self.tokenizer
69
+
70
+ self.tokenizer.run(html or "")
71
+ self.root = self.tree_builder.finish()
72
+
73
+ # Merge errors from both tokenizer and tree builder
74
+ self.errors = self.tokenizer.errors + self.tree_builder.errors
75
+
76
+ # In strict mode, raise on first error
77
+ if strict and self.errors:
78
+ raise StrictModeError(self.errors[0])
79
+
80
+ def query(self, selector):
81
+ """Query the document using a CSS selector. Delegates to root.query()."""
82
+ return self.root.query(selector)
83
+
84
+ def to_html(self, pretty=True, indent_size=2):
85
+ """Serialize the document to HTML. Delegates to root.to_html()."""
86
+ return self.root.to_html(indent=0, indent_size=indent_size, pretty=pretty)