html2pic 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
html2pic/__init__.py ADDED
@@ -0,0 +1,35 @@
1
+ """
2
+ html2pic: Convert HTML + CSS to images using PicTex
3
+
4
+ A Python library that translates a subset of HTML and CSS to beautiful images
5
+ without requiring a browser engine. Built on top of PicTex for high-quality
6
+ rendering with flexbox-like layout support.
7
+
8
+ Example:
9
+ ```python
10
+ from html2pic import Html2Pic
11
+
12
+ html = '<div class="card"><h1>Hello World</h1></div>'
13
+ css = '.card { padding: 20px; background: blue; color: white; }'
14
+
15
+ renderer = Html2Pic(html, css)
16
+ image = renderer.render()
17
+ image.save("output.png")
18
+ ```
19
+ """
20
+
21
+ from .core import Html2Pic
22
+ from .exceptions import Html2PicError, ParseError, RenderError
23
+ from .warnings_system import (
24
+ get_warning_collector, reset_warnings, WarningCategory,
25
+ Html2PicWarning, UnsupportedFeatureWarning, StyleApplicationWarning,
26
+ TranslationWarning, ParsingWarning
27
+ )
28
+
29
+ __version__ = "0.1.1"
30
+ __all__ = [
31
+ "Html2Pic", "Html2PicError", "ParseError", "RenderError",
32
+ "get_warning_collector", "reset_warnings", "WarningCategory",
33
+ "Html2PicWarning", "UnsupportedFeatureWarning", "StyleApplicationWarning",
34
+ "TranslationWarning", "ParsingWarning"
35
+ ]
html2pic/core.py ADDED
@@ -0,0 +1,185 @@
1
+ """
2
+ Core Html2Pic class and main API
3
+ """
4
+
5
+ from typing import Dict, Any
6
+ import pictex
7
+ from .html_parser import HtmlParser
8
+ from .css_parser import CssParser
9
+ from .style_engine import StyleEngine
10
+ from .translator import PicTexTranslator
11
+ from .exceptions import ParseError, RenderError
12
+ from .warnings_system import get_warning_collector, reset_warnings
13
+
14
+ class Html2Pic:
15
+ """
16
+ Main class for converting HTML + CSS to images using PicTex.
17
+
18
+ This class orchestrates the entire conversion process:
19
+ 1. Parse HTML into a DOM tree
20
+ 2. Parse CSS into style rules
21
+ 3. Apply styles to DOM nodes (cascading, specificity, inheritance)
22
+ 4. Translate styled DOM tree to PicTex builders
23
+ 5. Render using PicTex
24
+
25
+ Example:
26
+ ```python
27
+ html = '<div class="card"><h1>Hello</h1><p>World</p></div>'
28
+ css = '''
29
+ .card {
30
+ display: flex;
31
+ flex-direction: column;
32
+ padding: 20px;
33
+ background: #f0f0f0;
34
+ }
35
+ h1 { font-size: 24px; color: blue; }
36
+ p { color: gray; }
37
+ '''
38
+
39
+ renderer = Html2Pic(html, css)
40
+ image = renderer.render()
41
+ image.save("output.png")
42
+ ```
43
+ """
44
+
45
+ def __init__(self, html: str, css: str = "", base_font_size: int = 16):
46
+ """
47
+ Initialize the HTML to image converter.
48
+
49
+ Args:
50
+ html: HTML content as a string
51
+ css: CSS content as a string
52
+ base_font_size: Base font size for relative units (default: 16px)
53
+ """
54
+ self.html = html
55
+ self.css = css
56
+ self.base_font_size = base_font_size
57
+
58
+ # Initialize parsers and engines
59
+ self.html_parser = HtmlParser()
60
+ self.css_parser = CssParser()
61
+ self.style_engine = StyleEngine(base_font_size=base_font_size)
62
+ self.translator = PicTexTranslator()
63
+ self.warnings = get_warning_collector()
64
+
65
+ # Reset warnings for this new instance
66
+ reset_warnings()
67
+
68
+ # Parsed content (lazy loaded)
69
+ self._dom_tree = None
70
+ self._style_rules = None
71
+ self._styled_tree = None
72
+
73
+ @property
74
+ def dom_tree(self):
75
+ """Lazily parse and return the DOM tree"""
76
+ if self._dom_tree is None:
77
+ try:
78
+ self._dom_tree = self.html_parser.parse(self.html)
79
+ except Exception as e:
80
+ raise ParseError(f"Failed to parse HTML: {e}") from e
81
+ return self._dom_tree
82
+
83
+ @property
84
+ def style_rules(self):
85
+ """Lazily parse and return the CSS style rules"""
86
+ if self._style_rules is None:
87
+ try:
88
+ self._style_rules = self.css_parser.parse(self.css)
89
+ except Exception as e:
90
+ raise ParseError(f"Failed to parse CSS: {e}") from e
91
+ return self._style_rules
92
+
93
+ @property
94
+ def styled_tree(self):
95
+ """Lazily compute and return the styled DOM tree"""
96
+ if self._styled_tree is None:
97
+ try:
98
+ self._styled_tree = self.style_engine.apply_styles(
99
+ self.dom_tree,
100
+ self.style_rules
101
+ )
102
+ except Exception as e:
103
+ raise RenderError(f"Failed to apply styles: {e}") from e
104
+ return self._styled_tree
105
+
106
+ def render(self, crop_mode: pictex.CropMode = pictex.CropMode.SMART) -> pictex.BitmapImage:
107
+ """
108
+ Render the HTML + CSS to a bitmap image.
109
+
110
+ Args:
111
+ crop_mode: How to crop the final image (SMART, CONTENT_BOX, or NONE)
112
+
113
+ Returns:
114
+ A PicTex BitmapImage object
115
+
116
+ Raises:
117
+ Html2PicError: If any step in the conversion process fails
118
+ """
119
+ try:
120
+ # Translate styled DOM tree to PicTex builders
121
+ canvas, root_element = self.translator.translate(self.styled_tree)
122
+
123
+ # Render using PicTex
124
+ if root_element is None:
125
+ # Empty document, just render the canvas
126
+ return canvas.render("", crop_mode=crop_mode)
127
+ else:
128
+ return canvas.render(root_element, crop_mode=crop_mode)
129
+
130
+ except Exception as e:
131
+ raise RenderError(f"Failed to render image: {e}") from e
132
+
133
+ def render_as_svg(self, embed_font: bool = True) -> pictex.VectorImage:
134
+ """
135
+ Render the HTML + CSS to an SVG vector image.
136
+
137
+ Args:
138
+ embed_font: Whether to embed fonts in the SVG (default: True)
139
+
140
+ Returns:
141
+ A PicTex VectorImage object
142
+
143
+ Raises:
144
+ Html2PicError: If any step in the conversion process fails
145
+ """
146
+ try:
147
+ # Translate styled DOM tree to PicTex builders
148
+ canvas, root_element = self.translator.translate(self.styled_tree)
149
+
150
+ # Render as SVG using PicTex
151
+ if root_element is None:
152
+ return canvas.render_as_svg("", embed_font=embed_font)
153
+ else:
154
+ return canvas.render_as_svg(root_element, embed_font=embed_font)
155
+
156
+ except Exception as e:
157
+ raise RenderError(f"Failed to render SVG: {e}") from e
158
+
159
+ def debug_info(self) -> Dict[str, Any]:
160
+ """
161
+ Get debugging information about the conversion process.
162
+
163
+ Returns:
164
+ Dictionary containing DOM tree, style rules, styled tree, and warnings info
165
+ """
166
+ return {
167
+ "dom_tree": self.dom_tree,
168
+ "style_rules": self.style_rules,
169
+ "styled_tree": self.styled_tree,
170
+ "base_font_size": self.base_font_size,
171
+ "warnings": self.get_warnings(),
172
+ "warnings_summary": self.get_warnings_summary()
173
+ }
174
+
175
+ def get_warnings(self) -> list:
176
+ """Get all warnings from the conversion process"""
177
+ return self.warnings.get_warnings()
178
+
179
+ def get_warnings_summary(self) -> dict:
180
+ """Get a summary of warnings from the conversion process"""
181
+ return self.warnings.get_summary()
182
+
183
+ def print_warnings(self):
184
+ """Print a formatted summary of all warnings"""
185
+ self.warnings.print_summary()
html2pic/css_parser.py ADDED
@@ -0,0 +1,290 @@
1
+ """
2
+ CSS parser using tinycss2 to extract style rules
3
+ """
4
+
5
+ from typing import List, Dict
6
+ import tinycss2
7
+ from .models import CSSRule, ParsedSelector, SelectorType
8
+ from .exceptions import ParseError
9
+ from .warnings_system import get_warning_collector, WarningCategory
10
+
11
+ class CssParser:
12
+ """
13
+ Parses CSS content into structured rules for the style engine.
14
+
15
+ Uses tinycss2 for robust CSS parsing and extracts:
16
+ - Selectors (class, id, tag)
17
+ - Declarations (property: value pairs)
18
+ - Specificity calculations for cascade resolution
19
+ """
20
+
21
+ def __init__(self):
22
+ self.warnings = get_warning_collector()
23
+
24
+ def parse(self, css_content: str) -> List[CSSRule]:
25
+ """
26
+ Parse CSS string into a list of CSS rules.
27
+
28
+ Args:
29
+ css_content: CSS content as string
30
+
31
+ Returns:
32
+ List of CSSRule objects
33
+
34
+ Raises:
35
+ ParseError: If CSS parsing fails
36
+ """
37
+ if not css_content.strip():
38
+ return []
39
+
40
+ try:
41
+ # Parse CSS with tinycss2
42
+ stylesheet = tinycss2.parse_stylesheet(css_content)
43
+
44
+ rules = []
45
+ for rule in stylesheet:
46
+ if hasattr(rule, 'prelude') and hasattr(rule, 'content'):
47
+ # This is a qualified rule (selector + declarations)
48
+ css_rules = self._process_rule(rule)
49
+ rules.extend(css_rules)
50
+
51
+ return rules
52
+
53
+ except Exception as e:
54
+ raise ParseError(f"Failed to parse CSS: {e}") from e
55
+
56
+ def _process_rule(self, rule) -> List[CSSRule]:
57
+ """
58
+ Process a tinycss2 QualifiedRule into our CSSRule objects.
59
+
60
+ Args:
61
+ rule: tinycss2 QualifiedRule object
62
+
63
+ Returns:
64
+ List of CSSRule objects (one per selector if multiple selectors)
65
+ """
66
+ # Extract selectors from the rule prelude
67
+ selectors = self._extract_selectors(rule.prelude)
68
+
69
+ # Extract declarations from the rule content
70
+ declarations = self._extract_declarations(rule.content)
71
+
72
+ # Create a CSSRule for each selector
73
+ css_rules = []
74
+ for selector in selectors:
75
+ specificity = self._calculate_specificity(selector)
76
+ css_rules.append(CSSRule(
77
+ selector=selector.strip(),
78
+ declarations=declarations,
79
+ specificity=specificity
80
+ ))
81
+
82
+ return css_rules
83
+
84
+ def _extract_selectors(self, prelude) -> List[str]:
85
+ """
86
+ Extract selector strings from rule prelude.
87
+
88
+ Handles multiple selectors separated by commas.
89
+ For now, we only support simple selectors.
90
+ """
91
+ selectors = []
92
+ current_selector = []
93
+
94
+ for token in prelude:
95
+ if token.type == 'literal' and token.value == ',':
96
+ # End of current selector, start next one
97
+ if current_selector:
98
+ selector_str = ''.join(t.serialize() for t in current_selector).strip()
99
+ if selector_str:
100
+ selectors.append(selector_str)
101
+ current_selector = []
102
+ else:
103
+ current_selector.append(token)
104
+
105
+ # Add the last selector
106
+ if current_selector:
107
+ selector_str = ''.join(t.serialize() for t in current_selector).strip()
108
+ if selector_str:
109
+ selectors.append(selector_str)
110
+
111
+ return selectors if selectors else ['*'] # Fallback to universal selector
112
+
113
+ def _extract_declarations(self, content) -> Dict[str, str]:
114
+ """
115
+ Extract property: value declarations from rule content.
116
+
117
+ Args:
118
+ content: tinycss2 rule content tokens
119
+
120
+ Returns:
121
+ Dictionary mapping property names to values
122
+ """
123
+ declarations = {}
124
+
125
+ # Parse declarations from the content
126
+ declaration_list = tinycss2.parse_declaration_list(content)
127
+
128
+ for item in declaration_list:
129
+ if hasattr(item, 'name') and hasattr(item, 'value'):
130
+ # This is a Declaration
131
+ property_name = item.name.lower()
132
+ property_value = ''.join(token.serialize() for token in item.value).strip()
133
+
134
+ # Check for unsupported properties
135
+ self._check_unsupported_property(property_name, property_value)
136
+
137
+ # Handle shorthand properties
138
+ if property_name == 'padding':
139
+ padding_values = self._parse_shorthand_values(property_value)
140
+ declarations.update(self._expand_padding(padding_values))
141
+ elif property_name == 'margin':
142
+ margin_values = self._parse_shorthand_values(property_value)
143
+ declarations.update(self._expand_margin(margin_values))
144
+ elif property_name == 'border':
145
+ border_declarations = self._parse_border_shorthand(property_value)
146
+ declarations.update(border_declarations)
147
+ else:
148
+ declarations[property_name] = property_value
149
+ elif hasattr(item, 'type') and item.type == 'error':
150
+ # CSS parsing error
151
+ self.warnings.warn(
152
+ f"CSS parsing error in declaration: {getattr(item, 'message', 'unknown error')}",
153
+ WarningCategory.CSS_PARSING,
154
+ {'error_type': 'declaration_error'}
155
+ )
156
+
157
+ return declarations
158
+
159
+ def _parse_shorthand_values(self, value: str) -> List[str]:
160
+ """Parse shorthand values like '10px 20px' into individual values."""
161
+ values = value.split()
162
+
163
+ if len(values) == 1:
164
+ # all sides same
165
+ return [values[0]] * 4
166
+ elif len(values) == 2:
167
+ # vertical, horizontal
168
+ return [values[0], values[1], values[0], values[1]]
169
+ elif len(values) == 3:
170
+ # top, horizontal, bottom
171
+ return [values[0], values[1], values[2], values[1]]
172
+ elif len(values) >= 4:
173
+ # top, right, bottom, left
174
+ return values[:4]
175
+ else:
176
+ return ['0px'] * 4
177
+
178
+ def _expand_padding(self, values: List[str]) -> Dict[str, str]:
179
+ """Expand padding shorthand into individual properties."""
180
+ return {
181
+ 'padding-top': values[0],
182
+ 'padding-right': values[1],
183
+ 'padding-bottom': values[2],
184
+ 'padding-left': values[3]
185
+ }
186
+
187
+ def _expand_margin(self, values: List[str]) -> Dict[str, str]:
188
+ """Expand margin shorthand into individual properties."""
189
+ return {
190
+ 'margin-top': values[0],
191
+ 'margin-right': values[1],
192
+ 'margin-bottom': values[2],
193
+ 'margin-left': values[3]
194
+ }
195
+
196
+ def _parse_border_shorthand(self, value: str) -> Dict[str, str]:
197
+ """Parse border shorthand like '1px solid black'."""
198
+ parts = value.split()
199
+ declarations = {}
200
+
201
+ for part in parts:
202
+ part = part.strip()
203
+ if not part:
204
+ continue
205
+
206
+ # Check if it's a width (ends with px, em, etc.)
207
+ if any(part.endswith(unit) for unit in ['px', 'em', 'rem', '%']):
208
+ declarations['border-width'] = part
209
+ # Check if it's a style
210
+ elif part in ['solid', 'dashed', 'dotted', 'none']:
211
+ declarations['border-style'] = part
212
+ # Assume it's a color
213
+ else:
214
+ declarations['border-color'] = part
215
+
216
+ return declarations
217
+
218
+ def _calculate_specificity(self, selector: str) -> int:
219
+ """
220
+ Calculate CSS specificity for a selector.
221
+
222
+ Simplified specificity calculation:
223
+ - ID: 100 points
224
+ - Class: 10 points
225
+ - Tag: 1 point
226
+ - Universal (*): 0 points
227
+
228
+ Args:
229
+ selector: CSS selector string
230
+
231
+ Returns:
232
+ Specificity score as integer
233
+ """
234
+ try:
235
+ parsed = ParsedSelector.parse(selector)
236
+
237
+ if parsed.selector_type == SelectorType.ID:
238
+ return 100
239
+ elif parsed.selector_type == SelectorType.CLASS:
240
+ return 10
241
+ elif parsed.selector_type == SelectorType.TAG:
242
+ return 1
243
+ else: # UNIVERSAL
244
+ return 0
245
+
246
+ except Exception:
247
+ # Fallback for complex selectors we don't support yet
248
+ return 1
249
+
250
+ def _check_unsupported_property(self, property_name: str, property_value: str):
251
+ """Check if a CSS property is unsupported and warn if so"""
252
+
253
+ # Properties we fully support
254
+ supported_properties = {
255
+ # Layout
256
+ 'display', 'flex-direction', 'justify-content', 'align-items', 'gap',
257
+ # Box model
258
+ 'width', 'height', 'padding', 'padding-top', 'padding-right', 'padding-bottom', 'padding-left',
259
+ 'margin', 'margin-top', 'margin-right', 'margin-bottom', 'margin-left',
260
+ 'border', 'border-width', 'border-style', 'border-color', 'border-radius',
261
+ # Visual
262
+ 'background-color', 'background-image', 'background-size', 'box-shadow', 'text-shadow',
263
+ # Typography
264
+ 'color', 'font-family', 'font-size', 'font-weight', 'font-style',
265
+ 'text-align', 'line-height', 'text-decoration', 'text-wrap',
266
+ # Positioning (absolute only)
267
+ 'position', 'left', 'top'
268
+ }
269
+
270
+ # Properties we partially support or have limitations
271
+ partially_supported = {
272
+ 'background': 'Use background-color or background-image instead',
273
+ 'font': 'Use individual font properties instead',
274
+ 'border-radius': 'Percentage values supported, individual corners not yet',
275
+ 'right': 'Only left/top positioning supported with position absolute',
276
+ 'bottom': 'Only left/top positioning supported with position absolute',
277
+ }
278
+
279
+ if property_name not in supported_properties and property_name not in partially_supported:
280
+ self.warnings.warn_unsupported_css_property(
281
+ property_name,
282
+ property_value,
283
+ f"Property not supported in html2pic"
284
+ )
285
+ elif property_name in partially_supported:
286
+ self.warnings.warn(
287
+ f"CSS property '{property_name}' has limited support: {partially_supported[property_name]}",
288
+ WarningCategory.CSS_PARSING,
289
+ {'property': property_name, 'value': property_value, 'limitation': partially_supported[property_name]}
290
+ )
html2pic/exceptions.py ADDED
@@ -0,0 +1,19 @@
1
+ """
2
+ Exception classes for html2pic
3
+ """
4
+
5
+ class Html2PicError(Exception):
6
+ """Base exception for all html2pic errors"""
7
+ pass
8
+
9
+ class ParseError(Html2PicError):
10
+ """Raised when HTML or CSS parsing fails"""
11
+ pass
12
+
13
+ class RenderError(Html2PicError):
14
+ """Raised when rendering to PicTex fails"""
15
+ pass
16
+
17
+ class UnsupportedFeatureError(Html2PicError):
18
+ """Raised when an unsupported HTML/CSS feature is encountered"""
19
+ pass
@@ -0,0 +1,167 @@
1
+ """
2
+ HTML parser using BeautifulSoup to create DOM tree
3
+ """
4
+
5
+ from typing import Optional
6
+ from bs4 import BeautifulSoup, Tag, NavigableString, Comment
7
+ from .models import DOMNode, NodeType
8
+ from .exceptions import ParseError
9
+ from .warnings_system import get_warning_collector, WarningCategory
10
+
11
+ class HtmlParser:
12
+ """
13
+ Parses HTML content into our internal DOM tree representation.
14
+
15
+ Uses BeautifulSoup under the hood for robust HTML parsing,
16
+ then converts to our own DOMNode structure for easier processing.
17
+ """
18
+
19
+ def __init__(self):
20
+ self.parser = "html.parser" # Use Python's built-in parser
21
+ self.warnings = get_warning_collector()
22
+
23
+ def parse(self, html_content: str) -> DOMNode:
24
+ """
25
+ Parse HTML string into a DOM tree.
26
+
27
+ Args:
28
+ html_content: HTML content as string
29
+
30
+ Returns:
31
+ Root DOMNode representing the document
32
+
33
+ Raises:
34
+ ParseError: If HTML parsing fails
35
+ """
36
+ try:
37
+ # Parse HTML with BeautifulSoup
38
+ soup = BeautifulSoup(html_content, self.parser)
39
+
40
+ # Convert BeautifulSoup tree to our DOM tree
41
+ # We create a virtual root node to hold all top-level elements
42
+ root = DOMNode(
43
+ node_type=NodeType.ELEMENT,
44
+ tag="__root__", # Special tag for root node
45
+ attributes={},
46
+ text_content="",
47
+ children=[],
48
+ parent=None
49
+ )
50
+
51
+ # Process all direct children of the parsed document
52
+ for element in soup.contents:
53
+ if isinstance(element, Tag):
54
+ child_node = self._convert_element(element)
55
+ if child_node:
56
+ child_node.parent = root
57
+ root.children.append(child_node)
58
+ elif isinstance(element, NavigableString) and not isinstance(element, Comment):
59
+ # Handle top-level text content
60
+ text_content = str(element).strip()
61
+ if text_content:
62
+ text_node = DOMNode(
63
+ node_type=NodeType.TEXT,
64
+ text_content=text_content,
65
+ parent=root
66
+ )
67
+ root.children.append(text_node)
68
+
69
+ return root
70
+
71
+ except Exception as e:
72
+ raise ParseError(f"Failed to parse HTML: {e}") from e
73
+
74
+ def _convert_element(self, bs_element: Tag) -> Optional[DOMNode]:
75
+ """
76
+ Convert a BeautifulSoup Tag to our DOMNode.
77
+
78
+ Args:
79
+ bs_element: BeautifulSoup Tag element
80
+
81
+ Returns:
82
+ DOMNode or None if element should be skipped
83
+ """
84
+ # Skip script, style, and other non-visual elements
85
+ if bs_element.name in ['script', 'style', 'meta', 'link', 'title', 'head']:
86
+ self.warnings.warn(
87
+ f"Skipping non-visual element '<{bs_element.name}>'",
88
+ WarningCategory.HTML_PARSING,
89
+ {'tag': bs_element.name, 'reason': 'non-visual element'}
90
+ )
91
+ return None
92
+
93
+ # Check if element is recognized
94
+ supported_tags = {
95
+ 'div', 'span', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
96
+ 'section', 'article', 'header', 'footer', 'main', 'nav', 'aside',
97
+ 'img', 'br', 'hr', 'strong', 'em', 'b', 'i', 'u', 's',
98
+ 'ul', 'ol', 'li', 'a'
99
+ }
100
+
101
+ # Warn about potentially unsupported elements
102
+ unsupported_tags = {
103
+ 'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot',
104
+ 'form', 'input', 'button', 'select', 'textarea', 'label',
105
+ 'video', 'audio', 'canvas', 'svg', 'iframe', 'embed', 'object'
106
+ }
107
+
108
+ if bs_element.name not in supported_tags:
109
+ if bs_element.name in unsupported_tags:
110
+ self.warnings.warn_unsupported_html_tag(
111
+ bs_element.name,
112
+ f"May not render correctly - consider using div with appropriate styling"
113
+ )
114
+ else:
115
+ # Completely unrecognized element
116
+ self.warnings.warn_unsupported_html_tag(
117
+ bs_element.name,
118
+ f"Unrecognized HTML element - will be treated as a div container"
119
+ )
120
+
121
+ # Create element node
122
+ node = DOMNode(
123
+ node_type=NodeType.ELEMENT,
124
+ tag=bs_element.name,
125
+ attributes=dict(bs_element.attrs) if bs_element.attrs else {},
126
+ text_content="",
127
+ children=[],
128
+ parent=None
129
+ )
130
+
131
+ # Process children
132
+ for child in bs_element.contents:
133
+ if isinstance(child, Tag):
134
+ child_node = self._convert_element(child)
135
+ if child_node:
136
+ child_node.parent = node
137
+ node.children.append(child_node)
138
+
139
+ elif isinstance(child, NavigableString) and not isinstance(child, Comment):
140
+ # Handle text content
141
+ text_content = str(child).strip()
142
+ if text_content:
143
+ text_node = DOMNode(
144
+ node_type=NodeType.TEXT,
145
+ text_content=text_content,
146
+ parent=node
147
+ )
148
+ node.children.append(text_node)
149
+
150
+ return node
151
+
152
+ def _should_skip_element(self, tag_name: str) -> bool:
153
+ """
154
+ Determine if an HTML element should be skipped during parsing.
155
+
156
+ Args:
157
+ tag_name: HTML tag name
158
+
159
+ Returns:
160
+ True if element should be skipped
161
+ """
162
+ # Elements that don't contribute to visual layout
163
+ skip_tags = {
164
+ 'script', 'style', 'meta', 'link', 'title', 'head',
165
+ 'base', 'noscript', 'template'
166
+ }
167
+ return tag_name.lower() in skip_tags