html2pic 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- html2pic/__init__.py +35 -0
- html2pic/core.py +185 -0
- html2pic/css_parser.py +290 -0
- html2pic/exceptions.py +19 -0
- html2pic/html_parser.py +167 -0
- html2pic/models.py +168 -0
- html2pic/style_engine.py +442 -0
- html2pic/translator.py +944 -0
- html2pic/warnings_system.py +192 -0
- html2pic-0.1.1.dist-info/METADATA +347 -0
- html2pic-0.1.1.dist-info/RECORD +12 -0
- html2pic-0.1.1.dist-info/WHEEL +4 -0
html2pic/__init__.py
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
"""
|
2
|
+
html2pic: Convert HTML + CSS to images using PicTex
|
3
|
+
|
4
|
+
A Python library that translates a subset of HTML and CSS to beautiful images
|
5
|
+
without requiring a browser engine. Built on top of PicTex for high-quality
|
6
|
+
rendering with flexbox-like layout support.
|
7
|
+
|
8
|
+
Example:
|
9
|
+
```python
|
10
|
+
from html2pic import Html2Pic
|
11
|
+
|
12
|
+
html = '<div class="card"><h1>Hello World</h1></div>'
|
13
|
+
css = '.card { padding: 20px; background: blue; color: white; }'
|
14
|
+
|
15
|
+
renderer = Html2Pic(html, css)
|
16
|
+
image = renderer.render()
|
17
|
+
image.save("output.png")
|
18
|
+
```
|
19
|
+
"""
|
20
|
+
|
21
|
+
from .core import Html2Pic
|
22
|
+
from .exceptions import Html2PicError, ParseError, RenderError
|
23
|
+
from .warnings_system import (
|
24
|
+
get_warning_collector, reset_warnings, WarningCategory,
|
25
|
+
Html2PicWarning, UnsupportedFeatureWarning, StyleApplicationWarning,
|
26
|
+
TranslationWarning, ParsingWarning
|
27
|
+
)
|
28
|
+
|
29
|
+
__version__ = "0.1.1"
|
30
|
+
__all__ = [
|
31
|
+
"Html2Pic", "Html2PicError", "ParseError", "RenderError",
|
32
|
+
"get_warning_collector", "reset_warnings", "WarningCategory",
|
33
|
+
"Html2PicWarning", "UnsupportedFeatureWarning", "StyleApplicationWarning",
|
34
|
+
"TranslationWarning", "ParsingWarning"
|
35
|
+
]
|
html2pic/core.py
ADDED
@@ -0,0 +1,185 @@
|
|
1
|
+
"""
|
2
|
+
Core Html2Pic class and main API
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Dict, Any
|
6
|
+
import pictex
|
7
|
+
from .html_parser import HtmlParser
|
8
|
+
from .css_parser import CssParser
|
9
|
+
from .style_engine import StyleEngine
|
10
|
+
from .translator import PicTexTranslator
|
11
|
+
from .exceptions import ParseError, RenderError
|
12
|
+
from .warnings_system import get_warning_collector, reset_warnings
|
13
|
+
|
14
|
+
class Html2Pic:
|
15
|
+
"""
|
16
|
+
Main class for converting HTML + CSS to images using PicTex.
|
17
|
+
|
18
|
+
This class orchestrates the entire conversion process:
|
19
|
+
1. Parse HTML into a DOM tree
|
20
|
+
2. Parse CSS into style rules
|
21
|
+
3. Apply styles to DOM nodes (cascading, specificity, inheritance)
|
22
|
+
4. Translate styled DOM tree to PicTex builders
|
23
|
+
5. Render using PicTex
|
24
|
+
|
25
|
+
Example:
|
26
|
+
```python
|
27
|
+
html = '<div class="card"><h1>Hello</h1><p>World</p></div>'
|
28
|
+
css = '''
|
29
|
+
.card {
|
30
|
+
display: flex;
|
31
|
+
flex-direction: column;
|
32
|
+
padding: 20px;
|
33
|
+
background: #f0f0f0;
|
34
|
+
}
|
35
|
+
h1 { font-size: 24px; color: blue; }
|
36
|
+
p { color: gray; }
|
37
|
+
'''
|
38
|
+
|
39
|
+
renderer = Html2Pic(html, css)
|
40
|
+
image = renderer.render()
|
41
|
+
image.save("output.png")
|
42
|
+
```
|
43
|
+
"""
|
44
|
+
|
45
|
+
def __init__(self, html: str, css: str = "", base_font_size: int = 16):
|
46
|
+
"""
|
47
|
+
Initialize the HTML to image converter.
|
48
|
+
|
49
|
+
Args:
|
50
|
+
html: HTML content as a string
|
51
|
+
css: CSS content as a string
|
52
|
+
base_font_size: Base font size for relative units (default: 16px)
|
53
|
+
"""
|
54
|
+
self.html = html
|
55
|
+
self.css = css
|
56
|
+
self.base_font_size = base_font_size
|
57
|
+
|
58
|
+
# Initialize parsers and engines
|
59
|
+
self.html_parser = HtmlParser()
|
60
|
+
self.css_parser = CssParser()
|
61
|
+
self.style_engine = StyleEngine(base_font_size=base_font_size)
|
62
|
+
self.translator = PicTexTranslator()
|
63
|
+
self.warnings = get_warning_collector()
|
64
|
+
|
65
|
+
# Reset warnings for this new instance
|
66
|
+
reset_warnings()
|
67
|
+
|
68
|
+
# Parsed content (lazy loaded)
|
69
|
+
self._dom_tree = None
|
70
|
+
self._style_rules = None
|
71
|
+
self._styled_tree = None
|
72
|
+
|
73
|
+
@property
|
74
|
+
def dom_tree(self):
|
75
|
+
"""Lazily parse and return the DOM tree"""
|
76
|
+
if self._dom_tree is None:
|
77
|
+
try:
|
78
|
+
self._dom_tree = self.html_parser.parse(self.html)
|
79
|
+
except Exception as e:
|
80
|
+
raise ParseError(f"Failed to parse HTML: {e}") from e
|
81
|
+
return self._dom_tree
|
82
|
+
|
83
|
+
@property
|
84
|
+
def style_rules(self):
|
85
|
+
"""Lazily parse and return the CSS style rules"""
|
86
|
+
if self._style_rules is None:
|
87
|
+
try:
|
88
|
+
self._style_rules = self.css_parser.parse(self.css)
|
89
|
+
except Exception as e:
|
90
|
+
raise ParseError(f"Failed to parse CSS: {e}") from e
|
91
|
+
return self._style_rules
|
92
|
+
|
93
|
+
@property
|
94
|
+
def styled_tree(self):
|
95
|
+
"""Lazily compute and return the styled DOM tree"""
|
96
|
+
if self._styled_tree is None:
|
97
|
+
try:
|
98
|
+
self._styled_tree = self.style_engine.apply_styles(
|
99
|
+
self.dom_tree,
|
100
|
+
self.style_rules
|
101
|
+
)
|
102
|
+
except Exception as e:
|
103
|
+
raise RenderError(f"Failed to apply styles: {e}") from e
|
104
|
+
return self._styled_tree
|
105
|
+
|
106
|
+
def render(self, crop_mode: pictex.CropMode = pictex.CropMode.SMART) -> pictex.BitmapImage:
|
107
|
+
"""
|
108
|
+
Render the HTML + CSS to a bitmap image.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
crop_mode: How to crop the final image (SMART, CONTENT_BOX, or NONE)
|
112
|
+
|
113
|
+
Returns:
|
114
|
+
A PicTex BitmapImage object
|
115
|
+
|
116
|
+
Raises:
|
117
|
+
Html2PicError: If any step in the conversion process fails
|
118
|
+
"""
|
119
|
+
try:
|
120
|
+
# Translate styled DOM tree to PicTex builders
|
121
|
+
canvas, root_element = self.translator.translate(self.styled_tree)
|
122
|
+
|
123
|
+
# Render using PicTex
|
124
|
+
if root_element is None:
|
125
|
+
# Empty document, just render the canvas
|
126
|
+
return canvas.render("", crop_mode=crop_mode)
|
127
|
+
else:
|
128
|
+
return canvas.render(root_element, crop_mode=crop_mode)
|
129
|
+
|
130
|
+
except Exception as e:
|
131
|
+
raise RenderError(f"Failed to render image: {e}") from e
|
132
|
+
|
133
|
+
def render_as_svg(self, embed_font: bool = True) -> pictex.VectorImage:
|
134
|
+
"""
|
135
|
+
Render the HTML + CSS to an SVG vector image.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
embed_font: Whether to embed fonts in the SVG (default: True)
|
139
|
+
|
140
|
+
Returns:
|
141
|
+
A PicTex VectorImage object
|
142
|
+
|
143
|
+
Raises:
|
144
|
+
Html2PicError: If any step in the conversion process fails
|
145
|
+
"""
|
146
|
+
try:
|
147
|
+
# Translate styled DOM tree to PicTex builders
|
148
|
+
canvas, root_element = self.translator.translate(self.styled_tree)
|
149
|
+
|
150
|
+
# Render as SVG using PicTex
|
151
|
+
if root_element is None:
|
152
|
+
return canvas.render_as_svg("", embed_font=embed_font)
|
153
|
+
else:
|
154
|
+
return canvas.render_as_svg(root_element, embed_font=embed_font)
|
155
|
+
|
156
|
+
except Exception as e:
|
157
|
+
raise RenderError(f"Failed to render SVG: {e}") from e
|
158
|
+
|
159
|
+
def debug_info(self) -> Dict[str, Any]:
|
160
|
+
"""
|
161
|
+
Get debugging information about the conversion process.
|
162
|
+
|
163
|
+
Returns:
|
164
|
+
Dictionary containing DOM tree, style rules, styled tree, and warnings info
|
165
|
+
"""
|
166
|
+
return {
|
167
|
+
"dom_tree": self.dom_tree,
|
168
|
+
"style_rules": self.style_rules,
|
169
|
+
"styled_tree": self.styled_tree,
|
170
|
+
"base_font_size": self.base_font_size,
|
171
|
+
"warnings": self.get_warnings(),
|
172
|
+
"warnings_summary": self.get_warnings_summary()
|
173
|
+
}
|
174
|
+
|
175
|
+
def get_warnings(self) -> list:
|
176
|
+
"""Get all warnings from the conversion process"""
|
177
|
+
return self.warnings.get_warnings()
|
178
|
+
|
179
|
+
def get_warnings_summary(self) -> dict:
|
180
|
+
"""Get a summary of warnings from the conversion process"""
|
181
|
+
return self.warnings.get_summary()
|
182
|
+
|
183
|
+
def print_warnings(self):
|
184
|
+
"""Print a formatted summary of all warnings"""
|
185
|
+
self.warnings.print_summary()
|
html2pic/css_parser.py
ADDED
@@ -0,0 +1,290 @@
|
|
1
|
+
"""
|
2
|
+
CSS parser using tinycss2 to extract style rules
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import List, Dict
|
6
|
+
import tinycss2
|
7
|
+
from .models import CSSRule, ParsedSelector, SelectorType
|
8
|
+
from .exceptions import ParseError
|
9
|
+
from .warnings_system import get_warning_collector, WarningCategory
|
10
|
+
|
11
|
+
class CssParser:
|
12
|
+
"""
|
13
|
+
Parses CSS content into structured rules for the style engine.
|
14
|
+
|
15
|
+
Uses tinycss2 for robust CSS parsing and extracts:
|
16
|
+
- Selectors (class, id, tag)
|
17
|
+
- Declarations (property: value pairs)
|
18
|
+
- Specificity calculations for cascade resolution
|
19
|
+
"""
|
20
|
+
|
21
|
+
def __init__(self):
|
22
|
+
self.warnings = get_warning_collector()
|
23
|
+
|
24
|
+
def parse(self, css_content: str) -> List[CSSRule]:
|
25
|
+
"""
|
26
|
+
Parse CSS string into a list of CSS rules.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
css_content: CSS content as string
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
List of CSSRule objects
|
33
|
+
|
34
|
+
Raises:
|
35
|
+
ParseError: If CSS parsing fails
|
36
|
+
"""
|
37
|
+
if not css_content.strip():
|
38
|
+
return []
|
39
|
+
|
40
|
+
try:
|
41
|
+
# Parse CSS with tinycss2
|
42
|
+
stylesheet = tinycss2.parse_stylesheet(css_content)
|
43
|
+
|
44
|
+
rules = []
|
45
|
+
for rule in stylesheet:
|
46
|
+
if hasattr(rule, 'prelude') and hasattr(rule, 'content'):
|
47
|
+
# This is a qualified rule (selector + declarations)
|
48
|
+
css_rules = self._process_rule(rule)
|
49
|
+
rules.extend(css_rules)
|
50
|
+
|
51
|
+
return rules
|
52
|
+
|
53
|
+
except Exception as e:
|
54
|
+
raise ParseError(f"Failed to parse CSS: {e}") from e
|
55
|
+
|
56
|
+
def _process_rule(self, rule) -> List[CSSRule]:
|
57
|
+
"""
|
58
|
+
Process a tinycss2 QualifiedRule into our CSSRule objects.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
rule: tinycss2 QualifiedRule object
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
List of CSSRule objects (one per selector if multiple selectors)
|
65
|
+
"""
|
66
|
+
# Extract selectors from the rule prelude
|
67
|
+
selectors = self._extract_selectors(rule.prelude)
|
68
|
+
|
69
|
+
# Extract declarations from the rule content
|
70
|
+
declarations = self._extract_declarations(rule.content)
|
71
|
+
|
72
|
+
# Create a CSSRule for each selector
|
73
|
+
css_rules = []
|
74
|
+
for selector in selectors:
|
75
|
+
specificity = self._calculate_specificity(selector)
|
76
|
+
css_rules.append(CSSRule(
|
77
|
+
selector=selector.strip(),
|
78
|
+
declarations=declarations,
|
79
|
+
specificity=specificity
|
80
|
+
))
|
81
|
+
|
82
|
+
return css_rules
|
83
|
+
|
84
|
+
def _extract_selectors(self, prelude) -> List[str]:
|
85
|
+
"""
|
86
|
+
Extract selector strings from rule prelude.
|
87
|
+
|
88
|
+
Handles multiple selectors separated by commas.
|
89
|
+
For now, we only support simple selectors.
|
90
|
+
"""
|
91
|
+
selectors = []
|
92
|
+
current_selector = []
|
93
|
+
|
94
|
+
for token in prelude:
|
95
|
+
if token.type == 'literal' and token.value == ',':
|
96
|
+
# End of current selector, start next one
|
97
|
+
if current_selector:
|
98
|
+
selector_str = ''.join(t.serialize() for t in current_selector).strip()
|
99
|
+
if selector_str:
|
100
|
+
selectors.append(selector_str)
|
101
|
+
current_selector = []
|
102
|
+
else:
|
103
|
+
current_selector.append(token)
|
104
|
+
|
105
|
+
# Add the last selector
|
106
|
+
if current_selector:
|
107
|
+
selector_str = ''.join(t.serialize() for t in current_selector).strip()
|
108
|
+
if selector_str:
|
109
|
+
selectors.append(selector_str)
|
110
|
+
|
111
|
+
return selectors if selectors else ['*'] # Fallback to universal selector
|
112
|
+
|
113
|
+
def _extract_declarations(self, content) -> Dict[str, str]:
|
114
|
+
"""
|
115
|
+
Extract property: value declarations from rule content.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
content: tinycss2 rule content tokens
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
Dictionary mapping property names to values
|
122
|
+
"""
|
123
|
+
declarations = {}
|
124
|
+
|
125
|
+
# Parse declarations from the content
|
126
|
+
declaration_list = tinycss2.parse_declaration_list(content)
|
127
|
+
|
128
|
+
for item in declaration_list:
|
129
|
+
if hasattr(item, 'name') and hasattr(item, 'value'):
|
130
|
+
# This is a Declaration
|
131
|
+
property_name = item.name.lower()
|
132
|
+
property_value = ''.join(token.serialize() for token in item.value).strip()
|
133
|
+
|
134
|
+
# Check for unsupported properties
|
135
|
+
self._check_unsupported_property(property_name, property_value)
|
136
|
+
|
137
|
+
# Handle shorthand properties
|
138
|
+
if property_name == 'padding':
|
139
|
+
padding_values = self._parse_shorthand_values(property_value)
|
140
|
+
declarations.update(self._expand_padding(padding_values))
|
141
|
+
elif property_name == 'margin':
|
142
|
+
margin_values = self._parse_shorthand_values(property_value)
|
143
|
+
declarations.update(self._expand_margin(margin_values))
|
144
|
+
elif property_name == 'border':
|
145
|
+
border_declarations = self._parse_border_shorthand(property_value)
|
146
|
+
declarations.update(border_declarations)
|
147
|
+
else:
|
148
|
+
declarations[property_name] = property_value
|
149
|
+
elif hasattr(item, 'type') and item.type == 'error':
|
150
|
+
# CSS parsing error
|
151
|
+
self.warnings.warn(
|
152
|
+
f"CSS parsing error in declaration: {getattr(item, 'message', 'unknown error')}",
|
153
|
+
WarningCategory.CSS_PARSING,
|
154
|
+
{'error_type': 'declaration_error'}
|
155
|
+
)
|
156
|
+
|
157
|
+
return declarations
|
158
|
+
|
159
|
+
def _parse_shorthand_values(self, value: str) -> List[str]:
|
160
|
+
"""Parse shorthand values like '10px 20px' into individual values."""
|
161
|
+
values = value.split()
|
162
|
+
|
163
|
+
if len(values) == 1:
|
164
|
+
# all sides same
|
165
|
+
return [values[0]] * 4
|
166
|
+
elif len(values) == 2:
|
167
|
+
# vertical, horizontal
|
168
|
+
return [values[0], values[1], values[0], values[1]]
|
169
|
+
elif len(values) == 3:
|
170
|
+
# top, horizontal, bottom
|
171
|
+
return [values[0], values[1], values[2], values[1]]
|
172
|
+
elif len(values) >= 4:
|
173
|
+
# top, right, bottom, left
|
174
|
+
return values[:4]
|
175
|
+
else:
|
176
|
+
return ['0px'] * 4
|
177
|
+
|
178
|
+
def _expand_padding(self, values: List[str]) -> Dict[str, str]:
|
179
|
+
"""Expand padding shorthand into individual properties."""
|
180
|
+
return {
|
181
|
+
'padding-top': values[0],
|
182
|
+
'padding-right': values[1],
|
183
|
+
'padding-bottom': values[2],
|
184
|
+
'padding-left': values[3]
|
185
|
+
}
|
186
|
+
|
187
|
+
def _expand_margin(self, values: List[str]) -> Dict[str, str]:
|
188
|
+
"""Expand margin shorthand into individual properties."""
|
189
|
+
return {
|
190
|
+
'margin-top': values[0],
|
191
|
+
'margin-right': values[1],
|
192
|
+
'margin-bottom': values[2],
|
193
|
+
'margin-left': values[3]
|
194
|
+
}
|
195
|
+
|
196
|
+
def _parse_border_shorthand(self, value: str) -> Dict[str, str]:
|
197
|
+
"""Parse border shorthand like '1px solid black'."""
|
198
|
+
parts = value.split()
|
199
|
+
declarations = {}
|
200
|
+
|
201
|
+
for part in parts:
|
202
|
+
part = part.strip()
|
203
|
+
if not part:
|
204
|
+
continue
|
205
|
+
|
206
|
+
# Check if it's a width (ends with px, em, etc.)
|
207
|
+
if any(part.endswith(unit) for unit in ['px', 'em', 'rem', '%']):
|
208
|
+
declarations['border-width'] = part
|
209
|
+
# Check if it's a style
|
210
|
+
elif part in ['solid', 'dashed', 'dotted', 'none']:
|
211
|
+
declarations['border-style'] = part
|
212
|
+
# Assume it's a color
|
213
|
+
else:
|
214
|
+
declarations['border-color'] = part
|
215
|
+
|
216
|
+
return declarations
|
217
|
+
|
218
|
+
def _calculate_specificity(self, selector: str) -> int:
|
219
|
+
"""
|
220
|
+
Calculate CSS specificity for a selector.
|
221
|
+
|
222
|
+
Simplified specificity calculation:
|
223
|
+
- ID: 100 points
|
224
|
+
- Class: 10 points
|
225
|
+
- Tag: 1 point
|
226
|
+
- Universal (*): 0 points
|
227
|
+
|
228
|
+
Args:
|
229
|
+
selector: CSS selector string
|
230
|
+
|
231
|
+
Returns:
|
232
|
+
Specificity score as integer
|
233
|
+
"""
|
234
|
+
try:
|
235
|
+
parsed = ParsedSelector.parse(selector)
|
236
|
+
|
237
|
+
if parsed.selector_type == SelectorType.ID:
|
238
|
+
return 100
|
239
|
+
elif parsed.selector_type == SelectorType.CLASS:
|
240
|
+
return 10
|
241
|
+
elif parsed.selector_type == SelectorType.TAG:
|
242
|
+
return 1
|
243
|
+
else: # UNIVERSAL
|
244
|
+
return 0
|
245
|
+
|
246
|
+
except Exception:
|
247
|
+
# Fallback for complex selectors we don't support yet
|
248
|
+
return 1
|
249
|
+
|
250
|
+
def _check_unsupported_property(self, property_name: str, property_value: str):
|
251
|
+
"""Check if a CSS property is unsupported and warn if so"""
|
252
|
+
|
253
|
+
# Properties we fully support
|
254
|
+
supported_properties = {
|
255
|
+
# Layout
|
256
|
+
'display', 'flex-direction', 'justify-content', 'align-items', 'gap',
|
257
|
+
# Box model
|
258
|
+
'width', 'height', 'padding', 'padding-top', 'padding-right', 'padding-bottom', 'padding-left',
|
259
|
+
'margin', 'margin-top', 'margin-right', 'margin-bottom', 'margin-left',
|
260
|
+
'border', 'border-width', 'border-style', 'border-color', 'border-radius',
|
261
|
+
# Visual
|
262
|
+
'background-color', 'background-image', 'background-size', 'box-shadow', 'text-shadow',
|
263
|
+
# Typography
|
264
|
+
'color', 'font-family', 'font-size', 'font-weight', 'font-style',
|
265
|
+
'text-align', 'line-height', 'text-decoration', 'text-wrap',
|
266
|
+
# Positioning (absolute only)
|
267
|
+
'position', 'left', 'top'
|
268
|
+
}
|
269
|
+
|
270
|
+
# Properties we partially support or have limitations
|
271
|
+
partially_supported = {
|
272
|
+
'background': 'Use background-color or background-image instead',
|
273
|
+
'font': 'Use individual font properties instead',
|
274
|
+
'border-radius': 'Percentage values supported, individual corners not yet',
|
275
|
+
'right': 'Only left/top positioning supported with position absolute',
|
276
|
+
'bottom': 'Only left/top positioning supported with position absolute',
|
277
|
+
}
|
278
|
+
|
279
|
+
if property_name not in supported_properties and property_name not in partially_supported:
|
280
|
+
self.warnings.warn_unsupported_css_property(
|
281
|
+
property_name,
|
282
|
+
property_value,
|
283
|
+
f"Property not supported in html2pic"
|
284
|
+
)
|
285
|
+
elif property_name in partially_supported:
|
286
|
+
self.warnings.warn(
|
287
|
+
f"CSS property '{property_name}' has limited support: {partially_supported[property_name]}",
|
288
|
+
WarningCategory.CSS_PARSING,
|
289
|
+
{'property': property_name, 'value': property_value, 'limitation': partially_supported[property_name]}
|
290
|
+
)
|
html2pic/exceptions.py
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
"""
|
2
|
+
Exception classes for html2pic
|
3
|
+
"""
|
4
|
+
|
5
|
+
class Html2PicError(Exception):
|
6
|
+
"""Base exception for all html2pic errors"""
|
7
|
+
pass
|
8
|
+
|
9
|
+
class ParseError(Html2PicError):
|
10
|
+
"""Raised when HTML or CSS parsing fails"""
|
11
|
+
pass
|
12
|
+
|
13
|
+
class RenderError(Html2PicError):
|
14
|
+
"""Raised when rendering to PicTex fails"""
|
15
|
+
pass
|
16
|
+
|
17
|
+
class UnsupportedFeatureError(Html2PicError):
|
18
|
+
"""Raised when an unsupported HTML/CSS feature is encountered"""
|
19
|
+
pass
|
html2pic/html_parser.py
ADDED
@@ -0,0 +1,167 @@
|
|
1
|
+
"""
|
2
|
+
HTML parser using BeautifulSoup to create DOM tree
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import Optional
|
6
|
+
from bs4 import BeautifulSoup, Tag, NavigableString, Comment
|
7
|
+
from .models import DOMNode, NodeType
|
8
|
+
from .exceptions import ParseError
|
9
|
+
from .warnings_system import get_warning_collector, WarningCategory
|
10
|
+
|
11
|
+
class HtmlParser:
|
12
|
+
"""
|
13
|
+
Parses HTML content into our internal DOM tree representation.
|
14
|
+
|
15
|
+
Uses BeautifulSoup under the hood for robust HTML parsing,
|
16
|
+
then converts to our own DOMNode structure for easier processing.
|
17
|
+
"""
|
18
|
+
|
19
|
+
def __init__(self):
|
20
|
+
self.parser = "html.parser" # Use Python's built-in parser
|
21
|
+
self.warnings = get_warning_collector()
|
22
|
+
|
23
|
+
def parse(self, html_content: str) -> DOMNode:
|
24
|
+
"""
|
25
|
+
Parse HTML string into a DOM tree.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
html_content: HTML content as string
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
Root DOMNode representing the document
|
32
|
+
|
33
|
+
Raises:
|
34
|
+
ParseError: If HTML parsing fails
|
35
|
+
"""
|
36
|
+
try:
|
37
|
+
# Parse HTML with BeautifulSoup
|
38
|
+
soup = BeautifulSoup(html_content, self.parser)
|
39
|
+
|
40
|
+
# Convert BeautifulSoup tree to our DOM tree
|
41
|
+
# We create a virtual root node to hold all top-level elements
|
42
|
+
root = DOMNode(
|
43
|
+
node_type=NodeType.ELEMENT,
|
44
|
+
tag="__root__", # Special tag for root node
|
45
|
+
attributes={},
|
46
|
+
text_content="",
|
47
|
+
children=[],
|
48
|
+
parent=None
|
49
|
+
)
|
50
|
+
|
51
|
+
# Process all direct children of the parsed document
|
52
|
+
for element in soup.contents:
|
53
|
+
if isinstance(element, Tag):
|
54
|
+
child_node = self._convert_element(element)
|
55
|
+
if child_node:
|
56
|
+
child_node.parent = root
|
57
|
+
root.children.append(child_node)
|
58
|
+
elif isinstance(element, NavigableString) and not isinstance(element, Comment):
|
59
|
+
# Handle top-level text content
|
60
|
+
text_content = str(element).strip()
|
61
|
+
if text_content:
|
62
|
+
text_node = DOMNode(
|
63
|
+
node_type=NodeType.TEXT,
|
64
|
+
text_content=text_content,
|
65
|
+
parent=root
|
66
|
+
)
|
67
|
+
root.children.append(text_node)
|
68
|
+
|
69
|
+
return root
|
70
|
+
|
71
|
+
except Exception as e:
|
72
|
+
raise ParseError(f"Failed to parse HTML: {e}") from e
|
73
|
+
|
74
|
+
def _convert_element(self, bs_element: Tag) -> Optional[DOMNode]:
|
75
|
+
"""
|
76
|
+
Convert a BeautifulSoup Tag to our DOMNode.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
bs_element: BeautifulSoup Tag element
|
80
|
+
|
81
|
+
Returns:
|
82
|
+
DOMNode or None if element should be skipped
|
83
|
+
"""
|
84
|
+
# Skip script, style, and other non-visual elements
|
85
|
+
if bs_element.name in ['script', 'style', 'meta', 'link', 'title', 'head']:
|
86
|
+
self.warnings.warn(
|
87
|
+
f"Skipping non-visual element '<{bs_element.name}>'",
|
88
|
+
WarningCategory.HTML_PARSING,
|
89
|
+
{'tag': bs_element.name, 'reason': 'non-visual element'}
|
90
|
+
)
|
91
|
+
return None
|
92
|
+
|
93
|
+
# Check if element is recognized
|
94
|
+
supported_tags = {
|
95
|
+
'div', 'span', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
|
96
|
+
'section', 'article', 'header', 'footer', 'main', 'nav', 'aside',
|
97
|
+
'img', 'br', 'hr', 'strong', 'em', 'b', 'i', 'u', 's',
|
98
|
+
'ul', 'ol', 'li', 'a'
|
99
|
+
}
|
100
|
+
|
101
|
+
# Warn about potentially unsupported elements
|
102
|
+
unsupported_tags = {
|
103
|
+
'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot',
|
104
|
+
'form', 'input', 'button', 'select', 'textarea', 'label',
|
105
|
+
'video', 'audio', 'canvas', 'svg', 'iframe', 'embed', 'object'
|
106
|
+
}
|
107
|
+
|
108
|
+
if bs_element.name not in supported_tags:
|
109
|
+
if bs_element.name in unsupported_tags:
|
110
|
+
self.warnings.warn_unsupported_html_tag(
|
111
|
+
bs_element.name,
|
112
|
+
f"May not render correctly - consider using div with appropriate styling"
|
113
|
+
)
|
114
|
+
else:
|
115
|
+
# Completely unrecognized element
|
116
|
+
self.warnings.warn_unsupported_html_tag(
|
117
|
+
bs_element.name,
|
118
|
+
f"Unrecognized HTML element - will be treated as a div container"
|
119
|
+
)
|
120
|
+
|
121
|
+
# Create element node
|
122
|
+
node = DOMNode(
|
123
|
+
node_type=NodeType.ELEMENT,
|
124
|
+
tag=bs_element.name,
|
125
|
+
attributes=dict(bs_element.attrs) if bs_element.attrs else {},
|
126
|
+
text_content="",
|
127
|
+
children=[],
|
128
|
+
parent=None
|
129
|
+
)
|
130
|
+
|
131
|
+
# Process children
|
132
|
+
for child in bs_element.contents:
|
133
|
+
if isinstance(child, Tag):
|
134
|
+
child_node = self._convert_element(child)
|
135
|
+
if child_node:
|
136
|
+
child_node.parent = node
|
137
|
+
node.children.append(child_node)
|
138
|
+
|
139
|
+
elif isinstance(child, NavigableString) and not isinstance(child, Comment):
|
140
|
+
# Handle text content
|
141
|
+
text_content = str(child).strip()
|
142
|
+
if text_content:
|
143
|
+
text_node = DOMNode(
|
144
|
+
node_type=NodeType.TEXT,
|
145
|
+
text_content=text_content,
|
146
|
+
parent=node
|
147
|
+
)
|
148
|
+
node.children.append(text_node)
|
149
|
+
|
150
|
+
return node
|
151
|
+
|
152
|
+
def _should_skip_element(self, tag_name: str) -> bool:
|
153
|
+
"""
|
154
|
+
Determine if an HTML element should be skipped during parsing.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
tag_name: HTML tag name
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
True if element should be skipped
|
161
|
+
"""
|
162
|
+
# Elements that don't contribute to visual layout
|
163
|
+
skip_tags = {
|
164
|
+
'script', 'style', 'meta', 'link', 'title', 'head',
|
165
|
+
'base', 'noscript', 'template'
|
166
|
+
}
|
167
|
+
return tag_name.lower() in skip_tags
|