textblockrenderer 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,55 @@
1
+ """Text Block Renderer - A library for parsing HTML text, splitting text into blocks, and rendering blocks to PNG images."""
2
+
3
+ from .models import (
4
+ ColoredSpan,
5
+ ColoredWord,
6
+ FontSpec,
7
+ RenderConstraint,
8
+ SubtitleBlock,
9
+ ColoredSubtitleBlock,
10
+ SplitConfig,
11
+ )
12
+ from .htmlparser import (
13
+ ColorHTMLParser,
14
+ parse_html_text,
15
+ split_html_paragraphs,
16
+ )
17
+ from .splitter import (
18
+ TextMeasurer,
19
+ find_semantic_break,
20
+ split_paragraph_to_blocks,
21
+ find_colored_semantic_break,
22
+ split_html_to_colored_blocks,
23
+ )
24
+ from .renderer import (
25
+ optimize_polygon_x_alignment,
26
+ round_polygon_corners,
27
+ render_colored_block,
28
+ )
29
+
30
+ __version__ = "1.1.0"
31
+
32
+ __all__ = [
33
+ # Models
34
+ "ColoredSpan",
35
+ "ColoredWord",
36
+ "FontSpec",
37
+ "RenderConstraint",
38
+ "SubtitleBlock",
39
+ "ColoredSubtitleBlock",
40
+ "SplitConfig",
41
+ # HTML Parser
42
+ "ColorHTMLParser",
43
+ "parse_html_text",
44
+ "split_html_paragraphs",
45
+ # Splitter
46
+ "TextMeasurer",
47
+ "find_semantic_break",
48
+ "split_paragraph_to_blocks",
49
+ "find_colored_semantic_break",
50
+ "split_html_to_colored_blocks",
51
+ # Renderer
52
+ "optimize_polygon_x_alignment",
53
+ "round_polygon_corners",
54
+ "render_colored_block",
55
+ ]
@@ -0,0 +1,211 @@
1
+ """HTML parsing utilities for extracting text with color and font-size styles."""
2
+
3
+ from typing import List, Optional, Tuple
4
+ from html.parser import HTMLParser
5
+ import re
6
+
7
+ from .models import ColoredSpan, ColoredWord
8
+
9
+
10
+ class ColorHTMLParser(HTMLParser):
11
+ """解析HTML提取文本、颜色和字号信息"""
12
+
13
+ def __init__(self, base_font_size: int = 16, target_font_size: int = 36):
14
+ """
15
+ Args:
16
+ base_font_size: HTML 中的字号基准
17
+ target_font_size: 实际渲染的目标字号
18
+ """
19
+ super().__init__()
20
+ self.base_font_size = base_font_size
21
+ self.target_font_size = target_font_size
22
+ self.colored_spans: List[ColoredSpan] = []
23
+ self.current_color: Optional[str] = None
24
+ self.current_font_size: Optional[int] = None # 存储缩放后的字号
25
+ self.color_stack: List[Optional[str]] = []
26
+ self.font_size_stack: List[Optional[int]] = []
27
+
28
+ def _parse_font_size(self, value: str) -> Optional[int]:
29
+ """解析字号值,按比例缩放到目标字号
30
+
31
+ 缩放公式: 实际字号 = (HTML字号 / 基准字号) * 目标字号
32
+ """
33
+ value = value.strip().lower()
34
+ # 匹配数字和单位
35
+ match = re.match(r"([\d.]+)(px|pt|em|rem|%)?", value)
36
+ if match:
37
+ num = float(match.group(1))
38
+ unit = match.group(2) or "px"
39
+
40
+ # 转换为 HTML 像素值
41
+ if unit == "px":
42
+ html_px = num
43
+ elif unit == "pt":
44
+ html_px = num * 1.333
45
+ elif unit in ("em", "rem"):
46
+ html_px = num * self.base_font_size
47
+ elif unit == "%":
48
+ html_px = num / 100 * self.base_font_size
49
+ else:
50
+ html_px = num
51
+
52
+ # 按比例缩放到目标字号
53
+ scaled = (html_px / self.base_font_size) * self.target_font_size
54
+ return int(scaled)
55
+ return None
56
+
57
+ def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
58
+ if tag in ("span", "font", "p", "div", "b", "i", "strong", "em"):
59
+ color = None
60
+ font_size = None
61
+ for attr_name, attr_value in attrs:
62
+ if attr_name == "color":
63
+ color = attr_value
64
+ elif attr_name == "size":
65
+ # <font size="5"> 格式
66
+ font_size = self._parse_font_size(attr_value)
67
+ elif attr_name == "style":
68
+ # 解析 style="color: red; font-size: 24px"
69
+ color_match = re.search(r"color:\s*([^;]+)", attr_value)
70
+ if color_match:
71
+ color = color_match.group(1).strip()
72
+ size_match = re.search(r"font-size:\s*([^;]+)", attr_value)
73
+ if size_match:
74
+ font_size = self._parse_font_size(size_match.group(1))
75
+
76
+ # 压入栈,继承父级样式
77
+ self.color_stack.append(color if color else self.current_color)
78
+ self.font_size_stack.append(
79
+ font_size if font_size else self.current_font_size
80
+ )
81
+ if color:
82
+ self.current_color = color
83
+ if font_size:
84
+ self.current_font_size = font_size
85
+
86
+ def handle_endtag(self, tag: str):
87
+ if (
88
+ tag in ("span", "font", "p", "div", "b", "i", "strong", "em")
89
+ and self.color_stack
90
+ ):
91
+ self.color_stack.pop()
92
+ self.font_size_stack.pop()
93
+ self.current_color = self.color_stack[-1] if self.color_stack else None
94
+ self.current_font_size = (
95
+ self.font_size_stack[-1] if self.font_size_stack else None
96
+ )
97
+
98
+ def handle_data(self, data: str):
99
+ if data.strip():
100
+ self.colored_spans.append(
101
+ ColoredSpan(
102
+ text=data,
103
+ color=self.current_color,
104
+ font_size=self.current_font_size,
105
+ )
106
+ )
107
+
108
+ def get_colored_words(self) -> List[ColoredWord]:
109
+ """将文本片段转换为带样式的单词列表,支持 \\n 换行
110
+
111
+ 换行符会被当作字符处理:
112
+ - 单词后的 \\n 会设置 force_newline=True
113
+ - 空行(连续的 \\n\\n)会添加一个特殊的换行标记单词
114
+ """
115
+ words = []
116
+ for span in self.colored_spans:
117
+ # 按换行符分割,保留换行信息
118
+ lines = span.text.split("\n")
119
+ for line_idx, line in enumerate(lines):
120
+ line_words = line.split()
121
+ is_not_last_line = line_idx < len(lines) - 1
122
+
123
+ if not line_words:
124
+ # 空行:添加一个特殊的换行标记(只在非最后一行时添加)
125
+ if is_not_last_line:
126
+ words.append(
127
+ ColoredWord(
128
+ word="", # 空单词作为换行标记
129
+ color=span.color,
130
+ font_size=span.font_size,
131
+ force_newline=True,
132
+ )
133
+ )
134
+ else:
135
+ for word_idx, word in enumerate(line_words):
136
+ # 如果是该行最后一个单词,且不是最后一行,标记 force_newline
137
+ is_last_word_in_line = word_idx == len(line_words) - 1
138
+ force_newline = is_last_word_in_line and is_not_last_line
139
+
140
+ words.append(
141
+ ColoredWord(
142
+ word=word,
143
+ color=span.color,
144
+ font_size=span.font_size,
145
+ force_newline=force_newline,
146
+ )
147
+ )
148
+ return words
149
+
150
+
151
+ def parse_html_text(
152
+ html_text: str,
153
+ base_font_size: int = 16,
154
+ target_font_size: int = 36,
155
+ ) -> Tuple[str, List[ColoredWord]]:
156
+ """
157
+ 解析HTML文本
158
+
159
+ Args:
160
+ html_text: HTML 格式的文本
161
+ base_font_size: HTML 中的字号基准(默认 16px)
162
+ target_font_size: 实际渲染的目标字号(默认 36px)
163
+
164
+ 返回: (纯文本, 带样式的单词列表)
165
+ """
166
+ parser = ColorHTMLParser(base_font_size, target_font_size)
167
+ parser.feed(html_text)
168
+ colored_words = parser.get_colored_words()
169
+ plain_text = " ".join(w.word for w in colored_words)
170
+ return plain_text, colored_words
171
+
172
+
173
+ def split_html_paragraphs(html_text: str, combine: bool = False) -> str | List[str]:
174
+ """
175
+ 按HTML段落标签或换行符分割文本
176
+ 支持 <p>, <br>, <div> 等标签作为分隔符
177
+ 如果文本不包含HTML标签,则按 \\n 分割
178
+
179
+ Args:
180
+ html_text: HTML 格式的文本
181
+ combine: 如果为 True,将所有段落用换行符组合成单一字符串返回
182
+
183
+ Returns:
184
+ 分割后的段落列表(每个段落保留内部标签如span)
185
+ """
186
+ # 检测是否包含 HTML 标签
187
+ has_html_tags = bool(re.search(r"<[a-zA-Z][^>]*>", html_text))
188
+
189
+ if not has_html_tags:
190
+ # 纯文本模式:按换行符分割
191
+ paragraphs = html_text.split("\n")
192
+ result = [p.strip() for p in paragraphs if p.strip()]
193
+ else:
194
+ # HTML 模式:匹配 </p><p> 或 <br> 或 </div><div> 等作为段落分隔
195
+ paragraphs = re.split(
196
+ r"</p>\s*(?:<p[^>]*>)?|<br\s*/?>|</div>\s*(?:<div[^>]*>)?", html_text
197
+ )
198
+
199
+ result = []
200
+ for p in paragraphs:
201
+ # 清理开头的 <p> 或 <div> 标签
202
+ cleaned = re.sub(r"^<(p|div)[^>]*>", "", p.strip())
203
+ # 清理结尾的 </p> 或 </div>
204
+ cleaned = re.sub(r"</(p|div)>$", "", cleaned)
205
+ # 去除纯空白段落
206
+ # if cleaned.strip():
207
+ result.append(cleaned)
208
+
209
+ if combine:
210
+ return ["\n".join(result)]
211
+ return result