textblockrenderer 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- textblockrenderer/__init__.py +55 -0
- textblockrenderer/htmlparser.py +211 -0
- textblockrenderer/lang_constants.py +468 -0
- textblockrenderer/models.py +80 -0
- textblockrenderer/renderer.py +339 -0
- textblockrenderer/splitter.py +505 -0
- textblockrenderer-1.1.0.dist-info/METADATA +231 -0
- textblockrenderer-1.1.0.dist-info/RECORD +10 -0
- textblockrenderer-1.1.0.dist-info/WHEEL +4 -0
- textblockrenderer-1.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Text Block Renderer - A library for parsing HTML text, splitting text into blocks, and rendering blocks to PNG images."""
|
|
2
|
+
|
|
3
|
+
from .models import (
|
|
4
|
+
ColoredSpan,
|
|
5
|
+
ColoredWord,
|
|
6
|
+
FontSpec,
|
|
7
|
+
RenderConstraint,
|
|
8
|
+
SubtitleBlock,
|
|
9
|
+
ColoredSubtitleBlock,
|
|
10
|
+
SplitConfig,
|
|
11
|
+
)
|
|
12
|
+
from .htmlparser import (
|
|
13
|
+
ColorHTMLParser,
|
|
14
|
+
parse_html_text,
|
|
15
|
+
split_html_paragraphs,
|
|
16
|
+
)
|
|
17
|
+
from .splitter import (
|
|
18
|
+
TextMeasurer,
|
|
19
|
+
find_semantic_break,
|
|
20
|
+
split_paragraph_to_blocks,
|
|
21
|
+
find_colored_semantic_break,
|
|
22
|
+
split_html_to_colored_blocks,
|
|
23
|
+
)
|
|
24
|
+
from .renderer import (
|
|
25
|
+
optimize_polygon_x_alignment,
|
|
26
|
+
round_polygon_corners,
|
|
27
|
+
render_colored_block,
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
__version__ = "1.1.0"
|
|
31
|
+
|
|
32
|
+
__all__ = [
|
|
33
|
+
# Models
|
|
34
|
+
"ColoredSpan",
|
|
35
|
+
"ColoredWord",
|
|
36
|
+
"FontSpec",
|
|
37
|
+
"RenderConstraint",
|
|
38
|
+
"SubtitleBlock",
|
|
39
|
+
"ColoredSubtitleBlock",
|
|
40
|
+
"SplitConfig",
|
|
41
|
+
# HTML Parser
|
|
42
|
+
"ColorHTMLParser",
|
|
43
|
+
"parse_html_text",
|
|
44
|
+
"split_html_paragraphs",
|
|
45
|
+
# Splitter
|
|
46
|
+
"TextMeasurer",
|
|
47
|
+
"find_semantic_break",
|
|
48
|
+
"split_paragraph_to_blocks",
|
|
49
|
+
"find_colored_semantic_break",
|
|
50
|
+
"split_html_to_colored_blocks",
|
|
51
|
+
# Renderer
|
|
52
|
+
"optimize_polygon_x_alignment",
|
|
53
|
+
"round_polygon_corners",
|
|
54
|
+
"render_colored_block",
|
|
55
|
+
]
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
"""HTML parsing utilities for extracting text with color and font-size styles."""
|
|
2
|
+
|
|
3
|
+
from typing import List, Optional, Tuple
|
|
4
|
+
from html.parser import HTMLParser
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .models import ColoredSpan, ColoredWord
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ColorHTMLParser(HTMLParser):
|
|
11
|
+
"""解析HTML提取文本、颜色和字号信息"""
|
|
12
|
+
|
|
13
|
+
def __init__(self, base_font_size: int = 16, target_font_size: int = 36):
|
|
14
|
+
"""
|
|
15
|
+
Args:
|
|
16
|
+
base_font_size: HTML 中的字号基准
|
|
17
|
+
target_font_size: 实际渲染的目标字号
|
|
18
|
+
"""
|
|
19
|
+
super().__init__()
|
|
20
|
+
self.base_font_size = base_font_size
|
|
21
|
+
self.target_font_size = target_font_size
|
|
22
|
+
self.colored_spans: List[ColoredSpan] = []
|
|
23
|
+
self.current_color: Optional[str] = None
|
|
24
|
+
self.current_font_size: Optional[int] = None # 存储缩放后的字号
|
|
25
|
+
self.color_stack: List[Optional[str]] = []
|
|
26
|
+
self.font_size_stack: List[Optional[int]] = []
|
|
27
|
+
|
|
28
|
+
def _parse_font_size(self, value: str) -> Optional[int]:
|
|
29
|
+
"""解析字号值,按比例缩放到目标字号
|
|
30
|
+
|
|
31
|
+
缩放公式: 实际字号 = (HTML字号 / 基准字号) * 目标字号
|
|
32
|
+
"""
|
|
33
|
+
value = value.strip().lower()
|
|
34
|
+
# 匹配数字和单位
|
|
35
|
+
match = re.match(r"([\d.]+)(px|pt|em|rem|%)?", value)
|
|
36
|
+
if match:
|
|
37
|
+
num = float(match.group(1))
|
|
38
|
+
unit = match.group(2) or "px"
|
|
39
|
+
|
|
40
|
+
# 转换为 HTML 像素值
|
|
41
|
+
if unit == "px":
|
|
42
|
+
html_px = num
|
|
43
|
+
elif unit == "pt":
|
|
44
|
+
html_px = num * 1.333
|
|
45
|
+
elif unit in ("em", "rem"):
|
|
46
|
+
html_px = num * self.base_font_size
|
|
47
|
+
elif unit == "%":
|
|
48
|
+
html_px = num / 100 * self.base_font_size
|
|
49
|
+
else:
|
|
50
|
+
html_px = num
|
|
51
|
+
|
|
52
|
+
# 按比例缩放到目标字号
|
|
53
|
+
scaled = (html_px / self.base_font_size) * self.target_font_size
|
|
54
|
+
return int(scaled)
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
def handle_starttag(self, tag: str, attrs: List[Tuple[str, str]]):
|
|
58
|
+
if tag in ("span", "font", "p", "div", "b", "i", "strong", "em"):
|
|
59
|
+
color = None
|
|
60
|
+
font_size = None
|
|
61
|
+
for attr_name, attr_value in attrs:
|
|
62
|
+
if attr_name == "color":
|
|
63
|
+
color = attr_value
|
|
64
|
+
elif attr_name == "size":
|
|
65
|
+
# <font size="5"> 格式
|
|
66
|
+
font_size = self._parse_font_size(attr_value)
|
|
67
|
+
elif attr_name == "style":
|
|
68
|
+
# 解析 style="color: red; font-size: 24px"
|
|
69
|
+
color_match = re.search(r"color:\s*([^;]+)", attr_value)
|
|
70
|
+
if color_match:
|
|
71
|
+
color = color_match.group(1).strip()
|
|
72
|
+
size_match = re.search(r"font-size:\s*([^;]+)", attr_value)
|
|
73
|
+
if size_match:
|
|
74
|
+
font_size = self._parse_font_size(size_match.group(1))
|
|
75
|
+
|
|
76
|
+
# 压入栈,继承父级样式
|
|
77
|
+
self.color_stack.append(color if color else self.current_color)
|
|
78
|
+
self.font_size_stack.append(
|
|
79
|
+
font_size if font_size else self.current_font_size
|
|
80
|
+
)
|
|
81
|
+
if color:
|
|
82
|
+
self.current_color = color
|
|
83
|
+
if font_size:
|
|
84
|
+
self.current_font_size = font_size
|
|
85
|
+
|
|
86
|
+
def handle_endtag(self, tag: str):
|
|
87
|
+
if (
|
|
88
|
+
tag in ("span", "font", "p", "div", "b", "i", "strong", "em")
|
|
89
|
+
and self.color_stack
|
|
90
|
+
):
|
|
91
|
+
self.color_stack.pop()
|
|
92
|
+
self.font_size_stack.pop()
|
|
93
|
+
self.current_color = self.color_stack[-1] if self.color_stack else None
|
|
94
|
+
self.current_font_size = (
|
|
95
|
+
self.font_size_stack[-1] if self.font_size_stack else None
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
def handle_data(self, data: str):
|
|
99
|
+
if data.strip():
|
|
100
|
+
self.colored_spans.append(
|
|
101
|
+
ColoredSpan(
|
|
102
|
+
text=data,
|
|
103
|
+
color=self.current_color,
|
|
104
|
+
font_size=self.current_font_size,
|
|
105
|
+
)
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def get_colored_words(self) -> List[ColoredWord]:
|
|
109
|
+
"""将文本片段转换为带样式的单词列表,支持 \\n 换行
|
|
110
|
+
|
|
111
|
+
换行符会被当作字符处理:
|
|
112
|
+
- 单词后的 \\n 会设置 force_newline=True
|
|
113
|
+
- 空行(连续的 \\n\\n)会添加一个特殊的换行标记单词
|
|
114
|
+
"""
|
|
115
|
+
words = []
|
|
116
|
+
for span in self.colored_spans:
|
|
117
|
+
# 按换行符分割,保留换行信息
|
|
118
|
+
lines = span.text.split("\n")
|
|
119
|
+
for line_idx, line in enumerate(lines):
|
|
120
|
+
line_words = line.split()
|
|
121
|
+
is_not_last_line = line_idx < len(lines) - 1
|
|
122
|
+
|
|
123
|
+
if not line_words:
|
|
124
|
+
# 空行:添加一个特殊的换行标记(只在非最后一行时添加)
|
|
125
|
+
if is_not_last_line:
|
|
126
|
+
words.append(
|
|
127
|
+
ColoredWord(
|
|
128
|
+
word="", # 空单词作为换行标记
|
|
129
|
+
color=span.color,
|
|
130
|
+
font_size=span.font_size,
|
|
131
|
+
force_newline=True,
|
|
132
|
+
)
|
|
133
|
+
)
|
|
134
|
+
else:
|
|
135
|
+
for word_idx, word in enumerate(line_words):
|
|
136
|
+
# 如果是该行最后一个单词,且不是最后一行,标记 force_newline
|
|
137
|
+
is_last_word_in_line = word_idx == len(line_words) - 1
|
|
138
|
+
force_newline = is_last_word_in_line and is_not_last_line
|
|
139
|
+
|
|
140
|
+
words.append(
|
|
141
|
+
ColoredWord(
|
|
142
|
+
word=word,
|
|
143
|
+
color=span.color,
|
|
144
|
+
font_size=span.font_size,
|
|
145
|
+
force_newline=force_newline,
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
return words
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def parse_html_text(
|
|
152
|
+
html_text: str,
|
|
153
|
+
base_font_size: int = 16,
|
|
154
|
+
target_font_size: int = 36,
|
|
155
|
+
) -> Tuple[str, List[ColoredWord]]:
|
|
156
|
+
"""
|
|
157
|
+
解析HTML文本
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
html_text: HTML 格式的文本
|
|
161
|
+
base_font_size: HTML 中的字号基准(默认 16px)
|
|
162
|
+
target_font_size: 实际渲染的目标字号(默认 36px)
|
|
163
|
+
|
|
164
|
+
返回: (纯文本, 带样式的单词列表)
|
|
165
|
+
"""
|
|
166
|
+
parser = ColorHTMLParser(base_font_size, target_font_size)
|
|
167
|
+
parser.feed(html_text)
|
|
168
|
+
colored_words = parser.get_colored_words()
|
|
169
|
+
plain_text = " ".join(w.word for w in colored_words)
|
|
170
|
+
return plain_text, colored_words
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def split_html_paragraphs(html_text: str, combine: bool = False) -> str | List[str]:
|
|
174
|
+
"""
|
|
175
|
+
按HTML段落标签或换行符分割文本
|
|
176
|
+
支持 <p>, <br>, <div> 等标签作为分隔符
|
|
177
|
+
如果文本不包含HTML标签,则按 \\n 分割
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
html_text: HTML 格式的文本
|
|
181
|
+
combine: 如果为 True,将所有段落用换行符组合成单一字符串返回
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
分割后的段落列表(每个段落保留内部标签如span)
|
|
185
|
+
"""
|
|
186
|
+
# 检测是否包含 HTML 标签
|
|
187
|
+
has_html_tags = bool(re.search(r"<[a-zA-Z][^>]*>", html_text))
|
|
188
|
+
|
|
189
|
+
if not has_html_tags:
|
|
190
|
+
# 纯文本模式:按换行符分割
|
|
191
|
+
paragraphs = html_text.split("\n")
|
|
192
|
+
result = [p.strip() for p in paragraphs if p.strip()]
|
|
193
|
+
else:
|
|
194
|
+
# HTML 模式:匹配 </p><p> 或 <br> 或 </div><div> 等作为段落分隔
|
|
195
|
+
paragraphs = re.split(
|
|
196
|
+
r"</p>\s*(?:<p[^>]*>)?|<br\s*/?>|</div>\s*(?:<div[^>]*>)?", html_text
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
result = []
|
|
200
|
+
for p in paragraphs:
|
|
201
|
+
# 清理开头的 <p> 或 <div> 标签
|
|
202
|
+
cleaned = re.sub(r"^<(p|div)[^>]*>", "", p.strip())
|
|
203
|
+
# 清理结尾的 </p> 或 </div>
|
|
204
|
+
cleaned = re.sub(r"</(p|div)>$", "", cleaned)
|
|
205
|
+
# 去除纯空白段落
|
|
206
|
+
# if cleaned.strip():
|
|
207
|
+
result.append(cleaned)
|
|
208
|
+
|
|
209
|
+
if combine:
|
|
210
|
+
return ["\n".join(result)]
|
|
211
|
+
return result
|