@birthday8/doc-mcp 1.0.0 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.js +2 -11
- package/package.json +2 -2
- package/python/docx_converter.py +1220 -428
- package/python/html_fixer.py +125 -0
- package/python/html_validator.py +389 -0
- package/python/sample/example.html +407 -0
- package/python/server.py +193 -120
- package/python/test_error_detection.py +84 -0
- package/python/__pycache__/docx_converter.cpython-313.pyc +0 -0
package/python/docx_converter.py
CHANGED
|
@@ -4,13 +4,230 @@ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
|
|
|
4
4
|
from docx.enum.section import WD_SECTION
|
|
5
5
|
from docx.oxml.ns import qn
|
|
6
6
|
from docx.oxml import OxmlElement
|
|
7
|
+
from docx.enum.shape import WD_INLINE_SHAPE
|
|
7
8
|
from bs4 import BeautifulSoup
|
|
8
9
|
import os
|
|
9
10
|
import re
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
# 尝试导入 math2docx
|
|
14
|
+
try:
|
|
15
|
+
import math2docx
|
|
16
|
+
|
|
17
|
+
HAS_MATH2DOCX = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
HAS_MATH2DOCX = False
|
|
20
|
+
print("Warning: math2docx not installed. Formula support will be limited.")
|
|
21
|
+
|
|
22
|
+
# ==================== 预编译正则表达式 ====================
|
|
23
|
+
# 样式解析相关
|
|
24
|
+
TEXT_ALIGN_RE = re.compile(r"text-align:\s*([^;]+)")
|
|
25
|
+
LINE_HEIGHT_RE = re.compile(r"line-height:\s*([^;]+)")
|
|
26
|
+
COLOR_RE = re.compile(r"(?<!background-)color:\s*([^;]+)")
|
|
27
|
+
BACKGROUND_COLOR_RE = re.compile(r"background-color:\s*([^;]+)")
|
|
28
|
+
FONT_FAMILY_RE = re.compile(r"font-family:\s*([^;]+)")
|
|
29
|
+
FONT_SIZE_RE = re.compile(r"font-size:\s*([^;]+)")
|
|
30
|
+
FONT_WEIGHT_RE = re.compile(r"font-weight:\s*([^;]+)")
|
|
31
|
+
FONT_STYLE_RE = re.compile(r"font-style:\s*([^;]+)")
|
|
32
|
+
TEXT_DECORATION_RE = re.compile(r"text-decoration:\s*([^;]+)")
|
|
33
|
+
MARGIN_RE = re.compile(r"margin(?:-(top|bottom|left|right))?:\s*([^;]+)")
|
|
34
|
+
PADDING_RE = re.compile(r"padding(?:-(top|bottom|left|right))?:\s*([^;]+)")
|
|
35
|
+
|
|
36
|
+
# 公式相关
|
|
37
|
+
LATEX_FORMULA_RE = re.compile(r"\$\$(.*?)\$\$|\$(.*?)\$")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ==================== 常量配置 ====================
|
|
41
|
+
class ConverterConfig:
|
|
42
|
+
"""转换器配置常量"""
|
|
43
|
+
|
|
44
|
+
DEFAULT_FONT = "微软雅黑"
|
|
45
|
+
DEFAULT_SIZE = 12
|
|
46
|
+
|
|
47
|
+
# 页面设置
|
|
48
|
+
PAGE_HEIGHT_CM = 29.7
|
|
49
|
+
PAGE_WIDTH_CM = 21.0
|
|
50
|
+
MARGIN_CM = 2.54
|
|
51
|
+
|
|
52
|
+
# 字体大小映射
|
|
53
|
+
HEADING_SIZES = {
|
|
54
|
+
1: 18,
|
|
55
|
+
2: 16,
|
|
56
|
+
3: 14,
|
|
57
|
+
4: 14,
|
|
58
|
+
5: 14,
|
|
59
|
+
6: 14,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# 标题颜色
|
|
63
|
+
HEADING_COLORS = {
|
|
64
|
+
1: RGBColor(74, 63, 107),
|
|
65
|
+
2: RGBColor(91, 78, 140),
|
|
66
|
+
3: RGBColor(107, 91, 149),
|
|
67
|
+
4: RGBColor(122, 104, 161),
|
|
68
|
+
5: RGBColor(137, 117, 173),
|
|
69
|
+
6: RGBColor(152, 130, 185),
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# 特殊类名颜色
|
|
73
|
+
CLASS_COLORS = {
|
|
74
|
+
"red": RGBColor(255, 0, 0),
|
|
75
|
+
"blue": RGBColor(0, 0, 255),
|
|
76
|
+
"green": RGBColor(0, 128, 0),
|
|
77
|
+
"purple": RGBColor(128, 0, 128),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# 提示框颜色
|
|
81
|
+
INFO_COLORS = {
|
|
82
|
+
"bg": "E3F2FD",
|
|
83
|
+
"border": "2196F3",
|
|
84
|
+
}
|
|
85
|
+
WARNING_COLORS = {
|
|
86
|
+
"bg": "FFF3CD",
|
|
87
|
+
"border": "FFC107",
|
|
88
|
+
}
|
|
89
|
+
SUCCESS_COLORS = {
|
|
90
|
+
"bg": "D4EDDA",
|
|
91
|
+
"border": "28A745",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# 颜色映射
|
|
95
|
+
COLOR_MAP = {
|
|
96
|
+
"red": "FF0000",
|
|
97
|
+
"green": "008000",
|
|
98
|
+
"blue": "0000FF",
|
|
99
|
+
"yellow": "FFFF00",
|
|
100
|
+
"orange": "FFA500",
|
|
101
|
+
"purple": "800080",
|
|
102
|
+
"pink": "FFC0CB",
|
|
103
|
+
"brown": "A52A2A",
|
|
104
|
+
"gray": "808080",
|
|
105
|
+
"black": "000000",
|
|
106
|
+
"white": "FFFFFF",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def add_image(doc, image_path, width=None, height=None, align="center"):
|
|
111
|
+
"""添加图片到文档
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
doc: Word文档对象
|
|
115
|
+
image_path: 图片路径(绝对路径或相对于html文件的路径)
|
|
116
|
+
width: 图片宽度(英寸,可选)
|
|
117
|
+
height: 图片高度(英寸,可选)
|
|
118
|
+
align: 对齐方式('left', 'center', 'right')
|
|
119
|
+
"""
|
|
120
|
+
# 检查文件是否存在
|
|
121
|
+
if not os.path.exists(image_path):
|
|
122
|
+
print(f"Warning: Image file not found: {image_path}")
|
|
123
|
+
# 添加占位文本
|
|
124
|
+
para = doc.add_paragraph()
|
|
125
|
+
run = para.add_run(f"[图片: {os.path.basename(image_path)}]")
|
|
126
|
+
run.font.color.rgb = RGBColor(150, 150, 150)
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
# 创建段落并设置对齐
|
|
131
|
+
para = doc.add_paragraph()
|
|
132
|
+
if align == "center":
|
|
133
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
134
|
+
elif align == "right":
|
|
135
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
136
|
+
else:
|
|
137
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
138
|
+
|
|
139
|
+
# 添加图片
|
|
140
|
+
if width and height:
|
|
141
|
+
run = para.add_run()
|
|
142
|
+
run.add_picture(image_path, width=Inches(width), height=Inches(height))
|
|
143
|
+
elif width:
|
|
144
|
+
run = para.add_run()
|
|
145
|
+
run.add_picture(image_path, width=Inches(width))
|
|
146
|
+
elif height:
|
|
147
|
+
run = para.add_run()
|
|
148
|
+
run.add_picture(image_path, height=Inches(height))
|
|
149
|
+
else:
|
|
150
|
+
run = para.add_run()
|
|
151
|
+
run.add_picture(image_path)
|
|
152
|
+
|
|
153
|
+
return True
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
print(f"Warning: Failed to add image {image_path}: {e}")
|
|
157
|
+
import traceback
|
|
158
|
+
|
|
159
|
+
traceback.print_exc()
|
|
160
|
+
# 添加占位文本
|
|
161
|
+
para = doc.add_paragraph()
|
|
162
|
+
run = para.add_run(f"[图片: {os.path.basename(image_path)}]")
|
|
163
|
+
run.font.color.rgb = RGBColor(150, 150, 150)
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def latex_to_unicode_formula(latex_formula):
|
|
168
|
+
"""将 LaTeX 公式转换为 Unicode 数学符号"""
|
|
169
|
+
# 简单的 LaTeX 到 Unicode 转换映射
|
|
170
|
+
conversions = {
|
|
171
|
+
r"\^2": "²",
|
|
172
|
+
r"\^3": "³",
|
|
173
|
+
r"\_2": "₂",
|
|
174
|
+
r"\_3": "₃",
|
|
175
|
+
r"\\cdot": "·",
|
|
176
|
+
r"\\times": "×",
|
|
177
|
+
r"\\div": "÷",
|
|
178
|
+
r"\\neq": "≠",
|
|
179
|
+
r"\\leq": "≤",
|
|
180
|
+
r"\\geq": "≥",
|
|
181
|
+
r"\\pm": "±",
|
|
182
|
+
r"\\sqrt": "√",
|
|
183
|
+
r"\\pi": "π",
|
|
184
|
+
r"\\alpha": "α",
|
|
185
|
+
r"\\beta": "β",
|
|
186
|
+
r"\\gamma": "γ",
|
|
187
|
+
r"\\delta": "δ",
|
|
188
|
+
r"\\theta": "θ",
|
|
189
|
+
r"\\lambda": "λ",
|
|
190
|
+
r"\\mu": "μ",
|
|
191
|
+
r"\\sigma": "σ",
|
|
192
|
+
r"\\phi": "φ",
|
|
193
|
+
r"\\omega": "ω",
|
|
194
|
+
r"\\infty": "∞",
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
result = latex_formula
|
|
198
|
+
for latex, unicode_char in conversions.items():
|
|
199
|
+
result = result.replace(latex, unicode_char)
|
|
200
|
+
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def add_native_formula(
|
|
205
|
+
para,
|
|
206
|
+
latex_formula,
|
|
207
|
+
color=None,
|
|
208
|
+
font_name=None,
|
|
209
|
+
font_size=None,
|
|
210
|
+
bold=False,
|
|
211
|
+
italic=False,
|
|
212
|
+
):
|
|
213
|
+
"""添加 Word 原生公式"""
|
|
214
|
+
if HAS_MATH2DOCX:
|
|
215
|
+
try:
|
|
216
|
+
# 添加公式
|
|
217
|
+
math2docx.add_math(para, latex_formula)
|
|
218
|
+
return True
|
|
219
|
+
except Exception as e:
|
|
220
|
+
print(f"Warning: Failed to add native formula: {e}")
|
|
221
|
+
import traceback
|
|
222
|
+
|
|
223
|
+
traceback.print_exc()
|
|
224
|
+
return False
|
|
225
|
+
return False
|
|
226
|
+
|
|
10
227
|
|
|
11
228
|
def parse_color(color_str):
|
|
12
229
|
"""解析颜色字符串为RGBColor"""
|
|
13
|
-
if not color_str or not color_str.startswith(
|
|
230
|
+
if not color_str or not color_str.startswith("#"):
|
|
14
231
|
return None
|
|
15
232
|
try:
|
|
16
233
|
r = int(color_str[1:3], 16)
|
|
@@ -20,185 +237,112 @@ def parse_color(color_str):
|
|
|
20
237
|
except:
|
|
21
238
|
return None
|
|
22
239
|
|
|
23
|
-
|
|
24
|
-
|
|
240
|
+
|
|
241
|
+
def set_font(
|
|
242
|
+
run,
|
|
243
|
+
font_name="微软雅黑",
|
|
244
|
+
size=12,
|
|
245
|
+
color=None,
|
|
246
|
+
bold=False,
|
|
247
|
+
italic=False,
|
|
248
|
+
underline=False,
|
|
249
|
+
strike=False,
|
|
250
|
+
highlight_color=None,
|
|
251
|
+
):
|
|
25
252
|
"""设置字体样式"""
|
|
26
253
|
run.font.name = font_name
|
|
27
|
-
run._element.rPr.rFonts.set(qn(
|
|
254
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), font_name)
|
|
28
255
|
run.font.size = Pt(size)
|
|
29
256
|
run.font.bold = bold
|
|
30
257
|
run.font.italic = italic
|
|
31
|
-
|
|
258
|
+
|
|
32
259
|
if color:
|
|
33
260
|
run.font.color.rgb = color
|
|
34
|
-
|
|
261
|
+
|
|
35
262
|
if underline:
|
|
36
263
|
run.font.underline = True
|
|
37
|
-
|
|
264
|
+
|
|
38
265
|
if strike:
|
|
39
266
|
run.font.strike = True
|
|
40
|
-
|
|
267
|
+
|
|
41
268
|
if highlight_color:
|
|
42
269
|
from docx.enum.text import WD_COLOR_INDEX
|
|
270
|
+
|
|
43
271
|
color_map = {
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
272
|
+
"yellow": WD_COLOR_INDEX.YELLOW,
|
|
273
|
+
"green": WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
274
|
+
"cyan": WD_COLOR_INDEX.CYAN,
|
|
275
|
+
"magenta": WD_COLOR_INDEX.MAGENTA,
|
|
276
|
+
"blue": WD_COLOR_INDEX.TURQUOISE,
|
|
277
|
+
"red": WD_COLOR_INDEX.RED,
|
|
278
|
+
"darkblue": WD_COLOR_INDEX.BLUE,
|
|
279
|
+
"orange": WD_COLOR_INDEX.ORANGE,
|
|
280
|
+
"gray": WD_COLOR_INDEX.GRAY_25,
|
|
53
281
|
}
|
|
54
282
|
if highlight_color in color_map:
|
|
55
283
|
run.font.highlight_color = color_map[highlight_color]
|
|
56
284
|
|
|
57
|
-
def process_inline_elements(element, parent_run=None):
|
|
58
|
-
"""处理内联元素"""
|
|
59
|
-
from docx.text.paragraph import Paragraph
|
|
60
|
-
|
|
61
|
-
runs = []
|
|
62
|
-
|
|
63
|
-
for child in element.children:
|
|
64
|
-
if child.name is None: # 文本节点
|
|
65
|
-
text = str(child).strip()
|
|
66
|
-
if text:
|
|
67
|
-
if parent_run:
|
|
68
|
-
parent_run.add_text(text)
|
|
69
|
-
else:
|
|
70
|
-
runs.append({'text': text})
|
|
71
|
-
elif child.name == 'strong' or child.name == 'b':
|
|
72
|
-
if parent_run:
|
|
73
|
-
parent_run.bold = True
|
|
74
|
-
process_inline_elements(child, parent_run)
|
|
75
|
-
else:
|
|
76
|
-
runs.append({'text': child.get_text(), 'bold': True})
|
|
77
|
-
elif child.name == 'em' or child.name == 'i':
|
|
78
|
-
if parent_run:
|
|
79
|
-
parent_run.italic = True
|
|
80
|
-
process_inline_elements(child, parent_run)
|
|
81
|
-
else:
|
|
82
|
-
runs.append({'text': child.get_text(), 'italic': True})
|
|
83
|
-
elif child.name == 'u':
|
|
84
|
-
if parent_run:
|
|
85
|
-
parent_run.underline = True
|
|
86
|
-
process_inline_elements(child, parent_run)
|
|
87
|
-
else:
|
|
88
|
-
runs.append({'text': child.get_text(), 'underline': True})
|
|
89
|
-
elif child.name == 's' or child.name == 'del':
|
|
90
|
-
if parent_run:
|
|
91
|
-
parent_run.strike = True
|
|
92
|
-
process_inline_elements(child, parent_run)
|
|
93
|
-
else:
|
|
94
|
-
runs.append({'text': child.get_text(), 'strike': True})
|
|
95
|
-
elif child.name == 'sup':
|
|
96
|
-
if parent_run:
|
|
97
|
-
parent_run.font.superscript = True
|
|
98
|
-
process_inline_elements(child, parent_run)
|
|
99
|
-
else:
|
|
100
|
-
runs.append({'text': child.get_text(), 'superscript': True})
|
|
101
|
-
elif child.name == 'sub':
|
|
102
|
-
if parent_run:
|
|
103
|
-
parent_run.font.subscript = True
|
|
104
|
-
process_inline_elements(child, parent_run)
|
|
105
|
-
else:
|
|
106
|
-
runs.append({'text': child.get_text(), 'subscript': True})
|
|
107
|
-
elif child.name == 'code':
|
|
108
|
-
code_text = child.get_text()
|
|
109
|
-
if parent_run:
|
|
110
|
-
parent_run.font.name = 'Consolas'
|
|
111
|
-
parent_run.font.size = Pt(10)
|
|
112
|
-
parent_run.add_text(code_text)
|
|
113
|
-
else:
|
|
114
|
-
runs.append({'text': code_text, 'font': 'Consolas', 'size': 10})
|
|
115
|
-
elif child.name == 'a':
|
|
116
|
-
link_text = child.get_text()
|
|
117
|
-
href = child.get('href', '')
|
|
118
|
-
if parent_run:
|
|
119
|
-
parent_run.add_text(link_text)
|
|
120
|
-
else:
|
|
121
|
-
runs.append({'text': link_text, 'link': href})
|
|
122
|
-
elif child.name == 'span':
|
|
123
|
-
style = child.get('style', '')
|
|
124
|
-
color_match = re.search(r'color:\s*([^;]+)', style)
|
|
125
|
-
bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
|
|
126
|
-
|
|
127
|
-
props = {'text': child.get_text()}
|
|
128
|
-
if color_match:
|
|
129
|
-
color = parse_color(color_match.group(1).strip())
|
|
130
|
-
if color:
|
|
131
|
-
props['color'] = color
|
|
132
|
-
if bg_match:
|
|
133
|
-
bg_color = bg_match.group(1).strip()
|
|
134
|
-
if bg_color.startswith('#'):
|
|
135
|
-
bg_rgb = parse_color(bg_color)
|
|
136
|
-
if bg_rgb:
|
|
137
|
-
props['highlight'] = str(bg_rgb)
|
|
138
|
-
|
|
139
|
-
if parent_run:
|
|
140
|
-
if 'color' in props:
|
|
141
|
-
parent_run.font.color.rgb = props['color']
|
|
142
|
-
process_inline_elements(child, parent_run)
|
|
143
|
-
else:
|
|
144
|
-
runs.append(props)
|
|
145
|
-
else:
|
|
146
|
-
process_inline_elements(child, parent_run)
|
|
147
|
-
|
|
148
|
-
return runs
|
|
149
285
|
|
|
150
286
|
def _apply_highlight(run, bg_color):
|
|
151
287
|
"""为run应用背景色/高亮"""
|
|
152
288
|
from docx.enum.text import WD_COLOR_INDEX
|
|
153
|
-
|
|
289
|
+
|
|
154
290
|
# 颜色名称到WD_COLOR_INDEX的映射(只使用可用的枚举值)
|
|
155
291
|
color_map = {
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
292
|
+
"yellow": WD_COLOR_INDEX.YELLOW,
|
|
293
|
+
"green": WD_COLOR_INDEX.GREEN,
|
|
294
|
+
"brightgreen": WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
295
|
+
"blue": WD_COLOR_INDEX.BLUE,
|
|
296
|
+
"darkblue": WD_COLOR_INDEX.DARK_BLUE,
|
|
297
|
+
"red": WD_COLOR_INDEX.RED,
|
|
298
|
+
"darkred": WD_COLOR_INDEX.DARK_RED,
|
|
299
|
+
"darkyellow": WD_COLOR_INDEX.DARK_YELLOW,
|
|
300
|
+
"lightgray": WD_COLOR_INDEX.GRAY_25,
|
|
301
|
+
"gray": WD_COLOR_INDEX.GRAY_50,
|
|
302
|
+
"black": WD_COLOR_INDEX.BLACK,
|
|
303
|
+
"white": WD_COLOR_INDEX.WHITE,
|
|
304
|
+
"pink": WD_COLOR_INDEX.PINK,
|
|
305
|
+
"teal": WD_COLOR_INDEX.TEAL,
|
|
306
|
+
"turquoise": WD_COLOR_INDEX.TURQUOISE,
|
|
307
|
+
"violet": WD_COLOR_INDEX.VIOLET,
|
|
308
|
+
"cyan": WD_COLOR_INDEX.TURQUOISE,
|
|
309
|
+
"magenta": WD_COLOR_INDEX.VIOLET,
|
|
174
310
|
}
|
|
175
|
-
|
|
311
|
+
|
|
176
312
|
# 标准化颜色名称
|
|
177
313
|
bg_lower = bg_color.lower().strip()
|
|
178
|
-
|
|
314
|
+
|
|
179
315
|
if bg_lower in color_map:
|
|
180
316
|
# 使用预定义的高亮色
|
|
181
317
|
run.font.highlight_color = color_map[bg_lower]
|
|
182
|
-
elif bg_lower.startswith(
|
|
318
|
+
elif bg_lower.startswith("#"):
|
|
183
319
|
# 十六进制颜色,直接使用字符串
|
|
184
|
-
shading_elm = OxmlElement(
|
|
185
|
-
shading_elm.set(qn(
|
|
320
|
+
shading_elm = OxmlElement("w:shd")
|
|
321
|
+
shading_elm.set(qn("w:fill"), bg_lower[1:].upper())
|
|
186
322
|
run._element.get_or_add_rPr().append(shading_elm)
|
|
187
323
|
else:
|
|
188
324
|
# 尝试其他常见颜色名称映射到相近的预定义颜色
|
|
189
325
|
similar_colors = {
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
326
|
+
"lightblue": WD_COLOR_INDEX.TURQUOISE,
|
|
327
|
+
"lightyellow": WD_COLOR_INDEX.YELLOW,
|
|
328
|
+
"lightgreen": WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
329
|
+
"orange": WD_COLOR_INDEX.YELLOW, # 橙色映射到黄色
|
|
330
|
+
"purple": WD_COLOR_INDEX.VIOLET,
|
|
331
|
+
"brown": WD_COLOR_INDEX.DARK_YELLOW,
|
|
196
332
|
}
|
|
197
333
|
if bg_lower in similar_colors:
|
|
198
334
|
run.font.highlight_color = similar_colors[bg_lower]
|
|
199
335
|
|
|
200
|
-
|
|
201
|
-
|
|
336
|
+
|
|
337
|
+
def process_paragraph(
|
|
338
|
+
paragraph,
|
|
339
|
+
doc,
|
|
340
|
+
default_font="微软雅黑",
|
|
341
|
+
default_size=12,
|
|
342
|
+
indent=None,
|
|
343
|
+
align=None,
|
|
344
|
+
line_spacing=None,
|
|
345
|
+
):
|
|
202
346
|
"""处理段落及其内联元素"""
|
|
203
347
|
para = doc.add_paragraph()
|
|
204
348
|
|
|
@@ -211,7 +355,7 @@ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=
|
|
|
211
355
|
para.paragraph_format.first_line_indent = Inches(indent)
|
|
212
356
|
else:
|
|
213
357
|
# 从data-indent属性读取缩进(单位:em)
|
|
214
|
-
data_indent = paragraph.get(
|
|
358
|
+
data_indent = paragraph.get("data-indent", "")
|
|
215
359
|
if data_indent:
|
|
216
360
|
try:
|
|
217
361
|
em_count = float(data_indent)
|
|
@@ -224,247 +368,551 @@ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=
|
|
|
224
368
|
if line_spacing:
|
|
225
369
|
para.paragraph_format.line_spacing = line_spacing
|
|
226
370
|
|
|
371
|
+
# 解析段落的样式(包括行距和段距)
|
|
372
|
+
style = paragraph.get("style", "")
|
|
373
|
+
|
|
374
|
+
# 解析对齐方式
|
|
375
|
+
text_align_match = TEXT_ALIGN_RE.search(style)
|
|
376
|
+
if text_align_match:
|
|
377
|
+
align_str = text_align_match.group(1).strip().lower()
|
|
378
|
+
if align_str == "left":
|
|
379
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
380
|
+
elif align_str == "center":
|
|
381
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
382
|
+
elif align_str == "right":
|
|
383
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
384
|
+
elif align_str == "justify":
|
|
385
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
|
|
386
|
+
|
|
387
|
+
# 解析行距
|
|
388
|
+
line_height_match = LINE_HEIGHT_RE.search(style)
|
|
389
|
+
if line_height_match:
|
|
390
|
+
line_height_str = line_height_match.group(1).strip()
|
|
391
|
+
if line_height_str.endswith("pt"):
|
|
392
|
+
# 固定行距
|
|
393
|
+
para.paragraph_format.line_spacing = float(line_height_str[:-2])
|
|
394
|
+
elif line_height_str.endswith("px"):
|
|
395
|
+
# px转换为pt
|
|
396
|
+
para.paragraph_format.line_spacing = float(line_height_str[:-2]) * 0.75
|
|
397
|
+
elif line_height_str.endswith("em"):
|
|
398
|
+
# em转换为pt(基于段落字号)
|
|
399
|
+
para.paragraph_format.line_spacing = para_size * float(line_height_str[:-2])
|
|
400
|
+
else:
|
|
401
|
+
# 尝试作为倍数处理
|
|
402
|
+
try:
|
|
403
|
+
line_spacing_value = float(line_height_str)
|
|
404
|
+
para.paragraph_format.line_spacing = line_spacing_value
|
|
405
|
+
except:
|
|
406
|
+
pass
|
|
407
|
+
|
|
408
|
+
# 解析段后距
|
|
409
|
+
margin_bottom_match = MARGIN_RE.search(style)
|
|
410
|
+
if margin_bottom_match:
|
|
411
|
+
margin_bottom_str = margin_bottom_match.group(2).strip()
|
|
412
|
+
if margin_bottom_str.endswith("pt"):
|
|
413
|
+
para.paragraph_format.space_after = Pt(float(margin_bottom_str[:-2]))
|
|
414
|
+
elif margin_bottom_str.endswith("px"):
|
|
415
|
+
# px转换为pt
|
|
416
|
+
para.paragraph_format.space_after = Pt(float(margin_bottom_str[:-2]) * 0.75)
|
|
417
|
+
elif margin_bottom_str.endswith("em"):
|
|
418
|
+
# em转换为pt(基于段落字号)
|
|
419
|
+
para.paragraph_format.space_after = Pt(
|
|
420
|
+
para_size * float(margin_bottom_str[:-2])
|
|
421
|
+
)
|
|
422
|
+
else:
|
|
423
|
+
# 尝试作为pt处理
|
|
424
|
+
try:
|
|
425
|
+
para.paragraph_format.space_after = Pt(float(margin_bottom_str))
|
|
426
|
+
except:
|
|
427
|
+
pass
|
|
428
|
+
|
|
429
|
+
# 解析段前距
|
|
430
|
+
margin_top_match = MARGIN_RE.search(style)
|
|
431
|
+
if margin_top_match:
|
|
432
|
+
margin_top_str = margin_top_match.group(2).strip()
|
|
433
|
+
if margin_top_str.endswith("pt"):
|
|
434
|
+
para.paragraph_format.space_before = Pt(float(margin_top_str[:-2]))
|
|
435
|
+
elif margin_top_str.endswith("px"):
|
|
436
|
+
# px转换为pt
|
|
437
|
+
para.paragraph_format.space_before = Pt(float(margin_top_str[:-2]) * 0.75)
|
|
438
|
+
elif margin_top_str.endswith("em"):
|
|
439
|
+
# em转换为pt(基于段落字号)
|
|
440
|
+
para.paragraph_format.space_before = Pt(
|
|
441
|
+
para_size * float(margin_top_str[:-2])
|
|
442
|
+
)
|
|
443
|
+
else:
|
|
444
|
+
# 尝试作为pt处理
|
|
445
|
+
try:
|
|
446
|
+
para.paragraph_format.space_before = Pt(float(margin_top_str))
|
|
447
|
+
except:
|
|
448
|
+
pass
|
|
449
|
+
|
|
450
|
+
# 解析段落的字号
|
|
451
|
+
para_size = default_size
|
|
452
|
+
style = paragraph.get("style", "")
|
|
453
|
+
size_match = FONT_SIZE_RE.search(style)
|
|
454
|
+
if size_match:
|
|
455
|
+
size_str = size_match.group(1).strip()
|
|
456
|
+
# 处理不同单位:pt, px, em等
|
|
457
|
+
if size_str.endswith("pt"):
|
|
458
|
+
para_size = float(size_str[:-2])
|
|
459
|
+
elif size_str.endswith("px"):
|
|
460
|
+
# px转换为pt (1px ≈ 0.75pt)
|
|
461
|
+
para_size = float(size_str[:-2]) * 0.75
|
|
462
|
+
elif size_str.endswith("em"):
|
|
463
|
+
# em转换为pt (假设基础字号为12pt)
|
|
464
|
+
para_size = float(size_str[:-2]) * 12
|
|
465
|
+
else:
|
|
466
|
+
# 尝试直接解析为数字
|
|
467
|
+
try:
|
|
468
|
+
para_size = float(size_str)
|
|
469
|
+
except:
|
|
470
|
+
pass
|
|
471
|
+
|
|
227
472
|
# 处理段落内容 - 递归处理所有子元素
|
|
228
|
-
_process_element_to_runs(paragraph, para, default_font,
|
|
473
|
+
_process_element_to_runs(paragraph, para, default_font, para_size)
|
|
229
474
|
|
|
230
475
|
return para
|
|
231
476
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
477
|
+
|
|
478
|
+
def _process_element_to_runs(
|
|
479
|
+
element,
|
|
480
|
+
para,
|
|
481
|
+
default_font="微软雅黑",
|
|
482
|
+
default_size=12,
|
|
483
|
+
bold=False,
|
|
484
|
+
italic=False,
|
|
485
|
+
underline=False,
|
|
486
|
+
strike=False,
|
|
487
|
+
color=None,
|
|
488
|
+
bg_color=None,
|
|
489
|
+
font_name=None,
|
|
490
|
+
font_size=None,
|
|
491
|
+
):
|
|
235
492
|
"""递归处理元素,为不同格式的文本创建独立的runs"""
|
|
236
493
|
current_font = font_name or default_font
|
|
237
494
|
current_size = font_size or default_size
|
|
238
|
-
|
|
495
|
+
|
|
239
496
|
for child in element.children:
|
|
240
497
|
if child.name is None: # 文本节点
|
|
241
498
|
text = str(child)
|
|
242
499
|
# 去除多余空白但保留单个空格
|
|
243
500
|
if text:
|
|
244
501
|
# 替换换行和制表符为空格,然后合并多个空格
|
|
245
|
-
text =
|
|
502
|
+
text = " ".join(text.replace("\n", " ").replace("\t", " ").split())
|
|
246
503
|
if text: # 再次检查,因为去除空白后可能为空
|
|
247
504
|
run = para.add_run(text)
|
|
248
|
-
set_font(
|
|
249
|
-
|
|
505
|
+
set_font(
|
|
506
|
+
run,
|
|
507
|
+
font_name=current_font,
|
|
508
|
+
size=current_size,
|
|
509
|
+
bold=bold,
|
|
510
|
+
italic=italic,
|
|
511
|
+
underline=underline,
|
|
512
|
+
strike=strike,
|
|
513
|
+
)
|
|
250
514
|
if color:
|
|
251
515
|
run.font.color.rgb = color
|
|
252
516
|
# 应用背景色
|
|
253
517
|
if bg_color:
|
|
254
518
|
_apply_highlight(run, bg_color)
|
|
255
|
-
elif child.name ==
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
elif child.name ==
|
|
268
|
-
_process_element_to_runs(
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
519
|
+
elif child.name == "math" or child.name == "latex":
|
|
520
|
+
# 处理 LaTeX 公式标签,添加 Word 原生公式
|
|
521
|
+
# 注意:公式不会继承父级样式(颜色、加粗、斜体等),这是 Word OMML 的限制
|
|
522
|
+
latex_formula = child.get_text().strip()
|
|
523
|
+
if latex_formula and HAS_MATH2DOCX:
|
|
524
|
+
# 添加原生公式(不传递样式参数)
|
|
525
|
+
if add_native_formula(para, latex_formula):
|
|
526
|
+
continue # 成功添加原生公式,跳过后续处理
|
|
527
|
+
# 如果失败,回退到文本显示
|
|
528
|
+
# 回退方案:显示为代码文本
|
|
529
|
+
run = para.add_run(latex_formula)
|
|
530
|
+
set_font(run, font_name="Consolas", size=10, color=RGBColor(0, 0, 128))
|
|
531
|
+
elif child.name == "strong" or child.name == "b":
|
|
532
|
+
_process_element_to_runs(
|
|
533
|
+
child,
|
|
534
|
+
para,
|
|
535
|
+
default_font,
|
|
536
|
+
default_size,
|
|
537
|
+
bold=True,
|
|
538
|
+
italic=italic,
|
|
539
|
+
underline=underline,
|
|
540
|
+
strike=strike,
|
|
541
|
+
color=color,
|
|
542
|
+
bg_color=bg_color,
|
|
543
|
+
)
|
|
544
|
+
elif child.name == "em" or child.name == "i":
|
|
545
|
+
_process_element_to_runs(
|
|
546
|
+
child,
|
|
547
|
+
para,
|
|
548
|
+
default_font,
|
|
549
|
+
default_size,
|
|
550
|
+
bold=bold,
|
|
551
|
+
italic=True,
|
|
552
|
+
underline=underline,
|
|
553
|
+
strike=strike,
|
|
554
|
+
color=color,
|
|
555
|
+
bg_color=bg_color,
|
|
556
|
+
)
|
|
557
|
+
elif child.name == "u":
|
|
558
|
+
_process_element_to_runs(
|
|
559
|
+
child,
|
|
560
|
+
para,
|
|
561
|
+
default_font,
|
|
562
|
+
default_size,
|
|
563
|
+
bold=bold,
|
|
564
|
+
italic=italic,
|
|
565
|
+
underline=True,
|
|
566
|
+
strike=strike,
|
|
567
|
+
color=color,
|
|
568
|
+
bg_color=bg_color,
|
|
569
|
+
)
|
|
570
|
+
elif child.name == "s" or child.name == "del":
|
|
571
|
+
_process_element_to_runs(
|
|
572
|
+
child,
|
|
573
|
+
para,
|
|
574
|
+
default_font,
|
|
575
|
+
default_size,
|
|
576
|
+
bold=bold,
|
|
577
|
+
italic=italic,
|
|
578
|
+
underline=underline,
|
|
579
|
+
strike=True,
|
|
580
|
+
color=color,
|
|
581
|
+
bg_color=bg_color,
|
|
582
|
+
)
|
|
583
|
+
elif child.name == "sup":
|
|
272
584
|
for sub_child in child.children:
|
|
273
585
|
if sub_child.name is None:
|
|
274
586
|
run = para.add_run(str(sub_child))
|
|
275
|
-
set_font(
|
|
276
|
-
|
|
587
|
+
set_font(
|
|
588
|
+
run,
|
|
589
|
+
font_name=current_font,
|
|
590
|
+
size=current_size,
|
|
591
|
+
bold=bold,
|
|
592
|
+
italic=italic,
|
|
593
|
+
underline=underline,
|
|
594
|
+
strike=strike,
|
|
595
|
+
)
|
|
277
596
|
run.font.superscript = True
|
|
278
597
|
if color:
|
|
279
598
|
run.font.color.rgb = color
|
|
599
|
+
if bg_color:
|
|
600
|
+
_apply_highlight(run, bg_color)
|
|
280
601
|
else:
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
602
|
+
# 处理嵌套元素,但保持上标
|
|
603
|
+
_process_element_to_runs(
|
|
604
|
+
sub_child,
|
|
605
|
+
para,
|
|
606
|
+
default_font,
|
|
607
|
+
default_size,
|
|
608
|
+
bold=bold,
|
|
609
|
+
italic=italic,
|
|
610
|
+
underline=underline,
|
|
611
|
+
strike=strike,
|
|
612
|
+
color=color,
|
|
613
|
+
bg_color=bg_color,
|
|
614
|
+
)
|
|
615
|
+
# 为嵌套元素添加的上标
|
|
616
|
+
for run in (
|
|
617
|
+
para.runs[
|
|
618
|
+
len(list(para.runs)) - len(sub_child.find_all(True)) :
|
|
619
|
+
]
|
|
620
|
+
if para.runs
|
|
621
|
+
else []
|
|
622
|
+
):
|
|
623
|
+
run.font.superscript = True
|
|
624
|
+
elif child.name == "sub":
|
|
285
625
|
for sub_child in child.children:
|
|
286
626
|
if sub_child.name is None:
|
|
287
627
|
run = para.add_run(str(sub_child))
|
|
288
|
-
set_font(
|
|
289
|
-
|
|
628
|
+
set_font(
|
|
629
|
+
run,
|
|
630
|
+
font_name=current_font,
|
|
631
|
+
size=current_size,
|
|
632
|
+
bold=bold,
|
|
633
|
+
italic=italic,
|
|
634
|
+
underline=underline,
|
|
635
|
+
strike=strike,
|
|
636
|
+
)
|
|
290
637
|
run.font.subscript = True
|
|
291
638
|
if color:
|
|
292
639
|
run.font.color.rgb = color
|
|
640
|
+
if bg_color:
|
|
641
|
+
_apply_highlight(run, bg_color)
|
|
293
642
|
else:
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
643
|
+
# 处理嵌套元素,但保持下标
|
|
644
|
+
_process_element_to_runs(
|
|
645
|
+
sub_child,
|
|
646
|
+
para,
|
|
647
|
+
default_font,
|
|
648
|
+
default_size,
|
|
649
|
+
bold=bold,
|
|
650
|
+
italic=italic,
|
|
651
|
+
underline=underline,
|
|
652
|
+
strike=strike,
|
|
653
|
+
color=color,
|
|
654
|
+
bg_color=bg_color,
|
|
655
|
+
)
|
|
656
|
+
# 为嵌套元素添加的下标
|
|
657
|
+
for run in (
|
|
658
|
+
para.runs[
|
|
659
|
+
len(list(para.runs)) - len(sub_child.find_all(True)) :
|
|
660
|
+
]
|
|
661
|
+
if para.runs
|
|
662
|
+
else []
|
|
663
|
+
):
|
|
664
|
+
run.font.subscript = True
|
|
665
|
+
elif child.name == "code":
|
|
298
666
|
code_text = child.get_text()
|
|
299
667
|
run = para.add_run(code_text)
|
|
300
|
-
set_font(run, font_name=
|
|
301
|
-
elif child.name ==
|
|
668
|
+
set_font(run, font_name="Consolas", size=10)
|
|
669
|
+
elif child.name == "a":
|
|
302
670
|
link_text = child.get_text()
|
|
303
671
|
run = para.add_run(link_text)
|
|
304
672
|
set_font(run, font_name=current_font, size=current_size)
|
|
305
673
|
run.font.underline = True
|
|
306
674
|
run.font.color.rgb = RGBColor(0, 0, 255)
|
|
307
|
-
elif child.name ==
|
|
675
|
+
elif child.name == "span":
|
|
308
676
|
# 处理span的样式
|
|
309
|
-
style = child.get(
|
|
310
|
-
classes = child.get(
|
|
311
|
-
|
|
677
|
+
style = child.get("style", "")
|
|
678
|
+
classes = child.get("class", [])
|
|
679
|
+
|
|
312
680
|
span_color = color
|
|
313
681
|
span_bg = bg_color
|
|
314
|
-
|
|
682
|
+
span_font = current_font # 使用当前字体(继承父级)
|
|
683
|
+
span_size = current_size # 使用当前字号(继承父级)
|
|
684
|
+
|
|
315
685
|
# 解析style中的颜色
|
|
316
|
-
color_match =
|
|
686
|
+
color_match = COLOR_RE.search(style)
|
|
317
687
|
if color_match:
|
|
318
688
|
parsed = parse_color(color_match.group(1).strip())
|
|
319
689
|
if parsed:
|
|
320
690
|
span_color = parsed
|
|
321
|
-
|
|
691
|
+
|
|
692
|
+
# 解析 font-family
|
|
693
|
+
font_match = FONT_FAMILY_RE.search(style)
|
|
694
|
+
if font_match:
|
|
695
|
+
font_family = font_match.group(1).strip()
|
|
696
|
+
# 去除引号
|
|
697
|
+
font_family = font_family.strip("'\"").strip()
|
|
698
|
+
if font_family:
|
|
699
|
+
span_font = font_family
|
|
700
|
+
|
|
701
|
+
# 解析 font-size
|
|
702
|
+
size_match = FONT_SIZE_RE.search(style)
|
|
703
|
+
if size_match:
|
|
704
|
+
size_str = size_match.group(1).strip()
|
|
705
|
+
# 处理不同单位:pt, px, em等
|
|
706
|
+
if size_str.endswith("pt"):
|
|
707
|
+
span_size = float(size_str[:-2])
|
|
708
|
+
elif size_str.endswith("px"):
|
|
709
|
+
# px转换为pt (1px ≈ 0.75pt)
|
|
710
|
+
span_size = float(size_str[:-2]) * 0.75
|
|
711
|
+
elif size_str.endswith("em"):
|
|
712
|
+
# em转换为pt (基于默认12pt)
|
|
713
|
+
span_size = float(size_str[:-2]) * 12
|
|
714
|
+
else:
|
|
715
|
+
# 尝试直接解析为数字
|
|
716
|
+
try:
|
|
717
|
+
span_size = float(size_str)
|
|
718
|
+
except:
|
|
719
|
+
pass
|
|
720
|
+
|
|
322
721
|
# 解析class中的颜色
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
722
|
+
class_set = set(classes) # 转换为集合提高查找性能
|
|
723
|
+
if "red" in class_set:
|
|
724
|
+
span_color = ConverterConfig.CLASS_COLORS["red"]
|
|
725
|
+
elif "blue" in class_set:
|
|
726
|
+
span_color = ConverterConfig.CLASS_COLORS["blue"]
|
|
727
|
+
elif "green" in class_set:
|
|
728
|
+
span_color = ConverterConfig.CLASS_COLORS["green"]
|
|
729
|
+
elif "purple" in class_set:
|
|
730
|
+
span_color = ConverterConfig.CLASS_COLORS["purple"]
|
|
731
|
+
|
|
332
732
|
# 背景色
|
|
333
|
-
bg_match =
|
|
733
|
+
bg_match = BACKGROUND_COLOR_RE.search(style)
|
|
334
734
|
if bg_match:
|
|
335
735
|
span_bg = bg_match.group(1).strip()
|
|
336
|
-
if
|
|
337
|
-
span_bg =
|
|
338
|
-
|
|
339
|
-
_process_element_to_runs(
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
736
|
+
if "highlight" in class_set:
|
|
737
|
+
span_bg = "yellow"
|
|
738
|
+
|
|
739
|
+
_process_element_to_runs(
|
|
740
|
+
child,
|
|
741
|
+
para,
|
|
742
|
+
default_font,
|
|
743
|
+
default_size,
|
|
744
|
+
bold=bold,
|
|
745
|
+
italic=italic,
|
|
746
|
+
underline=underline,
|
|
747
|
+
strike=strike,
|
|
748
|
+
color=span_color,
|
|
749
|
+
bg_color=span_bg,
|
|
750
|
+
font_name=span_font,
|
|
751
|
+
font_size=span_size,
|
|
752
|
+
)
|
|
343
753
|
else:
|
|
344
754
|
# 其他标签递归处理
|
|
345
|
-
_process_element_to_runs(
|
|
346
|
-
|
|
347
|
-
|
|
755
|
+
_process_element_to_runs(
|
|
756
|
+
child,
|
|
757
|
+
para,
|
|
758
|
+
default_font,
|
|
759
|
+
default_size,
|
|
760
|
+
bold=bold,
|
|
761
|
+
italic=italic,
|
|
762
|
+
underline=underline,
|
|
763
|
+
strike=strike,
|
|
764
|
+
color=color,
|
|
765
|
+
bg_color=bg_color,
|
|
766
|
+
)
|
|
348
767
|
|
|
349
|
-
|
|
768
|
+
|
|
769
|
+
def process_list_items(
|
|
770
|
+
items, doc, ordered=False, default_font="微软雅黑", default_size=12, level=0
|
|
771
|
+
):
|
|
350
772
|
"""处理列表项,支持嵌套"""
|
|
351
773
|
for item in items:
|
|
352
774
|
# 创建列表项段落
|
|
353
775
|
if ordered:
|
|
354
|
-
para = doc.add_paragraph(style=
|
|
776
|
+
para = doc.add_paragraph(style="List Number")
|
|
355
777
|
else:
|
|
356
|
-
para = doc.add_paragraph(style=
|
|
357
|
-
|
|
778
|
+
para = doc.add_paragraph(style="List Bullet")
|
|
779
|
+
|
|
358
780
|
# 设置缩进:每级增加 0.25 英寸
|
|
359
781
|
para.paragraph_format.left_indent = Inches(0.25 * (level + 1))
|
|
360
|
-
|
|
782
|
+
|
|
361
783
|
# 查找嵌套列表
|
|
362
|
-
nested_ul = item.find(
|
|
363
|
-
nested_ol = item.find(
|
|
364
|
-
|
|
784
|
+
nested_ul = item.find("ul", recursive=False)
|
|
785
|
+
nested_ol = item.find("ol", recursive=False)
|
|
786
|
+
|
|
365
787
|
# 处理列表项的文本内容(排除嵌套列表)
|
|
366
788
|
# 创建一个临时副本用于提取文本
|
|
367
|
-
item_copy = BeautifulSoup(str(item),
|
|
789
|
+
item_copy = BeautifulSoup(str(item), "html.parser").find("li")
|
|
368
790
|
if item_copy:
|
|
369
791
|
# 移除嵌套列表
|
|
370
|
-
for nested in item_copy.find_all([
|
|
792
|
+
for nested in item_copy.find_all(["ul", "ol"], recursive=False):
|
|
371
793
|
nested.decompose()
|
|
372
|
-
|
|
794
|
+
|
|
373
795
|
# 处理剩余内容
|
|
374
796
|
if item_copy.get_text().strip():
|
|
375
797
|
_process_element_to_runs(item_copy, para, default_font, default_size)
|
|
376
|
-
|
|
798
|
+
|
|
377
799
|
# 递归处理嵌套列表
|
|
378
800
|
if nested_ul:
|
|
379
|
-
nested_items = nested_ul.find_all(
|
|
380
|
-
process_list_items(
|
|
381
|
-
|
|
801
|
+
nested_items = nested_ul.find_all("li", recursive=False)
|
|
802
|
+
process_list_items(
|
|
803
|
+
nested_items,
|
|
804
|
+
doc,
|
|
805
|
+
ordered=False,
|
|
806
|
+
default_font=default_font,
|
|
807
|
+
default_size=default_size,
|
|
808
|
+
level=level + 1,
|
|
809
|
+
)
|
|
382
810
|
if nested_ol:
|
|
383
|
-
nested_items = nested_ol.find_all(
|
|
384
|
-
process_list_items(
|
|
385
|
-
|
|
811
|
+
nested_items = nested_ol.find_all("li", recursive=False)
|
|
812
|
+
process_list_items(
|
|
813
|
+
nested_items,
|
|
814
|
+
doc,
|
|
815
|
+
ordered=True,
|
|
816
|
+
default_font=default_font,
|
|
817
|
+
default_size=default_size,
|
|
818
|
+
level=level + 1,
|
|
819
|
+
)
|
|
820
|
+
|
|
386
821
|
|
|
387
822
|
def _parse_style(style_str):
|
|
388
823
|
"""解析style字符串为字典"""
|
|
389
824
|
styles = {}
|
|
390
825
|
if not style_str:
|
|
391
826
|
return styles
|
|
392
|
-
for item in style_str.split(
|
|
393
|
-
if
|
|
394
|
-
key, value = item.split(
|
|
827
|
+
for item in style_str.split(";"):
|
|
828
|
+
if ":" in item:
|
|
829
|
+
key, value = item.split(":", 1)
|
|
395
830
|
styles[key.strip()] = value.strip()
|
|
396
831
|
return styles
|
|
397
832
|
|
|
833
|
+
|
|
398
834
|
def _apply_cell_style(cell_elem, style_dict):
|
|
399
835
|
"""应用单元格样式"""
|
|
400
836
|
# 背景色
|
|
401
|
-
bg_color = style_dict.get(
|
|
837
|
+
bg_color = style_dict.get("background-color", "")
|
|
402
838
|
if bg_color:
|
|
403
839
|
# 处理颜色值
|
|
404
|
-
if bg_color.startswith(
|
|
405
|
-
shading_elm = OxmlElement(
|
|
406
|
-
shading_elm.set(qn(
|
|
840
|
+
if bg_color.startswith("#"):
|
|
841
|
+
shading_elm = OxmlElement("w:shd")
|
|
842
|
+
shading_elm.set(qn("w:fill"), bg_color[1:].upper())
|
|
407
843
|
cell_elem._element.get_or_add_tcPr().append(shading_elm)
|
|
408
|
-
|
|
844
|
+
|
|
409
845
|
# 文字颜色
|
|
410
|
-
color = style_dict.get(
|
|
846
|
+
color = style_dict.get("color", "")
|
|
411
847
|
if color:
|
|
412
|
-
rgb = parse_color(color) if color.startswith(
|
|
848
|
+
rgb = parse_color(color) if color.startswith("#") else None
|
|
413
849
|
if rgb:
|
|
414
850
|
for run in cell_elem.paragraphs[0].runs:
|
|
415
851
|
run.font.color.rgb = rgb
|
|
416
852
|
|
|
417
|
-
|
|
853
|
+
|
|
854
|
+
def process_table(table, doc, default_font="微软雅黑", default_size=11):
|
|
418
855
|
"""处理表格,支持内联样式"""
|
|
419
|
-
rows = table.find_all(
|
|
856
|
+
rows = table.find_all("tr")
|
|
420
857
|
if not rows:
|
|
421
858
|
return
|
|
422
|
-
|
|
859
|
+
|
|
423
860
|
# 获取列数
|
|
424
|
-
cols = max(len(row.find_all([
|
|
425
|
-
|
|
861
|
+
cols = max(len(row.find_all(["td", "th"])) for row in rows)
|
|
862
|
+
|
|
426
863
|
# 创建表格
|
|
427
864
|
word_table = doc.add_table(rows=len(rows), cols=cols)
|
|
428
|
-
word_table.style =
|
|
429
|
-
|
|
865
|
+
word_table.style = "Table Grid"
|
|
866
|
+
|
|
430
867
|
for row_idx, row in enumerate(rows):
|
|
431
868
|
# 处理行样式(如背景色)
|
|
432
|
-
row_style = _parse_style(row.get(
|
|
433
|
-
row_bg = row_style.get(
|
|
434
|
-
|
|
435
|
-
cells = row.find_all([
|
|
869
|
+
row_style = _parse_style(row.get("style", ""))
|
|
870
|
+
row_bg = row_style.get("background-color", "")
|
|
871
|
+
|
|
872
|
+
cells = row.find_all(["td", "th"])
|
|
436
873
|
for col_idx, cell in enumerate(cells):
|
|
437
874
|
if col_idx < cols:
|
|
438
875
|
cell_elem = word_table.rows[row_idx].cells[col_idx]
|
|
439
|
-
|
|
440
|
-
|
|
876
|
+
|
|
441
877
|
# 解析单元格样式
|
|
442
|
-
cell_style = _parse_style(cell.get(
|
|
443
|
-
|
|
878
|
+
cell_style = _parse_style(cell.get("style", ""))
|
|
879
|
+
|
|
880
|
+
# 清空默认段落
|
|
881
|
+
cell_elem.paragraphs[0].clear()
|
|
882
|
+
|
|
883
|
+
# 使用 _process_element_to_runs 处理单元格内容,保留格式
|
|
884
|
+
_process_element_to_runs(
|
|
885
|
+
cell,
|
|
886
|
+
cell_elem.paragraphs[0],
|
|
887
|
+
default_font=default_font,
|
|
888
|
+
default_size=default_size,
|
|
889
|
+
)
|
|
890
|
+
|
|
444
891
|
# 表头加粗
|
|
445
|
-
if cell.name ==
|
|
892
|
+
if cell.name == "th":
|
|
446
893
|
for run in cell_elem.paragraphs[0].runs:
|
|
447
894
|
run.font.bold = True
|
|
448
|
-
|
|
895
|
+
|
|
449
896
|
# 设置单元格对齐
|
|
450
|
-
align = cell_style.get(
|
|
451
|
-
if align ==
|
|
897
|
+
align = cell_style.get("text-align", "center")
|
|
898
|
+
if align == "center":
|
|
452
899
|
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
453
|
-
elif align ==
|
|
900
|
+
elif align == "left":
|
|
454
901
|
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
455
|
-
elif align ==
|
|
902
|
+
elif align == "right":
|
|
456
903
|
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
457
|
-
|
|
904
|
+
|
|
458
905
|
# 应用单元格样式(背景色、文字颜色)
|
|
459
906
|
_apply_cell_style(cell_elem, cell_style)
|
|
460
|
-
|
|
907
|
+
|
|
461
908
|
# 如果行有背景色且单元格没有单独设置,应用行背景色
|
|
462
|
-
if row_bg and not cell_style.get(
|
|
463
|
-
if row_bg.startswith(
|
|
464
|
-
shading_elm = OxmlElement(
|
|
465
|
-
shading_elm.set(qn(
|
|
909
|
+
if row_bg and not cell_style.get("background-color"):
|
|
910
|
+
if row_bg.startswith("#"):
|
|
911
|
+
shading_elm = OxmlElement("w:shd")
|
|
912
|
+
shading_elm.set(qn("w:fill"), row_bg[1:].upper())
|
|
466
913
|
cell_elem._element.get_or_add_tcPr().append(shading_elm)
|
|
467
914
|
|
|
915
|
+
|
|
468
916
|
def set_section_columns(section, cols_num=2, space=720):
|
|
469
917
|
"""设置节的多栏布局
|
|
470
918
|
|
|
@@ -474,11 +922,12 @@ def set_section_columns(section, cols_num=2, space=720):
|
|
|
474
922
|
space: 栏间距(单位:twips,1英寸=1440twips),默认720(0.5英寸)
|
|
475
923
|
"""
|
|
476
924
|
sectPr = section._sectPr
|
|
477
|
-
cols = OxmlElement(
|
|
478
|
-
cols.set(qn(
|
|
479
|
-
cols.set(qn(
|
|
925
|
+
cols = OxmlElement("w:cols")
|
|
926
|
+
cols.set(qn("w:num"), str(cols_num))
|
|
927
|
+
cols.set(qn("w:space"), str(space))
|
|
480
928
|
sectPr.append(cols)
|
|
481
929
|
|
|
930
|
+
|
|
482
931
|
def add_columns_section(doc, cols_num=2, space=720):
|
|
483
932
|
"""添加连续分节符并设置多栏布局(不换页)
|
|
484
933
|
|
|
@@ -496,312 +945,655 @@ def add_columns_section(doc, cols_num=2, space=720):
|
|
|
496
945
|
set_section_columns(section, cols_num, space)
|
|
497
946
|
return section
|
|
498
947
|
|
|
948
|
+
|
|
499
949
|
def _process_blockquote(blockquote_elem, doc, level=0):
|
|
500
950
|
"""递归处理嵌套引用"""
|
|
501
|
-
#
|
|
502
|
-
|
|
951
|
+
# 检查是否有直接内容(不包括嵌套引用)
|
|
952
|
+
has_content = False
|
|
503
953
|
for child in blockquote_elem.children:
|
|
504
954
|
if child.name is None: # 文本节点
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
955
|
+
if str(child).strip():
|
|
956
|
+
has_content = True
|
|
957
|
+
break
|
|
958
|
+
elif child.name != "blockquote" and child.get_text().strip():
|
|
959
|
+
has_content = True
|
|
960
|
+
break
|
|
510
961
|
|
|
511
|
-
#
|
|
512
|
-
if
|
|
962
|
+
# 如果有直接内容,创建段落
|
|
963
|
+
if has_content:
|
|
513
964
|
para = doc.add_paragraph()
|
|
514
|
-
run = para.add_run(direct_text)
|
|
515
|
-
set_font(run, italic=True, color=RGBColor(100, 100, 100))
|
|
516
965
|
# 根据层级设置缩进
|
|
517
966
|
para.paragraph_format.left_indent = Inches(0.3 * level)
|
|
518
967
|
para.paragraph_format.right_indent = Inches(0.5)
|
|
519
968
|
# 添加灰色左边框
|
|
520
|
-
pBdr = OxmlElement(
|
|
521
|
-
left_border = OxmlElement(
|
|
522
|
-
left_border.set(qn(
|
|
523
|
-
left_border.set(qn(
|
|
524
|
-
left_border.set(qn(
|
|
969
|
+
pBdr = OxmlElement("w:pBdr")
|
|
970
|
+
left_border = OxmlElement("w:left")
|
|
971
|
+
left_border.set(qn("w:val"), "single")
|
|
972
|
+
left_border.set(qn("w:sz"), "18")
|
|
973
|
+
left_border.set(qn("w:color"), "CCCCCC")
|
|
525
974
|
pBdr.append(left_border)
|
|
526
975
|
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
527
976
|
|
|
977
|
+
# 创建一个临时元素来包含所有非blockquote的子元素
|
|
978
|
+
from bs4 import BeautifulSoup
|
|
979
|
+
|
|
980
|
+
temp_soup = BeautifulSoup("<div></div>", "html.parser")
|
|
981
|
+
temp_div = temp_soup.div
|
|
982
|
+
|
|
983
|
+
# 复制所有非blockquote的子元素
|
|
984
|
+
for child in blockquote_elem.children:
|
|
985
|
+
if child.name != "blockquote":
|
|
986
|
+
temp_div.append(
|
|
987
|
+
child.__copy__() if hasattr(child, "__copy__") else child
|
|
988
|
+
)
|
|
989
|
+
|
|
990
|
+
# 使用 _process_element_to_runs 处理格式化内容
|
|
991
|
+
# 注意:引用内容默认斜体和灰色
|
|
992
|
+
_process_element_to_runs(
|
|
993
|
+
temp_div,
|
|
994
|
+
para,
|
|
995
|
+
default_font="微软雅黑",
|
|
996
|
+
default_size=12,
|
|
997
|
+
italic=True,
|
|
998
|
+
color=RGBColor(100, 100, 100),
|
|
999
|
+
)
|
|
1000
|
+
|
|
528
1001
|
# 递归处理嵌套引用
|
|
529
|
-
nested_quotes = blockquote_elem.find_all(
|
|
1002
|
+
nested_quotes = blockquote_elem.find_all("blockquote", recursive=False)
|
|
530
1003
|
for nested in nested_quotes:
|
|
531
1004
|
_process_blockquote(nested, doc, level + 1)
|
|
532
1005
|
|
|
1006
|
+
|
|
533
1007
|
def add_page_break(doc):
|
|
534
1008
|
"""添加分页符"""
|
|
535
1009
|
doc.add_page_break()
|
|
536
1010
|
|
|
1011
|
+
|
|
537
1012
|
def add_horizontal_rule(doc):
|
|
538
1013
|
"""添加水平线"""
|
|
539
1014
|
para = doc.add_paragraph()
|
|
540
1015
|
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
541
|
-
run = para.add_run(
|
|
1016
|
+
run = para.add_run("_" * 50)
|
|
542
1017
|
run.font.color.rgb = RGBColor(200, 200, 200)
|
|
543
1018
|
|
|
544
|
-
|
|
1019
|
+
|
|
1020
|
+
# ==================== 辅助函数 ====================
|
|
1021
|
+
def _init_document(default_font, default_size):
|
|
1022
|
+
"""初始化Word文档"""
|
|
1023
|
+
doc = Document()
|
|
1024
|
+
doc.styles["Normal"].font.name = default_font
|
|
1025
|
+
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1026
|
+
doc.styles["Normal"].font.size = Pt(default_size)
|
|
1027
|
+
|
|
1028
|
+
# 处理页面设置
|
|
1029
|
+
section = doc.sections[0]
|
|
1030
|
+
section.page_height = Cm(ConverterConfig.PAGE_HEIGHT_CM)
|
|
1031
|
+
section.page_width = Cm(ConverterConfig.PAGE_WIDTH_CM)
|
|
1032
|
+
section.left_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1033
|
+
section.right_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1034
|
+
section.top_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1035
|
+
section.bottom_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1036
|
+
|
|
1037
|
+
return doc
|
|
1038
|
+
|
|
1039
|
+
|
|
1040
|
+
def _read_html_file(html_file):
|
|
1041
|
+
"""读取HTML文件"""
|
|
1042
|
+
with open(html_file, "r", encoding="utf-8") as f:
|
|
1043
|
+
return f.read()
|
|
1044
|
+
|
|
1045
|
+
|
|
1046
|
+
def _parse_html(html_content):
|
|
1047
|
+
"""解析HTML内容"""
|
|
1048
|
+
return BeautifulSoup(html_content, "html.parser")
|
|
1049
|
+
|
|
1050
|
+
|
|
1051
|
+
def _process_heading(element, doc, default_font):
|
|
1052
|
+
"""处理标题元素"""
|
|
1053
|
+
level = int(element.name[1])
|
|
1054
|
+
heading = doc.add_heading(element.get_text().strip(), level=level)
|
|
1055
|
+
heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
1056
|
+
|
|
1057
|
+
# 标题样式
|
|
1058
|
+
for run in heading.runs:
|
|
1059
|
+
run.font.name = default_font
|
|
1060
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1061
|
+
run.font.size = Pt(ConverterConfig.HEADING_SIZES.get(level, 14))
|
|
1062
|
+
run.font.bold = True
|
|
1063
|
+
run.font.color.rgb = ConverterConfig.HEADING_COLORS.get(
|
|
1064
|
+
level, RGBColor(107, 91, 149)
|
|
1065
|
+
)
|
|
1066
|
+
if level == 1:
|
|
1067
|
+
heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
1068
|
+
|
|
1069
|
+
|
|
1070
|
+
def _process_paragraph_element(element, doc, default_font, default_size):
|
|
1071
|
+
"""处理段落元素"""
|
|
1072
|
+
classes = element.get("class", [])
|
|
1073
|
+
class_set = set(classes)
|
|
1074
|
+
|
|
1075
|
+
if "center" in class_set:
|
|
1076
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
|
|
1077
|
+
elif "right" in class_set:
|
|
1078
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT)
|
|
1079
|
+
elif "dialogue" in class_set:
|
|
1080
|
+
para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
1081
|
+
for run in para.runs:
|
|
1082
|
+
set_font(run, italic=True, color=RGBColor(107, 91, 122))
|
|
1083
|
+
elif "quote" in class_set or element.get("style", "").find("background") != -1:
|
|
1084
|
+
para = process_paragraph(element, doc)
|
|
1085
|
+
para.paragraph_format.left_indent = Inches(1)
|
|
1086
|
+
para.paragraph_format.right_indent = Inches(1)
|
|
1087
|
+
from docx.enum.text import WD_BORDER
|
|
1088
|
+
|
|
1089
|
+
for border in para.paragraph_format._element.xpath("./w:pBdr"):
|
|
1090
|
+
border.getparent().remove(border)
|
|
1091
|
+
# 添加边框效果(使用浅灰色背景模拟)
|
|
1092
|
+
shading_elm = OxmlElement("w:shd")
|
|
1093
|
+
shading_elm.set(qn("w:fill"), "F5F5F5")
|
|
1094
|
+
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
1095
|
+
else:
|
|
1096
|
+
process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
1097
|
+
|
|
1098
|
+
|
|
1099
|
+
def _process_list_element(element, doc, ordered):
|
|
1100
|
+
"""处理列表元素"""
|
|
1101
|
+
items = element.find_all("li", recursive=False)
|
|
1102
|
+
process_list_items(items, doc, ordered=ordered)
|
|
1103
|
+
|
|
1104
|
+
|
|
1105
|
+
def _process_image_element(element, doc, html_file):
|
|
1106
|
+
"""处理图片元素"""
|
|
1107
|
+
src = element.get("src", "")
|
|
1108
|
+
if src:
|
|
1109
|
+
# 解析宽度、高度和对齐方式
|
|
1110
|
+
width = element.get("width")
|
|
1111
|
+
height = element.get("height")
|
|
1112
|
+
style = element.get("style", "")
|
|
1113
|
+
align = element.get("align", "center")
|
|
1114
|
+
|
|
1115
|
+
# 从 style 中提取对齐方式
|
|
1116
|
+
if "text-align: right" in style or "float: right" in style:
|
|
1117
|
+
align = "right"
|
|
1118
|
+
elif "text-align: left" in style or "float: left" in style:
|
|
1119
|
+
align = "left"
|
|
1120
|
+
elif "text-align: center" in style:
|
|
1121
|
+
align = "center"
|
|
1122
|
+
|
|
1123
|
+
# 处理宽度高度(支持像素转英寸)
|
|
1124
|
+
width_inch = None
|
|
1125
|
+
height_inch = None
|
|
1126
|
+
if width:
|
|
1127
|
+
try:
|
|
1128
|
+
width_px = float(width)
|
|
1129
|
+
width_inch = width_px / 96 # 假设96 DPI
|
|
1130
|
+
except:
|
|
1131
|
+
pass
|
|
1132
|
+
if height:
|
|
1133
|
+
try:
|
|
1134
|
+
height_px = float(height)
|
|
1135
|
+
height_inch = height_px / 96
|
|
1136
|
+
except:
|
|
1137
|
+
pass
|
|
1138
|
+
|
|
1139
|
+
# 处理相对路径(相对于HTML文件)
|
|
1140
|
+
html_dir = os.path.dirname(html_file)
|
|
1141
|
+
image_path = os.path.join(html_dir, src) if not os.path.isabs(src) else src
|
|
1142
|
+
|
|
1143
|
+
# 添加图片
|
|
1144
|
+
add_image(doc, image_path, width_inch, height_inch, align)
|
|
1145
|
+
|
|
1146
|
+
|
|
1147
|
+
def _process_div_element(element, doc, default_font, default_size):
|
|
1148
|
+
"""处理div元素"""
|
|
1149
|
+
classes = element.get("class", [])
|
|
1150
|
+
class_set = set(classes)
|
|
1151
|
+
|
|
1152
|
+
if "chapter" in class_set:
|
|
1153
|
+
# 处理章节
|
|
1154
|
+
h2 = element.find("h2")
|
|
1155
|
+
if h2:
|
|
1156
|
+
heading = doc.add_heading(h2.get_text().strip(), level=2)
|
|
1157
|
+
for run in heading.runs:
|
|
1158
|
+
run.font.color.rgb = RGBColor(91, 78, 140)
|
|
1159
|
+
run.font.size = Pt(16)
|
|
1160
|
+
run.font.name = default_font
|
|
1161
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1162
|
+
|
|
1163
|
+
paragraphs = element.find_all("p")
|
|
1164
|
+
for p in paragraphs:
|
|
1165
|
+
first_span = p.find("span", class_="first-line")
|
|
1166
|
+
if first_span:
|
|
1167
|
+
# 处理首字下沉效果
|
|
1168
|
+
para = doc.add_paragraph()
|
|
1169
|
+
para.paragraph_format.first_line_indent = Inches(0)
|
|
1170
|
+
|
|
1171
|
+
first_char_run = para.add_run(first_span.text)
|
|
1172
|
+
set_font(
|
|
1173
|
+
first_char_run, size=20, bold=True, color=RGBColor(102, 126, 234)
|
|
1174
|
+
)
|
|
1175
|
+
remaining_text = p.get_text().replace(first_span.text, "", 1)
|
|
1176
|
+
run = para.add_run(remaining_text)
|
|
1177
|
+
set_font(run)
|
|
1178
|
+
else:
|
|
1179
|
+
process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
|
|
1180
|
+
|
|
1181
|
+
elif "ending" in class_set:
|
|
1182
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
|
|
1183
|
+
for run in para.runs:
|
|
1184
|
+
set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
|
|
1185
|
+
|
|
1186
|
+
elif "page-break" in class_set:
|
|
1187
|
+
add_page_break(doc)
|
|
1188
|
+
|
|
1189
|
+
elif "columns" in class_set:
|
|
1190
|
+
# 处理多栏布局(使用连续分节符,不换页)
|
|
1191
|
+
cols_num = int(element.get("data-cols", "2"))
|
|
1192
|
+
# 添加连续分节符并设置栏数
|
|
1193
|
+
add_columns_section(doc, cols_num)
|
|
1194
|
+
# 处理其中的段落
|
|
1195
|
+
for p in element.find_all("p", recursive=False):
|
|
1196
|
+
process_paragraph(
|
|
1197
|
+
p, doc, default_font=default_font, default_size=default_size
|
|
1198
|
+
)
|
|
1199
|
+
|
|
1200
|
+
elif "info" in class_set or "warning" in class_set or "success" in class_set:
|
|
1201
|
+
# 处理提示框
|
|
1202
|
+
para = doc.add_paragraph()
|
|
1203
|
+
para.paragraph_format.right_indent = Inches(0.3)
|
|
1204
|
+
|
|
1205
|
+
# 设置背景色和左边框颜色
|
|
1206
|
+
if "info" in class_set:
|
|
1207
|
+
bg_color = ConverterConfig.INFO_COLORS["bg"]
|
|
1208
|
+
border_color = ConverterConfig.INFO_COLORS["border"]
|
|
1209
|
+
elif "warning" in class_set:
|
|
1210
|
+
bg_color = ConverterConfig.WARNING_COLORS["bg"]
|
|
1211
|
+
border_color = ConverterConfig.WARNING_COLORS["border"]
|
|
1212
|
+
else: # success
|
|
1213
|
+
bg_color = ConverterConfig.SUCCESS_COLORS["bg"]
|
|
1214
|
+
border_color = ConverterConfig.SUCCESS_COLORS["border"]
|
|
1215
|
+
|
|
1216
|
+
# 处理内容
|
|
1217
|
+
_process_element_to_runs(element, para, default_font, default_size)
|
|
1218
|
+
|
|
1219
|
+
# 添加背景色
|
|
1220
|
+
shading_elm = OxmlElement("w:shd")
|
|
1221
|
+
shading_elm.set(qn("w:fill"), bg_color)
|
|
1222
|
+
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
1223
|
+
|
|
1224
|
+
# 添加左边框
|
|
1225
|
+
pPr = para.paragraph_format._element.get_or_add_pPr()
|
|
1226
|
+
pBdr = OxmlElement("w:pBdr")
|
|
1227
|
+
left = OxmlElement("w:left")
|
|
1228
|
+
left.set(qn("w:val"), "single")
|
|
1229
|
+
left.set(qn("w:sz"), "4")
|
|
1230
|
+
left.set(qn("w:color"), border_color)
|
|
1231
|
+
pBdr.append(left)
|
|
1232
|
+
pPr.append(pBdr)
|
|
1233
|
+
|
|
1234
|
+
para.paragraph_format.space_after = Pt(6)
|
|
1235
|
+
|
|
1236
|
+
|
|
1237
|
+
def _process_horizontal_rule_element(element, doc):
|
|
1238
|
+
"""处理水平线元素"""
|
|
1239
|
+
classes = element.get("class", [])
|
|
1240
|
+
style = element.get("style", "")
|
|
1241
|
+
class_set = set(classes)
|
|
1242
|
+
if "page-break" in class_set or "page-break-after" in style:
|
|
1243
|
+
add_page_break(doc)
|
|
1244
|
+
else:
|
|
1245
|
+
add_horizontal_rule(doc)
|
|
1246
|
+
|
|
1247
|
+
|
|
1248
|
+
def _process_elements(soup, doc, html_file, default_font, default_size):
|
|
1249
|
+
"""处理所有HTML元素"""
|
|
1250
|
+
for element in soup.body.find_all(recursive=False):
|
|
1251
|
+
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
|
1252
|
+
_process_heading(element, doc, default_font)
|
|
1253
|
+
elif element.name == "p":
|
|
1254
|
+
_process_paragraph_element(element, doc, default_font, default_size)
|
|
1255
|
+
elif element.name == "ul":
|
|
1256
|
+
_process_list_element(element, doc, ordered=False)
|
|
1257
|
+
elif element.name == "ol":
|
|
1258
|
+
_process_list_element(element, doc, ordered=True)
|
|
1259
|
+
elif element.name == "table":
|
|
1260
|
+
process_table(element, doc)
|
|
1261
|
+
elif element.name == "img":
|
|
1262
|
+
_process_image_element(element, doc, html_file)
|
|
1263
|
+
elif element.name == "div":
|
|
1264
|
+
_process_div_element(element, doc, default_font, default_size)
|
|
1265
|
+
elif element.name == "hr":
|
|
1266
|
+
_process_horizontal_rule_element(element, doc)
|
|
1267
|
+
|
|
1268
|
+
|
|
1269
|
+
def convert_html_to_docx(
|
|
1270
|
+
html_file, output_file, default_font="微软雅黑", default_size=12
|
|
1271
|
+
):
|
|
545
1272
|
"""将HTML文件转换为DOCX文件"""
|
|
546
1273
|
# 读取HTML文件
|
|
547
|
-
with open(html_file,
|
|
1274
|
+
with open(html_file, "r", encoding="utf-8") as f:
|
|
548
1275
|
html_content = f.read()
|
|
549
|
-
|
|
1276
|
+
|
|
550
1277
|
# 解析HTML
|
|
551
|
-
soup = BeautifulSoup(html_content,
|
|
552
|
-
|
|
1278
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
1279
|
+
|
|
553
1280
|
# 创建Word文档
|
|
554
1281
|
doc = Document()
|
|
555
|
-
|
|
1282
|
+
|
|
556
1283
|
# 设置默认字体
|
|
557
|
-
doc.styles[
|
|
558
|
-
doc.styles[
|
|
559
|
-
doc.styles[
|
|
560
|
-
|
|
1284
|
+
doc.styles["Normal"].font.name = default_font
|
|
1285
|
+
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1286
|
+
doc.styles["Normal"].font.size = Pt(default_size)
|
|
1287
|
+
|
|
561
1288
|
# 处理页面设置
|
|
562
1289
|
section = doc.sections[0]
|
|
563
|
-
section.page_height = Cm(
|
|
564
|
-
section.page_width = Cm(
|
|
565
|
-
section.left_margin = Cm(
|
|
566
|
-
section.right_margin = Cm(
|
|
567
|
-
section.top_margin = Cm(
|
|
568
|
-
section.bottom_margin = Cm(
|
|
569
|
-
|
|
1290
|
+
section.page_height = Cm(ConverterConfig.PAGE_HEIGHT_CM)
|
|
1291
|
+
section.page_width = Cm(ConverterConfig.PAGE_WIDTH_CM)
|
|
1292
|
+
section.left_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1293
|
+
section.right_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1294
|
+
section.top_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1295
|
+
section.bottom_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1296
|
+
|
|
570
1297
|
# 遍历所有顶级元素
|
|
571
1298
|
for element in soup.body.find_all(recursive=False):
|
|
572
|
-
if element.name in [
|
|
1299
|
+
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
|
573
1300
|
level = int(element.name[1])
|
|
574
1301
|
heading = doc.add_heading(element.get_text().strip(), level=level)
|
|
575
1302
|
heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
576
|
-
|
|
1303
|
+
|
|
577
1304
|
# 标题样式
|
|
578
1305
|
for run in heading.runs:
|
|
579
1306
|
run.font.name = default_font
|
|
580
|
-
run._element.rPr.rFonts.set(qn(
|
|
1307
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1308
|
+
run.font.size = Pt(ConverterConfig.HEADING_SIZES.get(level, 14))
|
|
1309
|
+
run.font.bold = True
|
|
1310
|
+
run.font.color.rgb = ConverterConfig.HEADING_COLORS.get(
|
|
1311
|
+
level, RGBColor(107, 91, 149)
|
|
1312
|
+
)
|
|
581
1313
|
if level == 1:
|
|
582
|
-
run.font.size = Pt(18)
|
|
583
|
-
run.font.bold = True
|
|
584
|
-
run.font.color.rgb = RGBColor(74, 63, 107)
|
|
585
1314
|
heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
run.font.bold = True
|
|
589
|
-
run.font.color.rgb = RGBColor(91, 78, 140)
|
|
590
|
-
else:
|
|
591
|
-
run.font.size = Pt(14)
|
|
592
|
-
run.font.bold = True
|
|
593
|
-
|
|
594
|
-
elif element.name == 'p':
|
|
1315
|
+
|
|
1316
|
+
elif element.name == "p":
|
|
595
1317
|
# 检查特殊段落样式
|
|
596
|
-
classes = element.get(
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
1318
|
+
classes = element.get("class", [])
|
|
1319
|
+
class_set = set(classes) # 转换为集合提高查找性能
|
|
1320
|
+
|
|
1321
|
+
if "center" in class_set:
|
|
1322
|
+
para = process_paragraph(
|
|
1323
|
+
element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
1324
|
+
)
|
|
1325
|
+
elif "right" in class_set:
|
|
1326
|
+
para = process_paragraph(
|
|
1327
|
+
element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
1328
|
+
)
|
|
1329
|
+
elif "dialogue" in class_set:
|
|
603
1330
|
para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
604
1331
|
for run in para.runs:
|
|
605
1332
|
set_font(run, italic=True, color=RGBColor(107, 91, 122))
|
|
606
|
-
elif
|
|
1333
|
+
elif (
|
|
1334
|
+
"quote" in class_set
|
|
1335
|
+
or element.get("style", "").find("background") != -1
|
|
1336
|
+
):
|
|
607
1337
|
para = process_paragraph(element, doc)
|
|
608
1338
|
para.paragraph_format.left_indent = Inches(1)
|
|
609
1339
|
para.paragraph_format.right_indent = Inches(1)
|
|
610
1340
|
from docx.enum.text import WD_BORDER
|
|
611
|
-
|
|
1341
|
+
|
|
1342
|
+
for border in para.paragraph_format._element.xpath("./w:pBdr"):
|
|
612
1343
|
border.getparent().remove(border)
|
|
613
1344
|
# 添加边框效果(使用浅灰色背景模拟)
|
|
614
|
-
shading_elm = OxmlElement(
|
|
615
|
-
shading_elm.set(qn(
|
|
1345
|
+
shading_elm = OxmlElement("w:shd")
|
|
1346
|
+
shading_elm.set(qn("w:fill"), "F5F5F5")
|
|
616
1347
|
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
617
1348
|
else:
|
|
618
1349
|
process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
619
|
-
|
|
620
|
-
elif element.name ==
|
|
621
|
-
items = element.find_all(
|
|
1350
|
+
|
|
1351
|
+
elif element.name == "ul":
|
|
1352
|
+
items = element.find_all("li", recursive=False)
|
|
622
1353
|
process_list_items(items, doc, ordered=False)
|
|
623
|
-
|
|
624
|
-
elif element.name ==
|
|
625
|
-
items = element.find_all(
|
|
1354
|
+
|
|
1355
|
+
elif element.name == "ol":
|
|
1356
|
+
items = element.find_all("li", recursive=False)
|
|
626
1357
|
process_list_items(items, doc, ordered=True)
|
|
627
|
-
|
|
628
|
-
elif element.name ==
|
|
1358
|
+
|
|
1359
|
+
elif element.name == "blockquote":
|
|
629
1360
|
# 递归处理嵌套引用
|
|
630
1361
|
_process_blockquote(element, doc, level=0)
|
|
631
|
-
|
|
632
|
-
elif element.name ==
|
|
1362
|
+
|
|
1363
|
+
elif element.name == "pre":
|
|
633
1364
|
code_text = element.get_text()
|
|
634
1365
|
para = doc.add_paragraph()
|
|
635
1366
|
para.paragraph_format.left_indent = Inches(0.5)
|
|
636
1367
|
run = para.add_run(code_text)
|
|
637
|
-
set_font(run, font_name=
|
|
1368
|
+
set_font(run, font_name="Consolas", size=10, color=RGBColor(0, 0, 128))
|
|
638
1369
|
# 添加灰色背景
|
|
639
|
-
shading_elm = OxmlElement(
|
|
640
|
-
shading_elm.set(qn(
|
|
1370
|
+
shading_elm = OxmlElement("w:shd")
|
|
1371
|
+
shading_elm.set(qn("w:fill"), "F0F0F0")
|
|
641
1372
|
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
642
|
-
|
|
643
|
-
elif element.name ==
|
|
1373
|
+
|
|
1374
|
+
elif element.name == "hr":
|
|
644
1375
|
# 检查是否有分页符class或style
|
|
645
|
-
classes = element.get(
|
|
646
|
-
style = element.get(
|
|
647
|
-
|
|
1376
|
+
classes = element.get("class", [])
|
|
1377
|
+
style = element.get("style", "")
|
|
1378
|
+
class_set = set(classes)
|
|
1379
|
+
if "page-break" in class_set or "page-break-after" in style:
|
|
648
1380
|
add_page_break(doc)
|
|
649
1381
|
else:
|
|
650
1382
|
add_horizontal_rule(doc)
|
|
651
|
-
|
|
652
|
-
elif element.name ==
|
|
1383
|
+
|
|
1384
|
+
elif element.name == "table":
|
|
653
1385
|
process_table(element, doc)
|
|
654
|
-
|
|
655
|
-
elif element.name ==
|
|
1386
|
+
|
|
1387
|
+
elif element.name == "img":
|
|
1388
|
+
# 处理图片
|
|
1389
|
+
src = element.get("src", "")
|
|
1390
|
+
alt = element.get("alt", "")
|
|
1391
|
+
|
|
1392
|
+
if src:
|
|
1393
|
+
# 解析宽度、高度和对齐方式
|
|
1394
|
+
width = element.get("width")
|
|
1395
|
+
height = element.get("height")
|
|
1396
|
+
style = element.get("style", "")
|
|
1397
|
+
align = element.get("align", "center")
|
|
1398
|
+
|
|
1399
|
+
# 从 style 中提取对齐方式
|
|
1400
|
+
if "text-align: right" in style or "float: right" in style:
|
|
1401
|
+
align = "right"
|
|
1402
|
+
elif "text-align: left" in style or "float: left" in style:
|
|
1403
|
+
align = "left"
|
|
1404
|
+
elif "text-align: center" in style:
|
|
1405
|
+
align = "center"
|
|
1406
|
+
|
|
1407
|
+
# 处理宽度高度(支持像素转英寸)
|
|
1408
|
+
width_inch = None
|
|
1409
|
+
height_inch = None
|
|
1410
|
+
if width:
|
|
1411
|
+
try:
|
|
1412
|
+
width_px = float(width)
|
|
1413
|
+
width_inch = width_px / 96 # 假设96 DPI
|
|
1414
|
+
except:
|
|
1415
|
+
pass
|
|
1416
|
+
if height:
|
|
1417
|
+
try:
|
|
1418
|
+
height_px = float(height)
|
|
1419
|
+
height_inch = height_px / 96
|
|
1420
|
+
except:
|
|
1421
|
+
pass
|
|
1422
|
+
|
|
1423
|
+
# 处理相对路径(相对于HTML文件)
|
|
1424
|
+
html_dir = os.path.dirname(html_file)
|
|
1425
|
+
image_path = (
|
|
1426
|
+
os.path.join(html_dir, src) if not os.path.isabs(src) else src
|
|
1427
|
+
)
|
|
1428
|
+
|
|
1429
|
+
# 添加图片
|
|
1430
|
+
add_image(doc, image_path, width_inch, height_inch, align)
|
|
1431
|
+
|
|
1432
|
+
elif element.name == "div":
|
|
656
1433
|
# 检查是否是特殊div
|
|
657
|
-
classes = element.get(
|
|
658
|
-
|
|
1434
|
+
classes = element.get("class", [])
|
|
1435
|
+
class_set = set(classes)
|
|
1436
|
+
|
|
1437
|
+
if "chapter" in class_set:
|
|
659
1438
|
# 处理章节
|
|
660
|
-
h2 = element.find(
|
|
1439
|
+
h2 = element.find("h2")
|
|
661
1440
|
if h2:
|
|
662
1441
|
heading = doc.add_heading(h2.get_text().strip(), level=2)
|
|
663
1442
|
for run in heading.runs:
|
|
664
1443
|
run.font.color.rgb = RGBColor(91, 78, 140)
|
|
665
1444
|
run.font.size = Pt(16)
|
|
666
1445
|
run.font.name = default_font
|
|
667
|
-
run._element.rPr.rFonts.set(qn(
|
|
668
|
-
|
|
669
|
-
paragraphs = element.find_all(
|
|
1446
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1447
|
+
|
|
1448
|
+
paragraphs = element.find_all("p")
|
|
670
1449
|
for p in paragraphs:
|
|
671
|
-
first_span = p.find(
|
|
1450
|
+
first_span = p.find("span", class_="first-line")
|
|
672
1451
|
if first_span:
|
|
673
1452
|
para = doc.add_paragraph()
|
|
674
1453
|
first_char_run = para.add_run(first_span.text)
|
|
675
|
-
set_font(
|
|
676
|
-
|
|
1454
|
+
set_font(
|
|
1455
|
+
first_char_run,
|
|
1456
|
+
size=20,
|
|
1457
|
+
bold=True,
|
|
1458
|
+
color=RGBColor(102, 126, 234),
|
|
1459
|
+
)
|
|
1460
|
+
remaining_text = p.get_text().replace(first_span.text, "", 1)
|
|
677
1461
|
run = para.add_run(remaining_text)
|
|
678
1462
|
set_font(run)
|
|
679
1463
|
else:
|
|
680
1464
|
process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
|
|
681
|
-
|
|
682
|
-
elif
|
|
683
|
-
para = process_paragraph(
|
|
1465
|
+
|
|
1466
|
+
elif "ending" in class_set:
|
|
1467
|
+
para = process_paragraph(
|
|
1468
|
+
element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
1469
|
+
)
|
|
684
1470
|
for run in para.runs:
|
|
685
1471
|
set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
|
|
686
|
-
|
|
687
|
-
elif
|
|
1472
|
+
|
|
1473
|
+
elif "page-break" in class_set:
|
|
688
1474
|
add_page_break(doc)
|
|
689
1475
|
|
|
690
|
-
elif
|
|
1476
|
+
elif "columns" in class_set:
|
|
691
1477
|
# 处理多栏布局(使用连续分节符,不换页)
|
|
692
|
-
cols_num = int(element.get(
|
|
1478
|
+
cols_num = int(element.get("data-cols", "2"))
|
|
693
1479
|
# 添加连续分节符并设置栏数
|
|
694
1480
|
add_columns_section(doc, cols_num)
|
|
695
1481
|
# 处理其中的段落
|
|
696
|
-
for p in element.find_all(
|
|
697
|
-
process_paragraph(
|
|
1482
|
+
for p in element.find_all("p", recursive=False):
|
|
1483
|
+
process_paragraph(
|
|
1484
|
+
p, doc, default_font=default_font, default_size=default_size
|
|
1485
|
+
)
|
|
698
1486
|
|
|
699
|
-
elif
|
|
1487
|
+
elif (
|
|
1488
|
+
"info" in class_set or "warning" in class_set or "success" in class_set
|
|
1489
|
+
):
|
|
700
1490
|
# 处理提示框
|
|
701
1491
|
para = doc.add_paragraph()
|
|
702
1492
|
para.paragraph_format.right_indent = Inches(0.3)
|
|
703
|
-
|
|
1493
|
+
|
|
704
1494
|
# 设置背景色和左边框颜色
|
|
705
|
-
if
|
|
706
|
-
bg_color =
|
|
707
|
-
border_color =
|
|
708
|
-
elif
|
|
709
|
-
bg_color =
|
|
710
|
-
border_color =
|
|
1495
|
+
if "info" in class_set:
|
|
1496
|
+
bg_color = ConverterConfig.INFO_COLORS["bg"]
|
|
1497
|
+
border_color = ConverterConfig.INFO_COLORS["border"]
|
|
1498
|
+
elif "warning" in class_set:
|
|
1499
|
+
bg_color = ConverterConfig.WARNING_COLORS["bg"]
|
|
1500
|
+
border_color = ConverterConfig.WARNING_COLORS["border"]
|
|
711
1501
|
else: # success
|
|
712
|
-
bg_color =
|
|
713
|
-
border_color =
|
|
714
|
-
|
|
1502
|
+
bg_color = ConverterConfig.SUCCESS_COLORS["bg"]
|
|
1503
|
+
border_color = ConverterConfig.SUCCESS_COLORS["border"]
|
|
1504
|
+
|
|
715
1505
|
# 处理内容
|
|
716
1506
|
_process_element_to_runs(element, para, default_font, default_size)
|
|
717
|
-
|
|
1507
|
+
|
|
718
1508
|
# 添加背景色
|
|
719
|
-
shading_elm = OxmlElement(
|
|
720
|
-
shading_elm.set(qn(
|
|
1509
|
+
shading_elm = OxmlElement("w:shd")
|
|
1510
|
+
shading_elm.set(qn("w:fill"), bg_color)
|
|
721
1511
|
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
722
|
-
|
|
1512
|
+
|
|
723
1513
|
# 添加左边框
|
|
724
|
-
pBdr = OxmlElement(
|
|
725
|
-
left_border = OxmlElement(
|
|
726
|
-
left_border.set(qn(
|
|
727
|
-
left_border.set(qn(
|
|
728
|
-
left_border.set(qn(
|
|
1514
|
+
pBdr = OxmlElement("w:pBdr")
|
|
1515
|
+
left_border = OxmlElement("w:left")
|
|
1516
|
+
left_border.set(qn("w:val"), "single")
|
|
1517
|
+
left_border.set(qn("w:sz"), "24") # 边框粗细
|
|
1518
|
+
left_border.set(qn("w:color"), border_color)
|
|
729
1519
|
pBdr.append(left_border)
|
|
730
1520
|
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
731
|
-
|
|
1521
|
+
|
|
732
1522
|
else:
|
|
733
1523
|
# 处理普通div,检查是否有内联样式(如提示框)
|
|
734
|
-
style = element.get(
|
|
1524
|
+
style = element.get("style", "")
|
|
735
1525
|
style_dict = _parse_style(style)
|
|
736
|
-
|
|
1526
|
+
|
|
737
1527
|
# 检查是否有背景色和左边框(提示框特征)
|
|
738
|
-
bg_color = style_dict.get(
|
|
739
|
-
border_left = style_dict.get(
|
|
740
|
-
|
|
1528
|
+
bg_color = style_dict.get("background-color", "")
|
|
1529
|
+
border_left = style_dict.get("border-left", "")
|
|
1530
|
+
|
|
741
1531
|
if bg_color and border_left:
|
|
742
1532
|
# 这是提示框
|
|
743
1533
|
para = doc.add_paragraph()
|
|
744
1534
|
para.paragraph_format.right_indent = Inches(0.3)
|
|
745
|
-
|
|
1535
|
+
|
|
746
1536
|
# 处理内容
|
|
747
1537
|
_process_element_to_runs(element, para, default_font, default_size)
|
|
748
|
-
|
|
1538
|
+
|
|
749
1539
|
# 添加背景色
|
|
750
|
-
if bg_color.startswith(
|
|
751
|
-
shading_elm = OxmlElement(
|
|
752
|
-
shading_elm.set(qn(
|
|
753
|
-
para.paragraph_format._element.get_or_add_pPr().append(
|
|
754
|
-
|
|
1540
|
+
if bg_color.startswith("#"):
|
|
1541
|
+
shading_elm = OxmlElement("w:shd")
|
|
1542
|
+
shading_elm.set(qn("w:fill"), bg_color[1:].upper())
|
|
1543
|
+
para.paragraph_format._element.get_or_add_pPr().append(
|
|
1544
|
+
shading_elm
|
|
1545
|
+
)
|
|
1546
|
+
|
|
755
1547
|
# 解析左边框颜色
|
|
756
|
-
border_color =
|
|
757
|
-
if
|
|
1548
|
+
border_color = ""
|
|
1549
|
+
if "solid" in border_left:
|
|
758
1550
|
parts = border_left.split()
|
|
759
1551
|
for i, part in enumerate(parts):
|
|
760
|
-
if part.startswith(
|
|
1552
|
+
if part.startswith("#"):
|
|
761
1553
|
border_color = part[1:]
|
|
762
1554
|
break
|
|
763
|
-
|
|
1555
|
+
|
|
764
1556
|
# 添加左边框
|
|
765
1557
|
if border_color:
|
|
766
|
-
pBdr = OxmlElement(
|
|
767
|
-
left_border = OxmlElement(
|
|
768
|
-
left_border.set(qn(
|
|
769
|
-
left_border.set(qn(
|
|
770
|
-
left_border.set(qn(
|
|
1558
|
+
pBdr = OxmlElement("w:pBdr")
|
|
1559
|
+
left_border = OxmlElement("w:left")
|
|
1560
|
+
left_border.set(qn("w:val"), "single")
|
|
1561
|
+
left_border.set(qn("w:sz"), "24")
|
|
1562
|
+
left_border.set(qn("w:color"), border_color.upper())
|
|
771
1563
|
pBdr.append(left_border)
|
|
772
1564
|
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
773
1565
|
else:
|
|
774
1566
|
# 普通div,处理其中的段落
|
|
775
|
-
for p in element.find_all(
|
|
1567
|
+
for p in element.find_all("p", recursive=False):
|
|
776
1568
|
process_paragraph(p, doc)
|
|
777
|
-
|
|
778
|
-
elif element.name ==
|
|
779
|
-
src = element.get(
|
|
780
|
-
alt = element.get(
|
|
1569
|
+
|
|
1570
|
+
elif element.name == "img":
|
|
1571
|
+
src = element.get("src", "")
|
|
1572
|
+
alt = element.get("alt", "图片")
|
|
781
1573
|
if src and os.path.exists(src):
|
|
782
1574
|
try:
|
|
783
1575
|
doc.add_picture(src, width=Inches(5))
|
|
784
1576
|
last_para = doc.paragraphs[-1]
|
|
785
1577
|
last_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
786
1578
|
except:
|
|
787
|
-
para = doc.add_paragraph(f
|
|
1579
|
+
para = doc.add_paragraph(f"[图片: {alt}]")
|
|
788
1580
|
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
789
1581
|
else:
|
|
790
|
-
para = doc.add_paragraph(f
|
|
1582
|
+
para = doc.add_paragraph(f"[图片: {alt} - 路径: {src}]")
|
|
791
1583
|
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
792
|
-
|
|
1584
|
+
|
|
793
1585
|
# 保存文档
|
|
794
1586
|
doc.save(output_file)
|
|
795
|
-
print(f"转换完成!文件已保存为 {output_file}")
|
|
796
1587
|
|
|
797
|
-
|
|
1588
|
+
|
|
1589
|
+
if __name__ == "__main__":
|
|
798
1590
|
import sys
|
|
799
|
-
|
|
800
|
-
if len(sys.argv)
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
convert_html_to_docx(html_file, output_file)
|
|
1591
|
+
|
|
1592
|
+
if len(sys.argv) != 3:
|
|
1593
|
+
print("用法: python docx_converter.py <html_file> <output_file>")
|
|
1594
|
+
sys.exit(1)
|
|
1595
|
+
|
|
1596
|
+
html_file = sys.argv[1]
|
|
1597
|
+
output_file = sys.argv[2]
|
|
1598
|
+
|
|
1599
|
+
convert_html_to_docx(html_file, output_file)
|