@birthday8/doc-mcp 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/index.js +61 -65
- package/install.js +45 -35
- package/package.json +2 -4
- package/python/docx_converter.py +1152 -428
- package/python/html_fixer.py +125 -0
- package/python/html_rules.py +570 -0
- package/python/html_validator.py +174 -0
- package/python/html_validator_strict.py +428 -0
- package/python/sample/example.html +407 -0
- package/python/sample/html_schema.py +283 -0
- package/python/server.py +233 -123
- package/python/test_error_detection.py +84 -0
- package/python/test_strict_validation.py +118 -0
package/python/docx_converter.py
CHANGED
|
@@ -4,13 +4,225 @@ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
|
|
|
4
4
|
from docx.enum.section import WD_SECTION
|
|
5
5
|
from docx.oxml.ns import qn
|
|
6
6
|
from docx.oxml import OxmlElement
|
|
7
|
+
from docx.enum.shape import WD_INLINE_SHAPE
|
|
7
8
|
from bs4 import BeautifulSoup
|
|
8
9
|
import os
|
|
9
10
|
import re
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
# 尝试导入 math2docx
|
|
14
|
+
try:
|
|
15
|
+
import math2docx
|
|
16
|
+
|
|
17
|
+
HAS_MATH2DOCX = True
|
|
18
|
+
except ImportError:
|
|
19
|
+
HAS_MATH2DOCX = False
|
|
20
|
+
print("Warning: math2docx not installed. Formula support will be limited.")
|
|
21
|
+
|
|
22
|
+
# ==================== 预编译正则表达式 ====================
|
|
23
|
+
# 样式解析相关
|
|
24
|
+
TEXT_ALIGN_RE = re.compile(r"text-align:\s*([^;]+)")
|
|
25
|
+
LINE_HEIGHT_RE = re.compile(r"line-height:\s*([^;]+)")
|
|
26
|
+
COLOR_RE = re.compile(r"(?<!background-)color:\s*([^;]+)")
|
|
27
|
+
BACKGROUND_COLOR_RE = re.compile(r"background-color:\s*([^;]+)")
|
|
28
|
+
FONT_FAMILY_RE = re.compile(r"font-family:\s*([^;]+)")
|
|
29
|
+
FONT_SIZE_RE = re.compile(r"font-size:\s*([^;]+)")
|
|
30
|
+
FONT_WEIGHT_RE = re.compile(r"font-weight:\s*([^;]+)")
|
|
31
|
+
FONT_STYLE_RE = re.compile(r"font-style:\s*([^;]+)")
|
|
32
|
+
TEXT_DECORATION_RE = re.compile(r"text-decoration:\s*([^;]+)")
|
|
33
|
+
MARGIN_RE = re.compile(r"margin(?:-(top|bottom|left|right))?:\s*([^;]+)")
|
|
34
|
+
PADDING_RE = re.compile(r"padding(?:-(top|bottom|left|right))?:\s*([^;]+)")
|
|
35
|
+
|
|
36
|
+
# 公式相关
|
|
37
|
+
LATEX_FORMULA_RE = re.compile(r"\$\$(.*?)\$\$|\$(.*?)\$")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# ==================== 常量配置 ====================
|
|
41
|
+
class ConverterConfig:
|
|
42
|
+
"""转换器配置常量"""
|
|
43
|
+
|
|
44
|
+
DEFAULT_FONT = "微软雅黑"
|
|
45
|
+
DEFAULT_SIZE = 12
|
|
46
|
+
|
|
47
|
+
# 页面设置
|
|
48
|
+
PAGE_HEIGHT_CM = 29.7
|
|
49
|
+
PAGE_WIDTH_CM = 21.0
|
|
50
|
+
MARGIN_CM = 2.54
|
|
51
|
+
|
|
52
|
+
# 字体大小映射
|
|
53
|
+
HEADING_SIZES = {
|
|
54
|
+
1: 18,
|
|
55
|
+
2: 16,
|
|
56
|
+
3: 14,
|
|
57
|
+
4: 14,
|
|
58
|
+
5: 14,
|
|
59
|
+
6: 14,
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
# 标题颜色
|
|
63
|
+
HEADING_COLORS = {
|
|
64
|
+
1: RGBColor(74, 63, 107),
|
|
65
|
+
2: RGBColor(91, 78, 140),
|
|
66
|
+
3: RGBColor(107, 91, 149),
|
|
67
|
+
4: RGBColor(122, 104, 161),
|
|
68
|
+
5: RGBColor(137, 117, 173),
|
|
69
|
+
6: RGBColor(152, 130, 185),
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
# 特殊类名颜色
|
|
73
|
+
CLASS_COLORS = {
|
|
74
|
+
"red": RGBColor(255, 0, 0),
|
|
75
|
+
"blue": RGBColor(0, 0, 255),
|
|
76
|
+
"green": RGBColor(0, 128, 0),
|
|
77
|
+
"purple": RGBColor(128, 0, 128),
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# 提示框颜色
|
|
81
|
+
INFO_COLORS = {
|
|
82
|
+
"bg": "E3F2FD",
|
|
83
|
+
"border": "2196F3",
|
|
84
|
+
}
|
|
85
|
+
WARNING_COLORS = {
|
|
86
|
+
"bg": "FFF3CD",
|
|
87
|
+
"border": "FFC107",
|
|
88
|
+
}
|
|
89
|
+
SUCCESS_COLORS = {
|
|
90
|
+
"bg": "D4EDDA",
|
|
91
|
+
"border": "28A745",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# 颜色映射
|
|
95
|
+
COLOR_MAP = {
|
|
96
|
+
"red": "FF0000",
|
|
97
|
+
"green": "008000",
|
|
98
|
+
"blue": "0000FF",
|
|
99
|
+
"yellow": "FFFF00",
|
|
100
|
+
"orange": "FFA500",
|
|
101
|
+
"purple": "800080",
|
|
102
|
+
"pink": "FFC0CB",
|
|
103
|
+
"brown": "A52A2A",
|
|
104
|
+
"gray": "808080",
|
|
105
|
+
"black": "000000",
|
|
106
|
+
"white": "FFFFFF",
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def add_image(doc, image_path, width=None, height=None, align="center"):
|
|
111
|
+
"""添加图片到文档
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
doc: Word文档对象
|
|
115
|
+
image_path: 图片路径(绝对路径或相对于html文件的路径)
|
|
116
|
+
width: 图片宽度(英寸,可选)
|
|
117
|
+
height: 图片高度(英寸,可选)
|
|
118
|
+
align: 对齐方式('left', 'center', 'right')
|
|
119
|
+
"""
|
|
120
|
+
# 检查文件是否存在
|
|
121
|
+
if not os.path.exists(image_path):
|
|
122
|
+
print(f"Warning: Image file not found: {image_path}")
|
|
123
|
+
# 添加占位文本
|
|
124
|
+
para = doc.add_paragraph()
|
|
125
|
+
run = para.add_run(f"[图片: {os.path.basename(image_path)}]")
|
|
126
|
+
run.font.color.rgb = RGBColor(150, 150, 150)
|
|
127
|
+
return False
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
# 创建段落并设置对齐
|
|
131
|
+
para = doc.add_paragraph()
|
|
132
|
+
if align == "center":
|
|
133
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
134
|
+
elif align == "right":
|
|
135
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
136
|
+
else:
|
|
137
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
138
|
+
|
|
139
|
+
# 添加图片
|
|
140
|
+
if width and height:
|
|
141
|
+
run = para.add_run()
|
|
142
|
+
run.add_picture(image_path, width=Inches(width), height=Inches(height))
|
|
143
|
+
elif width:
|
|
144
|
+
run = para.add_run()
|
|
145
|
+
run.add_picture(image_path, width=Inches(width))
|
|
146
|
+
elif height:
|
|
147
|
+
run = para.add_run()
|
|
148
|
+
run.add_picture(image_path, height=Inches(height))
|
|
149
|
+
else:
|
|
150
|
+
run = para.add_run()
|
|
151
|
+
run.add_picture(image_path)
|
|
152
|
+
|
|
153
|
+
return True
|
|
154
|
+
|
|
155
|
+
except Exception as e:
|
|
156
|
+
print(f"Warning: Failed to add image {image_path}: {e}")
|
|
157
|
+
import traceback
|
|
158
|
+
|
|
159
|
+
traceback.print_exc()
|
|
160
|
+
# 添加占位文本
|
|
161
|
+
para = doc.add_paragraph()
|
|
162
|
+
run = para.add_run(f"[图片: {os.path.basename(image_path)}]")
|
|
163
|
+
run.font.color.rgb = RGBColor(150, 150, 150)
|
|
164
|
+
return False
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def latex_to_unicode_formula(latex_formula):
|
|
168
|
+
"""将 LaTeX 公式转换为 Unicode 数学符号"""
|
|
169
|
+
# 简单的 LaTeX 到 Unicode 转换映射
|
|
170
|
+
conversions = {
|
|
171
|
+
r"\^2": "²",
|
|
172
|
+
r"\^3": "³",
|
|
173
|
+
r"\_2": "₂",
|
|
174
|
+
r"\_3": "₃",
|
|
175
|
+
r"\\cdot": "·",
|
|
176
|
+
r"\\times": "×",
|
|
177
|
+
r"\\div": "÷",
|
|
178
|
+
r"\\neq": "≠",
|
|
179
|
+
r"\\leq": "≤",
|
|
180
|
+
r"\\geq": "≥",
|
|
181
|
+
r"\\pm": "±",
|
|
182
|
+
r"\\sqrt": "√",
|
|
183
|
+
r"\\pi": "π",
|
|
184
|
+
r"\\alpha": "α",
|
|
185
|
+
r"\\beta": "β",
|
|
186
|
+
r"\\gamma": "γ",
|
|
187
|
+
r"\\delta": "δ",
|
|
188
|
+
r"\\theta": "θ",
|
|
189
|
+
r"\\lambda": "λ",
|
|
190
|
+
r"\\mu": "μ",
|
|
191
|
+
r"\\sigma": "σ",
|
|
192
|
+
r"\\phi": "φ",
|
|
193
|
+
r"\\omega": "ω",
|
|
194
|
+
r"\\infty": "∞",
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
result = latex_formula
|
|
198
|
+
for latex, unicode_char in conversions.items():
|
|
199
|
+
result = result.replace(latex, unicode_char)
|
|
200
|
+
|
|
201
|
+
return result
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def add_native_formula(
|
|
205
|
+
para,
|
|
206
|
+
latex_formula,
|
|
207
|
+
):
|
|
208
|
+
"""添加 Word 原生公式"""
|
|
209
|
+
if HAS_MATH2DOCX:
|
|
210
|
+
try:
|
|
211
|
+
# 添加公式
|
|
212
|
+
math2docx.add_math(para, latex_formula)
|
|
213
|
+
return True
|
|
214
|
+
except Exception as e:
|
|
215
|
+
print(f"Warning: Failed to add native formula: {e}")
|
|
216
|
+
import traceback
|
|
217
|
+
|
|
218
|
+
traceback.print_exc()
|
|
219
|
+
return False
|
|
220
|
+
return False
|
|
221
|
+
|
|
10
222
|
|
|
11
223
|
def parse_color(color_str):
|
|
12
224
|
"""解析颜色字符串为RGBColor"""
|
|
13
|
-
if not color_str or not color_str.startswith(
|
|
225
|
+
if not color_str or not color_str.startswith("#"):
|
|
14
226
|
return None
|
|
15
227
|
try:
|
|
16
228
|
r = int(color_str[1:3], 16)
|
|
@@ -19,186 +231,114 @@ def parse_color(color_str):
|
|
|
19
231
|
return RGBColor(r, g, b)
|
|
20
232
|
except:
|
|
21
233
|
return None
|
|
234
|
+
raise
|
|
235
|
+
|
|
22
236
|
|
|
23
|
-
def set_font(
|
|
24
|
-
|
|
237
|
+
def set_font(
|
|
238
|
+
run,
|
|
239
|
+
font_name="微软雅黑",
|
|
240
|
+
size=12,
|
|
241
|
+
color=None,
|
|
242
|
+
bold=False,
|
|
243
|
+
italic=False,
|
|
244
|
+
underline=False,
|
|
245
|
+
strike=False,
|
|
246
|
+
highlight_color=None,
|
|
247
|
+
):
|
|
25
248
|
"""设置字体样式"""
|
|
26
249
|
run.font.name = font_name
|
|
27
|
-
run._element.rPr.rFonts.set(qn(
|
|
250
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), font_name)
|
|
28
251
|
run.font.size = Pt(size)
|
|
29
252
|
run.font.bold = bold
|
|
30
253
|
run.font.italic = italic
|
|
31
|
-
|
|
254
|
+
|
|
32
255
|
if color:
|
|
33
256
|
run.font.color.rgb = color
|
|
34
|
-
|
|
257
|
+
|
|
35
258
|
if underline:
|
|
36
259
|
run.font.underline = True
|
|
37
|
-
|
|
260
|
+
|
|
38
261
|
if strike:
|
|
39
262
|
run.font.strike = True
|
|
40
|
-
|
|
263
|
+
|
|
41
264
|
if highlight_color:
|
|
42
265
|
from docx.enum.text import WD_COLOR_INDEX
|
|
266
|
+
|
|
43
267
|
color_map = {
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
268
|
+
"yellow": WD_COLOR_INDEX.YELLOW,
|
|
269
|
+
"green": WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
270
|
+
"cyan": WD_COLOR_INDEX.CYAN,
|
|
271
|
+
"magenta": WD_COLOR_INDEX.MAGENTA,
|
|
272
|
+
"blue": WD_COLOR_INDEX.TURQUOISE,
|
|
273
|
+
"red": WD_COLOR_INDEX.RED,
|
|
274
|
+
"darkblue": WD_COLOR_INDEX.BLUE,
|
|
275
|
+
"orange": WD_COLOR_INDEX.ORANGE,
|
|
276
|
+
"gray": WD_COLOR_INDEX.GRAY_25,
|
|
53
277
|
}
|
|
54
278
|
if highlight_color in color_map:
|
|
55
279
|
run.font.highlight_color = color_map[highlight_color]
|
|
56
280
|
|
|
57
|
-
def process_inline_elements(element, parent_run=None):
|
|
58
|
-
"""处理内联元素"""
|
|
59
|
-
from docx.text.paragraph import Paragraph
|
|
60
|
-
|
|
61
|
-
runs = []
|
|
62
|
-
|
|
63
|
-
for child in element.children:
|
|
64
|
-
if child.name is None: # 文本节点
|
|
65
|
-
text = str(child).strip()
|
|
66
|
-
if text:
|
|
67
|
-
if parent_run:
|
|
68
|
-
parent_run.add_text(text)
|
|
69
|
-
else:
|
|
70
|
-
runs.append({'text': text})
|
|
71
|
-
elif child.name == 'strong' or child.name == 'b':
|
|
72
|
-
if parent_run:
|
|
73
|
-
parent_run.bold = True
|
|
74
|
-
process_inline_elements(child, parent_run)
|
|
75
|
-
else:
|
|
76
|
-
runs.append({'text': child.get_text(), 'bold': True})
|
|
77
|
-
elif child.name == 'em' or child.name == 'i':
|
|
78
|
-
if parent_run:
|
|
79
|
-
parent_run.italic = True
|
|
80
|
-
process_inline_elements(child, parent_run)
|
|
81
|
-
else:
|
|
82
|
-
runs.append({'text': child.get_text(), 'italic': True})
|
|
83
|
-
elif child.name == 'u':
|
|
84
|
-
if parent_run:
|
|
85
|
-
parent_run.underline = True
|
|
86
|
-
process_inline_elements(child, parent_run)
|
|
87
|
-
else:
|
|
88
|
-
runs.append({'text': child.get_text(), 'underline': True})
|
|
89
|
-
elif child.name == 's' or child.name == 'del':
|
|
90
|
-
if parent_run:
|
|
91
|
-
parent_run.strike = True
|
|
92
|
-
process_inline_elements(child, parent_run)
|
|
93
|
-
else:
|
|
94
|
-
runs.append({'text': child.get_text(), 'strike': True})
|
|
95
|
-
elif child.name == 'sup':
|
|
96
|
-
if parent_run:
|
|
97
|
-
parent_run.font.superscript = True
|
|
98
|
-
process_inline_elements(child, parent_run)
|
|
99
|
-
else:
|
|
100
|
-
runs.append({'text': child.get_text(), 'superscript': True})
|
|
101
|
-
elif child.name == 'sub':
|
|
102
|
-
if parent_run:
|
|
103
|
-
parent_run.font.subscript = True
|
|
104
|
-
process_inline_elements(child, parent_run)
|
|
105
|
-
else:
|
|
106
|
-
runs.append({'text': child.get_text(), 'subscript': True})
|
|
107
|
-
elif child.name == 'code':
|
|
108
|
-
code_text = child.get_text()
|
|
109
|
-
if parent_run:
|
|
110
|
-
parent_run.font.name = 'Consolas'
|
|
111
|
-
parent_run.font.size = Pt(10)
|
|
112
|
-
parent_run.add_text(code_text)
|
|
113
|
-
else:
|
|
114
|
-
runs.append({'text': code_text, 'font': 'Consolas', 'size': 10})
|
|
115
|
-
elif child.name == 'a':
|
|
116
|
-
link_text = child.get_text()
|
|
117
|
-
href = child.get('href', '')
|
|
118
|
-
if parent_run:
|
|
119
|
-
parent_run.add_text(link_text)
|
|
120
|
-
else:
|
|
121
|
-
runs.append({'text': link_text, 'link': href})
|
|
122
|
-
elif child.name == 'span':
|
|
123
|
-
style = child.get('style', '')
|
|
124
|
-
color_match = re.search(r'color:\s*([^;]+)', style)
|
|
125
|
-
bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
|
|
126
|
-
|
|
127
|
-
props = {'text': child.get_text()}
|
|
128
|
-
if color_match:
|
|
129
|
-
color = parse_color(color_match.group(1).strip())
|
|
130
|
-
if color:
|
|
131
|
-
props['color'] = color
|
|
132
|
-
if bg_match:
|
|
133
|
-
bg_color = bg_match.group(1).strip()
|
|
134
|
-
if bg_color.startswith('#'):
|
|
135
|
-
bg_rgb = parse_color(bg_color)
|
|
136
|
-
if bg_rgb:
|
|
137
|
-
props['highlight'] = str(bg_rgb)
|
|
138
|
-
|
|
139
|
-
if parent_run:
|
|
140
|
-
if 'color' in props:
|
|
141
|
-
parent_run.font.color.rgb = props['color']
|
|
142
|
-
process_inline_elements(child, parent_run)
|
|
143
|
-
else:
|
|
144
|
-
runs.append(props)
|
|
145
|
-
else:
|
|
146
|
-
process_inline_elements(child, parent_run)
|
|
147
|
-
|
|
148
|
-
return runs
|
|
149
281
|
|
|
150
282
|
def _apply_highlight(run, bg_color):
|
|
151
283
|
"""为run应用背景色/高亮"""
|
|
152
284
|
from docx.enum.text import WD_COLOR_INDEX
|
|
153
|
-
|
|
285
|
+
|
|
154
286
|
# 颜色名称到WD_COLOR_INDEX的映射(只使用可用的枚举值)
|
|
155
287
|
color_map = {
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
288
|
+
"yellow": WD_COLOR_INDEX.YELLOW,
|
|
289
|
+
"green": WD_COLOR_INDEX.GREEN,
|
|
290
|
+
"brightgreen": WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
291
|
+
"blue": WD_COLOR_INDEX.BLUE,
|
|
292
|
+
"darkblue": WD_COLOR_INDEX.DARK_BLUE,
|
|
293
|
+
"red": WD_COLOR_INDEX.RED,
|
|
294
|
+
"darkred": WD_COLOR_INDEX.DARK_RED,
|
|
295
|
+
"darkyellow": WD_COLOR_INDEX.DARK_YELLOW,
|
|
296
|
+
"lightgray": WD_COLOR_INDEX.GRAY_25,
|
|
297
|
+
"gray": WD_COLOR_INDEX.GRAY_50,
|
|
298
|
+
"black": WD_COLOR_INDEX.BLACK,
|
|
299
|
+
"white": WD_COLOR_INDEX.WHITE,
|
|
300
|
+
"pink": WD_COLOR_INDEX.PINK,
|
|
301
|
+
"teal": WD_COLOR_INDEX.TEAL,
|
|
302
|
+
"turquoise": WD_COLOR_INDEX.TURQUOISE,
|
|
303
|
+
"violet": WD_COLOR_INDEX.VIOLET,
|
|
304
|
+
"cyan": WD_COLOR_INDEX.TURQUOISE,
|
|
305
|
+
"magenta": WD_COLOR_INDEX.VIOLET,
|
|
174
306
|
}
|
|
175
|
-
|
|
307
|
+
|
|
176
308
|
# 标准化颜色名称
|
|
177
309
|
bg_lower = bg_color.lower().strip()
|
|
178
|
-
|
|
310
|
+
|
|
179
311
|
if bg_lower in color_map:
|
|
180
312
|
# 使用预定义的高亮色
|
|
181
313
|
run.font.highlight_color = color_map[bg_lower]
|
|
182
|
-
elif bg_lower.startswith(
|
|
314
|
+
elif bg_lower.startswith("#"):
|
|
183
315
|
# 十六进制颜色,直接使用字符串
|
|
184
|
-
shading_elm = OxmlElement(
|
|
185
|
-
shading_elm.set(qn(
|
|
316
|
+
shading_elm = OxmlElement("w:shd")
|
|
317
|
+
shading_elm.set(qn("w:fill"), bg_lower[1:].upper())
|
|
186
318
|
run._element.get_or_add_rPr().append(shading_elm)
|
|
187
319
|
else:
|
|
188
320
|
# 尝试其他常见颜色名称映射到相近的预定义颜色
|
|
189
321
|
similar_colors = {
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
322
|
+
"lightblue": WD_COLOR_INDEX.TURQUOISE,
|
|
323
|
+
"lightyellow": WD_COLOR_INDEX.YELLOW,
|
|
324
|
+
"lightgreen": WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
325
|
+
"orange": WD_COLOR_INDEX.YELLOW, # 橙色映射到黄色
|
|
326
|
+
"purple": WD_COLOR_INDEX.VIOLET,
|
|
327
|
+
"brown": WD_COLOR_INDEX.DARK_YELLOW,
|
|
196
328
|
}
|
|
197
329
|
if bg_lower in similar_colors:
|
|
198
330
|
run.font.highlight_color = similar_colors[bg_lower]
|
|
199
331
|
|
|
200
|
-
|
|
201
|
-
|
|
332
|
+
|
|
333
|
+
def process_paragraph(
|
|
334
|
+
paragraph,
|
|
335
|
+
doc,
|
|
336
|
+
default_font="微软雅黑",
|
|
337
|
+
default_size=12,
|
|
338
|
+
indent=None,
|
|
339
|
+
align=None,
|
|
340
|
+
line_spacing=None,
|
|
341
|
+
):
|
|
202
342
|
"""处理段落及其内联元素"""
|
|
203
343
|
para = doc.add_paragraph()
|
|
204
344
|
|
|
@@ -211,7 +351,7 @@ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=
|
|
|
211
351
|
para.paragraph_format.first_line_indent = Inches(indent)
|
|
212
352
|
else:
|
|
213
353
|
# 从data-indent属性读取缩进(单位:em)
|
|
214
|
-
data_indent = paragraph.get(
|
|
354
|
+
data_indent = paragraph.get("data-indent", "")
|
|
215
355
|
if data_indent:
|
|
216
356
|
try:
|
|
217
357
|
em_count = float(data_indent)
|
|
@@ -224,247 +364,536 @@ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=
|
|
|
224
364
|
if line_spacing:
|
|
225
365
|
para.paragraph_format.line_spacing = line_spacing
|
|
226
366
|
|
|
367
|
+
# 解析段落的样式(包括行距和段距)
|
|
368
|
+
style = paragraph.get("style", "")
|
|
369
|
+
|
|
370
|
+
# 解析对齐方式
|
|
371
|
+
text_align_match = TEXT_ALIGN_RE.search(style)
|
|
372
|
+
if text_align_match:
|
|
373
|
+
align_str = text_align_match.group(1).strip().lower()
|
|
374
|
+
if align_str == "left":
|
|
375
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
376
|
+
elif align_str == "center":
|
|
377
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
378
|
+
elif align_str == "right":
|
|
379
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
380
|
+
elif align_str == "justify":
|
|
381
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
|
|
382
|
+
|
|
383
|
+
# 解析行距
|
|
384
|
+
line_height_match = LINE_HEIGHT_RE.search(style)
|
|
385
|
+
if line_height_match:
|
|
386
|
+
line_height_str = line_height_match.group(1).strip()
|
|
387
|
+
if line_height_str.endswith("pt"):
|
|
388
|
+
# 固定行距
|
|
389
|
+
para.paragraph_format.line_spacing = float(line_height_str[:-2])
|
|
390
|
+
elif line_height_str.endswith("px"):
|
|
391
|
+
# px转换为pt
|
|
392
|
+
para.paragraph_format.line_spacing = float(line_height_str[:-2]) * 0.75
|
|
393
|
+
elif line_height_str.endswith("em"):
|
|
394
|
+
# em转换为pt(基于段落字号)
|
|
395
|
+
para.paragraph_format.line_spacing = para_size * float(line_height_str[:-2])
|
|
396
|
+
else:
|
|
397
|
+
# 尝试作为倍数处理
|
|
398
|
+
line_spacing_value = float(line_height_str)
|
|
399
|
+
para.paragraph_format.line_spacing = line_spacing_value
|
|
400
|
+
|
|
401
|
+
# 解析段后距
|
|
402
|
+
margin_bottom_match = MARGIN_RE.search(style)
|
|
403
|
+
if margin_bottom_match:
|
|
404
|
+
margin_bottom_str = margin_bottom_match.group(2).strip()
|
|
405
|
+
if margin_bottom_str.endswith("pt"):
|
|
406
|
+
para.paragraph_format.space_after = Pt(float(margin_bottom_str[:-2]))
|
|
407
|
+
elif margin_bottom_str.endswith("px"):
|
|
408
|
+
# px转换为pt
|
|
409
|
+
para.paragraph_format.space_after = Pt(float(margin_bottom_str[:-2]) * 0.75)
|
|
410
|
+
elif margin_bottom_str.endswith("em"):
|
|
411
|
+
# em转换为pt(基于段落字号)
|
|
412
|
+
para.paragraph_format.space_after = Pt(
|
|
413
|
+
para_size * float(margin_bottom_str[:-2])
|
|
414
|
+
)
|
|
415
|
+
else:
|
|
416
|
+
# 尝试作为pt处理
|
|
417
|
+
para.paragraph_format.space_after = Pt(float(margin_bottom_str))
|
|
418
|
+
|
|
419
|
+
# 解析段前距
|
|
420
|
+
margin_top_match = MARGIN_RE.search(style)
|
|
421
|
+
if margin_top_match:
|
|
422
|
+
margin_top_str = margin_top_match.group(2).strip()
|
|
423
|
+
if margin_top_str.endswith("pt"):
|
|
424
|
+
para.paragraph_format.space_before = Pt(float(margin_top_str[:-2]))
|
|
425
|
+
elif margin_top_str.endswith("px"):
|
|
426
|
+
# px转换为pt
|
|
427
|
+
para.paragraph_format.space_before = Pt(float(margin_top_str[:-2]) * 0.75)
|
|
428
|
+
elif margin_top_str.endswith("em"):
|
|
429
|
+
# em转换为pt(基于段落字号)
|
|
430
|
+
para.paragraph_format.space_before = Pt(
|
|
431
|
+
para_size * float(margin_top_str[:-2])
|
|
432
|
+
)
|
|
433
|
+
else:
|
|
434
|
+
# 尝试作为pt处理
|
|
435
|
+
para.paragraph_format.space_before = Pt(float(margin_top_str))
|
|
436
|
+
|
|
437
|
+
# 解析段落的字号
|
|
438
|
+
para_size = default_size
|
|
439
|
+
style = paragraph.get("style", "")
|
|
440
|
+
size_match = FONT_SIZE_RE.search(style)
|
|
441
|
+
if size_match:
|
|
442
|
+
size_str = size_match.group(1).strip()
|
|
443
|
+
# 处理不同单位:pt, px, em等
|
|
444
|
+
if size_str.endswith("pt"):
|
|
445
|
+
para_size = float(size_str[:-2])
|
|
446
|
+
elif size_str.endswith("px"):
|
|
447
|
+
# px转换为pt (1px ≈ 0.75pt)
|
|
448
|
+
para_size = float(size_str[:-2]) * 0.75
|
|
449
|
+
elif size_str.endswith("em"):
|
|
450
|
+
# em转换为pt (假设基础字号为12pt)
|
|
451
|
+
para_size = float(size_str[:-2]) * 12
|
|
452
|
+
else:
|
|
453
|
+
# 尝试直接解析为数字
|
|
454
|
+
para_size = float(size_str)
|
|
455
|
+
|
|
227
456
|
# 处理段落内容 - 递归处理所有子元素
|
|
228
|
-
_process_element_to_runs(paragraph, para, default_font,
|
|
457
|
+
_process_element_to_runs(paragraph, para, default_font, para_size)
|
|
229
458
|
|
|
230
459
|
return para
|
|
231
460
|
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
461
|
+
|
|
462
|
+
def _process_element_to_runs(
|
|
463
|
+
element,
|
|
464
|
+
para,
|
|
465
|
+
default_font="微软雅黑",
|
|
466
|
+
default_size=12,
|
|
467
|
+
bold=False,
|
|
468
|
+
italic=False,
|
|
469
|
+
underline=False,
|
|
470
|
+
strike=False,
|
|
471
|
+
color=None,
|
|
472
|
+
bg_color=None,
|
|
473
|
+
font_name=None,
|
|
474
|
+
font_size=None,
|
|
475
|
+
):
|
|
235
476
|
"""递归处理元素,为不同格式的文本创建独立的runs"""
|
|
236
477
|
current_font = font_name or default_font
|
|
237
478
|
current_size = font_size or default_size
|
|
238
|
-
|
|
479
|
+
|
|
239
480
|
for child in element.children:
|
|
240
481
|
if child.name is None: # 文本节点
|
|
241
482
|
text = str(child)
|
|
242
483
|
# 去除多余空白但保留单个空格
|
|
243
484
|
if text:
|
|
244
485
|
# 替换换行和制表符为空格,然后合并多个空格
|
|
245
|
-
text =
|
|
486
|
+
text = " ".join(text.replace("\n", " ").replace("\t", " ").split())
|
|
246
487
|
if text: # 再次检查,因为去除空白后可能为空
|
|
247
488
|
run = para.add_run(text)
|
|
248
|
-
set_font(
|
|
249
|
-
|
|
489
|
+
set_font(
|
|
490
|
+
run,
|
|
491
|
+
font_name=current_font,
|
|
492
|
+
size=current_size,
|
|
493
|
+
bold=bold,
|
|
494
|
+
italic=italic,
|
|
495
|
+
underline=underline,
|
|
496
|
+
strike=strike,
|
|
497
|
+
)
|
|
250
498
|
if color:
|
|
251
499
|
run.font.color.rgb = color
|
|
252
500
|
# 应用背景色
|
|
253
501
|
if bg_color:
|
|
254
502
|
_apply_highlight(run, bg_color)
|
|
255
|
-
elif child.name ==
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
elif child.name ==
|
|
268
|
-
_process_element_to_runs(
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
503
|
+
elif child.name == "math" or child.name == "latex":
|
|
504
|
+
# 处理 LaTeX 公式标签,添加 Word 原生公式
|
|
505
|
+
# 注意:公式不会继承父级样式(颜色、加粗、斜体等),这是 Word OMML 的限制
|
|
506
|
+
latex_formula = child.get_text().strip()
|
|
507
|
+
if latex_formula and HAS_MATH2DOCX:
|
|
508
|
+
# 添加原生公式(不传递样式参数)
|
|
509
|
+
if add_native_formula(para, latex_formula):
|
|
510
|
+
continue # 成功添加原生公式,跳过后续处理
|
|
511
|
+
# 如果失败,回退到文本显示
|
|
512
|
+
# 回退方案:显示为代码文本
|
|
513
|
+
run = para.add_run(latex_formula)
|
|
514
|
+
set_font(run, font_name="Consolas", size=10, color=RGBColor(0, 0, 128))
|
|
515
|
+
elif child.name == "strong" or child.name == "b":
|
|
516
|
+
_process_element_to_runs(
|
|
517
|
+
child,
|
|
518
|
+
para,
|
|
519
|
+
default_font,
|
|
520
|
+
default_size,
|
|
521
|
+
bold=True,
|
|
522
|
+
italic=italic,
|
|
523
|
+
underline=underline,
|
|
524
|
+
strike=strike,
|
|
525
|
+
color=color,
|
|
526
|
+
bg_color=bg_color,
|
|
527
|
+
)
|
|
528
|
+
elif child.name == "em" or child.name == "i":
|
|
529
|
+
_process_element_to_runs(
|
|
530
|
+
child,
|
|
531
|
+
para,
|
|
532
|
+
default_font,
|
|
533
|
+
default_size,
|
|
534
|
+
bold=bold,
|
|
535
|
+
italic=True,
|
|
536
|
+
underline=underline,
|
|
537
|
+
strike=strike,
|
|
538
|
+
color=color,
|
|
539
|
+
bg_color=bg_color,
|
|
540
|
+
)
|
|
541
|
+
elif child.name == "u":
|
|
542
|
+
_process_element_to_runs(
|
|
543
|
+
child,
|
|
544
|
+
para,
|
|
545
|
+
default_font,
|
|
546
|
+
default_size,
|
|
547
|
+
bold=bold,
|
|
548
|
+
italic=italic,
|
|
549
|
+
underline=True,
|
|
550
|
+
strike=strike,
|
|
551
|
+
color=color,
|
|
552
|
+
bg_color=bg_color,
|
|
553
|
+
)
|
|
554
|
+
elif child.name == "s" or child.name == "del":
|
|
555
|
+
_process_element_to_runs(
|
|
556
|
+
child,
|
|
557
|
+
para,
|
|
558
|
+
default_font,
|
|
559
|
+
default_size,
|
|
560
|
+
bold=bold,
|
|
561
|
+
italic=italic,
|
|
562
|
+
underline=underline,
|
|
563
|
+
strike=True,
|
|
564
|
+
color=color,
|
|
565
|
+
bg_color=bg_color,
|
|
566
|
+
)
|
|
567
|
+
elif child.name == "sup":
|
|
272
568
|
for sub_child in child.children:
|
|
273
569
|
if sub_child.name is None:
|
|
274
570
|
run = para.add_run(str(sub_child))
|
|
275
|
-
set_font(
|
|
276
|
-
|
|
571
|
+
set_font(
|
|
572
|
+
run,
|
|
573
|
+
font_name=current_font,
|
|
574
|
+
size=current_size,
|
|
575
|
+
bold=bold,
|
|
576
|
+
italic=italic,
|
|
577
|
+
underline=underline,
|
|
578
|
+
strike=strike,
|
|
579
|
+
)
|
|
277
580
|
run.font.superscript = True
|
|
278
581
|
if color:
|
|
279
582
|
run.font.color.rgb = color
|
|
583
|
+
if bg_color:
|
|
584
|
+
_apply_highlight(run, bg_color)
|
|
280
585
|
else:
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
586
|
+
# 处理嵌套元素,但保持上标
|
|
587
|
+
_process_element_to_runs(
|
|
588
|
+
sub_child,
|
|
589
|
+
para,
|
|
590
|
+
default_font,
|
|
591
|
+
default_size,
|
|
592
|
+
bold=bold,
|
|
593
|
+
italic=italic,
|
|
594
|
+
underline=underline,
|
|
595
|
+
strike=strike,
|
|
596
|
+
color=color,
|
|
597
|
+
bg_color=bg_color,
|
|
598
|
+
)
|
|
599
|
+
# 为嵌套元素添加的上标
|
|
600
|
+
for run in (
|
|
601
|
+
para.runs[
|
|
602
|
+
len(list(para.runs)) - len(sub_child.find_all(True)) :
|
|
603
|
+
]
|
|
604
|
+
if para.runs
|
|
605
|
+
else []
|
|
606
|
+
):
|
|
607
|
+
run.font.superscript = True
|
|
608
|
+
elif child.name == "sub":
|
|
285
609
|
for sub_child in child.children:
|
|
286
610
|
if sub_child.name is None:
|
|
287
611
|
run = para.add_run(str(sub_child))
|
|
288
|
-
set_font(
|
|
289
|
-
|
|
612
|
+
set_font(
|
|
613
|
+
run,
|
|
614
|
+
font_name=current_font,
|
|
615
|
+
size=current_size,
|
|
616
|
+
bold=bold,
|
|
617
|
+
italic=italic,
|
|
618
|
+
underline=underline,
|
|
619
|
+
strike=strike,
|
|
620
|
+
)
|
|
290
621
|
run.font.subscript = True
|
|
291
622
|
if color:
|
|
292
623
|
run.font.color.rgb = color
|
|
624
|
+
if bg_color:
|
|
625
|
+
_apply_highlight(run, bg_color)
|
|
293
626
|
else:
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
627
|
+
# 处理嵌套元素,但保持下标
|
|
628
|
+
_process_element_to_runs(
|
|
629
|
+
sub_child,
|
|
630
|
+
para,
|
|
631
|
+
default_font,
|
|
632
|
+
default_size,
|
|
633
|
+
bold=bold,
|
|
634
|
+
italic=italic,
|
|
635
|
+
underline=underline,
|
|
636
|
+
strike=strike,
|
|
637
|
+
color=color,
|
|
638
|
+
bg_color=bg_color,
|
|
639
|
+
)
|
|
640
|
+
# 为嵌套元素添加的下标
|
|
641
|
+
for run in (
|
|
642
|
+
para.runs[
|
|
643
|
+
len(list(para.runs)) - len(sub_child.find_all(True)) :
|
|
644
|
+
]
|
|
645
|
+
if para.runs
|
|
646
|
+
else []
|
|
647
|
+
):
|
|
648
|
+
run.font.subscript = True
|
|
649
|
+
elif child.name == "code":
|
|
298
650
|
code_text = child.get_text()
|
|
299
651
|
run = para.add_run(code_text)
|
|
300
|
-
set_font(run, font_name=
|
|
301
|
-
elif child.name ==
|
|
652
|
+
set_font(run, font_name="Consolas", size=10)
|
|
653
|
+
elif child.name == "a":
|
|
302
654
|
link_text = child.get_text()
|
|
303
655
|
run = para.add_run(link_text)
|
|
304
656
|
set_font(run, font_name=current_font, size=current_size)
|
|
305
657
|
run.font.underline = True
|
|
306
658
|
run.font.color.rgb = RGBColor(0, 0, 255)
|
|
307
|
-
elif child.name ==
|
|
659
|
+
elif child.name == "span":
|
|
308
660
|
# 处理span的样式
|
|
309
|
-
style = child.get(
|
|
310
|
-
classes = child.get(
|
|
311
|
-
|
|
661
|
+
style = child.get("style", "")
|
|
662
|
+
classes = child.get("class", [])
|
|
663
|
+
|
|
312
664
|
span_color = color
|
|
313
665
|
span_bg = bg_color
|
|
314
|
-
|
|
666
|
+
span_font = current_font # 使用当前字体(继承父级)
|
|
667
|
+
span_size = current_size # 使用当前字号(继承父级)
|
|
668
|
+
|
|
315
669
|
# 解析style中的颜色
|
|
316
|
-
color_match =
|
|
670
|
+
color_match = COLOR_RE.search(style)
|
|
317
671
|
if color_match:
|
|
318
672
|
parsed = parse_color(color_match.group(1).strip())
|
|
319
673
|
if parsed:
|
|
320
674
|
span_color = parsed
|
|
321
|
-
|
|
675
|
+
|
|
676
|
+
# 解析 font-family
|
|
677
|
+
font_match = FONT_FAMILY_RE.search(style)
|
|
678
|
+
if font_match:
|
|
679
|
+
font_family = font_match.group(1).strip()
|
|
680
|
+
# 去除引号
|
|
681
|
+
font_family = font_family.strip("'\"").strip()
|
|
682
|
+
if font_family:
|
|
683
|
+
span_font = font_family
|
|
684
|
+
|
|
685
|
+
# 解析 font-size
|
|
686
|
+
size_match = FONT_SIZE_RE.search(style)
|
|
687
|
+
if size_match:
|
|
688
|
+
size_str = size_match.group(1).strip()
|
|
689
|
+
# 处理不同单位:pt, px, em等
|
|
690
|
+
if size_str.endswith("pt"):
|
|
691
|
+
span_size = float(size_str[:-2])
|
|
692
|
+
elif size_str.endswith("px"):
|
|
693
|
+
# px转换为pt (1px ≈ 0.75pt)
|
|
694
|
+
span_size = float(size_str[:-2]) * 0.75
|
|
695
|
+
elif size_str.endswith("em"):
|
|
696
|
+
# em转换为pt (基于默认12pt)
|
|
697
|
+
span_size = float(size_str[:-2]) * 12
|
|
698
|
+
else:
|
|
699
|
+
# 尝试直接解析为数字
|
|
700
|
+
span_size = float(size_str)
|
|
701
|
+
|
|
322
702
|
# 解析class中的颜色
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
703
|
+
class_set = set(classes) # 转换为集合提高查找性能
|
|
704
|
+
if "red" in class_set:
|
|
705
|
+
span_color = ConverterConfig.CLASS_COLORS["red"]
|
|
706
|
+
elif "blue" in class_set:
|
|
707
|
+
span_color = ConverterConfig.CLASS_COLORS["blue"]
|
|
708
|
+
elif "green" in class_set:
|
|
709
|
+
span_color = ConverterConfig.CLASS_COLORS["green"]
|
|
710
|
+
elif "purple" in class_set:
|
|
711
|
+
span_color = ConverterConfig.CLASS_COLORS["purple"]
|
|
712
|
+
|
|
332
713
|
# 背景色
|
|
333
|
-
bg_match =
|
|
714
|
+
bg_match = BACKGROUND_COLOR_RE.search(style)
|
|
334
715
|
if bg_match:
|
|
335
716
|
span_bg = bg_match.group(1).strip()
|
|
336
|
-
if
|
|
337
|
-
span_bg =
|
|
338
|
-
|
|
339
|
-
_process_element_to_runs(
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
717
|
+
if "highlight" in class_set:
|
|
718
|
+
span_bg = "yellow"
|
|
719
|
+
|
|
720
|
+
_process_element_to_runs(
|
|
721
|
+
child,
|
|
722
|
+
para,
|
|
723
|
+
default_font,
|
|
724
|
+
default_size,
|
|
725
|
+
bold=bold,
|
|
726
|
+
italic=italic,
|
|
727
|
+
underline=underline,
|
|
728
|
+
strike=strike,
|
|
729
|
+
color=span_color,
|
|
730
|
+
bg_color=span_bg,
|
|
731
|
+
font_name=span_font,
|
|
732
|
+
font_size=span_size,
|
|
733
|
+
)
|
|
343
734
|
else:
|
|
344
735
|
# 其他标签递归处理
|
|
345
|
-
_process_element_to_runs(
|
|
346
|
-
|
|
347
|
-
|
|
736
|
+
_process_element_to_runs(
|
|
737
|
+
child,
|
|
738
|
+
para,
|
|
739
|
+
default_font,
|
|
740
|
+
default_size,
|
|
741
|
+
bold=bold,
|
|
742
|
+
italic=italic,
|
|
743
|
+
underline=underline,
|
|
744
|
+
strike=strike,
|
|
745
|
+
color=color,
|
|
746
|
+
bg_color=bg_color,
|
|
747
|
+
)
|
|
748
|
+
|
|
348
749
|
|
|
349
|
-
def process_list_items(
|
|
750
|
+
def process_list_items(
|
|
751
|
+
items, doc, ordered=False, default_font="微软雅黑", default_size=12, level=0
|
|
752
|
+
):
|
|
350
753
|
"""处理列表项,支持嵌套"""
|
|
351
754
|
for item in items:
|
|
352
755
|
# 创建列表项段落
|
|
353
756
|
if ordered:
|
|
354
|
-
para = doc.add_paragraph(style=
|
|
757
|
+
para = doc.add_paragraph(style="List Number")
|
|
355
758
|
else:
|
|
356
|
-
para = doc.add_paragraph(style=
|
|
357
|
-
|
|
759
|
+
para = doc.add_paragraph(style="List Bullet")
|
|
760
|
+
|
|
358
761
|
# 设置缩进:每级增加 0.25 英寸
|
|
359
762
|
para.paragraph_format.left_indent = Inches(0.25 * (level + 1))
|
|
360
|
-
|
|
763
|
+
|
|
361
764
|
# 查找嵌套列表
|
|
362
|
-
nested_ul = item.find(
|
|
363
|
-
nested_ol = item.find(
|
|
364
|
-
|
|
765
|
+
nested_ul = item.find("ul", recursive=False)
|
|
766
|
+
nested_ol = item.find("ol", recursive=False)
|
|
767
|
+
|
|
365
768
|
# 处理列表项的文本内容(排除嵌套列表)
|
|
366
769
|
# 创建一个临时副本用于提取文本
|
|
367
|
-
item_copy = BeautifulSoup(str(item),
|
|
770
|
+
item_copy = BeautifulSoup(str(item), "html.parser").find("li")
|
|
368
771
|
if item_copy:
|
|
369
772
|
# 移除嵌套列表
|
|
370
|
-
for nested in item_copy.find_all([
|
|
773
|
+
for nested in item_copy.find_all(["ul", "ol"], recursive=False):
|
|
371
774
|
nested.decompose()
|
|
372
|
-
|
|
775
|
+
|
|
373
776
|
# 处理剩余内容
|
|
374
777
|
if item_copy.get_text().strip():
|
|
375
778
|
_process_element_to_runs(item_copy, para, default_font, default_size)
|
|
376
|
-
|
|
779
|
+
|
|
377
780
|
# 递归处理嵌套列表
|
|
378
781
|
if nested_ul:
|
|
379
|
-
nested_items = nested_ul.find_all(
|
|
380
|
-
process_list_items(
|
|
381
|
-
|
|
782
|
+
nested_items = nested_ul.find_all("li", recursive=False)
|
|
783
|
+
process_list_items(
|
|
784
|
+
nested_items,
|
|
785
|
+
doc,
|
|
786
|
+
ordered=False,
|
|
787
|
+
default_font=default_font,
|
|
788
|
+
default_size=default_size,
|
|
789
|
+
level=level + 1,
|
|
790
|
+
)
|
|
382
791
|
if nested_ol:
|
|
383
|
-
nested_items = nested_ol.find_all(
|
|
384
|
-
process_list_items(
|
|
385
|
-
|
|
792
|
+
nested_items = nested_ol.find_all("li", recursive=False)
|
|
793
|
+
process_list_items(
|
|
794
|
+
nested_items,
|
|
795
|
+
doc,
|
|
796
|
+
ordered=True,
|
|
797
|
+
default_font=default_font,
|
|
798
|
+
default_size=default_size,
|
|
799
|
+
level=level + 1,
|
|
800
|
+
)
|
|
801
|
+
|
|
386
802
|
|
|
387
803
|
def _parse_style(style_str):
|
|
388
804
|
"""解析style字符串为字典"""
|
|
389
805
|
styles = {}
|
|
390
806
|
if not style_str:
|
|
391
807
|
return styles
|
|
392
|
-
for item in style_str.split(
|
|
393
|
-
if
|
|
394
|
-
key, value = item.split(
|
|
808
|
+
for item in style_str.split(";"):
|
|
809
|
+
if ":" in item:
|
|
810
|
+
key, value = item.split(":", 1)
|
|
395
811
|
styles[key.strip()] = value.strip()
|
|
396
812
|
return styles
|
|
397
813
|
|
|
814
|
+
|
|
398
815
|
def _apply_cell_style(cell_elem, style_dict):
|
|
399
816
|
"""应用单元格样式"""
|
|
400
817
|
# 背景色
|
|
401
|
-
bg_color = style_dict.get(
|
|
818
|
+
bg_color = style_dict.get("background-color", "")
|
|
402
819
|
if bg_color:
|
|
403
820
|
# 处理颜色值
|
|
404
|
-
if bg_color.startswith(
|
|
405
|
-
shading_elm = OxmlElement(
|
|
406
|
-
shading_elm.set(qn(
|
|
821
|
+
if bg_color.startswith("#"):
|
|
822
|
+
shading_elm = OxmlElement("w:shd")
|
|
823
|
+
shading_elm.set(qn("w:fill"), bg_color[1:].upper())
|
|
407
824
|
cell_elem._element.get_or_add_tcPr().append(shading_elm)
|
|
408
|
-
|
|
825
|
+
|
|
409
826
|
# 文字颜色
|
|
410
|
-
color = style_dict.get(
|
|
827
|
+
color = style_dict.get("color", "")
|
|
411
828
|
if color:
|
|
412
|
-
rgb = parse_color(color) if color.startswith(
|
|
829
|
+
rgb = parse_color(color) if color.startswith("#") else None
|
|
413
830
|
if rgb:
|
|
414
831
|
for run in cell_elem.paragraphs[0].runs:
|
|
415
832
|
run.font.color.rgb = rgb
|
|
416
833
|
|
|
417
|
-
|
|
834
|
+
|
|
835
|
+
def process_table(table, doc, default_font="微软雅黑", default_size=11):
|
|
418
836
|
"""处理表格,支持内联样式"""
|
|
419
|
-
rows = table.find_all(
|
|
837
|
+
rows = table.find_all("tr")
|
|
420
838
|
if not rows:
|
|
421
839
|
return
|
|
422
|
-
|
|
840
|
+
|
|
423
841
|
# 获取列数
|
|
424
|
-
cols = max(len(row.find_all([
|
|
425
|
-
|
|
842
|
+
cols = max(len(row.find_all(["td", "th"])) for row in rows)
|
|
843
|
+
|
|
426
844
|
# 创建表格
|
|
427
845
|
word_table = doc.add_table(rows=len(rows), cols=cols)
|
|
428
|
-
word_table.style =
|
|
429
|
-
|
|
846
|
+
word_table.style = "Table Grid"
|
|
847
|
+
|
|
430
848
|
for row_idx, row in enumerate(rows):
|
|
431
849
|
# 处理行样式(如背景色)
|
|
432
|
-
row_style = _parse_style(row.get(
|
|
433
|
-
row_bg = row_style.get(
|
|
434
|
-
|
|
435
|
-
cells = row.find_all([
|
|
850
|
+
row_style = _parse_style(row.get("style", ""))
|
|
851
|
+
row_bg = row_style.get("background-color", "")
|
|
852
|
+
|
|
853
|
+
cells = row.find_all(["td", "th"])
|
|
436
854
|
for col_idx, cell in enumerate(cells):
|
|
437
855
|
if col_idx < cols:
|
|
438
856
|
cell_elem = word_table.rows[row_idx].cells[col_idx]
|
|
439
|
-
|
|
440
|
-
|
|
857
|
+
|
|
441
858
|
# 解析单元格样式
|
|
442
|
-
cell_style = _parse_style(cell.get(
|
|
443
|
-
|
|
859
|
+
cell_style = _parse_style(cell.get("style", ""))
|
|
860
|
+
|
|
861
|
+
# 清空默认段落
|
|
862
|
+
cell_elem.paragraphs[0].clear()
|
|
863
|
+
|
|
864
|
+
# 使用 _process_element_to_runs 处理单元格内容,保留格式
|
|
865
|
+
_process_element_to_runs(
|
|
866
|
+
cell,
|
|
867
|
+
cell_elem.paragraphs[0],
|
|
868
|
+
default_font=default_font,
|
|
869
|
+
default_size=default_size,
|
|
870
|
+
)
|
|
871
|
+
|
|
444
872
|
# 表头加粗
|
|
445
|
-
if cell.name ==
|
|
873
|
+
if cell.name == "th":
|
|
446
874
|
for run in cell_elem.paragraphs[0].runs:
|
|
447
875
|
run.font.bold = True
|
|
448
|
-
|
|
876
|
+
|
|
449
877
|
# 设置单元格对齐
|
|
450
|
-
align = cell_style.get(
|
|
451
|
-
if align ==
|
|
878
|
+
align = cell_style.get("text-align", "center")
|
|
879
|
+
if align == "center":
|
|
452
880
|
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
453
|
-
elif align ==
|
|
881
|
+
elif align == "left":
|
|
454
882
|
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
455
|
-
elif align ==
|
|
883
|
+
elif align == "right":
|
|
456
884
|
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
457
|
-
|
|
885
|
+
|
|
458
886
|
# 应用单元格样式(背景色、文字颜色)
|
|
459
887
|
_apply_cell_style(cell_elem, cell_style)
|
|
460
|
-
|
|
888
|
+
|
|
461
889
|
# 如果行有背景色且单元格没有单独设置,应用行背景色
|
|
462
|
-
if row_bg and not cell_style.get(
|
|
463
|
-
if row_bg.startswith(
|
|
464
|
-
shading_elm = OxmlElement(
|
|
465
|
-
shading_elm.set(qn(
|
|
890
|
+
if row_bg and not cell_style.get("background-color"):
|
|
891
|
+
if row_bg.startswith("#"):
|
|
892
|
+
shading_elm = OxmlElement("w:shd")
|
|
893
|
+
shading_elm.set(qn("w:fill"), row_bg[1:].upper())
|
|
466
894
|
cell_elem._element.get_or_add_tcPr().append(shading_elm)
|
|
467
895
|
|
|
896
|
+
|
|
468
897
|
def set_section_columns(section, cols_num=2, space=720):
|
|
469
898
|
"""设置节的多栏布局
|
|
470
899
|
|
|
@@ -474,11 +903,12 @@ def set_section_columns(section, cols_num=2, space=720):
|
|
|
474
903
|
space: 栏间距(单位:twips,1英寸=1440twips),默认720(0.5英寸)
|
|
475
904
|
"""
|
|
476
905
|
sectPr = section._sectPr
|
|
477
|
-
cols = OxmlElement(
|
|
478
|
-
cols.set(qn(
|
|
479
|
-
cols.set(qn(
|
|
906
|
+
cols = OxmlElement("w:cols")
|
|
907
|
+
cols.set(qn("w:num"), str(cols_num))
|
|
908
|
+
cols.set(qn("w:space"), str(space))
|
|
480
909
|
sectPr.append(cols)
|
|
481
910
|
|
|
911
|
+
|
|
482
912
|
def add_columns_section(doc, cols_num=2, space=720):
|
|
483
913
|
"""添加连续分节符并设置多栏布局(不换页)
|
|
484
914
|
|
|
@@ -496,312 +926,606 @@ def add_columns_section(doc, cols_num=2, space=720):
|
|
|
496
926
|
set_section_columns(section, cols_num, space)
|
|
497
927
|
return section
|
|
498
928
|
|
|
929
|
+
|
|
499
930
|
def _process_blockquote(blockquote_elem, doc, level=0):
|
|
500
931
|
"""递归处理嵌套引用"""
|
|
501
|
-
#
|
|
502
|
-
|
|
932
|
+
# 检查是否有直接内容(不包括嵌套引用)
|
|
933
|
+
has_content = False
|
|
503
934
|
for child in blockquote_elem.children:
|
|
504
935
|
if child.name is None: # 文本节点
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
936
|
+
if str(child).strip():
|
|
937
|
+
has_content = True
|
|
938
|
+
break
|
|
939
|
+
elif child.name != "blockquote" and child.get_text().strip():
|
|
940
|
+
has_content = True
|
|
941
|
+
break
|
|
510
942
|
|
|
511
|
-
#
|
|
512
|
-
if
|
|
943
|
+
# 如果有直接内容,创建段落
|
|
944
|
+
if has_content:
|
|
513
945
|
para = doc.add_paragraph()
|
|
514
|
-
run = para.add_run(direct_text)
|
|
515
|
-
set_font(run, italic=True, color=RGBColor(100, 100, 100))
|
|
516
946
|
# 根据层级设置缩进
|
|
517
947
|
para.paragraph_format.left_indent = Inches(0.3 * level)
|
|
518
948
|
para.paragraph_format.right_indent = Inches(0.5)
|
|
519
949
|
# 添加灰色左边框
|
|
520
|
-
pBdr = OxmlElement(
|
|
521
|
-
left_border = OxmlElement(
|
|
522
|
-
left_border.set(qn(
|
|
523
|
-
left_border.set(qn(
|
|
524
|
-
left_border.set(qn(
|
|
950
|
+
pBdr = OxmlElement("w:pBdr")
|
|
951
|
+
left_border = OxmlElement("w:left")
|
|
952
|
+
left_border.set(qn("w:val"), "single")
|
|
953
|
+
left_border.set(qn("w:sz"), "18")
|
|
954
|
+
left_border.set(qn("w:color"), "CCCCCC")
|
|
525
955
|
pBdr.append(left_border)
|
|
526
956
|
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
527
957
|
|
|
958
|
+
# 创建一个临时元素来包含所有非blockquote的子元素
|
|
959
|
+
from bs4 import BeautifulSoup
|
|
960
|
+
|
|
961
|
+
temp_soup = BeautifulSoup("<div></div>", "html.parser")
|
|
962
|
+
temp_div = temp_soup.div
|
|
963
|
+
|
|
964
|
+
# 复制所有非blockquote的子元素
|
|
965
|
+
for child in blockquote_elem.children:
|
|
966
|
+
if child.name != "blockquote":
|
|
967
|
+
temp_div.append(
|
|
968
|
+
child.__copy__() if hasattr(child, "__copy__") else child
|
|
969
|
+
)
|
|
970
|
+
|
|
971
|
+
# 使用 _process_element_to_runs 处理格式化内容
|
|
972
|
+
# 注意:引用内容默认斜体和灰色
|
|
973
|
+
_process_element_to_runs(
|
|
974
|
+
temp_div,
|
|
975
|
+
para,
|
|
976
|
+
default_font="微软雅黑",
|
|
977
|
+
default_size=12,
|
|
978
|
+
italic=True,
|
|
979
|
+
color=RGBColor(100, 100, 100),
|
|
980
|
+
)
|
|
981
|
+
|
|
528
982
|
# 递归处理嵌套引用
|
|
529
|
-
nested_quotes = blockquote_elem.find_all(
|
|
983
|
+
nested_quotes = blockquote_elem.find_all("blockquote", recursive=False)
|
|
530
984
|
for nested in nested_quotes:
|
|
531
985
|
_process_blockquote(nested, doc, level + 1)
|
|
532
986
|
|
|
987
|
+
|
|
533
988
|
def add_page_break(doc):
|
|
534
989
|
"""添加分页符"""
|
|
535
990
|
doc.add_page_break()
|
|
536
991
|
|
|
992
|
+
|
|
537
993
|
def add_horizontal_rule(doc):
|
|
538
994
|
"""添加水平线"""
|
|
539
995
|
para = doc.add_paragraph()
|
|
540
996
|
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
541
|
-
run = para.add_run(
|
|
997
|
+
run = para.add_run("_" * 50)
|
|
542
998
|
run.font.color.rgb = RGBColor(200, 200, 200)
|
|
543
999
|
|
|
544
|
-
|
|
1000
|
+
|
|
1001
|
+
# ==================== 辅助函数 ====================
|
|
1002
|
+
def _init_document(default_font, default_size):
|
|
1003
|
+
"""初始化Word文档"""
|
|
1004
|
+
doc = Document()
|
|
1005
|
+
doc.styles["Normal"].font.name = default_font
|
|
1006
|
+
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1007
|
+
doc.styles["Normal"].font.size = Pt(default_size)
|
|
1008
|
+
|
|
1009
|
+
# 处理页面设置
|
|
1010
|
+
section = doc.sections[0]
|
|
1011
|
+
section.page_height = Cm(ConverterConfig.PAGE_HEIGHT_CM)
|
|
1012
|
+
section.page_width = Cm(ConverterConfig.PAGE_WIDTH_CM)
|
|
1013
|
+
section.left_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1014
|
+
section.right_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1015
|
+
section.top_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1016
|
+
section.bottom_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1017
|
+
|
|
1018
|
+
return doc
|
|
1019
|
+
|
|
1020
|
+
|
|
1021
|
+
def _read_html_file(html_file):
|
|
1022
|
+
"""读取HTML文件"""
|
|
1023
|
+
with open(html_file, "r", encoding="utf-8") as f:
|
|
1024
|
+
return f.read()
|
|
1025
|
+
|
|
1026
|
+
|
|
1027
|
+
def _parse_html(html_content):
|
|
1028
|
+
"""解析HTML内容"""
|
|
1029
|
+
return BeautifulSoup(html_content, "html.parser")
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
def _process_heading(element, doc, default_font):
|
|
1033
|
+
"""处理标题元素"""
|
|
1034
|
+
level = int(element.name[1])
|
|
1035
|
+
heading = doc.add_heading(element.get_text().strip(), level=level)
|
|
1036
|
+
heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
1037
|
+
|
|
1038
|
+
# 标题样式
|
|
1039
|
+
for run in heading.runs:
|
|
1040
|
+
run.font.name = default_font
|
|
1041
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1042
|
+
run.font.size = Pt(ConverterConfig.HEADING_SIZES.get(level, 14))
|
|
1043
|
+
run.font.bold = True
|
|
1044
|
+
run.font.color.rgb = ConverterConfig.HEADING_COLORS.get(
|
|
1045
|
+
level, RGBColor(107, 91, 149)
|
|
1046
|
+
)
|
|
1047
|
+
if level == 1:
|
|
1048
|
+
heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
1049
|
+
|
|
1050
|
+
|
|
1051
|
+
def _process_paragraph_element(element, doc):
|
|
1052
|
+
"""处理段落元素"""
|
|
1053
|
+
classes = element.get("class", [])
|
|
1054
|
+
class_set = set(classes)
|
|
1055
|
+
|
|
1056
|
+
if "center" in class_set:
|
|
1057
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
|
|
1058
|
+
elif "right" in class_set:
|
|
1059
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT)
|
|
1060
|
+
elif "dialogue" in class_set:
|
|
1061
|
+
para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
1062
|
+
for run in para.runs:
|
|
1063
|
+
set_font(run, italic=True, color=RGBColor(107, 91, 122))
|
|
1064
|
+
elif "quote" in class_set or element.get("style", "").find("background") != -1:
|
|
1065
|
+
para = process_paragraph(element, doc)
|
|
1066
|
+
para.paragraph_format.left_indent = Inches(1)
|
|
1067
|
+
para.paragraph_format.right_indent = Inches(1)
|
|
1068
|
+
from docx.enum.text import WD_BORDER
|
|
1069
|
+
|
|
1070
|
+
for border in para.paragraph_format._element.xpath("./w:pBdr"):
|
|
1071
|
+
border.getparent().remove(border)
|
|
1072
|
+
# 添加边框效果(使用浅灰色背景模拟)
|
|
1073
|
+
shading_elm = OxmlElement("w:shd")
|
|
1074
|
+
shading_elm.set(qn("w:fill"), "F5F5F5")
|
|
1075
|
+
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
1076
|
+
else:
|
|
1077
|
+
process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
1078
|
+
|
|
1079
|
+
|
|
1080
|
+
def _process_list_element(element, doc, ordered):
|
|
1081
|
+
"""处理列表元素"""
|
|
1082
|
+
items = element.find_all("li", recursive=False)
|
|
1083
|
+
process_list_items(items, doc, ordered=ordered)
|
|
1084
|
+
|
|
1085
|
+
|
|
1086
|
+
def _process_image_element(element, doc, html_file):
|
|
1087
|
+
"""处理图片元素"""
|
|
1088
|
+
src = element.get("src", "")
|
|
1089
|
+
if src:
|
|
1090
|
+
# 解析宽度、高度和对齐方式
|
|
1091
|
+
width = element.get("width")
|
|
1092
|
+
height = element.get("height")
|
|
1093
|
+
style = element.get("style", "")
|
|
1094
|
+
align = element.get("align", "center")
|
|
1095
|
+
|
|
1096
|
+
# 从 style 中提取对齐方式
|
|
1097
|
+
if "text-align: right" in style or "float: right" in style:
|
|
1098
|
+
align = "right"
|
|
1099
|
+
elif "text-align: left" in style or "float: left" in style:
|
|
1100
|
+
align = "left"
|
|
1101
|
+
elif "text-align: center" in style:
|
|
1102
|
+
align = "center"
|
|
1103
|
+
|
|
1104
|
+
# 处理宽度高度(支持像素转英寸)
|
|
1105
|
+
width_inch = None
|
|
1106
|
+
height_inch = None
|
|
1107
|
+
if width:
|
|
1108
|
+
width_px = float(width)
|
|
1109
|
+
width_inch = width_px / 96 # 假设96 DPI
|
|
1110
|
+
|
|
1111
|
+
if height:
|
|
1112
|
+
height_px = float(height)
|
|
1113
|
+
height_inch = height_px / 96
|
|
1114
|
+
|
|
1115
|
+
# 处理相对路径(相对于HTML文件)
|
|
1116
|
+
html_dir = os.path.dirname(html_file)
|
|
1117
|
+
image_path = os.path.join(html_dir, src) if not os.path.isabs(src) else src
|
|
1118
|
+
|
|
1119
|
+
# 添加图片
|
|
1120
|
+
add_image(doc, image_path, width_inch, height_inch, align)
|
|
1121
|
+
|
|
1122
|
+
|
|
1123
|
+
def _process_div_element(element, doc, default_font, default_size):
|
|
1124
|
+
"""处理div元素"""
|
|
1125
|
+
classes = element.get("class", [])
|
|
1126
|
+
class_set = set(classes)
|
|
1127
|
+
|
|
1128
|
+
if "chapter" in class_set:
|
|
1129
|
+
# 处理章节
|
|
1130
|
+
h2 = element.find("h2")
|
|
1131
|
+
if h2:
|
|
1132
|
+
heading = doc.add_heading(h2.get_text().strip(), level=2)
|
|
1133
|
+
for run in heading.runs:
|
|
1134
|
+
run.font.color.rgb = RGBColor(91, 78, 140)
|
|
1135
|
+
run.font.size = Pt(16)
|
|
1136
|
+
run.font.name = default_font
|
|
1137
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1138
|
+
|
|
1139
|
+
paragraphs = element.find_all("p")
|
|
1140
|
+
for p in paragraphs:
|
|
1141
|
+
first_span = p.find("span", class_="first-line")
|
|
1142
|
+
if first_span:
|
|
1143
|
+
# 处理首字下沉效果
|
|
1144
|
+
para = doc.add_paragraph()
|
|
1145
|
+
para.paragraph_format.first_line_indent = Inches(0)
|
|
1146
|
+
|
|
1147
|
+
first_char_run = para.add_run(first_span.text)
|
|
1148
|
+
set_font(
|
|
1149
|
+
first_char_run, size=20, bold=True, color=RGBColor(102, 126, 234)
|
|
1150
|
+
)
|
|
1151
|
+
remaining_text = p.get_text().replace(first_span.text, "", 1)
|
|
1152
|
+
run = para.add_run(remaining_text)
|
|
1153
|
+
set_font(run)
|
|
1154
|
+
else:
|
|
1155
|
+
process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
|
|
1156
|
+
|
|
1157
|
+
elif "ending" in class_set:
|
|
1158
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
|
|
1159
|
+
for run in para.runs:
|
|
1160
|
+
set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
|
|
1161
|
+
|
|
1162
|
+
elif "page-break" in class_set:
|
|
1163
|
+
add_page_break(doc)
|
|
1164
|
+
|
|
1165
|
+
elif "columns" in class_set:
|
|
1166
|
+
# 处理多栏布局(使用连续分节符,不换页)
|
|
1167
|
+
cols_num = int(element.get("data-cols", "2"))
|
|
1168
|
+
# 添加连续分节符并设置栏数
|
|
1169
|
+
add_columns_section(doc, cols_num)
|
|
1170
|
+
# 处理其中的段落
|
|
1171
|
+
for p in element.find_all("p", recursive=False):
|
|
1172
|
+
process_paragraph(
|
|
1173
|
+
p, doc, default_font=default_font, default_size=default_size
|
|
1174
|
+
)
|
|
1175
|
+
|
|
1176
|
+
elif "info" in class_set or "warning" in class_set or "success" in class_set:
|
|
1177
|
+
# 处理提示框
|
|
1178
|
+
para = doc.add_paragraph()
|
|
1179
|
+
para.paragraph_format.right_indent = Inches(0.3)
|
|
1180
|
+
|
|
1181
|
+
# 设置背景色和左边框颜色
|
|
1182
|
+
if "info" in class_set:
|
|
1183
|
+
bg_color = ConverterConfig.INFO_COLORS["bg"]
|
|
1184
|
+
border_color = ConverterConfig.INFO_COLORS["border"]
|
|
1185
|
+
elif "warning" in class_set:
|
|
1186
|
+
bg_color = ConverterConfig.WARNING_COLORS["bg"]
|
|
1187
|
+
border_color = ConverterConfig.WARNING_COLORS["border"]
|
|
1188
|
+
else: # success
|
|
1189
|
+
bg_color = ConverterConfig.SUCCESS_COLORS["bg"]
|
|
1190
|
+
border_color = ConverterConfig.SUCCESS_COLORS["border"]
|
|
1191
|
+
|
|
1192
|
+
# 处理内容
|
|
1193
|
+
_process_element_to_runs(element, para, default_font, default_size)
|
|
1194
|
+
|
|
1195
|
+
# 添加背景色
|
|
1196
|
+
shading_elm = OxmlElement("w:shd")
|
|
1197
|
+
shading_elm.set(qn("w:fill"), bg_color)
|
|
1198
|
+
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
1199
|
+
|
|
1200
|
+
# 添加左边框
|
|
1201
|
+
pPr = para.paragraph_format._element.get_or_add_pPr()
|
|
1202
|
+
pBdr = OxmlElement("w:pBdr")
|
|
1203
|
+
left = OxmlElement("w:left")
|
|
1204
|
+
left.set(qn("w:val"), "single")
|
|
1205
|
+
left.set(qn("w:sz"), "4")
|
|
1206
|
+
left.set(qn("w:color"), border_color)
|
|
1207
|
+
pBdr.append(left)
|
|
1208
|
+
pPr.append(pBdr)
|
|
1209
|
+
|
|
1210
|
+
para.paragraph_format.space_after = Pt(6)
|
|
1211
|
+
|
|
1212
|
+
|
|
1213
|
+
def _process_horizontal_rule_element(element, doc):
|
|
1214
|
+
"""处理水平线元素"""
|
|
1215
|
+
classes = element.get("class", [])
|
|
1216
|
+
style = element.get("style", "")
|
|
1217
|
+
class_set = set(classes)
|
|
1218
|
+
if "page-break" in class_set or "page-break-after" in style:
|
|
1219
|
+
add_page_break(doc)
|
|
1220
|
+
else:
|
|
1221
|
+
add_horizontal_rule(doc)
|
|
1222
|
+
|
|
1223
|
+
|
|
1224
|
+
def _process_elements(soup, doc, html_file, default_font, default_size):
|
|
1225
|
+
"""处理所有HTML元素"""
|
|
1226
|
+
for element in soup.body.find_all(recursive=False):
|
|
1227
|
+
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
|
1228
|
+
_process_heading(element, doc, default_font)
|
|
1229
|
+
elif element.name == "p":
|
|
1230
|
+
_process_paragraph_element(element, doc)
|
|
1231
|
+
elif element.name == "ul":
|
|
1232
|
+
_process_list_element(element, doc, ordered=False)
|
|
1233
|
+
elif element.name == "ol":
|
|
1234
|
+
_process_list_element(element, doc, ordered=True)
|
|
1235
|
+
elif element.name == "table":
|
|
1236
|
+
process_table(element, doc)
|
|
1237
|
+
elif element.name == "img":
|
|
1238
|
+
_process_image_element(element, doc, html_file)
|
|
1239
|
+
elif element.name == "div":
|
|
1240
|
+
_process_div_element(element, doc, default_font, default_size)
|
|
1241
|
+
elif element.name == "hr":
|
|
1242
|
+
_process_horizontal_rule_element(element, doc)
|
|
1243
|
+
|
|
1244
|
+
|
|
1245
|
+
def convert_html_to_docx(
|
|
1246
|
+
html_file, output_file, default_font="微软雅黑", default_size=12
|
|
1247
|
+
):
|
|
545
1248
|
"""将HTML文件转换为DOCX文件"""
|
|
546
1249
|
# 读取HTML文件
|
|
547
|
-
with open(html_file,
|
|
1250
|
+
with open(html_file, "r", encoding="utf-8") as f:
|
|
548
1251
|
html_content = f.read()
|
|
549
|
-
|
|
1252
|
+
|
|
550
1253
|
# 解析HTML
|
|
551
|
-
soup = BeautifulSoup(html_content,
|
|
552
|
-
|
|
1254
|
+
soup = BeautifulSoup(html_content, "html.parser")
|
|
1255
|
+
|
|
553
1256
|
# 创建Word文档
|
|
554
1257
|
doc = Document()
|
|
555
|
-
|
|
1258
|
+
|
|
556
1259
|
# 设置默认字体
|
|
557
|
-
doc.styles[
|
|
558
|
-
doc.styles[
|
|
559
|
-
doc.styles[
|
|
560
|
-
|
|
1260
|
+
doc.styles["Normal"].font.name = default_font
|
|
1261
|
+
doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1262
|
+
doc.styles["Normal"].font.size = Pt(default_size)
|
|
1263
|
+
|
|
561
1264
|
# 处理页面设置
|
|
562
1265
|
section = doc.sections[0]
|
|
563
|
-
section.page_height = Cm(
|
|
564
|
-
section.page_width = Cm(
|
|
565
|
-
section.left_margin = Cm(
|
|
566
|
-
section.right_margin = Cm(
|
|
567
|
-
section.top_margin = Cm(
|
|
568
|
-
section.bottom_margin = Cm(
|
|
569
|
-
|
|
1266
|
+
section.page_height = Cm(ConverterConfig.PAGE_HEIGHT_CM)
|
|
1267
|
+
section.page_width = Cm(ConverterConfig.PAGE_WIDTH_CM)
|
|
1268
|
+
section.left_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1269
|
+
section.right_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1270
|
+
section.top_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1271
|
+
section.bottom_margin = Cm(ConverterConfig.MARGIN_CM)
|
|
1272
|
+
|
|
570
1273
|
# 遍历所有顶级元素
|
|
571
1274
|
for element in soup.body.find_all(recursive=False):
|
|
572
|
-
if element.name in [
|
|
1275
|
+
if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
|
|
573
1276
|
level = int(element.name[1])
|
|
574
1277
|
heading = doc.add_heading(element.get_text().strip(), level=level)
|
|
575
1278
|
heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
576
|
-
|
|
1279
|
+
|
|
577
1280
|
# 标题样式
|
|
578
1281
|
for run in heading.runs:
|
|
579
1282
|
run.font.name = default_font
|
|
580
|
-
run._element.rPr.rFonts.set(qn(
|
|
1283
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1284
|
+
run.font.size = Pt(ConverterConfig.HEADING_SIZES.get(level, 14))
|
|
1285
|
+
run.font.bold = True
|
|
1286
|
+
run.font.color.rgb = ConverterConfig.HEADING_COLORS.get(
|
|
1287
|
+
level, RGBColor(107, 91, 149)
|
|
1288
|
+
)
|
|
581
1289
|
if level == 1:
|
|
582
|
-
run.font.size = Pt(18)
|
|
583
|
-
run.font.bold = True
|
|
584
|
-
run.font.color.rgb = RGBColor(74, 63, 107)
|
|
585
1290
|
heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
run.font.bold = True
|
|
589
|
-
run.font.color.rgb = RGBColor(91, 78, 140)
|
|
590
|
-
else:
|
|
591
|
-
run.font.size = Pt(14)
|
|
592
|
-
run.font.bold = True
|
|
593
|
-
|
|
594
|
-
elif element.name == 'p':
|
|
1291
|
+
|
|
1292
|
+
elif element.name == "p":
|
|
595
1293
|
# 检查特殊段落样式
|
|
596
|
-
classes = element.get(
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
1294
|
+
classes = element.get("class", [])
|
|
1295
|
+
class_set = set(classes) # 转换为集合提高查找性能
|
|
1296
|
+
|
|
1297
|
+
if "center" in class_set:
|
|
1298
|
+
para = process_paragraph(
|
|
1299
|
+
element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
1300
|
+
)
|
|
1301
|
+
elif "right" in class_set:
|
|
1302
|
+
para = process_paragraph(
|
|
1303
|
+
element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
1304
|
+
)
|
|
1305
|
+
elif "dialogue" in class_set:
|
|
603
1306
|
para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
604
1307
|
for run in para.runs:
|
|
605
1308
|
set_font(run, italic=True, color=RGBColor(107, 91, 122))
|
|
606
|
-
elif
|
|
1309
|
+
elif (
|
|
1310
|
+
"quote" in class_set
|
|
1311
|
+
or element.get("style", "").find("background") != -1
|
|
1312
|
+
):
|
|
607
1313
|
para = process_paragraph(element, doc)
|
|
608
1314
|
para.paragraph_format.left_indent = Inches(1)
|
|
609
1315
|
para.paragraph_format.right_indent = Inches(1)
|
|
610
1316
|
from docx.enum.text import WD_BORDER
|
|
611
|
-
|
|
1317
|
+
|
|
1318
|
+
for border in para.paragraph_format._element.xpath("./w:pBdr"):
|
|
612
1319
|
border.getparent().remove(border)
|
|
613
1320
|
# 添加边框效果(使用浅灰色背景模拟)
|
|
614
|
-
shading_elm = OxmlElement(
|
|
615
|
-
shading_elm.set(qn(
|
|
1321
|
+
shading_elm = OxmlElement("w:shd")
|
|
1322
|
+
shading_elm.set(qn("w:fill"), "F5F5F5")
|
|
616
1323
|
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
617
1324
|
else:
|
|
618
1325
|
process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
619
|
-
|
|
620
|
-
elif element.name ==
|
|
621
|
-
items = element.find_all(
|
|
1326
|
+
|
|
1327
|
+
elif element.name == "ul":
|
|
1328
|
+
items = element.find_all("li", recursive=False)
|
|
622
1329
|
process_list_items(items, doc, ordered=False)
|
|
623
|
-
|
|
624
|
-
elif element.name ==
|
|
625
|
-
items = element.find_all(
|
|
1330
|
+
|
|
1331
|
+
elif element.name == "ol":
|
|
1332
|
+
items = element.find_all("li", recursive=False)
|
|
626
1333
|
process_list_items(items, doc, ordered=True)
|
|
627
|
-
|
|
628
|
-
elif element.name ==
|
|
1334
|
+
|
|
1335
|
+
elif element.name == "blockquote":
|
|
629
1336
|
# 递归处理嵌套引用
|
|
630
1337
|
_process_blockquote(element, doc, level=0)
|
|
631
|
-
|
|
632
|
-
elif element.name ==
|
|
1338
|
+
|
|
1339
|
+
elif element.name == "pre":
|
|
633
1340
|
code_text = element.get_text()
|
|
634
1341
|
para = doc.add_paragraph()
|
|
635
1342
|
para.paragraph_format.left_indent = Inches(0.5)
|
|
636
1343
|
run = para.add_run(code_text)
|
|
637
|
-
set_font(run, font_name=
|
|
1344
|
+
set_font(run, font_name="Consolas", size=10, color=RGBColor(0, 0, 128))
|
|
638
1345
|
# 添加灰色背景
|
|
639
|
-
shading_elm = OxmlElement(
|
|
640
|
-
shading_elm.set(qn(
|
|
1346
|
+
shading_elm = OxmlElement("w:shd")
|
|
1347
|
+
shading_elm.set(qn("w:fill"), "F0F0F0")
|
|
641
1348
|
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
642
|
-
|
|
643
|
-
elif element.name ==
|
|
1349
|
+
|
|
1350
|
+
elif element.name == "hr":
|
|
644
1351
|
# 检查是否有分页符class或style
|
|
645
|
-
classes = element.get(
|
|
646
|
-
style = element.get(
|
|
647
|
-
|
|
1352
|
+
classes = element.get("class", [])
|
|
1353
|
+
style = element.get("style", "")
|
|
1354
|
+
class_set = set(classes)
|
|
1355
|
+
if "page-break" in class_set or "page-break-after" in style:
|
|
648
1356
|
add_page_break(doc)
|
|
649
1357
|
else:
|
|
650
1358
|
add_horizontal_rule(doc)
|
|
651
|
-
|
|
652
|
-
elif element.name ==
|
|
1359
|
+
|
|
1360
|
+
elif element.name == "table":
|
|
653
1361
|
process_table(element, doc)
|
|
654
|
-
|
|
655
|
-
elif element.name ==
|
|
1362
|
+
|
|
1363
|
+
elif element.name == "div":
|
|
656
1364
|
# 检查是否是特殊div
|
|
657
|
-
classes = element.get(
|
|
658
|
-
|
|
1365
|
+
classes = element.get("class", [])
|
|
1366
|
+
class_set = set(classes)
|
|
1367
|
+
|
|
1368
|
+
if "chapter" in class_set:
|
|
659
1369
|
# 处理章节
|
|
660
|
-
h2 = element.find(
|
|
1370
|
+
h2 = element.find("h2")
|
|
661
1371
|
if h2:
|
|
662
1372
|
heading = doc.add_heading(h2.get_text().strip(), level=2)
|
|
663
1373
|
for run in heading.runs:
|
|
664
1374
|
run.font.color.rgb = RGBColor(91, 78, 140)
|
|
665
1375
|
run.font.size = Pt(16)
|
|
666
1376
|
run.font.name = default_font
|
|
667
|
-
run._element.rPr.rFonts.set(qn(
|
|
668
|
-
|
|
669
|
-
paragraphs = element.find_all(
|
|
1377
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
|
|
1378
|
+
|
|
1379
|
+
paragraphs = element.find_all("p")
|
|
670
1380
|
for p in paragraphs:
|
|
671
|
-
first_span = p.find(
|
|
1381
|
+
first_span = p.find("span", class_="first-line")
|
|
672
1382
|
if first_span:
|
|
673
1383
|
para = doc.add_paragraph()
|
|
674
1384
|
first_char_run = para.add_run(first_span.text)
|
|
675
|
-
set_font(
|
|
676
|
-
|
|
1385
|
+
set_font(
|
|
1386
|
+
first_char_run,
|
|
1387
|
+
size=20,
|
|
1388
|
+
bold=True,
|
|
1389
|
+
color=RGBColor(102, 126, 234),
|
|
1390
|
+
)
|
|
1391
|
+
remaining_text = p.get_text().replace(first_span.text, "", 1)
|
|
677
1392
|
run = para.add_run(remaining_text)
|
|
678
1393
|
set_font(run)
|
|
679
1394
|
else:
|
|
680
1395
|
process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
|
|
681
|
-
|
|
682
|
-
elif
|
|
683
|
-
para = process_paragraph(
|
|
1396
|
+
|
|
1397
|
+
elif "ending" in class_set:
|
|
1398
|
+
para = process_paragraph(
|
|
1399
|
+
element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
1400
|
+
)
|
|
684
1401
|
for run in para.runs:
|
|
685
1402
|
set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
|
|
686
|
-
|
|
687
|
-
elif
|
|
1403
|
+
|
|
1404
|
+
elif "page-break" in class_set:
|
|
688
1405
|
add_page_break(doc)
|
|
689
1406
|
|
|
690
|
-
elif
|
|
1407
|
+
elif "columns" in class_set:
|
|
691
1408
|
# 处理多栏布局(使用连续分节符,不换页)
|
|
692
|
-
cols_num = int(element.get(
|
|
1409
|
+
cols_num = int(element.get("data-cols", "2"))
|
|
693
1410
|
# 添加连续分节符并设置栏数
|
|
694
1411
|
add_columns_section(doc, cols_num)
|
|
695
1412
|
# 处理其中的段落
|
|
696
|
-
for p in element.find_all(
|
|
697
|
-
process_paragraph(
|
|
1413
|
+
for p in element.find_all("p", recursive=False):
|
|
1414
|
+
process_paragraph(
|
|
1415
|
+
p, doc, default_font=default_font, default_size=default_size
|
|
1416
|
+
)
|
|
698
1417
|
|
|
699
|
-
elif
|
|
1418
|
+
elif (
|
|
1419
|
+
"info" in class_set or "warning" in class_set or "success" in class_set
|
|
1420
|
+
):
|
|
700
1421
|
# 处理提示框
|
|
701
1422
|
para = doc.add_paragraph()
|
|
702
1423
|
para.paragraph_format.right_indent = Inches(0.3)
|
|
703
|
-
|
|
1424
|
+
|
|
704
1425
|
# 设置背景色和左边框颜色
|
|
705
|
-
if
|
|
706
|
-
bg_color =
|
|
707
|
-
border_color =
|
|
708
|
-
elif
|
|
709
|
-
bg_color =
|
|
710
|
-
border_color =
|
|
1426
|
+
if "info" in class_set:
|
|
1427
|
+
bg_color = ConverterConfig.INFO_COLORS["bg"]
|
|
1428
|
+
border_color = ConverterConfig.INFO_COLORS["border"]
|
|
1429
|
+
elif "warning" in class_set:
|
|
1430
|
+
bg_color = ConverterConfig.WARNING_COLORS["bg"]
|
|
1431
|
+
border_color = ConverterConfig.WARNING_COLORS["border"]
|
|
711
1432
|
else: # success
|
|
712
|
-
bg_color =
|
|
713
|
-
border_color =
|
|
714
|
-
|
|
1433
|
+
bg_color = ConverterConfig.SUCCESS_COLORS["bg"]
|
|
1434
|
+
border_color = ConverterConfig.SUCCESS_COLORS["border"]
|
|
1435
|
+
|
|
715
1436
|
# 处理内容
|
|
716
1437
|
_process_element_to_runs(element, para, default_font, default_size)
|
|
717
|
-
|
|
1438
|
+
|
|
718
1439
|
# 添加背景色
|
|
719
|
-
shading_elm = OxmlElement(
|
|
720
|
-
shading_elm.set(qn(
|
|
1440
|
+
shading_elm = OxmlElement("w:shd")
|
|
1441
|
+
shading_elm.set(qn("w:fill"), bg_color)
|
|
721
1442
|
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
722
|
-
|
|
1443
|
+
|
|
723
1444
|
# 添加左边框
|
|
724
|
-
pBdr = OxmlElement(
|
|
725
|
-
left_border = OxmlElement(
|
|
726
|
-
left_border.set(qn(
|
|
727
|
-
left_border.set(qn(
|
|
728
|
-
left_border.set(qn(
|
|
1445
|
+
pBdr = OxmlElement("w:pBdr")
|
|
1446
|
+
left_border = OxmlElement("w:left")
|
|
1447
|
+
left_border.set(qn("w:val"), "single")
|
|
1448
|
+
left_border.set(qn("w:sz"), "24") # 边框粗细
|
|
1449
|
+
left_border.set(qn("w:color"), border_color)
|
|
729
1450
|
pBdr.append(left_border)
|
|
730
1451
|
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
731
|
-
|
|
1452
|
+
|
|
732
1453
|
else:
|
|
733
1454
|
# 处理普通div,检查是否有内联样式(如提示框)
|
|
734
|
-
style = element.get(
|
|
1455
|
+
style = element.get("style", "")
|
|
735
1456
|
style_dict = _parse_style(style)
|
|
736
|
-
|
|
1457
|
+
|
|
737
1458
|
# 检查是否有背景色和左边框(提示框特征)
|
|
738
|
-
bg_color = style_dict.get(
|
|
739
|
-
border_left = style_dict.get(
|
|
740
|
-
|
|
1459
|
+
bg_color = style_dict.get("background-color", "")
|
|
1460
|
+
border_left = style_dict.get("border-left", "")
|
|
1461
|
+
|
|
741
1462
|
if bg_color and border_left:
|
|
742
1463
|
# 这是提示框
|
|
743
1464
|
para = doc.add_paragraph()
|
|
744
1465
|
para.paragraph_format.right_indent = Inches(0.3)
|
|
745
|
-
|
|
1466
|
+
|
|
746
1467
|
# 处理内容
|
|
747
1468
|
_process_element_to_runs(element, para, default_font, default_size)
|
|
748
|
-
|
|
1469
|
+
|
|
749
1470
|
# 添加背景色
|
|
750
|
-
if bg_color.startswith(
|
|
751
|
-
shading_elm = OxmlElement(
|
|
752
|
-
shading_elm.set(qn(
|
|
753
|
-
para.paragraph_format._element.get_or_add_pPr().append(
|
|
754
|
-
|
|
1471
|
+
if bg_color.startswith("#"):
|
|
1472
|
+
shading_elm = OxmlElement("w:shd")
|
|
1473
|
+
shading_elm.set(qn("w:fill"), bg_color[1:].upper())
|
|
1474
|
+
para.paragraph_format._element.get_or_add_pPr().append(
|
|
1475
|
+
shading_elm
|
|
1476
|
+
)
|
|
1477
|
+
|
|
755
1478
|
# 解析左边框颜色
|
|
756
|
-
border_color =
|
|
757
|
-
if
|
|
1479
|
+
border_color = ""
|
|
1480
|
+
if "solid" in border_left:
|
|
758
1481
|
parts = border_left.split()
|
|
759
1482
|
for i, part in enumerate(parts):
|
|
760
|
-
if part.startswith(
|
|
1483
|
+
if part.startswith("#"):
|
|
761
1484
|
border_color = part[1:]
|
|
762
1485
|
break
|
|
763
|
-
|
|
1486
|
+
|
|
764
1487
|
# 添加左边框
|
|
765
1488
|
if border_color:
|
|
766
|
-
pBdr = OxmlElement(
|
|
767
|
-
left_border = OxmlElement(
|
|
768
|
-
left_border.set(qn(
|
|
769
|
-
left_border.set(qn(
|
|
770
|
-
left_border.set(qn(
|
|
1489
|
+
pBdr = OxmlElement("w:pBdr")
|
|
1490
|
+
left_border = OxmlElement("w:left")
|
|
1491
|
+
left_border.set(qn("w:val"), "single")
|
|
1492
|
+
left_border.set(qn("w:sz"), "24")
|
|
1493
|
+
left_border.set(qn("w:color"), border_color.upper())
|
|
771
1494
|
pBdr.append(left_border)
|
|
772
1495
|
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
773
1496
|
else:
|
|
774
1497
|
# 普通div,处理其中的段落
|
|
775
|
-
for p in element.find_all(
|
|
1498
|
+
for p in element.find_all("p", recursive=False):
|
|
776
1499
|
process_paragraph(p, doc)
|
|
777
|
-
|
|
778
|
-
elif element.name ==
|
|
779
|
-
src = element.get(
|
|
780
|
-
alt = element.get(
|
|
1500
|
+
|
|
1501
|
+
elif element.name == "img":
|
|
1502
|
+
src = element.get("src", "")
|
|
1503
|
+
alt = element.get("alt", "图片")
|
|
781
1504
|
if src and os.path.exists(src):
|
|
782
1505
|
try:
|
|
783
1506
|
doc.add_picture(src, width=Inches(5))
|
|
784
1507
|
last_para = doc.paragraphs[-1]
|
|
785
1508
|
last_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
786
1509
|
except:
|
|
787
|
-
para = doc.add_paragraph(f
|
|
1510
|
+
para = doc.add_paragraph(f"[图片: {alt}]")
|
|
788
1511
|
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
1512
|
+
raise
|
|
789
1513
|
else:
|
|
790
|
-
para = doc.add_paragraph(f
|
|
1514
|
+
para = doc.add_paragraph(f"[图片: {alt} - 路径: {src}]")
|
|
791
1515
|
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
792
|
-
|
|
1516
|
+
|
|
793
1517
|
# 保存文档
|
|
794
1518
|
doc.save(output_file)
|
|
795
|
-
print(f"转换完成!文件已保存为 {output_file}")
|
|
796
1519
|
|
|
797
|
-
|
|
1520
|
+
|
|
1521
|
+
if __name__ == "__main__":
|
|
798
1522
|
import sys
|
|
799
|
-
|
|
800
|
-
if len(sys.argv)
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
convert_html_to_docx(html_file, output_file)
|
|
1523
|
+
|
|
1524
|
+
if len(sys.argv) != 3:
|
|
1525
|
+
print("用法: python docx_converter.py <html_file> <output_file>")
|
|
1526
|
+
sys.exit(1)
|
|
1527
|
+
|
|
1528
|
+
html_file = sys.argv[1]
|
|
1529
|
+
output_file = sys.argv[2]
|
|
1530
|
+
|
|
1531
|
+
convert_html_to_docx(html_file, output_file)
|