@birthday8/doc-mcp 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -0
- package/index.js +159 -0
- package/install.js +96 -0
- package/package.json +32 -0
- package/python/__pycache__/docx_converter.cpython-313.pyc +0 -0
- package/python/docx_converter.py +807 -0
- package/python/requirements.txt +4 -0
- package/python/server.py +320 -0
|
@@ -0,0 +1,807 @@
|
|
|
1
|
+
from docx import Document
|
|
2
|
+
from docx.shared import Pt, RGBColor, Inches, Cm
|
|
3
|
+
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
|
|
4
|
+
from docx.enum.section import WD_SECTION
|
|
5
|
+
from docx.oxml.ns import qn
|
|
6
|
+
from docx.oxml import OxmlElement
|
|
7
|
+
from bs4 import BeautifulSoup
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
|
|
11
|
+
def parse_color(color_str):
|
|
12
|
+
"""解析颜色字符串为RGBColor"""
|
|
13
|
+
if not color_str or not color_str.startswith('#'):
|
|
14
|
+
return None
|
|
15
|
+
try:
|
|
16
|
+
r = int(color_str[1:3], 16)
|
|
17
|
+
g = int(color_str[3:5], 16)
|
|
18
|
+
b = int(color_str[5:7], 16)
|
|
19
|
+
return RGBColor(r, g, b)
|
|
20
|
+
except:
|
|
21
|
+
return None
|
|
22
|
+
|
|
23
|
+
def set_font(run, font_name='微软雅黑', size=12, color=None, bold=False, italic=False,
|
|
24
|
+
underline=False, strike=False, highlight_color=None):
|
|
25
|
+
"""设置字体样式"""
|
|
26
|
+
run.font.name = font_name
|
|
27
|
+
run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
|
|
28
|
+
run.font.size = Pt(size)
|
|
29
|
+
run.font.bold = bold
|
|
30
|
+
run.font.italic = italic
|
|
31
|
+
|
|
32
|
+
if color:
|
|
33
|
+
run.font.color.rgb = color
|
|
34
|
+
|
|
35
|
+
if underline:
|
|
36
|
+
run.font.underline = True
|
|
37
|
+
|
|
38
|
+
if strike:
|
|
39
|
+
run.font.strike = True
|
|
40
|
+
|
|
41
|
+
if highlight_color:
|
|
42
|
+
from docx.enum.text import WD_COLOR_INDEX
|
|
43
|
+
color_map = {
|
|
44
|
+
'yellow': WD_COLOR_INDEX.YELLOW,
|
|
45
|
+
'green': WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
46
|
+
'cyan': WD_COLOR_INDEX.CYAN,
|
|
47
|
+
'magenta': WD_COLOR_INDEX.MAGENTA,
|
|
48
|
+
'blue': WD_COLOR_INDEX.TURQUOISE,
|
|
49
|
+
'red': WD_COLOR_INDEX.RED,
|
|
50
|
+
'darkblue': WD_COLOR_INDEX.BLUE,
|
|
51
|
+
'orange': WD_COLOR_INDEX.ORANGE,
|
|
52
|
+
'gray': WD_COLOR_INDEX.GRAY_25,
|
|
53
|
+
}
|
|
54
|
+
if highlight_color in color_map:
|
|
55
|
+
run.font.highlight_color = color_map[highlight_color]
|
|
56
|
+
|
|
57
|
+
def process_inline_elements(element, parent_run=None):
|
|
58
|
+
"""处理内联元素"""
|
|
59
|
+
from docx.text.paragraph import Paragraph
|
|
60
|
+
|
|
61
|
+
runs = []
|
|
62
|
+
|
|
63
|
+
for child in element.children:
|
|
64
|
+
if child.name is None: # 文本节点
|
|
65
|
+
text = str(child).strip()
|
|
66
|
+
if text:
|
|
67
|
+
if parent_run:
|
|
68
|
+
parent_run.add_text(text)
|
|
69
|
+
else:
|
|
70
|
+
runs.append({'text': text})
|
|
71
|
+
elif child.name == 'strong' or child.name == 'b':
|
|
72
|
+
if parent_run:
|
|
73
|
+
parent_run.bold = True
|
|
74
|
+
process_inline_elements(child, parent_run)
|
|
75
|
+
else:
|
|
76
|
+
runs.append({'text': child.get_text(), 'bold': True})
|
|
77
|
+
elif child.name == 'em' or child.name == 'i':
|
|
78
|
+
if parent_run:
|
|
79
|
+
parent_run.italic = True
|
|
80
|
+
process_inline_elements(child, parent_run)
|
|
81
|
+
else:
|
|
82
|
+
runs.append({'text': child.get_text(), 'italic': True})
|
|
83
|
+
elif child.name == 'u':
|
|
84
|
+
if parent_run:
|
|
85
|
+
parent_run.underline = True
|
|
86
|
+
process_inline_elements(child, parent_run)
|
|
87
|
+
else:
|
|
88
|
+
runs.append({'text': child.get_text(), 'underline': True})
|
|
89
|
+
elif child.name == 's' or child.name == 'del':
|
|
90
|
+
if parent_run:
|
|
91
|
+
parent_run.strike = True
|
|
92
|
+
process_inline_elements(child, parent_run)
|
|
93
|
+
else:
|
|
94
|
+
runs.append({'text': child.get_text(), 'strike': True})
|
|
95
|
+
elif child.name == 'sup':
|
|
96
|
+
if parent_run:
|
|
97
|
+
parent_run.font.superscript = True
|
|
98
|
+
process_inline_elements(child, parent_run)
|
|
99
|
+
else:
|
|
100
|
+
runs.append({'text': child.get_text(), 'superscript': True})
|
|
101
|
+
elif child.name == 'sub':
|
|
102
|
+
if parent_run:
|
|
103
|
+
parent_run.font.subscript = True
|
|
104
|
+
process_inline_elements(child, parent_run)
|
|
105
|
+
else:
|
|
106
|
+
runs.append({'text': child.get_text(), 'subscript': True})
|
|
107
|
+
elif child.name == 'code':
|
|
108
|
+
code_text = child.get_text()
|
|
109
|
+
if parent_run:
|
|
110
|
+
parent_run.font.name = 'Consolas'
|
|
111
|
+
parent_run.font.size = Pt(10)
|
|
112
|
+
parent_run.add_text(code_text)
|
|
113
|
+
else:
|
|
114
|
+
runs.append({'text': code_text, 'font': 'Consolas', 'size': 10})
|
|
115
|
+
elif child.name == 'a':
|
|
116
|
+
link_text = child.get_text()
|
|
117
|
+
href = child.get('href', '')
|
|
118
|
+
if parent_run:
|
|
119
|
+
parent_run.add_text(link_text)
|
|
120
|
+
else:
|
|
121
|
+
runs.append({'text': link_text, 'link': href})
|
|
122
|
+
elif child.name == 'span':
|
|
123
|
+
style = child.get('style', '')
|
|
124
|
+
color_match = re.search(r'color:\s*([^;]+)', style)
|
|
125
|
+
bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
|
|
126
|
+
|
|
127
|
+
props = {'text': child.get_text()}
|
|
128
|
+
if color_match:
|
|
129
|
+
color = parse_color(color_match.group(1).strip())
|
|
130
|
+
if color:
|
|
131
|
+
props['color'] = color
|
|
132
|
+
if bg_match:
|
|
133
|
+
bg_color = bg_match.group(1).strip()
|
|
134
|
+
if bg_color.startswith('#'):
|
|
135
|
+
bg_rgb = parse_color(bg_color)
|
|
136
|
+
if bg_rgb:
|
|
137
|
+
props['highlight'] = str(bg_rgb)
|
|
138
|
+
|
|
139
|
+
if parent_run:
|
|
140
|
+
if 'color' in props:
|
|
141
|
+
parent_run.font.color.rgb = props['color']
|
|
142
|
+
process_inline_elements(child, parent_run)
|
|
143
|
+
else:
|
|
144
|
+
runs.append(props)
|
|
145
|
+
else:
|
|
146
|
+
process_inline_elements(child, parent_run)
|
|
147
|
+
|
|
148
|
+
return runs
|
|
149
|
+
|
|
150
|
+
def _apply_highlight(run, bg_color):
|
|
151
|
+
"""为run应用背景色/高亮"""
|
|
152
|
+
from docx.enum.text import WD_COLOR_INDEX
|
|
153
|
+
|
|
154
|
+
# 颜色名称到WD_COLOR_INDEX的映射(只使用可用的枚举值)
|
|
155
|
+
color_map = {
|
|
156
|
+
'yellow': WD_COLOR_INDEX.YELLOW,
|
|
157
|
+
'green': WD_COLOR_INDEX.GREEN,
|
|
158
|
+
'brightgreen': WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
159
|
+
'blue': WD_COLOR_INDEX.BLUE,
|
|
160
|
+
'darkblue': WD_COLOR_INDEX.DARK_BLUE,
|
|
161
|
+
'red': WD_COLOR_INDEX.RED,
|
|
162
|
+
'darkred': WD_COLOR_INDEX.DARK_RED,
|
|
163
|
+
'darkyellow': WD_COLOR_INDEX.DARK_YELLOW,
|
|
164
|
+
'lightgray': WD_COLOR_INDEX.GRAY_25,
|
|
165
|
+
'gray': WD_COLOR_INDEX.GRAY_50,
|
|
166
|
+
'black': WD_COLOR_INDEX.BLACK,
|
|
167
|
+
'white': WD_COLOR_INDEX.WHITE,
|
|
168
|
+
'pink': WD_COLOR_INDEX.PINK,
|
|
169
|
+
'teal': WD_COLOR_INDEX.TEAL,
|
|
170
|
+
'turquoise': WD_COLOR_INDEX.TURQUOISE,
|
|
171
|
+
'violet': WD_COLOR_INDEX.VIOLET,
|
|
172
|
+
'cyan': WD_COLOR_INDEX.TURQUOISE,
|
|
173
|
+
'magenta': WD_COLOR_INDEX.VIOLET,
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
# 标准化颜色名称
|
|
177
|
+
bg_lower = bg_color.lower().strip()
|
|
178
|
+
|
|
179
|
+
if bg_lower in color_map:
|
|
180
|
+
# 使用预定义的高亮色
|
|
181
|
+
run.font.highlight_color = color_map[bg_lower]
|
|
182
|
+
elif bg_lower.startswith('#'):
|
|
183
|
+
# 十六进制颜色,直接使用字符串
|
|
184
|
+
shading_elm = OxmlElement('w:shd')
|
|
185
|
+
shading_elm.set(qn('w:fill'), bg_lower[1:].upper())
|
|
186
|
+
run._element.get_or_add_rPr().append(shading_elm)
|
|
187
|
+
else:
|
|
188
|
+
# 尝试其他常见颜色名称映射到相近的预定义颜色
|
|
189
|
+
similar_colors = {
|
|
190
|
+
'lightblue': WD_COLOR_INDEX.TURQUOISE,
|
|
191
|
+
'lightyellow': WD_COLOR_INDEX.YELLOW,
|
|
192
|
+
'lightgreen': WD_COLOR_INDEX.BRIGHT_GREEN,
|
|
193
|
+
'orange': WD_COLOR_INDEX.YELLOW, # 橙色映射到黄色
|
|
194
|
+
'purple': WD_COLOR_INDEX.VIOLET,
|
|
195
|
+
'brown': WD_COLOR_INDEX.DARK_YELLOW,
|
|
196
|
+
}
|
|
197
|
+
if bg_lower in similar_colors:
|
|
198
|
+
run.font.highlight_color = similar_colors[bg_lower]
|
|
199
|
+
|
|
200
|
+
def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=12,
|
|
201
|
+
indent=None, align=None, line_spacing=None):
|
|
202
|
+
"""处理段落及其内联元素"""
|
|
203
|
+
para = doc.add_paragraph()
|
|
204
|
+
|
|
205
|
+
# 设置段落格式
|
|
206
|
+
if align:
|
|
207
|
+
para.alignment = align
|
|
208
|
+
|
|
209
|
+
# 优先使用传入的indent参数,否则尝试从data-indent属性读取
|
|
210
|
+
if indent is not None:
|
|
211
|
+
para.paragraph_format.first_line_indent = Inches(indent)
|
|
212
|
+
else:
|
|
213
|
+
# 从data-indent属性读取缩进(单位:em)
|
|
214
|
+
data_indent = paragraph.get('data-indent', '')
|
|
215
|
+
if data_indent:
|
|
216
|
+
try:
|
|
217
|
+
em_count = float(data_indent)
|
|
218
|
+
# 缩进 = 字体大小(pt) × em数
|
|
219
|
+
indent_pt = default_size * em_count
|
|
220
|
+
para.paragraph_format.first_line_indent = Pt(indent_pt)
|
|
221
|
+
except ValueError:
|
|
222
|
+
pass
|
|
223
|
+
|
|
224
|
+
if line_spacing:
|
|
225
|
+
para.paragraph_format.line_spacing = line_spacing
|
|
226
|
+
|
|
227
|
+
# 处理段落内容 - 递归处理所有子元素
|
|
228
|
+
_process_element_to_runs(paragraph, para, default_font, default_size)
|
|
229
|
+
|
|
230
|
+
return para
|
|
231
|
+
|
|
232
|
+
def _process_element_to_runs(element, para, default_font='微软雅黑', default_size=12,
|
|
233
|
+
bold=False, italic=False, underline=False, strike=False,
|
|
234
|
+
color=None, bg_color=None, font_name=None, font_size=None):
|
|
235
|
+
"""递归处理元素,为不同格式的文本创建独立的runs"""
|
|
236
|
+
current_font = font_name or default_font
|
|
237
|
+
current_size = font_size or default_size
|
|
238
|
+
|
|
239
|
+
for child in element.children:
|
|
240
|
+
if child.name is None: # 文本节点
|
|
241
|
+
text = str(child)
|
|
242
|
+
# 去除多余空白但保留单个空格
|
|
243
|
+
if text:
|
|
244
|
+
# 替换换行和制表符为空格,然后合并多个空格
|
|
245
|
+
text = ' '.join(text.replace('\n', ' ').replace('\t', ' ').split())
|
|
246
|
+
if text: # 再次检查,因为去除空白后可能为空
|
|
247
|
+
run = para.add_run(text)
|
|
248
|
+
set_font(run, font_name=current_font, size=current_size,
|
|
249
|
+
bold=bold, italic=italic, underline=underline, strike=strike)
|
|
250
|
+
if color:
|
|
251
|
+
run.font.color.rgb = color
|
|
252
|
+
# 应用背景色
|
|
253
|
+
if bg_color:
|
|
254
|
+
_apply_highlight(run, bg_color)
|
|
255
|
+
elif child.name == 'strong' or child.name == 'b':
|
|
256
|
+
_process_element_to_runs(child, para, default_font, default_size,
|
|
257
|
+
bold=True, italic=italic, underline=underline, strike=strike,
|
|
258
|
+
color=color, bg_color=bg_color)
|
|
259
|
+
elif child.name == 'em' or child.name == 'i':
|
|
260
|
+
_process_element_to_runs(child, para, default_font, default_size,
|
|
261
|
+
bold=bold, italic=True, underline=underline, strike=strike,
|
|
262
|
+
color=color, bg_color=bg_color)
|
|
263
|
+
elif child.name == 'u':
|
|
264
|
+
_process_element_to_runs(child, para, default_font, default_size,
|
|
265
|
+
bold=bold, italic=italic, underline=True, strike=strike,
|
|
266
|
+
color=color, bg_color=bg_color)
|
|
267
|
+
elif child.name == 's' or child.name == 'del':
|
|
268
|
+
_process_element_to_runs(child, para, default_font, default_size,
|
|
269
|
+
bold=bold, italic=italic, underline=underline, strike=True,
|
|
270
|
+
color=color, bg_color=bg_color)
|
|
271
|
+
elif child.name == 'sup':
|
|
272
|
+
for sub_child in child.children:
|
|
273
|
+
if sub_child.name is None:
|
|
274
|
+
run = para.add_run(str(sub_child))
|
|
275
|
+
set_font(run, font_name=current_font, size=current_size,
|
|
276
|
+
bold=bold, italic=italic, underline=underline, strike=strike)
|
|
277
|
+
run.font.superscript = True
|
|
278
|
+
if color:
|
|
279
|
+
run.font.color.rgb = color
|
|
280
|
+
else:
|
|
281
|
+
_process_element_to_runs(child, para, default_font, default_size,
|
|
282
|
+
bold=bold, italic=italic, underline=underline, strike=strike,
|
|
283
|
+
color=color, bg_color=bg_color)
|
|
284
|
+
elif child.name == 'sub':
|
|
285
|
+
for sub_child in child.children:
|
|
286
|
+
if sub_child.name is None:
|
|
287
|
+
run = para.add_run(str(sub_child))
|
|
288
|
+
set_font(run, font_name=current_font, size=current_size,
|
|
289
|
+
bold=bold, italic=italic, underline=underline, strike=strike)
|
|
290
|
+
run.font.subscript = True
|
|
291
|
+
if color:
|
|
292
|
+
run.font.color.rgb = color
|
|
293
|
+
else:
|
|
294
|
+
_process_element_to_runs(child, para, default_font, default_size,
|
|
295
|
+
bold=bold, italic=italic, underline=underline, strike=strike,
|
|
296
|
+
color=color, bg_color=bg_color)
|
|
297
|
+
elif child.name == 'code':
|
|
298
|
+
code_text = child.get_text()
|
|
299
|
+
run = para.add_run(code_text)
|
|
300
|
+
set_font(run, font_name='Consolas', size=10)
|
|
301
|
+
elif child.name == 'a':
|
|
302
|
+
link_text = child.get_text()
|
|
303
|
+
run = para.add_run(link_text)
|
|
304
|
+
set_font(run, font_name=current_font, size=current_size)
|
|
305
|
+
run.font.underline = True
|
|
306
|
+
run.font.color.rgb = RGBColor(0, 0, 255)
|
|
307
|
+
elif child.name == 'span':
|
|
308
|
+
# 处理span的样式
|
|
309
|
+
style = child.get('style', '')
|
|
310
|
+
classes = child.get('class', [])
|
|
311
|
+
|
|
312
|
+
span_color = color
|
|
313
|
+
span_bg = bg_color
|
|
314
|
+
|
|
315
|
+
# 解析style中的颜色
|
|
316
|
+
color_match = re.search(r'color:\s*([^;]+)', style)
|
|
317
|
+
if color_match:
|
|
318
|
+
parsed = parse_color(color_match.group(1).strip())
|
|
319
|
+
if parsed:
|
|
320
|
+
span_color = parsed
|
|
321
|
+
|
|
322
|
+
# 解析class中的颜色
|
|
323
|
+
if 'red' in classes:
|
|
324
|
+
span_color = RGBColor(255, 0, 0)
|
|
325
|
+
elif 'blue' in classes:
|
|
326
|
+
span_color = RGBColor(0, 0, 255)
|
|
327
|
+
elif 'green' in classes:
|
|
328
|
+
span_color = RGBColor(0, 128, 0)
|
|
329
|
+
elif 'purple' in classes:
|
|
330
|
+
span_color = RGBColor(128, 0, 128)
|
|
331
|
+
|
|
332
|
+
# 背景色
|
|
333
|
+
bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
|
|
334
|
+
if bg_match:
|
|
335
|
+
span_bg = bg_match.group(1).strip()
|
|
336
|
+
if 'highlight' in classes:
|
|
337
|
+
span_bg = 'yellow'
|
|
338
|
+
|
|
339
|
+
_process_element_to_runs(child, para, default_font, default_size,
|
|
340
|
+
bold=bold, italic=italic, underline=underline, strike=strike,
|
|
341
|
+
color=span_color, bg_color=span_bg,
|
|
342
|
+
font_name=current_font, font_size=current_size)
|
|
343
|
+
else:
|
|
344
|
+
# 其他标签递归处理
|
|
345
|
+
_process_element_to_runs(child, para, default_font, default_size,
|
|
346
|
+
bold=bold, italic=italic, underline=underline, strike=strike,
|
|
347
|
+
color=color, bg_color=bg_color)
|
|
348
|
+
|
|
349
|
+
def process_list_items(items, doc, ordered=False, default_font='微软雅黑', default_size=12, level=0):
|
|
350
|
+
"""处理列表项,支持嵌套"""
|
|
351
|
+
for item in items:
|
|
352
|
+
# 创建列表项段落
|
|
353
|
+
if ordered:
|
|
354
|
+
para = doc.add_paragraph(style='List Number')
|
|
355
|
+
else:
|
|
356
|
+
para = doc.add_paragraph(style='List Bullet')
|
|
357
|
+
|
|
358
|
+
# 设置缩进:每级增加 0.25 英寸
|
|
359
|
+
para.paragraph_format.left_indent = Inches(0.25 * (level + 1))
|
|
360
|
+
|
|
361
|
+
# 查找嵌套列表
|
|
362
|
+
nested_ul = item.find('ul', recursive=False)
|
|
363
|
+
nested_ol = item.find('ol', recursive=False)
|
|
364
|
+
|
|
365
|
+
# 处理列表项的文本内容(排除嵌套列表)
|
|
366
|
+
# 创建一个临时副本用于提取文本
|
|
367
|
+
item_copy = BeautifulSoup(str(item), 'html.parser').find('li')
|
|
368
|
+
if item_copy:
|
|
369
|
+
# 移除嵌套列表
|
|
370
|
+
for nested in item_copy.find_all(['ul', 'ol'], recursive=False):
|
|
371
|
+
nested.decompose()
|
|
372
|
+
|
|
373
|
+
# 处理剩余内容
|
|
374
|
+
if item_copy.get_text().strip():
|
|
375
|
+
_process_element_to_runs(item_copy, para, default_font, default_size)
|
|
376
|
+
|
|
377
|
+
# 递归处理嵌套列表
|
|
378
|
+
if nested_ul:
|
|
379
|
+
nested_items = nested_ul.find_all('li', recursive=False)
|
|
380
|
+
process_list_items(nested_items, doc, ordered=False,
|
|
381
|
+
default_font=default_font, default_size=default_size, level=level+1)
|
|
382
|
+
if nested_ol:
|
|
383
|
+
nested_items = nested_ol.find_all('li', recursive=False)
|
|
384
|
+
process_list_items(nested_items, doc, ordered=True,
|
|
385
|
+
default_font=default_font, default_size=default_size, level=level+1)
|
|
386
|
+
|
|
387
|
+
def _parse_style(style_str):
|
|
388
|
+
"""解析style字符串为字典"""
|
|
389
|
+
styles = {}
|
|
390
|
+
if not style_str:
|
|
391
|
+
return styles
|
|
392
|
+
for item in style_str.split(';'):
|
|
393
|
+
if ':' in item:
|
|
394
|
+
key, value = item.split(':', 1)
|
|
395
|
+
styles[key.strip()] = value.strip()
|
|
396
|
+
return styles
|
|
397
|
+
|
|
398
|
+
def _apply_cell_style(cell_elem, style_dict):
|
|
399
|
+
"""应用单元格样式"""
|
|
400
|
+
# 背景色
|
|
401
|
+
bg_color = style_dict.get('background-color', '')
|
|
402
|
+
if bg_color:
|
|
403
|
+
# 处理颜色值
|
|
404
|
+
if bg_color.startswith('#'):
|
|
405
|
+
shading_elm = OxmlElement('w:shd')
|
|
406
|
+
shading_elm.set(qn('w:fill'), bg_color[1:].upper())
|
|
407
|
+
cell_elem._element.get_or_add_tcPr().append(shading_elm)
|
|
408
|
+
|
|
409
|
+
# 文字颜色
|
|
410
|
+
color = style_dict.get('color', '')
|
|
411
|
+
if color:
|
|
412
|
+
rgb = parse_color(color) if color.startswith('#') else None
|
|
413
|
+
if rgb:
|
|
414
|
+
for run in cell_elem.paragraphs[0].runs:
|
|
415
|
+
run.font.color.rgb = rgb
|
|
416
|
+
|
|
417
|
+
def process_table(table, doc, default_font='微软雅黑', default_size=11):
|
|
418
|
+
"""处理表格,支持内联样式"""
|
|
419
|
+
rows = table.find_all('tr')
|
|
420
|
+
if not rows:
|
|
421
|
+
return
|
|
422
|
+
|
|
423
|
+
# 获取列数
|
|
424
|
+
cols = max(len(row.find_all(['td', 'th'])) for row in rows)
|
|
425
|
+
|
|
426
|
+
# 创建表格
|
|
427
|
+
word_table = doc.add_table(rows=len(rows), cols=cols)
|
|
428
|
+
word_table.style = 'Table Grid'
|
|
429
|
+
|
|
430
|
+
for row_idx, row in enumerate(rows):
|
|
431
|
+
# 处理行样式(如背景色)
|
|
432
|
+
row_style = _parse_style(row.get('style', ''))
|
|
433
|
+
row_bg = row_style.get('background-color', '')
|
|
434
|
+
|
|
435
|
+
cells = row.find_all(['td', 'th'])
|
|
436
|
+
for col_idx, cell in enumerate(cells):
|
|
437
|
+
if col_idx < cols:
|
|
438
|
+
cell_elem = word_table.rows[row_idx].cells[col_idx]
|
|
439
|
+
cell_elem.paragraphs[0].text = cell.get_text().strip()
|
|
440
|
+
|
|
441
|
+
# 解析单元格样式
|
|
442
|
+
cell_style = _parse_style(cell.get('style', ''))
|
|
443
|
+
|
|
444
|
+
# 表头加粗
|
|
445
|
+
if cell.name == 'th':
|
|
446
|
+
for run in cell_elem.paragraphs[0].runs:
|
|
447
|
+
run.font.bold = True
|
|
448
|
+
|
|
449
|
+
# 设置单元格对齐
|
|
450
|
+
align = cell_style.get('text-align', 'center')
|
|
451
|
+
if align == 'center':
|
|
452
|
+
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
453
|
+
elif align == 'left':
|
|
454
|
+
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
455
|
+
elif align == 'right':
|
|
456
|
+
cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
|
|
457
|
+
|
|
458
|
+
# 应用单元格样式(背景色、文字颜色)
|
|
459
|
+
_apply_cell_style(cell_elem, cell_style)
|
|
460
|
+
|
|
461
|
+
# 如果行有背景色且单元格没有单独设置,应用行背景色
|
|
462
|
+
if row_bg and not cell_style.get('background-color'):
|
|
463
|
+
if row_bg.startswith('#'):
|
|
464
|
+
shading_elm = OxmlElement('w:shd')
|
|
465
|
+
shading_elm.set(qn('w:fill'), row_bg[1:].upper())
|
|
466
|
+
cell_elem._element.get_or_add_tcPr().append(shading_elm)
|
|
467
|
+
|
|
468
|
+
def set_section_columns(section, cols_num=2, space=720):
|
|
469
|
+
"""设置节的多栏布局
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
section: docx.section.Section对象
|
|
473
|
+
cols_num: 栏数,默认为2
|
|
474
|
+
space: 栏间距(单位:twips,1英寸=1440twips),默认720(0.5英寸)
|
|
475
|
+
"""
|
|
476
|
+
sectPr = section._sectPr
|
|
477
|
+
cols = OxmlElement('w:cols')
|
|
478
|
+
cols.set(qn('w:num'), str(cols_num))
|
|
479
|
+
cols.set(qn('w:space'), str(space))
|
|
480
|
+
sectPr.append(cols)
|
|
481
|
+
|
|
482
|
+
def add_columns_section(doc, cols_num=2, space=720):
|
|
483
|
+
"""添加连续分节符并设置多栏布局(不换页)
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
doc: Document对象
|
|
487
|
+
cols_num: 栏数,默认为2
|
|
488
|
+
space: 栏间距(单位:twips),默认720(0.5英寸)
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
section: 新创建的节对象
|
|
492
|
+
"""
|
|
493
|
+
# 添加连续分节符(不换页)
|
|
494
|
+
section = doc.add_section(start_type=WD_SECTION.CONTINUOUS)
|
|
495
|
+
# 设置栏数
|
|
496
|
+
set_section_columns(section, cols_num, space)
|
|
497
|
+
return section
|
|
498
|
+
|
|
499
|
+
def _process_blockquote(blockquote_elem, doc, level=0):
|
|
500
|
+
"""递归处理嵌套引用"""
|
|
501
|
+
# 获取当前引用的直接文本内容(不包括嵌套引用)
|
|
502
|
+
direct_text = ''
|
|
503
|
+
for child in blockquote_elem.children:
|
|
504
|
+
if child.name is None: # 文本节点
|
|
505
|
+
direct_text += str(child)
|
|
506
|
+
elif child.name != 'blockquote': # 其他非引用标签
|
|
507
|
+
direct_text += child.get_text()
|
|
508
|
+
|
|
509
|
+
direct_text = ' '.join(direct_text.split())
|
|
510
|
+
|
|
511
|
+
# 如果有直接文本,创建段落
|
|
512
|
+
if direct_text:
|
|
513
|
+
para = doc.add_paragraph()
|
|
514
|
+
run = para.add_run(direct_text)
|
|
515
|
+
set_font(run, italic=True, color=RGBColor(100, 100, 100))
|
|
516
|
+
# 根据层级设置缩进
|
|
517
|
+
para.paragraph_format.left_indent = Inches(0.3 * level)
|
|
518
|
+
para.paragraph_format.right_indent = Inches(0.5)
|
|
519
|
+
# 添加灰色左边框
|
|
520
|
+
pBdr = OxmlElement('w:pBdr')
|
|
521
|
+
left_border = OxmlElement('w:left')
|
|
522
|
+
left_border.set(qn('w:val'), 'single')
|
|
523
|
+
left_border.set(qn('w:sz'), '18')
|
|
524
|
+
left_border.set(qn('w:color'), 'CCCCCC')
|
|
525
|
+
pBdr.append(left_border)
|
|
526
|
+
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
527
|
+
|
|
528
|
+
# 递归处理嵌套引用
|
|
529
|
+
nested_quotes = blockquote_elem.find_all('blockquote', recursive=False)
|
|
530
|
+
for nested in nested_quotes:
|
|
531
|
+
_process_blockquote(nested, doc, level + 1)
|
|
532
|
+
|
|
533
|
+
def add_page_break(doc):
|
|
534
|
+
"""添加分页符"""
|
|
535
|
+
doc.add_page_break()
|
|
536
|
+
|
|
537
|
+
def add_horizontal_rule(doc):
|
|
538
|
+
"""添加水平线"""
|
|
539
|
+
para = doc.add_paragraph()
|
|
540
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
541
|
+
run = para.add_run('_' * 50)
|
|
542
|
+
run.font.color.rgb = RGBColor(200, 200, 200)
|
|
543
|
+
|
|
544
|
+
def convert_html_to_docx(html_file, output_file, default_font='微软雅黑', default_size=12):
|
|
545
|
+
"""将HTML文件转换为DOCX文件"""
|
|
546
|
+
# 读取HTML文件
|
|
547
|
+
with open(html_file, 'r', encoding='utf-8') as f:
|
|
548
|
+
html_content = f.read()
|
|
549
|
+
|
|
550
|
+
# 解析HTML
|
|
551
|
+
soup = BeautifulSoup(html_content, 'html.parser')
|
|
552
|
+
|
|
553
|
+
# 创建Word文档
|
|
554
|
+
doc = Document()
|
|
555
|
+
|
|
556
|
+
# 设置默认字体
|
|
557
|
+
doc.styles['Normal'].font.name = default_font
|
|
558
|
+
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
|
|
559
|
+
doc.styles['Normal'].font.size = Pt(default_size)
|
|
560
|
+
|
|
561
|
+
# 处理页面设置
|
|
562
|
+
section = doc.sections[0]
|
|
563
|
+
section.page_height = Cm(29.7)
|
|
564
|
+
section.page_width = Cm(21)
|
|
565
|
+
section.left_margin = Cm(2.54)
|
|
566
|
+
section.right_margin = Cm(2.54)
|
|
567
|
+
section.top_margin = Cm(2.54)
|
|
568
|
+
section.bottom_margin = Cm(2.54)
|
|
569
|
+
|
|
570
|
+
# 遍历所有顶级元素
|
|
571
|
+
for element in soup.body.find_all(recursive=False):
|
|
572
|
+
if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
|
|
573
|
+
level = int(element.name[1])
|
|
574
|
+
heading = doc.add_heading(element.get_text().strip(), level=level)
|
|
575
|
+
heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
|
|
576
|
+
|
|
577
|
+
# 标题样式
|
|
578
|
+
for run in heading.runs:
|
|
579
|
+
run.font.name = default_font
|
|
580
|
+
run._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
|
|
581
|
+
if level == 1:
|
|
582
|
+
run.font.size = Pt(18)
|
|
583
|
+
run.font.bold = True
|
|
584
|
+
run.font.color.rgb = RGBColor(74, 63, 107)
|
|
585
|
+
heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
586
|
+
elif level == 2:
|
|
587
|
+
run.font.size = Pt(16)
|
|
588
|
+
run.font.bold = True
|
|
589
|
+
run.font.color.rgb = RGBColor(91, 78, 140)
|
|
590
|
+
else:
|
|
591
|
+
run.font.size = Pt(14)
|
|
592
|
+
run.font.bold = True
|
|
593
|
+
|
|
594
|
+
elif element.name == 'p':
|
|
595
|
+
# 检查特殊段落样式
|
|
596
|
+
classes = element.get('class', [])
|
|
597
|
+
|
|
598
|
+
if 'center' in classes:
|
|
599
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
|
|
600
|
+
elif 'right' in classes:
|
|
601
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT)
|
|
602
|
+
elif 'dialogue' in classes:
|
|
603
|
+
para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
604
|
+
for run in para.runs:
|
|
605
|
+
set_font(run, italic=True, color=RGBColor(107, 91, 122))
|
|
606
|
+
elif 'quote' in classes or element.get('style', '').find('background') != -1:
|
|
607
|
+
para = process_paragraph(element, doc)
|
|
608
|
+
para.paragraph_format.left_indent = Inches(1)
|
|
609
|
+
para.paragraph_format.right_indent = Inches(1)
|
|
610
|
+
from docx.enum.text import WD_BORDER
|
|
611
|
+
for border in para.paragraph_format._element.xpath('./w:pBdr'):
|
|
612
|
+
border.getparent().remove(border)
|
|
613
|
+
# 添加边框效果(使用浅灰色背景模拟)
|
|
614
|
+
shading_elm = OxmlElement('w:shd')
|
|
615
|
+
shading_elm.set(qn('w:fill'), 'F5F5F5')
|
|
616
|
+
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
617
|
+
else:
|
|
618
|
+
process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
|
|
619
|
+
|
|
620
|
+
elif element.name == 'ul':
|
|
621
|
+
items = element.find_all('li', recursive=False)
|
|
622
|
+
process_list_items(items, doc, ordered=False)
|
|
623
|
+
|
|
624
|
+
elif element.name == 'ol':
|
|
625
|
+
items = element.find_all('li', recursive=False)
|
|
626
|
+
process_list_items(items, doc, ordered=True)
|
|
627
|
+
|
|
628
|
+
elif element.name == 'blockquote':
|
|
629
|
+
# 递归处理嵌套引用
|
|
630
|
+
_process_blockquote(element, doc, level=0)
|
|
631
|
+
|
|
632
|
+
elif element.name == 'pre':
|
|
633
|
+
code_text = element.get_text()
|
|
634
|
+
para = doc.add_paragraph()
|
|
635
|
+
para.paragraph_format.left_indent = Inches(0.5)
|
|
636
|
+
run = para.add_run(code_text)
|
|
637
|
+
set_font(run, font_name='Consolas', size=10, color=RGBColor(0, 0, 128))
|
|
638
|
+
# 添加灰色背景
|
|
639
|
+
shading_elm = OxmlElement('w:shd')
|
|
640
|
+
shading_elm.set(qn('w:fill'), 'F0F0F0')
|
|
641
|
+
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
642
|
+
|
|
643
|
+
elif element.name == 'hr':
|
|
644
|
+
# 检查是否有分页符class或style
|
|
645
|
+
classes = element.get('class', [])
|
|
646
|
+
style = element.get('style', '')
|
|
647
|
+
if 'page-break' in classes or 'page-break-after' in style:
|
|
648
|
+
add_page_break(doc)
|
|
649
|
+
else:
|
|
650
|
+
add_horizontal_rule(doc)
|
|
651
|
+
|
|
652
|
+
elif element.name == 'table':
|
|
653
|
+
process_table(element, doc)
|
|
654
|
+
|
|
655
|
+
elif element.name == 'div':
|
|
656
|
+
# 检查是否是特殊div
|
|
657
|
+
classes = element.get('class', [])
|
|
658
|
+
if 'chapter' in classes:
|
|
659
|
+
# 处理章节
|
|
660
|
+
h2 = element.find('h2')
|
|
661
|
+
if h2:
|
|
662
|
+
heading = doc.add_heading(h2.get_text().strip(), level=2)
|
|
663
|
+
for run in heading.runs:
|
|
664
|
+
run.font.color.rgb = RGBColor(91, 78, 140)
|
|
665
|
+
run.font.size = Pt(16)
|
|
666
|
+
run.font.name = default_font
|
|
667
|
+
run._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
|
|
668
|
+
|
|
669
|
+
paragraphs = element.find_all('p')
|
|
670
|
+
for p in paragraphs:
|
|
671
|
+
first_span = p.find('span', class_='first-line')
|
|
672
|
+
if first_span:
|
|
673
|
+
para = doc.add_paragraph()
|
|
674
|
+
first_char_run = para.add_run(first_span.text)
|
|
675
|
+
set_font(first_char_run, size=20, bold=True, color=RGBColor(102, 126, 234))
|
|
676
|
+
remaining_text = p.get_text().replace(first_span.text, '', 1)
|
|
677
|
+
run = para.add_run(remaining_text)
|
|
678
|
+
set_font(run)
|
|
679
|
+
else:
|
|
680
|
+
process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
|
|
681
|
+
|
|
682
|
+
elif 'ending' in classes:
|
|
683
|
+
para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
|
|
684
|
+
for run in para.runs:
|
|
685
|
+
set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
|
|
686
|
+
|
|
687
|
+
elif 'page-break' in classes:
|
|
688
|
+
add_page_break(doc)
|
|
689
|
+
|
|
690
|
+
elif 'columns' in classes:
|
|
691
|
+
# 处理多栏布局(使用连续分节符,不换页)
|
|
692
|
+
cols_num = int(element.get('data-cols', '2'))
|
|
693
|
+
# 添加连续分节符并设置栏数
|
|
694
|
+
add_columns_section(doc, cols_num)
|
|
695
|
+
# 处理其中的段落
|
|
696
|
+
for p in element.find_all('p', recursive=False):
|
|
697
|
+
process_paragraph(p, doc, default_font=default_font, default_size=default_size)
|
|
698
|
+
|
|
699
|
+
elif 'info' in classes or 'warning' in classes or 'success' in classes:
|
|
700
|
+
# 处理提示框
|
|
701
|
+
para = doc.add_paragraph()
|
|
702
|
+
para.paragraph_format.right_indent = Inches(0.3)
|
|
703
|
+
|
|
704
|
+
# 设置背景色和左边框颜色
|
|
705
|
+
if 'info' in classes:
|
|
706
|
+
bg_color = 'E3F2FD' # 浅蓝
|
|
707
|
+
border_color = '2196F3' # 蓝色
|
|
708
|
+
elif 'warning' in classes:
|
|
709
|
+
bg_color = 'FFF3CD' # 浅黄
|
|
710
|
+
border_color = 'FFC107' # 黄色
|
|
711
|
+
else: # success
|
|
712
|
+
bg_color = 'D4EDDA' # 浅绿
|
|
713
|
+
border_color = '28A745' # 绿色
|
|
714
|
+
|
|
715
|
+
# 处理内容
|
|
716
|
+
_process_element_to_runs(element, para, default_font, default_size)
|
|
717
|
+
|
|
718
|
+
# 添加背景色
|
|
719
|
+
shading_elm = OxmlElement('w:shd')
|
|
720
|
+
shading_elm.set(qn('w:fill'), bg_color)
|
|
721
|
+
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
722
|
+
|
|
723
|
+
# 添加左边框
|
|
724
|
+
pBdr = OxmlElement('w:pBdr')
|
|
725
|
+
left_border = OxmlElement('w:left')
|
|
726
|
+
left_border.set(qn('w:val'), 'single')
|
|
727
|
+
left_border.set(qn('w:sz'), '24') # 边框粗细
|
|
728
|
+
left_border.set(qn('w:color'), border_color)
|
|
729
|
+
pBdr.append(left_border)
|
|
730
|
+
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
731
|
+
|
|
732
|
+
else:
|
|
733
|
+
# 处理普通div,检查是否有内联样式(如提示框)
|
|
734
|
+
style = element.get('style', '')
|
|
735
|
+
style_dict = _parse_style(style)
|
|
736
|
+
|
|
737
|
+
# 检查是否有背景色和左边框(提示框特征)
|
|
738
|
+
bg_color = style_dict.get('background-color', '')
|
|
739
|
+
border_left = style_dict.get('border-left', '')
|
|
740
|
+
|
|
741
|
+
if bg_color and border_left:
|
|
742
|
+
# 这是提示框
|
|
743
|
+
para = doc.add_paragraph()
|
|
744
|
+
para.paragraph_format.right_indent = Inches(0.3)
|
|
745
|
+
|
|
746
|
+
# 处理内容
|
|
747
|
+
_process_element_to_runs(element, para, default_font, default_size)
|
|
748
|
+
|
|
749
|
+
# 添加背景色
|
|
750
|
+
if bg_color.startswith('#'):
|
|
751
|
+
shading_elm = OxmlElement('w:shd')
|
|
752
|
+
shading_elm.set(qn('w:fill'), bg_color[1:].upper())
|
|
753
|
+
para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
|
|
754
|
+
|
|
755
|
+
# 解析左边框颜色
|
|
756
|
+
border_color = ''
|
|
757
|
+
if 'solid' in border_left:
|
|
758
|
+
parts = border_left.split()
|
|
759
|
+
for i, part in enumerate(parts):
|
|
760
|
+
if part.startswith('#'):
|
|
761
|
+
border_color = part[1:]
|
|
762
|
+
break
|
|
763
|
+
|
|
764
|
+
# 添加左边框
|
|
765
|
+
if border_color:
|
|
766
|
+
pBdr = OxmlElement('w:pBdr')
|
|
767
|
+
left_border = OxmlElement('w:left')
|
|
768
|
+
left_border.set(qn('w:val'), 'single')
|
|
769
|
+
left_border.set(qn('w:sz'), '24')
|
|
770
|
+
left_border.set(qn('w:color'), border_color.upper())
|
|
771
|
+
pBdr.append(left_border)
|
|
772
|
+
para.paragraph_format._element.get_or_add_pPr().append(pBdr)
|
|
773
|
+
else:
|
|
774
|
+
# 普通div,处理其中的段落
|
|
775
|
+
for p in element.find_all('p', recursive=False):
|
|
776
|
+
process_paragraph(p, doc)
|
|
777
|
+
|
|
778
|
+
elif element.name == 'img':
|
|
779
|
+
src = element.get('src', '')
|
|
780
|
+
alt = element.get('alt', '图片')
|
|
781
|
+
if src and os.path.exists(src):
|
|
782
|
+
try:
|
|
783
|
+
doc.add_picture(src, width=Inches(5))
|
|
784
|
+
last_para = doc.paragraphs[-1]
|
|
785
|
+
last_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
786
|
+
except:
|
|
787
|
+
para = doc.add_paragraph(f'[图片: {alt}]')
|
|
788
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
789
|
+
else:
|
|
790
|
+
para = doc.add_paragraph(f'[图片: {alt} - 路径: {src}]')
|
|
791
|
+
para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
|
|
792
|
+
|
|
793
|
+
# 保存文档
|
|
794
|
+
doc.save(output_file)
|
|
795
|
+
print(f"转换完成!文件已保存为 {output_file}")
|
|
796
|
+
|
|
797
|
+
if __name__ == '__main__':
|
|
798
|
+
import sys
|
|
799
|
+
|
|
800
|
+
if len(sys.argv) > 2:
|
|
801
|
+
html_file = sys.argv[1]
|
|
802
|
+
output_file = sys.argv[2]
|
|
803
|
+
else:
|
|
804
|
+
html_file = r'C:\Users\birth\Desktop\tmp\test\sample.html'
|
|
805
|
+
output_file = r'C:\Users\birth\Desktop\tmp\test\sample.docx'
|
|
806
|
+
|
|
807
|
+
convert_html_to_docx(html_file, output_file)
|