@birthday8/doc-mcp 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,807 @@
1
+ from docx import Document
2
+ from docx.shared import Pt, RGBColor, Inches, Cm
3
+ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
4
+ from docx.enum.section import WD_SECTION
5
+ from docx.oxml.ns import qn
6
+ from docx.oxml import OxmlElement
7
+ from bs4 import BeautifulSoup
8
+ import os
9
+ import re
10
+
11
+ def parse_color(color_str):
12
+ """解析颜色字符串为RGBColor"""
13
+ if not color_str or not color_str.startswith('#'):
14
+ return None
15
+ try:
16
+ r = int(color_str[1:3], 16)
17
+ g = int(color_str[3:5], 16)
18
+ b = int(color_str[5:7], 16)
19
+ return RGBColor(r, g, b)
20
+ except:
21
+ return None
22
+
23
+ def set_font(run, font_name='微软雅黑', size=12, color=None, bold=False, italic=False,
24
+ underline=False, strike=False, highlight_color=None):
25
+ """设置字体样式"""
26
+ run.font.name = font_name
27
+ run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
28
+ run.font.size = Pt(size)
29
+ run.font.bold = bold
30
+ run.font.italic = italic
31
+
32
+ if color:
33
+ run.font.color.rgb = color
34
+
35
+ if underline:
36
+ run.font.underline = True
37
+
38
+ if strike:
39
+ run.font.strike = True
40
+
41
+ if highlight_color:
42
+ from docx.enum.text import WD_COLOR_INDEX
43
+ color_map = {
44
+ 'yellow': WD_COLOR_INDEX.YELLOW,
45
+ 'green': WD_COLOR_INDEX.BRIGHT_GREEN,
46
+ 'cyan': WD_COLOR_INDEX.CYAN,
47
+ 'magenta': WD_COLOR_INDEX.MAGENTA,
48
+ 'blue': WD_COLOR_INDEX.TURQUOISE,
49
+ 'red': WD_COLOR_INDEX.RED,
50
+ 'darkblue': WD_COLOR_INDEX.BLUE,
51
+ 'orange': WD_COLOR_INDEX.ORANGE,
52
+ 'gray': WD_COLOR_INDEX.GRAY_25,
53
+ }
54
+ if highlight_color in color_map:
55
+ run.font.highlight_color = color_map[highlight_color]
56
+
57
+ def process_inline_elements(element, parent_run=None):
58
+ """处理内联元素"""
59
+ from docx.text.paragraph import Paragraph
60
+
61
+ runs = []
62
+
63
+ for child in element.children:
64
+ if child.name is None: # 文本节点
65
+ text = str(child).strip()
66
+ if text:
67
+ if parent_run:
68
+ parent_run.add_text(text)
69
+ else:
70
+ runs.append({'text': text})
71
+ elif child.name == 'strong' or child.name == 'b':
72
+ if parent_run:
73
+ parent_run.bold = True
74
+ process_inline_elements(child, parent_run)
75
+ else:
76
+ runs.append({'text': child.get_text(), 'bold': True})
77
+ elif child.name == 'em' or child.name == 'i':
78
+ if parent_run:
79
+ parent_run.italic = True
80
+ process_inline_elements(child, parent_run)
81
+ else:
82
+ runs.append({'text': child.get_text(), 'italic': True})
83
+ elif child.name == 'u':
84
+ if parent_run:
85
+ parent_run.underline = True
86
+ process_inline_elements(child, parent_run)
87
+ else:
88
+ runs.append({'text': child.get_text(), 'underline': True})
89
+ elif child.name == 's' or child.name == 'del':
90
+ if parent_run:
91
+ parent_run.strike = True
92
+ process_inline_elements(child, parent_run)
93
+ else:
94
+ runs.append({'text': child.get_text(), 'strike': True})
95
+ elif child.name == 'sup':
96
+ if parent_run:
97
+ parent_run.font.superscript = True
98
+ process_inline_elements(child, parent_run)
99
+ else:
100
+ runs.append({'text': child.get_text(), 'superscript': True})
101
+ elif child.name == 'sub':
102
+ if parent_run:
103
+ parent_run.font.subscript = True
104
+ process_inline_elements(child, parent_run)
105
+ else:
106
+ runs.append({'text': child.get_text(), 'subscript': True})
107
+ elif child.name == 'code':
108
+ code_text = child.get_text()
109
+ if parent_run:
110
+ parent_run.font.name = 'Consolas'
111
+ parent_run.font.size = Pt(10)
112
+ parent_run.add_text(code_text)
113
+ else:
114
+ runs.append({'text': code_text, 'font': 'Consolas', 'size': 10})
115
+ elif child.name == 'a':
116
+ link_text = child.get_text()
117
+ href = child.get('href', '')
118
+ if parent_run:
119
+ parent_run.add_text(link_text)
120
+ else:
121
+ runs.append({'text': link_text, 'link': href})
122
+ elif child.name == 'span':
123
+ style = child.get('style', '')
124
+ color_match = re.search(r'color:\s*([^;]+)', style)
125
+ bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
126
+
127
+ props = {'text': child.get_text()}
128
+ if color_match:
129
+ color = parse_color(color_match.group(1).strip())
130
+ if color:
131
+ props['color'] = color
132
+ if bg_match:
133
+ bg_color = bg_match.group(1).strip()
134
+ if bg_color.startswith('#'):
135
+ bg_rgb = parse_color(bg_color)
136
+ if bg_rgb:
137
+ props['highlight'] = str(bg_rgb)
138
+
139
+ if parent_run:
140
+ if 'color' in props:
141
+ parent_run.font.color.rgb = props['color']
142
+ process_inline_elements(child, parent_run)
143
+ else:
144
+ runs.append(props)
145
+ else:
146
+ process_inline_elements(child, parent_run)
147
+
148
+ return runs
149
+
150
+ def _apply_highlight(run, bg_color):
151
+ """为run应用背景色/高亮"""
152
+ from docx.enum.text import WD_COLOR_INDEX
153
+
154
+ # 颜色名称到WD_COLOR_INDEX的映射(只使用可用的枚举值)
155
+ color_map = {
156
+ 'yellow': WD_COLOR_INDEX.YELLOW,
157
+ 'green': WD_COLOR_INDEX.GREEN,
158
+ 'brightgreen': WD_COLOR_INDEX.BRIGHT_GREEN,
159
+ 'blue': WD_COLOR_INDEX.BLUE,
160
+ 'darkblue': WD_COLOR_INDEX.DARK_BLUE,
161
+ 'red': WD_COLOR_INDEX.RED,
162
+ 'darkred': WD_COLOR_INDEX.DARK_RED,
163
+ 'darkyellow': WD_COLOR_INDEX.DARK_YELLOW,
164
+ 'lightgray': WD_COLOR_INDEX.GRAY_25,
165
+ 'gray': WD_COLOR_INDEX.GRAY_50,
166
+ 'black': WD_COLOR_INDEX.BLACK,
167
+ 'white': WD_COLOR_INDEX.WHITE,
168
+ 'pink': WD_COLOR_INDEX.PINK,
169
+ 'teal': WD_COLOR_INDEX.TEAL,
170
+ 'turquoise': WD_COLOR_INDEX.TURQUOISE,
171
+ 'violet': WD_COLOR_INDEX.VIOLET,
172
+ 'cyan': WD_COLOR_INDEX.TURQUOISE,
173
+ 'magenta': WD_COLOR_INDEX.VIOLET,
174
+ }
175
+
176
+ # 标准化颜色名称
177
+ bg_lower = bg_color.lower().strip()
178
+
179
+ if bg_lower in color_map:
180
+ # 使用预定义的高亮色
181
+ run.font.highlight_color = color_map[bg_lower]
182
+ elif bg_lower.startswith('#'):
183
+ # 十六进制颜色,直接使用字符串
184
+ shading_elm = OxmlElement('w:shd')
185
+ shading_elm.set(qn('w:fill'), bg_lower[1:].upper())
186
+ run._element.get_or_add_rPr().append(shading_elm)
187
+ else:
188
+ # 尝试其他常见颜色名称映射到相近的预定义颜色
189
+ similar_colors = {
190
+ 'lightblue': WD_COLOR_INDEX.TURQUOISE,
191
+ 'lightyellow': WD_COLOR_INDEX.YELLOW,
192
+ 'lightgreen': WD_COLOR_INDEX.BRIGHT_GREEN,
193
+ 'orange': WD_COLOR_INDEX.YELLOW, # 橙色映射到黄色
194
+ 'purple': WD_COLOR_INDEX.VIOLET,
195
+ 'brown': WD_COLOR_INDEX.DARK_YELLOW,
196
+ }
197
+ if bg_lower in similar_colors:
198
+ run.font.highlight_color = similar_colors[bg_lower]
199
+
200
+ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=12,
201
+ indent=None, align=None, line_spacing=None):
202
+ """处理段落及其内联元素"""
203
+ para = doc.add_paragraph()
204
+
205
+ # 设置段落格式
206
+ if align:
207
+ para.alignment = align
208
+
209
+ # 优先使用传入的indent参数,否则尝试从data-indent属性读取
210
+ if indent is not None:
211
+ para.paragraph_format.first_line_indent = Inches(indent)
212
+ else:
213
+ # 从data-indent属性读取缩进(单位:em)
214
+ data_indent = paragraph.get('data-indent', '')
215
+ if data_indent:
216
+ try:
217
+ em_count = float(data_indent)
218
+ # 缩进 = 字体大小(pt) × em数
219
+ indent_pt = default_size * em_count
220
+ para.paragraph_format.first_line_indent = Pt(indent_pt)
221
+ except ValueError:
222
+ pass
223
+
224
+ if line_spacing:
225
+ para.paragraph_format.line_spacing = line_spacing
226
+
227
+ # 处理段落内容 - 递归处理所有子元素
228
+ _process_element_to_runs(paragraph, para, default_font, default_size)
229
+
230
+ return para
231
+
232
+ def _process_element_to_runs(element, para, default_font='微软雅黑', default_size=12,
233
+ bold=False, italic=False, underline=False, strike=False,
234
+ color=None, bg_color=None, font_name=None, font_size=None):
235
+ """递归处理元素,为不同格式的文本创建独立的runs"""
236
+ current_font = font_name or default_font
237
+ current_size = font_size or default_size
238
+
239
+ for child in element.children:
240
+ if child.name is None: # 文本节点
241
+ text = str(child)
242
+ # 去除多余空白但保留单个空格
243
+ if text:
244
+ # 替换换行和制表符为空格,然后合并多个空格
245
+ text = ' '.join(text.replace('\n', ' ').replace('\t', ' ').split())
246
+ if text: # 再次检查,因为去除空白后可能为空
247
+ run = para.add_run(text)
248
+ set_font(run, font_name=current_font, size=current_size,
249
+ bold=bold, italic=italic, underline=underline, strike=strike)
250
+ if color:
251
+ run.font.color.rgb = color
252
+ # 应用背景色
253
+ if bg_color:
254
+ _apply_highlight(run, bg_color)
255
+ elif child.name == 'strong' or child.name == 'b':
256
+ _process_element_to_runs(child, para, default_font, default_size,
257
+ bold=True, italic=italic, underline=underline, strike=strike,
258
+ color=color, bg_color=bg_color)
259
+ elif child.name == 'em' or child.name == 'i':
260
+ _process_element_to_runs(child, para, default_font, default_size,
261
+ bold=bold, italic=True, underline=underline, strike=strike,
262
+ color=color, bg_color=bg_color)
263
+ elif child.name == 'u':
264
+ _process_element_to_runs(child, para, default_font, default_size,
265
+ bold=bold, italic=italic, underline=True, strike=strike,
266
+ color=color, bg_color=bg_color)
267
+ elif child.name == 's' or child.name == 'del':
268
+ _process_element_to_runs(child, para, default_font, default_size,
269
+ bold=bold, italic=italic, underline=underline, strike=True,
270
+ color=color, bg_color=bg_color)
271
+ elif child.name == 'sup':
272
+ for sub_child in child.children:
273
+ if sub_child.name is None:
274
+ run = para.add_run(str(sub_child))
275
+ set_font(run, font_name=current_font, size=current_size,
276
+ bold=bold, italic=italic, underline=underline, strike=strike)
277
+ run.font.superscript = True
278
+ if color:
279
+ run.font.color.rgb = color
280
+ else:
281
+ _process_element_to_runs(child, para, default_font, default_size,
282
+ bold=bold, italic=italic, underline=underline, strike=strike,
283
+ color=color, bg_color=bg_color)
284
+ elif child.name == 'sub':
285
+ for sub_child in child.children:
286
+ if sub_child.name is None:
287
+ run = para.add_run(str(sub_child))
288
+ set_font(run, font_name=current_font, size=current_size,
289
+ bold=bold, italic=italic, underline=underline, strike=strike)
290
+ run.font.subscript = True
291
+ if color:
292
+ run.font.color.rgb = color
293
+ else:
294
+ _process_element_to_runs(child, para, default_font, default_size,
295
+ bold=bold, italic=italic, underline=underline, strike=strike,
296
+ color=color, bg_color=bg_color)
297
+ elif child.name == 'code':
298
+ code_text = child.get_text()
299
+ run = para.add_run(code_text)
300
+ set_font(run, font_name='Consolas', size=10)
301
+ elif child.name == 'a':
302
+ link_text = child.get_text()
303
+ run = para.add_run(link_text)
304
+ set_font(run, font_name=current_font, size=current_size)
305
+ run.font.underline = True
306
+ run.font.color.rgb = RGBColor(0, 0, 255)
307
+ elif child.name == 'span':
308
+ # 处理span的样式
309
+ style = child.get('style', '')
310
+ classes = child.get('class', [])
311
+
312
+ span_color = color
313
+ span_bg = bg_color
314
+
315
+ # 解析style中的颜色
316
+ color_match = re.search(r'color:\s*([^;]+)', style)
317
+ if color_match:
318
+ parsed = parse_color(color_match.group(1).strip())
319
+ if parsed:
320
+ span_color = parsed
321
+
322
+ # 解析class中的颜色
323
+ if 'red' in classes:
324
+ span_color = RGBColor(255, 0, 0)
325
+ elif 'blue' in classes:
326
+ span_color = RGBColor(0, 0, 255)
327
+ elif 'green' in classes:
328
+ span_color = RGBColor(0, 128, 0)
329
+ elif 'purple' in classes:
330
+ span_color = RGBColor(128, 0, 128)
331
+
332
+ # 背景色
333
+ bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
334
+ if bg_match:
335
+ span_bg = bg_match.group(1).strip()
336
+ if 'highlight' in classes:
337
+ span_bg = 'yellow'
338
+
339
+ _process_element_to_runs(child, para, default_font, default_size,
340
+ bold=bold, italic=italic, underline=underline, strike=strike,
341
+ color=span_color, bg_color=span_bg,
342
+ font_name=current_font, font_size=current_size)
343
+ else:
344
+ # 其他标签递归处理
345
+ _process_element_to_runs(child, para, default_font, default_size,
346
+ bold=bold, italic=italic, underline=underline, strike=strike,
347
+ color=color, bg_color=bg_color)
348
+
349
+ def process_list_items(items, doc, ordered=False, default_font='微软雅黑', default_size=12, level=0):
350
+ """处理列表项,支持嵌套"""
351
+ for item in items:
352
+ # 创建列表项段落
353
+ if ordered:
354
+ para = doc.add_paragraph(style='List Number')
355
+ else:
356
+ para = doc.add_paragraph(style='List Bullet')
357
+
358
+ # 设置缩进:每级增加 0.25 英寸
359
+ para.paragraph_format.left_indent = Inches(0.25 * (level + 1))
360
+
361
+ # 查找嵌套列表
362
+ nested_ul = item.find('ul', recursive=False)
363
+ nested_ol = item.find('ol', recursive=False)
364
+
365
+ # 处理列表项的文本内容(排除嵌套列表)
366
+ # 创建一个临时副本用于提取文本
367
+ item_copy = BeautifulSoup(str(item), 'html.parser').find('li')
368
+ if item_copy:
369
+ # 移除嵌套列表
370
+ for nested in item_copy.find_all(['ul', 'ol'], recursive=False):
371
+ nested.decompose()
372
+
373
+ # 处理剩余内容
374
+ if item_copy.get_text().strip():
375
+ _process_element_to_runs(item_copy, para, default_font, default_size)
376
+
377
+ # 递归处理嵌套列表
378
+ if nested_ul:
379
+ nested_items = nested_ul.find_all('li', recursive=False)
380
+ process_list_items(nested_items, doc, ordered=False,
381
+ default_font=default_font, default_size=default_size, level=level+1)
382
+ if nested_ol:
383
+ nested_items = nested_ol.find_all('li', recursive=False)
384
+ process_list_items(nested_items, doc, ordered=True,
385
+ default_font=default_font, default_size=default_size, level=level+1)
386
+
387
+ def _parse_style(style_str):
388
+ """解析style字符串为字典"""
389
+ styles = {}
390
+ if not style_str:
391
+ return styles
392
+ for item in style_str.split(';'):
393
+ if ':' in item:
394
+ key, value = item.split(':', 1)
395
+ styles[key.strip()] = value.strip()
396
+ return styles
397
+
398
+ def _apply_cell_style(cell_elem, style_dict):
399
+ """应用单元格样式"""
400
+ # 背景色
401
+ bg_color = style_dict.get('background-color', '')
402
+ if bg_color:
403
+ # 处理颜色值
404
+ if bg_color.startswith('#'):
405
+ shading_elm = OxmlElement('w:shd')
406
+ shading_elm.set(qn('w:fill'), bg_color[1:].upper())
407
+ cell_elem._element.get_or_add_tcPr().append(shading_elm)
408
+
409
+ # 文字颜色
410
+ color = style_dict.get('color', '')
411
+ if color:
412
+ rgb = parse_color(color) if color.startswith('#') else None
413
+ if rgb:
414
+ for run in cell_elem.paragraphs[0].runs:
415
+ run.font.color.rgb = rgb
416
+
417
+ def process_table(table, doc, default_font='微软雅黑', default_size=11):
418
+ """处理表格,支持内联样式"""
419
+ rows = table.find_all('tr')
420
+ if not rows:
421
+ return
422
+
423
+ # 获取列数
424
+ cols = max(len(row.find_all(['td', 'th'])) for row in rows)
425
+
426
+ # 创建表格
427
+ word_table = doc.add_table(rows=len(rows), cols=cols)
428
+ word_table.style = 'Table Grid'
429
+
430
+ for row_idx, row in enumerate(rows):
431
+ # 处理行样式(如背景色)
432
+ row_style = _parse_style(row.get('style', ''))
433
+ row_bg = row_style.get('background-color', '')
434
+
435
+ cells = row.find_all(['td', 'th'])
436
+ for col_idx, cell in enumerate(cells):
437
+ if col_idx < cols:
438
+ cell_elem = word_table.rows[row_idx].cells[col_idx]
439
+ cell_elem.paragraphs[0].text = cell.get_text().strip()
440
+
441
+ # 解析单元格样式
442
+ cell_style = _parse_style(cell.get('style', ''))
443
+
444
+ # 表头加粗
445
+ if cell.name == 'th':
446
+ for run in cell_elem.paragraphs[0].runs:
447
+ run.font.bold = True
448
+
449
+ # 设置单元格对齐
450
+ align = cell_style.get('text-align', 'center')
451
+ if align == 'center':
452
+ cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
453
+ elif align == 'left':
454
+ cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
455
+ elif align == 'right':
456
+ cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
457
+
458
+ # 应用单元格样式(背景色、文字颜色)
459
+ _apply_cell_style(cell_elem, cell_style)
460
+
461
+ # 如果行有背景色且单元格没有单独设置,应用行背景色
462
+ if row_bg and not cell_style.get('background-color'):
463
+ if row_bg.startswith('#'):
464
+ shading_elm = OxmlElement('w:shd')
465
+ shading_elm.set(qn('w:fill'), row_bg[1:].upper())
466
+ cell_elem._element.get_or_add_tcPr().append(shading_elm)
467
+
468
+ def set_section_columns(section, cols_num=2, space=720):
469
+ """设置节的多栏布局
470
+
471
+ Args:
472
+ section: docx.section.Section对象
473
+ cols_num: 栏数,默认为2
474
+ space: 栏间距(单位:twips,1英寸=1440twips),默认720(0.5英寸)
475
+ """
476
+ sectPr = section._sectPr
477
+ cols = OxmlElement('w:cols')
478
+ cols.set(qn('w:num'), str(cols_num))
479
+ cols.set(qn('w:space'), str(space))
480
+ sectPr.append(cols)
481
+
482
+ def add_columns_section(doc, cols_num=2, space=720):
483
+ """添加连续分节符并设置多栏布局(不换页)
484
+
485
+ Args:
486
+ doc: Document对象
487
+ cols_num: 栏数,默认为2
488
+ space: 栏间距(单位:twips),默认720(0.5英寸)
489
+
490
+ Returns:
491
+ section: 新创建的节对象
492
+ """
493
+ # 添加连续分节符(不换页)
494
+ section = doc.add_section(start_type=WD_SECTION.CONTINUOUS)
495
+ # 设置栏数
496
+ set_section_columns(section, cols_num, space)
497
+ return section
498
+
499
+ def _process_blockquote(blockquote_elem, doc, level=0):
500
+ """递归处理嵌套引用"""
501
+ # 获取当前引用的直接文本内容(不包括嵌套引用)
502
+ direct_text = ''
503
+ for child in blockquote_elem.children:
504
+ if child.name is None: # 文本节点
505
+ direct_text += str(child)
506
+ elif child.name != 'blockquote': # 其他非引用标签
507
+ direct_text += child.get_text()
508
+
509
+ direct_text = ' '.join(direct_text.split())
510
+
511
+ # 如果有直接文本,创建段落
512
+ if direct_text:
513
+ para = doc.add_paragraph()
514
+ run = para.add_run(direct_text)
515
+ set_font(run, italic=True, color=RGBColor(100, 100, 100))
516
+ # 根据层级设置缩进
517
+ para.paragraph_format.left_indent = Inches(0.3 * level)
518
+ para.paragraph_format.right_indent = Inches(0.5)
519
+ # 添加灰色左边框
520
+ pBdr = OxmlElement('w:pBdr')
521
+ left_border = OxmlElement('w:left')
522
+ left_border.set(qn('w:val'), 'single')
523
+ left_border.set(qn('w:sz'), '18')
524
+ left_border.set(qn('w:color'), 'CCCCCC')
525
+ pBdr.append(left_border)
526
+ para.paragraph_format._element.get_or_add_pPr().append(pBdr)
527
+
528
+ # 递归处理嵌套引用
529
+ nested_quotes = blockquote_elem.find_all('blockquote', recursive=False)
530
+ for nested in nested_quotes:
531
+ _process_blockquote(nested, doc, level + 1)
532
+
533
+ def add_page_break(doc):
534
+ """添加分页符"""
535
+ doc.add_page_break()
536
+
537
+ def add_horizontal_rule(doc):
538
+ """添加水平线"""
539
+ para = doc.add_paragraph()
540
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
541
+ run = para.add_run('_' * 50)
542
+ run.font.color.rgb = RGBColor(200, 200, 200)
543
+
544
+ def convert_html_to_docx(html_file, output_file, default_font='微软雅黑', default_size=12):
545
+ """将HTML文件转换为DOCX文件"""
546
+ # 读取HTML文件
547
+ with open(html_file, 'r', encoding='utf-8') as f:
548
+ html_content = f.read()
549
+
550
+ # 解析HTML
551
+ soup = BeautifulSoup(html_content, 'html.parser')
552
+
553
+ # 创建Word文档
554
+ doc = Document()
555
+
556
+ # 设置默认字体
557
+ doc.styles['Normal'].font.name = default_font
558
+ doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
559
+ doc.styles['Normal'].font.size = Pt(default_size)
560
+
561
+ # 处理页面设置
562
+ section = doc.sections[0]
563
+ section.page_height = Cm(29.7)
564
+ section.page_width = Cm(21)
565
+ section.left_margin = Cm(2.54)
566
+ section.right_margin = Cm(2.54)
567
+ section.top_margin = Cm(2.54)
568
+ section.bottom_margin = Cm(2.54)
569
+
570
+ # 遍历所有顶级元素
571
+ for element in soup.body.find_all(recursive=False):
572
+ if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
573
+ level = int(element.name[1])
574
+ heading = doc.add_heading(element.get_text().strip(), level=level)
575
+ heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
576
+
577
+ # 标题样式
578
+ for run in heading.runs:
579
+ run.font.name = default_font
580
+ run._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
581
+ if level == 1:
582
+ run.font.size = Pt(18)
583
+ run.font.bold = True
584
+ run.font.color.rgb = RGBColor(74, 63, 107)
585
+ heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
586
+ elif level == 2:
587
+ run.font.size = Pt(16)
588
+ run.font.bold = True
589
+ run.font.color.rgb = RGBColor(91, 78, 140)
590
+ else:
591
+ run.font.size = Pt(14)
592
+ run.font.bold = True
593
+
594
+ elif element.name == 'p':
595
+ # 检查特殊段落样式
596
+ classes = element.get('class', [])
597
+
598
+ if 'center' in classes:
599
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
600
+ elif 'right' in classes:
601
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT)
602
+ elif 'dialogue' in classes:
603
+ para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
604
+ for run in para.runs:
605
+ set_font(run, italic=True, color=RGBColor(107, 91, 122))
606
+ elif 'quote' in classes or element.get('style', '').find('background') != -1:
607
+ para = process_paragraph(element, doc)
608
+ para.paragraph_format.left_indent = Inches(1)
609
+ para.paragraph_format.right_indent = Inches(1)
610
+ from docx.enum.text import WD_BORDER
611
+ for border in para.paragraph_format._element.xpath('./w:pBdr'):
612
+ border.getparent().remove(border)
613
+ # 添加边框效果(使用浅灰色背景模拟)
614
+ shading_elm = OxmlElement('w:shd')
615
+ shading_elm.set(qn('w:fill'), 'F5F5F5')
616
+ para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
617
+ else:
618
+ process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
619
+
620
+ elif element.name == 'ul':
621
+ items = element.find_all('li', recursive=False)
622
+ process_list_items(items, doc, ordered=False)
623
+
624
+ elif element.name == 'ol':
625
+ items = element.find_all('li', recursive=False)
626
+ process_list_items(items, doc, ordered=True)
627
+
628
+ elif element.name == 'blockquote':
629
+ # 递归处理嵌套引用
630
+ _process_blockquote(element, doc, level=0)
631
+
632
+ elif element.name == 'pre':
633
+ code_text = element.get_text()
634
+ para = doc.add_paragraph()
635
+ para.paragraph_format.left_indent = Inches(0.5)
636
+ run = para.add_run(code_text)
637
+ set_font(run, font_name='Consolas', size=10, color=RGBColor(0, 0, 128))
638
+ # 添加灰色背景
639
+ shading_elm = OxmlElement('w:shd')
640
+ shading_elm.set(qn('w:fill'), 'F0F0F0')
641
+ para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
642
+
643
+ elif element.name == 'hr':
644
+ # 检查是否有分页符class或style
645
+ classes = element.get('class', [])
646
+ style = element.get('style', '')
647
+ if 'page-break' in classes or 'page-break-after' in style:
648
+ add_page_break(doc)
649
+ else:
650
+ add_horizontal_rule(doc)
651
+
652
+ elif element.name == 'table':
653
+ process_table(element, doc)
654
+
655
+ elif element.name == 'div':
656
+ # 检查是否是特殊div
657
+ classes = element.get('class', [])
658
+ if 'chapter' in classes:
659
+ # 处理章节
660
+ h2 = element.find('h2')
661
+ if h2:
662
+ heading = doc.add_heading(h2.get_text().strip(), level=2)
663
+ for run in heading.runs:
664
+ run.font.color.rgb = RGBColor(91, 78, 140)
665
+ run.font.size = Pt(16)
666
+ run.font.name = default_font
667
+ run._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
668
+
669
+ paragraphs = element.find_all('p')
670
+ for p in paragraphs:
671
+ first_span = p.find('span', class_='first-line')
672
+ if first_span:
673
+ para = doc.add_paragraph()
674
+ first_char_run = para.add_run(first_span.text)
675
+ set_font(first_char_run, size=20, bold=True, color=RGBColor(102, 126, 234))
676
+ remaining_text = p.get_text().replace(first_span.text, '', 1)
677
+ run = para.add_run(remaining_text)
678
+ set_font(run)
679
+ else:
680
+ process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
681
+
682
+ elif 'ending' in classes:
683
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
684
+ for run in para.runs:
685
+ set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
686
+
687
+ elif 'page-break' in classes:
688
+ add_page_break(doc)
689
+
690
+ elif 'columns' in classes:
691
+ # 处理多栏布局(使用连续分节符,不换页)
692
+ cols_num = int(element.get('data-cols', '2'))
693
+ # 添加连续分节符并设置栏数
694
+ add_columns_section(doc, cols_num)
695
+ # 处理其中的段落
696
+ for p in element.find_all('p', recursive=False):
697
+ process_paragraph(p, doc, default_font=default_font, default_size=default_size)
698
+
699
+ elif 'info' in classes or 'warning' in classes or 'success' in classes:
700
+ # 处理提示框
701
+ para = doc.add_paragraph()
702
+ para.paragraph_format.right_indent = Inches(0.3)
703
+
704
+ # 设置背景色和左边框颜色
705
+ if 'info' in classes:
706
+ bg_color = 'E3F2FD' # 浅蓝
707
+ border_color = '2196F3' # 蓝色
708
+ elif 'warning' in classes:
709
+ bg_color = 'FFF3CD' # 浅黄
710
+ border_color = 'FFC107' # 黄色
711
+ else: # success
712
+ bg_color = 'D4EDDA' # 浅绿
713
+ border_color = '28A745' # 绿色
714
+
715
+ # 处理内容
716
+ _process_element_to_runs(element, para, default_font, default_size)
717
+
718
+ # 添加背景色
719
+ shading_elm = OxmlElement('w:shd')
720
+ shading_elm.set(qn('w:fill'), bg_color)
721
+ para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
722
+
723
+ # 添加左边框
724
+ pBdr = OxmlElement('w:pBdr')
725
+ left_border = OxmlElement('w:left')
726
+ left_border.set(qn('w:val'), 'single')
727
+ left_border.set(qn('w:sz'), '24') # 边框粗细
728
+ left_border.set(qn('w:color'), border_color)
729
+ pBdr.append(left_border)
730
+ para.paragraph_format._element.get_or_add_pPr().append(pBdr)
731
+
732
+ else:
733
+ # 处理普通div,检查是否有内联样式(如提示框)
734
+ style = element.get('style', '')
735
+ style_dict = _parse_style(style)
736
+
737
+ # 检查是否有背景色和左边框(提示框特征)
738
+ bg_color = style_dict.get('background-color', '')
739
+ border_left = style_dict.get('border-left', '')
740
+
741
+ if bg_color and border_left:
742
+ # 这是提示框
743
+ para = doc.add_paragraph()
744
+ para.paragraph_format.right_indent = Inches(0.3)
745
+
746
+ # 处理内容
747
+ _process_element_to_runs(element, para, default_font, default_size)
748
+
749
+ # 添加背景色
750
+ if bg_color.startswith('#'):
751
+ shading_elm = OxmlElement('w:shd')
752
+ shading_elm.set(qn('w:fill'), bg_color[1:].upper())
753
+ para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
754
+
755
+ # 解析左边框颜色
756
+ border_color = ''
757
+ if 'solid' in border_left:
758
+ parts = border_left.split()
759
+ for i, part in enumerate(parts):
760
+ if part.startswith('#'):
761
+ border_color = part[1:]
762
+ break
763
+
764
+ # 添加左边框
765
+ if border_color:
766
+ pBdr = OxmlElement('w:pBdr')
767
+ left_border = OxmlElement('w:left')
768
+ left_border.set(qn('w:val'), 'single')
769
+ left_border.set(qn('w:sz'), '24')
770
+ left_border.set(qn('w:color'), border_color.upper())
771
+ pBdr.append(left_border)
772
+ para.paragraph_format._element.get_or_add_pPr().append(pBdr)
773
+ else:
774
+ # 普通div,处理其中的段落
775
+ for p in element.find_all('p', recursive=False):
776
+ process_paragraph(p, doc)
777
+
778
+ elif element.name == 'img':
779
+ src = element.get('src', '')
780
+ alt = element.get('alt', '图片')
781
+ if src and os.path.exists(src):
782
+ try:
783
+ doc.add_picture(src, width=Inches(5))
784
+ last_para = doc.paragraphs[-1]
785
+ last_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
786
+ except:
787
+ para = doc.add_paragraph(f'[图片: {alt}]')
788
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
789
+ else:
790
+ para = doc.add_paragraph(f'[图片: {alt} - 路径: {src}]')
791
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
792
+
793
+ # 保存文档
794
+ doc.save(output_file)
795
+ print(f"转换完成!文件已保存为 {output_file}")
796
+
797
+ if __name__ == '__main__':
798
+ import sys
799
+
800
+ if len(sys.argv) > 2:
801
+ html_file = sys.argv[1]
802
+ output_file = sys.argv[2]
803
+ else:
804
+ html_file = r'C:\Users\birth\Desktop\tmp\test\sample.html'
805
+ output_file = r'C:\Users\birth\Desktop\tmp\test\sample.docx'
806
+
807
+ convert_html_to_docx(html_file, output_file)