@birthday8/doc-mcp 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,13 +4,230 @@ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
4
4
  from docx.enum.section import WD_SECTION
5
5
  from docx.oxml.ns import qn
6
6
  from docx.oxml import OxmlElement
7
+ from docx.enum.shape import WD_INLINE_SHAPE
7
8
  from bs4 import BeautifulSoup
8
9
  import os
9
10
  import re
11
+ import json
12
+
13
+ # 尝试导入 math2docx
14
+ try:
15
+ import math2docx
16
+
17
+ HAS_MATH2DOCX = True
18
+ except ImportError:
19
+ HAS_MATH2DOCX = False
20
+ print("Warning: math2docx not installed. Formula support will be limited.")
21
+
22
+ # ==================== 预编译正则表达式 ====================
23
+ # 样式解析相关
24
+ TEXT_ALIGN_RE = re.compile(r"text-align:\s*([^;]+)")
25
+ LINE_HEIGHT_RE = re.compile(r"line-height:\s*([^;]+)")
26
+ COLOR_RE = re.compile(r"(?<!background-)color:\s*([^;]+)")
27
+ BACKGROUND_COLOR_RE = re.compile(r"background-color:\s*([^;]+)")
28
+ FONT_FAMILY_RE = re.compile(r"font-family:\s*([^;]+)")
29
+ FONT_SIZE_RE = re.compile(r"font-size:\s*([^;]+)")
30
+ FONT_WEIGHT_RE = re.compile(r"font-weight:\s*([^;]+)")
31
+ FONT_STYLE_RE = re.compile(r"font-style:\s*([^;]+)")
32
+ TEXT_DECORATION_RE = re.compile(r"text-decoration:\s*([^;]+)")
33
+ MARGIN_RE = re.compile(r"margin(?:-(top|bottom|left|right))?:\s*([^;]+)")
34
+ PADDING_RE = re.compile(r"padding(?:-(top|bottom|left|right))?:\s*([^;]+)")
35
+
36
+ # 公式相关
37
+ LATEX_FORMULA_RE = re.compile(r"\$\$(.*?)\$\$|\$(.*?)\$")
38
+
39
+
40
+ # ==================== 常量配置 ====================
41
+ class ConverterConfig:
42
+ """转换器配置常量"""
43
+
44
+ DEFAULT_FONT = "微软雅黑"
45
+ DEFAULT_SIZE = 12
46
+
47
+ # 页面设置
48
+ PAGE_HEIGHT_CM = 29.7
49
+ PAGE_WIDTH_CM = 21.0
50
+ MARGIN_CM = 2.54
51
+
52
+ # 字体大小映射
53
+ HEADING_SIZES = {
54
+ 1: 18,
55
+ 2: 16,
56
+ 3: 14,
57
+ 4: 14,
58
+ 5: 14,
59
+ 6: 14,
60
+ }
61
+
62
+ # 标题颜色
63
+ HEADING_COLORS = {
64
+ 1: RGBColor(74, 63, 107),
65
+ 2: RGBColor(91, 78, 140),
66
+ 3: RGBColor(107, 91, 149),
67
+ 4: RGBColor(122, 104, 161),
68
+ 5: RGBColor(137, 117, 173),
69
+ 6: RGBColor(152, 130, 185),
70
+ }
71
+
72
+ # 特殊类名颜色
73
+ CLASS_COLORS = {
74
+ "red": RGBColor(255, 0, 0),
75
+ "blue": RGBColor(0, 0, 255),
76
+ "green": RGBColor(0, 128, 0),
77
+ "purple": RGBColor(128, 0, 128),
78
+ }
79
+
80
+ # 提示框颜色
81
+ INFO_COLORS = {
82
+ "bg": "E3F2FD",
83
+ "border": "2196F3",
84
+ }
85
+ WARNING_COLORS = {
86
+ "bg": "FFF3CD",
87
+ "border": "FFC107",
88
+ }
89
+ SUCCESS_COLORS = {
90
+ "bg": "D4EDDA",
91
+ "border": "28A745",
92
+ }
93
+
94
+ # 颜色映射
95
+ COLOR_MAP = {
96
+ "red": "FF0000",
97
+ "green": "008000",
98
+ "blue": "0000FF",
99
+ "yellow": "FFFF00",
100
+ "orange": "FFA500",
101
+ "purple": "800080",
102
+ "pink": "FFC0CB",
103
+ "brown": "A52A2A",
104
+ "gray": "808080",
105
+ "black": "000000",
106
+ "white": "FFFFFF",
107
+ }
108
+
109
+
110
+ def add_image(doc, image_path, width=None, height=None, align="center"):
111
+ """添加图片到文档
112
+
113
+ Args:
114
+ doc: Word文档对象
115
+ image_path: 图片路径(绝对路径或相对于html文件的路径)
116
+ width: 图片宽度(英寸,可选)
117
+ height: 图片高度(英寸,可选)
118
+ align: 对齐方式('left', 'center', 'right')
119
+ """
120
+ # 检查文件是否存在
121
+ if not os.path.exists(image_path):
122
+ print(f"Warning: Image file not found: {image_path}")
123
+ # 添加占位文本
124
+ para = doc.add_paragraph()
125
+ run = para.add_run(f"[图片: {os.path.basename(image_path)}]")
126
+ run.font.color.rgb = RGBColor(150, 150, 150)
127
+ return False
128
+
129
+ try:
130
+ # 创建段落并设置对齐
131
+ para = doc.add_paragraph()
132
+ if align == "center":
133
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
134
+ elif align == "right":
135
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
136
+ else:
137
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
138
+
139
+ # 添加图片
140
+ if width and height:
141
+ run = para.add_run()
142
+ run.add_picture(image_path, width=Inches(width), height=Inches(height))
143
+ elif width:
144
+ run = para.add_run()
145
+ run.add_picture(image_path, width=Inches(width))
146
+ elif height:
147
+ run = para.add_run()
148
+ run.add_picture(image_path, height=Inches(height))
149
+ else:
150
+ run = para.add_run()
151
+ run.add_picture(image_path)
152
+
153
+ return True
154
+
155
+ except Exception as e:
156
+ print(f"Warning: Failed to add image {image_path}: {e}")
157
+ import traceback
158
+
159
+ traceback.print_exc()
160
+ # 添加占位文本
161
+ para = doc.add_paragraph()
162
+ run = para.add_run(f"[图片: {os.path.basename(image_path)}]")
163
+ run.font.color.rgb = RGBColor(150, 150, 150)
164
+ return False
165
+
166
+
167
+ def latex_to_unicode_formula(latex_formula):
168
+ """将 LaTeX 公式转换为 Unicode 数学符号"""
169
+ # 简单的 LaTeX 到 Unicode 转换映射
170
+ conversions = {
171
+ r"\^2": "²",
172
+ r"\^3": "³",
173
+ r"\_2": "₂",
174
+ r"\_3": "₃",
175
+ r"\\cdot": "·",
176
+ r"\\times": "×",
177
+ r"\\div": "÷",
178
+ r"\\neq": "≠",
179
+ r"\\leq": "≤",
180
+ r"\\geq": "≥",
181
+ r"\\pm": "±",
182
+ r"\\sqrt": "√",
183
+ r"\\pi": "π",
184
+ r"\\alpha": "α",
185
+ r"\\beta": "β",
186
+ r"\\gamma": "γ",
187
+ r"\\delta": "δ",
188
+ r"\\theta": "θ",
189
+ r"\\lambda": "λ",
190
+ r"\\mu": "μ",
191
+ r"\\sigma": "σ",
192
+ r"\\phi": "φ",
193
+ r"\\omega": "ω",
194
+ r"\\infty": "∞",
195
+ }
196
+
197
+ result = latex_formula
198
+ for latex, unicode_char in conversions.items():
199
+ result = result.replace(latex, unicode_char)
200
+
201
+ return result
202
+
203
+
204
+ def add_native_formula(
205
+ para,
206
+ latex_formula,
207
+ color=None,
208
+ font_name=None,
209
+ font_size=None,
210
+ bold=False,
211
+ italic=False,
212
+ ):
213
+ """添加 Word 原生公式"""
214
+ if HAS_MATH2DOCX:
215
+ try:
216
+ # 添加公式
217
+ math2docx.add_math(para, latex_formula)
218
+ return True
219
+ except Exception as e:
220
+ print(f"Warning: Failed to add native formula: {e}")
221
+ import traceback
222
+
223
+ traceback.print_exc()
224
+ return False
225
+ return False
226
+
10
227
 
11
228
  def parse_color(color_str):
12
229
  """解析颜色字符串为RGBColor"""
13
- if not color_str or not color_str.startswith('#'):
230
+ if not color_str or not color_str.startswith("#"):
14
231
  return None
15
232
  try:
16
233
  r = int(color_str[1:3], 16)
@@ -20,185 +237,112 @@ def parse_color(color_str):
20
237
  except:
21
238
  return None
22
239
 
23
- def set_font(run, font_name='微软雅黑', size=12, color=None, bold=False, italic=False,
24
- underline=False, strike=False, highlight_color=None):
240
+
241
+ def set_font(
242
+ run,
243
+ font_name="微软雅黑",
244
+ size=12,
245
+ color=None,
246
+ bold=False,
247
+ italic=False,
248
+ underline=False,
249
+ strike=False,
250
+ highlight_color=None,
251
+ ):
25
252
  """设置字体样式"""
26
253
  run.font.name = font_name
27
- run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
254
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), font_name)
28
255
  run.font.size = Pt(size)
29
256
  run.font.bold = bold
30
257
  run.font.italic = italic
31
-
258
+
32
259
  if color:
33
260
  run.font.color.rgb = color
34
-
261
+
35
262
  if underline:
36
263
  run.font.underline = True
37
-
264
+
38
265
  if strike:
39
266
  run.font.strike = True
40
-
267
+
41
268
  if highlight_color:
42
269
  from docx.enum.text import WD_COLOR_INDEX
270
+
43
271
  color_map = {
44
- 'yellow': WD_COLOR_INDEX.YELLOW,
45
- 'green': WD_COLOR_INDEX.BRIGHT_GREEN,
46
- 'cyan': WD_COLOR_INDEX.CYAN,
47
- 'magenta': WD_COLOR_INDEX.MAGENTA,
48
- 'blue': WD_COLOR_INDEX.TURQUOISE,
49
- 'red': WD_COLOR_INDEX.RED,
50
- 'darkblue': WD_COLOR_INDEX.BLUE,
51
- 'orange': WD_COLOR_INDEX.ORANGE,
52
- 'gray': WD_COLOR_INDEX.GRAY_25,
272
+ "yellow": WD_COLOR_INDEX.YELLOW,
273
+ "green": WD_COLOR_INDEX.BRIGHT_GREEN,
274
+ "cyan": WD_COLOR_INDEX.CYAN,
275
+ "magenta": WD_COLOR_INDEX.MAGENTA,
276
+ "blue": WD_COLOR_INDEX.TURQUOISE,
277
+ "red": WD_COLOR_INDEX.RED,
278
+ "darkblue": WD_COLOR_INDEX.BLUE,
279
+ "orange": WD_COLOR_INDEX.ORANGE,
280
+ "gray": WD_COLOR_INDEX.GRAY_25,
53
281
  }
54
282
  if highlight_color in color_map:
55
283
  run.font.highlight_color = color_map[highlight_color]
56
284
 
57
- def process_inline_elements(element, parent_run=None):
58
- """处理内联元素"""
59
- from docx.text.paragraph import Paragraph
60
-
61
- runs = []
62
-
63
- for child in element.children:
64
- if child.name is None: # 文本节点
65
- text = str(child).strip()
66
- if text:
67
- if parent_run:
68
- parent_run.add_text(text)
69
- else:
70
- runs.append({'text': text})
71
- elif child.name == 'strong' or child.name == 'b':
72
- if parent_run:
73
- parent_run.bold = True
74
- process_inline_elements(child, parent_run)
75
- else:
76
- runs.append({'text': child.get_text(), 'bold': True})
77
- elif child.name == 'em' or child.name == 'i':
78
- if parent_run:
79
- parent_run.italic = True
80
- process_inline_elements(child, parent_run)
81
- else:
82
- runs.append({'text': child.get_text(), 'italic': True})
83
- elif child.name == 'u':
84
- if parent_run:
85
- parent_run.underline = True
86
- process_inline_elements(child, parent_run)
87
- else:
88
- runs.append({'text': child.get_text(), 'underline': True})
89
- elif child.name == 's' or child.name == 'del':
90
- if parent_run:
91
- parent_run.strike = True
92
- process_inline_elements(child, parent_run)
93
- else:
94
- runs.append({'text': child.get_text(), 'strike': True})
95
- elif child.name == 'sup':
96
- if parent_run:
97
- parent_run.font.superscript = True
98
- process_inline_elements(child, parent_run)
99
- else:
100
- runs.append({'text': child.get_text(), 'superscript': True})
101
- elif child.name == 'sub':
102
- if parent_run:
103
- parent_run.font.subscript = True
104
- process_inline_elements(child, parent_run)
105
- else:
106
- runs.append({'text': child.get_text(), 'subscript': True})
107
- elif child.name == 'code':
108
- code_text = child.get_text()
109
- if parent_run:
110
- parent_run.font.name = 'Consolas'
111
- parent_run.font.size = Pt(10)
112
- parent_run.add_text(code_text)
113
- else:
114
- runs.append({'text': code_text, 'font': 'Consolas', 'size': 10})
115
- elif child.name == 'a':
116
- link_text = child.get_text()
117
- href = child.get('href', '')
118
- if parent_run:
119
- parent_run.add_text(link_text)
120
- else:
121
- runs.append({'text': link_text, 'link': href})
122
- elif child.name == 'span':
123
- style = child.get('style', '')
124
- color_match = re.search(r'color:\s*([^;]+)', style)
125
- bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
126
-
127
- props = {'text': child.get_text()}
128
- if color_match:
129
- color = parse_color(color_match.group(1).strip())
130
- if color:
131
- props['color'] = color
132
- if bg_match:
133
- bg_color = bg_match.group(1).strip()
134
- if bg_color.startswith('#'):
135
- bg_rgb = parse_color(bg_color)
136
- if bg_rgb:
137
- props['highlight'] = str(bg_rgb)
138
-
139
- if parent_run:
140
- if 'color' in props:
141
- parent_run.font.color.rgb = props['color']
142
- process_inline_elements(child, parent_run)
143
- else:
144
- runs.append(props)
145
- else:
146
- process_inline_elements(child, parent_run)
147
-
148
- return runs
149
285
 
150
286
  def _apply_highlight(run, bg_color):
151
287
  """为run应用背景色/高亮"""
152
288
  from docx.enum.text import WD_COLOR_INDEX
153
-
289
+
154
290
  # 颜色名称到WD_COLOR_INDEX的映射(只使用可用的枚举值)
155
291
  color_map = {
156
- 'yellow': WD_COLOR_INDEX.YELLOW,
157
- 'green': WD_COLOR_INDEX.GREEN,
158
- 'brightgreen': WD_COLOR_INDEX.BRIGHT_GREEN,
159
- 'blue': WD_COLOR_INDEX.BLUE,
160
- 'darkblue': WD_COLOR_INDEX.DARK_BLUE,
161
- 'red': WD_COLOR_INDEX.RED,
162
- 'darkred': WD_COLOR_INDEX.DARK_RED,
163
- 'darkyellow': WD_COLOR_INDEX.DARK_YELLOW,
164
- 'lightgray': WD_COLOR_INDEX.GRAY_25,
165
- 'gray': WD_COLOR_INDEX.GRAY_50,
166
- 'black': WD_COLOR_INDEX.BLACK,
167
- 'white': WD_COLOR_INDEX.WHITE,
168
- 'pink': WD_COLOR_INDEX.PINK,
169
- 'teal': WD_COLOR_INDEX.TEAL,
170
- 'turquoise': WD_COLOR_INDEX.TURQUOISE,
171
- 'violet': WD_COLOR_INDEX.VIOLET,
172
- 'cyan': WD_COLOR_INDEX.TURQUOISE,
173
- 'magenta': WD_COLOR_INDEX.VIOLET,
292
+ "yellow": WD_COLOR_INDEX.YELLOW,
293
+ "green": WD_COLOR_INDEX.GREEN,
294
+ "brightgreen": WD_COLOR_INDEX.BRIGHT_GREEN,
295
+ "blue": WD_COLOR_INDEX.BLUE,
296
+ "darkblue": WD_COLOR_INDEX.DARK_BLUE,
297
+ "red": WD_COLOR_INDEX.RED,
298
+ "darkred": WD_COLOR_INDEX.DARK_RED,
299
+ "darkyellow": WD_COLOR_INDEX.DARK_YELLOW,
300
+ "lightgray": WD_COLOR_INDEX.GRAY_25,
301
+ "gray": WD_COLOR_INDEX.GRAY_50,
302
+ "black": WD_COLOR_INDEX.BLACK,
303
+ "white": WD_COLOR_INDEX.WHITE,
304
+ "pink": WD_COLOR_INDEX.PINK,
305
+ "teal": WD_COLOR_INDEX.TEAL,
306
+ "turquoise": WD_COLOR_INDEX.TURQUOISE,
307
+ "violet": WD_COLOR_INDEX.VIOLET,
308
+ "cyan": WD_COLOR_INDEX.TURQUOISE,
309
+ "magenta": WD_COLOR_INDEX.VIOLET,
174
310
  }
175
-
311
+
176
312
  # 标准化颜色名称
177
313
  bg_lower = bg_color.lower().strip()
178
-
314
+
179
315
  if bg_lower in color_map:
180
316
  # 使用预定义的高亮色
181
317
  run.font.highlight_color = color_map[bg_lower]
182
- elif bg_lower.startswith('#'):
318
+ elif bg_lower.startswith("#"):
183
319
  # 十六进制颜色,直接使用字符串
184
- shading_elm = OxmlElement('w:shd')
185
- shading_elm.set(qn('w:fill'), bg_lower[1:].upper())
320
+ shading_elm = OxmlElement("w:shd")
321
+ shading_elm.set(qn("w:fill"), bg_lower[1:].upper())
186
322
  run._element.get_or_add_rPr().append(shading_elm)
187
323
  else:
188
324
  # 尝试其他常见颜色名称映射到相近的预定义颜色
189
325
  similar_colors = {
190
- 'lightblue': WD_COLOR_INDEX.TURQUOISE,
191
- 'lightyellow': WD_COLOR_INDEX.YELLOW,
192
- 'lightgreen': WD_COLOR_INDEX.BRIGHT_GREEN,
193
- 'orange': WD_COLOR_INDEX.YELLOW, # 橙色映射到黄色
194
- 'purple': WD_COLOR_INDEX.VIOLET,
195
- 'brown': WD_COLOR_INDEX.DARK_YELLOW,
326
+ "lightblue": WD_COLOR_INDEX.TURQUOISE,
327
+ "lightyellow": WD_COLOR_INDEX.YELLOW,
328
+ "lightgreen": WD_COLOR_INDEX.BRIGHT_GREEN,
329
+ "orange": WD_COLOR_INDEX.YELLOW, # 橙色映射到黄色
330
+ "purple": WD_COLOR_INDEX.VIOLET,
331
+ "brown": WD_COLOR_INDEX.DARK_YELLOW,
196
332
  }
197
333
  if bg_lower in similar_colors:
198
334
  run.font.highlight_color = similar_colors[bg_lower]
199
335
 
200
- def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=12,
201
- indent=None, align=None, line_spacing=None):
336
+
337
+ def process_paragraph(
338
+ paragraph,
339
+ doc,
340
+ default_font="微软雅黑",
341
+ default_size=12,
342
+ indent=None,
343
+ align=None,
344
+ line_spacing=None,
345
+ ):
202
346
  """处理段落及其内联元素"""
203
347
  para = doc.add_paragraph()
204
348
 
@@ -211,7 +355,7 @@ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=
211
355
  para.paragraph_format.first_line_indent = Inches(indent)
212
356
  else:
213
357
  # 从data-indent属性读取缩进(单位:em)
214
- data_indent = paragraph.get('data-indent', '')
358
+ data_indent = paragraph.get("data-indent", "")
215
359
  if data_indent:
216
360
  try:
217
361
  em_count = float(data_indent)
@@ -224,247 +368,551 @@ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=
224
368
  if line_spacing:
225
369
  para.paragraph_format.line_spacing = line_spacing
226
370
 
371
+ # 解析段落的样式(包括行距和段距)
372
+ style = paragraph.get("style", "")
373
+
374
+ # 解析对齐方式
375
+ text_align_match = TEXT_ALIGN_RE.search(style)
376
+ if text_align_match:
377
+ align_str = text_align_match.group(1).strip().lower()
378
+ if align_str == "left":
379
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
380
+ elif align_str == "center":
381
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
382
+ elif align_str == "right":
383
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
384
+ elif align_str == "justify":
385
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
386
+
387
+ # 解析行距
388
+ line_height_match = LINE_HEIGHT_RE.search(style)
389
+ if line_height_match:
390
+ line_height_str = line_height_match.group(1).strip()
391
+ if line_height_str.endswith("pt"):
392
+ # 固定行距
393
+ para.paragraph_format.line_spacing = float(line_height_str[:-2])
394
+ elif line_height_str.endswith("px"):
395
+ # px转换为pt
396
+ para.paragraph_format.line_spacing = float(line_height_str[:-2]) * 0.75
397
+ elif line_height_str.endswith("em"):
398
+ # em转换为pt(基于段落字号)
399
+ para.paragraph_format.line_spacing = para_size * float(line_height_str[:-2])
400
+ else:
401
+ # 尝试作为倍数处理
402
+ try:
403
+ line_spacing_value = float(line_height_str)
404
+ para.paragraph_format.line_spacing = line_spacing_value
405
+ except:
406
+ pass
407
+
408
+ # 解析段后距
409
+ margin_bottom_match = MARGIN_RE.search(style)
410
+ if margin_bottom_match:
411
+ margin_bottom_str = margin_bottom_match.group(2).strip()
412
+ if margin_bottom_str.endswith("pt"):
413
+ para.paragraph_format.space_after = Pt(float(margin_bottom_str[:-2]))
414
+ elif margin_bottom_str.endswith("px"):
415
+ # px转换为pt
416
+ para.paragraph_format.space_after = Pt(float(margin_bottom_str[:-2]) * 0.75)
417
+ elif margin_bottom_str.endswith("em"):
418
+ # em转换为pt(基于段落字号)
419
+ para.paragraph_format.space_after = Pt(
420
+ para_size * float(margin_bottom_str[:-2])
421
+ )
422
+ else:
423
+ # 尝试作为pt处理
424
+ try:
425
+ para.paragraph_format.space_after = Pt(float(margin_bottom_str))
426
+ except:
427
+ pass
428
+
429
+ # 解析段前距
430
+ margin_top_match = MARGIN_RE.search(style)
431
+ if margin_top_match:
432
+ margin_top_str = margin_top_match.group(2).strip()
433
+ if margin_top_str.endswith("pt"):
434
+ para.paragraph_format.space_before = Pt(float(margin_top_str[:-2]))
435
+ elif margin_top_str.endswith("px"):
436
+ # px转换为pt
437
+ para.paragraph_format.space_before = Pt(float(margin_top_str[:-2]) * 0.75)
438
+ elif margin_top_str.endswith("em"):
439
+ # em转换为pt(基于段落字号)
440
+ para.paragraph_format.space_before = Pt(
441
+ para_size * float(margin_top_str[:-2])
442
+ )
443
+ else:
444
+ # 尝试作为pt处理
445
+ try:
446
+ para.paragraph_format.space_before = Pt(float(margin_top_str))
447
+ except:
448
+ pass
449
+
450
+ # 解析段落的字号
451
+ para_size = default_size
452
+ style = paragraph.get("style", "")
453
+ size_match = FONT_SIZE_RE.search(style)
454
+ if size_match:
455
+ size_str = size_match.group(1).strip()
456
+ # 处理不同单位:pt, px, em等
457
+ if size_str.endswith("pt"):
458
+ para_size = float(size_str[:-2])
459
+ elif size_str.endswith("px"):
460
+ # px转换为pt (1px ≈ 0.75pt)
461
+ para_size = float(size_str[:-2]) * 0.75
462
+ elif size_str.endswith("em"):
463
+ # em转换为pt (假设基础字号为12pt)
464
+ para_size = float(size_str[:-2]) * 12
465
+ else:
466
+ # 尝试直接解析为数字
467
+ try:
468
+ para_size = float(size_str)
469
+ except:
470
+ pass
471
+
227
472
  # 处理段落内容 - 递归处理所有子元素
228
- _process_element_to_runs(paragraph, para, default_font, default_size)
473
+ _process_element_to_runs(paragraph, para, default_font, para_size)
229
474
 
230
475
  return para
231
476
 
232
- def _process_element_to_runs(element, para, default_font='微软雅黑', default_size=12,
233
- bold=False, italic=False, underline=False, strike=False,
234
- color=None, bg_color=None, font_name=None, font_size=None):
477
+
478
+ def _process_element_to_runs(
479
+ element,
480
+ para,
481
+ default_font="微软雅黑",
482
+ default_size=12,
483
+ bold=False,
484
+ italic=False,
485
+ underline=False,
486
+ strike=False,
487
+ color=None,
488
+ bg_color=None,
489
+ font_name=None,
490
+ font_size=None,
491
+ ):
235
492
  """递归处理元素,为不同格式的文本创建独立的runs"""
236
493
  current_font = font_name or default_font
237
494
  current_size = font_size or default_size
238
-
495
+
239
496
  for child in element.children:
240
497
  if child.name is None: # 文本节点
241
498
  text = str(child)
242
499
  # 去除多余空白但保留单个空格
243
500
  if text:
244
501
  # 替换换行和制表符为空格,然后合并多个空格
245
- text = ' '.join(text.replace('\n', ' ').replace('\t', ' ').split())
502
+ text = " ".join(text.replace("\n", " ").replace("\t", " ").split())
246
503
  if text: # 再次检查,因为去除空白后可能为空
247
504
  run = para.add_run(text)
248
- set_font(run, font_name=current_font, size=current_size,
249
- bold=bold, italic=italic, underline=underline, strike=strike)
505
+ set_font(
506
+ run,
507
+ font_name=current_font,
508
+ size=current_size,
509
+ bold=bold,
510
+ italic=italic,
511
+ underline=underline,
512
+ strike=strike,
513
+ )
250
514
  if color:
251
515
  run.font.color.rgb = color
252
516
  # 应用背景色
253
517
  if bg_color:
254
518
  _apply_highlight(run, bg_color)
255
- elif child.name == 'strong' or child.name == 'b':
256
- _process_element_to_runs(child, para, default_font, default_size,
257
- bold=True, italic=italic, underline=underline, strike=strike,
258
- color=color, bg_color=bg_color)
259
- elif child.name == 'em' or child.name == 'i':
260
- _process_element_to_runs(child, para, default_font, default_size,
261
- bold=bold, italic=True, underline=underline, strike=strike,
262
- color=color, bg_color=bg_color)
263
- elif child.name == 'u':
264
- _process_element_to_runs(child, para, default_font, default_size,
265
- bold=bold, italic=italic, underline=True, strike=strike,
266
- color=color, bg_color=bg_color)
267
- elif child.name == 's' or child.name == 'del':
268
- _process_element_to_runs(child, para, default_font, default_size,
269
- bold=bold, italic=italic, underline=underline, strike=True,
270
- color=color, bg_color=bg_color)
271
- elif child.name == 'sup':
519
+ elif child.name == "math" or child.name == "latex":
520
+ # 处理 LaTeX 公式标签,添加 Word 原生公式
521
+ # 注意:公式不会继承父级样式(颜色、加粗、斜体等),这是 Word OMML 的限制
522
+ latex_formula = child.get_text().strip()
523
+ if latex_formula and HAS_MATH2DOCX:
524
+ # 添加原生公式(不传递样式参数)
525
+ if add_native_formula(para, latex_formula):
526
+ continue # 成功添加原生公式,跳过后续处理
527
+ # 如果失败,回退到文本显示
528
+ # 回退方案:显示为代码文本
529
+ run = para.add_run(latex_formula)
530
+ set_font(run, font_name="Consolas", size=10, color=RGBColor(0, 0, 128))
531
+ elif child.name == "strong" or child.name == "b":
532
+ _process_element_to_runs(
533
+ child,
534
+ para,
535
+ default_font,
536
+ default_size,
537
+ bold=True,
538
+ italic=italic,
539
+ underline=underline,
540
+ strike=strike,
541
+ color=color,
542
+ bg_color=bg_color,
543
+ )
544
+ elif child.name == "em" or child.name == "i":
545
+ _process_element_to_runs(
546
+ child,
547
+ para,
548
+ default_font,
549
+ default_size,
550
+ bold=bold,
551
+ italic=True,
552
+ underline=underline,
553
+ strike=strike,
554
+ color=color,
555
+ bg_color=bg_color,
556
+ )
557
+ elif child.name == "u":
558
+ _process_element_to_runs(
559
+ child,
560
+ para,
561
+ default_font,
562
+ default_size,
563
+ bold=bold,
564
+ italic=italic,
565
+ underline=True,
566
+ strike=strike,
567
+ color=color,
568
+ bg_color=bg_color,
569
+ )
570
+ elif child.name == "s" or child.name == "del":
571
+ _process_element_to_runs(
572
+ child,
573
+ para,
574
+ default_font,
575
+ default_size,
576
+ bold=bold,
577
+ italic=italic,
578
+ underline=underline,
579
+ strike=True,
580
+ color=color,
581
+ bg_color=bg_color,
582
+ )
583
+ elif child.name == "sup":
272
584
  for sub_child in child.children:
273
585
  if sub_child.name is None:
274
586
  run = para.add_run(str(sub_child))
275
- set_font(run, font_name=current_font, size=current_size,
276
- bold=bold, italic=italic, underline=underline, strike=strike)
587
+ set_font(
588
+ run,
589
+ font_name=current_font,
590
+ size=current_size,
591
+ bold=bold,
592
+ italic=italic,
593
+ underline=underline,
594
+ strike=strike,
595
+ )
277
596
  run.font.superscript = True
278
597
  if color:
279
598
  run.font.color.rgb = color
599
+ if bg_color:
600
+ _apply_highlight(run, bg_color)
280
601
  else:
281
- _process_element_to_runs(child, para, default_font, default_size,
282
- bold=bold, italic=italic, underline=underline, strike=strike,
283
- color=color, bg_color=bg_color)
284
- elif child.name == 'sub':
602
+ # 处理嵌套元素,但保持上标
603
+ _process_element_to_runs(
604
+ sub_child,
605
+ para,
606
+ default_font,
607
+ default_size,
608
+ bold=bold,
609
+ italic=italic,
610
+ underline=underline,
611
+ strike=strike,
612
+ color=color,
613
+ bg_color=bg_color,
614
+ )
615
+ # 为嵌套元素添加的上标
616
+ for run in (
617
+ para.runs[
618
+ len(list(para.runs)) - len(sub_child.find_all(True)) :
619
+ ]
620
+ if para.runs
621
+ else []
622
+ ):
623
+ run.font.superscript = True
624
+ elif child.name == "sub":
285
625
  for sub_child in child.children:
286
626
  if sub_child.name is None:
287
627
  run = para.add_run(str(sub_child))
288
- set_font(run, font_name=current_font, size=current_size,
289
- bold=bold, italic=italic, underline=underline, strike=strike)
628
+ set_font(
629
+ run,
630
+ font_name=current_font,
631
+ size=current_size,
632
+ bold=bold,
633
+ italic=italic,
634
+ underline=underline,
635
+ strike=strike,
636
+ )
290
637
  run.font.subscript = True
291
638
  if color:
292
639
  run.font.color.rgb = color
640
+ if bg_color:
641
+ _apply_highlight(run, bg_color)
293
642
  else:
294
- _process_element_to_runs(child, para, default_font, default_size,
295
- bold=bold, italic=italic, underline=underline, strike=strike,
296
- color=color, bg_color=bg_color)
297
- elif child.name == 'code':
643
+ # 处理嵌套元素,但保持下标
644
+ _process_element_to_runs(
645
+ sub_child,
646
+ para,
647
+ default_font,
648
+ default_size,
649
+ bold=bold,
650
+ italic=italic,
651
+ underline=underline,
652
+ strike=strike,
653
+ color=color,
654
+ bg_color=bg_color,
655
+ )
656
+ # 为嵌套元素添加的下标
657
+ for run in (
658
+ para.runs[
659
+ len(list(para.runs)) - len(sub_child.find_all(True)) :
660
+ ]
661
+ if para.runs
662
+ else []
663
+ ):
664
+ run.font.subscript = True
665
+ elif child.name == "code":
298
666
  code_text = child.get_text()
299
667
  run = para.add_run(code_text)
300
- set_font(run, font_name='Consolas', size=10)
301
- elif child.name == 'a':
668
+ set_font(run, font_name="Consolas", size=10)
669
+ elif child.name == "a":
302
670
  link_text = child.get_text()
303
671
  run = para.add_run(link_text)
304
672
  set_font(run, font_name=current_font, size=current_size)
305
673
  run.font.underline = True
306
674
  run.font.color.rgb = RGBColor(0, 0, 255)
307
- elif child.name == 'span':
675
+ elif child.name == "span":
308
676
  # 处理span的样式
309
- style = child.get('style', '')
310
- classes = child.get('class', [])
311
-
677
+ style = child.get("style", "")
678
+ classes = child.get("class", [])
679
+
312
680
  span_color = color
313
681
  span_bg = bg_color
314
-
682
+ span_font = current_font # 使用当前字体(继承父级)
683
+ span_size = current_size # 使用当前字号(继承父级)
684
+
315
685
  # 解析style中的颜色
316
- color_match = re.search(r'color:\s*([^;]+)', style)
686
+ color_match = COLOR_RE.search(style)
317
687
  if color_match:
318
688
  parsed = parse_color(color_match.group(1).strip())
319
689
  if parsed:
320
690
  span_color = parsed
321
-
691
+
692
+ # 解析 font-family
693
+ font_match = FONT_FAMILY_RE.search(style)
694
+ if font_match:
695
+ font_family = font_match.group(1).strip()
696
+ # 去除引号
697
+ font_family = font_family.strip("'\"").strip()
698
+ if font_family:
699
+ span_font = font_family
700
+
701
+ # 解析 font-size
702
+ size_match = FONT_SIZE_RE.search(style)
703
+ if size_match:
704
+ size_str = size_match.group(1).strip()
705
+ # 处理不同单位:pt, px, em等
706
+ if size_str.endswith("pt"):
707
+ span_size = float(size_str[:-2])
708
+ elif size_str.endswith("px"):
709
+ # px转换为pt (1px ≈ 0.75pt)
710
+ span_size = float(size_str[:-2]) * 0.75
711
+ elif size_str.endswith("em"):
712
+ # em转换为pt (基于默认12pt)
713
+ span_size = float(size_str[:-2]) * 12
714
+ else:
715
+ # 尝试直接解析为数字
716
+ try:
717
+ span_size = float(size_str)
718
+ except:
719
+ pass
720
+
322
721
  # 解析class中的颜色
323
- if 'red' in classes:
324
- span_color = RGBColor(255, 0, 0)
325
- elif 'blue' in classes:
326
- span_color = RGBColor(0, 0, 255)
327
- elif 'green' in classes:
328
- span_color = RGBColor(0, 128, 0)
329
- elif 'purple' in classes:
330
- span_color = RGBColor(128, 0, 128)
331
-
722
+ class_set = set(classes) # 转换为集合提高查找性能
723
+ if "red" in class_set:
724
+ span_color = ConverterConfig.CLASS_COLORS["red"]
725
+ elif "blue" in class_set:
726
+ span_color = ConverterConfig.CLASS_COLORS["blue"]
727
+ elif "green" in class_set:
728
+ span_color = ConverterConfig.CLASS_COLORS["green"]
729
+ elif "purple" in class_set:
730
+ span_color = ConverterConfig.CLASS_COLORS["purple"]
731
+
332
732
  # 背景色
333
- bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
733
+ bg_match = BACKGROUND_COLOR_RE.search(style)
334
734
  if bg_match:
335
735
  span_bg = bg_match.group(1).strip()
336
- if 'highlight' in classes:
337
- span_bg = 'yellow'
338
-
339
- _process_element_to_runs(child, para, default_font, default_size,
340
- bold=bold, italic=italic, underline=underline, strike=strike,
341
- color=span_color, bg_color=span_bg,
342
- font_name=current_font, font_size=current_size)
736
+ if "highlight" in class_set:
737
+ span_bg = "yellow"
738
+
739
+ _process_element_to_runs(
740
+ child,
741
+ para,
742
+ default_font,
743
+ default_size,
744
+ bold=bold,
745
+ italic=italic,
746
+ underline=underline,
747
+ strike=strike,
748
+ color=span_color,
749
+ bg_color=span_bg,
750
+ font_name=span_font,
751
+ font_size=span_size,
752
+ )
343
753
  else:
344
754
  # 其他标签递归处理
345
- _process_element_to_runs(child, para, default_font, default_size,
346
- bold=bold, italic=italic, underline=underline, strike=strike,
347
- color=color, bg_color=bg_color)
755
+ _process_element_to_runs(
756
+ child,
757
+ para,
758
+ default_font,
759
+ default_size,
760
+ bold=bold,
761
+ italic=italic,
762
+ underline=underline,
763
+ strike=strike,
764
+ color=color,
765
+ bg_color=bg_color,
766
+ )
348
767
 
349
- def process_list_items(items, doc, ordered=False, default_font='微软雅黑', default_size=12, level=0):
768
+
769
+ def process_list_items(
770
+ items, doc, ordered=False, default_font="微软雅黑", default_size=12, level=0
771
+ ):
350
772
  """处理列表项,支持嵌套"""
351
773
  for item in items:
352
774
  # 创建列表项段落
353
775
  if ordered:
354
- para = doc.add_paragraph(style='List Number')
776
+ para = doc.add_paragraph(style="List Number")
355
777
  else:
356
- para = doc.add_paragraph(style='List Bullet')
357
-
778
+ para = doc.add_paragraph(style="List Bullet")
779
+
358
780
  # 设置缩进:每级增加 0.25 英寸
359
781
  para.paragraph_format.left_indent = Inches(0.25 * (level + 1))
360
-
782
+
361
783
  # 查找嵌套列表
362
- nested_ul = item.find('ul', recursive=False)
363
- nested_ol = item.find('ol', recursive=False)
364
-
784
+ nested_ul = item.find("ul", recursive=False)
785
+ nested_ol = item.find("ol", recursive=False)
786
+
365
787
  # 处理列表项的文本内容(排除嵌套列表)
366
788
  # 创建一个临时副本用于提取文本
367
- item_copy = BeautifulSoup(str(item), 'html.parser').find('li')
789
+ item_copy = BeautifulSoup(str(item), "html.parser").find("li")
368
790
  if item_copy:
369
791
  # 移除嵌套列表
370
- for nested in item_copy.find_all(['ul', 'ol'], recursive=False):
792
+ for nested in item_copy.find_all(["ul", "ol"], recursive=False):
371
793
  nested.decompose()
372
-
794
+
373
795
  # 处理剩余内容
374
796
  if item_copy.get_text().strip():
375
797
  _process_element_to_runs(item_copy, para, default_font, default_size)
376
-
798
+
377
799
  # 递归处理嵌套列表
378
800
  if nested_ul:
379
- nested_items = nested_ul.find_all('li', recursive=False)
380
- process_list_items(nested_items, doc, ordered=False,
381
- default_font=default_font, default_size=default_size, level=level+1)
801
+ nested_items = nested_ul.find_all("li", recursive=False)
802
+ process_list_items(
803
+ nested_items,
804
+ doc,
805
+ ordered=False,
806
+ default_font=default_font,
807
+ default_size=default_size,
808
+ level=level + 1,
809
+ )
382
810
  if nested_ol:
383
- nested_items = nested_ol.find_all('li', recursive=False)
384
- process_list_items(nested_items, doc, ordered=True,
385
- default_font=default_font, default_size=default_size, level=level+1)
811
+ nested_items = nested_ol.find_all("li", recursive=False)
812
+ process_list_items(
813
+ nested_items,
814
+ doc,
815
+ ordered=True,
816
+ default_font=default_font,
817
+ default_size=default_size,
818
+ level=level + 1,
819
+ )
820
+
386
821
 
387
822
  def _parse_style(style_str):
388
823
  """解析style字符串为字典"""
389
824
  styles = {}
390
825
  if not style_str:
391
826
  return styles
392
- for item in style_str.split(';'):
393
- if ':' in item:
394
- key, value = item.split(':', 1)
827
+ for item in style_str.split(";"):
828
+ if ":" in item:
829
+ key, value = item.split(":", 1)
395
830
  styles[key.strip()] = value.strip()
396
831
  return styles
397
832
 
833
+
398
834
  def _apply_cell_style(cell_elem, style_dict):
399
835
  """应用单元格样式"""
400
836
  # 背景色
401
- bg_color = style_dict.get('background-color', '')
837
+ bg_color = style_dict.get("background-color", "")
402
838
  if bg_color:
403
839
  # 处理颜色值
404
- if bg_color.startswith('#'):
405
- shading_elm = OxmlElement('w:shd')
406
- shading_elm.set(qn('w:fill'), bg_color[1:].upper())
840
+ if bg_color.startswith("#"):
841
+ shading_elm = OxmlElement("w:shd")
842
+ shading_elm.set(qn("w:fill"), bg_color[1:].upper())
407
843
  cell_elem._element.get_or_add_tcPr().append(shading_elm)
408
-
844
+
409
845
  # 文字颜色
410
- color = style_dict.get('color', '')
846
+ color = style_dict.get("color", "")
411
847
  if color:
412
- rgb = parse_color(color) if color.startswith('#') else None
848
+ rgb = parse_color(color) if color.startswith("#") else None
413
849
  if rgb:
414
850
  for run in cell_elem.paragraphs[0].runs:
415
851
  run.font.color.rgb = rgb
416
852
 
417
- def process_table(table, doc, default_font='微软雅黑', default_size=11):
853
+
854
+ def process_table(table, doc, default_font="微软雅黑", default_size=11):
418
855
  """处理表格,支持内联样式"""
419
- rows = table.find_all('tr')
856
+ rows = table.find_all("tr")
420
857
  if not rows:
421
858
  return
422
-
859
+
423
860
  # 获取列数
424
- cols = max(len(row.find_all(['td', 'th'])) for row in rows)
425
-
861
+ cols = max(len(row.find_all(["td", "th"])) for row in rows)
862
+
426
863
  # 创建表格
427
864
  word_table = doc.add_table(rows=len(rows), cols=cols)
428
- word_table.style = 'Table Grid'
429
-
865
+ word_table.style = "Table Grid"
866
+
430
867
  for row_idx, row in enumerate(rows):
431
868
  # 处理行样式(如背景色)
432
- row_style = _parse_style(row.get('style', ''))
433
- row_bg = row_style.get('background-color', '')
434
-
435
- cells = row.find_all(['td', 'th'])
869
+ row_style = _parse_style(row.get("style", ""))
870
+ row_bg = row_style.get("background-color", "")
871
+
872
+ cells = row.find_all(["td", "th"])
436
873
  for col_idx, cell in enumerate(cells):
437
874
  if col_idx < cols:
438
875
  cell_elem = word_table.rows[row_idx].cells[col_idx]
439
- cell_elem.paragraphs[0].text = cell.get_text().strip()
440
-
876
+
441
877
  # 解析单元格样式
442
- cell_style = _parse_style(cell.get('style', ''))
443
-
878
+ cell_style = _parse_style(cell.get("style", ""))
879
+
880
+ # 清空默认段落
881
+ cell_elem.paragraphs[0].clear()
882
+
883
+ # 使用 _process_element_to_runs 处理单元格内容,保留格式
884
+ _process_element_to_runs(
885
+ cell,
886
+ cell_elem.paragraphs[0],
887
+ default_font=default_font,
888
+ default_size=default_size,
889
+ )
890
+
444
891
  # 表头加粗
445
- if cell.name == 'th':
892
+ if cell.name == "th":
446
893
  for run in cell_elem.paragraphs[0].runs:
447
894
  run.font.bold = True
448
-
895
+
449
896
  # 设置单元格对齐
450
- align = cell_style.get('text-align', 'center')
451
- if align == 'center':
897
+ align = cell_style.get("text-align", "center")
898
+ if align == "center":
452
899
  cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
453
- elif align == 'left':
900
+ elif align == "left":
454
901
  cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
455
- elif align == 'right':
902
+ elif align == "right":
456
903
  cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
457
-
904
+
458
905
  # 应用单元格样式(背景色、文字颜色)
459
906
  _apply_cell_style(cell_elem, cell_style)
460
-
907
+
461
908
  # 如果行有背景色且单元格没有单独设置,应用行背景色
462
- if row_bg and not cell_style.get('background-color'):
463
- if row_bg.startswith('#'):
464
- shading_elm = OxmlElement('w:shd')
465
- shading_elm.set(qn('w:fill'), row_bg[1:].upper())
909
+ if row_bg and not cell_style.get("background-color"):
910
+ if row_bg.startswith("#"):
911
+ shading_elm = OxmlElement("w:shd")
912
+ shading_elm.set(qn("w:fill"), row_bg[1:].upper())
466
913
  cell_elem._element.get_or_add_tcPr().append(shading_elm)
467
914
 
915
+
468
916
  def set_section_columns(section, cols_num=2, space=720):
469
917
  """设置节的多栏布局
470
918
 
@@ -474,11 +922,12 @@ def set_section_columns(section, cols_num=2, space=720):
474
922
  space: 栏间距(单位:twips,1英寸=1440twips),默认720(0.5英寸)
475
923
  """
476
924
  sectPr = section._sectPr
477
- cols = OxmlElement('w:cols')
478
- cols.set(qn('w:num'), str(cols_num))
479
- cols.set(qn('w:space'), str(space))
925
+ cols = OxmlElement("w:cols")
926
+ cols.set(qn("w:num"), str(cols_num))
927
+ cols.set(qn("w:space"), str(space))
480
928
  sectPr.append(cols)
481
929
 
930
+
482
931
  def add_columns_section(doc, cols_num=2, space=720):
483
932
  """添加连续分节符并设置多栏布局(不换页)
484
933
 
@@ -496,312 +945,655 @@ def add_columns_section(doc, cols_num=2, space=720):
496
945
  set_section_columns(section, cols_num, space)
497
946
  return section
498
947
 
948
+
499
949
  def _process_blockquote(blockquote_elem, doc, level=0):
500
950
  """递归处理嵌套引用"""
501
- # 获取当前引用的直接文本内容(不包括嵌套引用)
502
- direct_text = ''
951
+ # 检查是否有直接内容(不包括嵌套引用)
952
+ has_content = False
503
953
  for child in blockquote_elem.children:
504
954
  if child.name is None: # 文本节点
505
- direct_text += str(child)
506
- elif child.name != 'blockquote': # 其他非引用标签
507
- direct_text += child.get_text()
508
-
509
- direct_text = ' '.join(direct_text.split())
955
+ if str(child).strip():
956
+ has_content = True
957
+ break
958
+ elif child.name != "blockquote" and child.get_text().strip():
959
+ has_content = True
960
+ break
510
961
 
511
- # 如果有直接文本,创建段落
512
- if direct_text:
962
+ # 如果有直接内容,创建段落
963
+ if has_content:
513
964
  para = doc.add_paragraph()
514
- run = para.add_run(direct_text)
515
- set_font(run, italic=True, color=RGBColor(100, 100, 100))
516
965
  # 根据层级设置缩进
517
966
  para.paragraph_format.left_indent = Inches(0.3 * level)
518
967
  para.paragraph_format.right_indent = Inches(0.5)
519
968
  # 添加灰色左边框
520
- pBdr = OxmlElement('w:pBdr')
521
- left_border = OxmlElement('w:left')
522
- left_border.set(qn('w:val'), 'single')
523
- left_border.set(qn('w:sz'), '18')
524
- left_border.set(qn('w:color'), 'CCCCCC')
969
+ pBdr = OxmlElement("w:pBdr")
970
+ left_border = OxmlElement("w:left")
971
+ left_border.set(qn("w:val"), "single")
972
+ left_border.set(qn("w:sz"), "18")
973
+ left_border.set(qn("w:color"), "CCCCCC")
525
974
  pBdr.append(left_border)
526
975
  para.paragraph_format._element.get_or_add_pPr().append(pBdr)
527
976
 
977
+ # 创建一个临时元素来包含所有非blockquote的子元素
978
+ from bs4 import BeautifulSoup
979
+
980
+ temp_soup = BeautifulSoup("<div></div>", "html.parser")
981
+ temp_div = temp_soup.div
982
+
983
+ # 复制所有非blockquote的子元素
984
+ for child in blockquote_elem.children:
985
+ if child.name != "blockquote":
986
+ temp_div.append(
987
+ child.__copy__() if hasattr(child, "__copy__") else child
988
+ )
989
+
990
+ # 使用 _process_element_to_runs 处理格式化内容
991
+ # 注意:引用内容默认斜体和灰色
992
+ _process_element_to_runs(
993
+ temp_div,
994
+ para,
995
+ default_font="微软雅黑",
996
+ default_size=12,
997
+ italic=True,
998
+ color=RGBColor(100, 100, 100),
999
+ )
1000
+
528
1001
  # 递归处理嵌套引用
529
- nested_quotes = blockquote_elem.find_all('blockquote', recursive=False)
1002
+ nested_quotes = blockquote_elem.find_all("blockquote", recursive=False)
530
1003
  for nested in nested_quotes:
531
1004
  _process_blockquote(nested, doc, level + 1)
532
1005
 
1006
+
533
1007
  def add_page_break(doc):
534
1008
  """添加分页符"""
535
1009
  doc.add_page_break()
536
1010
 
1011
+
537
1012
  def add_horizontal_rule(doc):
538
1013
  """添加水平线"""
539
1014
  para = doc.add_paragraph()
540
1015
  para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
541
- run = para.add_run('_' * 50)
1016
+ run = para.add_run("_" * 50)
542
1017
  run.font.color.rgb = RGBColor(200, 200, 200)
543
1018
 
544
- def convert_html_to_docx(html_file, output_file, default_font='微软雅黑', default_size=12):
1019
+
1020
+ # ==================== 辅助函数 ====================
1021
+ def _init_document(default_font, default_size):
1022
+ """初始化Word文档"""
1023
+ doc = Document()
1024
+ doc.styles["Normal"].font.name = default_font
1025
+ doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1026
+ doc.styles["Normal"].font.size = Pt(default_size)
1027
+
1028
+ # 处理页面设置
1029
+ section = doc.sections[0]
1030
+ section.page_height = Cm(ConverterConfig.PAGE_HEIGHT_CM)
1031
+ section.page_width = Cm(ConverterConfig.PAGE_WIDTH_CM)
1032
+ section.left_margin = Cm(ConverterConfig.MARGIN_CM)
1033
+ section.right_margin = Cm(ConverterConfig.MARGIN_CM)
1034
+ section.top_margin = Cm(ConverterConfig.MARGIN_CM)
1035
+ section.bottom_margin = Cm(ConverterConfig.MARGIN_CM)
1036
+
1037
+ return doc
1038
+
1039
+
1040
+ def _read_html_file(html_file):
1041
+ """读取HTML文件"""
1042
+ with open(html_file, "r", encoding="utf-8") as f:
1043
+ return f.read()
1044
+
1045
+
1046
+ def _parse_html(html_content):
1047
+ """解析HTML内容"""
1048
+ return BeautifulSoup(html_content, "html.parser")
1049
+
1050
+
1051
+ def _process_heading(element, doc, default_font):
1052
+ """处理标题元素"""
1053
+ level = int(element.name[1])
1054
+ heading = doc.add_heading(element.get_text().strip(), level=level)
1055
+ heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
1056
+
1057
+ # 标题样式
1058
+ for run in heading.runs:
1059
+ run.font.name = default_font
1060
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1061
+ run.font.size = Pt(ConverterConfig.HEADING_SIZES.get(level, 14))
1062
+ run.font.bold = True
1063
+ run.font.color.rgb = ConverterConfig.HEADING_COLORS.get(
1064
+ level, RGBColor(107, 91, 149)
1065
+ )
1066
+ if level == 1:
1067
+ heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
1068
+
1069
+
1070
+ def _process_paragraph_element(element, doc, default_font, default_size):
1071
+ """处理段落元素"""
1072
+ classes = element.get("class", [])
1073
+ class_set = set(classes)
1074
+
1075
+ if "center" in class_set:
1076
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
1077
+ elif "right" in class_set:
1078
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT)
1079
+ elif "dialogue" in class_set:
1080
+ para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
1081
+ for run in para.runs:
1082
+ set_font(run, italic=True, color=RGBColor(107, 91, 122))
1083
+ elif "quote" in class_set or element.get("style", "").find("background") != -1:
1084
+ para = process_paragraph(element, doc)
1085
+ para.paragraph_format.left_indent = Inches(1)
1086
+ para.paragraph_format.right_indent = Inches(1)
1087
+ from docx.enum.text import WD_BORDER
1088
+
1089
+ for border in para.paragraph_format._element.xpath("./w:pBdr"):
1090
+ border.getparent().remove(border)
1091
+ # 添加边框效果(使用浅灰色背景模拟)
1092
+ shading_elm = OxmlElement("w:shd")
1093
+ shading_elm.set(qn("w:fill"), "F5F5F5")
1094
+ para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
1095
+ else:
1096
+ process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
1097
+
1098
+
1099
+ def _process_list_element(element, doc, ordered):
1100
+ """处理列表元素"""
1101
+ items = element.find_all("li", recursive=False)
1102
+ process_list_items(items, doc, ordered=ordered)
1103
+
1104
+
1105
+ def _process_image_element(element, doc, html_file):
1106
+ """处理图片元素"""
1107
+ src = element.get("src", "")
1108
+ if src:
1109
+ # 解析宽度、高度和对齐方式
1110
+ width = element.get("width")
1111
+ height = element.get("height")
1112
+ style = element.get("style", "")
1113
+ align = element.get("align", "center")
1114
+
1115
+ # 从 style 中提取对齐方式
1116
+ if "text-align: right" in style or "float: right" in style:
1117
+ align = "right"
1118
+ elif "text-align: left" in style or "float: left" in style:
1119
+ align = "left"
1120
+ elif "text-align: center" in style:
1121
+ align = "center"
1122
+
1123
+ # 处理宽度高度(支持像素转英寸)
1124
+ width_inch = None
1125
+ height_inch = None
1126
+ if width:
1127
+ try:
1128
+ width_px = float(width)
1129
+ width_inch = width_px / 96 # 假设96 DPI
1130
+ except:
1131
+ pass
1132
+ if height:
1133
+ try:
1134
+ height_px = float(height)
1135
+ height_inch = height_px / 96
1136
+ except:
1137
+ pass
1138
+
1139
+ # 处理相对路径(相对于HTML文件)
1140
+ html_dir = os.path.dirname(html_file)
1141
+ image_path = os.path.join(html_dir, src) if not os.path.isabs(src) else src
1142
+
1143
+ # 添加图片
1144
+ add_image(doc, image_path, width_inch, height_inch, align)
1145
+
1146
+
1147
+ def _process_div_element(element, doc, default_font, default_size):
1148
+ """处理div元素"""
1149
+ classes = element.get("class", [])
1150
+ class_set = set(classes)
1151
+
1152
+ if "chapter" in class_set:
1153
+ # 处理章节
1154
+ h2 = element.find("h2")
1155
+ if h2:
1156
+ heading = doc.add_heading(h2.get_text().strip(), level=2)
1157
+ for run in heading.runs:
1158
+ run.font.color.rgb = RGBColor(91, 78, 140)
1159
+ run.font.size = Pt(16)
1160
+ run.font.name = default_font
1161
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1162
+
1163
+ paragraphs = element.find_all("p")
1164
+ for p in paragraphs:
1165
+ first_span = p.find("span", class_="first-line")
1166
+ if first_span:
1167
+ # 处理首字下沉效果
1168
+ para = doc.add_paragraph()
1169
+ para.paragraph_format.first_line_indent = Inches(0)
1170
+
1171
+ first_char_run = para.add_run(first_span.text)
1172
+ set_font(
1173
+ first_char_run, size=20, bold=True, color=RGBColor(102, 126, 234)
1174
+ )
1175
+ remaining_text = p.get_text().replace(first_span.text, "", 1)
1176
+ run = para.add_run(remaining_text)
1177
+ set_font(run)
1178
+ else:
1179
+ process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
1180
+
1181
+ elif "ending" in class_set:
1182
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
1183
+ for run in para.runs:
1184
+ set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
1185
+
1186
+ elif "page-break" in class_set:
1187
+ add_page_break(doc)
1188
+
1189
+ elif "columns" in class_set:
1190
+ # 处理多栏布局(使用连续分节符,不换页)
1191
+ cols_num = int(element.get("data-cols", "2"))
1192
+ # 添加连续分节符并设置栏数
1193
+ add_columns_section(doc, cols_num)
1194
+ # 处理其中的段落
1195
+ for p in element.find_all("p", recursive=False):
1196
+ process_paragraph(
1197
+ p, doc, default_font=default_font, default_size=default_size
1198
+ )
1199
+
1200
+ elif "info" in class_set or "warning" in class_set or "success" in class_set:
1201
+ # 处理提示框
1202
+ para = doc.add_paragraph()
1203
+ para.paragraph_format.right_indent = Inches(0.3)
1204
+
1205
+ # 设置背景色和左边框颜色
1206
+ if "info" in class_set:
1207
+ bg_color = ConverterConfig.INFO_COLORS["bg"]
1208
+ border_color = ConverterConfig.INFO_COLORS["border"]
1209
+ elif "warning" in class_set:
1210
+ bg_color = ConverterConfig.WARNING_COLORS["bg"]
1211
+ border_color = ConverterConfig.WARNING_COLORS["border"]
1212
+ else: # success
1213
+ bg_color = ConverterConfig.SUCCESS_COLORS["bg"]
1214
+ border_color = ConverterConfig.SUCCESS_COLORS["border"]
1215
+
1216
+ # 处理内容
1217
+ _process_element_to_runs(element, para, default_font, default_size)
1218
+
1219
+ # 添加背景色
1220
+ shading_elm = OxmlElement("w:shd")
1221
+ shading_elm.set(qn("w:fill"), bg_color)
1222
+ para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
1223
+
1224
+ # 添加左边框
1225
+ pPr = para.paragraph_format._element.get_or_add_pPr()
1226
+ pBdr = OxmlElement("w:pBdr")
1227
+ left = OxmlElement("w:left")
1228
+ left.set(qn("w:val"), "single")
1229
+ left.set(qn("w:sz"), "4")
1230
+ left.set(qn("w:color"), border_color)
1231
+ pBdr.append(left)
1232
+ pPr.append(pBdr)
1233
+
1234
+ para.paragraph_format.space_after = Pt(6)
1235
+
1236
+
1237
+ def _process_horizontal_rule_element(element, doc):
1238
+ """处理水平线元素"""
1239
+ classes = element.get("class", [])
1240
+ style = element.get("style", "")
1241
+ class_set = set(classes)
1242
+ if "page-break" in class_set or "page-break-after" in style:
1243
+ add_page_break(doc)
1244
+ else:
1245
+ add_horizontal_rule(doc)
1246
+
1247
+
1248
+ def _process_elements(soup, doc, html_file, default_font, default_size):
1249
+ """处理所有HTML元素"""
1250
+ for element in soup.body.find_all(recursive=False):
1251
+ if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
1252
+ _process_heading(element, doc, default_font)
1253
+ elif element.name == "p":
1254
+ _process_paragraph_element(element, doc, default_font, default_size)
1255
+ elif element.name == "ul":
1256
+ _process_list_element(element, doc, ordered=False)
1257
+ elif element.name == "ol":
1258
+ _process_list_element(element, doc, ordered=True)
1259
+ elif element.name == "table":
1260
+ process_table(element, doc)
1261
+ elif element.name == "img":
1262
+ _process_image_element(element, doc, html_file)
1263
+ elif element.name == "div":
1264
+ _process_div_element(element, doc, default_font, default_size)
1265
+ elif element.name == "hr":
1266
+ _process_horizontal_rule_element(element, doc)
1267
+
1268
+
1269
+ def convert_html_to_docx(
1270
+ html_file, output_file, default_font="微软雅黑", default_size=12
1271
+ ):
545
1272
  """将HTML文件转换为DOCX文件"""
546
1273
  # 读取HTML文件
547
- with open(html_file, 'r', encoding='utf-8') as f:
1274
+ with open(html_file, "r", encoding="utf-8") as f:
548
1275
  html_content = f.read()
549
-
1276
+
550
1277
  # 解析HTML
551
- soup = BeautifulSoup(html_content, 'html.parser')
552
-
1278
+ soup = BeautifulSoup(html_content, "html.parser")
1279
+
553
1280
  # 创建Word文档
554
1281
  doc = Document()
555
-
1282
+
556
1283
  # 设置默认字体
557
- doc.styles['Normal'].font.name = default_font
558
- doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
559
- doc.styles['Normal'].font.size = Pt(default_size)
560
-
1284
+ doc.styles["Normal"].font.name = default_font
1285
+ doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1286
+ doc.styles["Normal"].font.size = Pt(default_size)
1287
+
561
1288
  # 处理页面设置
562
1289
  section = doc.sections[0]
563
- section.page_height = Cm(29.7)
564
- section.page_width = Cm(21)
565
- section.left_margin = Cm(2.54)
566
- section.right_margin = Cm(2.54)
567
- section.top_margin = Cm(2.54)
568
- section.bottom_margin = Cm(2.54)
569
-
1290
+ section.page_height = Cm(ConverterConfig.PAGE_HEIGHT_CM)
1291
+ section.page_width = Cm(ConverterConfig.PAGE_WIDTH_CM)
1292
+ section.left_margin = Cm(ConverterConfig.MARGIN_CM)
1293
+ section.right_margin = Cm(ConverterConfig.MARGIN_CM)
1294
+ section.top_margin = Cm(ConverterConfig.MARGIN_CM)
1295
+ section.bottom_margin = Cm(ConverterConfig.MARGIN_CM)
1296
+
570
1297
  # 遍历所有顶级元素
571
1298
  for element in soup.body.find_all(recursive=False):
572
- if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
1299
+ if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
573
1300
  level = int(element.name[1])
574
1301
  heading = doc.add_heading(element.get_text().strip(), level=level)
575
1302
  heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
576
-
1303
+
577
1304
  # 标题样式
578
1305
  for run in heading.runs:
579
1306
  run.font.name = default_font
580
- run._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
1307
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1308
+ run.font.size = Pt(ConverterConfig.HEADING_SIZES.get(level, 14))
1309
+ run.font.bold = True
1310
+ run.font.color.rgb = ConverterConfig.HEADING_COLORS.get(
1311
+ level, RGBColor(107, 91, 149)
1312
+ )
581
1313
  if level == 1:
582
- run.font.size = Pt(18)
583
- run.font.bold = True
584
- run.font.color.rgb = RGBColor(74, 63, 107)
585
1314
  heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
586
- elif level == 2:
587
- run.font.size = Pt(16)
588
- run.font.bold = True
589
- run.font.color.rgb = RGBColor(91, 78, 140)
590
- else:
591
- run.font.size = Pt(14)
592
- run.font.bold = True
593
-
594
- elif element.name == 'p':
1315
+
1316
+ elif element.name == "p":
595
1317
  # 检查特殊段落样式
596
- classes = element.get('class', [])
597
-
598
- if 'center' in classes:
599
- para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
600
- elif 'right' in classes:
601
- para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT)
602
- elif 'dialogue' in classes:
1318
+ classes = element.get("class", [])
1319
+ class_set = set(classes) # 转换为集合提高查找性能
1320
+
1321
+ if "center" in class_set:
1322
+ para = process_paragraph(
1323
+ element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER
1324
+ )
1325
+ elif "right" in class_set:
1326
+ para = process_paragraph(
1327
+ element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT
1328
+ )
1329
+ elif "dialogue" in class_set:
603
1330
  para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
604
1331
  for run in para.runs:
605
1332
  set_font(run, italic=True, color=RGBColor(107, 91, 122))
606
- elif 'quote' in classes or element.get('style', '').find('background') != -1:
1333
+ elif (
1334
+ "quote" in class_set
1335
+ or element.get("style", "").find("background") != -1
1336
+ ):
607
1337
  para = process_paragraph(element, doc)
608
1338
  para.paragraph_format.left_indent = Inches(1)
609
1339
  para.paragraph_format.right_indent = Inches(1)
610
1340
  from docx.enum.text import WD_BORDER
611
- for border in para.paragraph_format._element.xpath('./w:pBdr'):
1341
+
1342
+ for border in para.paragraph_format._element.xpath("./w:pBdr"):
612
1343
  border.getparent().remove(border)
613
1344
  # 添加边框效果(使用浅灰色背景模拟)
614
- shading_elm = OxmlElement('w:shd')
615
- shading_elm.set(qn('w:fill'), 'F5F5F5')
1345
+ shading_elm = OxmlElement("w:shd")
1346
+ shading_elm.set(qn("w:fill"), "F5F5F5")
616
1347
  para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
617
1348
  else:
618
1349
  process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
619
-
620
- elif element.name == 'ul':
621
- items = element.find_all('li', recursive=False)
1350
+
1351
+ elif element.name == "ul":
1352
+ items = element.find_all("li", recursive=False)
622
1353
  process_list_items(items, doc, ordered=False)
623
-
624
- elif element.name == 'ol':
625
- items = element.find_all('li', recursive=False)
1354
+
1355
+ elif element.name == "ol":
1356
+ items = element.find_all("li", recursive=False)
626
1357
  process_list_items(items, doc, ordered=True)
627
-
628
- elif element.name == 'blockquote':
1358
+
1359
+ elif element.name == "blockquote":
629
1360
  # 递归处理嵌套引用
630
1361
  _process_blockquote(element, doc, level=0)
631
-
632
- elif element.name == 'pre':
1362
+
1363
+ elif element.name == "pre":
633
1364
  code_text = element.get_text()
634
1365
  para = doc.add_paragraph()
635
1366
  para.paragraph_format.left_indent = Inches(0.5)
636
1367
  run = para.add_run(code_text)
637
- set_font(run, font_name='Consolas', size=10, color=RGBColor(0, 0, 128))
1368
+ set_font(run, font_name="Consolas", size=10, color=RGBColor(0, 0, 128))
638
1369
  # 添加灰色背景
639
- shading_elm = OxmlElement('w:shd')
640
- shading_elm.set(qn('w:fill'), 'F0F0F0')
1370
+ shading_elm = OxmlElement("w:shd")
1371
+ shading_elm.set(qn("w:fill"), "F0F0F0")
641
1372
  para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
642
-
643
- elif element.name == 'hr':
1373
+
1374
+ elif element.name == "hr":
644
1375
  # 检查是否有分页符class或style
645
- classes = element.get('class', [])
646
- style = element.get('style', '')
647
- if 'page-break' in classes or 'page-break-after' in style:
1376
+ classes = element.get("class", [])
1377
+ style = element.get("style", "")
1378
+ class_set = set(classes)
1379
+ if "page-break" in class_set or "page-break-after" in style:
648
1380
  add_page_break(doc)
649
1381
  else:
650
1382
  add_horizontal_rule(doc)
651
-
652
- elif element.name == 'table':
1383
+
1384
+ elif element.name == "table":
653
1385
  process_table(element, doc)
654
-
655
- elif element.name == 'div':
1386
+
1387
+ elif element.name == "img":
1388
+ # 处理图片
1389
+ src = element.get("src", "")
1390
+ alt = element.get("alt", "")
1391
+
1392
+ if src:
1393
+ # 解析宽度、高度和对齐方式
1394
+ width = element.get("width")
1395
+ height = element.get("height")
1396
+ style = element.get("style", "")
1397
+ align = element.get("align", "center")
1398
+
1399
+ # 从 style 中提取对齐方式
1400
+ if "text-align: right" in style or "float: right" in style:
1401
+ align = "right"
1402
+ elif "text-align: left" in style or "float: left" in style:
1403
+ align = "left"
1404
+ elif "text-align: center" in style:
1405
+ align = "center"
1406
+
1407
+ # 处理宽度高度(支持像素转英寸)
1408
+ width_inch = None
1409
+ height_inch = None
1410
+ if width:
1411
+ try:
1412
+ width_px = float(width)
1413
+ width_inch = width_px / 96 # 假设96 DPI
1414
+ except:
1415
+ pass
1416
+ if height:
1417
+ try:
1418
+ height_px = float(height)
1419
+ height_inch = height_px / 96
1420
+ except:
1421
+ pass
1422
+
1423
+ # 处理相对路径(相对于HTML文件)
1424
+ html_dir = os.path.dirname(html_file)
1425
+ image_path = (
1426
+ os.path.join(html_dir, src) if not os.path.isabs(src) else src
1427
+ )
1428
+
1429
+ # 添加图片
1430
+ add_image(doc, image_path, width_inch, height_inch, align)
1431
+
1432
+ elif element.name == "div":
656
1433
  # 检查是否是特殊div
657
- classes = element.get('class', [])
658
- if 'chapter' in classes:
1434
+ classes = element.get("class", [])
1435
+ class_set = set(classes)
1436
+
1437
+ if "chapter" in class_set:
659
1438
  # 处理章节
660
- h2 = element.find('h2')
1439
+ h2 = element.find("h2")
661
1440
  if h2:
662
1441
  heading = doc.add_heading(h2.get_text().strip(), level=2)
663
1442
  for run in heading.runs:
664
1443
  run.font.color.rgb = RGBColor(91, 78, 140)
665
1444
  run.font.size = Pt(16)
666
1445
  run.font.name = default_font
667
- run._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
668
-
669
- paragraphs = element.find_all('p')
1446
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1447
+
1448
+ paragraphs = element.find_all("p")
670
1449
  for p in paragraphs:
671
- first_span = p.find('span', class_='first-line')
1450
+ first_span = p.find("span", class_="first-line")
672
1451
  if first_span:
673
1452
  para = doc.add_paragraph()
674
1453
  first_char_run = para.add_run(first_span.text)
675
- set_font(first_char_run, size=20, bold=True, color=RGBColor(102, 126, 234))
676
- remaining_text = p.get_text().replace(first_span.text, '', 1)
1454
+ set_font(
1455
+ first_char_run,
1456
+ size=20,
1457
+ bold=True,
1458
+ color=RGBColor(102, 126, 234),
1459
+ )
1460
+ remaining_text = p.get_text().replace(first_span.text, "", 1)
677
1461
  run = para.add_run(remaining_text)
678
1462
  set_font(run)
679
1463
  else:
680
1464
  process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
681
-
682
- elif 'ending' in classes:
683
- para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
1465
+
1466
+ elif "ending" in class_set:
1467
+ para = process_paragraph(
1468
+ element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER
1469
+ )
684
1470
  for run in para.runs:
685
1471
  set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
686
-
687
- elif 'page-break' in classes:
1472
+
1473
+ elif "page-break" in class_set:
688
1474
  add_page_break(doc)
689
1475
 
690
- elif 'columns' in classes:
1476
+ elif "columns" in class_set:
691
1477
  # 处理多栏布局(使用连续分节符,不换页)
692
- cols_num = int(element.get('data-cols', '2'))
1478
+ cols_num = int(element.get("data-cols", "2"))
693
1479
  # 添加连续分节符并设置栏数
694
1480
  add_columns_section(doc, cols_num)
695
1481
  # 处理其中的段落
696
- for p in element.find_all('p', recursive=False):
697
- process_paragraph(p, doc, default_font=default_font, default_size=default_size)
1482
+ for p in element.find_all("p", recursive=False):
1483
+ process_paragraph(
1484
+ p, doc, default_font=default_font, default_size=default_size
1485
+ )
698
1486
 
699
- elif 'info' in classes or 'warning' in classes or 'success' in classes:
1487
+ elif (
1488
+ "info" in class_set or "warning" in class_set or "success" in class_set
1489
+ ):
700
1490
  # 处理提示框
701
1491
  para = doc.add_paragraph()
702
1492
  para.paragraph_format.right_indent = Inches(0.3)
703
-
1493
+
704
1494
  # 设置背景色和左边框颜色
705
- if 'info' in classes:
706
- bg_color = 'E3F2FD' # 浅蓝
707
- border_color = '2196F3' # 蓝色
708
- elif 'warning' in classes:
709
- bg_color = 'FFF3CD' # 浅黄
710
- border_color = 'FFC107' # 黄色
1495
+ if "info" in class_set:
1496
+ bg_color = ConverterConfig.INFO_COLORS["bg"]
1497
+ border_color = ConverterConfig.INFO_COLORS["border"]
1498
+ elif "warning" in class_set:
1499
+ bg_color = ConverterConfig.WARNING_COLORS["bg"]
1500
+ border_color = ConverterConfig.WARNING_COLORS["border"]
711
1501
  else: # success
712
- bg_color = 'D4EDDA' # 浅绿
713
- border_color = '28A745' # 绿色
714
-
1502
+ bg_color = ConverterConfig.SUCCESS_COLORS["bg"]
1503
+ border_color = ConverterConfig.SUCCESS_COLORS["border"]
1504
+
715
1505
  # 处理内容
716
1506
  _process_element_to_runs(element, para, default_font, default_size)
717
-
1507
+
718
1508
  # 添加背景色
719
- shading_elm = OxmlElement('w:shd')
720
- shading_elm.set(qn('w:fill'), bg_color)
1509
+ shading_elm = OxmlElement("w:shd")
1510
+ shading_elm.set(qn("w:fill"), bg_color)
721
1511
  para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
722
-
1512
+
723
1513
  # 添加左边框
724
- pBdr = OxmlElement('w:pBdr')
725
- left_border = OxmlElement('w:left')
726
- left_border.set(qn('w:val'), 'single')
727
- left_border.set(qn('w:sz'), '24') # 边框粗细
728
- left_border.set(qn('w:color'), border_color)
1514
+ pBdr = OxmlElement("w:pBdr")
1515
+ left_border = OxmlElement("w:left")
1516
+ left_border.set(qn("w:val"), "single")
1517
+ left_border.set(qn("w:sz"), "24") # 边框粗细
1518
+ left_border.set(qn("w:color"), border_color)
729
1519
  pBdr.append(left_border)
730
1520
  para.paragraph_format._element.get_or_add_pPr().append(pBdr)
731
-
1521
+
732
1522
  else:
733
1523
  # 处理普通div,检查是否有内联样式(如提示框)
734
- style = element.get('style', '')
1524
+ style = element.get("style", "")
735
1525
  style_dict = _parse_style(style)
736
-
1526
+
737
1527
  # 检查是否有背景色和左边框(提示框特征)
738
- bg_color = style_dict.get('background-color', '')
739
- border_left = style_dict.get('border-left', '')
740
-
1528
+ bg_color = style_dict.get("background-color", "")
1529
+ border_left = style_dict.get("border-left", "")
1530
+
741
1531
  if bg_color and border_left:
742
1532
  # 这是提示框
743
1533
  para = doc.add_paragraph()
744
1534
  para.paragraph_format.right_indent = Inches(0.3)
745
-
1535
+
746
1536
  # 处理内容
747
1537
  _process_element_to_runs(element, para, default_font, default_size)
748
-
1538
+
749
1539
  # 添加背景色
750
- if bg_color.startswith('#'):
751
- shading_elm = OxmlElement('w:shd')
752
- shading_elm.set(qn('w:fill'), bg_color[1:].upper())
753
- para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
754
-
1540
+ if bg_color.startswith("#"):
1541
+ shading_elm = OxmlElement("w:shd")
1542
+ shading_elm.set(qn("w:fill"), bg_color[1:].upper())
1543
+ para.paragraph_format._element.get_or_add_pPr().append(
1544
+ shading_elm
1545
+ )
1546
+
755
1547
  # 解析左边框颜色
756
- border_color = ''
757
- if 'solid' in border_left:
1548
+ border_color = ""
1549
+ if "solid" in border_left:
758
1550
  parts = border_left.split()
759
1551
  for i, part in enumerate(parts):
760
- if part.startswith('#'):
1552
+ if part.startswith("#"):
761
1553
  border_color = part[1:]
762
1554
  break
763
-
1555
+
764
1556
  # 添加左边框
765
1557
  if border_color:
766
- pBdr = OxmlElement('w:pBdr')
767
- left_border = OxmlElement('w:left')
768
- left_border.set(qn('w:val'), 'single')
769
- left_border.set(qn('w:sz'), '24')
770
- left_border.set(qn('w:color'), border_color.upper())
1558
+ pBdr = OxmlElement("w:pBdr")
1559
+ left_border = OxmlElement("w:left")
1560
+ left_border.set(qn("w:val"), "single")
1561
+ left_border.set(qn("w:sz"), "24")
1562
+ left_border.set(qn("w:color"), border_color.upper())
771
1563
  pBdr.append(left_border)
772
1564
  para.paragraph_format._element.get_or_add_pPr().append(pBdr)
773
1565
  else:
774
1566
  # 普通div,处理其中的段落
775
- for p in element.find_all('p', recursive=False):
1567
+ for p in element.find_all("p", recursive=False):
776
1568
  process_paragraph(p, doc)
777
-
778
- elif element.name == 'img':
779
- src = element.get('src', '')
780
- alt = element.get('alt', '图片')
1569
+
1570
+ elif element.name == "img":
1571
+ src = element.get("src", "")
1572
+ alt = element.get("alt", "图片")
781
1573
  if src and os.path.exists(src):
782
1574
  try:
783
1575
  doc.add_picture(src, width=Inches(5))
784
1576
  last_para = doc.paragraphs[-1]
785
1577
  last_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
786
1578
  except:
787
- para = doc.add_paragraph(f'[图片: {alt}]')
1579
+ para = doc.add_paragraph(f"[图片: {alt}]")
788
1580
  para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
789
1581
  else:
790
- para = doc.add_paragraph(f'[图片: {alt} - 路径: {src}]')
1582
+ para = doc.add_paragraph(f"[图片: {alt} - 路径: {src}]")
791
1583
  para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
792
-
1584
+
793
1585
  # 保存文档
794
1586
  doc.save(output_file)
795
- print(f"转换完成!文件已保存为 {output_file}")
796
1587
 
797
- if __name__ == '__main__':
1588
+
1589
+ if __name__ == "__main__":
798
1590
  import sys
799
-
800
- if len(sys.argv) > 2:
801
- html_file = sys.argv[1]
802
- output_file = sys.argv[2]
803
- else:
804
- html_file = r'C:\Users\birth\Desktop\tmp\test\sample.html'
805
- output_file = r'C:\Users\birth\Desktop\tmp\test\sample.docx'
806
-
807
- convert_html_to_docx(html_file, output_file)
1591
+
1592
+ if len(sys.argv) != 3:
1593
+ print("用法: python docx_converter.py <html_file> <output_file>")
1594
+ sys.exit(1)
1595
+
1596
+ html_file = sys.argv[1]
1597
+ output_file = sys.argv[2]
1598
+
1599
+ convert_html_to_docx(html_file, output_file)