@birthday8/doc-mcp 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -4,13 +4,225 @@ from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_LINE_SPACING
4
4
  from docx.enum.section import WD_SECTION
5
5
  from docx.oxml.ns import qn
6
6
  from docx.oxml import OxmlElement
7
+ from docx.enum.shape import WD_INLINE_SHAPE
7
8
  from bs4 import BeautifulSoup
8
9
  import os
9
10
  import re
11
+ import json
12
+
13
+ # 尝试导入 math2docx
14
+ try:
15
+ import math2docx
16
+
17
+ HAS_MATH2DOCX = True
18
+ except ImportError:
19
+ HAS_MATH2DOCX = False
20
+ print("Warning: math2docx not installed. Formula support will be limited.")
21
+
22
+ # ==================== 预编译正则表达式 ====================
23
+ # 样式解析相关
24
+ TEXT_ALIGN_RE = re.compile(r"text-align:\s*([^;]+)")
25
+ LINE_HEIGHT_RE = re.compile(r"line-height:\s*([^;]+)")
26
+ COLOR_RE = re.compile(r"(?<!background-)color:\s*([^;]+)")
27
+ BACKGROUND_COLOR_RE = re.compile(r"background-color:\s*([^;]+)")
28
+ FONT_FAMILY_RE = re.compile(r"font-family:\s*([^;]+)")
29
+ FONT_SIZE_RE = re.compile(r"font-size:\s*([^;]+)")
30
+ FONT_WEIGHT_RE = re.compile(r"font-weight:\s*([^;]+)")
31
+ FONT_STYLE_RE = re.compile(r"font-style:\s*([^;]+)")
32
+ TEXT_DECORATION_RE = re.compile(r"text-decoration:\s*([^;]+)")
33
+ MARGIN_RE = re.compile(r"margin(?:-(top|bottom|left|right))?:\s*([^;]+)")
34
+ PADDING_RE = re.compile(r"padding(?:-(top|bottom|left|right))?:\s*([^;]+)")
35
+
36
+ # 公式相关
37
+ LATEX_FORMULA_RE = re.compile(r"\$\$(.*?)\$\$|\$(.*?)\$")
38
+
39
+
40
+ # ==================== 常量配置 ====================
41
+ class ConverterConfig:
42
+ """转换器配置常量"""
43
+
44
+ DEFAULT_FONT = "微软雅黑"
45
+ DEFAULT_SIZE = 12
46
+
47
+ # 页面设置
48
+ PAGE_HEIGHT_CM = 29.7
49
+ PAGE_WIDTH_CM = 21.0
50
+ MARGIN_CM = 2.54
51
+
52
+ # 字体大小映射
53
+ HEADING_SIZES = {
54
+ 1: 18,
55
+ 2: 16,
56
+ 3: 14,
57
+ 4: 14,
58
+ 5: 14,
59
+ 6: 14,
60
+ }
61
+
62
+ # 标题颜色
63
+ HEADING_COLORS = {
64
+ 1: RGBColor(74, 63, 107),
65
+ 2: RGBColor(91, 78, 140),
66
+ 3: RGBColor(107, 91, 149),
67
+ 4: RGBColor(122, 104, 161),
68
+ 5: RGBColor(137, 117, 173),
69
+ 6: RGBColor(152, 130, 185),
70
+ }
71
+
72
+ # 特殊类名颜色
73
+ CLASS_COLORS = {
74
+ "red": RGBColor(255, 0, 0),
75
+ "blue": RGBColor(0, 0, 255),
76
+ "green": RGBColor(0, 128, 0),
77
+ "purple": RGBColor(128, 0, 128),
78
+ }
79
+
80
+ # 提示框颜色
81
+ INFO_COLORS = {
82
+ "bg": "E3F2FD",
83
+ "border": "2196F3",
84
+ }
85
+ WARNING_COLORS = {
86
+ "bg": "FFF3CD",
87
+ "border": "FFC107",
88
+ }
89
+ SUCCESS_COLORS = {
90
+ "bg": "D4EDDA",
91
+ "border": "28A745",
92
+ }
93
+
94
+ # 颜色映射
95
+ COLOR_MAP = {
96
+ "red": "FF0000",
97
+ "green": "008000",
98
+ "blue": "0000FF",
99
+ "yellow": "FFFF00",
100
+ "orange": "FFA500",
101
+ "purple": "800080",
102
+ "pink": "FFC0CB",
103
+ "brown": "A52A2A",
104
+ "gray": "808080",
105
+ "black": "000000",
106
+ "white": "FFFFFF",
107
+ }
108
+
109
+
110
+ def add_image(doc, image_path, width=None, height=None, align="center"):
111
+ """添加图片到文档
112
+
113
+ Args:
114
+ doc: Word文档对象
115
+ image_path: 图片路径(绝对路径或相对于html文件的路径)
116
+ width: 图片宽度(英寸,可选)
117
+ height: 图片高度(英寸,可选)
118
+ align: 对齐方式('left', 'center', 'right')
119
+ """
120
+ # 检查文件是否存在
121
+ if not os.path.exists(image_path):
122
+ print(f"Warning: Image file not found: {image_path}")
123
+ # 添加占位文本
124
+ para = doc.add_paragraph()
125
+ run = para.add_run(f"[图片: {os.path.basename(image_path)}]")
126
+ run.font.color.rgb = RGBColor(150, 150, 150)
127
+ return False
128
+
129
+ try:
130
+ # 创建段落并设置对齐
131
+ para = doc.add_paragraph()
132
+ if align == "center":
133
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
134
+ elif align == "right":
135
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
136
+ else:
137
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
138
+
139
+ # 添加图片
140
+ if width and height:
141
+ run = para.add_run()
142
+ run.add_picture(image_path, width=Inches(width), height=Inches(height))
143
+ elif width:
144
+ run = para.add_run()
145
+ run.add_picture(image_path, width=Inches(width))
146
+ elif height:
147
+ run = para.add_run()
148
+ run.add_picture(image_path, height=Inches(height))
149
+ else:
150
+ run = para.add_run()
151
+ run.add_picture(image_path)
152
+
153
+ return True
154
+
155
+ except Exception as e:
156
+ print(f"Warning: Failed to add image {image_path}: {e}")
157
+ import traceback
158
+
159
+ traceback.print_exc()
160
+ # 添加占位文本
161
+ para = doc.add_paragraph()
162
+ run = para.add_run(f"[图片: {os.path.basename(image_path)}]")
163
+ run.font.color.rgb = RGBColor(150, 150, 150)
164
+ return False
165
+
166
+
167
+ def latex_to_unicode_formula(latex_formula):
168
+ """将 LaTeX 公式转换为 Unicode 数学符号"""
169
+ # 简单的 LaTeX 到 Unicode 转换映射
170
+ conversions = {
171
+ r"\^2": "²",
172
+ r"\^3": "³",
173
+ r"\_2": "₂",
174
+ r"\_3": "₃",
175
+ r"\\cdot": "·",
176
+ r"\\times": "×",
177
+ r"\\div": "÷",
178
+ r"\\neq": "≠",
179
+ r"\\leq": "≤",
180
+ r"\\geq": "≥",
181
+ r"\\pm": "±",
182
+ r"\\sqrt": "√",
183
+ r"\\pi": "π",
184
+ r"\\alpha": "α",
185
+ r"\\beta": "β",
186
+ r"\\gamma": "γ",
187
+ r"\\delta": "δ",
188
+ r"\\theta": "θ",
189
+ r"\\lambda": "λ",
190
+ r"\\mu": "μ",
191
+ r"\\sigma": "σ",
192
+ r"\\phi": "φ",
193
+ r"\\omega": "ω",
194
+ r"\\infty": "∞",
195
+ }
196
+
197
+ result = latex_formula
198
+ for latex, unicode_char in conversions.items():
199
+ result = result.replace(latex, unicode_char)
200
+
201
+ return result
202
+
203
+
204
+ def add_native_formula(
205
+ para,
206
+ latex_formula,
207
+ ):
208
+ """添加 Word 原生公式"""
209
+ if HAS_MATH2DOCX:
210
+ try:
211
+ # 添加公式
212
+ math2docx.add_math(para, latex_formula)
213
+ return True
214
+ except Exception as e:
215
+ print(f"Warning: Failed to add native formula: {e}")
216
+ import traceback
217
+
218
+ traceback.print_exc()
219
+ return False
220
+ return False
221
+
10
222
 
11
223
  def parse_color(color_str):
12
224
  """解析颜色字符串为RGBColor"""
13
- if not color_str or not color_str.startswith('#'):
225
+ if not color_str or not color_str.startswith("#"):
14
226
  return None
15
227
  try:
16
228
  r = int(color_str[1:3], 16)
@@ -19,186 +231,114 @@ def parse_color(color_str):
19
231
  return RGBColor(r, g, b)
20
232
  except:
21
233
  return None
234
+ raise
235
+
22
236
 
23
- def set_font(run, font_name='微软雅黑', size=12, color=None, bold=False, italic=False,
24
- underline=False, strike=False, highlight_color=None):
237
+ def set_font(
238
+ run,
239
+ font_name="微软雅黑",
240
+ size=12,
241
+ color=None,
242
+ bold=False,
243
+ italic=False,
244
+ underline=False,
245
+ strike=False,
246
+ highlight_color=None,
247
+ ):
25
248
  """设置字体样式"""
26
249
  run.font.name = font_name
27
- run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)
250
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), font_name)
28
251
  run.font.size = Pt(size)
29
252
  run.font.bold = bold
30
253
  run.font.italic = italic
31
-
254
+
32
255
  if color:
33
256
  run.font.color.rgb = color
34
-
257
+
35
258
  if underline:
36
259
  run.font.underline = True
37
-
260
+
38
261
  if strike:
39
262
  run.font.strike = True
40
-
263
+
41
264
  if highlight_color:
42
265
  from docx.enum.text import WD_COLOR_INDEX
266
+
43
267
  color_map = {
44
- 'yellow': WD_COLOR_INDEX.YELLOW,
45
- 'green': WD_COLOR_INDEX.BRIGHT_GREEN,
46
- 'cyan': WD_COLOR_INDEX.CYAN,
47
- 'magenta': WD_COLOR_INDEX.MAGENTA,
48
- 'blue': WD_COLOR_INDEX.TURQUOISE,
49
- 'red': WD_COLOR_INDEX.RED,
50
- 'darkblue': WD_COLOR_INDEX.BLUE,
51
- 'orange': WD_COLOR_INDEX.ORANGE,
52
- 'gray': WD_COLOR_INDEX.GRAY_25,
268
+ "yellow": WD_COLOR_INDEX.YELLOW,
269
+ "green": WD_COLOR_INDEX.BRIGHT_GREEN,
270
+ "cyan": WD_COLOR_INDEX.CYAN,
271
+ "magenta": WD_COLOR_INDEX.MAGENTA,
272
+ "blue": WD_COLOR_INDEX.TURQUOISE,
273
+ "red": WD_COLOR_INDEX.RED,
274
+ "darkblue": WD_COLOR_INDEX.BLUE,
275
+ "orange": WD_COLOR_INDEX.ORANGE,
276
+ "gray": WD_COLOR_INDEX.GRAY_25,
53
277
  }
54
278
  if highlight_color in color_map:
55
279
  run.font.highlight_color = color_map[highlight_color]
56
280
 
57
- def process_inline_elements(element, parent_run=None):
58
- """处理内联元素"""
59
- from docx.text.paragraph import Paragraph
60
-
61
- runs = []
62
-
63
- for child in element.children:
64
- if child.name is None: # 文本节点
65
- text = str(child).strip()
66
- if text:
67
- if parent_run:
68
- parent_run.add_text(text)
69
- else:
70
- runs.append({'text': text})
71
- elif child.name == 'strong' or child.name == 'b':
72
- if parent_run:
73
- parent_run.bold = True
74
- process_inline_elements(child, parent_run)
75
- else:
76
- runs.append({'text': child.get_text(), 'bold': True})
77
- elif child.name == 'em' or child.name == 'i':
78
- if parent_run:
79
- parent_run.italic = True
80
- process_inline_elements(child, parent_run)
81
- else:
82
- runs.append({'text': child.get_text(), 'italic': True})
83
- elif child.name == 'u':
84
- if parent_run:
85
- parent_run.underline = True
86
- process_inline_elements(child, parent_run)
87
- else:
88
- runs.append({'text': child.get_text(), 'underline': True})
89
- elif child.name == 's' or child.name == 'del':
90
- if parent_run:
91
- parent_run.strike = True
92
- process_inline_elements(child, parent_run)
93
- else:
94
- runs.append({'text': child.get_text(), 'strike': True})
95
- elif child.name == 'sup':
96
- if parent_run:
97
- parent_run.font.superscript = True
98
- process_inline_elements(child, parent_run)
99
- else:
100
- runs.append({'text': child.get_text(), 'superscript': True})
101
- elif child.name == 'sub':
102
- if parent_run:
103
- parent_run.font.subscript = True
104
- process_inline_elements(child, parent_run)
105
- else:
106
- runs.append({'text': child.get_text(), 'subscript': True})
107
- elif child.name == 'code':
108
- code_text = child.get_text()
109
- if parent_run:
110
- parent_run.font.name = 'Consolas'
111
- parent_run.font.size = Pt(10)
112
- parent_run.add_text(code_text)
113
- else:
114
- runs.append({'text': code_text, 'font': 'Consolas', 'size': 10})
115
- elif child.name == 'a':
116
- link_text = child.get_text()
117
- href = child.get('href', '')
118
- if parent_run:
119
- parent_run.add_text(link_text)
120
- else:
121
- runs.append({'text': link_text, 'link': href})
122
- elif child.name == 'span':
123
- style = child.get('style', '')
124
- color_match = re.search(r'color:\s*([^;]+)', style)
125
- bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
126
-
127
- props = {'text': child.get_text()}
128
- if color_match:
129
- color = parse_color(color_match.group(1).strip())
130
- if color:
131
- props['color'] = color
132
- if bg_match:
133
- bg_color = bg_match.group(1).strip()
134
- if bg_color.startswith('#'):
135
- bg_rgb = parse_color(bg_color)
136
- if bg_rgb:
137
- props['highlight'] = str(bg_rgb)
138
-
139
- if parent_run:
140
- if 'color' in props:
141
- parent_run.font.color.rgb = props['color']
142
- process_inline_elements(child, parent_run)
143
- else:
144
- runs.append(props)
145
- else:
146
- process_inline_elements(child, parent_run)
147
-
148
- return runs
149
281
 
150
282
  def _apply_highlight(run, bg_color):
151
283
  """为run应用背景色/高亮"""
152
284
  from docx.enum.text import WD_COLOR_INDEX
153
-
285
+
154
286
  # 颜色名称到WD_COLOR_INDEX的映射(只使用可用的枚举值)
155
287
  color_map = {
156
- 'yellow': WD_COLOR_INDEX.YELLOW,
157
- 'green': WD_COLOR_INDEX.GREEN,
158
- 'brightgreen': WD_COLOR_INDEX.BRIGHT_GREEN,
159
- 'blue': WD_COLOR_INDEX.BLUE,
160
- 'darkblue': WD_COLOR_INDEX.DARK_BLUE,
161
- 'red': WD_COLOR_INDEX.RED,
162
- 'darkred': WD_COLOR_INDEX.DARK_RED,
163
- 'darkyellow': WD_COLOR_INDEX.DARK_YELLOW,
164
- 'lightgray': WD_COLOR_INDEX.GRAY_25,
165
- 'gray': WD_COLOR_INDEX.GRAY_50,
166
- 'black': WD_COLOR_INDEX.BLACK,
167
- 'white': WD_COLOR_INDEX.WHITE,
168
- 'pink': WD_COLOR_INDEX.PINK,
169
- 'teal': WD_COLOR_INDEX.TEAL,
170
- 'turquoise': WD_COLOR_INDEX.TURQUOISE,
171
- 'violet': WD_COLOR_INDEX.VIOLET,
172
- 'cyan': WD_COLOR_INDEX.TURQUOISE,
173
- 'magenta': WD_COLOR_INDEX.VIOLET,
288
+ "yellow": WD_COLOR_INDEX.YELLOW,
289
+ "green": WD_COLOR_INDEX.GREEN,
290
+ "brightgreen": WD_COLOR_INDEX.BRIGHT_GREEN,
291
+ "blue": WD_COLOR_INDEX.BLUE,
292
+ "darkblue": WD_COLOR_INDEX.DARK_BLUE,
293
+ "red": WD_COLOR_INDEX.RED,
294
+ "darkred": WD_COLOR_INDEX.DARK_RED,
295
+ "darkyellow": WD_COLOR_INDEX.DARK_YELLOW,
296
+ "lightgray": WD_COLOR_INDEX.GRAY_25,
297
+ "gray": WD_COLOR_INDEX.GRAY_50,
298
+ "black": WD_COLOR_INDEX.BLACK,
299
+ "white": WD_COLOR_INDEX.WHITE,
300
+ "pink": WD_COLOR_INDEX.PINK,
301
+ "teal": WD_COLOR_INDEX.TEAL,
302
+ "turquoise": WD_COLOR_INDEX.TURQUOISE,
303
+ "violet": WD_COLOR_INDEX.VIOLET,
304
+ "cyan": WD_COLOR_INDEX.TURQUOISE,
305
+ "magenta": WD_COLOR_INDEX.VIOLET,
174
306
  }
175
-
307
+
176
308
  # 标准化颜色名称
177
309
  bg_lower = bg_color.lower().strip()
178
-
310
+
179
311
  if bg_lower in color_map:
180
312
  # 使用预定义的高亮色
181
313
  run.font.highlight_color = color_map[bg_lower]
182
- elif bg_lower.startswith('#'):
314
+ elif bg_lower.startswith("#"):
183
315
  # 十六进制颜色,直接使用字符串
184
- shading_elm = OxmlElement('w:shd')
185
- shading_elm.set(qn('w:fill'), bg_lower[1:].upper())
316
+ shading_elm = OxmlElement("w:shd")
317
+ shading_elm.set(qn("w:fill"), bg_lower[1:].upper())
186
318
  run._element.get_or_add_rPr().append(shading_elm)
187
319
  else:
188
320
  # 尝试其他常见颜色名称映射到相近的预定义颜色
189
321
  similar_colors = {
190
- 'lightblue': WD_COLOR_INDEX.TURQUOISE,
191
- 'lightyellow': WD_COLOR_INDEX.YELLOW,
192
- 'lightgreen': WD_COLOR_INDEX.BRIGHT_GREEN,
193
- 'orange': WD_COLOR_INDEX.YELLOW, # 橙色映射到黄色
194
- 'purple': WD_COLOR_INDEX.VIOLET,
195
- 'brown': WD_COLOR_INDEX.DARK_YELLOW,
322
+ "lightblue": WD_COLOR_INDEX.TURQUOISE,
323
+ "lightyellow": WD_COLOR_INDEX.YELLOW,
324
+ "lightgreen": WD_COLOR_INDEX.BRIGHT_GREEN,
325
+ "orange": WD_COLOR_INDEX.YELLOW, # 橙色映射到黄色
326
+ "purple": WD_COLOR_INDEX.VIOLET,
327
+ "brown": WD_COLOR_INDEX.DARK_YELLOW,
196
328
  }
197
329
  if bg_lower in similar_colors:
198
330
  run.font.highlight_color = similar_colors[bg_lower]
199
331
 
200
- def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=12,
201
- indent=None, align=None, line_spacing=None):
332
+
333
+ def process_paragraph(
334
+ paragraph,
335
+ doc,
336
+ default_font="微软雅黑",
337
+ default_size=12,
338
+ indent=None,
339
+ align=None,
340
+ line_spacing=None,
341
+ ):
202
342
  """处理段落及其内联元素"""
203
343
  para = doc.add_paragraph()
204
344
 
@@ -211,7 +351,7 @@ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=
211
351
  para.paragraph_format.first_line_indent = Inches(indent)
212
352
  else:
213
353
  # 从data-indent属性读取缩进(单位:em)
214
- data_indent = paragraph.get('data-indent', '')
354
+ data_indent = paragraph.get("data-indent", "")
215
355
  if data_indent:
216
356
  try:
217
357
  em_count = float(data_indent)
@@ -224,247 +364,536 @@ def process_paragraph(paragraph, doc, default_font='微软雅黑', default_size=
224
364
  if line_spacing:
225
365
  para.paragraph_format.line_spacing = line_spacing
226
366
 
367
+ # 解析段落的样式(包括行距和段距)
368
+ style = paragraph.get("style", "")
369
+
370
+ # 解析对齐方式
371
+ text_align_match = TEXT_ALIGN_RE.search(style)
372
+ if text_align_match:
373
+ align_str = text_align_match.group(1).strip().lower()
374
+ if align_str == "left":
375
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
376
+ elif align_str == "center":
377
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
378
+ elif align_str == "right":
379
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
380
+ elif align_str == "justify":
381
+ para.alignment = WD_PARAGRAPH_ALIGNMENT.JUSTIFY
382
+
383
+ # 解析行距
384
+ line_height_match = LINE_HEIGHT_RE.search(style)
385
+ if line_height_match:
386
+ line_height_str = line_height_match.group(1).strip()
387
+ if line_height_str.endswith("pt"):
388
+ # 固定行距
389
+ para.paragraph_format.line_spacing = float(line_height_str[:-2])
390
+ elif line_height_str.endswith("px"):
391
+ # px转换为pt
392
+ para.paragraph_format.line_spacing = float(line_height_str[:-2]) * 0.75
393
+ elif line_height_str.endswith("em"):
394
+ # em转换为pt(基于段落字号)
395
+ para.paragraph_format.line_spacing = para_size * float(line_height_str[:-2])
396
+ else:
397
+ # 尝试作为倍数处理
398
+ line_spacing_value = float(line_height_str)
399
+ para.paragraph_format.line_spacing = line_spacing_value
400
+
401
+ # 解析段后距
402
+ margin_bottom_match = MARGIN_RE.search(style)
403
+ if margin_bottom_match:
404
+ margin_bottom_str = margin_bottom_match.group(2).strip()
405
+ if margin_bottom_str.endswith("pt"):
406
+ para.paragraph_format.space_after = Pt(float(margin_bottom_str[:-2]))
407
+ elif margin_bottom_str.endswith("px"):
408
+ # px转换为pt
409
+ para.paragraph_format.space_after = Pt(float(margin_bottom_str[:-2]) * 0.75)
410
+ elif margin_bottom_str.endswith("em"):
411
+ # em转换为pt(基于段落字号)
412
+ para.paragraph_format.space_after = Pt(
413
+ para_size * float(margin_bottom_str[:-2])
414
+ )
415
+ else:
416
+ # 尝试作为pt处理
417
+ para.paragraph_format.space_after = Pt(float(margin_bottom_str))
418
+
419
+ # 解析段前距
420
+ margin_top_match = MARGIN_RE.search(style)
421
+ if margin_top_match:
422
+ margin_top_str = margin_top_match.group(2).strip()
423
+ if margin_top_str.endswith("pt"):
424
+ para.paragraph_format.space_before = Pt(float(margin_top_str[:-2]))
425
+ elif margin_top_str.endswith("px"):
426
+ # px转换为pt
427
+ para.paragraph_format.space_before = Pt(float(margin_top_str[:-2]) * 0.75)
428
+ elif margin_top_str.endswith("em"):
429
+ # em转换为pt(基于段落字号)
430
+ para.paragraph_format.space_before = Pt(
431
+ para_size * float(margin_top_str[:-2])
432
+ )
433
+ else:
434
+ # 尝试作为pt处理
435
+ para.paragraph_format.space_before = Pt(float(margin_top_str))
436
+
437
+ # 解析段落的字号
438
+ para_size = default_size
439
+ style = paragraph.get("style", "")
440
+ size_match = FONT_SIZE_RE.search(style)
441
+ if size_match:
442
+ size_str = size_match.group(1).strip()
443
+ # 处理不同单位:pt, px, em等
444
+ if size_str.endswith("pt"):
445
+ para_size = float(size_str[:-2])
446
+ elif size_str.endswith("px"):
447
+ # px转换为pt (1px ≈ 0.75pt)
448
+ para_size = float(size_str[:-2]) * 0.75
449
+ elif size_str.endswith("em"):
450
+ # em转换为pt (假设基础字号为12pt)
451
+ para_size = float(size_str[:-2]) * 12
452
+ else:
453
+ # 尝试直接解析为数字
454
+ para_size = float(size_str)
455
+
227
456
  # 处理段落内容 - 递归处理所有子元素
228
- _process_element_to_runs(paragraph, para, default_font, default_size)
457
+ _process_element_to_runs(paragraph, para, default_font, para_size)
229
458
 
230
459
  return para
231
460
 
232
- def _process_element_to_runs(element, para, default_font='微软雅黑', default_size=12,
233
- bold=False, italic=False, underline=False, strike=False,
234
- color=None, bg_color=None, font_name=None, font_size=None):
461
+
462
+ def _process_element_to_runs(
463
+ element,
464
+ para,
465
+ default_font="微软雅黑",
466
+ default_size=12,
467
+ bold=False,
468
+ italic=False,
469
+ underline=False,
470
+ strike=False,
471
+ color=None,
472
+ bg_color=None,
473
+ font_name=None,
474
+ font_size=None,
475
+ ):
235
476
  """递归处理元素,为不同格式的文本创建独立的runs"""
236
477
  current_font = font_name or default_font
237
478
  current_size = font_size or default_size
238
-
479
+
239
480
  for child in element.children:
240
481
  if child.name is None: # 文本节点
241
482
  text = str(child)
242
483
  # 去除多余空白但保留单个空格
243
484
  if text:
244
485
  # 替换换行和制表符为空格,然后合并多个空格
245
- text = ' '.join(text.replace('\n', ' ').replace('\t', ' ').split())
486
+ text = " ".join(text.replace("\n", " ").replace("\t", " ").split())
246
487
  if text: # 再次检查,因为去除空白后可能为空
247
488
  run = para.add_run(text)
248
- set_font(run, font_name=current_font, size=current_size,
249
- bold=bold, italic=italic, underline=underline, strike=strike)
489
+ set_font(
490
+ run,
491
+ font_name=current_font,
492
+ size=current_size,
493
+ bold=bold,
494
+ italic=italic,
495
+ underline=underline,
496
+ strike=strike,
497
+ )
250
498
  if color:
251
499
  run.font.color.rgb = color
252
500
  # 应用背景色
253
501
  if bg_color:
254
502
  _apply_highlight(run, bg_color)
255
- elif child.name == 'strong' or child.name == 'b':
256
- _process_element_to_runs(child, para, default_font, default_size,
257
- bold=True, italic=italic, underline=underline, strike=strike,
258
- color=color, bg_color=bg_color)
259
- elif child.name == 'em' or child.name == 'i':
260
- _process_element_to_runs(child, para, default_font, default_size,
261
- bold=bold, italic=True, underline=underline, strike=strike,
262
- color=color, bg_color=bg_color)
263
- elif child.name == 'u':
264
- _process_element_to_runs(child, para, default_font, default_size,
265
- bold=bold, italic=italic, underline=True, strike=strike,
266
- color=color, bg_color=bg_color)
267
- elif child.name == 's' or child.name == 'del':
268
- _process_element_to_runs(child, para, default_font, default_size,
269
- bold=bold, italic=italic, underline=underline, strike=True,
270
- color=color, bg_color=bg_color)
271
- elif child.name == 'sup':
503
+ elif child.name == "math" or child.name == "latex":
504
+ # 处理 LaTeX 公式标签,添加 Word 原生公式
505
+ # 注意:公式不会继承父级样式(颜色、加粗、斜体等),这是 Word OMML 的限制
506
+ latex_formula = child.get_text().strip()
507
+ if latex_formula and HAS_MATH2DOCX:
508
+ # 添加原生公式(不传递样式参数)
509
+ if add_native_formula(para, latex_formula):
510
+ continue # 成功添加原生公式,跳过后续处理
511
+ # 如果失败,回退到文本显示
512
+ # 回退方案:显示为代码文本
513
+ run = para.add_run(latex_formula)
514
+ set_font(run, font_name="Consolas", size=10, color=RGBColor(0, 0, 128))
515
+ elif child.name == "strong" or child.name == "b":
516
+ _process_element_to_runs(
517
+ child,
518
+ para,
519
+ default_font,
520
+ default_size,
521
+ bold=True,
522
+ italic=italic,
523
+ underline=underline,
524
+ strike=strike,
525
+ color=color,
526
+ bg_color=bg_color,
527
+ )
528
+ elif child.name == "em" or child.name == "i":
529
+ _process_element_to_runs(
530
+ child,
531
+ para,
532
+ default_font,
533
+ default_size,
534
+ bold=bold,
535
+ italic=True,
536
+ underline=underline,
537
+ strike=strike,
538
+ color=color,
539
+ bg_color=bg_color,
540
+ )
541
+ elif child.name == "u":
542
+ _process_element_to_runs(
543
+ child,
544
+ para,
545
+ default_font,
546
+ default_size,
547
+ bold=bold,
548
+ italic=italic,
549
+ underline=True,
550
+ strike=strike,
551
+ color=color,
552
+ bg_color=bg_color,
553
+ )
554
+ elif child.name == "s" or child.name == "del":
555
+ _process_element_to_runs(
556
+ child,
557
+ para,
558
+ default_font,
559
+ default_size,
560
+ bold=bold,
561
+ italic=italic,
562
+ underline=underline,
563
+ strike=True,
564
+ color=color,
565
+ bg_color=bg_color,
566
+ )
567
+ elif child.name == "sup":
272
568
  for sub_child in child.children:
273
569
  if sub_child.name is None:
274
570
  run = para.add_run(str(sub_child))
275
- set_font(run, font_name=current_font, size=current_size,
276
- bold=bold, italic=italic, underline=underline, strike=strike)
571
+ set_font(
572
+ run,
573
+ font_name=current_font,
574
+ size=current_size,
575
+ bold=bold,
576
+ italic=italic,
577
+ underline=underline,
578
+ strike=strike,
579
+ )
277
580
  run.font.superscript = True
278
581
  if color:
279
582
  run.font.color.rgb = color
583
+ if bg_color:
584
+ _apply_highlight(run, bg_color)
280
585
  else:
281
- _process_element_to_runs(child, para, default_font, default_size,
282
- bold=bold, italic=italic, underline=underline, strike=strike,
283
- color=color, bg_color=bg_color)
284
- elif child.name == 'sub':
586
+ # 处理嵌套元素,但保持上标
587
+ _process_element_to_runs(
588
+ sub_child,
589
+ para,
590
+ default_font,
591
+ default_size,
592
+ bold=bold,
593
+ italic=italic,
594
+ underline=underline,
595
+ strike=strike,
596
+ color=color,
597
+ bg_color=bg_color,
598
+ )
599
+ # 为嵌套元素添加的上标
600
+ for run in (
601
+ para.runs[
602
+ len(list(para.runs)) - len(sub_child.find_all(True)) :
603
+ ]
604
+ if para.runs
605
+ else []
606
+ ):
607
+ run.font.superscript = True
608
+ elif child.name == "sub":
285
609
  for sub_child in child.children:
286
610
  if sub_child.name is None:
287
611
  run = para.add_run(str(sub_child))
288
- set_font(run, font_name=current_font, size=current_size,
289
- bold=bold, italic=italic, underline=underline, strike=strike)
612
+ set_font(
613
+ run,
614
+ font_name=current_font,
615
+ size=current_size,
616
+ bold=bold,
617
+ italic=italic,
618
+ underline=underline,
619
+ strike=strike,
620
+ )
290
621
  run.font.subscript = True
291
622
  if color:
292
623
  run.font.color.rgb = color
624
+ if bg_color:
625
+ _apply_highlight(run, bg_color)
293
626
  else:
294
- _process_element_to_runs(child, para, default_font, default_size,
295
- bold=bold, italic=italic, underline=underline, strike=strike,
296
- color=color, bg_color=bg_color)
297
- elif child.name == 'code':
627
+ # 处理嵌套元素,但保持下标
628
+ _process_element_to_runs(
629
+ sub_child,
630
+ para,
631
+ default_font,
632
+ default_size,
633
+ bold=bold,
634
+ italic=italic,
635
+ underline=underline,
636
+ strike=strike,
637
+ color=color,
638
+ bg_color=bg_color,
639
+ )
640
+ # 为嵌套元素添加的下标
641
+ for run in (
642
+ para.runs[
643
+ len(list(para.runs)) - len(sub_child.find_all(True)) :
644
+ ]
645
+ if para.runs
646
+ else []
647
+ ):
648
+ run.font.subscript = True
649
+ elif child.name == "code":
298
650
  code_text = child.get_text()
299
651
  run = para.add_run(code_text)
300
- set_font(run, font_name='Consolas', size=10)
301
- elif child.name == 'a':
652
+ set_font(run, font_name="Consolas", size=10)
653
+ elif child.name == "a":
302
654
  link_text = child.get_text()
303
655
  run = para.add_run(link_text)
304
656
  set_font(run, font_name=current_font, size=current_size)
305
657
  run.font.underline = True
306
658
  run.font.color.rgb = RGBColor(0, 0, 255)
307
- elif child.name == 'span':
659
+ elif child.name == "span":
308
660
  # 处理span的样式
309
- style = child.get('style', '')
310
- classes = child.get('class', [])
311
-
661
+ style = child.get("style", "")
662
+ classes = child.get("class", [])
663
+
312
664
  span_color = color
313
665
  span_bg = bg_color
314
-
666
+ span_font = current_font # 使用当前字体(继承父级)
667
+ span_size = current_size # 使用当前字号(继承父级)
668
+
315
669
  # 解析style中的颜色
316
- color_match = re.search(r'color:\s*([^;]+)', style)
670
+ color_match = COLOR_RE.search(style)
317
671
  if color_match:
318
672
  parsed = parse_color(color_match.group(1).strip())
319
673
  if parsed:
320
674
  span_color = parsed
321
-
675
+
676
+ # 解析 font-family
677
+ font_match = FONT_FAMILY_RE.search(style)
678
+ if font_match:
679
+ font_family = font_match.group(1).strip()
680
+ # 去除引号
681
+ font_family = font_family.strip("'\"").strip()
682
+ if font_family:
683
+ span_font = font_family
684
+
685
+ # 解析 font-size
686
+ size_match = FONT_SIZE_RE.search(style)
687
+ if size_match:
688
+ size_str = size_match.group(1).strip()
689
+ # 处理不同单位:pt, px, em等
690
+ if size_str.endswith("pt"):
691
+ span_size = float(size_str[:-2])
692
+ elif size_str.endswith("px"):
693
+ # px转换为pt (1px ≈ 0.75pt)
694
+ span_size = float(size_str[:-2]) * 0.75
695
+ elif size_str.endswith("em"):
696
+ # em转换为pt (基于默认12pt)
697
+ span_size = float(size_str[:-2]) * 12
698
+ else:
699
+ # 尝试直接解析为数字
700
+ span_size = float(size_str)
701
+
322
702
  # 解析class中的颜色
323
- if 'red' in classes:
324
- span_color = RGBColor(255, 0, 0)
325
- elif 'blue' in classes:
326
- span_color = RGBColor(0, 0, 255)
327
- elif 'green' in classes:
328
- span_color = RGBColor(0, 128, 0)
329
- elif 'purple' in classes:
330
- span_color = RGBColor(128, 0, 128)
331
-
703
+ class_set = set(classes) # 转换为集合提高查找性能
704
+ if "red" in class_set:
705
+ span_color = ConverterConfig.CLASS_COLORS["red"]
706
+ elif "blue" in class_set:
707
+ span_color = ConverterConfig.CLASS_COLORS["blue"]
708
+ elif "green" in class_set:
709
+ span_color = ConverterConfig.CLASS_COLORS["green"]
710
+ elif "purple" in class_set:
711
+ span_color = ConverterConfig.CLASS_COLORS["purple"]
712
+
332
713
  # 背景色
333
- bg_match = re.search(r'background(?:-color)?:\s*([^;]+)', style)
714
+ bg_match = BACKGROUND_COLOR_RE.search(style)
334
715
  if bg_match:
335
716
  span_bg = bg_match.group(1).strip()
336
- if 'highlight' in classes:
337
- span_bg = 'yellow'
338
-
339
- _process_element_to_runs(child, para, default_font, default_size,
340
- bold=bold, italic=italic, underline=underline, strike=strike,
341
- color=span_color, bg_color=span_bg,
342
- font_name=current_font, font_size=current_size)
717
+ if "highlight" in class_set:
718
+ span_bg = "yellow"
719
+
720
+ _process_element_to_runs(
721
+ child,
722
+ para,
723
+ default_font,
724
+ default_size,
725
+ bold=bold,
726
+ italic=italic,
727
+ underline=underline,
728
+ strike=strike,
729
+ color=span_color,
730
+ bg_color=span_bg,
731
+ font_name=span_font,
732
+ font_size=span_size,
733
+ )
343
734
  else:
344
735
  # 其他标签递归处理
345
- _process_element_to_runs(child, para, default_font, default_size,
346
- bold=bold, italic=italic, underline=underline, strike=strike,
347
- color=color, bg_color=bg_color)
736
+ _process_element_to_runs(
737
+ child,
738
+ para,
739
+ default_font,
740
+ default_size,
741
+ bold=bold,
742
+ italic=italic,
743
+ underline=underline,
744
+ strike=strike,
745
+ color=color,
746
+ bg_color=bg_color,
747
+ )
748
+
348
749
 
349
- def process_list_items(items, doc, ordered=False, default_font='微软雅黑', default_size=12, level=0):
750
+ def process_list_items(
751
+ items, doc, ordered=False, default_font="微软雅黑", default_size=12, level=0
752
+ ):
350
753
  """处理列表项,支持嵌套"""
351
754
  for item in items:
352
755
  # 创建列表项段落
353
756
  if ordered:
354
- para = doc.add_paragraph(style='List Number')
757
+ para = doc.add_paragraph(style="List Number")
355
758
  else:
356
- para = doc.add_paragraph(style='List Bullet')
357
-
759
+ para = doc.add_paragraph(style="List Bullet")
760
+
358
761
  # 设置缩进:每级增加 0.25 英寸
359
762
  para.paragraph_format.left_indent = Inches(0.25 * (level + 1))
360
-
763
+
361
764
  # 查找嵌套列表
362
- nested_ul = item.find('ul', recursive=False)
363
- nested_ol = item.find('ol', recursive=False)
364
-
765
+ nested_ul = item.find("ul", recursive=False)
766
+ nested_ol = item.find("ol", recursive=False)
767
+
365
768
  # 处理列表项的文本内容(排除嵌套列表)
366
769
  # 创建一个临时副本用于提取文本
367
- item_copy = BeautifulSoup(str(item), 'html.parser').find('li')
770
+ item_copy = BeautifulSoup(str(item), "html.parser").find("li")
368
771
  if item_copy:
369
772
  # 移除嵌套列表
370
- for nested in item_copy.find_all(['ul', 'ol'], recursive=False):
773
+ for nested in item_copy.find_all(["ul", "ol"], recursive=False):
371
774
  nested.decompose()
372
-
775
+
373
776
  # 处理剩余内容
374
777
  if item_copy.get_text().strip():
375
778
  _process_element_to_runs(item_copy, para, default_font, default_size)
376
-
779
+
377
780
  # 递归处理嵌套列表
378
781
  if nested_ul:
379
- nested_items = nested_ul.find_all('li', recursive=False)
380
- process_list_items(nested_items, doc, ordered=False,
381
- default_font=default_font, default_size=default_size, level=level+1)
782
+ nested_items = nested_ul.find_all("li", recursive=False)
783
+ process_list_items(
784
+ nested_items,
785
+ doc,
786
+ ordered=False,
787
+ default_font=default_font,
788
+ default_size=default_size,
789
+ level=level + 1,
790
+ )
382
791
  if nested_ol:
383
- nested_items = nested_ol.find_all('li', recursive=False)
384
- process_list_items(nested_items, doc, ordered=True,
385
- default_font=default_font, default_size=default_size, level=level+1)
792
+ nested_items = nested_ol.find_all("li", recursive=False)
793
+ process_list_items(
794
+ nested_items,
795
+ doc,
796
+ ordered=True,
797
+ default_font=default_font,
798
+ default_size=default_size,
799
+ level=level + 1,
800
+ )
801
+
386
802
 
387
803
  def _parse_style(style_str):
388
804
  """解析style字符串为字典"""
389
805
  styles = {}
390
806
  if not style_str:
391
807
  return styles
392
- for item in style_str.split(';'):
393
- if ':' in item:
394
- key, value = item.split(':', 1)
808
+ for item in style_str.split(";"):
809
+ if ":" in item:
810
+ key, value = item.split(":", 1)
395
811
  styles[key.strip()] = value.strip()
396
812
  return styles
397
813
 
814
+
398
815
  def _apply_cell_style(cell_elem, style_dict):
399
816
  """应用单元格样式"""
400
817
  # 背景色
401
- bg_color = style_dict.get('background-color', '')
818
+ bg_color = style_dict.get("background-color", "")
402
819
  if bg_color:
403
820
  # 处理颜色值
404
- if bg_color.startswith('#'):
405
- shading_elm = OxmlElement('w:shd')
406
- shading_elm.set(qn('w:fill'), bg_color[1:].upper())
821
+ if bg_color.startswith("#"):
822
+ shading_elm = OxmlElement("w:shd")
823
+ shading_elm.set(qn("w:fill"), bg_color[1:].upper())
407
824
  cell_elem._element.get_or_add_tcPr().append(shading_elm)
408
-
825
+
409
826
  # 文字颜色
410
- color = style_dict.get('color', '')
827
+ color = style_dict.get("color", "")
411
828
  if color:
412
- rgb = parse_color(color) if color.startswith('#') else None
829
+ rgb = parse_color(color) if color.startswith("#") else None
413
830
  if rgb:
414
831
  for run in cell_elem.paragraphs[0].runs:
415
832
  run.font.color.rgb = rgb
416
833
 
417
- def process_table(table, doc, default_font='微软雅黑', default_size=11):
834
+
835
+ def process_table(table, doc, default_font="微软雅黑", default_size=11):
418
836
  """处理表格,支持内联样式"""
419
- rows = table.find_all('tr')
837
+ rows = table.find_all("tr")
420
838
  if not rows:
421
839
  return
422
-
840
+
423
841
  # 获取列数
424
- cols = max(len(row.find_all(['td', 'th'])) for row in rows)
425
-
842
+ cols = max(len(row.find_all(["td", "th"])) for row in rows)
843
+
426
844
  # 创建表格
427
845
  word_table = doc.add_table(rows=len(rows), cols=cols)
428
- word_table.style = 'Table Grid'
429
-
846
+ word_table.style = "Table Grid"
847
+
430
848
  for row_idx, row in enumerate(rows):
431
849
  # 处理行样式(如背景色)
432
- row_style = _parse_style(row.get('style', ''))
433
- row_bg = row_style.get('background-color', '')
434
-
435
- cells = row.find_all(['td', 'th'])
850
+ row_style = _parse_style(row.get("style", ""))
851
+ row_bg = row_style.get("background-color", "")
852
+
853
+ cells = row.find_all(["td", "th"])
436
854
  for col_idx, cell in enumerate(cells):
437
855
  if col_idx < cols:
438
856
  cell_elem = word_table.rows[row_idx].cells[col_idx]
439
- cell_elem.paragraphs[0].text = cell.get_text().strip()
440
-
857
+
441
858
  # 解析单元格样式
442
- cell_style = _parse_style(cell.get('style', ''))
443
-
859
+ cell_style = _parse_style(cell.get("style", ""))
860
+
861
+ # 清空默认段落
862
+ cell_elem.paragraphs[0].clear()
863
+
864
+ # 使用 _process_element_to_runs 处理单元格内容,保留格式
865
+ _process_element_to_runs(
866
+ cell,
867
+ cell_elem.paragraphs[0],
868
+ default_font=default_font,
869
+ default_size=default_size,
870
+ )
871
+
444
872
  # 表头加粗
445
- if cell.name == 'th':
873
+ if cell.name == "th":
446
874
  for run in cell_elem.paragraphs[0].runs:
447
875
  run.font.bold = True
448
-
876
+
449
877
  # 设置单元格对齐
450
- align = cell_style.get('text-align', 'center')
451
- if align == 'center':
878
+ align = cell_style.get("text-align", "center")
879
+ if align == "center":
452
880
  cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
453
- elif align == 'left':
881
+ elif align == "left":
454
882
  cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
455
- elif align == 'right':
883
+ elif align == "right":
456
884
  cell_elem.paragraphs[0].alignment = WD_PARAGRAPH_ALIGNMENT.RIGHT
457
-
885
+
458
886
  # 应用单元格样式(背景色、文字颜色)
459
887
  _apply_cell_style(cell_elem, cell_style)
460
-
888
+
461
889
  # 如果行有背景色且单元格没有单独设置,应用行背景色
462
- if row_bg and not cell_style.get('background-color'):
463
- if row_bg.startswith('#'):
464
- shading_elm = OxmlElement('w:shd')
465
- shading_elm.set(qn('w:fill'), row_bg[1:].upper())
890
+ if row_bg and not cell_style.get("background-color"):
891
+ if row_bg.startswith("#"):
892
+ shading_elm = OxmlElement("w:shd")
893
+ shading_elm.set(qn("w:fill"), row_bg[1:].upper())
466
894
  cell_elem._element.get_or_add_tcPr().append(shading_elm)
467
895
 
896
+
468
897
  def set_section_columns(section, cols_num=2, space=720):
469
898
  """设置节的多栏布局
470
899
 
@@ -474,11 +903,12 @@ def set_section_columns(section, cols_num=2, space=720):
474
903
  space: 栏间距(单位:twips,1英寸=1440twips),默认720(0.5英寸)
475
904
  """
476
905
  sectPr = section._sectPr
477
- cols = OxmlElement('w:cols')
478
- cols.set(qn('w:num'), str(cols_num))
479
- cols.set(qn('w:space'), str(space))
906
+ cols = OxmlElement("w:cols")
907
+ cols.set(qn("w:num"), str(cols_num))
908
+ cols.set(qn("w:space"), str(space))
480
909
  sectPr.append(cols)
481
910
 
911
+
482
912
  def add_columns_section(doc, cols_num=2, space=720):
483
913
  """添加连续分节符并设置多栏布局(不换页)
484
914
 
@@ -496,312 +926,606 @@ def add_columns_section(doc, cols_num=2, space=720):
496
926
  set_section_columns(section, cols_num, space)
497
927
  return section
498
928
 
929
+
499
930
  def _process_blockquote(blockquote_elem, doc, level=0):
500
931
  """递归处理嵌套引用"""
501
- # 获取当前引用的直接文本内容(不包括嵌套引用)
502
- direct_text = ''
932
+ # 检查是否有直接内容(不包括嵌套引用)
933
+ has_content = False
503
934
  for child in blockquote_elem.children:
504
935
  if child.name is None: # 文本节点
505
- direct_text += str(child)
506
- elif child.name != 'blockquote': # 其他非引用标签
507
- direct_text += child.get_text()
508
-
509
- direct_text = ' '.join(direct_text.split())
936
+ if str(child).strip():
937
+ has_content = True
938
+ break
939
+ elif child.name != "blockquote" and child.get_text().strip():
940
+ has_content = True
941
+ break
510
942
 
511
- # 如果有直接文本,创建段落
512
- if direct_text:
943
+ # 如果有直接内容,创建段落
944
+ if has_content:
513
945
  para = doc.add_paragraph()
514
- run = para.add_run(direct_text)
515
- set_font(run, italic=True, color=RGBColor(100, 100, 100))
516
946
  # 根据层级设置缩进
517
947
  para.paragraph_format.left_indent = Inches(0.3 * level)
518
948
  para.paragraph_format.right_indent = Inches(0.5)
519
949
  # 添加灰色左边框
520
- pBdr = OxmlElement('w:pBdr')
521
- left_border = OxmlElement('w:left')
522
- left_border.set(qn('w:val'), 'single')
523
- left_border.set(qn('w:sz'), '18')
524
- left_border.set(qn('w:color'), 'CCCCCC')
950
+ pBdr = OxmlElement("w:pBdr")
951
+ left_border = OxmlElement("w:left")
952
+ left_border.set(qn("w:val"), "single")
953
+ left_border.set(qn("w:sz"), "18")
954
+ left_border.set(qn("w:color"), "CCCCCC")
525
955
  pBdr.append(left_border)
526
956
  para.paragraph_format._element.get_or_add_pPr().append(pBdr)
527
957
 
958
+ # 创建一个临时元素来包含所有非blockquote的子元素
959
+ from bs4 import BeautifulSoup
960
+
961
+ temp_soup = BeautifulSoup("<div></div>", "html.parser")
962
+ temp_div = temp_soup.div
963
+
964
+ # 复制所有非blockquote的子元素
965
+ for child in blockquote_elem.children:
966
+ if child.name != "blockquote":
967
+ temp_div.append(
968
+ child.__copy__() if hasattr(child, "__copy__") else child
969
+ )
970
+
971
+ # 使用 _process_element_to_runs 处理格式化内容
972
+ # 注意:引用内容默认斜体和灰色
973
+ _process_element_to_runs(
974
+ temp_div,
975
+ para,
976
+ default_font="微软雅黑",
977
+ default_size=12,
978
+ italic=True,
979
+ color=RGBColor(100, 100, 100),
980
+ )
981
+
528
982
  # 递归处理嵌套引用
529
- nested_quotes = blockquote_elem.find_all('blockquote', recursive=False)
983
+ nested_quotes = blockquote_elem.find_all("blockquote", recursive=False)
530
984
  for nested in nested_quotes:
531
985
  _process_blockquote(nested, doc, level + 1)
532
986
 
987
+
533
988
  def add_page_break(doc):
534
989
  """添加分页符"""
535
990
  doc.add_page_break()
536
991
 
992
+
537
993
  def add_horizontal_rule(doc):
538
994
  """添加水平线"""
539
995
  para = doc.add_paragraph()
540
996
  para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
541
- run = para.add_run('_' * 50)
997
+ run = para.add_run("_" * 50)
542
998
  run.font.color.rgb = RGBColor(200, 200, 200)
543
999
 
544
- def convert_html_to_docx(html_file, output_file, default_font='微软雅黑', default_size=12):
1000
+
1001
+ # ==================== 辅助函数 ====================
1002
+ def _init_document(default_font, default_size):
1003
+ """初始化Word文档"""
1004
+ doc = Document()
1005
+ doc.styles["Normal"].font.name = default_font
1006
+ doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1007
+ doc.styles["Normal"].font.size = Pt(default_size)
1008
+
1009
+ # 处理页面设置
1010
+ section = doc.sections[0]
1011
+ section.page_height = Cm(ConverterConfig.PAGE_HEIGHT_CM)
1012
+ section.page_width = Cm(ConverterConfig.PAGE_WIDTH_CM)
1013
+ section.left_margin = Cm(ConverterConfig.MARGIN_CM)
1014
+ section.right_margin = Cm(ConverterConfig.MARGIN_CM)
1015
+ section.top_margin = Cm(ConverterConfig.MARGIN_CM)
1016
+ section.bottom_margin = Cm(ConverterConfig.MARGIN_CM)
1017
+
1018
+ return doc
1019
+
1020
+
1021
+ def _read_html_file(html_file):
1022
+ """读取HTML文件"""
1023
+ with open(html_file, "r", encoding="utf-8") as f:
1024
+ return f.read()
1025
+
1026
+
1027
+ def _parse_html(html_content):
1028
+ """解析HTML内容"""
1029
+ return BeautifulSoup(html_content, "html.parser")
1030
+
1031
+
1032
+ def _process_heading(element, doc, default_font):
1033
+ """处理标题元素"""
1034
+ level = int(element.name[1])
1035
+ heading = doc.add_heading(element.get_text().strip(), level=level)
1036
+ heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
1037
+
1038
+ # 标题样式
1039
+ for run in heading.runs:
1040
+ run.font.name = default_font
1041
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1042
+ run.font.size = Pt(ConverterConfig.HEADING_SIZES.get(level, 14))
1043
+ run.font.bold = True
1044
+ run.font.color.rgb = ConverterConfig.HEADING_COLORS.get(
1045
+ level, RGBColor(107, 91, 149)
1046
+ )
1047
+ if level == 1:
1048
+ heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
1049
+
1050
+
1051
+ def _process_paragraph_element(element, doc):
1052
+ """处理段落元素"""
1053
+ classes = element.get("class", [])
1054
+ class_set = set(classes)
1055
+
1056
+ if "center" in class_set:
1057
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
1058
+ elif "right" in class_set:
1059
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT)
1060
+ elif "dialogue" in class_set:
1061
+ para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
1062
+ for run in para.runs:
1063
+ set_font(run, italic=True, color=RGBColor(107, 91, 122))
1064
+ elif "quote" in class_set or element.get("style", "").find("background") != -1:
1065
+ para = process_paragraph(element, doc)
1066
+ para.paragraph_format.left_indent = Inches(1)
1067
+ para.paragraph_format.right_indent = Inches(1)
1068
+ from docx.enum.text import WD_BORDER
1069
+
1070
+ for border in para.paragraph_format._element.xpath("./w:pBdr"):
1071
+ border.getparent().remove(border)
1072
+ # 添加边框效果(使用浅灰色背景模拟)
1073
+ shading_elm = OxmlElement("w:shd")
1074
+ shading_elm.set(qn("w:fill"), "F5F5F5")
1075
+ para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
1076
+ else:
1077
+ process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
1078
+
1079
+
1080
+ def _process_list_element(element, doc, ordered):
1081
+ """处理列表元素"""
1082
+ items = element.find_all("li", recursive=False)
1083
+ process_list_items(items, doc, ordered=ordered)
1084
+
1085
+
1086
+ def _process_image_element(element, doc, html_file):
1087
+ """处理图片元素"""
1088
+ src = element.get("src", "")
1089
+ if src:
1090
+ # 解析宽度、高度和对齐方式
1091
+ width = element.get("width")
1092
+ height = element.get("height")
1093
+ style = element.get("style", "")
1094
+ align = element.get("align", "center")
1095
+
1096
+ # 从 style 中提取对齐方式
1097
+ if "text-align: right" in style or "float: right" in style:
1098
+ align = "right"
1099
+ elif "text-align: left" in style or "float: left" in style:
1100
+ align = "left"
1101
+ elif "text-align: center" in style:
1102
+ align = "center"
1103
+
1104
+ # 处理宽度高度(支持像素转英寸)
1105
+ width_inch = None
1106
+ height_inch = None
1107
+ if width:
1108
+ width_px = float(width)
1109
+ width_inch = width_px / 96 # 假设96 DPI
1110
+
1111
+ if height:
1112
+ height_px = float(height)
1113
+ height_inch = height_px / 96
1114
+
1115
+ # 处理相对路径(相对于HTML文件)
1116
+ html_dir = os.path.dirname(html_file)
1117
+ image_path = os.path.join(html_dir, src) if not os.path.isabs(src) else src
1118
+
1119
+ # 添加图片
1120
+ add_image(doc, image_path, width_inch, height_inch, align)
1121
+
1122
+
1123
+ def _process_div_element(element, doc, default_font, default_size):
1124
+ """处理div元素"""
1125
+ classes = element.get("class", [])
1126
+ class_set = set(classes)
1127
+
1128
+ if "chapter" in class_set:
1129
+ # 处理章节
1130
+ h2 = element.find("h2")
1131
+ if h2:
1132
+ heading = doc.add_heading(h2.get_text().strip(), level=2)
1133
+ for run in heading.runs:
1134
+ run.font.color.rgb = RGBColor(91, 78, 140)
1135
+ run.font.size = Pt(16)
1136
+ run.font.name = default_font
1137
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1138
+
1139
+ paragraphs = element.find_all("p")
1140
+ for p in paragraphs:
1141
+ first_span = p.find("span", class_="first-line")
1142
+ if first_span:
1143
+ # 处理首字下沉效果
1144
+ para = doc.add_paragraph()
1145
+ para.paragraph_format.first_line_indent = Inches(0)
1146
+
1147
+ first_char_run = para.add_run(first_span.text)
1148
+ set_font(
1149
+ first_char_run, size=20, bold=True, color=RGBColor(102, 126, 234)
1150
+ )
1151
+ remaining_text = p.get_text().replace(first_span.text, "", 1)
1152
+ run = para.add_run(remaining_text)
1153
+ set_font(run)
1154
+ else:
1155
+ process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
1156
+
1157
+ elif "ending" in class_set:
1158
+ para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
1159
+ for run in para.runs:
1160
+ set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
1161
+
1162
+ elif "page-break" in class_set:
1163
+ add_page_break(doc)
1164
+
1165
+ elif "columns" in class_set:
1166
+ # 处理多栏布局(使用连续分节符,不换页)
1167
+ cols_num = int(element.get("data-cols", "2"))
1168
+ # 添加连续分节符并设置栏数
1169
+ add_columns_section(doc, cols_num)
1170
+ # 处理其中的段落
1171
+ for p in element.find_all("p", recursive=False):
1172
+ process_paragraph(
1173
+ p, doc, default_font=default_font, default_size=default_size
1174
+ )
1175
+
1176
+ elif "info" in class_set or "warning" in class_set or "success" in class_set:
1177
+ # 处理提示框
1178
+ para = doc.add_paragraph()
1179
+ para.paragraph_format.right_indent = Inches(0.3)
1180
+
1181
+ # 设置背景色和左边框颜色
1182
+ if "info" in class_set:
1183
+ bg_color = ConverterConfig.INFO_COLORS["bg"]
1184
+ border_color = ConverterConfig.INFO_COLORS["border"]
1185
+ elif "warning" in class_set:
1186
+ bg_color = ConverterConfig.WARNING_COLORS["bg"]
1187
+ border_color = ConverterConfig.WARNING_COLORS["border"]
1188
+ else: # success
1189
+ bg_color = ConverterConfig.SUCCESS_COLORS["bg"]
1190
+ border_color = ConverterConfig.SUCCESS_COLORS["border"]
1191
+
1192
+ # 处理内容
1193
+ _process_element_to_runs(element, para, default_font, default_size)
1194
+
1195
+ # 添加背景色
1196
+ shading_elm = OxmlElement("w:shd")
1197
+ shading_elm.set(qn("w:fill"), bg_color)
1198
+ para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
1199
+
1200
+ # 添加左边框
1201
+ pPr = para.paragraph_format._element.get_or_add_pPr()
1202
+ pBdr = OxmlElement("w:pBdr")
1203
+ left = OxmlElement("w:left")
1204
+ left.set(qn("w:val"), "single")
1205
+ left.set(qn("w:sz"), "4")
1206
+ left.set(qn("w:color"), border_color)
1207
+ pBdr.append(left)
1208
+ pPr.append(pBdr)
1209
+
1210
+ para.paragraph_format.space_after = Pt(6)
1211
+
1212
+
1213
+ def _process_horizontal_rule_element(element, doc):
1214
+ """处理水平线元素"""
1215
+ classes = element.get("class", [])
1216
+ style = element.get("style", "")
1217
+ class_set = set(classes)
1218
+ if "page-break" in class_set or "page-break-after" in style:
1219
+ add_page_break(doc)
1220
+ else:
1221
+ add_horizontal_rule(doc)
1222
+
1223
+
1224
+ def _process_elements(soup, doc, html_file, default_font, default_size):
1225
+ """处理所有HTML元素"""
1226
+ for element in soup.body.find_all(recursive=False):
1227
+ if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
1228
+ _process_heading(element, doc, default_font)
1229
+ elif element.name == "p":
1230
+ _process_paragraph_element(element, doc)
1231
+ elif element.name == "ul":
1232
+ _process_list_element(element, doc, ordered=False)
1233
+ elif element.name == "ol":
1234
+ _process_list_element(element, doc, ordered=True)
1235
+ elif element.name == "table":
1236
+ process_table(element, doc)
1237
+ elif element.name == "img":
1238
+ _process_image_element(element, doc, html_file)
1239
+ elif element.name == "div":
1240
+ _process_div_element(element, doc, default_font, default_size)
1241
+ elif element.name == "hr":
1242
+ _process_horizontal_rule_element(element, doc)
1243
+
1244
+
1245
+ def convert_html_to_docx(
1246
+ html_file, output_file, default_font="微软雅黑", default_size=12
1247
+ ):
545
1248
  """将HTML文件转换为DOCX文件"""
546
1249
  # 读取HTML文件
547
- with open(html_file, 'r', encoding='utf-8') as f:
1250
+ with open(html_file, "r", encoding="utf-8") as f:
548
1251
  html_content = f.read()
549
-
1252
+
550
1253
  # 解析HTML
551
- soup = BeautifulSoup(html_content, 'html.parser')
552
-
1254
+ soup = BeautifulSoup(html_content, "html.parser")
1255
+
553
1256
  # 创建Word文档
554
1257
  doc = Document()
555
-
1258
+
556
1259
  # 设置默认字体
557
- doc.styles['Normal'].font.name = default_font
558
- doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
559
- doc.styles['Normal'].font.size = Pt(default_size)
560
-
1260
+ doc.styles["Normal"].font.name = default_font
1261
+ doc.styles["Normal"]._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1262
+ doc.styles["Normal"].font.size = Pt(default_size)
1263
+
561
1264
  # 处理页面设置
562
1265
  section = doc.sections[0]
563
- section.page_height = Cm(29.7)
564
- section.page_width = Cm(21)
565
- section.left_margin = Cm(2.54)
566
- section.right_margin = Cm(2.54)
567
- section.top_margin = Cm(2.54)
568
- section.bottom_margin = Cm(2.54)
569
-
1266
+ section.page_height = Cm(ConverterConfig.PAGE_HEIGHT_CM)
1267
+ section.page_width = Cm(ConverterConfig.PAGE_WIDTH_CM)
1268
+ section.left_margin = Cm(ConverterConfig.MARGIN_CM)
1269
+ section.right_margin = Cm(ConverterConfig.MARGIN_CM)
1270
+ section.top_margin = Cm(ConverterConfig.MARGIN_CM)
1271
+ section.bottom_margin = Cm(ConverterConfig.MARGIN_CM)
1272
+
570
1273
  # 遍历所有顶级元素
571
1274
  for element in soup.body.find_all(recursive=False):
572
- if element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
1275
+ if element.name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
573
1276
  level = int(element.name[1])
574
1277
  heading = doc.add_heading(element.get_text().strip(), level=level)
575
1278
  heading.alignment = WD_PARAGRAPH_ALIGNMENT.LEFT
576
-
1279
+
577
1280
  # 标题样式
578
1281
  for run in heading.runs:
579
1282
  run.font.name = default_font
580
- run._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
1283
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1284
+ run.font.size = Pt(ConverterConfig.HEADING_SIZES.get(level, 14))
1285
+ run.font.bold = True
1286
+ run.font.color.rgb = ConverterConfig.HEADING_COLORS.get(
1287
+ level, RGBColor(107, 91, 149)
1288
+ )
581
1289
  if level == 1:
582
- run.font.size = Pt(18)
583
- run.font.bold = True
584
- run.font.color.rgb = RGBColor(74, 63, 107)
585
1290
  heading.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
586
- elif level == 2:
587
- run.font.size = Pt(16)
588
- run.font.bold = True
589
- run.font.color.rgb = RGBColor(91, 78, 140)
590
- else:
591
- run.font.size = Pt(14)
592
- run.font.bold = True
593
-
594
- elif element.name == 'p':
1291
+
1292
+ elif element.name == "p":
595
1293
  # 检查特殊段落样式
596
- classes = element.get('class', [])
597
-
598
- if 'center' in classes:
599
- para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
600
- elif 'right' in classes:
601
- para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT)
602
- elif 'dialogue' in classes:
1294
+ classes = element.get("class", [])
1295
+ class_set = set(classes) # 转换为集合提高查找性能
1296
+
1297
+ if "center" in class_set:
1298
+ para = process_paragraph(
1299
+ element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER
1300
+ )
1301
+ elif "right" in class_set:
1302
+ para = process_paragraph(
1303
+ element, doc, align=WD_PARAGRAPH_ALIGNMENT.RIGHT
1304
+ )
1305
+ elif "dialogue" in class_set:
603
1306
  para = process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
604
1307
  for run in para.runs:
605
1308
  set_font(run, italic=True, color=RGBColor(107, 91, 122))
606
- elif 'quote' in classes or element.get('style', '').find('background') != -1:
1309
+ elif (
1310
+ "quote" in class_set
1311
+ or element.get("style", "").find("background") != -1
1312
+ ):
607
1313
  para = process_paragraph(element, doc)
608
1314
  para.paragraph_format.left_indent = Inches(1)
609
1315
  para.paragraph_format.right_indent = Inches(1)
610
1316
  from docx.enum.text import WD_BORDER
611
- for border in para.paragraph_format._element.xpath('./w:pBdr'):
1317
+
1318
+ for border in para.paragraph_format._element.xpath("./w:pBdr"):
612
1319
  border.getparent().remove(border)
613
1320
  # 添加边框效果(使用浅灰色背景模拟)
614
- shading_elm = OxmlElement('w:shd')
615
- shading_elm.set(qn('w:fill'), 'F5F5F5')
1321
+ shading_elm = OxmlElement("w:shd")
1322
+ shading_elm.set(qn("w:fill"), "F5F5F5")
616
1323
  para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
617
1324
  else:
618
1325
  process_paragraph(element, doc, indent=0.5, line_spacing=1.5)
619
-
620
- elif element.name == 'ul':
621
- items = element.find_all('li', recursive=False)
1326
+
1327
+ elif element.name == "ul":
1328
+ items = element.find_all("li", recursive=False)
622
1329
  process_list_items(items, doc, ordered=False)
623
-
624
- elif element.name == 'ol':
625
- items = element.find_all('li', recursive=False)
1330
+
1331
+ elif element.name == "ol":
1332
+ items = element.find_all("li", recursive=False)
626
1333
  process_list_items(items, doc, ordered=True)
627
-
628
- elif element.name == 'blockquote':
1334
+
1335
+ elif element.name == "blockquote":
629
1336
  # 递归处理嵌套引用
630
1337
  _process_blockquote(element, doc, level=0)
631
-
632
- elif element.name == 'pre':
1338
+
1339
+ elif element.name == "pre":
633
1340
  code_text = element.get_text()
634
1341
  para = doc.add_paragraph()
635
1342
  para.paragraph_format.left_indent = Inches(0.5)
636
1343
  run = para.add_run(code_text)
637
- set_font(run, font_name='Consolas', size=10, color=RGBColor(0, 0, 128))
1344
+ set_font(run, font_name="Consolas", size=10, color=RGBColor(0, 0, 128))
638
1345
  # 添加灰色背景
639
- shading_elm = OxmlElement('w:shd')
640
- shading_elm.set(qn('w:fill'), 'F0F0F0')
1346
+ shading_elm = OxmlElement("w:shd")
1347
+ shading_elm.set(qn("w:fill"), "F0F0F0")
641
1348
  para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
642
-
643
- elif element.name == 'hr':
1349
+
1350
+ elif element.name == "hr":
644
1351
  # 检查是否有分页符class或style
645
- classes = element.get('class', [])
646
- style = element.get('style', '')
647
- if 'page-break' in classes or 'page-break-after' in style:
1352
+ classes = element.get("class", [])
1353
+ style = element.get("style", "")
1354
+ class_set = set(classes)
1355
+ if "page-break" in class_set or "page-break-after" in style:
648
1356
  add_page_break(doc)
649
1357
  else:
650
1358
  add_horizontal_rule(doc)
651
-
652
- elif element.name == 'table':
1359
+
1360
+ elif element.name == "table":
653
1361
  process_table(element, doc)
654
-
655
- elif element.name == 'div':
1362
+
1363
+ elif element.name == "div":
656
1364
  # 检查是否是特殊div
657
- classes = element.get('class', [])
658
- if 'chapter' in classes:
1365
+ classes = element.get("class", [])
1366
+ class_set = set(classes)
1367
+
1368
+ if "chapter" in class_set:
659
1369
  # 处理章节
660
- h2 = element.find('h2')
1370
+ h2 = element.find("h2")
661
1371
  if h2:
662
1372
  heading = doc.add_heading(h2.get_text().strip(), level=2)
663
1373
  for run in heading.runs:
664
1374
  run.font.color.rgb = RGBColor(91, 78, 140)
665
1375
  run.font.size = Pt(16)
666
1376
  run.font.name = default_font
667
- run._element.rPr.rFonts.set(qn('w:eastAsia'), default_font)
668
-
669
- paragraphs = element.find_all('p')
1377
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), default_font)
1378
+
1379
+ paragraphs = element.find_all("p")
670
1380
  for p in paragraphs:
671
- first_span = p.find('span', class_='first-line')
1381
+ first_span = p.find("span", class_="first-line")
672
1382
  if first_span:
673
1383
  para = doc.add_paragraph()
674
1384
  first_char_run = para.add_run(first_span.text)
675
- set_font(first_char_run, size=20, bold=True, color=RGBColor(102, 126, 234))
676
- remaining_text = p.get_text().replace(first_span.text, '', 1)
1385
+ set_font(
1386
+ first_char_run,
1387
+ size=20,
1388
+ bold=True,
1389
+ color=RGBColor(102, 126, 234),
1390
+ )
1391
+ remaining_text = p.get_text().replace(first_span.text, "", 1)
677
1392
  run = para.add_run(remaining_text)
678
1393
  set_font(run)
679
1394
  else:
680
1395
  process_paragraph(p, doc, indent=0.5, line_spacing=1.5)
681
-
682
- elif 'ending' in classes:
683
- para = process_paragraph(element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER)
1396
+
1397
+ elif "ending" in class_set:
1398
+ para = process_paragraph(
1399
+ element, doc, align=WD_PARAGRAPH_ALIGNMENT.CENTER
1400
+ )
684
1401
  for run in para.runs:
685
1402
  set_font(run, italic=True, size=14, color=RGBColor(91, 78, 140))
686
-
687
- elif 'page-break' in classes:
1403
+
1404
+ elif "page-break" in class_set:
688
1405
  add_page_break(doc)
689
1406
 
690
- elif 'columns' in classes:
1407
+ elif "columns" in class_set:
691
1408
  # 处理多栏布局(使用连续分节符,不换页)
692
- cols_num = int(element.get('data-cols', '2'))
1409
+ cols_num = int(element.get("data-cols", "2"))
693
1410
  # 添加连续分节符并设置栏数
694
1411
  add_columns_section(doc, cols_num)
695
1412
  # 处理其中的段落
696
- for p in element.find_all('p', recursive=False):
697
- process_paragraph(p, doc, default_font=default_font, default_size=default_size)
1413
+ for p in element.find_all("p", recursive=False):
1414
+ process_paragraph(
1415
+ p, doc, default_font=default_font, default_size=default_size
1416
+ )
698
1417
 
699
- elif 'info' in classes or 'warning' in classes or 'success' in classes:
1418
+ elif (
1419
+ "info" in class_set or "warning" in class_set or "success" in class_set
1420
+ ):
700
1421
  # 处理提示框
701
1422
  para = doc.add_paragraph()
702
1423
  para.paragraph_format.right_indent = Inches(0.3)
703
-
1424
+
704
1425
  # 设置背景色和左边框颜色
705
- if 'info' in classes:
706
- bg_color = 'E3F2FD' # 浅蓝
707
- border_color = '2196F3' # 蓝色
708
- elif 'warning' in classes:
709
- bg_color = 'FFF3CD' # 浅黄
710
- border_color = 'FFC107' # 黄色
1426
+ if "info" in class_set:
1427
+ bg_color = ConverterConfig.INFO_COLORS["bg"]
1428
+ border_color = ConverterConfig.INFO_COLORS["border"]
1429
+ elif "warning" in class_set:
1430
+ bg_color = ConverterConfig.WARNING_COLORS["bg"]
1431
+ border_color = ConverterConfig.WARNING_COLORS["border"]
711
1432
  else: # success
712
- bg_color = 'D4EDDA' # 浅绿
713
- border_color = '28A745' # 绿色
714
-
1433
+ bg_color = ConverterConfig.SUCCESS_COLORS["bg"]
1434
+ border_color = ConverterConfig.SUCCESS_COLORS["border"]
1435
+
715
1436
  # 处理内容
716
1437
  _process_element_to_runs(element, para, default_font, default_size)
717
-
1438
+
718
1439
  # 添加背景色
719
- shading_elm = OxmlElement('w:shd')
720
- shading_elm.set(qn('w:fill'), bg_color)
1440
+ shading_elm = OxmlElement("w:shd")
1441
+ shading_elm.set(qn("w:fill"), bg_color)
721
1442
  para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
722
-
1443
+
723
1444
  # 添加左边框
724
- pBdr = OxmlElement('w:pBdr')
725
- left_border = OxmlElement('w:left')
726
- left_border.set(qn('w:val'), 'single')
727
- left_border.set(qn('w:sz'), '24') # 边框粗细
728
- left_border.set(qn('w:color'), border_color)
1445
+ pBdr = OxmlElement("w:pBdr")
1446
+ left_border = OxmlElement("w:left")
1447
+ left_border.set(qn("w:val"), "single")
1448
+ left_border.set(qn("w:sz"), "24") # 边框粗细
1449
+ left_border.set(qn("w:color"), border_color)
729
1450
  pBdr.append(left_border)
730
1451
  para.paragraph_format._element.get_or_add_pPr().append(pBdr)
731
-
1452
+
732
1453
  else:
733
1454
  # 处理普通div,检查是否有内联样式(如提示框)
734
- style = element.get('style', '')
1455
+ style = element.get("style", "")
735
1456
  style_dict = _parse_style(style)
736
-
1457
+
737
1458
  # 检查是否有背景色和左边框(提示框特征)
738
- bg_color = style_dict.get('background-color', '')
739
- border_left = style_dict.get('border-left', '')
740
-
1459
+ bg_color = style_dict.get("background-color", "")
1460
+ border_left = style_dict.get("border-left", "")
1461
+
741
1462
  if bg_color and border_left:
742
1463
  # 这是提示框
743
1464
  para = doc.add_paragraph()
744
1465
  para.paragraph_format.right_indent = Inches(0.3)
745
-
1466
+
746
1467
  # 处理内容
747
1468
  _process_element_to_runs(element, para, default_font, default_size)
748
-
1469
+
749
1470
  # 添加背景色
750
- if bg_color.startswith('#'):
751
- shading_elm = OxmlElement('w:shd')
752
- shading_elm.set(qn('w:fill'), bg_color[1:].upper())
753
- para.paragraph_format._element.get_or_add_pPr().append(shading_elm)
754
-
1471
+ if bg_color.startswith("#"):
1472
+ shading_elm = OxmlElement("w:shd")
1473
+ shading_elm.set(qn("w:fill"), bg_color[1:].upper())
1474
+ para.paragraph_format._element.get_or_add_pPr().append(
1475
+ shading_elm
1476
+ )
1477
+
755
1478
  # 解析左边框颜色
756
- border_color = ''
757
- if 'solid' in border_left:
1479
+ border_color = ""
1480
+ if "solid" in border_left:
758
1481
  parts = border_left.split()
759
1482
  for i, part in enumerate(parts):
760
- if part.startswith('#'):
1483
+ if part.startswith("#"):
761
1484
  border_color = part[1:]
762
1485
  break
763
-
1486
+
764
1487
  # 添加左边框
765
1488
  if border_color:
766
- pBdr = OxmlElement('w:pBdr')
767
- left_border = OxmlElement('w:left')
768
- left_border.set(qn('w:val'), 'single')
769
- left_border.set(qn('w:sz'), '24')
770
- left_border.set(qn('w:color'), border_color.upper())
1489
+ pBdr = OxmlElement("w:pBdr")
1490
+ left_border = OxmlElement("w:left")
1491
+ left_border.set(qn("w:val"), "single")
1492
+ left_border.set(qn("w:sz"), "24")
1493
+ left_border.set(qn("w:color"), border_color.upper())
771
1494
  pBdr.append(left_border)
772
1495
  para.paragraph_format._element.get_or_add_pPr().append(pBdr)
773
1496
  else:
774
1497
  # 普通div,处理其中的段落
775
- for p in element.find_all('p', recursive=False):
1498
+ for p in element.find_all("p", recursive=False):
776
1499
  process_paragraph(p, doc)
777
-
778
- elif element.name == 'img':
779
- src = element.get('src', '')
780
- alt = element.get('alt', '图片')
1500
+
1501
+ elif element.name == "img":
1502
+ src = element.get("src", "")
1503
+ alt = element.get("alt", "图片")
781
1504
  if src and os.path.exists(src):
782
1505
  try:
783
1506
  doc.add_picture(src, width=Inches(5))
784
1507
  last_para = doc.paragraphs[-1]
785
1508
  last_para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
786
1509
  except:
787
- para = doc.add_paragraph(f'[图片: {alt}]')
1510
+ para = doc.add_paragraph(f"[图片: {alt}]")
788
1511
  para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
1512
+ raise
789
1513
  else:
790
- para = doc.add_paragraph(f'[图片: {alt} - 路径: {src}]')
1514
+ para = doc.add_paragraph(f"[图片: {alt} - 路径: {src}]")
791
1515
  para.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
792
-
1516
+
793
1517
  # 保存文档
794
1518
  doc.save(output_file)
795
- print(f"转换完成!文件已保存为 {output_file}")
796
1519
 
797
- if __name__ == '__main__':
1520
+
1521
+ if __name__ == "__main__":
798
1522
  import sys
799
-
800
- if len(sys.argv) > 2:
801
- html_file = sys.argv[1]
802
- output_file = sys.argv[2]
803
- else:
804
- html_file = r'C:\Users\birth\Desktop\tmp\test\sample.html'
805
- output_file = r'C:\Users\birth\Desktop\tmp\test\sample.docx'
806
-
807
- convert_html_to_docx(html_file, output_file)
1523
+
1524
+ if len(sys.argv) != 3:
1525
+ print("用法: python docx_converter.py <html_file> <output_file>")
1526
+ sys.exit(1)
1527
+
1528
+ html_file = sys.argv[1]
1529
+ output_file = sys.argv[2]
1530
+
1531
+ convert_html_to_docx(html_file, output_file)