md2word 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2word/converter.py ADDED
@@ -0,0 +1,1169 @@
1
+ """
2
+ Core converter module for md2word.
3
+ Converts Markdown content to Word documents.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import base64
9
+ import re
10
+ import uuid
11
+ from io import BytesIO
12
+ from pathlib import Path
13
+ from typing import TYPE_CHECKING
14
+
15
+ import httpx
16
+ import markdown2
17
+ from docx import Document
18
+ from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK, WD_LINE_SPACING
19
+ from docx.image.exceptions import UnrecognizedImageError
20
+ from docx.oxml import OxmlElement
21
+ from docx.oxml.ns import qn
22
+ from docx.shared import Inches, Pt, RGBColor
23
+ from html4docx import HtmlToDocx
24
+ from PIL import Image
25
+
26
+ from .config import Config, StyleConfig, TableConfig
27
+ from .latex import extract_latex_formulas, replace_formula_placeholders
28
+
29
+ if TYPE_CHECKING:
30
+ pass
31
+
32
+
33
+ def print_info(message: str) -> None:
34
+ """Print info message."""
35
+ print(f"[INFO] {message}")
36
+
37
+
38
+ def print_error(message: str) -> None:
39
+ """Print error message."""
40
+ print(f"[ERROR] {message}")
41
+
42
+
43
+ def hex_to_rgb(hex_color: str) -> tuple[int, int, int]:
44
+ """Convert hex color to RGB tuple."""
45
+ hex_color = hex_color.lstrip("#")
46
+ r = int(hex_color[0:2], 16)
47
+ g = int(hex_color[2:4], 16)
48
+ b = int(hex_color[4:6], 16)
49
+ return (r, g, b)
50
+
51
+
52
+ # Chinese number mapping
53
+ CHINESE_NUMBERS = [
54
+ "零",
55
+ "一",
56
+ "二",
57
+ "三",
58
+ "四",
59
+ "五",
60
+ "六",
61
+ "七",
62
+ "八",
63
+ "九",
64
+ "十",
65
+ "十一",
66
+ "十二",
67
+ "十三",
68
+ "十四",
69
+ "十五",
70
+ "十六",
71
+ "十七",
72
+ "十八",
73
+ "十九",
74
+ "二十",
75
+ ]
76
+
77
+
78
+ def number_to_chinese(n: int) -> str:
79
+ """Convert number to Chinese."""
80
+ if n <= 20:
81
+ return CHINESE_NUMBERS[n]
82
+ return str(n)
83
+
84
+
85
+ class HeadingNumbering:
86
+ """Heading numbering manager."""
87
+
88
+ FORMATS = {
89
+ "chapter": "第{n}章",
90
+ "section": "第{n}节",
91
+ "chinese": "{n}、",
92
+ "chinese_paren": "({n})",
93
+ "arabic": "{n}.",
94
+ "arabic_paren": "({n})",
95
+ "arabic_bracket": "[{n}]",
96
+ "roman": "{n}.",
97
+ "roman_lower": "{n}.",
98
+ "letter": "{n}.",
99
+ "letter_lower": "{n}.",
100
+ "circle": "{n}",
101
+ "none": "",
102
+ }
103
+
104
+ ROMAN_NUMERALS = [
105
+ "",
106
+ "I",
107
+ "II",
108
+ "III",
109
+ "IV",
110
+ "V",
111
+ "VI",
112
+ "VII",
113
+ "VIII",
114
+ "IX",
115
+ "X",
116
+ "XI",
117
+ "XII",
118
+ "XIII",
119
+ "XIV",
120
+ "XV",
121
+ "XVI",
122
+ "XVII",
123
+ "XVIII",
124
+ "XIX",
125
+ "XX",
126
+ ]
127
+
128
+ CIRCLE_NUMBERS = [
129
+ "⓪",
130
+ "①",
131
+ "②",
132
+ "③",
133
+ "④",
134
+ "⑤",
135
+ "⑥",
136
+ "⑦",
137
+ "⑧",
138
+ "⑨",
139
+ "⑩",
140
+ "⑪",
141
+ "⑫",
142
+ "⑬",
143
+ "⑭",
144
+ "⑮",
145
+ "⑯",
146
+ "⑰",
147
+ "⑱",
148
+ "⑲",
149
+ "⑳",
150
+ ]
151
+
152
+ def __init__(self):
153
+ self.counters = {}
154
+
155
+ def reset(self, level: int | None = None):
156
+ """Reset counters."""
157
+ if level is None:
158
+ self.counters = {}
159
+ else:
160
+ for lvl in list(self.counters.keys()):
161
+ if lvl >= level:
162
+ self.counters[lvl] = 0
163
+
164
+ def get_number(self, level: int, format_name: str | None) -> str:
165
+ """Get numbering for specified level."""
166
+ if not format_name or format_name == "none":
167
+ return ""
168
+
169
+ if level not in self.counters:
170
+ self.counters[level] = 0
171
+ self.counters[level] += 1
172
+
173
+ for lvl in list(self.counters.keys()):
174
+ if lvl > level:
175
+ self.counters[lvl] = 0
176
+
177
+ n = self.counters[level]
178
+
179
+ if format_name in ("chapter", "section"):
180
+ chinese_n = number_to_chinese(n)
181
+ return self.FORMATS[format_name].format(n=chinese_n)
182
+ elif format_name in ("chinese", "chinese_paren"):
183
+ chinese_n = number_to_chinese(n)
184
+ return self.FORMATS[format_name].format(n=chinese_n)
185
+ elif format_name in ("arabic", "arabic_paren", "arabic_bracket"):
186
+ return self.FORMATS[format_name].format(n=n)
187
+ elif format_name == "roman":
188
+ roman = self.ROMAN_NUMERALS[n] if n <= 20 else str(n)
189
+ return f"{roman}."
190
+ elif format_name == "roman_lower":
191
+ roman = self.ROMAN_NUMERALS[n].lower() if n <= 20 else str(n)
192
+ return f"{roman}."
193
+ elif format_name == "letter":
194
+ letter = chr(ord("A") + n - 1) if n <= 26 else str(n)
195
+ return f"{letter}."
196
+ elif format_name == "letter_lower":
197
+ letter = chr(ord("a") + n - 1) if n <= 26 else str(n)
198
+ return f"{letter}."
199
+ elif format_name == "circle":
200
+ return self.CIRCLE_NUMBERS[n] if n <= 20 else f"({n})"
201
+ else:
202
+ try:
203
+ return format_name.format(n=n, cn=number_to_chinese(n))
204
+ except (KeyError, ValueError):
205
+ return f"{n}. "
206
+
207
+
208
+ # Image processing functions
209
+ def process_image_content(image_content: bytes, url: str, local_dir: str = "./images") -> str:
210
+ """Process image content, convert format and save, return local path."""
211
+ Path(local_dir).mkdir(parents=True, exist_ok=True)
212
+
213
+ image = Image.open(BytesIO(image_content))
214
+ original_format = image.format.lower() if image.format else "png"
215
+
216
+ supported_formats = ["png", "jpeg", "jpg"]
217
+ if original_format not in supported_formats:
218
+ if image.mode in ("RGBA", "LA") or "transparency" in image.info:
219
+ target_format = "png"
220
+ else:
221
+ target_format = "jpeg"
222
+ else:
223
+ target_format = original_format
224
+
225
+ url_filename = url.split("/")[-1].split("?")[0]
226
+ name_without_ext = Path(url_filename).stem if url_filename else str(uuid.uuid4())
227
+ local_filename = f"{name_without_ext}.{target_format}"
228
+ local_path = Path(local_dir) / local_filename
229
+
230
+ if original_format != target_format:
231
+ if target_format == "jpeg" and image.mode in ("RGBA", "LA"):
232
+ background = Image.new("RGB", image.size, (255, 255, 255))
233
+ if image.mode == "RGBA":
234
+ background.paste(image, mask=image.split()[-1])
235
+ else:
236
+ background.paste(image)
237
+ image = background
238
+
239
+ image.save(local_path, format=target_format.upper())
240
+ print_info(f"Downloaded and converted image: {url} ({original_format} -> {target_format}) -> {local_path}")
241
+ else:
242
+ with open(local_path, "wb") as f:
243
+ f.write(image_content)
244
+ print_info(f"Downloaded image: {url} -> {local_path}")
245
+
246
+ return str(local_path)
247
+
248
+
249
+ def download_image(url: str, config: Config) -> str | None:
250
+ """Download image and return local file path."""
251
+ local_dir = config.image_local_dir
252
+ headers = {"User-Agent": config.image_user_agent}
253
+ timeout = config.image_download_timeout
254
+
255
+ try:
256
+ Path(local_dir).mkdir(parents=True, exist_ok=True)
257
+
258
+ with httpx.Client() as client:
259
+ response = client.get(url, timeout=timeout, headers=headers, follow_redirects=True)
260
+ response.raise_for_status()
261
+ image_content = response.content
262
+
263
+ return process_image_content(image_content, url, local_dir=local_dir)
264
+ except Exception as e:
265
+ print_error(f"Failed to download image {url}: {e}")
266
+ return None
267
+
268
+
269
+ def ensure_local_image_compatible(image_path: str, local_dir: str = "./images") -> str | None:
270
+ """Ensure local image is in docx-supported format."""
271
+ path = Path(image_path)
272
+ if not path.exists():
273
+ print_error(f"Local image not found: {image_path}")
274
+ return None
275
+
276
+ try:
277
+ image_content = path.read_bytes()
278
+ except Exception as e:
279
+ print_error(f"Failed to read local image {image_path}: {e}")
280
+ return None
281
+
282
+ try:
283
+ image = Image.open(BytesIO(image_content))
284
+ original_format = image.format.lower() if image.format else "png"
285
+ image.verify()
286
+ except Exception as e:
287
+ print_error(f"Cannot recognize local image {image_path}: {e}")
288
+ return None
289
+
290
+ if original_format in ("png", "jpeg", "jpg"):
291
+ return str(path)
292
+
293
+ try:
294
+ return process_image_content(image_content, path.name, local_dir=local_dir)
295
+ except Exception as e:
296
+ print_error(f"Failed to convert local image {image_path}: {e}")
297
+ return None
298
+
299
+
300
+ def decode_data_uri_image(data_uri: str, local_dir: str = "./images") -> str | None:
301
+ """Decode data URI and save as local image."""
302
+ if not data_uri.startswith("data:"):
303
+ return None
304
+ if "base64," not in data_uri:
305
+ return None
306
+ try:
307
+ _, b64_data = data_uri.split("base64,", 1)
308
+ image_content = base64.b64decode(b64_data)
309
+ except Exception as e:
310
+ print_error(f"Failed to decode data URI: {e}")
311
+ return None
312
+
313
+ try:
314
+ name_hint = f"inline_{uuid.uuid4().hex}"
315
+ return process_image_content(image_content, name_hint, local_dir=local_dir)
316
+ except Exception as e:
317
+ print_error(f"Failed to process data URI image: {e}")
318
+ return None
319
+
320
+
321
+ def _extract_img_attr(tag: str, attr: str) -> str | None:
322
+ """Extract attribute from img tag."""
323
+ match = re.search(rf'{attr}\s*=\s*(["\'])(.*?)\1', tag, flags=re.IGNORECASE)
324
+ if match:
325
+ return match.group(2)
326
+ match = re.search(rf"{attr}\s*=\s*([^\s>]+)", tag, flags=re.IGNORECASE)
327
+ if match:
328
+ return match.group(1)
329
+ return None
330
+
331
+
332
+ def _replace_img_src(tag: str, new_src: str) -> str:
333
+ """Replace src attribute in img tag."""
334
+ replacement = f'src="{new_src}"'
335
+ updated = re.sub(r'\bsrc\s*=\s*([\'"])(.*?)\1', lambda m: replacement, tag, flags=re.IGNORECASE)
336
+ if updated != tag:
337
+ return updated
338
+ updated = re.sub(r"\bsrc\s*=\s*([^\s>]+)", lambda m: replacement, tag, flags=re.IGNORECASE)
339
+ if updated != tag:
340
+ return updated
341
+ alt = _extract_img_attr(tag, "alt")
342
+ if alt:
343
+ return f'<img src="{new_src}" alt="{alt}">'
344
+ return f'<img src="{new_src}">'
345
+
346
+
347
+ def sanitize_html_images(html_content: str, config: Config) -> str:
348
+ """Process images in HTML, ensure they are usable."""
349
+ img_pattern = re.compile(r"<img\b[^>]*>", flags=re.IGNORECASE)
350
+ local_dir = config.image_local_dir
351
+
352
+ def replace_img(match):
353
+ tag = match.group(0)
354
+ src = _extract_img_attr(tag, "src")
355
+ alt = _extract_img_attr(tag, "alt") or ""
356
+
357
+ if not src:
358
+ return alt
359
+
360
+ if src.startswith(("http://", "https://")):
361
+ local_path = download_image(src, config)
362
+ if local_path:
363
+ return _replace_img_src(tag, local_path)
364
+ print_info(f"Image download failed, skipping: {src}")
365
+ return alt
366
+
367
+ if src.startswith("data:"):
368
+ local_path = decode_data_uri_image(src, local_dir=local_dir)
369
+ if local_path:
370
+ return _replace_img_src(tag, local_path)
371
+ print_info("Data URI image processing failed, skipping")
372
+ return alt
373
+
374
+ compatible_path = ensure_local_image_compatible(src, local_dir=local_dir)
375
+ if compatible_path:
376
+ return _replace_img_src(tag, compatible_path)
377
+
378
+ print_info(f"Local image unavailable, skipping: {src}")
379
+ return alt
380
+
381
+ return img_pattern.sub(replace_img, html_content)
382
+
383
+
384
+ def is_docx_image_supported(image_path: str) -> bool:
385
+ """Check if image can be recognized by docx."""
386
+ try:
387
+ test_doc = Document()
388
+ test_doc.add_picture(image_path)
389
+ return True
390
+ except UnrecognizedImageError:
391
+ return False
392
+ except Exception as e:
393
+ print_error(f"Failed to check image {image_path}: {e}")
394
+ return False
395
+
396
+
397
+ def extract_blockquotes(html_content: str) -> tuple[str, list[str]]:
398
+ """Extract blockquotes from HTML and mark with placeholders.
399
+
400
+ Returns:
401
+ Tuple of (modified HTML, list of blockquote texts)
402
+ """
403
+ blockquotes = []
404
+
405
+ def save_blockquote(match):
406
+ block_html = match.group(0)
407
+ # Extract text content from blockquote
408
+ # Remove HTML tags but keep the text
409
+ text = re.sub(r"<[^>]+>", "", block_html)
410
+ text = text.strip()
411
+ # Decode HTML entities
412
+ text = text.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
413
+ text = text.replace("&quot;", '"').replace("&#39;", "'")
414
+
415
+ blockquotes.append(text)
416
+ placeholder = f"__BLOCKQUOTE_PLACEHOLDER_{len(blockquotes) - 1}__"
417
+ return f"<p>{placeholder}</p>"
418
+
419
+ # Extract blockquotes
420
+ html_content = re.sub(
421
+ r"<blockquote[^>]*>.*?</blockquote>",
422
+ save_blockquote,
423
+ html_content,
424
+ flags=re.DOTALL | re.IGNORECASE,
425
+ )
426
+
427
+ return html_content, blockquotes
428
+
429
+
430
+ def replace_blockquote_placeholders(document, blockquotes: list[str], config: Config) -> None:
431
+ """Replace blockquote placeholders with styled paragraphs."""
432
+ if not blockquotes:
433
+ return
434
+
435
+ style_config = config.get_style("blockquote")
436
+
437
+ for i, paragraph in enumerate(document.paragraphs):
438
+ text = paragraph.text.strip()
439
+ for idx, quote_text in enumerate(blockquotes):
440
+ placeholder = f"__BLOCKQUOTE_PLACEHOLDER_{idx}__"
441
+ if text == placeholder:
442
+ # Clear and rebuild paragraph
443
+ paragraph.clear()
444
+ run = paragraph.add_run(quote_text)
445
+
446
+ # Apply blockquote style to run
447
+ run.font.name = style_config.font_name
448
+ run.font.size = Pt(style_config.font_size)
449
+ run.font.italic = style_config.italic
450
+ run.font.bold = style_config.bold
451
+
452
+ # Set color
453
+ r, g, b = hex_to_rgb(style_config.color)
454
+ run.font.color.rgb = RGBColor(r, g, b)
455
+
456
+ # Set East Asian font
457
+ rPr = run._element.get_or_add_rPr()
458
+ rFonts = rPr.get_or_add_rFonts()
459
+ rFonts.set(qn("w:eastAsia"), style_config.font_name)
460
+
461
+ # Apply paragraph formatting
462
+ apply_style_to_paragraph(paragraph, style_config)
463
+
464
+ # Add left border for blockquote visual effect
465
+ pPr = paragraph._element.get_or_add_pPr()
466
+ pBdr = OxmlElement("w:pBdr")
467
+ left_border = OxmlElement("w:left")
468
+ left_border.set(qn("w:val"), "single")
469
+ left_border.set(qn("w:sz"), "24") # Border width
470
+ left_border.set(qn("w:space"), "4") # Space between border and text
471
+ left_border.set(qn("w:color"), style_config.color)
472
+ pBdr.append(left_border)
473
+ pPr.append(pBdr)
474
+
475
+ print_info(f"Styled blockquote ({len(quote_text)} chars)")
476
+ break
477
+
478
+
479
+ def extract_code_blocks(html_content: str) -> tuple[str, list[dict], list[str]]:
480
+ """Extract code blocks from HTML and replace with placeholders.
481
+
482
+ Returns:
483
+ Tuple of (modified HTML, list of code block info dicts, list of inline codes)
484
+ """
485
+ code_blocks = []
486
+ inline_codes = []
487
+
488
+ def save_code_block(match):
489
+ block_html = match.group(0)
490
+ # Remove all span tags (syntax highlighting)
491
+ clean_content = re.sub(r"<span[^>]*>", "", block_html)
492
+ clean_content = re.sub(r"</span>", "", clean_content)
493
+ # Extract the code content
494
+ code_match = re.search(r"<code[^>]*>(.*?)</code>", clean_content, flags=re.DOTALL | re.IGNORECASE)
495
+ if code_match:
496
+ code_text = code_match.group(1)
497
+ # Decode HTML entities
498
+ code_text = code_text.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
499
+ code_text = code_text.replace("&quot;", '"').replace("&#39;", "'")
500
+ else:
501
+ # Fallback: extract text between pre tags
502
+ pre_match = re.search(r"<pre[^>]*>(.*?)</pre>", clean_content, flags=re.DOTALL | re.IGNORECASE)
503
+ code_text = pre_match.group(1) if pre_match else ""
504
+
505
+ code_blocks.append({
506
+ "code": code_text.strip(),
507
+ "placeholder": f"__CODE_BLOCK_PLACEHOLDER_{len(code_blocks)}__"
508
+ })
509
+ return f'<p>{code_blocks[-1]["placeholder"]}</p>'
510
+
511
+ # Extract code blocks wrapped in codehilite div
512
+ html_content = re.sub(
513
+ r"<div[^>]*class=\"codehilite\"[^>]*>\s*<pre[^>]*>.*?</pre>\s*</div>",
514
+ save_code_block,
515
+ html_content,
516
+ flags=re.DOTALL | re.IGNORECASE,
517
+ )
518
+
519
+ # Extract standalone pre blocks
520
+ html_content = re.sub(
521
+ r"<pre[^>]*>.*?</pre>",
522
+ save_code_block,
523
+ html_content,
524
+ flags=re.DOTALL | re.IGNORECASE,
525
+ )
526
+
527
+ # Mark inline code with special markers
528
+ def mark_inline_code(match):
529
+ code_text = match.group(1)
530
+ # Decode HTML entities
531
+ code_text = code_text.replace("&lt;", "<").replace("&gt;", ">").replace("&amp;", "&")
532
+ code_text = code_text.replace("&quot;", '"').replace("&#39;", "'")
533
+ inline_codes.append(code_text)
534
+ return f"⟦CODE⟧{code_text}⟦/CODE⟧"
535
+
536
+ html_content = re.sub(
537
+ r"<code>([^<]*)</code>",
538
+ mark_inline_code,
539
+ html_content,
540
+ flags=re.IGNORECASE,
541
+ )
542
+
543
+ return html_content, code_blocks, inline_codes
544
+
545
+
546
+ def add_code_block_to_document(paragraph, code_text: str, config: Config) -> None:
547
+ """Replace a placeholder paragraph with properly formatted code block."""
548
+ from docx.oxml.ns import qn
549
+ from docx.oxml import OxmlElement
550
+
551
+ code_style = config.get_style("code")
552
+ font_name = code_style.font_name
553
+ font_size = code_style.font_size
554
+ bg_color = code_style.background_color or "f5f5f5"
555
+
556
+ # Clear existing content
557
+ paragraph.clear()
558
+
559
+ # Add code lines
560
+ lines = code_text.split("\n")
561
+ for i, line in enumerate(lines):
562
+ run = paragraph.add_run(line)
563
+ run.font.name = font_name
564
+ run.font.size = Pt(font_size)
565
+ # Set East Asian font
566
+ run._element.rPr.rFonts.set(qn("w:eastAsia"), font_name)
567
+
568
+ # Add line break except for last line
569
+ if i < len(lines) - 1:
570
+ run.add_break()
571
+
572
+ # Set paragraph formatting
573
+ pf = paragraph.paragraph_format
574
+ pf.space_before = Pt(6)
575
+ pf.space_after = Pt(6)
576
+ pf.line_spacing = 1.0
577
+
578
+ # Add shading (background color)
579
+ pPr = paragraph._element.get_or_add_pPr()
580
+ shd = OxmlElement("w:shd")
581
+ shd.set(qn("w:val"), "clear")
582
+ shd.set(qn("w:color"), "auto")
583
+ shd.set(qn("w:fill"), bg_color)
584
+ pPr.append(shd)
585
+
586
+
587
+ def replace_code_block_placeholders(document, code_blocks: list[dict], config: Config) -> None:
588
+ """Replace code block placeholders in document with formatted code."""
589
+ if not code_blocks:
590
+ return
591
+
592
+ placeholder_map = {block["placeholder"]: block["code"] for block in code_blocks}
593
+
594
+ for paragraph in document.paragraphs:
595
+ text = paragraph.text.strip()
596
+ if text in placeholder_map:
597
+ add_code_block_to_document(paragraph, placeholder_map[text], config)
598
+ print_info(f"Added code block ({len(placeholder_map[text])} chars)")
599
+
600
+
601
+ def style_inline_code_in_document(document, config: Config) -> None:
602
+ """Find and style inline code marked with special markers."""
603
+ code_style = config.get_style("code")
604
+ body_style = config.get_style("body")
605
+ code_font_name = code_style.font_name
606
+ code_font_size = code_style.font_size
607
+ bg_color = code_style.background_color or "f5f5f5"
608
+
609
+ inline_code_pattern = re.compile(r"⟦CODE⟧(.*?)⟦/CODE⟧")
610
+
611
+ for paragraph in document.paragraphs:
612
+ # Check if paragraph contains inline code markers
613
+ full_text = paragraph.text
614
+ if "⟦CODE⟧" not in full_text:
615
+ continue
616
+
617
+ # We need to rebuild the paragraph with styled inline code
618
+ matches = list(inline_code_pattern.finditer(full_text))
619
+ if not matches:
620
+ continue
621
+
622
+ # Clear paragraph
623
+ paragraph.clear()
624
+
625
+ # Process text and add runs
626
+ last_end = 0
627
+ for match in matches:
628
+ # Add text before the code (with body style)
629
+ if match.start() > last_end:
630
+ before_text = full_text[last_end:match.start()]
631
+ if before_text:
632
+ run = paragraph.add_run(before_text)
633
+ run.font.name = body_style.font_name
634
+ run.font.size = Pt(body_style.font_size)
635
+ # Set East Asian font for Chinese
636
+ rPr = run._element.get_or_add_rPr()
637
+ rFonts = rPr.get_or_add_rFonts()
638
+ rFonts.set(qn("w:eastAsia"), body_style.font_name)
639
+
640
+ # Add the code with special styling
641
+ code_text = match.group(1)
642
+ code_run = paragraph.add_run(code_text)
643
+ code_run.font.name = code_font_name
644
+ code_run.font.size = Pt(code_font_size)
645
+ # Set East Asian font for code
646
+ code_rPr = code_run._element.get_or_add_rPr()
647
+ code_rFonts = code_rPr.get_or_add_rFonts()
648
+ code_rFonts.set(qn("w:eastAsia"), code_font_name)
649
+ # Add shading to run
650
+ shd = OxmlElement("w:shd")
651
+ shd.set(qn("w:val"), "clear")
652
+ shd.set(qn("w:color"), "auto")
653
+ shd.set(qn("w:fill"), bg_color)
654
+ code_rPr.append(shd)
655
+
656
+ last_end = match.end()
657
+
658
+ # Add remaining text (with body style)
659
+ if last_end < len(full_text):
660
+ remaining_text = full_text[last_end:]
661
+ if remaining_text:
662
+ run = paragraph.add_run(remaining_text)
663
+ run.font.name = body_style.font_name
664
+ run.font.size = Pt(body_style.font_size)
665
+ # Set East Asian font for Chinese
666
+ rPr = run._element.get_or_add_rPr()
667
+ rFonts = rPr.get_or_add_rFonts()
668
+ rFonts.set(qn("w:eastAsia"), body_style.font_name)
669
+
670
+
671
+ def filter_unrecognized_images(html_content: str) -> str:
672
+ """Remove image tags that docx cannot recognize."""
673
+ img_pattern = re.compile(r"<img\b[^>]*>", flags=re.IGNORECASE)
674
+
675
+ def replace_img(match):
676
+ tag = match.group(0)
677
+ src = _extract_img_attr(tag, "src")
678
+ alt = _extract_img_attr(tag, "alt") or ""
679
+
680
+ if not src:
681
+ return alt
682
+
683
+ if src.startswith(("http://", "https://", "data:")):
684
+ print_info(f"Unprocessed image link, skipping: {src}")
685
+ return alt
686
+
687
+ if not is_docx_image_supported(src):
688
+ print_info(f"Image cannot be recognized, skipping: {src}")
689
+ return alt
690
+
691
+ return tag
692
+
693
+ return img_pattern.sub(replace_img, html_content)
694
+
695
+
696
+ def process_markdown_images(markdown_content: str, config: Config) -> str:
697
+ """Process image links in markdown, download to local and replace paths."""
698
+ image_pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
699
+
700
+ def replace_image(match):
701
+ alt_text = match.group(1)
702
+ image_url = match.group(2)
703
+
704
+ if image_url.startswith(("http://", "https://")):
705
+ local_path = download_image(image_url, config)
706
+ if local_path:
707
+ return f"![{alt_text}]({local_path})"
708
+ else:
709
+ print_info(f"Image download failed, skipping: {image_url}")
710
+ return alt_text or ""
711
+ else:
712
+ return match.group(0)
713
+
714
+ return re.sub(image_pattern, replace_image, markdown_content)
715
+
716
+
717
+ def resize_images_in_document(document, max_width_inches: float = 6.0) -> None:
718
+ """Resize all images in document to fit page width."""
719
+ try:
720
+ for shape in document.inline_shapes:
721
+ if hasattr(shape, "type") and "PICTURE" in str(shape.type):
722
+ current_width_inches = shape.width.inches
723
+ current_height_inches = shape.height.inches
724
+
725
+ if current_width_inches > max_width_inches:
726
+ scale_ratio = max_width_inches / current_width_inches
727
+ new_height_inches = current_height_inches * scale_ratio
728
+
729
+ shape.width = Inches(max_width_inches)
730
+ shape.height = Inches(new_height_inches)
731
+
732
+ print_info(
733
+ f"Resized image: {current_width_inches:.2f}x{current_height_inches:.2f} -> "
734
+ f"{max_width_inches:.2f}x{new_height_inches:.2f} inches"
735
+ )
736
+ except Exception as e:
737
+ print_error(f"Error resizing images: {e}")
738
+
739
+
740
+ # Style application functions
741
+ def apply_style_to_run(run, style_config: StyleConfig) -> None:
742
+ """Apply style configuration to run."""
743
+ run.font.name = style_config.font_name
744
+ run.font.size = Pt(style_config.font_size)
745
+ # Preserve existing bold/italic formatting from HTML conversion
746
+ run.font.bold = run.font.bold or style_config.bold
747
+ run.font.italic = run.font.italic or style_config.italic
748
+
749
+ r, g, b = hex_to_rgb(style_config.color)
750
+ run.font.color.rgb = RGBColor(r, g, b)
751
+
752
+ # Set Chinese font
753
+ if run._element.rPr is not None:
754
+ rFonts = run._element.rPr.rFonts
755
+ if rFonts is not None:
756
+ rFonts.set(qn("w:eastAsia"), style_config.font_name)
757
+
758
+
759
+ def apply_style_to_paragraph(paragraph, style_config: StyleConfig) -> None:
760
+ """Apply style configuration to paragraph."""
761
+ pf = paragraph.paragraph_format
762
+ pf.space_before = Pt(style_config.space_before)
763
+ pf.space_after = Pt(style_config.space_after)
764
+
765
+ # Alignment
766
+ alignment_map = {
767
+ "left": WD_ALIGN_PARAGRAPH.LEFT,
768
+ "center": WD_ALIGN_PARAGRAPH.CENTER,
769
+ "right": WD_ALIGN_PARAGRAPH.RIGHT,
770
+ "justify": WD_ALIGN_PARAGRAPH.JUSTIFY,
771
+ }
772
+ if style_config.alignment in alignment_map:
773
+ pf.alignment = alignment_map[style_config.alignment]
774
+
775
+ # Line spacing
776
+ if style_config.line_spacing_rule == "exact" and style_config.line_spacing_value:
777
+ pf.line_spacing_rule = WD_LINE_SPACING.EXACTLY
778
+ pf.line_spacing = Pt(style_config.line_spacing_value)
779
+ elif style_config.line_spacing_rule == "at_least" and style_config.line_spacing_value:
780
+ pf.line_spacing_rule = WD_LINE_SPACING.AT_LEAST
781
+ pf.line_spacing = Pt(style_config.line_spacing_value)
782
+ elif style_config.line_spacing_rule == "single":
783
+ pf.line_spacing_rule = WD_LINE_SPACING.SINGLE
784
+ elif style_config.line_spacing_rule == "1.5":
785
+ pf.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
786
+ elif style_config.line_spacing_rule == "double":
787
+ pf.line_spacing_rule = WD_LINE_SPACING.DOUBLE
788
+ elif style_config.line_spacing_rule == "multiple":
789
+ pf.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
790
+ if style_config.line_spacing_value:
791
+ pf.line_spacing = style_config.line_spacing_value
792
+ elif style_config.line_spacing > 0:
793
+ pf.line_spacing = style_config.line_spacing
794
+ elif style_config.line_spacing > 0:
795
+ pf.line_spacing = style_config.line_spacing
796
+
797
+ # Left indent
798
+ if style_config.left_indent > 0:
799
+ pf.left_indent = Inches(style_config.left_indent)
800
+
801
+ # First line indent (in characters)
802
+ if style_config.first_line_indent > 0:
803
+ indent_pt = style_config.first_line_indent * style_config.font_size
804
+ pf.first_line_indent = Pt(indent_pt)
805
+
806
+
807
+ def get_heading_level(paragraph) -> int | None:
808
+ """Get heading level of paragraph, returns None if not a heading."""
809
+ style_name = paragraph.style.name if paragraph.style else ""
810
+ if style_name.startswith("Heading"):
811
+ try:
812
+ return int(style_name.replace("Heading ", "").replace("Heading", ""))
813
+ except ValueError:
814
+ return None
815
+ return None
816
+
817
+
818
+ def is_code_block_paragraph(paragraph) -> bool:
819
+ """Check if paragraph is a code block (has shading)."""
820
+ pPr = paragraph._element.pPr
821
+ if pPr is not None:
822
+ shd = pPr.find(qn("w:shd"))
823
+ if shd is not None:
824
+ fill = shd.get(qn("w:fill"))
825
+ # Check if it has a background fill (code blocks have gray background)
826
+ if fill and fill.lower() not in ("auto", "ffffff", "none"):
827
+ return True
828
+ return False
829
+
830
+
831
+ def apply_styles_to_document(document, config: Config) -> None:
832
+ """Apply style configuration to document."""
833
+ numbering = HeadingNumbering()
834
+
835
+ for paragraph in document.paragraphs:
836
+ # Skip code block paragraphs (they already have their own styling)
837
+ if is_code_block_paragraph(paragraph):
838
+ continue
839
+
840
+ heading_level = get_heading_level(paragraph)
841
+
842
+ if heading_level is not None:
843
+ style_name = f"heading_{heading_level}"
844
+ style_config = config.get_style(style_name)
845
+
846
+ # Add heading numbering
847
+ if style_config.numbering_format and paragraph.runs:
848
+ number_text = numbering.get_number(heading_level, style_config.numbering_format)
849
+ if number_text:
850
+ first_run = paragraph.runs[0]
851
+ original_text = first_run.text
852
+ first_run.text = number_text + original_text
853
+ else:
854
+ style_config = config.get_style("body")
855
+
856
+ apply_style_to_paragraph(paragraph, style_config)
857
+
858
+ for run in paragraph.runs:
859
+ apply_style_to_run(run, style_config)
860
+
861
+ # Process tables
862
+ apply_table_styles(document, config)
863
+
864
+
865
+ def apply_table_styles(document, config: Config) -> None:
866
+ """Apply table styling from configuration."""
867
+ from docx.shared import Twips
868
+
869
+ table_config = config.table
870
+
871
+ # Border style mapping
872
+ border_style_map = {
873
+ "single": "single",
874
+ "double": "double",
875
+ "dotted": "dotted",
876
+ "dashed": "dashed",
877
+ "none": "nil",
878
+ }
879
+ border_val = border_style_map.get(table_config.border_style, "single")
880
+
881
+ for table in document.tables:
882
+ # Set table width
883
+ if table_config.width_mode == "full":
884
+ table.autofit = False
885
+ table.allow_autofit = False
886
+ # Set table width to 100% of page width
887
+ tbl = table._tbl
888
+ tblPr = tbl.tblPr if tbl.tblPr is not None else OxmlElement("w:tblPr")
889
+ tblW = OxmlElement("w:tblW")
890
+ tblW.set(qn("w:w"), "5000")
891
+ tblW.set(qn("w:type"), "pct") # percentage
892
+ tblPr.append(tblW)
893
+ if tbl.tblPr is None:
894
+ tbl.insert(0, tblPr)
895
+ elif table_config.width_mode == "fixed" and table_config.width_inches:
896
+ table.autofit = False
897
+ tbl = table._tbl
898
+ tblPr = tbl.tblPr if tbl.tblPr is not None else OxmlElement("w:tblPr")
899
+ tblW = OxmlElement("w:tblW")
900
+ tblW.set(qn("w:w"), str(int(table_config.width_inches * 1440))) # inches to twips
901
+ tblW.set(qn("w:type"), "dxa")
902
+ tblPr.append(tblW)
903
+ if tbl.tblPr is None:
904
+ tbl.insert(0, tblPr)
905
+
906
+ # Apply borders and cell styles
907
+ for i, row in enumerate(table.rows):
908
+ for j, cell in enumerate(row.cells):
909
+ # Apply text styles
910
+ for paragraph in cell.paragraphs:
911
+ if i == 0:
912
+ style_config = config.get_style("table_header")
913
+ else:
914
+ style_config = config.get_style("table_cell")
915
+
916
+ apply_style_to_paragraph(paragraph, style_config)
917
+ for run in paragraph.runs:
918
+ apply_style_to_run(run, style_config)
919
+
920
+ # Get or create cell properties
921
+ tc = cell._tc
922
+ tcPr = tc.get_or_add_tcPr()
923
+
924
+ # Apply cell background color
925
+ if i == 0 and table_config.header_background_color:
926
+ shd = OxmlElement("w:shd")
927
+ shd.set(qn("w:val"), "clear")
928
+ shd.set(qn("w:color"), "auto")
929
+ shd.set(qn("w:fill"), table_config.header_background_color)
930
+ tcPr.append(shd)
931
+ elif i > 0:
932
+ # Alternating row colors
933
+ if table_config.alternating_row_color and i % 2 == 0:
934
+ shd = OxmlElement("w:shd")
935
+ shd.set(qn("w:val"), "clear")
936
+ shd.set(qn("w:color"), "auto")
937
+ shd.set(qn("w:fill"), table_config.alternating_row_color)
938
+ tcPr.append(shd)
939
+ elif table_config.cell_background_color:
940
+ shd = OxmlElement("w:shd")
941
+ shd.set(qn("w:val"), "clear")
942
+ shd.set(qn("w:color"), "auto")
943
+ shd.set(qn("w:fill"), table_config.cell_background_color)
944
+ tcPr.append(shd)
945
+
946
+ # Apply cell margins/padding
947
+ tcMar = OxmlElement("w:tcMar")
948
+ for side, value in [
949
+ ("top", table_config.cell_padding_top),
950
+ ("bottom", table_config.cell_padding_bottom),
951
+ ("left", table_config.cell_padding_left),
952
+ ("right", table_config.cell_padding_right),
953
+ ]:
954
+ margin = OxmlElement(f"w:{side}")
955
+ margin.set(qn("w:w"), str(int(value * 20))) # points to twips
956
+ margin.set(qn("w:type"), "dxa")
957
+ tcMar.append(margin)
958
+ tcPr.append(tcMar)
959
+
960
+ # Apply cell borders
961
+ if border_val != "nil":
962
+ tcBorders = OxmlElement("w:tcBorders")
963
+ for side in ["top", "left", "bottom", "right"]:
964
+ border = OxmlElement(f"w:{side}")
965
+ border.set(qn("w:val"), border_val)
966
+ border.set(qn("w:sz"), str(table_config.border_width))
967
+ border.set(qn("w:color"), table_config.border_color)
968
+ tcBorders.append(border)
969
+ tcPr.append(tcBorders)
970
+
971
+
972
+ def add_toc(document, title: str = "目录", max_level: int = 3) -> None:
973
+ """Add table of contents at the beginning of document."""
974
+ toc_title = document.paragraphs[0].insert_paragraph_before(title)
975
+ toc_title.style = document.styles["Heading 1"]
976
+
977
+ toc_paragraph = toc_title.insert_paragraph_before("")
978
+ run = toc_paragraph.add_run()
979
+
980
+ fld_char_begin = OxmlElement("w:fldChar")
981
+ fld_char_begin.set(qn("w:fldCharType"), "begin")
982
+ run._r.append(fld_char_begin)
983
+
984
+ instr_text = OxmlElement("w:instrText")
985
+ instr_text.set(qn("xml:space"), "preserve")
986
+ instr_text.text = f' TOC \\o "1-{max_level}" \\h \\z \\u '
987
+ run._r.append(instr_text)
988
+
989
+ fld_char_separate = OxmlElement("w:fldChar")
990
+ fld_char_separate.set(qn("w:fldCharType"), "separate")
991
+ run._r.append(fld_char_separate)
992
+
993
+ placeholder_run = toc_paragraph.add_run("Right-click here and select 'Update Field' to generate TOC")
994
+ placeholder_run.italic = True
995
+ placeholder_run.font.color.rgb = RGBColor(128, 128, 128)
996
+
997
+ fld_char_end = OxmlElement("w:fldChar")
998
+ fld_char_end.set(qn("w:fldCharType"), "end")
999
+ run._r.append(fld_char_end)
1000
+
1001
+ page_break_paragraph = toc_title.insert_paragraph_before("")
1002
+ page_break_run = page_break_paragraph.add_run()
1003
+ page_break_run.add_break(WD_BREAK.PAGE)
1004
+
1005
+ print_info(f"Added TOC (levels 1-{max_level})")
1006
+
1007
+
1008
+ def convert(
1009
+ markdown_content: str,
1010
+ output_path: str | Path,
1011
+ config: Config | None = None,
1012
+ toc: bool = False,
1013
+ toc_title: str = "目录",
1014
+ toc_max_level: int = 3,
1015
+ ) -> Path:
1016
+ """
1017
+ Convert Markdown content to Word document.
1018
+
1019
+ Args:
1020
+ markdown_content: Markdown text content
1021
+ output_path: Output file path
1022
+ config: Configuration object (uses defaults if None)
1023
+ toc: Whether to add table of contents
1024
+ toc_title: TOC title
1025
+ toc_max_level: Maximum heading level for TOC
1026
+
1027
+ Returns:
1028
+ Path to the output file
1029
+ """
1030
+ if config is None:
1031
+ config = Config()
1032
+
1033
+ output_path = Path(output_path)
1034
+
1035
+ # Extract LaTeX formulas
1036
+ processed_content, formulas = extract_latex_formulas(markdown_content)
1037
+ if formulas:
1038
+ print_info(f"Detected {len(formulas)} LaTeX formulas")
1039
+
1040
+ # Process markdown images
1041
+ processed_content = process_markdown_images(processed_content, config)
1042
+
1043
+ # Convert to HTML
1044
+ html_content = markdown2.markdown(
1045
+ processed_content,
1046
+ extras=["tables", "cuddled-lists", "fenced-code-blocks", "header-ids"],
1047
+ )
1048
+
1049
+ # Process HTML images
1050
+ html_content = sanitize_html_images(html_content, config)
1051
+
1052
+ # Extract code blocks (to bypass html4docx's broken handling)
1053
+ html_content, code_blocks, inline_codes = extract_code_blocks(html_content)
1054
+ if code_blocks:
1055
+ print_info(f"Extracted {len(code_blocks)} code blocks")
1056
+ if inline_codes:
1057
+ print_info(f"Found {len(inline_codes)} inline code snippets")
1058
+
1059
+ # Extract blockquotes
1060
+ html_content, blockquotes = extract_blockquotes(html_content)
1061
+ if blockquotes:
1062
+ print_info(f"Extracted {len(blockquotes)} blockquotes")
1063
+
1064
+ # Create Word document
1065
+ document = Document()
1066
+ new_parser = HtmlToDocx()
1067
+
1068
+ try:
1069
+ new_parser.add_html_to_document(html_content, document)
1070
+ except UnrecognizedImageError as e:
1071
+ print_error(f"UnrecognizedImageError, retrying without problematic images: {e}")
1072
+ html_filtered = filter_unrecognized_images(html_content)
1073
+ document = Document()
1074
+ new_parser = HtmlToDocx()
1075
+ try:
1076
+ new_parser.add_html_to_document(html_filtered, document)
1077
+ except UnrecognizedImageError as e2:
1078
+ print_error(f"Still failing, removing all images: {e2}")
1079
+ html_without_images = re.sub(r"<img[^>]*>", "", html_filtered, flags=re.IGNORECASE)
1080
+ document = Document()
1081
+ new_parser = HtmlToDocx()
1082
+ new_parser.add_html_to_document(html_without_images, document)
1083
+
1084
+ # Replace code block placeholders
1085
+ if code_blocks:
1086
+ replace_code_block_placeholders(document, code_blocks, config)
1087
+
1088
+ # Replace blockquote placeholders
1089
+ if blockquotes:
1090
+ replace_blockquote_placeholders(document, blockquotes, config)
1091
+
1092
+ # Replace formula placeholders
1093
+ if formulas:
1094
+ replace_formula_placeholders(document, formulas)
1095
+
1096
+ # Apply styles
1097
+ apply_styles_to_document(document, config)
1098
+
1099
+ # Style inline code (must be after apply_styles_to_document to avoid being overwritten)
1100
+ if inline_codes:
1101
+ style_inline_code_in_document(document, config)
1102
+
1103
+ # Resize images
1104
+ resize_images_in_document(document, config.max_image_width_inches)
1105
+
1106
+ # Add TOC
1107
+ if toc and len(document.paragraphs) > 0:
1108
+ add_toc(document, title=toc_title, max_level=toc_max_level)
1109
+
1110
+ # Save document
1111
+ output_path.parent.mkdir(parents=True, exist_ok=True)
1112
+ document.save(str(output_path))
1113
+ print_info(f"Document saved: {output_path}")
1114
+
1115
+ return output_path
1116
+
1117
+
1118
+ def convert_file(
1119
+ input_path: str | Path,
1120
+ output_path: str | Path | None = None,
1121
+ config: Config | str | Path | None = None,
1122
+ toc: bool = False,
1123
+ toc_title: str = "目录",
1124
+ toc_max_level: int = 3,
1125
+ ) -> Path:
1126
+ """
1127
+ Convert Markdown file to Word document.
1128
+
1129
+ Args:
1130
+ input_path: Input Markdown file path
1131
+ output_path: Output file path (defaults to input with .docx extension)
1132
+ config: Configuration object or path to config file
1133
+ toc: Whether to add table of contents
1134
+ toc_title: TOC title
1135
+ toc_max_level: Maximum heading level for TOC
1136
+
1137
+ Returns:
1138
+ Path to the output file
1139
+ """
1140
+ input_path = Path(input_path)
1141
+ if not input_path.exists():
1142
+ raise FileNotFoundError(f"Input file not found: {input_path}")
1143
+
1144
+ if output_path is None:
1145
+ output_path = input_path.with_suffix(".docx")
1146
+ else:
1147
+ output_path = Path(output_path)
1148
+
1149
+ # Load config
1150
+ if config is None:
1151
+ config = Config()
1152
+ elif isinstance(config, (str, Path)):
1153
+ config = Config.from_file(config)
1154
+
1155
+ # Read markdown content
1156
+ markdown_content = input_path.read_text(encoding="utf-8")
1157
+
1158
+ # Remove markdown code block wrapper if present
1159
+ if markdown_content.startswith("```markdown") and markdown_content.endswith("```"):
1160
+ markdown_content = markdown_content[12:-3]
1161
+
1162
+ return convert(
1163
+ markdown_content,
1164
+ output_path,
1165
+ config,
1166
+ toc=toc,
1167
+ toc_title=toc_title,
1168
+ toc_max_level=toc_max_level,
1169
+ )