md2word 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- md2word/__init__.py +22 -0
- md2word/__main__.py +91 -0
- md2word/config.py +351 -0
- md2word/converter.py +1169 -0
- md2word/latex.py +237 -0
- md2word-0.1.0.dist-info/METADATA +272 -0
- md2word-0.1.0.dist-info/RECORD +10 -0
- md2word-0.1.0.dist-info/WHEEL +4 -0
- md2word-0.1.0.dist-info/entry_points.txt +2 -0
- md2word-0.1.0.dist-info/licenses/LICENSE +21 -0
md2word/converter.py
ADDED
|
@@ -0,0 +1,1169 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Core converter module for md2word.
|
|
3
|
+
Converts Markdown content to Word documents.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import base64
|
|
9
|
+
import re
|
|
10
|
+
import uuid
|
|
11
|
+
from io import BytesIO
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
import httpx
|
|
16
|
+
import markdown2
|
|
17
|
+
from docx import Document
|
|
18
|
+
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_BREAK, WD_LINE_SPACING
|
|
19
|
+
from docx.image.exceptions import UnrecognizedImageError
|
|
20
|
+
from docx.oxml import OxmlElement
|
|
21
|
+
from docx.oxml.ns import qn
|
|
22
|
+
from docx.shared import Inches, Pt, RGBColor
|
|
23
|
+
from html4docx import HtmlToDocx
|
|
24
|
+
from PIL import Image
|
|
25
|
+
|
|
26
|
+
from .config import Config, StyleConfig, TableConfig
|
|
27
|
+
from .latex import extract_latex_formulas, replace_formula_placeholders
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def print_info(message: str) -> None:
|
|
34
|
+
"""Print info message."""
|
|
35
|
+
print(f"[INFO] {message}")
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def print_error(message: str) -> None:
|
|
39
|
+
"""Print error message."""
|
|
40
|
+
print(f"[ERROR] {message}")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def hex_to_rgb(hex_color: str) -> tuple[int, int, int]:
|
|
44
|
+
"""Convert hex color to RGB tuple."""
|
|
45
|
+
hex_color = hex_color.lstrip("#")
|
|
46
|
+
r = int(hex_color[0:2], 16)
|
|
47
|
+
g = int(hex_color[2:4], 16)
|
|
48
|
+
b = int(hex_color[4:6], 16)
|
|
49
|
+
return (r, g, b)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Chinese number mapping
|
|
53
|
+
CHINESE_NUMBERS = [
|
|
54
|
+
"零",
|
|
55
|
+
"一",
|
|
56
|
+
"二",
|
|
57
|
+
"三",
|
|
58
|
+
"四",
|
|
59
|
+
"五",
|
|
60
|
+
"六",
|
|
61
|
+
"七",
|
|
62
|
+
"八",
|
|
63
|
+
"九",
|
|
64
|
+
"十",
|
|
65
|
+
"十一",
|
|
66
|
+
"十二",
|
|
67
|
+
"十三",
|
|
68
|
+
"十四",
|
|
69
|
+
"十五",
|
|
70
|
+
"十六",
|
|
71
|
+
"十七",
|
|
72
|
+
"十八",
|
|
73
|
+
"十九",
|
|
74
|
+
"二十",
|
|
75
|
+
]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def number_to_chinese(n: int) -> str:
|
|
79
|
+
"""Convert number to Chinese."""
|
|
80
|
+
if n <= 20:
|
|
81
|
+
return CHINESE_NUMBERS[n]
|
|
82
|
+
return str(n)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class HeadingNumbering:
|
|
86
|
+
"""Heading numbering manager."""
|
|
87
|
+
|
|
88
|
+
FORMATS = {
|
|
89
|
+
"chapter": "第{n}章",
|
|
90
|
+
"section": "第{n}节",
|
|
91
|
+
"chinese": "{n}、",
|
|
92
|
+
"chinese_paren": "({n})",
|
|
93
|
+
"arabic": "{n}.",
|
|
94
|
+
"arabic_paren": "({n})",
|
|
95
|
+
"arabic_bracket": "[{n}]",
|
|
96
|
+
"roman": "{n}.",
|
|
97
|
+
"roman_lower": "{n}.",
|
|
98
|
+
"letter": "{n}.",
|
|
99
|
+
"letter_lower": "{n}.",
|
|
100
|
+
"circle": "{n}",
|
|
101
|
+
"none": "",
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
ROMAN_NUMERALS = [
|
|
105
|
+
"",
|
|
106
|
+
"I",
|
|
107
|
+
"II",
|
|
108
|
+
"III",
|
|
109
|
+
"IV",
|
|
110
|
+
"V",
|
|
111
|
+
"VI",
|
|
112
|
+
"VII",
|
|
113
|
+
"VIII",
|
|
114
|
+
"IX",
|
|
115
|
+
"X",
|
|
116
|
+
"XI",
|
|
117
|
+
"XII",
|
|
118
|
+
"XIII",
|
|
119
|
+
"XIV",
|
|
120
|
+
"XV",
|
|
121
|
+
"XVI",
|
|
122
|
+
"XVII",
|
|
123
|
+
"XVIII",
|
|
124
|
+
"XIX",
|
|
125
|
+
"XX",
|
|
126
|
+
]
|
|
127
|
+
|
|
128
|
+
CIRCLE_NUMBERS = [
|
|
129
|
+
"⓪",
|
|
130
|
+
"①",
|
|
131
|
+
"②",
|
|
132
|
+
"③",
|
|
133
|
+
"④",
|
|
134
|
+
"⑤",
|
|
135
|
+
"⑥",
|
|
136
|
+
"⑦",
|
|
137
|
+
"⑧",
|
|
138
|
+
"⑨",
|
|
139
|
+
"⑩",
|
|
140
|
+
"⑪",
|
|
141
|
+
"⑫",
|
|
142
|
+
"⑬",
|
|
143
|
+
"⑭",
|
|
144
|
+
"⑮",
|
|
145
|
+
"⑯",
|
|
146
|
+
"⑰",
|
|
147
|
+
"⑱",
|
|
148
|
+
"⑲",
|
|
149
|
+
"⑳",
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
def __init__(self):
|
|
153
|
+
self.counters = {}
|
|
154
|
+
|
|
155
|
+
def reset(self, level: int | None = None):
|
|
156
|
+
"""Reset counters."""
|
|
157
|
+
if level is None:
|
|
158
|
+
self.counters = {}
|
|
159
|
+
else:
|
|
160
|
+
for lvl in list(self.counters.keys()):
|
|
161
|
+
if lvl >= level:
|
|
162
|
+
self.counters[lvl] = 0
|
|
163
|
+
|
|
164
|
+
def get_number(self, level: int, format_name: str | None) -> str:
|
|
165
|
+
"""Get numbering for specified level."""
|
|
166
|
+
if not format_name or format_name == "none":
|
|
167
|
+
return ""
|
|
168
|
+
|
|
169
|
+
if level not in self.counters:
|
|
170
|
+
self.counters[level] = 0
|
|
171
|
+
self.counters[level] += 1
|
|
172
|
+
|
|
173
|
+
for lvl in list(self.counters.keys()):
|
|
174
|
+
if lvl > level:
|
|
175
|
+
self.counters[lvl] = 0
|
|
176
|
+
|
|
177
|
+
n = self.counters[level]
|
|
178
|
+
|
|
179
|
+
if format_name in ("chapter", "section"):
|
|
180
|
+
chinese_n = number_to_chinese(n)
|
|
181
|
+
return self.FORMATS[format_name].format(n=chinese_n)
|
|
182
|
+
elif format_name in ("chinese", "chinese_paren"):
|
|
183
|
+
chinese_n = number_to_chinese(n)
|
|
184
|
+
return self.FORMATS[format_name].format(n=chinese_n)
|
|
185
|
+
elif format_name in ("arabic", "arabic_paren", "arabic_bracket"):
|
|
186
|
+
return self.FORMATS[format_name].format(n=n)
|
|
187
|
+
elif format_name == "roman":
|
|
188
|
+
roman = self.ROMAN_NUMERALS[n] if n <= 20 else str(n)
|
|
189
|
+
return f"{roman}."
|
|
190
|
+
elif format_name == "roman_lower":
|
|
191
|
+
roman = self.ROMAN_NUMERALS[n].lower() if n <= 20 else str(n)
|
|
192
|
+
return f"{roman}."
|
|
193
|
+
elif format_name == "letter":
|
|
194
|
+
letter = chr(ord("A") + n - 1) if n <= 26 else str(n)
|
|
195
|
+
return f"{letter}."
|
|
196
|
+
elif format_name == "letter_lower":
|
|
197
|
+
letter = chr(ord("a") + n - 1) if n <= 26 else str(n)
|
|
198
|
+
return f"{letter}."
|
|
199
|
+
elif format_name == "circle":
|
|
200
|
+
return self.CIRCLE_NUMBERS[n] if n <= 20 else f"({n})"
|
|
201
|
+
else:
|
|
202
|
+
try:
|
|
203
|
+
return format_name.format(n=n, cn=number_to_chinese(n))
|
|
204
|
+
except (KeyError, ValueError):
|
|
205
|
+
return f"{n}. "
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
# Image processing functions
|
|
209
|
+
def process_image_content(image_content: bytes, url: str, local_dir: str = "./images") -> str:
|
|
210
|
+
"""Process image content, convert format and save, return local path."""
|
|
211
|
+
Path(local_dir).mkdir(parents=True, exist_ok=True)
|
|
212
|
+
|
|
213
|
+
image = Image.open(BytesIO(image_content))
|
|
214
|
+
original_format = image.format.lower() if image.format else "png"
|
|
215
|
+
|
|
216
|
+
supported_formats = ["png", "jpeg", "jpg"]
|
|
217
|
+
if original_format not in supported_formats:
|
|
218
|
+
if image.mode in ("RGBA", "LA") or "transparency" in image.info:
|
|
219
|
+
target_format = "png"
|
|
220
|
+
else:
|
|
221
|
+
target_format = "jpeg"
|
|
222
|
+
else:
|
|
223
|
+
target_format = original_format
|
|
224
|
+
|
|
225
|
+
url_filename = url.split("/")[-1].split("?")[0]
|
|
226
|
+
name_without_ext = Path(url_filename).stem if url_filename else str(uuid.uuid4())
|
|
227
|
+
local_filename = f"{name_without_ext}.{target_format}"
|
|
228
|
+
local_path = Path(local_dir) / local_filename
|
|
229
|
+
|
|
230
|
+
if original_format != target_format:
|
|
231
|
+
if target_format == "jpeg" and image.mode in ("RGBA", "LA"):
|
|
232
|
+
background = Image.new("RGB", image.size, (255, 255, 255))
|
|
233
|
+
if image.mode == "RGBA":
|
|
234
|
+
background.paste(image, mask=image.split()[-1])
|
|
235
|
+
else:
|
|
236
|
+
background.paste(image)
|
|
237
|
+
image = background
|
|
238
|
+
|
|
239
|
+
image.save(local_path, format=target_format.upper())
|
|
240
|
+
print_info(f"Downloaded and converted image: {url} ({original_format} -> {target_format}) -> {local_path}")
|
|
241
|
+
else:
|
|
242
|
+
with open(local_path, "wb") as f:
|
|
243
|
+
f.write(image_content)
|
|
244
|
+
print_info(f"Downloaded image: {url} -> {local_path}")
|
|
245
|
+
|
|
246
|
+
return str(local_path)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def download_image(url: str, config: Config) -> str | None:
|
|
250
|
+
"""Download image and return local file path."""
|
|
251
|
+
local_dir = config.image_local_dir
|
|
252
|
+
headers = {"User-Agent": config.image_user_agent}
|
|
253
|
+
timeout = config.image_download_timeout
|
|
254
|
+
|
|
255
|
+
try:
|
|
256
|
+
Path(local_dir).mkdir(parents=True, exist_ok=True)
|
|
257
|
+
|
|
258
|
+
with httpx.Client() as client:
|
|
259
|
+
response = client.get(url, timeout=timeout, headers=headers, follow_redirects=True)
|
|
260
|
+
response.raise_for_status()
|
|
261
|
+
image_content = response.content
|
|
262
|
+
|
|
263
|
+
return process_image_content(image_content, url, local_dir=local_dir)
|
|
264
|
+
except Exception as e:
|
|
265
|
+
print_error(f"Failed to download image {url}: {e}")
|
|
266
|
+
return None
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def ensure_local_image_compatible(image_path: str, local_dir: str = "./images") -> str | None:
|
|
270
|
+
"""Ensure local image is in docx-supported format."""
|
|
271
|
+
path = Path(image_path)
|
|
272
|
+
if not path.exists():
|
|
273
|
+
print_error(f"Local image not found: {image_path}")
|
|
274
|
+
return None
|
|
275
|
+
|
|
276
|
+
try:
|
|
277
|
+
image_content = path.read_bytes()
|
|
278
|
+
except Exception as e:
|
|
279
|
+
print_error(f"Failed to read local image {image_path}: {e}")
|
|
280
|
+
return None
|
|
281
|
+
|
|
282
|
+
try:
|
|
283
|
+
image = Image.open(BytesIO(image_content))
|
|
284
|
+
original_format = image.format.lower() if image.format else "png"
|
|
285
|
+
image.verify()
|
|
286
|
+
except Exception as e:
|
|
287
|
+
print_error(f"Cannot recognize local image {image_path}: {e}")
|
|
288
|
+
return None
|
|
289
|
+
|
|
290
|
+
if original_format in ("png", "jpeg", "jpg"):
|
|
291
|
+
return str(path)
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
return process_image_content(image_content, path.name, local_dir=local_dir)
|
|
295
|
+
except Exception as e:
|
|
296
|
+
print_error(f"Failed to convert local image {image_path}: {e}")
|
|
297
|
+
return None
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
def decode_data_uri_image(data_uri: str, local_dir: str = "./images") -> str | None:
|
|
301
|
+
"""Decode data URI and save as local image."""
|
|
302
|
+
if not data_uri.startswith("data:"):
|
|
303
|
+
return None
|
|
304
|
+
if "base64," not in data_uri:
|
|
305
|
+
return None
|
|
306
|
+
try:
|
|
307
|
+
_, b64_data = data_uri.split("base64,", 1)
|
|
308
|
+
image_content = base64.b64decode(b64_data)
|
|
309
|
+
except Exception as e:
|
|
310
|
+
print_error(f"Failed to decode data URI: {e}")
|
|
311
|
+
return None
|
|
312
|
+
|
|
313
|
+
try:
|
|
314
|
+
name_hint = f"inline_{uuid.uuid4().hex}"
|
|
315
|
+
return process_image_content(image_content, name_hint, local_dir=local_dir)
|
|
316
|
+
except Exception as e:
|
|
317
|
+
print_error(f"Failed to process data URI image: {e}")
|
|
318
|
+
return None
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _extract_img_attr(tag: str, attr: str) -> str | None:
|
|
322
|
+
"""Extract attribute from img tag."""
|
|
323
|
+
match = re.search(rf'{attr}\s*=\s*(["\'])(.*?)\1', tag, flags=re.IGNORECASE)
|
|
324
|
+
if match:
|
|
325
|
+
return match.group(2)
|
|
326
|
+
match = re.search(rf"{attr}\s*=\s*([^\s>]+)", tag, flags=re.IGNORECASE)
|
|
327
|
+
if match:
|
|
328
|
+
return match.group(1)
|
|
329
|
+
return None
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _replace_img_src(tag: str, new_src: str) -> str:
|
|
333
|
+
"""Replace src attribute in img tag."""
|
|
334
|
+
replacement = f'src="{new_src}"'
|
|
335
|
+
updated = re.sub(r'\bsrc\s*=\s*([\'"])(.*?)\1', lambda m: replacement, tag, flags=re.IGNORECASE)
|
|
336
|
+
if updated != tag:
|
|
337
|
+
return updated
|
|
338
|
+
updated = re.sub(r"\bsrc\s*=\s*([^\s>]+)", lambda m: replacement, tag, flags=re.IGNORECASE)
|
|
339
|
+
if updated != tag:
|
|
340
|
+
return updated
|
|
341
|
+
alt = _extract_img_attr(tag, "alt")
|
|
342
|
+
if alt:
|
|
343
|
+
return f'<img src="{new_src}" alt="{alt}">'
|
|
344
|
+
return f'<img src="{new_src}">'
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def sanitize_html_images(html_content: str, config: Config) -> str:
|
|
348
|
+
"""Process images in HTML, ensure they are usable."""
|
|
349
|
+
img_pattern = re.compile(r"<img\b[^>]*>", flags=re.IGNORECASE)
|
|
350
|
+
local_dir = config.image_local_dir
|
|
351
|
+
|
|
352
|
+
def replace_img(match):
|
|
353
|
+
tag = match.group(0)
|
|
354
|
+
src = _extract_img_attr(tag, "src")
|
|
355
|
+
alt = _extract_img_attr(tag, "alt") or ""
|
|
356
|
+
|
|
357
|
+
if not src:
|
|
358
|
+
return alt
|
|
359
|
+
|
|
360
|
+
if src.startswith(("http://", "https://")):
|
|
361
|
+
local_path = download_image(src, config)
|
|
362
|
+
if local_path:
|
|
363
|
+
return _replace_img_src(tag, local_path)
|
|
364
|
+
print_info(f"Image download failed, skipping: {src}")
|
|
365
|
+
return alt
|
|
366
|
+
|
|
367
|
+
if src.startswith("data:"):
|
|
368
|
+
local_path = decode_data_uri_image(src, local_dir=local_dir)
|
|
369
|
+
if local_path:
|
|
370
|
+
return _replace_img_src(tag, local_path)
|
|
371
|
+
print_info("Data URI image processing failed, skipping")
|
|
372
|
+
return alt
|
|
373
|
+
|
|
374
|
+
compatible_path = ensure_local_image_compatible(src, local_dir=local_dir)
|
|
375
|
+
if compatible_path:
|
|
376
|
+
return _replace_img_src(tag, compatible_path)
|
|
377
|
+
|
|
378
|
+
print_info(f"Local image unavailable, skipping: {src}")
|
|
379
|
+
return alt
|
|
380
|
+
|
|
381
|
+
return img_pattern.sub(replace_img, html_content)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
def is_docx_image_supported(image_path: str) -> bool:
|
|
385
|
+
"""Check if image can be recognized by docx."""
|
|
386
|
+
try:
|
|
387
|
+
test_doc = Document()
|
|
388
|
+
test_doc.add_picture(image_path)
|
|
389
|
+
return True
|
|
390
|
+
except UnrecognizedImageError:
|
|
391
|
+
return False
|
|
392
|
+
except Exception as e:
|
|
393
|
+
print_error(f"Failed to check image {image_path}: {e}")
|
|
394
|
+
return False
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def extract_blockquotes(html_content: str) -> tuple[str, list[str]]:
|
|
398
|
+
"""Extract blockquotes from HTML and mark with placeholders.
|
|
399
|
+
|
|
400
|
+
Returns:
|
|
401
|
+
Tuple of (modified HTML, list of blockquote texts)
|
|
402
|
+
"""
|
|
403
|
+
blockquotes = []
|
|
404
|
+
|
|
405
|
+
def save_blockquote(match):
|
|
406
|
+
block_html = match.group(0)
|
|
407
|
+
# Extract text content from blockquote
|
|
408
|
+
# Remove HTML tags but keep the text
|
|
409
|
+
text = re.sub(r"<[^>]+>", "", block_html)
|
|
410
|
+
text = text.strip()
|
|
411
|
+
# Decode HTML entities
|
|
412
|
+
text = text.replace("<", "<").replace(">", ">").replace("&", "&")
|
|
413
|
+
text = text.replace(""", '"').replace("'", "'")
|
|
414
|
+
|
|
415
|
+
blockquotes.append(text)
|
|
416
|
+
placeholder = f"__BLOCKQUOTE_PLACEHOLDER_{len(blockquotes) - 1}__"
|
|
417
|
+
return f"<p>{placeholder}</p>"
|
|
418
|
+
|
|
419
|
+
# Extract blockquotes
|
|
420
|
+
html_content = re.sub(
|
|
421
|
+
r"<blockquote[^>]*>.*?</blockquote>",
|
|
422
|
+
save_blockquote,
|
|
423
|
+
html_content,
|
|
424
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
return html_content, blockquotes
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def replace_blockquote_placeholders(document, blockquotes: list[str], config: Config) -> None:
|
|
431
|
+
"""Replace blockquote placeholders with styled paragraphs."""
|
|
432
|
+
if not blockquotes:
|
|
433
|
+
return
|
|
434
|
+
|
|
435
|
+
style_config = config.get_style("blockquote")
|
|
436
|
+
|
|
437
|
+
for i, paragraph in enumerate(document.paragraphs):
|
|
438
|
+
text = paragraph.text.strip()
|
|
439
|
+
for idx, quote_text in enumerate(blockquotes):
|
|
440
|
+
placeholder = f"__BLOCKQUOTE_PLACEHOLDER_{idx}__"
|
|
441
|
+
if text == placeholder:
|
|
442
|
+
# Clear and rebuild paragraph
|
|
443
|
+
paragraph.clear()
|
|
444
|
+
run = paragraph.add_run(quote_text)
|
|
445
|
+
|
|
446
|
+
# Apply blockquote style to run
|
|
447
|
+
run.font.name = style_config.font_name
|
|
448
|
+
run.font.size = Pt(style_config.font_size)
|
|
449
|
+
run.font.italic = style_config.italic
|
|
450
|
+
run.font.bold = style_config.bold
|
|
451
|
+
|
|
452
|
+
# Set color
|
|
453
|
+
r, g, b = hex_to_rgb(style_config.color)
|
|
454
|
+
run.font.color.rgb = RGBColor(r, g, b)
|
|
455
|
+
|
|
456
|
+
# Set East Asian font
|
|
457
|
+
rPr = run._element.get_or_add_rPr()
|
|
458
|
+
rFonts = rPr.get_or_add_rFonts()
|
|
459
|
+
rFonts.set(qn("w:eastAsia"), style_config.font_name)
|
|
460
|
+
|
|
461
|
+
# Apply paragraph formatting
|
|
462
|
+
apply_style_to_paragraph(paragraph, style_config)
|
|
463
|
+
|
|
464
|
+
# Add left border for blockquote visual effect
|
|
465
|
+
pPr = paragraph._element.get_or_add_pPr()
|
|
466
|
+
pBdr = OxmlElement("w:pBdr")
|
|
467
|
+
left_border = OxmlElement("w:left")
|
|
468
|
+
left_border.set(qn("w:val"), "single")
|
|
469
|
+
left_border.set(qn("w:sz"), "24") # Border width
|
|
470
|
+
left_border.set(qn("w:space"), "4") # Space between border and text
|
|
471
|
+
left_border.set(qn("w:color"), style_config.color)
|
|
472
|
+
pBdr.append(left_border)
|
|
473
|
+
pPr.append(pBdr)
|
|
474
|
+
|
|
475
|
+
print_info(f"Styled blockquote ({len(quote_text)} chars)")
|
|
476
|
+
break
|
|
477
|
+
|
|
478
|
+
|
|
479
|
+
def extract_code_blocks(html_content: str) -> tuple[str, list[dict], list[str]]:
|
|
480
|
+
"""Extract code blocks from HTML and replace with placeholders.
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
Tuple of (modified HTML, list of code block info dicts, list of inline codes)
|
|
484
|
+
"""
|
|
485
|
+
code_blocks = []
|
|
486
|
+
inline_codes = []
|
|
487
|
+
|
|
488
|
+
def save_code_block(match):
|
|
489
|
+
block_html = match.group(0)
|
|
490
|
+
# Remove all span tags (syntax highlighting)
|
|
491
|
+
clean_content = re.sub(r"<span[^>]*>", "", block_html)
|
|
492
|
+
clean_content = re.sub(r"</span>", "", clean_content)
|
|
493
|
+
# Extract the code content
|
|
494
|
+
code_match = re.search(r"<code[^>]*>(.*?)</code>", clean_content, flags=re.DOTALL | re.IGNORECASE)
|
|
495
|
+
if code_match:
|
|
496
|
+
code_text = code_match.group(1)
|
|
497
|
+
# Decode HTML entities
|
|
498
|
+
code_text = code_text.replace("<", "<").replace(">", ">").replace("&", "&")
|
|
499
|
+
code_text = code_text.replace(""", '"').replace("'", "'")
|
|
500
|
+
else:
|
|
501
|
+
# Fallback: extract text between pre tags
|
|
502
|
+
pre_match = re.search(r"<pre[^>]*>(.*?)</pre>", clean_content, flags=re.DOTALL | re.IGNORECASE)
|
|
503
|
+
code_text = pre_match.group(1) if pre_match else ""
|
|
504
|
+
|
|
505
|
+
code_blocks.append({
|
|
506
|
+
"code": code_text.strip(),
|
|
507
|
+
"placeholder": f"__CODE_BLOCK_PLACEHOLDER_{len(code_blocks)}__"
|
|
508
|
+
})
|
|
509
|
+
return f'<p>{code_blocks[-1]["placeholder"]}</p>'
|
|
510
|
+
|
|
511
|
+
# Extract code blocks wrapped in codehilite div
|
|
512
|
+
html_content = re.sub(
|
|
513
|
+
r"<div[^>]*class=\"codehilite\"[^>]*>\s*<pre[^>]*>.*?</pre>\s*</div>",
|
|
514
|
+
save_code_block,
|
|
515
|
+
html_content,
|
|
516
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Extract standalone pre blocks
|
|
520
|
+
html_content = re.sub(
|
|
521
|
+
r"<pre[^>]*>.*?</pre>",
|
|
522
|
+
save_code_block,
|
|
523
|
+
html_content,
|
|
524
|
+
flags=re.DOTALL | re.IGNORECASE,
|
|
525
|
+
)
|
|
526
|
+
|
|
527
|
+
# Mark inline code with special markers
|
|
528
|
+
def mark_inline_code(match):
|
|
529
|
+
code_text = match.group(1)
|
|
530
|
+
# Decode HTML entities
|
|
531
|
+
code_text = code_text.replace("<", "<").replace(">", ">").replace("&", "&")
|
|
532
|
+
code_text = code_text.replace(""", '"').replace("'", "'")
|
|
533
|
+
inline_codes.append(code_text)
|
|
534
|
+
return f"⟦CODE⟧{code_text}⟦/CODE⟧"
|
|
535
|
+
|
|
536
|
+
html_content = re.sub(
|
|
537
|
+
r"<code>([^<]*)</code>",
|
|
538
|
+
mark_inline_code,
|
|
539
|
+
html_content,
|
|
540
|
+
flags=re.IGNORECASE,
|
|
541
|
+
)
|
|
542
|
+
|
|
543
|
+
return html_content, code_blocks, inline_codes
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def add_code_block_to_document(paragraph, code_text: str, config: Config) -> None:
|
|
547
|
+
"""Replace a placeholder paragraph with properly formatted code block."""
|
|
548
|
+
from docx.oxml.ns import qn
|
|
549
|
+
from docx.oxml import OxmlElement
|
|
550
|
+
|
|
551
|
+
code_style = config.get_style("code")
|
|
552
|
+
font_name = code_style.font_name
|
|
553
|
+
font_size = code_style.font_size
|
|
554
|
+
bg_color = code_style.background_color or "f5f5f5"
|
|
555
|
+
|
|
556
|
+
# Clear existing content
|
|
557
|
+
paragraph.clear()
|
|
558
|
+
|
|
559
|
+
# Add code lines
|
|
560
|
+
lines = code_text.split("\n")
|
|
561
|
+
for i, line in enumerate(lines):
|
|
562
|
+
run = paragraph.add_run(line)
|
|
563
|
+
run.font.name = font_name
|
|
564
|
+
run.font.size = Pt(font_size)
|
|
565
|
+
# Set East Asian font
|
|
566
|
+
run._element.rPr.rFonts.set(qn("w:eastAsia"), font_name)
|
|
567
|
+
|
|
568
|
+
# Add line break except for last line
|
|
569
|
+
if i < len(lines) - 1:
|
|
570
|
+
run.add_break()
|
|
571
|
+
|
|
572
|
+
# Set paragraph formatting
|
|
573
|
+
pf = paragraph.paragraph_format
|
|
574
|
+
pf.space_before = Pt(6)
|
|
575
|
+
pf.space_after = Pt(6)
|
|
576
|
+
pf.line_spacing = 1.0
|
|
577
|
+
|
|
578
|
+
# Add shading (background color)
|
|
579
|
+
pPr = paragraph._element.get_or_add_pPr()
|
|
580
|
+
shd = OxmlElement("w:shd")
|
|
581
|
+
shd.set(qn("w:val"), "clear")
|
|
582
|
+
shd.set(qn("w:color"), "auto")
|
|
583
|
+
shd.set(qn("w:fill"), bg_color)
|
|
584
|
+
pPr.append(shd)
|
|
585
|
+
|
|
586
|
+
|
|
587
|
+
def replace_code_block_placeholders(document, code_blocks: list[dict], config: Config) -> None:
|
|
588
|
+
"""Replace code block placeholders in document with formatted code."""
|
|
589
|
+
if not code_blocks:
|
|
590
|
+
return
|
|
591
|
+
|
|
592
|
+
placeholder_map = {block["placeholder"]: block["code"] for block in code_blocks}
|
|
593
|
+
|
|
594
|
+
for paragraph in document.paragraphs:
|
|
595
|
+
text = paragraph.text.strip()
|
|
596
|
+
if text in placeholder_map:
|
|
597
|
+
add_code_block_to_document(paragraph, placeholder_map[text], config)
|
|
598
|
+
print_info(f"Added code block ({len(placeholder_map[text])} chars)")
|
|
599
|
+
|
|
600
|
+
|
|
601
|
+
def style_inline_code_in_document(document, config: Config) -> None:
|
|
602
|
+
"""Find and style inline code marked with special markers."""
|
|
603
|
+
code_style = config.get_style("code")
|
|
604
|
+
body_style = config.get_style("body")
|
|
605
|
+
code_font_name = code_style.font_name
|
|
606
|
+
code_font_size = code_style.font_size
|
|
607
|
+
bg_color = code_style.background_color or "f5f5f5"
|
|
608
|
+
|
|
609
|
+
inline_code_pattern = re.compile(r"⟦CODE⟧(.*?)⟦/CODE⟧")
|
|
610
|
+
|
|
611
|
+
for paragraph in document.paragraphs:
|
|
612
|
+
# Check if paragraph contains inline code markers
|
|
613
|
+
full_text = paragraph.text
|
|
614
|
+
if "⟦CODE⟧" not in full_text:
|
|
615
|
+
continue
|
|
616
|
+
|
|
617
|
+
# We need to rebuild the paragraph with styled inline code
|
|
618
|
+
matches = list(inline_code_pattern.finditer(full_text))
|
|
619
|
+
if not matches:
|
|
620
|
+
continue
|
|
621
|
+
|
|
622
|
+
# Clear paragraph
|
|
623
|
+
paragraph.clear()
|
|
624
|
+
|
|
625
|
+
# Process text and add runs
|
|
626
|
+
last_end = 0
|
|
627
|
+
for match in matches:
|
|
628
|
+
# Add text before the code (with body style)
|
|
629
|
+
if match.start() > last_end:
|
|
630
|
+
before_text = full_text[last_end:match.start()]
|
|
631
|
+
if before_text:
|
|
632
|
+
run = paragraph.add_run(before_text)
|
|
633
|
+
run.font.name = body_style.font_name
|
|
634
|
+
run.font.size = Pt(body_style.font_size)
|
|
635
|
+
# Set East Asian font for Chinese
|
|
636
|
+
rPr = run._element.get_or_add_rPr()
|
|
637
|
+
rFonts = rPr.get_or_add_rFonts()
|
|
638
|
+
rFonts.set(qn("w:eastAsia"), body_style.font_name)
|
|
639
|
+
|
|
640
|
+
# Add the code with special styling
|
|
641
|
+
code_text = match.group(1)
|
|
642
|
+
code_run = paragraph.add_run(code_text)
|
|
643
|
+
code_run.font.name = code_font_name
|
|
644
|
+
code_run.font.size = Pt(code_font_size)
|
|
645
|
+
# Set East Asian font for code
|
|
646
|
+
code_rPr = code_run._element.get_or_add_rPr()
|
|
647
|
+
code_rFonts = code_rPr.get_or_add_rFonts()
|
|
648
|
+
code_rFonts.set(qn("w:eastAsia"), code_font_name)
|
|
649
|
+
# Add shading to run
|
|
650
|
+
shd = OxmlElement("w:shd")
|
|
651
|
+
shd.set(qn("w:val"), "clear")
|
|
652
|
+
shd.set(qn("w:color"), "auto")
|
|
653
|
+
shd.set(qn("w:fill"), bg_color)
|
|
654
|
+
code_rPr.append(shd)
|
|
655
|
+
|
|
656
|
+
last_end = match.end()
|
|
657
|
+
|
|
658
|
+
# Add remaining text (with body style)
|
|
659
|
+
if last_end < len(full_text):
|
|
660
|
+
remaining_text = full_text[last_end:]
|
|
661
|
+
if remaining_text:
|
|
662
|
+
run = paragraph.add_run(remaining_text)
|
|
663
|
+
run.font.name = body_style.font_name
|
|
664
|
+
run.font.size = Pt(body_style.font_size)
|
|
665
|
+
# Set East Asian font for Chinese
|
|
666
|
+
rPr = run._element.get_or_add_rPr()
|
|
667
|
+
rFonts = rPr.get_or_add_rFonts()
|
|
668
|
+
rFonts.set(qn("w:eastAsia"), body_style.font_name)
|
|
669
|
+
|
|
670
|
+
|
|
671
|
+
def filter_unrecognized_images(html_content: str) -> str:
|
|
672
|
+
"""Remove image tags that docx cannot recognize."""
|
|
673
|
+
img_pattern = re.compile(r"<img\b[^>]*>", flags=re.IGNORECASE)
|
|
674
|
+
|
|
675
|
+
def replace_img(match):
|
|
676
|
+
tag = match.group(0)
|
|
677
|
+
src = _extract_img_attr(tag, "src")
|
|
678
|
+
alt = _extract_img_attr(tag, "alt") or ""
|
|
679
|
+
|
|
680
|
+
if not src:
|
|
681
|
+
return alt
|
|
682
|
+
|
|
683
|
+
if src.startswith(("http://", "https://", "data:")):
|
|
684
|
+
print_info(f"Unprocessed image link, skipping: {src}")
|
|
685
|
+
return alt
|
|
686
|
+
|
|
687
|
+
if not is_docx_image_supported(src):
|
|
688
|
+
print_info(f"Image cannot be recognized, skipping: {src}")
|
|
689
|
+
return alt
|
|
690
|
+
|
|
691
|
+
return tag
|
|
692
|
+
|
|
693
|
+
return img_pattern.sub(replace_img, html_content)
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
def process_markdown_images(markdown_content: str, config: Config) -> str:
|
|
697
|
+
"""Process image links in markdown, download to local and replace paths."""
|
|
698
|
+
image_pattern = r"!\[([^\]]*)\]\(([^)]+)\)"
|
|
699
|
+
|
|
700
|
+
def replace_image(match):
|
|
701
|
+
alt_text = match.group(1)
|
|
702
|
+
image_url = match.group(2)
|
|
703
|
+
|
|
704
|
+
if image_url.startswith(("http://", "https://")):
|
|
705
|
+
local_path = download_image(image_url, config)
|
|
706
|
+
if local_path:
|
|
707
|
+
return f""
|
|
708
|
+
else:
|
|
709
|
+
print_info(f"Image download failed, skipping: {image_url}")
|
|
710
|
+
return alt_text or ""
|
|
711
|
+
else:
|
|
712
|
+
return match.group(0)
|
|
713
|
+
|
|
714
|
+
return re.sub(image_pattern, replace_image, markdown_content)
|
|
715
|
+
|
|
716
|
+
|
|
717
|
+
def resize_images_in_document(document, max_width_inches: float = 6.0) -> None:
|
|
718
|
+
"""Resize all images in document to fit page width."""
|
|
719
|
+
try:
|
|
720
|
+
for shape in document.inline_shapes:
|
|
721
|
+
if hasattr(shape, "type") and "PICTURE" in str(shape.type):
|
|
722
|
+
current_width_inches = shape.width.inches
|
|
723
|
+
current_height_inches = shape.height.inches
|
|
724
|
+
|
|
725
|
+
if current_width_inches > max_width_inches:
|
|
726
|
+
scale_ratio = max_width_inches / current_width_inches
|
|
727
|
+
new_height_inches = current_height_inches * scale_ratio
|
|
728
|
+
|
|
729
|
+
shape.width = Inches(max_width_inches)
|
|
730
|
+
shape.height = Inches(new_height_inches)
|
|
731
|
+
|
|
732
|
+
print_info(
|
|
733
|
+
f"Resized image: {current_width_inches:.2f}x{current_height_inches:.2f} -> "
|
|
734
|
+
f"{max_width_inches:.2f}x{new_height_inches:.2f} inches"
|
|
735
|
+
)
|
|
736
|
+
except Exception as e:
|
|
737
|
+
print_error(f"Error resizing images: {e}")
|
|
738
|
+
|
|
739
|
+
|
|
740
|
+
# Style application functions
|
|
741
|
+
def apply_style_to_run(run, style_config: StyleConfig) -> None:
|
|
742
|
+
"""Apply style configuration to run."""
|
|
743
|
+
run.font.name = style_config.font_name
|
|
744
|
+
run.font.size = Pt(style_config.font_size)
|
|
745
|
+
# Preserve existing bold/italic formatting from HTML conversion
|
|
746
|
+
run.font.bold = run.font.bold or style_config.bold
|
|
747
|
+
run.font.italic = run.font.italic or style_config.italic
|
|
748
|
+
|
|
749
|
+
r, g, b = hex_to_rgb(style_config.color)
|
|
750
|
+
run.font.color.rgb = RGBColor(r, g, b)
|
|
751
|
+
|
|
752
|
+
# Set Chinese font
|
|
753
|
+
if run._element.rPr is not None:
|
|
754
|
+
rFonts = run._element.rPr.rFonts
|
|
755
|
+
if rFonts is not None:
|
|
756
|
+
rFonts.set(qn("w:eastAsia"), style_config.font_name)
|
|
757
|
+
|
|
758
|
+
|
|
759
|
+
def apply_style_to_paragraph(paragraph, style_config: StyleConfig) -> None:
|
|
760
|
+
"""Apply style configuration to paragraph."""
|
|
761
|
+
pf = paragraph.paragraph_format
|
|
762
|
+
pf.space_before = Pt(style_config.space_before)
|
|
763
|
+
pf.space_after = Pt(style_config.space_after)
|
|
764
|
+
|
|
765
|
+
# Alignment
|
|
766
|
+
alignment_map = {
|
|
767
|
+
"left": WD_ALIGN_PARAGRAPH.LEFT,
|
|
768
|
+
"center": WD_ALIGN_PARAGRAPH.CENTER,
|
|
769
|
+
"right": WD_ALIGN_PARAGRAPH.RIGHT,
|
|
770
|
+
"justify": WD_ALIGN_PARAGRAPH.JUSTIFY,
|
|
771
|
+
}
|
|
772
|
+
if style_config.alignment in alignment_map:
|
|
773
|
+
pf.alignment = alignment_map[style_config.alignment]
|
|
774
|
+
|
|
775
|
+
# Line spacing
|
|
776
|
+
if style_config.line_spacing_rule == "exact" and style_config.line_spacing_value:
|
|
777
|
+
pf.line_spacing_rule = WD_LINE_SPACING.EXACTLY
|
|
778
|
+
pf.line_spacing = Pt(style_config.line_spacing_value)
|
|
779
|
+
elif style_config.line_spacing_rule == "at_least" and style_config.line_spacing_value:
|
|
780
|
+
pf.line_spacing_rule = WD_LINE_SPACING.AT_LEAST
|
|
781
|
+
pf.line_spacing = Pt(style_config.line_spacing_value)
|
|
782
|
+
elif style_config.line_spacing_rule == "single":
|
|
783
|
+
pf.line_spacing_rule = WD_LINE_SPACING.SINGLE
|
|
784
|
+
elif style_config.line_spacing_rule == "1.5":
|
|
785
|
+
pf.line_spacing_rule = WD_LINE_SPACING.ONE_POINT_FIVE
|
|
786
|
+
elif style_config.line_spacing_rule == "double":
|
|
787
|
+
pf.line_spacing_rule = WD_LINE_SPACING.DOUBLE
|
|
788
|
+
elif style_config.line_spacing_rule == "multiple":
|
|
789
|
+
pf.line_spacing_rule = WD_LINE_SPACING.MULTIPLE
|
|
790
|
+
if style_config.line_spacing_value:
|
|
791
|
+
pf.line_spacing = style_config.line_spacing_value
|
|
792
|
+
elif style_config.line_spacing > 0:
|
|
793
|
+
pf.line_spacing = style_config.line_spacing
|
|
794
|
+
elif style_config.line_spacing > 0:
|
|
795
|
+
pf.line_spacing = style_config.line_spacing
|
|
796
|
+
|
|
797
|
+
# Left indent
|
|
798
|
+
if style_config.left_indent > 0:
|
|
799
|
+
pf.left_indent = Inches(style_config.left_indent)
|
|
800
|
+
|
|
801
|
+
# First line indent (in characters)
|
|
802
|
+
if style_config.first_line_indent > 0:
|
|
803
|
+
indent_pt = style_config.first_line_indent * style_config.font_size
|
|
804
|
+
pf.first_line_indent = Pt(indent_pt)
|
|
805
|
+
|
|
806
|
+
|
|
807
|
+
def get_heading_level(paragraph) -> int | None:
|
|
808
|
+
"""Get heading level of paragraph, returns None if not a heading."""
|
|
809
|
+
style_name = paragraph.style.name if paragraph.style else ""
|
|
810
|
+
if style_name.startswith("Heading"):
|
|
811
|
+
try:
|
|
812
|
+
return int(style_name.replace("Heading ", "").replace("Heading", ""))
|
|
813
|
+
except ValueError:
|
|
814
|
+
return None
|
|
815
|
+
return None
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def is_code_block_paragraph(paragraph) -> bool:
|
|
819
|
+
"""Check if paragraph is a code block (has shading)."""
|
|
820
|
+
pPr = paragraph._element.pPr
|
|
821
|
+
if pPr is not None:
|
|
822
|
+
shd = pPr.find(qn("w:shd"))
|
|
823
|
+
if shd is not None:
|
|
824
|
+
fill = shd.get(qn("w:fill"))
|
|
825
|
+
# Check if it has a background fill (code blocks have gray background)
|
|
826
|
+
if fill and fill.lower() not in ("auto", "ffffff", "none"):
|
|
827
|
+
return True
|
|
828
|
+
return False
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def apply_styles_to_document(document, config: Config) -> None:
|
|
832
|
+
"""Apply style configuration to document."""
|
|
833
|
+
numbering = HeadingNumbering()
|
|
834
|
+
|
|
835
|
+
for paragraph in document.paragraphs:
|
|
836
|
+
# Skip code block paragraphs (they already have their own styling)
|
|
837
|
+
if is_code_block_paragraph(paragraph):
|
|
838
|
+
continue
|
|
839
|
+
|
|
840
|
+
heading_level = get_heading_level(paragraph)
|
|
841
|
+
|
|
842
|
+
if heading_level is not None:
|
|
843
|
+
style_name = f"heading_{heading_level}"
|
|
844
|
+
style_config = config.get_style(style_name)
|
|
845
|
+
|
|
846
|
+
# Add heading numbering
|
|
847
|
+
if style_config.numbering_format and paragraph.runs:
|
|
848
|
+
number_text = numbering.get_number(heading_level, style_config.numbering_format)
|
|
849
|
+
if number_text:
|
|
850
|
+
first_run = paragraph.runs[0]
|
|
851
|
+
original_text = first_run.text
|
|
852
|
+
first_run.text = number_text + original_text
|
|
853
|
+
else:
|
|
854
|
+
style_config = config.get_style("body")
|
|
855
|
+
|
|
856
|
+
apply_style_to_paragraph(paragraph, style_config)
|
|
857
|
+
|
|
858
|
+
for run in paragraph.runs:
|
|
859
|
+
apply_style_to_run(run, style_config)
|
|
860
|
+
|
|
861
|
+
# Process tables
|
|
862
|
+
apply_table_styles(document, config)
|
|
863
|
+
|
|
864
|
+
|
|
865
|
+
def apply_table_styles(document, config: Config) -> None:
|
|
866
|
+
"""Apply table styling from configuration."""
|
|
867
|
+
from docx.shared import Twips
|
|
868
|
+
|
|
869
|
+
table_config = config.table
|
|
870
|
+
|
|
871
|
+
# Border style mapping
|
|
872
|
+
border_style_map = {
|
|
873
|
+
"single": "single",
|
|
874
|
+
"double": "double",
|
|
875
|
+
"dotted": "dotted",
|
|
876
|
+
"dashed": "dashed",
|
|
877
|
+
"none": "nil",
|
|
878
|
+
}
|
|
879
|
+
border_val = border_style_map.get(table_config.border_style, "single")
|
|
880
|
+
|
|
881
|
+
for table in document.tables:
|
|
882
|
+
# Set table width
|
|
883
|
+
if table_config.width_mode == "full":
|
|
884
|
+
table.autofit = False
|
|
885
|
+
table.allow_autofit = False
|
|
886
|
+
# Set table width to 100% of page width
|
|
887
|
+
tbl = table._tbl
|
|
888
|
+
tblPr = tbl.tblPr if tbl.tblPr is not None else OxmlElement("w:tblPr")
|
|
889
|
+
tblW = OxmlElement("w:tblW")
|
|
890
|
+
tblW.set(qn("w:w"), "5000")
|
|
891
|
+
tblW.set(qn("w:type"), "pct") # percentage
|
|
892
|
+
tblPr.append(tblW)
|
|
893
|
+
if tbl.tblPr is None:
|
|
894
|
+
tbl.insert(0, tblPr)
|
|
895
|
+
elif table_config.width_mode == "fixed" and table_config.width_inches:
|
|
896
|
+
table.autofit = False
|
|
897
|
+
tbl = table._tbl
|
|
898
|
+
tblPr = tbl.tblPr if tbl.tblPr is not None else OxmlElement("w:tblPr")
|
|
899
|
+
tblW = OxmlElement("w:tblW")
|
|
900
|
+
tblW.set(qn("w:w"), str(int(table_config.width_inches * 1440))) # inches to twips
|
|
901
|
+
tblW.set(qn("w:type"), "dxa")
|
|
902
|
+
tblPr.append(tblW)
|
|
903
|
+
if tbl.tblPr is None:
|
|
904
|
+
tbl.insert(0, tblPr)
|
|
905
|
+
|
|
906
|
+
# Apply borders and cell styles
|
|
907
|
+
for i, row in enumerate(table.rows):
|
|
908
|
+
for j, cell in enumerate(row.cells):
|
|
909
|
+
# Apply text styles
|
|
910
|
+
for paragraph in cell.paragraphs:
|
|
911
|
+
if i == 0:
|
|
912
|
+
style_config = config.get_style("table_header")
|
|
913
|
+
else:
|
|
914
|
+
style_config = config.get_style("table_cell")
|
|
915
|
+
|
|
916
|
+
apply_style_to_paragraph(paragraph, style_config)
|
|
917
|
+
for run in paragraph.runs:
|
|
918
|
+
apply_style_to_run(run, style_config)
|
|
919
|
+
|
|
920
|
+
# Get or create cell properties
|
|
921
|
+
tc = cell._tc
|
|
922
|
+
tcPr = tc.get_or_add_tcPr()
|
|
923
|
+
|
|
924
|
+
# Apply cell background color
|
|
925
|
+
if i == 0 and table_config.header_background_color:
|
|
926
|
+
shd = OxmlElement("w:shd")
|
|
927
|
+
shd.set(qn("w:val"), "clear")
|
|
928
|
+
shd.set(qn("w:color"), "auto")
|
|
929
|
+
shd.set(qn("w:fill"), table_config.header_background_color)
|
|
930
|
+
tcPr.append(shd)
|
|
931
|
+
elif i > 0:
|
|
932
|
+
# Alternating row colors
|
|
933
|
+
if table_config.alternating_row_color and i % 2 == 0:
|
|
934
|
+
shd = OxmlElement("w:shd")
|
|
935
|
+
shd.set(qn("w:val"), "clear")
|
|
936
|
+
shd.set(qn("w:color"), "auto")
|
|
937
|
+
shd.set(qn("w:fill"), table_config.alternating_row_color)
|
|
938
|
+
tcPr.append(shd)
|
|
939
|
+
elif table_config.cell_background_color:
|
|
940
|
+
shd = OxmlElement("w:shd")
|
|
941
|
+
shd.set(qn("w:val"), "clear")
|
|
942
|
+
shd.set(qn("w:color"), "auto")
|
|
943
|
+
shd.set(qn("w:fill"), table_config.cell_background_color)
|
|
944
|
+
tcPr.append(shd)
|
|
945
|
+
|
|
946
|
+
# Apply cell margins/padding
|
|
947
|
+
tcMar = OxmlElement("w:tcMar")
|
|
948
|
+
for side, value in [
|
|
949
|
+
("top", table_config.cell_padding_top),
|
|
950
|
+
("bottom", table_config.cell_padding_bottom),
|
|
951
|
+
("left", table_config.cell_padding_left),
|
|
952
|
+
("right", table_config.cell_padding_right),
|
|
953
|
+
]:
|
|
954
|
+
margin = OxmlElement(f"w:{side}")
|
|
955
|
+
margin.set(qn("w:w"), str(int(value * 20))) # points to twips
|
|
956
|
+
margin.set(qn("w:type"), "dxa")
|
|
957
|
+
tcMar.append(margin)
|
|
958
|
+
tcPr.append(tcMar)
|
|
959
|
+
|
|
960
|
+
# Apply cell borders
|
|
961
|
+
if border_val != "nil":
|
|
962
|
+
tcBorders = OxmlElement("w:tcBorders")
|
|
963
|
+
for side in ["top", "left", "bottom", "right"]:
|
|
964
|
+
border = OxmlElement(f"w:{side}")
|
|
965
|
+
border.set(qn("w:val"), border_val)
|
|
966
|
+
border.set(qn("w:sz"), str(table_config.border_width))
|
|
967
|
+
border.set(qn("w:color"), table_config.border_color)
|
|
968
|
+
tcBorders.append(border)
|
|
969
|
+
tcPr.append(tcBorders)
|
|
970
|
+
|
|
971
|
+
|
|
972
|
+
def add_toc(document, title: str = "目录", max_level: int = 3) -> None:
|
|
973
|
+
"""Add table of contents at the beginning of document."""
|
|
974
|
+
toc_title = document.paragraphs[0].insert_paragraph_before(title)
|
|
975
|
+
toc_title.style = document.styles["Heading 1"]
|
|
976
|
+
|
|
977
|
+
toc_paragraph = toc_title.insert_paragraph_before("")
|
|
978
|
+
run = toc_paragraph.add_run()
|
|
979
|
+
|
|
980
|
+
fld_char_begin = OxmlElement("w:fldChar")
|
|
981
|
+
fld_char_begin.set(qn("w:fldCharType"), "begin")
|
|
982
|
+
run._r.append(fld_char_begin)
|
|
983
|
+
|
|
984
|
+
instr_text = OxmlElement("w:instrText")
|
|
985
|
+
instr_text.set(qn("xml:space"), "preserve")
|
|
986
|
+
instr_text.text = f' TOC \\o "1-{max_level}" \\h \\z \\u '
|
|
987
|
+
run._r.append(instr_text)
|
|
988
|
+
|
|
989
|
+
fld_char_separate = OxmlElement("w:fldChar")
|
|
990
|
+
fld_char_separate.set(qn("w:fldCharType"), "separate")
|
|
991
|
+
run._r.append(fld_char_separate)
|
|
992
|
+
|
|
993
|
+
placeholder_run = toc_paragraph.add_run("Right-click here and select 'Update Field' to generate TOC")
|
|
994
|
+
placeholder_run.italic = True
|
|
995
|
+
placeholder_run.font.color.rgb = RGBColor(128, 128, 128)
|
|
996
|
+
|
|
997
|
+
fld_char_end = OxmlElement("w:fldChar")
|
|
998
|
+
fld_char_end.set(qn("w:fldCharType"), "end")
|
|
999
|
+
run._r.append(fld_char_end)
|
|
1000
|
+
|
|
1001
|
+
page_break_paragraph = toc_title.insert_paragraph_before("")
|
|
1002
|
+
page_break_run = page_break_paragraph.add_run()
|
|
1003
|
+
page_break_run.add_break(WD_BREAK.PAGE)
|
|
1004
|
+
|
|
1005
|
+
print_info(f"Added TOC (levels 1-{max_level})")
|
|
1006
|
+
|
|
1007
|
+
|
|
1008
|
+
def convert(
|
|
1009
|
+
markdown_content: str,
|
|
1010
|
+
output_path: str | Path,
|
|
1011
|
+
config: Config | None = None,
|
|
1012
|
+
toc: bool = False,
|
|
1013
|
+
toc_title: str = "目录",
|
|
1014
|
+
toc_max_level: int = 3,
|
|
1015
|
+
) -> Path:
|
|
1016
|
+
"""
|
|
1017
|
+
Convert Markdown content to Word document.
|
|
1018
|
+
|
|
1019
|
+
Args:
|
|
1020
|
+
markdown_content: Markdown text content
|
|
1021
|
+
output_path: Output file path
|
|
1022
|
+
config: Configuration object (uses defaults if None)
|
|
1023
|
+
toc: Whether to add table of contents
|
|
1024
|
+
toc_title: TOC title
|
|
1025
|
+
toc_max_level: Maximum heading level for TOC
|
|
1026
|
+
|
|
1027
|
+
Returns:
|
|
1028
|
+
Path to the output file
|
|
1029
|
+
"""
|
|
1030
|
+
if config is None:
|
|
1031
|
+
config = Config()
|
|
1032
|
+
|
|
1033
|
+
output_path = Path(output_path)
|
|
1034
|
+
|
|
1035
|
+
# Extract LaTeX formulas
|
|
1036
|
+
processed_content, formulas = extract_latex_formulas(markdown_content)
|
|
1037
|
+
if formulas:
|
|
1038
|
+
print_info(f"Detected {len(formulas)} LaTeX formulas")
|
|
1039
|
+
|
|
1040
|
+
# Process markdown images
|
|
1041
|
+
processed_content = process_markdown_images(processed_content, config)
|
|
1042
|
+
|
|
1043
|
+
# Convert to HTML
|
|
1044
|
+
html_content = markdown2.markdown(
|
|
1045
|
+
processed_content,
|
|
1046
|
+
extras=["tables", "cuddled-lists", "fenced-code-blocks", "header-ids"],
|
|
1047
|
+
)
|
|
1048
|
+
|
|
1049
|
+
# Process HTML images
|
|
1050
|
+
html_content = sanitize_html_images(html_content, config)
|
|
1051
|
+
|
|
1052
|
+
# Extract code blocks (to bypass html4docx's broken handling)
|
|
1053
|
+
html_content, code_blocks, inline_codes = extract_code_blocks(html_content)
|
|
1054
|
+
if code_blocks:
|
|
1055
|
+
print_info(f"Extracted {len(code_blocks)} code blocks")
|
|
1056
|
+
if inline_codes:
|
|
1057
|
+
print_info(f"Found {len(inline_codes)} inline code snippets")
|
|
1058
|
+
|
|
1059
|
+
# Extract blockquotes
|
|
1060
|
+
html_content, blockquotes = extract_blockquotes(html_content)
|
|
1061
|
+
if blockquotes:
|
|
1062
|
+
print_info(f"Extracted {len(blockquotes)} blockquotes")
|
|
1063
|
+
|
|
1064
|
+
# Create Word document
|
|
1065
|
+
document = Document()
|
|
1066
|
+
new_parser = HtmlToDocx()
|
|
1067
|
+
|
|
1068
|
+
try:
|
|
1069
|
+
new_parser.add_html_to_document(html_content, document)
|
|
1070
|
+
except UnrecognizedImageError as e:
|
|
1071
|
+
print_error(f"UnrecognizedImageError, retrying without problematic images: {e}")
|
|
1072
|
+
html_filtered = filter_unrecognized_images(html_content)
|
|
1073
|
+
document = Document()
|
|
1074
|
+
new_parser = HtmlToDocx()
|
|
1075
|
+
try:
|
|
1076
|
+
new_parser.add_html_to_document(html_filtered, document)
|
|
1077
|
+
except UnrecognizedImageError as e2:
|
|
1078
|
+
print_error(f"Still failing, removing all images: {e2}")
|
|
1079
|
+
html_without_images = re.sub(r"<img[^>]*>", "", html_filtered, flags=re.IGNORECASE)
|
|
1080
|
+
document = Document()
|
|
1081
|
+
new_parser = HtmlToDocx()
|
|
1082
|
+
new_parser.add_html_to_document(html_without_images, document)
|
|
1083
|
+
|
|
1084
|
+
# Replace code block placeholders
|
|
1085
|
+
if code_blocks:
|
|
1086
|
+
replace_code_block_placeholders(document, code_blocks, config)
|
|
1087
|
+
|
|
1088
|
+
# Replace blockquote placeholders
|
|
1089
|
+
if blockquotes:
|
|
1090
|
+
replace_blockquote_placeholders(document, blockquotes, config)
|
|
1091
|
+
|
|
1092
|
+
# Replace formula placeholders
|
|
1093
|
+
if formulas:
|
|
1094
|
+
replace_formula_placeholders(document, formulas)
|
|
1095
|
+
|
|
1096
|
+
# Apply styles
|
|
1097
|
+
apply_styles_to_document(document, config)
|
|
1098
|
+
|
|
1099
|
+
# Style inline code (must be after apply_styles_to_document to avoid being overwritten)
|
|
1100
|
+
if inline_codes:
|
|
1101
|
+
style_inline_code_in_document(document, config)
|
|
1102
|
+
|
|
1103
|
+
# Resize images
|
|
1104
|
+
resize_images_in_document(document, config.max_image_width_inches)
|
|
1105
|
+
|
|
1106
|
+
# Add TOC
|
|
1107
|
+
if toc and len(document.paragraphs) > 0:
|
|
1108
|
+
add_toc(document, title=toc_title, max_level=toc_max_level)
|
|
1109
|
+
|
|
1110
|
+
# Save document
|
|
1111
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1112
|
+
document.save(str(output_path))
|
|
1113
|
+
print_info(f"Document saved: {output_path}")
|
|
1114
|
+
|
|
1115
|
+
return output_path
|
|
1116
|
+
|
|
1117
|
+
|
|
1118
|
+
def convert_file(
|
|
1119
|
+
input_path: str | Path,
|
|
1120
|
+
output_path: str | Path | None = None,
|
|
1121
|
+
config: Config | str | Path | None = None,
|
|
1122
|
+
toc: bool = False,
|
|
1123
|
+
toc_title: str = "目录",
|
|
1124
|
+
toc_max_level: int = 3,
|
|
1125
|
+
) -> Path:
|
|
1126
|
+
"""
|
|
1127
|
+
Convert Markdown file to Word document.
|
|
1128
|
+
|
|
1129
|
+
Args:
|
|
1130
|
+
input_path: Input Markdown file path
|
|
1131
|
+
output_path: Output file path (defaults to input with .docx extension)
|
|
1132
|
+
config: Configuration object or path to config file
|
|
1133
|
+
toc: Whether to add table of contents
|
|
1134
|
+
toc_title: TOC title
|
|
1135
|
+
toc_max_level: Maximum heading level for TOC
|
|
1136
|
+
|
|
1137
|
+
Returns:
|
|
1138
|
+
Path to the output file
|
|
1139
|
+
"""
|
|
1140
|
+
input_path = Path(input_path)
|
|
1141
|
+
if not input_path.exists():
|
|
1142
|
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
|
1143
|
+
|
|
1144
|
+
if output_path is None:
|
|
1145
|
+
output_path = input_path.with_suffix(".docx")
|
|
1146
|
+
else:
|
|
1147
|
+
output_path = Path(output_path)
|
|
1148
|
+
|
|
1149
|
+
# Load config
|
|
1150
|
+
if config is None:
|
|
1151
|
+
config = Config()
|
|
1152
|
+
elif isinstance(config, (str, Path)):
|
|
1153
|
+
config = Config.from_file(config)
|
|
1154
|
+
|
|
1155
|
+
# Read markdown content
|
|
1156
|
+
markdown_content = input_path.read_text(encoding="utf-8")
|
|
1157
|
+
|
|
1158
|
+
# Remove markdown code block wrapper if present
|
|
1159
|
+
if markdown_content.startswith("```markdown") and markdown_content.endswith("```"):
|
|
1160
|
+
markdown_content = markdown_content[12:-3]
|
|
1161
|
+
|
|
1162
|
+
return convert(
|
|
1163
|
+
markdown_content,
|
|
1164
|
+
output_path,
|
|
1165
|
+
config,
|
|
1166
|
+
toc=toc,
|
|
1167
|
+
toc_title=toc_title,
|
|
1168
|
+
toc_max_level=toc_max_level,
|
|
1169
|
+
)
|