regen.mde 0.2.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/README.md +409 -295
  2. package/bin/build-corpus-editor.js +5 -3
  3. package/bin/postinstall.js +259 -187
  4. package/bin/regen-mdeditor-install.js +1 -1
  5. package/bin/regen-mdeditor-uninstall.js +1 -1
  6. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
  7. package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
  8. package/desktop/BuildCorpusEditor/Program.cs +85 -81
  9. package/dist/release/regen-mde-0.3.0-win-x64-setup.exe +0 -0
  10. package/dist/release/{regen.mde-0.2.2-win-x64.zip → regen-mde-0.3.0-win-x64.zip} +0 -0
  11. package/dist/release/regen-mde-0.7.0-win-x64-setup.exe +0 -0
  12. package/dist/release/regen-mde-0.7.0-win-x64.zip +0 -0
  13. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  14. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  15. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  16. package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
  17. package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
  18. package/dist/windows-editor/wwwroot/index.html +3 -3
  19. package/editor-web/index.html +1 -1
  20. package/editor-web/src/main.jsx +1044 -399
  21. package/editor-web/src/styles.css +846 -602
  22. package/installer/install-regen-mde.ps1 +49 -10
  23. package/installer/regen-mde.nsi +16 -16
  24. package/package.json +90 -86
  25. package/pyproject.toml +35 -33
  26. package/requirements.txt +6 -4
  27. package/scripts/package-windows-editor.ps1 +8 -8
  28. package/scripts/release-dual.mjs +105 -0
  29. package/scripts/run-editor-implementation-plane.ps1 +29 -6
  30. package/src/build_corpus/docx_exporter.py +1055 -798
  31. package/src/build_corpus/equations.py +80 -0
  32. package/src/build_corpus/exporter.py +1488 -1195
  33. package/src/build_corpus/frontmatter.py +302 -0
  34. package/src/build_corpus/ppt_exporter.py +543 -532
  35. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  36. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
  37. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
@@ -1,798 +1,1055 @@
1
- from __future__ import annotations
2
-
3
- import json
4
- import os
5
- import re
6
- import shutil
7
- import subprocess
8
- import tempfile
9
- from html.parser import HTMLParser
10
- from contextlib import ExitStack
11
- from dataclasses import dataclass, field
12
- from importlib.resources import as_file, files
13
- from pathlib import Path
14
- from zipfile import ZipFile
15
-
16
- from docx import Document
17
- from docx.enum.style import WD_STYLE_TYPE
18
- from docx.enum.text import WD_BREAK
19
- from docx.oxml import OxmlElement
20
- from docx.oxml.ns import qn
21
- from docx.shared import Inches, Pt, RGBColor
22
- from docx.image.exceptions import UnrecognizedImageError
23
-
24
-
25
- INLINE_TOKEN_RE = re.compile(
26
- r"(!\[[^\]]*\]\([^)]+\)|\[[^\]]+\]\([^)]+\)|`[^`]+`|\$\$[^$]+\$\$|\$[^$\n]+\$|\*\*\*.+?\*\*\*|\*\*.+?\*\*|\*.+?\*)"
27
- )
28
- BARE_URL_RE = re.compile(r"https?://[^\s<>()]+(?:\([^\s<>()]*\)[^\s<>()]*)*")
29
- LIST_ITEM_RE = re.compile(r"^(\s*)([-*+]|\d+\.)\s+(.*)$")
30
- TABLE_SEPARATOR_RE = re.compile(r"^\s*\|?(?:\s*:?-{3,}:?\s*\|)+\s*:?-{3,}:?\s*\|?\s*$")
31
- DEFAULT_TEMPLATE_FILENAME = "md-to-word-template.dotx"
32
-
33
-
34
- @dataclass
35
- class WordExportStats:
36
- paragraphs: int = 0
37
- headings: int = 0
38
- lists: int = 0
39
- tables: int = 0
40
- code_blocks: int = 0
41
- blockquotes: int = 0
42
- images: int = 0
43
- equations: int = 0
44
- warnings: list[str] = field(default_factory=list)
45
-
46
-
47
- def strip_fence(line: str) -> str:
48
- return line.strip()[3:].strip()
49
-
50
-
51
- def split_table_row(line: str) -> list[str]:
52
- text = line.strip()
53
- if text.startswith("|"):
54
- text = text[1:]
55
- if text.endswith("|"):
56
- text = text[:-1]
57
- cells: list[str] = []
58
- buffer: list[str] = []
59
- escape = False
60
- for char in text:
61
- if escape:
62
- buffer.append(char)
63
- escape = False
64
- continue
65
- if char == "\\":
66
- escape = True
67
- buffer.append(char)
68
- continue
69
- if char == "|":
70
- cells.append("".join(buffer).strip())
71
- buffer.clear()
72
- continue
73
- buffer.append(char)
74
- cells.append("".join(buffer).strip())
75
- return cells
76
-
77
-
78
- def markdown_link_parts(token: str) -> tuple[str, str]:
79
- match = re.fullmatch(r"!?\[([^\]]*)\]\(([^)]+)\)", token)
80
- if not match:
81
- return token, ""
82
- return match.group(1), match.group(2).strip()
83
-
84
-
85
- def unescape_markdown_text(text: str) -> str:
86
- return re.sub(r"\\([\\`*_{}\[\]()#+.!|$-])", r"\1", text)
87
-
88
-
89
- class HTMLTableParser(HTMLParser):
90
- def __init__(self) -> None:
91
- super().__init__()
92
- self.rows: list[list[str]] = []
93
- self.current_row: list[str] | None = None
94
- self.current_cell: list[str] | None = None
95
-
96
- def handle_starttag(self, tag: str, attrs) -> None: # type: ignore[override]
97
- normalized = tag.lower()
98
- if normalized == "tr":
99
- self.current_row = []
100
- elif normalized == "td":
101
- self.current_cell = []
102
- elif normalized == "br" and self.current_cell is not None:
103
- self.current_cell.append("\n")
104
-
105
- def handle_endtag(self, tag: str) -> None: # type: ignore[override]
106
- normalized = tag.lower()
107
- if normalized == "td" and self.current_row is not None and self.current_cell is not None:
108
- self.current_row.append("".join(self.current_cell).strip())
109
- self.current_cell = None
110
- elif normalized == "tr" and self.current_row is not None:
111
- self.rows.append(self.current_row)
112
- self.current_row = None
113
-
114
- def handle_data(self, data: str) -> None: # type: ignore[override]
115
- if self.current_cell is not None:
116
- self.current_cell.append(data)
117
-
118
-
119
- def parse_html_table(markup: str) -> list[list[str]]:
120
- parser = HTMLTableParser()
121
- parser.feed(markup)
122
- parser.close()
123
- return [row for row in parser.rows if any(cell.strip() for cell in row)]
124
-
125
-
126
- def packaged_template_resource():
127
- package_root = f"{__package__}.templates" if __package__ else "build_corpus.templates"
128
- return files(package_root).joinpath(DEFAULT_TEMPLATE_FILENAME)
129
-
130
-
131
- def resolve_default_template_path() -> Path | None:
132
- try:
133
- packaged_template_resource()
134
- except ModuleNotFoundError:
135
- return None
136
- return Path(f"bundled:{DEFAULT_TEMPLATE_FILENAME}")
137
-
138
-
139
- def set_cell_text(cell, text: str) -> None:
140
- lines = [segment.strip() for segment in text.replace("<br>", "\n").splitlines()]
141
- lines = [line for line in lines if line]
142
- if not lines:
143
- cell.text = ""
144
- return []
145
- return lines
146
-
147
-
148
- def append_text_with_breaks(paragraph, text: str) -> None:
149
- text = unescape_markdown_text(text)
150
- parts = text.split("\n")
151
- for index, part in enumerate(parts):
152
- if part:
153
- paragraph.add_run(part)
154
- if index < len(parts) - 1:
155
- paragraph.add_run().add_break(WD_BREAK.LINE)
156
-
157
-
158
- def split_trailing_url_punctuation(url: str) -> tuple[str, str]:
159
- trailing = []
160
- core = url
161
- while core and core[-1] in ".,;:!?":
162
- trailing.append(core[-1])
163
- core = core[:-1]
164
- return core, "".join(reversed(trailing))
165
-
166
-
167
- def convert_windows_metafile_to_png(source: Path) -> Path | None:
168
- if os.name != "nt":
169
- return None
170
- target_dir = Path(tempfile.gettempdir()) / "build-corpus-image-fallbacks"
171
- target_dir.mkdir(parents=True, exist_ok=True)
172
- target = target_dir / f"{source.stem}.png"
173
- source_literal = str(source).replace("'", "''")
174
- target_literal = str(target).replace("'", "''")
175
- command = (
176
- "Add-Type -AssemblyName System.Drawing; "
177
- f"$img = [System.Drawing.Image]::FromFile('{source_literal}'); "
178
- f"$bmp = New-Object System.Drawing.Bitmap $img.Width, $img.Height; "
179
- "$gfx = [System.Drawing.Graphics]::FromImage($bmp); "
180
- "$gfx.DrawImage($img, 0, 0, $img.Width, $img.Height); "
181
- f"$bmp.Save('{target_literal}', [System.Drawing.Imaging.ImageFormat]::Png); "
182
- "$gfx.Dispose(); $bmp.Dispose(); $img.Dispose()"
183
- )
184
- result = subprocess.run(
185
- ["powershell", "-NoProfile", "-Command", command],
186
- capture_output=True,
187
- text=True,
188
- )
189
- if result.returncode != 0 or not target.exists():
190
- return None
191
- return target
192
-
193
-
194
- def set_paragraph_shading(paragraph, fill: str) -> None:
195
- paragraph_pr = paragraph._p.get_or_add_pPr()
196
- shading = paragraph_pr.find(qn("w:shd"))
197
- if shading is None:
198
- shading = OxmlElement("w:shd")
199
- paragraph_pr.append(shading)
200
- shading.set(qn("w:fill"), fill)
201
-
202
-
203
- def set_paragraph_border(paragraph, color: str) -> None:
204
- paragraph_pr = paragraph._p.get_or_add_pPr()
205
- borders = paragraph_pr.find(qn("w:pBdr"))
206
- if borders is None:
207
- borders = OxmlElement("w:pBdr")
208
- paragraph_pr.append(borders)
209
- left = borders.find(qn("w:left"))
210
- if left is None:
211
- left = OxmlElement("w:left")
212
- borders.append(left)
213
- left.set(qn("w:val"), "single")
214
- left.set(qn("w:sz"), "10")
215
- left.set(qn("w:space"), "12")
216
- left.set(qn("w:color"), color)
217
-
218
-
219
- def add_hyperlink(paragraph, text: str, url: str):
220
- part = paragraph.part
221
- relationship_id = part.relate_to(
222
- url,
223
- "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
224
- is_external=True,
225
- )
226
- hyperlink = OxmlElement("w:hyperlink")
227
- hyperlink.set(qn("r:id"), relationship_id)
228
-
229
- run = OxmlElement("w:r")
230
- rpr = OxmlElement("w:rPr")
231
-
232
- color = OxmlElement("w:color")
233
- color.set(qn("w:val"), "2563EB")
234
- rpr.append(color)
235
-
236
- underline = OxmlElement("w:u")
237
- underline.set(qn("w:val"), "single")
238
- rpr.append(underline)
239
- run.append(rpr)
240
-
241
- text_node = OxmlElement("w:t")
242
- text_node.text = text
243
- run.append(text_node)
244
- hyperlink.append(run)
245
- paragraph._p.append(hyperlink)
246
- return hyperlink
247
-
248
-
249
- def append_hyperlink_run(hyperlink, text: str, bold: bool = False, italic: bool = False, code: bool = False) -> None:
250
- run = OxmlElement("w:r")
251
- rpr = OxmlElement("w:rPr")
252
-
253
- color = OxmlElement("w:color")
254
- color.set(qn("w:val"), "2563EB")
255
- rpr.append(color)
256
-
257
- underline = OxmlElement("w:u")
258
- underline.set(qn("w:val"), "single")
259
- rpr.append(underline)
260
-
261
- if bold:
262
- rpr.append(OxmlElement("w:b"))
263
- if italic:
264
- rpr.append(OxmlElement("w:i"))
265
-
266
- fonts_needed = code
267
- if fonts_needed:
268
- fonts = OxmlElement("w:rFonts")
269
- fonts.set(qn("w:ascii"), "Consolas")
270
- fonts.set(qn("w:hAnsi"), "Consolas")
271
- fonts.set(qn("w:cs"), "Consolas")
272
- rpr.append(fonts)
273
- size = OxmlElement("w:sz")
274
- size.set(qn("w:val"), "20")
275
- rpr.append(size)
276
-
277
- run.append(rpr)
278
- text_node = OxmlElement("w:t")
279
- text_node.text = text
280
- run.append(text_node)
281
- hyperlink.append(run)
282
-
283
-
284
- def set_picture_metadata(run, source_name: str) -> None:
285
- filename = Path(source_name).name
286
- try:
287
- doc_props = run._r.xpath(".//*[local-name()='docPr']")
288
- except Exception:
289
- doc_props = []
290
- for doc_prop in doc_props:
291
- doc_prop.set("name", filename)
292
- doc_prop.set("descr", filename)
293
- doc_prop.set("title", filename)
294
-
295
-
296
- class MarkdownToDocxExporter:
297
- def __init__(
298
- self,
299
- input_path: Path,
300
- output_dir: Path,
301
- output_docx: Path | None = None,
302
- report_path: Path | None = None,
303
- template_path: Path | None = None,
304
- ):
305
- self.input_path = input_path
306
- self.output_dir = output_dir
307
- self.output_docx = output_docx or (output_dir / f"{input_path.stem}.docx")
308
- self.report_path = report_path or (output_dir / "export-report.json")
309
- self._template_resource_stack = ExitStack()
310
- self.template_path = self.resolve_template_path(template_path)
311
- self.use_template_styles = self.template_path is not None
312
- self.stats = WordExportStats()
313
-
314
- def export(self) -> dict:
315
- try:
316
- self.output_dir.mkdir(parents=True, exist_ok=True)
317
- if self.template_path and not self.template_path.exists():
318
- raise FileNotFoundError(f"Word template not found: {self.template_path}")
319
-
320
- document = Document()
321
- if not self.template_path:
322
- self.apply_modern_styles(document)
323
- self.ensure_custom_styles(document)
324
-
325
- markdown = self.input_path.read_text(encoding="utf-8")
326
- self.render_markdown(document, markdown)
327
- document.save(self.output_docx)
328
- if self.template_path:
329
- self.apply_template_package(self.output_docx, self.template_path)
330
-
331
- report = {
332
- "input": str(self.input_path),
333
- "output": str(self.output_docx),
334
- "template": str(self.template_path) if self.template_path else None,
335
- "stats": self.stats.__dict__,
336
- }
337
- self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
338
- return report
339
- finally:
340
- self._template_resource_stack.close()
341
-
342
- def resolve_template_path(self, template_path: Path | None) -> Path | None:
343
- if template_path is not None:
344
- return Path(template_path)
345
-
346
- try:
347
- return self._template_resource_stack.enter_context(as_file(packaged_template_resource()))
348
- except (FileNotFoundError, ModuleNotFoundError):
349
- return None
350
-
351
- def apply_modern_styles(self, document: Document) -> None:
352
- section = document.sections[0]
353
- section.top_margin = Inches(0.8)
354
- section.bottom_margin = Inches(0.8)
355
- section.left_margin = Inches(0.9)
356
- section.right_margin = Inches(0.9)
357
-
358
- normal = document.styles["Normal"]
359
- normal.font.name = "Aptos"
360
- normal.font.size = Pt(11)
361
- normal.font.color.rgb = RGBColor(31, 41, 55)
362
- normal.paragraph_format.space_after = Pt(8)
363
- normal.paragraph_format.line_spacing = 1.15
364
-
365
- for level, size in ((1, 22), (2, 17), (3, 14), (4, 12)):
366
- style = document.styles[f"Heading {level}"]
367
- style.font.name = "Aptos Display"
368
- style.font.size = Pt(size)
369
- style.font.bold = True
370
- style.font.color.rgb = RGBColor(15, 23, 42)
371
- style.paragraph_format.space_before = Pt(12 if level == 1 else 10)
372
- style.paragraph_format.space_after = Pt(4)
373
-
374
- self.ensure_custom_styles(document)
375
-
376
- def ensure_custom_styles(self, document: Document) -> None:
377
- if self.use_template_styles:
378
- return
379
- if "BuildCorpus Code" not in document.styles:
380
- style = document.styles.add_style("BuildCorpus Code", WD_STYLE_TYPE.PARAGRAPH)
381
- style.base_style = document.styles["Normal"]
382
- style.font.name = "Consolas"
383
- style.font.size = Pt(10)
384
- style.paragraph_format.left_indent = Inches(0.2)
385
- style.paragraph_format.right_indent = Inches(0.2)
386
- style.paragraph_format.space_before = Pt(4)
387
- style.paragraph_format.space_after = Pt(6)
388
-
389
- if "BuildCorpus Quote" not in document.styles:
390
- style = document.styles.add_style("BuildCorpus Quote", WD_STYLE_TYPE.PARAGRAPH)
391
- style.base_style = document.styles["Normal"]
392
- style.font.italic = True
393
- style.font.color.rgb = RGBColor(71, 85, 105)
394
- style.paragraph_format.left_indent = Inches(0.35)
395
- style.paragraph_format.space_after = Pt(6)
396
-
397
- def render_markdown(self, document: Document, markdown: str) -> None:
398
- lines = markdown.splitlines()
399
- index = 0
400
- while index < len(lines):
401
- line = lines[index]
402
- stripped = line.strip()
403
-
404
- if not stripped:
405
- index += 1
406
- continue
407
-
408
- if stripped.startswith("```"):
409
- info = strip_fence(line)
410
- buffer: list[str] = []
411
- index += 1
412
- while index < len(lines) and not lines[index].strip().startswith("```"):
413
- buffer.append(lines[index])
414
- index += 1
415
- if index < len(lines):
416
- index += 1
417
- self.add_code_block(document, "\n".join(buffer), info)
418
- continue
419
-
420
- if stripped == "$$":
421
- buffer: list[str] = []
422
- index += 1
423
- while index < len(lines) and lines[index].strip() != "$$":
424
- buffer.append(lines[index])
425
- index += 1
426
- if index < len(lines):
427
- index += 1
428
- self.add_equation_block(document, "\n".join(buffer).strip())
429
- continue
430
-
431
- if stripped.startswith("$$") and stripped.endswith("$$") and len(stripped) > 4:
432
- self.add_equation_block(document, stripped[2:-2].strip())
433
- index += 1
434
- continue
435
-
436
- if stripped.startswith("#"):
437
- level = len(stripped) - len(stripped.lstrip("#"))
438
- text = stripped[level:].strip()
439
- self.add_heading(document, level, text)
440
- index += 1
441
- continue
442
-
443
- if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
444
- table_lines = [line, lines[index + 1]]
445
- index += 2
446
- while index < len(lines) and "|" in lines[index]:
447
- if not lines[index].strip():
448
- break
449
- table_lines.append(lines[index])
450
- index += 1
451
- self.add_table(document, table_lines)
452
- continue
453
-
454
- if stripped.lower() == "<table>":
455
- table_lines = [line]
456
- index += 1
457
- while index < len(lines):
458
- table_lines.append(lines[index])
459
- if lines[index].strip().lower() == "</table>":
460
- index += 1
461
- break
462
- index += 1
463
- self.add_html_table(document, "\n".join(table_lines))
464
- continue
465
-
466
- list_match = LIST_ITEM_RE.match(line)
467
- if list_match:
468
- index = self.add_list(document, lines, index)
469
- continue
470
-
471
- if stripped.startswith(">"):
472
- quote_lines: list[str] = []
473
- while index < len(lines) and lines[index].strip().startswith(">"):
474
- quote_lines.append(lines[index].strip()[1:].strip())
475
- index += 1
476
- self.add_blockquote(document, " ".join(quote_lines))
477
- continue
478
-
479
- if re.fullmatch(r"[-*_]{3,}", stripped):
480
- document.add_paragraph("")
481
- index += 1
482
- continue
483
-
484
- paragraph_lines = [line.strip()]
485
- paragraph_lines = [line.rstrip()]
486
- paragraph_breaks = [line.endswith(" ") or line.endswith("\\")]
487
- index += 1
488
- while index < len(lines):
489
- candidate = lines[index]
490
- if not candidate.strip():
491
- break
492
- if candidate.strip().startswith(("```", "#", ">")):
493
- break
494
- if LIST_ITEM_RE.match(candidate):
495
- break
496
- if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
497
- break
498
- paragraph_lines.append(candidate.rstrip())
499
- paragraph_breaks.append(candidate.endswith(" ") or candidate.endswith("\\"))
500
- index += 1
501
- self.add_paragraph(document, self.combine_paragraph_lines(paragraph_lines, paragraph_breaks))
502
-
503
- @staticmethod
504
- def combine_paragraph_lines(lines: list[str], breaks: list[bool]) -> str:
505
- if not lines:
506
- return ""
507
- combined = lines[0]
508
- for index in range(1, len(lines)):
509
- separator = "\n" if breaks[index - 1] else " "
510
- combined += separator + lines[index]
511
- return combined
512
-
513
- @staticmethod
514
- def apply_template_package(output_docx: Path, template_path: Path) -> None:
515
- transplant_parts = {
516
- "word/styles.xml",
517
- "word/stylesWithEffects.xml",
518
- "word/numbering.xml",
519
- "word/fontTable.xml",
520
- "word/settings.xml",
521
- "word/webSettings.xml",
522
- "word/theme/theme1.xml",
523
- }
524
- with tempfile.TemporaryDirectory(prefix="build-corpus-template-") as tmp:
525
- tmp_dir = Path(tmp)
526
- patched = tmp_dir / output_docx.name
527
- with ZipFile(output_docx) as out_zip, ZipFile(template_path) as template_zip, ZipFile(patched, "w") as patched_zip:
528
- template_names = set(template_zip.namelist())
529
- output_names = set(out_zip.namelist())
530
- for name in out_zip.namelist():
531
- if name in transplant_parts and name in template_names:
532
- patched_zip.writestr(name, template_zip.read(name))
533
- else:
534
- patched_zip.writestr(name, out_zip.read(name))
535
- for name in transplant_parts:
536
- if name in template_names and name not in output_names:
537
- patched_zip.writestr(name, template_zip.read(name))
538
- shutil.move(str(patched), output_docx)
539
-
540
- def add_heading(self, document: Document, level: int, text: str) -> None:
541
- paragraph = document.add_paragraph(style=f"Heading {min(level, 6)}")
542
- self.render_inline(paragraph, text)
543
- self.stats.headings += 1
544
-
545
- def add_paragraph(self, document: Document, text: str) -> None:
546
- paragraph = document.add_paragraph(style="Normal")
547
- self.render_inline(paragraph, text)
548
- self.stats.paragraphs += 1
549
-
550
- def add_code_block(self, document: Document, code: str, info: str) -> None:
551
- paragraph = document.add_paragraph(style="Normal" if self.use_template_styles else "BuildCorpus Code")
552
- if info:
553
- label = paragraph.add_run(f"{info}\n")
554
- label.bold = True
555
- label.font.color.rgb = RGBColor(37, 99, 235)
556
- run = paragraph.add_run(code)
557
- run.font.name = "Consolas"
558
- run.font.size = Pt(10)
559
- set_paragraph_shading(paragraph, "F8FAFC")
560
- self.stats.code_blocks += 1
561
-
562
- def add_equation_block(self, document: Document, equation: str) -> None:
563
- paragraph = document.add_paragraph(style="Normal")
564
- paragraph.paragraph_format.left_indent = Inches(0.3)
565
- paragraph.paragraph_format.right_indent = Inches(0.3)
566
- paragraph.paragraph_format.space_before = Pt(4)
567
- paragraph.paragraph_format.space_after = Pt(8)
568
- run = paragraph.add_run(equation)
569
- run.font.name = "Cambria Math"
570
- run.font.size = Pt(11)
571
- self.stats.equations += 1
572
-
573
- def add_blockquote(self, document: Document, text: str) -> None:
574
- paragraph = document.add_paragraph(style="Quote" if self.use_template_styles else "BuildCorpus Quote")
575
- self.render_inline(paragraph, text)
576
- set_paragraph_border(paragraph, "CBD5E1")
577
- self.stats.blockquotes += 1
578
-
579
- def add_list(self, document: Document, lines: list[str], start: int) -> int:
580
- index = start
581
- while index < len(lines):
582
- match = LIST_ITEM_RE.match(lines[index])
583
- if not match:
584
- break
585
- indent, marker, body = match.groups()
586
- ordered = marker.endswith(".")
587
- body_lines = [body]
588
- lookahead = index + 1
589
- while lookahead < len(lines):
590
- candidate = lines[lookahead]
591
- stripped = candidate.strip()
592
- if not stripped:
593
- break
594
- if candidate.strip().startswith(("```", "#", ">")):
595
- break
596
- if LIST_ITEM_RE.match(candidate):
597
- break
598
- if TABLE_SEPARATOR_RE.match(lines[lookahead + 1]) if lookahead + 1 < len(lines) else False:
599
- break
600
- if not candidate[:1].isspace():
601
- break
602
- body_lines.append(candidate.rstrip())
603
- lookahead += 1
604
- style_name = self.list_style_name(document, ordered, indent)
605
- paragraph = document.add_paragraph(style=style_name)
606
- if style_name in {"List Bullet", "List Number"}:
607
- paragraph.paragraph_format.left_indent = Inches(0.25 + (len(indent.replace("\t", " ")) // 2) * 0.18)
608
- self.render_inline(paragraph, "\n".join(body_lines))
609
- self.stats.lists += 1
610
- index = lookahead
611
- return index
612
-
613
- def list_style_name(self, document: Document, ordered: bool, indent: str) -> str:
614
- level = min(3, max(1, (len(indent.replace("\t", " ")) // 2) + 1))
615
- base = "List Number" if ordered else "List Bullet"
616
- candidate = base if level == 1 else f"{base} {level}"
617
- return candidate if candidate in document.styles else base
618
-
619
- def add_table(self, document: Document, table_lines: list[str]) -> None:
620
- rows = [split_table_row(line) for line in table_lines if not TABLE_SEPARATOR_RE.match(line)]
621
- self.add_table_rows(document, rows)
622
-
623
- def add_html_table(self, document: Document, table_markup: str) -> None:
624
- self.add_table_rows(document, parse_html_table(table_markup))
625
-
626
- def add_table_rows(self, document: Document, rows: list[list[str]]) -> None:
627
- if not rows:
628
- return
629
- width = max(len(row) for row in rows)
630
- table = document.add_table(rows=len(rows), cols=width)
631
- table.style = "Light List Accent 1" if "Light List Accent 1" in document.styles else "Table Grid"
632
- for row_index, row in enumerate(rows):
633
- for col_index in range(width):
634
- value = row[col_index] if col_index < len(row) else ""
635
- self.render_table_cell(table.cell(row_index, col_index), value)
636
- self.stats.tables += 1
637
-
638
- def render_table_cell(self, cell, text: str) -> None:
639
- lines = set_cell_text(cell, text)
640
- if not lines:
641
- cell.text = ""
642
- return
643
- first = cell.paragraphs[0]
644
- first.text = ""
645
- self.render_inline(first, lines[0])
646
- for line in lines[1:]:
647
- paragraph = cell.add_paragraph("")
648
- self.render_inline(paragraph, line)
649
-
650
- def render_inline(self, paragraph, text: str) -> None:
651
- cursor = 0
652
- for match in INLINE_TOKEN_RE.finditer(text):
653
- if match.start() > cursor:
654
- self.render_plain_text(paragraph, text[cursor:match.start()])
655
- token = match.group(0)
656
- self.render_inline_token(paragraph, token)
657
- cursor = match.end()
658
- if cursor < len(text):
659
- self.render_plain_text(paragraph, text[cursor:])
660
-
661
- def render_plain_text(self, paragraph, text: str) -> None:
662
- cursor = 0
663
- for match in BARE_URL_RE.finditer(text):
664
- if match.start() > cursor:
665
- append_text_with_breaks(paragraph, text[cursor:match.start()])
666
- url = match.group(0)
667
- normalized_url, trailing = split_trailing_url_punctuation(url)
668
- if normalized_url:
669
- add_hyperlink(paragraph, normalized_url, normalized_url)
670
- if trailing:
671
- append_text_with_breaks(paragraph, trailing)
672
- cursor = match.end()
673
- if cursor < len(text):
674
- append_text_with_breaks(paragraph, text[cursor:])
675
-
676
- def render_inline_token(self, paragraph, token: str) -> None:
677
- if token.startswith("!["):
678
- alt, target = markdown_link_parts(token)
679
- image_path = (self.input_path.parent / target).resolve()
680
- if target.startswith(("http://", "https://", "data:")):
681
- paragraph.add_run(f"[image: {alt or target}]")
682
- self.stats.warnings.append(f"Skipped non-local image target: {target}")
683
- return
684
- if image_path.exists():
685
- run = paragraph.add_run()
686
- try:
687
- run.add_picture(str(image_path), width=Inches(5.8))
688
- set_picture_metadata(run, target)
689
- self.stats.images += 1
690
- except UnrecognizedImageError:
691
- converted = convert_windows_metafile_to_png(image_path)
692
- if converted is not None:
693
- run.add_picture(str(converted), width=Inches(5.8))
694
- set_picture_metadata(run, target)
695
- self.stats.images += 1
696
- self.stats.warnings.append(f"Converted unsupported image to PNG: {target}")
697
- else:
698
- paragraph.add_run(f"[unsupported image: {target}]")
699
- self.stats.warnings.append(f"Unsupported image asset: {target}")
700
- else:
701
- paragraph.add_run(f"[missing image: {target}]")
702
- self.stats.warnings.append(f"Missing image asset: {target}")
703
- return
704
-
705
- if token.startswith("["):
706
- label, target = markdown_link_parts(token)
707
- hyperlink = add_hyperlink(paragraph, "", target)
708
- self.render_hyperlink_label(hyperlink, label)
709
- return
710
-
711
- if token.startswith("`"):
712
- run = paragraph.add_run(token[1:-1])
713
- run.font.name = "Consolas"
714
- run.font.size = Pt(10)
715
- return
716
-
717
- if token.startswith("$$") and token.endswith("$$"):
718
- run = paragraph.add_run(token[2:-2])
719
- run.font.name = "Cambria Math"
720
- run.font.size = Pt(11)
721
- self.stats.equations += 1
722
- return
723
-
724
- if token.startswith("$") and token.endswith("$"):
725
- run = paragraph.add_run(token[1:-1])
726
- run.font.name = "Cambria Math"
727
- run.font.size = Pt(11)
728
- self.stats.equations += 1
729
- return
730
-
731
- if token.startswith("***") and token.endswith("***"):
732
- run = paragraph.add_run(token[3:-3])
733
- run.bold = True
734
- run.italic = True
735
- return
736
-
737
- if token.startswith("**") and token.endswith("**"):
738
- run = paragraph.add_run(token[2:-2])
739
- run.bold = True
740
- return
741
-
742
- if token.startswith("*") and token.endswith("*"):
743
- run = paragraph.add_run(token[1:-1])
744
- run.italic = True
745
- return
746
-
747
- paragraph.add_run(token)
748
-
749
- def render_hyperlink_label(self, hyperlink, text: str) -> None:
750
- cursor = 0
751
- for match in INLINE_TOKEN_RE.finditer(text):
752
- if match.start() > cursor:
753
- append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:match.start()]))
754
- token = match.group(0)
755
- self.render_hyperlink_token(hyperlink, token)
756
- cursor = match.end()
757
- if cursor < len(text):
758
- append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:]))
759
-
760
- def render_hyperlink_token(self, hyperlink, token: str) -> None:
761
- if token.startswith("`") and token.endswith("`"):
762
- append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), code=True)
763
- return
764
- if token.startswith("***") and token.endswith("***"):
765
- append_hyperlink_run(hyperlink, unescape_markdown_text(token[3:-3]), bold=True, italic=True)
766
- return
767
- if token.startswith("**") and token.endswith("**"):
768
- append_hyperlink_run(hyperlink, unescape_markdown_text(token[2:-2]), bold=True)
769
- return
770
- if token.startswith("*") and token.endswith("*"):
771
- append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), italic=True)
772
- return
773
- append_hyperlink_run(hyperlink, unescape_markdown_text(token))
774
-
775
-
776
- def export_markdown_to_docx(
777
- input_path: Path,
778
- output_root: Path,
779
- out_same_dir: bool,
780
- template_path: Path | None = None,
781
- ) -> dict:
782
- if out_same_dir:
783
- output_dir = input_path.parent
784
- output_docx = input_path.with_suffix(".docx")
785
- report_path = input_path.with_name(f"{input_path.stem}.export-report.json")
786
- else:
787
- output_dir = output_root / input_path.stem
788
- output_docx = None
789
- report_path = None
790
-
791
- exporter = MarkdownToDocxExporter(
792
- input_path=input_path,
793
- output_dir=output_dir,
794
- output_docx=output_docx,
795
- report_path=report_path,
796
- template_path=template_path,
797
- )
798
- return exporter.export()
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import tempfile
9
+ from html.parser import HTMLParser
10
+ from contextlib import ExitStack
11
+ from dataclasses import dataclass, field
12
+ from importlib.resources import as_file, files
13
+ from pathlib import Path
14
+ from zipfile import ZipFile
15
+
16
+ from docx import Document
17
+ from docx.enum.style import WD_STYLE_TYPE
18
+ from docx.enum.text import WD_BREAK
19
+ from docx.oxml import OxmlElement
20
+ from docx.oxml.ns import qn
21
+ from docx.shared import Inches, Pt, RGBColor
22
+ from docx.image.exceptions import UnrecognizedImageError
23
+
24
+ try:
25
+ from .equations import latex_to_omath, latex_to_omath_para
26
+ except ImportError: # pragma: no cover - script-style invocation
27
+ from build_corpus.equations import latex_to_omath, latex_to_omath_para
28
+
29
+ try:
30
+ from .frontmatter import inject_frontmatter_into_package, strip_mdk_frontmatter
31
+ except ImportError: # pragma: no cover - script-style invocation
32
+ from build_corpus.frontmatter import inject_frontmatter_into_package, strip_mdk_frontmatter
33
+
34
+
35
+ # Math delimiters use a (?<!\\) lookbehind so escaped dollars — e.g. currency
36
+ # like "\$252.3B" — are NOT mistaken for inline-math fences. Escaped dollars then
37
+ # flow through the plain-text path and are unescaped to "$" (counted as a fixup).
38
+ INLINE_TOKEN_RE = re.compile(
39
+ r"(!\[[^\]]*\]\([^)]+\)|\[[^\]]+\]\([^)]+\)|`[^`]+`|(?<!\\)\$\$[^$]+\$\$|(?<!\\)\$[^$\n]+\$|\*\*\*.+?\*\*\*|\*\*.+?\*\*|\*.+?\*)"
40
+ )
41
+
42
+ # Image targets python-docx can never embed as a picture (need an external
43
+ # render pipeline, e.g. headless-browser screenshot — NOT build-corpus's job).
44
+ UNRENDERABLE_IMAGE_EXTS = (".html", ".htm", ".jsx", ".tsx", ".js", ".svg", ".md")
45
+ # Vector/metafile formats embeddable only after a metafile->PNG conversion
46
+ # (available on Windows / where LibreOffice is present).
47
+ METAFILE_IMAGE_EXTS = (".emf", ".wmf")
48
+ STYLE_PACKAGE_COMMENT_RE = re.compile(r'<!--\s*build-corpus:word-style-package\b[^>]*\bpath="([^"]+)"[^>]*-->', re.IGNORECASE)
49
+
50
+
51
+ def count_input_elements(markdown: str) -> dict:
52
+ """Count source-side Markdown elements for input/output reconciliation."""
53
+ lines = markdown.splitlines()
54
+ counts = {"tables": 0, "equations": 0, "images": 0,
55
+ "code_blocks": 0, "headings": 0, "links": 0}
56
+ in_fence = False
57
+ index = 0
58
+ while index < len(lines):
59
+ stripped = lines[index].strip()
60
+ if stripped.startswith("```"):
61
+ counts["code_blocks"] += 1
62
+ index += 1
63
+ while index < len(lines) and not lines[index].strip().startswith("```"):
64
+ index += 1
65
+ index += 1
66
+ continue
67
+ if stripped == "$$":
68
+ counts["equations"] += 1
69
+ index += 1
70
+ while index < len(lines) and lines[index].strip() != "$$":
71
+ index += 1
72
+ index += 1
73
+ continue
74
+ if stripped.startswith("$$") and stripped.endswith("$$") and len(stripped) > 4:
75
+ # standalone single-line display equation (counted once; matches render_markdown)
76
+ counts["equations"] += 1
77
+ index += 1
78
+ continue
79
+ if stripped.startswith("#"):
80
+ counts["headings"] += 1
81
+ elif index + 1 < len(lines) and TABLE_SEPARATOR_RE.match(lines[index + 1]):
82
+ counts["tables"] += 1
83
+ line_text = lines[index]
84
+ # inline elements on this line
85
+ counts["images"] += len(re.findall(r"!\[[^\]]*\]\([^)]+\)", line_text))
86
+ # display $$...$$ embedded in a line with surrounding text (render tokenizes these too)
87
+ counts["equations"] += len(re.findall(r"(?<!\\)\$\$[^$]+\$\$", line_text))
88
+ # inline math: standalone $...$ not part of $$ and not escaped
89
+ counts["equations"] += len(re.findall(r"(?<!\\)(?<!\$)\$[^$\n]+\$(?!\$)", line_text))
90
+ # links that are not images
91
+ counts["links"] += len(re.findall(r"(?<!!)\[[^\]]+\]\([^)]+\)", line_text))
92
+ index += 1
93
+ return counts
94
+
95
+
96
+ def count_text_fixups(markdown: str) -> dict:
97
+ """Count markdown-escape unescapes the exporter applies to body prose.
98
+
99
+ Escapes inside fenced code blocks and inline code spans are not unescaped by
100
+ the renderer, so they are excluded here to match actual behavior.
101
+ """
102
+ # strip fenced code blocks
103
+ no_fences = re.sub(r"```.*?```", "", markdown, flags=re.DOTALL)
104
+ # strip inline code spans
105
+ no_code = re.sub(r"`[^`]+`", "", no_fences)
106
+ matches = re.findall(r"\\([\\`*_{}\[\]()#+.!|$-])", no_code)
107
+ return {"total": len(matches), "currency_unescaped": matches.count("$")}
108
+
109
+
110
+ def image_looks_like_svg(path: Path) -> bool:
111
+ """Sniff a file's first bytes to detect SVG/XML content (even if mislabeled)."""
112
+ try:
113
+ head = path.read_bytes()[:512].lstrip()
114
+ except OSError:
115
+ return False
116
+ lowered = head[:256].lower()
117
+ return lowered.startswith(b"<svg") or (lowered.startswith(b"<?xml") and b"<svg" in head.lower())
118
+ BARE_URL_RE = re.compile(r"https?://[^\s<>()]+(?:\([^\s<>()]*\)[^\s<>()]*)*")
119
+ LIST_ITEM_RE = re.compile(r"^(\s*)([-*+]|\d+\.)\s+(.*)$")
120
+ TABLE_SEPARATOR_RE = re.compile(r"^\s*\|?(?:\s*:?-{3,}:?\s*\|)+\s*:?-{3,}:?\s*\|?\s*$")
121
+ DEFAULT_TEMPLATE_FILENAME = "md-to-word-template.dotx"
122
+
123
+
124
+ @dataclass
125
+ class WordExportStats:
126
+ paragraphs: int = 0
127
+ headings: int = 0
128
+ lists: int = 0
129
+ tables: int = 0
130
+ code_blocks: int = 0
131
+ blockquotes: int = 0
132
+ links: int = 0
133
+ images: int = 0
134
+ images_failed: int = 0
135
+ equations: int = 0
136
+ equations_omml: int = 0
137
+ equations_fell_back: int = 0
138
+ warnings: list[str] = field(default_factory=list)
139
+ issues: list[dict] = field(default_factory=list)
140
+
141
+
142
+ def strip_fence(line: str) -> str:
143
+ return line.strip()[3:].strip()
144
+
145
+
146
+ def split_table_row(line: str) -> list[str]:
147
+ text = line.strip()
148
+ if text.startswith("|"):
149
+ text = text[1:]
150
+ if text.endswith("|"):
151
+ text = text[:-1]
152
+ cells: list[str] = []
153
+ buffer: list[str] = []
154
+ escape = False
155
+ for char in text:
156
+ if escape:
157
+ buffer.append(char)
158
+ escape = False
159
+ continue
160
+ if char == "\\":
161
+ escape = True
162
+ buffer.append(char)
163
+ continue
164
+ if char == "|":
165
+ cells.append("".join(buffer).strip())
166
+ buffer.clear()
167
+ continue
168
+ buffer.append(char)
169
+ cells.append("".join(buffer).strip())
170
+ return cells
171
+
172
+
173
+ def markdown_link_parts(token: str) -> tuple[str, str]:
174
+ match = re.fullmatch(r"!?\[([^\]]*)\]\(([^)]+)\)", token)
175
+ if not match:
176
+ return token, ""
177
+ return match.group(1), match.group(2).strip()
178
+
179
+
180
+ def unescape_markdown_text(text: str) -> str:
181
+ return re.sub(r"\\([\\`*_{}\[\]()#+.!|$-])", r"\1", text)
182
+
183
+
184
+ class HTMLTableParser(HTMLParser):
185
+ def __init__(self) -> None:
186
+ super().__init__()
187
+ self.rows: list[list[str]] = []
188
+ self.current_row: list[str] | None = None
189
+ self.current_cell: list[str] | None = None
190
+
191
+ def handle_starttag(self, tag: str, attrs) -> None: # type: ignore[override]
192
+ normalized = tag.lower()
193
+ if normalized == "tr":
194
+ self.current_row = []
195
+ elif normalized == "td":
196
+ self.current_cell = []
197
+ elif normalized == "br" and self.current_cell is not None:
198
+ self.current_cell.append("\n")
199
+
200
+ def handle_endtag(self, tag: str) -> None: # type: ignore[override]
201
+ normalized = tag.lower()
202
+ if normalized == "td" and self.current_row is not None and self.current_cell is not None:
203
+ self.current_row.append("".join(self.current_cell).strip())
204
+ self.current_cell = None
205
+ elif normalized == "tr" and self.current_row is not None:
206
+ self.rows.append(self.current_row)
207
+ self.current_row = None
208
+
209
+ def handle_data(self, data: str) -> None: # type: ignore[override]
210
+ if self.current_cell is not None:
211
+ self.current_cell.append(data)
212
+
213
+
214
+ def parse_html_table(markup: str) -> list[list[str]]:
215
+ parser = HTMLTableParser()
216
+ parser.feed(markup)
217
+ parser.close()
218
+ return [row for row in parser.rows if any(cell.strip() for cell in row)]
219
+
220
+
221
+ def packaged_template_resource():
222
+ package_root = f"{__package__}.templates" if __package__ else "build_corpus.templates"
223
+ return files(package_root).joinpath(DEFAULT_TEMPLATE_FILENAME)
224
+
225
+
226
+ def resolve_default_template_path() -> Path | None:
227
+ try:
228
+ packaged_template_resource()
229
+ except ModuleNotFoundError:
230
+ return None
231
+ return Path(f"bundled:{DEFAULT_TEMPLATE_FILENAME}")
232
+
233
+
234
+ def set_cell_text(cell, text: str) -> None:
235
+ lines = [segment.strip() for segment in text.replace("<br>", "\n").splitlines()]
236
+ lines = [line for line in lines if line]
237
+ if not lines:
238
+ cell.text = ""
239
+ return []
240
+ return lines
241
+
242
+
243
+ def append_text_with_breaks(paragraph, text: str) -> None:
244
+ text = unescape_markdown_text(text)
245
+ parts = text.split("\n")
246
+ for index, part in enumerate(parts):
247
+ if part:
248
+ paragraph.add_run(part)
249
+ if index < len(parts) - 1:
250
+ paragraph.add_run().add_break(WD_BREAK.LINE)
251
+
252
+
253
+ def split_trailing_url_punctuation(url: str) -> tuple[str, str]:
254
+ trailing = []
255
+ core = url
256
+ while core and core[-1] in ".,;:!?":
257
+ trailing.append(core[-1])
258
+ core = core[:-1]
259
+ return core, "".join(reversed(trailing))
260
+
261
+
262
+ def convert_windows_metafile_to_png(source: Path) -> Path | None:
263
+ if os.name != "nt":
264
+ return None
265
+ target_dir = Path(tempfile.gettempdir()) / "build-corpus-image-fallbacks"
266
+ target_dir.mkdir(parents=True, exist_ok=True)
267
+ target = target_dir / f"{source.stem}.png"
268
+ source_literal = str(source).replace("'", "''")
269
+ target_literal = str(target).replace("'", "''")
270
+ command = (
271
+ "Add-Type -AssemblyName System.Drawing; "
272
+ f"$img = [System.Drawing.Image]::FromFile('{source_literal}'); "
273
+ f"$bmp = New-Object System.Drawing.Bitmap $img.Width, $img.Height; "
274
+ "$gfx = [System.Drawing.Graphics]::FromImage($bmp); "
275
+ "$gfx.DrawImage($img, 0, 0, $img.Width, $img.Height); "
276
+ f"$bmp.Save('{target_literal}', [System.Drawing.Imaging.ImageFormat]::Png); "
277
+ "$gfx.Dispose(); $bmp.Dispose(); $img.Dispose()"
278
+ )
279
+ result = subprocess.run(
280
+ ["powershell", "-NoProfile", "-Command", command],
281
+ capture_output=True,
282
+ text=True,
283
+ )
284
+ if result.returncode != 0 or not target.exists():
285
+ return None
286
+ return target
287
+
288
+
289
+ def set_paragraph_shading(paragraph, fill: str) -> None:
290
+ paragraph_pr = paragraph._p.get_or_add_pPr()
291
+ shading = paragraph_pr.find(qn("w:shd"))
292
+ if shading is None:
293
+ shading = OxmlElement("w:shd")
294
+ paragraph_pr.append(shading)
295
+ shading.set(qn("w:fill"), fill)
296
+
297
+
298
+ def set_paragraph_border(paragraph, color: str) -> None:
299
+ paragraph_pr = paragraph._p.get_or_add_pPr()
300
+ borders = paragraph_pr.find(qn("w:pBdr"))
301
+ if borders is None:
302
+ borders = OxmlElement("w:pBdr")
303
+ paragraph_pr.append(borders)
304
+ left = borders.find(qn("w:left"))
305
+ if left is None:
306
+ left = OxmlElement("w:left")
307
+ borders.append(left)
308
+ left.set(qn("w:val"), "single")
309
+ left.set(qn("w:sz"), "10")
310
+ left.set(qn("w:space"), "12")
311
+ left.set(qn("w:color"), color)
312
+
313
+
314
+ def add_hyperlink(paragraph, text: str, url: str):
315
+ part = paragraph.part
316
+ relationship_id = part.relate_to(
317
+ url,
318
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
319
+ is_external=True,
320
+ )
321
+ hyperlink = OxmlElement("w:hyperlink")
322
+ hyperlink.set(qn("r:id"), relationship_id)
323
+
324
+ run = OxmlElement("w:r")
325
+ rpr = OxmlElement("w:rPr")
326
+
327
+ color = OxmlElement("w:color")
328
+ color.set(qn("w:val"), "2563EB")
329
+ rpr.append(color)
330
+
331
+ underline = OxmlElement("w:u")
332
+ underline.set(qn("w:val"), "single")
333
+ rpr.append(underline)
334
+ run.append(rpr)
335
+
336
+ text_node = OxmlElement("w:t")
337
+ text_node.text = text
338
+ run.append(text_node)
339
+ hyperlink.append(run)
340
+ paragraph._p.append(hyperlink)
341
+ return hyperlink
342
+
343
+
344
+ def append_hyperlink_run(hyperlink, text: str, bold: bool = False, italic: bool = False, code: bool = False) -> None:
345
+ run = OxmlElement("w:r")
346
+ rpr = OxmlElement("w:rPr")
347
+
348
+ color = OxmlElement("w:color")
349
+ color.set(qn("w:val"), "2563EB")
350
+ rpr.append(color)
351
+
352
+ underline = OxmlElement("w:u")
353
+ underline.set(qn("w:val"), "single")
354
+ rpr.append(underline)
355
+
356
+ if bold:
357
+ rpr.append(OxmlElement("w:b"))
358
+ if italic:
359
+ rpr.append(OxmlElement("w:i"))
360
+
361
+ fonts_needed = code
362
+ if fonts_needed:
363
+ fonts = OxmlElement("w:rFonts")
364
+ fonts.set(qn("w:ascii"), "Consolas")
365
+ fonts.set(qn("w:hAnsi"), "Consolas")
366
+ fonts.set(qn("w:cs"), "Consolas")
367
+ rpr.append(fonts)
368
+ size = OxmlElement("w:sz")
369
+ size.set(qn("w:val"), "20")
370
+ rpr.append(size)
371
+
372
+ run.append(rpr)
373
+ text_node = OxmlElement("w:t")
374
+ text_node.text = text
375
+ run.append(text_node)
376
+ hyperlink.append(run)
377
+
378
+
379
+ def set_picture_metadata(run, source_name: str) -> None:
380
+ filename = Path(source_name).name
381
+ try:
382
+ doc_props = run._r.xpath(".//*[local-name()='docPr']")
383
+ except Exception:
384
+ doc_props = []
385
+ for doc_prop in doc_props:
386
+ doc_prop.set("name", filename)
387
+ doc_prop.set("descr", filename)
388
+ doc_prop.set("title", filename)
389
+
390
+
391
+ class MarkdownToDocxExporter:
392
+ def __init__(
393
+ self,
394
+ input_path: Path,
395
+ output_dir: Path,
396
+ output_docx: Path | None = None,
397
+ report_path: Path | None = None,
398
+ template_path: Path | None = None,
399
+ ):
400
+ self.input_path = input_path
401
+ self.output_dir = output_dir
402
+ self.output_docx = output_docx or (output_dir / f"{input_path.stem}.docx")
403
+ self.report_path = report_path or (output_dir / "export-report.json")
404
+ self._template_resource_stack = ExitStack()
405
+ self.template_path = self.resolve_template_path(template_path)
406
+ self.use_template_styles = self.template_path is not None
407
+ self.stats = WordExportStats()
408
+
409
+ def export(self) -> dict:
410
+ try:
411
+ self.output_dir.mkdir(parents=True, exist_ok=True)
412
+ if self.template_path and not self.template_path.exists():
413
+ raise FileNotFoundError(f"Word template not found: {self.template_path}")
414
+
415
+ document = Document()
416
+ if not self.template_path:
417
+ self.apply_modern_styles(document)
418
+ self.ensure_custom_styles(document)
419
+
420
+ raw_markdown = self.input_path.read_text(encoding="utf-8")
421
+ frontmatter, markdown = strip_mdk_frontmatter(raw_markdown)
422
+ self.render_markdown(document, markdown)
423
+ document.save(self.output_docx)
424
+ if self.template_path:
425
+ self.apply_template_package(self.output_docx, self.template_path)
426
+ if frontmatter:
427
+ inject_frontmatter_into_package(self.output_docx, frontmatter)
428
+
429
+ input_counts = count_input_elements(markdown)
430
+ text_fixups = count_text_fixups(markdown)
431
+ reconciliation = self.build_reconciliation(input_counts)
432
+ fidelity_ok = all(row["ok"] for row in reconciliation.values())
433
+
434
+ report = {
435
+ "input": str(self.input_path),
436
+ "output": str(self.output_docx),
437
+ "template": str(self.template_path) if self.template_path else None,
438
+ "fidelity_ok": fidelity_ok,
439
+ "reconciliation": reconciliation,
440
+ "text_fixups": text_fixups,
441
+ "issues": self.stats.issues,
442
+ "stats": self.stats.__dict__,
443
+ }
444
+ self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
445
+ self._print_digest(reconciliation, text_fixups, fidelity_ok)
446
+ return report
447
+ finally:
448
+ self._template_resource_stack.close()
449
+
450
+ def build_reconciliation(self, input_counts: dict) -> dict:
451
+ """Compare source-side element counts to what was emitted."""
452
+ s = self.stats
453
+ equations_in = input_counts["equations"]
454
+ return {
455
+ "tables": {
456
+ "in": input_counts["tables"], "out": s.tables,
457
+ "ok": input_counts["tables"] == s.tables,
458
+ },
459
+ "equations": {
460
+ "in": equations_in, "out_omml": s.equations_omml,
461
+ "fell_back": s.equations_fell_back,
462
+ "ok": s.equations_omml == equations_in and s.equations_fell_back == 0,
463
+ },
464
+ "images": {
465
+ "in": input_counts["images"], "out": s.images, "failed": s.images_failed,
466
+ "ok": s.images_failed == 0 and s.images == input_counts["images"],
467
+ },
468
+ "code_blocks": {
469
+ "in": input_counts["code_blocks"], "out": s.code_blocks,
470
+ "ok": input_counts["code_blocks"] == s.code_blocks,
471
+ },
472
+ "headings": {
473
+ "in": input_counts["headings"], "out": s.headings,
474
+ "ok": input_counts["headings"] == s.headings,
475
+ },
476
+ "links": {
477
+ "in": input_counts["links"], "out": s.links,
478
+ "ok": input_counts["links"] == s.links,
479
+ },
480
+ }
481
+
482
+ def _print_digest(self, reconciliation: dict, text_fixups: dict, fidelity_ok: bool) -> None:
483
+ """Print a one-line human-readable fidelity summary to stdout."""
484
+ parts = []
485
+ for name, row in reconciliation.items():
486
+ out = row.get("out_omml", row.get("out", 0))
487
+ mark = "OK" if row["ok"] else "!!"
488
+ label = f"[{mark}] {name} {out}/{row['in']}"
489
+ if name == "equations" and row["fell_back"]:
490
+ label += f" ({row['fell_back']} fell back)"
491
+ if name == "images" and row.get("failed"):
492
+ label += f" ({row['failed']} failed)"
493
+ parts.append(label)
494
+ if text_fixups["total"]:
495
+ parts.append(f"text_fixups {text_fixups['total']} "
496
+ f"({text_fixups['currency_unescaped']} currency)")
497
+ bad_lines = sorted({i["line"] for i in self.stats.issues if i.get("line")})
498
+ if bad_lines:
499
+ parts.append("issue lines: " + ",".join(str(n) for n in bad_lines))
500
+ print(" ".join(parts) + f" -> fidelity_ok={str(fidelity_ok).lower()}")
501
+
502
+ def resolve_template_path(self, template_path: Path | None) -> Path | None:
503
+ if template_path is not None:
504
+ return Path(template_path)
505
+
506
+ roundtrip_template = self.resolve_roundtrip_template_path()
507
+ if roundtrip_template is not None:
508
+ return roundtrip_template
509
+
510
+ try:
511
+ return self._template_resource_stack.enter_context(as_file(packaged_template_resource()))
512
+ except (FileNotFoundError, ModuleNotFoundError):
513
+ return None
514
+
515
+ def resolve_roundtrip_template_path(self) -> Path | None:
516
+ sidecar_template = self.input_path.with_suffix(".wordstyle") / "style-package.docx"
517
+ if sidecar_template.exists():
518
+ return sidecar_template
519
+ try:
520
+ head = self.input_path.read_text(encoding="utf-8")[:4096]
521
+ except OSError:
522
+ return None
523
+ match = STYLE_PACKAGE_COMMENT_RE.search(head)
524
+ if not match:
525
+ return None
526
+ candidate = Path(match.group(1))
527
+ if not candidate.is_absolute():
528
+ candidate = self.input_path.parent / candidate
529
+ return candidate if candidate.exists() else None
530
+
531
+ def apply_modern_styles(self, document: Document) -> None:
532
+ section = document.sections[0]
533
+ section.top_margin = Inches(0.8)
534
+ section.bottom_margin = Inches(0.8)
535
+ section.left_margin = Inches(0.9)
536
+ section.right_margin = Inches(0.9)
537
+
538
+ normal = document.styles["Normal"]
539
+ normal.font.name = "Aptos"
540
+ normal.font.size = Pt(11)
541
+ normal.font.color.rgb = RGBColor(31, 41, 55)
542
+ normal.paragraph_format.space_after = Pt(8)
543
+ normal.paragraph_format.line_spacing = 1.15
544
+
545
+ for level, size in ((1, 22), (2, 17), (3, 14), (4, 12)):
546
+ style = document.styles[f"Heading {level}"]
547
+ style.font.name = "Aptos Display"
548
+ style.font.size = Pt(size)
549
+ style.font.bold = True
550
+ style.font.color.rgb = RGBColor(15, 23, 42)
551
+ style.paragraph_format.space_before = Pt(12 if level == 1 else 10)
552
+ style.paragraph_format.space_after = Pt(4)
553
+
554
+ self.ensure_custom_styles(document)
555
+
556
+ def ensure_custom_styles(self, document: Document) -> None:
557
+ if self.use_template_styles:
558
+ return
559
+ if "BuildCorpus Code" not in document.styles:
560
+ style = document.styles.add_style("BuildCorpus Code", WD_STYLE_TYPE.PARAGRAPH)
561
+ style.base_style = document.styles["Normal"]
562
+ style.font.name = "Consolas"
563
+ style.font.size = Pt(10)
564
+ style.paragraph_format.left_indent = Inches(0.2)
565
+ style.paragraph_format.right_indent = Inches(0.2)
566
+ style.paragraph_format.space_before = Pt(4)
567
+ style.paragraph_format.space_after = Pt(6)
568
+
569
+ if "BuildCorpus Quote" not in document.styles:
570
+ style = document.styles.add_style("BuildCorpus Quote", WD_STYLE_TYPE.PARAGRAPH)
571
+ style.base_style = document.styles["Normal"]
572
+ style.font.italic = True
573
+ style.font.color.rgb = RGBColor(71, 85, 105)
574
+ style.paragraph_format.left_indent = Inches(0.35)
575
+ style.paragraph_format.space_after = Pt(6)
576
+
577
+ def render_markdown(self, document: Document, markdown: str) -> None:
578
+ lines = markdown.splitlines()
579
+ index = 0
580
+ while index < len(lines):
581
+ line = lines[index]
582
+ self.current_line = index + 1
583
+ stripped = line.strip()
584
+
585
+ if not stripped:
586
+ index += 1
587
+ continue
588
+
589
+ if stripped.startswith("<!--") and stripped.endswith("-->"):
590
+ index += 1
591
+ continue
592
+
593
+ if stripped.startswith("```"):
594
+ info = strip_fence(line)
595
+ buffer: list[str] = []
596
+ index += 1
597
+ while index < len(lines) and not lines[index].strip().startswith("```"):
598
+ buffer.append(lines[index])
599
+ index += 1
600
+ if index < len(lines):
601
+ index += 1
602
+ self.add_code_block(document, "\n".join(buffer), info)
603
+ continue
604
+
605
+ if stripped == "$$":
606
+ buffer: list[str] = []
607
+ index += 1
608
+ while index < len(lines) and lines[index].strip() != "$$":
609
+ buffer.append(lines[index])
610
+ index += 1
611
+ if index < len(lines):
612
+ index += 1
613
+ self.add_equation_block(document, "\n".join(buffer).strip())
614
+ continue
615
+
616
+ if stripped.startswith("$$") and stripped.endswith("$$") and len(stripped) > 4:
617
+ self.add_equation_block(document, stripped[2:-2].strip())
618
+ index += 1
619
+ continue
620
+
621
+ if stripped.startswith("#"):
622
+ level = len(stripped) - len(stripped.lstrip("#"))
623
+ text = stripped[level:].strip()
624
+ self.add_heading(document, level, text)
625
+ index += 1
626
+ continue
627
+
628
+ if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
629
+ table_lines = [line, lines[index + 1]]
630
+ index += 2
631
+ while index < len(lines) and "|" in lines[index]:
632
+ if not lines[index].strip():
633
+ break
634
+ table_lines.append(lines[index])
635
+ index += 1
636
+ self.add_table(document, table_lines)
637
+ continue
638
+
639
+ if stripped.lower() == "<table>":
640
+ table_lines = [line]
641
+ index += 1
642
+ while index < len(lines):
643
+ table_lines.append(lines[index])
644
+ if lines[index].strip().lower() == "</table>":
645
+ index += 1
646
+ break
647
+ index += 1
648
+ self.add_html_table(document, "\n".join(table_lines))
649
+ continue
650
+
651
+ list_match = LIST_ITEM_RE.match(line)
652
+ if list_match:
653
+ index = self.add_list(document, lines, index)
654
+ continue
655
+
656
+ if stripped.startswith(">"):
657
+ quote_lines: list[str] = []
658
+ while index < len(lines) and lines[index].strip().startswith(">"):
659
+ quote_lines.append(lines[index].strip()[1:].strip())
660
+ index += 1
661
+ self.add_blockquote(document, " ".join(quote_lines))
662
+ continue
663
+
664
+ if re.fullmatch(r"[-*_]{3,}", stripped):
665
+ document.add_paragraph("")
666
+ index += 1
667
+ continue
668
+
669
+ paragraph_lines = [line.strip()]
670
+ paragraph_lines = [line.rstrip()]
671
+ paragraph_breaks = [line.endswith(" ") or line.endswith("\\")]
672
+ index += 1
673
+ while index < len(lines):
674
+ candidate = lines[index]
675
+ if not candidate.strip():
676
+ break
677
+ if candidate.strip().startswith(("```", "#", ">")):
678
+ break
679
+ if LIST_ITEM_RE.match(candidate):
680
+ break
681
+ if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
682
+ break
683
+ paragraph_lines.append(candidate.rstrip())
684
+ paragraph_breaks.append(candidate.endswith(" ") or candidate.endswith("\\"))
685
+ index += 1
686
+ self.add_paragraph(document, self.combine_paragraph_lines(paragraph_lines, paragraph_breaks))
687
+
688
+ @staticmethod
689
+ def combine_paragraph_lines(lines: list[str], breaks: list[bool]) -> str:
690
+ if not lines:
691
+ return ""
692
+ combined = lines[0]
693
+ for index in range(1, len(lines)):
694
+ separator = "\n" if breaks[index - 1] else " "
695
+ combined += separator + lines[index]
696
+ return combined
697
+
698
+ @staticmethod
699
+ def apply_template_package(output_docx: Path, template_path: Path) -> None:
700
+ transplant_parts = {
701
+ "word/styles.xml",
702
+ "word/stylesWithEffects.xml",
703
+ "word/numbering.xml",
704
+ "word/fontTable.xml",
705
+ "word/settings.xml",
706
+ "word/webSettings.xml",
707
+ "word/theme/theme1.xml",
708
+ }
709
+ with tempfile.TemporaryDirectory(prefix="build-corpus-template-") as tmp:
710
+ tmp_dir = Path(tmp)
711
+ patched = tmp_dir / output_docx.name
712
+ with ZipFile(output_docx) as out_zip, ZipFile(template_path) as template_zip, ZipFile(patched, "w") as patched_zip:
713
+ template_names = set(template_zip.namelist())
714
+ output_names = set(out_zip.namelist())
715
+ for name in out_zip.namelist():
716
+ if name in transplant_parts and name in template_names:
717
+ patched_zip.writestr(name, template_zip.read(name))
718
+ else:
719
+ patched_zip.writestr(name, out_zip.read(name))
720
+ for name in transplant_parts:
721
+ if name in template_names and name not in output_names:
722
+ patched_zip.writestr(name, template_zip.read(name))
723
+ shutil.move(str(patched), output_docx)
724
+
725
+ def add_heading(self, document: Document, level: int, text: str) -> None:
726
+ paragraph = document.add_paragraph(style=f"Heading {min(level, 6)}")
727
+ self.render_inline(paragraph, text)
728
+ self.stats.headings += 1
729
+
730
+ def add_paragraph(self, document: Document, text: str) -> None:
731
+ paragraph = document.add_paragraph(style="Normal")
732
+ self.render_inline(paragraph, text)
733
+ self.stats.paragraphs += 1
734
+
735
+ def add_code_block(self, document: Document, code: str, info: str) -> None:
736
+ paragraph = document.add_paragraph(style="Normal" if self.use_template_styles else "BuildCorpus Code")
737
+ if info:
738
+ label = paragraph.add_run(f"{info}\n")
739
+ label.bold = True
740
+ label.font.color.rgb = RGBColor(37, 99, 235)
741
+ run = paragraph.add_run(code)
742
+ run.font.name = "Consolas"
743
+ run.font.size = Pt(10)
744
+ set_paragraph_shading(paragraph, "F8FAFC")
745
+ self.stats.code_blocks += 1
746
+
747
+ def add_equation_block(self, document: Document, equation: str) -> None:
748
+ paragraph = document.add_paragraph(style="Normal")
749
+ paragraph.paragraph_format.left_indent = Inches(0.3)
750
+ paragraph.paragraph_format.right_indent = Inches(0.3)
751
+ paragraph.paragraph_format.space_before = Pt(4)
752
+ paragraph.paragraph_format.space_after = Pt(8)
753
+ omath = latex_to_omath_para(equation)
754
+ if omath is not None:
755
+ self.add_hidden_math_source(paragraph, equation)
756
+ paragraph._p.append(omath)
757
+ self.stats.equations_omml += 1
758
+ else:
759
+ run = paragraph.add_run(equation)
760
+ run.font.name = "Cambria Math"
761
+ run.font.size = Pt(11)
762
+ self._record_equation_fallback(equation)
763
+ self.stats.equations += 1
764
+
765
+ def _add_inline_equation(self, paragraph, latex: str) -> None:
766
+ """Append an inline OMML equation, falling back to Cambria Math text."""
767
+ omath = latex_to_omath(latex)
768
+ if omath is not None:
769
+ self.add_hidden_math_source(paragraph, latex)
770
+ paragraph._p.append(omath)
771
+ self.stats.equations_omml += 1
772
+ else:
773
+ run = paragraph.add_run(latex)
774
+ run.font.name = "Cambria Math"
775
+ run.font.size = Pt(11)
776
+ self._record_equation_fallback(latex)
777
+ self.stats.equations += 1
778
+
779
+ @staticmethod
780
+ def add_hidden_math_source(paragraph, latex: str) -> None:
781
+ run = paragraph.add_run(latex)
782
+ run.font.name = "Cambria Math"
783
+ rpr = run._r.get_or_add_rPr()
784
+ rpr.append(OxmlElement("w:vanish"))
785
+
786
+ def _record_equation_fallback(self, latex: str) -> None:
787
+ """Record a LaTeX fragment that could not be converted to OMML."""
788
+ self.stats.equations_fell_back += 1
789
+ line = getattr(self, "current_line", 0)
790
+ message = f"Equation kept as text (LaTeX not parsed): {latex[:60]}"
791
+ self.stats.warnings.append(message)
792
+ self.stats.issues.append({
793
+ "type": "equation",
794
+ "line": line,
795
+ "source": latex[:120],
796
+ "reason": "latex-parse-failed",
797
+ })
798
+
799
+ def _record_image_failure(self, target: str, reason: str, message: str) -> None:
800
+ """Record an image that could not be embedded, with a specific reason."""
801
+ self.stats.images_failed += 1
802
+ self.stats.warnings.append(f"{message}: {target}")
803
+ self.stats.issues.append({
804
+ "type": "image",
805
+ "line": getattr(self, "current_line", 0),
806
+ "target": target,
807
+ "reason": reason,
808
+ })
809
+
810
+ def add_blockquote(self, document: Document, text: str) -> None:
811
+ paragraph = document.add_paragraph(style="Quote" if self.use_template_styles else "BuildCorpus Quote")
812
+ self.render_inline(paragraph, text)
813
+ set_paragraph_border(paragraph, "CBD5E1")
814
+ self.stats.blockquotes += 1
815
+
816
+ def add_list(self, document: Document, lines: list[str], start: int) -> int:
817
+ index = start
818
+ while index < len(lines):
819
+ match = LIST_ITEM_RE.match(lines[index])
820
+ if not match:
821
+ break
822
+ indent, marker, body = match.groups()
823
+ ordered = marker.endswith(".")
824
+ body_lines = [body]
825
+ lookahead = index + 1
826
+ while lookahead < len(lines):
827
+ candidate = lines[lookahead]
828
+ stripped = candidate.strip()
829
+ if not stripped:
830
+ break
831
+ if candidate.strip().startswith(("```", "#", ">")):
832
+ break
833
+ if LIST_ITEM_RE.match(candidate):
834
+ break
835
+ if TABLE_SEPARATOR_RE.match(lines[lookahead + 1]) if lookahead + 1 < len(lines) else False:
836
+ break
837
+ if not candidate[:1].isspace():
838
+ break
839
+ body_lines.append(candidate.rstrip())
840
+ lookahead += 1
841
+ style_name = self.list_style_name(document, ordered, indent)
842
+ paragraph = document.add_paragraph(style=style_name)
843
+ if style_name in {"List Bullet", "List Number"}:
844
+ paragraph.paragraph_format.left_indent = Inches(0.25 + (len(indent.replace("\t", " ")) // 2) * 0.18)
845
+ self.render_inline(paragraph, "\n".join(body_lines))
846
+ self.stats.lists += 1
847
+ index = lookahead
848
+ return index
849
+
850
+ def list_style_name(self, document: Document, ordered: bool, indent: str) -> str:
851
+ level = min(3, max(1, (len(indent.replace("\t", " ")) // 2) + 1))
852
+ base = "List Number" if ordered else "List Bullet"
853
+ candidate = base if level == 1 else f"{base} {level}"
854
+ return candidate if candidate in document.styles else base
855
+
856
+ def add_table(self, document: Document, table_lines: list[str]) -> None:
857
+ rows = [split_table_row(line) for line in table_lines if not TABLE_SEPARATOR_RE.match(line)]
858
+ self.add_table_rows(document, rows)
859
+
860
+ def add_html_table(self, document: Document, table_markup: str) -> None:
861
+ self.add_table_rows(document, parse_html_table(table_markup))
862
+
863
+ def add_table_rows(self, document: Document, rows: list[list[str]]) -> None:
864
+ if not rows:
865
+ return
866
+ width = max(len(row) for row in rows)
867
+ table = document.add_table(rows=len(rows), cols=width)
868
+ table.style = "Light List Accent 1" if "Light List Accent 1" in document.styles else "Table Grid"
869
+ for row_index, row in enumerate(rows):
870
+ for col_index in range(width):
871
+ value = row[col_index] if col_index < len(row) else ""
872
+ self.render_table_cell(table.cell(row_index, col_index), value)
873
+ self.stats.tables += 1
874
+
875
+ def render_table_cell(self, cell, text: str) -> None:
876
+ lines = set_cell_text(cell, text)
877
+ if not lines:
878
+ cell.text = ""
879
+ return
880
+ first = cell.paragraphs[0]
881
+ first.text = ""
882
+ self.render_inline(first, lines[0])
883
+ for line in lines[1:]:
884
+ paragraph = cell.add_paragraph("")
885
+ self.render_inline(paragraph, line)
886
+
887
+ def render_inline(self, paragraph, text: str) -> None:
888
+ cursor = 0
889
+ for match in INLINE_TOKEN_RE.finditer(text):
890
+ if match.start() > cursor:
891
+ self.render_plain_text(paragraph, text[cursor:match.start()])
892
+ token = match.group(0)
893
+ self.render_inline_token(paragraph, token)
894
+ cursor = match.end()
895
+ if cursor < len(text):
896
+ self.render_plain_text(paragraph, text[cursor:])
897
+
898
+ def render_plain_text(self, paragraph, text: str) -> None:
899
+ cursor = 0
900
+ for match in BARE_URL_RE.finditer(text):
901
+ if match.start() > cursor:
902
+ append_text_with_breaks(paragraph, text[cursor:match.start()])
903
+ url = match.group(0)
904
+ normalized_url, trailing = split_trailing_url_punctuation(url)
905
+ if normalized_url:
906
+ add_hyperlink(paragraph, normalized_url, normalized_url)
907
+ if trailing:
908
+ append_text_with_breaks(paragraph, trailing)
909
+ cursor = match.end()
910
+ if cursor < len(text):
911
+ append_text_with_breaks(paragraph, text[cursor:])
912
+
913
+ def render_inline_token(self, paragraph, token: str) -> None:
914
+ if token.startswith("!["):
915
+ alt, target = markdown_link_parts(token)
916
+ image_path = (self.input_path.parent / target).resolve()
917
+ ext = image_path.suffix.lower()
918
+ if target.startswith(("http://", "https://", "data:")):
919
+ paragraph.add_run(f"[image: {alt or target}]")
920
+ self._record_image_failure(target, "skipped-remote",
921
+ "Skipped non-local image target")
922
+ return
923
+ if ext == ".svg" or (ext in UNRENDERABLE_IMAGE_EXTS and image_looks_like_svg(image_path)):
924
+ paragraph.add_run(f"[unsupported image: {target}]")
925
+ self._record_image_failure(target, "svg-needs-rasterization",
926
+ "SVG cannot be embedded by python-docx; rasterize to PNG via the render pipeline")
927
+ return
928
+ if ext in UNRENDERABLE_IMAGE_EXTS:
929
+ # HTML/JSX can never be embedded as a picture — flag for the
930
+ # external render pipeline. We deliberately do NOT rasterize here.
931
+ paragraph.add_run(f"[unsupported image: {target}]")
932
+ self._record_image_failure(target, "unsupported-format",
933
+ f"Image format cannot be embedded ({ext}); route to render pipeline")
934
+ return
935
+ if not image_path.exists():
936
+ paragraph.add_run(f"[missing image: {target}]")
937
+ self._record_image_failure(target, "missing-file", "Missing image asset")
938
+ return
939
+ run = paragraph.add_run()
940
+ try:
941
+ run.add_picture(str(image_path), width=Inches(5.8))
942
+ set_picture_metadata(run, target)
943
+ self.stats.images += 1
944
+ except UnrecognizedImageError:
945
+ converted = convert_windows_metafile_to_png(image_path)
946
+ if converted is not None:
947
+ run.add_picture(str(converted), width=Inches(5.8))
948
+ set_picture_metadata(run, target)
949
+ self.stats.images += 1
950
+ self.stats.warnings.append(f"Converted unsupported image to PNG: {target}")
951
+ elif image_looks_like_svg(image_path):
952
+ # SVG content mislabeled with a raster extension (e.g. .png that
953
+ # is really SVG+PNG-fallback the extractor flattened wrong).
954
+ paragraph.add_run(f"[unsupported image: {target}]")
955
+ self._record_image_failure(target, "mislabeled-svg",
956
+ f"{target} is SVG content with a {ext or 'raster'} extension; rasterize to PNG and repoint")
957
+ elif ext in METAFILE_IMAGE_EXTS:
958
+ paragraph.add_run(f"[unsupported image: {target}]")
959
+ self._record_image_failure(target, "unsupported-on-platform",
960
+ f"{ext} needs metafile->PNG conversion (install LibreOffice / run on Windows)")
961
+ else:
962
+ paragraph.add_run(f"[unsupported image: {target}]")
963
+ self._record_image_failure(target, "unsupported-format",
964
+ "Image format not recognized by python-docx")
965
+ return
966
+
967
+ if token.startswith("["):
968
+ label, target = markdown_link_parts(token)
969
+ hyperlink = add_hyperlink(paragraph, "", target)
970
+ self.render_hyperlink_label(hyperlink, label)
971
+ self.stats.links += 1
972
+ return
973
+
974
+ if token.startswith("`"):
975
+ run = paragraph.add_run(token[1:-1])
976
+ run.font.name = "Consolas"
977
+ run.font.size = Pt(10)
978
+ return
979
+
980
+ if token.startswith("$$") and token.endswith("$$"):
981
+ self._add_inline_equation(paragraph, token[2:-2])
982
+ return
983
+
984
+ if token.startswith("$") and token.endswith("$"):
985
+ self._add_inline_equation(paragraph, token[1:-1])
986
+ return
987
+
988
+ if token.startswith("***") and token.endswith("***"):
989
+ run = paragraph.add_run(token[3:-3])
990
+ run.bold = True
991
+ run.italic = True
992
+ return
993
+
994
+ if token.startswith("**") and token.endswith("**"):
995
+ run = paragraph.add_run(token[2:-2])
996
+ run.bold = True
997
+ return
998
+
999
+ if token.startswith("*") and token.endswith("*"):
1000
+ run = paragraph.add_run(token[1:-1])
1001
+ run.italic = True
1002
+ return
1003
+
1004
+ paragraph.add_run(token)
1005
+
1006
+ def render_hyperlink_label(self, hyperlink, text: str) -> None:
1007
+ cursor = 0
1008
+ for match in INLINE_TOKEN_RE.finditer(text):
1009
+ if match.start() > cursor:
1010
+ append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:match.start()]))
1011
+ token = match.group(0)
1012
+ self.render_hyperlink_token(hyperlink, token)
1013
+ cursor = match.end()
1014
+ if cursor < len(text):
1015
+ append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:]))
1016
+
1017
+ def render_hyperlink_token(self, hyperlink, token: str) -> None:
1018
+ if token.startswith("`") and token.endswith("`"):
1019
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), code=True)
1020
+ return
1021
+ if token.startswith("***") and token.endswith("***"):
1022
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token[3:-3]), bold=True, italic=True)
1023
+ return
1024
+ if token.startswith("**") and token.endswith("**"):
1025
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token[2:-2]), bold=True)
1026
+ return
1027
+ if token.startswith("*") and token.endswith("*"):
1028
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), italic=True)
1029
+ return
1030
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token))
1031
+
1032
+
1033
+ def export_markdown_to_docx(
1034
+ input_path: Path,
1035
+ output_root: Path,
1036
+ out_same_dir: bool,
1037
+ template_path: Path | None = None,
1038
+ ) -> dict:
1039
+ if out_same_dir:
1040
+ output_dir = input_path.parent
1041
+ output_docx = input_path.with_suffix(".docx")
1042
+ report_path = input_path.with_name(f"{input_path.stem}.export-report.json")
1043
+ else:
1044
+ output_dir = output_root / input_path.stem
1045
+ output_docx = None
1046
+ report_path = None
1047
+
1048
+ exporter = MarkdownToDocxExporter(
1049
+ input_path=input_path,
1050
+ output_dir=output_dir,
1051
+ output_docx=output_docx,
1052
+ report_path=report_path,
1053
+ template_path=template_path,
1054
+ )
1055
+ return exporter.export()