regen.mde 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/LICENSE +16 -0
  2. package/README.md +295 -0
  3. package/bin/build-corpus-editor.js +81 -0
  4. package/bin/build-corpus.js +41 -0
  5. package/bin/postinstall.js +187 -0
  6. package/bin/regen-mdeditor-install.js +27 -0
  7. package/bin/regen-mdeditor-uninstall.js +19 -0
  8. package/bin/validate-katex.js +93 -0
  9. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
  10. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
  11. package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
  12. package/desktop/BuildCorpusEditor/Program.cs +81 -0
  13. package/desktop/BuildCorpusEditor/app.manifest +16 -0
  14. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  15. package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
  16. package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
  17. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  18. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  19. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  20. package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
  21. package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
  22. package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
  23. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
  24. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
  25. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
  26. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
  27. package/dist/windows-editor/WebView2Loader.dll +0 -0
  28. package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
  29. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
  30. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
  31. package/dist/windows-editor/wwwroot/index.html +22 -0
  32. package/editor-web/index.html +21 -0
  33. package/editor-web/src/main.jsx +399 -0
  34. package/editor-web/src/styles.css +602 -0
  35. package/editor-web/vite.config.js +13 -0
  36. package/examples/build-corpus.config.example.json +21 -0
  37. package/installer/install-regen-mde.ps1 +175 -0
  38. package/installer/regen-mde.nsi +81 -0
  39. package/package.json +86 -0
  40. package/pyproject.toml +33 -0
  41. package/requirements.txt +4 -0
  42. package/scripts/build-windows-editor.ps1 +47 -0
  43. package/scripts/package-windows-editor.ps1 +90 -0
  44. package/scripts/run-corpus.ps1 +28 -0
  45. package/scripts/run-editor-implementation-plane.ps1 +203 -0
  46. package/scripts/run-required-tests.ps1 +98 -0
  47. package/scripts/run-smoke.ps1 +28 -0
  48. package/src/build_corpus/__init__.py +3 -0
  49. package/src/build_corpus/docx_exporter.py +798 -0
  50. package/src/build_corpus/exporter.py +1195 -0
  51. package/src/build_corpus/ppt_exporter.py +532 -0
  52. package/src/build_corpus/templates/__init__.py +1 -0
  53. package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
  54. package/src/build_corpus/validate_assets.py +46 -0
  55. package/tools/audit_corpus.py +203 -0
  56. package/tools/collect_microsoft_word_templates.py +228 -0
  57. package/tools/collect_online_docx_corpus.py +272 -0
  58. package/tools/collect_online_pptx_corpus.py +252 -0
  59. package/tools/compare_pptx_inputs_outputs.py +87 -0
  60. package/tools/roundtrip_docx_corpus.py +171 -0
@@ -0,0 +1,798 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ import os
5
+ import re
6
+ import shutil
7
+ import subprocess
8
+ import tempfile
9
+ from html.parser import HTMLParser
10
+ from contextlib import ExitStack
11
+ from dataclasses import dataclass, field
12
+ from importlib.resources import as_file, files
13
+ from pathlib import Path
14
+ from zipfile import ZipFile
15
+
16
+ from docx import Document
17
+ from docx.enum.style import WD_STYLE_TYPE
18
+ from docx.enum.text import WD_BREAK
19
+ from docx.oxml import OxmlElement
20
+ from docx.oxml.ns import qn
21
+ from docx.shared import Inches, Pt, RGBColor
22
+ from docx.image.exceptions import UnrecognizedImageError
23
+
24
+
25
+ INLINE_TOKEN_RE = re.compile(
26
+ r"(!\[[^\]]*\]\([^)]+\)|\[[^\]]+\]\([^)]+\)|`[^`]+`|\$\$[^$]+\$\$|\$[^$\n]+\$|\*\*\*.+?\*\*\*|\*\*.+?\*\*|\*.+?\*)"
27
+ )
28
+ BARE_URL_RE = re.compile(r"https?://[^\s<>()]+(?:\([^\s<>()]*\)[^\s<>()]*)*")
29
+ LIST_ITEM_RE = re.compile(r"^(\s*)([-*+]|\d+\.)\s+(.*)$")
30
+ TABLE_SEPARATOR_RE = re.compile(r"^\s*\|?(?:\s*:?-{3,}:?\s*\|)+\s*:?-{3,}:?\s*\|?\s*$")
31
+ DEFAULT_TEMPLATE_FILENAME = "md-to-word-template.dotx"
32
+
33
+
34
+ @dataclass
35
+ class WordExportStats:
36
+ paragraphs: int = 0
37
+ headings: int = 0
38
+ lists: int = 0
39
+ tables: int = 0
40
+ code_blocks: int = 0
41
+ blockquotes: int = 0
42
+ images: int = 0
43
+ equations: int = 0
44
+ warnings: list[str] = field(default_factory=list)
45
+
46
+
47
+ def strip_fence(line: str) -> str:
48
+ return line.strip()[3:].strip()
49
+
50
+
51
+ def split_table_row(line: str) -> list[str]:
52
+ text = line.strip()
53
+ if text.startswith("|"):
54
+ text = text[1:]
55
+ if text.endswith("|"):
56
+ text = text[:-1]
57
+ cells: list[str] = []
58
+ buffer: list[str] = []
59
+ escape = False
60
+ for char in text:
61
+ if escape:
62
+ buffer.append(char)
63
+ escape = False
64
+ continue
65
+ if char == "\\":
66
+ escape = True
67
+ buffer.append(char)
68
+ continue
69
+ if char == "|":
70
+ cells.append("".join(buffer).strip())
71
+ buffer.clear()
72
+ continue
73
+ buffer.append(char)
74
+ cells.append("".join(buffer).strip())
75
+ return cells
76
+
77
+
78
+ def markdown_link_parts(token: str) -> tuple[str, str]:
79
+ match = re.fullmatch(r"!?\[([^\]]*)\]\(([^)]+)\)", token)
80
+ if not match:
81
+ return token, ""
82
+ return match.group(1), match.group(2).strip()
83
+
84
+
85
+ def unescape_markdown_text(text: str) -> str:
86
+ return re.sub(r"\\([\\`*_{}\[\]()#+.!|$-])", r"\1", text)
87
+
88
+
89
+ class HTMLTableParser(HTMLParser):
90
+ def __init__(self) -> None:
91
+ super().__init__()
92
+ self.rows: list[list[str]] = []
93
+ self.current_row: list[str] | None = None
94
+ self.current_cell: list[str] | None = None
95
+
96
+ def handle_starttag(self, tag: str, attrs) -> None: # type: ignore[override]
97
+ normalized = tag.lower()
98
+ if normalized == "tr":
99
+ self.current_row = []
100
+ elif normalized == "td":
101
+ self.current_cell = []
102
+ elif normalized == "br" and self.current_cell is not None:
103
+ self.current_cell.append("\n")
104
+
105
+ def handle_endtag(self, tag: str) -> None: # type: ignore[override]
106
+ normalized = tag.lower()
107
+ if normalized == "td" and self.current_row is not None and self.current_cell is not None:
108
+ self.current_row.append("".join(self.current_cell).strip())
109
+ self.current_cell = None
110
+ elif normalized == "tr" and self.current_row is not None:
111
+ self.rows.append(self.current_row)
112
+ self.current_row = None
113
+
114
+ def handle_data(self, data: str) -> None: # type: ignore[override]
115
+ if self.current_cell is not None:
116
+ self.current_cell.append(data)
117
+
118
+
119
+ def parse_html_table(markup: str) -> list[list[str]]:
120
+ parser = HTMLTableParser()
121
+ parser.feed(markup)
122
+ parser.close()
123
+ return [row for row in parser.rows if any(cell.strip() for cell in row)]
124
+
125
+
126
+ def packaged_template_resource():
127
+ package_root = f"{__package__}.templates" if __package__ else "build_corpus.templates"
128
+ return files(package_root).joinpath(DEFAULT_TEMPLATE_FILENAME)
129
+
130
+
131
+ def resolve_default_template_path() -> Path | None:
132
+ try:
133
+ packaged_template_resource()
134
+ except ModuleNotFoundError:
135
+ return None
136
+ return Path(f"bundled:{DEFAULT_TEMPLATE_FILENAME}")
137
+
138
+
139
+ def set_cell_text(cell, text: str) -> None:
140
+ lines = [segment.strip() for segment in text.replace("<br>", "\n").splitlines()]
141
+ lines = [line for line in lines if line]
142
+ if not lines:
143
+ cell.text = ""
144
+ return []
145
+ return lines
146
+
147
+
148
+ def append_text_with_breaks(paragraph, text: str) -> None:
149
+ text = unescape_markdown_text(text)
150
+ parts = text.split("\n")
151
+ for index, part in enumerate(parts):
152
+ if part:
153
+ paragraph.add_run(part)
154
+ if index < len(parts) - 1:
155
+ paragraph.add_run().add_break(WD_BREAK.LINE)
156
+
157
+
158
+ def split_trailing_url_punctuation(url: str) -> tuple[str, str]:
159
+ trailing = []
160
+ core = url
161
+ while core and core[-1] in ".,;:!?":
162
+ trailing.append(core[-1])
163
+ core = core[:-1]
164
+ return core, "".join(reversed(trailing))
165
+
166
+
167
+ def convert_windows_metafile_to_png(source: Path) -> Path | None:
168
+ if os.name != "nt":
169
+ return None
170
+ target_dir = Path(tempfile.gettempdir()) / "build-corpus-image-fallbacks"
171
+ target_dir.mkdir(parents=True, exist_ok=True)
172
+ target = target_dir / f"{source.stem}.png"
173
+ source_literal = str(source).replace("'", "''")
174
+ target_literal = str(target).replace("'", "''")
175
+ command = (
176
+ "Add-Type -AssemblyName System.Drawing; "
177
+ f"$img = [System.Drawing.Image]::FromFile('{source_literal}'); "
178
+ f"$bmp = New-Object System.Drawing.Bitmap $img.Width, $img.Height; "
179
+ "$gfx = [System.Drawing.Graphics]::FromImage($bmp); "
180
+ "$gfx.DrawImage($img, 0, 0, $img.Width, $img.Height); "
181
+ f"$bmp.Save('{target_literal}', [System.Drawing.Imaging.ImageFormat]::Png); "
182
+ "$gfx.Dispose(); $bmp.Dispose(); $img.Dispose()"
183
+ )
184
+ result = subprocess.run(
185
+ ["powershell", "-NoProfile", "-Command", command],
186
+ capture_output=True,
187
+ text=True,
188
+ )
189
+ if result.returncode != 0 or not target.exists():
190
+ return None
191
+ return target
192
+
193
+
194
+ def set_paragraph_shading(paragraph, fill: str) -> None:
195
+ paragraph_pr = paragraph._p.get_or_add_pPr()
196
+ shading = paragraph_pr.find(qn("w:shd"))
197
+ if shading is None:
198
+ shading = OxmlElement("w:shd")
199
+ paragraph_pr.append(shading)
200
+ shading.set(qn("w:fill"), fill)
201
+
202
+
203
+ def set_paragraph_border(paragraph, color: str) -> None:
204
+ paragraph_pr = paragraph._p.get_or_add_pPr()
205
+ borders = paragraph_pr.find(qn("w:pBdr"))
206
+ if borders is None:
207
+ borders = OxmlElement("w:pBdr")
208
+ paragraph_pr.append(borders)
209
+ left = borders.find(qn("w:left"))
210
+ if left is None:
211
+ left = OxmlElement("w:left")
212
+ borders.append(left)
213
+ left.set(qn("w:val"), "single")
214
+ left.set(qn("w:sz"), "10")
215
+ left.set(qn("w:space"), "12")
216
+ left.set(qn("w:color"), color)
217
+
218
+
219
+ def add_hyperlink(paragraph, text: str, url: str):
220
+ part = paragraph.part
221
+ relationship_id = part.relate_to(
222
+ url,
223
+ "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
224
+ is_external=True,
225
+ )
226
+ hyperlink = OxmlElement("w:hyperlink")
227
+ hyperlink.set(qn("r:id"), relationship_id)
228
+
229
+ run = OxmlElement("w:r")
230
+ rpr = OxmlElement("w:rPr")
231
+
232
+ color = OxmlElement("w:color")
233
+ color.set(qn("w:val"), "2563EB")
234
+ rpr.append(color)
235
+
236
+ underline = OxmlElement("w:u")
237
+ underline.set(qn("w:val"), "single")
238
+ rpr.append(underline)
239
+ run.append(rpr)
240
+
241
+ text_node = OxmlElement("w:t")
242
+ text_node.text = text
243
+ run.append(text_node)
244
+ hyperlink.append(run)
245
+ paragraph._p.append(hyperlink)
246
+ return hyperlink
247
+
248
+
249
+ def append_hyperlink_run(hyperlink, text: str, bold: bool = False, italic: bool = False, code: bool = False) -> None:
250
+ run = OxmlElement("w:r")
251
+ rpr = OxmlElement("w:rPr")
252
+
253
+ color = OxmlElement("w:color")
254
+ color.set(qn("w:val"), "2563EB")
255
+ rpr.append(color)
256
+
257
+ underline = OxmlElement("w:u")
258
+ underline.set(qn("w:val"), "single")
259
+ rpr.append(underline)
260
+
261
+ if bold:
262
+ rpr.append(OxmlElement("w:b"))
263
+ if italic:
264
+ rpr.append(OxmlElement("w:i"))
265
+
266
+ fonts_needed = code
267
+ if fonts_needed:
268
+ fonts = OxmlElement("w:rFonts")
269
+ fonts.set(qn("w:ascii"), "Consolas")
270
+ fonts.set(qn("w:hAnsi"), "Consolas")
271
+ fonts.set(qn("w:cs"), "Consolas")
272
+ rpr.append(fonts)
273
+ size = OxmlElement("w:sz")
274
+ size.set(qn("w:val"), "20")
275
+ rpr.append(size)
276
+
277
+ run.append(rpr)
278
+ text_node = OxmlElement("w:t")
279
+ text_node.text = text
280
+ run.append(text_node)
281
+ hyperlink.append(run)
282
+
283
+
284
+ def set_picture_metadata(run, source_name: str) -> None:
285
+ filename = Path(source_name).name
286
+ try:
287
+ doc_props = run._r.xpath(".//*[local-name()='docPr']")
288
+ except Exception:
289
+ doc_props = []
290
+ for doc_prop in doc_props:
291
+ doc_prop.set("name", filename)
292
+ doc_prop.set("descr", filename)
293
+ doc_prop.set("title", filename)
294
+
295
+
296
+ class MarkdownToDocxExporter:
297
+ def __init__(
298
+ self,
299
+ input_path: Path,
300
+ output_dir: Path,
301
+ output_docx: Path | None = None,
302
+ report_path: Path | None = None,
303
+ template_path: Path | None = None,
304
+ ):
305
+ self.input_path = input_path
306
+ self.output_dir = output_dir
307
+ self.output_docx = output_docx or (output_dir / f"{input_path.stem}.docx")
308
+ self.report_path = report_path or (output_dir / "export-report.json")
309
+ self._template_resource_stack = ExitStack()
310
+ self.template_path = self.resolve_template_path(template_path)
311
+ self.use_template_styles = self.template_path is not None
312
+ self.stats = WordExportStats()
313
+
314
+ def export(self) -> dict:
315
+ try:
316
+ self.output_dir.mkdir(parents=True, exist_ok=True)
317
+ if self.template_path and not self.template_path.exists():
318
+ raise FileNotFoundError(f"Word template not found: {self.template_path}")
319
+
320
+ document = Document()
321
+ if not self.template_path:
322
+ self.apply_modern_styles(document)
323
+ self.ensure_custom_styles(document)
324
+
325
+ markdown = self.input_path.read_text(encoding="utf-8")
326
+ self.render_markdown(document, markdown)
327
+ document.save(self.output_docx)
328
+ if self.template_path:
329
+ self.apply_template_package(self.output_docx, self.template_path)
330
+
331
+ report = {
332
+ "input": str(self.input_path),
333
+ "output": str(self.output_docx),
334
+ "template": str(self.template_path) if self.template_path else None,
335
+ "stats": self.stats.__dict__,
336
+ }
337
+ self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
338
+ return report
339
+ finally:
340
+ self._template_resource_stack.close()
341
+
342
+ def resolve_template_path(self, template_path: Path | None) -> Path | None:
343
+ if template_path is not None:
344
+ return Path(template_path)
345
+
346
+ try:
347
+ return self._template_resource_stack.enter_context(as_file(packaged_template_resource()))
348
+ except (FileNotFoundError, ModuleNotFoundError):
349
+ return None
350
+
351
+ def apply_modern_styles(self, document: Document) -> None:
352
+ section = document.sections[0]
353
+ section.top_margin = Inches(0.8)
354
+ section.bottom_margin = Inches(0.8)
355
+ section.left_margin = Inches(0.9)
356
+ section.right_margin = Inches(0.9)
357
+
358
+ normal = document.styles["Normal"]
359
+ normal.font.name = "Aptos"
360
+ normal.font.size = Pt(11)
361
+ normal.font.color.rgb = RGBColor(31, 41, 55)
362
+ normal.paragraph_format.space_after = Pt(8)
363
+ normal.paragraph_format.line_spacing = 1.15
364
+
365
+ for level, size in ((1, 22), (2, 17), (3, 14), (4, 12)):
366
+ style = document.styles[f"Heading {level}"]
367
+ style.font.name = "Aptos Display"
368
+ style.font.size = Pt(size)
369
+ style.font.bold = True
370
+ style.font.color.rgb = RGBColor(15, 23, 42)
371
+ style.paragraph_format.space_before = Pt(12 if level == 1 else 10)
372
+ style.paragraph_format.space_after = Pt(4)
373
+
374
+ self.ensure_custom_styles(document)
375
+
376
+ def ensure_custom_styles(self, document: Document) -> None:
377
+ if self.use_template_styles:
378
+ return
379
+ if "BuildCorpus Code" not in document.styles:
380
+ style = document.styles.add_style("BuildCorpus Code", WD_STYLE_TYPE.PARAGRAPH)
381
+ style.base_style = document.styles["Normal"]
382
+ style.font.name = "Consolas"
383
+ style.font.size = Pt(10)
384
+ style.paragraph_format.left_indent = Inches(0.2)
385
+ style.paragraph_format.right_indent = Inches(0.2)
386
+ style.paragraph_format.space_before = Pt(4)
387
+ style.paragraph_format.space_after = Pt(6)
388
+
389
+ if "BuildCorpus Quote" not in document.styles:
390
+ style = document.styles.add_style("BuildCorpus Quote", WD_STYLE_TYPE.PARAGRAPH)
391
+ style.base_style = document.styles["Normal"]
392
+ style.font.italic = True
393
+ style.font.color.rgb = RGBColor(71, 85, 105)
394
+ style.paragraph_format.left_indent = Inches(0.35)
395
+ style.paragraph_format.space_after = Pt(6)
396
+
397
+ def render_markdown(self, document: Document, markdown: str) -> None:
398
+ lines = markdown.splitlines()
399
+ index = 0
400
+ while index < len(lines):
401
+ line = lines[index]
402
+ stripped = line.strip()
403
+
404
+ if not stripped:
405
+ index += 1
406
+ continue
407
+
408
+ if stripped.startswith("```"):
409
+ info = strip_fence(line)
410
+ buffer: list[str] = []
411
+ index += 1
412
+ while index < len(lines) and not lines[index].strip().startswith("```"):
413
+ buffer.append(lines[index])
414
+ index += 1
415
+ if index < len(lines):
416
+ index += 1
417
+ self.add_code_block(document, "\n".join(buffer), info)
418
+ continue
419
+
420
+ if stripped == "$$":
421
+ buffer: list[str] = []
422
+ index += 1
423
+ while index < len(lines) and lines[index].strip() != "$$":
424
+ buffer.append(lines[index])
425
+ index += 1
426
+ if index < len(lines):
427
+ index += 1
428
+ self.add_equation_block(document, "\n".join(buffer).strip())
429
+ continue
430
+
431
+ if stripped.startswith("$$") and stripped.endswith("$$") and len(stripped) > 4:
432
+ self.add_equation_block(document, stripped[2:-2].strip())
433
+ index += 1
434
+ continue
435
+
436
+ if stripped.startswith("#"):
437
+ level = len(stripped) - len(stripped.lstrip("#"))
438
+ text = stripped[level:].strip()
439
+ self.add_heading(document, level, text)
440
+ index += 1
441
+ continue
442
+
443
+ if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
444
+ table_lines = [line, lines[index + 1]]
445
+ index += 2
446
+ while index < len(lines) and "|" in lines[index]:
447
+ if not lines[index].strip():
448
+ break
449
+ table_lines.append(lines[index])
450
+ index += 1
451
+ self.add_table(document, table_lines)
452
+ continue
453
+
454
+ if stripped.lower() == "<table>":
455
+ table_lines = [line]
456
+ index += 1
457
+ while index < len(lines):
458
+ table_lines.append(lines[index])
459
+ if lines[index].strip().lower() == "</table>":
460
+ index += 1
461
+ break
462
+ index += 1
463
+ self.add_html_table(document, "\n".join(table_lines))
464
+ continue
465
+
466
+ list_match = LIST_ITEM_RE.match(line)
467
+ if list_match:
468
+ index = self.add_list(document, lines, index)
469
+ continue
470
+
471
+ if stripped.startswith(">"):
472
+ quote_lines: list[str] = []
473
+ while index < len(lines) and lines[index].strip().startswith(">"):
474
+ quote_lines.append(lines[index].strip()[1:].strip())
475
+ index += 1
476
+ self.add_blockquote(document, " ".join(quote_lines))
477
+ continue
478
+
479
+ if re.fullmatch(r"[-*_]{3,}", stripped):
480
+ document.add_paragraph("")
481
+ index += 1
482
+ continue
483
+
484
+ paragraph_lines = [line.strip()]
485
+ paragraph_lines = [line.rstrip()]
486
+ paragraph_breaks = [line.endswith(" ") or line.endswith("\\")]
487
+ index += 1
488
+ while index < len(lines):
489
+ candidate = lines[index]
490
+ if not candidate.strip():
491
+ break
492
+ if candidate.strip().startswith(("```", "#", ">")):
493
+ break
494
+ if LIST_ITEM_RE.match(candidate):
495
+ break
496
+ if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
497
+ break
498
+ paragraph_lines.append(candidate.rstrip())
499
+ paragraph_breaks.append(candidate.endswith(" ") or candidate.endswith("\\"))
500
+ index += 1
501
+ self.add_paragraph(document, self.combine_paragraph_lines(paragraph_lines, paragraph_breaks))
502
+
503
+ @staticmethod
504
+ def combine_paragraph_lines(lines: list[str], breaks: list[bool]) -> str:
505
+ if not lines:
506
+ return ""
507
+ combined = lines[0]
508
+ for index in range(1, len(lines)):
509
+ separator = "\n" if breaks[index - 1] else " "
510
+ combined += separator + lines[index]
511
+ return combined
512
+
513
+ @staticmethod
514
+ def apply_template_package(output_docx: Path, template_path: Path) -> None:
515
+ transplant_parts = {
516
+ "word/styles.xml",
517
+ "word/stylesWithEffects.xml",
518
+ "word/numbering.xml",
519
+ "word/fontTable.xml",
520
+ "word/settings.xml",
521
+ "word/webSettings.xml",
522
+ "word/theme/theme1.xml",
523
+ }
524
+ with tempfile.TemporaryDirectory(prefix="build-corpus-template-") as tmp:
525
+ tmp_dir = Path(tmp)
526
+ patched = tmp_dir / output_docx.name
527
+ with ZipFile(output_docx) as out_zip, ZipFile(template_path) as template_zip, ZipFile(patched, "w") as patched_zip:
528
+ template_names = set(template_zip.namelist())
529
+ output_names = set(out_zip.namelist())
530
+ for name in out_zip.namelist():
531
+ if name in transplant_parts and name in template_names:
532
+ patched_zip.writestr(name, template_zip.read(name))
533
+ else:
534
+ patched_zip.writestr(name, out_zip.read(name))
535
+ for name in transplant_parts:
536
+ if name in template_names and name not in output_names:
537
+ patched_zip.writestr(name, template_zip.read(name))
538
+ shutil.move(str(patched), output_docx)
539
+
540
+ def add_heading(self, document: Document, level: int, text: str) -> None:
541
+ paragraph = document.add_paragraph(style=f"Heading {min(level, 6)}")
542
+ self.render_inline(paragraph, text)
543
+ self.stats.headings += 1
544
+
545
+ def add_paragraph(self, document: Document, text: str) -> None:
546
+ paragraph = document.add_paragraph(style="Normal")
547
+ self.render_inline(paragraph, text)
548
+ self.stats.paragraphs += 1
549
+
550
+ def add_code_block(self, document: Document, code: str, info: str) -> None:
551
+ paragraph = document.add_paragraph(style="Normal" if self.use_template_styles else "BuildCorpus Code")
552
+ if info:
553
+ label = paragraph.add_run(f"{info}\n")
554
+ label.bold = True
555
+ label.font.color.rgb = RGBColor(37, 99, 235)
556
+ run = paragraph.add_run(code)
557
+ run.font.name = "Consolas"
558
+ run.font.size = Pt(10)
559
+ set_paragraph_shading(paragraph, "F8FAFC")
560
+ self.stats.code_blocks += 1
561
+
562
+ def add_equation_block(self, document: Document, equation: str) -> None:
563
+ paragraph = document.add_paragraph(style="Normal")
564
+ paragraph.paragraph_format.left_indent = Inches(0.3)
565
+ paragraph.paragraph_format.right_indent = Inches(0.3)
566
+ paragraph.paragraph_format.space_before = Pt(4)
567
+ paragraph.paragraph_format.space_after = Pt(8)
568
+ run = paragraph.add_run(equation)
569
+ run.font.name = "Cambria Math"
570
+ run.font.size = Pt(11)
571
+ self.stats.equations += 1
572
+
573
+ def add_blockquote(self, document: Document, text: str) -> None:
574
+ paragraph = document.add_paragraph(style="Quote" if self.use_template_styles else "BuildCorpus Quote")
575
+ self.render_inline(paragraph, text)
576
+ set_paragraph_border(paragraph, "CBD5E1")
577
+ self.stats.blockquotes += 1
578
+
579
+ def add_list(self, document: Document, lines: list[str], start: int) -> int:
580
+ index = start
581
+ while index < len(lines):
582
+ match = LIST_ITEM_RE.match(lines[index])
583
+ if not match:
584
+ break
585
+ indent, marker, body = match.groups()
586
+ ordered = marker.endswith(".")
587
+ body_lines = [body]
588
+ lookahead = index + 1
589
+ while lookahead < len(lines):
590
+ candidate = lines[lookahead]
591
+ stripped = candidate.strip()
592
+ if not stripped:
593
+ break
594
+ if candidate.strip().startswith(("```", "#", ">")):
595
+ break
596
+ if LIST_ITEM_RE.match(candidate):
597
+ break
598
+ if TABLE_SEPARATOR_RE.match(lines[lookahead + 1]) if lookahead + 1 < len(lines) else False:
599
+ break
600
+ if not candidate[:1].isspace():
601
+ break
602
+ body_lines.append(candidate.rstrip())
603
+ lookahead += 1
604
+ style_name = self.list_style_name(document, ordered, indent)
605
+ paragraph = document.add_paragraph(style=style_name)
606
+ if style_name in {"List Bullet", "List Number"}:
607
+ paragraph.paragraph_format.left_indent = Inches(0.25 + (len(indent.replace("\t", " ")) // 2) * 0.18)
608
+ self.render_inline(paragraph, "\n".join(body_lines))
609
+ self.stats.lists += 1
610
+ index = lookahead
611
+ return index
612
+
613
+ def list_style_name(self, document: Document, ordered: bool, indent: str) -> str:
614
+ level = min(3, max(1, (len(indent.replace("\t", " ")) // 2) + 1))
615
+ base = "List Number" if ordered else "List Bullet"
616
+ candidate = base if level == 1 else f"{base} {level}"
617
+ return candidate if candidate in document.styles else base
618
+
619
+ def add_table(self, document: Document, table_lines: list[str]) -> None:
620
+ rows = [split_table_row(line) for line in table_lines if not TABLE_SEPARATOR_RE.match(line)]
621
+ self.add_table_rows(document, rows)
622
+
623
+ def add_html_table(self, document: Document, table_markup: str) -> None:
624
+ self.add_table_rows(document, parse_html_table(table_markup))
625
+
626
+ def add_table_rows(self, document: Document, rows: list[list[str]]) -> None:
627
+ if not rows:
628
+ return
629
+ width = max(len(row) for row in rows)
630
+ table = document.add_table(rows=len(rows), cols=width)
631
+ table.style = "Light List Accent 1" if "Light List Accent 1" in document.styles else "Table Grid"
632
+ for row_index, row in enumerate(rows):
633
+ for col_index in range(width):
634
+ value = row[col_index] if col_index < len(row) else ""
635
+ self.render_table_cell(table.cell(row_index, col_index), value)
636
+ self.stats.tables += 1
637
+
638
+ def render_table_cell(self, cell, text: str) -> None:
639
+ lines = set_cell_text(cell, text)
640
+ if not lines:
641
+ cell.text = ""
642
+ return
643
+ first = cell.paragraphs[0]
644
+ first.text = ""
645
+ self.render_inline(first, lines[0])
646
+ for line in lines[1:]:
647
+ paragraph = cell.add_paragraph("")
648
+ self.render_inline(paragraph, line)
649
+
650
+ def render_inline(self, paragraph, text: str) -> None:
651
+ cursor = 0
652
+ for match in INLINE_TOKEN_RE.finditer(text):
653
+ if match.start() > cursor:
654
+ self.render_plain_text(paragraph, text[cursor:match.start()])
655
+ token = match.group(0)
656
+ self.render_inline_token(paragraph, token)
657
+ cursor = match.end()
658
+ if cursor < len(text):
659
+ self.render_plain_text(paragraph, text[cursor:])
660
+
661
+ def render_plain_text(self, paragraph, text: str) -> None:
662
+ cursor = 0
663
+ for match in BARE_URL_RE.finditer(text):
664
+ if match.start() > cursor:
665
+ append_text_with_breaks(paragraph, text[cursor:match.start()])
666
+ url = match.group(0)
667
+ normalized_url, trailing = split_trailing_url_punctuation(url)
668
+ if normalized_url:
669
+ add_hyperlink(paragraph, normalized_url, normalized_url)
670
+ if trailing:
671
+ append_text_with_breaks(paragraph, trailing)
672
+ cursor = match.end()
673
+ if cursor < len(text):
674
+ append_text_with_breaks(paragraph, text[cursor:])
675
+
676
+ def render_inline_token(self, paragraph, token: str) -> None:
677
+ if token.startswith("!["):
678
+ alt, target = markdown_link_parts(token)
679
+ image_path = (self.input_path.parent / target).resolve()
680
+ if target.startswith(("http://", "https://", "data:")):
681
+ paragraph.add_run(f"[image: {alt or target}]")
682
+ self.stats.warnings.append(f"Skipped non-local image target: {target}")
683
+ return
684
+ if image_path.exists():
685
+ run = paragraph.add_run()
686
+ try:
687
+ run.add_picture(str(image_path), width=Inches(5.8))
688
+ set_picture_metadata(run, target)
689
+ self.stats.images += 1
690
+ except UnrecognizedImageError:
691
+ converted = convert_windows_metafile_to_png(image_path)
692
+ if converted is not None:
693
+ run.add_picture(str(converted), width=Inches(5.8))
694
+ set_picture_metadata(run, target)
695
+ self.stats.images += 1
696
+ self.stats.warnings.append(f"Converted unsupported image to PNG: {target}")
697
+ else:
698
+ paragraph.add_run(f"[unsupported image: {target}]")
699
+ self.stats.warnings.append(f"Unsupported image asset: {target}")
700
+ else:
701
+ paragraph.add_run(f"[missing image: {target}]")
702
+ self.stats.warnings.append(f"Missing image asset: {target}")
703
+ return
704
+
705
+ if token.startswith("["):
706
+ label, target = markdown_link_parts(token)
707
+ hyperlink = add_hyperlink(paragraph, "", target)
708
+ self.render_hyperlink_label(hyperlink, label)
709
+ return
710
+
711
+ if token.startswith("`"):
712
+ run = paragraph.add_run(token[1:-1])
713
+ run.font.name = "Consolas"
714
+ run.font.size = Pt(10)
715
+ return
716
+
717
+ if token.startswith("$$") and token.endswith("$$"):
718
+ run = paragraph.add_run(token[2:-2])
719
+ run.font.name = "Cambria Math"
720
+ run.font.size = Pt(11)
721
+ self.stats.equations += 1
722
+ return
723
+
724
+ if token.startswith("$") and token.endswith("$"):
725
+ run = paragraph.add_run(token[1:-1])
726
+ run.font.name = "Cambria Math"
727
+ run.font.size = Pt(11)
728
+ self.stats.equations += 1
729
+ return
730
+
731
+ if token.startswith("***") and token.endswith("***"):
732
+ run = paragraph.add_run(token[3:-3])
733
+ run.bold = True
734
+ run.italic = True
735
+ return
736
+
737
+ if token.startswith("**") and token.endswith("**"):
738
+ run = paragraph.add_run(token[2:-2])
739
+ run.bold = True
740
+ return
741
+
742
+ if token.startswith("*") and token.endswith("*"):
743
+ run = paragraph.add_run(token[1:-1])
744
+ run.italic = True
745
+ return
746
+
747
+ paragraph.add_run(token)
748
+
749
+ def render_hyperlink_label(self, hyperlink, text: str) -> None:
750
+ cursor = 0
751
+ for match in INLINE_TOKEN_RE.finditer(text):
752
+ if match.start() > cursor:
753
+ append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:match.start()]))
754
+ token = match.group(0)
755
+ self.render_hyperlink_token(hyperlink, token)
756
+ cursor = match.end()
757
+ if cursor < len(text):
758
+ append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:]))
759
+
760
+ def render_hyperlink_token(self, hyperlink, token: str) -> None:
761
+ if token.startswith("`") and token.endswith("`"):
762
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), code=True)
763
+ return
764
+ if token.startswith("***") and token.endswith("***"):
765
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token[3:-3]), bold=True, italic=True)
766
+ return
767
+ if token.startswith("**") and token.endswith("**"):
768
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token[2:-2]), bold=True)
769
+ return
770
+ if token.startswith("*") and token.endswith("*"):
771
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), italic=True)
772
+ return
773
+ append_hyperlink_run(hyperlink, unescape_markdown_text(token))
774
+
775
+
776
+ def export_markdown_to_docx(
777
+ input_path: Path,
778
+ output_root: Path,
779
+ out_same_dir: bool,
780
+ template_path: Path | None = None,
781
+ ) -> dict:
782
+ if out_same_dir:
783
+ output_dir = input_path.parent
784
+ output_docx = input_path.with_suffix(".docx")
785
+ report_path = input_path.with_name(f"{input_path.stem}.export-report.json")
786
+ else:
787
+ output_dir = output_root / input_path.stem
788
+ output_docx = None
789
+ report_path = None
790
+
791
+ exporter = MarkdownToDocxExporter(
792
+ input_path=input_path,
793
+ output_dir=output_dir,
794
+ output_docx=output_docx,
795
+ report_path=report_path,
796
+ template_path=template_path,
797
+ )
798
+ return exporter.export()