regen.mde 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +16 -0
- package/README.md +295 -0
- package/bin/build-corpus-editor.js +81 -0
- package/bin/build-corpus.js +41 -0
- package/bin/postinstall.js +187 -0
- package/bin/regen-mdeditor-install.js +27 -0
- package/bin/regen-mdeditor-uninstall.js +19 -0
- package/bin/validate-katex.js +93 -0
- package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
- package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
- package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
- package/desktop/BuildCorpusEditor/Program.cs +81 -0
- package/desktop/BuildCorpusEditor/app.manifest +16 -0
- package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
- package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
- package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
- package/dist/windows-editor/WebView2Loader.dll +0 -0
- package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
- package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
- package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
- package/dist/windows-editor/wwwroot/index.html +22 -0
- package/editor-web/index.html +21 -0
- package/editor-web/src/main.jsx +399 -0
- package/editor-web/src/styles.css +602 -0
- package/editor-web/vite.config.js +13 -0
- package/examples/build-corpus.config.example.json +21 -0
- package/installer/install-regen-mde.ps1 +175 -0
- package/installer/regen-mde.nsi +81 -0
- package/package.json +86 -0
- package/pyproject.toml +33 -0
- package/requirements.txt +4 -0
- package/scripts/build-windows-editor.ps1 +47 -0
- package/scripts/package-windows-editor.ps1 +90 -0
- package/scripts/run-corpus.ps1 +28 -0
- package/scripts/run-editor-implementation-plane.ps1 +203 -0
- package/scripts/run-required-tests.ps1 +98 -0
- package/scripts/run-smoke.ps1 +28 -0
- package/src/build_corpus/__init__.py +3 -0
- package/src/build_corpus/docx_exporter.py +798 -0
- package/src/build_corpus/exporter.py +1195 -0
- package/src/build_corpus/ppt_exporter.py +532 -0
- package/src/build_corpus/templates/__init__.py +1 -0
- package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
- package/src/build_corpus/validate_assets.py +46 -0
- package/tools/audit_corpus.py +203 -0
- package/tools/collect_microsoft_word_templates.py +228 -0
- package/tools/collect_online_docx_corpus.py +272 -0
- package/tools/collect_online_pptx_corpus.py +252 -0
- package/tools/compare_pptx_inputs_outputs.py +87 -0
- package/tools/roundtrip_docx_corpus.py +171 -0
|
@@ -0,0 +1,798 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import tempfile
|
|
9
|
+
from html.parser import HTMLParser
|
|
10
|
+
from contextlib import ExitStack
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from importlib.resources import as_file, files
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from zipfile import ZipFile
|
|
15
|
+
|
|
16
|
+
from docx import Document
|
|
17
|
+
from docx.enum.style import WD_STYLE_TYPE
|
|
18
|
+
from docx.enum.text import WD_BREAK
|
|
19
|
+
from docx.oxml import OxmlElement
|
|
20
|
+
from docx.oxml.ns import qn
|
|
21
|
+
from docx.shared import Inches, Pt, RGBColor
|
|
22
|
+
from docx.image.exceptions import UnrecognizedImageError
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
INLINE_TOKEN_RE = re.compile(
|
|
26
|
+
r"(!\[[^\]]*\]\([^)]+\)|\[[^\]]+\]\([^)]+\)|`[^`]+`|\$\$[^$]+\$\$|\$[^$\n]+\$|\*\*\*.+?\*\*\*|\*\*.+?\*\*|\*.+?\*)"
|
|
27
|
+
)
|
|
28
|
+
BARE_URL_RE = re.compile(r"https?://[^\s<>()]+(?:\([^\s<>()]*\)[^\s<>()]*)*")
|
|
29
|
+
LIST_ITEM_RE = re.compile(r"^(\s*)([-*+]|\d+\.)\s+(.*)$")
|
|
30
|
+
TABLE_SEPARATOR_RE = re.compile(r"^\s*\|?(?:\s*:?-{3,}:?\s*\|)+\s*:?-{3,}:?\s*\|?\s*$")
|
|
31
|
+
DEFAULT_TEMPLATE_FILENAME = "md-to-word-template.dotx"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class WordExportStats:
|
|
36
|
+
paragraphs: int = 0
|
|
37
|
+
headings: int = 0
|
|
38
|
+
lists: int = 0
|
|
39
|
+
tables: int = 0
|
|
40
|
+
code_blocks: int = 0
|
|
41
|
+
blockquotes: int = 0
|
|
42
|
+
images: int = 0
|
|
43
|
+
equations: int = 0
|
|
44
|
+
warnings: list[str] = field(default_factory=list)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def strip_fence(line: str) -> str:
|
|
48
|
+
return line.strip()[3:].strip()
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def split_table_row(line: str) -> list[str]:
|
|
52
|
+
text = line.strip()
|
|
53
|
+
if text.startswith("|"):
|
|
54
|
+
text = text[1:]
|
|
55
|
+
if text.endswith("|"):
|
|
56
|
+
text = text[:-1]
|
|
57
|
+
cells: list[str] = []
|
|
58
|
+
buffer: list[str] = []
|
|
59
|
+
escape = False
|
|
60
|
+
for char in text:
|
|
61
|
+
if escape:
|
|
62
|
+
buffer.append(char)
|
|
63
|
+
escape = False
|
|
64
|
+
continue
|
|
65
|
+
if char == "\\":
|
|
66
|
+
escape = True
|
|
67
|
+
buffer.append(char)
|
|
68
|
+
continue
|
|
69
|
+
if char == "|":
|
|
70
|
+
cells.append("".join(buffer).strip())
|
|
71
|
+
buffer.clear()
|
|
72
|
+
continue
|
|
73
|
+
buffer.append(char)
|
|
74
|
+
cells.append("".join(buffer).strip())
|
|
75
|
+
return cells
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def markdown_link_parts(token: str) -> tuple[str, str]:
|
|
79
|
+
match = re.fullmatch(r"!?\[([^\]]*)\]\(([^)]+)\)", token)
|
|
80
|
+
if not match:
|
|
81
|
+
return token, ""
|
|
82
|
+
return match.group(1), match.group(2).strip()
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def unescape_markdown_text(text: str) -> str:
|
|
86
|
+
return re.sub(r"\\([\\`*_{}\[\]()#+.!|$-])", r"\1", text)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
class HTMLTableParser(HTMLParser):
|
|
90
|
+
def __init__(self) -> None:
|
|
91
|
+
super().__init__()
|
|
92
|
+
self.rows: list[list[str]] = []
|
|
93
|
+
self.current_row: list[str] | None = None
|
|
94
|
+
self.current_cell: list[str] | None = None
|
|
95
|
+
|
|
96
|
+
def handle_starttag(self, tag: str, attrs) -> None: # type: ignore[override]
|
|
97
|
+
normalized = tag.lower()
|
|
98
|
+
if normalized == "tr":
|
|
99
|
+
self.current_row = []
|
|
100
|
+
elif normalized == "td":
|
|
101
|
+
self.current_cell = []
|
|
102
|
+
elif normalized == "br" and self.current_cell is not None:
|
|
103
|
+
self.current_cell.append("\n")
|
|
104
|
+
|
|
105
|
+
def handle_endtag(self, tag: str) -> None: # type: ignore[override]
|
|
106
|
+
normalized = tag.lower()
|
|
107
|
+
if normalized == "td" and self.current_row is not None and self.current_cell is not None:
|
|
108
|
+
self.current_row.append("".join(self.current_cell).strip())
|
|
109
|
+
self.current_cell = None
|
|
110
|
+
elif normalized == "tr" and self.current_row is not None:
|
|
111
|
+
self.rows.append(self.current_row)
|
|
112
|
+
self.current_row = None
|
|
113
|
+
|
|
114
|
+
def handle_data(self, data: str) -> None: # type: ignore[override]
|
|
115
|
+
if self.current_cell is not None:
|
|
116
|
+
self.current_cell.append(data)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def parse_html_table(markup: str) -> list[list[str]]:
|
|
120
|
+
parser = HTMLTableParser()
|
|
121
|
+
parser.feed(markup)
|
|
122
|
+
parser.close()
|
|
123
|
+
return [row for row in parser.rows if any(cell.strip() for cell in row)]
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def packaged_template_resource():
|
|
127
|
+
package_root = f"{__package__}.templates" if __package__ else "build_corpus.templates"
|
|
128
|
+
return files(package_root).joinpath(DEFAULT_TEMPLATE_FILENAME)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def resolve_default_template_path() -> Path | None:
|
|
132
|
+
try:
|
|
133
|
+
packaged_template_resource()
|
|
134
|
+
except ModuleNotFoundError:
|
|
135
|
+
return None
|
|
136
|
+
return Path(f"bundled:{DEFAULT_TEMPLATE_FILENAME}")
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def set_cell_text(cell, text: str) -> None:
|
|
140
|
+
lines = [segment.strip() for segment in text.replace("<br>", "\n").splitlines()]
|
|
141
|
+
lines = [line for line in lines if line]
|
|
142
|
+
if not lines:
|
|
143
|
+
cell.text = ""
|
|
144
|
+
return []
|
|
145
|
+
return lines
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def append_text_with_breaks(paragraph, text: str) -> None:
|
|
149
|
+
text = unescape_markdown_text(text)
|
|
150
|
+
parts = text.split("\n")
|
|
151
|
+
for index, part in enumerate(parts):
|
|
152
|
+
if part:
|
|
153
|
+
paragraph.add_run(part)
|
|
154
|
+
if index < len(parts) - 1:
|
|
155
|
+
paragraph.add_run().add_break(WD_BREAK.LINE)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def split_trailing_url_punctuation(url: str) -> tuple[str, str]:
|
|
159
|
+
trailing = []
|
|
160
|
+
core = url
|
|
161
|
+
while core and core[-1] in ".,;:!?":
|
|
162
|
+
trailing.append(core[-1])
|
|
163
|
+
core = core[:-1]
|
|
164
|
+
return core, "".join(reversed(trailing))
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def convert_windows_metafile_to_png(source: Path) -> Path | None:
|
|
168
|
+
if os.name != "nt":
|
|
169
|
+
return None
|
|
170
|
+
target_dir = Path(tempfile.gettempdir()) / "build-corpus-image-fallbacks"
|
|
171
|
+
target_dir.mkdir(parents=True, exist_ok=True)
|
|
172
|
+
target = target_dir / f"{source.stem}.png"
|
|
173
|
+
source_literal = str(source).replace("'", "''")
|
|
174
|
+
target_literal = str(target).replace("'", "''")
|
|
175
|
+
command = (
|
|
176
|
+
"Add-Type -AssemblyName System.Drawing; "
|
|
177
|
+
f"$img = [System.Drawing.Image]::FromFile('{source_literal}'); "
|
|
178
|
+
f"$bmp = New-Object System.Drawing.Bitmap $img.Width, $img.Height; "
|
|
179
|
+
"$gfx = [System.Drawing.Graphics]::FromImage($bmp); "
|
|
180
|
+
"$gfx.DrawImage($img, 0, 0, $img.Width, $img.Height); "
|
|
181
|
+
f"$bmp.Save('{target_literal}', [System.Drawing.Imaging.ImageFormat]::Png); "
|
|
182
|
+
"$gfx.Dispose(); $bmp.Dispose(); $img.Dispose()"
|
|
183
|
+
)
|
|
184
|
+
result = subprocess.run(
|
|
185
|
+
["powershell", "-NoProfile", "-Command", command],
|
|
186
|
+
capture_output=True,
|
|
187
|
+
text=True,
|
|
188
|
+
)
|
|
189
|
+
if result.returncode != 0 or not target.exists():
|
|
190
|
+
return None
|
|
191
|
+
return target
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def set_paragraph_shading(paragraph, fill: str) -> None:
|
|
195
|
+
paragraph_pr = paragraph._p.get_or_add_pPr()
|
|
196
|
+
shading = paragraph_pr.find(qn("w:shd"))
|
|
197
|
+
if shading is None:
|
|
198
|
+
shading = OxmlElement("w:shd")
|
|
199
|
+
paragraph_pr.append(shading)
|
|
200
|
+
shading.set(qn("w:fill"), fill)
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def set_paragraph_border(paragraph, color: str) -> None:
|
|
204
|
+
paragraph_pr = paragraph._p.get_or_add_pPr()
|
|
205
|
+
borders = paragraph_pr.find(qn("w:pBdr"))
|
|
206
|
+
if borders is None:
|
|
207
|
+
borders = OxmlElement("w:pBdr")
|
|
208
|
+
paragraph_pr.append(borders)
|
|
209
|
+
left = borders.find(qn("w:left"))
|
|
210
|
+
if left is None:
|
|
211
|
+
left = OxmlElement("w:left")
|
|
212
|
+
borders.append(left)
|
|
213
|
+
left.set(qn("w:val"), "single")
|
|
214
|
+
left.set(qn("w:sz"), "10")
|
|
215
|
+
left.set(qn("w:space"), "12")
|
|
216
|
+
left.set(qn("w:color"), color)
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def add_hyperlink(paragraph, text: str, url: str):
|
|
220
|
+
part = paragraph.part
|
|
221
|
+
relationship_id = part.relate_to(
|
|
222
|
+
url,
|
|
223
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink",
|
|
224
|
+
is_external=True,
|
|
225
|
+
)
|
|
226
|
+
hyperlink = OxmlElement("w:hyperlink")
|
|
227
|
+
hyperlink.set(qn("r:id"), relationship_id)
|
|
228
|
+
|
|
229
|
+
run = OxmlElement("w:r")
|
|
230
|
+
rpr = OxmlElement("w:rPr")
|
|
231
|
+
|
|
232
|
+
color = OxmlElement("w:color")
|
|
233
|
+
color.set(qn("w:val"), "2563EB")
|
|
234
|
+
rpr.append(color)
|
|
235
|
+
|
|
236
|
+
underline = OxmlElement("w:u")
|
|
237
|
+
underline.set(qn("w:val"), "single")
|
|
238
|
+
rpr.append(underline)
|
|
239
|
+
run.append(rpr)
|
|
240
|
+
|
|
241
|
+
text_node = OxmlElement("w:t")
|
|
242
|
+
text_node.text = text
|
|
243
|
+
run.append(text_node)
|
|
244
|
+
hyperlink.append(run)
|
|
245
|
+
paragraph._p.append(hyperlink)
|
|
246
|
+
return hyperlink
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
def append_hyperlink_run(hyperlink, text: str, bold: bool = False, italic: bool = False, code: bool = False) -> None:
|
|
250
|
+
run = OxmlElement("w:r")
|
|
251
|
+
rpr = OxmlElement("w:rPr")
|
|
252
|
+
|
|
253
|
+
color = OxmlElement("w:color")
|
|
254
|
+
color.set(qn("w:val"), "2563EB")
|
|
255
|
+
rpr.append(color)
|
|
256
|
+
|
|
257
|
+
underline = OxmlElement("w:u")
|
|
258
|
+
underline.set(qn("w:val"), "single")
|
|
259
|
+
rpr.append(underline)
|
|
260
|
+
|
|
261
|
+
if bold:
|
|
262
|
+
rpr.append(OxmlElement("w:b"))
|
|
263
|
+
if italic:
|
|
264
|
+
rpr.append(OxmlElement("w:i"))
|
|
265
|
+
|
|
266
|
+
fonts_needed = code
|
|
267
|
+
if fonts_needed:
|
|
268
|
+
fonts = OxmlElement("w:rFonts")
|
|
269
|
+
fonts.set(qn("w:ascii"), "Consolas")
|
|
270
|
+
fonts.set(qn("w:hAnsi"), "Consolas")
|
|
271
|
+
fonts.set(qn("w:cs"), "Consolas")
|
|
272
|
+
rpr.append(fonts)
|
|
273
|
+
size = OxmlElement("w:sz")
|
|
274
|
+
size.set(qn("w:val"), "20")
|
|
275
|
+
rpr.append(size)
|
|
276
|
+
|
|
277
|
+
run.append(rpr)
|
|
278
|
+
text_node = OxmlElement("w:t")
|
|
279
|
+
text_node.text = text
|
|
280
|
+
run.append(text_node)
|
|
281
|
+
hyperlink.append(run)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def set_picture_metadata(run, source_name: str) -> None:
|
|
285
|
+
filename = Path(source_name).name
|
|
286
|
+
try:
|
|
287
|
+
doc_props = run._r.xpath(".//*[local-name()='docPr']")
|
|
288
|
+
except Exception:
|
|
289
|
+
doc_props = []
|
|
290
|
+
for doc_prop in doc_props:
|
|
291
|
+
doc_prop.set("name", filename)
|
|
292
|
+
doc_prop.set("descr", filename)
|
|
293
|
+
doc_prop.set("title", filename)
|
|
294
|
+
|
|
295
|
+
|
|
296
|
+
class MarkdownToDocxExporter:
|
|
297
|
+
def __init__(
|
|
298
|
+
self,
|
|
299
|
+
input_path: Path,
|
|
300
|
+
output_dir: Path,
|
|
301
|
+
output_docx: Path | None = None,
|
|
302
|
+
report_path: Path | None = None,
|
|
303
|
+
template_path: Path | None = None,
|
|
304
|
+
):
|
|
305
|
+
self.input_path = input_path
|
|
306
|
+
self.output_dir = output_dir
|
|
307
|
+
self.output_docx = output_docx or (output_dir / f"{input_path.stem}.docx")
|
|
308
|
+
self.report_path = report_path or (output_dir / "export-report.json")
|
|
309
|
+
self._template_resource_stack = ExitStack()
|
|
310
|
+
self.template_path = self.resolve_template_path(template_path)
|
|
311
|
+
self.use_template_styles = self.template_path is not None
|
|
312
|
+
self.stats = WordExportStats()
|
|
313
|
+
|
|
314
|
+
def export(self) -> dict:
|
|
315
|
+
try:
|
|
316
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
317
|
+
if self.template_path and not self.template_path.exists():
|
|
318
|
+
raise FileNotFoundError(f"Word template not found: {self.template_path}")
|
|
319
|
+
|
|
320
|
+
document = Document()
|
|
321
|
+
if not self.template_path:
|
|
322
|
+
self.apply_modern_styles(document)
|
|
323
|
+
self.ensure_custom_styles(document)
|
|
324
|
+
|
|
325
|
+
markdown = self.input_path.read_text(encoding="utf-8")
|
|
326
|
+
self.render_markdown(document, markdown)
|
|
327
|
+
document.save(self.output_docx)
|
|
328
|
+
if self.template_path:
|
|
329
|
+
self.apply_template_package(self.output_docx, self.template_path)
|
|
330
|
+
|
|
331
|
+
report = {
|
|
332
|
+
"input": str(self.input_path),
|
|
333
|
+
"output": str(self.output_docx),
|
|
334
|
+
"template": str(self.template_path) if self.template_path else None,
|
|
335
|
+
"stats": self.stats.__dict__,
|
|
336
|
+
}
|
|
337
|
+
self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
338
|
+
return report
|
|
339
|
+
finally:
|
|
340
|
+
self._template_resource_stack.close()
|
|
341
|
+
|
|
342
|
+
def resolve_template_path(self, template_path: Path | None) -> Path | None:
|
|
343
|
+
if template_path is not None:
|
|
344
|
+
return Path(template_path)
|
|
345
|
+
|
|
346
|
+
try:
|
|
347
|
+
return self._template_resource_stack.enter_context(as_file(packaged_template_resource()))
|
|
348
|
+
except (FileNotFoundError, ModuleNotFoundError):
|
|
349
|
+
return None
|
|
350
|
+
|
|
351
|
+
def apply_modern_styles(self, document: Document) -> None:
|
|
352
|
+
section = document.sections[0]
|
|
353
|
+
section.top_margin = Inches(0.8)
|
|
354
|
+
section.bottom_margin = Inches(0.8)
|
|
355
|
+
section.left_margin = Inches(0.9)
|
|
356
|
+
section.right_margin = Inches(0.9)
|
|
357
|
+
|
|
358
|
+
normal = document.styles["Normal"]
|
|
359
|
+
normal.font.name = "Aptos"
|
|
360
|
+
normal.font.size = Pt(11)
|
|
361
|
+
normal.font.color.rgb = RGBColor(31, 41, 55)
|
|
362
|
+
normal.paragraph_format.space_after = Pt(8)
|
|
363
|
+
normal.paragraph_format.line_spacing = 1.15
|
|
364
|
+
|
|
365
|
+
for level, size in ((1, 22), (2, 17), (3, 14), (4, 12)):
|
|
366
|
+
style = document.styles[f"Heading {level}"]
|
|
367
|
+
style.font.name = "Aptos Display"
|
|
368
|
+
style.font.size = Pt(size)
|
|
369
|
+
style.font.bold = True
|
|
370
|
+
style.font.color.rgb = RGBColor(15, 23, 42)
|
|
371
|
+
style.paragraph_format.space_before = Pt(12 if level == 1 else 10)
|
|
372
|
+
style.paragraph_format.space_after = Pt(4)
|
|
373
|
+
|
|
374
|
+
self.ensure_custom_styles(document)
|
|
375
|
+
|
|
376
|
+
def ensure_custom_styles(self, document: Document) -> None:
|
|
377
|
+
if self.use_template_styles:
|
|
378
|
+
return
|
|
379
|
+
if "BuildCorpus Code" not in document.styles:
|
|
380
|
+
style = document.styles.add_style("BuildCorpus Code", WD_STYLE_TYPE.PARAGRAPH)
|
|
381
|
+
style.base_style = document.styles["Normal"]
|
|
382
|
+
style.font.name = "Consolas"
|
|
383
|
+
style.font.size = Pt(10)
|
|
384
|
+
style.paragraph_format.left_indent = Inches(0.2)
|
|
385
|
+
style.paragraph_format.right_indent = Inches(0.2)
|
|
386
|
+
style.paragraph_format.space_before = Pt(4)
|
|
387
|
+
style.paragraph_format.space_after = Pt(6)
|
|
388
|
+
|
|
389
|
+
if "BuildCorpus Quote" not in document.styles:
|
|
390
|
+
style = document.styles.add_style("BuildCorpus Quote", WD_STYLE_TYPE.PARAGRAPH)
|
|
391
|
+
style.base_style = document.styles["Normal"]
|
|
392
|
+
style.font.italic = True
|
|
393
|
+
style.font.color.rgb = RGBColor(71, 85, 105)
|
|
394
|
+
style.paragraph_format.left_indent = Inches(0.35)
|
|
395
|
+
style.paragraph_format.space_after = Pt(6)
|
|
396
|
+
|
|
397
|
+
def render_markdown(self, document: Document, markdown: str) -> None:
|
|
398
|
+
lines = markdown.splitlines()
|
|
399
|
+
index = 0
|
|
400
|
+
while index < len(lines):
|
|
401
|
+
line = lines[index]
|
|
402
|
+
stripped = line.strip()
|
|
403
|
+
|
|
404
|
+
if not stripped:
|
|
405
|
+
index += 1
|
|
406
|
+
continue
|
|
407
|
+
|
|
408
|
+
if stripped.startswith("```"):
|
|
409
|
+
info = strip_fence(line)
|
|
410
|
+
buffer: list[str] = []
|
|
411
|
+
index += 1
|
|
412
|
+
while index < len(lines) and not lines[index].strip().startswith("```"):
|
|
413
|
+
buffer.append(lines[index])
|
|
414
|
+
index += 1
|
|
415
|
+
if index < len(lines):
|
|
416
|
+
index += 1
|
|
417
|
+
self.add_code_block(document, "\n".join(buffer), info)
|
|
418
|
+
continue
|
|
419
|
+
|
|
420
|
+
if stripped == "$$":
|
|
421
|
+
buffer: list[str] = []
|
|
422
|
+
index += 1
|
|
423
|
+
while index < len(lines) and lines[index].strip() != "$$":
|
|
424
|
+
buffer.append(lines[index])
|
|
425
|
+
index += 1
|
|
426
|
+
if index < len(lines):
|
|
427
|
+
index += 1
|
|
428
|
+
self.add_equation_block(document, "\n".join(buffer).strip())
|
|
429
|
+
continue
|
|
430
|
+
|
|
431
|
+
if stripped.startswith("$$") and stripped.endswith("$$") and len(stripped) > 4:
|
|
432
|
+
self.add_equation_block(document, stripped[2:-2].strip())
|
|
433
|
+
index += 1
|
|
434
|
+
continue
|
|
435
|
+
|
|
436
|
+
if stripped.startswith("#"):
|
|
437
|
+
level = len(stripped) - len(stripped.lstrip("#"))
|
|
438
|
+
text = stripped[level:].strip()
|
|
439
|
+
self.add_heading(document, level, text)
|
|
440
|
+
index += 1
|
|
441
|
+
continue
|
|
442
|
+
|
|
443
|
+
if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
|
|
444
|
+
table_lines = [line, lines[index + 1]]
|
|
445
|
+
index += 2
|
|
446
|
+
while index < len(lines) and "|" in lines[index]:
|
|
447
|
+
if not lines[index].strip():
|
|
448
|
+
break
|
|
449
|
+
table_lines.append(lines[index])
|
|
450
|
+
index += 1
|
|
451
|
+
self.add_table(document, table_lines)
|
|
452
|
+
continue
|
|
453
|
+
|
|
454
|
+
if stripped.lower() == "<table>":
|
|
455
|
+
table_lines = [line]
|
|
456
|
+
index += 1
|
|
457
|
+
while index < len(lines):
|
|
458
|
+
table_lines.append(lines[index])
|
|
459
|
+
if lines[index].strip().lower() == "</table>":
|
|
460
|
+
index += 1
|
|
461
|
+
break
|
|
462
|
+
index += 1
|
|
463
|
+
self.add_html_table(document, "\n".join(table_lines))
|
|
464
|
+
continue
|
|
465
|
+
|
|
466
|
+
list_match = LIST_ITEM_RE.match(line)
|
|
467
|
+
if list_match:
|
|
468
|
+
index = self.add_list(document, lines, index)
|
|
469
|
+
continue
|
|
470
|
+
|
|
471
|
+
if stripped.startswith(">"):
|
|
472
|
+
quote_lines: list[str] = []
|
|
473
|
+
while index < len(lines) and lines[index].strip().startswith(">"):
|
|
474
|
+
quote_lines.append(lines[index].strip()[1:].strip())
|
|
475
|
+
index += 1
|
|
476
|
+
self.add_blockquote(document, " ".join(quote_lines))
|
|
477
|
+
continue
|
|
478
|
+
|
|
479
|
+
if re.fullmatch(r"[-*_]{3,}", stripped):
|
|
480
|
+
document.add_paragraph("")
|
|
481
|
+
index += 1
|
|
482
|
+
continue
|
|
483
|
+
|
|
484
|
+
paragraph_lines = [line.strip()]
|
|
485
|
+
paragraph_lines = [line.rstrip()]
|
|
486
|
+
paragraph_breaks = [line.endswith(" ") or line.endswith("\\")]
|
|
487
|
+
index += 1
|
|
488
|
+
while index < len(lines):
|
|
489
|
+
candidate = lines[index]
|
|
490
|
+
if not candidate.strip():
|
|
491
|
+
break
|
|
492
|
+
if candidate.strip().startswith(("```", "#", ">")):
|
|
493
|
+
break
|
|
494
|
+
if LIST_ITEM_RE.match(candidate):
|
|
495
|
+
break
|
|
496
|
+
if TABLE_SEPARATOR_RE.match(lines[index + 1]) if index + 1 < len(lines) else False:
|
|
497
|
+
break
|
|
498
|
+
paragraph_lines.append(candidate.rstrip())
|
|
499
|
+
paragraph_breaks.append(candidate.endswith(" ") or candidate.endswith("\\"))
|
|
500
|
+
index += 1
|
|
501
|
+
self.add_paragraph(document, self.combine_paragraph_lines(paragraph_lines, paragraph_breaks))
|
|
502
|
+
|
|
503
|
+
@staticmethod
|
|
504
|
+
def combine_paragraph_lines(lines: list[str], breaks: list[bool]) -> str:
|
|
505
|
+
if not lines:
|
|
506
|
+
return ""
|
|
507
|
+
combined = lines[0]
|
|
508
|
+
for index in range(1, len(lines)):
|
|
509
|
+
separator = "\n" if breaks[index - 1] else " "
|
|
510
|
+
combined += separator + lines[index]
|
|
511
|
+
return combined
|
|
512
|
+
|
|
513
|
+
@staticmethod
|
|
514
|
+
def apply_template_package(output_docx: Path, template_path: Path) -> None:
|
|
515
|
+
transplant_parts = {
|
|
516
|
+
"word/styles.xml",
|
|
517
|
+
"word/stylesWithEffects.xml",
|
|
518
|
+
"word/numbering.xml",
|
|
519
|
+
"word/fontTable.xml",
|
|
520
|
+
"word/settings.xml",
|
|
521
|
+
"word/webSettings.xml",
|
|
522
|
+
"word/theme/theme1.xml",
|
|
523
|
+
}
|
|
524
|
+
with tempfile.TemporaryDirectory(prefix="build-corpus-template-") as tmp:
|
|
525
|
+
tmp_dir = Path(tmp)
|
|
526
|
+
patched = tmp_dir / output_docx.name
|
|
527
|
+
with ZipFile(output_docx) as out_zip, ZipFile(template_path) as template_zip, ZipFile(patched, "w") as patched_zip:
|
|
528
|
+
template_names = set(template_zip.namelist())
|
|
529
|
+
output_names = set(out_zip.namelist())
|
|
530
|
+
for name in out_zip.namelist():
|
|
531
|
+
if name in transplant_parts and name in template_names:
|
|
532
|
+
patched_zip.writestr(name, template_zip.read(name))
|
|
533
|
+
else:
|
|
534
|
+
patched_zip.writestr(name, out_zip.read(name))
|
|
535
|
+
for name in transplant_parts:
|
|
536
|
+
if name in template_names and name not in output_names:
|
|
537
|
+
patched_zip.writestr(name, template_zip.read(name))
|
|
538
|
+
shutil.move(str(patched), output_docx)
|
|
539
|
+
|
|
540
|
+
def add_heading(self, document: Document, level: int, text: str) -> None:
|
|
541
|
+
paragraph = document.add_paragraph(style=f"Heading {min(level, 6)}")
|
|
542
|
+
self.render_inline(paragraph, text)
|
|
543
|
+
self.stats.headings += 1
|
|
544
|
+
|
|
545
|
+
def add_paragraph(self, document: Document, text: str) -> None:
|
|
546
|
+
paragraph = document.add_paragraph(style="Normal")
|
|
547
|
+
self.render_inline(paragraph, text)
|
|
548
|
+
self.stats.paragraphs += 1
|
|
549
|
+
|
|
550
|
+
def add_code_block(self, document: Document, code: str, info: str) -> None:
|
|
551
|
+
paragraph = document.add_paragraph(style="Normal" if self.use_template_styles else "BuildCorpus Code")
|
|
552
|
+
if info:
|
|
553
|
+
label = paragraph.add_run(f"{info}\n")
|
|
554
|
+
label.bold = True
|
|
555
|
+
label.font.color.rgb = RGBColor(37, 99, 235)
|
|
556
|
+
run = paragraph.add_run(code)
|
|
557
|
+
run.font.name = "Consolas"
|
|
558
|
+
run.font.size = Pt(10)
|
|
559
|
+
set_paragraph_shading(paragraph, "F8FAFC")
|
|
560
|
+
self.stats.code_blocks += 1
|
|
561
|
+
|
|
562
|
+
def add_equation_block(self, document: Document, equation: str) -> None:
|
|
563
|
+
paragraph = document.add_paragraph(style="Normal")
|
|
564
|
+
paragraph.paragraph_format.left_indent = Inches(0.3)
|
|
565
|
+
paragraph.paragraph_format.right_indent = Inches(0.3)
|
|
566
|
+
paragraph.paragraph_format.space_before = Pt(4)
|
|
567
|
+
paragraph.paragraph_format.space_after = Pt(8)
|
|
568
|
+
run = paragraph.add_run(equation)
|
|
569
|
+
run.font.name = "Cambria Math"
|
|
570
|
+
run.font.size = Pt(11)
|
|
571
|
+
self.stats.equations += 1
|
|
572
|
+
|
|
573
|
+
def add_blockquote(self, document: Document, text: str) -> None:
|
|
574
|
+
paragraph = document.add_paragraph(style="Quote" if self.use_template_styles else "BuildCorpus Quote")
|
|
575
|
+
self.render_inline(paragraph, text)
|
|
576
|
+
set_paragraph_border(paragraph, "CBD5E1")
|
|
577
|
+
self.stats.blockquotes += 1
|
|
578
|
+
|
|
579
|
+
def add_list(self, document: Document, lines: list[str], start: int) -> int:
|
|
580
|
+
index = start
|
|
581
|
+
while index < len(lines):
|
|
582
|
+
match = LIST_ITEM_RE.match(lines[index])
|
|
583
|
+
if not match:
|
|
584
|
+
break
|
|
585
|
+
indent, marker, body = match.groups()
|
|
586
|
+
ordered = marker.endswith(".")
|
|
587
|
+
body_lines = [body]
|
|
588
|
+
lookahead = index + 1
|
|
589
|
+
while lookahead < len(lines):
|
|
590
|
+
candidate = lines[lookahead]
|
|
591
|
+
stripped = candidate.strip()
|
|
592
|
+
if not stripped:
|
|
593
|
+
break
|
|
594
|
+
if candidate.strip().startswith(("```", "#", ">")):
|
|
595
|
+
break
|
|
596
|
+
if LIST_ITEM_RE.match(candidate):
|
|
597
|
+
break
|
|
598
|
+
if TABLE_SEPARATOR_RE.match(lines[lookahead + 1]) if lookahead + 1 < len(lines) else False:
|
|
599
|
+
break
|
|
600
|
+
if not candidate[:1].isspace():
|
|
601
|
+
break
|
|
602
|
+
body_lines.append(candidate.rstrip())
|
|
603
|
+
lookahead += 1
|
|
604
|
+
style_name = self.list_style_name(document, ordered, indent)
|
|
605
|
+
paragraph = document.add_paragraph(style=style_name)
|
|
606
|
+
if style_name in {"List Bullet", "List Number"}:
|
|
607
|
+
paragraph.paragraph_format.left_indent = Inches(0.25 + (len(indent.replace("\t", " ")) // 2) * 0.18)
|
|
608
|
+
self.render_inline(paragraph, "\n".join(body_lines))
|
|
609
|
+
self.stats.lists += 1
|
|
610
|
+
index = lookahead
|
|
611
|
+
return index
|
|
612
|
+
|
|
613
|
+
def list_style_name(self, document: Document, ordered: bool, indent: str) -> str:
|
|
614
|
+
level = min(3, max(1, (len(indent.replace("\t", " ")) // 2) + 1))
|
|
615
|
+
base = "List Number" if ordered else "List Bullet"
|
|
616
|
+
candidate = base if level == 1 else f"{base} {level}"
|
|
617
|
+
return candidate if candidate in document.styles else base
|
|
618
|
+
|
|
619
|
+
def add_table(self, document: Document, table_lines: list[str]) -> None:
|
|
620
|
+
rows = [split_table_row(line) for line in table_lines if not TABLE_SEPARATOR_RE.match(line)]
|
|
621
|
+
self.add_table_rows(document, rows)
|
|
622
|
+
|
|
623
|
+
def add_html_table(self, document: Document, table_markup: str) -> None:
|
|
624
|
+
self.add_table_rows(document, parse_html_table(table_markup))
|
|
625
|
+
|
|
626
|
+
def add_table_rows(self, document: Document, rows: list[list[str]]) -> None:
|
|
627
|
+
if not rows:
|
|
628
|
+
return
|
|
629
|
+
width = max(len(row) for row in rows)
|
|
630
|
+
table = document.add_table(rows=len(rows), cols=width)
|
|
631
|
+
table.style = "Light List Accent 1" if "Light List Accent 1" in document.styles else "Table Grid"
|
|
632
|
+
for row_index, row in enumerate(rows):
|
|
633
|
+
for col_index in range(width):
|
|
634
|
+
value = row[col_index] if col_index < len(row) else ""
|
|
635
|
+
self.render_table_cell(table.cell(row_index, col_index), value)
|
|
636
|
+
self.stats.tables += 1
|
|
637
|
+
|
|
638
|
+
def render_table_cell(self, cell, text: str) -> None:
|
|
639
|
+
lines = set_cell_text(cell, text)
|
|
640
|
+
if not lines:
|
|
641
|
+
cell.text = ""
|
|
642
|
+
return
|
|
643
|
+
first = cell.paragraphs[0]
|
|
644
|
+
first.text = ""
|
|
645
|
+
self.render_inline(first, lines[0])
|
|
646
|
+
for line in lines[1:]:
|
|
647
|
+
paragraph = cell.add_paragraph("")
|
|
648
|
+
self.render_inline(paragraph, line)
|
|
649
|
+
|
|
650
|
+
def render_inline(self, paragraph, text: str) -> None:
|
|
651
|
+
cursor = 0
|
|
652
|
+
for match in INLINE_TOKEN_RE.finditer(text):
|
|
653
|
+
if match.start() > cursor:
|
|
654
|
+
self.render_plain_text(paragraph, text[cursor:match.start()])
|
|
655
|
+
token = match.group(0)
|
|
656
|
+
self.render_inline_token(paragraph, token)
|
|
657
|
+
cursor = match.end()
|
|
658
|
+
if cursor < len(text):
|
|
659
|
+
self.render_plain_text(paragraph, text[cursor:])
|
|
660
|
+
|
|
661
|
+
def render_plain_text(self, paragraph, text: str) -> None:
|
|
662
|
+
cursor = 0
|
|
663
|
+
for match in BARE_URL_RE.finditer(text):
|
|
664
|
+
if match.start() > cursor:
|
|
665
|
+
append_text_with_breaks(paragraph, text[cursor:match.start()])
|
|
666
|
+
url = match.group(0)
|
|
667
|
+
normalized_url, trailing = split_trailing_url_punctuation(url)
|
|
668
|
+
if normalized_url:
|
|
669
|
+
add_hyperlink(paragraph, normalized_url, normalized_url)
|
|
670
|
+
if trailing:
|
|
671
|
+
append_text_with_breaks(paragraph, trailing)
|
|
672
|
+
cursor = match.end()
|
|
673
|
+
if cursor < len(text):
|
|
674
|
+
append_text_with_breaks(paragraph, text[cursor:])
|
|
675
|
+
|
|
676
|
+
def render_inline_token(self, paragraph, token: str) -> None:
|
|
677
|
+
if token.startswith("!["):
|
|
678
|
+
alt, target = markdown_link_parts(token)
|
|
679
|
+
image_path = (self.input_path.parent / target).resolve()
|
|
680
|
+
if target.startswith(("http://", "https://", "data:")):
|
|
681
|
+
paragraph.add_run(f"[image: {alt or target}]")
|
|
682
|
+
self.stats.warnings.append(f"Skipped non-local image target: {target}")
|
|
683
|
+
return
|
|
684
|
+
if image_path.exists():
|
|
685
|
+
run = paragraph.add_run()
|
|
686
|
+
try:
|
|
687
|
+
run.add_picture(str(image_path), width=Inches(5.8))
|
|
688
|
+
set_picture_metadata(run, target)
|
|
689
|
+
self.stats.images += 1
|
|
690
|
+
except UnrecognizedImageError:
|
|
691
|
+
converted = convert_windows_metafile_to_png(image_path)
|
|
692
|
+
if converted is not None:
|
|
693
|
+
run.add_picture(str(converted), width=Inches(5.8))
|
|
694
|
+
set_picture_metadata(run, target)
|
|
695
|
+
self.stats.images += 1
|
|
696
|
+
self.stats.warnings.append(f"Converted unsupported image to PNG: {target}")
|
|
697
|
+
else:
|
|
698
|
+
paragraph.add_run(f"[unsupported image: {target}]")
|
|
699
|
+
self.stats.warnings.append(f"Unsupported image asset: {target}")
|
|
700
|
+
else:
|
|
701
|
+
paragraph.add_run(f"[missing image: {target}]")
|
|
702
|
+
self.stats.warnings.append(f"Missing image asset: {target}")
|
|
703
|
+
return
|
|
704
|
+
|
|
705
|
+
if token.startswith("["):
|
|
706
|
+
label, target = markdown_link_parts(token)
|
|
707
|
+
hyperlink = add_hyperlink(paragraph, "", target)
|
|
708
|
+
self.render_hyperlink_label(hyperlink, label)
|
|
709
|
+
return
|
|
710
|
+
|
|
711
|
+
if token.startswith("`"):
|
|
712
|
+
run = paragraph.add_run(token[1:-1])
|
|
713
|
+
run.font.name = "Consolas"
|
|
714
|
+
run.font.size = Pt(10)
|
|
715
|
+
return
|
|
716
|
+
|
|
717
|
+
if token.startswith("$$") and token.endswith("$$"):
|
|
718
|
+
run = paragraph.add_run(token[2:-2])
|
|
719
|
+
run.font.name = "Cambria Math"
|
|
720
|
+
run.font.size = Pt(11)
|
|
721
|
+
self.stats.equations += 1
|
|
722
|
+
return
|
|
723
|
+
|
|
724
|
+
if token.startswith("$") and token.endswith("$"):
|
|
725
|
+
run = paragraph.add_run(token[1:-1])
|
|
726
|
+
run.font.name = "Cambria Math"
|
|
727
|
+
run.font.size = Pt(11)
|
|
728
|
+
self.stats.equations += 1
|
|
729
|
+
return
|
|
730
|
+
|
|
731
|
+
if token.startswith("***") and token.endswith("***"):
|
|
732
|
+
run = paragraph.add_run(token[3:-3])
|
|
733
|
+
run.bold = True
|
|
734
|
+
run.italic = True
|
|
735
|
+
return
|
|
736
|
+
|
|
737
|
+
if token.startswith("**") and token.endswith("**"):
|
|
738
|
+
run = paragraph.add_run(token[2:-2])
|
|
739
|
+
run.bold = True
|
|
740
|
+
return
|
|
741
|
+
|
|
742
|
+
if token.startswith("*") and token.endswith("*"):
|
|
743
|
+
run = paragraph.add_run(token[1:-1])
|
|
744
|
+
run.italic = True
|
|
745
|
+
return
|
|
746
|
+
|
|
747
|
+
paragraph.add_run(token)
|
|
748
|
+
|
|
749
|
+
def render_hyperlink_label(self, hyperlink, text: str) -> None:
|
|
750
|
+
cursor = 0
|
|
751
|
+
for match in INLINE_TOKEN_RE.finditer(text):
|
|
752
|
+
if match.start() > cursor:
|
|
753
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:match.start()]))
|
|
754
|
+
token = match.group(0)
|
|
755
|
+
self.render_hyperlink_token(hyperlink, token)
|
|
756
|
+
cursor = match.end()
|
|
757
|
+
if cursor < len(text):
|
|
758
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(text[cursor:]))
|
|
759
|
+
|
|
760
|
+
def render_hyperlink_token(self, hyperlink, token: str) -> None:
|
|
761
|
+
if token.startswith("`") and token.endswith("`"):
|
|
762
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), code=True)
|
|
763
|
+
return
|
|
764
|
+
if token.startswith("***") and token.endswith("***"):
|
|
765
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token[3:-3]), bold=True, italic=True)
|
|
766
|
+
return
|
|
767
|
+
if token.startswith("**") and token.endswith("**"):
|
|
768
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token[2:-2]), bold=True)
|
|
769
|
+
return
|
|
770
|
+
if token.startswith("*") and token.endswith("*"):
|
|
771
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token[1:-1]), italic=True)
|
|
772
|
+
return
|
|
773
|
+
append_hyperlink_run(hyperlink, unescape_markdown_text(token))
|
|
774
|
+
|
|
775
|
+
|
|
776
|
+
def export_markdown_to_docx(
|
|
777
|
+
input_path: Path,
|
|
778
|
+
output_root: Path,
|
|
779
|
+
out_same_dir: bool,
|
|
780
|
+
template_path: Path | None = None,
|
|
781
|
+
) -> dict:
|
|
782
|
+
if out_same_dir:
|
|
783
|
+
output_dir = input_path.parent
|
|
784
|
+
output_docx = input_path.with_suffix(".docx")
|
|
785
|
+
report_path = input_path.with_name(f"{input_path.stem}.export-report.json")
|
|
786
|
+
else:
|
|
787
|
+
output_dir = output_root / input_path.stem
|
|
788
|
+
output_docx = None
|
|
789
|
+
report_path = None
|
|
790
|
+
|
|
791
|
+
exporter = MarkdownToDocxExporter(
|
|
792
|
+
input_path=input_path,
|
|
793
|
+
output_dir=output_dir,
|
|
794
|
+
output_docx=output_docx,
|
|
795
|
+
report_path=report_path,
|
|
796
|
+
template_path=template_path,
|
|
797
|
+
)
|
|
798
|
+
return exporter.export()
|