regen.mde 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +16 -0
- package/README.md +295 -0
- package/bin/build-corpus-editor.js +81 -0
- package/bin/build-corpus.js +41 -0
- package/bin/postinstall.js +187 -0
- package/bin/regen-mdeditor-install.js +27 -0
- package/bin/regen-mdeditor-uninstall.js +19 -0
- package/bin/validate-katex.js +93 -0
- package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
- package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
- package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
- package/desktop/BuildCorpusEditor/Program.cs +81 -0
- package/desktop/BuildCorpusEditor/app.manifest +16 -0
- package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
- package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
- package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
- package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
- package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
- package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
- package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
- package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
- package/dist/windows-editor/WebView2Loader.dll +0 -0
- package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
- package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
- package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
- package/dist/windows-editor/wwwroot/index.html +22 -0
- package/editor-web/index.html +21 -0
- package/editor-web/src/main.jsx +399 -0
- package/editor-web/src/styles.css +602 -0
- package/editor-web/vite.config.js +13 -0
- package/examples/build-corpus.config.example.json +21 -0
- package/installer/install-regen-mde.ps1 +175 -0
- package/installer/regen-mde.nsi +81 -0
- package/package.json +86 -0
- package/pyproject.toml +33 -0
- package/requirements.txt +4 -0
- package/scripts/build-windows-editor.ps1 +47 -0
- package/scripts/package-windows-editor.ps1 +90 -0
- package/scripts/run-corpus.ps1 +28 -0
- package/scripts/run-editor-implementation-plane.ps1 +203 -0
- package/scripts/run-required-tests.ps1 +98 -0
- package/scripts/run-smoke.ps1 +28 -0
- package/src/build_corpus/__init__.py +3 -0
- package/src/build_corpus/docx_exporter.py +798 -0
- package/src/build_corpus/exporter.py +1195 -0
- package/src/build_corpus/ppt_exporter.py +532 -0
- package/src/build_corpus/templates/__init__.py +1 -0
- package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
- package/src/build_corpus/validate_assets.py +46 -0
- package/tools/audit_corpus.py +203 -0
- package/tools/collect_microsoft_word_templates.py +228 -0
- package/tools/collect_online_docx_corpus.py +272 -0
- package/tools/collect_online_pptx_corpus.py +252 -0
- package/tools/compare_pptx_inputs_outputs.py +87 -0
- package/tools/roundtrip_docx_corpus.py +171 -0
|
@@ -0,0 +1,1195 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import base64
|
|
5
|
+
import contextlib
|
|
6
|
+
import hashlib
|
|
7
|
+
import html
|
|
8
|
+
import json
|
|
9
|
+
import mimetypes
|
|
10
|
+
import os
|
|
11
|
+
import re
|
|
12
|
+
import shutil
|
|
13
|
+
import subprocess
|
|
14
|
+
import tempfile
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from typing import Optional
|
|
18
|
+
from zipfile import ZipFile
|
|
19
|
+
from xml.etree import ElementTree as ET
|
|
20
|
+
|
|
21
|
+
from omml2latex import convert_omml
|
|
22
|
+
try:
|
|
23
|
+
from .docx_exporter import export_markdown_to_docx, resolve_default_template_path
|
|
24
|
+
except ImportError: # pragma: no cover - allows direct script execution
|
|
25
|
+
from build_corpus.docx_exporter import export_markdown_to_docx, resolve_default_template_path
|
|
26
|
+
try:
|
|
27
|
+
from .ppt_exporter import export_presentation
|
|
28
|
+
except ImportError: # pragma: no cover - allows direct script execution
|
|
29
|
+
from build_corpus.ppt_exporter import export_presentation
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
NS = {
|
|
33
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
34
|
+
"m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
|
|
35
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
36
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
37
|
+
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
W = f"{{{NS['w']}}}"
|
|
41
|
+
R = f"{{{NS['r']}}}"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
@dataclass
|
|
45
|
+
class ExportStats:
|
|
46
|
+
paragraphs: int = 0
|
|
47
|
+
headings: int = 0
|
|
48
|
+
code_blocks: int = 0
|
|
49
|
+
tables: int = 0
|
|
50
|
+
markdown_tables: int = 0
|
|
51
|
+
html_tables: int = 0
|
|
52
|
+
equations: int = 0
|
|
53
|
+
equation_images: int = 0
|
|
54
|
+
skipped_empty_equations: int = 0
|
|
55
|
+
equation_errors: int = 0
|
|
56
|
+
images: int = 0
|
|
57
|
+
lists: int = 0
|
|
58
|
+
warnings: list[str] = field(default_factory=list)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class S3ImageConfig:
|
|
63
|
+
bucket: str
|
|
64
|
+
public_base_url: str
|
|
65
|
+
prefix: str = ""
|
|
66
|
+
endpoint_url: str | None = None
|
|
67
|
+
region_name: str | None = None
|
|
68
|
+
access_key_id: str | None = None
|
|
69
|
+
secret_access_key: str | None = None
|
|
70
|
+
cache_control: str = "public, max-age=31536000, immutable"
|
|
71
|
+
acl: str | None = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class S3ImageUploader:
|
|
75
|
+
def __init__(self, config: S3ImageConfig):
|
|
76
|
+
self.config = config
|
|
77
|
+
try:
|
|
78
|
+
import boto3
|
|
79
|
+
except ImportError as exc:
|
|
80
|
+
raise RuntimeError("S3/R2 image mode requires boto3. Install with: pip install boto3") from exc
|
|
81
|
+
|
|
82
|
+
kwargs = {
|
|
83
|
+
"service_name": "s3",
|
|
84
|
+
"endpoint_url": config.endpoint_url,
|
|
85
|
+
"region_name": config.region_name,
|
|
86
|
+
"aws_access_key_id": config.access_key_id,
|
|
87
|
+
"aws_secret_access_key": config.secret_access_key,
|
|
88
|
+
}
|
|
89
|
+
self.client = boto3.client(**{key: value for key, value in kwargs.items() if value})
|
|
90
|
+
|
|
91
|
+
def upload(self, source_name: str, data: bytes, content_type: str) -> dict[str, str]:
|
|
92
|
+
digest = hashlib.sha256(data).hexdigest()
|
|
93
|
+
suffix = Path(source_name).suffix.lower()
|
|
94
|
+
key_parts = [self.config.prefix.strip("/"), "images", "sha256", f"{digest}{suffix}"]
|
|
95
|
+
key = "/".join(part for part in key_parts if part)
|
|
96
|
+
put_args = {
|
|
97
|
+
"Bucket": self.config.bucket,
|
|
98
|
+
"Key": key,
|
|
99
|
+
"Body": data,
|
|
100
|
+
"ContentType": content_type,
|
|
101
|
+
"CacheControl": self.config.cache_control,
|
|
102
|
+
}
|
|
103
|
+
if self.config.acl:
|
|
104
|
+
put_args["ACL"] = self.config.acl
|
|
105
|
+
self.client.put_object(**put_args)
|
|
106
|
+
return {
|
|
107
|
+
"source": source_name,
|
|
108
|
+
"sha256": digest,
|
|
109
|
+
"bucket": self.config.bucket,
|
|
110
|
+
"key": key,
|
|
111
|
+
"url": f"{self.config.public_base_url.rstrip('/')}/{key}",
|
|
112
|
+
"content_type": content_type,
|
|
113
|
+
"bytes": str(len(data)),
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def local_name(tag: str) -> str:
|
|
118
|
+
return tag.rsplit("}", 1)[-1] if "}" in tag else tag
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def attr(node: ET.Element, ns: str, name: str) -> str | None:
|
|
122
|
+
return node.attrib.get(f"{{{NS[ns]}}}{name}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def clean_text(text: str) -> str:
|
|
126
|
+
return (
|
|
127
|
+
text.replace("\u00a0", " ")
|
|
128
|
+
.replace("\u200b", "")
|
|
129
|
+
.replace("\ufeff", "")
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def escape_md_text(text: str) -> str:
|
|
134
|
+
text = clean_text(text)
|
|
135
|
+
escaped: list[str] = []
|
|
136
|
+
index = 0
|
|
137
|
+
while index < len(text):
|
|
138
|
+
char = text[index]
|
|
139
|
+
if char == "\\":
|
|
140
|
+
next_char = text[index + 1] if index + 1 < len(text) else ""
|
|
141
|
+
if next_char in "\\`*_{}[]()#+.!|$-":
|
|
142
|
+
escaped.append("\\")
|
|
143
|
+
escaped.append(next_char)
|
|
144
|
+
index += 2
|
|
145
|
+
continue
|
|
146
|
+
escaped.append("\\\\")
|
|
147
|
+
elif char in {"*", "_", "$"}:
|
|
148
|
+
escaped.append("\\" + char)
|
|
149
|
+
else:
|
|
150
|
+
escaped.append(char)
|
|
151
|
+
index += 1
|
|
152
|
+
return "".join(escaped)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def normalize_inline_markers(text: str) -> str:
|
|
156
|
+
# Ensure inline images do not glue themselves to adjacent text.
|
|
157
|
+
text = re.sub(r"(\!\[[^\]]*\]\([^)]+\))(?=[^\s<>)\].,;:!?])", r"\1 ", text)
|
|
158
|
+
text = re.sub(r"(?<=[^\s<(\[.,;:!?])(\!\[[^\]]*\]\([^)]+\))", r" \1", text)
|
|
159
|
+
return text
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def strip_trailing_markdown_breaks(text: str) -> str:
|
|
163
|
+
while text.endswith(" "):
|
|
164
|
+
text = text[:-2]
|
|
165
|
+
return text.rstrip()
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
InlineStyle = tuple[bool, bool, bool]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def apply_inline_style(text: str, style: InlineStyle) -> str:
|
|
172
|
+
is_code, bold, italic = style
|
|
173
|
+
if not text:
|
|
174
|
+
return ""
|
|
175
|
+
if not text.strip():
|
|
176
|
+
return text
|
|
177
|
+
if is_code:
|
|
178
|
+
return f"`{text.replace('`', '\\`')}`"
|
|
179
|
+
if bold and italic:
|
|
180
|
+
return f"***{text}***"
|
|
181
|
+
if bold:
|
|
182
|
+
return f"**{text}**"
|
|
183
|
+
if italic:
|
|
184
|
+
return f"*{text}*"
|
|
185
|
+
return text
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def coalesce_inline_segments(segments: list[tuple[Optional[InlineStyle], str]]) -> str:
|
|
189
|
+
parts: list[str] = []
|
|
190
|
+
buffer: list[str] = []
|
|
191
|
+
buffer_style: Optional[InlineStyle] = None
|
|
192
|
+
|
|
193
|
+
def flush() -> None:
|
|
194
|
+
nonlocal buffer_style
|
|
195
|
+
if not buffer:
|
|
196
|
+
return
|
|
197
|
+
parts.append(apply_inline_style("".join(buffer), buffer_style or (False, False, False)))
|
|
198
|
+
buffer.clear()
|
|
199
|
+
buffer_style = None
|
|
200
|
+
|
|
201
|
+
for style, text in segments:
|
|
202
|
+
if not text:
|
|
203
|
+
continue
|
|
204
|
+
if style is None:
|
|
205
|
+
flush()
|
|
206
|
+
parts.append(text)
|
|
207
|
+
continue
|
|
208
|
+
if buffer_style == style:
|
|
209
|
+
buffer.append(text)
|
|
210
|
+
continue
|
|
211
|
+
flush()
|
|
212
|
+
buffer_style = style
|
|
213
|
+
buffer.append(text)
|
|
214
|
+
|
|
215
|
+
flush()
|
|
216
|
+
return normalize_inline_markers("".join(parts))
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
def normalize_tex(tex: str, display: bool) -> str:
|
|
220
|
+
tex = clean_text(tex).strip()
|
|
221
|
+
if tex.startswith("$$") and tex.endswith("$$"):
|
|
222
|
+
tex = tex[2:-2].strip()
|
|
223
|
+
elif tex.startswith("$") and tex.endswith("$"):
|
|
224
|
+
tex = tex[1:-1].strip()
|
|
225
|
+
|
|
226
|
+
tex = tex.replace("\u2011", "-")
|
|
227
|
+
tex = tex.replace("$", r"\$")
|
|
228
|
+
tex = tex.replace(r"\text{ }", r"\,")
|
|
229
|
+
tex = tex.replace(r"\text{ }", r"\;")
|
|
230
|
+
tex = tex.replace(r"\text{ }", " ")
|
|
231
|
+
tex = tex.replace(r"\mathrm{\}\text{*}}", r"\*")
|
|
232
|
+
tex = tex.replace(r"\text{-}", "-")
|
|
233
|
+
tex = tex.replace(r"\*", "*")
|
|
234
|
+
tex = replace_raw_unicode_math(tex)
|
|
235
|
+
tex = strip_word_equation_field_codes(tex)
|
|
236
|
+
tex = escape_text_macro_underscores(tex)
|
|
237
|
+
tex = repair_underbrace_limits(tex)
|
|
238
|
+
tex = balance_tex_braces(tex)
|
|
239
|
+
tex = re.sub(r"\s+", " ", tex).strip()
|
|
240
|
+
return f"$$\n{tex}\n$$" if display else f"${tex}$"
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
UNICODE_MATH_REPLACEMENTS = {
|
|
244
|
+
"∸": r"\dot{-}",
|
|
245
|
+
"⨅": r"\sqcap",
|
|
246
|
+
"⨃": r"\bigcup",
|
|
247
|
+
"⋜": r"\lessgtr",
|
|
248
|
+
"⋝": r"\gtrless",
|
|
249
|
+
"∱": r"\oint",
|
|
250
|
+
"∲": r"\oint",
|
|
251
|
+
"∳": r"\oint",
|
|
252
|
+
"ℇ": r"\varepsilon",
|
|
253
|
+
"Ϝ": r"\digamma",
|
|
254
|
+
"℩": r"\iota",
|
|
255
|
+
"Å": r"\mathring{A}",
|
|
256
|
+
"℮": "e",
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def replace_raw_unicode_math(tex: str) -> str:
|
|
261
|
+
for raw, replacement in UNICODE_MATH_REPLACEMENTS.items():
|
|
262
|
+
tex = tex.replace(raw, replacement)
|
|
263
|
+
return tex
|
|
264
|
+
|
|
265
|
+
|
|
266
|
+
def strip_word_equation_field_codes(tex: str) -> str:
|
|
267
|
+
# Word SEQ fields can leak into OMML conversion as equation-number text.
|
|
268
|
+
patterns = [
|
|
269
|
+
r"#\s*\\left\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\\right\)",
|
|
270
|
+
r"#\s*\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\)",
|
|
271
|
+
r"#\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+",
|
|
272
|
+
]
|
|
273
|
+
for pattern in patterns:
|
|
274
|
+
tex = re.sub(pattern, "", tex, flags=re.IGNORECASE)
|
|
275
|
+
return tex
|
|
276
|
+
|
|
277
|
+
|
|
278
|
+
def escape_text_macro_underscores(tex: str) -> str:
|
|
279
|
+
def replace(match: re.Match[str]) -> str:
|
|
280
|
+
body = match.group(1)
|
|
281
|
+
body = body.replace("\\", r"\textbackslash{}")
|
|
282
|
+
body = body.replace("_", r"\_")
|
|
283
|
+
body = body.replace("&", r"\&")
|
|
284
|
+
body = body.replace("%", r"\%")
|
|
285
|
+
body = body.replace("#", r"\#")
|
|
286
|
+
return r"\text{" + body + "}"
|
|
287
|
+
|
|
288
|
+
return re.sub(r"\\text\{([^{}]*)\}", replace, tex)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def repair_underbrace_limits(tex: str) -> str:
|
|
292
|
+
pattern = re.compile(
|
|
293
|
+
r"\\mathop\{\\mathop\{(?P<base>.*?)\}\\limits_\{\s*\\underbrace\s*\}\}\\limits_\{(?P<label>.*?)\}"
|
|
294
|
+
r"(?=(?:[+\-]|\\cdot|\\times|=|,|;|$))",
|
|
295
|
+
re.DOTALL,
|
|
296
|
+
)
|
|
297
|
+
previous = None
|
|
298
|
+
while previous != tex:
|
|
299
|
+
previous = tex
|
|
300
|
+
tex = pattern.sub(r"\\underbrace{\g<base>}_{\g<label>}", tex)
|
|
301
|
+
return tex
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def balance_tex_braces(tex: str) -> str:
|
|
305
|
+
balanced: list[str] = []
|
|
306
|
+
depth = 0
|
|
307
|
+
escaped = False
|
|
308
|
+
for char in tex:
|
|
309
|
+
if escaped:
|
|
310
|
+
balanced.append(char)
|
|
311
|
+
escaped = False
|
|
312
|
+
continue
|
|
313
|
+
if char == "\\":
|
|
314
|
+
balanced.append(char)
|
|
315
|
+
escaped = True
|
|
316
|
+
continue
|
|
317
|
+
if char == "{":
|
|
318
|
+
depth += 1
|
|
319
|
+
balanced.append(char)
|
|
320
|
+
elif char == "}":
|
|
321
|
+
if depth > 0:
|
|
322
|
+
depth -= 1
|
|
323
|
+
balanced.append(char)
|
|
324
|
+
# Drop unmatched closing braces; KaTeX rejects them.
|
|
325
|
+
else:
|
|
326
|
+
balanced.append(char)
|
|
327
|
+
if depth > 0:
|
|
328
|
+
balanced.extend("}" for _ in range(depth))
|
|
329
|
+
return "".join(balanced)
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def paragraph_style(node: ET.Element) -> str | None:
|
|
333
|
+
style = node.find("./w:pPr/w:pStyle", NS)
|
|
334
|
+
return attr(style, "w", "val") if style is not None else None
|
|
335
|
+
|
|
336
|
+
|
|
337
|
+
def heading_level(style: str | None) -> int | None:
|
|
338
|
+
if not style:
|
|
339
|
+
return None
|
|
340
|
+
match = re.fullmatch(r"Heading([1-6])", style)
|
|
341
|
+
if match:
|
|
342
|
+
return int(match.group(1))
|
|
343
|
+
return None
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def is_code_style(style: str | None) -> bool:
|
|
347
|
+
return bool(style and "code" in style.lower())
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def is_quote_style(style: str | None) -> bool:
|
|
351
|
+
if not style:
|
|
352
|
+
return False
|
|
353
|
+
normalized = style.replace(" ", "").lower()
|
|
354
|
+
return normalized in {"buildcorpusquote", "quote", "intensequote"}
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def paragraph_num_info(node: ET.Element) -> tuple[int, bool] | None:
|
|
358
|
+
num_pr = node.find("./w:pPr/w:numPr", NS)
|
|
359
|
+
if num_pr is None:
|
|
360
|
+
return None
|
|
361
|
+
ilvl = num_pr.find("./w:ilvl", NS)
|
|
362
|
+
level = int(attr(ilvl, "w", "val") or "0") if ilvl is not None else 0
|
|
363
|
+
# Without numbering.xml style resolution, use bullets as the safer default.
|
|
364
|
+
return level, False
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def paragraph_list_style_info(style: str | None) -> tuple[int, bool] | None:
|
|
368
|
+
if not style:
|
|
369
|
+
return None
|
|
370
|
+
normalized = style.replace(" ", "").lower()
|
|
371
|
+
if normalized.startswith("listbullet"):
|
|
372
|
+
suffix = normalized.removeprefix("listbullet")
|
|
373
|
+
level = int(suffix) if suffix.isdigit() else 1
|
|
374
|
+
return max(level - 1, 0), False
|
|
375
|
+
if normalized.startswith("listnumber"):
|
|
376
|
+
suffix = normalized.removeprefix("listnumber")
|
|
377
|
+
level = int(suffix) if suffix.isdigit() else 1
|
|
378
|
+
return max(level - 1, 0), True
|
|
379
|
+
return None
|
|
380
|
+
|
|
381
|
+
|
|
382
|
+
def run_is_math(run: ET.Element) -> bool:
|
|
383
|
+
props = run.find("./w:rPr", NS)
|
|
384
|
+
if props is None:
|
|
385
|
+
return False
|
|
386
|
+
fonts = props.find("./w:rFonts", NS)
|
|
387
|
+
if fonts is None:
|
|
388
|
+
return False
|
|
389
|
+
for attr_name in ("ascii", "hAnsi", "cs"):
|
|
390
|
+
value = attr(fonts, "w", attr_name)
|
|
391
|
+
if value and value.lower() == "cambria math":
|
|
392
|
+
return True
|
|
393
|
+
return False
|
|
394
|
+
|
|
395
|
+
|
|
396
|
+
def run_is_code(run: ET.Element) -> bool:
|
|
397
|
+
props = run.find("./w:rPr", NS)
|
|
398
|
+
if props is None:
|
|
399
|
+
return False
|
|
400
|
+
style_node = props.find("./w:rStyle", NS)
|
|
401
|
+
if style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower():
|
|
402
|
+
return True
|
|
403
|
+
fonts = props.find("./w:rFonts", NS)
|
|
404
|
+
if fonts is None:
|
|
405
|
+
return False
|
|
406
|
+
for attr_name in ("ascii", "hAnsi", "cs"):
|
|
407
|
+
value = attr(fonts, "w", attr_name)
|
|
408
|
+
if value and value.lower() == "consolas":
|
|
409
|
+
return True
|
|
410
|
+
return False
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def run_is_bold(run: ET.Element) -> bool:
|
|
414
|
+
props = run.find("./w:rPr", NS)
|
|
415
|
+
return props is not None and props.find("./w:b", NS) is not None
|
|
416
|
+
|
|
417
|
+
|
|
418
|
+
def paragraph_is_code(node: ET.Element) -> bool:
|
|
419
|
+
runs = node.findall("./w:r", NS)
|
|
420
|
+
if not runs:
|
|
421
|
+
return False
|
|
422
|
+
|
|
423
|
+
first_nonempty_seen = False
|
|
424
|
+
code_like_runs = 0
|
|
425
|
+
meaningful_runs = 0
|
|
426
|
+
|
|
427
|
+
for run in runs:
|
|
428
|
+
text = extract_run_text(run)
|
|
429
|
+
if not text or not text.strip():
|
|
430
|
+
continue
|
|
431
|
+
meaningful_runs += 1
|
|
432
|
+
if not first_nonempty_seen and run_is_bold(run):
|
|
433
|
+
first_nonempty_seen = True
|
|
434
|
+
continue
|
|
435
|
+
first_nonempty_seen = True
|
|
436
|
+
if run_is_code(run):
|
|
437
|
+
code_like_runs += 1
|
|
438
|
+
continue
|
|
439
|
+
return False
|
|
440
|
+
|
|
441
|
+
return meaningful_runs > 0 and code_like_runs > 0
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def extract_run_text(run: ET.Element) -> str:
|
|
445
|
+
parts: list[str] = []
|
|
446
|
+
for child in list(run):
|
|
447
|
+
name = local_name(child.tag)
|
|
448
|
+
if name == "t":
|
|
449
|
+
parts.append(clean_text(child.text or ""))
|
|
450
|
+
elif name == "tab":
|
|
451
|
+
parts.append("\t")
|
|
452
|
+
elif name in {"br", "cr"}:
|
|
453
|
+
parts.append("\n")
|
|
454
|
+
return "".join(parts)
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
def paragraph_is_math(node: ET.Element) -> bool:
|
|
458
|
+
runs = node.findall("./w:r", NS)
|
|
459
|
+
math_runs = 0
|
|
460
|
+
text_runs = 0
|
|
461
|
+
for run in runs:
|
|
462
|
+
texts = [t.text or "" for t in run.findall("./w:t", NS)]
|
|
463
|
+
if not any(segment.strip() for segment in texts):
|
|
464
|
+
continue
|
|
465
|
+
text_runs += 1
|
|
466
|
+
if run_is_math(run):
|
|
467
|
+
math_runs += 1
|
|
468
|
+
return text_runs > 0 and text_runs == math_runs
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def paragraph_has_display_math_layout(node: ET.Element) -> bool:
|
|
472
|
+
indent = node.find("./w:pPr/w:ind", NS)
|
|
473
|
+
spacing = node.find("./w:pPr/w:spacing", NS)
|
|
474
|
+
if indent is not None and any(attr(indent, "w", key) not in {None, "0"} for key in ("left", "right", "firstLine", "hanging")):
|
|
475
|
+
return True
|
|
476
|
+
if spacing is not None and any(attr(spacing, "w", key) not in {None, "0"} for key in ("before", "after")):
|
|
477
|
+
return True
|
|
478
|
+
return False
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def relationship_map(zip_file: ZipFile, part: str = "word/document.xml") -> dict[str, str]:
|
|
482
|
+
rels_path = str(Path(part).parent / "_rels" / (Path(part).name + ".rels")).replace("\\", "/")
|
|
483
|
+
if rels_path not in zip_file.namelist():
|
|
484
|
+
return {}
|
|
485
|
+
root = ET.fromstring(zip_file.read(rels_path))
|
|
486
|
+
return {
|
|
487
|
+
rel.attrib["Id"]: rel.attrib.get("Target", "")
|
|
488
|
+
for rel in root
|
|
489
|
+
if "Id" in rel.attrib
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
def resolve_image_target(target: str) -> str:
|
|
494
|
+
if target.startswith("../"):
|
|
495
|
+
target = target[3:]
|
|
496
|
+
if not target.startswith("word/"):
|
|
497
|
+
target = f"word/{target}"
|
|
498
|
+
return target
|
|
499
|
+
|
|
500
|
+
|
|
501
|
+
def image_metadata_filename(node: ET.Element) -> str | None:
|
|
502
|
+
for tag_name in ("docPr", "cNvPr"):
|
|
503
|
+
for entry in node.findall(f".//wp:{tag_name}", NS):
|
|
504
|
+
for key in ("descr", "title", "name"):
|
|
505
|
+
value = entry.attrib.get(key)
|
|
506
|
+
if value and Path(value).suffix:
|
|
507
|
+
return Path(value).name
|
|
508
|
+
return None
|
|
509
|
+
|
|
510
|
+
|
|
511
|
+
def expand_env(value):
|
|
512
|
+
if isinstance(value, str):
|
|
513
|
+
return os.path.expandvars(value)
|
|
514
|
+
if isinstance(value, dict):
|
|
515
|
+
return {key: expand_env(item) for key, item in value.items()}
|
|
516
|
+
if isinstance(value, list):
|
|
517
|
+
return [expand_env(item) for item in value]
|
|
518
|
+
return value
|
|
519
|
+
|
|
520
|
+
|
|
521
|
+
def load_config(path: Path | None) -> dict:
|
|
522
|
+
if path is None:
|
|
523
|
+
return {}
|
|
524
|
+
if not path.exists():
|
|
525
|
+
raise FileNotFoundError(f"Config file not found: {path}")
|
|
526
|
+
if path.suffix.lower() != ".json":
|
|
527
|
+
raise ValueError("Config currently supports JSON files only")
|
|
528
|
+
return expand_env(json.loads(path.read_text(encoding="utf-8")))
|
|
529
|
+
|
|
530
|
+
|
|
531
|
+
def config_get(config: dict, key: str, default=None):
|
|
532
|
+
current = config
|
|
533
|
+
for part in key.split("."):
|
|
534
|
+
if not isinstance(current, dict) or part not in current:
|
|
535
|
+
return default
|
|
536
|
+
current = current[part]
|
|
537
|
+
return current
|
|
538
|
+
|
|
539
|
+
|
|
540
|
+
def build_s3_config(config: dict, args: argparse.Namespace) -> S3ImageConfig | None:
|
|
541
|
+
if args.images != "s3":
|
|
542
|
+
return None
|
|
543
|
+
s3 = config_get(config, "s3", {}) or {}
|
|
544
|
+
bucket = args.s3_bucket or s3.get("bucket")
|
|
545
|
+
public_base_url = args.s3_public_base_url or s3.get("public_base_url")
|
|
546
|
+
if not bucket or not public_base_url:
|
|
547
|
+
raise ValueError("S3/R2 image mode requires bucket and public_base_url")
|
|
548
|
+
return S3ImageConfig(
|
|
549
|
+
bucket=bucket,
|
|
550
|
+
public_base_url=public_base_url,
|
|
551
|
+
prefix=args.s3_prefix if args.s3_prefix is not None else s3.get("prefix", ""),
|
|
552
|
+
endpoint_url=args.s3_endpoint_url or s3.get("endpoint_url"),
|
|
553
|
+
region_name=args.s3_region or s3.get("region_name"),
|
|
554
|
+
access_key_id=args.s3_access_key_id or s3.get("access_key_id"),
|
|
555
|
+
secret_access_key=args.s3_secret_access_key or s3.get("secret_access_key"),
|
|
556
|
+
cache_control=args.s3_cache_control or s3.get("cache_control", "public, max-age=31536000, immutable"),
|
|
557
|
+
acl=args.s3_acl if args.s3_acl is not None else s3.get("acl"),
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
|
|
561
|
+
class BuildCorpusExporter:
|
|
562
|
+
def __init__(
|
|
563
|
+
self,
|
|
564
|
+
input_path: Path,
|
|
565
|
+
output_dir: Path,
|
|
566
|
+
equation_mode: str = "tex",
|
|
567
|
+
output_md: Path | None = None,
|
|
568
|
+
assets_dir: Path | None = None,
|
|
569
|
+
report_path: Path | None = None,
|
|
570
|
+
image_mode: str = "assets",
|
|
571
|
+
s3_config: S3ImageConfig | None = None,
|
|
572
|
+
):
|
|
573
|
+
self.input_path = input_path
|
|
574
|
+
self.output_dir = output_dir
|
|
575
|
+
self.output_md = output_md or (output_dir / (input_path.stem + ".md"))
|
|
576
|
+
self.assets_dir = assets_dir or (output_dir / "assets")
|
|
577
|
+
self.report_path = report_path or (output_dir / "export-report.json")
|
|
578
|
+
self.asset_ref_prefix = self.assets_dir.name
|
|
579
|
+
self.equation_mode = equation_mode
|
|
580
|
+
self.image_mode = image_mode
|
|
581
|
+
self.s3_config = s3_config
|
|
582
|
+
self.s3_uploader = S3ImageUploader(s3_config) if image_mode == "s3" and s3_config else None
|
|
583
|
+
self.stats = ExportStats()
|
|
584
|
+
self.rels: dict[str, str] = {}
|
|
585
|
+
self.media_map: dict[str, str] = {}
|
|
586
|
+
self.image_uploads: list[dict[str, str]] = []
|
|
587
|
+
self.equation_asset_map: dict[int, str] = {}
|
|
588
|
+
self.empty_equation_indexes: set[int] = set()
|
|
589
|
+
self.equation_index = 0
|
|
590
|
+
self.equation_samples: list[dict[str, str]] = []
|
|
591
|
+
self.table_depth = 0
|
|
592
|
+
|
|
593
|
+
def export(self) -> dict:
|
|
594
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
595
|
+
if self.image_mode == "assets" or self.equation_mode == "image":
|
|
596
|
+
self.assets_dir.mkdir(parents=True, exist_ok=True)
|
|
597
|
+
|
|
598
|
+
with self.open_input_zip() as zf:
|
|
599
|
+
self.rels = relationship_map(zf)
|
|
600
|
+
self._copy_media(zf)
|
|
601
|
+
document_xml = zf.read("word/document.xml")
|
|
602
|
+
root = ET.fromstring(document_xml)
|
|
603
|
+
body = root.find("w:body", NS)
|
|
604
|
+
if body is None:
|
|
605
|
+
raise RuntimeError("word/document.xml has no w:body")
|
|
606
|
+
if self.equation_mode == "image":
|
|
607
|
+
self._render_equation_assets(root)
|
|
608
|
+
|
|
609
|
+
markdown = self.render_children(body, top_level=True).strip() + "\n"
|
|
610
|
+
|
|
611
|
+
self.output_md.parent.mkdir(parents=True, exist_ok=True)
|
|
612
|
+
self.output_md.write_text(markdown, encoding="utf-8")
|
|
613
|
+
report = {
|
|
614
|
+
"input": str(self.input_path),
|
|
615
|
+
"output": str(self.output_md),
|
|
616
|
+
"assets_dir": str(self.assets_dir) if self.assets_dir.exists() else None,
|
|
617
|
+
"image_mode": self.image_mode,
|
|
618
|
+
"image_uploads": self.image_uploads,
|
|
619
|
+
"stats": self.stats.__dict__,
|
|
620
|
+
"equation_samples": self.equation_samples[:50],
|
|
621
|
+
}
|
|
622
|
+
self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
623
|
+
return report
|
|
624
|
+
|
|
625
|
+
@contextlib.contextmanager
|
|
626
|
+
def open_input_zip(self):
|
|
627
|
+
try:
|
|
628
|
+
with ZipFile(self.input_path) as zf:
|
|
629
|
+
yield zf
|
|
630
|
+
return
|
|
631
|
+
except PermissionError:
|
|
632
|
+
pass
|
|
633
|
+
|
|
634
|
+
with tempfile.TemporaryDirectory(prefix="build-corpus-input-") as tmp:
|
|
635
|
+
temp_input = Path(tmp) / self.input_path.name
|
|
636
|
+
self.copy_locked_input(temp_input)
|
|
637
|
+
self.stats.warnings.append(
|
|
638
|
+
f"Input file was locked; converted from temporary copy: {temp_input}"
|
|
639
|
+
)
|
|
640
|
+
with ZipFile(temp_input) as zf:
|
|
641
|
+
yield zf
|
|
642
|
+
|
|
643
|
+
def copy_locked_input(self, temp_input: Path) -> None:
|
|
644
|
+
try:
|
|
645
|
+
shutil.copyfile(self.input_path, temp_input)
|
|
646
|
+
return
|
|
647
|
+
except PermissionError:
|
|
648
|
+
if os.name != "nt":
|
|
649
|
+
raise
|
|
650
|
+
|
|
651
|
+
source = str(self.input_path).replace("'", "''")
|
|
652
|
+
target = str(temp_input).replace("'", "''")
|
|
653
|
+
command = f"Copy-Item -LiteralPath '{source}' -Destination '{target}' -Force"
|
|
654
|
+
result = subprocess.run(
|
|
655
|
+
["powershell", "-NoProfile", "-Command", command],
|
|
656
|
+
capture_output=True,
|
|
657
|
+
text=True,
|
|
658
|
+
)
|
|
659
|
+
if result.returncode != 0:
|
|
660
|
+
message = result.stderr.strip() or result.stdout.strip() or "unknown error"
|
|
661
|
+
raise PermissionError(f"Could not copy locked input via PowerShell: {message}")
|
|
662
|
+
|
|
663
|
+
def _copy_media(self, zf: ZipFile) -> None:
|
|
664
|
+
for name in zf.namelist():
|
|
665
|
+
if not name.startswith("word/media/"):
|
|
666
|
+
continue
|
|
667
|
+
mime_type = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
|
|
668
|
+
if self.image_mode == "base64":
|
|
669
|
+
data = zf.read(name)
|
|
670
|
+
encoded = base64.b64encode(data).decode("ascii")
|
|
671
|
+
self.media_map[name] = f"data:{mime_type};base64,{encoded}"
|
|
672
|
+
elif self.image_mode == "s3":
|
|
673
|
+
if self.s3_uploader is None:
|
|
674
|
+
raise RuntimeError("S3/R2 image mode needs s3_config")
|
|
675
|
+
data = zf.read(name)
|
|
676
|
+
upload = self.s3_uploader.upload(name, data, mime_type)
|
|
677
|
+
self.image_uploads.append(upload)
|
|
678
|
+
self.media_map[name] = upload["url"]
|
|
679
|
+
else:
|
|
680
|
+
target = self.assets_dir / Path(name).name
|
|
681
|
+
with zf.open(name) as src, target.open("wb") as dst:
|
|
682
|
+
shutil.copyfileobj(src, dst)
|
|
683
|
+
self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
|
|
684
|
+
|
|
685
|
+
def _render_equation_assets(self, document_root: ET.Element) -> None:
|
|
686
|
+
math_nodes = document_root.findall(".//m:oMath", NS)
|
|
687
|
+
if not math_nodes:
|
|
688
|
+
return
|
|
689
|
+
render_jobs = []
|
|
690
|
+
for index, math_node in enumerate(math_nodes, 1):
|
|
691
|
+
if self.is_empty_equation(math_node):
|
|
692
|
+
self.empty_equation_indexes.add(index)
|
|
693
|
+
continue
|
|
694
|
+
render_jobs.append((index, math_node))
|
|
695
|
+
|
|
696
|
+
try:
|
|
697
|
+
from docx import Document
|
|
698
|
+
from docx.oxml import parse_xml
|
|
699
|
+
import win32com.client as win32
|
|
700
|
+
except Exception as exc:
|
|
701
|
+
self.stats.warnings.append(f"Equation image rendering unavailable: {exc!r}")
|
|
702
|
+
return
|
|
703
|
+
|
|
704
|
+
chunk_size = 1
|
|
705
|
+
word = win32.DispatchEx("Word.Application")
|
|
706
|
+
word.Visible = False
|
|
707
|
+
word.DisplayAlerts = 0
|
|
708
|
+
try:
|
|
709
|
+
for start in range(0, len(render_jobs), chunk_size):
|
|
710
|
+
chunk = render_jobs[start : start + chunk_size]
|
|
711
|
+
with tempfile.TemporaryDirectory(prefix="build-corpus-equations-") as tmp:
|
|
712
|
+
tmp_dir = Path(tmp)
|
|
713
|
+
temp_docx = tmp_dir / "equations.docx"
|
|
714
|
+
temp_html = tmp_dir / "equations.html"
|
|
715
|
+
|
|
716
|
+
doc = Document()
|
|
717
|
+
for absolute_index, math_node in chunk:
|
|
718
|
+
p = doc.add_paragraph(f"EQMARKER{absolute_index:06d} ")
|
|
719
|
+
p._p.append(parse_xml(ET.tostring(math_node, encoding="unicode")))
|
|
720
|
+
doc.save(temp_docx)
|
|
721
|
+
|
|
722
|
+
opened = word.Documents.Open(str(temp_docx), ReadOnly=True, AddToRecentFiles=False)
|
|
723
|
+
opened.SaveAs2(str(temp_html), FileFormat=10)
|
|
724
|
+
opened.Close(False)
|
|
725
|
+
|
|
726
|
+
html_assets = temp_html.with_name(temp_html.stem + "_files")
|
|
727
|
+
rendered = sorted(html_assets.glob("image*.png"))
|
|
728
|
+
if len(rendered) != len(chunk):
|
|
729
|
+
self.stats.warnings.append(
|
|
730
|
+
f"Equation image count mismatch in render chunk {start + 1}-{start + len(chunk)}: "
|
|
731
|
+
f"OMML={len(chunk)} rendered={len(rendered)}"
|
|
732
|
+
)
|
|
733
|
+
|
|
734
|
+
for (absolute_index, _math_node), source in zip(chunk, rendered):
|
|
735
|
+
target = self.assets_dir / f"eq-{absolute_index:06d}.png"
|
|
736
|
+
shutil.copyfile(source, target)
|
|
737
|
+
self.equation_asset_map[absolute_index] = f"{self.asset_ref_prefix}/{target.name}"
|
|
738
|
+
finally:
|
|
739
|
+
word.Quit()
|
|
740
|
+
self.stats.equation_images = len(self.equation_asset_map)
|
|
741
|
+
self.stats.skipped_empty_equations = len(self.empty_equation_indexes)
|
|
742
|
+
|
|
743
|
+
def render_children(self, node: ET.Element, top_level: bool = False) -> str:
|
|
744
|
+
parts: list[str] = []
|
|
745
|
+
for child in list(node):
|
|
746
|
+
rendered = self.render_block(child)
|
|
747
|
+
if not rendered:
|
|
748
|
+
continue
|
|
749
|
+
if top_level:
|
|
750
|
+
parts.append(rendered.rstrip())
|
|
751
|
+
else:
|
|
752
|
+
parts.append(rendered.strip())
|
|
753
|
+
sep = "\n\n" if top_level else "\n"
|
|
754
|
+
return sep.join(part for part in parts if part)
|
|
755
|
+
|
|
756
|
+
def render_block(self, node: ET.Element) -> str:
|
|
757
|
+
name = local_name(node.tag)
|
|
758
|
+
if name == "p":
|
|
759
|
+
return self.render_paragraph(node)
|
|
760
|
+
if name == "tbl":
|
|
761
|
+
return self.render_table(node)
|
|
762
|
+
if name == "sdt":
|
|
763
|
+
content = node.find("./w:sdtContent", NS)
|
|
764
|
+
return self.render_children(content, top_level=True) if content is not None else ""
|
|
765
|
+
if name in {"bookmarkStart", "bookmarkEnd", "sectPr", "proofErr", "permStart", "permEnd"}:
|
|
766
|
+
return ""
|
|
767
|
+
return self.render_children(node, top_level=False)
|
|
768
|
+
|
|
769
|
+
def render_paragraph(self, p: ET.Element) -> str:
|
|
770
|
+
style = paragraph_style(p)
|
|
771
|
+
content = self.render_inline_children(p)
|
|
772
|
+
if not content.strip():
|
|
773
|
+
return ""
|
|
774
|
+
content = strip_trailing_markdown_breaks(content)
|
|
775
|
+
|
|
776
|
+
if is_code_style(style) or (self.table_depth == 0 and paragraph_is_code(p)):
|
|
777
|
+
self.stats.code_blocks += 1
|
|
778
|
+
return self.render_code_paragraph(p)
|
|
779
|
+
|
|
780
|
+
level = heading_level(style)
|
|
781
|
+
if level:
|
|
782
|
+
self.stats.headings += 1
|
|
783
|
+
return f"{'#' * level} {self.strip_inline_markers(content)}"
|
|
784
|
+
|
|
785
|
+
if is_quote_style(style):
|
|
786
|
+
self.stats.paragraphs += 1
|
|
787
|
+
return f"> {content}"
|
|
788
|
+
|
|
789
|
+
num_info = paragraph_num_info(p)
|
|
790
|
+
if not num_info:
|
|
791
|
+
num_info = paragraph_list_style_info(style)
|
|
792
|
+
if num_info:
|
|
793
|
+
self.stats.lists += 1
|
|
794
|
+
list_level, ordered = num_info
|
|
795
|
+
indent = " " * list_level
|
|
796
|
+
bullet = "1." if ordered else "-"
|
|
797
|
+
return f"{indent}{bullet} {content}"
|
|
798
|
+
|
|
799
|
+
if paragraph_is_math(p):
|
|
800
|
+
if self.table_depth > 0:
|
|
801
|
+
return content
|
|
802
|
+
if not paragraph_has_display_math_layout(p):
|
|
803
|
+
return content
|
|
804
|
+
inner = content.strip()
|
|
805
|
+
if inner.startswith("$") and inner.endswith("$") and len(inner) >= 2:
|
|
806
|
+
inner = inner[1:-1]
|
|
807
|
+
return f"$$\n{inner}\n$$"
|
|
808
|
+
|
|
809
|
+
self.stats.paragraphs += 1
|
|
810
|
+
return content
|
|
811
|
+
|
|
812
|
+
def render_code_paragraph(self, p: ET.Element) -> str:
|
|
813
|
+
info = ""
|
|
814
|
+
code_parts: list[str] = []
|
|
815
|
+
first_nonempty_seen = False
|
|
816
|
+
|
|
817
|
+
for run in p.findall("./w:r", NS):
|
|
818
|
+
raw = extract_run_text(run)
|
|
819
|
+
if not raw:
|
|
820
|
+
continue
|
|
821
|
+
if not first_nonempty_seen and run_is_bold(run):
|
|
822
|
+
info = raw.strip()
|
|
823
|
+
first_nonempty_seen = True
|
|
824
|
+
continue
|
|
825
|
+
first_nonempty_seen = True
|
|
826
|
+
code_parts.append(raw)
|
|
827
|
+
|
|
828
|
+
code = "".join(code_parts).strip("\n")
|
|
829
|
+
fence = f"```{info}".rstrip()
|
|
830
|
+
return f"{fence}\n{code}\n```"
|
|
831
|
+
|
|
832
|
+
def render_inline_children(self, node: ET.Element) -> str:
|
|
833
|
+
segments: list[tuple[Optional[InlineStyle], str]] = []
|
|
834
|
+
for child in list(node):
|
|
835
|
+
name = local_name(child.tag)
|
|
836
|
+
if name == "r":
|
|
837
|
+
segments.extend(self.render_run_segments(child))
|
|
838
|
+
elif name == "hyperlink":
|
|
839
|
+
label = self.render_inline_children(child).strip()
|
|
840
|
+
anchor = attr(child, "w", "anchor")
|
|
841
|
+
rid = attr(child, "r", "id")
|
|
842
|
+
url = f"#{anchor}" if anchor else self.rels.get(rid or "", "")
|
|
843
|
+
segments.append((None, f"[{label}]({url})" if url else label))
|
|
844
|
+
elif name == "oMath":
|
|
845
|
+
segments.append((None, self.render_math(child, display=False)))
|
|
846
|
+
elif name == "oMathPara":
|
|
847
|
+
segments.append((None, self.render_math(child, display=True)))
|
|
848
|
+
elif name == "drawing":
|
|
849
|
+
img = self.render_image(child)
|
|
850
|
+
if img:
|
|
851
|
+
segments.append((None, img))
|
|
852
|
+
elif name in {"pPr", "rPr"}:
|
|
853
|
+
continue
|
|
854
|
+
else:
|
|
855
|
+
segments.append((None, self.render_inline_children(child)))
|
|
856
|
+
return coalesce_inline_segments(segments)
|
|
857
|
+
|
|
858
|
+
def run_style(self, run: ET.Element) -> InlineStyle:
|
|
859
|
+
props = run.find("./w:rPr", NS)
|
|
860
|
+
if props is None:
|
|
861
|
+
return False, False, False
|
|
862
|
+
style_node = props.find("./w:rStyle", NS)
|
|
863
|
+
is_code = (
|
|
864
|
+
style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower()
|
|
865
|
+
) or run_is_code(run)
|
|
866
|
+
bold = props.find("./w:b", NS) is not None
|
|
867
|
+
italic = props.find("./w:i", NS) is not None
|
|
868
|
+
return is_code, bold, italic
|
|
869
|
+
|
|
870
|
+
def render_run_segments(self, run: ET.Element) -> list[tuple[Optional[InlineStyle], str]]:
|
|
871
|
+
style = self.run_style(run)
|
|
872
|
+
is_math = run_is_math(run)
|
|
873
|
+
segments: list[tuple[Optional[InlineStyle], str]] = []
|
|
874
|
+
run_parts: list[str] = []
|
|
875
|
+
|
|
876
|
+
def flush_text() -> None:
|
|
877
|
+
if run_parts:
|
|
878
|
+
text = "".join(run_parts)
|
|
879
|
+
if is_math:
|
|
880
|
+
segments.append((None, f"${text}$"))
|
|
881
|
+
else:
|
|
882
|
+
segments.append((style, text))
|
|
883
|
+
run_parts.clear()
|
|
884
|
+
|
|
885
|
+
for child in list(run):
|
|
886
|
+
name = local_name(child.tag)
|
|
887
|
+
if name == "t":
|
|
888
|
+
run_parts.append(clean_text(child.text or "") if is_math else escape_md_text(child.text or ""))
|
|
889
|
+
elif name == "noBreakHyphen":
|
|
890
|
+
run_parts.append("\u2011")
|
|
891
|
+
elif name == "softHyphen":
|
|
892
|
+
run_parts.append("\u00ad")
|
|
893
|
+
elif name == "tab":
|
|
894
|
+
run_parts.append("\t")
|
|
895
|
+
elif name in {"br", "cr"}:
|
|
896
|
+
run_parts.append(" \n")
|
|
897
|
+
elif name == "drawing":
|
|
898
|
+
flush_text()
|
|
899
|
+
img = self.render_image(child)
|
|
900
|
+
if img:
|
|
901
|
+
segments.append((None, img))
|
|
902
|
+
elif name == "rPr":
|
|
903
|
+
continue
|
|
904
|
+
else:
|
|
905
|
+
flush_text()
|
|
906
|
+
nested = self.render_inline_children(child)
|
|
907
|
+
if nested:
|
|
908
|
+
segments.append((None, nested))
|
|
909
|
+
|
|
910
|
+
flush_text()
|
|
911
|
+
return segments
|
|
912
|
+
|
|
913
|
+
def render_math(self, node: ET.Element, display: bool) -> str:
|
|
914
|
+
self.stats.equations += 1
|
|
915
|
+
self.equation_index += 1
|
|
916
|
+
if self.equation_index in self.empty_equation_indexes or self.is_empty_equation(node):
|
|
917
|
+
self.stats.skipped_empty_equations = max(
|
|
918
|
+
self.stats.skipped_empty_equations,
|
|
919
|
+
len(self.empty_equation_indexes),
|
|
920
|
+
)
|
|
921
|
+
return ""
|
|
922
|
+
if self.equation_mode == "image":
|
|
923
|
+
asset = self.equation_asset_map.get(self.equation_index)
|
|
924
|
+
if asset:
|
|
925
|
+
alt = f"equation {self.equation_index}"
|
|
926
|
+
rendered = f""
|
|
927
|
+
return f"\n{rendered}\n" if display and self.table_depth == 0 else rendered
|
|
928
|
+
self.stats.warnings.append(f"Missing rendered equation asset for equation {self.equation_index}")
|
|
929
|
+
return self.render_missing_equation_fallback(node)
|
|
930
|
+
|
|
931
|
+
try:
|
|
932
|
+
tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
|
|
933
|
+
render_display = display and self.table_depth == 0
|
|
934
|
+
rendered = normalize_tex(tex, display=render_display)
|
|
935
|
+
if len(self.equation_samples) < 50:
|
|
936
|
+
self.equation_samples.append({
|
|
937
|
+
"source": "".join(t.text or "" for t in node.findall(".//m:t", NS))[:220],
|
|
938
|
+
"tex": rendered[:500],
|
|
939
|
+
})
|
|
940
|
+
return f"\n{rendered}\n" if render_display else rendered
|
|
941
|
+
except Exception as exc:
|
|
942
|
+
self.stats.equation_errors += 1
|
|
943
|
+
self.stats.warnings.append(f"Equation conversion failed: {exc!r}")
|
|
944
|
+
fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
|
|
945
|
+
return f"`[equation: {fallback}]`"
|
|
946
|
+
|
|
947
|
+
def render_missing_equation_fallback(self, node: ET.Element) -> str:
|
|
948
|
+
try:
|
|
949
|
+
tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
|
|
950
|
+
return normalize_tex(tex, display=False)
|
|
951
|
+
except Exception:
|
|
952
|
+
fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
|
|
953
|
+
return f"`[equation: {fallback}]`"
|
|
954
|
+
|
|
955
|
+
@staticmethod
|
|
956
|
+
def is_empty_equation(node: ET.Element) -> bool:
|
|
957
|
+
text = "".join(t.text or "" for t in node.findall(".//m:t", NS)).strip()
|
|
958
|
+
return not text
|
|
959
|
+
|
|
960
|
+
def render_image(self, node: ET.Element) -> str:
|
|
961
|
+
preferred_name = image_metadata_filename(node)
|
|
962
|
+
refs = []
|
|
963
|
+
for blip in node.findall(".//a:blip", NS):
|
|
964
|
+
rid = attr(blip, "r", "embed") or attr(blip, "r", "link")
|
|
965
|
+
if rid:
|
|
966
|
+
refs.append(rid)
|
|
967
|
+
rendered = []
|
|
968
|
+
for rid in refs:
|
|
969
|
+
target = self.rels.get(rid, rid)
|
|
970
|
+
source = resolve_image_target(target)
|
|
971
|
+
asset = self.media_map.get(source)
|
|
972
|
+
if not asset:
|
|
973
|
+
self.stats.warnings.append(f"Image relationship not copied: {rid} -> {target}")
|
|
974
|
+
continue
|
|
975
|
+
if preferred_name and self.image_mode == "assets":
|
|
976
|
+
current_path = self.output_dir / asset
|
|
977
|
+
preferred_path = self.assets_dir / preferred_name
|
|
978
|
+
if current_path.exists() and preferred_path != current_path and not preferred_path.exists():
|
|
979
|
+
shutil.copyfile(current_path, preferred_path)
|
|
980
|
+
asset = f"{self.asset_ref_prefix}/{preferred_path.name}"
|
|
981
|
+
self.stats.images += 1
|
|
982
|
+
rendered.append(f"")
|
|
983
|
+
return " ".join(rendered)
|
|
984
|
+
|
|
985
|
+
def render_table(self, tbl: ET.Element) -> str:
|
|
986
|
+
self.stats.tables += 1
|
|
987
|
+
self.table_depth += 1
|
|
988
|
+
try:
|
|
989
|
+
rows = tbl.findall("./w:tr", NS)
|
|
990
|
+
rendered_rows = []
|
|
991
|
+
complex_table = False
|
|
992
|
+
for row in rows:
|
|
993
|
+
cells = row.findall("./w:tc", NS)
|
|
994
|
+
rendered_cells = []
|
|
995
|
+
for cell in cells:
|
|
996
|
+
if cell.find(".//w:tbl", NS) is not None:
|
|
997
|
+
complex_table = True
|
|
998
|
+
cell_text = self.render_cell(cell)
|
|
999
|
+
if "\n\n" in cell_text:
|
|
1000
|
+
complex_table = True
|
|
1001
|
+
rendered_cells.append(cell_text)
|
|
1002
|
+
rendered_rows.append(rendered_cells)
|
|
1003
|
+
finally:
|
|
1004
|
+
self.table_depth -= 1
|
|
1005
|
+
|
|
1006
|
+
if not rendered_rows:
|
|
1007
|
+
return ""
|
|
1008
|
+
if complex_table:
|
|
1009
|
+
self.stats.html_tables += 1
|
|
1010
|
+
return self.render_html_table(rendered_rows)
|
|
1011
|
+
self.stats.markdown_tables += 1
|
|
1012
|
+
return self.render_markdown_table(rendered_rows)
|
|
1013
|
+
|
|
1014
|
+
def render_cell(self, cell: ET.Element) -> str:
|
|
1015
|
+
parts = []
|
|
1016
|
+
for child in list(cell):
|
|
1017
|
+
if local_name(child.tag) == "tcPr":
|
|
1018
|
+
continue
|
|
1019
|
+
rendered = self.render_block(child)
|
|
1020
|
+
if rendered:
|
|
1021
|
+
parts.append(rendered.strip())
|
|
1022
|
+
return "<br>".join(parts).strip()
|
|
1023
|
+
|
|
1024
|
+
def render_markdown_table(self, rows: list[list[str]]) -> str:
|
|
1025
|
+
width = max(len(row) for row in rows)
|
|
1026
|
+
padded = [row + [""] * (width - len(row)) for row in rows]
|
|
1027
|
+
|
|
1028
|
+
def clean_cell(value: str) -> str:
|
|
1029
|
+
return value.replace("\n", "<br>").replace("|", "\\|").strip()
|
|
1030
|
+
|
|
1031
|
+
lines = []
|
|
1032
|
+
lines.append("| " + " | ".join(clean_cell(v) for v in padded[0]) + " |")
|
|
1033
|
+
lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
|
|
1034
|
+
for row in padded[1:]:
|
|
1035
|
+
lines.append("| " + " | ".join(clean_cell(v) for v in row) + " |")
|
|
1036
|
+
return "\n".join(lines)
|
|
1037
|
+
|
|
1038
|
+
def render_html_table(self, rows: list[list[str]]) -> str:
|
|
1039
|
+
html_rows = ["<table>"]
|
|
1040
|
+
for row in rows:
|
|
1041
|
+
html_rows.append(" <tr>")
|
|
1042
|
+
for cell in row:
|
|
1043
|
+
# Keep inline Markdown-ish math readable inside HTML fallback.
|
|
1044
|
+
html_rows.append(f" <td>{html.escape(cell, quote=False).replace(chr(10), '<br>')}</td>")
|
|
1045
|
+
html_rows.append(" </tr>")
|
|
1046
|
+
html_rows.append("</table>")
|
|
1047
|
+
return "\n".join(html_rows)
|
|
1048
|
+
|
|
1049
|
+
@staticmethod
|
|
1050
|
+
def strip_inline_markers(text: str) -> str:
|
|
1051
|
+
return text.replace("\n", " ").strip()
|
|
1052
|
+
|
|
1053
|
+
|
|
1054
|
+
def export_one(
|
|
1055
|
+
input_path: Path,
|
|
1056
|
+
output_root: Path,
|
|
1057
|
+
equation_mode: str,
|
|
1058
|
+
out_same_dir: bool,
|
|
1059
|
+
image_mode: str,
|
|
1060
|
+
s3_config: S3ImageConfig | None = None,
|
|
1061
|
+
) -> dict:
|
|
1062
|
+
if out_same_dir:
|
|
1063
|
+
output_dir = input_path.parent
|
|
1064
|
+
output_md = input_path.with_suffix(".md")
|
|
1065
|
+
assets_dir = input_path.with_name(input_path.stem + ".assets")
|
|
1066
|
+
report_path = input_path.with_name(input_path.stem + ".export-report.json")
|
|
1067
|
+
else:
|
|
1068
|
+
output_dir = output_root / input_path.stem
|
|
1069
|
+
output_md = None
|
|
1070
|
+
assets_dir = None
|
|
1071
|
+
report_path = None
|
|
1072
|
+
exporter = BuildCorpusExporter(
|
|
1073
|
+
input_path,
|
|
1074
|
+
output_dir,
|
|
1075
|
+
equation_mode=equation_mode,
|
|
1076
|
+
output_md=output_md,
|
|
1077
|
+
assets_dir=assets_dir,
|
|
1078
|
+
report_path=report_path,
|
|
1079
|
+
image_mode=image_mode,
|
|
1080
|
+
s3_config=s3_config,
|
|
1081
|
+
)
|
|
1082
|
+
return exporter.export()
|
|
1083
|
+
|
|
1084
|
+
|
|
1085
|
+
def collect_inputs(path: Path, target: str) -> list[Path]:
|
|
1086
|
+
if path.is_file():
|
|
1087
|
+
return [path]
|
|
1088
|
+
inputs: list[Path] = []
|
|
1089
|
+
patterns = ("*.md",) if target == "word" else ("*.docx", "*.pptx", "*.ppt")
|
|
1090
|
+
for ext in patterns:
|
|
1091
|
+
inputs.extend(path.rglob(ext))
|
|
1092
|
+
return sorted(inputs)
|
|
1093
|
+
|
|
1094
|
+
|
|
1095
|
+
def main() -> None:
|
|
1096
|
+
parser = argparse.ArgumentParser(
|
|
1097
|
+
description="Convert Markdown to DOCX or DOCX/PPTX/PPT to Markdown.",
|
|
1098
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
1099
|
+
epilog="""examples:
|
|
1100
|
+
build-corpus input.docx --out out
|
|
1101
|
+
build-corpus input.md --to word --out out
|
|
1102
|
+
build-corpus ./word-files --out ./markdown
|
|
1103
|
+
build-corpus ./word-files --out-same-dir
|
|
1104
|
+
build-corpus input.docx --images base64
|
|
1105
|
+
build-corpus input.docx --images s3 --config build-corpus.config.json
|
|
1106
|
+
|
|
1107
|
+
image modes:
|
|
1108
|
+
assets copy images into an assets folder and reference them from Markdown
|
|
1109
|
+
base64 embed images directly as Markdown data URIs
|
|
1110
|
+
s3 upload images to S3-compatible storage such as Cloudflare R2 or AWS S3
|
|
1111
|
+
|
|
1112
|
+
equation modes:
|
|
1113
|
+
tex convert Word OMML equations to KaTeX-readable TeX
|
|
1114
|
+
image render equations as images for visual debugging only
|
|
1115
|
+
""",
|
|
1116
|
+
)
|
|
1117
|
+
parser.add_argument("input", type=Path, help="Markdown, DOCX, PPTX, or PPT file or directory")
|
|
1118
|
+
parser.add_argument("--config", type=Path, help="JSON config file with conversion, output, and S3/R2 defaults")
|
|
1119
|
+
parser.add_argument("--out", type=Path, help="Output directory for converted Markdown tree")
|
|
1120
|
+
parser.add_argument("--to", choices=["auto", "markdown", "word"], help="Output target; auto infers from a single-file input")
|
|
1121
|
+
parser.add_argument("--equations", choices=["tex", "image"], help="Equation output mode; default comes from config or tex")
|
|
1122
|
+
parser.add_argument("--images", choices=["assets", "base64", "s3"], help="Image output mode; default comes from config or assets")
|
|
1123
|
+
parser.add_argument("--out-same-dir", action="store_true", help="Write .md, .assets, and reports beside each source DOCX")
|
|
1124
|
+
parser.add_argument("--word-template", type=Path, help="Optional .docx or .dotx template used for Markdown to Word exports")
|
|
1125
|
+
parser.add_argument("--s3-bucket", help="S3/R2 bucket name for --images s3")
|
|
1126
|
+
parser.add_argument("--s3-public-base-url", help="Public URL base used in Markdown, e.g. https://assets.example.com")
|
|
1127
|
+
parser.add_argument("--s3-prefix", help="Object key prefix for uploaded images")
|
|
1128
|
+
parser.add_argument("--s3-endpoint-url", help="S3-compatible endpoint, required for Cloudflare R2")
|
|
1129
|
+
parser.add_argument("--s3-region", help="S3 region; use auto for Cloudflare R2")
|
|
1130
|
+
parser.add_argument("--s3-access-key-id", help="S3/R2 access key id; can also come from config/env expansion")
|
|
1131
|
+
parser.add_argument("--s3-secret-access-key", help="S3/R2 secret access key; can also come from config/env expansion")
|
|
1132
|
+
parser.add_argument("--s3-cache-control", help="Cache-Control header for uploaded images")
|
|
1133
|
+
parser.add_argument("--s3-acl", help="Optional ACL for AWS S3; usually omitted for Cloudflare R2")
|
|
1134
|
+
args = parser.parse_args()
|
|
1135
|
+
config = load_config(args.config)
|
|
1136
|
+
|
|
1137
|
+
args.out = args.out or Path(config_get(config, "output.out", ".codex/build-corpus/out"))
|
|
1138
|
+
args.to = args.to or config_get(config, "conversion.target", "auto")
|
|
1139
|
+
args.equations = args.equations or config_get(config, "conversion.equations", "tex")
|
|
1140
|
+
args.images = args.images or config_get(config, "conversion.images", "assets")
|
|
1141
|
+
args.out_same_dir = args.out_same_dir or bool(config_get(config, "output.out_same_dir", False))
|
|
1142
|
+
args.word_template = args.word_template or (
|
|
1143
|
+
Path(config_get(config, "word.template")) if config_get(config, "word.template") else None
|
|
1144
|
+
)
|
|
1145
|
+
s3_config = build_s3_config(config, args)
|
|
1146
|
+
|
|
1147
|
+
input_target = args.to
|
|
1148
|
+
if args.input.is_file() and args.to == "auto":
|
|
1149
|
+
input_target = "word" if args.input.suffix.lower() == ".md" else "markdown"
|
|
1150
|
+
elif args.input.is_dir() and args.to == "auto":
|
|
1151
|
+
input_target = "markdown"
|
|
1152
|
+
|
|
1153
|
+
reports = []
|
|
1154
|
+
for input_path in collect_inputs(args.input, input_target):
|
|
1155
|
+
if input_path.name.startswith("~$"):
|
|
1156
|
+
continue
|
|
1157
|
+
suffix = input_path.suffix.lower()
|
|
1158
|
+
if input_target == "word" or suffix == ".md":
|
|
1159
|
+
reports.append(export_markdown_to_docx(
|
|
1160
|
+
input_path,
|
|
1161
|
+
args.out,
|
|
1162
|
+
args.out_same_dir,
|
|
1163
|
+
template_path=args.word_template,
|
|
1164
|
+
))
|
|
1165
|
+
elif suffix in {".pptx", ".ppt"}:
|
|
1166
|
+
reports.append(export_presentation(
|
|
1167
|
+
input_path,
|
|
1168
|
+
args.out,
|
|
1169
|
+
args.out_same_dir,
|
|
1170
|
+
image_mode=args.images,
|
|
1171
|
+
))
|
|
1172
|
+
else:
|
|
1173
|
+
reports.append(export_one(
|
|
1174
|
+
input_path,
|
|
1175
|
+
args.out,
|
|
1176
|
+
equation_mode=args.equations,
|
|
1177
|
+
out_same_dir=args.out_same_dir,
|
|
1178
|
+
image_mode=args.images,
|
|
1179
|
+
s3_config=s3_config,
|
|
1180
|
+
))
|
|
1181
|
+
|
|
1182
|
+
batch_report_root = args.input if args.out_same_dir and args.input.is_dir() else args.out
|
|
1183
|
+
batch_report_root.mkdir(parents=True, exist_ok=True)
|
|
1184
|
+
batch_report = batch_report_root / "build-corpus-batch-report.json"
|
|
1185
|
+
batch_report.write_text(json.dumps(reports, indent=2, ensure_ascii=False), encoding="utf-8")
|
|
1186
|
+
print(json.dumps({
|
|
1187
|
+
"converted": len(reports),
|
|
1188
|
+
"batch_report": str(batch_report),
|
|
1189
|
+
"outputs": [report["output"] for report in reports],
|
|
1190
|
+
"default_word_template": str(args.word_template or resolve_default_template_path() or "bundled:md-to-word-template.dotx"),
|
|
1191
|
+
}, indent=2))
|
|
1192
|
+
|
|
1193
|
+
|
|
1194
|
+
if __name__ == "__main__":
|
|
1195
|
+
main()
|