regen.mde 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/LICENSE +16 -0
  2. package/README.md +295 -0
  3. package/bin/build-corpus-editor.js +81 -0
  4. package/bin/build-corpus.js +41 -0
  5. package/bin/postinstall.js +187 -0
  6. package/bin/regen-mdeditor-install.js +27 -0
  7. package/bin/regen-mdeditor-uninstall.js +19 -0
  8. package/bin/validate-katex.js +93 -0
  9. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +270 -0
  10. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -0
  11. package/desktop/BuildCorpusEditor/EditorForm.cs +540 -0
  12. package/desktop/BuildCorpusEditor/Program.cs +81 -0
  13. package/desktop/BuildCorpusEditor/app.manifest +16 -0
  14. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  15. package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
  16. package/dist/windows-editor/BuildCorpusEditor.deps.json +83 -0
  17. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  18. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  19. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  20. package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +19 -0
  21. package/dist/windows-editor/Microsoft.Web.WebView2.Core.dll +0 -0
  22. package/dist/windows-editor/Microsoft.Web.WebView2.Core.xml +6817 -0
  23. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.dll +0 -0
  24. package/dist/windows-editor/Microsoft.Web.WebView2.WinForms.xml +510 -0
  25. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.dll +0 -0
  26. package/dist/windows-editor/Microsoft.Web.WebView2.Wpf.xml +1902 -0
  27. package/dist/windows-editor/WebView2Loader.dll +0 -0
  28. package/dist/windows-editor/runtimes/win-x64/native/WebView2Loader.dll +0 -0
  29. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +326 -0
  30. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +1 -0
  31. package/dist/windows-editor/wwwroot/index.html +22 -0
  32. package/editor-web/index.html +21 -0
  33. package/editor-web/src/main.jsx +399 -0
  34. package/editor-web/src/styles.css +602 -0
  35. package/editor-web/vite.config.js +13 -0
  36. package/examples/build-corpus.config.example.json +21 -0
  37. package/installer/install-regen-mde.ps1 +175 -0
  38. package/installer/regen-mde.nsi +81 -0
  39. package/package.json +86 -0
  40. package/pyproject.toml +33 -0
  41. package/requirements.txt +4 -0
  42. package/scripts/build-windows-editor.ps1 +47 -0
  43. package/scripts/package-windows-editor.ps1 +90 -0
  44. package/scripts/run-corpus.ps1 +28 -0
  45. package/scripts/run-editor-implementation-plane.ps1 +203 -0
  46. package/scripts/run-required-tests.ps1 +98 -0
  47. package/scripts/run-smoke.ps1 +28 -0
  48. package/src/build_corpus/__init__.py +3 -0
  49. package/src/build_corpus/docx_exporter.py +798 -0
  50. package/src/build_corpus/exporter.py +1195 -0
  51. package/src/build_corpus/ppt_exporter.py +532 -0
  52. package/src/build_corpus/templates/__init__.py +1 -0
  53. package/src/build_corpus/templates/md-to-word-template.dotx +0 -0
  54. package/src/build_corpus/validate_assets.py +46 -0
  55. package/tools/audit_corpus.py +203 -0
  56. package/tools/collect_microsoft_word_templates.py +228 -0
  57. package/tools/collect_online_docx_corpus.py +272 -0
  58. package/tools/collect_online_pptx_corpus.py +252 -0
  59. package/tools/compare_pptx_inputs_outputs.py +87 -0
  60. package/tools/roundtrip_docx_corpus.py +171 -0
@@ -0,0 +1,1195 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import base64
5
+ import contextlib
6
+ import hashlib
7
+ import html
8
+ import json
9
+ import mimetypes
10
+ import os
11
+ import re
12
+ import shutil
13
+ import subprocess
14
+ import tempfile
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ from typing import Optional
18
+ from zipfile import ZipFile
19
+ from xml.etree import ElementTree as ET
20
+
21
+ from omml2latex import convert_omml
22
+ try:
23
+ from .docx_exporter import export_markdown_to_docx, resolve_default_template_path
24
+ except ImportError: # pragma: no cover - allows direct script execution
25
+ from build_corpus.docx_exporter import export_markdown_to_docx, resolve_default_template_path
26
+ try:
27
+ from .ppt_exporter import export_presentation
28
+ except ImportError: # pragma: no cover - allows direct script execution
29
+ from build_corpus.ppt_exporter import export_presentation
30
+
31
+
32
+ NS = {
33
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
34
+ "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
35
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
36
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
37
+ "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
38
+ }
39
+
40
+ W = f"{{{NS['w']}}}"
41
+ R = f"{{{NS['r']}}}"
42
+
43
+
44
+ @dataclass
45
+ class ExportStats:
46
+ paragraphs: int = 0
47
+ headings: int = 0
48
+ code_blocks: int = 0
49
+ tables: int = 0
50
+ markdown_tables: int = 0
51
+ html_tables: int = 0
52
+ equations: int = 0
53
+ equation_images: int = 0
54
+ skipped_empty_equations: int = 0
55
+ equation_errors: int = 0
56
+ images: int = 0
57
+ lists: int = 0
58
+ warnings: list[str] = field(default_factory=list)
59
+
60
+
61
+ @dataclass
62
+ class S3ImageConfig:
63
+ bucket: str
64
+ public_base_url: str
65
+ prefix: str = ""
66
+ endpoint_url: str | None = None
67
+ region_name: str | None = None
68
+ access_key_id: str | None = None
69
+ secret_access_key: str | None = None
70
+ cache_control: str = "public, max-age=31536000, immutable"
71
+ acl: str | None = None
72
+
73
+
74
+ class S3ImageUploader:
75
+ def __init__(self, config: S3ImageConfig):
76
+ self.config = config
77
+ try:
78
+ import boto3
79
+ except ImportError as exc:
80
+ raise RuntimeError("S3/R2 image mode requires boto3. Install with: pip install boto3") from exc
81
+
82
+ kwargs = {
83
+ "service_name": "s3",
84
+ "endpoint_url": config.endpoint_url,
85
+ "region_name": config.region_name,
86
+ "aws_access_key_id": config.access_key_id,
87
+ "aws_secret_access_key": config.secret_access_key,
88
+ }
89
+ self.client = boto3.client(**{key: value for key, value in kwargs.items() if value})
90
+
91
+ def upload(self, source_name: str, data: bytes, content_type: str) -> dict[str, str]:
92
+ digest = hashlib.sha256(data).hexdigest()
93
+ suffix = Path(source_name).suffix.lower()
94
+ key_parts = [self.config.prefix.strip("/"), "images", "sha256", f"{digest}{suffix}"]
95
+ key = "/".join(part for part in key_parts if part)
96
+ put_args = {
97
+ "Bucket": self.config.bucket,
98
+ "Key": key,
99
+ "Body": data,
100
+ "ContentType": content_type,
101
+ "CacheControl": self.config.cache_control,
102
+ }
103
+ if self.config.acl:
104
+ put_args["ACL"] = self.config.acl
105
+ self.client.put_object(**put_args)
106
+ return {
107
+ "source": source_name,
108
+ "sha256": digest,
109
+ "bucket": self.config.bucket,
110
+ "key": key,
111
+ "url": f"{self.config.public_base_url.rstrip('/')}/{key}",
112
+ "content_type": content_type,
113
+ "bytes": str(len(data)),
114
+ }
115
+
116
+
117
+ def local_name(tag: str) -> str:
118
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
119
+
120
+
121
+ def attr(node: ET.Element, ns: str, name: str) -> str | None:
122
+ return node.attrib.get(f"{{{NS[ns]}}}{name}")
123
+
124
+
125
+ def clean_text(text: str) -> str:
126
+ return (
127
+ text.replace("\u00a0", " ")
128
+ .replace("\u200b", "")
129
+ .replace("\ufeff", "")
130
+ )
131
+
132
+
133
+ def escape_md_text(text: str) -> str:
134
+ text = clean_text(text)
135
+ escaped: list[str] = []
136
+ index = 0
137
+ while index < len(text):
138
+ char = text[index]
139
+ if char == "\\":
140
+ next_char = text[index + 1] if index + 1 < len(text) else ""
141
+ if next_char in "\\`*_{}[]()#+.!|$-":
142
+ escaped.append("\\")
143
+ escaped.append(next_char)
144
+ index += 2
145
+ continue
146
+ escaped.append("\\\\")
147
+ elif char in {"*", "_", "$"}:
148
+ escaped.append("\\" + char)
149
+ else:
150
+ escaped.append(char)
151
+ index += 1
152
+ return "".join(escaped)
153
+
154
+
155
+ def normalize_inline_markers(text: str) -> str:
156
+ # Ensure inline images do not glue themselves to adjacent text.
157
+ text = re.sub(r"(\!\[[^\]]*\]\([^)]+\))(?=[^\s<>)\].,;:!?])", r"\1 ", text)
158
+ text = re.sub(r"(?<=[^\s<(\[.,;:!?])(\!\[[^\]]*\]\([^)]+\))", r" \1", text)
159
+ return text
160
+
161
+
162
+ def strip_trailing_markdown_breaks(text: str) -> str:
163
+ while text.endswith(" "):
164
+ text = text[:-2]
165
+ return text.rstrip()
166
+
167
+
168
+ InlineStyle = tuple[bool, bool, bool]
169
+
170
+
171
+ def apply_inline_style(text: str, style: InlineStyle) -> str:
172
+ is_code, bold, italic = style
173
+ if not text:
174
+ return ""
175
+ if not text.strip():
176
+ return text
177
+ if is_code:
178
+ return f"`{text.replace('`', '\\`')}`"
179
+ if bold and italic:
180
+ return f"***{text}***"
181
+ if bold:
182
+ return f"**{text}**"
183
+ if italic:
184
+ return f"*{text}*"
185
+ return text
186
+
187
+
188
+ def coalesce_inline_segments(segments: list[tuple[Optional[InlineStyle], str]]) -> str:
189
+ parts: list[str] = []
190
+ buffer: list[str] = []
191
+ buffer_style: Optional[InlineStyle] = None
192
+
193
+ def flush() -> None:
194
+ nonlocal buffer_style
195
+ if not buffer:
196
+ return
197
+ parts.append(apply_inline_style("".join(buffer), buffer_style or (False, False, False)))
198
+ buffer.clear()
199
+ buffer_style = None
200
+
201
+ for style, text in segments:
202
+ if not text:
203
+ continue
204
+ if style is None:
205
+ flush()
206
+ parts.append(text)
207
+ continue
208
+ if buffer_style == style:
209
+ buffer.append(text)
210
+ continue
211
+ flush()
212
+ buffer_style = style
213
+ buffer.append(text)
214
+
215
+ flush()
216
+ return normalize_inline_markers("".join(parts))
217
+
218
+
219
+ def normalize_tex(tex: str, display: bool) -> str:
220
+ tex = clean_text(tex).strip()
221
+ if tex.startswith("$$") and tex.endswith("$$"):
222
+ tex = tex[2:-2].strip()
223
+ elif tex.startswith("$") and tex.endswith("$"):
224
+ tex = tex[1:-1].strip()
225
+
226
+ tex = tex.replace("\u2011", "-")
227
+ tex = tex.replace("$", r"\$")
228
+ tex = tex.replace(r"\text{ }", r"\,")
229
+ tex = tex.replace(r"\text{  }", r"\;")
230
+ tex = tex.replace(r"\text{ }", " ")
231
+ tex = tex.replace(r"\mathrm{\}\text{*}}", r"\*")
232
+ tex = tex.replace(r"\text{-}", "-")
233
+ tex = tex.replace(r"\*", "*")
234
+ tex = replace_raw_unicode_math(tex)
235
+ tex = strip_word_equation_field_codes(tex)
236
+ tex = escape_text_macro_underscores(tex)
237
+ tex = repair_underbrace_limits(tex)
238
+ tex = balance_tex_braces(tex)
239
+ tex = re.sub(r"\s+", " ", tex).strip()
240
+ return f"$$\n{tex}\n$$" if display else f"${tex}$"
241
+
242
+
243
+ UNICODE_MATH_REPLACEMENTS = {
244
+ "∸": r"\dot{-}",
245
+ "⨅": r"\sqcap",
246
+ "⨃": r"\bigcup",
247
+ "⋜": r"\lessgtr",
248
+ "⋝": r"\gtrless",
249
+ "∱": r"\oint",
250
+ "∲": r"\oint",
251
+ "∳": r"\oint",
252
+ "ℇ": r"\varepsilon",
253
+ "Ϝ": r"\digamma",
254
+ "℩": r"\iota",
255
+ "Å": r"\mathring{A}",
256
+ "℮": "e",
257
+ }
258
+
259
+
260
+ def replace_raw_unicode_math(tex: str) -> str:
261
+ for raw, replacement in UNICODE_MATH_REPLACEMENTS.items():
262
+ tex = tex.replace(raw, replacement)
263
+ return tex
264
+
265
+
266
+ def strip_word_equation_field_codes(tex: str) -> str:
267
+ # Word SEQ fields can leak into OMML conversion as equation-number text.
268
+ patterns = [
269
+ r"#\s*\\left\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\\right\)",
270
+ r"#\s*\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\)",
271
+ r"#\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+",
272
+ ]
273
+ for pattern in patterns:
274
+ tex = re.sub(pattern, "", tex, flags=re.IGNORECASE)
275
+ return tex
276
+
277
+
278
+ def escape_text_macro_underscores(tex: str) -> str:
279
+ def replace(match: re.Match[str]) -> str:
280
+ body = match.group(1)
281
+ body = body.replace("\\", r"\textbackslash{}")
282
+ body = body.replace("_", r"\_")
283
+ body = body.replace("&", r"\&")
284
+ body = body.replace("%", r"\%")
285
+ body = body.replace("#", r"\#")
286
+ return r"\text{" + body + "}"
287
+
288
+ return re.sub(r"\\text\{([^{}]*)\}", replace, tex)
289
+
290
+
291
+ def repair_underbrace_limits(tex: str) -> str:
292
+ pattern = re.compile(
293
+ r"\\mathop\{\\mathop\{(?P<base>.*?)\}\\limits_\{\s*\\underbrace\s*\}\}\\limits_\{(?P<label>.*?)\}"
294
+ r"(?=(?:[+\-]|\\cdot|\\times|=|,|;|$))",
295
+ re.DOTALL,
296
+ )
297
+ previous = None
298
+ while previous != tex:
299
+ previous = tex
300
+ tex = pattern.sub(r"\\underbrace{\g<base>}_{\g<label>}", tex)
301
+ return tex
302
+
303
+
304
+ def balance_tex_braces(tex: str) -> str:
305
+ balanced: list[str] = []
306
+ depth = 0
307
+ escaped = False
308
+ for char in tex:
309
+ if escaped:
310
+ balanced.append(char)
311
+ escaped = False
312
+ continue
313
+ if char == "\\":
314
+ balanced.append(char)
315
+ escaped = True
316
+ continue
317
+ if char == "{":
318
+ depth += 1
319
+ balanced.append(char)
320
+ elif char == "}":
321
+ if depth > 0:
322
+ depth -= 1
323
+ balanced.append(char)
324
+ # Drop unmatched closing braces; KaTeX rejects them.
325
+ else:
326
+ balanced.append(char)
327
+ if depth > 0:
328
+ balanced.extend("}" for _ in range(depth))
329
+ return "".join(balanced)
330
+
331
+
332
+ def paragraph_style(node: ET.Element) -> str | None:
333
+ style = node.find("./w:pPr/w:pStyle", NS)
334
+ return attr(style, "w", "val") if style is not None else None
335
+
336
+
337
+ def heading_level(style: str | None) -> int | None:
338
+ if not style:
339
+ return None
340
+ match = re.fullmatch(r"Heading([1-6])", style)
341
+ if match:
342
+ return int(match.group(1))
343
+ return None
344
+
345
+
346
+ def is_code_style(style: str | None) -> bool:
347
+ return bool(style and "code" in style.lower())
348
+
349
+
350
+ def is_quote_style(style: str | None) -> bool:
351
+ if not style:
352
+ return False
353
+ normalized = style.replace(" ", "").lower()
354
+ return normalized in {"buildcorpusquote", "quote", "intensequote"}
355
+
356
+
357
+ def paragraph_num_info(node: ET.Element) -> tuple[int, bool] | None:
358
+ num_pr = node.find("./w:pPr/w:numPr", NS)
359
+ if num_pr is None:
360
+ return None
361
+ ilvl = num_pr.find("./w:ilvl", NS)
362
+ level = int(attr(ilvl, "w", "val") or "0") if ilvl is not None else 0
363
+ # Without numbering.xml style resolution, use bullets as the safer default.
364
+ return level, False
365
+
366
+
367
+ def paragraph_list_style_info(style: str | None) -> tuple[int, bool] | None:
368
+ if not style:
369
+ return None
370
+ normalized = style.replace(" ", "").lower()
371
+ if normalized.startswith("listbullet"):
372
+ suffix = normalized.removeprefix("listbullet")
373
+ level = int(suffix) if suffix.isdigit() else 1
374
+ return max(level - 1, 0), False
375
+ if normalized.startswith("listnumber"):
376
+ suffix = normalized.removeprefix("listnumber")
377
+ level = int(suffix) if suffix.isdigit() else 1
378
+ return max(level - 1, 0), True
379
+ return None
380
+
381
+
382
+ def run_is_math(run: ET.Element) -> bool:
383
+ props = run.find("./w:rPr", NS)
384
+ if props is None:
385
+ return False
386
+ fonts = props.find("./w:rFonts", NS)
387
+ if fonts is None:
388
+ return False
389
+ for attr_name in ("ascii", "hAnsi", "cs"):
390
+ value = attr(fonts, "w", attr_name)
391
+ if value and value.lower() == "cambria math":
392
+ return True
393
+ return False
394
+
395
+
396
+ def run_is_code(run: ET.Element) -> bool:
397
+ props = run.find("./w:rPr", NS)
398
+ if props is None:
399
+ return False
400
+ style_node = props.find("./w:rStyle", NS)
401
+ if style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower():
402
+ return True
403
+ fonts = props.find("./w:rFonts", NS)
404
+ if fonts is None:
405
+ return False
406
+ for attr_name in ("ascii", "hAnsi", "cs"):
407
+ value = attr(fonts, "w", attr_name)
408
+ if value and value.lower() == "consolas":
409
+ return True
410
+ return False
411
+
412
+
413
+ def run_is_bold(run: ET.Element) -> bool:
414
+ props = run.find("./w:rPr", NS)
415
+ return props is not None and props.find("./w:b", NS) is not None
416
+
417
+
418
+ def paragraph_is_code(node: ET.Element) -> bool:
419
+ runs = node.findall("./w:r", NS)
420
+ if not runs:
421
+ return False
422
+
423
+ first_nonempty_seen = False
424
+ code_like_runs = 0
425
+ meaningful_runs = 0
426
+
427
+ for run in runs:
428
+ text = extract_run_text(run)
429
+ if not text or not text.strip():
430
+ continue
431
+ meaningful_runs += 1
432
+ if not first_nonempty_seen and run_is_bold(run):
433
+ first_nonempty_seen = True
434
+ continue
435
+ first_nonempty_seen = True
436
+ if run_is_code(run):
437
+ code_like_runs += 1
438
+ continue
439
+ return False
440
+
441
+ return meaningful_runs > 0 and code_like_runs > 0
442
+
443
+
444
+ def extract_run_text(run: ET.Element) -> str:
445
+ parts: list[str] = []
446
+ for child in list(run):
447
+ name = local_name(child.tag)
448
+ if name == "t":
449
+ parts.append(clean_text(child.text or ""))
450
+ elif name == "tab":
451
+ parts.append("\t")
452
+ elif name in {"br", "cr"}:
453
+ parts.append("\n")
454
+ return "".join(parts)
455
+
456
+
457
+ def paragraph_is_math(node: ET.Element) -> bool:
458
+ runs = node.findall("./w:r", NS)
459
+ math_runs = 0
460
+ text_runs = 0
461
+ for run in runs:
462
+ texts = [t.text or "" for t in run.findall("./w:t", NS)]
463
+ if not any(segment.strip() for segment in texts):
464
+ continue
465
+ text_runs += 1
466
+ if run_is_math(run):
467
+ math_runs += 1
468
+ return text_runs > 0 and text_runs == math_runs
469
+
470
+
471
+ def paragraph_has_display_math_layout(node: ET.Element) -> bool:
472
+ indent = node.find("./w:pPr/w:ind", NS)
473
+ spacing = node.find("./w:pPr/w:spacing", NS)
474
+ if indent is not None and any(attr(indent, "w", key) not in {None, "0"} for key in ("left", "right", "firstLine", "hanging")):
475
+ return True
476
+ if spacing is not None and any(attr(spacing, "w", key) not in {None, "0"} for key in ("before", "after")):
477
+ return True
478
+ return False
479
+
480
+
481
+ def relationship_map(zip_file: ZipFile, part: str = "word/document.xml") -> dict[str, str]:
482
+ rels_path = str(Path(part).parent / "_rels" / (Path(part).name + ".rels")).replace("\\", "/")
483
+ if rels_path not in zip_file.namelist():
484
+ return {}
485
+ root = ET.fromstring(zip_file.read(rels_path))
486
+ return {
487
+ rel.attrib["Id"]: rel.attrib.get("Target", "")
488
+ for rel in root
489
+ if "Id" in rel.attrib
490
+ }
491
+
492
+
493
+ def resolve_image_target(target: str) -> str:
494
+ if target.startswith("../"):
495
+ target = target[3:]
496
+ if not target.startswith("word/"):
497
+ target = f"word/{target}"
498
+ return target
499
+
500
+
501
+ def image_metadata_filename(node: ET.Element) -> str | None:
502
+ for tag_name in ("docPr", "cNvPr"):
503
+ for entry in node.findall(f".//wp:{tag_name}", NS):
504
+ for key in ("descr", "title", "name"):
505
+ value = entry.attrib.get(key)
506
+ if value and Path(value).suffix:
507
+ return Path(value).name
508
+ return None
509
+
510
+
511
+ def expand_env(value):
512
+ if isinstance(value, str):
513
+ return os.path.expandvars(value)
514
+ if isinstance(value, dict):
515
+ return {key: expand_env(item) for key, item in value.items()}
516
+ if isinstance(value, list):
517
+ return [expand_env(item) for item in value]
518
+ return value
519
+
520
+
521
+ def load_config(path: Path | None) -> dict:
522
+ if path is None:
523
+ return {}
524
+ if not path.exists():
525
+ raise FileNotFoundError(f"Config file not found: {path}")
526
+ if path.suffix.lower() != ".json":
527
+ raise ValueError("Config currently supports JSON files only")
528
+ return expand_env(json.loads(path.read_text(encoding="utf-8")))
529
+
530
+
531
+ def config_get(config: dict, key: str, default=None):
532
+ current = config
533
+ for part in key.split("."):
534
+ if not isinstance(current, dict) or part not in current:
535
+ return default
536
+ current = current[part]
537
+ return current
538
+
539
+
540
+ def build_s3_config(config: dict, args: argparse.Namespace) -> S3ImageConfig | None:
541
+ if args.images != "s3":
542
+ return None
543
+ s3 = config_get(config, "s3", {}) or {}
544
+ bucket = args.s3_bucket or s3.get("bucket")
545
+ public_base_url = args.s3_public_base_url or s3.get("public_base_url")
546
+ if not bucket or not public_base_url:
547
+ raise ValueError("S3/R2 image mode requires bucket and public_base_url")
548
+ return S3ImageConfig(
549
+ bucket=bucket,
550
+ public_base_url=public_base_url,
551
+ prefix=args.s3_prefix if args.s3_prefix is not None else s3.get("prefix", ""),
552
+ endpoint_url=args.s3_endpoint_url or s3.get("endpoint_url"),
553
+ region_name=args.s3_region or s3.get("region_name"),
554
+ access_key_id=args.s3_access_key_id or s3.get("access_key_id"),
555
+ secret_access_key=args.s3_secret_access_key or s3.get("secret_access_key"),
556
+ cache_control=args.s3_cache_control or s3.get("cache_control", "public, max-age=31536000, immutable"),
557
+ acl=args.s3_acl if args.s3_acl is not None else s3.get("acl"),
558
+ )
559
+
560
+
561
+ class BuildCorpusExporter:
562
+ def __init__(
563
+ self,
564
+ input_path: Path,
565
+ output_dir: Path,
566
+ equation_mode: str = "tex",
567
+ output_md: Path | None = None,
568
+ assets_dir: Path | None = None,
569
+ report_path: Path | None = None,
570
+ image_mode: str = "assets",
571
+ s3_config: S3ImageConfig | None = None,
572
+ ):
573
+ self.input_path = input_path
574
+ self.output_dir = output_dir
575
+ self.output_md = output_md or (output_dir / (input_path.stem + ".md"))
576
+ self.assets_dir = assets_dir or (output_dir / "assets")
577
+ self.report_path = report_path or (output_dir / "export-report.json")
578
+ self.asset_ref_prefix = self.assets_dir.name
579
+ self.equation_mode = equation_mode
580
+ self.image_mode = image_mode
581
+ self.s3_config = s3_config
582
+ self.s3_uploader = S3ImageUploader(s3_config) if image_mode == "s3" and s3_config else None
583
+ self.stats = ExportStats()
584
+ self.rels: dict[str, str] = {}
585
+ self.media_map: dict[str, str] = {}
586
+ self.image_uploads: list[dict[str, str]] = []
587
+ self.equation_asset_map: dict[int, str] = {}
588
+ self.empty_equation_indexes: set[int] = set()
589
+ self.equation_index = 0
590
+ self.equation_samples: list[dict[str, str]] = []
591
+ self.table_depth = 0
592
+
593
+ def export(self) -> dict:
594
+ self.output_dir.mkdir(parents=True, exist_ok=True)
595
+ if self.image_mode == "assets" or self.equation_mode == "image":
596
+ self.assets_dir.mkdir(parents=True, exist_ok=True)
597
+
598
+ with self.open_input_zip() as zf:
599
+ self.rels = relationship_map(zf)
600
+ self._copy_media(zf)
601
+ document_xml = zf.read("word/document.xml")
602
+ root = ET.fromstring(document_xml)
603
+ body = root.find("w:body", NS)
604
+ if body is None:
605
+ raise RuntimeError("word/document.xml has no w:body")
606
+ if self.equation_mode == "image":
607
+ self._render_equation_assets(root)
608
+
609
+ markdown = self.render_children(body, top_level=True).strip() + "\n"
610
+
611
+ self.output_md.parent.mkdir(parents=True, exist_ok=True)
612
+ self.output_md.write_text(markdown, encoding="utf-8")
613
+ report = {
614
+ "input": str(self.input_path),
615
+ "output": str(self.output_md),
616
+ "assets_dir": str(self.assets_dir) if self.assets_dir.exists() else None,
617
+ "image_mode": self.image_mode,
618
+ "image_uploads": self.image_uploads,
619
+ "stats": self.stats.__dict__,
620
+ "equation_samples": self.equation_samples[:50],
621
+ }
622
+ self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
623
+ return report
624
+
625
+ @contextlib.contextmanager
626
+ def open_input_zip(self):
627
+ try:
628
+ with ZipFile(self.input_path) as zf:
629
+ yield zf
630
+ return
631
+ except PermissionError:
632
+ pass
633
+
634
+ with tempfile.TemporaryDirectory(prefix="build-corpus-input-") as tmp:
635
+ temp_input = Path(tmp) / self.input_path.name
636
+ self.copy_locked_input(temp_input)
637
+ self.stats.warnings.append(
638
+ f"Input file was locked; converted from temporary copy: {temp_input}"
639
+ )
640
+ with ZipFile(temp_input) as zf:
641
+ yield zf
642
+
643
+ def copy_locked_input(self, temp_input: Path) -> None:
644
+ try:
645
+ shutil.copyfile(self.input_path, temp_input)
646
+ return
647
+ except PermissionError:
648
+ if os.name != "nt":
649
+ raise
650
+
651
+ source = str(self.input_path).replace("'", "''")
652
+ target = str(temp_input).replace("'", "''")
653
+ command = f"Copy-Item -LiteralPath '{source}' -Destination '{target}' -Force"
654
+ result = subprocess.run(
655
+ ["powershell", "-NoProfile", "-Command", command],
656
+ capture_output=True,
657
+ text=True,
658
+ )
659
+ if result.returncode != 0:
660
+ message = result.stderr.strip() or result.stdout.strip() or "unknown error"
661
+ raise PermissionError(f"Could not copy locked input via PowerShell: {message}")
662
+
663
+ def _copy_media(self, zf: ZipFile) -> None:
664
+ for name in zf.namelist():
665
+ if not name.startswith("word/media/"):
666
+ continue
667
+ mime_type = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
668
+ if self.image_mode == "base64":
669
+ data = zf.read(name)
670
+ encoded = base64.b64encode(data).decode("ascii")
671
+ self.media_map[name] = f"data:{mime_type};base64,{encoded}"
672
+ elif self.image_mode == "s3":
673
+ if self.s3_uploader is None:
674
+ raise RuntimeError("S3/R2 image mode needs s3_config")
675
+ data = zf.read(name)
676
+ upload = self.s3_uploader.upload(name, data, mime_type)
677
+ self.image_uploads.append(upload)
678
+ self.media_map[name] = upload["url"]
679
+ else:
680
+ target = self.assets_dir / Path(name).name
681
+ with zf.open(name) as src, target.open("wb") as dst:
682
+ shutil.copyfileobj(src, dst)
683
+ self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
684
+
685
+ def _render_equation_assets(self, document_root: ET.Element) -> None:
686
+ math_nodes = document_root.findall(".//m:oMath", NS)
687
+ if not math_nodes:
688
+ return
689
+ render_jobs = []
690
+ for index, math_node in enumerate(math_nodes, 1):
691
+ if self.is_empty_equation(math_node):
692
+ self.empty_equation_indexes.add(index)
693
+ continue
694
+ render_jobs.append((index, math_node))
695
+
696
+ try:
697
+ from docx import Document
698
+ from docx.oxml import parse_xml
699
+ import win32com.client as win32
700
+ except Exception as exc:
701
+ self.stats.warnings.append(f"Equation image rendering unavailable: {exc!r}")
702
+ return
703
+
704
+ chunk_size = 1
705
+ word = win32.DispatchEx("Word.Application")
706
+ word.Visible = False
707
+ word.DisplayAlerts = 0
708
+ try:
709
+ for start in range(0, len(render_jobs), chunk_size):
710
+ chunk = render_jobs[start : start + chunk_size]
711
+ with tempfile.TemporaryDirectory(prefix="build-corpus-equations-") as tmp:
712
+ tmp_dir = Path(tmp)
713
+ temp_docx = tmp_dir / "equations.docx"
714
+ temp_html = tmp_dir / "equations.html"
715
+
716
+ doc = Document()
717
+ for absolute_index, math_node in chunk:
718
+ p = doc.add_paragraph(f"EQMARKER{absolute_index:06d} ")
719
+ p._p.append(parse_xml(ET.tostring(math_node, encoding="unicode")))
720
+ doc.save(temp_docx)
721
+
722
+ opened = word.Documents.Open(str(temp_docx), ReadOnly=True, AddToRecentFiles=False)
723
+ opened.SaveAs2(str(temp_html), FileFormat=10)
724
+ opened.Close(False)
725
+
726
+ html_assets = temp_html.with_name(temp_html.stem + "_files")
727
+ rendered = sorted(html_assets.glob("image*.png"))
728
+ if len(rendered) != len(chunk):
729
+ self.stats.warnings.append(
730
+ f"Equation image count mismatch in render chunk {start + 1}-{start + len(chunk)}: "
731
+ f"OMML={len(chunk)} rendered={len(rendered)}"
732
+ )
733
+
734
+ for (absolute_index, _math_node), source in zip(chunk, rendered):
735
+ target = self.assets_dir / f"eq-{absolute_index:06d}.png"
736
+ shutil.copyfile(source, target)
737
+ self.equation_asset_map[absolute_index] = f"{self.asset_ref_prefix}/{target.name}"
738
+ finally:
739
+ word.Quit()
740
+ self.stats.equation_images = len(self.equation_asset_map)
741
+ self.stats.skipped_empty_equations = len(self.empty_equation_indexes)
742
+
743
+ def render_children(self, node: ET.Element, top_level: bool = False) -> str:
744
+ parts: list[str] = []
745
+ for child in list(node):
746
+ rendered = self.render_block(child)
747
+ if not rendered:
748
+ continue
749
+ if top_level:
750
+ parts.append(rendered.rstrip())
751
+ else:
752
+ parts.append(rendered.strip())
753
+ sep = "\n\n" if top_level else "\n"
754
+ return sep.join(part for part in parts if part)
755
+
756
+ def render_block(self, node: ET.Element) -> str:
757
+ name = local_name(node.tag)
758
+ if name == "p":
759
+ return self.render_paragraph(node)
760
+ if name == "tbl":
761
+ return self.render_table(node)
762
+ if name == "sdt":
763
+ content = node.find("./w:sdtContent", NS)
764
+ return self.render_children(content, top_level=True) if content is not None else ""
765
+ if name in {"bookmarkStart", "bookmarkEnd", "sectPr", "proofErr", "permStart", "permEnd"}:
766
+ return ""
767
+ return self.render_children(node, top_level=False)
768
+
769
+ def render_paragraph(self, p: ET.Element) -> str:
770
+ style = paragraph_style(p)
771
+ content = self.render_inline_children(p)
772
+ if not content.strip():
773
+ return ""
774
+ content = strip_trailing_markdown_breaks(content)
775
+
776
+ if is_code_style(style) or (self.table_depth == 0 and paragraph_is_code(p)):
777
+ self.stats.code_blocks += 1
778
+ return self.render_code_paragraph(p)
779
+
780
+ level = heading_level(style)
781
+ if level:
782
+ self.stats.headings += 1
783
+ return f"{'#' * level} {self.strip_inline_markers(content)}"
784
+
785
+ if is_quote_style(style):
786
+ self.stats.paragraphs += 1
787
+ return f"> {content}"
788
+
789
+ num_info = paragraph_num_info(p)
790
+ if not num_info:
791
+ num_info = paragraph_list_style_info(style)
792
+ if num_info:
793
+ self.stats.lists += 1
794
+ list_level, ordered = num_info
795
+ indent = " " * list_level
796
+ bullet = "1." if ordered else "-"
797
+ return f"{indent}{bullet} {content}"
798
+
799
+ if paragraph_is_math(p):
800
+ if self.table_depth > 0:
801
+ return content
802
+ if not paragraph_has_display_math_layout(p):
803
+ return content
804
+ inner = content.strip()
805
+ if inner.startswith("$") and inner.endswith("$") and len(inner) >= 2:
806
+ inner = inner[1:-1]
807
+ return f"$$\n{inner}\n$$"
808
+
809
+ self.stats.paragraphs += 1
810
+ return content
811
+
812
+ def render_code_paragraph(self, p: ET.Element) -> str:
813
+ info = ""
814
+ code_parts: list[str] = []
815
+ first_nonempty_seen = False
816
+
817
+ for run in p.findall("./w:r", NS):
818
+ raw = extract_run_text(run)
819
+ if not raw:
820
+ continue
821
+ if not first_nonempty_seen and run_is_bold(run):
822
+ info = raw.strip()
823
+ first_nonempty_seen = True
824
+ continue
825
+ first_nonempty_seen = True
826
+ code_parts.append(raw)
827
+
828
+ code = "".join(code_parts).strip("\n")
829
+ fence = f"```{info}".rstrip()
830
+ return f"{fence}\n{code}\n```"
831
+
832
+ def render_inline_children(self, node: ET.Element) -> str:
833
+ segments: list[tuple[Optional[InlineStyle], str]] = []
834
+ for child in list(node):
835
+ name = local_name(child.tag)
836
+ if name == "r":
837
+ segments.extend(self.render_run_segments(child))
838
+ elif name == "hyperlink":
839
+ label = self.render_inline_children(child).strip()
840
+ anchor = attr(child, "w", "anchor")
841
+ rid = attr(child, "r", "id")
842
+ url = f"#{anchor}" if anchor else self.rels.get(rid or "", "")
843
+ segments.append((None, f"[{label}]({url})" if url else label))
844
+ elif name == "oMath":
845
+ segments.append((None, self.render_math(child, display=False)))
846
+ elif name == "oMathPara":
847
+ segments.append((None, self.render_math(child, display=True)))
848
+ elif name == "drawing":
849
+ img = self.render_image(child)
850
+ if img:
851
+ segments.append((None, img))
852
+ elif name in {"pPr", "rPr"}:
853
+ continue
854
+ else:
855
+ segments.append((None, self.render_inline_children(child)))
856
+ return coalesce_inline_segments(segments)
857
+
858
+ def run_style(self, run: ET.Element) -> InlineStyle:
859
+ props = run.find("./w:rPr", NS)
860
+ if props is None:
861
+ return False, False, False
862
+ style_node = props.find("./w:rStyle", NS)
863
+ is_code = (
864
+ style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower()
865
+ ) or run_is_code(run)
866
+ bold = props.find("./w:b", NS) is not None
867
+ italic = props.find("./w:i", NS) is not None
868
+ return is_code, bold, italic
869
+
870
+ def render_run_segments(self, run: ET.Element) -> list[tuple[Optional[InlineStyle], str]]:
871
+ style = self.run_style(run)
872
+ is_math = run_is_math(run)
873
+ segments: list[tuple[Optional[InlineStyle], str]] = []
874
+ run_parts: list[str] = []
875
+
876
+ def flush_text() -> None:
877
+ if run_parts:
878
+ text = "".join(run_parts)
879
+ if is_math:
880
+ segments.append((None, f"${text}$"))
881
+ else:
882
+ segments.append((style, text))
883
+ run_parts.clear()
884
+
885
+ for child in list(run):
886
+ name = local_name(child.tag)
887
+ if name == "t":
888
+ run_parts.append(clean_text(child.text or "") if is_math else escape_md_text(child.text or ""))
889
+ elif name == "noBreakHyphen":
890
+ run_parts.append("\u2011")
891
+ elif name == "softHyphen":
892
+ run_parts.append("\u00ad")
893
+ elif name == "tab":
894
+ run_parts.append("\t")
895
+ elif name in {"br", "cr"}:
896
+ run_parts.append(" \n")
897
+ elif name == "drawing":
898
+ flush_text()
899
+ img = self.render_image(child)
900
+ if img:
901
+ segments.append((None, img))
902
+ elif name == "rPr":
903
+ continue
904
+ else:
905
+ flush_text()
906
+ nested = self.render_inline_children(child)
907
+ if nested:
908
+ segments.append((None, nested))
909
+
910
+ flush_text()
911
+ return segments
912
+
913
+ def render_math(self, node: ET.Element, display: bool) -> str:
914
+ self.stats.equations += 1
915
+ self.equation_index += 1
916
+ if self.equation_index in self.empty_equation_indexes or self.is_empty_equation(node):
917
+ self.stats.skipped_empty_equations = max(
918
+ self.stats.skipped_empty_equations,
919
+ len(self.empty_equation_indexes),
920
+ )
921
+ return ""
922
+ if self.equation_mode == "image":
923
+ asset = self.equation_asset_map.get(self.equation_index)
924
+ if asset:
925
+ alt = f"equation {self.equation_index}"
926
+ rendered = f"![{alt}]({asset})"
927
+ return f"\n{rendered}\n" if display and self.table_depth == 0 else rendered
928
+ self.stats.warnings.append(f"Missing rendered equation asset for equation {self.equation_index}")
929
+ return self.render_missing_equation_fallback(node)
930
+
931
+ try:
932
+ tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
933
+ render_display = display and self.table_depth == 0
934
+ rendered = normalize_tex(tex, display=render_display)
935
+ if len(self.equation_samples) < 50:
936
+ self.equation_samples.append({
937
+ "source": "".join(t.text or "" for t in node.findall(".//m:t", NS))[:220],
938
+ "tex": rendered[:500],
939
+ })
940
+ return f"\n{rendered}\n" if render_display else rendered
941
+ except Exception as exc:
942
+ self.stats.equation_errors += 1
943
+ self.stats.warnings.append(f"Equation conversion failed: {exc!r}")
944
+ fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
945
+ return f"`[equation: {fallback}]`"
946
+
947
+ def render_missing_equation_fallback(self, node: ET.Element) -> str:
948
+ try:
949
+ tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
950
+ return normalize_tex(tex, display=False)
951
+ except Exception:
952
+ fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
953
+ return f"`[equation: {fallback}]`"
954
+
955
+ @staticmethod
956
+ def is_empty_equation(node: ET.Element) -> bool:
957
+ text = "".join(t.text or "" for t in node.findall(".//m:t", NS)).strip()
958
+ return not text
959
+
960
+ def render_image(self, node: ET.Element) -> str:
961
+ preferred_name = image_metadata_filename(node)
962
+ refs = []
963
+ for blip in node.findall(".//a:blip", NS):
964
+ rid = attr(blip, "r", "embed") or attr(blip, "r", "link")
965
+ if rid:
966
+ refs.append(rid)
967
+ rendered = []
968
+ for rid in refs:
969
+ target = self.rels.get(rid, rid)
970
+ source = resolve_image_target(target)
971
+ asset = self.media_map.get(source)
972
+ if not asset:
973
+ self.stats.warnings.append(f"Image relationship not copied: {rid} -> {target}")
974
+ continue
975
+ if preferred_name and self.image_mode == "assets":
976
+ current_path = self.output_dir / asset
977
+ preferred_path = self.assets_dir / preferred_name
978
+ if current_path.exists() and preferred_path != current_path and not preferred_path.exists():
979
+ shutil.copyfile(current_path, preferred_path)
980
+ asset = f"{self.asset_ref_prefix}/{preferred_path.name}"
981
+ self.stats.images += 1
982
+ rendered.append(f"![image]({asset})")
983
+ return " ".join(rendered)
984
+
985
+ def render_table(self, tbl: ET.Element) -> str:
986
+ self.stats.tables += 1
987
+ self.table_depth += 1
988
+ try:
989
+ rows = tbl.findall("./w:tr", NS)
990
+ rendered_rows = []
991
+ complex_table = False
992
+ for row in rows:
993
+ cells = row.findall("./w:tc", NS)
994
+ rendered_cells = []
995
+ for cell in cells:
996
+ if cell.find(".//w:tbl", NS) is not None:
997
+ complex_table = True
998
+ cell_text = self.render_cell(cell)
999
+ if "\n\n" in cell_text:
1000
+ complex_table = True
1001
+ rendered_cells.append(cell_text)
1002
+ rendered_rows.append(rendered_cells)
1003
+ finally:
1004
+ self.table_depth -= 1
1005
+
1006
+ if not rendered_rows:
1007
+ return ""
1008
+ if complex_table:
1009
+ self.stats.html_tables += 1
1010
+ return self.render_html_table(rendered_rows)
1011
+ self.stats.markdown_tables += 1
1012
+ return self.render_markdown_table(rendered_rows)
1013
+
1014
+ def render_cell(self, cell: ET.Element) -> str:
1015
+ parts = []
1016
+ for child in list(cell):
1017
+ if local_name(child.tag) == "tcPr":
1018
+ continue
1019
+ rendered = self.render_block(child)
1020
+ if rendered:
1021
+ parts.append(rendered.strip())
1022
+ return "<br>".join(parts).strip()
1023
+
1024
+ def render_markdown_table(self, rows: list[list[str]]) -> str:
1025
+ width = max(len(row) for row in rows)
1026
+ padded = [row + [""] * (width - len(row)) for row in rows]
1027
+
1028
+ def clean_cell(value: str) -> str:
1029
+ return value.replace("\n", "<br>").replace("|", "\\|").strip()
1030
+
1031
+ lines = []
1032
+ lines.append("| " + " | ".join(clean_cell(v) for v in padded[0]) + " |")
1033
+ lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
1034
+ for row in padded[1:]:
1035
+ lines.append("| " + " | ".join(clean_cell(v) for v in row) + " |")
1036
+ return "\n".join(lines)
1037
+
1038
+ def render_html_table(self, rows: list[list[str]]) -> str:
1039
+ html_rows = ["<table>"]
1040
+ for row in rows:
1041
+ html_rows.append(" <tr>")
1042
+ for cell in row:
1043
+ # Keep inline Markdown-ish math readable inside HTML fallback.
1044
+ html_rows.append(f" <td>{html.escape(cell, quote=False).replace(chr(10), '<br>')}</td>")
1045
+ html_rows.append(" </tr>")
1046
+ html_rows.append("</table>")
1047
+ return "\n".join(html_rows)
1048
+
1049
+ @staticmethod
1050
+ def strip_inline_markers(text: str) -> str:
1051
+ return text.replace("\n", " ").strip()
1052
+
1053
+
1054
+ def export_one(
1055
+ input_path: Path,
1056
+ output_root: Path,
1057
+ equation_mode: str,
1058
+ out_same_dir: bool,
1059
+ image_mode: str,
1060
+ s3_config: S3ImageConfig | None = None,
1061
+ ) -> dict:
1062
+ if out_same_dir:
1063
+ output_dir = input_path.parent
1064
+ output_md = input_path.with_suffix(".md")
1065
+ assets_dir = input_path.with_name(input_path.stem + ".assets")
1066
+ report_path = input_path.with_name(input_path.stem + ".export-report.json")
1067
+ else:
1068
+ output_dir = output_root / input_path.stem
1069
+ output_md = None
1070
+ assets_dir = None
1071
+ report_path = None
1072
+ exporter = BuildCorpusExporter(
1073
+ input_path,
1074
+ output_dir,
1075
+ equation_mode=equation_mode,
1076
+ output_md=output_md,
1077
+ assets_dir=assets_dir,
1078
+ report_path=report_path,
1079
+ image_mode=image_mode,
1080
+ s3_config=s3_config,
1081
+ )
1082
+ return exporter.export()
1083
+
1084
+
1085
+ def collect_inputs(path: Path, target: str) -> list[Path]:
1086
+ if path.is_file():
1087
+ return [path]
1088
+ inputs: list[Path] = []
1089
+ patterns = ("*.md",) if target == "word" else ("*.docx", "*.pptx", "*.ppt")
1090
+ for ext in patterns:
1091
+ inputs.extend(path.rglob(ext))
1092
+ return sorted(inputs)
1093
+
1094
+
1095
+ def main() -> None:
1096
+ parser = argparse.ArgumentParser(
1097
+ description="Convert Markdown to DOCX or DOCX/PPTX/PPT to Markdown.",
1098
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1099
+ epilog="""examples:
1100
+ build-corpus input.docx --out out
1101
+ build-corpus input.md --to word --out out
1102
+ build-corpus ./word-files --out ./markdown
1103
+ build-corpus ./word-files --out-same-dir
1104
+ build-corpus input.docx --images base64
1105
+ build-corpus input.docx --images s3 --config build-corpus.config.json
1106
+
1107
+ image modes:
1108
+ assets copy images into an assets folder and reference them from Markdown
1109
+ base64 embed images directly as Markdown data URIs
1110
+ s3 upload images to S3-compatible storage such as Cloudflare R2 or AWS S3
1111
+
1112
+ equation modes:
1113
+ tex convert Word OMML equations to KaTeX-readable TeX
1114
+ image render equations as images for visual debugging only
1115
+ """,
1116
+ )
1117
+ parser.add_argument("input", type=Path, help="Markdown, DOCX, PPTX, or PPT file or directory")
1118
+ parser.add_argument("--config", type=Path, help="JSON config file with conversion, output, and S3/R2 defaults")
1119
+ parser.add_argument("--out", type=Path, help="Output directory for converted Markdown tree")
1120
+ parser.add_argument("--to", choices=["auto", "markdown", "word"], help="Output target; auto infers from a single-file input")
1121
+ parser.add_argument("--equations", choices=["tex", "image"], help="Equation output mode; default comes from config or tex")
1122
+ parser.add_argument("--images", choices=["assets", "base64", "s3"], help="Image output mode; default comes from config or assets")
1123
+ parser.add_argument("--out-same-dir", action="store_true", help="Write .md, .assets, and reports beside each source DOCX")
1124
+ parser.add_argument("--word-template", type=Path, help="Optional .docx or .dotx template used for Markdown to Word exports")
1125
+ parser.add_argument("--s3-bucket", help="S3/R2 bucket name for --images s3")
1126
+ parser.add_argument("--s3-public-base-url", help="Public URL base used in Markdown, e.g. https://assets.example.com")
1127
+ parser.add_argument("--s3-prefix", help="Object key prefix for uploaded images")
1128
+ parser.add_argument("--s3-endpoint-url", help="S3-compatible endpoint, required for Cloudflare R2")
1129
+ parser.add_argument("--s3-region", help="S3 region; use auto for Cloudflare R2")
1130
+ parser.add_argument("--s3-access-key-id", help="S3/R2 access key id; can also come from config/env expansion")
1131
+ parser.add_argument("--s3-secret-access-key", help="S3/R2 secret access key; can also come from config/env expansion")
1132
+ parser.add_argument("--s3-cache-control", help="Cache-Control header for uploaded images")
1133
+ parser.add_argument("--s3-acl", help="Optional ACL for AWS S3; usually omitted for Cloudflare R2")
1134
+ args = parser.parse_args()
1135
+ config = load_config(args.config)
1136
+
1137
+ args.out = args.out or Path(config_get(config, "output.out", ".codex/build-corpus/out"))
1138
+ args.to = args.to or config_get(config, "conversion.target", "auto")
1139
+ args.equations = args.equations or config_get(config, "conversion.equations", "tex")
1140
+ args.images = args.images or config_get(config, "conversion.images", "assets")
1141
+ args.out_same_dir = args.out_same_dir or bool(config_get(config, "output.out_same_dir", False))
1142
+ args.word_template = args.word_template or (
1143
+ Path(config_get(config, "word.template")) if config_get(config, "word.template") else None
1144
+ )
1145
+ s3_config = build_s3_config(config, args)
1146
+
1147
+ input_target = args.to
1148
+ if args.input.is_file() and args.to == "auto":
1149
+ input_target = "word" if args.input.suffix.lower() == ".md" else "markdown"
1150
+ elif args.input.is_dir() and args.to == "auto":
1151
+ input_target = "markdown"
1152
+
1153
+ reports = []
1154
+ for input_path in collect_inputs(args.input, input_target):
1155
+ if input_path.name.startswith("~$"):
1156
+ continue
1157
+ suffix = input_path.suffix.lower()
1158
+ if input_target == "word" or suffix == ".md":
1159
+ reports.append(export_markdown_to_docx(
1160
+ input_path,
1161
+ args.out,
1162
+ args.out_same_dir,
1163
+ template_path=args.word_template,
1164
+ ))
1165
+ elif suffix in {".pptx", ".ppt"}:
1166
+ reports.append(export_presentation(
1167
+ input_path,
1168
+ args.out,
1169
+ args.out_same_dir,
1170
+ image_mode=args.images,
1171
+ ))
1172
+ else:
1173
+ reports.append(export_one(
1174
+ input_path,
1175
+ args.out,
1176
+ equation_mode=args.equations,
1177
+ out_same_dir=args.out_same_dir,
1178
+ image_mode=args.images,
1179
+ s3_config=s3_config,
1180
+ ))
1181
+
1182
+ batch_report_root = args.input if args.out_same_dir and args.input.is_dir() else args.out
1183
+ batch_report_root.mkdir(parents=True, exist_ok=True)
1184
+ batch_report = batch_report_root / "build-corpus-batch-report.json"
1185
+ batch_report.write_text(json.dumps(reports, indent=2, ensure_ascii=False), encoding="utf-8")
1186
+ print(json.dumps({
1187
+ "converted": len(reports),
1188
+ "batch_report": str(batch_report),
1189
+ "outputs": [report["output"] for report in reports],
1190
+ "default_word_template": str(args.word_template or resolve_default_template_path() or "bundled:md-to-word-template.dotx"),
1191
+ }, indent=2))
1192
+
1193
+
1194
+ if __name__ == "__main__":
1195
+ main()