regen.mde 0.2.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/LICENSE +16 -16
  2. package/README.md +409 -295
  3. package/bin/build-corpus-editor.js +83 -81
  4. package/bin/build-corpus.js +41 -41
  5. package/bin/postinstall.js +259 -187
  6. package/bin/regen-mdeditor-install.js +27 -27
  7. package/bin/regen-mdeditor-uninstall.js +19 -19
  8. package/bin/validate-katex.js +93 -93
  9. package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
  10. package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
  11. package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
  12. package/desktop/BuildCorpusEditor/Program.cs +85 -81
  13. package/desktop/BuildCorpusEditor/app.manifest +16 -16
  14. package/dist/release/regen-mde-0.8.0-win-x64.zip +0 -0
  15. package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
  16. package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
  17. package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
  18. package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +1 -1
  19. package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
  20. package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
  21. package/dist/windows-editor/wwwroot/index.html +22 -22
  22. package/editor-web/index.html +21 -21
  23. package/editor-web/src/main.jsx +1044 -399
  24. package/editor-web/src/styles.css +846 -602
  25. package/editor-web/vite.config.js +13 -13
  26. package/examples/build-corpus.config.example.json +21 -21
  27. package/installer/install-regen-mde.ps1 +214 -175
  28. package/installer/regen-mde.nsi +81 -81
  29. package/package.json +10 -6
  30. package/pyproject.toml +4 -3
  31. package/requirements.txt +5 -4
  32. package/scripts/build-windows-editor.ps1 +47 -47
  33. package/scripts/package-windows-editor.ps1 +90 -90
  34. package/scripts/release-dual.mjs +105 -0
  35. package/scripts/run-corpus.ps1 +28 -28
  36. package/scripts/run-editor-implementation-plane.ps1 +226 -203
  37. package/scripts/run-required-tests.ps1 +98 -98
  38. package/scripts/run-smoke.ps1 +28 -28
  39. package/src/build_corpus/__init__.py +1 -1
  40. package/src/build_corpus/docx_exporter.py +1055 -798
  41. package/src/build_corpus/equations.py +1345 -0
  42. package/src/build_corpus/exporter.py +1488 -1195
  43. package/src/build_corpus/frontmatter.py +302 -0
  44. package/src/build_corpus/ppt_exporter.py +543 -532
  45. package/src/build_corpus/templates/__init__.py +1 -1
  46. package/src/build_corpus/validate_assets.py +46 -46
  47. package/tools/audit_corpus.py +203 -203
  48. package/tools/collect_microsoft_word_templates.py +228 -228
  49. package/tools/collect_online_docx_corpus.py +272 -272
  50. package/tools/collect_online_pptx_corpus.py +252 -252
  51. package/tools/compare_pptx_inputs_outputs.py +87 -87
  52. package/tools/roundtrip_docx_corpus.py +171 -171
  53. package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
  54. package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
  55. package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
  56. package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1
@@ -1,1195 +1,1488 @@
1
- from __future__ import annotations
2
-
3
- import argparse
4
- import base64
5
- import contextlib
6
- import hashlib
7
- import html
8
- import json
9
- import mimetypes
10
- import os
11
- import re
12
- import shutil
13
- import subprocess
14
- import tempfile
15
- from dataclasses import dataclass, field
16
- from pathlib import Path
17
- from typing import Optional
18
- from zipfile import ZipFile
19
- from xml.etree import ElementTree as ET
20
-
21
- from omml2latex import convert_omml
22
- try:
23
- from .docx_exporter import export_markdown_to_docx, resolve_default_template_path
24
- except ImportError: # pragma: no cover - allows direct script execution
25
- from build_corpus.docx_exporter import export_markdown_to_docx, resolve_default_template_path
26
- try:
27
- from .ppt_exporter import export_presentation
28
- except ImportError: # pragma: no cover - allows direct script execution
29
- from build_corpus.ppt_exporter import export_presentation
30
-
31
-
32
- NS = {
33
- "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
34
- "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
35
- "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
36
- "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
37
- "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
38
- }
39
-
40
- W = f"{{{NS['w']}}}"
41
- R = f"{{{NS['r']}}}"
42
-
43
-
44
- @dataclass
45
- class ExportStats:
46
- paragraphs: int = 0
47
- headings: int = 0
48
- code_blocks: int = 0
49
- tables: int = 0
50
- markdown_tables: int = 0
51
- html_tables: int = 0
52
- equations: int = 0
53
- equation_images: int = 0
54
- skipped_empty_equations: int = 0
55
- equation_errors: int = 0
56
- images: int = 0
57
- lists: int = 0
58
- warnings: list[str] = field(default_factory=list)
59
-
60
-
61
- @dataclass
62
- class S3ImageConfig:
63
- bucket: str
64
- public_base_url: str
65
- prefix: str = ""
66
- endpoint_url: str | None = None
67
- region_name: str | None = None
68
- access_key_id: str | None = None
69
- secret_access_key: str | None = None
70
- cache_control: str = "public, max-age=31536000, immutable"
71
- acl: str | None = None
72
-
73
-
74
- class S3ImageUploader:
75
- def __init__(self, config: S3ImageConfig):
76
- self.config = config
77
- try:
78
- import boto3
79
- except ImportError as exc:
80
- raise RuntimeError("S3/R2 image mode requires boto3. Install with: pip install boto3") from exc
81
-
82
- kwargs = {
83
- "service_name": "s3",
84
- "endpoint_url": config.endpoint_url,
85
- "region_name": config.region_name,
86
- "aws_access_key_id": config.access_key_id,
87
- "aws_secret_access_key": config.secret_access_key,
88
- }
89
- self.client = boto3.client(**{key: value for key, value in kwargs.items() if value})
90
-
91
- def upload(self, source_name: str, data: bytes, content_type: str) -> dict[str, str]:
92
- digest = hashlib.sha256(data).hexdigest()
93
- suffix = Path(source_name).suffix.lower()
94
- key_parts = [self.config.prefix.strip("/"), "images", "sha256", f"{digest}{suffix}"]
95
- key = "/".join(part for part in key_parts if part)
96
- put_args = {
97
- "Bucket": self.config.bucket,
98
- "Key": key,
99
- "Body": data,
100
- "ContentType": content_type,
101
- "CacheControl": self.config.cache_control,
102
- }
103
- if self.config.acl:
104
- put_args["ACL"] = self.config.acl
105
- self.client.put_object(**put_args)
106
- return {
107
- "source": source_name,
108
- "sha256": digest,
109
- "bucket": self.config.bucket,
110
- "key": key,
111
- "url": f"{self.config.public_base_url.rstrip('/')}/{key}",
112
- "content_type": content_type,
113
- "bytes": str(len(data)),
114
- }
115
-
116
-
117
- def local_name(tag: str) -> str:
118
- return tag.rsplit("}", 1)[-1] if "}" in tag else tag
119
-
120
-
121
- def attr(node: ET.Element, ns: str, name: str) -> str | None:
122
- return node.attrib.get(f"{{{NS[ns]}}}{name}")
123
-
124
-
125
- def clean_text(text: str) -> str:
126
- return (
127
- text.replace("\u00a0", " ")
128
- .replace("\u200b", "")
129
- .replace("\ufeff", "")
130
- )
131
-
132
-
133
- def escape_md_text(text: str) -> str:
134
- text = clean_text(text)
135
- escaped: list[str] = []
136
- index = 0
137
- while index < len(text):
138
- char = text[index]
139
- if char == "\\":
140
- next_char = text[index + 1] if index + 1 < len(text) else ""
141
- if next_char in "\\`*_{}[]()#+.!|$-":
142
- escaped.append("\\")
143
- escaped.append(next_char)
144
- index += 2
145
- continue
146
- escaped.append("\\\\")
147
- elif char in {"*", "_", "$"}:
148
- escaped.append("\\" + char)
149
- else:
150
- escaped.append(char)
151
- index += 1
152
- return "".join(escaped)
153
-
154
-
155
- def normalize_inline_markers(text: str) -> str:
156
- # Ensure inline images do not glue themselves to adjacent text.
157
- text = re.sub(r"(\!\[[^\]]*\]\([^)]+\))(?=[^\s<>)\].,;:!?])", r"\1 ", text)
158
- text = re.sub(r"(?<=[^\s<(\[.,;:!?])(\!\[[^\]]*\]\([^)]+\))", r" \1", text)
159
- return text
160
-
161
-
162
- def strip_trailing_markdown_breaks(text: str) -> str:
163
- while text.endswith(" "):
164
- text = text[:-2]
165
- return text.rstrip()
166
-
167
-
168
- InlineStyle = tuple[bool, bool, bool]
169
-
170
-
171
- def apply_inline_style(text: str, style: InlineStyle) -> str:
172
- is_code, bold, italic = style
173
- if not text:
174
- return ""
175
- if not text.strip():
176
- return text
177
- if is_code:
178
- return f"`{text.replace('`', '\\`')}`"
179
- if bold and italic:
180
- return f"***{text}***"
181
- if bold:
182
- return f"**{text}**"
183
- if italic:
184
- return f"*{text}*"
185
- return text
186
-
187
-
188
- def coalesce_inline_segments(segments: list[tuple[Optional[InlineStyle], str]]) -> str:
189
- parts: list[str] = []
190
- buffer: list[str] = []
191
- buffer_style: Optional[InlineStyle] = None
192
-
193
- def flush() -> None:
194
- nonlocal buffer_style
195
- if not buffer:
196
- return
197
- parts.append(apply_inline_style("".join(buffer), buffer_style or (False, False, False)))
198
- buffer.clear()
199
- buffer_style = None
200
-
201
- for style, text in segments:
202
- if not text:
203
- continue
204
- if style is None:
205
- flush()
206
- parts.append(text)
207
- continue
208
- if buffer_style == style:
209
- buffer.append(text)
210
- continue
211
- flush()
212
- buffer_style = style
213
- buffer.append(text)
214
-
215
- flush()
216
- return normalize_inline_markers("".join(parts))
217
-
218
-
219
- def normalize_tex(tex: str, display: bool) -> str:
220
- tex = clean_text(tex).strip()
221
- if tex.startswith("$$") and tex.endswith("$$"):
222
- tex = tex[2:-2].strip()
223
- elif tex.startswith("$") and tex.endswith("$"):
224
- tex = tex[1:-1].strip()
225
-
226
- tex = tex.replace("\u2011", "-")
227
- tex = tex.replace("$", r"\$")
228
- tex = tex.replace(r"\text{ }", r"\,")
229
- tex = tex.replace(r"\text{  }", r"\;")
230
- tex = tex.replace(r"\text{ }", " ")
231
- tex = tex.replace(r"\mathrm{\}\text{*}}", r"\*")
232
- tex = tex.replace(r"\text{-}", "-")
233
- tex = tex.replace(r"\*", "*")
234
- tex = replace_raw_unicode_math(tex)
235
- tex = strip_word_equation_field_codes(tex)
236
- tex = escape_text_macro_underscores(tex)
237
- tex = repair_underbrace_limits(tex)
238
- tex = balance_tex_braces(tex)
239
- tex = re.sub(r"\s+", " ", tex).strip()
240
- return f"$$\n{tex}\n$$" if display else f"${tex}$"
241
-
242
-
243
- UNICODE_MATH_REPLACEMENTS = {
244
- "∸": r"\dot{-}",
245
- "⨅": r"\sqcap",
246
- "⨃": r"\bigcup",
247
- "": r"\lessgtr",
248
- "⋝": r"\gtrless",
249
- "∱": r"\oint",
250
- "∲": r"\oint",
251
- "": r"\oint",
252
- "": r"\varepsilon",
253
- "Ϝ": r"\digamma",
254
- "℩": r"\iota",
255
- "Å": r"\mathring{A}",
256
- "": "e",
257
- }
258
-
259
-
260
- def replace_raw_unicode_math(tex: str) -> str:
261
- for raw, replacement in UNICODE_MATH_REPLACEMENTS.items():
262
- tex = tex.replace(raw, replacement)
263
- return tex
264
-
265
-
266
- def strip_word_equation_field_codes(tex: str) -> str:
267
- # Word SEQ fields can leak into OMML conversion as equation-number text.
268
- patterns = [
269
- r"#\s*\\left\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\\right\)",
270
- r"#\s*\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\)",
271
- r"#\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+",
272
- ]
273
- for pattern in patterns:
274
- tex = re.sub(pattern, "", tex, flags=re.IGNORECASE)
275
- return tex
276
-
277
-
278
- def escape_text_macro_underscores(tex: str) -> str:
279
- def replace(match: re.Match[str]) -> str:
280
- body = match.group(1)
281
- body = body.replace("\\", r"\textbackslash{}")
282
- body = body.replace("_", r"\_")
283
- body = body.replace("&", r"\&")
284
- body = body.replace("%", r"\%")
285
- body = body.replace("#", r"\#")
286
- return r"\text{" + body + "}"
287
-
288
- return re.sub(r"\\text\{([^{}]*)\}", replace, tex)
289
-
290
-
291
- def repair_underbrace_limits(tex: str) -> str:
292
- pattern = re.compile(
293
- r"\\mathop\{\\mathop\{(?P<base>.*?)\}\\limits_\{\s*\\underbrace\s*\}\}\\limits_\{(?P<label>.*?)\}"
294
- r"(?=(?:[+\-]|\\cdot|\\times|=|,|;|$))",
295
- re.DOTALL,
296
- )
297
- previous = None
298
- while previous != tex:
299
- previous = tex
300
- tex = pattern.sub(r"\\underbrace{\g<base>}_{\g<label>}", tex)
301
- return tex
302
-
303
-
304
- def balance_tex_braces(tex: str) -> str:
305
- balanced: list[str] = []
306
- depth = 0
307
- escaped = False
308
- for char in tex:
309
- if escaped:
310
- balanced.append(char)
311
- escaped = False
312
- continue
313
- if char == "\\":
314
- balanced.append(char)
315
- escaped = True
316
- continue
317
- if char == "{":
318
- depth += 1
319
- balanced.append(char)
320
- elif char == "}":
321
- if depth > 0:
322
- depth -= 1
323
- balanced.append(char)
324
- # Drop unmatched closing braces; KaTeX rejects them.
325
- else:
326
- balanced.append(char)
327
- if depth > 0:
328
- balanced.extend("}" for _ in range(depth))
329
- return "".join(balanced)
330
-
331
-
332
- def paragraph_style(node: ET.Element) -> str | None:
333
- style = node.find("./w:pPr/w:pStyle", NS)
334
- return attr(style, "w", "val") if style is not None else None
335
-
336
-
337
- def heading_level(style: str | None) -> int | None:
338
- if not style:
339
- return None
340
- match = re.fullmatch(r"Heading([1-6])", style)
341
- if match:
342
- return int(match.group(1))
343
- return None
344
-
345
-
346
- def is_code_style(style: str | None) -> bool:
347
- return bool(style and "code" in style.lower())
348
-
349
-
350
- def is_quote_style(style: str | None) -> bool:
351
- if not style:
352
- return False
353
- normalized = style.replace(" ", "").lower()
354
- return normalized in {"buildcorpusquote", "quote", "intensequote"}
355
-
356
-
357
- def paragraph_num_info(node: ET.Element) -> tuple[int, bool] | None:
358
- num_pr = node.find("./w:pPr/w:numPr", NS)
359
- if num_pr is None:
360
- return None
361
- ilvl = num_pr.find("./w:ilvl", NS)
362
- level = int(attr(ilvl, "w", "val") or "0") if ilvl is not None else 0
363
- # Without numbering.xml style resolution, use bullets as the safer default.
364
- return level, False
365
-
366
-
367
- def paragraph_list_style_info(style: str | None) -> tuple[int, bool] | None:
368
- if not style:
369
- return None
370
- normalized = style.replace(" ", "").lower()
371
- if normalized.startswith("listbullet"):
372
- suffix = normalized.removeprefix("listbullet")
373
- level = int(suffix) if suffix.isdigit() else 1
374
- return max(level - 1, 0), False
375
- if normalized.startswith("listnumber"):
376
- suffix = normalized.removeprefix("listnumber")
377
- level = int(suffix) if suffix.isdigit() else 1
378
- return max(level - 1, 0), True
379
- return None
380
-
381
-
382
- def run_is_math(run: ET.Element) -> bool:
383
- props = run.find("./w:rPr", NS)
384
- if props is None:
385
- return False
386
- fonts = props.find("./w:rFonts", NS)
387
- if fonts is None:
388
- return False
389
- for attr_name in ("ascii", "hAnsi", "cs"):
390
- value = attr(fonts, "w", attr_name)
391
- if value and value.lower() == "cambria math":
392
- return True
393
- return False
394
-
395
-
396
- def run_is_code(run: ET.Element) -> bool:
397
- props = run.find("./w:rPr", NS)
398
- if props is None:
399
- return False
400
- style_node = props.find("./w:rStyle", NS)
401
- if style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower():
402
- return True
403
- fonts = props.find("./w:rFonts", NS)
404
- if fonts is None:
405
- return False
406
- for attr_name in ("ascii", "hAnsi", "cs"):
407
- value = attr(fonts, "w", attr_name)
408
- if value and value.lower() == "consolas":
409
- return True
410
- return False
411
-
412
-
413
- def run_is_bold(run: ET.Element) -> bool:
414
- props = run.find("./w:rPr", NS)
415
- return props is not None and props.find("./w:b", NS) is not None
416
-
417
-
418
- def paragraph_is_code(node: ET.Element) -> bool:
419
- runs = node.findall("./w:r", NS)
420
- if not runs:
421
- return False
422
-
423
- first_nonempty_seen = False
424
- code_like_runs = 0
425
- meaningful_runs = 0
426
-
427
- for run in runs:
428
- text = extract_run_text(run)
429
- if not text or not text.strip():
430
- continue
431
- meaningful_runs += 1
432
- if not first_nonempty_seen and run_is_bold(run):
433
- first_nonempty_seen = True
434
- continue
435
- first_nonempty_seen = True
436
- if run_is_code(run):
437
- code_like_runs += 1
438
- continue
439
- return False
440
-
441
- return meaningful_runs > 0 and code_like_runs > 0
442
-
443
-
444
- def extract_run_text(run: ET.Element) -> str:
445
- parts: list[str] = []
446
- for child in list(run):
447
- name = local_name(child.tag)
448
- if name == "t":
449
- parts.append(clean_text(child.text or ""))
450
- elif name == "tab":
451
- parts.append("\t")
452
- elif name in {"br", "cr"}:
453
- parts.append("\n")
454
- return "".join(parts)
455
-
456
-
457
- def paragraph_is_math(node: ET.Element) -> bool:
458
- runs = node.findall("./w:r", NS)
459
- math_runs = 0
460
- text_runs = 0
461
- for run in runs:
462
- texts = [t.text or "" for t in run.findall("./w:t", NS)]
463
- if not any(segment.strip() for segment in texts):
464
- continue
465
- text_runs += 1
466
- if run_is_math(run):
467
- math_runs += 1
468
- return text_runs > 0 and text_runs == math_runs
469
-
470
-
471
- def paragraph_has_display_math_layout(node: ET.Element) -> bool:
472
- indent = node.find("./w:pPr/w:ind", NS)
473
- spacing = node.find("./w:pPr/w:spacing", NS)
474
- if indent is not None and any(attr(indent, "w", key) not in {None, "0"} for key in ("left", "right", "firstLine", "hanging")):
475
- return True
476
- if spacing is not None and any(attr(spacing, "w", key) not in {None, "0"} for key in ("before", "after")):
477
- return True
478
- return False
479
-
480
-
481
- def relationship_map(zip_file: ZipFile, part: str = "word/document.xml") -> dict[str, str]:
482
- rels_path = str(Path(part).parent / "_rels" / (Path(part).name + ".rels")).replace("\\", "/")
483
- if rels_path not in zip_file.namelist():
484
- return {}
485
- root = ET.fromstring(zip_file.read(rels_path))
486
- return {
487
- rel.attrib["Id"]: rel.attrib.get("Target", "")
488
- for rel in root
489
- if "Id" in rel.attrib
490
- }
491
-
492
-
493
- def resolve_image_target(target: str) -> str:
494
- if target.startswith("../"):
495
- target = target[3:]
496
- if not target.startswith("word/"):
497
- target = f"word/{target}"
498
- return target
499
-
500
-
501
- def image_metadata_filename(node: ET.Element) -> str | None:
502
- for tag_name in ("docPr", "cNvPr"):
503
- for entry in node.findall(f".//wp:{tag_name}", NS):
504
- for key in ("descr", "title", "name"):
505
- value = entry.attrib.get(key)
506
- if value and Path(value).suffix:
507
- return Path(value).name
508
- return None
509
-
510
-
511
- def expand_env(value):
512
- if isinstance(value, str):
513
- return os.path.expandvars(value)
514
- if isinstance(value, dict):
515
- return {key: expand_env(item) for key, item in value.items()}
516
- if isinstance(value, list):
517
- return [expand_env(item) for item in value]
518
- return value
519
-
520
-
521
- def load_config(path: Path | None) -> dict:
522
- if path is None:
523
- return {}
524
- if not path.exists():
525
- raise FileNotFoundError(f"Config file not found: {path}")
526
- if path.suffix.lower() != ".json":
527
- raise ValueError("Config currently supports JSON files only")
528
- return expand_env(json.loads(path.read_text(encoding="utf-8")))
529
-
530
-
531
- def config_get(config: dict, key: str, default=None):
532
- current = config
533
- for part in key.split("."):
534
- if not isinstance(current, dict) or part not in current:
535
- return default
536
- current = current[part]
537
- return current
538
-
539
-
540
- def build_s3_config(config: dict, args: argparse.Namespace) -> S3ImageConfig | None:
541
- if args.images != "s3":
542
- return None
543
- s3 = config_get(config, "s3", {}) or {}
544
- bucket = args.s3_bucket or s3.get("bucket")
545
- public_base_url = args.s3_public_base_url or s3.get("public_base_url")
546
- if not bucket or not public_base_url:
547
- raise ValueError("S3/R2 image mode requires bucket and public_base_url")
548
- return S3ImageConfig(
549
- bucket=bucket,
550
- public_base_url=public_base_url,
551
- prefix=args.s3_prefix if args.s3_prefix is not None else s3.get("prefix", ""),
552
- endpoint_url=args.s3_endpoint_url or s3.get("endpoint_url"),
553
- region_name=args.s3_region or s3.get("region_name"),
554
- access_key_id=args.s3_access_key_id or s3.get("access_key_id"),
555
- secret_access_key=args.s3_secret_access_key or s3.get("secret_access_key"),
556
- cache_control=args.s3_cache_control or s3.get("cache_control", "public, max-age=31536000, immutable"),
557
- acl=args.s3_acl if args.s3_acl is not None else s3.get("acl"),
558
- )
559
-
560
-
561
- class BuildCorpusExporter:
562
- def __init__(
563
- self,
564
- input_path: Path,
565
- output_dir: Path,
566
- equation_mode: str = "tex",
567
- output_md: Path | None = None,
568
- assets_dir: Path | None = None,
569
- report_path: Path | None = None,
570
- image_mode: str = "assets",
571
- s3_config: S3ImageConfig | None = None,
572
- ):
573
- self.input_path = input_path
574
- self.output_dir = output_dir
575
- self.output_md = output_md or (output_dir / (input_path.stem + ".md"))
576
- self.assets_dir = assets_dir or (output_dir / "assets")
577
- self.report_path = report_path or (output_dir / "export-report.json")
578
- self.asset_ref_prefix = self.assets_dir.name
579
- self.equation_mode = equation_mode
580
- self.image_mode = image_mode
581
- self.s3_config = s3_config
582
- self.s3_uploader = S3ImageUploader(s3_config) if image_mode == "s3" and s3_config else None
583
- self.stats = ExportStats()
584
- self.rels: dict[str, str] = {}
585
- self.media_map: dict[str, str] = {}
586
- self.image_uploads: list[dict[str, str]] = []
587
- self.equation_asset_map: dict[int, str] = {}
588
- self.empty_equation_indexes: set[int] = set()
589
- self.equation_index = 0
590
- self.equation_samples: list[dict[str, str]] = []
591
- self.table_depth = 0
592
-
593
- def export(self) -> dict:
594
- self.output_dir.mkdir(parents=True, exist_ok=True)
595
- if self.image_mode == "assets" or self.equation_mode == "image":
596
- self.assets_dir.mkdir(parents=True, exist_ok=True)
597
-
598
- with self.open_input_zip() as zf:
599
- self.rels = relationship_map(zf)
600
- self._copy_media(zf)
601
- document_xml = zf.read("word/document.xml")
602
- root = ET.fromstring(document_xml)
603
- body = root.find("w:body", NS)
604
- if body is None:
605
- raise RuntimeError("word/document.xml has no w:body")
606
- if self.equation_mode == "image":
607
- self._render_equation_assets(root)
608
-
609
- markdown = self.render_children(body, top_level=True).strip() + "\n"
610
-
611
- self.output_md.parent.mkdir(parents=True, exist_ok=True)
612
- self.output_md.write_text(markdown, encoding="utf-8")
613
- report = {
614
- "input": str(self.input_path),
615
- "output": str(self.output_md),
616
- "assets_dir": str(self.assets_dir) if self.assets_dir.exists() else None,
617
- "image_mode": self.image_mode,
618
- "image_uploads": self.image_uploads,
619
- "stats": self.stats.__dict__,
620
- "equation_samples": self.equation_samples[:50],
621
- }
622
- self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
623
- return report
624
-
625
- @contextlib.contextmanager
626
- def open_input_zip(self):
627
- try:
628
- with ZipFile(self.input_path) as zf:
629
- yield zf
630
- return
631
- except PermissionError:
632
- pass
633
-
634
- with tempfile.TemporaryDirectory(prefix="build-corpus-input-") as tmp:
635
- temp_input = Path(tmp) / self.input_path.name
636
- self.copy_locked_input(temp_input)
637
- self.stats.warnings.append(
638
- f"Input file was locked; converted from temporary copy: {temp_input}"
639
- )
640
- with ZipFile(temp_input) as zf:
641
- yield zf
642
-
643
- def copy_locked_input(self, temp_input: Path) -> None:
644
- try:
645
- shutil.copyfile(self.input_path, temp_input)
646
- return
647
- except PermissionError:
648
- if os.name != "nt":
649
- raise
650
-
651
- source = str(self.input_path).replace("'", "''")
652
- target = str(temp_input).replace("'", "''")
653
- command = f"Copy-Item -LiteralPath '{source}' -Destination '{target}' -Force"
654
- result = subprocess.run(
655
- ["powershell", "-NoProfile", "-Command", command],
656
- capture_output=True,
657
- text=True,
658
- )
659
- if result.returncode != 0:
660
- message = result.stderr.strip() or result.stdout.strip() or "unknown error"
661
- raise PermissionError(f"Could not copy locked input via PowerShell: {message}")
662
-
663
- def _copy_media(self, zf: ZipFile) -> None:
664
- for name in zf.namelist():
665
- if not name.startswith("word/media/"):
666
- continue
667
- mime_type = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
668
- if self.image_mode == "base64":
669
- data = zf.read(name)
670
- encoded = base64.b64encode(data).decode("ascii")
671
- self.media_map[name] = f"data:{mime_type};base64,{encoded}"
672
- elif self.image_mode == "s3":
673
- if self.s3_uploader is None:
674
- raise RuntimeError("S3/R2 image mode needs s3_config")
675
- data = zf.read(name)
676
- upload = self.s3_uploader.upload(name, data, mime_type)
677
- self.image_uploads.append(upload)
678
- self.media_map[name] = upload["url"]
679
- else:
680
- target = self.assets_dir / Path(name).name
681
- with zf.open(name) as src, target.open("wb") as dst:
682
- shutil.copyfileobj(src, dst)
683
- self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
684
-
685
- def _render_equation_assets(self, document_root: ET.Element) -> None:
686
- math_nodes = document_root.findall(".//m:oMath", NS)
687
- if not math_nodes:
688
- return
689
- render_jobs = []
690
- for index, math_node in enumerate(math_nodes, 1):
691
- if self.is_empty_equation(math_node):
692
- self.empty_equation_indexes.add(index)
693
- continue
694
- render_jobs.append((index, math_node))
695
-
696
- try:
697
- from docx import Document
698
- from docx.oxml import parse_xml
699
- import win32com.client as win32
700
- except Exception as exc:
701
- self.stats.warnings.append(f"Equation image rendering unavailable: {exc!r}")
702
- return
703
-
704
- chunk_size = 1
705
- word = win32.DispatchEx("Word.Application")
706
- word.Visible = False
707
- word.DisplayAlerts = 0
708
- try:
709
- for start in range(0, len(render_jobs), chunk_size):
710
- chunk = render_jobs[start : start + chunk_size]
711
- with tempfile.TemporaryDirectory(prefix="build-corpus-equations-") as tmp:
712
- tmp_dir = Path(tmp)
713
- temp_docx = tmp_dir / "equations.docx"
714
- temp_html = tmp_dir / "equations.html"
715
-
716
- doc = Document()
717
- for absolute_index, math_node in chunk:
718
- p = doc.add_paragraph(f"EQMARKER{absolute_index:06d} ")
719
- p._p.append(parse_xml(ET.tostring(math_node, encoding="unicode")))
720
- doc.save(temp_docx)
721
-
722
- opened = word.Documents.Open(str(temp_docx), ReadOnly=True, AddToRecentFiles=False)
723
- opened.SaveAs2(str(temp_html), FileFormat=10)
724
- opened.Close(False)
725
-
726
- html_assets = temp_html.with_name(temp_html.stem + "_files")
727
- rendered = sorted(html_assets.glob("image*.png"))
728
- if len(rendered) != len(chunk):
729
- self.stats.warnings.append(
730
- f"Equation image count mismatch in render chunk {start + 1}-{start + len(chunk)}: "
731
- f"OMML={len(chunk)} rendered={len(rendered)}"
732
- )
733
-
734
- for (absolute_index, _math_node), source in zip(chunk, rendered):
735
- target = self.assets_dir / f"eq-{absolute_index:06d}.png"
736
- shutil.copyfile(source, target)
737
- self.equation_asset_map[absolute_index] = f"{self.asset_ref_prefix}/{target.name}"
738
- finally:
739
- word.Quit()
740
- self.stats.equation_images = len(self.equation_asset_map)
741
- self.stats.skipped_empty_equations = len(self.empty_equation_indexes)
742
-
743
- def render_children(self, node: ET.Element, top_level: bool = False) -> str:
744
- parts: list[str] = []
745
- for child in list(node):
746
- rendered = self.render_block(child)
747
- if not rendered:
748
- continue
749
- if top_level:
750
- parts.append(rendered.rstrip())
751
- else:
752
- parts.append(rendered.strip())
753
- sep = "\n\n" if top_level else "\n"
754
- return sep.join(part for part in parts if part)
755
-
756
- def render_block(self, node: ET.Element) -> str:
757
- name = local_name(node.tag)
758
- if name == "p":
759
- return self.render_paragraph(node)
760
- if name == "tbl":
761
- return self.render_table(node)
762
- if name == "sdt":
763
- content = node.find("./w:sdtContent", NS)
764
- return self.render_children(content, top_level=True) if content is not None else ""
765
- if name in {"bookmarkStart", "bookmarkEnd", "sectPr", "proofErr", "permStart", "permEnd"}:
766
- return ""
767
- return self.render_children(node, top_level=False)
768
-
769
- def render_paragraph(self, p: ET.Element) -> str:
770
- style = paragraph_style(p)
771
- content = self.render_inline_children(p)
772
- if not content.strip():
773
- return ""
774
- content = strip_trailing_markdown_breaks(content)
775
-
776
- if is_code_style(style) or (self.table_depth == 0 and paragraph_is_code(p)):
777
- self.stats.code_blocks += 1
778
- return self.render_code_paragraph(p)
779
-
780
- level = heading_level(style)
781
- if level:
782
- self.stats.headings += 1
783
- return f"{'#' * level} {self.strip_inline_markers(content)}"
784
-
785
- if is_quote_style(style):
786
- self.stats.paragraphs += 1
787
- return f"> {content}"
788
-
789
- num_info = paragraph_num_info(p)
790
- if not num_info:
791
- num_info = paragraph_list_style_info(style)
792
- if num_info:
793
- self.stats.lists += 1
794
- list_level, ordered = num_info
795
- indent = " " * list_level
796
- bullet = "1." if ordered else "-"
797
- return f"{indent}{bullet} {content}"
798
-
799
- if paragraph_is_math(p):
800
- if self.table_depth > 0:
801
- return content
802
- if not paragraph_has_display_math_layout(p):
803
- return content
804
- inner = content.strip()
805
- if inner.startswith("$") and inner.endswith("$") and len(inner) >= 2:
806
- inner = inner[1:-1]
807
- return f"$$\n{inner}\n$$"
808
-
809
- self.stats.paragraphs += 1
810
- return content
811
-
812
- def render_code_paragraph(self, p: ET.Element) -> str:
813
- info = ""
814
- code_parts: list[str] = []
815
- first_nonempty_seen = False
816
-
817
- for run in p.findall("./w:r", NS):
818
- raw = extract_run_text(run)
819
- if not raw:
820
- continue
821
- if not first_nonempty_seen and run_is_bold(run):
822
- info = raw.strip()
823
- first_nonempty_seen = True
824
- continue
825
- first_nonempty_seen = True
826
- code_parts.append(raw)
827
-
828
- code = "".join(code_parts).strip("\n")
829
- fence = f"```{info}".rstrip()
830
- return f"{fence}\n{code}\n```"
831
-
832
- def render_inline_children(self, node: ET.Element) -> str:
833
- segments: list[tuple[Optional[InlineStyle], str]] = []
834
- for child in list(node):
835
- name = local_name(child.tag)
836
- if name == "r":
837
- segments.extend(self.render_run_segments(child))
838
- elif name == "hyperlink":
839
- label = self.render_inline_children(child).strip()
840
- anchor = attr(child, "w", "anchor")
841
- rid = attr(child, "r", "id")
842
- url = f"#{anchor}" if anchor else self.rels.get(rid or "", "")
843
- segments.append((None, f"[{label}]({url})" if url else label))
844
- elif name == "oMath":
845
- segments.append((None, self.render_math(child, display=False)))
846
- elif name == "oMathPara":
847
- segments.append((None, self.render_math(child, display=True)))
848
- elif name == "drawing":
849
- img = self.render_image(child)
850
- if img:
851
- segments.append((None, img))
852
- elif name in {"pPr", "rPr"}:
853
- continue
854
- else:
855
- segments.append((None, self.render_inline_children(child)))
856
- return coalesce_inline_segments(segments)
857
-
858
- def run_style(self, run: ET.Element) -> InlineStyle:
859
- props = run.find("./w:rPr", NS)
860
- if props is None:
861
- return False, False, False
862
- style_node = props.find("./w:rStyle", NS)
863
- is_code = (
864
- style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower()
865
- ) or run_is_code(run)
866
- bold = props.find("./w:b", NS) is not None
867
- italic = props.find("./w:i", NS) is not None
868
- return is_code, bold, italic
869
-
870
- def render_run_segments(self, run: ET.Element) -> list[tuple[Optional[InlineStyle], str]]:
871
- style = self.run_style(run)
872
- is_math = run_is_math(run)
873
- segments: list[tuple[Optional[InlineStyle], str]] = []
874
- run_parts: list[str] = []
875
-
876
- def flush_text() -> None:
877
- if run_parts:
878
- text = "".join(run_parts)
879
- if is_math:
880
- segments.append((None, f"${text}$"))
881
- else:
882
- segments.append((style, text))
883
- run_parts.clear()
884
-
885
- for child in list(run):
886
- name = local_name(child.tag)
887
- if name == "t":
888
- run_parts.append(clean_text(child.text or "") if is_math else escape_md_text(child.text or ""))
889
- elif name == "noBreakHyphen":
890
- run_parts.append("\u2011")
891
- elif name == "softHyphen":
892
- run_parts.append("\u00ad")
893
- elif name == "tab":
894
- run_parts.append("\t")
895
- elif name in {"br", "cr"}:
896
- run_parts.append(" \n")
897
- elif name == "drawing":
898
- flush_text()
899
- img = self.render_image(child)
900
- if img:
901
- segments.append((None, img))
902
- elif name == "rPr":
903
- continue
904
- else:
905
- flush_text()
906
- nested = self.render_inline_children(child)
907
- if nested:
908
- segments.append((None, nested))
909
-
910
- flush_text()
911
- return segments
912
-
913
- def render_math(self, node: ET.Element, display: bool) -> str:
914
- self.stats.equations += 1
915
- self.equation_index += 1
916
- if self.equation_index in self.empty_equation_indexes or self.is_empty_equation(node):
917
- self.stats.skipped_empty_equations = max(
918
- self.stats.skipped_empty_equations,
919
- len(self.empty_equation_indexes),
920
- )
921
- return ""
922
- if self.equation_mode == "image":
923
- asset = self.equation_asset_map.get(self.equation_index)
924
- if asset:
925
- alt = f"equation {self.equation_index}"
926
- rendered = f"![{alt}]({asset})"
927
- return f"\n{rendered}\n" if display and self.table_depth == 0 else rendered
928
- self.stats.warnings.append(f"Missing rendered equation asset for equation {self.equation_index}")
929
- return self.render_missing_equation_fallback(node)
930
-
931
- try:
932
- tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
933
- render_display = display and self.table_depth == 0
934
- rendered = normalize_tex(tex, display=render_display)
935
- if len(self.equation_samples) < 50:
936
- self.equation_samples.append({
937
- "source": "".join(t.text or "" for t in node.findall(".//m:t", NS))[:220],
938
- "tex": rendered[:500],
939
- })
940
- return f"\n{rendered}\n" if render_display else rendered
941
- except Exception as exc:
942
- self.stats.equation_errors += 1
943
- self.stats.warnings.append(f"Equation conversion failed: {exc!r}")
944
- fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
945
- return f"`[equation: {fallback}]`"
946
-
947
- def render_missing_equation_fallback(self, node: ET.Element) -> str:
948
- try:
949
- tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
950
- return normalize_tex(tex, display=False)
951
- except Exception:
952
- fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
953
- return f"`[equation: {fallback}]`"
954
-
955
- @staticmethod
956
- def is_empty_equation(node: ET.Element) -> bool:
957
- text = "".join(t.text or "" for t in node.findall(".//m:t", NS)).strip()
958
- return not text
959
-
960
- def render_image(self, node: ET.Element) -> str:
961
- preferred_name = image_metadata_filename(node)
962
- refs = []
963
- for blip in node.findall(".//a:blip", NS):
964
- rid = attr(blip, "r", "embed") or attr(blip, "r", "link")
965
- if rid:
966
- refs.append(rid)
967
- rendered = []
968
- for rid in refs:
969
- target = self.rels.get(rid, rid)
970
- source = resolve_image_target(target)
971
- asset = self.media_map.get(source)
972
- if not asset:
973
- self.stats.warnings.append(f"Image relationship not copied: {rid} -> {target}")
974
- continue
975
- if preferred_name and self.image_mode == "assets":
976
- current_path = self.output_dir / asset
977
- preferred_path = self.assets_dir / preferred_name
978
- if current_path.exists() and preferred_path != current_path and not preferred_path.exists():
979
- shutil.copyfile(current_path, preferred_path)
980
- asset = f"{self.asset_ref_prefix}/{preferred_path.name}"
981
- self.stats.images += 1
982
- rendered.append(f"![image]({asset})")
983
- return " ".join(rendered)
984
-
985
- def render_table(self, tbl: ET.Element) -> str:
986
- self.stats.tables += 1
987
- self.table_depth += 1
988
- try:
989
- rows = tbl.findall("./w:tr", NS)
990
- rendered_rows = []
991
- complex_table = False
992
- for row in rows:
993
- cells = row.findall("./w:tc", NS)
994
- rendered_cells = []
995
- for cell in cells:
996
- if cell.find(".//w:tbl", NS) is not None:
997
- complex_table = True
998
- cell_text = self.render_cell(cell)
999
- if "\n\n" in cell_text:
1000
- complex_table = True
1001
- rendered_cells.append(cell_text)
1002
- rendered_rows.append(rendered_cells)
1003
- finally:
1004
- self.table_depth -= 1
1005
-
1006
- if not rendered_rows:
1007
- return ""
1008
- if complex_table:
1009
- self.stats.html_tables += 1
1010
- return self.render_html_table(rendered_rows)
1011
- self.stats.markdown_tables += 1
1012
- return self.render_markdown_table(rendered_rows)
1013
-
1014
- def render_cell(self, cell: ET.Element) -> str:
1015
- parts = []
1016
- for child in list(cell):
1017
- if local_name(child.tag) == "tcPr":
1018
- continue
1019
- rendered = self.render_block(child)
1020
- if rendered:
1021
- parts.append(rendered.strip())
1022
- return "<br>".join(parts).strip()
1023
-
1024
- def render_markdown_table(self, rows: list[list[str]]) -> str:
1025
- width = max(len(row) for row in rows)
1026
- padded = [row + [""] * (width - len(row)) for row in rows]
1027
-
1028
- def clean_cell(value: str) -> str:
1029
- return value.replace("\n", "<br>").replace("|", "\\|").strip()
1030
-
1031
- lines = []
1032
- lines.append("| " + " | ".join(clean_cell(v) for v in padded[0]) + " |")
1033
- lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
1034
- for row in padded[1:]:
1035
- lines.append("| " + " | ".join(clean_cell(v) for v in row) + " |")
1036
- return "\n".join(lines)
1037
-
1038
- def render_html_table(self, rows: list[list[str]]) -> str:
1039
- html_rows = ["<table>"]
1040
- for row in rows:
1041
- html_rows.append(" <tr>")
1042
- for cell in row:
1043
- # Keep inline Markdown-ish math readable inside HTML fallback.
1044
- html_rows.append(f" <td>{html.escape(cell, quote=False).replace(chr(10), '<br>')}</td>")
1045
- html_rows.append(" </tr>")
1046
- html_rows.append("</table>")
1047
- return "\n".join(html_rows)
1048
-
1049
- @staticmethod
1050
- def strip_inline_markers(text: str) -> str:
1051
- return text.replace("\n", " ").strip()
1052
-
1053
-
1054
- def export_one(
1055
- input_path: Path,
1056
- output_root: Path,
1057
- equation_mode: str,
1058
- out_same_dir: bool,
1059
- image_mode: str,
1060
- s3_config: S3ImageConfig | None = None,
1061
- ) -> dict:
1062
- if out_same_dir:
1063
- output_dir = input_path.parent
1064
- output_md = input_path.with_suffix(".md")
1065
- assets_dir = input_path.with_name(input_path.stem + ".assets")
1066
- report_path = input_path.with_name(input_path.stem + ".export-report.json")
1067
- else:
1068
- output_dir = output_root / input_path.stem
1069
- output_md = None
1070
- assets_dir = None
1071
- report_path = None
1072
- exporter = BuildCorpusExporter(
1073
- input_path,
1074
- output_dir,
1075
- equation_mode=equation_mode,
1076
- output_md=output_md,
1077
- assets_dir=assets_dir,
1078
- report_path=report_path,
1079
- image_mode=image_mode,
1080
- s3_config=s3_config,
1081
- )
1082
- return exporter.export()
1083
-
1084
-
1085
- def collect_inputs(path: Path, target: str) -> list[Path]:
1086
- if path.is_file():
1087
- return [path]
1088
- inputs: list[Path] = []
1089
- patterns = ("*.md",) if target == "word" else ("*.docx", "*.pptx", "*.ppt")
1090
- for ext in patterns:
1091
- inputs.extend(path.rglob(ext))
1092
- return sorted(inputs)
1093
-
1094
-
1095
- def main() -> None:
1096
- parser = argparse.ArgumentParser(
1097
- description="Convert Markdown to DOCX or DOCX/PPTX/PPT to Markdown.",
1098
- formatter_class=argparse.RawDescriptionHelpFormatter,
1099
- epilog="""examples:
1100
- build-corpus input.docx --out out
1101
- build-corpus input.md --to word --out out
1102
- build-corpus ./word-files --out ./markdown
1103
- build-corpus ./word-files --out-same-dir
1104
- build-corpus input.docx --images base64
1105
- build-corpus input.docx --images s3 --config build-corpus.config.json
1106
-
1107
- image modes:
1108
- assets copy images into an assets folder and reference them from Markdown
1109
- base64 embed images directly as Markdown data URIs
1110
- s3 upload images to S3-compatible storage such as Cloudflare R2 or AWS S3
1111
-
1112
- equation modes:
1113
- tex convert Word OMML equations to KaTeX-readable TeX
1114
- image render equations as images for visual debugging only
1115
- """,
1116
- )
1117
- parser.add_argument("input", type=Path, help="Markdown, DOCX, PPTX, or PPT file or directory")
1118
- parser.add_argument("--config", type=Path, help="JSON config file with conversion, output, and S3/R2 defaults")
1119
- parser.add_argument("--out", type=Path, help="Output directory for converted Markdown tree")
1120
- parser.add_argument("--to", choices=["auto", "markdown", "word"], help="Output target; auto infers from a single-file input")
1121
- parser.add_argument("--equations", choices=["tex", "image"], help="Equation output mode; default comes from config or tex")
1122
- parser.add_argument("--images", choices=["assets", "base64", "s3"], help="Image output mode; default comes from config or assets")
1123
- parser.add_argument("--out-same-dir", action="store_true", help="Write .md, .assets, and reports beside each source DOCX")
1124
- parser.add_argument("--word-template", type=Path, help="Optional .docx or .dotx template used for Markdown to Word exports")
1125
- parser.add_argument("--s3-bucket", help="S3/R2 bucket name for --images s3")
1126
- parser.add_argument("--s3-public-base-url", help="Public URL base used in Markdown, e.g. https://assets.example.com")
1127
- parser.add_argument("--s3-prefix", help="Object key prefix for uploaded images")
1128
- parser.add_argument("--s3-endpoint-url", help="S3-compatible endpoint, required for Cloudflare R2")
1129
- parser.add_argument("--s3-region", help="S3 region; use auto for Cloudflare R2")
1130
- parser.add_argument("--s3-access-key-id", help="S3/R2 access key id; can also come from config/env expansion")
1131
- parser.add_argument("--s3-secret-access-key", help="S3/R2 secret access key; can also come from config/env expansion")
1132
- parser.add_argument("--s3-cache-control", help="Cache-Control header for uploaded images")
1133
- parser.add_argument("--s3-acl", help="Optional ACL for AWS S3; usually omitted for Cloudflare R2")
1134
- args = parser.parse_args()
1135
- config = load_config(args.config)
1136
-
1137
- args.out = args.out or Path(config_get(config, "output.out", ".codex/build-corpus/out"))
1138
- args.to = args.to or config_get(config, "conversion.target", "auto")
1139
- args.equations = args.equations or config_get(config, "conversion.equations", "tex")
1140
- args.images = args.images or config_get(config, "conversion.images", "assets")
1141
- args.out_same_dir = args.out_same_dir or bool(config_get(config, "output.out_same_dir", False))
1142
- args.word_template = args.word_template or (
1143
- Path(config_get(config, "word.template")) if config_get(config, "word.template") else None
1144
- )
1145
- s3_config = build_s3_config(config, args)
1146
-
1147
- input_target = args.to
1148
- if args.input.is_file() and args.to == "auto":
1149
- input_target = "word" if args.input.suffix.lower() == ".md" else "markdown"
1150
- elif args.input.is_dir() and args.to == "auto":
1151
- input_target = "markdown"
1152
-
1153
- reports = []
1154
- for input_path in collect_inputs(args.input, input_target):
1155
- if input_path.name.startswith("~$"):
1156
- continue
1157
- suffix = input_path.suffix.lower()
1158
- if input_target == "word" or suffix == ".md":
1159
- reports.append(export_markdown_to_docx(
1160
- input_path,
1161
- args.out,
1162
- args.out_same_dir,
1163
- template_path=args.word_template,
1164
- ))
1165
- elif suffix in {".pptx", ".ppt"}:
1166
- reports.append(export_presentation(
1167
- input_path,
1168
- args.out,
1169
- args.out_same_dir,
1170
- image_mode=args.images,
1171
- ))
1172
- else:
1173
- reports.append(export_one(
1174
- input_path,
1175
- args.out,
1176
- equation_mode=args.equations,
1177
- out_same_dir=args.out_same_dir,
1178
- image_mode=args.images,
1179
- s3_config=s3_config,
1180
- ))
1181
-
1182
- batch_report_root = args.input if args.out_same_dir and args.input.is_dir() else args.out
1183
- batch_report_root.mkdir(parents=True, exist_ok=True)
1184
- batch_report = batch_report_root / "build-corpus-batch-report.json"
1185
- batch_report.write_text(json.dumps(reports, indent=2, ensure_ascii=False), encoding="utf-8")
1186
- print(json.dumps({
1187
- "converted": len(reports),
1188
- "batch_report": str(batch_report),
1189
- "outputs": [report["output"] for report in reports],
1190
- "default_word_template": str(args.word_template or resolve_default_template_path() or "bundled:md-to-word-template.dotx"),
1191
- }, indent=2))
1192
-
1193
-
1194
- if __name__ == "__main__":
1195
- main()
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import base64
5
+ import contextlib
6
+ import hashlib
7
+ import html
8
+ import json
9
+ import mimetypes
10
+ import os
11
+ import re
12
+ import shutil
13
+ import subprocess
14
+ import tempfile
15
+ import urllib.request
16
+ from dataclasses import dataclass, field
17
+ from pathlib import Path
18
+ from typing import Optional
19
+ from zipfile import ZipFile
20
+ from xml.etree import ElementTree as ET
21
+
22
+ from omml2latex import convert_omml
23
+ try:
24
+ from .frontmatter import add_mdk_frontmatter, read_frontmatter_from_zip
25
+ except ImportError: # pragma: no cover - allows direct script execution
26
+ from build_corpus.frontmatter import add_mdk_frontmatter, read_frontmatter_from_zip
27
+ try:
28
+ from .docx_exporter import export_markdown_to_docx, resolve_default_template_path
29
+ except ImportError: # pragma: no cover - allows direct script execution
30
+ from build_corpus.docx_exporter import export_markdown_to_docx, resolve_default_template_path
31
+ try:
32
+ from .ppt_exporter import export_presentation
33
+ except ImportError: # pragma: no cover - allows direct script execution
34
+ from build_corpus.ppt_exporter import export_presentation
35
+
36
+
37
+ NS = {
38
+ "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
39
+ "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
40
+ "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
41
+ "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
42
+ "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
43
+ }
44
+
45
+ W = f"{{{NS['w']}}}"
46
+ R = f"{{{NS['r']}}}"
47
+
48
+
49
+ @dataclass
50
+ class ExportStats:
51
+ paragraphs: int = 0
52
+ headings: int = 0
53
+ code_blocks: int = 0
54
+ tables: int = 0
55
+ markdown_tables: int = 0
56
+ html_tables: int = 0
57
+ equations: int = 0
58
+ equation_images: int = 0
59
+ skipped_empty_equations: int = 0
60
+ equation_errors: int = 0
61
+ images: int = 0
62
+ lists: int = 0
63
+ footnotes: int = 0
64
+ warnings: list[str] = field(default_factory=list)
65
+
66
+
67
+ @dataclass
68
+ class S3ImageConfig:
69
+ bucket: str
70
+ public_base_url: str
71
+ prefix: str = ""
72
+ endpoint_url: str | None = None
73
+ region_name: str | None = None
74
+ access_key_id: str | None = None
75
+ secret_access_key: str | None = None
76
+ cache_control: str = "public, max-age=31536000, immutable"
77
+ acl: str | None = None
78
+
79
+
80
+ class S3ImageUploader:
81
+ def __init__(self, config: S3ImageConfig):
82
+ self.config = config
83
+ try:
84
+ import boto3
85
+ except ImportError as exc:
86
+ raise RuntimeError("S3/R2 image mode requires boto3. Install with: pip install boto3") from exc
87
+
88
+ kwargs = {
89
+ "service_name": "s3",
90
+ "endpoint_url": config.endpoint_url,
91
+ "region_name": config.region_name,
92
+ "aws_access_key_id": config.access_key_id,
93
+ "aws_secret_access_key": config.secret_access_key,
94
+ }
95
+ self.client = boto3.client(**{key: value for key, value in kwargs.items() if value})
96
+
97
+ def upload(self, source_name: str, data: bytes, content_type: str) -> dict[str, str]:
98
+ digest = hashlib.sha256(data).hexdigest()
99
+ suffix = Path(source_name).suffix.lower()
100
+ key_parts = [self.config.prefix.strip("/"), "images", "sha256", f"{digest}{suffix}"]
101
+ key = "/".join(part for part in key_parts if part)
102
+ put_args = {
103
+ "Bucket": self.config.bucket,
104
+ "Key": key,
105
+ "Body": data,
106
+ "ContentType": content_type,
107
+ "CacheControl": self.config.cache_control,
108
+ }
109
+ if self.config.acl:
110
+ put_args["ACL"] = self.config.acl
111
+ self.client.put_object(**put_args)
112
+ return {
113
+ "source": source_name,
114
+ "sha256": digest,
115
+ "bucket": self.config.bucket,
116
+ "key": key,
117
+ "url": f"{self.config.public_base_url.rstrip('/')}/{key}",
118
+ "content_type": content_type,
119
+ "bytes": str(len(data)),
120
+ }
121
+
122
+
123
+ def local_name(tag: str) -> str:
124
+ return tag.rsplit("}", 1)[-1] if "}" in tag else tag
125
+
126
+
127
+ def attr(node: ET.Element, ns: str, name: str) -> str | None:
128
+ return node.attrib.get(f"{{{NS[ns]}}}{name}")
129
+
130
+
131
+ def clean_text(text: str) -> str:
132
+ return (
133
+ text.replace("\u00a0", " ")
134
+ .replace("\u200b", "")
135
+ .replace("\ufeff", "")
136
+ )
137
+
138
+
139
+ def escape_md_text(text: str) -> str:
140
+ text = clean_text(text)
141
+ escaped: list[str] = []
142
+ index = 0
143
+ while index < len(text):
144
+ char = text[index]
145
+ if char == "\\":
146
+ next_char = text[index + 1] if index + 1 < len(text) else ""
147
+ if next_char in "\\`*_{}[]()#+.!|$-":
148
+ escaped.append("\\")
149
+ escaped.append(next_char)
150
+ index += 2
151
+ continue
152
+ escaped.append("\\\\")
153
+ elif char in {"*", "_", "$"}:
154
+ escaped.append("\\" + char)
155
+ else:
156
+ escaped.append(char)
157
+ index += 1
158
+ return "".join(escaped)
159
+
160
+
161
+ def normalize_inline_markers(text: str) -> str:
162
+ # Ensure inline images do not glue themselves to adjacent text.
163
+ text = re.sub(r"(\!\[[^\]]*\]\([^)]+\))(?=[^\s<>)\].,;:!?])", r"\1 ", text)
164
+ text = re.sub(r"(?<=[^\s<(\[.,;:!?])(\!\[[^\]]*\]\([^)]+\))", r" \1", text)
165
+ return text
166
+
167
+
168
+ def strip_trailing_markdown_breaks(text: str) -> str:
169
+ while text.endswith(" "):
170
+ text = text[:-2]
171
+ return text.rstrip()
172
+
173
+
174
+ InlineStyle = tuple[bool, bool, bool]
175
+
176
+
177
+ def apply_inline_style(text: str, style: InlineStyle) -> str:
178
+ is_code, bold, italic = style
179
+ if not text:
180
+ return ""
181
+ if not text.strip():
182
+ return text
183
+ if is_code:
184
+ return f"`{text.replace('`', '\\`')}`"
185
+ if bold and italic:
186
+ return f"***{text}***"
187
+ if bold:
188
+ return f"**{text}**"
189
+ if italic:
190
+ return f"*{text}*"
191
+ return text
192
+
193
+
194
+ def coalesce_inline_segments(segments: list[tuple[Optional[InlineStyle], str]]) -> str:
195
+ parts: list[str] = []
196
+ buffer: list[str] = []
197
+ buffer_style: Optional[InlineStyle] = None
198
+
199
+ def flush() -> None:
200
+ nonlocal buffer_style
201
+ if not buffer:
202
+ return
203
+ parts.append(apply_inline_style("".join(buffer), buffer_style or (False, False, False)))
204
+ buffer.clear()
205
+ buffer_style = None
206
+
207
+ for style, text in segments:
208
+ if not text:
209
+ continue
210
+ if style is None:
211
+ flush()
212
+ parts.append(text)
213
+ continue
214
+ if buffer_style == style:
215
+ buffer.append(text)
216
+ continue
217
+ flush()
218
+ buffer_style = style
219
+ buffer.append(text)
220
+
221
+ flush()
222
+ return normalize_inline_markers("".join(parts))
223
+
224
+
225
+ def normalize_tex(tex: str, display: bool) -> str:
226
+ tex = clean_text(tex).strip()
227
+ if tex.startswith("$$") and tex.endswith("$$"):
228
+ tex = tex[2:-2].strip()
229
+ elif tex.startswith("$") and tex.endswith("$"):
230
+ tex = tex[1:-1].strip()
231
+
232
+ tex = tex.replace("\u2011", "-")
233
+ tex = tex.replace("$", r"\$")
234
+ tex = tex.replace(r"\text{ }", r"\,")
235
+ tex = tex.replace(r"\text{  }", r"\;")
236
+ tex = tex.replace(r"\text{ }", " ")
237
+ tex = tex.replace(r"\mathrm{\}\text{*}}", r"\*")
238
+ tex = tex.replace(r"\text{-}", "-")
239
+ tex = tex.replace(r"\*", "*")
240
+ tex = replace_raw_unicode_math(tex)
241
+ tex = strip_word_equation_field_codes(tex)
242
+ tex = normalize_redundant_math_italics(tex, display=display)
243
+ tex = escape_text_macro_underscores(tex)
244
+ tex = repair_underbrace_limits(tex)
245
+ tex = balance_tex_braces(tex)
246
+ tex = re.sub(r"\s+", " ", tex).strip()
247
+ return f"$$\n{tex}\n$$" if display else f"${tex}$"
248
+
249
+
250
+ def normalize_redundant_math_italics(tex: str, display: bool) -> str:
251
+ tex = re.sub(r"\\mathit\{([A-Za-z])\}", r"\1", tex)
252
+ tex = re.sub(r"\{([A-Za-z])\}\^\{([^{}]+)\}", r"\1^\2", tex)
253
+ tex = re.sub(r"\\int_\{([^{}]+)\}\^\{([^{}]+)\}", r"\\int_\1^\2", tex)
254
+ tex = normalize_integral_differentials(tex)
255
+ if display:
256
+ tex = re.sub(r"\s*=\s*", " = ", tex)
257
+ return unwrap_redundant_tex_groups(tex)
258
+
259
+
260
+ def normalize_integral_differentials(tex: str) -> str:
261
+ if r"\int" not in tex:
262
+ return tex
263
+ return re.sub(r"(?<!\\,)d([A-Za-z])(?=\s*(?:=|$|[+\-*/]))", r"\\,d\1", tex)
264
+
265
+
266
+ def unwrap_redundant_tex_groups(tex: str) -> str:
267
+ integral_match = re.match(r"^(\\int_[^\s]+\^[^\s]+)\s+(.+)$", tex)
268
+ if integral_match:
269
+ head, tail = integral_match.groups()
270
+ tex = head + " " + unwrap_redundant_tex_groups(tail.strip())
271
+ while is_wrapped_in_redundant_braces(tex):
272
+ tex = tex[1:-1].strip()
273
+ return tex
274
+
275
+
276
+ def is_wrapped_in_redundant_braces(tex: str) -> bool:
277
+ if not (tex.startswith("{") and tex.endswith("}")):
278
+ return False
279
+ depth = 0
280
+ escaped = False
281
+ for index, char in enumerate(tex):
282
+ if escaped:
283
+ escaped = False
284
+ continue
285
+ if char == "\\":
286
+ escaped = True
287
+ continue
288
+ if char == "{":
289
+ depth += 1
290
+ elif char == "}":
291
+ depth -= 1
292
+ if depth == 0 and index != len(tex) - 1:
293
+ return False
294
+ return depth == 0
295
+
296
+
297
+ UNICODE_MATH_REPLACEMENTS = {
298
+ "∸": r"\dot{-}",
299
+ "⨅": r"\sqcap",
300
+ "⨃": r"\bigcup",
301
+ "⋜": r"\lessgtr",
302
+ "⋝": r"\gtrless",
303
+ "∱": r"\oint",
304
+ "∲": r"\oint",
305
+ "∳": r"\oint",
306
+ "ℇ": r"\varepsilon",
307
+ "Ϝ": r"\digamma",
308
+ "℩": r"\iota",
309
+ "Å": r"\mathring{A}",
310
+ "℮": "e",
311
+ }
312
+
313
+
314
+ def replace_raw_unicode_math(tex: str) -> str:
315
+ for raw, replacement in UNICODE_MATH_REPLACEMENTS.items():
316
+ tex = tex.replace(raw, replacement)
317
+ return tex
318
+
319
+
320
+ def strip_word_equation_field_codes(tex: str) -> str:
321
+ # Word SEQ fields can leak into OMML conversion as equation-number text.
322
+ patterns = [
323
+ r"#\s*\\left\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\\right\)",
324
+ r"#\s*\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\)",
325
+ r"#\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+",
326
+ ]
327
+ for pattern in patterns:
328
+ tex = re.sub(pattern, "", tex, flags=re.IGNORECASE)
329
+ return tex
330
+
331
+
332
+ def escape_text_macro_underscores(tex: str) -> str:
333
+ def replace(match: re.Match[str]) -> str:
334
+ body = match.group(1)
335
+ body = body.replace("\\", r"\textbackslash{}")
336
+ body = body.replace("_", r"\_")
337
+ body = body.replace("&", r"\&")
338
+ body = body.replace("%", r"\%")
339
+ body = body.replace("#", r"\#")
340
+ return r"\text{" + body + "}"
341
+
342
+ return re.sub(r"\\text\{([^{}]*)\}", replace, tex)
343
+
344
+
345
+ def repair_underbrace_limits(tex: str) -> str:
346
+ pattern = re.compile(
347
+ r"\\mathop\{\\mathop\{(?P<base>.*?)\}\\limits_\{\s*\\underbrace\s*\}\}\\limits_\{(?P<label>.*?)\}"
348
+ r"(?=(?:[+\-]|\\cdot|\\times|=|,|;|$))",
349
+ re.DOTALL,
350
+ )
351
+ previous = None
352
+ while previous != tex:
353
+ previous = tex
354
+ tex = pattern.sub(r"\\underbrace{\g<base>}_{\g<label>}", tex)
355
+ return tex
356
+
357
+
358
+ def balance_tex_braces(tex: str) -> str:
359
+ balanced: list[str] = []
360
+ depth = 0
361
+ escaped = False
362
+ for char in tex:
363
+ if escaped:
364
+ balanced.append(char)
365
+ escaped = False
366
+ continue
367
+ if char == "\\":
368
+ balanced.append(char)
369
+ escaped = True
370
+ continue
371
+ if char == "{":
372
+ depth += 1
373
+ balanced.append(char)
374
+ elif char == "}":
375
+ if depth > 0:
376
+ depth -= 1
377
+ balanced.append(char)
378
+ # Drop unmatched closing braces; KaTeX rejects them.
379
+ else:
380
+ balanced.append(char)
381
+ if depth > 0:
382
+ balanced.extend("}" for _ in range(depth))
383
+ return "".join(balanced)
384
+
385
+
386
+ def paragraph_style(node: ET.Element) -> str | None:
387
+ style = node.find("./w:pPr/w:pStyle", NS)
388
+ return attr(style, "w", "val") if style is not None else None
389
+
390
+
391
+ def heading_level(style: str | None) -> int | None:
392
+ if not style:
393
+ return None
394
+ match = re.fullmatch(r"Heading([1-6])", style)
395
+ if match:
396
+ return int(match.group(1))
397
+ return None
398
+
399
+
400
+ def is_code_style(style: str | None) -> bool:
401
+ return bool(style and "code" in style.lower())
402
+
403
+
404
+ def is_quote_style(style: str | None) -> bool:
405
+ if not style:
406
+ return False
407
+ normalized = style.replace(" ", "").lower()
408
+ return normalized in {"buildcorpusquote", "quote", "intensequote"}
409
+
410
+
411
+ def paragraph_num_info(node: ET.Element) -> tuple[int, bool] | None:
412
+ num_pr = node.find("./w:pPr/w:numPr", NS)
413
+ if num_pr is None:
414
+ return None
415
+ ilvl = num_pr.find("./w:ilvl", NS)
416
+ level = int(attr(ilvl, "w", "val") or "0") if ilvl is not None else 0
417
+ # Without numbering.xml style resolution, use bullets as the safer default.
418
+ return level, False
419
+
420
+
421
+ def paragraph_list_style_info(style: str | None) -> tuple[int, bool] | None:
422
+ if not style:
423
+ return None
424
+ normalized = style.replace(" ", "").lower()
425
+ if normalized.startswith("listbullet"):
426
+ suffix = normalized.removeprefix("listbullet")
427
+ level = int(suffix) if suffix.isdigit() else 1
428
+ return max(level - 1, 0), False
429
+ if normalized.startswith("listnumber"):
430
+ suffix = normalized.removeprefix("listnumber")
431
+ level = int(suffix) if suffix.isdigit() else 1
432
+ return max(level - 1, 0), True
433
+ return None
434
+
435
+
436
+ def run_is_math(run: ET.Element) -> bool:
437
+ props = run.find("./w:rPr", NS)
438
+ if props is None:
439
+ return False
440
+ fonts = props.find("./w:rFonts", NS)
441
+ if fonts is None:
442
+ return False
443
+ for attr_name in ("ascii", "hAnsi", "cs"):
444
+ value = attr(fonts, "w", attr_name)
445
+ if value and value.lower() == "cambria math":
446
+ return True
447
+ return False
448
+
449
+
450
+ def run_is_code(run: ET.Element) -> bool:
451
+ props = run.find("./w:rPr", NS)
452
+ if props is None:
453
+ return False
454
+ style_node = props.find("./w:rStyle", NS)
455
+ if style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower():
456
+ return True
457
+ fonts = props.find("./w:rFonts", NS)
458
+ if fonts is None:
459
+ return False
460
+ for attr_name in ("ascii", "hAnsi", "cs"):
461
+ value = attr(fonts, "w", attr_name)
462
+ if value and value.lower() == "consolas":
463
+ return True
464
+ return False
465
+
466
+
467
+ def run_is_bold(run: ET.Element) -> bool:
468
+ props = run.find("./w:rPr", NS)
469
+ return props is not None and props.find("./w:b", NS) is not None
470
+
471
+
472
+ def paragraph_is_code(node: ET.Element) -> bool:
473
+ runs = node.findall("./w:r", NS)
474
+ if not runs:
475
+ return False
476
+
477
+ first_nonempty_seen = False
478
+ code_like_runs = 0
479
+ meaningful_runs = 0
480
+
481
+ for run in runs:
482
+ text = extract_run_text(run)
483
+ if not text or not text.strip():
484
+ continue
485
+ meaningful_runs += 1
486
+ if not first_nonempty_seen and run_is_bold(run):
487
+ first_nonempty_seen = True
488
+ continue
489
+ first_nonempty_seen = True
490
+ if run_is_code(run):
491
+ code_like_runs += 1
492
+ continue
493
+ return False
494
+
495
+ return meaningful_runs > 0 and code_like_runs > 0
496
+
497
+
498
+ def extract_run_text(run: ET.Element) -> str:
499
+ parts: list[str] = []
500
+ for child in list(run):
501
+ name = local_name(child.tag)
502
+ if name == "t":
503
+ parts.append(clean_text(child.text or ""))
504
+ elif name == "tab":
505
+ parts.append("\t")
506
+ elif name in {"br", "cr"}:
507
+ parts.append("\n")
508
+ return "".join(parts)
509
+
510
+
511
+ def paragraph_is_math(node: ET.Element) -> bool:
512
+ runs = node.findall("./w:r", NS)
513
+ math_runs = 0
514
+ text_runs = 0
515
+ for run in runs:
516
+ if run_is_hidden(run):
517
+ continue
518
+ texts = [t.text or "" for t in run.findall("./w:t", NS)]
519
+ if not any(segment.strip() for segment in texts):
520
+ continue
521
+ text_runs += 1
522
+ if run_is_math(run):
523
+ math_runs += 1
524
+ return text_runs > 0 and text_runs == math_runs
525
+
526
+
527
+ def paragraph_has_display_math_layout(node: ET.Element) -> bool:
528
+ indent = node.find("./w:pPr/w:ind", NS)
529
+ spacing = node.find("./w:pPr/w:spacing", NS)
530
+ if indent is not None and any(attr(indent, "w", key) not in {None, "0"} for key in ("left", "right", "firstLine", "hanging")):
531
+ return True
532
+ if spacing is not None and any(attr(spacing, "w", key) not in {None, "0"} for key in ("before", "after")):
533
+ return True
534
+ return False
535
+
536
+
537
+ def run_is_hidden(run: ET.Element) -> bool:
538
+ return run.find("./w:rPr/w:vanish", NS) is not None
539
+
540
+
541
+ def relationship_map(zip_file: ZipFile, part: str = "word/document.xml") -> dict[str, str]:
542
+ rels_path = str(Path(part).parent / "_rels" / (Path(part).name + ".rels")).replace("\\", "/")
543
+ if rels_path not in zip_file.namelist():
544
+ return {}
545
+ root = ET.fromstring(zip_file.read(rels_path))
546
+ return {
547
+ rel.attrib["Id"]: rel.attrib.get("Target", "")
548
+ for rel in root
549
+ if "Id" in rel.attrib
550
+ }
551
+
552
+
553
+ def resolve_image_target(target: str) -> str:
554
+ if target.startswith("../"):
555
+ target = target[3:]
556
+ if not target.startswith("word/"):
557
+ target = f"word/{target}"
558
+ return target
559
+
560
+
561
+ def image_metadata_filename(node: ET.Element) -> str | None:
562
+ for tag_name in ("docPr", "cNvPr"):
563
+ for entry in node.findall(f".//wp:{tag_name}", NS):
564
+ for key in ("descr", "title", "name"):
565
+ value = entry.attrib.get(key)
566
+ if value and Path(value).suffix:
567
+ return Path(value).name
568
+ return None
569
+
570
+
571
+ def expand_env(value):
572
+ if isinstance(value, str):
573
+ return os.path.expandvars(value)
574
+ if isinstance(value, dict):
575
+ return {key: expand_env(item) for key, item in value.items()}
576
+ if isinstance(value, list):
577
+ return [expand_env(item) for item in value]
578
+ return value
579
+
580
+
581
+ def load_config(path: Path | None) -> dict:
582
+ if path is None:
583
+ return {}
584
+ if not path.exists():
585
+ raise FileNotFoundError(f"Config file not found: {path}")
586
+ if path.suffix.lower() != ".json":
587
+ raise ValueError("Config currently supports JSON files only")
588
+ return expand_env(json.loads(path.read_text(encoding="utf-8")))
589
+
590
+
591
+ def config_get(config: dict, key: str, default=None):
592
+ current = config
593
+ for part in key.split("."):
594
+ if not isinstance(current, dict) or part not in current:
595
+ return default
596
+ current = current[part]
597
+ return current
598
+
599
+
600
+ IMAGE_MARKDOWN_RE = re.compile(r"!\[([^\]]*)\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)")
601
+ STYLE_PACKAGE_PARTS = {
602
+ "word/styles.xml",
603
+ "word/stylesWithEffects.xml",
604
+ "word/numbering.xml",
605
+ "word/fontTable.xml",
606
+ "word/settings.xml",
607
+ "word/webSettings.xml",
608
+ "word/theme/theme1.xml",
609
+ }
610
+ STYLE_PACKAGE_REL_PARTS = {
611
+ "_rels/.rels",
612
+ "word/_rels/document.xml.rels",
613
+ "docProps/core.xml",
614
+ "docProps/app.xml",
615
+ "docProps/custom.xml",
616
+ }
617
+
618
+
619
+ def image_mime_from_ref(ref: str, data: bytes) -> str:
620
+ guessed = mimetypes.guess_type(ref.split("?", 1)[0])[0]
621
+ if guessed:
622
+ return guessed
623
+ if data.startswith(b"\x89PNG\r\n\x1a\n"):
624
+ return "image/png"
625
+ if data.startswith(b"\xff\xd8"):
626
+ return "image/jpeg"
627
+ if data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
628
+ return "image/gif"
629
+ if data.startswith(b"RIFF") and data[8:12] == b"WEBP":
630
+ return "image/webp"
631
+ return "application/octet-stream"
632
+
633
+
634
+ def read_external_image(ref: str, base_dir: Path) -> tuple[bytes, str]:
635
+ if ref.startswith("data:"):
636
+ raise ValueError("already inline")
637
+ if re.match(r"^https?://", ref, re.IGNORECASE):
638
+ with urllib.request.urlopen(ref, timeout=30) as response:
639
+ data = response.read()
640
+ content_type = response.headers.get_content_type() or image_mime_from_ref(ref, data)
641
+ return data, content_type
642
+ path = Path(ref)
643
+ if not path.is_absolute():
644
+ path = base_dir / path
645
+ data = path.read_bytes()
646
+ return data, image_mime_from_ref(str(path), data)
647
+
648
+
649
+ def inline_markdown_images(input_path: Path, output_path: Path | None = None) -> dict:
650
+ markdown = input_path.read_text(encoding="utf-8")
651
+ base_dir = input_path.parent
652
+ converted = 0
653
+ skipped: list[str] = []
654
+
655
+ def replace(match: re.Match[str]) -> str:
656
+ nonlocal converted
657
+ alt, ref = match.group(1), match.group(2)
658
+ try:
659
+ data, mime = read_external_image(ref, base_dir)
660
+ except Exception as exc:
661
+ skipped.append(f"{ref}: {exc}")
662
+ return match.group(0)
663
+ converted += 1
664
+ encoded = base64.b64encode(data).decode("ascii")
665
+ return f"![{alt}](data:{mime};base64,{encoded})"
666
+
667
+ output = IMAGE_MARKDOWN_RE.sub(replace, markdown)
668
+ target = output_path or input_path.with_name(input_path.stem + ".inline.md")
669
+ target.write_text(output, encoding="utf-8")
670
+ return {
671
+ "input": str(input_path),
672
+ "output": str(target),
673
+ "images_inlined": converted,
674
+ "skipped": skipped,
675
+ }
676
+
677
+
678
+ def build_s3_config(config: dict, args: argparse.Namespace) -> S3ImageConfig | None:
679
+ if args.images != "s3":
680
+ return None
681
+ s3 = config_get(config, "s3", {}) or {}
682
+ bucket = args.s3_bucket or s3.get("bucket")
683
+ public_base_url = args.s3_public_base_url or s3.get("public_base_url")
684
+ if not bucket or not public_base_url:
685
+ raise ValueError("S3/R2 image mode requires bucket and public_base_url")
686
+ return S3ImageConfig(
687
+ bucket=bucket,
688
+ public_base_url=public_base_url,
689
+ prefix=args.s3_prefix if args.s3_prefix is not None else s3.get("prefix", ""),
690
+ endpoint_url=args.s3_endpoint_url or s3.get("endpoint_url"),
691
+ region_name=args.s3_region or s3.get("region_name"),
692
+ access_key_id=args.s3_access_key_id or s3.get("access_key_id"),
693
+ secret_access_key=args.s3_secret_access_key or s3.get("secret_access_key"),
694
+ cache_control=args.s3_cache_control or s3.get("cache_control", "public, max-age=31536000, immutable"),
695
+ acl=args.s3_acl if args.s3_acl is not None else s3.get("acl"),
696
+ )
697
+
698
+
699
+ class BuildCorpusExporter:
700
+ def __init__(
701
+ self,
702
+ input_path: Path,
703
+ output_dir: Path,
704
+ equation_mode: str = "tex",
705
+ output_md: Path | None = None,
706
+ assets_dir: Path | None = None,
707
+ report_path: Path | None = None,
708
+ image_mode: str = "assets",
709
+ s3_config: S3ImageConfig | None = None,
710
+ emit_frontmatter: bool = True,
711
+ ):
712
+ self.input_path = input_path
713
+ self.output_dir = output_dir
714
+ self.emit_frontmatter = emit_frontmatter
715
+ self.output_md = output_md or (output_dir / (input_path.stem + ".md"))
716
+ self.assets_dir = assets_dir or (output_dir / "assets")
717
+ self.report_path = report_path or (output_dir / "export-report.json")
718
+ self.asset_ref_prefix = self.assets_dir.name
719
+ self.equation_mode = equation_mode
720
+ self.image_mode = image_mode
721
+ self.s3_config = s3_config
722
+ self.s3_uploader = S3ImageUploader(s3_config) if image_mode == "s3" and s3_config else None
723
+ self.stats = ExportStats()
724
+ self.rels: dict[str, str] = {}
725
+ self.media_map: dict[str, str] = {}
726
+ self.image_uploads: list[dict[str, str]] = []
727
+ self.equation_asset_map: dict[int, str] = {}
728
+ self.empty_equation_indexes: set[int] = set()
729
+ self.equation_index = 0
730
+ self.equation_samples: list[dict[str, str]] = []
731
+ self.table_depth = 0
732
+ self.pending_math_source: str | None = None
733
+ self.footnotes: dict[str, str] = {}
734
+ self.used_footnotes: list[str] = []
735
+ self.word_style_package_path: Path | None = None
736
+ self.word_style_manifest_path: Path | None = None
737
+
738
+ def export(self) -> dict:
739
+ self.output_dir.mkdir(parents=True, exist_ok=True)
740
+ if self.image_mode == "assets" or self.equation_mode == "image":
741
+ self.assets_dir.mkdir(parents=True, exist_ok=True)
742
+
743
+ prior_frontmatter: str | None = None
744
+ with self.open_input_zip() as zf:
745
+ self.rels = relationship_map(zf)
746
+ self.footnotes = self.load_footnotes(zf)
747
+ if self.emit_frontmatter:
748
+ prior_frontmatter = read_frontmatter_from_zip(zf)
749
+ self.word_style_package_path, self.word_style_manifest_path = self.write_word_style_package(zf)
750
+ self._copy_media(zf)
751
+ document_xml = zf.read("word/document.xml")
752
+ root = ET.fromstring(document_xml)
753
+ body = root.find("w:body", NS)
754
+ if body is None:
755
+ raise RuntimeError("word/document.xml has no w:body")
756
+ if self.equation_mode == "image":
757
+ self._render_equation_assets(root)
758
+
759
+ markdown = self.render_children(body, top_level=True).strip() + "\n"
760
+ markdown = self.add_footnote_definitions(markdown)
761
+
762
+ if self.emit_frontmatter:
763
+ markdown = add_mdk_frontmatter(markdown, self.input_path, prior_frontmatter)
764
+
765
+ self.output_md.parent.mkdir(parents=True, exist_ok=True)
766
+ self.output_md.write_text(markdown, encoding="utf-8")
767
+ report = {
768
+ "input": str(self.input_path),
769
+ "output": str(self.output_md),
770
+ "assets_dir": str(self.assets_dir) if self.assets_dir.exists() else None,
771
+ "image_mode": self.image_mode,
772
+ "image_uploads": self.image_uploads,
773
+ "stats": self.stats.__dict__,
774
+ "equation_samples": self.equation_samples[:50],
775
+ "word_style_package": str(self.word_style_package_path) if self.word_style_package_path else None,
776
+ "word_style_manifest": str(self.word_style_manifest_path) if self.word_style_manifest_path else None,
777
+ }
778
+ self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
779
+ return report
780
+
781
+ def write_word_style_package(self, zf: ZipFile) -> tuple[Path | None, Path | None]:
782
+ sidecar = self.output_md.with_suffix(".wordstyle")
783
+ package_dir = sidecar / "package"
784
+ package_docx = sidecar / "style-package.docx"
785
+ manifest_path = sidecar / "manifest.json"
786
+ sidecar.mkdir(parents=True, exist_ok=True)
787
+ package_dir.mkdir(parents=True, exist_ok=True)
788
+ names = set(zf.namelist())
789
+ copied_parts: list[str] = []
790
+ with ZipFile(package_docx, "w") as package_zip:
791
+ for name in sorted((STYLE_PACKAGE_PARTS | STYLE_PACKAGE_REL_PARTS) & names):
792
+ data = zf.read(name)
793
+ package_zip.writestr(name, data)
794
+ target = package_dir / Path(name)
795
+ target.parent.mkdir(parents=True, exist_ok=True)
796
+ target.write_bytes(data)
797
+ copied_parts.append(name)
798
+ manifest = {
799
+ "source": str(self.input_path),
800
+ "stylePackage": str(package_docx),
801
+ "parts": copied_parts,
802
+ "roundTrip": "word-to-md-to-word",
803
+ }
804
+ manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
805
+ return package_docx, manifest_path
806
+
807
+ def add_roundtrip_metadata(self, markdown: str) -> str:
808
+ if not self.word_style_package_path or not self.word_style_manifest_path:
809
+ return markdown
810
+ package_ref = self.word_style_package_path.relative_to(self.output_md.parent).as_posix()
811
+ manifest_ref = self.word_style_manifest_path.relative_to(self.output_md.parent).as_posix()
812
+ metadata = (
813
+ f'<!-- build-corpus:word-style-package path="{package_ref}" manifest="{manifest_ref}" -->\n'
814
+ "<!-- build-corpus:metadata hidden=\"true\" -->\n\n"
815
+ )
816
+ return metadata + markdown
817
+
818
+ def add_footnote_definitions(self, markdown: str) -> str:
819
+ if not self.used_footnotes:
820
+ return markdown
821
+ lines = ["", ""]
822
+ for note_id in self.used_footnotes:
823
+ text = self.footnotes.get(note_id, "").strip()
824
+ if text:
825
+ lines.append(f"[^{note_id}]: {text}")
826
+ if len(lines) == 2:
827
+ return markdown
828
+ return markdown.rstrip() + "\n" + "\n".join(lines) + "\n"
829
+
830
+ def load_footnotes(self, zf: ZipFile) -> dict[str, str]:
831
+ if "word/footnotes.xml" not in zf.namelist():
832
+ return {}
833
+ root = ET.fromstring(zf.read("word/footnotes.xml"))
834
+ notes: dict[str, str] = {}
835
+ for note in root.findall("./w:footnote", NS):
836
+ note_id = attr(note, "w", "id")
837
+ note_type = attr(note, "w", "type")
838
+ if not note_id or note_type in {"separator", "continuationSeparator"}:
839
+ continue
840
+ rendered = self.render_children(note, top_level=True).strip()
841
+ if rendered:
842
+ notes[note_id] = rendered.replace("\n", " ")
843
+ return notes
844
+
845
+ @contextlib.contextmanager
846
+ def open_input_zip(self):
847
+ try:
848
+ with ZipFile(self.input_path) as zf:
849
+ yield zf
850
+ return
851
+ except PermissionError:
852
+ pass
853
+
854
+ with tempfile.TemporaryDirectory(prefix="build-corpus-input-") as tmp:
855
+ temp_input = Path(tmp) / self.input_path.name
856
+ self.copy_locked_input(temp_input)
857
+ self.stats.warnings.append(
858
+ f"Input file was locked; converted from temporary copy: {temp_input}"
859
+ )
860
+ with ZipFile(temp_input) as zf:
861
+ yield zf
862
+
863
+ def copy_locked_input(self, temp_input: Path) -> None:
864
+ try:
865
+ shutil.copyfile(self.input_path, temp_input)
866
+ return
867
+ except PermissionError:
868
+ if os.name != "nt":
869
+ raise
870
+
871
+ source = str(self.input_path).replace("'", "''")
872
+ target = str(temp_input).replace("'", "''")
873
+ command = f"Copy-Item -LiteralPath '{source}' -Destination '{target}' -Force"
874
+ result = subprocess.run(
875
+ ["powershell", "-NoProfile", "-Command", command],
876
+ capture_output=True,
877
+ text=True,
878
+ )
879
+ if result.returncode != 0:
880
+ message = result.stderr.strip() or result.stdout.strip() or "unknown error"
881
+ raise PermissionError(f"Could not copy locked input via PowerShell: {message}")
882
+
883
+ def _copy_media(self, zf: ZipFile) -> None:
884
+ for name in zf.namelist():
885
+ if not name.startswith("word/media/"):
886
+ continue
887
+ mime_type = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
888
+ if self.image_mode == "base64":
889
+ data = zf.read(name)
890
+ encoded = base64.b64encode(data).decode("ascii")
891
+ self.media_map[name] = f"data:{mime_type};base64,{encoded}"
892
+ elif self.image_mode == "s3":
893
+ if self.s3_uploader is None:
894
+ raise RuntimeError("S3/R2 image mode needs s3_config")
895
+ data = zf.read(name)
896
+ upload = self.s3_uploader.upload(name, data, mime_type)
897
+ self.image_uploads.append(upload)
898
+ self.media_map[name] = upload["url"]
899
+ else:
900
+ target = self.assets_dir / Path(name).name
901
+ with zf.open(name) as src, target.open("wb") as dst:
902
+ shutil.copyfileobj(src, dst)
903
+ self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
904
+
905
+ def _render_equation_assets(self, document_root: ET.Element) -> None:
906
+ math_nodes = document_root.findall(".//m:oMath", NS)
907
+ if not math_nodes:
908
+ return
909
+ render_jobs = []
910
+ for index, math_node in enumerate(math_nodes, 1):
911
+ if self.is_empty_equation(math_node):
912
+ self.empty_equation_indexes.add(index)
913
+ continue
914
+ render_jobs.append((index, math_node))
915
+
916
+ try:
917
+ from docx import Document
918
+ from docx.oxml import parse_xml
919
+ import win32com.client as win32
920
+ except Exception as exc:
921
+ self.stats.warnings.append(f"Equation image rendering unavailable: {exc!r}")
922
+ return
923
+
924
+ chunk_size = 1
925
+ word = win32.DispatchEx("Word.Application")
926
+ word.Visible = False
927
+ word.DisplayAlerts = 0
928
+ try:
929
+ for start in range(0, len(render_jobs), chunk_size):
930
+ chunk = render_jobs[start : start + chunk_size]
931
+ with tempfile.TemporaryDirectory(prefix="build-corpus-equations-") as tmp:
932
+ tmp_dir = Path(tmp)
933
+ temp_docx = tmp_dir / "equations.docx"
934
+ temp_html = tmp_dir / "equations.html"
935
+
936
+ doc = Document()
937
+ for absolute_index, math_node in chunk:
938
+ p = doc.add_paragraph(f"EQMARKER{absolute_index:06d} ")
939
+ p._p.append(parse_xml(ET.tostring(math_node, encoding="unicode")))
940
+ doc.save(temp_docx)
941
+
942
+ opened = word.Documents.Open(str(temp_docx), ReadOnly=True, AddToRecentFiles=False)
943
+ opened.SaveAs2(str(temp_html), FileFormat=10)
944
+ opened.Close(False)
945
+
946
+ html_assets = temp_html.with_name(temp_html.stem + "_files")
947
+ rendered = sorted(html_assets.glob("image*.png"))
948
+ if len(rendered) != len(chunk):
949
+ self.stats.warnings.append(
950
+ f"Equation image count mismatch in render chunk {start + 1}-{start + len(chunk)}: "
951
+ f"OMML={len(chunk)} rendered={len(rendered)}"
952
+ )
953
+
954
+ for (absolute_index, _math_node), source in zip(chunk, rendered):
955
+ target = self.assets_dir / f"eq-{absolute_index:06d}.png"
956
+ shutil.copyfile(source, target)
957
+ self.equation_asset_map[absolute_index] = f"{self.asset_ref_prefix}/{target.name}"
958
+ finally:
959
+ word.Quit()
960
+ self.stats.equation_images = len(self.equation_asset_map)
961
+ self.stats.skipped_empty_equations = len(self.empty_equation_indexes)
962
+
963
+ def render_children(self, node: ET.Element, top_level: bool = False) -> str:
964
+ parts: list[str] = []
965
+ for child in list(node):
966
+ rendered = self.render_block(child)
967
+ if not rendered:
968
+ continue
969
+ if top_level:
970
+ parts.append(rendered.rstrip())
971
+ else:
972
+ parts.append(rendered.strip())
973
+ sep = "\n\n" if top_level else "\n"
974
+ return sep.join(part for part in parts if part)
975
+
976
+ def render_block(self, node: ET.Element) -> str:
977
+ name = local_name(node.tag)
978
+ if name == "p":
979
+ return self.render_paragraph(node)
980
+ if name == "tbl":
981
+ return self.render_table(node)
982
+ if name == "sdt":
983
+ content = node.find("./w:sdtContent", NS)
984
+ return self.render_children(content, top_level=True) if content is not None else ""
985
+ if name in {"bookmarkStart", "bookmarkEnd", "sectPr", "proofErr", "permStart", "permEnd"}:
986
+ return ""
987
+ return self.render_children(node, top_level=False)
988
+
989
+ def render_paragraph(self, p: ET.Element) -> str:
990
+ style = paragraph_style(p)
991
+ content = self.render_inline_children(p)
992
+ if not content.strip():
993
+ return ""
994
+ content = strip_trailing_markdown_breaks(content)
995
+
996
+ if is_code_style(style) or (self.table_depth == 0 and paragraph_is_code(p)):
997
+ self.stats.code_blocks += 1
998
+ return self.render_code_paragraph(p)
999
+
1000
+ level = heading_level(style)
1001
+ if level:
1002
+ self.stats.headings += 1
1003
+ return f"{'#' * level} {self.strip_inline_markers(content)}"
1004
+
1005
+ if is_quote_style(style):
1006
+ self.stats.paragraphs += 1
1007
+ return f"> {content}"
1008
+
1009
+ num_info = paragraph_num_info(p)
1010
+ if not num_info:
1011
+ num_info = paragraph_list_style_info(style)
1012
+ if num_info:
1013
+ self.stats.lists += 1
1014
+ list_level, ordered = num_info
1015
+ indent = " " * list_level
1016
+ bullet = "1." if ordered else "-"
1017
+ return f"{indent}{bullet} {content}"
1018
+
1019
+ if paragraph_is_math(p):
1020
+ if self.table_depth > 0:
1021
+ return content
1022
+ if not paragraph_has_display_math_layout(p):
1023
+ return content
1024
+ inner = content.strip()
1025
+ if inner.startswith("$") and inner.endswith("$") and len(inner) >= 2:
1026
+ inner = inner[1:-1]
1027
+ return f"$$\n{inner}\n$$"
1028
+
1029
+ self.stats.paragraphs += 1
1030
+ return content
1031
+
1032
+ def render_code_paragraph(self, p: ET.Element) -> str:
1033
+ info = ""
1034
+ code_parts: list[str] = []
1035
+ first_nonempty_seen = False
1036
+
1037
+ for run in p.findall("./w:r", NS):
1038
+ raw = extract_run_text(run)
1039
+ if not raw:
1040
+ continue
1041
+ if not first_nonempty_seen and run_is_bold(run):
1042
+ info = raw.strip()
1043
+ first_nonempty_seen = True
1044
+ continue
1045
+ first_nonempty_seen = True
1046
+ code_parts.append(raw)
1047
+
1048
+ code = "".join(code_parts).strip("\n")
1049
+ fence = f"```{info}".rstrip()
1050
+ return f"{fence}\n{code}\n```"
1051
+
1052
+ def render_inline_children(self, node: ET.Element) -> str:
1053
+ segments: list[tuple[Optional[InlineStyle], str]] = []
1054
+ for child in list(node):
1055
+ name = local_name(child.tag)
1056
+ if name == "r":
1057
+ if run_is_hidden(child):
1058
+ source = extract_run_text(child).strip()
1059
+ if source:
1060
+ self.pending_math_source = source
1061
+ continue
1062
+ segments.extend(self.render_run_segments(child))
1063
+ elif name == "hyperlink":
1064
+ label = self.render_inline_children(child).strip()
1065
+ anchor = attr(child, "w", "anchor")
1066
+ rid = attr(child, "r", "id")
1067
+ url = f"#{anchor}" if anchor else self.rels.get(rid or "", "")
1068
+ segments.append((None, f"[{label}]({url})" if url else label))
1069
+ elif name == "oMath":
1070
+ segments.append((None, self.render_math(child, display=False)))
1071
+ elif name == "oMathPara":
1072
+ segments.append((None, self.render_math(child, display=True)))
1073
+ elif name == "drawing":
1074
+ img = self.render_image(child)
1075
+ if img:
1076
+ segments.append((None, img))
1077
+ elif name in {"pPr", "rPr"}:
1078
+ continue
1079
+ else:
1080
+ segments.append((None, self.render_inline_children(child)))
1081
+ return coalesce_inline_segments(segments)
1082
+
1083
+ def run_style(self, run: ET.Element) -> InlineStyle:
1084
+ props = run.find("./w:rPr", NS)
1085
+ if props is None:
1086
+ return False, False, False
1087
+ style_node = props.find("./w:rStyle", NS)
1088
+ is_code = (
1089
+ style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower()
1090
+ ) or run_is_code(run)
1091
+ bold = props.find("./w:b", NS) is not None
1092
+ italic = props.find("./w:i", NS) is not None
1093
+ return is_code, bold, italic
1094
+
1095
+ def render_run_segments(self, run: ET.Element) -> list[tuple[Optional[InlineStyle], str]]:
1096
+ if run_is_hidden(run):
1097
+ return []
1098
+ style = self.run_style(run)
1099
+ is_math = run_is_math(run)
1100
+ segments: list[tuple[Optional[InlineStyle], str]] = []
1101
+ run_parts: list[str] = []
1102
+
1103
+ def flush_text() -> None:
1104
+ if run_parts:
1105
+ text = "".join(run_parts)
1106
+ if is_math:
1107
+ segments.append((None, f"${text}$"))
1108
+ else:
1109
+ segments.append((style, text))
1110
+ run_parts.clear()
1111
+
1112
+ for child in list(run):
1113
+ name = local_name(child.tag)
1114
+ if name == "t":
1115
+ run_parts.append(clean_text(child.text or "") if is_math else escape_md_text(child.text or ""))
1116
+ elif name == "noBreakHyphen":
1117
+ run_parts.append("\u2011")
1118
+ elif name == "softHyphen":
1119
+ run_parts.append("\u00ad")
1120
+ elif name == "tab":
1121
+ run_parts.append("\t")
1122
+ elif name in {"br", "cr"}:
1123
+ run_parts.append(" \n")
1124
+ elif name == "drawing":
1125
+ flush_text()
1126
+ img = self.render_image(child)
1127
+ if img:
1128
+ segments.append((None, img))
1129
+ elif name == "footnoteReference":
1130
+ flush_text()
1131
+ note_id = attr(child, "w", "id")
1132
+ if note_id and note_id in self.footnotes:
1133
+ self.stats.footnotes += 1
1134
+ if note_id not in self.used_footnotes:
1135
+ self.used_footnotes.append(note_id)
1136
+ segments.append((None, f"[^{note_id}]"))
1137
+ elif name == "rPr":
1138
+ continue
1139
+ else:
1140
+ flush_text()
1141
+ nested = self.render_inline_children(child)
1142
+ if nested:
1143
+ segments.append((None, nested))
1144
+
1145
+ flush_text()
1146
+ return segments
1147
+
1148
+ def render_math(self, node: ET.Element, display: bool) -> str:
1149
+ self.stats.equations += 1
1150
+ self.equation_index += 1
1151
+ if self.pending_math_source:
1152
+ source = self.pending_math_source
1153
+ self.pending_math_source = None
1154
+ return normalize_tex(source, display=display)
1155
+ if self.equation_index in self.empty_equation_indexes or self.is_empty_equation(node):
1156
+ self.stats.skipped_empty_equations = max(
1157
+ self.stats.skipped_empty_equations,
1158
+ len(self.empty_equation_indexes),
1159
+ )
1160
+ return ""
1161
+ if self.equation_mode == "image":
1162
+ asset = self.equation_asset_map.get(self.equation_index)
1163
+ if asset:
1164
+ alt = f"equation {self.equation_index}"
1165
+ rendered = f"![{alt}]({asset})"
1166
+ return f"\n{rendered}\n" if display and self.table_depth == 0 else rendered
1167
+ self.stats.warnings.append(f"Missing rendered equation asset for equation {self.equation_index}")
1168
+ return self.render_missing_equation_fallback(node)
1169
+
1170
+ try:
1171
+ tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
1172
+ render_display = display and self.table_depth == 0
1173
+ rendered = normalize_tex(tex, display=render_display)
1174
+ if len(self.equation_samples) < 50:
1175
+ self.equation_samples.append({
1176
+ "source": "".join(t.text or "" for t in node.findall(".//m:t", NS))[:220],
1177
+ "tex": rendered[:500],
1178
+ })
1179
+ return f"\n{rendered}\n" if render_display else rendered
1180
+ except Exception as exc:
1181
+ self.stats.equation_errors += 1
1182
+ self.stats.warnings.append(f"Equation conversion failed: {exc!r}")
1183
+ fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
1184
+ return f"`[equation: {fallback}]`"
1185
+
1186
+ def render_missing_equation_fallback(self, node: ET.Element) -> str:
1187
+ try:
1188
+ tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
1189
+ return normalize_tex(tex, display=False)
1190
+ except Exception:
1191
+ fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
1192
+ return f"`[equation: {fallback}]`"
1193
+
1194
+ @staticmethod
1195
+ def is_empty_equation(node: ET.Element) -> bool:
1196
+ text = "".join(t.text or "" for t in node.findall(".//m:t", NS)).strip()
1197
+ return not text
1198
+
1199
+ def render_image(self, node: ET.Element) -> str:
1200
+ preferred_name = image_metadata_filename(node)
1201
+ refs = []
1202
+ for blip in node.findall(".//a:blip", NS):
1203
+ rid = attr(blip, "r", "embed") or attr(blip, "r", "link")
1204
+ if rid:
1205
+ refs.append(rid)
1206
+ rendered = []
1207
+ for rid in refs:
1208
+ target = self.rels.get(rid, rid)
1209
+ source = resolve_image_target(target)
1210
+ asset = self.media_map.get(source)
1211
+ if not asset:
1212
+ self.stats.warnings.append(f"Image relationship not copied: {rid} -> {target}")
1213
+ continue
1214
+ if preferred_name and self.image_mode == "assets":
1215
+ current_path = self.output_dir / asset
1216
+ preferred_path = self.assets_dir / preferred_name
1217
+ if current_path.exists() and preferred_path != current_path and not preferred_path.exists():
1218
+ shutil.copyfile(current_path, preferred_path)
1219
+ asset = f"{self.asset_ref_prefix}/{preferred_path.name}"
1220
+ self.stats.images += 1
1221
+ rendered.append(f"![image]({asset})")
1222
+ return " ".join(rendered)
1223
+
1224
+ def render_table(self, tbl: ET.Element) -> str:
1225
+ self.stats.tables += 1
1226
+ self.table_depth += 1
1227
+ try:
1228
+ rows = tbl.findall("./w:tr", NS)
1229
+ rendered_rows = []
1230
+ complex_table = False
1231
+ for row in rows:
1232
+ cells = row.findall("./w:tc", NS)
1233
+ rendered_cells = []
1234
+ for cell in cells:
1235
+ if cell.find(".//w:tbl", NS) is not None:
1236
+ complex_table = True
1237
+ cell_text = self.render_cell(cell)
1238
+ if "\n\n" in cell_text:
1239
+ complex_table = True
1240
+ rendered_cells.append(cell_text)
1241
+ rendered_rows.append(rendered_cells)
1242
+ finally:
1243
+ self.table_depth -= 1
1244
+
1245
+ if not rendered_rows:
1246
+ return ""
1247
+ if complex_table:
1248
+ self.stats.html_tables += 1
1249
+ return self.render_html_table(rendered_rows)
1250
+ self.stats.markdown_tables += 1
1251
+ return self.render_markdown_table(rendered_rows)
1252
+
1253
+ def render_cell(self, cell: ET.Element) -> str:
1254
+ parts = []
1255
+ for child in list(cell):
1256
+ if local_name(child.tag) == "tcPr":
1257
+ continue
1258
+ rendered = self.render_block(child)
1259
+ if rendered:
1260
+ parts.append(rendered.strip())
1261
+ return "<br>".join(parts).strip()
1262
+
1263
+ def render_markdown_table(self, rows: list[list[str]]) -> str:
1264
+ width = max(len(row) for row in rows)
1265
+ padded = [row + [""] * (width - len(row)) for row in rows]
1266
+
1267
+ def clean_cell(value: str) -> str:
1268
+ return value.replace("\n", "<br>").replace("|", "\\|").strip()
1269
+
1270
+ lines = []
1271
+ lines.append("| " + " | ".join(clean_cell(v) for v in padded[0]) + " |")
1272
+ lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
1273
+ for row in padded[1:]:
1274
+ lines.append("| " + " | ".join(clean_cell(v) for v in row) + " |")
1275
+ return "\n".join(lines)
1276
+
1277
+ def render_html_table(self, rows: list[list[str]]) -> str:
1278
+ html_rows = ["<table>"]
1279
+ for row in rows:
1280
+ html_rows.append(" <tr>")
1281
+ for cell in row:
1282
+ # Keep inline Markdown-ish math readable inside HTML fallback.
1283
+ html_rows.append(f" <td>{html.escape(cell, quote=False).replace(chr(10), '<br>')}</td>")
1284
+ html_rows.append(" </tr>")
1285
+ html_rows.append("</table>")
1286
+ return "\n".join(html_rows)
1287
+
1288
+ @staticmethod
1289
+ def strip_inline_markers(text: str) -> str:
1290
+ return text.replace("\n", " ").strip()
1291
+
1292
+
1293
+ def export_one(
1294
+ input_path: Path,
1295
+ output_root: Path,
1296
+ equation_mode: str,
1297
+ out_same_dir: bool,
1298
+ image_mode: str,
1299
+ s3_config: S3ImageConfig | None = None,
1300
+ emit_frontmatter: bool = True,
1301
+ ) -> dict:
1302
+ if out_same_dir:
1303
+ output_dir = input_path.parent
1304
+ output_md = input_path.with_suffix(".md")
1305
+ assets_dir = input_path.with_name(input_path.stem + ".assets")
1306
+ report_path = input_path.with_name(input_path.stem + ".export-report.json")
1307
+ else:
1308
+ output_dir = output_root / input_path.stem
1309
+ output_md = None
1310
+ assets_dir = None
1311
+ report_path = None
1312
+ exporter = BuildCorpusExporter(
1313
+ input_path,
1314
+ output_dir,
1315
+ equation_mode=equation_mode,
1316
+ output_md=output_md,
1317
+ assets_dir=assets_dir,
1318
+ report_path=report_path,
1319
+ image_mode=image_mode,
1320
+ s3_config=s3_config,
1321
+ emit_frontmatter=emit_frontmatter,
1322
+ )
1323
+ return exporter.export()
1324
+
1325
+
1326
+ def collect_inputs(path: Path, target: str) -> list[Path]:
1327
+ if path.is_file():
1328
+ return [path]
1329
+ inputs: list[Path] = []
1330
+ patterns = ("*.md",) if target == "word" else ("*.docx", "*.pptx", "*.ppt")
1331
+ for ext in patterns:
1332
+ inputs.extend(path.rglob(ext))
1333
+ return sorted(inputs)
1334
+
1335
+
1336
+ def collect_many_inputs(paths: list[Path], target: str) -> list[Path]:
1337
+ inputs: list[Path] = []
1338
+ for path in paths:
1339
+ inputs.extend(collect_inputs(path, target))
1340
+ unique: dict[str, Path] = {}
1341
+ for path in inputs:
1342
+ unique[str(path.resolve())] = path
1343
+ return sorted(unique.values())
1344
+
1345
+
1346
+ def move_processed_source(input_path: Path) -> Path:
1347
+ source_dir = input_path.parent / "_originals"
1348
+ source_dir.mkdir(exist_ok=True)
1349
+ target = source_dir / input_path.name
1350
+ if target.exists():
1351
+ target = source_dir / f"{input_path.stem}-{hashlib.sha1(str(input_path).encode('utf-8')).hexdigest()[:8]}{input_path.suffix}"
1352
+ shutil.move(str(input_path), str(target))
1353
+ return target
1354
+
1355
+
1356
+ def main() -> None:
1357
+ parser = argparse.ArgumentParser(
1358
+ description="Convert Markdown to DOCX or DOCX/PPTX/PPT to Markdown.",
1359
+ formatter_class=argparse.RawDescriptionHelpFormatter,
1360
+ epilog="""examples:
1361
+ build-corpus input.docx --out out
1362
+ build-corpus input.md --to word --out out
1363
+ build-corpus ./word-files --out ./markdown
1364
+ build-corpus ./word-files --out-same-dir
1365
+ build-corpus input.docx --images base64
1366
+ build-corpus input.docx --images s3 --config build-corpus.config.json
1367
+
1368
+ image modes:
1369
+ assets copy images into an assets folder and reference them from Markdown
1370
+ base64 embed images directly as Markdown data URIs
1371
+ s3 upload images to S3-compatible storage such as Cloudflare R2 or AWS S3
1372
+
1373
+ equation modes:
1374
+ tex convert Word OMML equations to KaTeX-readable TeX
1375
+ image render equations as images for visual debugging only
1376
+ """,
1377
+ )
1378
+ parser.add_argument("input", type=Path, nargs="+", help="Markdown, DOCX, PPTX, or PPT file/directory; multiple paths are allowed")
1379
+ parser.add_argument("--config", type=Path, help="JSON config file with conversion, output, and S3/R2 defaults")
1380
+ parser.add_argument("--out", type=Path, help="Output directory for converted Markdown tree")
1381
+ parser.add_argument("--to", choices=["auto", "markdown", "word"], help="Output target; auto infers from a single-file input")
1382
+ parser.add_argument("--equations", choices=["tex", "image"], help="Equation output mode; default comes from config or tex")
1383
+ parser.add_argument("--images", choices=["assets", "base64", "s3"], help="Image output mode; default comes from config or assets")
1384
+ parser.add_argument("--out-same-dir", action="store_true", help="Write .md, .assets, and reports beside each source DOCX")
1385
+ parser.add_argument("--word-template", type=Path, help="Optional .docx or .dotx template used for Markdown to Word exports")
1386
+ parser.add_argument("--s3-bucket", help="S3/R2 bucket name for --images s3")
1387
+ parser.add_argument("--s3-public-base-url", help="Public URL base used in Markdown, e.g. https://assets.example.com")
1388
+ parser.add_argument("--s3-prefix", help="Object key prefix for uploaded images")
1389
+ parser.add_argument("--s3-endpoint-url", help="S3-compatible endpoint, required for Cloudflare R2")
1390
+ parser.add_argument("--s3-region", help="S3 region; use auto for Cloudflare R2")
1391
+ parser.add_argument("--s3-access-key-id", help="S3/R2 access key id; can also come from config/env expansion")
1392
+ parser.add_argument("--s3-secret-access-key", help="S3/R2 secret access key; can also come from config/env expansion")
1393
+ parser.add_argument("--s3-cache-control", help="Cache-Control header for uploaded images")
1394
+ parser.add_argument("--s3-acl", help="Optional ACL for AWS S3; usually omitted for Cloudflare R2")
1395
+ parser.add_argument("--move-sources", action="store_true", help="After successful DOCX/PPT conversion, move processed source files into an _originals folder beside each file")
1396
+ parser.add_argument("--inline-images", action="store_true", help="Create Markdown with local or HTTP image references embedded as data URIs")
1397
+ parser.add_argument("--no-frontmatter", action="store_true", help="Do not emit MDK YAML frontmatter on generated Markdown (frontmatter is emitted by default and round-trips through docProps/custom.xml)")
1398
+ args = parser.parse_args()
1399
+ config = load_config(args.config)
1400
+
1401
+ args.out = args.out or Path(config_get(config, "output.out", ".codex/build-corpus/out"))
1402
+ args.to = args.to or config_get(config, "conversion.target", "auto")
1403
+ args.equations = args.equations or config_get(config, "conversion.equations", "tex")
1404
+ args.images = args.images or config_get(config, "conversion.images", "assets")
1405
+ args.out_same_dir = args.out_same_dir or bool(config_get(config, "output.out_same_dir", False))
1406
+ args.word_template = args.word_template or (
1407
+ Path(config_get(config, "word.template")) if config_get(config, "word.template") else None
1408
+ )
1409
+ s3_config = build_s3_config(config, args)
1410
+ emit_frontmatter = not args.no_frontmatter
1411
+
1412
+ input_target = args.to
1413
+ single_input = len(args.input) == 1
1414
+ first_input = args.input[0]
1415
+ if args.inline_images:
1416
+ input_target = "inline-images"
1417
+ elif single_input and first_input.is_file() and args.to == "auto":
1418
+ input_target = "word" if first_input.suffix.lower() == ".md" else "markdown"
1419
+ elif args.to == "auto":
1420
+ input_target = "markdown"
1421
+
1422
+ reports = []
1423
+ if input_target == "inline-images":
1424
+ for input_path in collect_many_inputs(args.input, "word"):
1425
+ if input_path.suffix.lower() in {".md", ".markdown"}:
1426
+ reports.append(inline_markdown_images(input_path))
1427
+ batch_report_root = first_input.parent if single_input and first_input.is_file() else args.out
1428
+ batch_report_root.mkdir(parents=True, exist_ok=True)
1429
+ batch_report = batch_report_root / "build-corpus-inline-report.json"
1430
+ batch_report.write_text(json.dumps(reports, indent=2, ensure_ascii=False), encoding="utf-8")
1431
+ print(json.dumps({
1432
+ "converted": len(reports),
1433
+ "batch_report": str(batch_report),
1434
+ "outputs": [report["output"] for report in reports],
1435
+ }, indent=2))
1436
+ return
1437
+
1438
+ for input_path in collect_many_inputs(args.input, input_target):
1439
+ if input_path.name.startswith("~$"):
1440
+ continue
1441
+ suffix = input_path.suffix.lower()
1442
+ report = None
1443
+ if input_target == "word" or suffix == ".md":
1444
+ report = export_markdown_to_docx(
1445
+ input_path,
1446
+ args.out,
1447
+ args.out_same_dir,
1448
+ template_path=args.word_template,
1449
+ )
1450
+ elif suffix in {".pptx", ".ppt"}:
1451
+ report = export_presentation(
1452
+ input_path,
1453
+ args.out,
1454
+ args.out_same_dir,
1455
+ image_mode=args.images,
1456
+ emit_frontmatter=emit_frontmatter,
1457
+ )
1458
+ else:
1459
+ report = export_one(
1460
+ input_path,
1461
+ args.out,
1462
+ equation_mode=args.equations,
1463
+ out_same_dir=args.out_same_dir,
1464
+ image_mode=args.images,
1465
+ s3_config=s3_config,
1466
+ emit_frontmatter=emit_frontmatter,
1467
+ )
1468
+ if args.move_sources and suffix in {".docx", ".pptx", ".ppt"}:
1469
+ report["moved_source"] = str(move_processed_source(input_path))
1470
+ reports.append(report)
1471
+
1472
+ batch_report_root = first_input if args.out_same_dir and single_input and first_input.is_dir() else args.out
1473
+ batch_report_root.mkdir(parents=True, exist_ok=True)
1474
+ batch_report = batch_report_root / "build-corpus-batch-report.json"
1475
+ batch_report.write_text(json.dumps(reports, indent=2, ensure_ascii=False), encoding="utf-8")
1476
+ fidelity_failures = [report["output"] for report in reports if report.get("fidelity_ok") is False]
1477
+ print(json.dumps({
1478
+ "converted": len(reports),
1479
+ "batch_report": str(batch_report),
1480
+ "outputs": [report["output"] for report in reports],
1481
+ "all_fidelity_ok": len(fidelity_failures) == 0,
1482
+ "fidelity_failures": fidelity_failures,
1483
+ "default_word_template": str(args.word_template or resolve_default_template_path() or "bundled:md-to-word-template.dotx"),
1484
+ }, indent=2))
1485
+
1486
+
1487
+ if __name__ == "__main__":
1488
+ main()