npm - regen.mde - Versions diffs - 0.2.2 → 0.8.0 - Mend

regen.mde 0.2.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (56) hide show

package/LICENSE +16 -16
package/README.md +409 -295
package/bin/build-corpus-editor.js +83 -81
package/bin/build-corpus.js +41 -41
package/bin/postinstall.js +259 -187
package/bin/regen-mdeditor-install.js +27 -27
package/bin/regen-mdeditor-uninstall.js +19 -19
package/bin/validate-katex.js +93 -93
package/desktop/BuildCorpusEditor/BuildCorpusBridge.cs +493 -270
package/desktop/BuildCorpusEditor/BuildCorpusEditor.csproj +22 -22
package/desktop/BuildCorpusEditor/EditorForm.cs +853 -540
package/desktop/BuildCorpusEditor/Program.cs +85 -81
package/desktop/BuildCorpusEditor/app.manifest +16 -16
package/dist/release/regen-mde-0.8.0-win-x64.zip +0 -0
package/dist/windows-editor/BuildCorpusEditor.dll +0 -0
package/dist/windows-editor/BuildCorpusEditor.exe +0 -0
package/dist/windows-editor/BuildCorpusEditor.pdb +0 -0
package/dist/windows-editor/BuildCorpusEditor.runtimeconfig.json +1 -1
package/dist/windows-editor/wwwroot/assets/index-C_VxJk4k.js +375 -0
package/dist/windows-editor/wwwroot/assets/index-Wt9zSjIw.css +1 -0
package/dist/windows-editor/wwwroot/index.html +22 -22
package/editor-web/index.html +21 -21
package/editor-web/src/main.jsx +1044 -399
package/editor-web/src/styles.css +846 -602
package/editor-web/vite.config.js +13 -13
package/examples/build-corpus.config.example.json +21 -21
package/installer/install-regen-mde.ps1 +214 -175
package/installer/regen-mde.nsi +81 -81
package/package.json +10 -6
package/pyproject.toml +4 -3
package/requirements.txt +5 -4
package/scripts/build-windows-editor.ps1 +47 -47
package/scripts/package-windows-editor.ps1 +90 -90
package/scripts/release-dual.mjs +105 -0
package/scripts/run-corpus.ps1 +28 -28
package/scripts/run-editor-implementation-plane.ps1 +226 -203
package/scripts/run-required-tests.ps1 +98 -98
package/scripts/run-smoke.ps1 +28 -28
package/src/build_corpus/__init__.py +1 -1
package/src/build_corpus/docx_exporter.py +1055 -798
package/src/build_corpus/equations.py +1345 -0
package/src/build_corpus/exporter.py +1488 -1195
package/src/build_corpus/frontmatter.py +302 -0
package/src/build_corpus/ppt_exporter.py +543 -532
package/src/build_corpus/templates/__init__.py +1 -1
package/src/build_corpus/validate_assets.py +46 -46
package/tools/audit_corpus.py +203 -203
package/tools/collect_microsoft_word_templates.py +228 -228
package/tools/collect_online_docx_corpus.py +272 -272
package/tools/collect_online_pptx_corpus.py +252 -252
package/tools/compare_pptx_inputs_outputs.py +87 -87
package/tools/roundtrip_docx_corpus.py +171 -171
package/dist/release/regen.mde-0.2.2-win-x64-setup.exe +0 -0
package/dist/release/regen.mde-0.2.2-win-x64.zip +0 -0
package/dist/windows-editor/wwwroot/assets/index-DjJ6xmhy.js +0 -326
package/dist/windows-editor/wwwroot/assets/index-_dwMNNsm.css +0 -1

package/src/build_corpus/exporter.py CHANGED Viewed

@@ -1,1195 +1,1488 @@
-from __future__ import annotations
-import argparse
-import base64
-import contextlib
-import hashlib
-import html
-import json
-import mimetypes
-import os
-import re
-import shutil
-import subprocess
-import tempfile
-from dataclasses import dataclass, field
-from pathlib import Path
-from typing import Optional
-from zipfile import ZipFile
-from xml.etree import ElementTree as ET
-from omml2latex import convert_omml
-try:
-    from .docx_exporter import export_markdown_to_docx, resolve_default_template_path
-except ImportError:  # pragma: no cover - allows direct script execution
-    from build_corpus.docx_exporter import export_markdown_to_docx, resolve_default_template_path
-try:
-    from .ppt_exporter import export_presentation
-except ImportError:  # pragma: no cover - allows direct script execution
-    from build_corpus.ppt_exporter import export_presentation
-NS = {
-    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
-    "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
-    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
-    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
-    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
-}
-W = f"{{{NS['w']}}}"
-R = f"{{{NS['r']}}}"
-@dataclass
-class ExportStats:
-    paragraphs: int = 0
-    headings: int = 0
-    code_blocks: int = 0
-    tables: int = 0
-    markdown_tables: int = 0
-    html_tables: int = 0
-    equations: int = 0
-    equation_images: int = 0
-    skipped_empty_equations: int = 0
-    equation_errors: int = 0
-    images: int = 0
-    lists: int = 0
-    warnings: list[str] = field(default_factory=list)
-@dataclass
-class S3ImageConfig:
-    bucket: str
-    public_base_url: str
-    prefix: str = ""
-    endpoint_url: str | None = None
-    region_name: str | None = None
-    access_key_id: str | None = None
-    secret_access_key: str | None = None
-    cache_control: str = "public, max-age=31536000, immutable"
-    acl: str | None = None
-class S3ImageUploader:
-    def __init__(self, config: S3ImageConfig):
-        self.config = config
-        try:
-            import boto3
-        except ImportError as exc:
-            raise RuntimeError("S3/R2 image mode requires boto3. Install with: pip install boto3") from exc
-        kwargs = {
-            "service_name": "s3",
-            "endpoint_url": config.endpoint_url,
-            "region_name": config.region_name,
-            "aws_access_key_id": config.access_key_id,
-            "aws_secret_access_key": config.secret_access_key,
-        }
-        self.client = boto3.client(**{key: value for key, value in kwargs.items() if value})
-    def upload(self, source_name: str, data: bytes, content_type: str) -> dict[str, str]:
-        digest = hashlib.sha256(data).hexdigest()
-        suffix = Path(source_name).suffix.lower()
-        key_parts = [self.config.prefix.strip("/"), "images", "sha256", f"{digest}{suffix}"]
-        key = "/".join(part for part in key_parts if part)
-        put_args = {
-            "Bucket": self.config.bucket,
-            "Key": key,
-            "Body": data,
-            "ContentType": content_type,
-            "CacheControl": self.config.cache_control,
-        }
-        if self.config.acl:
-            put_args["ACL"] = self.config.acl
-        self.client.put_object(**put_args)
-        return {
-            "source": source_name,
-            "sha256": digest,
-            "bucket": self.config.bucket,
-            "key": key,
-            "url": f"{self.config.public_base_url.rstrip('/')}/{key}",
-            "content_type": content_type,
-            "bytes": str(len(data)),
-        }
-def local_name(tag: str) -> str:
-    return tag.rsplit("}", 1)[-1] if "}" in tag else tag
-def attr(node: ET.Element, ns: str, name: str) -> str | None:
-    return node.attrib.get(f"{{{NS[ns]}}}{name}")
-def clean_text(text: str) -> str:
-    return (
-        text.replace("\u00a0", " ")
-        .replace("\u200b", "")
-        .replace("\ufeff", "")
-    )
-def escape_md_text(text: str) -> str:
-    text = clean_text(text)
-    escaped: list[str] = []
-    index = 0
-    while index < len(text):
-        char = text[index]
-        if char == "\\":
-            next_char = text[index + 1] if index + 1 < len(text) else ""
-            if next_char in "\\`*_{}[]()#+.!|$-":
-                escaped.append("\\")
-                escaped.append(next_char)
-                index += 2
-                continue
-            escaped.append("\\\\")
-        elif char in {"*", "_", "$"}:
-            escaped.append("\\" + char)
-        else:
-            escaped.append(char)
-        index += 1
-    return "".join(escaped)
-def normalize_inline_markers(text: str) -> str:
-    # Ensure inline images do not glue themselves to adjacent text.
-    text = re.sub(r"(\!\[[^\]]*\]\([^)]+\))(?=[^\s<>)\].,;:!?])", r"\1 ", text)
-    text = re.sub(r"(?<=[^\s<(\[.,;:!?])(\!\[[^\]]*\]\([^)]+\))", r" \1", text)
-    return text
-def strip_trailing_markdown_breaks(text: str) -> str:
-    while text.endswith("  "):
-        text = text[:-2]
-    return text.rstrip()
-InlineStyle = tuple[bool, bool, bool]
-def apply_inline_style(text: str, style: InlineStyle) -> str:
-    is_code, bold, italic = style
-    if not text:
-        return ""
-    if not text.strip():
-        return text
-    if is_code:
-        return f"`{text.replace('`', '\\`')}`"
-    if bold and italic:
-        return f"***{text}***"
-    if bold:
-        return f"**{text}**"
-    if italic:
-        return f"*{text}*"
-    return text
-def coalesce_inline_segments(segments: list[tuple[Optional[InlineStyle], str]]) -> str:
-    parts: list[str] = []
-    buffer: list[str] = []
-    buffer_style: Optional[InlineStyle] = None
-    def flush() -> None:
-        nonlocal buffer_style
-        if not buffer:
-            return
-        parts.append(apply_inline_style("".join(buffer), buffer_style or (False, False, False)))
-        buffer.clear()
-        buffer_style = None
-    for style, text in segments:
-        if not text:
-            continue
-        if style is None:
-            flush()
-            parts.append(text)
-            continue
-        if buffer_style == style:
-            buffer.append(text)
-            continue
-        flush()
-        buffer_style = style
-        buffer.append(text)
-    flush()
-    return normalize_inline_markers("".join(parts))
-def normalize_tex(tex: str, display: bool) -> str:
-    tex = clean_text(tex).strip()
-    if tex.startswith("$$") and tex.endswith("$$"):
-        tex = tex[2:-2].strip()
-    elif tex.startswith("$") and tex.endswith("$"):
-        tex = tex[1:-1].strip()
-    tex = tex.replace("\u2011", "-")
-    tex = tex.replace("$", r"\$")
-    tex = tex.replace(r"\text{ }", r"\,")
-    tex = tex.replace(r"\text{  }", r"\;")
-    tex = tex.replace(r"\text{  }", " ")
-    tex = tex.replace(r"\mathrm{\}\text{*}}", r"\*")
-    tex = tex.replace(r"\text{-}", "-")
-    tex = tex.replace(r"\*", "*")
-    tex = replace_raw_unicode_math(tex)
-    tex = strip_word_equation_field_codes(tex)
-    tex = escape_text_macro_underscores(tex)
-    tex = repair_underbrace_limits(tex)
-    tex = balance_tex_braces(tex)
-    tex = re.sub(r"\s+", " ", tex).strip()
-    return f"$$\n{tex}\n$$" if display else f"${tex}$"
-UNICODE_MATH_REPLACEMENTS = {
-    "∸": r"\dot{-}",
-    "⨅": r"\sqcap",
-    "⨃": r"\bigcup",
-    "⋜": r"\lessgtr",
-    "⋝": r"\gtrless",
-    "∱": r"\oint",
-    "∲": r"\oint",
-    "∳": r"\oint",
-    "ℇ": r"\varepsilon",
-    "Ϝ": r"\digamma",
-    "℩": r"\iota",
-    "Å": r"\mathring{A}",
-    "℮": "e",
-}
-def replace_raw_unicode_math(tex: str) -> str:
-    for raw, replacement in UNICODE_MATH_REPLACEMENTS.items():
-        tex = tex.replace(raw, replacement)
-    return tex
-def strip_word_equation_field_codes(tex: str) -> str:
-    # Word SEQ fields can leak into OMML conversion as equation-number text.
-    patterns = [
-        r"#\s*\\left\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\\right\)",
-        r"#\s*\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\)",
-        r"#\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+",
-    ]
-    for pattern in patterns:
-        tex = re.sub(pattern, "", tex, flags=re.IGNORECASE)
-    return tex
-def escape_text_macro_underscores(tex: str) -> str:
-    def replace(match: re.Match[str]) -> str:
-        body = match.group(1)
-        body = body.replace("\\", r"\textbackslash{}")
-        body = body.replace("_", r"\_")
-        body = body.replace("&", r"\&")
-        body = body.replace("%", r"\%")
-        body = body.replace("#", r"\#")
-        return r"\text{" + body + "}"
-    return re.sub(r"\\text\{([^{}]*)\}", replace, tex)
-def repair_underbrace_limits(tex: str) -> str:
-    pattern = re.compile(
-        r"\\mathop\{\\mathop\{(?P<base>.*?)\}\\limits_\{\s*\\underbrace\s*\}\}\\limits_\{(?P<label>.*?)\}"
-        r"(?=(?:[+\-]|\\cdot|\\times|=|,|;|$))",
-        re.DOTALL,
-    )
-    previous = None
-    while previous != tex:
-        previous = tex
-        tex = pattern.sub(r"\\underbrace{\g<base>}_{\g<label>}", tex)
-    return tex
-def balance_tex_braces(tex: str) -> str:
-    balanced: list[str] = []
-    depth = 0
-    escaped = False
-    for char in tex:
-        if escaped:
-            balanced.append(char)
-            escaped = False
-            continue
-        if char == "\\":
-            balanced.append(char)
-            escaped = True
-            continue
-        if char == "{":
-            depth += 1
-            balanced.append(char)
-        elif char == "}":
-            if depth > 0:
-                depth -= 1
-                balanced.append(char)
-            # Drop unmatched closing braces; KaTeX rejects them.
-        else:
-            balanced.append(char)
-    if depth > 0:
-        balanced.extend("}" for _ in range(depth))
-    return "".join(balanced)
-def paragraph_style(node: ET.Element) -> str | None:
-    style = node.find("./w:pPr/w:pStyle", NS)
-    return attr(style, "w", "val") if style is not None else None
-def heading_level(style: str | None) -> int | None:
-    if not style:
-        return None
-    match = re.fullmatch(r"Heading([1-6])", style)
-    if match:
-        return int(match.group(1))
-    return None
-def is_code_style(style: str | None) -> bool:
-    return bool(style and "code" in style.lower())
-def is_quote_style(style: str | None) -> bool:
-    if not style:
-        return False
-    normalized = style.replace(" ", "").lower()
-    return normalized in {"buildcorpusquote", "quote", "intensequote"}
-def paragraph_num_info(node: ET.Element) -> tuple[int, bool] | None:
-    num_pr = node.find("./w:pPr/w:numPr", NS)
-    if num_pr is None:
-        return None
-    ilvl = num_pr.find("./w:ilvl", NS)
-    level = int(attr(ilvl, "w", "val") or "0") if ilvl is not None else 0
-    # Without numbering.xml style resolution, use bullets as the safer default.
-    return level, False
-def paragraph_list_style_info(style: str | None) -> tuple[int, bool] | None:
-    if not style:
-        return None
-    normalized = style.replace(" ", "").lower()
-    if normalized.startswith("listbullet"):
-        suffix = normalized.removeprefix("listbullet")
-        level = int(suffix) if suffix.isdigit() else 1
-        return max(level - 1, 0), False
-    if normalized.startswith("listnumber"):
-        suffix = normalized.removeprefix("listnumber")
-        level = int(suffix) if suffix.isdigit() else 1
-        return max(level - 1, 0), True
-    return None
-def run_is_math(run: ET.Element) -> bool:
-    props = run.find("./w:rPr", NS)
-    if props is None:
-        return False
-    fonts = props.find("./w:rFonts", NS)
-    if fonts is None:
-        return False
-    for attr_name in ("ascii", "hAnsi", "cs"):
-        value = attr(fonts, "w", attr_name)
-        if value and value.lower() == "cambria math":
-            return True
-    return False
-def run_is_code(run: ET.Element) -> bool:
-    props = run.find("./w:rPr", NS)
-    if props is None:
-        return False
-    style_node = props.find("./w:rStyle", NS)
-    if style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower():
-        return True
-    fonts = props.find("./w:rFonts", NS)
-    if fonts is None:
-        return False
-    for attr_name in ("ascii", "hAnsi", "cs"):
-        value = attr(fonts, "w", attr_name)
-        if value and value.lower() == "consolas":
-            return True
-    return False
-def run_is_bold(run: ET.Element) -> bool:
-    props = run.find("./w:rPr", NS)
-    return props is not None and props.find("./w:b", NS) is not None
-def paragraph_is_code(node: ET.Element) -> bool:
-    runs = node.findall("./w:r", NS)
-    if not runs:
-        return False
-    first_nonempty_seen = False
-    code_like_runs = 0
-    meaningful_runs = 0
-    for run in runs:
-        text = extract_run_text(run)
-        if not text or not text.strip():
-            continue
-        meaningful_runs += 1
-        if not first_nonempty_seen and run_is_bold(run):
-            first_nonempty_seen = True
-            continue
-        first_nonempty_seen = True
-        if run_is_code(run):
-            code_like_runs += 1
-            continue
-        return False
-    return meaningful_runs > 0 and code_like_runs > 0
-def extract_run_text(run: ET.Element) -> str:
-    parts: list[str] = []
-    for child in list(run):
-        name = local_name(child.tag)
-        if name == "t":
-            parts.append(clean_text(child.text or ""))
-        elif name == "tab":
-            parts.append("\t")
-        elif name in {"br", "cr"}:
-            parts.append("\n")
-    return "".join(parts)
-def paragraph_is_math(node: ET.Element) -> bool:
-    runs = node.findall("./w:r", NS)
-    math_runs = 0
-    text_runs = 0
-    for run in runs:
-        texts = [t.text or "" for t in run.findall("./w:t", NS)]
-        if not any(segment.strip() for segment in texts):
-            continue
-        text_runs += 1
-        if run_is_math(run):
-            math_runs += 1
-    return text_runs > 0 and text_runs == math_runs
-def paragraph_has_display_math_layout(node: ET.Element) -> bool:
-    indent = node.find("./w:pPr/w:ind", NS)
-    spacing = node.find("./w:pPr/w:spacing", NS)
-    if indent is not None and any(attr(indent, "w", key) not in {None, "0"} for key in ("left", "right", "firstLine", "hanging")):
-        return True
-    if spacing is not None and any(attr(spacing, "w", key) not in {None, "0"} for key in ("before", "after")):
-        return True
-    return False
-def relationship_map(zip_file: ZipFile, part: str = "word/document.xml") -> dict[str, str]:
-    rels_path = str(Path(part).parent / "_rels" / (Path(part).name + ".rels")).replace("\\", "/")
-    if rels_path not in zip_file.namelist():
-        return {}
-    root = ET.fromstring(zip_file.read(rels_path))
-    return {
-        rel.attrib["Id"]: rel.attrib.get("Target", "")
-        for rel in root
-        if "Id" in rel.attrib
-    }
-def resolve_image_target(target: str) -> str:
-    if target.startswith("../"):
-        target = target[3:]
-    if not target.startswith("word/"):
-        target = f"word/{target}"
-    return target
-def image_metadata_filename(node: ET.Element) -> str | None:
-    for tag_name in ("docPr", "cNvPr"):
-        for entry in node.findall(f".//wp:{tag_name}", NS):
-            for key in ("descr", "title", "name"):
-                value = entry.attrib.get(key)
-                if value and Path(value).suffix:
-                    return Path(value).name
-    return None
-def expand_env(value):
-    if isinstance(value, str):
-        return os.path.expandvars(value)
-    if isinstance(value, dict):
-        return {key: expand_env(item) for key, item in value.items()}
-    if isinstance(value, list):
-        return [expand_env(item) for item in value]
-    return value
-def load_config(path: Path | None) -> dict:
-    if path is None:
-        return {}
-    if not path.exists():
-        raise FileNotFoundError(f"Config file not found: {path}")
-    if path.suffix.lower() != ".json":
-        raise ValueError("Config currently supports JSON files only")
-    return expand_env(json.loads(path.read_text(encoding="utf-8")))
-def config_get(config: dict, key: str, default=None):
-    current = config
-    for part in key.split("."):
-        if not isinstance(current, dict) or part not in current:
-            return default
-        current = current[part]
-    return current
-def build_s3_config(config: dict, args: argparse.Namespace) -> S3ImageConfig | None:
-    if args.images != "s3":
-        return None
-    s3 = config_get(config, "s3", {}) or {}
-    bucket = args.s3_bucket or s3.get("bucket")
-    public_base_url = args.s3_public_base_url or s3.get("public_base_url")
-    if not bucket or not public_base_url:
-        raise ValueError("S3/R2 image mode requires bucket and public_base_url")
-    return S3ImageConfig(
-        bucket=bucket,
-        public_base_url=public_base_url,
-        prefix=args.s3_prefix if args.s3_prefix is not None else s3.get("prefix", ""),
-        endpoint_url=args.s3_endpoint_url or s3.get("endpoint_url"),
-        region_name=args.s3_region or s3.get("region_name"),
-        access_key_id=args.s3_access_key_id or s3.get("access_key_id"),
-        secret_access_key=args.s3_secret_access_key or s3.get("secret_access_key"),
-        cache_control=args.s3_cache_control or s3.get("cache_control", "public, max-age=31536000, immutable"),
-        acl=args.s3_acl if args.s3_acl is not None else s3.get("acl"),
-    )
-class BuildCorpusExporter:
-    def __init__(
-        self,
-        input_path: Path,
-        output_dir: Path,
-        equation_mode: str = "tex",
-        output_md: Path | None = None,
-        assets_dir: Path | None = None,
-        report_path: Path | None = None,
-        image_mode: str = "assets",
-        s3_config: S3ImageConfig | None = None,
-    ):
-        self.input_path = input_path
-        self.output_dir = output_dir
-        self.output_md = output_md or (output_dir / (input_path.stem + ".md"))
-        self.assets_dir = assets_dir or (output_dir / "assets")
-        self.report_path = report_path or (output_dir / "export-report.json")
-        self.asset_ref_prefix = self.assets_dir.name
-        self.equation_mode = equation_mode
-        self.image_mode = image_mode
-        self.s3_config = s3_config
-        self.s3_uploader = S3ImageUploader(s3_config) if image_mode == "s3" and s3_config else None
-        self.stats = ExportStats()
-        self.rels: dict[str, str] = {}
-        self.media_map: dict[str, str] = {}
-        self.image_uploads: list[dict[str, str]] = []
-        self.equation_asset_map: dict[int, str] = {}
-        self.empty_equation_indexes: set[int] = set()
-        self.equation_index = 0
-        self.equation_samples: list[dict[str, str]] = []
-        self.table_depth = 0
-    def export(self) -> dict:
-        self.output_dir.mkdir(parents=True, exist_ok=True)
-        if self.image_mode == "assets" or self.equation_mode == "image":
-            self.assets_dir.mkdir(parents=True, exist_ok=True)
-        with self.open_input_zip() as zf:
-            self.rels = relationship_map(zf)
-            self._copy_media(zf)
-            document_xml = zf.read("word/document.xml")
-            root = ET.fromstring(document_xml)
-            body = root.find("w:body", NS)
-            if body is None:
-                raise RuntimeError("word/document.xml has no w:body")
-            if self.equation_mode == "image":
-                self._render_equation_assets(root)
-            markdown = self.render_children(body, top_level=True).strip() + "\n"
-        self.output_md.parent.mkdir(parents=True, exist_ok=True)
-        self.output_md.write_text(markdown, encoding="utf-8")
-        report = {
-            "input": str(self.input_path),
-            "output": str(self.output_md),
-            "assets_dir": str(self.assets_dir) if self.assets_dir.exists() else None,
-            "image_mode": self.image_mode,
-            "image_uploads": self.image_uploads,
-            "stats": self.stats.__dict__,
-            "equation_samples": self.equation_samples[:50],
-        }
-        self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
-        return report
-    @contextlib.contextmanager
-    def open_input_zip(self):
-        try:
-            with ZipFile(self.input_path) as zf:
-                yield zf
-                return
-        except PermissionError:
-            pass
-        with tempfile.TemporaryDirectory(prefix="build-corpus-input-") as tmp:
-            temp_input = Path(tmp) / self.input_path.name
-            self.copy_locked_input(temp_input)
-            self.stats.warnings.append(
-                f"Input file was locked; converted from temporary copy: {temp_input}"
-            )
-            with ZipFile(temp_input) as zf:
-                yield zf
-    def copy_locked_input(self, temp_input: Path) -> None:
-        try:
-            shutil.copyfile(self.input_path, temp_input)
-            return
-        except PermissionError:
-            if os.name != "nt":
-                raise
-        source = str(self.input_path).replace("'", "''")
-        target = str(temp_input).replace("'", "''")
-        command = f"Copy-Item -LiteralPath '{source}' -Destination '{target}' -Force"
-        result = subprocess.run(
-            ["powershell", "-NoProfile", "-Command", command],
-            capture_output=True,
-            text=True,
-        )
-        if result.returncode != 0:
-            message = result.stderr.strip() or result.stdout.strip() or "unknown error"
-            raise PermissionError(f"Could not copy locked input via PowerShell: {message}")
-    def _copy_media(self, zf: ZipFile) -> None:
-        for name in zf.namelist():
-            if not name.startswith("word/media/"):
-                continue
-            mime_type = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
-            if self.image_mode == "base64":
-                data = zf.read(name)
-                encoded = base64.b64encode(data).decode("ascii")
-                self.media_map[name] = f"data:{mime_type};base64,{encoded}"
-            elif self.image_mode == "s3":
-                if self.s3_uploader is None:
-                    raise RuntimeError("S3/R2 image mode needs s3_config")
-                data = zf.read(name)
-                upload = self.s3_uploader.upload(name, data, mime_type)
-                self.image_uploads.append(upload)
-                self.media_map[name] = upload["url"]
-            else:
-                target = self.assets_dir / Path(name).name
-                with zf.open(name) as src, target.open("wb") as dst:
-                    shutil.copyfileobj(src, dst)
-                self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
-    def _render_equation_assets(self, document_root: ET.Element) -> None:
-        math_nodes = document_root.findall(".//m:oMath", NS)
-        if not math_nodes:
-            return
-        render_jobs = []
-        for index, math_node in enumerate(math_nodes, 1):
-            if self.is_empty_equation(math_node):
-                self.empty_equation_indexes.add(index)
-                continue
-            render_jobs.append((index, math_node))
-        try:
-            from docx import Document
-            from docx.oxml import parse_xml
-            import win32com.client as win32
-        except Exception as exc:
-            self.stats.warnings.append(f"Equation image rendering unavailable: {exc!r}")
-            return
-        chunk_size = 1
-        word = win32.DispatchEx("Word.Application")
-        word.Visible = False
-        word.DisplayAlerts = 0
-        try:
-            for start in range(0, len(render_jobs), chunk_size):
-                chunk = render_jobs[start : start + chunk_size]
-                with tempfile.TemporaryDirectory(prefix="build-corpus-equations-") as tmp:
-                    tmp_dir = Path(tmp)
-                    temp_docx = tmp_dir / "equations.docx"
-                    temp_html = tmp_dir / "equations.html"
-                    doc = Document()
-                    for absolute_index, math_node in chunk:
-                        p = doc.add_paragraph(f"EQMARKER{absolute_index:06d} ")
-                        p._p.append(parse_xml(ET.tostring(math_node, encoding="unicode")))
-                    doc.save(temp_docx)
-                    opened = word.Documents.Open(str(temp_docx), ReadOnly=True, AddToRecentFiles=False)
-                    opened.SaveAs2(str(temp_html), FileFormat=10)
-                    opened.Close(False)
-                    html_assets = temp_html.with_name(temp_html.stem + "_files")
-                    rendered = sorted(html_assets.glob("image*.png"))
-                    if len(rendered) != len(chunk):
-                        self.stats.warnings.append(
-                            f"Equation image count mismatch in render chunk {start + 1}-{start + len(chunk)}: "
-                            f"OMML={len(chunk)} rendered={len(rendered)}"
-                        )
-                    for (absolute_index, _math_node), source in zip(chunk, rendered):
-                        target = self.assets_dir / f"eq-{absolute_index:06d}.png"
-                        shutil.copyfile(source, target)
-                        self.equation_asset_map[absolute_index] = f"{self.asset_ref_prefix}/{target.name}"
-        finally:
-            word.Quit()
-        self.stats.equation_images = len(self.equation_asset_map)
-        self.stats.skipped_empty_equations = len(self.empty_equation_indexes)
-    def render_children(self, node: ET.Element, top_level: bool = False) -> str:
-        parts: list[str] = []
-        for child in list(node):
-            rendered = self.render_block(child)
-            if not rendered:
-                continue
-            if top_level:
-                parts.append(rendered.rstrip())
-            else:
-                parts.append(rendered.strip())
-        sep = "\n\n" if top_level else "\n"
-        return sep.join(part for part in parts if part)
-    def render_block(self, node: ET.Element) -> str:
-        name = local_name(node.tag)
-        if name == "p":
-            return self.render_paragraph(node)
-        if name == "tbl":
-            return self.render_table(node)
-        if name == "sdt":
-            content = node.find("./w:sdtContent", NS)
-            return self.render_children(content, top_level=True) if content is not None else ""
-        if name in {"bookmarkStart", "bookmarkEnd", "sectPr", "proofErr", "permStart", "permEnd"}:
-            return ""
-        return self.render_children(node, top_level=False)
-    def render_paragraph(self, p: ET.Element) -> str:
-        style = paragraph_style(p)
-        content = self.render_inline_children(p)
-        if not content.strip():
-            return ""
-        content = strip_trailing_markdown_breaks(content)
-        if is_code_style(style) or (self.table_depth == 0 and paragraph_is_code(p)):
-            self.stats.code_blocks += 1
-            return self.render_code_paragraph(p)
-        level = heading_level(style)
-        if level:
-            self.stats.headings += 1
-            return f"{'#' * level} {self.strip_inline_markers(content)}"
-        if is_quote_style(style):
-            self.stats.paragraphs += 1
-            return f"> {content}"
-        num_info = paragraph_num_info(p)
-        if not num_info:
-            num_info = paragraph_list_style_info(style)
-        if num_info:
-            self.stats.lists += 1
-            list_level, ordered = num_info
-            indent = "  " * list_level
-            bullet = "1." if ordered else "-"
-            return f"{indent}{bullet} {content}"
-        if paragraph_is_math(p):
-            if self.table_depth > 0:
-                return content
-            if not paragraph_has_display_math_layout(p):
-                return content
-            inner = content.strip()
-            if inner.startswith("$") and inner.endswith("$") and len(inner) >= 2:
-                inner = inner[1:-1]
-            return f"$$\n{inner}\n$$"
-        self.stats.paragraphs += 1
-        return content
-    def render_code_paragraph(self, p: ET.Element) -> str:
-        info = ""
-        code_parts: list[str] = []
-        first_nonempty_seen = False
-        for run in p.findall("./w:r", NS):
-            raw = extract_run_text(run)
-            if not raw:
-                continue
-            if not first_nonempty_seen and run_is_bold(run):
-                info = raw.strip()
-                first_nonempty_seen = True
-                continue
-            first_nonempty_seen = True
-            code_parts.append(raw)
-        code = "".join(code_parts).strip("\n")
-        fence = f"```{info}".rstrip()
-        return f"{fence}\n{code}\n```"
-    def render_inline_children(self, node: ET.Element) -> str:
-        segments: list[tuple[Optional[InlineStyle], str]] = []
-        for child in list(node):
-            name = local_name(child.tag)
-            if name == "r":
-                segments.extend(self.render_run_segments(child))
-            elif name == "hyperlink":
-                label = self.render_inline_children(child).strip()
-                anchor = attr(child, "w", "anchor")
-                rid = attr(child, "r", "id")
-                url = f"#{anchor}" if anchor else self.rels.get(rid or "", "")
-                segments.append((None, f"[{label}]({url})" if url else label))
-            elif name == "oMath":
-                segments.append((None, self.render_math(child, display=False)))
-            elif name == "oMathPara":
-                segments.append((None, self.render_math(child, display=True)))
-            elif name == "drawing":
-                img = self.render_image(child)
-                if img:
-                    segments.append((None, img))
-            elif name in {"pPr", "rPr"}:
-                continue
-            else:
-                segments.append((None, self.render_inline_children(child)))
-        return coalesce_inline_segments(segments)
-    def run_style(self, run: ET.Element) -> InlineStyle:
-        props = run.find("./w:rPr", NS)
-        if props is None:
-            return False, False, False
-        style_node = props.find("./w:rStyle", NS)
-        is_code = (
-            style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower()
-        ) or run_is_code(run)
-        bold = props.find("./w:b", NS) is not None
-        italic = props.find("./w:i", NS) is not None
-        return is_code, bold, italic
-    def render_run_segments(self, run: ET.Element) -> list[tuple[Optional[InlineStyle], str]]:
-        style = self.run_style(run)
-        is_math = run_is_math(run)
-        segments: list[tuple[Optional[InlineStyle], str]] = []
-        run_parts: list[str] = []
-        def flush_text() -> None:
-            if run_parts:
-                text = "".join(run_parts)
-                if is_math:
-                    segments.append((None, f"${text}$"))
-                else:
-                    segments.append((style, text))
-                run_parts.clear()
-        for child in list(run):
-            name = local_name(child.tag)
-            if name == "t":
-                run_parts.append(clean_text(child.text or "") if is_math else escape_md_text(child.text or ""))
-            elif name == "noBreakHyphen":
-                run_parts.append("\u2011")
-            elif name == "softHyphen":
-                run_parts.append("\u00ad")
-            elif name == "tab":
-                run_parts.append("\t")
-            elif name in {"br", "cr"}:
-                run_parts.append("  \n")
-            elif name == "drawing":
-                flush_text()
-                img = self.render_image(child)
-                if img:
-                    segments.append((None, img))
-            elif name == "rPr":
-                continue
-            else:
-                flush_text()
-                nested = self.render_inline_children(child)
-                if nested:
-                    segments.append((None, nested))
-        flush_text()
-        return segments
-    def render_math(self, node: ET.Element, display: bool) -> str:
-        self.stats.equations += 1
-        self.equation_index += 1
-        if self.equation_index in self.empty_equation_indexes or self.is_empty_equation(node):
-            self.stats.skipped_empty_equations = max(
-                self.stats.skipped_empty_equations,
-                len(self.empty_equation_indexes),
-            )
-            return ""
-        if self.equation_mode == "image":
-            asset = self.equation_asset_map.get(self.equation_index)
-            if asset:
-                alt = f"equation {self.equation_index}"
-                rendered = f"![{alt}]({asset})"
-                return f"\n{rendered}\n" if display and self.table_depth == 0 else rendered
-            self.stats.warnings.append(f"Missing rendered equation asset for equation {self.equation_index}")
-            return self.render_missing_equation_fallback(node)
-        try:
-            tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
-            render_display = display and self.table_depth == 0
-            rendered = normalize_tex(tex, display=render_display)
-            if len(self.equation_samples) < 50:
-                self.equation_samples.append({
-                    "source": "".join(t.text or "" for t in node.findall(".//m:t", NS))[:220],
-                    "tex": rendered[:500],
-                })
-            return f"\n{rendered}\n" if render_display else rendered
-        except Exception as exc:
-            self.stats.equation_errors += 1
-            self.stats.warnings.append(f"Equation conversion failed: {exc!r}")
-            fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
-            return f"`[equation: {fallback}]`"
-    def render_missing_equation_fallback(self, node: ET.Element) -> str:
-        try:
-            tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
-            return normalize_tex(tex, display=False)
-        except Exception:
-            fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
-            return f"`[equation: {fallback}]`"
-    @staticmethod
-    def is_empty_equation(node: ET.Element) -> bool:
-        text = "".join(t.text or "" for t in node.findall(".//m:t", NS)).strip()
-        return not text
-    def render_image(self, node: ET.Element) -> str:
-        preferred_name = image_metadata_filename(node)
-        refs = []
-        for blip in node.findall(".//a:blip", NS):
-            rid = attr(blip, "r", "embed") or attr(blip, "r", "link")
-            if rid:
-                refs.append(rid)
-        rendered = []
-        for rid in refs:
-            target = self.rels.get(rid, rid)
-            source = resolve_image_target(target)
-            asset = self.media_map.get(source)
-            if not asset:
-                self.stats.warnings.append(f"Image relationship not copied: {rid} -> {target}")
-                continue
-            if preferred_name and self.image_mode == "assets":
-                current_path = self.output_dir / asset
-                preferred_path = self.assets_dir / preferred_name
-                if current_path.exists() and preferred_path != current_path and not preferred_path.exists():
-                    shutil.copyfile(current_path, preferred_path)
-                asset = f"{self.asset_ref_prefix}/{preferred_path.name}"
-            self.stats.images += 1
-            rendered.append(f"![image]({asset})")
-        return " ".join(rendered)
-    def render_table(self, tbl: ET.Element) -> str:
-        self.stats.tables += 1
-        self.table_depth += 1
-        try:
-            rows = tbl.findall("./w:tr", NS)
-            rendered_rows = []
-            complex_table = False
-            for row in rows:
-                cells = row.findall("./w:tc", NS)
-                rendered_cells = []
-                for cell in cells:
-                    if cell.find(".//w:tbl", NS) is not None:
-                        complex_table = True
-                    cell_text = self.render_cell(cell)
-                    if "\n\n" in cell_text:
-                        complex_table = True
-                    rendered_cells.append(cell_text)
-                rendered_rows.append(rendered_cells)
-        finally:
-            self.table_depth -= 1
-        if not rendered_rows:
-            return ""
-        if complex_table:
-            self.stats.html_tables += 1
-            return self.render_html_table(rendered_rows)
-        self.stats.markdown_tables += 1
-        return self.render_markdown_table(rendered_rows)
-    def render_cell(self, cell: ET.Element) -> str:
-        parts = []
-        for child in list(cell):
-            if local_name(child.tag) == "tcPr":
-                continue
-            rendered = self.render_block(child)
-            if rendered:
-                parts.append(rendered.strip())
-        return "<br>".join(parts).strip()
-    def render_markdown_table(self, rows: list[list[str]]) -> str:
-        width = max(len(row) for row in rows)
-        padded = [row + [""] * (width - len(row)) for row in rows]
-        def clean_cell(value: str) -> str:
-            return value.replace("\n", "<br>").replace("|", "\\|").strip()
-        lines = []
-        lines.append("| " + " | ".join(clean_cell(v) for v in padded[0]) + " |")
-        lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
-        for row in padded[1:]:
-            lines.append("| " + " | ".join(clean_cell(v) for v in row) + " |")
-        return "\n".join(lines)
-    def render_html_table(self, rows: list[list[str]]) -> str:
-        html_rows = ["<table>"]
-        for row in rows:
-            html_rows.append("  <tr>")
-            for cell in row:
-                # Keep inline Markdown-ish math readable inside HTML fallback.
-                html_rows.append(f"    <td>{html.escape(cell, quote=False).replace(chr(10), '<br>')}</td>")
-            html_rows.append("  </tr>")
-        html_rows.append("</table>")
-        return "\n".join(html_rows)
-    @staticmethod
-    def strip_inline_markers(text: str) -> str:
-        return text.replace("\n", " ").strip()
-def export_one(
-    input_path: Path,
-    output_root: Path,
-    equation_mode: str,
-    out_same_dir: bool,
-    image_mode: str,
-    s3_config: S3ImageConfig | None = None,
-) -> dict:
-    if out_same_dir:
-        output_dir = input_path.parent
-        output_md = input_path.with_suffix(".md")
-        assets_dir = input_path.with_name(input_path.stem + ".assets")
-        report_path = input_path.with_name(input_path.stem + ".export-report.json")
-    else:
-        output_dir = output_root / input_path.stem
-        output_md = None
-        assets_dir = None
-        report_path = None
-    exporter = BuildCorpusExporter(
-        input_path,
-        output_dir,
-        equation_mode=equation_mode,
-        output_md=output_md,
-        assets_dir=assets_dir,
-        report_path=report_path,
-        image_mode=image_mode,
-        s3_config=s3_config,
-    )
-    return exporter.export()
-def collect_inputs(path: Path, target: str) -> list[Path]:
-    if path.is_file():
-        return [path]
-    inputs: list[Path] = []
-    patterns = ("*.md",) if target == "word" else ("*.docx", "*.pptx", "*.ppt")
-    for ext in patterns:
-        inputs.extend(path.rglob(ext))
-    return sorted(inputs)
-def main() -> None:
-    parser = argparse.ArgumentParser(
-        description="Convert Markdown to DOCX or DOCX/PPTX/PPT to Markdown.",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""examples:
-  build-corpus input.docx --out out
-  build-corpus input.md --to word --out out
-  build-corpus ./word-files --out ./markdown
-  build-corpus ./word-files --out-same-dir
-  build-corpus input.docx --images base64
-  build-corpus input.docx --images s3 --config build-corpus.config.json
-image modes:
-  assets   copy images into an assets folder and reference them from Markdown
-  base64   embed images directly as Markdown data URIs
-  s3       upload images to S3-compatible storage such as Cloudflare R2 or AWS S3
-equation modes:
-  tex      convert Word OMML equations to KaTeX-readable TeX
-  image    render equations as images for visual debugging only
-""",
-    )
-    parser.add_argument("input", type=Path, help="Markdown, DOCX, PPTX, or PPT file or directory")
-    parser.add_argument("--config", type=Path, help="JSON config file with conversion, output, and S3/R2 defaults")
-    parser.add_argument("--out", type=Path, help="Output directory for converted Markdown tree")
-    parser.add_argument("--to", choices=["auto", "markdown", "word"], help="Output target; auto infers from a single-file input")
-    parser.add_argument("--equations", choices=["tex", "image"], help="Equation output mode; default comes from config or tex")
-    parser.add_argument("--images", choices=["assets", "base64", "s3"], help="Image output mode; default comes from config or assets")
-    parser.add_argument("--out-same-dir", action="store_true", help="Write .md, .assets, and reports beside each source DOCX")
-    parser.add_argument("--word-template", type=Path, help="Optional .docx or .dotx template used for Markdown to Word exports")
-    parser.add_argument("--s3-bucket", help="S3/R2 bucket name for --images s3")
-    parser.add_argument("--s3-public-base-url", help="Public URL base used in Markdown, e.g. https://assets.example.com")
-    parser.add_argument("--s3-prefix", help="Object key prefix for uploaded images")
-    parser.add_argument("--s3-endpoint-url", help="S3-compatible endpoint, required for Cloudflare R2")
-    parser.add_argument("--s3-region", help="S3 region; use auto for Cloudflare R2")
-    parser.add_argument("--s3-access-key-id", help="S3/R2 access key id; can also come from config/env expansion")
-    parser.add_argument("--s3-secret-access-key", help="S3/R2 secret access key; can also come from config/env expansion")
-    parser.add_argument("--s3-cache-control", help="Cache-Control header for uploaded images")
-    parser.add_argument("--s3-acl", help="Optional ACL for AWS S3; usually omitted for Cloudflare R2")
-    args = parser.parse_args()
-    config = load_config(args.config)
-    args.out = args.out or Path(config_get(config, "output.out", ".codex/build-corpus/out"))
-    args.to = args.to or config_get(config, "conversion.target", "auto")
-    args.equations = args.equations or config_get(config, "conversion.equations", "tex")
-    args.images = args.images or config_get(config, "conversion.images", "assets")
-    args.out_same_dir = args.out_same_dir or bool(config_get(config, "output.out_same_dir", False))
-    args.word_template = args.word_template or (
-        Path(config_get(config, "word.template")) if config_get(config, "word.template") else None
-    )
-    s3_config = build_s3_config(config, args)
-    input_target = args.to
-    if args.input.is_file() and args.to == "auto":
-        input_target = "word" if args.input.suffix.lower() == ".md" else "markdown"
-    elif args.input.is_dir() and args.to == "auto":
-        input_target = "markdown"
-    reports = []
-    for input_path in collect_inputs(args.input, input_target):
-        if input_path.name.startswith("~$"):
-            continue
-        suffix = input_path.suffix.lower()
-        if input_target == "word" or suffix == ".md":
-            reports.append(export_markdown_to_docx(
-                input_path,
-                args.out,
-                args.out_same_dir,
-                template_path=args.word_template,
-            ))
-        elif suffix in {".pptx", ".ppt"}:
-            reports.append(export_presentation(
-                input_path,
-                args.out,
-                args.out_same_dir,
-                image_mode=args.images,
-            ))
-        else:
-            reports.append(export_one(
-                input_path,
-                args.out,
-                equation_mode=args.equations,
-                out_same_dir=args.out_same_dir,
-                image_mode=args.images,
-                s3_config=s3_config,
-            ))
-    batch_report_root = args.input if args.out_same_dir and args.input.is_dir() else args.out
-    batch_report_root.mkdir(parents=True, exist_ok=True)
-    batch_report = batch_report_root / "build-corpus-batch-report.json"
-    batch_report.write_text(json.dumps(reports, indent=2, ensure_ascii=False), encoding="utf-8")
-    print(json.dumps({
-        "converted": len(reports),
-        "batch_report": str(batch_report),
-        "outputs": [report["output"] for report in reports],
-        "default_word_template": str(args.word_template or resolve_default_template_path() or "bundled:md-to-word-template.dotx"),
-    }, indent=2))
-if __name__ == "__main__":
-    main()
+from __future__ import annotations
+import argparse
+import base64
+import contextlib
+import hashlib
+import html
+import json
+import mimetypes
+import os
+import re
+import shutil
+import subprocess
+import tempfile
+import urllib.request
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+from zipfile import ZipFile
+from xml.etree import ElementTree as ET
+from omml2latex import convert_omml
+try:
+    from .frontmatter import add_mdk_frontmatter, read_frontmatter_from_zip
+except ImportError:  # pragma: no cover - allows direct script execution
+    from build_corpus.frontmatter import add_mdk_frontmatter, read_frontmatter_from_zip
+try:
+    from .docx_exporter import export_markdown_to_docx, resolve_default_template_path
+except ImportError:  # pragma: no cover - allows direct script execution
+    from build_corpus.docx_exporter import export_markdown_to_docx, resolve_default_template_path
+try:
+    from .ppt_exporter import export_presentation
+except ImportError:  # pragma: no cover - allows direct script execution
+    from build_corpus.ppt_exporter import export_presentation
+NS = {
+    "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
+    "m": "http://schemas.openxmlformats.org/officeDocument/2006/math",
+    "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
+    "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
+    "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
+}
+W = f"{{{NS['w']}}}"
+R = f"{{{NS['r']}}}"
+@dataclass
+class ExportStats:
+    paragraphs: int = 0
+    headings: int = 0
+    code_blocks: int = 0
+    tables: int = 0
+    markdown_tables: int = 0
+    html_tables: int = 0
+    equations: int = 0
+    equation_images: int = 0
+    skipped_empty_equations: int = 0
+    equation_errors: int = 0
+    images: int = 0
+    lists: int = 0
+    footnotes: int = 0
+    warnings: list[str] = field(default_factory=list)
+@dataclass
+class S3ImageConfig:
+    bucket: str
+    public_base_url: str
+    prefix: str = ""
+    endpoint_url: str | None = None
+    region_name: str | None = None
+    access_key_id: str | None = None
+    secret_access_key: str | None = None
+    cache_control: str = "public, max-age=31536000, immutable"
+    acl: str | None = None
+class S3ImageUploader:
+    def __init__(self, config: S3ImageConfig):
+        self.config = config
+        try:
+            import boto3
+        except ImportError as exc:
+            raise RuntimeError("S3/R2 image mode requires boto3. Install with: pip install boto3") from exc
+        kwargs = {
+            "service_name": "s3",
+            "endpoint_url": config.endpoint_url,
+            "region_name": config.region_name,
+            "aws_access_key_id": config.access_key_id,
+            "aws_secret_access_key": config.secret_access_key,
+        }
+        self.client = boto3.client(**{key: value for key, value in kwargs.items() if value})
+    def upload(self, source_name: str, data: bytes, content_type: str) -> dict[str, str]:
+        digest = hashlib.sha256(data).hexdigest()
+        suffix = Path(source_name).suffix.lower()
+        key_parts = [self.config.prefix.strip("/"), "images", "sha256", f"{digest}{suffix}"]
+        key = "/".join(part for part in key_parts if part)
+        put_args = {
+            "Bucket": self.config.bucket,
+            "Key": key,
+            "Body": data,
+            "ContentType": content_type,
+            "CacheControl": self.config.cache_control,
+        }
+        if self.config.acl:
+            put_args["ACL"] = self.config.acl
+        self.client.put_object(**put_args)
+        return {
+            "source": source_name,
+            "sha256": digest,
+            "bucket": self.config.bucket,
+            "key": key,
+            "url": f"{self.config.public_base_url.rstrip('/')}/{key}",
+            "content_type": content_type,
+            "bytes": str(len(data)),
+        }
+def local_name(tag: str) -> str:
+    return tag.rsplit("}", 1)[-1] if "}" in tag else tag
+def attr(node: ET.Element, ns: str, name: str) -> str | None:
+    return node.attrib.get(f"{{{NS[ns]}}}{name}")
+def clean_text(text: str) -> str:
+    return (
+        text.replace("\u00a0", " ")
+        .replace("\u200b", "")
+        .replace("\ufeff", "")
+    )
+def escape_md_text(text: str) -> str:
+    text = clean_text(text)
+    escaped: list[str] = []
+    index = 0
+    while index < len(text):
+        char = text[index]
+        if char == "\\":
+            next_char = text[index + 1] if index + 1 < len(text) else ""
+            if next_char in "\\`*_{}[]()#+.!|$-":
+                escaped.append("\\")
+                escaped.append(next_char)
+                index += 2
+                continue
+            escaped.append("\\\\")
+        elif char in {"*", "_", "$"}:
+            escaped.append("\\" + char)
+        else:
+            escaped.append(char)
+        index += 1
+    return "".join(escaped)
+def normalize_inline_markers(text: str) -> str:
+    # Ensure inline images do not glue themselves to adjacent text.
+    text = re.sub(r"(\!\[[^\]]*\]\([^)]+\))(?=[^\s<>)\].,;:!?])", r"\1 ", text)
+    text = re.sub(r"(?<=[^\s<(\[.,;:!?])(\!\[[^\]]*\]\([^)]+\))", r" \1", text)
+    return text
+def strip_trailing_markdown_breaks(text: str) -> str:
+    while text.endswith("  "):
+        text = text[:-2]
+    return text.rstrip()
+InlineStyle = tuple[bool, bool, bool]
+def apply_inline_style(text: str, style: InlineStyle) -> str:
+    is_code, bold, italic = style
+    if not text:
+        return ""
+    if not text.strip():
+        return text
+    if is_code:
+        return f"`{text.replace('`', '\\`')}`"
+    if bold and italic:
+        return f"***{text}***"
+    if bold:
+        return f"**{text}**"
+    if italic:
+        return f"*{text}*"
+    return text
+def coalesce_inline_segments(segments: list[tuple[Optional[InlineStyle], str]]) -> str:
+    parts: list[str] = []
+    buffer: list[str] = []
+    buffer_style: Optional[InlineStyle] = None
+    def flush() -> None:
+        nonlocal buffer_style
+        if not buffer:
+            return
+        parts.append(apply_inline_style("".join(buffer), buffer_style or (False, False, False)))
+        buffer.clear()
+        buffer_style = None
+    for style, text in segments:
+        if not text:
+            continue
+        if style is None:
+            flush()
+            parts.append(text)
+            continue
+        if buffer_style == style:
+            buffer.append(text)
+            continue
+        flush()
+        buffer_style = style
+        buffer.append(text)
+    flush()
+    return normalize_inline_markers("".join(parts))
+def normalize_tex(tex: str, display: bool) -> str:
+    tex = clean_text(tex).strip()
+    if tex.startswith("$$") and tex.endswith("$$"):
+        tex = tex[2:-2].strip()
+    elif tex.startswith("$") and tex.endswith("$"):
+        tex = tex[1:-1].strip()
+    tex = tex.replace("\u2011", "-")
+    tex = tex.replace("$", r"\$")
+    tex = tex.replace(r"\text{ }", r"\,")
+    tex = tex.replace(r"\text{  }", r"\;")
+    tex = tex.replace(r"\text{  }", " ")
+    tex = tex.replace(r"\mathrm{\}\text{*}}", r"\*")
+    tex = tex.replace(r"\text{-}", "-")
+    tex = tex.replace(r"\*", "*")
+    tex = replace_raw_unicode_math(tex)
+    tex = strip_word_equation_field_codes(tex)
+    tex = normalize_redundant_math_italics(tex, display=display)
+    tex = escape_text_macro_underscores(tex)
+    tex = repair_underbrace_limits(tex)
+    tex = balance_tex_braces(tex)
+    tex = re.sub(r"\s+", " ", tex).strip()
+    return f"$$\n{tex}\n$$" if display else f"${tex}$"
+def normalize_redundant_math_italics(tex: str, display: bool) -> str:
+    tex = re.sub(r"\\mathit\{([A-Za-z])\}", r"\1", tex)
+    tex = re.sub(r"\{([A-Za-z])\}\^\{([^{}]+)\}", r"\1^\2", tex)
+    tex = re.sub(r"\\int_\{([^{}]+)\}\^\{([^{}]+)\}", r"\\int_\1^\2", tex)
+    tex = normalize_integral_differentials(tex)
+    if display:
+        tex = re.sub(r"\s*=\s*", " = ", tex)
+    return unwrap_redundant_tex_groups(tex)
+def normalize_integral_differentials(tex: str) -> str:
+    if r"\int" not in tex:
+        return tex
+    return re.sub(r"(?<!\\,)d([A-Za-z])(?=\s*(?:=|$|[+\-*/]))", r"\\,d\1", tex)
+def unwrap_redundant_tex_groups(tex: str) -> str:
+    integral_match = re.match(r"^(\\int_[^\s]+\^[^\s]+)\s+(.+)$", tex)
+    if integral_match:
+        head, tail = integral_match.groups()
+        tex = head + " " + unwrap_redundant_tex_groups(tail.strip())
+    while is_wrapped_in_redundant_braces(tex):
+        tex = tex[1:-1].strip()
+    return tex
+def is_wrapped_in_redundant_braces(tex: str) -> bool:
+    if not (tex.startswith("{") and tex.endswith("}")):
+        return False
+    depth = 0
+    escaped = False
+    for index, char in enumerate(tex):
+        if escaped:
+            escaped = False
+            continue
+        if char == "\\":
+            escaped = True
+            continue
+        if char == "{":
+            depth += 1
+        elif char == "}":
+            depth -= 1
+            if depth == 0 and index != len(tex) - 1:
+                return False
+    return depth == 0
+UNICODE_MATH_REPLACEMENTS = {
+    "∸": r"\dot{-}",
+    "⨅": r"\sqcap",
+    "⨃": r"\bigcup",
+    "⋜": r"\lessgtr",
+    "⋝": r"\gtrless",
+    "∱": r"\oint",
+    "∲": r"\oint",
+    "∳": r"\oint",
+    "ℇ": r"\varepsilon",
+    "Ϝ": r"\digamma",
+    "℩": r"\iota",
+    "Å": r"\mathring{A}",
+    "℮": "e",
+}
+def replace_raw_unicode_math(tex: str) -> str:
+    for raw, replacement in UNICODE_MATH_REPLACEMENTS.items():
+        tex = tex.replace(raw, replacement)
+    return tex
+def strip_word_equation_field_codes(tex: str) -> str:
+    # Word SEQ fields can leak into OMML conversion as equation-number text.
+    patterns = [
+        r"#\s*\\left\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\\right\)",
+        r"#\s*\(\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+\s*\)",
+        r"#\s*SEQ\s+Equation\s+\*\s+ARABIC\s+\d+",
+    ]
+    for pattern in patterns:
+        tex = re.sub(pattern, "", tex, flags=re.IGNORECASE)
+    return tex
+def escape_text_macro_underscores(tex: str) -> str:
+    def replace(match: re.Match[str]) -> str:
+        body = match.group(1)
+        body = body.replace("\\", r"\textbackslash{}")
+        body = body.replace("_", r"\_")
+        body = body.replace("&", r"\&")
+        body = body.replace("%", r"\%")
+        body = body.replace("#", r"\#")
+        return r"\text{" + body + "}"
+    return re.sub(r"\\text\{([^{}]*)\}", replace, tex)
+def repair_underbrace_limits(tex: str) -> str:
+    pattern = re.compile(
+        r"\\mathop\{\\mathop\{(?P<base>.*?)\}\\limits_\{\s*\\underbrace\s*\}\}\\limits_\{(?P<label>.*?)\}"
+        r"(?=(?:[+\-]|\\cdot|\\times|=|,|;|$))",
+        re.DOTALL,
+    )
+    previous = None
+    while previous != tex:
+        previous = tex
+        tex = pattern.sub(r"\\underbrace{\g<base>}_{\g<label>}", tex)
+    return tex
+def balance_tex_braces(tex: str) -> str:
+    balanced: list[str] = []
+    depth = 0
+    escaped = False
+    for char in tex:
+        if escaped:
+            balanced.append(char)
+            escaped = False
+            continue
+        if char == "\\":
+            balanced.append(char)
+            escaped = True
+            continue
+        if char == "{":
+            depth += 1
+            balanced.append(char)
+        elif char == "}":
+            if depth > 0:
+                depth -= 1
+                balanced.append(char)
+            # Drop unmatched closing braces; KaTeX rejects them.
+        else:
+            balanced.append(char)
+    if depth > 0:
+        balanced.extend("}" for _ in range(depth))
+    return "".join(balanced)
+def paragraph_style(node: ET.Element) -> str | None:
+    style = node.find("./w:pPr/w:pStyle", NS)
+    return attr(style, "w", "val") if style is not None else None
+def heading_level(style: str | None) -> int | None:
+    if not style:
+        return None
+    match = re.fullmatch(r"Heading([1-6])", style)
+    if match:
+        return int(match.group(1))
+    return None
+def is_code_style(style: str | None) -> bool:
+    return bool(style and "code" in style.lower())
+def is_quote_style(style: str | None) -> bool:
+    if not style:
+        return False
+    normalized = style.replace(" ", "").lower()
+    return normalized in {"buildcorpusquote", "quote", "intensequote"}
+def paragraph_num_info(node: ET.Element) -> tuple[int, bool] | None:
+    num_pr = node.find("./w:pPr/w:numPr", NS)
+    if num_pr is None:
+        return None
+    ilvl = num_pr.find("./w:ilvl", NS)
+    level = int(attr(ilvl, "w", "val") or "0") if ilvl is not None else 0
+    # Without numbering.xml style resolution, use bullets as the safer default.
+    return level, False
+def paragraph_list_style_info(style: str | None) -> tuple[int, bool] | None:
+    if not style:
+        return None
+    normalized = style.replace(" ", "").lower()
+    if normalized.startswith("listbullet"):
+        suffix = normalized.removeprefix("listbullet")
+        level = int(suffix) if suffix.isdigit() else 1
+        return max(level - 1, 0), False
+    if normalized.startswith("listnumber"):
+        suffix = normalized.removeprefix("listnumber")
+        level = int(suffix) if suffix.isdigit() else 1
+        return max(level - 1, 0), True
+    return None
+def run_is_math(run: ET.Element) -> bool:
+    props = run.find("./w:rPr", NS)
+    if props is None:
+        return False
+    fonts = props.find("./w:rFonts", NS)
+    if fonts is None:
+        return False
+    for attr_name in ("ascii", "hAnsi", "cs"):
+        value = attr(fonts, "w", attr_name)
+        if value and value.lower() == "cambria math":
+            return True
+    return False
+def run_is_code(run: ET.Element) -> bool:
+    props = run.find("./w:rPr", NS)
+    if props is None:
+        return False
+    style_node = props.find("./w:rStyle", NS)
+    if style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower():
+        return True
+    fonts = props.find("./w:rFonts", NS)
+    if fonts is None:
+        return False
+    for attr_name in ("ascii", "hAnsi", "cs"):
+        value = attr(fonts, "w", attr_name)
+        if value and value.lower() == "consolas":
+            return True
+    return False
+def run_is_bold(run: ET.Element) -> bool:
+    props = run.find("./w:rPr", NS)
+    return props is not None and props.find("./w:b", NS) is not None
+def paragraph_is_code(node: ET.Element) -> bool:
+    runs = node.findall("./w:r", NS)
+    if not runs:
+        return False
+    first_nonempty_seen = False
+    code_like_runs = 0
+    meaningful_runs = 0
+    for run in runs:
+        text = extract_run_text(run)
+        if not text or not text.strip():
+            continue
+        meaningful_runs += 1
+        if not first_nonempty_seen and run_is_bold(run):
+            first_nonempty_seen = True
+            continue
+        first_nonempty_seen = True
+        if run_is_code(run):
+            code_like_runs += 1
+            continue
+        return False
+    return meaningful_runs > 0 and code_like_runs > 0
+def extract_run_text(run: ET.Element) -> str:
+    parts: list[str] = []
+    for child in list(run):
+        name = local_name(child.tag)
+        if name == "t":
+            parts.append(clean_text(child.text or ""))
+        elif name == "tab":
+            parts.append("\t")
+        elif name in {"br", "cr"}:
+            parts.append("\n")
+    return "".join(parts)
+def paragraph_is_math(node: ET.Element) -> bool:
+    runs = node.findall("./w:r", NS)
+    math_runs = 0
+    text_runs = 0
+    for run in runs:
+        if run_is_hidden(run):
+            continue
+        texts = [t.text or "" for t in run.findall("./w:t", NS)]
+        if not any(segment.strip() for segment in texts):
+            continue
+        text_runs += 1
+        if run_is_math(run):
+            math_runs += 1
+    return text_runs > 0 and text_runs == math_runs
+def paragraph_has_display_math_layout(node: ET.Element) -> bool:
+    indent = node.find("./w:pPr/w:ind", NS)
+    spacing = node.find("./w:pPr/w:spacing", NS)
+    if indent is not None and any(attr(indent, "w", key) not in {None, "0"} for key in ("left", "right", "firstLine", "hanging")):
+        return True
+    if spacing is not None and any(attr(spacing, "w", key) not in {None, "0"} for key in ("before", "after")):
+        return True
+    return False
+def run_is_hidden(run: ET.Element) -> bool:
+    return run.find("./w:rPr/w:vanish", NS) is not None
+def relationship_map(zip_file: ZipFile, part: str = "word/document.xml") -> dict[str, str]:
+    rels_path = str(Path(part).parent / "_rels" / (Path(part).name + ".rels")).replace("\\", "/")
+    if rels_path not in zip_file.namelist():
+        return {}
+    root = ET.fromstring(zip_file.read(rels_path))
+    return {
+        rel.attrib["Id"]: rel.attrib.get("Target", "")
+        for rel in root
+        if "Id" in rel.attrib
+    }
+def resolve_image_target(target: str) -> str:
+    if target.startswith("../"):
+        target = target[3:]
+    if not target.startswith("word/"):
+        target = f"word/{target}"
+    return target
+def image_metadata_filename(node: ET.Element) -> str | None:
+    for tag_name in ("docPr", "cNvPr"):
+        for entry in node.findall(f".//wp:{tag_name}", NS):
+            for key in ("descr", "title", "name"):
+                value = entry.attrib.get(key)
+                if value and Path(value).suffix:
+                    return Path(value).name
+    return None
+def expand_env(value):
+    if isinstance(value, str):
+        return os.path.expandvars(value)
+    if isinstance(value, dict):
+        return {key: expand_env(item) for key, item in value.items()}
+    if isinstance(value, list):
+        return [expand_env(item) for item in value]
+    return value
+def load_config(path: Path | None) -> dict:
+    if path is None:
+        return {}
+    if not path.exists():
+        raise FileNotFoundError(f"Config file not found: {path}")
+    if path.suffix.lower() != ".json":
+        raise ValueError("Config currently supports JSON files only")
+    return expand_env(json.loads(path.read_text(encoding="utf-8")))
+def config_get(config: dict, key: str, default=None):
+    current = config
+    for part in key.split("."):
+        if not isinstance(current, dict) or part not in current:
+            return default
+        current = current[part]
+    return current
+IMAGE_MARKDOWN_RE = re.compile(r"!\[([^\]]*)\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)")
+STYLE_PACKAGE_PARTS = {
+    "word/styles.xml",
+    "word/stylesWithEffects.xml",
+    "word/numbering.xml",
+    "word/fontTable.xml",
+    "word/settings.xml",
+    "word/webSettings.xml",
+    "word/theme/theme1.xml",
+}
+STYLE_PACKAGE_REL_PARTS = {
+    "_rels/.rels",
+    "word/_rels/document.xml.rels",
+    "docProps/core.xml",
+    "docProps/app.xml",
+    "docProps/custom.xml",
+}
+def image_mime_from_ref(ref: str, data: bytes) -> str:
+    guessed = mimetypes.guess_type(ref.split("?", 1)[0])[0]
+    if guessed:
+        return guessed
+    if data.startswith(b"\x89PNG\r\n\x1a\n"):
+        return "image/png"
+    if data.startswith(b"\xff\xd8"):
+        return "image/jpeg"
+    if data.startswith(b"GIF87a") or data.startswith(b"GIF89a"):
+        return "image/gif"
+    if data.startswith(b"RIFF") and data[8:12] == b"WEBP":
+        return "image/webp"
+    return "application/octet-stream"
+def read_external_image(ref: str, base_dir: Path) -> tuple[bytes, str]:
+    if ref.startswith("data:"):
+        raise ValueError("already inline")
+    if re.match(r"^https?://", ref, re.IGNORECASE):
+        with urllib.request.urlopen(ref, timeout=30) as response:
+            data = response.read()
+            content_type = response.headers.get_content_type() or image_mime_from_ref(ref, data)
+            return data, content_type
+    path = Path(ref)
+    if not path.is_absolute():
+        path = base_dir / path
+    data = path.read_bytes()
+    return data, image_mime_from_ref(str(path), data)
+def inline_markdown_images(input_path: Path, output_path: Path | None = None) -> dict:
+    markdown = input_path.read_text(encoding="utf-8")
+    base_dir = input_path.parent
+    converted = 0
+    skipped: list[str] = []
+    def replace(match: re.Match[str]) -> str:
+        nonlocal converted
+        alt, ref = match.group(1), match.group(2)
+        try:
+            data, mime = read_external_image(ref, base_dir)
+        except Exception as exc:
+            skipped.append(f"{ref}: {exc}")
+            return match.group(0)
+        converted += 1
+        encoded = base64.b64encode(data).decode("ascii")
+        return f"![{alt}](data:{mime};base64,{encoded})"
+    output = IMAGE_MARKDOWN_RE.sub(replace, markdown)
+    target = output_path or input_path.with_name(input_path.stem + ".inline.md")
+    target.write_text(output, encoding="utf-8")
+    return {
+        "input": str(input_path),
+        "output": str(target),
+        "images_inlined": converted,
+        "skipped": skipped,
+    }
+def build_s3_config(config: dict, args: argparse.Namespace) -> S3ImageConfig | None:
+    if args.images != "s3":
+        return None
+    s3 = config_get(config, "s3", {}) or {}
+    bucket = args.s3_bucket or s3.get("bucket")
+    public_base_url = args.s3_public_base_url or s3.get("public_base_url")
+    if not bucket or not public_base_url:
+        raise ValueError("S3/R2 image mode requires bucket and public_base_url")
+    return S3ImageConfig(
+        bucket=bucket,
+        public_base_url=public_base_url,
+        prefix=args.s3_prefix if args.s3_prefix is not None else s3.get("prefix", ""),
+        endpoint_url=args.s3_endpoint_url or s3.get("endpoint_url"),
+        region_name=args.s3_region or s3.get("region_name"),
+        access_key_id=args.s3_access_key_id or s3.get("access_key_id"),
+        secret_access_key=args.s3_secret_access_key or s3.get("secret_access_key"),
+        cache_control=args.s3_cache_control or s3.get("cache_control", "public, max-age=31536000, immutable"),
+        acl=args.s3_acl if args.s3_acl is not None else s3.get("acl"),
+    )
+class BuildCorpusExporter:
+    def __init__(
+        self,
+        input_path: Path,
+        output_dir: Path,
+        equation_mode: str = "tex",
+        output_md: Path | None = None,
+        assets_dir: Path | None = None,
+        report_path: Path | None = None,
+        image_mode: str = "assets",
+        s3_config: S3ImageConfig | None = None,
+        emit_frontmatter: bool = True,
+    ):
+        self.input_path = input_path
+        self.output_dir = output_dir
+        self.emit_frontmatter = emit_frontmatter
+        self.output_md = output_md or (output_dir / (input_path.stem + ".md"))
+        self.assets_dir = assets_dir or (output_dir / "assets")
+        self.report_path = report_path or (output_dir / "export-report.json")
+        self.asset_ref_prefix = self.assets_dir.name
+        self.equation_mode = equation_mode
+        self.image_mode = image_mode
+        self.s3_config = s3_config
+        self.s3_uploader = S3ImageUploader(s3_config) if image_mode == "s3" and s3_config else None
+        self.stats = ExportStats()
+        self.rels: dict[str, str] = {}
+        self.media_map: dict[str, str] = {}
+        self.image_uploads: list[dict[str, str]] = []
+        self.equation_asset_map: dict[int, str] = {}
+        self.empty_equation_indexes: set[int] = set()
+        self.equation_index = 0
+        self.equation_samples: list[dict[str, str]] = []
+        self.table_depth = 0
+        self.pending_math_source: str | None = None
+        self.footnotes: dict[str, str] = {}
+        self.used_footnotes: list[str] = []
+        self.word_style_package_path: Path | None = None
+        self.word_style_manifest_path: Path | None = None
+    def export(self) -> dict:
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        if self.image_mode == "assets" or self.equation_mode == "image":
+            self.assets_dir.mkdir(parents=True, exist_ok=True)
+        prior_frontmatter: str | None = None
+        with self.open_input_zip() as zf:
+            self.rels = relationship_map(zf)
+            self.footnotes = self.load_footnotes(zf)
+            if self.emit_frontmatter:
+                prior_frontmatter = read_frontmatter_from_zip(zf)
+            self.word_style_package_path, self.word_style_manifest_path = self.write_word_style_package(zf)
+            self._copy_media(zf)
+            document_xml = zf.read("word/document.xml")
+            root = ET.fromstring(document_xml)
+            body = root.find("w:body", NS)
+            if body is None:
+                raise RuntimeError("word/document.xml has no w:body")
+            if self.equation_mode == "image":
+                self._render_equation_assets(root)
+            markdown = self.render_children(body, top_level=True).strip() + "\n"
+            markdown = self.add_footnote_definitions(markdown)
+        if self.emit_frontmatter:
+            markdown = add_mdk_frontmatter(markdown, self.input_path, prior_frontmatter)
+        self.output_md.parent.mkdir(parents=True, exist_ok=True)
+        self.output_md.write_text(markdown, encoding="utf-8")
+        report = {
+            "input": str(self.input_path),
+            "output": str(self.output_md),
+            "assets_dir": str(self.assets_dir) if self.assets_dir.exists() else None,
+            "image_mode": self.image_mode,
+            "image_uploads": self.image_uploads,
+            "stats": self.stats.__dict__,
+            "equation_samples": self.equation_samples[:50],
+            "word_style_package": str(self.word_style_package_path) if self.word_style_package_path else None,
+            "word_style_manifest": str(self.word_style_manifest_path) if self.word_style_manifest_path else None,
+        }
+        self.report_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
+        return report
+    def write_word_style_package(self, zf: ZipFile) -> tuple[Path | None, Path | None]:
+        sidecar = self.output_md.with_suffix(".wordstyle")
+        package_dir = sidecar / "package"
+        package_docx = sidecar / "style-package.docx"
+        manifest_path = sidecar / "manifest.json"
+        sidecar.mkdir(parents=True, exist_ok=True)
+        package_dir.mkdir(parents=True, exist_ok=True)
+        names = set(zf.namelist())
+        copied_parts: list[str] = []
+        with ZipFile(package_docx, "w") as package_zip:
+            for name in sorted((STYLE_PACKAGE_PARTS | STYLE_PACKAGE_REL_PARTS) & names):
+                data = zf.read(name)
+                package_zip.writestr(name, data)
+                target = package_dir / Path(name)
+                target.parent.mkdir(parents=True, exist_ok=True)
+                target.write_bytes(data)
+                copied_parts.append(name)
+        manifest = {
+            "source": str(self.input_path),
+            "stylePackage": str(package_docx),
+            "parts": copied_parts,
+            "roundTrip": "word-to-md-to-word",
+        }
+        manifest_path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
+        return package_docx, manifest_path
+    def add_roundtrip_metadata(self, markdown: str) -> str:
+        if not self.word_style_package_path or not self.word_style_manifest_path:
+            return markdown
+        package_ref = self.word_style_package_path.relative_to(self.output_md.parent).as_posix()
+        manifest_ref = self.word_style_manifest_path.relative_to(self.output_md.parent).as_posix()
+        metadata = (
+            f'<!-- build-corpus:word-style-package path="{package_ref}" manifest="{manifest_ref}" -->\n'
+            "<!-- build-corpus:metadata hidden=\"true\" -->\n\n"
+        )
+        return metadata + markdown
+    def add_footnote_definitions(self, markdown: str) -> str:
+        if not self.used_footnotes:
+            return markdown
+        lines = ["", ""]
+        for note_id in self.used_footnotes:
+            text = self.footnotes.get(note_id, "").strip()
+            if text:
+                lines.append(f"[^{note_id}]: {text}")
+        if len(lines) == 2:
+            return markdown
+        return markdown.rstrip() + "\n" + "\n".join(lines) + "\n"
+    def load_footnotes(self, zf: ZipFile) -> dict[str, str]:
+        if "word/footnotes.xml" not in zf.namelist():
+            return {}
+        root = ET.fromstring(zf.read("word/footnotes.xml"))
+        notes: dict[str, str] = {}
+        for note in root.findall("./w:footnote", NS):
+            note_id = attr(note, "w", "id")
+            note_type = attr(note, "w", "type")
+            if not note_id or note_type in {"separator", "continuationSeparator"}:
+                continue
+            rendered = self.render_children(note, top_level=True).strip()
+            if rendered:
+                notes[note_id] = rendered.replace("\n", " ")
+        return notes
+    @contextlib.contextmanager
+    def open_input_zip(self):
+        try:
+            with ZipFile(self.input_path) as zf:
+                yield zf
+                return
+        except PermissionError:
+            pass
+        with tempfile.TemporaryDirectory(prefix="build-corpus-input-") as tmp:
+            temp_input = Path(tmp) / self.input_path.name
+            self.copy_locked_input(temp_input)
+            self.stats.warnings.append(
+                f"Input file was locked; converted from temporary copy: {temp_input}"
+            )
+            with ZipFile(temp_input) as zf:
+                yield zf
+    def copy_locked_input(self, temp_input: Path) -> None:
+        try:
+            shutil.copyfile(self.input_path, temp_input)
+            return
+        except PermissionError:
+            if os.name != "nt":
+                raise
+        source = str(self.input_path).replace("'", "''")
+        target = str(temp_input).replace("'", "''")
+        command = f"Copy-Item -LiteralPath '{source}' -Destination '{target}' -Force"
+        result = subprocess.run(
+            ["powershell", "-NoProfile", "-Command", command],
+            capture_output=True,
+            text=True,
+        )
+        if result.returncode != 0:
+            message = result.stderr.strip() or result.stdout.strip() or "unknown error"
+            raise PermissionError(f"Could not copy locked input via PowerShell: {message}")
+    def _copy_media(self, zf: ZipFile) -> None:
+        for name in zf.namelist():
+            if not name.startswith("word/media/"):
+                continue
+            mime_type = mimetypes.guess_type(Path(name).name)[0] or "application/octet-stream"
+            if self.image_mode == "base64":
+                data = zf.read(name)
+                encoded = base64.b64encode(data).decode("ascii")
+                self.media_map[name] = f"data:{mime_type};base64,{encoded}"
+            elif self.image_mode == "s3":
+                if self.s3_uploader is None:
+                    raise RuntimeError("S3/R2 image mode needs s3_config")
+                data = zf.read(name)
+                upload = self.s3_uploader.upload(name, data, mime_type)
+                self.image_uploads.append(upload)
+                self.media_map[name] = upload["url"]
+            else:
+                target = self.assets_dir / Path(name).name
+                with zf.open(name) as src, target.open("wb") as dst:
+                    shutil.copyfileobj(src, dst)
+                self.media_map[name] = f"{self.asset_ref_prefix}/{target.name}"
+    def _render_equation_assets(self, document_root: ET.Element) -> None:
+        math_nodes = document_root.findall(".//m:oMath", NS)
+        if not math_nodes:
+            return
+        render_jobs = []
+        for index, math_node in enumerate(math_nodes, 1):
+            if self.is_empty_equation(math_node):
+                self.empty_equation_indexes.add(index)
+                continue
+            render_jobs.append((index, math_node))
+        try:
+            from docx import Document
+            from docx.oxml import parse_xml
+            import win32com.client as win32
+        except Exception as exc:
+            self.stats.warnings.append(f"Equation image rendering unavailable: {exc!r}")
+            return
+        chunk_size = 1
+        word = win32.DispatchEx("Word.Application")
+        word.Visible = False
+        word.DisplayAlerts = 0
+        try:
+            for start in range(0, len(render_jobs), chunk_size):
+                chunk = render_jobs[start : start + chunk_size]
+                with tempfile.TemporaryDirectory(prefix="build-corpus-equations-") as tmp:
+                    tmp_dir = Path(tmp)
+                    temp_docx = tmp_dir / "equations.docx"
+                    temp_html = tmp_dir / "equations.html"
+                    doc = Document()
+                    for absolute_index, math_node in chunk:
+                        p = doc.add_paragraph(f"EQMARKER{absolute_index:06d} ")
+                        p._p.append(parse_xml(ET.tostring(math_node, encoding="unicode")))
+                    doc.save(temp_docx)
+                    opened = word.Documents.Open(str(temp_docx), ReadOnly=True, AddToRecentFiles=False)
+                    opened.SaveAs2(str(temp_html), FileFormat=10)
+                    opened.Close(False)
+                    html_assets = temp_html.with_name(temp_html.stem + "_files")
+                    rendered = sorted(html_assets.glob("image*.png"))
+                    if len(rendered) != len(chunk):
+                        self.stats.warnings.append(
+                            f"Equation image count mismatch in render chunk {start + 1}-{start + len(chunk)}: "
+                            f"OMML={len(chunk)} rendered={len(rendered)}"
+                        )
+                    for (absolute_index, _math_node), source in zip(chunk, rendered):
+                        target = self.assets_dir / f"eq-{absolute_index:06d}.png"
+                        shutil.copyfile(source, target)
+                        self.equation_asset_map[absolute_index] = f"{self.asset_ref_prefix}/{target.name}"
+        finally:
+            word.Quit()
+        self.stats.equation_images = len(self.equation_asset_map)
+        self.stats.skipped_empty_equations = len(self.empty_equation_indexes)
+    def render_children(self, node: ET.Element, top_level: bool = False) -> str:
+        parts: list[str] = []
+        for child in list(node):
+            rendered = self.render_block(child)
+            if not rendered:
+                continue
+            if top_level:
+                parts.append(rendered.rstrip())
+            else:
+                parts.append(rendered.strip())
+        sep = "\n\n" if top_level else "\n"
+        return sep.join(part for part in parts if part)
+    def render_block(self, node: ET.Element) -> str:
+        name = local_name(node.tag)
+        if name == "p":
+            return self.render_paragraph(node)
+        if name == "tbl":
+            return self.render_table(node)
+        if name == "sdt":
+            content = node.find("./w:sdtContent", NS)
+            return self.render_children(content, top_level=True) if content is not None else ""
+        if name in {"bookmarkStart", "bookmarkEnd", "sectPr", "proofErr", "permStart", "permEnd"}:
+            return ""
+        return self.render_children(node, top_level=False)
+    def render_paragraph(self, p: ET.Element) -> str:
+        style = paragraph_style(p)
+        content = self.render_inline_children(p)
+        if not content.strip():
+            return ""
+        content = strip_trailing_markdown_breaks(content)
+        if is_code_style(style) or (self.table_depth == 0 and paragraph_is_code(p)):
+            self.stats.code_blocks += 1
+            return self.render_code_paragraph(p)
+        level = heading_level(style)
+        if level:
+            self.stats.headings += 1
+            return f"{'#' * level} {self.strip_inline_markers(content)}"
+        if is_quote_style(style):
+            self.stats.paragraphs += 1
+            return f"> {content}"
+        num_info = paragraph_num_info(p)
+        if not num_info:
+            num_info = paragraph_list_style_info(style)
+        if num_info:
+            self.stats.lists += 1
+            list_level, ordered = num_info
+            indent = "  " * list_level
+            bullet = "1." if ordered else "-"
+            return f"{indent}{bullet} {content}"
+        if paragraph_is_math(p):
+            if self.table_depth > 0:
+                return content
+            if not paragraph_has_display_math_layout(p):
+                return content
+            inner = content.strip()
+            if inner.startswith("$") and inner.endswith("$") and len(inner) >= 2:
+                inner = inner[1:-1]
+            return f"$$\n{inner}\n$$"
+        self.stats.paragraphs += 1
+        return content
+    def render_code_paragraph(self, p: ET.Element) -> str:
+        info = ""
+        code_parts: list[str] = []
+        first_nonempty_seen = False
+        for run in p.findall("./w:r", NS):
+            raw = extract_run_text(run)
+            if not raw:
+                continue
+            if not first_nonempty_seen and run_is_bold(run):
+                info = raw.strip()
+                first_nonempty_seen = True
+                continue
+            first_nonempty_seen = True
+            code_parts.append(raw)
+        code = "".join(code_parts).strip("\n")
+        fence = f"```{info}".rstrip()
+        return f"{fence}\n{code}\n```"
+    def render_inline_children(self, node: ET.Element) -> str:
+        segments: list[tuple[Optional[InlineStyle], str]] = []
+        for child in list(node):
+            name = local_name(child.tag)
+            if name == "r":
+                if run_is_hidden(child):
+                    source = extract_run_text(child).strip()
+                    if source:
+                        self.pending_math_source = source
+                    continue
+                segments.extend(self.render_run_segments(child))
+            elif name == "hyperlink":
+                label = self.render_inline_children(child).strip()
+                anchor = attr(child, "w", "anchor")
+                rid = attr(child, "r", "id")
+                url = f"#{anchor}" if anchor else self.rels.get(rid or "", "")
+                segments.append((None, f"[{label}]({url})" if url else label))
+            elif name == "oMath":
+                segments.append((None, self.render_math(child, display=False)))
+            elif name == "oMathPara":
+                segments.append((None, self.render_math(child, display=True)))
+            elif name == "drawing":
+                img = self.render_image(child)
+                if img:
+                    segments.append((None, img))
+            elif name in {"pPr", "rPr"}:
+                continue
+            else:
+                segments.append((None, self.render_inline_children(child)))
+        return coalesce_inline_segments(segments)
+    def run_style(self, run: ET.Element) -> InlineStyle:
+        props = run.find("./w:rPr", NS)
+        if props is None:
+            return False, False, False
+        style_node = props.find("./w:rStyle", NS)
+        is_code = (
+            style_node is not None and "code" in (attr(style_node, "w", "val") or "").lower()
+        ) or run_is_code(run)
+        bold = props.find("./w:b", NS) is not None
+        italic = props.find("./w:i", NS) is not None
+        return is_code, bold, italic
+    def render_run_segments(self, run: ET.Element) -> list[tuple[Optional[InlineStyle], str]]:
+        if run_is_hidden(run):
+            return []
+        style = self.run_style(run)
+        is_math = run_is_math(run)
+        segments: list[tuple[Optional[InlineStyle], str]] = []
+        run_parts: list[str] = []
+        def flush_text() -> None:
+            if run_parts:
+                text = "".join(run_parts)
+                if is_math:
+                    segments.append((None, f"${text}$"))
+                else:
+                    segments.append((style, text))
+                run_parts.clear()
+        for child in list(run):
+            name = local_name(child.tag)
+            if name == "t":
+                run_parts.append(clean_text(child.text or "") if is_math else escape_md_text(child.text or ""))
+            elif name == "noBreakHyphen":
+                run_parts.append("\u2011")
+            elif name == "softHyphen":
+                run_parts.append("\u00ad")
+            elif name == "tab":
+                run_parts.append("\t")
+            elif name in {"br", "cr"}:
+                run_parts.append("  \n")
+            elif name == "drawing":
+                flush_text()
+                img = self.render_image(child)
+                if img:
+                    segments.append((None, img))
+            elif name == "footnoteReference":
+                flush_text()
+                note_id = attr(child, "w", "id")
+                if note_id and note_id in self.footnotes:
+                    self.stats.footnotes += 1
+                    if note_id not in self.used_footnotes:
+                        self.used_footnotes.append(note_id)
+                    segments.append((None, f"[^{note_id}]"))
+            elif name == "rPr":
+                continue
+            else:
+                flush_text()
+                nested = self.render_inline_children(child)
+                if nested:
+                    segments.append((None, nested))
+        flush_text()
+        return segments
+    def render_math(self, node: ET.Element, display: bool) -> str:
+        self.stats.equations += 1
+        self.equation_index += 1
+        if self.pending_math_source:
+            source = self.pending_math_source
+            self.pending_math_source = None
+            return normalize_tex(source, display=display)
+        if self.equation_index in self.empty_equation_indexes or self.is_empty_equation(node):
+            self.stats.skipped_empty_equations = max(
+                self.stats.skipped_empty_equations,
+                len(self.empty_equation_indexes),
+            )
+            return ""
+        if self.equation_mode == "image":
+            asset = self.equation_asset_map.get(self.equation_index)
+            if asset:
+                alt = f"equation {self.equation_index}"
+                rendered = f"![{alt}]({asset})"
+                return f"\n{rendered}\n" if display and self.table_depth == 0 else rendered
+            self.stats.warnings.append(f"Missing rendered equation asset for equation {self.equation_index}")
+            return self.render_missing_equation_fallback(node)
+        try:
+            tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
+            render_display = display and self.table_depth == 0
+            rendered = normalize_tex(tex, display=render_display)
+            if len(self.equation_samples) < 50:
+                self.equation_samples.append({
+                    "source": "".join(t.text or "" for t in node.findall(".//m:t", NS))[:220],
+                    "tex": rendered[:500],
+                })
+            return f"\n{rendered}\n" if render_display else rendered
+        except Exception as exc:
+            self.stats.equation_errors += 1
+            self.stats.warnings.append(f"Equation conversion failed: {exc!r}")
+            fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
+            return f"`[equation: {fallback}]`"
+    def render_missing_equation_fallback(self, node: ET.Element) -> str:
+        try:
+            tex = convert_omml(ET.fromstring(ET.tostring(node, encoding="unicode")))
+            return normalize_tex(tex, display=False)
+        except Exception:
+            fallback = "".join(t.text or "" for t in node.findall(".//m:t", NS))
+            return f"`[equation: {fallback}]`"
+    @staticmethod
+    def is_empty_equation(node: ET.Element) -> bool:
+        text = "".join(t.text or "" for t in node.findall(".//m:t", NS)).strip()
+        return not text
+    def render_image(self, node: ET.Element) -> str:
+        preferred_name = image_metadata_filename(node)
+        refs = []
+        for blip in node.findall(".//a:blip", NS):
+            rid = attr(blip, "r", "embed") or attr(blip, "r", "link")
+            if rid:
+                refs.append(rid)
+        rendered = []
+        for rid in refs:
+            target = self.rels.get(rid, rid)
+            source = resolve_image_target(target)
+            asset = self.media_map.get(source)
+            if not asset:
+                self.stats.warnings.append(f"Image relationship not copied: {rid} -> {target}")
+                continue
+            if preferred_name and self.image_mode == "assets":
+                current_path = self.output_dir / asset
+                preferred_path = self.assets_dir / preferred_name
+                if current_path.exists() and preferred_path != current_path and not preferred_path.exists():
+                    shutil.copyfile(current_path, preferred_path)
+                asset = f"{self.asset_ref_prefix}/{preferred_path.name}"
+            self.stats.images += 1
+            rendered.append(f"![image]({asset})")
+        return " ".join(rendered)
+    def render_table(self, tbl: ET.Element) -> str:
+        self.stats.tables += 1
+        self.table_depth += 1
+        try:
+            rows = tbl.findall("./w:tr", NS)
+            rendered_rows = []
+            complex_table = False
+            for row in rows:
+                cells = row.findall("./w:tc", NS)
+                rendered_cells = []
+                for cell in cells:
+                    if cell.find(".//w:tbl", NS) is not None:
+                        complex_table = True
+                    cell_text = self.render_cell(cell)
+                    if "\n\n" in cell_text:
+                        complex_table = True
+                    rendered_cells.append(cell_text)
+                rendered_rows.append(rendered_cells)
+        finally:
+            self.table_depth -= 1
+        if not rendered_rows:
+            return ""
+        if complex_table:
+            self.stats.html_tables += 1
+            return self.render_html_table(rendered_rows)
+        self.stats.markdown_tables += 1
+        return self.render_markdown_table(rendered_rows)
+    def render_cell(self, cell: ET.Element) -> str:
+        parts = []
+        for child in list(cell):
+            if local_name(child.tag) == "tcPr":
+                continue
+            rendered = self.render_block(child)
+            if rendered:
+                parts.append(rendered.strip())
+        return "<br>".join(parts).strip()
+    def render_markdown_table(self, rows: list[list[str]]) -> str:
+        width = max(len(row) for row in rows)
+        padded = [row + [""] * (width - len(row)) for row in rows]
+        def clean_cell(value: str) -> str:
+            return value.replace("\n", "<br>").replace("|", "\\|").strip()
+        lines = []
+        lines.append("| " + " | ".join(clean_cell(v) for v in padded[0]) + " |")
+        lines.append("| " + " | ".join("---" for _ in range(width)) + " |")
+        for row in padded[1:]:
+            lines.append("| " + " | ".join(clean_cell(v) for v in row) + " |")
+        return "\n".join(lines)
+    def render_html_table(self, rows: list[list[str]]) -> str:
+        html_rows = ["<table>"]
+        for row in rows:
+            html_rows.append("  <tr>")
+            for cell in row:
+                # Keep inline Markdown-ish math readable inside HTML fallback.
+                html_rows.append(f"    <td>{html.escape(cell, quote=False).replace(chr(10), '<br>')}</td>")
+            html_rows.append("  </tr>")
+        html_rows.append("</table>")
+        return "\n".join(html_rows)
+    @staticmethod
+    def strip_inline_markers(text: str) -> str:
+        return text.replace("\n", " ").strip()
+def export_one(
+    input_path: Path,
+    output_root: Path,
+    equation_mode: str,
+    out_same_dir: bool,
+    image_mode: str,
+    s3_config: S3ImageConfig | None = None,
+    emit_frontmatter: bool = True,
+) -> dict:
+    if out_same_dir:
+        output_dir = input_path.parent
+        output_md = input_path.with_suffix(".md")
+        assets_dir = input_path.with_name(input_path.stem + ".assets")
+        report_path = input_path.with_name(input_path.stem + ".export-report.json")
+    else:
+        output_dir = output_root / input_path.stem
+        output_md = None
+        assets_dir = None
+        report_path = None
+    exporter = BuildCorpusExporter(
+        input_path,
+        output_dir,
+        equation_mode=equation_mode,
+        output_md=output_md,
+        assets_dir=assets_dir,
+        report_path=report_path,
+        image_mode=image_mode,
+        s3_config=s3_config,
+        emit_frontmatter=emit_frontmatter,
+    )
+    return exporter.export()
+def collect_inputs(path: Path, target: str) -> list[Path]:
+    if path.is_file():
+        return [path]
+    inputs: list[Path] = []
+    patterns = ("*.md",) if target == "word" else ("*.docx", "*.pptx", "*.ppt")
+    for ext in patterns:
+        inputs.extend(path.rglob(ext))
+    return sorted(inputs)
+def collect_many_inputs(paths: list[Path], target: str) -> list[Path]:
+    inputs: list[Path] = []
+    for path in paths:
+        inputs.extend(collect_inputs(path, target))
+    unique: dict[str, Path] = {}
+    for path in inputs:
+        unique[str(path.resolve())] = path
+    return sorted(unique.values())
+def move_processed_source(input_path: Path) -> Path:
+    source_dir = input_path.parent / "_originals"
+    source_dir.mkdir(exist_ok=True)
+    target = source_dir / input_path.name
+    if target.exists():
+        target = source_dir / f"{input_path.stem}-{hashlib.sha1(str(input_path).encode('utf-8')).hexdigest()[:8]}{input_path.suffix}"
+    shutil.move(str(input_path), str(target))
+    return target
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Convert Markdown to DOCX or DOCX/PPTX/PPT to Markdown.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""examples:
+  build-corpus input.docx --out out
+  build-corpus input.md --to word --out out
+  build-corpus ./word-files --out ./markdown
+  build-corpus ./word-files --out-same-dir
+  build-corpus input.docx --images base64
+  build-corpus input.docx --images s3 --config build-corpus.config.json
+image modes:
+  assets   copy images into an assets folder and reference them from Markdown
+  base64   embed images directly as Markdown data URIs
+  s3       upload images to S3-compatible storage such as Cloudflare R2 or AWS S3
+equation modes:
+  tex      convert Word OMML equations to KaTeX-readable TeX
+  image    render equations as images for visual debugging only
+""",
+    )
+    parser.add_argument("input", type=Path, nargs="+", help="Markdown, DOCX, PPTX, or PPT file/directory; multiple paths are allowed")
+    parser.add_argument("--config", type=Path, help="JSON config file with conversion, output, and S3/R2 defaults")
+    parser.add_argument("--out", type=Path, help="Output directory for converted Markdown tree")
+    parser.add_argument("--to", choices=["auto", "markdown", "word"], help="Output target; auto infers from a single-file input")
+    parser.add_argument("--equations", choices=["tex", "image"], help="Equation output mode; default comes from config or tex")
+    parser.add_argument("--images", choices=["assets", "base64", "s3"], help="Image output mode; default comes from config or assets")
+    parser.add_argument("--out-same-dir", action="store_true", help="Write .md, .assets, and reports beside each source DOCX")
+    parser.add_argument("--word-template", type=Path, help="Optional .docx or .dotx template used for Markdown to Word exports")
+    parser.add_argument("--s3-bucket", help="S3/R2 bucket name for --images s3")
+    parser.add_argument("--s3-public-base-url", help="Public URL base used in Markdown, e.g. https://assets.example.com")
+    parser.add_argument("--s3-prefix", help="Object key prefix for uploaded images")
+    parser.add_argument("--s3-endpoint-url", help="S3-compatible endpoint, required for Cloudflare R2")
+    parser.add_argument("--s3-region", help="S3 region; use auto for Cloudflare R2")
+    parser.add_argument("--s3-access-key-id", help="S3/R2 access key id; can also come from config/env expansion")
+    parser.add_argument("--s3-secret-access-key", help="S3/R2 secret access key; can also come from config/env expansion")
+    parser.add_argument("--s3-cache-control", help="Cache-Control header for uploaded images")
+    parser.add_argument("--s3-acl", help="Optional ACL for AWS S3; usually omitted for Cloudflare R2")
+    parser.add_argument("--move-sources", action="store_true", help="After successful DOCX/PPT conversion, move processed source files into an _originals folder beside each file")
+    parser.add_argument("--inline-images", action="store_true", help="Create Markdown with local or HTTP image references embedded as data URIs")
+    parser.add_argument("--no-frontmatter", action="store_true", help="Do not emit MDK YAML frontmatter on generated Markdown (frontmatter is emitted by default and round-trips through docProps/custom.xml)")
+    args = parser.parse_args()
+    config = load_config(args.config)
+    args.out = args.out or Path(config_get(config, "output.out", ".codex/build-corpus/out"))
+    args.to = args.to or config_get(config, "conversion.target", "auto")
+    args.equations = args.equations or config_get(config, "conversion.equations", "tex")
+    args.images = args.images or config_get(config, "conversion.images", "assets")
+    args.out_same_dir = args.out_same_dir or bool(config_get(config, "output.out_same_dir", False))
+    args.word_template = args.word_template or (
+        Path(config_get(config, "word.template")) if config_get(config, "word.template") else None
+    )
+    s3_config = build_s3_config(config, args)
+    emit_frontmatter = not args.no_frontmatter
+    input_target = args.to
+    single_input = len(args.input) == 1
+    first_input = args.input[0]
+    if args.inline_images:
+        input_target = "inline-images"
+    elif single_input and first_input.is_file() and args.to == "auto":
+        input_target = "word" if first_input.suffix.lower() == ".md" else "markdown"
+    elif args.to == "auto":
+        input_target = "markdown"
+    reports = []
+    if input_target == "inline-images":
+        for input_path in collect_many_inputs(args.input, "word"):
+            if input_path.suffix.lower() in {".md", ".markdown"}:
+                reports.append(inline_markdown_images(input_path))
+        batch_report_root = first_input.parent if single_input and first_input.is_file() else args.out
+        batch_report_root.mkdir(parents=True, exist_ok=True)
+        batch_report = batch_report_root / "build-corpus-inline-report.json"
+        batch_report.write_text(json.dumps(reports, indent=2, ensure_ascii=False), encoding="utf-8")
+        print(json.dumps({
+            "converted": len(reports),
+            "batch_report": str(batch_report),
+            "outputs": [report["output"] for report in reports],
+        }, indent=2))
+        return
+    for input_path in collect_many_inputs(args.input, input_target):
+        if input_path.name.startswith("~$"):
+            continue
+        suffix = input_path.suffix.lower()
+        report = None
+        if input_target == "word" or suffix == ".md":
+            report = export_markdown_to_docx(
+                input_path,
+                args.out,
+                args.out_same_dir,
+                template_path=args.word_template,
+            )
+        elif suffix in {".pptx", ".ppt"}:
+            report = export_presentation(
+                input_path,
+                args.out,
+                args.out_same_dir,
+                image_mode=args.images,
+                emit_frontmatter=emit_frontmatter,
+            )
+        else:
+            report = export_one(
+                input_path,
+                args.out,
+                equation_mode=args.equations,
+                out_same_dir=args.out_same_dir,
+                image_mode=args.images,
+                s3_config=s3_config,
+                emit_frontmatter=emit_frontmatter,
+            )
+        if args.move_sources and suffix in {".docx", ".pptx", ".ppt"}:
+            report["moved_source"] = str(move_processed_source(input_path))
+        reports.append(report)
+    batch_report_root = first_input if args.out_same_dir and single_input and first_input.is_dir() else args.out
+    batch_report_root.mkdir(parents=True, exist_ok=True)
+    batch_report = batch_report_root / "build-corpus-batch-report.json"
+    batch_report.write_text(json.dumps(reports, indent=2, ensure_ascii=False), encoding="utf-8")
+    fidelity_failures = [report["output"] for report in reports if report.get("fidelity_ok") is False]
+    print(json.dumps({
+        "converted": len(reports),
+        "batch_report": str(batch_report),
+        "outputs": [report["output"] for report in reports],
+        "all_fidelity_ok": len(fidelity_failures) == 0,
+        "fidelity_failures": fidelity_failures,
+        "default_word_template": str(args.word_template or resolve_default_template_path() or "bundled:md-to-word-template.dotx"),
+    }, indent=2))
+if __name__ == "__main__":
+    main()