PyPI - python-hwpx - Versions diffs - 2.10.1__py3-none-any.whl → 2.10.2__py3-none-any.whl - Mend

python-hwpx 2.10.1py3-none-any.whl → 2.10.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

hwpx/document.py CHANGED Viewed

@@ -1472,6 +1472,14 @@ class HwpxDocument:
         from .tools.exporter import export_markdown
         return export_markdown(self, **kwargs)  # type: ignore[arg-type]
+    def export_rich_markdown(self, **kwargs: object) -> str:
+        """Export rich Markdown preserving inline styles, tables, footnotes, hyperlinks, images, and shape text.
+        Keyword args forwarded to :func:`~hwpx.tools.markdown_export.export_markdown`.
+        """
+        from .tools.markdown_export import export_markdown as _rich
+        return _rich(self, **kwargs)  # type: ignore[arg-type]
     # ------------------------------------------------------------------
     # Validation
     # ------------------------------------------------------------------

hwpx/oxml/document.py CHANGED Viewed

@@ -1872,6 +1872,68 @@ class HwpxOxmlNote:
         t.text = _sanitize_text(value)
         self.paragraph.section.mark_dirty()
+    @property
+    def body_paragraph(self) -> "HwpxOxmlParagraph":
+        """Return the note's body ``<hp:p>`` wrapped as :class:`HwpxOxmlParagraph`.
+        The body lives inside ``<hp:subList>`` and is distinct from
+        :attr:`paragraph`, which is the *hosting* paragraph (where the note
+        marker is inserted). Use this to add runs with mixed formatting
+        directly into the note body:
+        >>> note = para.add_footnote("기본 ")
+        >>> note.add_run("청색", char_pr_id_ref=5)
+        """
+        p = self.element.find(f".//{_HP}p")
+        if p is None:
+            raise ValueError("note has no body paragraph element")
+        return HwpxOxmlParagraph(p, self.paragraph.section)
+    def add_run(
+        self,
+        text: str = "",
+        *,
+        char_pr_id_ref: str | int | None = None,
+        bold: bool = False,
+        italic: bool = False,
+        underline: bool = False,
+        color: str | None = None,
+        font: str | None = None,
+        size: int | float | None = None,
+        highlight: str | None = None,
+        strike: bool | None = None,
+        attributes: dict[str, str] | None = None,
+    ) -> "HwpxOxmlRun":
+        """Append a run to the note body paragraph (delegates to body_paragraph.add_run)."""
+        return self.body_paragraph.add_run(
+            text,
+            char_pr_id_ref=char_pr_id_ref,
+            bold=bold,
+            italic=italic,
+            underline=underline,
+            color=color,
+            font=font,
+            size=size,
+            highlight=highlight,
+            strike=strike,
+            attributes=attributes,
+        )
+    def add_hyperlink(
+        self,
+        url: str,
+        display_text: str,
+        *,
+        char_pr_id_ref: str | int | None = None,
+    ) -> "HwpxOxmlInlineObject":
+        """Append a hyperlink to the note body paragraph.
+        Convenience wrapper around ``body_paragraph.add_hyperlink``.
+        """
+        return self.body_paragraph.add_hyperlink(
+            url, display_text, char_pr_id_ref=char_pr_id_ref
+        )
 def _default_sublist_attributes() -> dict[str, str]:
     """Return standard attributes for a ``<hp:subList>`` element.
@@ -2425,6 +2487,9 @@ class HwpxOxmlTableCell:
     @property
     def text(self) -> str:
+        paragraphs = self.paragraphs
+        if paragraphs:
+            return "\n".join(paragraph.text or "" for paragraph in paragraphs)
         parts: list[str] = []
         for t_elem in self.element.findall(f".//{_HP}t"):
             if t_elem.text:
@@ -2433,8 +2498,79 @@ class HwpxOxmlTableCell:
     @text.setter
     def text(self, value: str) -> None:
+        self.set_text(value)
+    def _first_run_char_pr_id_ref(self) -> str:
+        for paragraph in self.paragraphs:
+            for run in paragraph.runs:
+                if run.char_pr_id_ref is not None:
+                    return str(run.char_pr_id_ref)
+        return "0"
+    def _paragraph_format_attrs(self, paragraph: "HwpxOxmlParagraph" | None = None) -> dict[str, str]:
+        source = paragraph.element if paragraph is not None else None
+        attrs = dict(_default_cell_paragraph_attributes())
+        if source is not None:
+            for key in ("paraPrIDRef", "styleIDRef", "pageBreak", "columnBreak", "merged"):
+                value = source.get(key)
+                if value is not None:
+                    attrs[key] = value
+        attrs["id"] = _paragraph_id()
+        return attrs
+    def _run_char_pr_for_line(self, paragraphs: Sequence["HwpxOxmlParagraph"], index: int) -> str:
+        if index < len(paragraphs):
+            for run in paragraphs[index].runs:
+                if run.char_pr_id_ref is not None:
+                    return str(run.char_pr_id_ref)
+        return self._first_run_char_pr_id_ref()
+    def _set_split_paragraph_text(self, value: str) -> None:
+        sublist = self._ensure_sublist()
+        existing = self.paragraphs
+        lines = (value or "").replace("\r\n", "\n").replace("\r", "\n").split("\n")
+        if not lines:
+            lines = [""]
+        for paragraph in list(sublist.findall(f"{_HP}p")):
+            sublist.remove(paragraph)
+        for index, line in enumerate(lines):
+            source = existing[index] if index < len(existing) else existing[0] if existing else None
+            paragraph = _append_child(sublist, f"{_HP}p", self._paragraph_format_attrs(source))
+            run = _append_child(
+                paragraph,
+                f"{_HP}run",
+                {"charPrIDRef": self._run_char_pr_for_line(existing, index)},
+            )
+            _append_text_with_tabs(run, line)
+    def set_text(
+        self,
+        value: str,
+        *,
+        preserve_format: bool = True,
+        split_paragraphs: bool = False,
+    ) -> None:
+        if split_paragraphs:
+            self._set_split_paragraph_text(value)
+            self.element.set("dirty", "1")
+            self.table.mark_dirty()
+            return
         text_element = self._ensure_text_element()
         text_element.text = _sanitize_text(value)
+        for node in self.element.findall(f".//{_HP}t"):
+            if node is text_element:
+                continue
+            if node.text:
+                node.text = ""
+        if not preserve_format:
+            run = text_element
+            while run is not None and _element_local_name(run) != "run":
+                run = run.getparent() if hasattr(run, "getparent") else None
+            if run is not None:
+                run.set("charPrIDRef", "0")
         self.element.set("dirty", "1")
         self.table.mark_dirty()
@@ -2898,6 +3034,8 @@ class HwpxOxmlTable:
         *,
         logical: bool = False,
         split_merged: bool = False,
+        preserve_format: bool = True,
+        split_paragraphs: bool = False,
     ) -> None:
         if logical:
             entry = self._grid_entry(row_index, col_index)
@@ -2907,7 +3045,11 @@ class HwpxOxmlTable:
                 cell = entry.cell
         else:
             cell = self.cell(row_index, col_index)
-        cell.text = text
+        cell.set_text(
+            text,
+            preserve_format=preserve_format,
+            split_paragraphs=split_paragraphs,
+        )
     def split_merged_cell(
         self, row_index: int, col_index: int
@@ -3797,7 +3939,10 @@ class HwpxOxmlParagraph:
         sublist = _append_child(note_element, f"{_HP}subList", _default_sublist_attributes())
         p_attrs = {"id": _paragraph_id(), **_DEFAULT_PARAGRAPH_ATTRS}
         paragraph = _append_child(sublist, f"{_HP}p", p_attrs)
-        note_run = _append_child(paragraph, f"{_HP}run", {"charPrIDRef": "0"})
+        # 본문 run의 charPrIDRef도 인자를 따라가도록 적용 (host run과 동일 스타일).
+        # None이면 "0"(default).
+        body_cpr = "0" if char_pr_id_ref is None else str(char_pr_id_ref)
+        note_run = _append_child(paragraph, f"{_HP}run", {"charPrIDRef": body_cpr})
         t = _append_child(note_run, f"{_HP}t", {})
         t.text = _sanitize_text(text)
         self.section.mark_dirty()

hwpx/tools/markdown_export.py ADDED Viewed

@@ -0,0 +1,488 @@
+"""Rich HWPX → Markdown converter.
+Preserves:
+- 인라인 서식 (bold/italic/color/shade) via run charPrIDRef diff
+- 표 병합 셀 (colspan/rowspan) via HTML
+- 중첩 표 재귀 HTML
+- 도형(rect/ellipse/polygon) 내부 paragraph
+- 이미지 (BinData → ![image](path))
+- 헤딩 (Ⅰ. / 1. 패턴)
+- 각주/미주 정확 위치 + fn1/en1 일련번호 + 본문 인라인 서식
+- 하이퍼링크 [text](url) (fieldBegin/End 추적)
+"""
+from __future__ import annotations
+import re
+from html import escape as html_escape
+from pathlib import Path
+from typing import Union
+from ..document import HwpxDocument
+from ..oxml.namespaces import tag_local_name
+# 도형은 rect/ellipse/polygon만 순회. drawText/container는 이들의 자식이라
+# 별도 순회하면 같은 paragraph가 중복 처리됨.
+SHAPE_TAGS = ("rect", "ellipse", "polygon")
+ROMAN_HEAD = re.compile(r"^\s*[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ]\.\s*.+")
+ARABIC_HEAD = re.compile(r"^\s*\d+\.\s+[가-힣A-Za-z].+")
+# ──────────────────────────────────────────────────────────────────
+# 인라인 서식
+# ──────────────────────────────────────────────────────────────────
+def _local_name(element) -> str:
+    return tag_local_name(str(element.tag))
+def _direct_children(element, local_name: str):
+    return [child for child in list(element) if _local_name(child) == local_name]
+def _descendants(element, local_name: str):
+    return [
+        child
+        for child in element.iter()
+        if child is not element and _local_name(child) == local_name
+    ]
+def _first_descendant(element, local_name: str):
+    for child in element.iter():
+        if child is not element and _local_name(child) == local_name:
+            return child
+    return None
+def _has_descendant(element, local_name: str) -> bool:
+    return _first_descendant(element, local_name) is not None
+def _escape_markdown_text(text: str) -> str:
+    """Escape source text before applying generated Markdown/HTML wrappers."""
+    escaped = html_escape(text, quote=False)
+    for char in ("\\", "`", "*", "[", "]", "|"):
+        escaped = escaped.replace(char, "\\" + char)
+    return escaped
+def _diff_style(cp, base_cp) -> dict:
+    if cp is None:
+        return {}
+    ca, a = cp.child_attributes, cp.attributes
+    base_ca = base_cp.child_attributes if base_cp is not None else {}
+    base_a = base_cp.attributes if base_cp is not None else {}
+    bold = "bold" in ca and "bold" not in base_ca
+    italic = "italic" in ca and "italic" not in base_ca
+    underline = (
+        ca.get("underline", {}).get("type", "NONE") != "NONE"
+        and base_ca.get("underline", {}).get("type", "NONE") == "NONE"
+    )
+    strike = (
+        ca.get("strikeout", {}).get("shape", "NONE") != "NONE"
+        and base_ca.get("strikeout", {}).get("shape", "NONE") == "NONE"
+    )
+    color = a.get("textColor", "#000000")
+    base_color = base_a.get("textColor", "#000000")
+    # 흰색은 어두운 배경 위 디자인 효과로 가정 → 시각 의미 없음
+    color_changed = (
+        color != base_color and color.upper() not in ("#000000", "#FFFFFF")
+    )
+    shade = a.get("shadeColor", "none")
+    base_shade = base_a.get("shadeColor", "none")
+    shade_changed = shade.lower() not in ("none", "", base_shade.lower())
+    return {
+        "bold": bold,
+        "italic": italic,
+        "underline": underline,
+        "strike": strike,
+        "color": color if color_changed else None,
+        "shade": shade if shade_changed else None,
+    }
+def _wrap(text: str, style: dict) -> str:
+    if not text:
+        return ""
+    out = text
+    if style.get("shade"):
+        out = f'<mark style="background-color:{style["shade"]}">{out}</mark>'
+    if style.get("color"):
+        out = f'<span style="color:{style["color"]}">{out}</span>'
+    if style.get("underline"):
+        out = f"<u>{out}</u>"
+    if style.get("strike"):
+        out = f"~~{out}~~"
+    if style.get("italic"):
+        out = f"*{out}*"
+    if style.get("bold"):
+        out = f"**{out}**"
+    return out
+def _style_key(style: dict) -> tuple:
+    return tuple(sorted((k, v) for k, v in style.items() if v))
+def _render_runs(items, base_cp, chars) -> str:
+    """[(cpr_id, text)] 시퀀스를 인접 동일 서식 머지 후 markdown으로."""
+    groups: list[tuple[tuple, str]] = []
+    for cpr, text in items:
+        if not text:
+            continue
+        cp = chars.get(str(cpr), base_cp)
+        style = _diff_style(cp, base_cp)
+        key = _style_key(style)
+        escaped = _escape_markdown_text(text)
+        if groups and groups[-1][0] == key:
+            groups[-1] = (key, groups[-1][1] + escaped)
+        else:
+            groups.append((key, escaped))
+    return "".join(_wrap(text, dict(key)) for key, text in groups)
+# ──────────────────────────────────────────────────────────────────
+# 이미지 매핑
+# ──────────────────────────────────────────────────────────────────
+def _build_image_map(
+    doc: HwpxDocument,
+    image_dir: Path | None,
+    image_ref_prefix: str | None,
+) -> dict[str, str]:
+    """doc._package의 BinData/* 를 image_dir에 추출하고 {ref_stem → rel_path} 반환.
+    image_dir이 None이면 추출 없이 빈 dict (마크다운에 ![image]() 안 들어감).
+    """
+    if image_dir is None:
+        return {}
+    image_dir = Path(image_dir)
+    image_dir.mkdir(parents=True, exist_ok=True)
+    prefix = image_ref_prefix if image_ref_prefix is not None else image_dir.name
+    mapping: dict[str, str] = {}
+    pkg = doc._package
+    for name in pkg.files():
+        if not name.startswith("BinData/"):
+            continue
+        data = pkg.read(name)
+        fname = Path(name).name
+        (image_dir / fname).write_bytes(data)
+        mapping[Path(name).stem] = f"{prefix}/{fname}" if prefix else fname
+    return mapping
+def _paragraph_images(p_el, mapping: dict[str, str]) -> list[str]:
+    """paragraph element 안 모든 <hp:pic> → markdown 이미지 라인."""
+    out = []
+    for pic in _descendants(p_el, "pic"):
+        img = _first_descendant(pic, "img")
+        if img is None:
+            continue
+        ref = img.get("binaryItemIDRef")
+        if not ref or not mapping:
+            continue
+        rel = mapping.get(ref, f"BinData/{ref}")
+        out.append(f"![image]({rel})")
+    return out
+# ──────────────────────────────────────────────────────────────────
+# Paragraph element → markdown (재귀 진입점)
+# ──────────────────────────────────────────────────────────────────
+def _p_element_to_md(p_el, doc, notes_out: list | None = None) -> str:
+    chars = doc._root.char_properties
+    base_cp = chars.get("0")
+    output: list[str] = []
+    items: list[tuple] = []
+    link_url: str | None = None
+    link_items: list[tuple] = []
+    def flush_items():
+        nonlocal items
+        if items:
+            output.append(_render_runs(items, base_cp, chars))
+            items = []
+    def flush_link():
+        nonlocal link_url, link_items
+        if link_url is None:
+            return
+        text = _render_runs(link_items, base_cp, chars)
+        if text:
+            output.append(f"[{text}]({link_url})" if link_url else text)
+        link_url = None
+        link_items = []
+    def push_text(cpr, text):
+        if link_url is not None:
+            link_items.append((cpr, text))
+        else:
+            items.append((cpr, text))
+    for run in _direct_children(p_el, "run"):
+        cpr = run.get("charPrIDRef", "0")
+        for child in run:
+            tag = _local_name(child)
+            if tag == "t":
+                if child.text:
+                    push_text(cpr, child.text)
+            elif tag == "ctrl":
+                for gc in child:
+                    gctag = _local_name(gc)
+                    if gctag == "fieldBegin" and gc.get("type") == "HYPERLINK":
+                        flush_items()
+                        link_url = gc.get("name", "")
+                    elif gctag == "fieldEnd":
+                        flush_link()
+            elif tag in ("footNote", "endNote"):
+                inst_id = child.get("instId", "")
+                kind = "fn" if tag == "footNote" else "en"
+                marker = f"[^{kind}{inst_id}]"
+                if link_url is not None:
+                    flush_link()
+                else:
+                    flush_items()
+                output.append(marker)
+                if notes_out is not None:
+                    body_parts = []
+                    for fp in _descendants(child, "p"):
+                        sub_md = _p_element_to_md(fp, doc, None).strip()
+                        if sub_md:
+                            body_parts.append(sub_md)
+                    notes_out.append((kind, inst_id, " ".join(body_parts)))
+    flush_items()
+    flush_link()
+    return "".join(output)
+# ──────────────────────────────────────────────────────────────────
+# 도형 / 셀 / 표
+# ──────────────────────────────────────────────────────────────────
+def _shape_text_lines(scope_el, doc, notes_out: list | None = None) -> list[str]:
+    lines: list[str] = []
+    seen_p = set()
+    for tag in SHAPE_TAGS:
+        for shape in _descendants(scope_el, tag):
+            for sub_p in _descendants(shape, "p"):
+                pid = id(sub_p)
+                if pid in seen_p:
+                    continue
+                seen_p.add(pid)
+                md = _p_element_to_md(sub_p, doc, notes_out).strip()
+                if md:
+                    lines.append(md)
+    return lines
+def _cell_to_md(cell, doc, mapping, depth: int = 0, notes_out: list | None = None) -> str:
+    chunks: list[str] = []
+    for cp in cell.paragraphs:
+        md = _p_element_to_md(cp.element, doc, notes_out).strip()
+        imgs = _paragraph_images(cp.element, mapping)
+        shape_lines = _shape_text_lines(cp.element, doc, notes_out)
+        if md:
+            chunks.append(md)
+        chunks.extend(shape_lines)
+        chunks.extend(imgs)
+        for sub in cp.tables:
+            chunks.append(_table_to_md(sub, doc, mapping, depth + 1, notes_out))
+    return "<br>".join(c for c in chunks if c).strip()
+def _table_to_md(tbl, doc, mapping, depth: int = 0, notes_out: list | None = None) -> str:
+    grid = tbl.get_cell_map()
+    rows, cols = tbl.row_count, tbl.column_count
+    has_merge = any(not pos.is_anchor for row in grid for pos in row)
+    if has_merge or depth > 0:
+        # 병합 셀 또는 중첩 — HTML
+        out = ["<table>"]
+        for r in range(rows):
+            out.append("<tr>")
+            for c in range(cols):
+                pos = grid[r][c]
+                if not pos.is_anchor:
+                    continue
+                col_end = c
+                while (
+                    col_end + 1 < cols
+                    and not grid[r][col_end + 1].is_anchor
+                    and grid[r][col_end + 1].cell is pos.cell
+                ):
+                    col_end += 1
+                row_end = r
+                while (
+                    row_end + 1 < rows
+                    and not grid[row_end + 1][c].is_anchor
+                    and grid[row_end + 1][c].cell is pos.cell
+                ):
+                    row_end += 1
+                colspan = col_end - c + 1
+                rowspan = row_end - r + 1
+                attrs = []
+                if colspan > 1:
+                    attrs.append(f'colspan="{colspan}"')
+                if rowspan > 1:
+                    attrs.append(f'rowspan="{rowspan}"')
+                attr_s = (" " + " ".join(attrs)) if attrs else ""
+                content = _cell_to_md(pos.cell, doc, mapping, depth + 1, notes_out)
+                tag = "th" if r == 0 else "td"
+                out.append(f"<{tag}{attr_s}>{content}</{tag}>")
+            out.append("</tr>")
+        out.append("</table>")
+        return "\n".join(out)
+    # 단순 — GFM
+    lines = []
+    for r in range(rows):
+        cells = [
+            _cell_to_md(grid[r][c].cell, doc, mapping, depth + 1, notes_out)
+            for c in range(cols)
+        ]
+        lines.append("| " + " | ".join(cells) + " |")
+        if r == 0:
+            lines.append("| " + " | ".join(["---"] * cols) + " |")
+    return "\n".join(lines)
+# ──────────────────────────────────────────────────────────────────
+# 헤딩 감지
+# ──────────────────────────────────────────────────────────────────
+def _detect_heading(text: str) -> str | None:
+    plain = re.sub(r"~~|\*\*|<[^>]+>|\*", "", text.strip())
+    plain = plain.replace("\\[", "[").replace("\\]", "]").replace("\\|", "|")
+    if ROMAN_HEAD.match(plain):
+        return f"# {plain}"
+    if ARABIC_HEAD.match(plain) and len(plain) < 40:
+        return f"## {plain}"
+    return None
+# ──────────────────────────────────────────────────────────────────
+# Public API
+# ──────────────────────────────────────────────────────────────────
+def export_markdown(
+    source: Union[HwpxDocument, str, Path, bytes],
+    *,
+    image_dir: Union[str, Path, None] = None,
+    image_ref_prefix: str | None = None,
+    detect_headings: bool = True,
+    notes_section_separator: str = "\n\n---\n",
+) -> str:
+    """HWPX → rich markdown.
+    Parameters
+    ----------
+    source : HwpxDocument | path | bytes
+        HwpxDocument 인스턴스 또는 파일 경로/바이트.
+    image_dir : path | None
+        BinData/* 추출 대상 디렉토리. None이면 이미지 마커 생성하지 않음.
+    image_ref_prefix : str | None
+        markdown 이미지 경로의 prefix. None이면 image_dir의 basename.
+    detect_headings : bool
+        Ⅰ./1. 패턴 감지로 `#`/`##` 헤딩 격상 여부.
+    notes_section_separator : str
+        각주/미주 정의 부록 앞에 삽입할 separator.
+    """
+    if isinstance(source, HwpxDocument):
+        doc = source
+    elif isinstance(source, (bytes, bytearray)):
+        import io
+        doc = HwpxDocument.open(io.BytesIO(source))
+    else:
+        doc = HwpxDocument.open(str(source))
+    mapping = _build_image_map(doc, Path(image_dir) if image_dir else None, image_ref_prefix)
+    notes: list[tuple] = []
+    lines: list[str] = []
+    for section in doc.sections:
+        for p in section.paragraphs:
+            md = _p_element_to_md(p.element, doc, notes).strip()
+            imgs = _paragraph_images(p.element, mapping)
+            tables = [_table_to_md(t, doc, mapping, 0, notes) for t in p.tables]
+            # 중복 가드 1: paragraph text가 표 셀 안에 동일하게 들어있으면 표가 정식
+            if md and p.tables:
+                plain = (p.text or "").strip()
+                all_cell_text = "".join(
+                    (cell.text or "")
+                    for tbl in p.tables
+                    for row in tbl.rows
+                    for cell in row.cells
+                )
+                if plain and plain in all_cell_text:
+                    md = ""
+            # 중복 가드 2: 도형 보유 시 paragraph text는 도형 텍스트의 흘러나옴
+            if md and any(_has_descendant(p.element, tag) for tag in SHAPE_TAGS):
+                md = ""
+            # 도형 내부 paragraph 추출 (표 안 도형은 cell_to_md에서 처리됨)
+            shape_lines: list[str] = []
+            seen_p = set()
+            for sub in p.tables:
+                for nested_p in _descendants(sub.element, "p"):
+                    seen_p.add(id(nested_p))
+            for tag in SHAPE_TAGS:
+                for shape in _descendants(p.element, tag):
+                    for sub_p in _descendants(shape, "p"):
+                        pid = id(sub_p)
+                        if pid in seen_p:
+                            continue
+                        seen_p.add(pid)
+                        sub_md = _p_element_to_md(sub_p, doc, notes).strip()
+                        if sub_md:
+                            shape_lines.append(sub_md)
+            # 헤딩 감지 (1x1 표 셀에 있는 경우 포함)
+            promoted = None
+            if detect_headings:
+                if md:
+                    promoted = _detect_heading(md)
+                elif p.tables and len(p.tables) == 1:
+                    t = p.tables[0]
+                    if t.row_count == 1 and t.column_count == 1:
+                        cell_text = _cell_to_md(
+                            t.rows[0].cells[0], doc, mapping, 0, notes
+                        )
+                        promoted = _detect_heading(cell_text)
+                        if promoted:
+                            lines.append(promoted)
+                            continue
+            if promoted:
+                lines.append(promoted)
+            elif md:
+                lines.append(md)
+            lines.extend(shape_lines)
+            lines.extend(imgs)
+            lines.extend(tables)
+    body = "\n\n".join(lines)
+    # 각주/미주 instId → fn1/en1 일련번호 매핑 + 정의 부록
+    if notes:
+        seq_map: dict[str, dict[str, int]] = {"fn": {}, "en": {}}
+        for kind, inst_id, _ in notes:
+            if inst_id not in seq_map[kind]:
+                seq_map[kind][inst_id] = len(seq_map[kind]) + 1
+        for kind, m in seq_map.items():
+            for inst_id, seq in m.items():
+                body = body.replace(f"[^{kind}{inst_id}]", f"[^{kind}{seq}]")
+        body += notes_section_separator
+        seen = set()
+        for kind, inst_id, text in notes:
+            key = (kind, inst_id)
+            if key in seen:
+                continue
+            seen.add(key)
+            seq = seq_map[kind][inst_id]
+            body += f"\n[^{kind}{seq}]: {text}\n"
+    return body

hwpx/tools/table_navigation.py CHANGED Viewed

@@ -41,10 +41,14 @@ class TableMapEntry(TypedDict):
     table_index: int
     paragraph_index: int
+    location: dict[str, object]
     rows: int
     cols: int
+    caption_text: str
+    preceding_paragraph_text: str
     header_text: str
     first_row_preview: list[str]
+    cells: list[dict[str, object]]
     is_empty: bool
@@ -107,6 +111,8 @@ class TableFillResult(TypedDict):
 class _AnchoredTable:
     table: HwpxOxmlTable
     paragraph_index: int
+    caption_text: str
+    preceding_paragraph_text: str
     header_text: str
@@ -115,6 +121,8 @@ class _IndexedTable:
     table_index: int
     table: HwpxOxmlTable
     paragraph_index: int
+    caption_text: str
+    preceding_paragraph_text: str
     header_text: str
@@ -193,6 +201,8 @@ def _collect_tables_from_paragraph(
                 _AnchoredTable(
                     table=table,
                     paragraph_index=anchor_paragraph_index,
+                    caption_text=paragraph_prefix_text,
+                    preceding_paragraph_text=last_header_text,
                     header_text=header_text,
                 )
             )
@@ -227,6 +237,8 @@ def _collect_document_tables(document: HwpxDocument) -> list[_IndexedTable]:
             table_index=table_index,
             table=item.table,
             paragraph_index=item.paragraph_index,
+            caption_text=item.caption_text,
+            preceding_paragraph_text=item.preceding_paragraph_text,
             header_text=item.header_text,
         )
         for table_index, item in enumerate(anchored_tables)
@@ -234,7 +246,11 @@ def _collect_document_tables(document: HwpxDocument) -> list[_IndexedTable]:
 def _cell_text(table: HwpxOxmlTable, row_index: int, col_index: int) -> str:
-    return table.cell(row_index, col_index).text
+    cell = table.cell(row_index, col_index)
+    paragraphs = list(getattr(cell, "paragraphs", []) or [])
+    if paragraphs:
+        return "\n".join(paragraph.text or "" for paragraph in paragraphs)
+    return cell.text
 def _table_is_empty(table: HwpxOxmlTable) -> bool:
@@ -251,6 +267,62 @@ def _first_row_preview(table: HwpxOxmlTable) -> list[str]:
     return [_cell_text(table, 0, col_index) for col_index in range(table.column_count)]
+def _body_paragraph_location(paragraph_index: int) -> dict[str, object]:
+    return {"kind": "body_paragraph", "paragraph_index": paragraph_index}
+def _table_cell_paragraph_location(
+    table_index: int,
+    row_index: int,
+    col_index: int,
+    cell_paragraph_index: int,
+) -> dict[str, object]:
+    return {
+        "kind": "table_cell_paragraph",
+        "table_index": table_index,
+        "row": row_index,
+        "col": col_index,
+        "cell_paragraph_index": cell_paragraph_index,
+    }
+def _table_cells(table_ref: _IndexedTable) -> list[dict[str, object]]:
+    cells: list[dict[str, object]] = []
+    for row_index in range(table_ref.table.row_count):
+        for col_index in range(table_ref.table.column_count):
+            cell = table_ref.table.cell(row_index, col_index)
+            paragraphs = list(getattr(cell, "paragraphs", []) or [])
+            paragraph_payloads: list[dict[str, object]] = []
+            for cell_paragraph_index, paragraph in enumerate(paragraphs):
+                paragraph_payloads.append(
+                    {
+                        "cell_paragraph_index": cell_paragraph_index,
+                        "text": paragraph.text or "",
+                        "location": _table_cell_paragraph_location(
+                            table_ref.table_index,
+                            row_index,
+                            col_index,
+                            cell_paragraph_index,
+                        ),
+                    }
+                )
+            cells.append(
+                {
+                    "row": row_index,
+                    "col": col_index,
+                    "text": _cell_text(table_ref.table, row_index, col_index),
+                    "paragraphs": paragraph_payloads,
+                    "location": {
+                        "kind": "table_cell",
+                        "table_index": table_ref.table_index,
+                        "row": row_index,
+                        "col": col_index,
+                    },
+                }
+            )
+    return cells
 def _direction_delta(direction: PathDirection) -> tuple[int, int]:
     if direction == "right":
         return (0, 1)
@@ -337,10 +409,14 @@ def get_table_map(document: HwpxDocument) -> TableMapResult:
             {
                 "table_index": table_ref.table_index,
                 "paragraph_index": table_ref.paragraph_index,
+                "location": _body_paragraph_location(table_ref.paragraph_index),
                 "rows": table_ref.table.row_count,
                 "cols": table_ref.table.column_count,
+                "caption_text": table_ref.caption_text,
+                "preceding_paragraph_text": table_ref.preceding_paragraph_text,
                 "header_text": table_ref.header_text,
                 "first_row_preview": _first_row_preview(table_ref.table),
+                "cells": _table_cells(table_ref),
                 "is_empty": _table_is_empty(table_ref.table),
             }
         )

{python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: python-hwpx
-Version: 2.10.1
+Version: 2.10.2
 Summary: 한글 없이 HWPX 문서를 열고, 편집하고, 생성하고, 검증하는 Python 자동화 라이브러리
 Author: python-hwpx Maintainers
 License-Expression: Apache-2.0
@@ -115,6 +115,47 @@ hwpx-validate-package 보고서.hwpx
 hwpx-analyze-template 보고서.hwpx
 ```
+### 4. 풍부한 Markdown 변환 (서식·표·각주·이미지 보존)
+`export_markdown()`는 단순 평문 추출이고, `export_rich_markdown()`는 인라인 서식(`**굵게**`, `*기울임*`, `~~취소선~~`),
+표(중첩 포함, colspan/rowspan 안전), 도형 텍스트, 이미지, 각주/미주, 하이퍼링크, 제목(`#`/`##`) 자동 감지까지 보존한다.
+```python
+from hwpx import HwpxDocument
+doc = HwpxDocument.open("보고서.hwpx")
+md = doc.export_rich_markdown(
+    image_dir="out/images",          # BinData 이미지를 디스크에 추출
+    image_ref_prefix="images/",      # 마크다운 내 ![](images/...) 경로 접두
+    detect_headings=True,            # Ⅰ./1. 패턴 기반 #/## 자동
+)
+print(md)
+```
+문자열·경로·바이트도 그대로 받는다:
+```python
+from hwpx.tools.markdown_export import export_markdown
+md = export_markdown("보고서.hwpx")          # 경로
+md = export_markdown(open("a.hwpx", "rb").read())  # bytes
+```
+### 5. 각주 본문에 혼합 서식 / 하이퍼링크 추가
+`HwpxOxmlNote`에 `body_paragraph`, `add_run`, `add_hyperlink` helper가 있어 각주 본문을
+직접 paragraph로 다루지 않고도 인라인 서식·링크를 손쉽게 채울 수 있다.
+```python
+para = section.paragraphs[0]
+note = para.add_footnote("")  # 빈 각주 생성 후 본문 구성
+note.add_run("자세한 내용은 ", )
+note.add_run("정부 공식 사이트", bold=True)
+note.add_run("를 참고하라: ")
+note.add_hyperlink("https://www.kasa.go.kr", "우주항공청")
+```
 처음에는 `open/new -> edit/extract -> save_to_path` 흐름만 잡으면 된다. 패키지 구조, XML 파트, 템플릿 회귀 점검은 필요할 때만 확장하면 된다.
 ## 어디부터 읽으면 되나
@@ -244,6 +285,7 @@ doc.set_footer_text("1 / 10", page_type="BOTH")
 # 표 셀 병합·분할
 table.merge_cells(0, 0, 1, 1)   # (0,0)~(1,1) 병합
 table.set_cell_text(0, 0, "병합된 셀", logical=True, split_merged=True)
+table.set_cell_text(0, 0, "line 1\nline 2", split_paragraphs=True)
 # 양식형 표 자동 채우기
 form = doc.add_table(2, 2)
@@ -257,6 +299,12 @@ doc.fill_by_path({
 })
 ```
+`doc.paragraphs`의 인덱스는 본문 직속 문단 0-based 기준입니다. 표 안 문단은
+본문 `paragraph_index`에 섞지 않고 `get_table_map()`의 cell `location`
+(`table_index`, `row`, `col`, `cell_paragraph_index`)으로 다룹니다.
+`get_table_map()`은 `caption_text`와 `preceding_paragraph_text`를 분리해
+반환하고, 셀 미리보기의 여러 문단은 `\n`으로 유지합니다.
 ### 🔍 텍스트 추출 & 검색
 ```python

{python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 hwpx/__init__.py,sha256=ouwTSF8JrUPVgwWxB1hudQwVdhAA981uHeX_wXxxQHo,2205
 hwpx/authoring.py,sha256=caZfPFe99ilaJMDJEDRsWKCb-QKAp18M0vRlPdM0PR0,96068
-hwpx/document.py,sha256=Q8uHzYryMFUXn6fc7Uhi1cEbmhaqiQ8uqb8bT-gAYjU,54798
+hwpx/document.py,sha256=1kb0n6C5cEiex7Bs58MlLhFXI8mknQFqErTkCYaFuQE,55204
 hwpx/form_fill.py,sha256=VUIU53Qa9Ho2aP72biDvJwnDW7ngdAzu3PSd5A7d1JM,9908
 hwpx/package.py,sha256=0rKjGCJbPQvrVBIy07Jpjsu3fI7HhbqFCGWTiTDsJpo,1141
 hwpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -16,7 +16,7 @@ hwpx/opc/xml_utils.py,sha256=O_eZtp1-8vWimoi9Xdy0uzmtk8bnkfjf-QLjD_uWSFQ,3483
 hwpx/oxml/__init__.py,sha256=tUoiHQw3oJpHvSES6f5AuhpfXvlby0Df-3L0t-CMhxM,5000
 hwpx/oxml/body.py,sha256=VOwlyDRgoLMlDePFbCjU3qlBAefO9DIoSGsEI2Fr8DA,24888
 hwpx/oxml/common.py,sha256=TJkafzg7x4T3J29tZchRZk57ZTsrM9PEiqGT3rX3w5o,1044
-hwpx/oxml/document.py,sha256=QA37yh53PDNk7-TNKhSViXDlcz5vkLlosZMcHSSyKKc,209419
+hwpx/oxml/document.py,sha256=WIKmJ-nW0Wqh2vEf-06-5gx6EpQc-TvbJr-nze3VYtQ,214823
 hwpx/oxml/header.py,sha256=_KgKsCN6UWB8r59z2iqe0rLC8EdEZyJD7GfQ0Xd2WXM,43080
 hwpx/oxml/header_part.py,sha256=U3tXD1LWruAdQV-w9cIBv8iXPpQ1oUm0CXlxAAonZ6I,231
 hwpx/oxml/memo.py,sha256=WSJSTYOSLKG836eF_UsrD99hMqJhWwzRZ8pJbHq-nsA,228
@@ -34,6 +34,7 @@ hwpx/tools/archive_cli.py,sha256=rlgE6KBeJORa8Z6RhGOVmOl7gGIKdgA9GY106EFouVo,122
 hwpx/tools/exporter.py,sha256=hx7th-LAL1a5G0ICyVcyJPJaUY5jEgDJUZ7UYg_YAmI,6578
 hwpx/tools/generic_inventory.py,sha256=pHVP8-htX_vO02ARdQR37XFxm7fUPK68VtMeeOJ1NZY,4835
 hwpx/tools/id_integrity.py,sha256=_Ra981ZPX1WXH_bK-2KNhCnwPVYErfdX2wW4SosX0Ls,9256
+hwpx/tools/markdown_export.py,sha256=FejutCpQHbycO185uljcSwfZuwXMTbGEgXtf5e-a4_k,19139
 hwpx/tools/object_finder.py,sha256=7i6XI1-r7-ar_IzSZQ82hfOcxVzJFK2XjMDB8oxcmMA,13478
 hwpx/tools/package_validator.py,sha256=87uv7uVh6wqqY8-woX9kAGnwuWK3uYL4BHfGf7NNgcs,14521
 hwpx/tools/page_guard.py,sha256=nDAVPcvrnuyDxVTA_j22wiYD7CXAD6XlzsMzaz3h_q8,9701
@@ -43,17 +44,17 @@ hwpx/tools/report_parser.py,sha256=3Daqn2hqIcj5pG1qUxeYbvWr7CvdhwzatWvxCCcnSZg,4
 hwpx/tools/report_utils.py,sha256=6HYEeQc3ZxTpxbwF11s47uZ-KmV4tsHPE1MV4491KDE,4434
 hwpx/tools/roundtrip_diff.py,sha256=ao0AdpDJkq89u5hwcrsxTijvSsia9Jaw1OOnh4WAco4,1365
 hwpx/tools/table_cleanup.py,sha256=0_f6NnvNp3QD4owKd_bRX6FZbeUmoQC7a4_VGzF2SCE,1796
-hwpx/tools/table_navigation.py,sha256=oGfJE0cM3WIvE8_avtbST8R_nITnoMwDA4t-4IEW9dg,13520
+hwpx/tools/table_navigation.py,sha256=rtbrWFKpJhqC3LD0ZXImyHgjmDR2hjHCFy3_S-qNBwA,16479
 hwpx/tools/template_analyzer.py,sha256=qZMIyB-r4YXZqU54v6uwt_CQiOAQR0mVgmo_Bt4biWM,8497
 hwpx/tools/text_extract_cli.py,sha256=BmsDAwNXpDPhEayb9ez2ORtGNzPd_Xxduy4_cLXhnUw,2188
 hwpx/tools/text_extractor.py,sha256=dqGzOnJVRUEfrxiTt04GkDrfY4yfZXRIhPtEwTM77Mw,25289
 hwpx/tools/validator.py,sha256=LMo8gIMoptP9RRDbYKV4WwrM59rclC5h3HP-ZJRUxO0,6856
 hwpx/tools/_schemas/header.xsd,sha256=mJXuFMuHGT1JnFFaluUpYUglwjMCNlfbFCRVM26eHXE,664
 hwpx/tools/_schemas/section.xsd,sha256=MgvavVHG05RDfUnVPxVU10H4FQOja5ON04_m9Uk_m7E,522
-python_hwpx-2.10.1.dist-info/licenses/LICENSE,sha256=_ubz4wv-BkkT3l3gu-QuH7JGeVjuRYGZoZK95eNsCHU,9688
-python_hwpx-2.10.1.dist-info/licenses/NOTICE,sha256=k48h6EaGQE8Y1c0dS9sIOOcz4YqkbcImWClF7pBOgsg,2473
-python_hwpx-2.10.1.dist-info/METADATA,sha256=43VEoLZ0bnRSIhdTzq10F49GYCdSXIwHqyNcZygODVk,16077
-python_hwpx-2.10.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
-python_hwpx-2.10.1.dist-info/entry_points.txt,sha256=JUKRxbly9UaeHV7YzOea23y8IiqSTcrhUlooP3fS_Zc,405
-python_hwpx-2.10.1.dist-info/top_level.txt,sha256=R1iToqDh80Nf2oQhRjTN0rbN2X6kyDUizIocZjkhuxc,5
-python_hwpx-2.10.1.dist-info/RECORD,,
+python_hwpx-2.10.2.dist-info/licenses/LICENSE,sha256=_ubz4wv-BkkT3l3gu-QuH7JGeVjuRYGZoZK95eNsCHU,9688
+python_hwpx-2.10.2.dist-info/licenses/NOTICE,sha256=k48h6EaGQE8Y1c0dS9sIOOcz4YqkbcImWClF7pBOgsg,2473
+python_hwpx-2.10.2.dist-info/METADATA,sha256=S3vl8kgL0d7BcCafoPk8AuV7otQmjutlivMFAvUNROA,18099
+python_hwpx-2.10.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
+python_hwpx-2.10.2.dist-info/entry_points.txt,sha256=JUKRxbly9UaeHV7YzOea23y8IiqSTcrhUlooP3fS_Zc,405
+python_hwpx-2.10.2.dist-info/top_level.txt,sha256=R1iToqDh80Nf2oQhRjTN0rbN2X6kyDUizIocZjkhuxc,5
+python_hwpx-2.10.2.dist-info/RECORD,,

{python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/WHEEL RENAMED Viewed

File without changes

{python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/licenses/NOTICE RENAMED Viewed

File without changes

{python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/top_level.txt RENAMED Viewed

File without changes

python-hwpx 2.10.1__py3-none-any.whl → 2.10.2__py3-none-any.whl

python-hwpx 2.10.1py3-none-any.whl → 2.10.2py3-none-any.whl