python-hwpx 2.10.1__py3-none-any.whl → 2.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hwpx/document.py CHANGED
@@ -1472,6 +1472,14 @@ class HwpxDocument:
1472
1472
  from .tools.exporter import export_markdown
1473
1473
  return export_markdown(self, **kwargs) # type: ignore[arg-type]
1474
1474
 
1475
+ def export_rich_markdown(self, **kwargs: object) -> str:
1476
+ """Export rich Markdown preserving inline styles, tables, footnotes, hyperlinks, images, and shape text.
1477
+
1478
+ Keyword args forwarded to :func:`~hwpx.tools.markdown_export.export_markdown`.
1479
+ """
1480
+ from .tools.markdown_export import export_markdown as _rich
1481
+ return _rich(self, **kwargs) # type: ignore[arg-type]
1482
+
1475
1483
  # ------------------------------------------------------------------
1476
1484
  # Validation
1477
1485
  # ------------------------------------------------------------------
hwpx/oxml/document.py CHANGED
@@ -1872,6 +1872,68 @@ class HwpxOxmlNote:
1872
1872
  t.text = _sanitize_text(value)
1873
1873
  self.paragraph.section.mark_dirty()
1874
1874
 
1875
+ @property
1876
+ def body_paragraph(self) -> "HwpxOxmlParagraph":
1877
+ """Return the note's body ``<hp:p>`` wrapped as :class:`HwpxOxmlParagraph`.
1878
+
1879
+ The body lives inside ``<hp:subList>`` and is distinct from
1880
+ :attr:`paragraph`, which is the *hosting* paragraph (where the note
1881
+ marker is inserted). Use this to add runs with mixed formatting
1882
+ directly into the note body:
1883
+
1884
+ >>> note = para.add_footnote("기본 ")
1885
+ >>> note.add_run("청색", char_pr_id_ref=5)
1886
+ """
1887
+ p = self.element.find(f".//{_HP}p")
1888
+ if p is None:
1889
+ raise ValueError("note has no body paragraph element")
1890
+ return HwpxOxmlParagraph(p, self.paragraph.section)
1891
+
1892
+ def add_run(
1893
+ self,
1894
+ text: str = "",
1895
+ *,
1896
+ char_pr_id_ref: str | int | None = None,
1897
+ bold: bool = False,
1898
+ italic: bool = False,
1899
+ underline: bool = False,
1900
+ color: str | None = None,
1901
+ font: str | None = None,
1902
+ size: int | float | None = None,
1903
+ highlight: str | None = None,
1904
+ strike: bool | None = None,
1905
+ attributes: dict[str, str] | None = None,
1906
+ ) -> "HwpxOxmlRun":
1907
+ """Append a run to the note body paragraph (delegates to body_paragraph.add_run)."""
1908
+ return self.body_paragraph.add_run(
1909
+ text,
1910
+ char_pr_id_ref=char_pr_id_ref,
1911
+ bold=bold,
1912
+ italic=italic,
1913
+ underline=underline,
1914
+ color=color,
1915
+ font=font,
1916
+ size=size,
1917
+ highlight=highlight,
1918
+ strike=strike,
1919
+ attributes=attributes,
1920
+ )
1921
+
1922
+ def add_hyperlink(
1923
+ self,
1924
+ url: str,
1925
+ display_text: str,
1926
+ *,
1927
+ char_pr_id_ref: str | int | None = None,
1928
+ ) -> "HwpxOxmlInlineObject":
1929
+ """Append a hyperlink to the note body paragraph.
1930
+
1931
+ Convenience wrapper around ``body_paragraph.add_hyperlink``.
1932
+ """
1933
+ return self.body_paragraph.add_hyperlink(
1934
+ url, display_text, char_pr_id_ref=char_pr_id_ref
1935
+ )
1936
+
1875
1937
 
1876
1938
  def _default_sublist_attributes() -> dict[str, str]:
1877
1939
  """Return standard attributes for a ``<hp:subList>`` element.
@@ -2425,6 +2487,9 @@ class HwpxOxmlTableCell:
2425
2487
 
2426
2488
  @property
2427
2489
  def text(self) -> str:
2490
+ paragraphs = self.paragraphs
2491
+ if paragraphs:
2492
+ return "\n".join(paragraph.text or "" for paragraph in paragraphs)
2428
2493
  parts: list[str] = []
2429
2494
  for t_elem in self.element.findall(f".//{_HP}t"):
2430
2495
  if t_elem.text:
@@ -2433,8 +2498,79 @@ class HwpxOxmlTableCell:
2433
2498
 
2434
2499
  @text.setter
2435
2500
  def text(self, value: str) -> None:
2501
+ self.set_text(value)
2502
+
2503
+ def _first_run_char_pr_id_ref(self) -> str:
2504
+ for paragraph in self.paragraphs:
2505
+ for run in paragraph.runs:
2506
+ if run.char_pr_id_ref is not None:
2507
+ return str(run.char_pr_id_ref)
2508
+ return "0"
2509
+
2510
+ def _paragraph_format_attrs(self, paragraph: "HwpxOxmlParagraph" | None = None) -> dict[str, str]:
2511
+ source = paragraph.element if paragraph is not None else None
2512
+ attrs = dict(_default_cell_paragraph_attributes())
2513
+ if source is not None:
2514
+ for key in ("paraPrIDRef", "styleIDRef", "pageBreak", "columnBreak", "merged"):
2515
+ value = source.get(key)
2516
+ if value is not None:
2517
+ attrs[key] = value
2518
+ attrs["id"] = _paragraph_id()
2519
+ return attrs
2520
+
2521
+ def _run_char_pr_for_line(self, paragraphs: Sequence["HwpxOxmlParagraph"], index: int) -> str:
2522
+ if index < len(paragraphs):
2523
+ for run in paragraphs[index].runs:
2524
+ if run.char_pr_id_ref is not None:
2525
+ return str(run.char_pr_id_ref)
2526
+ return self._first_run_char_pr_id_ref()
2527
+
2528
+ def _set_split_paragraph_text(self, value: str) -> None:
2529
+ sublist = self._ensure_sublist()
2530
+ existing = self.paragraphs
2531
+ lines = (value or "").replace("\r\n", "\n").replace("\r", "\n").split("\n")
2532
+ if not lines:
2533
+ lines = [""]
2534
+
2535
+ for paragraph in list(sublist.findall(f"{_HP}p")):
2536
+ sublist.remove(paragraph)
2537
+
2538
+ for index, line in enumerate(lines):
2539
+ source = existing[index] if index < len(existing) else existing[0] if existing else None
2540
+ paragraph = _append_child(sublist, f"{_HP}p", self._paragraph_format_attrs(source))
2541
+ run = _append_child(
2542
+ paragraph,
2543
+ f"{_HP}run",
2544
+ {"charPrIDRef": self._run_char_pr_for_line(existing, index)},
2545
+ )
2546
+ _append_text_with_tabs(run, line)
2547
+
2548
+ def set_text(
2549
+ self,
2550
+ value: str,
2551
+ *,
2552
+ preserve_format: bool = True,
2553
+ split_paragraphs: bool = False,
2554
+ ) -> None:
2555
+ if split_paragraphs:
2556
+ self._set_split_paragraph_text(value)
2557
+ self.element.set("dirty", "1")
2558
+ self.table.mark_dirty()
2559
+ return
2560
+
2436
2561
  text_element = self._ensure_text_element()
2437
2562
  text_element.text = _sanitize_text(value)
2563
+ for node in self.element.findall(f".//{_HP}t"):
2564
+ if node is text_element:
2565
+ continue
2566
+ if node.text:
2567
+ node.text = ""
2568
+ if not preserve_format:
2569
+ run = text_element
2570
+ while run is not None and _element_local_name(run) != "run":
2571
+ run = run.getparent() if hasattr(run, "getparent") else None
2572
+ if run is not None:
2573
+ run.set("charPrIDRef", "0")
2438
2574
  self.element.set("dirty", "1")
2439
2575
  self.table.mark_dirty()
2440
2576
 
@@ -2898,6 +3034,8 @@ class HwpxOxmlTable:
2898
3034
  *,
2899
3035
  logical: bool = False,
2900
3036
  split_merged: bool = False,
3037
+ preserve_format: bool = True,
3038
+ split_paragraphs: bool = False,
2901
3039
  ) -> None:
2902
3040
  if logical:
2903
3041
  entry = self._grid_entry(row_index, col_index)
@@ -2907,7 +3045,11 @@ class HwpxOxmlTable:
2907
3045
  cell = entry.cell
2908
3046
  else:
2909
3047
  cell = self.cell(row_index, col_index)
2910
- cell.text = text
3048
+ cell.set_text(
3049
+ text,
3050
+ preserve_format=preserve_format,
3051
+ split_paragraphs=split_paragraphs,
3052
+ )
2911
3053
 
2912
3054
  def split_merged_cell(
2913
3055
  self, row_index: int, col_index: int
@@ -3797,7 +3939,10 @@ class HwpxOxmlParagraph:
3797
3939
  sublist = _append_child(note_element, f"{_HP}subList", _default_sublist_attributes())
3798
3940
  p_attrs = {"id": _paragraph_id(), **_DEFAULT_PARAGRAPH_ATTRS}
3799
3941
  paragraph = _append_child(sublist, f"{_HP}p", p_attrs)
3800
- note_run = _append_child(paragraph, f"{_HP}run", {"charPrIDRef": "0"})
3942
+ # 본문 run의 charPrIDRef도 인자를 따라가도록 적용 (host run 동일 스타일).
3943
+ # None이면 "0"(default).
3944
+ body_cpr = "0" if char_pr_id_ref is None else str(char_pr_id_ref)
3945
+ note_run = _append_child(paragraph, f"{_HP}run", {"charPrIDRef": body_cpr})
3801
3946
  t = _append_child(note_run, f"{_HP}t", {})
3802
3947
  t.text = _sanitize_text(text)
3803
3948
  self.section.mark_dirty()
@@ -0,0 +1,488 @@
1
+ """Rich HWPX → Markdown converter.
2
+
3
+ Preserves:
4
+ - 인라인 서식 (bold/italic/color/shade) via run charPrIDRef diff
5
+ - 표 병합 셀 (colspan/rowspan) via HTML
6
+ - 중첩 표 재귀 HTML
7
+ - 도형(rect/ellipse/polygon) 내부 paragraph
8
+ - 이미지 (BinData → ![image](path))
9
+ - 헤딩 (Ⅰ. / 1. 패턴)
10
+ - 각주/미주 정확 위치 + fn1/en1 일련번호 + 본문 인라인 서식
11
+ - 하이퍼링크 [text](url) (fieldBegin/End 추적)
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import re
17
+ from html import escape as html_escape
18
+ from pathlib import Path
19
+ from typing import Union
20
+
21
+ from ..document import HwpxDocument
22
+ from ..oxml.namespaces import tag_local_name
23
+
24
+ # 도형은 rect/ellipse/polygon만 순회. drawText/container는 이들의 자식이라
25
+ # 별도 순회하면 같은 paragraph가 중복 처리됨.
26
+ SHAPE_TAGS = ("rect", "ellipse", "polygon")
27
+
28
+ ROMAN_HEAD = re.compile(r"^\s*[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ]\.\s*.+")
29
+ ARABIC_HEAD = re.compile(r"^\s*\d+\.\s+[가-힣A-Za-z].+")
30
+
31
+
32
+ # ──────────────────────────────────────────────────────────────────
33
+ # 인라인 서식
34
+ # ──────────────────────────────────────────────────────────────────
35
+ def _local_name(element) -> str:
36
+ return tag_local_name(str(element.tag))
37
+
38
+
39
+ def _direct_children(element, local_name: str):
40
+ return [child for child in list(element) if _local_name(child) == local_name]
41
+
42
+
43
+ def _descendants(element, local_name: str):
44
+ return [
45
+ child
46
+ for child in element.iter()
47
+ if child is not element and _local_name(child) == local_name
48
+ ]
49
+
50
+
51
+ def _first_descendant(element, local_name: str):
52
+ for child in element.iter():
53
+ if child is not element and _local_name(child) == local_name:
54
+ return child
55
+ return None
56
+
57
+
58
+ def _has_descendant(element, local_name: str) -> bool:
59
+ return _first_descendant(element, local_name) is not None
60
+
61
+
62
+ def _escape_markdown_text(text: str) -> str:
63
+ """Escape source text before applying generated Markdown/HTML wrappers."""
64
+ escaped = html_escape(text, quote=False)
65
+ for char in ("\\", "`", "*", "[", "]", "|"):
66
+ escaped = escaped.replace(char, "\\" + char)
67
+ return escaped
68
+
69
+
70
+ def _diff_style(cp, base_cp) -> dict:
71
+ if cp is None:
72
+ return {}
73
+ ca, a = cp.child_attributes, cp.attributes
74
+ base_ca = base_cp.child_attributes if base_cp is not None else {}
75
+ base_a = base_cp.attributes if base_cp is not None else {}
76
+
77
+ bold = "bold" in ca and "bold" not in base_ca
78
+ italic = "italic" in ca and "italic" not in base_ca
79
+ underline = (
80
+ ca.get("underline", {}).get("type", "NONE") != "NONE"
81
+ and base_ca.get("underline", {}).get("type", "NONE") == "NONE"
82
+ )
83
+ strike = (
84
+ ca.get("strikeout", {}).get("shape", "NONE") != "NONE"
85
+ and base_ca.get("strikeout", {}).get("shape", "NONE") == "NONE"
86
+ )
87
+ color = a.get("textColor", "#000000")
88
+ base_color = base_a.get("textColor", "#000000")
89
+ # 흰색은 어두운 배경 위 디자인 효과로 가정 → 시각 의미 없음
90
+ color_changed = (
91
+ color != base_color and color.upper() not in ("#000000", "#FFFFFF")
92
+ )
93
+ shade = a.get("shadeColor", "none")
94
+ base_shade = base_a.get("shadeColor", "none")
95
+ shade_changed = shade.lower() not in ("none", "", base_shade.lower())
96
+
97
+ return {
98
+ "bold": bold,
99
+ "italic": italic,
100
+ "underline": underline,
101
+ "strike": strike,
102
+ "color": color if color_changed else None,
103
+ "shade": shade if shade_changed else None,
104
+ }
105
+
106
+
107
+ def _wrap(text: str, style: dict) -> str:
108
+ if not text:
109
+ return ""
110
+ out = text
111
+ if style.get("shade"):
112
+ out = f'<mark style="background-color:{style["shade"]}">{out}</mark>'
113
+ if style.get("color"):
114
+ out = f'<span style="color:{style["color"]}">{out}</span>'
115
+ if style.get("underline"):
116
+ out = f"<u>{out}</u>"
117
+ if style.get("strike"):
118
+ out = f"~~{out}~~"
119
+ if style.get("italic"):
120
+ out = f"*{out}*"
121
+ if style.get("bold"):
122
+ out = f"**{out}**"
123
+ return out
124
+
125
+
126
+ def _style_key(style: dict) -> tuple:
127
+ return tuple(sorted((k, v) for k, v in style.items() if v))
128
+
129
+
130
+ def _render_runs(items, base_cp, chars) -> str:
131
+ """[(cpr_id, text)] 시퀀스를 인접 동일 서식 머지 후 markdown으로."""
132
+ groups: list[tuple[tuple, str]] = []
133
+ for cpr, text in items:
134
+ if not text:
135
+ continue
136
+ cp = chars.get(str(cpr), base_cp)
137
+ style = _diff_style(cp, base_cp)
138
+ key = _style_key(style)
139
+ escaped = _escape_markdown_text(text)
140
+ if groups and groups[-1][0] == key:
141
+ groups[-1] = (key, groups[-1][1] + escaped)
142
+ else:
143
+ groups.append((key, escaped))
144
+ return "".join(_wrap(text, dict(key)) for key, text in groups)
145
+
146
+
147
+ # ──────────────────────────────────────────────────────────────────
148
+ # 이미지 매핑
149
+ # ──────────────────────────────────────────────────────────────────
150
+ def _build_image_map(
151
+ doc: HwpxDocument,
152
+ image_dir: Path | None,
153
+ image_ref_prefix: str | None,
154
+ ) -> dict[str, str]:
155
+ """doc._package의 BinData/* 를 image_dir에 추출하고 {ref_stem → rel_path} 반환.
156
+ image_dir이 None이면 추출 없이 빈 dict (마크다운에 ![image]() 안 들어감).
157
+ """
158
+ if image_dir is None:
159
+ return {}
160
+ image_dir = Path(image_dir)
161
+ image_dir.mkdir(parents=True, exist_ok=True)
162
+ prefix = image_ref_prefix if image_ref_prefix is not None else image_dir.name
163
+ mapping: dict[str, str] = {}
164
+ pkg = doc._package
165
+ for name in pkg.files():
166
+ if not name.startswith("BinData/"):
167
+ continue
168
+ data = pkg.read(name)
169
+ fname = Path(name).name
170
+ (image_dir / fname).write_bytes(data)
171
+ mapping[Path(name).stem] = f"{prefix}/{fname}" if prefix else fname
172
+ return mapping
173
+
174
+
175
+ def _paragraph_images(p_el, mapping: dict[str, str]) -> list[str]:
176
+ """paragraph element 안 모든 <hp:pic> → markdown 이미지 라인."""
177
+ out = []
178
+ for pic in _descendants(p_el, "pic"):
179
+ img = _first_descendant(pic, "img")
180
+ if img is None:
181
+ continue
182
+ ref = img.get("binaryItemIDRef")
183
+ if not ref or not mapping:
184
+ continue
185
+ rel = mapping.get(ref, f"BinData/{ref}")
186
+ out.append(f"![image]({rel})")
187
+ return out
188
+
189
+
190
+ # ──────────────────────────────────────────────────────────────────
191
+ # Paragraph element → markdown (재귀 진입점)
192
+ # ──────────────────────────────────────────────────────────────────
193
+ def _p_element_to_md(p_el, doc, notes_out: list | None = None) -> str:
194
+ chars = doc._root.char_properties
195
+ base_cp = chars.get("0")
196
+
197
+ output: list[str] = []
198
+ items: list[tuple] = []
199
+ link_url: str | None = None
200
+ link_items: list[tuple] = []
201
+
202
+ def flush_items():
203
+ nonlocal items
204
+ if items:
205
+ output.append(_render_runs(items, base_cp, chars))
206
+ items = []
207
+
208
+ def flush_link():
209
+ nonlocal link_url, link_items
210
+ if link_url is None:
211
+ return
212
+ text = _render_runs(link_items, base_cp, chars)
213
+ if text:
214
+ output.append(f"[{text}]({link_url})" if link_url else text)
215
+ link_url = None
216
+ link_items = []
217
+
218
+ def push_text(cpr, text):
219
+ if link_url is not None:
220
+ link_items.append((cpr, text))
221
+ else:
222
+ items.append((cpr, text))
223
+
224
+ for run in _direct_children(p_el, "run"):
225
+ cpr = run.get("charPrIDRef", "0")
226
+ for child in run:
227
+ tag = _local_name(child)
228
+ if tag == "t":
229
+ if child.text:
230
+ push_text(cpr, child.text)
231
+ elif tag == "ctrl":
232
+ for gc in child:
233
+ gctag = _local_name(gc)
234
+ if gctag == "fieldBegin" and gc.get("type") == "HYPERLINK":
235
+ flush_items()
236
+ link_url = gc.get("name", "")
237
+ elif gctag == "fieldEnd":
238
+ flush_link()
239
+ elif tag in ("footNote", "endNote"):
240
+ inst_id = child.get("instId", "")
241
+ kind = "fn" if tag == "footNote" else "en"
242
+ marker = f"[^{kind}{inst_id}]"
243
+ if link_url is not None:
244
+ flush_link()
245
+ else:
246
+ flush_items()
247
+ output.append(marker)
248
+ if notes_out is not None:
249
+ body_parts = []
250
+ for fp in _descendants(child, "p"):
251
+ sub_md = _p_element_to_md(fp, doc, None).strip()
252
+ if sub_md:
253
+ body_parts.append(sub_md)
254
+ notes_out.append((kind, inst_id, " ".join(body_parts)))
255
+
256
+ flush_items()
257
+ flush_link()
258
+ return "".join(output)
259
+
260
+
261
+ # ──────────────────────────────────────────────────────────────────
262
+ # 도형 / 셀 / 표
263
+ # ──────────────────────────────────────────────────────────────────
264
+ def _shape_text_lines(scope_el, doc, notes_out: list | None = None) -> list[str]:
265
+ lines: list[str] = []
266
+ seen_p = set()
267
+ for tag in SHAPE_TAGS:
268
+ for shape in _descendants(scope_el, tag):
269
+ for sub_p in _descendants(shape, "p"):
270
+ pid = id(sub_p)
271
+ if pid in seen_p:
272
+ continue
273
+ seen_p.add(pid)
274
+ md = _p_element_to_md(sub_p, doc, notes_out).strip()
275
+ if md:
276
+ lines.append(md)
277
+ return lines
278
+
279
+
280
+ def _cell_to_md(cell, doc, mapping, depth: int = 0, notes_out: list | None = None) -> str:
281
+ chunks: list[str] = []
282
+ for cp in cell.paragraphs:
283
+ md = _p_element_to_md(cp.element, doc, notes_out).strip()
284
+ imgs = _paragraph_images(cp.element, mapping)
285
+ shape_lines = _shape_text_lines(cp.element, doc, notes_out)
286
+ if md:
287
+ chunks.append(md)
288
+ chunks.extend(shape_lines)
289
+ chunks.extend(imgs)
290
+ for sub in cp.tables:
291
+ chunks.append(_table_to_md(sub, doc, mapping, depth + 1, notes_out))
292
+ return "<br>".join(c for c in chunks if c).strip()
293
+
294
+
295
+ def _table_to_md(tbl, doc, mapping, depth: int = 0, notes_out: list | None = None) -> str:
296
+ grid = tbl.get_cell_map()
297
+ rows, cols = tbl.row_count, tbl.column_count
298
+ has_merge = any(not pos.is_anchor for row in grid for pos in row)
299
+
300
+ if has_merge or depth > 0:
301
+ # 병합 셀 또는 중첩 — HTML
302
+ out = ["<table>"]
303
+ for r in range(rows):
304
+ out.append("<tr>")
305
+ for c in range(cols):
306
+ pos = grid[r][c]
307
+ if not pos.is_anchor:
308
+ continue
309
+ col_end = c
310
+ while (
311
+ col_end + 1 < cols
312
+ and not grid[r][col_end + 1].is_anchor
313
+ and grid[r][col_end + 1].cell is pos.cell
314
+ ):
315
+ col_end += 1
316
+ row_end = r
317
+ while (
318
+ row_end + 1 < rows
319
+ and not grid[row_end + 1][c].is_anchor
320
+ and grid[row_end + 1][c].cell is pos.cell
321
+ ):
322
+ row_end += 1
323
+ colspan = col_end - c + 1
324
+ rowspan = row_end - r + 1
325
+ attrs = []
326
+ if colspan > 1:
327
+ attrs.append(f'colspan="{colspan}"')
328
+ if rowspan > 1:
329
+ attrs.append(f'rowspan="{rowspan}"')
330
+ attr_s = (" " + " ".join(attrs)) if attrs else ""
331
+ content = _cell_to_md(pos.cell, doc, mapping, depth + 1, notes_out)
332
+ tag = "th" if r == 0 else "td"
333
+ out.append(f"<{tag}{attr_s}>{content}</{tag}>")
334
+ out.append("</tr>")
335
+ out.append("</table>")
336
+ return "\n".join(out)
337
+
338
+ # 단순 — GFM
339
+ lines = []
340
+ for r in range(rows):
341
+ cells = [
342
+ _cell_to_md(grid[r][c].cell, doc, mapping, depth + 1, notes_out)
343
+ for c in range(cols)
344
+ ]
345
+ lines.append("| " + " | ".join(cells) + " |")
346
+ if r == 0:
347
+ lines.append("| " + " | ".join(["---"] * cols) + " |")
348
+ return "\n".join(lines)
349
+
350
+
351
+ # ──────────────────────────────────────────────────────────────────
352
+ # 헤딩 감지
353
+ # ──────────────────────────────────────────────────────────────────
354
+ def _detect_heading(text: str) -> str | None:
355
+ plain = re.sub(r"~~|\*\*|<[^>]+>|\*", "", text.strip())
356
+ plain = plain.replace("\\[", "[").replace("\\]", "]").replace("\\|", "|")
357
+ if ROMAN_HEAD.match(plain):
358
+ return f"# {plain}"
359
+ if ARABIC_HEAD.match(plain) and len(plain) < 40:
360
+ return f"## {plain}"
361
+ return None
362
+
363
+
364
+ # ──────────────────────────────────────────────────────────────────
365
+ # Public API
366
+ # ──────────────────────────────────────────────────────────────────
367
+ def export_markdown(
368
+ source: Union[HwpxDocument, str, Path, bytes],
369
+ *,
370
+ image_dir: Union[str, Path, None] = None,
371
+ image_ref_prefix: str | None = None,
372
+ detect_headings: bool = True,
373
+ notes_section_separator: str = "\n\n---\n",
374
+ ) -> str:
375
+ """HWPX → rich markdown.
376
+
377
+ Parameters
378
+ ----------
379
+ source : HwpxDocument | path | bytes
380
+ HwpxDocument 인스턴스 또는 파일 경로/바이트.
381
+ image_dir : path | None
382
+ BinData/* 추출 대상 디렉토리. None이면 이미지 마커 생성하지 않음.
383
+ image_ref_prefix : str | None
384
+ markdown 이미지 경로의 prefix. None이면 image_dir의 basename.
385
+ detect_headings : bool
386
+ Ⅰ./1. 패턴 감지로 `#`/`##` 헤딩 격상 여부.
387
+ notes_section_separator : str
388
+ 각주/미주 정의 부록 앞에 삽입할 separator.
389
+ """
390
+ if isinstance(source, HwpxDocument):
391
+ doc = source
392
+ elif isinstance(source, (bytes, bytearray)):
393
+ import io
394
+ doc = HwpxDocument.open(io.BytesIO(source))
395
+ else:
396
+ doc = HwpxDocument.open(str(source))
397
+
398
+ mapping = _build_image_map(doc, Path(image_dir) if image_dir else None, image_ref_prefix)
399
+ notes: list[tuple] = []
400
+ lines: list[str] = []
401
+
402
+ for section in doc.sections:
403
+ for p in section.paragraphs:
404
+ md = _p_element_to_md(p.element, doc, notes).strip()
405
+ imgs = _paragraph_images(p.element, mapping)
406
+ tables = [_table_to_md(t, doc, mapping, 0, notes) for t in p.tables]
407
+
408
+ # 중복 가드 1: paragraph text가 표 셀 안에 동일하게 들어있으면 표가 정식
409
+ if md and p.tables:
410
+ plain = (p.text or "").strip()
411
+ all_cell_text = "".join(
412
+ (cell.text or "")
413
+ for tbl in p.tables
414
+ for row in tbl.rows
415
+ for cell in row.cells
416
+ )
417
+ if plain and plain in all_cell_text:
418
+ md = ""
419
+
420
+ # 중복 가드 2: 도형 보유 시 paragraph text는 도형 텍스트의 흘러나옴
421
+ if md and any(_has_descendant(p.element, tag) for tag in SHAPE_TAGS):
422
+ md = ""
423
+
424
+ # 도형 내부 paragraph 추출 (표 안 도형은 cell_to_md에서 처리됨)
425
+ shape_lines: list[str] = []
426
+ seen_p = set()
427
+ for sub in p.tables:
428
+ for nested_p in _descendants(sub.element, "p"):
429
+ seen_p.add(id(nested_p))
430
+ for tag in SHAPE_TAGS:
431
+ for shape in _descendants(p.element, tag):
432
+ for sub_p in _descendants(shape, "p"):
433
+ pid = id(sub_p)
434
+ if pid in seen_p:
435
+ continue
436
+ seen_p.add(pid)
437
+ sub_md = _p_element_to_md(sub_p, doc, notes).strip()
438
+ if sub_md:
439
+ shape_lines.append(sub_md)
440
+
441
+ # 헤딩 감지 (1x1 표 셀에 있는 경우 포함)
442
+ promoted = None
443
+ if detect_headings:
444
+ if md:
445
+ promoted = _detect_heading(md)
446
+ elif p.tables and len(p.tables) == 1:
447
+ t = p.tables[0]
448
+ if t.row_count == 1 and t.column_count == 1:
449
+ cell_text = _cell_to_md(
450
+ t.rows[0].cells[0], doc, mapping, 0, notes
451
+ )
452
+ promoted = _detect_heading(cell_text)
453
+ if promoted:
454
+ lines.append(promoted)
455
+ continue
456
+
457
+ if promoted:
458
+ lines.append(promoted)
459
+ elif md:
460
+ lines.append(md)
461
+ lines.extend(shape_lines)
462
+ lines.extend(imgs)
463
+ lines.extend(tables)
464
+
465
+ body = "\n\n".join(lines)
466
+
467
+ # 각주/미주 instId → fn1/en1 일련번호 매핑 + 정의 부록
468
+ if notes:
469
+ seq_map: dict[str, dict[str, int]] = {"fn": {}, "en": {}}
470
+ for kind, inst_id, _ in notes:
471
+ if inst_id not in seq_map[kind]:
472
+ seq_map[kind][inst_id] = len(seq_map[kind]) + 1
473
+
474
+ for kind, m in seq_map.items():
475
+ for inst_id, seq in m.items():
476
+ body = body.replace(f"[^{kind}{inst_id}]", f"[^{kind}{seq}]")
477
+
478
+ body += notes_section_separator
479
+ seen = set()
480
+ for kind, inst_id, text in notes:
481
+ key = (kind, inst_id)
482
+ if key in seen:
483
+ continue
484
+ seen.add(key)
485
+ seq = seq_map[kind][inst_id]
486
+ body += f"\n[^{kind}{seq}]: {text}\n"
487
+
488
+ return body
@@ -41,10 +41,14 @@ class TableMapEntry(TypedDict):
41
41
 
42
42
  table_index: int
43
43
  paragraph_index: int
44
+ location: dict[str, object]
44
45
  rows: int
45
46
  cols: int
47
+ caption_text: str
48
+ preceding_paragraph_text: str
46
49
  header_text: str
47
50
  first_row_preview: list[str]
51
+ cells: list[dict[str, object]]
48
52
  is_empty: bool
49
53
 
50
54
 
@@ -107,6 +111,8 @@ class TableFillResult(TypedDict):
107
111
  class _AnchoredTable:
108
112
  table: HwpxOxmlTable
109
113
  paragraph_index: int
114
+ caption_text: str
115
+ preceding_paragraph_text: str
110
116
  header_text: str
111
117
 
112
118
 
@@ -115,6 +121,8 @@ class _IndexedTable:
115
121
  table_index: int
116
122
  table: HwpxOxmlTable
117
123
  paragraph_index: int
124
+ caption_text: str
125
+ preceding_paragraph_text: str
118
126
  header_text: str
119
127
 
120
128
 
@@ -193,6 +201,8 @@ def _collect_tables_from_paragraph(
193
201
  _AnchoredTable(
194
202
  table=table,
195
203
  paragraph_index=anchor_paragraph_index,
204
+ caption_text=paragraph_prefix_text,
205
+ preceding_paragraph_text=last_header_text,
196
206
  header_text=header_text,
197
207
  )
198
208
  )
@@ -227,6 +237,8 @@ def _collect_document_tables(document: HwpxDocument) -> list[_IndexedTable]:
227
237
  table_index=table_index,
228
238
  table=item.table,
229
239
  paragraph_index=item.paragraph_index,
240
+ caption_text=item.caption_text,
241
+ preceding_paragraph_text=item.preceding_paragraph_text,
230
242
  header_text=item.header_text,
231
243
  )
232
244
  for table_index, item in enumerate(anchored_tables)
@@ -234,7 +246,11 @@ def _collect_document_tables(document: HwpxDocument) -> list[_IndexedTable]:
234
246
 
235
247
 
236
248
  def _cell_text(table: HwpxOxmlTable, row_index: int, col_index: int) -> str:
237
- return table.cell(row_index, col_index).text
249
+ cell = table.cell(row_index, col_index)
250
+ paragraphs = list(getattr(cell, "paragraphs", []) or [])
251
+ if paragraphs:
252
+ return "\n".join(paragraph.text or "" for paragraph in paragraphs)
253
+ return cell.text
238
254
 
239
255
 
240
256
  def _table_is_empty(table: HwpxOxmlTable) -> bool:
@@ -251,6 +267,62 @@ def _first_row_preview(table: HwpxOxmlTable) -> list[str]:
251
267
  return [_cell_text(table, 0, col_index) for col_index in range(table.column_count)]
252
268
 
253
269
 
270
+ def _body_paragraph_location(paragraph_index: int) -> dict[str, object]:
271
+ return {"kind": "body_paragraph", "paragraph_index": paragraph_index}
272
+
273
+
274
+ def _table_cell_paragraph_location(
275
+ table_index: int,
276
+ row_index: int,
277
+ col_index: int,
278
+ cell_paragraph_index: int,
279
+ ) -> dict[str, object]:
280
+ return {
281
+ "kind": "table_cell_paragraph",
282
+ "table_index": table_index,
283
+ "row": row_index,
284
+ "col": col_index,
285
+ "cell_paragraph_index": cell_paragraph_index,
286
+ }
287
+
288
+
289
+ def _table_cells(table_ref: _IndexedTable) -> list[dict[str, object]]:
290
+ cells: list[dict[str, object]] = []
291
+ for row_index in range(table_ref.table.row_count):
292
+ for col_index in range(table_ref.table.column_count):
293
+ cell = table_ref.table.cell(row_index, col_index)
294
+ paragraphs = list(getattr(cell, "paragraphs", []) or [])
295
+ paragraph_payloads: list[dict[str, object]] = []
296
+ for cell_paragraph_index, paragraph in enumerate(paragraphs):
297
+ paragraph_payloads.append(
298
+ {
299
+ "cell_paragraph_index": cell_paragraph_index,
300
+ "text": paragraph.text or "",
301
+ "location": _table_cell_paragraph_location(
302
+ table_ref.table_index,
303
+ row_index,
304
+ col_index,
305
+ cell_paragraph_index,
306
+ ),
307
+ }
308
+ )
309
+ cells.append(
310
+ {
311
+ "row": row_index,
312
+ "col": col_index,
313
+ "text": _cell_text(table_ref.table, row_index, col_index),
314
+ "paragraphs": paragraph_payloads,
315
+ "location": {
316
+ "kind": "table_cell",
317
+ "table_index": table_ref.table_index,
318
+ "row": row_index,
319
+ "col": col_index,
320
+ },
321
+ }
322
+ )
323
+ return cells
324
+
325
+
254
326
  def _direction_delta(direction: PathDirection) -> tuple[int, int]:
255
327
  if direction == "right":
256
328
  return (0, 1)
@@ -337,10 +409,14 @@ def get_table_map(document: HwpxDocument) -> TableMapResult:
337
409
  {
338
410
  "table_index": table_ref.table_index,
339
411
  "paragraph_index": table_ref.paragraph_index,
412
+ "location": _body_paragraph_location(table_ref.paragraph_index),
340
413
  "rows": table_ref.table.row_count,
341
414
  "cols": table_ref.table.column_count,
415
+ "caption_text": table_ref.caption_text,
416
+ "preceding_paragraph_text": table_ref.preceding_paragraph_text,
342
417
  "header_text": table_ref.header_text,
343
418
  "first_row_preview": _first_row_preview(table_ref.table),
419
+ "cells": _table_cells(table_ref),
344
420
  "is_empty": _table_is_empty(table_ref.table),
345
421
  }
346
422
  )
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-hwpx
3
- Version: 2.10.1
3
+ Version: 2.10.2
4
4
  Summary: 한글 없이 HWPX 문서를 열고, 편집하고, 생성하고, 검증하는 Python 자동화 라이브러리
5
5
  Author: python-hwpx Maintainers
6
6
  License-Expression: Apache-2.0
@@ -115,6 +115,47 @@ hwpx-validate-package 보고서.hwpx
115
115
  hwpx-analyze-template 보고서.hwpx
116
116
  ```
117
117
 
118
+ ### 4. 풍부한 Markdown 변환 (서식·표·각주·이미지 보존)
119
+
120
+ `export_markdown()`는 단순 평문 추출이고, `export_rich_markdown()`는 인라인 서식(`**굵게**`, `*기울임*`, `~~취소선~~`),
121
+ 표(중첩 포함, colspan/rowspan 안전), 도형 텍스트, 이미지, 각주/미주, 하이퍼링크, 제목(`#`/`##`) 자동 감지까지 보존한다.
122
+
123
+ ```python
124
+ from hwpx import HwpxDocument
125
+
126
+ doc = HwpxDocument.open("보고서.hwpx")
127
+
128
+ md = doc.export_rich_markdown(
129
+ image_dir="out/images", # BinData 이미지를 디스크에 추출
130
+ image_ref_prefix="images/", # 마크다운 내 ![](images/...) 경로 접두
131
+ detect_headings=True, # Ⅰ./1. 패턴 기반 #/## 자동
132
+ )
133
+ print(md)
134
+ ```
135
+
136
+ 문자열·경로·바이트도 그대로 받는다:
137
+
138
+ ```python
139
+ from hwpx.tools.markdown_export import export_markdown
140
+
141
+ md = export_markdown("보고서.hwpx") # 경로
142
+ md = export_markdown(open("a.hwpx", "rb").read()) # bytes
143
+ ```
144
+
145
+ ### 5. 각주 본문에 혼합 서식 / 하이퍼링크 추가
146
+
147
+ `HwpxOxmlNote`에 `body_paragraph`, `add_run`, `add_hyperlink` helper가 있어 각주 본문을
148
+ 직접 paragraph로 다루지 않고도 인라인 서식·링크를 손쉽게 채울 수 있다.
149
+
150
+ ```python
151
+ para = section.paragraphs[0]
152
+ note = para.add_footnote("") # 빈 각주 생성 후 본문 구성
153
+ note.add_run("자세한 내용은 ", )
154
+ note.add_run("정부 공식 사이트", bold=True)
155
+ note.add_run("를 참고하라: ")
156
+ note.add_hyperlink("https://www.kasa.go.kr", "우주항공청")
157
+ ```
158
+
118
159
  처음에는 `open/new -> edit/extract -> save_to_path` 흐름만 잡으면 된다. 패키지 구조, XML 파트, 템플릿 회귀 점검은 필요할 때만 확장하면 된다.
119
160
 
120
161
  ## 어디부터 읽으면 되나
@@ -244,6 +285,7 @@ doc.set_footer_text("1 / 10", page_type="BOTH")
244
285
  # 표 셀 병합·분할
245
286
  table.merge_cells(0, 0, 1, 1) # (0,0)~(1,1) 병합
246
287
  table.set_cell_text(0, 0, "병합된 셀", logical=True, split_merged=True)
288
+ table.set_cell_text(0, 0, "line 1\nline 2", split_paragraphs=True)
247
289
 
248
290
  # 양식형 표 자동 채우기
249
291
  form = doc.add_table(2, 2)
@@ -257,6 +299,12 @@ doc.fill_by_path({
257
299
  })
258
300
  ```
259
301
 
302
+ `doc.paragraphs`의 인덱스는 본문 직속 문단 0-based 기준입니다. 표 안 문단은
303
+ 본문 `paragraph_index`에 섞지 않고 `get_table_map()`의 cell `location`
304
+ (`table_index`, `row`, `col`, `cell_paragraph_index`)으로 다룹니다.
305
+ `get_table_map()`은 `caption_text`와 `preceding_paragraph_text`를 분리해
306
+ 반환하고, 셀 미리보기의 여러 문단은 `\n`으로 유지합니다.
307
+
260
308
  ### 🔍 텍스트 추출 & 검색
261
309
 
262
310
  ```python
@@ -1,6 +1,6 @@
1
1
  hwpx/__init__.py,sha256=ouwTSF8JrUPVgwWxB1hudQwVdhAA981uHeX_wXxxQHo,2205
2
2
  hwpx/authoring.py,sha256=caZfPFe99ilaJMDJEDRsWKCb-QKAp18M0vRlPdM0PR0,96068
3
- hwpx/document.py,sha256=Q8uHzYryMFUXn6fc7Uhi1cEbmhaqiQ8uqb8bT-gAYjU,54798
3
+ hwpx/document.py,sha256=1kb0n6C5cEiex7Bs58MlLhFXI8mknQFqErTkCYaFuQE,55204
4
4
  hwpx/form_fill.py,sha256=VUIU53Qa9Ho2aP72biDvJwnDW7ngdAzu3PSd5A7d1JM,9908
5
5
  hwpx/package.py,sha256=0rKjGCJbPQvrVBIy07Jpjsu3fI7HhbqFCGWTiTDsJpo,1141
6
6
  hwpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -16,7 +16,7 @@ hwpx/opc/xml_utils.py,sha256=O_eZtp1-8vWimoi9Xdy0uzmtk8bnkfjf-QLjD_uWSFQ,3483
16
16
  hwpx/oxml/__init__.py,sha256=tUoiHQw3oJpHvSES6f5AuhpfXvlby0Df-3L0t-CMhxM,5000
17
17
  hwpx/oxml/body.py,sha256=VOwlyDRgoLMlDePFbCjU3qlBAefO9DIoSGsEI2Fr8DA,24888
18
18
  hwpx/oxml/common.py,sha256=TJkafzg7x4T3J29tZchRZk57ZTsrM9PEiqGT3rX3w5o,1044
19
- hwpx/oxml/document.py,sha256=QA37yh53PDNk7-TNKhSViXDlcz5vkLlosZMcHSSyKKc,209419
19
+ hwpx/oxml/document.py,sha256=WIKmJ-nW0Wqh2vEf-06-5gx6EpQc-TvbJr-nze3VYtQ,214823
20
20
  hwpx/oxml/header.py,sha256=_KgKsCN6UWB8r59z2iqe0rLC8EdEZyJD7GfQ0Xd2WXM,43080
21
21
  hwpx/oxml/header_part.py,sha256=U3tXD1LWruAdQV-w9cIBv8iXPpQ1oUm0CXlxAAonZ6I,231
22
22
  hwpx/oxml/memo.py,sha256=WSJSTYOSLKG836eF_UsrD99hMqJhWwzRZ8pJbHq-nsA,228
@@ -34,6 +34,7 @@ hwpx/tools/archive_cli.py,sha256=rlgE6KBeJORa8Z6RhGOVmOl7gGIKdgA9GY106EFouVo,122
34
34
  hwpx/tools/exporter.py,sha256=hx7th-LAL1a5G0ICyVcyJPJaUY5jEgDJUZ7UYg_YAmI,6578
35
35
  hwpx/tools/generic_inventory.py,sha256=pHVP8-htX_vO02ARdQR37XFxm7fUPK68VtMeeOJ1NZY,4835
36
36
  hwpx/tools/id_integrity.py,sha256=_Ra981ZPX1WXH_bK-2KNhCnwPVYErfdX2wW4SosX0Ls,9256
37
+ hwpx/tools/markdown_export.py,sha256=FejutCpQHbycO185uljcSwfZuwXMTbGEgXtf5e-a4_k,19139
37
38
  hwpx/tools/object_finder.py,sha256=7i6XI1-r7-ar_IzSZQ82hfOcxVzJFK2XjMDB8oxcmMA,13478
38
39
  hwpx/tools/package_validator.py,sha256=87uv7uVh6wqqY8-woX9kAGnwuWK3uYL4BHfGf7NNgcs,14521
39
40
  hwpx/tools/page_guard.py,sha256=nDAVPcvrnuyDxVTA_j22wiYD7CXAD6XlzsMzaz3h_q8,9701
@@ -43,17 +44,17 @@ hwpx/tools/report_parser.py,sha256=3Daqn2hqIcj5pG1qUxeYbvWr7CvdhwzatWvxCCcnSZg,4
43
44
  hwpx/tools/report_utils.py,sha256=6HYEeQc3ZxTpxbwF11s47uZ-KmV4tsHPE1MV4491KDE,4434
44
45
  hwpx/tools/roundtrip_diff.py,sha256=ao0AdpDJkq89u5hwcrsxTijvSsia9Jaw1OOnh4WAco4,1365
45
46
  hwpx/tools/table_cleanup.py,sha256=0_f6NnvNp3QD4owKd_bRX6FZbeUmoQC7a4_VGzF2SCE,1796
46
- hwpx/tools/table_navigation.py,sha256=oGfJE0cM3WIvE8_avtbST8R_nITnoMwDA4t-4IEW9dg,13520
47
+ hwpx/tools/table_navigation.py,sha256=rtbrWFKpJhqC3LD0ZXImyHgjmDR2hjHCFy3_S-qNBwA,16479
47
48
  hwpx/tools/template_analyzer.py,sha256=qZMIyB-r4YXZqU54v6uwt_CQiOAQR0mVgmo_Bt4biWM,8497
48
49
  hwpx/tools/text_extract_cli.py,sha256=BmsDAwNXpDPhEayb9ez2ORtGNzPd_Xxduy4_cLXhnUw,2188
49
50
  hwpx/tools/text_extractor.py,sha256=dqGzOnJVRUEfrxiTt04GkDrfY4yfZXRIhPtEwTM77Mw,25289
50
51
  hwpx/tools/validator.py,sha256=LMo8gIMoptP9RRDbYKV4WwrM59rclC5h3HP-ZJRUxO0,6856
51
52
  hwpx/tools/_schemas/header.xsd,sha256=mJXuFMuHGT1JnFFaluUpYUglwjMCNlfbFCRVM26eHXE,664
52
53
  hwpx/tools/_schemas/section.xsd,sha256=MgvavVHG05RDfUnVPxVU10H4FQOja5ON04_m9Uk_m7E,522
53
- python_hwpx-2.10.1.dist-info/licenses/LICENSE,sha256=_ubz4wv-BkkT3l3gu-QuH7JGeVjuRYGZoZK95eNsCHU,9688
54
- python_hwpx-2.10.1.dist-info/licenses/NOTICE,sha256=k48h6EaGQE8Y1c0dS9sIOOcz4YqkbcImWClF7pBOgsg,2473
55
- python_hwpx-2.10.1.dist-info/METADATA,sha256=43VEoLZ0bnRSIhdTzq10F49GYCdSXIwHqyNcZygODVk,16077
56
- python_hwpx-2.10.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
57
- python_hwpx-2.10.1.dist-info/entry_points.txt,sha256=JUKRxbly9UaeHV7YzOea23y8IiqSTcrhUlooP3fS_Zc,405
58
- python_hwpx-2.10.1.dist-info/top_level.txt,sha256=R1iToqDh80Nf2oQhRjTN0rbN2X6kyDUizIocZjkhuxc,5
59
- python_hwpx-2.10.1.dist-info/RECORD,,
54
+ python_hwpx-2.10.2.dist-info/licenses/LICENSE,sha256=_ubz4wv-BkkT3l3gu-QuH7JGeVjuRYGZoZK95eNsCHU,9688
55
+ python_hwpx-2.10.2.dist-info/licenses/NOTICE,sha256=k48h6EaGQE8Y1c0dS9sIOOcz4YqkbcImWClF7pBOgsg,2473
56
+ python_hwpx-2.10.2.dist-info/METADATA,sha256=S3vl8kgL0d7BcCafoPk8AuV7otQmjutlivMFAvUNROA,18099
57
+ python_hwpx-2.10.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
58
+ python_hwpx-2.10.2.dist-info/entry_points.txt,sha256=JUKRxbly9UaeHV7YzOea23y8IiqSTcrhUlooP3fS_Zc,405
59
+ python_hwpx-2.10.2.dist-info/top_level.txt,sha256=R1iToqDh80Nf2oQhRjTN0rbN2X6kyDUizIocZjkhuxc,5
60
+ python_hwpx-2.10.2.dist-info/RECORD,,