python-hwpx 2.10.1__py3-none-any.whl → 2.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hwpx/document.py +8 -0
- hwpx/oxml/document.py +147 -2
- hwpx/tools/markdown_export.py +488 -0
- hwpx/tools/table_navigation.py +77 -1
- {python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/METADATA +49 -1
- {python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/RECORD +11 -10
- {python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/WHEEL +0 -0
- {python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/entry_points.txt +0 -0
- {python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/licenses/LICENSE +0 -0
- {python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/licenses/NOTICE +0 -0
- {python_hwpx-2.10.1.dist-info → python_hwpx-2.10.2.dist-info}/top_level.txt +0 -0
hwpx/document.py
CHANGED
|
@@ -1472,6 +1472,14 @@ class HwpxDocument:
|
|
|
1472
1472
|
from .tools.exporter import export_markdown
|
|
1473
1473
|
return export_markdown(self, **kwargs) # type: ignore[arg-type]
|
|
1474
1474
|
|
|
1475
|
+
def export_rich_markdown(self, **kwargs: object) -> str:
|
|
1476
|
+
"""Export rich Markdown preserving inline styles, tables, footnotes, hyperlinks, images, and shape text.
|
|
1477
|
+
|
|
1478
|
+
Keyword args forwarded to :func:`~hwpx.tools.markdown_export.export_markdown`.
|
|
1479
|
+
"""
|
|
1480
|
+
from .tools.markdown_export import export_markdown as _rich
|
|
1481
|
+
return _rich(self, **kwargs) # type: ignore[arg-type]
|
|
1482
|
+
|
|
1475
1483
|
# ------------------------------------------------------------------
|
|
1476
1484
|
# Validation
|
|
1477
1485
|
# ------------------------------------------------------------------
|
hwpx/oxml/document.py
CHANGED
|
@@ -1872,6 +1872,68 @@ class HwpxOxmlNote:
|
|
|
1872
1872
|
t.text = _sanitize_text(value)
|
|
1873
1873
|
self.paragraph.section.mark_dirty()
|
|
1874
1874
|
|
|
1875
|
+
@property
|
|
1876
|
+
def body_paragraph(self) -> "HwpxOxmlParagraph":
|
|
1877
|
+
"""Return the note's body ``<hp:p>`` wrapped as :class:`HwpxOxmlParagraph`.
|
|
1878
|
+
|
|
1879
|
+
The body lives inside ``<hp:subList>`` and is distinct from
|
|
1880
|
+
:attr:`paragraph`, which is the *hosting* paragraph (where the note
|
|
1881
|
+
marker is inserted). Use this to add runs with mixed formatting
|
|
1882
|
+
directly into the note body:
|
|
1883
|
+
|
|
1884
|
+
>>> note = para.add_footnote("기본 ")
|
|
1885
|
+
>>> note.add_run("청색", char_pr_id_ref=5)
|
|
1886
|
+
"""
|
|
1887
|
+
p = self.element.find(f".//{_HP}p")
|
|
1888
|
+
if p is None:
|
|
1889
|
+
raise ValueError("note has no body paragraph element")
|
|
1890
|
+
return HwpxOxmlParagraph(p, self.paragraph.section)
|
|
1891
|
+
|
|
1892
|
+
def add_run(
|
|
1893
|
+
self,
|
|
1894
|
+
text: str = "",
|
|
1895
|
+
*,
|
|
1896
|
+
char_pr_id_ref: str | int | None = None,
|
|
1897
|
+
bold: bool = False,
|
|
1898
|
+
italic: bool = False,
|
|
1899
|
+
underline: bool = False,
|
|
1900
|
+
color: str | None = None,
|
|
1901
|
+
font: str | None = None,
|
|
1902
|
+
size: int | float | None = None,
|
|
1903
|
+
highlight: str | None = None,
|
|
1904
|
+
strike: bool | None = None,
|
|
1905
|
+
attributes: dict[str, str] | None = None,
|
|
1906
|
+
) -> "HwpxOxmlRun":
|
|
1907
|
+
"""Append a run to the note body paragraph (delegates to body_paragraph.add_run)."""
|
|
1908
|
+
return self.body_paragraph.add_run(
|
|
1909
|
+
text,
|
|
1910
|
+
char_pr_id_ref=char_pr_id_ref,
|
|
1911
|
+
bold=bold,
|
|
1912
|
+
italic=italic,
|
|
1913
|
+
underline=underline,
|
|
1914
|
+
color=color,
|
|
1915
|
+
font=font,
|
|
1916
|
+
size=size,
|
|
1917
|
+
highlight=highlight,
|
|
1918
|
+
strike=strike,
|
|
1919
|
+
attributes=attributes,
|
|
1920
|
+
)
|
|
1921
|
+
|
|
1922
|
+
def add_hyperlink(
|
|
1923
|
+
self,
|
|
1924
|
+
url: str,
|
|
1925
|
+
display_text: str,
|
|
1926
|
+
*,
|
|
1927
|
+
char_pr_id_ref: str | int | None = None,
|
|
1928
|
+
) -> "HwpxOxmlInlineObject":
|
|
1929
|
+
"""Append a hyperlink to the note body paragraph.
|
|
1930
|
+
|
|
1931
|
+
Convenience wrapper around ``body_paragraph.add_hyperlink``.
|
|
1932
|
+
"""
|
|
1933
|
+
return self.body_paragraph.add_hyperlink(
|
|
1934
|
+
url, display_text, char_pr_id_ref=char_pr_id_ref
|
|
1935
|
+
)
|
|
1936
|
+
|
|
1875
1937
|
|
|
1876
1938
|
def _default_sublist_attributes() -> dict[str, str]:
|
|
1877
1939
|
"""Return standard attributes for a ``<hp:subList>`` element.
|
|
@@ -2425,6 +2487,9 @@ class HwpxOxmlTableCell:
|
|
|
2425
2487
|
|
|
2426
2488
|
@property
|
|
2427
2489
|
def text(self) -> str:
|
|
2490
|
+
paragraphs = self.paragraphs
|
|
2491
|
+
if paragraphs:
|
|
2492
|
+
return "\n".join(paragraph.text or "" for paragraph in paragraphs)
|
|
2428
2493
|
parts: list[str] = []
|
|
2429
2494
|
for t_elem in self.element.findall(f".//{_HP}t"):
|
|
2430
2495
|
if t_elem.text:
|
|
@@ -2433,8 +2498,79 @@ class HwpxOxmlTableCell:
|
|
|
2433
2498
|
|
|
2434
2499
|
@text.setter
|
|
2435
2500
|
def text(self, value: str) -> None:
|
|
2501
|
+
self.set_text(value)
|
|
2502
|
+
|
|
2503
|
+
def _first_run_char_pr_id_ref(self) -> str:
|
|
2504
|
+
for paragraph in self.paragraphs:
|
|
2505
|
+
for run in paragraph.runs:
|
|
2506
|
+
if run.char_pr_id_ref is not None:
|
|
2507
|
+
return str(run.char_pr_id_ref)
|
|
2508
|
+
return "0"
|
|
2509
|
+
|
|
2510
|
+
def _paragraph_format_attrs(self, paragraph: "HwpxOxmlParagraph" | None = None) -> dict[str, str]:
|
|
2511
|
+
source = paragraph.element if paragraph is not None else None
|
|
2512
|
+
attrs = dict(_default_cell_paragraph_attributes())
|
|
2513
|
+
if source is not None:
|
|
2514
|
+
for key in ("paraPrIDRef", "styleIDRef", "pageBreak", "columnBreak", "merged"):
|
|
2515
|
+
value = source.get(key)
|
|
2516
|
+
if value is not None:
|
|
2517
|
+
attrs[key] = value
|
|
2518
|
+
attrs["id"] = _paragraph_id()
|
|
2519
|
+
return attrs
|
|
2520
|
+
|
|
2521
|
+
def _run_char_pr_for_line(self, paragraphs: Sequence["HwpxOxmlParagraph"], index: int) -> str:
|
|
2522
|
+
if index < len(paragraphs):
|
|
2523
|
+
for run in paragraphs[index].runs:
|
|
2524
|
+
if run.char_pr_id_ref is not None:
|
|
2525
|
+
return str(run.char_pr_id_ref)
|
|
2526
|
+
return self._first_run_char_pr_id_ref()
|
|
2527
|
+
|
|
2528
|
+
def _set_split_paragraph_text(self, value: str) -> None:
|
|
2529
|
+
sublist = self._ensure_sublist()
|
|
2530
|
+
existing = self.paragraphs
|
|
2531
|
+
lines = (value or "").replace("\r\n", "\n").replace("\r", "\n").split("\n")
|
|
2532
|
+
if not lines:
|
|
2533
|
+
lines = [""]
|
|
2534
|
+
|
|
2535
|
+
for paragraph in list(sublist.findall(f"{_HP}p")):
|
|
2536
|
+
sublist.remove(paragraph)
|
|
2537
|
+
|
|
2538
|
+
for index, line in enumerate(lines):
|
|
2539
|
+
source = existing[index] if index < len(existing) else existing[0] if existing else None
|
|
2540
|
+
paragraph = _append_child(sublist, f"{_HP}p", self._paragraph_format_attrs(source))
|
|
2541
|
+
run = _append_child(
|
|
2542
|
+
paragraph,
|
|
2543
|
+
f"{_HP}run",
|
|
2544
|
+
{"charPrIDRef": self._run_char_pr_for_line(existing, index)},
|
|
2545
|
+
)
|
|
2546
|
+
_append_text_with_tabs(run, line)
|
|
2547
|
+
|
|
2548
|
+
def set_text(
|
|
2549
|
+
self,
|
|
2550
|
+
value: str,
|
|
2551
|
+
*,
|
|
2552
|
+
preserve_format: bool = True,
|
|
2553
|
+
split_paragraphs: bool = False,
|
|
2554
|
+
) -> None:
|
|
2555
|
+
if split_paragraphs:
|
|
2556
|
+
self._set_split_paragraph_text(value)
|
|
2557
|
+
self.element.set("dirty", "1")
|
|
2558
|
+
self.table.mark_dirty()
|
|
2559
|
+
return
|
|
2560
|
+
|
|
2436
2561
|
text_element = self._ensure_text_element()
|
|
2437
2562
|
text_element.text = _sanitize_text(value)
|
|
2563
|
+
for node in self.element.findall(f".//{_HP}t"):
|
|
2564
|
+
if node is text_element:
|
|
2565
|
+
continue
|
|
2566
|
+
if node.text:
|
|
2567
|
+
node.text = ""
|
|
2568
|
+
if not preserve_format:
|
|
2569
|
+
run = text_element
|
|
2570
|
+
while run is not None and _element_local_name(run) != "run":
|
|
2571
|
+
run = run.getparent() if hasattr(run, "getparent") else None
|
|
2572
|
+
if run is not None:
|
|
2573
|
+
run.set("charPrIDRef", "0")
|
|
2438
2574
|
self.element.set("dirty", "1")
|
|
2439
2575
|
self.table.mark_dirty()
|
|
2440
2576
|
|
|
@@ -2898,6 +3034,8 @@ class HwpxOxmlTable:
|
|
|
2898
3034
|
*,
|
|
2899
3035
|
logical: bool = False,
|
|
2900
3036
|
split_merged: bool = False,
|
|
3037
|
+
preserve_format: bool = True,
|
|
3038
|
+
split_paragraphs: bool = False,
|
|
2901
3039
|
) -> None:
|
|
2902
3040
|
if logical:
|
|
2903
3041
|
entry = self._grid_entry(row_index, col_index)
|
|
@@ -2907,7 +3045,11 @@ class HwpxOxmlTable:
|
|
|
2907
3045
|
cell = entry.cell
|
|
2908
3046
|
else:
|
|
2909
3047
|
cell = self.cell(row_index, col_index)
|
|
2910
|
-
cell.
|
|
3048
|
+
cell.set_text(
|
|
3049
|
+
text,
|
|
3050
|
+
preserve_format=preserve_format,
|
|
3051
|
+
split_paragraphs=split_paragraphs,
|
|
3052
|
+
)
|
|
2911
3053
|
|
|
2912
3054
|
def split_merged_cell(
|
|
2913
3055
|
self, row_index: int, col_index: int
|
|
@@ -3797,7 +3939,10 @@ class HwpxOxmlParagraph:
|
|
|
3797
3939
|
sublist = _append_child(note_element, f"{_HP}subList", _default_sublist_attributes())
|
|
3798
3940
|
p_attrs = {"id": _paragraph_id(), **_DEFAULT_PARAGRAPH_ATTRS}
|
|
3799
3941
|
paragraph = _append_child(sublist, f"{_HP}p", p_attrs)
|
|
3800
|
-
|
|
3942
|
+
# 본문 run의 charPrIDRef도 인자를 따라가도록 적용 (host run과 동일 스타일).
|
|
3943
|
+
# None이면 "0"(default).
|
|
3944
|
+
body_cpr = "0" if char_pr_id_ref is None else str(char_pr_id_ref)
|
|
3945
|
+
note_run = _append_child(paragraph, f"{_HP}run", {"charPrIDRef": body_cpr})
|
|
3801
3946
|
t = _append_child(note_run, f"{_HP}t", {})
|
|
3802
3947
|
t.text = _sanitize_text(text)
|
|
3803
3948
|
self.section.mark_dirty()
|
|
@@ -0,0 +1,488 @@
|
|
|
1
|
+
"""Rich HWPX → Markdown converter.
|
|
2
|
+
|
|
3
|
+
Preserves:
|
|
4
|
+
- 인라인 서식 (bold/italic/color/shade) via run charPrIDRef diff
|
|
5
|
+
- 표 병합 셀 (colspan/rowspan) via HTML
|
|
6
|
+
- 중첩 표 재귀 HTML
|
|
7
|
+
- 도형(rect/ellipse/polygon) 내부 paragraph
|
|
8
|
+
- 이미지 (BinData → )
|
|
9
|
+
- 헤딩 (Ⅰ. / 1. 패턴)
|
|
10
|
+
- 각주/미주 정확 위치 + fn1/en1 일련번호 + 본문 인라인 서식
|
|
11
|
+
- 하이퍼링크 [text](url) (fieldBegin/End 추적)
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
import re
|
|
17
|
+
from html import escape as html_escape
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Union
|
|
20
|
+
|
|
21
|
+
from ..document import HwpxDocument
|
|
22
|
+
from ..oxml.namespaces import tag_local_name
|
|
23
|
+
|
|
24
|
+
# 도형은 rect/ellipse/polygon만 순회. drawText/container는 이들의 자식이라
|
|
25
|
+
# 별도 순회하면 같은 paragraph가 중복 처리됨.
|
|
26
|
+
SHAPE_TAGS = ("rect", "ellipse", "polygon")
|
|
27
|
+
|
|
28
|
+
ROMAN_HEAD = re.compile(r"^\s*[ⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ]\.\s*.+")
|
|
29
|
+
ARABIC_HEAD = re.compile(r"^\s*\d+\.\s+[가-힣A-Za-z].+")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
# ──────────────────────────────────────────────────────────────────
|
|
33
|
+
# 인라인 서식
|
|
34
|
+
# ──────────────────────────────────────────────────────────────────
|
|
35
|
+
def _local_name(element) -> str:
|
|
36
|
+
return tag_local_name(str(element.tag))
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _direct_children(element, local_name: str):
|
|
40
|
+
return [child for child in list(element) if _local_name(child) == local_name]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _descendants(element, local_name: str):
|
|
44
|
+
return [
|
|
45
|
+
child
|
|
46
|
+
for child in element.iter()
|
|
47
|
+
if child is not element and _local_name(child) == local_name
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _first_descendant(element, local_name: str):
|
|
52
|
+
for child in element.iter():
|
|
53
|
+
if child is not element and _local_name(child) == local_name:
|
|
54
|
+
return child
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _has_descendant(element, local_name: str) -> bool:
|
|
59
|
+
return _first_descendant(element, local_name) is not None
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _escape_markdown_text(text: str) -> str:
|
|
63
|
+
"""Escape source text before applying generated Markdown/HTML wrappers."""
|
|
64
|
+
escaped = html_escape(text, quote=False)
|
|
65
|
+
for char in ("\\", "`", "*", "[", "]", "|"):
|
|
66
|
+
escaped = escaped.replace(char, "\\" + char)
|
|
67
|
+
return escaped
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _diff_style(cp, base_cp) -> dict:
|
|
71
|
+
if cp is None:
|
|
72
|
+
return {}
|
|
73
|
+
ca, a = cp.child_attributes, cp.attributes
|
|
74
|
+
base_ca = base_cp.child_attributes if base_cp is not None else {}
|
|
75
|
+
base_a = base_cp.attributes if base_cp is not None else {}
|
|
76
|
+
|
|
77
|
+
bold = "bold" in ca and "bold" not in base_ca
|
|
78
|
+
italic = "italic" in ca and "italic" not in base_ca
|
|
79
|
+
underline = (
|
|
80
|
+
ca.get("underline", {}).get("type", "NONE") != "NONE"
|
|
81
|
+
and base_ca.get("underline", {}).get("type", "NONE") == "NONE"
|
|
82
|
+
)
|
|
83
|
+
strike = (
|
|
84
|
+
ca.get("strikeout", {}).get("shape", "NONE") != "NONE"
|
|
85
|
+
and base_ca.get("strikeout", {}).get("shape", "NONE") == "NONE"
|
|
86
|
+
)
|
|
87
|
+
color = a.get("textColor", "#000000")
|
|
88
|
+
base_color = base_a.get("textColor", "#000000")
|
|
89
|
+
# 흰색은 어두운 배경 위 디자인 효과로 가정 → 시각 의미 없음
|
|
90
|
+
color_changed = (
|
|
91
|
+
color != base_color and color.upper() not in ("#000000", "#FFFFFF")
|
|
92
|
+
)
|
|
93
|
+
shade = a.get("shadeColor", "none")
|
|
94
|
+
base_shade = base_a.get("shadeColor", "none")
|
|
95
|
+
shade_changed = shade.lower() not in ("none", "", base_shade.lower())
|
|
96
|
+
|
|
97
|
+
return {
|
|
98
|
+
"bold": bold,
|
|
99
|
+
"italic": italic,
|
|
100
|
+
"underline": underline,
|
|
101
|
+
"strike": strike,
|
|
102
|
+
"color": color if color_changed else None,
|
|
103
|
+
"shade": shade if shade_changed else None,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def _wrap(text: str, style: dict) -> str:
|
|
108
|
+
if not text:
|
|
109
|
+
return ""
|
|
110
|
+
out = text
|
|
111
|
+
if style.get("shade"):
|
|
112
|
+
out = f'<mark style="background-color:{style["shade"]}">{out}</mark>'
|
|
113
|
+
if style.get("color"):
|
|
114
|
+
out = f'<span style="color:{style["color"]}">{out}</span>'
|
|
115
|
+
if style.get("underline"):
|
|
116
|
+
out = f"<u>{out}</u>"
|
|
117
|
+
if style.get("strike"):
|
|
118
|
+
out = f"~~{out}~~"
|
|
119
|
+
if style.get("italic"):
|
|
120
|
+
out = f"*{out}*"
|
|
121
|
+
if style.get("bold"):
|
|
122
|
+
out = f"**{out}**"
|
|
123
|
+
return out
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _style_key(style: dict) -> tuple:
|
|
127
|
+
return tuple(sorted((k, v) for k, v in style.items() if v))
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _render_runs(items, base_cp, chars) -> str:
|
|
131
|
+
"""[(cpr_id, text)] 시퀀스를 인접 동일 서식 머지 후 markdown으로."""
|
|
132
|
+
groups: list[tuple[tuple, str]] = []
|
|
133
|
+
for cpr, text in items:
|
|
134
|
+
if not text:
|
|
135
|
+
continue
|
|
136
|
+
cp = chars.get(str(cpr), base_cp)
|
|
137
|
+
style = _diff_style(cp, base_cp)
|
|
138
|
+
key = _style_key(style)
|
|
139
|
+
escaped = _escape_markdown_text(text)
|
|
140
|
+
if groups and groups[-1][0] == key:
|
|
141
|
+
groups[-1] = (key, groups[-1][1] + escaped)
|
|
142
|
+
else:
|
|
143
|
+
groups.append((key, escaped))
|
|
144
|
+
return "".join(_wrap(text, dict(key)) for key, text in groups)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ──────────────────────────────────────────────────────────────────
|
|
148
|
+
# 이미지 매핑
|
|
149
|
+
# ──────────────────────────────────────────────────────────────────
|
|
150
|
+
def _build_image_map(
|
|
151
|
+
doc: HwpxDocument,
|
|
152
|
+
image_dir: Path | None,
|
|
153
|
+
image_ref_prefix: str | None,
|
|
154
|
+
) -> dict[str, str]:
|
|
155
|
+
"""doc._package의 BinData/* 를 image_dir에 추출하고 {ref_stem → rel_path} 반환.
|
|
156
|
+
image_dir이 None이면 추출 없이 빈 dict (마크다운에 ![image]() 안 들어감).
|
|
157
|
+
"""
|
|
158
|
+
if image_dir is None:
|
|
159
|
+
return {}
|
|
160
|
+
image_dir = Path(image_dir)
|
|
161
|
+
image_dir.mkdir(parents=True, exist_ok=True)
|
|
162
|
+
prefix = image_ref_prefix if image_ref_prefix is not None else image_dir.name
|
|
163
|
+
mapping: dict[str, str] = {}
|
|
164
|
+
pkg = doc._package
|
|
165
|
+
for name in pkg.files():
|
|
166
|
+
if not name.startswith("BinData/"):
|
|
167
|
+
continue
|
|
168
|
+
data = pkg.read(name)
|
|
169
|
+
fname = Path(name).name
|
|
170
|
+
(image_dir / fname).write_bytes(data)
|
|
171
|
+
mapping[Path(name).stem] = f"{prefix}/{fname}" if prefix else fname
|
|
172
|
+
return mapping
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def _paragraph_images(p_el, mapping: dict[str, str]) -> list[str]:
|
|
176
|
+
"""paragraph element 안 모든 <hp:pic> → markdown 이미지 라인."""
|
|
177
|
+
out = []
|
|
178
|
+
for pic in _descendants(p_el, "pic"):
|
|
179
|
+
img = _first_descendant(pic, "img")
|
|
180
|
+
if img is None:
|
|
181
|
+
continue
|
|
182
|
+
ref = img.get("binaryItemIDRef")
|
|
183
|
+
if not ref or not mapping:
|
|
184
|
+
continue
|
|
185
|
+
rel = mapping.get(ref, f"BinData/{ref}")
|
|
186
|
+
out.append(f"")
|
|
187
|
+
return out
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
# ──────────────────────────────────────────────────────────────────
|
|
191
|
+
# Paragraph element → markdown (재귀 진입점)
|
|
192
|
+
# ──────────────────────────────────────────────────────────────────
|
|
193
|
+
def _p_element_to_md(p_el, doc, notes_out: list | None = None) -> str:
|
|
194
|
+
chars = doc._root.char_properties
|
|
195
|
+
base_cp = chars.get("0")
|
|
196
|
+
|
|
197
|
+
output: list[str] = []
|
|
198
|
+
items: list[tuple] = []
|
|
199
|
+
link_url: str | None = None
|
|
200
|
+
link_items: list[tuple] = []
|
|
201
|
+
|
|
202
|
+
def flush_items():
|
|
203
|
+
nonlocal items
|
|
204
|
+
if items:
|
|
205
|
+
output.append(_render_runs(items, base_cp, chars))
|
|
206
|
+
items = []
|
|
207
|
+
|
|
208
|
+
def flush_link():
|
|
209
|
+
nonlocal link_url, link_items
|
|
210
|
+
if link_url is None:
|
|
211
|
+
return
|
|
212
|
+
text = _render_runs(link_items, base_cp, chars)
|
|
213
|
+
if text:
|
|
214
|
+
output.append(f"[{text}]({link_url})" if link_url else text)
|
|
215
|
+
link_url = None
|
|
216
|
+
link_items = []
|
|
217
|
+
|
|
218
|
+
def push_text(cpr, text):
|
|
219
|
+
if link_url is not None:
|
|
220
|
+
link_items.append((cpr, text))
|
|
221
|
+
else:
|
|
222
|
+
items.append((cpr, text))
|
|
223
|
+
|
|
224
|
+
for run in _direct_children(p_el, "run"):
|
|
225
|
+
cpr = run.get("charPrIDRef", "0")
|
|
226
|
+
for child in run:
|
|
227
|
+
tag = _local_name(child)
|
|
228
|
+
if tag == "t":
|
|
229
|
+
if child.text:
|
|
230
|
+
push_text(cpr, child.text)
|
|
231
|
+
elif tag == "ctrl":
|
|
232
|
+
for gc in child:
|
|
233
|
+
gctag = _local_name(gc)
|
|
234
|
+
if gctag == "fieldBegin" and gc.get("type") == "HYPERLINK":
|
|
235
|
+
flush_items()
|
|
236
|
+
link_url = gc.get("name", "")
|
|
237
|
+
elif gctag == "fieldEnd":
|
|
238
|
+
flush_link()
|
|
239
|
+
elif tag in ("footNote", "endNote"):
|
|
240
|
+
inst_id = child.get("instId", "")
|
|
241
|
+
kind = "fn" if tag == "footNote" else "en"
|
|
242
|
+
marker = f"[^{kind}{inst_id}]"
|
|
243
|
+
if link_url is not None:
|
|
244
|
+
flush_link()
|
|
245
|
+
else:
|
|
246
|
+
flush_items()
|
|
247
|
+
output.append(marker)
|
|
248
|
+
if notes_out is not None:
|
|
249
|
+
body_parts = []
|
|
250
|
+
for fp in _descendants(child, "p"):
|
|
251
|
+
sub_md = _p_element_to_md(fp, doc, None).strip()
|
|
252
|
+
if sub_md:
|
|
253
|
+
body_parts.append(sub_md)
|
|
254
|
+
notes_out.append((kind, inst_id, " ".join(body_parts)))
|
|
255
|
+
|
|
256
|
+
flush_items()
|
|
257
|
+
flush_link()
|
|
258
|
+
return "".join(output)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
# ──────────────────────────────────────────────────────────────────
|
|
262
|
+
# 도형 / 셀 / 표
|
|
263
|
+
# ──────────────────────────────────────────────────────────────────
|
|
264
|
+
def _shape_text_lines(scope_el, doc, notes_out: list | None = None) -> list[str]:
|
|
265
|
+
lines: list[str] = []
|
|
266
|
+
seen_p = set()
|
|
267
|
+
for tag in SHAPE_TAGS:
|
|
268
|
+
for shape in _descendants(scope_el, tag):
|
|
269
|
+
for sub_p in _descendants(shape, "p"):
|
|
270
|
+
pid = id(sub_p)
|
|
271
|
+
if pid in seen_p:
|
|
272
|
+
continue
|
|
273
|
+
seen_p.add(pid)
|
|
274
|
+
md = _p_element_to_md(sub_p, doc, notes_out).strip()
|
|
275
|
+
if md:
|
|
276
|
+
lines.append(md)
|
|
277
|
+
return lines
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
def _cell_to_md(cell, doc, mapping, depth: int = 0, notes_out: list | None = None) -> str:
|
|
281
|
+
chunks: list[str] = []
|
|
282
|
+
for cp in cell.paragraphs:
|
|
283
|
+
md = _p_element_to_md(cp.element, doc, notes_out).strip()
|
|
284
|
+
imgs = _paragraph_images(cp.element, mapping)
|
|
285
|
+
shape_lines = _shape_text_lines(cp.element, doc, notes_out)
|
|
286
|
+
if md:
|
|
287
|
+
chunks.append(md)
|
|
288
|
+
chunks.extend(shape_lines)
|
|
289
|
+
chunks.extend(imgs)
|
|
290
|
+
for sub in cp.tables:
|
|
291
|
+
chunks.append(_table_to_md(sub, doc, mapping, depth + 1, notes_out))
|
|
292
|
+
return "<br>".join(c for c in chunks if c).strip()
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def _table_to_md(tbl, doc, mapping, depth: int = 0, notes_out: list | None = None) -> str:
|
|
296
|
+
grid = tbl.get_cell_map()
|
|
297
|
+
rows, cols = tbl.row_count, tbl.column_count
|
|
298
|
+
has_merge = any(not pos.is_anchor for row in grid for pos in row)
|
|
299
|
+
|
|
300
|
+
if has_merge or depth > 0:
|
|
301
|
+
# 병합 셀 또는 중첩 — HTML
|
|
302
|
+
out = ["<table>"]
|
|
303
|
+
for r in range(rows):
|
|
304
|
+
out.append("<tr>")
|
|
305
|
+
for c in range(cols):
|
|
306
|
+
pos = grid[r][c]
|
|
307
|
+
if not pos.is_anchor:
|
|
308
|
+
continue
|
|
309
|
+
col_end = c
|
|
310
|
+
while (
|
|
311
|
+
col_end + 1 < cols
|
|
312
|
+
and not grid[r][col_end + 1].is_anchor
|
|
313
|
+
and grid[r][col_end + 1].cell is pos.cell
|
|
314
|
+
):
|
|
315
|
+
col_end += 1
|
|
316
|
+
row_end = r
|
|
317
|
+
while (
|
|
318
|
+
row_end + 1 < rows
|
|
319
|
+
and not grid[row_end + 1][c].is_anchor
|
|
320
|
+
and grid[row_end + 1][c].cell is pos.cell
|
|
321
|
+
):
|
|
322
|
+
row_end += 1
|
|
323
|
+
colspan = col_end - c + 1
|
|
324
|
+
rowspan = row_end - r + 1
|
|
325
|
+
attrs = []
|
|
326
|
+
if colspan > 1:
|
|
327
|
+
attrs.append(f'colspan="{colspan}"')
|
|
328
|
+
if rowspan > 1:
|
|
329
|
+
attrs.append(f'rowspan="{rowspan}"')
|
|
330
|
+
attr_s = (" " + " ".join(attrs)) if attrs else ""
|
|
331
|
+
content = _cell_to_md(pos.cell, doc, mapping, depth + 1, notes_out)
|
|
332
|
+
tag = "th" if r == 0 else "td"
|
|
333
|
+
out.append(f"<{tag}{attr_s}>{content}</{tag}>")
|
|
334
|
+
out.append("</tr>")
|
|
335
|
+
out.append("</table>")
|
|
336
|
+
return "\n".join(out)
|
|
337
|
+
|
|
338
|
+
# 단순 — GFM
|
|
339
|
+
lines = []
|
|
340
|
+
for r in range(rows):
|
|
341
|
+
cells = [
|
|
342
|
+
_cell_to_md(grid[r][c].cell, doc, mapping, depth + 1, notes_out)
|
|
343
|
+
for c in range(cols)
|
|
344
|
+
]
|
|
345
|
+
lines.append("| " + " | ".join(cells) + " |")
|
|
346
|
+
if r == 0:
|
|
347
|
+
lines.append("| " + " | ".join(["---"] * cols) + " |")
|
|
348
|
+
return "\n".join(lines)
|
|
349
|
+
|
|
350
|
+
|
|
351
|
+
# ──────────────────────────────────────────────────────────────────
|
|
352
|
+
# 헤딩 감지
|
|
353
|
+
# ──────────────────────────────────────────────────────────────────
|
|
354
|
+
def _detect_heading(text: str) -> str | None:
|
|
355
|
+
plain = re.sub(r"~~|\*\*|<[^>]+>|\*", "", text.strip())
|
|
356
|
+
plain = plain.replace("\\[", "[").replace("\\]", "]").replace("\\|", "|")
|
|
357
|
+
if ROMAN_HEAD.match(plain):
|
|
358
|
+
return f"# {plain}"
|
|
359
|
+
if ARABIC_HEAD.match(plain) and len(plain) < 40:
|
|
360
|
+
return f"## {plain}"
|
|
361
|
+
return None
|
|
362
|
+
|
|
363
|
+
|
|
364
|
+
# ──────────────────────────────────────────────────────────────────
|
|
365
|
+
# Public API
|
|
366
|
+
# ──────────────────────────────────────────────────────────────────
|
|
367
|
+
def export_markdown(
|
|
368
|
+
source: Union[HwpxDocument, str, Path, bytes],
|
|
369
|
+
*,
|
|
370
|
+
image_dir: Union[str, Path, None] = None,
|
|
371
|
+
image_ref_prefix: str | None = None,
|
|
372
|
+
detect_headings: bool = True,
|
|
373
|
+
notes_section_separator: str = "\n\n---\n",
|
|
374
|
+
) -> str:
|
|
375
|
+
"""HWPX → rich markdown.
|
|
376
|
+
|
|
377
|
+
Parameters
|
|
378
|
+
----------
|
|
379
|
+
source : HwpxDocument | path | bytes
|
|
380
|
+
HwpxDocument 인스턴스 또는 파일 경로/바이트.
|
|
381
|
+
image_dir : path | None
|
|
382
|
+
BinData/* 추출 대상 디렉토리. None이면 이미지 마커 생성하지 않음.
|
|
383
|
+
image_ref_prefix : str | None
|
|
384
|
+
markdown 이미지 경로의 prefix. None이면 image_dir의 basename.
|
|
385
|
+
detect_headings : bool
|
|
386
|
+
Ⅰ./1. 패턴 감지로 `#`/`##` 헤딩 격상 여부.
|
|
387
|
+
notes_section_separator : str
|
|
388
|
+
각주/미주 정의 부록 앞에 삽입할 separator.
|
|
389
|
+
"""
|
|
390
|
+
if isinstance(source, HwpxDocument):
|
|
391
|
+
doc = source
|
|
392
|
+
elif isinstance(source, (bytes, bytearray)):
|
|
393
|
+
import io
|
|
394
|
+
doc = HwpxDocument.open(io.BytesIO(source))
|
|
395
|
+
else:
|
|
396
|
+
doc = HwpxDocument.open(str(source))
|
|
397
|
+
|
|
398
|
+
mapping = _build_image_map(doc, Path(image_dir) if image_dir else None, image_ref_prefix)
|
|
399
|
+
notes: list[tuple] = []
|
|
400
|
+
lines: list[str] = []
|
|
401
|
+
|
|
402
|
+
for section in doc.sections:
|
|
403
|
+
for p in section.paragraphs:
|
|
404
|
+
md = _p_element_to_md(p.element, doc, notes).strip()
|
|
405
|
+
imgs = _paragraph_images(p.element, mapping)
|
|
406
|
+
tables = [_table_to_md(t, doc, mapping, 0, notes) for t in p.tables]
|
|
407
|
+
|
|
408
|
+
# 중복 가드 1: paragraph text가 표 셀 안에 동일하게 들어있으면 표가 정식
|
|
409
|
+
if md and p.tables:
|
|
410
|
+
plain = (p.text or "").strip()
|
|
411
|
+
all_cell_text = "".join(
|
|
412
|
+
(cell.text or "")
|
|
413
|
+
for tbl in p.tables
|
|
414
|
+
for row in tbl.rows
|
|
415
|
+
for cell in row.cells
|
|
416
|
+
)
|
|
417
|
+
if plain and plain in all_cell_text:
|
|
418
|
+
md = ""
|
|
419
|
+
|
|
420
|
+
# 중복 가드 2: 도형 보유 시 paragraph text는 도형 텍스트의 흘러나옴
|
|
421
|
+
if md and any(_has_descendant(p.element, tag) for tag in SHAPE_TAGS):
|
|
422
|
+
md = ""
|
|
423
|
+
|
|
424
|
+
# 도형 내부 paragraph 추출 (표 안 도형은 cell_to_md에서 처리됨)
|
|
425
|
+
shape_lines: list[str] = []
|
|
426
|
+
seen_p = set()
|
|
427
|
+
for sub in p.tables:
|
|
428
|
+
for nested_p in _descendants(sub.element, "p"):
|
|
429
|
+
seen_p.add(id(nested_p))
|
|
430
|
+
for tag in SHAPE_TAGS:
|
|
431
|
+
for shape in _descendants(p.element, tag):
|
|
432
|
+
for sub_p in _descendants(shape, "p"):
|
|
433
|
+
pid = id(sub_p)
|
|
434
|
+
if pid in seen_p:
|
|
435
|
+
continue
|
|
436
|
+
seen_p.add(pid)
|
|
437
|
+
sub_md = _p_element_to_md(sub_p, doc, notes).strip()
|
|
438
|
+
if sub_md:
|
|
439
|
+
shape_lines.append(sub_md)
|
|
440
|
+
|
|
441
|
+
# 헤딩 감지 (1x1 표 셀에 있는 경우 포함)
|
|
442
|
+
promoted = None
|
|
443
|
+
if detect_headings:
|
|
444
|
+
if md:
|
|
445
|
+
promoted = _detect_heading(md)
|
|
446
|
+
elif p.tables and len(p.tables) == 1:
|
|
447
|
+
t = p.tables[0]
|
|
448
|
+
if t.row_count == 1 and t.column_count == 1:
|
|
449
|
+
cell_text = _cell_to_md(
|
|
450
|
+
t.rows[0].cells[0], doc, mapping, 0, notes
|
|
451
|
+
)
|
|
452
|
+
promoted = _detect_heading(cell_text)
|
|
453
|
+
if promoted:
|
|
454
|
+
lines.append(promoted)
|
|
455
|
+
continue
|
|
456
|
+
|
|
457
|
+
if promoted:
|
|
458
|
+
lines.append(promoted)
|
|
459
|
+
elif md:
|
|
460
|
+
lines.append(md)
|
|
461
|
+
lines.extend(shape_lines)
|
|
462
|
+
lines.extend(imgs)
|
|
463
|
+
lines.extend(tables)
|
|
464
|
+
|
|
465
|
+
body = "\n\n".join(lines)
|
|
466
|
+
|
|
467
|
+
# 각주/미주 instId → fn1/en1 일련번호 매핑 + 정의 부록
|
|
468
|
+
if notes:
|
|
469
|
+
seq_map: dict[str, dict[str, int]] = {"fn": {}, "en": {}}
|
|
470
|
+
for kind, inst_id, _ in notes:
|
|
471
|
+
if inst_id not in seq_map[kind]:
|
|
472
|
+
seq_map[kind][inst_id] = len(seq_map[kind]) + 1
|
|
473
|
+
|
|
474
|
+
for kind, m in seq_map.items():
|
|
475
|
+
for inst_id, seq in m.items():
|
|
476
|
+
body = body.replace(f"[^{kind}{inst_id}]", f"[^{kind}{seq}]")
|
|
477
|
+
|
|
478
|
+
body += notes_section_separator
|
|
479
|
+
seen = set()
|
|
480
|
+
for kind, inst_id, text in notes:
|
|
481
|
+
key = (kind, inst_id)
|
|
482
|
+
if key in seen:
|
|
483
|
+
continue
|
|
484
|
+
seen.add(key)
|
|
485
|
+
seq = seq_map[kind][inst_id]
|
|
486
|
+
body += f"\n[^{kind}{seq}]: {text}\n"
|
|
487
|
+
|
|
488
|
+
return body
|
hwpx/tools/table_navigation.py
CHANGED
|
@@ -41,10 +41,14 @@ class TableMapEntry(TypedDict):
|
|
|
41
41
|
|
|
42
42
|
table_index: int
|
|
43
43
|
paragraph_index: int
|
|
44
|
+
location: dict[str, object]
|
|
44
45
|
rows: int
|
|
45
46
|
cols: int
|
|
47
|
+
caption_text: str
|
|
48
|
+
preceding_paragraph_text: str
|
|
46
49
|
header_text: str
|
|
47
50
|
first_row_preview: list[str]
|
|
51
|
+
cells: list[dict[str, object]]
|
|
48
52
|
is_empty: bool
|
|
49
53
|
|
|
50
54
|
|
|
@@ -107,6 +111,8 @@ class TableFillResult(TypedDict):
|
|
|
107
111
|
class _AnchoredTable:
|
|
108
112
|
table: HwpxOxmlTable
|
|
109
113
|
paragraph_index: int
|
|
114
|
+
caption_text: str
|
|
115
|
+
preceding_paragraph_text: str
|
|
110
116
|
header_text: str
|
|
111
117
|
|
|
112
118
|
|
|
@@ -115,6 +121,8 @@ class _IndexedTable:
|
|
|
115
121
|
table_index: int
|
|
116
122
|
table: HwpxOxmlTable
|
|
117
123
|
paragraph_index: int
|
|
124
|
+
caption_text: str
|
|
125
|
+
preceding_paragraph_text: str
|
|
118
126
|
header_text: str
|
|
119
127
|
|
|
120
128
|
|
|
@@ -193,6 +201,8 @@ def _collect_tables_from_paragraph(
|
|
|
193
201
|
_AnchoredTable(
|
|
194
202
|
table=table,
|
|
195
203
|
paragraph_index=anchor_paragraph_index,
|
|
204
|
+
caption_text=paragraph_prefix_text,
|
|
205
|
+
preceding_paragraph_text=last_header_text,
|
|
196
206
|
header_text=header_text,
|
|
197
207
|
)
|
|
198
208
|
)
|
|
@@ -227,6 +237,8 @@ def _collect_document_tables(document: HwpxDocument) -> list[_IndexedTable]:
|
|
|
227
237
|
table_index=table_index,
|
|
228
238
|
table=item.table,
|
|
229
239
|
paragraph_index=item.paragraph_index,
|
|
240
|
+
caption_text=item.caption_text,
|
|
241
|
+
preceding_paragraph_text=item.preceding_paragraph_text,
|
|
230
242
|
header_text=item.header_text,
|
|
231
243
|
)
|
|
232
244
|
for table_index, item in enumerate(anchored_tables)
|
|
@@ -234,7 +246,11 @@ def _collect_document_tables(document: HwpxDocument) -> list[_IndexedTable]:
|
|
|
234
246
|
|
|
235
247
|
|
|
236
248
|
def _cell_text(table: HwpxOxmlTable, row_index: int, col_index: int) -> str:
|
|
237
|
-
|
|
249
|
+
cell = table.cell(row_index, col_index)
|
|
250
|
+
paragraphs = list(getattr(cell, "paragraphs", []) or [])
|
|
251
|
+
if paragraphs:
|
|
252
|
+
return "\n".join(paragraph.text or "" for paragraph in paragraphs)
|
|
253
|
+
return cell.text
|
|
238
254
|
|
|
239
255
|
|
|
240
256
|
def _table_is_empty(table: HwpxOxmlTable) -> bool:
|
|
@@ -251,6 +267,62 @@ def _first_row_preview(table: HwpxOxmlTable) -> list[str]:
|
|
|
251
267
|
return [_cell_text(table, 0, col_index) for col_index in range(table.column_count)]
|
|
252
268
|
|
|
253
269
|
|
|
270
|
+
def _body_paragraph_location(paragraph_index: int) -> dict[str, object]:
|
|
271
|
+
return {"kind": "body_paragraph", "paragraph_index": paragraph_index}
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
def _table_cell_paragraph_location(
|
|
275
|
+
table_index: int,
|
|
276
|
+
row_index: int,
|
|
277
|
+
col_index: int,
|
|
278
|
+
cell_paragraph_index: int,
|
|
279
|
+
) -> dict[str, object]:
|
|
280
|
+
return {
|
|
281
|
+
"kind": "table_cell_paragraph",
|
|
282
|
+
"table_index": table_index,
|
|
283
|
+
"row": row_index,
|
|
284
|
+
"col": col_index,
|
|
285
|
+
"cell_paragraph_index": cell_paragraph_index,
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def _table_cells(table_ref: _IndexedTable) -> list[dict[str, object]]:
|
|
290
|
+
cells: list[dict[str, object]] = []
|
|
291
|
+
for row_index in range(table_ref.table.row_count):
|
|
292
|
+
for col_index in range(table_ref.table.column_count):
|
|
293
|
+
cell = table_ref.table.cell(row_index, col_index)
|
|
294
|
+
paragraphs = list(getattr(cell, "paragraphs", []) or [])
|
|
295
|
+
paragraph_payloads: list[dict[str, object]] = []
|
|
296
|
+
for cell_paragraph_index, paragraph in enumerate(paragraphs):
|
|
297
|
+
paragraph_payloads.append(
|
|
298
|
+
{
|
|
299
|
+
"cell_paragraph_index": cell_paragraph_index,
|
|
300
|
+
"text": paragraph.text or "",
|
|
301
|
+
"location": _table_cell_paragraph_location(
|
|
302
|
+
table_ref.table_index,
|
|
303
|
+
row_index,
|
|
304
|
+
col_index,
|
|
305
|
+
cell_paragraph_index,
|
|
306
|
+
),
|
|
307
|
+
}
|
|
308
|
+
)
|
|
309
|
+
cells.append(
|
|
310
|
+
{
|
|
311
|
+
"row": row_index,
|
|
312
|
+
"col": col_index,
|
|
313
|
+
"text": _cell_text(table_ref.table, row_index, col_index),
|
|
314
|
+
"paragraphs": paragraph_payloads,
|
|
315
|
+
"location": {
|
|
316
|
+
"kind": "table_cell",
|
|
317
|
+
"table_index": table_ref.table_index,
|
|
318
|
+
"row": row_index,
|
|
319
|
+
"col": col_index,
|
|
320
|
+
},
|
|
321
|
+
}
|
|
322
|
+
)
|
|
323
|
+
return cells
|
|
324
|
+
|
|
325
|
+
|
|
254
326
|
def _direction_delta(direction: PathDirection) -> tuple[int, int]:
|
|
255
327
|
if direction == "right":
|
|
256
328
|
return (0, 1)
|
|
@@ -337,10 +409,14 @@ def get_table_map(document: HwpxDocument) -> TableMapResult:
|
|
|
337
409
|
{
|
|
338
410
|
"table_index": table_ref.table_index,
|
|
339
411
|
"paragraph_index": table_ref.paragraph_index,
|
|
412
|
+
"location": _body_paragraph_location(table_ref.paragraph_index),
|
|
340
413
|
"rows": table_ref.table.row_count,
|
|
341
414
|
"cols": table_ref.table.column_count,
|
|
415
|
+
"caption_text": table_ref.caption_text,
|
|
416
|
+
"preceding_paragraph_text": table_ref.preceding_paragraph_text,
|
|
342
417
|
"header_text": table_ref.header_text,
|
|
343
418
|
"first_row_preview": _first_row_preview(table_ref.table),
|
|
419
|
+
"cells": _table_cells(table_ref),
|
|
344
420
|
"is_empty": _table_is_empty(table_ref.table),
|
|
345
421
|
}
|
|
346
422
|
)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: python-hwpx
|
|
3
|
-
Version: 2.10.
|
|
3
|
+
Version: 2.10.2
|
|
4
4
|
Summary: 한글 없이 HWPX 문서를 열고, 편집하고, 생성하고, 검증하는 Python 자동화 라이브러리
|
|
5
5
|
Author: python-hwpx Maintainers
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -115,6 +115,47 @@ hwpx-validate-package 보고서.hwpx
|
|
|
115
115
|
hwpx-analyze-template 보고서.hwpx
|
|
116
116
|
```
|
|
117
117
|
|
|
118
|
+
### 4. 풍부한 Markdown 변환 (서식·표·각주·이미지 보존)
|
|
119
|
+
|
|
120
|
+
`export_markdown()`는 단순 평문 추출이고, `export_rich_markdown()`는 인라인 서식(`**굵게**`, `*기울임*`, `~~취소선~~`),
|
|
121
|
+
표(중첩 포함, colspan/rowspan 안전), 도형 텍스트, 이미지, 각주/미주, 하이퍼링크, 제목(`#`/`##`) 자동 감지까지 보존한다.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from hwpx import HwpxDocument
|
|
125
|
+
|
|
126
|
+
doc = HwpxDocument.open("보고서.hwpx")
|
|
127
|
+
|
|
128
|
+
md = doc.export_rich_markdown(
|
|
129
|
+
image_dir="out/images", # BinData 이미지를 디스크에 추출
|
|
130
|
+
image_ref_prefix="images/", # 마크다운 내  경로 접두
|
|
131
|
+
detect_headings=True, # Ⅰ./1. 패턴 기반 #/## 자동
|
|
132
|
+
)
|
|
133
|
+
print(md)
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
문자열·경로·바이트도 그대로 받는다:
|
|
137
|
+
|
|
138
|
+
```python
|
|
139
|
+
from hwpx.tools.markdown_export import export_markdown
|
|
140
|
+
|
|
141
|
+
md = export_markdown("보고서.hwpx") # 경로
|
|
142
|
+
md = export_markdown(open("a.hwpx", "rb").read()) # bytes
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### 5. 각주 본문에 혼합 서식 / 하이퍼링크 추가
|
|
146
|
+
|
|
147
|
+
`HwpxOxmlNote`에 `body_paragraph`, `add_run`, `add_hyperlink` helper가 있어 각주 본문을
|
|
148
|
+
직접 paragraph로 다루지 않고도 인라인 서식·링크를 손쉽게 채울 수 있다.
|
|
149
|
+
|
|
150
|
+
```python
|
|
151
|
+
para = section.paragraphs[0]
|
|
152
|
+
note = para.add_footnote("") # 빈 각주 생성 후 본문 구성
|
|
153
|
+
note.add_run("자세한 내용은 ", )
|
|
154
|
+
note.add_run("정부 공식 사이트", bold=True)
|
|
155
|
+
note.add_run("를 참고하라: ")
|
|
156
|
+
note.add_hyperlink("https://www.kasa.go.kr", "우주항공청")
|
|
157
|
+
```
|
|
158
|
+
|
|
118
159
|
처음에는 `open/new -> edit/extract -> save_to_path` 흐름만 잡으면 된다. 패키지 구조, XML 파트, 템플릿 회귀 점검은 필요할 때만 확장하면 된다.
|
|
119
160
|
|
|
120
161
|
## 어디부터 읽으면 되나
|
|
@@ -244,6 +285,7 @@ doc.set_footer_text("1 / 10", page_type="BOTH")
|
|
|
244
285
|
# 표 셀 병합·분할
|
|
245
286
|
table.merge_cells(0, 0, 1, 1) # (0,0)~(1,1) 병합
|
|
246
287
|
table.set_cell_text(0, 0, "병합된 셀", logical=True, split_merged=True)
|
|
288
|
+
table.set_cell_text(0, 0, "line 1\nline 2", split_paragraphs=True)
|
|
247
289
|
|
|
248
290
|
# 양식형 표 자동 채우기
|
|
249
291
|
form = doc.add_table(2, 2)
|
|
@@ -257,6 +299,12 @@ doc.fill_by_path({
|
|
|
257
299
|
})
|
|
258
300
|
```
|
|
259
301
|
|
|
302
|
+
`doc.paragraphs`의 인덱스는 본문 직속 문단 0-based 기준입니다. 표 안 문단은
|
|
303
|
+
본문 `paragraph_index`에 섞지 않고 `get_table_map()`의 cell `location`
|
|
304
|
+
(`table_index`, `row`, `col`, `cell_paragraph_index`)으로 다룹니다.
|
|
305
|
+
`get_table_map()`은 `caption_text`와 `preceding_paragraph_text`를 분리해
|
|
306
|
+
반환하고, 셀 미리보기의 여러 문단은 `\n`으로 유지합니다.
|
|
307
|
+
|
|
260
308
|
### 🔍 텍스트 추출 & 검색
|
|
261
309
|
|
|
262
310
|
```python
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
hwpx/__init__.py,sha256=ouwTSF8JrUPVgwWxB1hudQwVdhAA981uHeX_wXxxQHo,2205
|
|
2
2
|
hwpx/authoring.py,sha256=caZfPFe99ilaJMDJEDRsWKCb-QKAp18M0vRlPdM0PR0,96068
|
|
3
|
-
hwpx/document.py,sha256=
|
|
3
|
+
hwpx/document.py,sha256=1kb0n6C5cEiex7Bs58MlLhFXI8mknQFqErTkCYaFuQE,55204
|
|
4
4
|
hwpx/form_fill.py,sha256=VUIU53Qa9Ho2aP72biDvJwnDW7ngdAzu3PSd5A7d1JM,9908
|
|
5
5
|
hwpx/package.py,sha256=0rKjGCJbPQvrVBIy07Jpjsu3fI7HhbqFCGWTiTDsJpo,1141
|
|
6
6
|
hwpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -16,7 +16,7 @@ hwpx/opc/xml_utils.py,sha256=O_eZtp1-8vWimoi9Xdy0uzmtk8bnkfjf-QLjD_uWSFQ,3483
|
|
|
16
16
|
hwpx/oxml/__init__.py,sha256=tUoiHQw3oJpHvSES6f5AuhpfXvlby0Df-3L0t-CMhxM,5000
|
|
17
17
|
hwpx/oxml/body.py,sha256=VOwlyDRgoLMlDePFbCjU3qlBAefO9DIoSGsEI2Fr8DA,24888
|
|
18
18
|
hwpx/oxml/common.py,sha256=TJkafzg7x4T3J29tZchRZk57ZTsrM9PEiqGT3rX3w5o,1044
|
|
19
|
-
hwpx/oxml/document.py,sha256=
|
|
19
|
+
hwpx/oxml/document.py,sha256=WIKmJ-nW0Wqh2vEf-06-5gx6EpQc-TvbJr-nze3VYtQ,214823
|
|
20
20
|
hwpx/oxml/header.py,sha256=_KgKsCN6UWB8r59z2iqe0rLC8EdEZyJD7GfQ0Xd2WXM,43080
|
|
21
21
|
hwpx/oxml/header_part.py,sha256=U3tXD1LWruAdQV-w9cIBv8iXPpQ1oUm0CXlxAAonZ6I,231
|
|
22
22
|
hwpx/oxml/memo.py,sha256=WSJSTYOSLKG836eF_UsrD99hMqJhWwzRZ8pJbHq-nsA,228
|
|
@@ -34,6 +34,7 @@ hwpx/tools/archive_cli.py,sha256=rlgE6KBeJORa8Z6RhGOVmOl7gGIKdgA9GY106EFouVo,122
|
|
|
34
34
|
hwpx/tools/exporter.py,sha256=hx7th-LAL1a5G0ICyVcyJPJaUY5jEgDJUZ7UYg_YAmI,6578
|
|
35
35
|
hwpx/tools/generic_inventory.py,sha256=pHVP8-htX_vO02ARdQR37XFxm7fUPK68VtMeeOJ1NZY,4835
|
|
36
36
|
hwpx/tools/id_integrity.py,sha256=_Ra981ZPX1WXH_bK-2KNhCnwPVYErfdX2wW4SosX0Ls,9256
|
|
37
|
+
hwpx/tools/markdown_export.py,sha256=FejutCpQHbycO185uljcSwfZuwXMTbGEgXtf5e-a4_k,19139
|
|
37
38
|
hwpx/tools/object_finder.py,sha256=7i6XI1-r7-ar_IzSZQ82hfOcxVzJFK2XjMDB8oxcmMA,13478
|
|
38
39
|
hwpx/tools/package_validator.py,sha256=87uv7uVh6wqqY8-woX9kAGnwuWK3uYL4BHfGf7NNgcs,14521
|
|
39
40
|
hwpx/tools/page_guard.py,sha256=nDAVPcvrnuyDxVTA_j22wiYD7CXAD6XlzsMzaz3h_q8,9701
|
|
@@ -43,17 +44,17 @@ hwpx/tools/report_parser.py,sha256=3Daqn2hqIcj5pG1qUxeYbvWr7CvdhwzatWvxCCcnSZg,4
|
|
|
43
44
|
hwpx/tools/report_utils.py,sha256=6HYEeQc3ZxTpxbwF11s47uZ-KmV4tsHPE1MV4491KDE,4434
|
|
44
45
|
hwpx/tools/roundtrip_diff.py,sha256=ao0AdpDJkq89u5hwcrsxTijvSsia9Jaw1OOnh4WAco4,1365
|
|
45
46
|
hwpx/tools/table_cleanup.py,sha256=0_f6NnvNp3QD4owKd_bRX6FZbeUmoQC7a4_VGzF2SCE,1796
|
|
46
|
-
hwpx/tools/table_navigation.py,sha256=
|
|
47
|
+
hwpx/tools/table_navigation.py,sha256=rtbrWFKpJhqC3LD0ZXImyHgjmDR2hjHCFy3_S-qNBwA,16479
|
|
47
48
|
hwpx/tools/template_analyzer.py,sha256=qZMIyB-r4YXZqU54v6uwt_CQiOAQR0mVgmo_Bt4biWM,8497
|
|
48
49
|
hwpx/tools/text_extract_cli.py,sha256=BmsDAwNXpDPhEayb9ez2ORtGNzPd_Xxduy4_cLXhnUw,2188
|
|
49
50
|
hwpx/tools/text_extractor.py,sha256=dqGzOnJVRUEfrxiTt04GkDrfY4yfZXRIhPtEwTM77Mw,25289
|
|
50
51
|
hwpx/tools/validator.py,sha256=LMo8gIMoptP9RRDbYKV4WwrM59rclC5h3HP-ZJRUxO0,6856
|
|
51
52
|
hwpx/tools/_schemas/header.xsd,sha256=mJXuFMuHGT1JnFFaluUpYUglwjMCNlfbFCRVM26eHXE,664
|
|
52
53
|
hwpx/tools/_schemas/section.xsd,sha256=MgvavVHG05RDfUnVPxVU10H4FQOja5ON04_m9Uk_m7E,522
|
|
53
|
-
python_hwpx-2.10.
|
|
54
|
-
python_hwpx-2.10.
|
|
55
|
-
python_hwpx-2.10.
|
|
56
|
-
python_hwpx-2.10.
|
|
57
|
-
python_hwpx-2.10.
|
|
58
|
-
python_hwpx-2.10.
|
|
59
|
-
python_hwpx-2.10.
|
|
54
|
+
python_hwpx-2.10.2.dist-info/licenses/LICENSE,sha256=_ubz4wv-BkkT3l3gu-QuH7JGeVjuRYGZoZK95eNsCHU,9688
|
|
55
|
+
python_hwpx-2.10.2.dist-info/licenses/NOTICE,sha256=k48h6EaGQE8Y1c0dS9sIOOcz4YqkbcImWClF7pBOgsg,2473
|
|
56
|
+
python_hwpx-2.10.2.dist-info/METADATA,sha256=S3vl8kgL0d7BcCafoPk8AuV7otQmjutlivMFAvUNROA,18099
|
|
57
|
+
python_hwpx-2.10.2.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
58
|
+
python_hwpx-2.10.2.dist-info/entry_points.txt,sha256=JUKRxbly9UaeHV7YzOea23y8IiqSTcrhUlooP3fS_Zc,405
|
|
59
|
+
python_hwpx-2.10.2.dist-info/top_level.txt,sha256=R1iToqDh80Nf2oQhRjTN0rbN2X6kyDUizIocZjkhuxc,5
|
|
60
|
+
python_hwpx-2.10.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|