python-hwpx 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hwpx/__init__.py +23 -0
- hwpx/document.py +518 -0
- hwpx/opc/package.py +274 -0
- hwpx/oxml/__init__.py +138 -0
- hwpx/oxml/body.py +151 -0
- hwpx/oxml/common.py +31 -0
- hwpx/oxml/document.py +1932 -0
- hwpx/oxml/header.py +543 -0
- hwpx/oxml/parser.py +62 -0
- hwpx/oxml/schema.py +41 -0
- hwpx/oxml/utils.py +82 -0
- hwpx/package.py +202 -0
- hwpx/tools/__init__.py +36 -0
- hwpx/tools/_schemas/header.xsd +14 -0
- hwpx/tools/_schemas/section.xsd +12 -0
- hwpx/tools/object_finder.py +347 -0
- hwpx/tools/text_extractor.py +726 -0
- hwpx/tools/validator.py +184 -0
- python_hwpx-1.0.dist-info/LICENSE +32 -0
- python_hwpx-1.0.dist-info/METADATA +199 -0
- python_hwpx-1.0.dist-info/RECORD +24 -0
- python_hwpx-1.0.dist-info/WHEEL +5 -0
- python_hwpx-1.0.dist-info/entry_points.txt +2 -0
- python_hwpx-1.0.dist-info/top_level.txt +1 -0
hwpx/opc/package.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
1
|
+
"""Utilities for reading and writing HWPX OPC packages."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from io import BytesIO
|
|
7
|
+
from typing import BinaryIO, Iterable, Iterator, Mapping, MutableMapping
|
|
8
|
+
from xml.etree import ElementTree as ET
|
|
9
|
+
from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile, ZipInfo
|
|
10
|
+
|
|
11
|
+
__all__ = ["HwpxPackage", "HwpxPackageError", "HwpxStructureError", "RootFile", "VersionInfo"]
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class HwpxPackageError(Exception):
|
|
15
|
+
"""Base error raised for issues related to :class:`HwpxPackage`."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class HwpxStructureError(HwpxPackageError):
|
|
19
|
+
"""Raised when the underlying HWPX package violates the required structure."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass(frozen=True)
|
|
23
|
+
class RootFile:
|
|
24
|
+
"""Represents a ``rootfile`` entry from ``META-INF/container.xml``."""
|
|
25
|
+
|
|
26
|
+
full_path: str
|
|
27
|
+
media_type: str | None = None
|
|
28
|
+
|
|
29
|
+
def ensure_exists(self, files: Mapping[str, bytes]) -> None:
|
|
30
|
+
"""Ensure that the referenced root file actually exists in ``files``."""
|
|
31
|
+
|
|
32
|
+
if self.full_path not in files:
|
|
33
|
+
raise HwpxStructureError(
|
|
34
|
+
f"Root content '{self.full_path}' declared in container.xml is missing."
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class VersionInfo:
|
|
39
|
+
"""Model for the ``version.xml`` document."""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
element: ET.Element,
|
|
44
|
+
namespaces: Mapping[str, str],
|
|
45
|
+
xml_declaration: bytes | None,
|
|
46
|
+
) -> None:
|
|
47
|
+
self._element = element
|
|
48
|
+
self._namespaces = dict(namespaces)
|
|
49
|
+
self._xml_declaration = xml_declaration
|
|
50
|
+
self._dirty = False
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def from_bytes(cls, data: bytes) -> VersionInfo:
|
|
54
|
+
element = ET.fromstring(data)
|
|
55
|
+
namespaces = cls._collect_namespaces(data)
|
|
56
|
+
declaration = cls._extract_declaration(data)
|
|
57
|
+
return cls(element, namespaces, declaration)
|
|
58
|
+
|
|
59
|
+
@staticmethod
|
|
60
|
+
def _collect_namespaces(data: bytes) -> Mapping[str, str]:
|
|
61
|
+
namespaces: dict[str, str] = {}
|
|
62
|
+
for event, elem in ET.iterparse(BytesIO(data), events=("start-ns",)):
|
|
63
|
+
prefix, uri = elem
|
|
64
|
+
namespaces[prefix or ""] = uri
|
|
65
|
+
return namespaces
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def _extract_declaration(data: bytes) -> bytes | None:
|
|
69
|
+
data = data.lstrip()
|
|
70
|
+
if not data.startswith(b"<?xml"):
|
|
71
|
+
return None
|
|
72
|
+
end = data.find(b"?>")
|
|
73
|
+
if end == -1:
|
|
74
|
+
return None
|
|
75
|
+
return data[: end + 2]
|
|
76
|
+
|
|
77
|
+
@property
|
|
78
|
+
def attributes(self) -> Mapping[str, str]:
|
|
79
|
+
return dict(self._element.attrib)
|
|
80
|
+
|
|
81
|
+
def get(self, key: str, default: str | None = None) -> str | None:
|
|
82
|
+
return self._element.attrib.get(key, default)
|
|
83
|
+
|
|
84
|
+
def set(self, key: str, value: str) -> None:
|
|
85
|
+
self._element.attrib[key] = value
|
|
86
|
+
self._dirty = True
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def tag(self) -> str:
|
|
90
|
+
return self._element.tag
|
|
91
|
+
|
|
92
|
+
def to_bytes(self) -> bytes:
|
|
93
|
+
for prefix, uri in self._namespaces.items():
|
|
94
|
+
ET.register_namespace(prefix, uri)
|
|
95
|
+
stream = BytesIO()
|
|
96
|
+
tree = ET.ElementTree(self._element)
|
|
97
|
+
tree.write(stream, encoding="utf-8", xml_declaration=False)
|
|
98
|
+
xml_body = stream.getvalue()
|
|
99
|
+
if self._xml_declaration:
|
|
100
|
+
return self._xml_declaration + xml_body
|
|
101
|
+
return xml_body
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def dirty(self) -> bool:
|
|
105
|
+
return self._dirty
|
|
106
|
+
|
|
107
|
+
def mark_clean(self) -> None:
|
|
108
|
+
self._dirty = False
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
class HwpxPackage:
|
|
112
|
+
"""Represents an HWPX package backed by an Open Packaging Convention container."""
|
|
113
|
+
|
|
114
|
+
CONTAINER_PATH = "META-INF/container.xml"
|
|
115
|
+
VERSION_PATH = "version.xml"
|
|
116
|
+
MIMETYPE_PATH = "mimetype"
|
|
117
|
+
DEFAULT_MIMETYPE = "application/hwp+zip"
|
|
118
|
+
|
|
119
|
+
def __init__(
|
|
120
|
+
self,
|
|
121
|
+
files: MutableMapping[str, bytes],
|
|
122
|
+
rootfiles: Iterable[RootFile],
|
|
123
|
+
version_info: VersionInfo,
|
|
124
|
+
mimetype: str,
|
|
125
|
+
) -> None:
|
|
126
|
+
self._files = files
|
|
127
|
+
self._rootfiles = list(rootfiles)
|
|
128
|
+
self._version = version_info
|
|
129
|
+
self._mimetype = mimetype
|
|
130
|
+
self._validate_structure()
|
|
131
|
+
|
|
132
|
+
@classmethod
|
|
133
|
+
def open(cls, pkg_file: str | BinaryIO) -> HwpxPackage:
|
|
134
|
+
with ZipFile(pkg_file, "r") as zf:
|
|
135
|
+
files = {info.filename: zf.read(info) for info in zf.infolist()}
|
|
136
|
+
if cls.MIMETYPE_PATH not in files:
|
|
137
|
+
raise HwpxStructureError("HWPX package is missing the mandatory 'mimetype' file.")
|
|
138
|
+
mimetype = files[cls.MIMETYPE_PATH].decode("utf-8")
|
|
139
|
+
rootfiles = cls._parse_container(files.get(cls.CONTAINER_PATH))
|
|
140
|
+
version_info = cls._parse_version(files.get(cls.VERSION_PATH))
|
|
141
|
+
package = cls(files, rootfiles, version_info, mimetype)
|
|
142
|
+
return package
|
|
143
|
+
|
|
144
|
+
@staticmethod
|
|
145
|
+
def _parse_container(data: bytes | None) -> list[RootFile]:
|
|
146
|
+
if data is None:
|
|
147
|
+
raise HwpxStructureError(
|
|
148
|
+
"HWPX package is missing 'META-INF/container.xml'."
|
|
149
|
+
)
|
|
150
|
+
root = ET.fromstring(data)
|
|
151
|
+
rootfiles = []
|
|
152
|
+
for elem in root.findall(".//{*}rootfile"):
|
|
153
|
+
full_path = (
|
|
154
|
+
elem.get("full-path")
|
|
155
|
+
or elem.get("fullPath")
|
|
156
|
+
or elem.get("full_path")
|
|
157
|
+
)
|
|
158
|
+
if not full_path:
|
|
159
|
+
raise HwpxStructureError("container.xml contains a rootfile without 'full-path'.")
|
|
160
|
+
media_type = (
|
|
161
|
+
elem.get("media-type")
|
|
162
|
+
or elem.get("mediaType")
|
|
163
|
+
or elem.get("media_type")
|
|
164
|
+
)
|
|
165
|
+
rootfiles.append(RootFile(full_path, media_type))
|
|
166
|
+
if not rootfiles:
|
|
167
|
+
raise HwpxStructureError("container.xml does not declare any rootfiles.")
|
|
168
|
+
return rootfiles
|
|
169
|
+
|
|
170
|
+
@staticmethod
|
|
171
|
+
def _parse_version(data: bytes | None) -> VersionInfo:
|
|
172
|
+
if data is None:
|
|
173
|
+
raise HwpxStructureError("HWPX package is missing 'version.xml'.")
|
|
174
|
+
return VersionInfo.from_bytes(data)
|
|
175
|
+
|
|
176
|
+
def _validate_structure(self) -> None:
|
|
177
|
+
for rootfile in self._rootfiles:
|
|
178
|
+
rootfile.ensure_exists(self._files)
|
|
179
|
+
if not any(path.startswith(("Contents/", "Content/")) for path in self._files):
|
|
180
|
+
raise HwpxStructureError(
|
|
181
|
+
"HWPX package does not contain a 'Contents' directory."
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
@property
|
|
185
|
+
def mimetype(self) -> str:
|
|
186
|
+
return self._mimetype
|
|
187
|
+
|
|
188
|
+
@property
|
|
189
|
+
def rootfiles(self) -> tuple[RootFile, ...]:
|
|
190
|
+
return tuple(self._rootfiles)
|
|
191
|
+
|
|
192
|
+
def iter_rootfiles(self) -> Iterator[RootFile]:
|
|
193
|
+
yield from self._rootfiles
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
def main_content(self) -> RootFile:
|
|
197
|
+
for rootfile in self._rootfiles:
|
|
198
|
+
if rootfile.media_type == "application/hwpml-package+xml":
|
|
199
|
+
return rootfile
|
|
200
|
+
return self._rootfiles[0]
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def version_info(self) -> VersionInfo:
|
|
204
|
+
return self._version
|
|
205
|
+
|
|
206
|
+
def read(self, path: str) -> bytes:
|
|
207
|
+
norm_path = self._normalize_path(path)
|
|
208
|
+
try:
|
|
209
|
+
return self._files[norm_path]
|
|
210
|
+
except KeyError as exc:
|
|
211
|
+
raise HwpxPackageError(f"File '{norm_path}' is not present in the package.") from exc
|
|
212
|
+
|
|
213
|
+
def write(self, path: str, data: bytes | str) -> None:
|
|
214
|
+
norm_path = self._normalize_path(path)
|
|
215
|
+
if isinstance(data, str):
|
|
216
|
+
data = data.encode("utf-8")
|
|
217
|
+
pending_rootfiles: list[RootFile] | None = None
|
|
218
|
+
pending_version: VersionInfo | None = None
|
|
219
|
+
if norm_path == self.MIMETYPE_PATH:
|
|
220
|
+
mimetype = data.decode("utf-8")
|
|
221
|
+
elif norm_path == self.CONTAINER_PATH:
|
|
222
|
+
pending_rootfiles = self._parse_container(data)
|
|
223
|
+
elif norm_path == self.VERSION_PATH:
|
|
224
|
+
pending_version = self._parse_version(data)
|
|
225
|
+
self._files[norm_path] = data
|
|
226
|
+
if norm_path == self.MIMETYPE_PATH:
|
|
227
|
+
self._mimetype = mimetype
|
|
228
|
+
elif norm_path == self.CONTAINER_PATH:
|
|
229
|
+
assert pending_rootfiles is not None
|
|
230
|
+
self._rootfiles = pending_rootfiles
|
|
231
|
+
elif norm_path == self.VERSION_PATH:
|
|
232
|
+
assert pending_version is not None
|
|
233
|
+
self._version = pending_version
|
|
234
|
+
self._validate_structure()
|
|
235
|
+
|
|
236
|
+
def delete(self, path: str) -> None:
|
|
237
|
+
norm_path = self._normalize_path(path)
|
|
238
|
+
if norm_path not in self._files:
|
|
239
|
+
raise HwpxPackageError(f"File '{norm_path}' is not present in the package.")
|
|
240
|
+
if norm_path in {self.MIMETYPE_PATH, self.CONTAINER_PATH, self.VERSION_PATH}:
|
|
241
|
+
raise HwpxStructureError(
|
|
242
|
+
"Cannot remove mandatory files ('mimetype', 'container.xml', 'version.xml')."
|
|
243
|
+
)
|
|
244
|
+
del self._files[norm_path]
|
|
245
|
+
self._validate_structure()
|
|
246
|
+
|
|
247
|
+
@staticmethod
|
|
248
|
+
def _normalize_path(path: str) -> str:
|
|
249
|
+
return path.replace("\\", "/")
|
|
250
|
+
|
|
251
|
+
def files(self) -> list[str]:
|
|
252
|
+
return sorted(self._files)
|
|
253
|
+
|
|
254
|
+
def save(self, pkg_file: str | BinaryIO) -> None:
|
|
255
|
+
self._files[self.MIMETYPE_PATH] = self._mimetype.encode("utf-8")
|
|
256
|
+
if self._version.dirty:
|
|
257
|
+
self._files[self.VERSION_PATH] = self._version.to_bytes()
|
|
258
|
+
self._version.mark_clean()
|
|
259
|
+
self._validate_structure()
|
|
260
|
+
with ZipFile(pkg_file, "w") as zf:
|
|
261
|
+
self._write_mimetype(zf)
|
|
262
|
+
for name in sorted(self._files):
|
|
263
|
+
if name == self.MIMETYPE_PATH:
|
|
264
|
+
continue
|
|
265
|
+
data = self._files[name]
|
|
266
|
+
info = ZipInfo(name)
|
|
267
|
+
info.compress_type = ZIP_DEFLATED
|
|
268
|
+
zf.writestr(info, data)
|
|
269
|
+
|
|
270
|
+
def _write_mimetype(self, zf: ZipFile) -> None:
|
|
271
|
+
info = ZipInfo(self.MIMETYPE_PATH)
|
|
272
|
+
info.compress_type = ZIP_STORED
|
|
273
|
+
zf.writestr(info, self._files[self.MIMETYPE_PATH])
|
|
274
|
+
|
hwpx/oxml/__init__.py
ADDED
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
|
|
2
|
+
"""Open XML helpers for the HWPX document format."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from .body import (
|
|
7
|
+
Paragraph,
|
|
8
|
+
Run,
|
|
9
|
+
Section,
|
|
10
|
+
TextSpan,
|
|
11
|
+
parse_paragraph_element,
|
|
12
|
+
parse_run_element,
|
|
13
|
+
parse_section_element,
|
|
14
|
+
parse_text_span,
|
|
15
|
+
)
|
|
16
|
+
from .common import GenericElement, parse_generic_element
|
|
17
|
+
|
|
18
|
+
from .document import (
|
|
19
|
+
DocumentNumbering,
|
|
20
|
+
HwpxOxmlDocument,
|
|
21
|
+
HwpxOxmlHeader,
|
|
22
|
+
HwpxOxmlInlineObject,
|
|
23
|
+
HwpxOxmlMemo,
|
|
24
|
+
HwpxOxmlMemoGroup,
|
|
25
|
+
HwpxOxmlParagraph,
|
|
26
|
+
HwpxOxmlRun,
|
|
27
|
+
HwpxOxmlSection,
|
|
28
|
+
HwpxOxmlSectionHeaderFooter,
|
|
29
|
+
HwpxOxmlSectionProperties,
|
|
30
|
+
HwpxOxmlTable,
|
|
31
|
+
HwpxOxmlTableCell,
|
|
32
|
+
HwpxOxmlTableRow,
|
|
33
|
+
PageMargins,
|
|
34
|
+
PageSize,
|
|
35
|
+
RunStyle,
|
|
36
|
+
SectionStartNumbering,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
from .header import (
|
|
40
|
+
BeginNum,
|
|
41
|
+
BorderFillList,
|
|
42
|
+
CharProperty,
|
|
43
|
+
CharPropertyList,
|
|
44
|
+
DocOption,
|
|
45
|
+
Font,
|
|
46
|
+
FontFace,
|
|
47
|
+
FontFaceList,
|
|
48
|
+
FontSubstitution,
|
|
49
|
+
FontTypeInfo,
|
|
50
|
+
ForbiddenWordList,
|
|
51
|
+
Header,
|
|
52
|
+
KeyDerivation,
|
|
53
|
+
KeyEncryption,
|
|
54
|
+
LinkInfo,
|
|
55
|
+
LicenseMark,
|
|
56
|
+
MemoProperties,
|
|
57
|
+
MemoShape,
|
|
58
|
+
NumberingList,
|
|
59
|
+
RefList,
|
|
60
|
+
TabProperties,
|
|
61
|
+
TrackChangeConfig,
|
|
62
|
+
memo_shape_from_attributes,
|
|
63
|
+
parse_begin_num,
|
|
64
|
+
parse_doc_option,
|
|
65
|
+
parse_header_element,
|
|
66
|
+
parse_memo_properties,
|
|
67
|
+
parse_memo_shape,
|
|
68
|
+
parse_ref_list,
|
|
69
|
+
)
|
|
70
|
+
from .parser import element_to_model, parse_header_xml, parse_section_xml
|
|
71
|
+
from .schema import load_schema
|
|
72
|
+
from .utils import XmlSource
|
|
73
|
+
|
|
74
|
+
__all__ = [
|
|
75
|
+
"BeginNum",
|
|
76
|
+
"BorderFillList",
|
|
77
|
+
"CharProperty",
|
|
78
|
+
"CharPropertyList",
|
|
79
|
+
"DocOption",
|
|
80
|
+
"Font",
|
|
81
|
+
"FontFace",
|
|
82
|
+
"FontFaceList",
|
|
83
|
+
"FontSubstitution",
|
|
84
|
+
"FontTypeInfo",
|
|
85
|
+
"ForbiddenWordList",
|
|
86
|
+
"GenericElement",
|
|
87
|
+
"Header",
|
|
88
|
+
"DocumentNumbering",
|
|
89
|
+
"HwpxOxmlDocument",
|
|
90
|
+
"HwpxOxmlHeader",
|
|
91
|
+
"HwpxOxmlInlineObject",
|
|
92
|
+
"HwpxOxmlMemo",
|
|
93
|
+
"HwpxOxmlMemoGroup",
|
|
94
|
+
"HwpxOxmlParagraph",
|
|
95
|
+
"HwpxOxmlRun",
|
|
96
|
+
"HwpxOxmlSection",
|
|
97
|
+
"HwpxOxmlSectionHeaderFooter",
|
|
98
|
+
"HwpxOxmlSectionProperties",
|
|
99
|
+
"HwpxOxmlTable",
|
|
100
|
+
"HwpxOxmlTableCell",
|
|
101
|
+
"HwpxOxmlTableRow",
|
|
102
|
+
"KeyDerivation",
|
|
103
|
+
"KeyEncryption",
|
|
104
|
+
"LinkInfo",
|
|
105
|
+
"LicenseMark",
|
|
106
|
+
"MemoProperties",
|
|
107
|
+
"MemoShape",
|
|
108
|
+
"NumberingList",
|
|
109
|
+
"Paragraph",
|
|
110
|
+
"PageMargins",
|
|
111
|
+
"PageSize",
|
|
112
|
+
"RunStyle",
|
|
113
|
+
"memo_shape_from_attributes",
|
|
114
|
+
"RefList",
|
|
115
|
+
"Run",
|
|
116
|
+
"Section",
|
|
117
|
+
"SectionStartNumbering",
|
|
118
|
+
"TabProperties",
|
|
119
|
+
"TextSpan",
|
|
120
|
+
"TrackChangeConfig",
|
|
121
|
+
"XmlSource",
|
|
122
|
+
"element_to_model",
|
|
123
|
+
"load_schema",
|
|
124
|
+
"parse_begin_num",
|
|
125
|
+
"parse_doc_option",
|
|
126
|
+
"parse_generic_element",
|
|
127
|
+
"parse_header_element",
|
|
128
|
+
"parse_memo_properties",
|
|
129
|
+
"parse_memo_shape",
|
|
130
|
+
"parse_header_xml",
|
|
131
|
+
"parse_paragraph_element",
|
|
132
|
+
"parse_ref_list",
|
|
133
|
+
"parse_run_element",
|
|
134
|
+
"parse_section_element",
|
|
135
|
+
"parse_section_xml",
|
|
136
|
+
"parse_text_span",
|
|
137
|
+
]
|
|
138
|
+
|
hwpx/oxml/body.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from lxml import etree
|
|
7
|
+
|
|
8
|
+
from .common import GenericElement, parse_generic_element
|
|
9
|
+
from .utils import local_name, parse_bool, parse_int
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
INLINE_OBJECT_NAMES = {
|
|
13
|
+
"line",
|
|
14
|
+
"rect",
|
|
15
|
+
"ellipse",
|
|
16
|
+
"arc",
|
|
17
|
+
"polyline",
|
|
18
|
+
"polygon",
|
|
19
|
+
"curve",
|
|
20
|
+
"picture",
|
|
21
|
+
"tbl",
|
|
22
|
+
"shape",
|
|
23
|
+
"drawingObject",
|
|
24
|
+
"equation",
|
|
25
|
+
"ole",
|
|
26
|
+
"chart",
|
|
27
|
+
"video",
|
|
28
|
+
"audio",
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(slots=True)
|
|
33
|
+
class TextSpan:
|
|
34
|
+
text: str
|
|
35
|
+
marks: List[GenericElement] = field(default_factory=list)
|
|
36
|
+
attributes: Dict[str, str] = field(default_factory=dict)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@dataclass(slots=True)
|
|
40
|
+
class Run:
|
|
41
|
+
char_pr_id_ref: Optional[int]
|
|
42
|
+
section_properties: List[GenericElement] = field(default_factory=list)
|
|
43
|
+
controls: List[GenericElement] = field(default_factory=list)
|
|
44
|
+
inline_objects: List[GenericElement] = field(default_factory=list)
|
|
45
|
+
text_spans: List[TextSpan] = field(default_factory=list)
|
|
46
|
+
other_children: List[GenericElement] = field(default_factory=list)
|
|
47
|
+
attributes: Dict[str, str] = field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass(slots=True)
|
|
51
|
+
class Paragraph:
|
|
52
|
+
id: Optional[int]
|
|
53
|
+
para_pr_id_ref: Optional[int]
|
|
54
|
+
style_id_ref: Optional[int]
|
|
55
|
+
page_break: Optional[bool]
|
|
56
|
+
column_break: Optional[bool]
|
|
57
|
+
merged: Optional[bool]
|
|
58
|
+
runs: List[Run] = field(default_factory=list)
|
|
59
|
+
attributes: Dict[str, str] = field(default_factory=dict)
|
|
60
|
+
other_children: List[GenericElement] = field(default_factory=list)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@dataclass(slots=True)
|
|
64
|
+
class Section:
|
|
65
|
+
attributes: Dict[str, str]
|
|
66
|
+
paragraphs: List[Paragraph] = field(default_factory=list)
|
|
67
|
+
other_children: List[GenericElement] = field(default_factory=list)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def parse_text_span(node: etree._Element) -> TextSpan:
|
|
71
|
+
parts: List[str] = []
|
|
72
|
+
marks: List[GenericElement] = []
|
|
73
|
+
|
|
74
|
+
if node.text:
|
|
75
|
+
parts.append(node.text)
|
|
76
|
+
|
|
77
|
+
for child in node:
|
|
78
|
+
marks.append(parse_generic_element(child))
|
|
79
|
+
if child.tail:
|
|
80
|
+
parts.append(child.tail)
|
|
81
|
+
|
|
82
|
+
text = "".join(parts)
|
|
83
|
+
return TextSpan(text=text, marks=marks, attributes={key: value for key, value in node.attrib.items()})
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def parse_run_element(node: etree._Element) -> Run:
|
|
87
|
+
attributes = {key: value for key, value in node.attrib.items()}
|
|
88
|
+
char_pr_id_ref = parse_int(attributes.pop("charPrIDRef", None))
|
|
89
|
+
|
|
90
|
+
run = Run(char_pr_id_ref=char_pr_id_ref, attributes=attributes)
|
|
91
|
+
|
|
92
|
+
for child in node:
|
|
93
|
+
name = local_name(child)
|
|
94
|
+
if name == "secPr":
|
|
95
|
+
run.section_properties.append(parse_generic_element(child))
|
|
96
|
+
elif name == "ctrl":
|
|
97
|
+
run.controls.append(parse_generic_element(child))
|
|
98
|
+
elif name == "t":
|
|
99
|
+
run.text_spans.append(parse_text_span(child))
|
|
100
|
+
elif name in INLINE_OBJECT_NAMES:
|
|
101
|
+
run.inline_objects.append(parse_generic_element(child))
|
|
102
|
+
else:
|
|
103
|
+
run.other_children.append(parse_generic_element(child))
|
|
104
|
+
|
|
105
|
+
return run
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def parse_paragraph_element(node: etree._Element) -> Paragraph:
|
|
109
|
+
attributes = {key: value for key, value in node.attrib.items()}
|
|
110
|
+
|
|
111
|
+
paragraph = Paragraph(
|
|
112
|
+
id=parse_int(attributes.pop("id", None)),
|
|
113
|
+
para_pr_id_ref=parse_int(attributes.pop("paraPrIDRef", None)),
|
|
114
|
+
style_id_ref=parse_int(attributes.pop("styleIDRef", None)),
|
|
115
|
+
page_break=parse_bool(attributes.pop("pageBreak", None)),
|
|
116
|
+
column_break=parse_bool(attributes.pop("columnBreak", None)),
|
|
117
|
+
merged=parse_bool(attributes.pop("merged", None)),
|
|
118
|
+
attributes=attributes,
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
for child in node:
|
|
122
|
+
if local_name(child) == "run":
|
|
123
|
+
paragraph.runs.append(parse_run_element(child))
|
|
124
|
+
else:
|
|
125
|
+
paragraph.other_children.append(parse_generic_element(child))
|
|
126
|
+
|
|
127
|
+
return paragraph
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def parse_section_element(node: etree._Element) -> Section:
|
|
131
|
+
section = Section(attributes={key: value for key, value in node.attrib.items()})
|
|
132
|
+
|
|
133
|
+
for child in node:
|
|
134
|
+
if local_name(child) == "p":
|
|
135
|
+
section.paragraphs.append(parse_paragraph_element(child))
|
|
136
|
+
else:
|
|
137
|
+
section.other_children.append(parse_generic_element(child))
|
|
138
|
+
|
|
139
|
+
return section
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
__all__ = [
|
|
143
|
+
"Paragraph",
|
|
144
|
+
"Run",
|
|
145
|
+
"Section",
|
|
146
|
+
"TextSpan",
|
|
147
|
+
"parse_paragraph_element",
|
|
148
|
+
"parse_run_element",
|
|
149
|
+
"parse_section_element",
|
|
150
|
+
"parse_text_span",
|
|
151
|
+
]
|
hwpx/oxml/common.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Dict, List, Optional
|
|
5
|
+
|
|
6
|
+
from lxml import etree
|
|
7
|
+
|
|
8
|
+
from .utils import local_name
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass(slots=True)
|
|
12
|
+
class GenericElement:
|
|
13
|
+
"""Fallback representation for XML elements without a specialised model."""
|
|
14
|
+
|
|
15
|
+
name: str
|
|
16
|
+
attributes: Dict[str, str] = field(default_factory=dict)
|
|
17
|
+
children: List["GenericElement"] = field(default_factory=list)
|
|
18
|
+
text: Optional[str] = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def parse_generic_element(node: etree._Element) -> GenericElement:
|
|
22
|
+
"""Convert *node* into a :class:`GenericElement`."""
|
|
23
|
+
|
|
24
|
+
children = [parse_generic_element(child) for child in node]
|
|
25
|
+
text = node.text if node.text is not None else None
|
|
26
|
+
return GenericElement(
|
|
27
|
+
name=local_name(node),
|
|
28
|
+
attributes={key: value for key, value in node.attrib.items()},
|
|
29
|
+
children=children,
|
|
30
|
+
text=text,
|
|
31
|
+
)
|