python-hwpx 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hwpx/opc/package.py ADDED
@@ -0,0 +1,274 @@
1
+ """Utilities for reading and writing HWPX OPC packages."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from io import BytesIO
7
+ from typing import BinaryIO, Iterable, Iterator, Mapping, MutableMapping
8
+ from xml.etree import ElementTree as ET
9
+ from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile, ZipInfo
10
+
11
+ __all__ = ["HwpxPackage", "HwpxPackageError", "HwpxStructureError", "RootFile", "VersionInfo"]
12
+
13
+
14
+ class HwpxPackageError(Exception):
15
+ """Base error raised for issues related to :class:`HwpxPackage`."""
16
+
17
+
18
+ class HwpxStructureError(HwpxPackageError):
19
+ """Raised when the underlying HWPX package violates the required structure."""
20
+
21
+
22
+ @dataclass(frozen=True)
23
+ class RootFile:
24
+ """Represents a ``rootfile`` entry from ``META-INF/container.xml``."""
25
+
26
+ full_path: str
27
+ media_type: str | None = None
28
+
29
+ def ensure_exists(self, files: Mapping[str, bytes]) -> None:
30
+ """Ensure that the referenced root file actually exists in ``files``."""
31
+
32
+ if self.full_path not in files:
33
+ raise HwpxStructureError(
34
+ f"Root content '{self.full_path}' declared in container.xml is missing."
35
+ )
36
+
37
+
38
+ class VersionInfo:
39
+ """Model for the ``version.xml`` document."""
40
+
41
+ def __init__(
42
+ self,
43
+ element: ET.Element,
44
+ namespaces: Mapping[str, str],
45
+ xml_declaration: bytes | None,
46
+ ) -> None:
47
+ self._element = element
48
+ self._namespaces = dict(namespaces)
49
+ self._xml_declaration = xml_declaration
50
+ self._dirty = False
51
+
52
+ @classmethod
53
+ def from_bytes(cls, data: bytes) -> VersionInfo:
54
+ element = ET.fromstring(data)
55
+ namespaces = cls._collect_namespaces(data)
56
+ declaration = cls._extract_declaration(data)
57
+ return cls(element, namespaces, declaration)
58
+
59
+ @staticmethod
60
+ def _collect_namespaces(data: bytes) -> Mapping[str, str]:
61
+ namespaces: dict[str, str] = {}
62
+ for event, elem in ET.iterparse(BytesIO(data), events=("start-ns",)):
63
+ prefix, uri = elem
64
+ namespaces[prefix or ""] = uri
65
+ return namespaces
66
+
67
+ @staticmethod
68
+ def _extract_declaration(data: bytes) -> bytes | None:
69
+ data = data.lstrip()
70
+ if not data.startswith(b"<?xml"):
71
+ return None
72
+ end = data.find(b"?>")
73
+ if end == -1:
74
+ return None
75
+ return data[: end + 2]
76
+
77
+ @property
78
+ def attributes(self) -> Mapping[str, str]:
79
+ return dict(self._element.attrib)
80
+
81
+ def get(self, key: str, default: str | None = None) -> str | None:
82
+ return self._element.attrib.get(key, default)
83
+
84
+ def set(self, key: str, value: str) -> None:
85
+ self._element.attrib[key] = value
86
+ self._dirty = True
87
+
88
+ @property
89
+ def tag(self) -> str:
90
+ return self._element.tag
91
+
92
+ def to_bytes(self) -> bytes:
93
+ for prefix, uri in self._namespaces.items():
94
+ ET.register_namespace(prefix, uri)
95
+ stream = BytesIO()
96
+ tree = ET.ElementTree(self._element)
97
+ tree.write(stream, encoding="utf-8", xml_declaration=False)
98
+ xml_body = stream.getvalue()
99
+ if self._xml_declaration:
100
+ return self._xml_declaration + xml_body
101
+ return xml_body
102
+
103
+ @property
104
+ def dirty(self) -> bool:
105
+ return self._dirty
106
+
107
+ def mark_clean(self) -> None:
108
+ self._dirty = False
109
+
110
+
111
+ class HwpxPackage:
112
+ """Represents an HWPX package backed by an Open Packaging Convention container."""
113
+
114
+ CONTAINER_PATH = "META-INF/container.xml"
115
+ VERSION_PATH = "version.xml"
116
+ MIMETYPE_PATH = "mimetype"
117
+ DEFAULT_MIMETYPE = "application/hwp+zip"
118
+
119
+ def __init__(
120
+ self,
121
+ files: MutableMapping[str, bytes],
122
+ rootfiles: Iterable[RootFile],
123
+ version_info: VersionInfo,
124
+ mimetype: str,
125
+ ) -> None:
126
+ self._files = files
127
+ self._rootfiles = list(rootfiles)
128
+ self._version = version_info
129
+ self._mimetype = mimetype
130
+ self._validate_structure()
131
+
132
+ @classmethod
133
+ def open(cls, pkg_file: str | BinaryIO) -> HwpxPackage:
134
+ with ZipFile(pkg_file, "r") as zf:
135
+ files = {info.filename: zf.read(info) for info in zf.infolist()}
136
+ if cls.MIMETYPE_PATH not in files:
137
+ raise HwpxStructureError("HWPX package is missing the mandatory 'mimetype' file.")
138
+ mimetype = files[cls.MIMETYPE_PATH].decode("utf-8")
139
+ rootfiles = cls._parse_container(files.get(cls.CONTAINER_PATH))
140
+ version_info = cls._parse_version(files.get(cls.VERSION_PATH))
141
+ package = cls(files, rootfiles, version_info, mimetype)
142
+ return package
143
+
144
+ @staticmethod
145
+ def _parse_container(data: bytes | None) -> list[RootFile]:
146
+ if data is None:
147
+ raise HwpxStructureError(
148
+ "HWPX package is missing 'META-INF/container.xml'."
149
+ )
150
+ root = ET.fromstring(data)
151
+ rootfiles = []
152
+ for elem in root.findall(".//{*}rootfile"):
153
+ full_path = (
154
+ elem.get("full-path")
155
+ or elem.get("fullPath")
156
+ or elem.get("full_path")
157
+ )
158
+ if not full_path:
159
+ raise HwpxStructureError("container.xml contains a rootfile without 'full-path'.")
160
+ media_type = (
161
+ elem.get("media-type")
162
+ or elem.get("mediaType")
163
+ or elem.get("media_type")
164
+ )
165
+ rootfiles.append(RootFile(full_path, media_type))
166
+ if not rootfiles:
167
+ raise HwpxStructureError("container.xml does not declare any rootfiles.")
168
+ return rootfiles
169
+
170
+ @staticmethod
171
+ def _parse_version(data: bytes | None) -> VersionInfo:
172
+ if data is None:
173
+ raise HwpxStructureError("HWPX package is missing 'version.xml'.")
174
+ return VersionInfo.from_bytes(data)
175
+
176
+ def _validate_structure(self) -> None:
177
+ for rootfile in self._rootfiles:
178
+ rootfile.ensure_exists(self._files)
179
+ if not any(path.startswith(("Contents/", "Content/")) for path in self._files):
180
+ raise HwpxStructureError(
181
+ "HWPX package does not contain a 'Contents' directory."
182
+ )
183
+
184
+ @property
185
+ def mimetype(self) -> str:
186
+ return self._mimetype
187
+
188
+ @property
189
+ def rootfiles(self) -> tuple[RootFile, ...]:
190
+ return tuple(self._rootfiles)
191
+
192
+ def iter_rootfiles(self) -> Iterator[RootFile]:
193
+ yield from self._rootfiles
194
+
195
+ @property
196
+ def main_content(self) -> RootFile:
197
+ for rootfile in self._rootfiles:
198
+ if rootfile.media_type == "application/hwpml-package+xml":
199
+ return rootfile
200
+ return self._rootfiles[0]
201
+
202
+ @property
203
+ def version_info(self) -> VersionInfo:
204
+ return self._version
205
+
206
+ def read(self, path: str) -> bytes:
207
+ norm_path = self._normalize_path(path)
208
+ try:
209
+ return self._files[norm_path]
210
+ except KeyError as exc:
211
+ raise HwpxPackageError(f"File '{norm_path}' is not present in the package.") from exc
212
+
213
+ def write(self, path: str, data: bytes | str) -> None:
214
+ norm_path = self._normalize_path(path)
215
+ if isinstance(data, str):
216
+ data = data.encode("utf-8")
217
+ pending_rootfiles: list[RootFile] | None = None
218
+ pending_version: VersionInfo | None = None
219
+ if norm_path == self.MIMETYPE_PATH:
220
+ mimetype = data.decode("utf-8")
221
+ elif norm_path == self.CONTAINER_PATH:
222
+ pending_rootfiles = self._parse_container(data)
223
+ elif norm_path == self.VERSION_PATH:
224
+ pending_version = self._parse_version(data)
225
+ self._files[norm_path] = data
226
+ if norm_path == self.MIMETYPE_PATH:
227
+ self._mimetype = mimetype
228
+ elif norm_path == self.CONTAINER_PATH:
229
+ assert pending_rootfiles is not None
230
+ self._rootfiles = pending_rootfiles
231
+ elif norm_path == self.VERSION_PATH:
232
+ assert pending_version is not None
233
+ self._version = pending_version
234
+ self._validate_structure()
235
+
236
+ def delete(self, path: str) -> None:
237
+ norm_path = self._normalize_path(path)
238
+ if norm_path not in self._files:
239
+ raise HwpxPackageError(f"File '{norm_path}' is not present in the package.")
240
+ if norm_path in {self.MIMETYPE_PATH, self.CONTAINER_PATH, self.VERSION_PATH}:
241
+ raise HwpxStructureError(
242
+ "Cannot remove mandatory files ('mimetype', 'container.xml', 'version.xml')."
243
+ )
244
+ del self._files[norm_path]
245
+ self._validate_structure()
246
+
247
+ @staticmethod
248
+ def _normalize_path(path: str) -> str:
249
+ return path.replace("\\", "/")
250
+
251
+ def files(self) -> list[str]:
252
+ return sorted(self._files)
253
+
254
+ def save(self, pkg_file: str | BinaryIO) -> None:
255
+ self._files[self.MIMETYPE_PATH] = self._mimetype.encode("utf-8")
256
+ if self._version.dirty:
257
+ self._files[self.VERSION_PATH] = self._version.to_bytes()
258
+ self._version.mark_clean()
259
+ self._validate_structure()
260
+ with ZipFile(pkg_file, "w") as zf:
261
+ self._write_mimetype(zf)
262
+ for name in sorted(self._files):
263
+ if name == self.MIMETYPE_PATH:
264
+ continue
265
+ data = self._files[name]
266
+ info = ZipInfo(name)
267
+ info.compress_type = ZIP_DEFLATED
268
+ zf.writestr(info, data)
269
+
270
+ def _write_mimetype(self, zf: ZipFile) -> None:
271
+ info = ZipInfo(self.MIMETYPE_PATH)
272
+ info.compress_type = ZIP_STORED
273
+ zf.writestr(info, self._files[self.MIMETYPE_PATH])
274
+
hwpx/oxml/__init__.py ADDED
@@ -0,0 +1,138 @@
1
+
2
+ """Open XML helpers for the HWPX document format."""
3
+
4
+ from __future__ import annotations
5
+
6
+ from .body import (
7
+ Paragraph,
8
+ Run,
9
+ Section,
10
+ TextSpan,
11
+ parse_paragraph_element,
12
+ parse_run_element,
13
+ parse_section_element,
14
+ parse_text_span,
15
+ )
16
+ from .common import GenericElement, parse_generic_element
17
+
18
+ from .document import (
19
+ DocumentNumbering,
20
+ HwpxOxmlDocument,
21
+ HwpxOxmlHeader,
22
+ HwpxOxmlInlineObject,
23
+ HwpxOxmlMemo,
24
+ HwpxOxmlMemoGroup,
25
+ HwpxOxmlParagraph,
26
+ HwpxOxmlRun,
27
+ HwpxOxmlSection,
28
+ HwpxOxmlSectionHeaderFooter,
29
+ HwpxOxmlSectionProperties,
30
+ HwpxOxmlTable,
31
+ HwpxOxmlTableCell,
32
+ HwpxOxmlTableRow,
33
+ PageMargins,
34
+ PageSize,
35
+ RunStyle,
36
+ SectionStartNumbering,
37
+ )
38
+
39
+ from .header import (
40
+ BeginNum,
41
+ BorderFillList,
42
+ CharProperty,
43
+ CharPropertyList,
44
+ DocOption,
45
+ Font,
46
+ FontFace,
47
+ FontFaceList,
48
+ FontSubstitution,
49
+ FontTypeInfo,
50
+ ForbiddenWordList,
51
+ Header,
52
+ KeyDerivation,
53
+ KeyEncryption,
54
+ LinkInfo,
55
+ LicenseMark,
56
+ MemoProperties,
57
+ MemoShape,
58
+ NumberingList,
59
+ RefList,
60
+ TabProperties,
61
+ TrackChangeConfig,
62
+ memo_shape_from_attributes,
63
+ parse_begin_num,
64
+ parse_doc_option,
65
+ parse_header_element,
66
+ parse_memo_properties,
67
+ parse_memo_shape,
68
+ parse_ref_list,
69
+ )
70
+ from .parser import element_to_model, parse_header_xml, parse_section_xml
71
+ from .schema import load_schema
72
+ from .utils import XmlSource
73
+
74
+ __all__ = [
75
+ "BeginNum",
76
+ "BorderFillList",
77
+ "CharProperty",
78
+ "CharPropertyList",
79
+ "DocOption",
80
+ "Font",
81
+ "FontFace",
82
+ "FontFaceList",
83
+ "FontSubstitution",
84
+ "FontTypeInfo",
85
+ "ForbiddenWordList",
86
+ "GenericElement",
87
+ "Header",
88
+ "DocumentNumbering",
89
+ "HwpxOxmlDocument",
90
+ "HwpxOxmlHeader",
91
+ "HwpxOxmlInlineObject",
92
+ "HwpxOxmlMemo",
93
+ "HwpxOxmlMemoGroup",
94
+ "HwpxOxmlParagraph",
95
+ "HwpxOxmlRun",
96
+ "HwpxOxmlSection",
97
+ "HwpxOxmlSectionHeaderFooter",
98
+ "HwpxOxmlSectionProperties",
99
+ "HwpxOxmlTable",
100
+ "HwpxOxmlTableCell",
101
+ "HwpxOxmlTableRow",
102
+ "KeyDerivation",
103
+ "KeyEncryption",
104
+ "LinkInfo",
105
+ "LicenseMark",
106
+ "MemoProperties",
107
+ "MemoShape",
108
+ "NumberingList",
109
+ "Paragraph",
110
+ "PageMargins",
111
+ "PageSize",
112
+ "RunStyle",
113
+ "memo_shape_from_attributes",
114
+ "RefList",
115
+ "Run",
116
+ "Section",
117
+ "SectionStartNumbering",
118
+ "TabProperties",
119
+ "TextSpan",
120
+ "TrackChangeConfig",
121
+ "XmlSource",
122
+ "element_to_model",
123
+ "load_schema",
124
+ "parse_begin_num",
125
+ "parse_doc_option",
126
+ "parse_generic_element",
127
+ "parse_header_element",
128
+ "parse_memo_properties",
129
+ "parse_memo_shape",
130
+ "parse_header_xml",
131
+ "parse_paragraph_element",
132
+ "parse_ref_list",
133
+ "parse_run_element",
134
+ "parse_section_element",
135
+ "parse_section_xml",
136
+ "parse_text_span",
137
+ ]
138
+
hwpx/oxml/body.py ADDED
@@ -0,0 +1,151 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Dict, List, Optional
5
+
6
+ from lxml import etree
7
+
8
+ from .common import GenericElement, parse_generic_element
9
+ from .utils import local_name, parse_bool, parse_int
10
+
11
+
12
+ INLINE_OBJECT_NAMES = {
13
+ "line",
14
+ "rect",
15
+ "ellipse",
16
+ "arc",
17
+ "polyline",
18
+ "polygon",
19
+ "curve",
20
+ "picture",
21
+ "tbl",
22
+ "shape",
23
+ "drawingObject",
24
+ "equation",
25
+ "ole",
26
+ "chart",
27
+ "video",
28
+ "audio",
29
+ }
30
+
31
+
32
+ @dataclass(slots=True)
33
+ class TextSpan:
34
+ text: str
35
+ marks: List[GenericElement] = field(default_factory=list)
36
+ attributes: Dict[str, str] = field(default_factory=dict)
37
+
38
+
39
+ @dataclass(slots=True)
40
+ class Run:
41
+ char_pr_id_ref: Optional[int]
42
+ section_properties: List[GenericElement] = field(default_factory=list)
43
+ controls: List[GenericElement] = field(default_factory=list)
44
+ inline_objects: List[GenericElement] = field(default_factory=list)
45
+ text_spans: List[TextSpan] = field(default_factory=list)
46
+ other_children: List[GenericElement] = field(default_factory=list)
47
+ attributes: Dict[str, str] = field(default_factory=dict)
48
+
49
+
50
+ @dataclass(slots=True)
51
+ class Paragraph:
52
+ id: Optional[int]
53
+ para_pr_id_ref: Optional[int]
54
+ style_id_ref: Optional[int]
55
+ page_break: Optional[bool]
56
+ column_break: Optional[bool]
57
+ merged: Optional[bool]
58
+ runs: List[Run] = field(default_factory=list)
59
+ attributes: Dict[str, str] = field(default_factory=dict)
60
+ other_children: List[GenericElement] = field(default_factory=list)
61
+
62
+
63
+ @dataclass(slots=True)
64
+ class Section:
65
+ attributes: Dict[str, str]
66
+ paragraphs: List[Paragraph] = field(default_factory=list)
67
+ other_children: List[GenericElement] = field(default_factory=list)
68
+
69
+
70
+ def parse_text_span(node: etree._Element) -> TextSpan:
71
+ parts: List[str] = []
72
+ marks: List[GenericElement] = []
73
+
74
+ if node.text:
75
+ parts.append(node.text)
76
+
77
+ for child in node:
78
+ marks.append(parse_generic_element(child))
79
+ if child.tail:
80
+ parts.append(child.tail)
81
+
82
+ text = "".join(parts)
83
+ return TextSpan(text=text, marks=marks, attributes={key: value for key, value in node.attrib.items()})
84
+
85
+
86
+ def parse_run_element(node: etree._Element) -> Run:
87
+ attributes = {key: value for key, value in node.attrib.items()}
88
+ char_pr_id_ref = parse_int(attributes.pop("charPrIDRef", None))
89
+
90
+ run = Run(char_pr_id_ref=char_pr_id_ref, attributes=attributes)
91
+
92
+ for child in node:
93
+ name = local_name(child)
94
+ if name == "secPr":
95
+ run.section_properties.append(parse_generic_element(child))
96
+ elif name == "ctrl":
97
+ run.controls.append(parse_generic_element(child))
98
+ elif name == "t":
99
+ run.text_spans.append(parse_text_span(child))
100
+ elif name in INLINE_OBJECT_NAMES:
101
+ run.inline_objects.append(parse_generic_element(child))
102
+ else:
103
+ run.other_children.append(parse_generic_element(child))
104
+
105
+ return run
106
+
107
+
108
+ def parse_paragraph_element(node: etree._Element) -> Paragraph:
109
+ attributes = {key: value for key, value in node.attrib.items()}
110
+
111
+ paragraph = Paragraph(
112
+ id=parse_int(attributes.pop("id", None)),
113
+ para_pr_id_ref=parse_int(attributes.pop("paraPrIDRef", None)),
114
+ style_id_ref=parse_int(attributes.pop("styleIDRef", None)),
115
+ page_break=parse_bool(attributes.pop("pageBreak", None)),
116
+ column_break=parse_bool(attributes.pop("columnBreak", None)),
117
+ merged=parse_bool(attributes.pop("merged", None)),
118
+ attributes=attributes,
119
+ )
120
+
121
+ for child in node:
122
+ if local_name(child) == "run":
123
+ paragraph.runs.append(parse_run_element(child))
124
+ else:
125
+ paragraph.other_children.append(parse_generic_element(child))
126
+
127
+ return paragraph
128
+
129
+
130
+ def parse_section_element(node: etree._Element) -> Section:
131
+ section = Section(attributes={key: value for key, value in node.attrib.items()})
132
+
133
+ for child in node:
134
+ if local_name(child) == "p":
135
+ section.paragraphs.append(parse_paragraph_element(child))
136
+ else:
137
+ section.other_children.append(parse_generic_element(child))
138
+
139
+ return section
140
+
141
+
142
+ __all__ = [
143
+ "Paragraph",
144
+ "Run",
145
+ "Section",
146
+ "TextSpan",
147
+ "parse_paragraph_element",
148
+ "parse_run_element",
149
+ "parse_section_element",
150
+ "parse_text_span",
151
+ ]
hwpx/oxml/common.py ADDED
@@ -0,0 +1,31 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass, field
4
+ from typing import Dict, List, Optional
5
+
6
+ from lxml import etree
7
+
8
+ from .utils import local_name
9
+
10
+
11
+ @dataclass(slots=True)
12
+ class GenericElement:
13
+ """Fallback representation for XML elements without a specialised model."""
14
+
15
+ name: str
16
+ attributes: Dict[str, str] = field(default_factory=dict)
17
+ children: List["GenericElement"] = field(default_factory=list)
18
+ text: Optional[str] = None
19
+
20
+
21
+ def parse_generic_element(node: etree._Element) -> GenericElement:
22
+ """Convert *node* into a :class:`GenericElement`."""
23
+
24
+ children = [parse_generic_element(child) for child in node]
25
+ text = node.text if node.text is not None else None
26
+ return GenericElement(
27
+ name=local_name(node),
28
+ attributes={key: value for key, value in node.attrib.items()},
29
+ children=children,
30
+ text=text,
31
+ )