python-hwpx 2.7__tar.gz → 2.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {python_hwpx-2.7/src/python_hwpx.egg-info → python_hwpx-2.8}/PKG-INFO +10 -3
- {python_hwpx-2.7 → python_hwpx-2.8}/README.md +9 -2
- {python_hwpx-2.7 → python_hwpx-2.8}/pyproject.toml +26 -4
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/opc/package.py +62 -97
- python_hwpx-2.8/src/hwpx/opc/relationships.py +227 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/document.py +5 -2
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/archive_cli.py +35 -11
- python_hwpx-2.8/src/hwpx/tools/package_validator.py +352 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/page_guard.py +12 -40
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/template_analyzer.py +35 -19
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/text_extractor.py +44 -27
- {python_hwpx-2.7 → python_hwpx-2.8/src/python_hwpx.egg-info}/PKG-INFO +10 -3
- {python_hwpx-2.7 → python_hwpx-2.8}/src/python_hwpx.egg-info/SOURCES.txt +1 -0
- python_hwpx-2.8/tests/test_gap_closure_tools.py +548 -0
- python_hwpx-2.7/src/hwpx/tools/package_validator.py +0 -219
- python_hwpx-2.7/tests/test_gap_closure_tools.py +0 -221
- {python_hwpx-2.7 → python_hwpx-2.8}/LICENSE +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/setup.cfg +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/__init__.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/data/Skeleton.hwpx +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/document.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/opc/xml_utils.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/__init__.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/body.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/common.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/header.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/header_part.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/memo.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/namespaces.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/paragraph.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/parser.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/schema.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/section.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/table.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/oxml/utils.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/package.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/py.typed +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/templates.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/__init__.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/_schemas/header.xsd +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/_schemas/section.xsd +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/exporter.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/object_finder.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/text_extract_cli.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/hwpx/tools/validator.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/python_hwpx.egg-info/dependency_links.txt +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/python_hwpx.egg-info/entry_points.txt +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/python_hwpx.egg-info/requires.txt +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/src/python_hwpx.egg-info/top_level.txt +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_coverage_targets.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_document_context_manager.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_document_formatting.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_document_save_api.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_inline_models.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_integration_hwpx_compatibility.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_integration_roundtrip.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_memo_and_style_editing.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_new_features.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_opc_package.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_oxml_parsing.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_packaging_py_typed.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_paragraph_section_management.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_repr_snapshots.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_section_headers.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_split_merged_cell.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_tables_default_border.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_text_extractor_annotations.py +0 -0
- {python_hwpx-2.7 → python_hwpx-2.8}/tests/test_version_metadata.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: python-hwpx
|
|
3
|
-
Version: 2.
|
|
3
|
+
Version: 2.8
|
|
4
4
|
Summary: Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음
|
|
5
5
|
Author: python-hwpx Maintainers
|
|
6
6
|
License: Non-Commercial License
|
|
@@ -284,12 +284,15 @@ hwpx-validate 문서.hwpx
|
|
|
284
284
|
# ZIP/OPC/HWPX 패키지 구조 검사
|
|
285
285
|
hwpx-validate-package 문서.hwpx
|
|
286
286
|
|
|
287
|
-
# HWPX 풀기 / 다시 묶기
|
|
287
|
+
# HWPX 풀기 / 다시 묶기 (기본값: XML/HWPF 바이트 보존)
|
|
288
288
|
hwpx-unpack 문서.hwpx ./unpacked
|
|
289
|
+
hwpx-unpack 문서.hwpx ./pretty-unpacked --pretty-xml
|
|
289
290
|
hwpx-pack ./unpacked ./repacked.hwpx
|
|
290
291
|
|
|
291
|
-
# 레퍼런스 템플릿 분석과
|
|
292
|
+
# 레퍼런스 템플릿 분석과 pack-ready 추출
|
|
292
293
|
hwpx-analyze-template 문서.hwpx --extract-dir ./template-parts --json
|
|
294
|
+
hwpx-pack ./template-parts ./template-roundtrip.hwpx
|
|
295
|
+
hwpx-validate-package ./template-roundtrip.hwpx
|
|
293
296
|
|
|
294
297
|
# plain / markdown 텍스트 추출
|
|
295
298
|
hwpx-text-extract 문서.hwpx --format markdown --output 문서.md
|
|
@@ -300,6 +303,10 @@ hwpx-page-guard --reference 원본.hwpx --output 결과.hwpx
|
|
|
300
303
|
|
|
301
304
|
`hwpx-page-guard`는 렌더된 실제 쪽수를 계산하지 않습니다. 대신 단락 수, 표 수, shape/control 수, 명시적 page/column break, 텍스트 길이 통계를 비교해 레이아웃 드리프트 위험을 탐지하는 프록시 도구입니다.
|
|
302
305
|
|
|
306
|
+
`hwpx-validate-package`는 `Contents/content.hpf` 같은 고정 경로를 강제하지 않고, `META-INF/container.xml`과 선택된 rootfile/manifest 관계를 따라가며 검사합니다. 엔진이 fallback으로 열 수 있는 비표준 패키지는 가능한 경우 경고로 구분합니다.
|
|
307
|
+
|
|
308
|
+
`hwpx-analyze-template --extract-dir`는 covered fixture 기준으로 `hwpx-pack`과 `hwpx-validate-package`, 그리고 엔진 open 경로에 다시 투입할 수 있는 pack-ready 작업 디렉터리를 만듭니다. 이건 재패킹 가능성을 목표로 한 것이지, 렌더링 fidelity를 보장한다는 뜻은 아닙니다.
|
|
309
|
+
|
|
303
310
|
## 문서
|
|
304
311
|
|
|
305
312
|
| | |
|
|
@@ -217,12 +217,15 @@ hwpx-validate 문서.hwpx
|
|
|
217
217
|
# ZIP/OPC/HWPX 패키지 구조 검사
|
|
218
218
|
hwpx-validate-package 문서.hwpx
|
|
219
219
|
|
|
220
|
-
# HWPX 풀기 / 다시 묶기
|
|
220
|
+
# HWPX 풀기 / 다시 묶기 (기본값: XML/HWPF 바이트 보존)
|
|
221
221
|
hwpx-unpack 문서.hwpx ./unpacked
|
|
222
|
+
hwpx-unpack 문서.hwpx ./pretty-unpacked --pretty-xml
|
|
222
223
|
hwpx-pack ./unpacked ./repacked.hwpx
|
|
223
224
|
|
|
224
|
-
# 레퍼런스 템플릿 분석과
|
|
225
|
+
# 레퍼런스 템플릿 분석과 pack-ready 추출
|
|
225
226
|
hwpx-analyze-template 문서.hwpx --extract-dir ./template-parts --json
|
|
227
|
+
hwpx-pack ./template-parts ./template-roundtrip.hwpx
|
|
228
|
+
hwpx-validate-package ./template-roundtrip.hwpx
|
|
226
229
|
|
|
227
230
|
# plain / markdown 텍스트 추출
|
|
228
231
|
hwpx-text-extract 문서.hwpx --format markdown --output 문서.md
|
|
@@ -233,6 +236,10 @@ hwpx-page-guard --reference 원본.hwpx --output 결과.hwpx
|
|
|
233
236
|
|
|
234
237
|
`hwpx-page-guard`는 렌더된 실제 쪽수를 계산하지 않습니다. 대신 단락 수, 표 수, shape/control 수, 명시적 page/column break, 텍스트 길이 통계를 비교해 레이아웃 드리프트 위험을 탐지하는 프록시 도구입니다.
|
|
235
238
|
|
|
239
|
+
`hwpx-validate-package`는 `Contents/content.hpf` 같은 고정 경로를 강제하지 않고, `META-INF/container.xml`과 선택된 rootfile/manifest 관계를 따라가며 검사합니다. 엔진이 fallback으로 열 수 있는 비표준 패키지는 가능한 경우 경고로 구분합니다.
|
|
240
|
+
|
|
241
|
+
`hwpx-analyze-template --extract-dir`는 covered fixture 기준으로 `hwpx-pack`과 `hwpx-validate-package`, 그리고 엔진 open 경로에 다시 투입할 수 있는 pack-ready 작업 디렉터리를 만듭니다. 이건 재패킹 가능성을 목표로 한 것이지, 렌더링 fidelity를 보장한다는 뜻은 아닙니다.
|
|
242
|
+
|
|
236
243
|
## 문서
|
|
237
244
|
|
|
238
245
|
| | |
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "python-hwpx"
|
|
7
|
-
version = "2.
|
|
7
|
+
version = "2.8"
|
|
8
8
|
description = "Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음"
|
|
9
9
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -78,7 +78,18 @@ testpaths = ["tests"]
|
|
|
78
78
|
|
|
79
79
|
[tool.mypy]
|
|
80
80
|
python_version = "3.10"
|
|
81
|
-
files = [
|
|
81
|
+
files = [
|
|
82
|
+
"src/hwpx/document.py",
|
|
83
|
+
"src/hwpx/oxml/document.py",
|
|
84
|
+
"src/hwpx/opc/package.py",
|
|
85
|
+
"src/hwpx/opc/relationships.py",
|
|
86
|
+
"src/hwpx/tools/archive_cli.py",
|
|
87
|
+
"src/hwpx/tools/package_validator.py",
|
|
88
|
+
"src/hwpx/tools/page_guard.py",
|
|
89
|
+
"src/hwpx/tools/template_analyzer.py",
|
|
90
|
+
"src/hwpx/tools/text_extract_cli.py",
|
|
91
|
+
"src/hwpx/tools/text_extractor.py",
|
|
92
|
+
]
|
|
82
93
|
ignore_missing_imports = true
|
|
83
94
|
|
|
84
95
|
[[tool.mypy.overrides]]
|
|
@@ -86,7 +97,18 @@ module = ["hwpx.document", "hwpx.oxml.document"]
|
|
|
86
97
|
ignore_errors = true
|
|
87
98
|
|
|
88
99
|
[tool.pyright]
|
|
89
|
-
|
|
100
|
+
# Keep basic pyright focused on OPC/tooling modules that currently pass
|
|
101
|
+
# without suppressing their real diagnostics.
|
|
102
|
+
include = [
|
|
103
|
+
"src/hwpx/opc/package.py",
|
|
104
|
+
"src/hwpx/opc/relationships.py",
|
|
105
|
+
"src/hwpx/tools/archive_cli.py",
|
|
106
|
+
"src/hwpx/tools/package_validator.py",
|
|
107
|
+
"src/hwpx/tools/page_guard.py",
|
|
108
|
+
"src/hwpx/tools/template_analyzer.py",
|
|
109
|
+
"src/hwpx/tools/text_extract_cli.py",
|
|
110
|
+
"src/hwpx/tools/text_extractor.py",
|
|
111
|
+
]
|
|
90
112
|
pythonVersion = "3.10"
|
|
91
|
-
typeCheckingMode = "
|
|
113
|
+
typeCheckingMode = "basic"
|
|
92
114
|
reportMissingTypeStubs = false
|
|
@@ -7,12 +7,21 @@ import io
|
|
|
7
7
|
import os
|
|
8
8
|
import tempfile
|
|
9
9
|
from dataclasses import dataclass
|
|
10
|
-
from pathlib import Path
|
|
10
|
+
from pathlib import Path, PurePosixPath
|
|
11
11
|
from typing import BinaryIO, Iterable, Iterator, Mapping, MutableMapping
|
|
12
12
|
from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile, ZipInfo
|
|
13
13
|
|
|
14
|
-
from lxml import etree
|
|
14
|
+
from lxml import etree # type: ignore[reportAttributeAccessIssue]
|
|
15
15
|
|
|
16
|
+
from .relationships import (
|
|
17
|
+
MAIN_ROOTFILE_MEDIA_TYPE,
|
|
18
|
+
OPF_NS,
|
|
19
|
+
is_header_part_name,
|
|
20
|
+
is_section_part_name,
|
|
21
|
+
normalize_part_name,
|
|
22
|
+
parse_container_rootfiles,
|
|
23
|
+
parse_manifest_relationships,
|
|
24
|
+
)
|
|
16
25
|
from .xml_utils import (
|
|
17
26
|
extract_xml_declaration,
|
|
18
27
|
iter_declared_namespaces,
|
|
@@ -24,8 +33,6 @@ __all__ = ["HwpxPackage", "HwpxPackageError", "HwpxStructureError", "RootFile",
|
|
|
24
33
|
|
|
25
34
|
logger = logging.getLogger(__name__)
|
|
26
35
|
|
|
27
|
-
_OPF_NS = "http://www.idpf.org/2007/opf/"
|
|
28
|
-
|
|
29
36
|
|
|
30
37
|
class HwpxPackageError(Exception):
|
|
31
38
|
"""Base error raised for issues related to :class:`HwpxPackage`."""
|
|
@@ -169,25 +176,10 @@ class HwpxPackage:
|
|
|
169
176
|
except Exception:
|
|
170
177
|
logger.exception("container.xml 파싱에 실패했습니다.")
|
|
171
178
|
raise
|
|
172
|
-
rootfiles = [
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
if full_path and not full_path_attr:
|
|
177
|
-
logger.warning(
|
|
178
|
-
"container.xml rootfile이 비표준 경로 속성명을 사용했습니다: %s",
|
|
179
|
-
elem.attrib,
|
|
180
|
-
)
|
|
181
|
-
if not full_path:
|
|
182
|
-
raise HwpxStructureError("container.xml contains a rootfile without 'full-path'.")
|
|
183
|
-
media_type_attr = elem.get("media-type")
|
|
184
|
-
media_type = media_type_attr or elem.get("mediaType") or elem.get("media_type")
|
|
185
|
-
if media_type and not media_type_attr:
|
|
186
|
-
logger.warning(
|
|
187
|
-
"container.xml rootfile이 비표준 media-type 속성명을 사용했습니다: %s",
|
|
188
|
-
elem.attrib,
|
|
189
|
-
)
|
|
190
|
-
rootfiles.append(RootFile(full_path, media_type))
|
|
179
|
+
rootfiles = [
|
|
180
|
+
RootFile(ref.full_path, ref.media_type)
|
|
181
|
+
for ref in parse_container_rootfiles(root)
|
|
182
|
+
]
|
|
191
183
|
if not rootfiles:
|
|
192
184
|
raise HwpxStructureError("container.xml does not declare any rootfiles.")
|
|
193
185
|
return rootfiles
|
|
@@ -201,10 +193,6 @@ class HwpxPackage:
|
|
|
201
193
|
def _validate_structure(self) -> None:
|
|
202
194
|
for rootfile in self._rootfiles:
|
|
203
195
|
rootfile.ensure_exists(self._files)
|
|
204
|
-
if not any(path.startswith(("Contents/", "Content/")) for path in self._files):
|
|
205
|
-
raise HwpxStructureError(
|
|
206
|
-
"HWPX package does not contain a 'Contents' directory."
|
|
207
|
-
)
|
|
208
196
|
|
|
209
197
|
@property
|
|
210
198
|
def mimetype(self) -> str:
|
|
@@ -220,7 +208,7 @@ class HwpxPackage:
|
|
|
220
208
|
@property
|
|
221
209
|
def main_content(self) -> RootFile:
|
|
222
210
|
for rootfile in self._rootfiles:
|
|
223
|
-
if rootfile.media_type ==
|
|
211
|
+
if rootfile.media_type == MAIN_ROOTFILE_MEDIA_TYPE:
|
|
224
212
|
return rootfile
|
|
225
213
|
selected = self._rootfiles[0]
|
|
226
214
|
logger.warning(
|
|
@@ -254,7 +242,6 @@ class HwpxPackage:
|
|
|
254
242
|
elif norm_path == self.VERSION_PATH:
|
|
255
243
|
pending_version = self._parse_version(data)
|
|
256
244
|
self._files[norm_path] = data
|
|
257
|
-
self._invalidate_caches(norm_path)
|
|
258
245
|
if norm_path == self.MIMETYPE_PATH:
|
|
259
246
|
self._mimetype = mimetype
|
|
260
247
|
elif norm_path == self.CONTAINER_PATH:
|
|
@@ -263,6 +250,7 @@ class HwpxPackage:
|
|
|
263
250
|
elif norm_path == self.VERSION_PATH:
|
|
264
251
|
assert pending_version is not None
|
|
265
252
|
self._version = pending_version
|
|
253
|
+
self._invalidate_caches(norm_path)
|
|
266
254
|
self._validate_structure()
|
|
267
255
|
|
|
268
256
|
def delete(self, path: str) -> None:
|
|
@@ -274,11 +262,12 @@ class HwpxPackage:
|
|
|
274
262
|
"Cannot remove mandatory files ('mimetype', 'container.xml', 'version.xml')."
|
|
275
263
|
)
|
|
276
264
|
del self._files[norm_path]
|
|
265
|
+
self._invalidate_caches(norm_path)
|
|
277
266
|
self._validate_structure()
|
|
278
267
|
|
|
279
268
|
@staticmethod
|
|
280
269
|
def _normalize_path(path: str) -> str:
|
|
281
|
-
return path
|
|
270
|
+
return normalize_part_name(path)
|
|
282
271
|
|
|
283
272
|
def files(self) -> list[str]:
|
|
284
273
|
return sorted(self._files)
|
|
@@ -314,13 +303,12 @@ class HwpxPackage:
|
|
|
314
303
|
|
|
315
304
|
def manifest_tree(self) -> etree._Element:
|
|
316
305
|
if self._manifest_tree is None:
|
|
317
|
-
self._manifest_tree = self.get_xml(self.
|
|
306
|
+
self._manifest_tree = self.get_xml(self.main_content.full_path)
|
|
318
307
|
return self._manifest_tree
|
|
319
308
|
|
|
320
309
|
def _manifest_items(self) -> list[etree._Element]:
|
|
321
310
|
manifest = self.manifest_tree()
|
|
322
|
-
|
|
323
|
-
return list(manifest.findall("./opf:manifest/opf:item", ns))
|
|
311
|
+
return list(manifest.findall("./opf:manifest/opf:item", OPF_NS))
|
|
324
312
|
|
|
325
313
|
@staticmethod
|
|
326
314
|
def _normalized_manifest_value(element: etree._Element) -> str:
|
|
@@ -339,52 +327,37 @@ class HwpxPackage:
|
|
|
339
327
|
|
|
340
328
|
def _resolve_spine_paths(self) -> list[str]:
|
|
341
329
|
if self._spine_cache is None:
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
if item_id and href:
|
|
349
|
-
manifest_items[item_id] = href
|
|
350
|
-
spine_paths: list[str] = []
|
|
351
|
-
for itemref in manifest.findall("./opf:spine/opf:itemref", ns):
|
|
352
|
-
idref = itemref.attrib.get("idref")
|
|
353
|
-
if not idref:
|
|
354
|
-
continue
|
|
355
|
-
href = manifest_items.get(idref)
|
|
356
|
-
if href:
|
|
357
|
-
spine_paths.append(href)
|
|
358
|
-
self._spine_cache = spine_paths
|
|
330
|
+
relationships = parse_manifest_relationships(
|
|
331
|
+
self.manifest_tree(),
|
|
332
|
+
self.main_content.full_path,
|
|
333
|
+
known_parts=self._files.keys(),
|
|
334
|
+
)
|
|
335
|
+
self._spine_cache = list(relationships.spine_paths)
|
|
359
336
|
return self._spine_cache
|
|
360
337
|
|
|
361
338
|
def section_paths(self) -> list[str]:
|
|
362
339
|
if self._section_paths_cache is None:
|
|
363
|
-
from pathlib import PurePosixPath
|
|
364
|
-
|
|
365
340
|
paths = [
|
|
366
341
|
path
|
|
367
342
|
for path in self._resolve_spine_paths()
|
|
368
|
-
if path and
|
|
343
|
+
if path and is_section_part_name(path)
|
|
369
344
|
]
|
|
370
345
|
if not paths:
|
|
371
346
|
logger.warning("manifest spine에서 section 경로를 찾지 못해 파일명 기반 fallback을 사용합니다.")
|
|
372
347
|
paths = [
|
|
373
348
|
name
|
|
374
349
|
for name in self._files.keys()
|
|
375
|
-
if
|
|
350
|
+
if is_section_part_name(name)
|
|
376
351
|
]
|
|
377
352
|
self._section_paths_cache = paths
|
|
378
353
|
return list(self._section_paths_cache)
|
|
379
354
|
|
|
380
355
|
def header_paths(self) -> list[str]:
|
|
381
356
|
if self._header_paths_cache is None:
|
|
382
|
-
from pathlib import PurePosixPath
|
|
383
|
-
|
|
384
357
|
paths = [
|
|
385
358
|
path
|
|
386
359
|
for path in self._resolve_spine_paths()
|
|
387
|
-
if path and
|
|
360
|
+
if path and is_header_part_name(path)
|
|
388
361
|
]
|
|
389
362
|
if not paths and self.has_part(self.HEADER_PATH):
|
|
390
363
|
logger.warning(
|
|
@@ -397,14 +370,13 @@ class HwpxPackage:
|
|
|
397
370
|
|
|
398
371
|
def master_page_paths(self) -> list[str]:
|
|
399
372
|
if self._master_page_paths_cache is None:
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
]
|
|
373
|
+
paths = list(
|
|
374
|
+
parse_manifest_relationships(
|
|
375
|
+
self.manifest_tree(),
|
|
376
|
+
self.main_content.full_path,
|
|
377
|
+
known_parts=self._files.keys(),
|
|
378
|
+
).master_page_paths
|
|
379
|
+
)
|
|
408
380
|
if not paths:
|
|
409
381
|
logger.warning("manifest에서 masterPage를 찾지 못해 파일명 탐색 fallback을 사용합니다.")
|
|
410
382
|
paths = [
|
|
@@ -418,13 +390,13 @@ class HwpxPackage:
|
|
|
418
390
|
|
|
419
391
|
def history_paths(self) -> list[str]:
|
|
420
392
|
if self._history_paths_cache is None:
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
393
|
+
paths = list(
|
|
394
|
+
parse_manifest_relationships(
|
|
395
|
+
self.manifest_tree(),
|
|
396
|
+
self.main_content.full_path,
|
|
397
|
+
known_parts=self._files.keys(),
|
|
398
|
+
).history_paths
|
|
399
|
+
)
|
|
428
400
|
if not paths:
|
|
429
401
|
logger.warning("manifest에서 history를 찾지 못해 파일명 탐색 fallback을 사용합니다.")
|
|
430
402
|
paths = [
|
|
@@ -437,13 +409,11 @@ class HwpxPackage:
|
|
|
437
409
|
|
|
438
410
|
def version_path(self) -> str | None:
|
|
439
411
|
if not self._version_path_cache_resolved:
|
|
440
|
-
path
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
path = href
|
|
446
|
-
break
|
|
412
|
+
path = parse_manifest_relationships(
|
|
413
|
+
self.manifest_tree(),
|
|
414
|
+
self.main_content.full_path,
|
|
415
|
+
known_parts=self._files.keys(),
|
|
416
|
+
).version_path
|
|
447
417
|
if path is None and self.has_part(self.VERSION_PATH):
|
|
448
418
|
logger.warning(
|
|
449
419
|
"manifest에서 version 파트를 찾지 못해 기본 경로 fallback을 사용합니다: %s",
|
|
@@ -461,8 +431,7 @@ class HwpxPackage:
|
|
|
461
431
|
def _manifest_element(self) -> etree._Element | None:
|
|
462
432
|
"""Return the ``<opf:manifest>`` element."""
|
|
463
433
|
manifest = self.manifest_tree()
|
|
464
|
-
|
|
465
|
-
return manifest.find("opf:manifest", ns)
|
|
434
|
+
return manifest.find("opf:manifest", OPF_NS)
|
|
466
435
|
|
|
467
436
|
def add_manifest_item(
|
|
468
437
|
self,
|
|
@@ -475,13 +444,12 @@ class HwpxPackage:
|
|
|
475
444
|
if manifest_el is None:
|
|
476
445
|
raise HwpxStructureError("Manifest does not contain an <opf:manifest> element.")
|
|
477
446
|
|
|
478
|
-
|
|
479
|
-
for existing in manifest_el.findall("opf:item", ns):
|
|
447
|
+
for existing in manifest_el.findall("opf:item", OPF_NS):
|
|
480
448
|
if existing.get("id") == item_id:
|
|
481
449
|
return # already present
|
|
482
450
|
|
|
483
451
|
new_item = manifest_el.makeelement(
|
|
484
|
-
f"{{{
|
|
452
|
+
f"{{{OPF_NS['opf']}}}item",
|
|
485
453
|
{"id": item_id, "href": href, "media-type": media_type},
|
|
486
454
|
)
|
|
487
455
|
manifest_el.append(new_item)
|
|
@@ -493,8 +461,7 @@ class HwpxPackage:
|
|
|
493
461
|
if manifest_el is None:
|
|
494
462
|
return False
|
|
495
463
|
|
|
496
|
-
|
|
497
|
-
for existing in manifest_el.findall("opf:item", ns):
|
|
464
|
+
for existing in manifest_el.findall("opf:item", OPF_NS):
|
|
498
465
|
if existing.get("id") == item_id:
|
|
499
466
|
manifest_el.remove(existing)
|
|
500
467
|
self._persist_manifest()
|
|
@@ -505,20 +472,18 @@ class HwpxPackage:
|
|
|
505
472
|
"""Write the in-memory manifest tree back to the package."""
|
|
506
473
|
tree = self._manifest_tree
|
|
507
474
|
if tree is not None:
|
|
508
|
-
self.set_part(self.
|
|
475
|
+
self.set_part(self.main_content.full_path, tree)
|
|
509
476
|
|
|
510
477
|
def _invalidate_caches(self, changed_path: str) -> None:
|
|
511
|
-
if changed_path
|
|
478
|
+
if changed_path in {self.CONTAINER_PATH, self.main_content.full_path}:
|
|
512
479
|
self._manifest_tree = None
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
elif changed_path == self.VERSION_PATH:
|
|
521
|
-
self._version_path_cache_resolved = False
|
|
480
|
+
self._spine_cache = None
|
|
481
|
+
self._section_paths_cache = None
|
|
482
|
+
self._header_paths_cache = None
|
|
483
|
+
self._master_page_paths_cache = None
|
|
484
|
+
self._history_paths_cache = None
|
|
485
|
+
self._version_path_cache = None
|
|
486
|
+
self._version_path_cache_resolved = False
|
|
522
487
|
|
|
523
488
|
def save(
|
|
524
489
|
self,
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
"""Helpers for resolving HWPX container and manifest relationships."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import PurePosixPath
|
|
7
|
+
from typing import Any, Collection, Iterable
|
|
8
|
+
|
|
9
|
+
CONTAINER_NAMESPACES = (
|
|
10
|
+
"urn:oasis:names:tc:opendocument:xmlns:container",
|
|
11
|
+
)
|
|
12
|
+
MAIN_ROOTFILE_MEDIA_TYPE = "application/hwpml-package+xml"
|
|
13
|
+
OPF_NS = {"opf": "http://www.idpf.org/2007/opf/"}
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"CONTAINER_NAMESPACES",
|
|
17
|
+
"MAIN_ROOTFILE_MEDIA_TYPE",
|
|
18
|
+
"OPF_NS",
|
|
19
|
+
"ManifestItemRef",
|
|
20
|
+
"ManifestRelationships",
|
|
21
|
+
"RootFileRef",
|
|
22
|
+
"is_header_part_name",
|
|
23
|
+
"is_section_part_name",
|
|
24
|
+
"normalize_part_name",
|
|
25
|
+
"parse_container_rootfiles",
|
|
26
|
+
"parse_manifest_relationships",
|
|
27
|
+
"resolve_part_name",
|
|
28
|
+
"select_main_rootfile",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class RootFileRef:
|
|
34
|
+
full_path: str
|
|
35
|
+
media_type: str | None = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass(frozen=True)
|
|
39
|
+
class ManifestItemRef:
|
|
40
|
+
item_id: str | None
|
|
41
|
+
href: str
|
|
42
|
+
resolved_path: str
|
|
43
|
+
media_type: str | None = None
|
|
44
|
+
properties: str | None = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass(frozen=True)
|
|
48
|
+
class ManifestRelationships:
|
|
49
|
+
manifest_path: str
|
|
50
|
+
items: tuple[ManifestItemRef, ...]
|
|
51
|
+
spine_paths: tuple[str, ...]
|
|
52
|
+
dangling_idrefs: tuple[str, ...]
|
|
53
|
+
header_paths: tuple[str, ...]
|
|
54
|
+
master_page_paths: tuple[str, ...]
|
|
55
|
+
history_paths: tuple[str, ...]
|
|
56
|
+
version_path: str | None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def normalize_part_name(path: str) -> str:
|
|
60
|
+
raw = path.replace("\\", "/").strip()
|
|
61
|
+
parts: list[str] = []
|
|
62
|
+
for part in PurePosixPath(raw).parts:
|
|
63
|
+
if part in {"", ".", "/"}:
|
|
64
|
+
continue
|
|
65
|
+
if part == "..":
|
|
66
|
+
if parts:
|
|
67
|
+
parts.pop()
|
|
68
|
+
continue
|
|
69
|
+
parts.append(part)
|
|
70
|
+
return "/".join(parts)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def resolve_part_name(
|
|
74
|
+
base_part: str,
|
|
75
|
+
href: str,
|
|
76
|
+
*,
|
|
77
|
+
known_parts: Collection[str] | None = None,
|
|
78
|
+
) -> str:
|
|
79
|
+
raw_href = href.replace("\\", "/").strip()
|
|
80
|
+
if not raw_href:
|
|
81
|
+
return ""
|
|
82
|
+
if raw_href.startswith("/"):
|
|
83
|
+
return normalize_part_name(raw_href)
|
|
84
|
+
base_dir = PurePosixPath(normalize_part_name(base_part)).parent
|
|
85
|
+
normalized_href = normalize_part_name(raw_href)
|
|
86
|
+
if known_parts is not None:
|
|
87
|
+
normalized_parts = {normalize_part_name(part) for part in known_parts}
|
|
88
|
+
if normalized_href in normalized_parts:
|
|
89
|
+
return normalized_href
|
|
90
|
+
relative_candidate = normalize_part_name(str(base_dir / raw_href))
|
|
91
|
+
if relative_candidate in normalized_parts:
|
|
92
|
+
return relative_candidate
|
|
93
|
+
base_dir_name = normalize_part_name(str(base_dir))
|
|
94
|
+
if base_dir_name and normalized_href.startswith(f"{base_dir_name}/"):
|
|
95
|
+
return normalized_href
|
|
96
|
+
return normalize_part_name(str(base_dir / raw_href))
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def parse_container_rootfiles(container_root: Any) -> tuple[RootFileRef, ...]:
|
|
100
|
+
rootfiles: list[RootFileRef] = []
|
|
101
|
+
seen: set[tuple[str, str | None]] = set()
|
|
102
|
+
candidates = list(container_root.findall(".//rootfile"))
|
|
103
|
+
for namespace in CONTAINER_NAMESPACES:
|
|
104
|
+
candidates.extend(container_root.findall(f".//{{{namespace}}}rootfile"))
|
|
105
|
+
|
|
106
|
+
for elem in candidates:
|
|
107
|
+
full_path = (
|
|
108
|
+
elem.get("full-path")
|
|
109
|
+
or elem.get("fullPath")
|
|
110
|
+
or elem.get("full_path")
|
|
111
|
+
)
|
|
112
|
+
if not full_path:
|
|
113
|
+
continue
|
|
114
|
+
media_type = (
|
|
115
|
+
elem.get("media-type")
|
|
116
|
+
or elem.get("mediaType")
|
|
117
|
+
or elem.get("media_type")
|
|
118
|
+
)
|
|
119
|
+
root = RootFileRef(
|
|
120
|
+
full_path=normalize_part_name(full_path),
|
|
121
|
+
media_type=media_type,
|
|
122
|
+
)
|
|
123
|
+
key = (root.full_path, root.media_type)
|
|
124
|
+
if key in seen:
|
|
125
|
+
continue
|
|
126
|
+
seen.add(key)
|
|
127
|
+
rootfiles.append(root)
|
|
128
|
+
return tuple(rootfiles)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def select_main_rootfile(rootfiles: Iterable[RootFileRef]) -> tuple[RootFileRef | None, bool]:
|
|
132
|
+
ordered = list(rootfiles)
|
|
133
|
+
if not ordered:
|
|
134
|
+
return None, False
|
|
135
|
+
for rootfile in ordered:
|
|
136
|
+
if rootfile.media_type == MAIN_ROOTFILE_MEDIA_TYPE:
|
|
137
|
+
return rootfile, False
|
|
138
|
+
return ordered[0], True
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def is_section_part_name(path: str) -> bool:
|
|
142
|
+
name = PurePosixPath(path).name.lower()
|
|
143
|
+
return name.startswith("section") and name.endswith(".xml")
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def is_header_part_name(path: str) -> bool:
|
|
147
|
+
name = PurePosixPath(path).name.lower()
|
|
148
|
+
return name.startswith("header") and name.endswith(".xml")
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _manifest_matches(item: ManifestItemRef, *candidates: str) -> bool:
|
|
152
|
+
haystack = " ".join(
|
|
153
|
+
part.lower()
|
|
154
|
+
for part in (
|
|
155
|
+
item.item_id or "",
|
|
156
|
+
item.href,
|
|
157
|
+
item.media_type or "",
|
|
158
|
+
item.properties or "",
|
|
159
|
+
)
|
|
160
|
+
if part
|
|
161
|
+
)
|
|
162
|
+
return any(candidate in haystack for candidate in candidates if candidate)
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def parse_manifest_relationships(
|
|
166
|
+
manifest_root: Any,
|
|
167
|
+
manifest_path: str,
|
|
168
|
+
*,
|
|
169
|
+
known_parts: Collection[str] | None = None,
|
|
170
|
+
) -> ManifestRelationships:
|
|
171
|
+
items: list[ManifestItemRef] = []
|
|
172
|
+
id_to_path: dict[str, str] = {}
|
|
173
|
+
|
|
174
|
+
for item in manifest_root.findall(".//opf:item", OPF_NS):
|
|
175
|
+
href = (item.get("href") or "").strip()
|
|
176
|
+
if not href:
|
|
177
|
+
continue
|
|
178
|
+
resolved_path = resolve_part_name(manifest_path, href, known_parts=known_parts)
|
|
179
|
+
item_ref = ManifestItemRef(
|
|
180
|
+
item_id=item.get("id"),
|
|
181
|
+
href=href,
|
|
182
|
+
resolved_path=resolved_path,
|
|
183
|
+
media_type=item.get("media-type"),
|
|
184
|
+
properties=item.get("properties"),
|
|
185
|
+
)
|
|
186
|
+
items.append(item_ref)
|
|
187
|
+
if item_ref.item_id:
|
|
188
|
+
id_to_path[item_ref.item_id] = resolved_path
|
|
189
|
+
|
|
190
|
+
spine_paths: list[str] = []
|
|
191
|
+
dangling_idrefs: list[str] = []
|
|
192
|
+
for itemref in manifest_root.findall(".//opf:itemref", OPF_NS):
|
|
193
|
+
idref = (itemref.get("idref") or "").strip()
|
|
194
|
+
if not idref:
|
|
195
|
+
continue
|
|
196
|
+
spine_path = id_to_path.get(idref)
|
|
197
|
+
if spine_path:
|
|
198
|
+
spine_paths.append(spine_path)
|
|
199
|
+
else:
|
|
200
|
+
dangling_idrefs.append(idref)
|
|
201
|
+
|
|
202
|
+
header_paths = tuple(path for path in spine_paths if is_header_part_name(path))
|
|
203
|
+
master_page_paths = tuple(
|
|
204
|
+
item.resolved_path
|
|
205
|
+
for item in items
|
|
206
|
+
if _manifest_matches(item, "masterpage", "master-page")
|
|
207
|
+
)
|
|
208
|
+
history_paths = tuple(
|
|
209
|
+
item.resolved_path
|
|
210
|
+
for item in items
|
|
211
|
+
if _manifest_matches(item, "history")
|
|
212
|
+
)
|
|
213
|
+
version_path = next(
|
|
214
|
+
(item.resolved_path for item in items if _manifest_matches(item, "version")),
|
|
215
|
+
None,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
return ManifestRelationships(
|
|
219
|
+
manifest_path=normalize_part_name(manifest_path),
|
|
220
|
+
items=tuple(items),
|
|
221
|
+
spine_paths=tuple(spine_paths),
|
|
222
|
+
dangling_idrefs=tuple(dangling_idrefs),
|
|
223
|
+
header_paths=header_paths,
|
|
224
|
+
master_page_paths=master_page_paths,
|
|
225
|
+
history_paths=history_paths,
|
|
226
|
+
version_path=version_path,
|
|
227
|
+
)
|