python-hwpx 2.6__tar.gz → 2.7.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {python_hwpx-2.6 → python_hwpx-2.7.1}/PKG-INFO +27 -3
  2. {python_hwpx-2.6 → python_hwpx-2.7.1}/README.md +26 -2
  3. {python_hwpx-2.6 → python_hwpx-2.7.1}/pyproject.toml +4 -1
  4. python_hwpx-2.7.1/src/hwpx/tools/archive_cli.py +337 -0
  5. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/package_validator.py +9 -7
  6. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/page_guard.py +64 -1
  7. python_hwpx-2.7.1/src/hwpx/tools/template_analyzer.py +218 -0
  8. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/python_hwpx.egg-info/PKG-INFO +27 -3
  9. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/python_hwpx.egg-info/SOURCES.txt +2 -0
  10. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/python_hwpx.egg-info/entry_points.txt +3 -0
  11. python_hwpx-2.7.1/tests/test_gap_closure_tools.py +221 -0
  12. python_hwpx-2.6/tests/test_gap_closure_tools.py +0 -160
  13. {python_hwpx-2.6 → python_hwpx-2.7.1}/LICENSE +0 -0
  14. {python_hwpx-2.6 → python_hwpx-2.7.1}/setup.cfg +0 -0
  15. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/__init__.py +0 -0
  16. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/data/Skeleton.hwpx +0 -0
  17. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/document.py +0 -0
  18. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/opc/package.py +0 -0
  19. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/opc/xml_utils.py +0 -0
  20. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/__init__.py +0 -0
  21. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/body.py +0 -0
  22. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/common.py +0 -0
  23. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/document.py +0 -0
  24. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/header.py +0 -0
  25. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/header_part.py +0 -0
  26. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/memo.py +0 -0
  27. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/namespaces.py +0 -0
  28. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/paragraph.py +0 -0
  29. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/parser.py +0 -0
  30. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/schema.py +0 -0
  31. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/section.py +0 -0
  32. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/table.py +0 -0
  33. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/oxml/utils.py +0 -0
  34. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/package.py +0 -0
  35. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/py.typed +0 -0
  36. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/templates.py +0 -0
  37. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/__init__.py +0 -0
  38. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/_schemas/header.xsd +0 -0
  39. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/_schemas/section.xsd +0 -0
  40. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/exporter.py +0 -0
  41. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/object_finder.py +0 -0
  42. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/text_extract_cli.py +0 -0
  43. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/text_extractor.py +0 -0
  44. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/hwpx/tools/validator.py +0 -0
  45. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/python_hwpx.egg-info/dependency_links.txt +0 -0
  46. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/python_hwpx.egg-info/requires.txt +0 -0
  47. {python_hwpx-2.6 → python_hwpx-2.7.1}/src/python_hwpx.egg-info/top_level.txt +0 -0
  48. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_coverage_targets.py +0 -0
  49. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_document_context_manager.py +0 -0
  50. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_document_formatting.py +0 -0
  51. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_document_save_api.py +0 -0
  52. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_inline_models.py +0 -0
  53. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_integration_hwpx_compatibility.py +0 -0
  54. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_integration_roundtrip.py +0 -0
  55. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_memo_and_style_editing.py +0 -0
  56. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_new_features.py +0 -0
  57. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_opc_package.py +0 -0
  58. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_oxml_parsing.py +0 -0
  59. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_packaging_py_typed.py +0 -0
  60. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_paragraph_section_management.py +0 -0
  61. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_repr_snapshots.py +0 -0
  62. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_section_headers.py +0 -0
  63. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_split_merged_cell.py +0 -0
  64. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_tables_default_border.py +0 -0
  65. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_text_extractor_annotations.py +0 -0
  66. {python_hwpx-2.6 → python_hwpx-2.7.1}/tests/test_version_metadata.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-hwpx
3
- Version: 2.6
3
+ Version: 2.7.1
4
4
  Summary: Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음
5
5
  Author: python-hwpx Maintainers
6
6
  License: Non-Commercial License
@@ -165,7 +165,8 @@ doc.save_to_path("결과물.hwpx")
165
165
  | 🔎 **객체 검색** | 태그/속성/XPath | 특정 요소 탐색, 주석 이터레이터 |
166
166
  | 🎨 **스타일 치환** | 서식 기반 필터 | 색상/밑줄/charPrIDRef 기반 Run 검색 및 교체 |
167
167
  | 📤 **내보내기** | 텍스트/HTML/Markdown | 문서 변환 출력 |
168
- | ✅ **유효성 검사** | XSD 스키마 | CLI(`hwpx-validate`) 및 API |
168
+ | ✅ **유효성 검사** | XSD + 패키지 구조 | CLI(`hwpx-validate`, `hwpx-validate-package`) 및 API |
169
+ | 🧰 **워크플로 도구** | unpack/pack/template analyze/page guard | 템플릿 보존형 XML-first 작업 보조 |
169
170
  | 🏗️ **저수준 XML** | 데이터클래스 매핑 | OWPML 스키마 ↔ Python 객체 직접 조작 |
170
171
  | 🔄 **네임스페이스 호환** | 자동 정규화 | HWPML 2016 → 2011 자동 변환 |
171
172
 
@@ -262,10 +263,15 @@ python-hwpx
262
263
  │ ├── body.py # 타입이 지정된 본문 모델
263
264
  │ └── common.py # 범용 XML ↔ 데이터클래스
264
265
  ├── hwpx.tools
266
+ │ ├── archive_cli # unpack/pack CLI 및 재패킹 메타데이터
265
267
  │ ├── text_extractor # 텍스트 추출 파이프라인
268
+ │ ├── text_extract_cli # 텍스트 추출 CLI
266
269
  │ ├── object_finder # 객체 탐색 유틸리티
267
270
  │ ├── exporter # 텍스트/HTML/Markdown 내보내기
268
- └── validator # 스키마 유효성 검사 (hwpx-validate CLI)
271
+ ├── validator # 스키마 유효성 검사 (hwpx-validate CLI)
272
+ │ ├── package_validator# ZIP/OPC/HWPX 구조 검사
273
+ │ ├── page_guard # layout-drift proxy
274
+ │ └── template_analyzer# 레퍼런스 문서 분석/추출
269
275
  └── hwpx.templates # 내장 빈 문서 템플릿
270
276
  ```
271
277
 
@@ -274,8 +280,26 @@ python-hwpx
274
280
  ```bash
275
281
  # HWPX 문서 스키마 유효성 검사
276
282
  hwpx-validate 문서.hwpx
283
+
284
+ # ZIP/OPC/HWPX 패키지 구조 검사
285
+ hwpx-validate-package 문서.hwpx
286
+
287
+ # HWPX 풀기 / 다시 묶기
288
+ hwpx-unpack 문서.hwpx ./unpacked
289
+ hwpx-pack ./unpacked ./repacked.hwpx
290
+
291
+ # 레퍼런스 템플릿 분석과 파트 추출
292
+ hwpx-analyze-template 문서.hwpx --extract-dir ./template-parts --json
293
+
294
+ # plain / markdown 텍스트 추출
295
+ hwpx-text-extract 문서.hwpx --format markdown --output 문서.md
296
+
297
+ # 레이아웃 드리프트 프록시 비교
298
+ hwpx-page-guard --reference 원본.hwpx --output 결과.hwpx
277
299
  ```
278
300
 
301
+ `hwpx-page-guard`는 렌더된 실제 쪽수를 계산하지 않습니다. 대신 단락 수, 표 수, shape/control 수, 명시적 page/column break, 텍스트 길이 통계를 비교해 레이아웃 드리프트 위험을 탐지하는 프록시 도구입니다.
302
+
279
303
  ## 문서
280
304
 
281
305
  | | |
@@ -98,7 +98,8 @@ doc.save_to_path("결과물.hwpx")
98
98
  | 🔎 **객체 검색** | 태그/속성/XPath | 특정 요소 탐색, 주석 이터레이터 |
99
99
  | 🎨 **스타일 치환** | 서식 기반 필터 | 색상/밑줄/charPrIDRef 기반 Run 검색 및 교체 |
100
100
  | 📤 **내보내기** | 텍스트/HTML/Markdown | 문서 변환 출력 |
101
- | ✅ **유효성 검사** | XSD 스키마 | CLI(`hwpx-validate`) 및 API |
101
+ | ✅ **유효성 검사** | XSD + 패키지 구조 | CLI(`hwpx-validate`, `hwpx-validate-package`) 및 API |
102
+ | 🧰 **워크플로 도구** | unpack/pack/template analyze/page guard | 템플릿 보존형 XML-first 작업 보조 |
102
103
  | 🏗️ **저수준 XML** | 데이터클래스 매핑 | OWPML 스키마 ↔ Python 객체 직접 조작 |
103
104
  | 🔄 **네임스페이스 호환** | 자동 정규화 | HWPML 2016 → 2011 자동 변환 |
104
105
 
@@ -195,10 +196,15 @@ python-hwpx
195
196
  │ ├── body.py # 타입이 지정된 본문 모델
196
197
  │ └── common.py # 범용 XML ↔ 데이터클래스
197
198
  ├── hwpx.tools
199
+ │ ├── archive_cli # unpack/pack CLI 및 재패킹 메타데이터
198
200
  │ ├── text_extractor # 텍스트 추출 파이프라인
201
+ │ ├── text_extract_cli # 텍스트 추출 CLI
199
202
  │ ├── object_finder # 객체 탐색 유틸리티
200
203
  │ ├── exporter # 텍스트/HTML/Markdown 내보내기
201
- └── validator # 스키마 유효성 검사 (hwpx-validate CLI)
204
+ ├── validator # 스키마 유효성 검사 (hwpx-validate CLI)
205
+ │ ├── package_validator# ZIP/OPC/HWPX 구조 검사
206
+ │ ├── page_guard # layout-drift proxy
207
+ │ └── template_analyzer# 레퍼런스 문서 분석/추출
202
208
  └── hwpx.templates # 내장 빈 문서 템플릿
203
209
  ```
204
210
 
@@ -207,8 +213,26 @@ python-hwpx
207
213
  ```bash
208
214
  # HWPX 문서 스키마 유효성 검사
209
215
  hwpx-validate 문서.hwpx
216
+
217
+ # ZIP/OPC/HWPX 패키지 구조 검사
218
+ hwpx-validate-package 문서.hwpx
219
+
220
+ # HWPX 풀기 / 다시 묶기
221
+ hwpx-unpack 문서.hwpx ./unpacked
222
+ hwpx-pack ./unpacked ./repacked.hwpx
223
+
224
+ # 레퍼런스 템플릿 분석과 파트 추출
225
+ hwpx-analyze-template 문서.hwpx --extract-dir ./template-parts --json
226
+
227
+ # plain / markdown 텍스트 추출
228
+ hwpx-text-extract 문서.hwpx --format markdown --output 문서.md
229
+
230
+ # 레이아웃 드리프트 프록시 비교
231
+ hwpx-page-guard --reference 원본.hwpx --output 결과.hwpx
210
232
  ```
211
233
 
234
+ `hwpx-page-guard`는 렌더된 실제 쪽수를 계산하지 않습니다. 대신 단락 수, 표 수, shape/control 수, 명시적 page/column break, 텍스트 길이 통계를 비교해 레이아웃 드리프트 위험을 탐지하는 프록시 도구입니다.
235
+
212
236
  ## 문서
213
237
 
214
238
  | | |
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "python-hwpx"
7
- version = "2.6"
7
+ version = "2.7.1"
8
8
  description = "Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  license = { file = "LICENSE" }
@@ -49,9 +49,12 @@ Documentation = "https://github.com/airmang/python-hwpx/tree/main/docs"
49
49
  Issues = "https://github.com/airmang/python-hwpx/issues"
50
50
 
51
51
  [project.scripts]
52
+ hwpx-unpack = "hwpx.tools.archive_cli:unpack_main"
53
+ hwpx-pack = "hwpx.tools.archive_cli:pack_main"
52
54
  hwpx-validate = "hwpx.tools.validator:main"
53
55
  hwpx-validate-package = "hwpx.tools.package_validator:main"
54
56
  hwpx-page-guard = "hwpx.tools.page_guard:main"
57
+ hwpx-analyze-template = "hwpx.tools.template_analyzer:main"
55
58
  hwpx-text-extract = "hwpx.tools.text_extract_cli:main"
56
59
 
57
60
  [tool.setuptools]
@@ -0,0 +1,337 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import json
5
+ import os
6
+ import shutil
7
+ import tempfile
8
+ from dataclasses import asdict, dataclass
9
+ from pathlib import Path
10
+ from typing import Sequence
11
+ from zipfile import ZIP_DEFLATED, ZIP_STORED, ZipFile
12
+
13
+ from lxml import etree
14
+
15
+ from .package_validator import validate_package
16
+
17
+ _XML_SUFFIXES = (".xml", ".hpf")
18
+ _PACK_METADATA_NAME = ".hwpx-pack-metadata.json"
19
+
20
+ __all__ = [
21
+ "ArchiveEntryInfo",
22
+ "UnpackResult",
23
+ "PackResult",
24
+ "pack_hwpx",
25
+ "unpack_hwpx",
26
+ "pack_main",
27
+ "unpack_main",
28
+ "main",
29
+ ]
30
+
31
+
32
+ @dataclass(frozen=True)
33
+ class ArchiveEntryInfo:
34
+ path: str
35
+ compress_type: int
36
+
37
+
38
+ @dataclass(frozen=True)
39
+ class UnpackResult:
40
+ output_dir: Path
41
+ metadata_path: Path
42
+ entries: tuple[ArchiveEntryInfo, ...]
43
+
44
+
45
+ @dataclass(frozen=True)
46
+ class PackResult:
47
+ output_path: Path
48
+ entries: tuple[str, ...]
49
+
50
+
51
+ def _guard_destructive_target(path: Path) -> None:
52
+ resolved = path.resolve()
53
+ if resolved == Path(resolved.anchor):
54
+ raise ValueError(f"refusing to overwrite filesystem root: {resolved}")
55
+ if resolved == Path.cwd().resolve():
56
+ raise ValueError(f"refusing to overwrite current working directory: {resolved}")
57
+
58
+
59
+ def _prepare_output_dir(output_dir: Path, *, overwrite: bool) -> None:
60
+ if output_dir.exists() and not output_dir.is_dir():
61
+ raise NotADirectoryError(f"output exists and is not a directory: {output_dir}")
62
+ if output_dir.exists():
63
+ if any(output_dir.iterdir()):
64
+ if not overwrite:
65
+ raise FileExistsError(f"output directory is not empty: {output_dir}")
66
+ _guard_destructive_target(output_dir)
67
+ shutil.rmtree(output_dir)
68
+ else:
69
+ output_dir.rmdir()
70
+ output_dir.mkdir(parents=True, exist_ok=True)
71
+
72
+
73
+ def _prepare_output_path(output_path: Path, *, overwrite: bool) -> None:
74
+ output_path.parent.mkdir(parents=True, exist_ok=True)
75
+ if output_path.exists() and not overwrite:
76
+ raise FileExistsError(f"output file already exists: {output_path}")
77
+
78
+
79
+ def _format_xml_bytes(payload: bytes) -> bytes:
80
+ try:
81
+ element = etree.fromstring(payload)
82
+ except etree.XMLSyntaxError:
83
+ return payload
84
+ etree.indent(element, space=" ")
85
+ return etree.tostring(
86
+ element,
87
+ pretty_print=True,
88
+ xml_declaration=True,
89
+ encoding="UTF-8",
90
+ )
91
+
92
+
93
+ def _iter_file_entries(archive: ZipFile) -> tuple[ArchiveEntryInfo, ...]:
94
+ entries: list[ArchiveEntryInfo] = []
95
+ for info in archive.infolist():
96
+ if info.is_dir():
97
+ continue
98
+ entries.append(ArchiveEntryInfo(path=info.filename, compress_type=info.compress_type))
99
+ return tuple(entries)
100
+
101
+
102
+ def _metadata_path(root: Path) -> Path:
103
+ return root / _PACK_METADATA_NAME
104
+
105
+
106
+ def _write_pack_metadata(root: Path, entries: tuple[ArchiveEntryInfo, ...]) -> Path:
107
+ destination = _metadata_path(root)
108
+ payload = {
109
+ "format_version": 1,
110
+ "entries": [asdict(entry) for entry in entries],
111
+ }
112
+ destination.write_text(json.dumps(payload, indent=2), encoding="utf-8")
113
+ return destination
114
+
115
+
116
+ def _read_pack_metadata(root: Path) -> tuple[ArchiveEntryInfo, ...]:
117
+ metadata_file = _metadata_path(root)
118
+ if not metadata_file.is_file():
119
+ return ()
120
+
121
+ data = json.loads(metadata_file.read_text(encoding="utf-8"))
122
+ entries: list[ArchiveEntryInfo] = []
123
+ for entry in data.get("entries", []):
124
+ path = str(entry.get("path", "")).strip()
125
+ if not path:
126
+ continue
127
+ entries.append(
128
+ ArchiveEntryInfo(
129
+ path=path.replace("\\", "/"),
130
+ compress_type=int(entry.get("compress_type", ZIP_DEFLATED)),
131
+ )
132
+ )
133
+ return tuple(entries)
134
+
135
+
136
+ def _discover_files(root: Path) -> set[str]:
137
+ paths: set[str] = set()
138
+ for path in root.rglob("*"):
139
+ if not path.is_file():
140
+ continue
141
+ rel_path = path.relative_to(root).as_posix()
142
+ if rel_path == _PACK_METADATA_NAME:
143
+ continue
144
+ paths.add(rel_path)
145
+ return paths
146
+
147
+
148
+ def _resolve_write_order(paths: set[str], metadata: tuple[ArchiveEntryInfo, ...]) -> tuple[str, ...]:
149
+ ordered: list[str] = []
150
+ seen: set[str] = set()
151
+
152
+ if "mimetype" in paths:
153
+ ordered.append("mimetype")
154
+ seen.add("mimetype")
155
+
156
+ for entry in metadata:
157
+ if entry.path in paths and entry.path not in seen:
158
+ ordered.append(entry.path)
159
+ seen.add(entry.path)
160
+
161
+ for path in sorted(paths):
162
+ if path in seen:
163
+ continue
164
+ ordered.append(path)
165
+ seen.add(path)
166
+
167
+ return tuple(ordered)
168
+
169
+
170
+ def _summarize_pack_validation(output_path: Path) -> None:
171
+ report = validate_package(output_path)
172
+ if report.ok:
173
+ return
174
+ summary = "\n".join(f"- {issue}" for issue in report.issues[:10])
175
+ raise ValueError(f"packed archive failed validation:\n{summary}")
176
+
177
+
178
+ def unpack_hwpx(
179
+ source: str | Path,
180
+ output_dir: str | Path,
181
+ *,
182
+ overwrite: bool = False,
183
+ pretty_xml: bool = True,
184
+ ) -> UnpackResult:
185
+ source_path = Path(source)
186
+ if not source_path.is_file():
187
+ raise FileNotFoundError(f"input file not found: {source_path}")
188
+
189
+ destination = Path(output_dir)
190
+ _prepare_output_dir(destination, overwrite=overwrite)
191
+
192
+ with ZipFile(source_path, "r") as archive:
193
+ entries = _iter_file_entries(archive)
194
+ for entry in entries:
195
+ data = archive.read(entry.path)
196
+ if pretty_xml and entry.path.endswith(_XML_SUFFIXES):
197
+ data = _format_xml_bytes(data)
198
+ target = destination / entry.path
199
+ target.parent.mkdir(parents=True, exist_ok=True)
200
+ target.write_bytes(data)
201
+
202
+ metadata_path = _write_pack_metadata(destination, entries)
203
+ return UnpackResult(output_dir=destination, metadata_path=metadata_path, entries=entries)
204
+
205
+
206
+ def pack_hwpx(
207
+ input_dir: str | Path,
208
+ output_path: str | Path,
209
+ *,
210
+ overwrite: bool = False,
211
+ ) -> PackResult:
212
+ root = Path(input_dir)
213
+ if not root.is_dir():
214
+ raise FileNotFoundError(f"input directory not found: {root}")
215
+
216
+ destination = Path(output_path)
217
+ _prepare_output_path(destination, overwrite=overwrite)
218
+
219
+ files = _discover_files(root)
220
+ if "mimetype" not in files:
221
+ raise FileNotFoundError(f"missing required 'mimetype' file in {root}")
222
+
223
+ metadata = _read_pack_metadata(root)
224
+ compress_types = {entry.path: entry.compress_type for entry in metadata}
225
+ ordered_paths = _resolve_write_order(files, metadata)
226
+
227
+ fd, tmp_name = tempfile.mkstemp(dir=str(destination.parent), suffix=".hwpx.tmp")
228
+ os.close(fd)
229
+ tmp_path = Path(tmp_name)
230
+ try:
231
+ with ZipFile(tmp_path, "w", ZIP_DEFLATED) as archive:
232
+ archive.write(root / "mimetype", "mimetype", compress_type=ZIP_STORED)
233
+ for rel_path in ordered_paths:
234
+ if rel_path == "mimetype":
235
+ continue
236
+ compress_type = compress_types.get(rel_path, ZIP_DEFLATED)
237
+ if compress_type != ZIP_STORED:
238
+ compress_type = ZIP_DEFLATED
239
+ archive.write(root / rel_path, rel_path, compress_type=compress_type)
240
+
241
+ _summarize_pack_validation(tmp_path)
242
+ os.replace(tmp_path, destination)
243
+ except BaseException:
244
+ try:
245
+ tmp_path.unlink(missing_ok=True)
246
+ except OSError:
247
+ pass
248
+ raise
249
+
250
+ return PackResult(output_path=destination, entries=ordered_paths)
251
+
252
+
253
+ def unpack_main(argv: Sequence[str] | None = None) -> int:
254
+ parser = argparse.ArgumentParser(description="Unpack an HWPX file into a directory")
255
+ parser.add_argument("input", help="Input .hwpx path")
256
+ parser.add_argument("output", help="Output directory")
257
+ parser.add_argument(
258
+ "--force",
259
+ action="store_true",
260
+ help="Allow deleting an existing non-empty output directory",
261
+ )
262
+ parser.add_argument(
263
+ "--no-pretty-xml",
264
+ action="store_true",
265
+ help="Keep XML payloads in their original byte formatting",
266
+ )
267
+ args = parser.parse_args(argv)
268
+
269
+ try:
270
+ result = unpack_hwpx(
271
+ args.input,
272
+ args.output,
273
+ overwrite=args.force,
274
+ pretty_xml=not args.no_pretty_xml,
275
+ )
276
+ except Exception as exc:
277
+ print(f"ERROR: {exc}")
278
+ return 1
279
+
280
+ print(f"Unpacked {args.input} -> {result.output_dir}")
281
+ print(f"Recorded archive metadata at {result.metadata_path}")
282
+ return 0
283
+
284
+
285
+ def pack_main(argv: Sequence[str] | None = None) -> int:
286
+ parser = argparse.ArgumentParser(description="Pack a directory into an HWPX archive")
287
+ parser.add_argument("input", help="Input directory")
288
+ parser.add_argument("output", help="Output .hwpx path")
289
+ parser.add_argument(
290
+ "--force",
291
+ action="store_true",
292
+ help="Allow replacing an existing output file",
293
+ )
294
+ args = parser.parse_args(argv)
295
+
296
+ try:
297
+ result = pack_hwpx(args.input, args.output, overwrite=args.force)
298
+ except Exception as exc:
299
+ print(f"ERROR: {exc}")
300
+ return 1
301
+
302
+ print(f"Packed {args.input} -> {result.output_path}")
303
+ return 0
304
+
305
+
306
+ def main(argv: Sequence[str] | None = None) -> int:
307
+ parser = argparse.ArgumentParser(description="HWPX archive utility helpers")
308
+ subparsers = parser.add_subparsers(dest="command", required=True)
309
+
310
+ unpack_parser = subparsers.add_parser("unpack", help="Unpack an HWPX file")
311
+ unpack_parser.add_argument("input")
312
+ unpack_parser.add_argument("output")
313
+ unpack_parser.add_argument("--force", action="store_true")
314
+ unpack_parser.add_argument("--no-pretty-xml", action="store_true")
315
+
316
+ pack_parser = subparsers.add_parser("pack", help="Pack a directory into HWPX")
317
+ pack_parser.add_argument("input")
318
+ pack_parser.add_argument("output")
319
+ pack_parser.add_argument("--force", action="store_true")
320
+
321
+ args = parser.parse_args(argv)
322
+ if args.command == "unpack":
323
+ forward = [args.input, args.output]
324
+ if args.force:
325
+ forward.append("--force")
326
+ if args.no_pretty_xml:
327
+ forward.append("--no-pretty-xml")
328
+ return unpack_main(forward)
329
+
330
+ forward = [args.input, args.output]
331
+ if args.force:
332
+ forward.append("--force")
333
+ return pack_main(forward)
334
+
335
+
336
+ if __name__ == "__main__": # pragma: no cover - CLI convenience
337
+ raise SystemExit(main())
@@ -68,13 +68,15 @@ def _parse_xml(payload: bytes) -> ET.Element:
68
68
  def _container_rootfiles(container_root: ET.Element) -> list[str]:
69
69
  paths: list[str] = []
70
70
  for namespace in CONTAINER_NS.values():
71
- paths.extend(
72
- elem.get("full-path")
73
- or elem.get("fullPath")
74
- or elem.get("full_path")
75
- for elem in container_root.findall(f".//{{{namespace}}}rootfile")
76
- )
77
- return [path for path in paths if path]
71
+ for elem in container_root.findall(f".//{{{namespace}}}rootfile"):
72
+ path = (
73
+ elem.get("full-path")
74
+ or elem.get("fullPath")
75
+ or elem.get("full_path")
76
+ )
77
+ if path:
78
+ paths.append(path)
79
+ return paths
78
80
 
79
81
 
80
82
  def _manifest_hrefs(manifest_root: ET.Element) -> set[str]:
@@ -1,3 +1,9 @@
1
+ """Proxy checks for layout drift between a reference and an output HWPX.
2
+
3
+ This module does not calculate rendered page counts. It compares structural and
4
+ textual metrics that often correlate with page-layout drift.
5
+ """
6
+
1
7
  from __future__ import annotations
2
8
 
3
9
  import argparse
@@ -16,6 +22,22 @@ NS = {
16
22
  "opf": "http://www.idpf.org/2007/opf/",
17
23
  }
18
24
 
25
+ _SHAPE_TAGS = {
26
+ "line",
27
+ "rect",
28
+ "ellipse",
29
+ "arc",
30
+ "polygon",
31
+ "curve",
32
+ "connectLine",
33
+ "textart",
34
+ "pic",
35
+ "compose",
36
+ "equation",
37
+ "ole",
38
+ "container",
39
+ }
40
+
19
41
  __all__ = [
20
42
  "DocumentMetrics",
21
43
  "collect_metrics",
@@ -31,7 +53,11 @@ class DocumentMetrics:
31
53
  page_break_count: int
32
54
  column_break_count: int
33
55
  table_count: int
56
+ shape_count: int
57
+ control_count: int
34
58
  table_shapes: list[tuple[str, str, str, str, str, str]]
59
+ shape_types: list[tuple[str, int]]
60
+ control_types: list[tuple[str, int]]
35
61
  text_char_total: int
36
62
  text_char_total_nospace: int
37
63
  paragraph_text_lengths: list[int]
@@ -66,6 +92,12 @@ def _text_of_t_node(node: etree._Element) -> str:
66
92
  return "".join(node.itertext())
67
93
 
68
94
 
95
+ def _local_name(tag: str) -> str:
96
+ if "}" in tag:
97
+ return tag.split("}", 1)[1]
98
+ return tag
99
+
100
+
69
101
  def _iter_section_roots(source: str | Path | bytes | BinaryIO) -> Iterable[etree._Element]:
70
102
  if isinstance(source, bytes):
71
103
  archive = ZipFile(io.BytesIO(source), "r")
@@ -85,6 +117,8 @@ def collect_metrics(source: str | Path | bytes | BinaryIO) -> DocumentMetrics:
85
117
  paragraphs: list[etree._Element] = []
86
118
  tables: list[etree._Element] = []
87
119
  table_shapes: list[tuple[str, str, str, str, str, str]] = []
120
+ shape_types: dict[str, int] = {}
121
+ control_types: dict[str, int] = {}
88
122
  paragraph_text_lengths: list[int] = []
89
123
  text_char_total = 0
90
124
  text_char_total_nospace = 0
@@ -100,6 +134,19 @@ def collect_metrics(source: str | Path | bytes | BinaryIO) -> DocumentMetrics:
100
134
  section_tables = root.xpath(".//hp:tbl", namespaces=NS)
101
135
  tables.extend(section_tables)
102
136
 
137
+ for element in root.iter():
138
+ name = _local_name(element.tag)
139
+ if name in _SHAPE_TAGS:
140
+ shape_types[name] = shape_types.get(name, 0) + 1
141
+ if name == "ctrl":
142
+ control_counted = False
143
+ for child in element:
144
+ child_name = _local_name(child.tag)
145
+ control_types[child_name] = control_types.get(child_name, 0) + 1
146
+ control_counted = True
147
+ if not control_counted:
148
+ control_types["ctrl"] = control_types.get("ctrl", 0) + 1
149
+
103
150
  for table in section_tables:
104
151
  size = table.find("hp:sz", namespaces=NS)
105
152
  table_shapes.append(
@@ -132,7 +179,11 @@ def collect_metrics(source: str | Path | bytes | BinaryIO) -> DocumentMetrics:
132
179
  page_break_count=page_break_count,
133
180
  column_break_count=column_break_count,
134
181
  table_count=len(tables),
182
+ shape_count=sum(shape_types.values()),
183
+ control_count=sum(control_types.values()),
135
184
  table_shapes=table_shapes,
185
+ shape_types=sorted(shape_types.items()),
186
+ control_types=sorted(control_types.items()),
136
187
  text_char_total=text_char_total,
137
188
  text_char_total_nospace=text_char_total_nospace,
138
189
  paragraph_text_lengths=paragraph_text_lengths,
@@ -173,8 +224,18 @@ def compare_metrics(
173
224
  )
174
225
  if reference.table_count != output.table_count:
175
226
  errors.append(f"table count mismatch: ref={reference.table_count}, out={output.table_count}")
227
+ if reference.shape_count != output.shape_count:
228
+ errors.append(f"shape count mismatch: ref={reference.shape_count}, out={output.shape_count}")
229
+ if reference.control_count != output.control_count:
230
+ errors.append(
231
+ f"control count mismatch: ref={reference.control_count}, out={output.control_count}"
232
+ )
176
233
  if reference.table_shapes != output.table_shapes:
177
234
  errors.append("table shape mismatch (rowCnt/colCnt/width/height/repeatHeader/pageBreak)")
235
+ if reference.shape_types != output.shape_types:
236
+ errors.append("shape type histogram mismatch")
237
+ if reference.control_types != output.control_types:
238
+ errors.append("control type histogram mismatch")
178
239
 
179
240
  text_delta = _ratio_delta(reference.text_char_total_nospace, output.text_char_total_nospace)
180
241
  if text_delta > max_text_delta_ratio:
@@ -202,7 +263,9 @@ def compare_metrics(
202
263
 
203
264
 
204
265
  def main(argv: Sequence[str] | None = None) -> int:
205
- parser = argparse.ArgumentParser(description="Reference-vs-output HWPX page drift guard")
266
+ parser = argparse.ArgumentParser(
267
+ description="Reference-vs-output HWPX layout-drift proxy checker"
268
+ )
206
269
  parser.add_argument("--reference", "-r", required=True, help="Reference HWPX path")
207
270
  parser.add_argument("--output", "-o", required=True, help="Output HWPX path")
208
271
  parser.add_argument("--max-text-delta-ratio", type=float, default=0.15)