python-hwpx 2.4__tar.gz → 2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. {python_hwpx-2.4 → python_hwpx-2.6}/PKG-INFO +1 -1
  2. {python_hwpx-2.4 → python_hwpx-2.6}/pyproject.toml +4 -1
  3. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/document.py +9 -4
  4. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/document.py +6 -1
  5. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/tools/__init__.py +16 -0
  6. python_hwpx-2.6/src/hwpx/tools/package_validator.py +217 -0
  7. python_hwpx-2.6/src/hwpx/tools/page_guard.py +242 -0
  8. python_hwpx-2.6/src/hwpx/tools/text_extract_cli.py +66 -0
  9. {python_hwpx-2.4 → python_hwpx-2.6}/src/python_hwpx.egg-info/PKG-INFO +1 -1
  10. {python_hwpx-2.4 → python_hwpx-2.6}/src/python_hwpx.egg-info/SOURCES.txt +5 -0
  11. python_hwpx-2.6/src/python_hwpx.egg-info/entry_points.txt +5 -0
  12. python_hwpx-2.6/tests/test_gap_closure_tools.py +160 -0
  13. python_hwpx-2.6/tests/test_split_merged_cell.py +185 -0
  14. python_hwpx-2.4/src/python_hwpx.egg-info/entry_points.txt +0 -2
  15. {python_hwpx-2.4 → python_hwpx-2.6}/LICENSE +0 -0
  16. {python_hwpx-2.4 → python_hwpx-2.6}/README.md +0 -0
  17. {python_hwpx-2.4 → python_hwpx-2.6}/setup.cfg +0 -0
  18. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/__init__.py +0 -0
  19. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/data/Skeleton.hwpx +0 -0
  20. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/opc/package.py +0 -0
  21. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/opc/xml_utils.py +0 -0
  22. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/__init__.py +0 -0
  23. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/body.py +0 -0
  24. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/common.py +0 -0
  25. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/header.py +0 -0
  26. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/header_part.py +0 -0
  27. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/memo.py +0 -0
  28. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/namespaces.py +0 -0
  29. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/paragraph.py +0 -0
  30. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/parser.py +0 -0
  31. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/schema.py +0 -0
  32. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/section.py +0 -0
  33. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/table.py +0 -0
  34. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/oxml/utils.py +0 -0
  35. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/package.py +0 -0
  36. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/py.typed +0 -0
  37. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/templates.py +0 -0
  38. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/tools/_schemas/header.xsd +0 -0
  39. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/tools/_schemas/section.xsd +0 -0
  40. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/tools/exporter.py +0 -0
  41. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/tools/object_finder.py +0 -0
  42. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/tools/text_extractor.py +0 -0
  43. {python_hwpx-2.4 → python_hwpx-2.6}/src/hwpx/tools/validator.py +0 -0
  44. {python_hwpx-2.4 → python_hwpx-2.6}/src/python_hwpx.egg-info/dependency_links.txt +0 -0
  45. {python_hwpx-2.4 → python_hwpx-2.6}/src/python_hwpx.egg-info/requires.txt +0 -0
  46. {python_hwpx-2.4 → python_hwpx-2.6}/src/python_hwpx.egg-info/top_level.txt +0 -0
  47. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_coverage_targets.py +0 -0
  48. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_document_context_manager.py +0 -0
  49. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_document_formatting.py +0 -0
  50. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_document_save_api.py +0 -0
  51. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_inline_models.py +0 -0
  52. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_integration_hwpx_compatibility.py +0 -0
  53. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_integration_roundtrip.py +0 -0
  54. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_memo_and_style_editing.py +0 -0
  55. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_new_features.py +0 -0
  56. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_opc_package.py +0 -0
  57. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_oxml_parsing.py +0 -0
  58. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_packaging_py_typed.py +0 -0
  59. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_paragraph_section_management.py +0 -0
  60. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_repr_snapshots.py +0 -0
  61. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_section_headers.py +0 -0
  62. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_tables_default_border.py +0 -0
  63. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_text_extractor_annotations.py +0 -0
  64. {python_hwpx-2.4 → python_hwpx-2.6}/tests/test_version_metadata.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-hwpx
3
- Version: 2.4
3
+ Version: 2.6
4
4
  Summary: Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음
5
5
  Author: python-hwpx Maintainers
6
6
  License: Non-Commercial License
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "python-hwpx"
7
- version = "2.4"
7
+ version = "2.6"
8
8
  description = "Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음"
9
9
  readme = { file = "README.md", content-type = "text/markdown" }
10
10
  license = { file = "LICENSE" }
@@ -50,6 +50,9 @@ Issues = "https://github.com/airmang/python-hwpx/issues"
50
50
 
51
51
  [project.scripts]
52
52
  hwpx-validate = "hwpx.tools.validator:main"
53
+ hwpx-validate-package = "hwpx.tools.package_validator:main"
54
+ hwpx-page-guard = "hwpx.tools.page_guard:main"
55
+ hwpx-text-extract = "hwpx.tools.text_extract_cli:main"
53
56
 
54
57
  [tool.setuptools]
55
58
  package-dir = { "" = "src" }
@@ -1280,7 +1280,7 @@ class HwpxDocument:
1280
1280
  """
1281
1281
  from .tools.validator import validate_document
1282
1282
 
1283
- return validate_document(self._to_bytes_raw())
1283
+ return validate_document(self._to_bytes_raw(reset_dirty=False))
1284
1284
 
1285
1285
  def _run_pre_save_validation(self) -> None:
1286
1286
  """Raise if validate_on_save is enabled and the document is invalid."""
@@ -1318,11 +1318,16 @@ class HwpxDocument:
1318
1318
  self._run_pre_save_validation()
1319
1319
  return self._to_bytes_raw()
1320
1320
 
1321
- def _to_bytes_raw(self) -> bytes:
1322
- """Serialize without validation (used by :meth:`validate`)."""
1321
+ def _to_bytes_raw(self, *, reset_dirty: bool = True) -> bytes:
1322
+ """Serialize without validation.
1323
+
1324
+ When ``reset_dirty`` is ``False``, the document remains marked as
1325
+ modified after the archive snapshot is generated.
1326
+ """
1323
1327
  updates = self._root.serialize()
1324
1328
  result = self._package.save(None, updates)
1325
- self._root.reset_dirty()
1329
+ if reset_dirty:
1330
+ self._root.reset_dirty()
1326
1331
  if isinstance(result, bytes):
1327
1332
  return result
1328
1333
  raise TypeError("package.save(None) must return bytes")
@@ -2541,7 +2541,12 @@ class HwpxOxmlTable:
2541
2541
  existing_target.set_size(col_width, row_height)
2542
2542
  continue
2543
2543
 
2544
- new_cell_element = ET.Element(f"{_HP}tc", dict(template_attrs))
2544
+ # Use makeelement() so the new cell matches the XML engine
2545
+ # of the existing tree (stdlib ET or lxml). ET.Element()
2546
+ # always produces stdlib elements which cannot be appended to
2547
+ # an lxml tree (and vice-versa), causing TypeError at runtime
2548
+ # when splitting cells in documents parsed via lxml.
2549
+ new_cell_element = row_element.makeelement(f"{_HP}tc", dict(template_attrs))
2545
2550
  for child in preserved_children:
2546
2551
  new_cell_element.append(deepcopy(child))
2547
2552
 
@@ -6,6 +6,16 @@ from .exporter import (
6
6
  export_text,
7
7
  )
8
8
  from .object_finder import FoundElement, ObjectFinder
9
+ from .package_validator import (
10
+ PackageValidationIssue,
11
+ PackageValidationReport,
12
+ validate_package,
13
+ )
14
+ from .page_guard import (
15
+ DocumentMetrics,
16
+ collect_metrics,
17
+ compare_metrics,
18
+ )
9
19
  from .text_extractor import (
10
20
  DEFAULT_NAMESPACES,
11
21
  ParagraphInfo,
@@ -33,6 +43,12 @@ __all__ = [
33
43
  "strip_namespace",
34
44
  "FoundElement",
35
45
  "ObjectFinder",
46
+ "PackageValidationIssue",
47
+ "PackageValidationReport",
48
+ "validate_package",
49
+ "DocumentMetrics",
50
+ "collect_metrics",
51
+ "compare_metrics",
36
52
  "DocumentSchemas",
37
53
  "ValidationIssue",
38
54
  "ValidationReport",
@@ -0,0 +1,217 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import io
5
+ import xml.etree.ElementTree as ET
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import BinaryIO, Sequence
9
+ from zipfile import ZIP_STORED, BadZipFile, ZipFile
10
+
11
+ EXPECTED_MIMETYPE = "application/hwp+zip"
12
+ CONTAINER_PATH = "META-INF/container.xml"
13
+ MANIFEST_PATH = "Contents/content.hpf"
14
+ HEADER_PATH = "Contents/header.xml"
15
+ VERSION_PATH = "version.xml"
16
+ REQUIRED_CORE_FILES = ("mimetype", CONTAINER_PATH, MANIFEST_PATH, HEADER_PATH, VERSION_PATH)
17
+ OPF_NS = {"opf": "http://www.idpf.org/2007/opf/"}
18
+ CONTAINER_NS = {
19
+ "ct": "urn:oasis:names:tc:opendocument:xmlns:container",
20
+ "ocf": "urn:oasis:names:tc:opendocument:xmlns:container",
21
+ }
22
+
23
+ __all__ = [
24
+ "PackageValidationIssue",
25
+ "PackageValidationReport",
26
+ "validate_package",
27
+ "main",
28
+ ]
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class PackageValidationIssue:
33
+ part_name: str
34
+ message: str
35
+
36
+ def __str__(self) -> str: # pragma: no cover - human readable helper
37
+ return f"{self.part_name}: {self.message}"
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class PackageValidationReport:
42
+ checked_parts: tuple[str, ...]
43
+ issues: tuple[PackageValidationIssue, ...]
44
+
45
+ @property
46
+ def ok(self) -> bool:
47
+ return not self.issues
48
+
49
+ def __bool__(self) -> bool: # pragma: no cover - convenience alias
50
+ return self.ok
51
+
52
+
53
+ def _open_zip(source: str | Path | bytes | BinaryIO) -> ZipFile:
54
+ if isinstance(source, (str, Path)):
55
+ return ZipFile(source, "r")
56
+ if isinstance(source, bytes):
57
+ return ZipFile(io.BytesIO(source), "r")
58
+ return ZipFile(source, "r")
59
+
60
+
61
+ def _parse_xml(payload: bytes) -> ET.Element:
62
+ try:
63
+ return ET.fromstring(payload)
64
+ except ET.ParseError as exc:
65
+ raise ValueError(f"malformed XML: {exc}") from exc
66
+
67
+
68
+ def _container_rootfiles(container_root: ET.Element) -> list[str]:
69
+ paths: list[str] = []
70
+ for namespace in CONTAINER_NS.values():
71
+ paths.extend(
72
+ elem.get("full-path")
73
+ or elem.get("fullPath")
74
+ or elem.get("full_path")
75
+ for elem in container_root.findall(f".//{{{namespace}}}rootfile")
76
+ )
77
+ return [path for path in paths if path]
78
+
79
+
80
+ def _manifest_hrefs(manifest_root: ET.Element) -> set[str]:
81
+ hrefs: set[str] = set()
82
+ for item in manifest_root.findall(".//opf:item", OPF_NS):
83
+ href = item.get("href")
84
+ if href:
85
+ hrefs.add(href)
86
+ return hrefs
87
+
88
+
89
+ def _spine_hrefs(manifest_root: ET.Element) -> list[str]:
90
+ hrefs: list[str] = []
91
+ id_to_href: dict[str, str] = {}
92
+ for item in manifest_root.findall(".//opf:item", OPF_NS):
93
+ item_id = item.get("id")
94
+ href = item.get("href")
95
+ if item_id and href:
96
+ id_to_href[item_id] = href
97
+
98
+ for itemref in manifest_root.findall(".//opf:itemref", OPF_NS):
99
+ idref = itemref.get("idref")
100
+ if idref and idref in id_to_href:
101
+ hrefs.append(id_to_href[idref])
102
+ return hrefs
103
+
104
+
105
+ def validate_package(source: str | Path | bytes | BinaryIO) -> PackageValidationReport:
106
+ checked_parts: list[str] = []
107
+ issues: list[PackageValidationIssue] = []
108
+
109
+ try:
110
+ archive = _open_zip(source)
111
+ except BadZipFile:
112
+ return PackageValidationReport(
113
+ checked_parts=(),
114
+ issues=(PackageValidationIssue("archive", "not a valid ZIP archive"),),
115
+ )
116
+
117
+ with archive as zf:
118
+ names = zf.namelist()
119
+ checked_parts.extend(names)
120
+
121
+ for required in REQUIRED_CORE_FILES:
122
+ if required not in names:
123
+ issues.append(PackageValidationIssue(required, "missing required file"))
124
+
125
+ if not names:
126
+ issues.append(PackageValidationIssue("archive", "empty archive"))
127
+ return PackageValidationReport(tuple(checked_parts), tuple(issues))
128
+
129
+ if "mimetype" in names:
130
+ try:
131
+ mimetype = zf.read("mimetype").decode("utf-8").strip()
132
+ except UnicodeDecodeError:
133
+ mimetype = "<binary>"
134
+ if mimetype != EXPECTED_MIMETYPE:
135
+ issues.append(
136
+ PackageValidationIssue(
137
+ "mimetype",
138
+ f"expected {EXPECTED_MIMETYPE!r}, got {mimetype!r}",
139
+ )
140
+ )
141
+ if names[0] != "mimetype":
142
+ issues.append(PackageValidationIssue("mimetype", "must be the first ZIP entry"))
143
+ if zf.getinfo("mimetype").compress_type != ZIP_STORED:
144
+ issues.append(PackageValidationIssue("mimetype", "must use ZIP_STORED"))
145
+
146
+ xml_roots: dict[str, ET.Element] = {}
147
+ for name in names:
148
+ if not (name.endswith(".xml") or name.endswith(".hpf")):
149
+ continue
150
+ try:
151
+ xml_roots[name] = _parse_xml(zf.read(name))
152
+ except ValueError as exc:
153
+ issues.append(PackageValidationIssue(name, str(exc)))
154
+
155
+ container_root = xml_roots.get(CONTAINER_PATH)
156
+ if container_root is not None:
157
+ rootfiles = _container_rootfiles(container_root)
158
+ if not rootfiles:
159
+ issues.append(PackageValidationIssue(CONTAINER_PATH, "declares no rootfile entries"))
160
+ for rootfile in rootfiles:
161
+ if rootfile not in names:
162
+ issues.append(
163
+ PackageValidationIssue(
164
+ CONTAINER_PATH,
165
+ f"rootfile points to missing part {rootfile!r}",
166
+ )
167
+ )
168
+
169
+ manifest_root = xml_roots.get(MANIFEST_PATH)
170
+ if manifest_root is not None:
171
+ hrefs = _manifest_hrefs(manifest_root)
172
+ for href in sorted(hrefs):
173
+ if href not in names:
174
+ issues.append(
175
+ PackageValidationIssue(
176
+ MANIFEST_PATH,
177
+ f"manifest href missing from archive: {href}",
178
+ )
179
+ )
180
+
181
+ spine_hrefs = _spine_hrefs(manifest_root)
182
+ if not spine_hrefs:
183
+ issues.append(PackageValidationIssue(MANIFEST_PATH, "spine declares no section parts"))
184
+ for href in spine_hrefs:
185
+ if href not in names:
186
+ issues.append(
187
+ PackageValidationIssue(
188
+ MANIFEST_PATH,
189
+ f"spine item missing from archive: {href}",
190
+ )
191
+ )
192
+
193
+ if HEADER_PATH in names and HEADER_PATH not in hrefs:
194
+ issues.append(
195
+ PackageValidationIssue(MANIFEST_PATH, "header.xml is not referenced in manifest")
196
+ )
197
+
198
+ return PackageValidationReport(tuple(checked_parts), tuple(issues))
199
+
200
+
201
+ def main(argv: Sequence[str] | None = None) -> int:
202
+ parser = argparse.ArgumentParser(description="Validate HWPX package structure")
203
+ parser.add_argument("source", help="Path to the HWPX file")
204
+ args = parser.parse_args(argv)
205
+
206
+ report = validate_package(args.source)
207
+ if report.issues:
208
+ for issue in report.issues:
209
+ print(f"ERROR: {issue}")
210
+ return 1
211
+
212
+ print("All package validations passed.")
213
+ return 0
214
+
215
+
216
+ if __name__ == "__main__": # pragma: no cover - CLI convenience
217
+ raise SystemExit(main())
@@ -0,0 +1,242 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import io
5
+ import json
6
+ from dataclasses import asdict, dataclass
7
+ from pathlib import Path
8
+ from typing import BinaryIO, Iterable, Sequence
9
+ from zipfile import ZipFile
10
+
11
+ from lxml import etree
12
+
13
+ NS = {
14
+ "hp": "http://www.hancom.co.kr/hwpml/2011/paragraph",
15
+ "hs": "http://www.hancom.co.kr/hwpml/2011/section",
16
+ "opf": "http://www.idpf.org/2007/opf/",
17
+ }
18
+
19
+ __all__ = [
20
+ "DocumentMetrics",
21
+ "collect_metrics",
22
+ "compare_metrics",
23
+ "main",
24
+ ]
25
+
26
+
27
+ @dataclass(frozen=True)
28
+ class DocumentMetrics:
29
+ section_count: int
30
+ paragraph_count: int
31
+ page_break_count: int
32
+ column_break_count: int
33
+ table_count: int
34
+ table_shapes: list[tuple[str, str, str, str, str, str]]
35
+ text_char_total: int
36
+ text_char_total_nospace: int
37
+ paragraph_text_lengths: list[int]
38
+
39
+
40
+ def _section_files(zf: ZipFile) -> list[str]:
41
+ try:
42
+ root = etree.fromstring(zf.read("Contents/content.hpf"))
43
+ except KeyError:
44
+ return [
45
+ name
46
+ for name in zf.namelist()
47
+ if name.startswith("Contents/section") and name.endswith(".xml")
48
+ ]
49
+
50
+ id_to_href: dict[str, str] = {}
51
+ for item in root.findall(".//opf:item", namespaces=NS):
52
+ item_id = item.get("id")
53
+ href = item.get("href")
54
+ if item_id and href:
55
+ id_to_href[item_id] = href
56
+
57
+ files: list[str] = []
58
+ for itemref in root.findall(".//opf:itemref", namespaces=NS):
59
+ idref = itemref.get("idref")
60
+ if idref and idref in id_to_href:
61
+ files.append(id_to_href[idref])
62
+ return files
63
+
64
+
65
+ def _text_of_t_node(node: etree._Element) -> str:
66
+ return "".join(node.itertext())
67
+
68
+
69
+ def _iter_section_roots(source: str | Path | bytes | BinaryIO) -> Iterable[etree._Element]:
70
+ if isinstance(source, bytes):
71
+ archive = ZipFile(io.BytesIO(source), "r")
72
+ else:
73
+ archive = ZipFile(source, "r")
74
+
75
+ try:
76
+ for name in _section_files(archive):
77
+ yield etree.fromstring(archive.read(name))
78
+ finally:
79
+ archive.close()
80
+
81
+
82
+ def collect_metrics(source: str | Path | bytes | BinaryIO) -> DocumentMetrics:
83
+ section_roots = list(_iter_section_roots(source))
84
+
85
+ paragraphs: list[etree._Element] = []
86
+ tables: list[etree._Element] = []
87
+ table_shapes: list[tuple[str, str, str, str, str, str]] = []
88
+ paragraph_text_lengths: list[int] = []
89
+ text_char_total = 0
90
+ text_char_total_nospace = 0
91
+ page_break_count = 0
92
+ column_break_count = 0
93
+
94
+ for root in section_roots:
95
+ section_paragraphs = root.xpath(".//hs:sec/hp:p", namespaces=NS)
96
+ if not section_paragraphs:
97
+ section_paragraphs = root.xpath(".//hp:p", namespaces=NS)
98
+ paragraphs.extend(section_paragraphs)
99
+
100
+ section_tables = root.xpath(".//hp:tbl", namespaces=NS)
101
+ tables.extend(section_tables)
102
+
103
+ for table in section_tables:
104
+ size = table.find("hp:sz", namespaces=NS)
105
+ table_shapes.append(
106
+ (
107
+ table.get("rowCnt", ""),
108
+ table.get("colCnt", ""),
109
+ size.get("width", "") if size is not None else "",
110
+ size.get("height", "") if size is not None else "",
111
+ table.get("repeatHeader", ""),
112
+ table.get("pageBreak", ""),
113
+ )
114
+ )
115
+
116
+ for paragraph in section_paragraphs:
117
+ if paragraph.get("pageBreak") == "1":
118
+ page_break_count += 1
119
+ if paragraph.get("columnBreak") == "1":
120
+ column_break_count += 1
121
+ paragraph_length = 0
122
+ for text_node in paragraph.xpath(".//hp:t", namespaces=NS):
123
+ text = _text_of_t_node(text_node)
124
+ paragraph_length += len(text)
125
+ text_char_total += len(text)
126
+ text_char_total_nospace += len("".join(text.split()))
127
+ paragraph_text_lengths.append(paragraph_length)
128
+
129
+ return DocumentMetrics(
130
+ section_count=len(section_roots),
131
+ paragraph_count=len(paragraphs),
132
+ page_break_count=page_break_count,
133
+ column_break_count=column_break_count,
134
+ table_count=len(tables),
135
+ table_shapes=table_shapes,
136
+ text_char_total=text_char_total,
137
+ text_char_total_nospace=text_char_total_nospace,
138
+ paragraph_text_lengths=paragraph_text_lengths,
139
+ )
140
+
141
+
142
+ def _ratio_delta(reference_value: int, output_value: int) -> float:
143
+ base = max(reference_value, 1)
144
+ return abs(output_value - reference_value) / base
145
+
146
+
147
+ def compare_metrics(
148
+ reference: DocumentMetrics,
149
+ output: DocumentMetrics,
150
+ *,
151
+ max_text_delta_ratio: float = 0.15,
152
+ max_paragraph_delta_ratio: float = 0.25,
153
+ ) -> list[str]:
154
+ errors: list[str] = []
155
+
156
+ if reference.section_count != output.section_count:
157
+ errors.append(
158
+ f"section count mismatch: ref={reference.section_count}, out={output.section_count}"
159
+ )
160
+ if reference.paragraph_count != output.paragraph_count:
161
+ errors.append(
162
+ f"paragraph count mismatch: ref={reference.paragraph_count}, out={output.paragraph_count}"
163
+ )
164
+ if reference.page_break_count != output.page_break_count:
165
+ errors.append(
166
+ "pageBreak count mismatch: "
167
+ f"ref={reference.page_break_count}, out={output.page_break_count}"
168
+ )
169
+ if reference.column_break_count != output.column_break_count:
170
+ errors.append(
171
+ "columnBreak count mismatch: "
172
+ f"ref={reference.column_break_count}, out={output.column_break_count}"
173
+ )
174
+ if reference.table_count != output.table_count:
175
+ errors.append(f"table count mismatch: ref={reference.table_count}, out={output.table_count}")
176
+ if reference.table_shapes != output.table_shapes:
177
+ errors.append("table shape mismatch (rowCnt/colCnt/width/height/repeatHeader/pageBreak)")
178
+
179
+ text_delta = _ratio_delta(reference.text_char_total_nospace, output.text_char_total_nospace)
180
+ if text_delta > max_text_delta_ratio:
181
+ errors.append(
182
+ "total text length drift exceeded: "
183
+ f"ref={reference.text_char_total_nospace}, out={output.text_char_total_nospace}, "
184
+ f"delta={text_delta:.2%}, limit={max_text_delta_ratio:.2%}"
185
+ )
186
+
187
+ if len(reference.paragraph_text_lengths) == len(output.paragraph_text_lengths):
188
+ for index, (ref_len, out_len) in enumerate(
189
+ zip(reference.paragraph_text_lengths, output.paragraph_text_lengths),
190
+ start=1,
191
+ ):
192
+ if ref_len == 0 and out_len == 0:
193
+ continue
194
+ delta = _ratio_delta(ref_len, out_len)
195
+ if delta > max_paragraph_delta_ratio:
196
+ errors.append(
197
+ f"paragraph {index} text drift exceeded: "
198
+ f"ref={ref_len}, out={out_len}, delta={delta:.2%}, "
199
+ f"limit={max_paragraph_delta_ratio:.2%}"
200
+ )
201
+ return errors
202
+
203
+
204
+ def main(argv: Sequence[str] | None = None) -> int:
205
+ parser = argparse.ArgumentParser(description="Reference-vs-output HWPX page drift guard")
206
+ parser.add_argument("--reference", "-r", required=True, help="Reference HWPX path")
207
+ parser.add_argument("--output", "-o", required=True, help="Output HWPX path")
208
+ parser.add_argument("--max-text-delta-ratio", type=float, default=0.15)
209
+ parser.add_argument("--max-paragraph-delta-ratio", type=float, default=0.25)
210
+ parser.add_argument("--json", action="store_true", help="Print collected metrics as JSON")
211
+ args = parser.parse_args(argv)
212
+
213
+ reference = collect_metrics(args.reference)
214
+ output = collect_metrics(args.output)
215
+
216
+ if args.json:
217
+ print(
218
+ json.dumps(
219
+ {"reference": asdict(reference), "output": asdict(output)},
220
+ ensure_ascii=False,
221
+ indent=2,
222
+ )
223
+ )
224
+
225
+ errors = compare_metrics(
226
+ reference,
227
+ output,
228
+ max_text_delta_ratio=args.max_text_delta_ratio,
229
+ max_paragraph_delta_ratio=args.max_paragraph_delta_ratio,
230
+ )
231
+ if errors:
232
+ print("FAIL: page guard")
233
+ for error in errors:
234
+ print(f" - {error}")
235
+ return 1
236
+
237
+ print("PASS: page guard")
238
+ return 0
239
+
240
+
241
+ if __name__ == "__main__": # pragma: no cover - CLI convenience
242
+ raise SystemExit(main())
@@ -0,0 +1,66 @@
1
+ from __future__ import annotations
2
+
3
+ import argparse
4
+ import sys
5
+ from pathlib import Path
6
+ from typing import Sequence
7
+
8
+ from .text_extractor import TextExtractor
9
+
10
+ __all__ = [
11
+ "extract_plain",
12
+ "extract_markdown",
13
+ "main",
14
+ ]
15
+
16
+
17
+ def extract_plain(hwpx_path: str, *, include_tables: bool = False) -> str:
18
+ with TextExtractor(hwpx_path) as extractor:
19
+ return extractor.extract_text(
20
+ include_nested=include_tables,
21
+ object_behavior="skip",
22
+ skip_empty=True,
23
+ )
24
+
25
+
26
+ def extract_markdown(hwpx_path: str) -> str:
27
+ lines: list[str] = []
28
+ with TextExtractor(hwpx_path) as extractor:
29
+ for section in extractor.iter_sections():
30
+ if lines:
31
+ lines.extend(["", "---", ""])
32
+ for paragraph in extractor.iter_paragraphs(section, include_nested=True):
33
+ text = paragraph.text(object_behavior="skip")
34
+ if not text.strip():
35
+ continue
36
+ lines.append(f" {text}" if paragraph.is_nested else text)
37
+ return "\n".join(lines)
38
+
39
+
40
+ def main(argv: Sequence[str] | None = None) -> int:
41
+ parser = argparse.ArgumentParser(description="Extract text from an HWPX document")
42
+ parser.add_argument("input", help="Path to the .hwpx file")
43
+ parser.add_argument("--format", "-f", choices=["plain", "markdown"], default="plain")
44
+ parser.add_argument("--include-tables", action="store_true", help="Include nested table text")
45
+ parser.add_argument("--output", "-o", help="Write output to a file instead of stdout")
46
+ args = parser.parse_args(argv)
47
+
48
+ input_path = Path(args.input)
49
+ if not input_path.is_file():
50
+ print(f"Error: File not found: {args.input}", file=sys.stderr)
51
+ return 1
52
+
53
+ if args.format == "markdown":
54
+ result = extract_markdown(str(input_path))
55
+ else:
56
+ result = extract_plain(str(input_path), include_tables=args.include_tables)
57
+
58
+ if args.output:
59
+ Path(args.output).write_text(result, encoding="utf-8")
60
+ else:
61
+ print(result)
62
+ return 0
63
+
64
+
65
+ if __name__ == "__main__": # pragma: no cover - CLI convenience
66
+ raise SystemExit(main())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: python-hwpx
3
- Version: 2.4
3
+ Version: 2.6
4
4
  Summary: Hancom HWPX 패키지를 로드하고 편집하기 위한 Python 유틸리티 모음
5
5
  Author: python-hwpx Maintainers
6
6
  License: Non-Commercial License
@@ -26,6 +26,9 @@ src/hwpx/oxml/utils.py
26
26
  src/hwpx/tools/__init__.py
27
27
  src/hwpx/tools/exporter.py
28
28
  src/hwpx/tools/object_finder.py
29
+ src/hwpx/tools/package_validator.py
30
+ src/hwpx/tools/page_guard.py
31
+ src/hwpx/tools/text_extract_cli.py
29
32
  src/hwpx/tools/text_extractor.py
30
33
  src/hwpx/tools/validator.py
31
34
  src/hwpx/tools/_schemas/header.xsd
@@ -40,6 +43,7 @@ tests/test_coverage_targets.py
40
43
  tests/test_document_context_manager.py
41
44
  tests/test_document_formatting.py
42
45
  tests/test_document_save_api.py
46
+ tests/test_gap_closure_tools.py
43
47
  tests/test_inline_models.py
44
48
  tests/test_integration_hwpx_compatibility.py
45
49
  tests/test_integration_roundtrip.py
@@ -51,6 +55,7 @@ tests/test_packaging_py_typed.py
51
55
  tests/test_paragraph_section_management.py
52
56
  tests/test_repr_snapshots.py
53
57
  tests/test_section_headers.py
58
+ tests/test_split_merged_cell.py
54
59
  tests/test_tables_default_border.py
55
60
  tests/test_text_extractor_annotations.py
56
61
  tests/test_version_metadata.py
@@ -0,0 +1,5 @@
1
+ [console_scripts]
2
+ hwpx-page-guard = hwpx.tools.page_guard:main
3
+ hwpx-text-extract = hwpx.tools.text_extract_cli:main
4
+ hwpx-validate = hwpx.tools.validator:main
5
+ hwpx-validate-package = hwpx.tools.package_validator:main
@@ -0,0 +1,160 @@
1
+ from __future__ import annotations
2
+
3
+ import io
4
+ import subprocess
5
+ import sys
6
+ from pathlib import Path
7
+ from zipfile import ZIP_DEFLATED, ZipFile
8
+
9
+ from hwpx import HwpxDocument
10
+ from hwpx.tools.package_validator import validate_package
11
+ from hwpx.tools.page_guard import collect_metrics, compare_metrics
12
+ from hwpx.tools.text_extract_cli import extract_markdown, extract_plain
13
+
14
+ _REPO_ROOT = Path(__file__).resolve().parents[1]
15
+ _CONTAINER_XML = (
16
+ b'<?xml version="1.0" encoding="UTF-8"?>'
17
+ b'<container xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
18
+ b"<rootfiles>"
19
+ b'<rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>'
20
+ b"</rootfiles>"
21
+ b"</container>"
22
+ )
23
+ _MANIFEST_XML = (
24
+ b'<?xml version="1.0" encoding="UTF-8"?>'
25
+ b'<opf:package xmlns:opf="http://www.idpf.org/2007/opf/">'
26
+ b"<opf:manifest>"
27
+ b'<opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>'
28
+ b'<opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>'
29
+ b"</opf:manifest>"
30
+ b"<opf:spine>"
31
+ b'<opf:itemref idref="section0"/>'
32
+ b"</opf:spine>"
33
+ b"</opf:package>"
34
+ )
35
+ _VERSION_XML = b'<?xml version="1.0" encoding="UTF-8"?><version/>'
36
+ _HEADER_XML = b'<?xml version="1.0" encoding="UTF-8"?><header/>'
37
+ _SECTION_XML = b'<?xml version="1.0" encoding="UTF-8"?><section/>'
38
+
39
+
40
+ def _build_invalid_package() -> bytes:
41
+ buffer = io.BytesIO()
42
+ with ZipFile(buffer, "w", compression=ZIP_DEFLATED) as archive:
43
+ archive.writestr("META-INF/container.xml", _CONTAINER_XML)
44
+ archive.writestr("Contents/content.hpf", _MANIFEST_XML)
45
+ archive.writestr("Contents/header.xml", _HEADER_XML)
46
+ archive.writestr("Contents/section0.xml", _SECTION_XML)
47
+ archive.writestr("version.xml", _VERSION_XML)
48
+ return buffer.getvalue()
49
+
50
+
51
+ def test_package_validator_accepts_valid_document() -> None:
52
+ document = HwpxDocument.new()
53
+ document.add_paragraph("validator smoke test")
54
+
55
+ report = validate_package(document.to_bytes())
56
+
57
+ assert report.ok
58
+ assert "Contents/header.xml" in report.checked_parts
59
+
60
+
61
+ def test_package_validator_reports_missing_mimetype() -> None:
62
+ report = validate_package(_build_invalid_package())
63
+
64
+ assert not report.ok
65
+ assert any(issue.part_name == "mimetype" for issue in report.issues)
66
+
67
+
68
+ def test_page_guard_detects_text_drift() -> None:
69
+ reference = HwpxDocument.new()
70
+ reference.add_paragraph("alpha")
71
+ reference.add_paragraph("beta")
72
+
73
+ output = HwpxDocument.new()
74
+ output.add_paragraph("alpha" * 12)
75
+ output.add_paragraph("beta" * 12)
76
+
77
+ errors = compare_metrics(
78
+ collect_metrics(reference.to_bytes()),
79
+ collect_metrics(output.to_bytes()),
80
+ max_text_delta_ratio=0.05,
81
+ max_paragraph_delta_ratio=0.05,
82
+ )
83
+
84
+ assert any("total text length drift exceeded" in error for error in errors)
85
+
86
+
87
+ def test_text_extract_cli_functions_include_table_text(tmp_path: Path) -> None:
88
+ document = HwpxDocument.new()
89
+ document.add_paragraph("Title")
90
+ table = document.add_table(1, 1)
91
+ table.cell(0, 0).text = "Cell"
92
+
93
+ source = tmp_path / "sample.hwpx"
94
+ source.write_bytes(document.to_bytes())
95
+
96
+ plain = extract_plain(str(source), include_tables=True)
97
+ markdown = extract_markdown(str(source))
98
+
99
+ assert "Title" in plain
100
+ assert "Cell" in plain
101
+ assert "Title" in markdown
102
+ assert "Cell" in markdown
103
+
104
+
105
+ def test_validate_preserves_dirty_state() -> None:
106
+ document = HwpxDocument.new()
107
+ document.add_paragraph("dirty paragraph")
108
+
109
+ section = document.sections[-1]
110
+ assert section.dirty
111
+
112
+ document.validate()
113
+
114
+ assert section.dirty
115
+
116
+
117
+ def test_office_pack_unpack_roundtrip(tmp_path: Path) -> None:
118
+ document = HwpxDocument.new()
119
+ document.add_paragraph("Roundtrip")
120
+ table = document.add_table(1, 1)
121
+ table.cell(0, 0).text = "A1"
122
+
123
+ source = tmp_path / "source.hwpx"
124
+ unpack_dir = tmp_path / "unpacked"
125
+ repacked = tmp_path / "repacked.hwpx"
126
+ source.write_bytes(document.to_bytes())
127
+
128
+ subprocess.run(
129
+ [sys.executable, str(_REPO_ROOT / "scripts" / "office" / "unpack.py"), str(source), str(unpack_dir)],
130
+ check=True,
131
+ capture_output=True,
132
+ text=True,
133
+ )
134
+ subprocess.run(
135
+ [sys.executable, str(_REPO_ROOT / "scripts" / "office" / "pack.py"), str(unpack_dir), str(repacked)],
136
+ check=True,
137
+ capture_output=True,
138
+ text=True,
139
+ )
140
+
141
+ assert validate_package(repacked.read_bytes()).ok
142
+ assert compare_metrics(
143
+ collect_metrics(source.read_bytes()),
144
+ collect_metrics(repacked.read_bytes()),
145
+ ) == []
146
+
147
+
148
+ def test_analyze_template_script_smoke(tmp_path: Path) -> None:
149
+ source = tmp_path / "template.hwpx"
150
+ source.write_bytes(HwpxDocument.new().to_bytes())
151
+
152
+ result = subprocess.run(
153
+ [sys.executable, str(_REPO_ROOT / "scripts" / "analyze_template.py"), str(source)],
154
+ check=True,
155
+ capture_output=True,
156
+ text=True,
157
+ )
158
+
159
+ assert "== header summary ==" in result.stdout
160
+ assert "== section summary ==" in result.stdout
@@ -0,0 +1,185 @@
1
+ """Regression tests for split_merged_cell – ET / lxml mixing fix.
2
+
3
+ The root cause of the original crash (TypeError: append() argument 1
4
+ must be xml.etree.ElementTree.Element, not lxml.etree._Element) was that
5
+ ``split_merged_cell`` created new cell elements with stdlib
6
+ ``ET.Element()`` while the existing document tree consisted of lxml
7
+ elements (parsed via ``lxml.etree.fromstring``). The fix uses
8
+ ``row_element.makeelement()`` so that new cells always match the XML
9
+ engine of the surrounding tree.
10
+
11
+ Choice A was applied: *all runtime element creation inside
12
+ ``split_merged_cell`` is now engine-agnostic* by delegating to
13
+ ``makeelement`` / ``SubElement`` (which itself delegates to
14
+ ``makeelement``), so the code works identically with both stdlib ET
15
+ and lxml trees.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ import io
21
+
22
+ import pytest
23
+
24
+ from hwpx.document import HwpxDocument
25
+
26
+
27
+ # --------------------------------------------------------------------------- #
28
+ # Helpers
29
+ # --------------------------------------------------------------------------- #
30
+
31
+
32
+ def _new_doc_with_table(rows: int = 3, cols: int = 3) -> tuple[HwpxDocument, object]:
33
+ """Return (document, table) backed by lxml (via HwpxDocument.new())."""
34
+ doc = HwpxDocument.new()
35
+ table = doc.add_table(rows, cols)
36
+ return doc, table
37
+
38
+
39
+ # --------------------------------------------------------------------------- #
40
+ # Scenario 1 – horizontal merge then split
41
+ # --------------------------------------------------------------------------- #
42
+
43
+
44
+ def test_split_horizontal_merge_no_type_error() -> None:
45
+ """Splitting a horizontally merged cell must not raise TypeError.
46
+
47
+ This is the exact code-path that triggered the original crash when
48
+ an lxml-backed table was modified with stdlib ET elements.
49
+ """
50
+ doc, table = _new_doc_with_table(3, 3)
51
+
52
+ # Merge (0,0)–(0,1) horizontally
53
+ table.merge_cells(0, 0, 0, 1)
54
+ merged = table.cell(0, 0)
55
+ assert merged.span == (1, 2), "pre-condition: cell should be merged"
56
+
57
+ # Split – this used to crash with TypeError
58
+ result = table.split_merged_cell(0, 0)
59
+ assert result is not None
60
+
61
+ # Master cell span reset to (1, 1)
62
+ assert table.cell(0, 0).span == (1, 1)
63
+ # Restored cell exists and is independent
64
+ assert table.cell(0, 1).span == (1, 1)
65
+ assert table.cell(0, 0).element is not table.cell(0, 1).element
66
+
67
+
68
+ # --------------------------------------------------------------------------- #
69
+ # Scenario 2 – vertical merge then split
70
+ # --------------------------------------------------------------------------- #
71
+
72
+
73
+ def test_split_vertical_merge_no_type_error() -> None:
74
+ """Splitting a vertically merged cell must not raise TypeError."""
75
+ doc, table = _new_doc_with_table(3, 3)
76
+
77
+ # Merge (0,0)–(1,0) vertically
78
+ table.merge_cells(0, 0, 1, 0)
79
+ assert table.cell(0, 0).span == (2, 1)
80
+
81
+ result = table.split_merged_cell(0, 0)
82
+ assert result is not None
83
+
84
+ assert table.cell(0, 0).span == (1, 1)
85
+ assert table.cell(1, 0).span == (1, 1)
86
+ assert table.cell(0, 0).element is not table.cell(1, 0).element
87
+
88
+
89
+ # --------------------------------------------------------------------------- #
90
+ # Scenario 3 – 2×2 block merge then split
91
+ # --------------------------------------------------------------------------- #
92
+
93
+
94
+ def test_split_block_merge_restores_all_cells() -> None:
95
+ """A 2×2 block merge should produce 4 independent cells after split."""
96
+ doc, table = _new_doc_with_table(3, 3)
97
+
98
+ table.merge_cells(0, 0, 1, 1)
99
+ assert table.cell(0, 0).span == (2, 2)
100
+
101
+ table.split_merged_cell(0, 0)
102
+
103
+ for r in range(2):
104
+ for c in range(2):
105
+ cell = table.cell(r, c)
106
+ assert cell.span == (1, 1), f"cell ({r},{c}) span should be (1,1)"
107
+
108
+
109
+ # --------------------------------------------------------------------------- #
110
+ # Scenario 4 – save → reopen round-trip after split
111
+ # --------------------------------------------------------------------------- #
112
+
113
+
114
+ def test_split_then_save_reopen_roundtrip(tmp_path) -> None:
115
+ """After splitting, the document must survive save → reopen."""
116
+ doc, table = _new_doc_with_table(3, 3)
117
+
118
+ # Write identifiable text before merge
119
+ table.set_cell_text(0, 0, "A")
120
+ table.set_cell_text(0, 1, "B")
121
+ table.set_cell_text(0, 2, "C")
122
+
123
+ # Merge (0,0)–(0,1) then split
124
+ table.merge_cells(0, 0, 0, 1)
125
+ table.split_merged_cell(0, 0)
126
+
127
+ # Set text in the restored cell
128
+ table.cell(0, 1).text = "B-restored"
129
+
130
+ # Save to bytes and reopen
131
+ buf = io.BytesIO()
132
+ doc.save(buf)
133
+ buf.seek(0)
134
+
135
+ reopened = HwpxDocument.open(buf.getvalue())
136
+ # Collect tables from all paragraphs
137
+ rt_tables = [
138
+ t
139
+ for para in reopened.paragraphs
140
+ for t in para.tables
141
+ ]
142
+ assert len(rt_tables) >= 1
143
+
144
+ rt_table = rt_tables[0]
145
+ assert rt_table.cell(0, 0).span == (1, 1)
146
+ assert rt_table.cell(0, 1).span == (1, 1)
147
+ # Master cell kept its original text
148
+ assert rt_table.cell(0, 0).text == "A"
149
+ # Restored cell has the text we set
150
+ assert rt_table.cell(0, 1).text == "B-restored"
151
+ # Untouched cell is intact
152
+ assert rt_table.cell(0, 2).text == "C"
153
+
154
+
155
+ # --------------------------------------------------------------------------- #
156
+ # Scenario 5 – split via set_cell_text logical API
157
+ # --------------------------------------------------------------------------- #
158
+
159
+
160
+ def test_set_cell_text_split_merged_flag() -> None:
161
+ """``set_cell_text(split_merged=True)`` must trigger split correctly."""
162
+ doc, table = _new_doc_with_table(3, 3)
163
+
164
+ table.merge_cells(0, 0, 0, 1)
165
+ # Write to the covered column with split_merged=True
166
+ table.set_cell_text(0, 1, "Split-Write", logical=True, split_merged=True)
167
+
168
+ assert table.cell(0, 0).span == (1, 1)
169
+ assert table.cell(0, 1).text == "Split-Write"
170
+ assert table.cell(0, 1).span == (1, 1)
171
+
172
+
173
+ # --------------------------------------------------------------------------- #
174
+ # Scenario 6 – splitting an already-unmerged cell is a no-op
175
+ # --------------------------------------------------------------------------- #
176
+
177
+
178
+ def test_split_unmerged_cell_is_noop() -> None:
179
+ """Splitting a cell that is not merged should return it unchanged."""
180
+ doc, table = _new_doc_with_table(2, 2)
181
+
182
+ cell_before = table.cell(0, 0)
183
+ cell_after = table.split_merged_cell(0, 0)
184
+ assert cell_before.element is cell_after.element
185
+ assert cell_after.span == (1, 1)
@@ -1,2 +0,0 @@
1
- [console_scripts]
2
- hwpx-validate = hwpx.tools.validator:main
File without changes
File without changes
File without changes
File without changes
File without changes