python-hwpx 2.5__py3-none-any.whl → 2.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hwpx/document.py +9 -4
- hwpx/tools/__init__.py +16 -0
- hwpx/tools/package_validator.py +217 -0
- hwpx/tools/page_guard.py +242 -0
- hwpx/tools/text_extract_cli.py +66 -0
- {python_hwpx-2.5.dist-info → python_hwpx-2.6.dist-info}/METADATA +1 -1
- {python_hwpx-2.5.dist-info → python_hwpx-2.6.dist-info}/RECORD +11 -8
- python_hwpx-2.6.dist-info/entry_points.txt +5 -0
- python_hwpx-2.5.dist-info/entry_points.txt +0 -2
- {python_hwpx-2.5.dist-info → python_hwpx-2.6.dist-info}/WHEEL +0 -0
- {python_hwpx-2.5.dist-info → python_hwpx-2.6.dist-info}/licenses/LICENSE +0 -0
- {python_hwpx-2.5.dist-info → python_hwpx-2.6.dist-info}/top_level.txt +0 -0
hwpx/document.py
CHANGED
|
@@ -1280,7 +1280,7 @@ class HwpxDocument:
|
|
|
1280
1280
|
"""
|
|
1281
1281
|
from .tools.validator import validate_document
|
|
1282
1282
|
|
|
1283
|
-
return validate_document(self._to_bytes_raw())
|
|
1283
|
+
return validate_document(self._to_bytes_raw(reset_dirty=False))
|
|
1284
1284
|
|
|
1285
1285
|
def _run_pre_save_validation(self) -> None:
|
|
1286
1286
|
"""Raise if validate_on_save is enabled and the document is invalid."""
|
|
@@ -1318,11 +1318,16 @@ class HwpxDocument:
|
|
|
1318
1318
|
self._run_pre_save_validation()
|
|
1319
1319
|
return self._to_bytes_raw()
|
|
1320
1320
|
|
|
1321
|
-
def _to_bytes_raw(self) -> bytes:
|
|
1322
|
-
"""Serialize without validation
|
|
1321
|
+
def _to_bytes_raw(self, *, reset_dirty: bool = True) -> bytes:
|
|
1322
|
+
"""Serialize without validation.
|
|
1323
|
+
|
|
1324
|
+
When ``reset_dirty`` is ``False``, the document remains marked as
|
|
1325
|
+
modified after the archive snapshot is generated.
|
|
1326
|
+
"""
|
|
1323
1327
|
updates = self._root.serialize()
|
|
1324
1328
|
result = self._package.save(None, updates)
|
|
1325
|
-
|
|
1329
|
+
if reset_dirty:
|
|
1330
|
+
self._root.reset_dirty()
|
|
1326
1331
|
if isinstance(result, bytes):
|
|
1327
1332
|
return result
|
|
1328
1333
|
raise TypeError("package.save(None) must return bytes")
|
hwpx/tools/__init__.py
CHANGED
|
@@ -6,6 +6,16 @@ from .exporter import (
|
|
|
6
6
|
export_text,
|
|
7
7
|
)
|
|
8
8
|
from .object_finder import FoundElement, ObjectFinder
|
|
9
|
+
from .package_validator import (
|
|
10
|
+
PackageValidationIssue,
|
|
11
|
+
PackageValidationReport,
|
|
12
|
+
validate_package,
|
|
13
|
+
)
|
|
14
|
+
from .page_guard import (
|
|
15
|
+
DocumentMetrics,
|
|
16
|
+
collect_metrics,
|
|
17
|
+
compare_metrics,
|
|
18
|
+
)
|
|
9
19
|
from .text_extractor import (
|
|
10
20
|
DEFAULT_NAMESPACES,
|
|
11
21
|
ParagraphInfo,
|
|
@@ -33,6 +43,12 @@ __all__ = [
|
|
|
33
43
|
"strip_namespace",
|
|
34
44
|
"FoundElement",
|
|
35
45
|
"ObjectFinder",
|
|
46
|
+
"PackageValidationIssue",
|
|
47
|
+
"PackageValidationReport",
|
|
48
|
+
"validate_package",
|
|
49
|
+
"DocumentMetrics",
|
|
50
|
+
"collect_metrics",
|
|
51
|
+
"compare_metrics",
|
|
36
52
|
"DocumentSchemas",
|
|
37
53
|
"ValidationIssue",
|
|
38
54
|
"ValidationReport",
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import io
|
|
5
|
+
import xml.etree.ElementTree as ET
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import BinaryIO, Sequence
|
|
9
|
+
from zipfile import ZIP_STORED, BadZipFile, ZipFile
|
|
10
|
+
|
|
11
|
+
EXPECTED_MIMETYPE = "application/hwp+zip"
|
|
12
|
+
CONTAINER_PATH = "META-INF/container.xml"
|
|
13
|
+
MANIFEST_PATH = "Contents/content.hpf"
|
|
14
|
+
HEADER_PATH = "Contents/header.xml"
|
|
15
|
+
VERSION_PATH = "version.xml"
|
|
16
|
+
REQUIRED_CORE_FILES = ("mimetype", CONTAINER_PATH, MANIFEST_PATH, HEADER_PATH, VERSION_PATH)
|
|
17
|
+
OPF_NS = {"opf": "http://www.idpf.org/2007/opf/"}
|
|
18
|
+
CONTAINER_NS = {
|
|
19
|
+
"ct": "urn:oasis:names:tc:opendocument:xmlns:container",
|
|
20
|
+
"ocf": "urn:oasis:names:tc:opendocument:xmlns:container",
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"PackageValidationIssue",
|
|
25
|
+
"PackageValidationReport",
|
|
26
|
+
"validate_package",
|
|
27
|
+
"main",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass(frozen=True)
|
|
32
|
+
class PackageValidationIssue:
|
|
33
|
+
part_name: str
|
|
34
|
+
message: str
|
|
35
|
+
|
|
36
|
+
def __str__(self) -> str: # pragma: no cover - human readable helper
|
|
37
|
+
return f"{self.part_name}: {self.message}"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(frozen=True)
|
|
41
|
+
class PackageValidationReport:
|
|
42
|
+
checked_parts: tuple[str, ...]
|
|
43
|
+
issues: tuple[PackageValidationIssue, ...]
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def ok(self) -> bool:
|
|
47
|
+
return not self.issues
|
|
48
|
+
|
|
49
|
+
def __bool__(self) -> bool: # pragma: no cover - convenience alias
|
|
50
|
+
return self.ok
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _open_zip(source: str | Path | bytes | BinaryIO) -> ZipFile:
|
|
54
|
+
if isinstance(source, (str, Path)):
|
|
55
|
+
return ZipFile(source, "r")
|
|
56
|
+
if isinstance(source, bytes):
|
|
57
|
+
return ZipFile(io.BytesIO(source), "r")
|
|
58
|
+
return ZipFile(source, "r")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _parse_xml(payload: bytes) -> ET.Element:
|
|
62
|
+
try:
|
|
63
|
+
return ET.fromstring(payload)
|
|
64
|
+
except ET.ParseError as exc:
|
|
65
|
+
raise ValueError(f"malformed XML: {exc}") from exc
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def _container_rootfiles(container_root: ET.Element) -> list[str]:
|
|
69
|
+
paths: list[str] = []
|
|
70
|
+
for namespace in CONTAINER_NS.values():
|
|
71
|
+
paths.extend(
|
|
72
|
+
elem.get("full-path")
|
|
73
|
+
or elem.get("fullPath")
|
|
74
|
+
or elem.get("full_path")
|
|
75
|
+
for elem in container_root.findall(f".//{{{namespace}}}rootfile")
|
|
76
|
+
)
|
|
77
|
+
return [path for path in paths if path]
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _manifest_hrefs(manifest_root: ET.Element) -> set[str]:
|
|
81
|
+
hrefs: set[str] = set()
|
|
82
|
+
for item in manifest_root.findall(".//opf:item", OPF_NS):
|
|
83
|
+
href = item.get("href")
|
|
84
|
+
if href:
|
|
85
|
+
hrefs.add(href)
|
|
86
|
+
return hrefs
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _spine_hrefs(manifest_root: ET.Element) -> list[str]:
|
|
90
|
+
hrefs: list[str] = []
|
|
91
|
+
id_to_href: dict[str, str] = {}
|
|
92
|
+
for item in manifest_root.findall(".//opf:item", OPF_NS):
|
|
93
|
+
item_id = item.get("id")
|
|
94
|
+
href = item.get("href")
|
|
95
|
+
if item_id and href:
|
|
96
|
+
id_to_href[item_id] = href
|
|
97
|
+
|
|
98
|
+
for itemref in manifest_root.findall(".//opf:itemref", OPF_NS):
|
|
99
|
+
idref = itemref.get("idref")
|
|
100
|
+
if idref and idref in id_to_href:
|
|
101
|
+
hrefs.append(id_to_href[idref])
|
|
102
|
+
return hrefs
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def validate_package(source: str | Path | bytes | BinaryIO) -> PackageValidationReport:
|
|
106
|
+
checked_parts: list[str] = []
|
|
107
|
+
issues: list[PackageValidationIssue] = []
|
|
108
|
+
|
|
109
|
+
try:
|
|
110
|
+
archive = _open_zip(source)
|
|
111
|
+
except BadZipFile:
|
|
112
|
+
return PackageValidationReport(
|
|
113
|
+
checked_parts=(),
|
|
114
|
+
issues=(PackageValidationIssue("archive", "not a valid ZIP archive"),),
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
with archive as zf:
|
|
118
|
+
names = zf.namelist()
|
|
119
|
+
checked_parts.extend(names)
|
|
120
|
+
|
|
121
|
+
for required in REQUIRED_CORE_FILES:
|
|
122
|
+
if required not in names:
|
|
123
|
+
issues.append(PackageValidationIssue(required, "missing required file"))
|
|
124
|
+
|
|
125
|
+
if not names:
|
|
126
|
+
issues.append(PackageValidationIssue("archive", "empty archive"))
|
|
127
|
+
return PackageValidationReport(tuple(checked_parts), tuple(issues))
|
|
128
|
+
|
|
129
|
+
if "mimetype" in names:
|
|
130
|
+
try:
|
|
131
|
+
mimetype = zf.read("mimetype").decode("utf-8").strip()
|
|
132
|
+
except UnicodeDecodeError:
|
|
133
|
+
mimetype = "<binary>"
|
|
134
|
+
if mimetype != EXPECTED_MIMETYPE:
|
|
135
|
+
issues.append(
|
|
136
|
+
PackageValidationIssue(
|
|
137
|
+
"mimetype",
|
|
138
|
+
f"expected {EXPECTED_MIMETYPE!r}, got {mimetype!r}",
|
|
139
|
+
)
|
|
140
|
+
)
|
|
141
|
+
if names[0] != "mimetype":
|
|
142
|
+
issues.append(PackageValidationIssue("mimetype", "must be the first ZIP entry"))
|
|
143
|
+
if zf.getinfo("mimetype").compress_type != ZIP_STORED:
|
|
144
|
+
issues.append(PackageValidationIssue("mimetype", "must use ZIP_STORED"))
|
|
145
|
+
|
|
146
|
+
xml_roots: dict[str, ET.Element] = {}
|
|
147
|
+
for name in names:
|
|
148
|
+
if not (name.endswith(".xml") or name.endswith(".hpf")):
|
|
149
|
+
continue
|
|
150
|
+
try:
|
|
151
|
+
xml_roots[name] = _parse_xml(zf.read(name))
|
|
152
|
+
except ValueError as exc:
|
|
153
|
+
issues.append(PackageValidationIssue(name, str(exc)))
|
|
154
|
+
|
|
155
|
+
container_root = xml_roots.get(CONTAINER_PATH)
|
|
156
|
+
if container_root is not None:
|
|
157
|
+
rootfiles = _container_rootfiles(container_root)
|
|
158
|
+
if not rootfiles:
|
|
159
|
+
issues.append(PackageValidationIssue(CONTAINER_PATH, "declares no rootfile entries"))
|
|
160
|
+
for rootfile in rootfiles:
|
|
161
|
+
if rootfile not in names:
|
|
162
|
+
issues.append(
|
|
163
|
+
PackageValidationIssue(
|
|
164
|
+
CONTAINER_PATH,
|
|
165
|
+
f"rootfile points to missing part {rootfile!r}",
|
|
166
|
+
)
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
manifest_root = xml_roots.get(MANIFEST_PATH)
|
|
170
|
+
if manifest_root is not None:
|
|
171
|
+
hrefs = _manifest_hrefs(manifest_root)
|
|
172
|
+
for href in sorted(hrefs):
|
|
173
|
+
if href not in names:
|
|
174
|
+
issues.append(
|
|
175
|
+
PackageValidationIssue(
|
|
176
|
+
MANIFEST_PATH,
|
|
177
|
+
f"manifest href missing from archive: {href}",
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
spine_hrefs = _spine_hrefs(manifest_root)
|
|
182
|
+
if not spine_hrefs:
|
|
183
|
+
issues.append(PackageValidationIssue(MANIFEST_PATH, "spine declares no section parts"))
|
|
184
|
+
for href in spine_hrefs:
|
|
185
|
+
if href not in names:
|
|
186
|
+
issues.append(
|
|
187
|
+
PackageValidationIssue(
|
|
188
|
+
MANIFEST_PATH,
|
|
189
|
+
f"spine item missing from archive: {href}",
|
|
190
|
+
)
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
if HEADER_PATH in names and HEADER_PATH not in hrefs:
|
|
194
|
+
issues.append(
|
|
195
|
+
PackageValidationIssue(MANIFEST_PATH, "header.xml is not referenced in manifest")
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return PackageValidationReport(tuple(checked_parts), tuple(issues))
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
202
|
+
parser = argparse.ArgumentParser(description="Validate HWPX package structure")
|
|
203
|
+
parser.add_argument("source", help="Path to the HWPX file")
|
|
204
|
+
args = parser.parse_args(argv)
|
|
205
|
+
|
|
206
|
+
report = validate_package(args.source)
|
|
207
|
+
if report.issues:
|
|
208
|
+
for issue in report.issues:
|
|
209
|
+
print(f"ERROR: {issue}")
|
|
210
|
+
return 1
|
|
211
|
+
|
|
212
|
+
print("All package validations passed.")
|
|
213
|
+
return 0
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
if __name__ == "__main__": # pragma: no cover - CLI convenience
|
|
217
|
+
raise SystemExit(main())
|
hwpx/tools/page_guard.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import io
|
|
5
|
+
import json
|
|
6
|
+
from dataclasses import asdict, dataclass
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import BinaryIO, Iterable, Sequence
|
|
9
|
+
from zipfile import ZipFile
|
|
10
|
+
|
|
11
|
+
from lxml import etree
|
|
12
|
+
|
|
13
|
+
NS = {
|
|
14
|
+
"hp": "http://www.hancom.co.kr/hwpml/2011/paragraph",
|
|
15
|
+
"hs": "http://www.hancom.co.kr/hwpml/2011/section",
|
|
16
|
+
"opf": "http://www.idpf.org/2007/opf/",
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"DocumentMetrics",
|
|
21
|
+
"collect_metrics",
|
|
22
|
+
"compare_metrics",
|
|
23
|
+
"main",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass(frozen=True)
|
|
28
|
+
class DocumentMetrics:
|
|
29
|
+
section_count: int
|
|
30
|
+
paragraph_count: int
|
|
31
|
+
page_break_count: int
|
|
32
|
+
column_break_count: int
|
|
33
|
+
table_count: int
|
|
34
|
+
table_shapes: list[tuple[str, str, str, str, str, str]]
|
|
35
|
+
text_char_total: int
|
|
36
|
+
text_char_total_nospace: int
|
|
37
|
+
paragraph_text_lengths: list[int]
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _section_files(zf: ZipFile) -> list[str]:
|
|
41
|
+
try:
|
|
42
|
+
root = etree.fromstring(zf.read("Contents/content.hpf"))
|
|
43
|
+
except KeyError:
|
|
44
|
+
return [
|
|
45
|
+
name
|
|
46
|
+
for name in zf.namelist()
|
|
47
|
+
if name.startswith("Contents/section") and name.endswith(".xml")
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
id_to_href: dict[str, str] = {}
|
|
51
|
+
for item in root.findall(".//opf:item", namespaces=NS):
|
|
52
|
+
item_id = item.get("id")
|
|
53
|
+
href = item.get("href")
|
|
54
|
+
if item_id and href:
|
|
55
|
+
id_to_href[item_id] = href
|
|
56
|
+
|
|
57
|
+
files: list[str] = []
|
|
58
|
+
for itemref in root.findall(".//opf:itemref", namespaces=NS):
|
|
59
|
+
idref = itemref.get("idref")
|
|
60
|
+
if idref and idref in id_to_href:
|
|
61
|
+
files.append(id_to_href[idref])
|
|
62
|
+
return files
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _text_of_t_node(node: etree._Element) -> str:
|
|
66
|
+
return "".join(node.itertext())
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _iter_section_roots(source: str | Path | bytes | BinaryIO) -> Iterable[etree._Element]:
|
|
70
|
+
if isinstance(source, bytes):
|
|
71
|
+
archive = ZipFile(io.BytesIO(source), "r")
|
|
72
|
+
else:
|
|
73
|
+
archive = ZipFile(source, "r")
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
for name in _section_files(archive):
|
|
77
|
+
yield etree.fromstring(archive.read(name))
|
|
78
|
+
finally:
|
|
79
|
+
archive.close()
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def collect_metrics(source: str | Path | bytes | BinaryIO) -> DocumentMetrics:
|
|
83
|
+
section_roots = list(_iter_section_roots(source))
|
|
84
|
+
|
|
85
|
+
paragraphs: list[etree._Element] = []
|
|
86
|
+
tables: list[etree._Element] = []
|
|
87
|
+
table_shapes: list[tuple[str, str, str, str, str, str]] = []
|
|
88
|
+
paragraph_text_lengths: list[int] = []
|
|
89
|
+
text_char_total = 0
|
|
90
|
+
text_char_total_nospace = 0
|
|
91
|
+
page_break_count = 0
|
|
92
|
+
column_break_count = 0
|
|
93
|
+
|
|
94
|
+
for root in section_roots:
|
|
95
|
+
section_paragraphs = root.xpath(".//hs:sec/hp:p", namespaces=NS)
|
|
96
|
+
if not section_paragraphs:
|
|
97
|
+
section_paragraphs = root.xpath(".//hp:p", namespaces=NS)
|
|
98
|
+
paragraphs.extend(section_paragraphs)
|
|
99
|
+
|
|
100
|
+
section_tables = root.xpath(".//hp:tbl", namespaces=NS)
|
|
101
|
+
tables.extend(section_tables)
|
|
102
|
+
|
|
103
|
+
for table in section_tables:
|
|
104
|
+
size = table.find("hp:sz", namespaces=NS)
|
|
105
|
+
table_shapes.append(
|
|
106
|
+
(
|
|
107
|
+
table.get("rowCnt", ""),
|
|
108
|
+
table.get("colCnt", ""),
|
|
109
|
+
size.get("width", "") if size is not None else "",
|
|
110
|
+
size.get("height", "") if size is not None else "",
|
|
111
|
+
table.get("repeatHeader", ""),
|
|
112
|
+
table.get("pageBreak", ""),
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
for paragraph in section_paragraphs:
|
|
117
|
+
if paragraph.get("pageBreak") == "1":
|
|
118
|
+
page_break_count += 1
|
|
119
|
+
if paragraph.get("columnBreak") == "1":
|
|
120
|
+
column_break_count += 1
|
|
121
|
+
paragraph_length = 0
|
|
122
|
+
for text_node in paragraph.xpath(".//hp:t", namespaces=NS):
|
|
123
|
+
text = _text_of_t_node(text_node)
|
|
124
|
+
paragraph_length += len(text)
|
|
125
|
+
text_char_total += len(text)
|
|
126
|
+
text_char_total_nospace += len("".join(text.split()))
|
|
127
|
+
paragraph_text_lengths.append(paragraph_length)
|
|
128
|
+
|
|
129
|
+
return DocumentMetrics(
|
|
130
|
+
section_count=len(section_roots),
|
|
131
|
+
paragraph_count=len(paragraphs),
|
|
132
|
+
page_break_count=page_break_count,
|
|
133
|
+
column_break_count=column_break_count,
|
|
134
|
+
table_count=len(tables),
|
|
135
|
+
table_shapes=table_shapes,
|
|
136
|
+
text_char_total=text_char_total,
|
|
137
|
+
text_char_total_nospace=text_char_total_nospace,
|
|
138
|
+
paragraph_text_lengths=paragraph_text_lengths,
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def _ratio_delta(reference_value: int, output_value: int) -> float:
|
|
143
|
+
base = max(reference_value, 1)
|
|
144
|
+
return abs(output_value - reference_value) / base
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def compare_metrics(
|
|
148
|
+
reference: DocumentMetrics,
|
|
149
|
+
output: DocumentMetrics,
|
|
150
|
+
*,
|
|
151
|
+
max_text_delta_ratio: float = 0.15,
|
|
152
|
+
max_paragraph_delta_ratio: float = 0.25,
|
|
153
|
+
) -> list[str]:
|
|
154
|
+
errors: list[str] = []
|
|
155
|
+
|
|
156
|
+
if reference.section_count != output.section_count:
|
|
157
|
+
errors.append(
|
|
158
|
+
f"section count mismatch: ref={reference.section_count}, out={output.section_count}"
|
|
159
|
+
)
|
|
160
|
+
if reference.paragraph_count != output.paragraph_count:
|
|
161
|
+
errors.append(
|
|
162
|
+
f"paragraph count mismatch: ref={reference.paragraph_count}, out={output.paragraph_count}"
|
|
163
|
+
)
|
|
164
|
+
if reference.page_break_count != output.page_break_count:
|
|
165
|
+
errors.append(
|
|
166
|
+
"pageBreak count mismatch: "
|
|
167
|
+
f"ref={reference.page_break_count}, out={output.page_break_count}"
|
|
168
|
+
)
|
|
169
|
+
if reference.column_break_count != output.column_break_count:
|
|
170
|
+
errors.append(
|
|
171
|
+
"columnBreak count mismatch: "
|
|
172
|
+
f"ref={reference.column_break_count}, out={output.column_break_count}"
|
|
173
|
+
)
|
|
174
|
+
if reference.table_count != output.table_count:
|
|
175
|
+
errors.append(f"table count mismatch: ref={reference.table_count}, out={output.table_count}")
|
|
176
|
+
if reference.table_shapes != output.table_shapes:
|
|
177
|
+
errors.append("table shape mismatch (rowCnt/colCnt/width/height/repeatHeader/pageBreak)")
|
|
178
|
+
|
|
179
|
+
text_delta = _ratio_delta(reference.text_char_total_nospace, output.text_char_total_nospace)
|
|
180
|
+
if text_delta > max_text_delta_ratio:
|
|
181
|
+
errors.append(
|
|
182
|
+
"total text length drift exceeded: "
|
|
183
|
+
f"ref={reference.text_char_total_nospace}, out={output.text_char_total_nospace}, "
|
|
184
|
+
f"delta={text_delta:.2%}, limit={max_text_delta_ratio:.2%}"
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
if len(reference.paragraph_text_lengths) == len(output.paragraph_text_lengths):
|
|
188
|
+
for index, (ref_len, out_len) in enumerate(
|
|
189
|
+
zip(reference.paragraph_text_lengths, output.paragraph_text_lengths),
|
|
190
|
+
start=1,
|
|
191
|
+
):
|
|
192
|
+
if ref_len == 0 and out_len == 0:
|
|
193
|
+
continue
|
|
194
|
+
delta = _ratio_delta(ref_len, out_len)
|
|
195
|
+
if delta > max_paragraph_delta_ratio:
|
|
196
|
+
errors.append(
|
|
197
|
+
f"paragraph {index} text drift exceeded: "
|
|
198
|
+
f"ref={ref_len}, out={out_len}, delta={delta:.2%}, "
|
|
199
|
+
f"limit={max_paragraph_delta_ratio:.2%}"
|
|
200
|
+
)
|
|
201
|
+
return errors
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
205
|
+
parser = argparse.ArgumentParser(description="Reference-vs-output HWPX page drift guard")
|
|
206
|
+
parser.add_argument("--reference", "-r", required=True, help="Reference HWPX path")
|
|
207
|
+
parser.add_argument("--output", "-o", required=True, help="Output HWPX path")
|
|
208
|
+
parser.add_argument("--max-text-delta-ratio", type=float, default=0.15)
|
|
209
|
+
parser.add_argument("--max-paragraph-delta-ratio", type=float, default=0.25)
|
|
210
|
+
parser.add_argument("--json", action="store_true", help="Print collected metrics as JSON")
|
|
211
|
+
args = parser.parse_args(argv)
|
|
212
|
+
|
|
213
|
+
reference = collect_metrics(args.reference)
|
|
214
|
+
output = collect_metrics(args.output)
|
|
215
|
+
|
|
216
|
+
if args.json:
|
|
217
|
+
print(
|
|
218
|
+
json.dumps(
|
|
219
|
+
{"reference": asdict(reference), "output": asdict(output)},
|
|
220
|
+
ensure_ascii=False,
|
|
221
|
+
indent=2,
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
errors = compare_metrics(
|
|
226
|
+
reference,
|
|
227
|
+
output,
|
|
228
|
+
max_text_delta_ratio=args.max_text_delta_ratio,
|
|
229
|
+
max_paragraph_delta_ratio=args.max_paragraph_delta_ratio,
|
|
230
|
+
)
|
|
231
|
+
if errors:
|
|
232
|
+
print("FAIL: page guard")
|
|
233
|
+
for error in errors:
|
|
234
|
+
print(f" - {error}")
|
|
235
|
+
return 1
|
|
236
|
+
|
|
237
|
+
print("PASS: page guard")
|
|
238
|
+
return 0
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
if __name__ == "__main__": # pragma: no cover - CLI convenience
|
|
242
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
import sys
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Sequence
|
|
7
|
+
|
|
8
|
+
from .text_extractor import TextExtractor
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"extract_plain",
|
|
12
|
+
"extract_markdown",
|
|
13
|
+
"main",
|
|
14
|
+
]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def extract_plain(hwpx_path: str, *, include_tables: bool = False) -> str:
|
|
18
|
+
with TextExtractor(hwpx_path) as extractor:
|
|
19
|
+
return extractor.extract_text(
|
|
20
|
+
include_nested=include_tables,
|
|
21
|
+
object_behavior="skip",
|
|
22
|
+
skip_empty=True,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_markdown(hwpx_path: str) -> str:
|
|
27
|
+
lines: list[str] = []
|
|
28
|
+
with TextExtractor(hwpx_path) as extractor:
|
|
29
|
+
for section in extractor.iter_sections():
|
|
30
|
+
if lines:
|
|
31
|
+
lines.extend(["", "---", ""])
|
|
32
|
+
for paragraph in extractor.iter_paragraphs(section, include_nested=True):
|
|
33
|
+
text = paragraph.text(object_behavior="skip")
|
|
34
|
+
if not text.strip():
|
|
35
|
+
continue
|
|
36
|
+
lines.append(f" {text}" if paragraph.is_nested else text)
|
|
37
|
+
return "\n".join(lines)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
41
|
+
parser = argparse.ArgumentParser(description="Extract text from an HWPX document")
|
|
42
|
+
parser.add_argument("input", help="Path to the .hwpx file")
|
|
43
|
+
parser.add_argument("--format", "-f", choices=["plain", "markdown"], default="plain")
|
|
44
|
+
parser.add_argument("--include-tables", action="store_true", help="Include nested table text")
|
|
45
|
+
parser.add_argument("--output", "-o", help="Write output to a file instead of stdout")
|
|
46
|
+
args = parser.parse_args(argv)
|
|
47
|
+
|
|
48
|
+
input_path = Path(args.input)
|
|
49
|
+
if not input_path.is_file():
|
|
50
|
+
print(f"Error: File not found: {args.input}", file=sys.stderr)
|
|
51
|
+
return 1
|
|
52
|
+
|
|
53
|
+
if args.format == "markdown":
|
|
54
|
+
result = extract_markdown(str(input_path))
|
|
55
|
+
else:
|
|
56
|
+
result = extract_plain(str(input_path), include_tables=args.include_tables)
|
|
57
|
+
|
|
58
|
+
if args.output:
|
|
59
|
+
Path(args.output).write_text(result, encoding="utf-8")
|
|
60
|
+
else:
|
|
61
|
+
print(result)
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
if __name__ == "__main__": # pragma: no cover - CLI convenience
|
|
66
|
+
raise SystemExit(main())
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
hwpx/__init__.py,sha256=RZ4O84G3Zp_L8ELArtwO3KVPvhx1vLYyKC2Ka1M5mwc,857
|
|
2
|
-
hwpx/document.py,sha256=
|
|
2
|
+
hwpx/document.py,sha256=UnM61gSf9Hno5n0YWrVSTod9USmA3WtRQeeQadLbYdQ,48133
|
|
3
3
|
hwpx/package.py,sha256=YK4oYEPk7la2BZKZepoVHzrjGIPMDnDdPa02Hh-RTBw,1103
|
|
4
4
|
hwpx/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
5
|
hwpx/templates.py,sha256=kZ_gV0bP-DIvr5CJuzs-uGnt8XVncJCI3cGFq083uTg,1149
|
|
@@ -20,16 +20,19 @@ hwpx/oxml/schema.py,sha256=THswXdMNpAiSoLxpvUGbdbI66hW-SKuUqSw4vdkIYmA,1246
|
|
|
20
20
|
hwpx/oxml/section.py,sha256=WwxZ6PWPeMrj2L9mz4JlqFGXwd7E7qAuSBuM5dgRjZk,199
|
|
21
21
|
hwpx/oxml/table.py,sha256=pdO2TTAcbEC6Z4cnaOnB-bcmuZ1KVado7J3RiY_zOfE,193
|
|
22
22
|
hwpx/oxml/utils.py,sha256=to0yytS7vtLSvWl-dQyegT6MWClMK55b1Sp1uagEkI4,2591
|
|
23
|
-
hwpx/tools/__init__.py,sha256=
|
|
23
|
+
hwpx/tools/__init__.py,sha256=e1OaIVdbkmjTvLOzQ7qVRfuuQ1611225pNZByB2ln9w,1270
|
|
24
24
|
hwpx/tools/exporter.py,sha256=GcbNtV4rIWOJv5nBcgdX0yfkXQa-xQhfrCzXWgaNbTE,8862
|
|
25
25
|
hwpx/tools/object_finder.py,sha256=vbZ8FuIpGF-2vpbWDeZWi4UgZ2-3PK_ddQCs0oq1dRw,13440
|
|
26
|
+
hwpx/tools/package_validator.py,sha256=ZixhRjv_RYwBshk3NEJXyakRVsN7hM4WI47euvWFETU,7379
|
|
27
|
+
hwpx/tools/page_guard.py,sha256=nholL2cMv249yieVBWGqW3WHqkFR1qVutBVz59V_kYo,8351
|
|
28
|
+
hwpx/tools/text_extract_cli.py,sha256=pIBMIFuFX10IEegw7fQ3gtUbQyjNgbAUYkQWh2S3aQs,2150
|
|
26
29
|
hwpx/tools/text_extractor.py,sha256=r2OJRgDOiR6n14hXRcvkYuSFtEHpAV6jasHv-ZLHx1Y,24238
|
|
27
30
|
hwpx/tools/validator.py,sha256=KThqBQKKQfZkuLMGtzONbPkzy877-2FgT22FHPmt_gI,5979
|
|
28
31
|
hwpx/tools/_schemas/header.xsd,sha256=mJXuFMuHGT1JnFFaluUpYUglwjMCNlfbFCRVM26eHXE,664
|
|
29
32
|
hwpx/tools/_schemas/section.xsd,sha256=MgvavVHG05RDfUnVPxVU10H4FQOja5ON04_m9Uk_m7E,522
|
|
30
|
-
python_hwpx-2.
|
|
31
|
-
python_hwpx-2.
|
|
32
|
-
python_hwpx-2.
|
|
33
|
-
python_hwpx-2.
|
|
34
|
-
python_hwpx-2.
|
|
35
|
-
python_hwpx-2.
|
|
33
|
+
python_hwpx-2.6.dist-info/licenses/LICENSE,sha256=3F1-JUTcmjmxMpHGeB77ZzaSdhms3h8p1DBBa3lvV08,1609
|
|
34
|
+
python_hwpx-2.6.dist-info/METADATA,sha256=9LDnFWuCNTEqQpF_WMyH7Mm3Ah7uYYnGazGI3C8pE20,12974
|
|
35
|
+
python_hwpx-2.6.dist-info/WHEEL,sha256=YCfwYGOYMi5Jhw2fU4yNgwErybb2IX5PEwBKV4ZbdBo,91
|
|
36
|
+
python_hwpx-2.6.dist-info/entry_points.txt,sha256=KvwTIdfB-3OL8BEAmoiICdfqqndolqiET-zBEgbIyiM,216
|
|
37
|
+
python_hwpx-2.6.dist-info/top_level.txt,sha256=R1iToqDh80Nf2oQhRjTN0rbN2X6kyDUizIocZjkhuxc,5
|
|
38
|
+
python_hwpx-2.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|