python-hwpx 2.7__py3-none-any.whl → 2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hwpx/opc/package.py +62 -97
- hwpx/opc/relationships.py +227 -0
- hwpx/oxml/document.py +5 -2
- hwpx/tools/archive_cli.py +35 -11
- hwpx/tools/package_validator.py +239 -106
- hwpx/tools/page_guard.py +12 -40
- hwpx/tools/template_analyzer.py +35 -19
- hwpx/tools/text_extractor.py +44 -27
- {python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/METADATA +10 -3
- {python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/RECORD +14 -13
- {python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/WHEEL +0 -0
- {python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/entry_points.txt +0 -0
- {python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/licenses/LICENSE +0 -0
- {python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/top_level.txt +0 -0
hwpx/tools/package_validator.py
CHANGED
|
@@ -4,21 +4,25 @@ import argparse
|
|
|
4
4
|
import io
|
|
5
5
|
import xml.etree.ElementTree as ET
|
|
6
6
|
from dataclasses import dataclass
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
from typing import BinaryIO, Sequence
|
|
7
|
+
from pathlib import Path, PurePosixPath
|
|
8
|
+
from typing import BinaryIO, Literal, Sequence
|
|
9
9
|
from zipfile import ZIP_STORED, BadZipFile, ZipFile
|
|
10
10
|
|
|
11
|
+
from ..opc.relationships import (
|
|
12
|
+
MAIN_ROOTFILE_MEDIA_TYPE,
|
|
13
|
+
is_section_part_name,
|
|
14
|
+
parse_container_rootfiles,
|
|
15
|
+
parse_manifest_relationships,
|
|
16
|
+
select_main_rootfile,
|
|
17
|
+
)
|
|
18
|
+
|
|
11
19
|
EXPECTED_MIMETYPE = "application/hwp+zip"
|
|
20
|
+
MIMETYPE_PATH = "mimetype"
|
|
12
21
|
CONTAINER_PATH = "META-INF/container.xml"
|
|
13
|
-
MANIFEST_PATH = "Contents/content.hpf"
|
|
14
22
|
HEADER_PATH = "Contents/header.xml"
|
|
15
23
|
VERSION_PATH = "version.xml"
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
CONTAINER_NS = {
|
|
19
|
-
"ct": "urn:oasis:names:tc:opendocument:xmlns:container",
|
|
20
|
-
"ocf": "urn:oasis:names:tc:opendocument:xmlns:container",
|
|
21
|
-
}
|
|
24
|
+
|
|
25
|
+
IssueLevel = Literal["error", "warning"]
|
|
22
26
|
|
|
23
27
|
__all__ = [
|
|
24
28
|
"PackageValidationIssue",
|
|
@@ -32,6 +36,11 @@ __all__ = [
|
|
|
32
36
|
class PackageValidationIssue:
|
|
33
37
|
part_name: str
|
|
34
38
|
message: str
|
|
39
|
+
level: IssueLevel = "error"
|
|
40
|
+
|
|
41
|
+
@property
|
|
42
|
+
def is_error(self) -> bool:
|
|
43
|
+
return self.level == "error"
|
|
35
44
|
|
|
36
45
|
def __str__(self) -> str: # pragma: no cover - human readable helper
|
|
37
46
|
return f"{self.part_name}: {self.message}"
|
|
@@ -42,9 +51,17 @@ class PackageValidationReport:
|
|
|
42
51
|
checked_parts: tuple[str, ...]
|
|
43
52
|
issues: tuple[PackageValidationIssue, ...]
|
|
44
53
|
|
|
54
|
+
@property
|
|
55
|
+
def errors(self) -> tuple[PackageValidationIssue, ...]:
|
|
56
|
+
return tuple(issue for issue in self.issues if issue.is_error)
|
|
57
|
+
|
|
58
|
+
@property
|
|
59
|
+
def warnings(self) -> tuple[PackageValidationIssue, ...]:
|
|
60
|
+
return tuple(issue for issue in self.issues if not issue.is_error)
|
|
61
|
+
|
|
45
62
|
@property
|
|
46
63
|
def ok(self) -> bool:
|
|
47
|
-
return not self.
|
|
64
|
+
return not self.errors
|
|
48
65
|
|
|
49
66
|
def __bool__(self) -> bool: # pragma: no cover - convenience alias
|
|
50
67
|
return self.ok
|
|
@@ -65,43 +82,31 @@ def _parse_xml(payload: bytes) -> ET.Element:
|
|
|
65
82
|
raise ValueError(f"malformed XML: {exc}") from exc
|
|
66
83
|
|
|
67
84
|
|
|
68
|
-
def
|
|
69
|
-
|
|
70
|
-
for namespace in CONTAINER_NS.values():
|
|
71
|
-
for elem in container_root.findall(f".//{{{namespace}}}rootfile"):
|
|
72
|
-
path = (
|
|
73
|
-
elem.get("full-path")
|
|
74
|
-
or elem.get("fullPath")
|
|
75
|
-
or elem.get("full_path")
|
|
76
|
-
)
|
|
77
|
-
if path:
|
|
78
|
-
paths.append(path)
|
|
79
|
-
return paths
|
|
85
|
+
def _error(issues: list[PackageValidationIssue], part_name: str, message: str) -> None:
|
|
86
|
+
issues.append(PackageValidationIssue(part_name, message, "error"))
|
|
80
87
|
|
|
81
88
|
|
|
82
|
-
def
|
|
83
|
-
|
|
84
|
-
for item in manifest_root.findall(".//opf:item", OPF_NS):
|
|
85
|
-
href = item.get("href")
|
|
86
|
-
if href:
|
|
87
|
-
hrefs.add(href)
|
|
88
|
-
return hrefs
|
|
89
|
+
def _warning(issues: list[PackageValidationIssue], part_name: str, message: str) -> None:
|
|
90
|
+
issues.append(PackageValidationIssue(part_name, message, "warning"))
|
|
89
91
|
|
|
90
92
|
|
|
91
|
-
def
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
if item_id and href:
|
|
98
|
-
id_to_href[item_id] = href
|
|
93
|
+
def _safe_read(zf: ZipFile, part_name: str) -> bytes | None:
|
|
94
|
+
try:
|
|
95
|
+
return zf.read(part_name)
|
|
96
|
+
except (BadZipFile, KeyError, OSError):
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
99
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
100
|
+
def _fallback_named_parts(names: set[str], *, token: str, extra_token: str | None = None) -> list[str]:
|
|
101
|
+
matches: list[str] = []
|
|
102
|
+
for name in sorted(names):
|
|
103
|
+
part_name = PurePosixPath(name).name.lower()
|
|
104
|
+
if token not in part_name:
|
|
105
|
+
continue
|
|
106
|
+
if extra_token is not None and extra_token not in part_name:
|
|
107
|
+
continue
|
|
108
|
+
matches.append(name)
|
|
109
|
+
return matches
|
|
105
110
|
|
|
106
111
|
|
|
107
112
|
def validate_package(source: str | Path | bytes | BinaryIO) -> PackageValidationReport:
|
|
@@ -117,101 +122,229 @@ def validate_package(source: str | Path | bytes | BinaryIO) -> PackageValidation
|
|
|
117
122
|
)
|
|
118
123
|
|
|
119
124
|
with archive as zf:
|
|
120
|
-
|
|
125
|
+
infos = [info for info in zf.infolist() if not info.is_dir()]
|
|
126
|
+
names = [info.filename for info in infos]
|
|
127
|
+
name_set = set(names)
|
|
121
128
|
checked_parts.extend(names)
|
|
122
129
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
issues.append(PackageValidationIssue(required, "missing required file"))
|
|
126
|
-
|
|
127
|
-
if not names:
|
|
128
|
-
issues.append(PackageValidationIssue("archive", "empty archive"))
|
|
130
|
+
if not infos:
|
|
131
|
+
_error(issues, "archive", "empty archive")
|
|
129
132
|
return PackageValidationReport(tuple(checked_parts), tuple(issues))
|
|
130
133
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
134
|
+
bad_entry = zf.testzip()
|
|
135
|
+
if bad_entry is not None:
|
|
136
|
+
_error(issues, bad_entry, "ZIP CRC/integrity check failed")
|
|
137
|
+
|
|
138
|
+
if MIMETYPE_PATH not in name_set:
|
|
139
|
+
_error(issues, MIMETYPE_PATH, "missing required file")
|
|
140
|
+
else:
|
|
141
|
+
mimetype_bytes = _safe_read(zf, MIMETYPE_PATH)
|
|
142
|
+
if mimetype_bytes is None:
|
|
143
|
+
_error(issues, MIMETYPE_PATH, "unable to read entry for integrity validation")
|
|
144
|
+
else:
|
|
145
|
+
try:
|
|
146
|
+
mimetype = mimetype_bytes.decode("utf-8").strip()
|
|
147
|
+
except UnicodeDecodeError:
|
|
148
|
+
mimetype = "<binary>"
|
|
149
|
+
if mimetype != EXPECTED_MIMETYPE:
|
|
150
|
+
_error(
|
|
151
|
+
issues,
|
|
152
|
+
MIMETYPE_PATH,
|
|
140
153
|
f"expected {EXPECTED_MIMETYPE!r}, got {mimetype!r}",
|
|
141
154
|
)
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
155
|
+
if infos[0].filename != MIMETYPE_PATH:
|
|
156
|
+
_error(issues, MIMETYPE_PATH, "must be the first ZIP entry")
|
|
157
|
+
if zf.getinfo(MIMETYPE_PATH).compress_type != ZIP_STORED:
|
|
158
|
+
_error(issues, MIMETYPE_PATH, "must use ZIP_STORED")
|
|
159
|
+
|
|
160
|
+
if CONTAINER_PATH not in name_set:
|
|
161
|
+
_error(issues, CONTAINER_PATH, "missing required file")
|
|
162
|
+
if VERSION_PATH not in name_set:
|
|
163
|
+
_error(issues, VERSION_PATH, "missing required file under current engine semantics")
|
|
147
164
|
|
|
148
165
|
xml_roots: dict[str, ET.Element] = {}
|
|
149
166
|
for name in names:
|
|
150
167
|
if not (name.endswith(".xml") or name.endswith(".hpf")):
|
|
151
168
|
continue
|
|
169
|
+
payload = _safe_read(zf, name)
|
|
170
|
+
if payload is None:
|
|
171
|
+
_error(issues, name, "unable to read entry for XML parsing")
|
|
172
|
+
continue
|
|
152
173
|
try:
|
|
153
|
-
xml_roots[name] = _parse_xml(
|
|
174
|
+
xml_roots[name] = _parse_xml(payload)
|
|
154
175
|
except ValueError as exc:
|
|
155
|
-
issues
|
|
176
|
+
_error(issues, name, str(exc))
|
|
156
177
|
|
|
157
178
|
container_root = xml_roots.get(CONTAINER_PATH)
|
|
158
|
-
if container_root is
|
|
159
|
-
|
|
160
|
-
if not rootfiles:
|
|
161
|
-
issues.append(PackageValidationIssue(CONTAINER_PATH, "declares no rootfile entries"))
|
|
162
|
-
for rootfile in rootfiles:
|
|
163
|
-
if rootfile not in names:
|
|
164
|
-
issues.append(
|
|
165
|
-
PackageValidationIssue(
|
|
166
|
-
CONTAINER_PATH,
|
|
167
|
-
f"rootfile points to missing part {rootfile!r}",
|
|
168
|
-
)
|
|
169
|
-
)
|
|
179
|
+
if container_root is None:
|
|
180
|
+
return PackageValidationReport(tuple(checked_parts), tuple(issues))
|
|
170
181
|
|
|
171
|
-
|
|
172
|
-
if
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
+
rootfiles = parse_container_rootfiles(container_root)
|
|
183
|
+
if not rootfiles:
|
|
184
|
+
_error(issues, CONTAINER_PATH, "declares no rootfile entries")
|
|
185
|
+
return PackageValidationReport(tuple(checked_parts), tuple(issues))
|
|
186
|
+
|
|
187
|
+
for rootfile in rootfiles:
|
|
188
|
+
if rootfile.full_path not in name_set:
|
|
189
|
+
_error(
|
|
190
|
+
issues,
|
|
191
|
+
CONTAINER_PATH,
|
|
192
|
+
f"rootfile points to missing part {rootfile.full_path!r}",
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
selected_rootfile, used_rootfile_fallback = select_main_rootfile(rootfiles)
|
|
196
|
+
if selected_rootfile is None:
|
|
197
|
+
return PackageValidationReport(tuple(checked_parts), tuple(issues))
|
|
198
|
+
if used_rootfile_fallback:
|
|
199
|
+
_warning(
|
|
200
|
+
issues,
|
|
201
|
+
CONTAINER_PATH,
|
|
202
|
+
"no rootfile is marked as "
|
|
203
|
+
f"{MAIN_ROOTFILE_MEDIA_TYPE!r}; engine will use the first declaration "
|
|
204
|
+
f"{selected_rootfile.full_path!r}",
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
manifest_root = xml_roots.get(selected_rootfile.full_path)
|
|
208
|
+
if manifest_root is None:
|
|
209
|
+
_error(
|
|
210
|
+
issues,
|
|
211
|
+
selected_rootfile.full_path,
|
|
212
|
+
"selected main rootfile is missing or not well-formed XML",
|
|
213
|
+
)
|
|
214
|
+
return PackageValidationReport(tuple(checked_parts), tuple(issues))
|
|
215
|
+
|
|
216
|
+
relationships = parse_manifest_relationships(
|
|
217
|
+
manifest_root,
|
|
218
|
+
selected_rootfile.full_path,
|
|
219
|
+
known_parts=name_set,
|
|
220
|
+
)
|
|
182
221
|
|
|
183
|
-
|
|
184
|
-
if not
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
222
|
+
for item in relationships.items:
|
|
223
|
+
if item.resolved_path not in name_set:
|
|
224
|
+
_error(
|
|
225
|
+
issues,
|
|
226
|
+
selected_rootfile.full_path,
|
|
227
|
+
f"manifest href missing from archive: {item.href!r} -> {item.resolved_path!r}",
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
for idref in relationships.dangling_idrefs:
|
|
231
|
+
_warning(
|
|
232
|
+
issues,
|
|
233
|
+
selected_rootfile.full_path,
|
|
234
|
+
f"spine itemref references missing manifest id {idref!r}",
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
section_paths = [path for path in relationships.spine_paths if is_section_part_name(path)]
|
|
238
|
+
if section_paths:
|
|
239
|
+
for path in section_paths:
|
|
240
|
+
if path not in name_set:
|
|
241
|
+
_error(
|
|
242
|
+
issues,
|
|
243
|
+
selected_rootfile.full_path,
|
|
244
|
+
f"spine section part missing from archive: {path!r}",
|
|
193
245
|
)
|
|
246
|
+
else:
|
|
247
|
+
fallback_sections = [name for name in sorted(name_set) if is_section_part_name(name)]
|
|
248
|
+
if fallback_sections:
|
|
249
|
+
_warning(
|
|
250
|
+
issues,
|
|
251
|
+
selected_rootfile.full_path,
|
|
252
|
+
"manifest spine does not resolve any section parts; engine will fall back "
|
|
253
|
+
"to filename-based section discovery",
|
|
254
|
+
)
|
|
255
|
+
else:
|
|
256
|
+
_error(
|
|
257
|
+
issues,
|
|
258
|
+
selected_rootfile.full_path,
|
|
259
|
+
"no section parts found in manifest spine or archive fallback",
|
|
260
|
+
)
|
|
194
261
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
262
|
+
if not relationships.header_paths and HEADER_PATH in name_set:
|
|
263
|
+
_warning(
|
|
264
|
+
issues,
|
|
265
|
+
selected_rootfile.full_path,
|
|
266
|
+
"manifest spine does not resolve a header part; engine will fall back to "
|
|
267
|
+
f"{HEADER_PATH!r}",
|
|
268
|
+
)
|
|
269
|
+
|
|
270
|
+
for path in relationships.header_paths:
|
|
271
|
+
if path not in name_set:
|
|
272
|
+
_error(
|
|
273
|
+
issues,
|
|
274
|
+
selected_rootfile.full_path,
|
|
275
|
+
f"header part missing from archive: {path!r}",
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
if not relationships.master_page_paths:
|
|
279
|
+
fallback_master_pages = _fallback_named_parts(name_set, token="master", extra_token="page")
|
|
280
|
+
if fallback_master_pages:
|
|
281
|
+
_warning(
|
|
282
|
+
issues,
|
|
283
|
+
selected_rootfile.full_path,
|
|
284
|
+
"manifest does not reference masterPage parts; engine will fall back to "
|
|
285
|
+
"filename-based discovery",
|
|
286
|
+
)
|
|
287
|
+
for path in relationships.master_page_paths:
|
|
288
|
+
if path not in name_set:
|
|
289
|
+
_error(
|
|
290
|
+
issues,
|
|
291
|
+
selected_rootfile.full_path,
|
|
292
|
+
f"masterPage part missing from archive: {path!r}",
|
|
293
|
+
)
|
|
294
|
+
|
|
295
|
+
if not relationships.history_paths:
|
|
296
|
+
fallback_histories = _fallback_named_parts(name_set, token="history")
|
|
297
|
+
if fallback_histories:
|
|
298
|
+
_warning(
|
|
299
|
+
issues,
|
|
300
|
+
selected_rootfile.full_path,
|
|
301
|
+
"manifest does not reference history parts; engine will fall back to "
|
|
302
|
+
"filename-based discovery",
|
|
303
|
+
)
|
|
304
|
+
for path in relationships.history_paths:
|
|
305
|
+
if path not in name_set:
|
|
306
|
+
_error(
|
|
307
|
+
issues,
|
|
308
|
+
selected_rootfile.full_path,
|
|
309
|
+
f"history part missing from archive: {path!r}",
|
|
198
310
|
)
|
|
199
311
|
|
|
312
|
+
if relationships.version_path is None and VERSION_PATH in name_set:
|
|
313
|
+
_warning(
|
|
314
|
+
issues,
|
|
315
|
+
selected_rootfile.full_path,
|
|
316
|
+
"manifest does not reference a version part; engine will fall back to "
|
|
317
|
+
f"{VERSION_PATH!r}",
|
|
318
|
+
)
|
|
319
|
+
elif relationships.version_path is not None and relationships.version_path not in name_set:
|
|
320
|
+
_error(
|
|
321
|
+
issues,
|
|
322
|
+
selected_rootfile.full_path,
|
|
323
|
+
f"manifest version part missing from archive: {relationships.version_path!r}",
|
|
324
|
+
)
|
|
325
|
+
|
|
200
326
|
return PackageValidationReport(tuple(checked_parts), tuple(issues))
|
|
201
327
|
|
|
202
328
|
|
|
203
329
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
204
|
-
parser = argparse.ArgumentParser(
|
|
330
|
+
parser = argparse.ArgumentParser(
|
|
331
|
+
description="Validate HWPX package structure using engine-aligned ZIP/container/manifest checks"
|
|
332
|
+
)
|
|
205
333
|
parser.add_argument("source", help="Path to the HWPX file")
|
|
206
334
|
args = parser.parse_args(argv)
|
|
207
335
|
|
|
208
336
|
report = validate_package(args.source)
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
337
|
+
for issue in report.issues:
|
|
338
|
+
prefix = "ERROR" if issue.is_error else "WARN"
|
|
339
|
+
print(f"{prefix}: {issue}")
|
|
340
|
+
|
|
341
|
+
if report.errors:
|
|
212
342
|
return 1
|
|
213
343
|
|
|
214
|
-
|
|
344
|
+
if report.warnings:
|
|
345
|
+
print("Package validation passed with warnings.")
|
|
346
|
+
else:
|
|
347
|
+
print("All package validations passed.")
|
|
215
348
|
return 0
|
|
216
349
|
|
|
217
350
|
|
hwpx/tools/page_guard.py
CHANGED
|
@@ -7,14 +7,14 @@ textual metrics that often correlate with page-layout drift.
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
9
|
import argparse
|
|
10
|
-
import io
|
|
11
10
|
import json
|
|
12
11
|
from dataclasses import asdict, dataclass
|
|
13
12
|
from pathlib import Path
|
|
14
13
|
from typing import BinaryIO, Iterable, Sequence
|
|
15
|
-
from zipfile import ZipFile
|
|
16
14
|
|
|
17
|
-
from lxml import etree
|
|
15
|
+
from lxml import etree # type: ignore[reportAttributeAccessIssue]
|
|
16
|
+
|
|
17
|
+
from ..opc.package import HwpxPackage
|
|
18
18
|
|
|
19
19
|
NS = {
|
|
20
20
|
"hp": "http://www.hancom.co.kr/hwpml/2011/paragraph",
|
|
@@ -63,31 +63,6 @@ class DocumentMetrics:
|
|
|
63
63
|
paragraph_text_lengths: list[int]
|
|
64
64
|
|
|
65
65
|
|
|
66
|
-
def _section_files(zf: ZipFile) -> list[str]:
|
|
67
|
-
try:
|
|
68
|
-
root = etree.fromstring(zf.read("Contents/content.hpf"))
|
|
69
|
-
except KeyError:
|
|
70
|
-
return [
|
|
71
|
-
name
|
|
72
|
-
for name in zf.namelist()
|
|
73
|
-
if name.startswith("Contents/section") and name.endswith(".xml")
|
|
74
|
-
]
|
|
75
|
-
|
|
76
|
-
id_to_href: dict[str, str] = {}
|
|
77
|
-
for item in root.findall(".//opf:item", namespaces=NS):
|
|
78
|
-
item_id = item.get("id")
|
|
79
|
-
href = item.get("href")
|
|
80
|
-
if item_id and href:
|
|
81
|
-
id_to_href[item_id] = href
|
|
82
|
-
|
|
83
|
-
files: list[str] = []
|
|
84
|
-
for itemref in root.findall(".//opf:itemref", namespaces=NS):
|
|
85
|
-
idref = itemref.get("idref")
|
|
86
|
-
if idref and idref in id_to_href:
|
|
87
|
-
files.append(id_to_href[idref])
|
|
88
|
-
return files
|
|
89
|
-
|
|
90
|
-
|
|
91
66
|
def _text_of_t_node(node: etree._Element) -> str:
|
|
92
67
|
return "".join(node.itertext())
|
|
93
68
|
|
|
@@ -99,16 +74,9 @@ def _local_name(tag: str) -> str:
|
|
|
99
74
|
|
|
100
75
|
|
|
101
76
|
def _iter_section_roots(source: str | Path | bytes | BinaryIO) -> Iterable[etree._Element]:
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
archive = ZipFile(source, "r")
|
|
106
|
-
|
|
107
|
-
try:
|
|
108
|
-
for name in _section_files(archive):
|
|
109
|
-
yield etree.fromstring(archive.read(name))
|
|
110
|
-
finally:
|
|
111
|
-
archive.close()
|
|
77
|
+
package = HwpxPackage.open(source)
|
|
78
|
+
for name in package.section_paths():
|
|
79
|
+
yield package.get_xml(name)
|
|
112
80
|
|
|
113
81
|
|
|
114
82
|
def collect_metrics(source: str | Path | bytes | BinaryIO) -> DocumentMetrics:
|
|
@@ -273,8 +241,12 @@ def main(argv: Sequence[str] | None = None) -> int:
|
|
|
273
241
|
parser.add_argument("--json", action="store_true", help="Print collected metrics as JSON")
|
|
274
242
|
args = parser.parse_args(argv)
|
|
275
243
|
|
|
276
|
-
|
|
277
|
-
|
|
244
|
+
try:
|
|
245
|
+
reference = collect_metrics(args.reference)
|
|
246
|
+
output = collect_metrics(args.output)
|
|
247
|
+
except Exception as exc:
|
|
248
|
+
print(f"ERROR: {exc}")
|
|
249
|
+
return 1
|
|
278
250
|
|
|
279
251
|
if args.json:
|
|
280
252
|
print(
|
hwpx/tools/template_analyzer.py
CHANGED
|
@@ -3,11 +3,13 @@ from __future__ import annotations
|
|
|
3
3
|
import argparse
|
|
4
4
|
import json
|
|
5
5
|
from dataclasses import asdict, dataclass
|
|
6
|
-
from pathlib import Path
|
|
6
|
+
from pathlib import Path, PurePosixPath
|
|
7
7
|
from typing import Sequence
|
|
8
8
|
from xml.etree import ElementTree as ET
|
|
9
9
|
|
|
10
10
|
from ..opc.package import HwpxPackage
|
|
11
|
+
from ..opc.relationships import parse_manifest_relationships
|
|
12
|
+
from .archive_cli import unpack_hwpx
|
|
11
13
|
from .page_guard import DocumentMetrics, collect_metrics
|
|
12
14
|
|
|
13
15
|
_HH_NS = "http://www.hancom.co.kr/hwpml/2011/head"
|
|
@@ -36,8 +38,12 @@ class TemplateAnalysis:
|
|
|
36
38
|
part_names: tuple[str, ...]
|
|
37
39
|
rootfiles: tuple[str, ...]
|
|
38
40
|
manifest_path: str
|
|
41
|
+
manifest_item_paths: tuple[str, ...]
|
|
39
42
|
header_paths: tuple[str, ...]
|
|
40
43
|
section_paths: tuple[str, ...]
|
|
44
|
+
master_page_paths: tuple[str, ...]
|
|
45
|
+
history_paths: tuple[str, ...]
|
|
46
|
+
bin_data_paths: tuple[str, ...]
|
|
41
47
|
version_path: str | None
|
|
42
48
|
header_summary: HeaderSummary
|
|
43
49
|
proxy_metrics: DocumentMetrics
|
|
@@ -59,23 +65,36 @@ def _summarize_header(element: ET.Element | None) -> HeaderSummary:
|
|
|
59
65
|
)
|
|
60
66
|
|
|
61
67
|
|
|
68
|
+
def _is_bindata_path(path: str) -> bool:
|
|
69
|
+
return any(part.lower() == "bindata" for part in PurePosixPath(path).parts)
|
|
70
|
+
|
|
71
|
+
|
|
62
72
|
def analyze_template(source: str | Path) -> TemplateAnalysis:
|
|
63
73
|
source_path = Path(source)
|
|
64
74
|
package = HwpxPackage.open(source_path)
|
|
75
|
+
relationships = parse_manifest_relationships(
|
|
76
|
+
package.manifest_tree(),
|
|
77
|
+
package.main_content.full_path,
|
|
78
|
+
known_parts=package.part_names(),
|
|
79
|
+
)
|
|
65
80
|
|
|
66
81
|
header_paths = tuple(package.header_paths())
|
|
67
82
|
header_xml = package.get_xml(header_paths[0]) if header_paths else None
|
|
68
|
-
manifest_path = package.main_content.full_path
|
|
69
|
-
version_path = package.version_path()
|
|
70
83
|
|
|
71
84
|
return TemplateAnalysis(
|
|
72
85
|
source_name=source_path.name,
|
|
73
86
|
part_names=tuple(package.part_names()),
|
|
74
87
|
rootfiles=tuple(rootfile.full_path for rootfile in package.iter_rootfiles()),
|
|
75
|
-
manifest_path=
|
|
88
|
+
manifest_path=package.main_content.full_path,
|
|
89
|
+
manifest_item_paths=tuple(item.resolved_path for item in relationships.items),
|
|
76
90
|
header_paths=header_paths,
|
|
77
91
|
section_paths=tuple(package.section_paths()),
|
|
78
|
-
|
|
92
|
+
master_page_paths=tuple(package.master_page_paths()),
|
|
93
|
+
history_paths=tuple(package.history_paths()),
|
|
94
|
+
bin_data_paths=tuple(
|
|
95
|
+
item.resolved_path for item in relationships.items if _is_bindata_path(item.resolved_path)
|
|
96
|
+
),
|
|
97
|
+
version_path=package.version_path(),
|
|
79
98
|
header_summary=_summarize_header(header_xml),
|
|
80
99
|
proxy_metrics=collect_metrics(source_path),
|
|
81
100
|
)
|
|
@@ -100,18 +119,9 @@ def extract_template_parts(
|
|
|
100
119
|
written: list[Path] = []
|
|
101
120
|
|
|
102
121
|
if extract_dir is not None:
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
written.append(
|
|
106
|
-
for part_name in package.header_paths():
|
|
107
|
-
written.append(_write_part(package, part_name, root / part_name))
|
|
108
|
-
for part_name in package.section_paths():
|
|
109
|
-
written.append(_write_part(package, part_name, root / part_name))
|
|
110
|
-
version_path = package.version_path()
|
|
111
|
-
if version_path and package.has_part(version_path):
|
|
112
|
-
written.append(_write_part(package, version_path, root / version_path))
|
|
113
|
-
if package.has_part(package.CONTAINER_PATH):
|
|
114
|
-
written.append(_write_part(package, package.CONTAINER_PATH, root / package.CONTAINER_PATH))
|
|
122
|
+
result = unpack_hwpx(source_path, extract_dir, pretty_xml=False)
|
|
123
|
+
written.extend(result.output_dir / entry.path for entry in result.entries)
|
|
124
|
+
written.append(result.metadata_path)
|
|
115
125
|
|
|
116
126
|
if extract_header is not None:
|
|
117
127
|
header_paths = package.header_paths()
|
|
@@ -141,6 +151,9 @@ def _print_summary(analysis: TemplateAnalysis) -> None:
|
|
|
141
151
|
print(f"rootfiles: {', '.join(analysis.rootfiles) or '(none)'}")
|
|
142
152
|
print(f"headers: {', '.join(analysis.header_paths) or '(none)'}")
|
|
143
153
|
print(f"sections: {', '.join(analysis.section_paths) or '(none)'}")
|
|
154
|
+
print(f"masterPages: {', '.join(analysis.master_page_paths) or '(none)'}")
|
|
155
|
+
print(f"histories: {', '.join(analysis.history_paths) or '(none)'}")
|
|
156
|
+
print(f"BinData: {', '.join(analysis.bin_data_paths) or '(none)'}")
|
|
144
157
|
if analysis.version_path:
|
|
145
158
|
print(f"version part: {analysis.version_path}")
|
|
146
159
|
print(
|
|
@@ -163,14 +176,17 @@ def _print_summary(analysis: TemplateAnalysis) -> None:
|
|
|
163
176
|
|
|
164
177
|
def main(argv: Sequence[str] | None = None) -> int:
|
|
165
178
|
parser = argparse.ArgumentParser(
|
|
166
|
-
description="Analyze a reference HWPX template for template-preserving workflows"
|
|
179
|
+
description="Analyze a reference HWPX template for pack-ready, template-preserving workflows"
|
|
167
180
|
)
|
|
168
181
|
parser.add_argument("input", help="Input HWPX path")
|
|
169
182
|
parser.add_argument("--json", action="store_true", help="Print machine-readable JSON summary")
|
|
170
183
|
parser.add_argument("--output-json", help="Write the JSON summary to a file")
|
|
171
184
|
parser.add_argument(
|
|
172
185
|
"--extract-dir",
|
|
173
|
-
help=
|
|
186
|
+
help=(
|
|
187
|
+
"Create a pack-ready extracted workspace that preserves archive-relative paths "
|
|
188
|
+
"and hwpx-pack metadata"
|
|
189
|
+
),
|
|
174
190
|
)
|
|
175
191
|
parser.add_argument("--extract-header", help="Copy the first header.xml part to a path")
|
|
176
192
|
parser.add_argument("--extract-section", help="Copy the first section XML part to a path")
|