PyPI - python-hwpx - Versions diffs - 2.7__py3-none-any.whl → 2.8__py3-none-any.whl - Mend

python-hwpx 2.7py3-none-any.whl → 2.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

hwpx/opc/package.py +62 -97
hwpx/opc/relationships.py +227 -0
hwpx/oxml/document.py +5 -2
hwpx/tools/archive_cli.py +35 -11
hwpx/tools/package_validator.py +239 -106
hwpx/tools/page_guard.py +12 -40
hwpx/tools/template_analyzer.py +35 -19
hwpx/tools/text_extractor.py +44 -27
{python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/METADATA +10 -3
{python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/RECORD +14 -13
{python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/WHEEL +0 -0
{python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/entry_points.txt +0 -0
{python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/licenses/LICENSE +0 -0
{python_hwpx-2.7.dist-info → python_hwpx-2.8.dist-info}/top_level.txt +0 -0

hwpx/tools/package_validator.py CHANGED Viewed

@@ -4,21 +4,25 @@ import argparse
 import io
 import xml.etree.ElementTree as ET
 from dataclasses import dataclass
-from pathlib import Path
-from typing import BinaryIO, Sequence
+from pathlib import Path, PurePosixPath
+from typing import BinaryIO, Literal, Sequence
 from zipfile import ZIP_STORED, BadZipFile, ZipFile
+from ..opc.relationships import (
+    MAIN_ROOTFILE_MEDIA_TYPE,
+    is_section_part_name,
+    parse_container_rootfiles,
+    parse_manifest_relationships,
+    select_main_rootfile,
+)
 EXPECTED_MIMETYPE = "application/hwp+zip"
+MIMETYPE_PATH = "mimetype"
 CONTAINER_PATH = "META-INF/container.xml"
-MANIFEST_PATH = "Contents/content.hpf"
 HEADER_PATH = "Contents/header.xml"
 VERSION_PATH = "version.xml"
-REQUIRED_CORE_FILES = ("mimetype", CONTAINER_PATH, MANIFEST_PATH, HEADER_PATH, VERSION_PATH)
-OPF_NS = {"opf": "http://www.idpf.org/2007/opf/"}
-CONTAINER_NS = {
-    "ct": "urn:oasis:names:tc:opendocument:xmlns:container",
-    "ocf": "urn:oasis:names:tc:opendocument:xmlns:container",
-}
+IssueLevel = Literal["error", "warning"]
 __all__ = [
     "PackageValidationIssue",
@@ -32,6 +36,11 @@ __all__ = [
 class PackageValidationIssue:
     part_name: str
     message: str
+    level: IssueLevel = "error"
+    @property
+    def is_error(self) -> bool:
+        return self.level == "error"
     def __str__(self) -> str:  # pragma: no cover - human readable helper
         return f"{self.part_name}: {self.message}"
@@ -42,9 +51,17 @@ class PackageValidationReport:
     checked_parts: tuple[str, ...]
     issues: tuple[PackageValidationIssue, ...]
+    @property
+    def errors(self) -> tuple[PackageValidationIssue, ...]:
+        return tuple(issue for issue in self.issues if issue.is_error)
+    @property
+    def warnings(self) -> tuple[PackageValidationIssue, ...]:
+        return tuple(issue for issue in self.issues if not issue.is_error)
     @property
     def ok(self) -> bool:
-        return not self.issues
+        return not self.errors
     def __bool__(self) -> bool:  # pragma: no cover - convenience alias
         return self.ok
@@ -65,43 +82,31 @@ def _parse_xml(payload: bytes) -> ET.Element:
         raise ValueError(f"malformed XML: {exc}") from exc
-def _container_rootfiles(container_root: ET.Element) -> list[str]:
-    paths: list[str] = []
-    for namespace in CONTAINER_NS.values():
-        for elem in container_root.findall(f".//{{{namespace}}}rootfile"):
-            path = (
-                elem.get("full-path")
-                or elem.get("fullPath")
-                or elem.get("full_path")
-            )
-            if path:
-                paths.append(path)
-    return paths
+def _error(issues: list[PackageValidationIssue], part_name: str, message: str) -> None:
+    issues.append(PackageValidationIssue(part_name, message, "error"))
-def _manifest_hrefs(manifest_root: ET.Element) -> set[str]:
-    hrefs: set[str] = set()
-    for item in manifest_root.findall(".//opf:item", OPF_NS):
-        href = item.get("href")
-        if href:
-            hrefs.add(href)
-    return hrefs
+def _warning(issues: list[PackageValidationIssue], part_name: str, message: str) -> None:
+    issues.append(PackageValidationIssue(part_name, message, "warning"))
-def _spine_hrefs(manifest_root: ET.Element) -> list[str]:
-    hrefs: list[str] = []
-    id_to_href: dict[str, str] = {}
-    for item in manifest_root.findall(".//opf:item", OPF_NS):
-        item_id = item.get("id")
-        href = item.get("href")
-        if item_id and href:
-            id_to_href[item_id] = href
+def _safe_read(zf: ZipFile, part_name: str) -> bytes | None:
+    try:
+        return zf.read(part_name)
+    except (BadZipFile, KeyError, OSError):
+        return None
-    for itemref in manifest_root.findall(".//opf:itemref", OPF_NS):
-        idref = itemref.get("idref")
-        if idref and idref in id_to_href:
-            hrefs.append(id_to_href[idref])
-    return hrefs
+def _fallback_named_parts(names: set[str], *, token: str, extra_token: str | None = None) -> list[str]:
+    matches: list[str] = []
+    for name in sorted(names):
+        part_name = PurePosixPath(name).name.lower()
+        if token not in part_name:
+            continue
+        if extra_token is not None and extra_token not in part_name:
+            continue
+        matches.append(name)
+    return matches
 def validate_package(source: str | Path | bytes | BinaryIO) -> PackageValidationReport:
@@ -117,101 +122,229 @@ def validate_package(source: str | Path | bytes | BinaryIO) -> PackageValidation
         )
     with archive as zf:
-        names = zf.namelist()
+        infos = [info for info in zf.infolist() if not info.is_dir()]
+        names = [info.filename for info in infos]
+        name_set = set(names)
         checked_parts.extend(names)
-        for required in REQUIRED_CORE_FILES:
-            if required not in names:
-                issues.append(PackageValidationIssue(required, "missing required file"))
-        if not names:
-            issues.append(PackageValidationIssue("archive", "empty archive"))
+        if not infos:
+            _error(issues, "archive", "empty archive")
             return PackageValidationReport(tuple(checked_parts), tuple(issues))
-        if "mimetype" in names:
-            try:
-                mimetype = zf.read("mimetype").decode("utf-8").strip()
-            except UnicodeDecodeError:
-                mimetype = "<binary>"
-            if mimetype != EXPECTED_MIMETYPE:
-                issues.append(
-                    PackageValidationIssue(
-                        "mimetype",
+        bad_entry = zf.testzip()
+        if bad_entry is not None:
+            _error(issues, bad_entry, "ZIP CRC/integrity check failed")
+        if MIMETYPE_PATH not in name_set:
+            _error(issues, MIMETYPE_PATH, "missing required file")
+        else:
+            mimetype_bytes = _safe_read(zf, MIMETYPE_PATH)
+            if mimetype_bytes is None:
+                _error(issues, MIMETYPE_PATH, "unable to read entry for integrity validation")
+            else:
+                try:
+                    mimetype = mimetype_bytes.decode("utf-8").strip()
+                except UnicodeDecodeError:
+                    mimetype = "<binary>"
+                if mimetype != EXPECTED_MIMETYPE:
+                    _error(
+                        issues,
+                        MIMETYPE_PATH,
                         f"expected {EXPECTED_MIMETYPE!r}, got {mimetype!r}",
                     )
-                )
-            if names[0] != "mimetype":
-                issues.append(PackageValidationIssue("mimetype", "must be the first ZIP entry"))
-            if zf.getinfo("mimetype").compress_type != ZIP_STORED:
-                issues.append(PackageValidationIssue("mimetype", "must use ZIP_STORED"))
+                if infos[0].filename != MIMETYPE_PATH:
+                    _error(issues, MIMETYPE_PATH, "must be the first ZIP entry")
+                if zf.getinfo(MIMETYPE_PATH).compress_type != ZIP_STORED:
+                    _error(issues, MIMETYPE_PATH, "must use ZIP_STORED")
+        if CONTAINER_PATH not in name_set:
+            _error(issues, CONTAINER_PATH, "missing required file")
+        if VERSION_PATH not in name_set:
+            _error(issues, VERSION_PATH, "missing required file under current engine semantics")
         xml_roots: dict[str, ET.Element] = {}
         for name in names:
             if not (name.endswith(".xml") or name.endswith(".hpf")):
                 continue
+            payload = _safe_read(zf, name)
+            if payload is None:
+                _error(issues, name, "unable to read entry for XML parsing")
+                continue
             try:
-                xml_roots[name] = _parse_xml(zf.read(name))
+                xml_roots[name] = _parse_xml(payload)
             except ValueError as exc:
-                issues.append(PackageValidationIssue(name, str(exc)))
+                _error(issues, name, str(exc))
         container_root = xml_roots.get(CONTAINER_PATH)
-        if container_root is not None:
-            rootfiles = _container_rootfiles(container_root)
-            if not rootfiles:
-                issues.append(PackageValidationIssue(CONTAINER_PATH, "declares no rootfile entries"))
-            for rootfile in rootfiles:
-                if rootfile not in names:
-                    issues.append(
-                        PackageValidationIssue(
-                            CONTAINER_PATH,
-                            f"rootfile points to missing part {rootfile!r}",
-                        )
-                    )
+        if container_root is None:
+            return PackageValidationReport(tuple(checked_parts), tuple(issues))
-        manifest_root = xml_roots.get(MANIFEST_PATH)
-        if manifest_root is not None:
-            hrefs = _manifest_hrefs(manifest_root)
-            for href in sorted(hrefs):
-                if href not in names:
-                    issues.append(
-                        PackageValidationIssue(
-                            MANIFEST_PATH,
-                            f"manifest href missing from archive: {href}",
-                        )
-                    )
+        rootfiles = parse_container_rootfiles(container_root)
+        if not rootfiles:
+            _error(issues, CONTAINER_PATH, "declares no rootfile entries")
+            return PackageValidationReport(tuple(checked_parts), tuple(issues))
+        for rootfile in rootfiles:
+            if rootfile.full_path not in name_set:
+                _error(
+                    issues,
+                    CONTAINER_PATH,
+                    f"rootfile points to missing part {rootfile.full_path!r}",
+                )
+        selected_rootfile, used_rootfile_fallback = select_main_rootfile(rootfiles)
+        if selected_rootfile is None:
+            return PackageValidationReport(tuple(checked_parts), tuple(issues))
+        if used_rootfile_fallback:
+            _warning(
+                issues,
+                CONTAINER_PATH,
+                "no rootfile is marked as "
+                f"{MAIN_ROOTFILE_MEDIA_TYPE!r}; engine will use the first declaration "
+                f"{selected_rootfile.full_path!r}",
+            )
+        manifest_root = xml_roots.get(selected_rootfile.full_path)
+        if manifest_root is None:
+            _error(
+                issues,
+                selected_rootfile.full_path,
+                "selected main rootfile is missing or not well-formed XML",
+            )
+            return PackageValidationReport(tuple(checked_parts), tuple(issues))
+        relationships = parse_manifest_relationships(
+            manifest_root,
+            selected_rootfile.full_path,
+            known_parts=name_set,
+        )
-            spine_hrefs = _spine_hrefs(manifest_root)
-            if not spine_hrefs:
-                issues.append(PackageValidationIssue(MANIFEST_PATH, "spine declares no section parts"))
-            for href in spine_hrefs:
-                if href not in names:
-                    issues.append(
-                        PackageValidationIssue(
-                            MANIFEST_PATH,
-                            f"spine item missing from archive: {href}",
-                        )
+        for item in relationships.items:
+            if item.resolved_path not in name_set:
+                _error(
+                    issues,
+                    selected_rootfile.full_path,
+                    f"manifest href missing from archive: {item.href!r} -> {item.resolved_path!r}",
+                )
+        for idref in relationships.dangling_idrefs:
+            _warning(
+                issues,
+                selected_rootfile.full_path,
+                f"spine itemref references missing manifest id {idref!r}",
+            )
+        section_paths = [path for path in relationships.spine_paths if is_section_part_name(path)]
+        if section_paths:
+            for path in section_paths:
+                if path not in name_set:
+                    _error(
+                        issues,
+                        selected_rootfile.full_path,
+                        f"spine section part missing from archive: {path!r}",
                     )
+        else:
+            fallback_sections = [name for name in sorted(name_set) if is_section_part_name(name)]
+            if fallback_sections:
+                _warning(
+                    issues,
+                    selected_rootfile.full_path,
+                    "manifest spine does not resolve any section parts; engine will fall back "
+                    "to filename-based section discovery",
+                )
+            else:
+                _error(
+                    issues,
+                    selected_rootfile.full_path,
+                    "no section parts found in manifest spine or archive fallback",
+                )
-            if HEADER_PATH in names and HEADER_PATH not in hrefs:
-                issues.append(
-                    PackageValidationIssue(MANIFEST_PATH, "header.xml is not referenced in manifest")
+        if not relationships.header_paths and HEADER_PATH in name_set:
+            _warning(
+                issues,
+                selected_rootfile.full_path,
+                "manifest spine does not resolve a header part; engine will fall back to "
+                f"{HEADER_PATH!r}",
+            )
+        for path in relationships.header_paths:
+            if path not in name_set:
+                _error(
+                    issues,
+                    selected_rootfile.full_path,
+                    f"header part missing from archive: {path!r}",
+                )
+        if not relationships.master_page_paths:
+            fallback_master_pages = _fallback_named_parts(name_set, token="master", extra_token="page")
+            if fallback_master_pages:
+                _warning(
+                    issues,
+                    selected_rootfile.full_path,
+                    "manifest does not reference masterPage parts; engine will fall back to "
+                    "filename-based discovery",
+                )
+        for path in relationships.master_page_paths:
+            if path not in name_set:
+                _error(
+                    issues,
+                    selected_rootfile.full_path,
+                    f"masterPage part missing from archive: {path!r}",
+                )
+        if not relationships.history_paths:
+            fallback_histories = _fallback_named_parts(name_set, token="history")
+            if fallback_histories:
+                _warning(
+                    issues,
+                    selected_rootfile.full_path,
+                    "manifest does not reference history parts; engine will fall back to "
+                    "filename-based discovery",
+                )
+        for path in relationships.history_paths:
+            if path not in name_set:
+                _error(
+                    issues,
+                    selected_rootfile.full_path,
+                    f"history part missing from archive: {path!r}",
                 )
+        if relationships.version_path is None and VERSION_PATH in name_set:
+            _warning(
+                issues,
+                selected_rootfile.full_path,
+                "manifest does not reference a version part; engine will fall back to "
+                f"{VERSION_PATH!r}",
+            )
+        elif relationships.version_path is not None and relationships.version_path not in name_set:
+            _error(
+                issues,
+                selected_rootfile.full_path,
+                f"manifest version part missing from archive: {relationships.version_path!r}",
+            )
     return PackageValidationReport(tuple(checked_parts), tuple(issues))
 def main(argv: Sequence[str] | None = None) -> int:
-    parser = argparse.ArgumentParser(description="Validate HWPX package structure")
+    parser = argparse.ArgumentParser(
+        description="Validate HWPX package structure using engine-aligned ZIP/container/manifest checks"
+    )
     parser.add_argument("source", help="Path to the HWPX file")
     args = parser.parse_args(argv)
     report = validate_package(args.source)
-    if report.issues:
-        for issue in report.issues:
-            print(f"ERROR: {issue}")
+    for issue in report.issues:
+        prefix = "ERROR" if issue.is_error else "WARN"
+        print(f"{prefix}: {issue}")
+    if report.errors:
         return 1
-    print("All package validations passed.")
+    if report.warnings:
+        print("Package validation passed with warnings.")
+    else:
+        print("All package validations passed.")
     return 0

hwpx/tools/page_guard.py CHANGED Viewed

@@ -7,14 +7,14 @@ textual metrics that often correlate with page-layout drift.
 from __future__ import annotations
 import argparse
-import io
 import json
 from dataclasses import asdict, dataclass
 from pathlib import Path
 from typing import BinaryIO, Iterable, Sequence
-from zipfile import ZipFile
-from lxml import etree
+from lxml import etree  # type: ignore[reportAttributeAccessIssue]
+from ..opc.package import HwpxPackage
 NS = {
     "hp": "http://www.hancom.co.kr/hwpml/2011/paragraph",
@@ -63,31 +63,6 @@ class DocumentMetrics:
     paragraph_text_lengths: list[int]
-def _section_files(zf: ZipFile) -> list[str]:
-    try:
-        root = etree.fromstring(zf.read("Contents/content.hpf"))
-    except KeyError:
-        return [
-            name
-            for name in zf.namelist()
-            if name.startswith("Contents/section") and name.endswith(".xml")
-        ]
-    id_to_href: dict[str, str] = {}
-    for item in root.findall(".//opf:item", namespaces=NS):
-        item_id = item.get("id")
-        href = item.get("href")
-        if item_id and href:
-            id_to_href[item_id] = href
-    files: list[str] = []
-    for itemref in root.findall(".//opf:itemref", namespaces=NS):
-        idref = itemref.get("idref")
-        if idref and idref in id_to_href:
-            files.append(id_to_href[idref])
-    return files
 def _text_of_t_node(node: etree._Element) -> str:
     return "".join(node.itertext())
@@ -99,16 +74,9 @@ def _local_name(tag: str) -> str:
 def _iter_section_roots(source: str | Path | bytes | BinaryIO) -> Iterable[etree._Element]:
-    if isinstance(source, bytes):
-        archive = ZipFile(io.BytesIO(source), "r")
-    else:
-        archive = ZipFile(source, "r")
-    try:
-        for name in _section_files(archive):
-            yield etree.fromstring(archive.read(name))
-    finally:
-        archive.close()
+    package = HwpxPackage.open(source)
+    for name in package.section_paths():
+        yield package.get_xml(name)
 def collect_metrics(source: str | Path | bytes | BinaryIO) -> DocumentMetrics:
@@ -273,8 +241,12 @@ def main(argv: Sequence[str] | None = None) -> int:
     parser.add_argument("--json", action="store_true", help="Print collected metrics as JSON")
     args = parser.parse_args(argv)
-    reference = collect_metrics(args.reference)
-    output = collect_metrics(args.output)
+    try:
+        reference = collect_metrics(args.reference)
+        output = collect_metrics(args.output)
+    except Exception as exc:
+        print(f"ERROR: {exc}")
+        return 1
     if args.json:
         print(

hwpx/tools/template_analyzer.py CHANGED Viewed

@@ -3,11 +3,13 @@ from __future__ import annotations
 import argparse
 import json
 from dataclasses import asdict, dataclass
-from pathlib import Path
+from pathlib import Path, PurePosixPath
 from typing import Sequence
 from xml.etree import ElementTree as ET
 from ..opc.package import HwpxPackage
+from ..opc.relationships import parse_manifest_relationships
+from .archive_cli import unpack_hwpx
 from .page_guard import DocumentMetrics, collect_metrics
 _HH_NS = "http://www.hancom.co.kr/hwpml/2011/head"
@@ -36,8 +38,12 @@ class TemplateAnalysis:
     part_names: tuple[str, ...]
     rootfiles: tuple[str, ...]
     manifest_path: str
+    manifest_item_paths: tuple[str, ...]
     header_paths: tuple[str, ...]
     section_paths: tuple[str, ...]
+    master_page_paths: tuple[str, ...]
+    history_paths: tuple[str, ...]
+    bin_data_paths: tuple[str, ...]
     version_path: str | None
     header_summary: HeaderSummary
     proxy_metrics: DocumentMetrics
@@ -59,23 +65,36 @@ def _summarize_header(element: ET.Element | None) -> HeaderSummary:
     )
+def _is_bindata_path(path: str) -> bool:
+    return any(part.lower() == "bindata" for part in PurePosixPath(path).parts)
 def analyze_template(source: str | Path) -> TemplateAnalysis:
     source_path = Path(source)
     package = HwpxPackage.open(source_path)
+    relationships = parse_manifest_relationships(
+        package.manifest_tree(),
+        package.main_content.full_path,
+        known_parts=package.part_names(),
+    )
     header_paths = tuple(package.header_paths())
     header_xml = package.get_xml(header_paths[0]) if header_paths else None
-    manifest_path = package.main_content.full_path
-    version_path = package.version_path()
     return TemplateAnalysis(
         source_name=source_path.name,
         part_names=tuple(package.part_names()),
         rootfiles=tuple(rootfile.full_path for rootfile in package.iter_rootfiles()),
-        manifest_path=manifest_path,
+        manifest_path=package.main_content.full_path,
+        manifest_item_paths=tuple(item.resolved_path for item in relationships.items),
         header_paths=header_paths,
         section_paths=tuple(package.section_paths()),
-        version_path=version_path,
+        master_page_paths=tuple(package.master_page_paths()),
+        history_paths=tuple(package.history_paths()),
+        bin_data_paths=tuple(
+            item.resolved_path for item in relationships.items if _is_bindata_path(item.resolved_path)
+        ),
+        version_path=package.version_path(),
         header_summary=_summarize_header(header_xml),
         proxy_metrics=collect_metrics(source_path),
     )
@@ -100,18 +119,9 @@ def extract_template_parts(
     written: list[Path] = []
     if extract_dir is not None:
-        root = Path(extract_dir)
-        root.mkdir(parents=True, exist_ok=True)
-        written.append(_write_part(package, package.main_content.full_path, root / package.main_content.full_path))
-        for part_name in package.header_paths():
-            written.append(_write_part(package, part_name, root / part_name))
-        for part_name in package.section_paths():
-            written.append(_write_part(package, part_name, root / part_name))
-        version_path = package.version_path()
-        if version_path and package.has_part(version_path):
-            written.append(_write_part(package, version_path, root / version_path))
-        if package.has_part(package.CONTAINER_PATH):
-            written.append(_write_part(package, package.CONTAINER_PATH, root / package.CONTAINER_PATH))
+        result = unpack_hwpx(source_path, extract_dir, pretty_xml=False)
+        written.extend(result.output_dir / entry.path for entry in result.entries)
+        written.append(result.metadata_path)
     if extract_header is not None:
         header_paths = package.header_paths()
@@ -141,6 +151,9 @@ def _print_summary(analysis: TemplateAnalysis) -> None:
     print(f"rootfiles: {', '.join(analysis.rootfiles) or '(none)'}")
     print(f"headers: {', '.join(analysis.header_paths) or '(none)'}")
     print(f"sections: {', '.join(analysis.section_paths) or '(none)'}")
+    print(f"masterPages: {', '.join(analysis.master_page_paths) or '(none)'}")
+    print(f"histories: {', '.join(analysis.history_paths) or '(none)'}")
+    print(f"BinData: {', '.join(analysis.bin_data_paths) or '(none)'}")
     if analysis.version_path:
         print(f"version part: {analysis.version_path}")
     print(
@@ -163,14 +176,17 @@ def _print_summary(analysis: TemplateAnalysis) -> None:
 def main(argv: Sequence[str] | None = None) -> int:
     parser = argparse.ArgumentParser(
-        description="Analyze a reference HWPX template for template-preserving workflows"
+        description="Analyze a reference HWPX template for pack-ready, template-preserving workflows"
     )
     parser.add_argument("input", help="Input HWPX path")
     parser.add_argument("--json", action="store_true", help="Print machine-readable JSON summary")
     parser.add_argument("--output-json", help="Write the JSON summary to a file")
     parser.add_argument(
         "--extract-dir",
-        help="Copy manifest, header, sections, version, and container.xml into a directory",
+        help=(
+            "Create a pack-ready extracted workspace that preserves archive-relative paths "
+            "and hwpx-pack metadata"
+        ),
     )
     parser.add_argument("--extract-header", help="Copy the first header.xml part to a path")
     parser.add_argument("--extract-section", help="Copy the first section XML part to a path")

python-hwpx 2.7__py3-none-any.whl → 2.8__py3-none-any.whl

python-hwpx 2.7py3-none-any.whl → 2.8py3-none-any.whl