python-hwpx 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hwpx/__init__.py +23 -0
- hwpx/document.py +518 -0
- hwpx/opc/package.py +274 -0
- hwpx/oxml/__init__.py +138 -0
- hwpx/oxml/body.py +151 -0
- hwpx/oxml/common.py +31 -0
- hwpx/oxml/document.py +1932 -0
- hwpx/oxml/header.py +543 -0
- hwpx/oxml/parser.py +62 -0
- hwpx/oxml/schema.py +41 -0
- hwpx/oxml/utils.py +82 -0
- hwpx/package.py +202 -0
- hwpx/tools/__init__.py +36 -0
- hwpx/tools/_schemas/header.xsd +14 -0
- hwpx/tools/_schemas/section.xsd +12 -0
- hwpx/tools/object_finder.py +347 -0
- hwpx/tools/text_extractor.py +726 -0
- hwpx/tools/validator.py +184 -0
- python_hwpx-1.0.dist-info/LICENSE +32 -0
- python_hwpx-1.0.dist-info/METADATA +199 -0
- python_hwpx-1.0.dist-info/RECORD +24 -0
- python_hwpx-1.0.dist-info/WHEEL +5 -0
- python_hwpx-1.0.dist-info/entry_points.txt +2 -0
- python_hwpx-1.0.dist-info/top_level.txt +1 -0
hwpx/oxml/utils.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Optional, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from lxml import etree
|
|
7
|
+
|
|
8
|
+
_TRUE_VALUES = {"1", "true", "True", "TRUE"}
|
|
9
|
+
_FALSE_VALUES = {"0", "false", "False", "FALSE"}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def local_name(node: etree._Element) -> str:
|
|
13
|
+
"""Return the local (namespace-stripped) tag name for *node*."""
|
|
14
|
+
return etree.QName(node).localname
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def parse_int(value: Optional[str], *, allow_none: bool = True) -> Optional[int]:
|
|
18
|
+
"""Parse *value* as an integer.
|
|
19
|
+
|
|
20
|
+
When *allow_none* is ``True`` (the default) ``None`` is returned unchanged.
|
|
21
|
+
``ValueError`` is raised if conversion fails.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
if value is None:
|
|
25
|
+
if allow_none:
|
|
26
|
+
return None
|
|
27
|
+
raise ValueError("Missing integer value")
|
|
28
|
+
try:
|
|
29
|
+
return int(value)
|
|
30
|
+
except (TypeError, ValueError) as exc: # pragma: no cover - defensive branch
|
|
31
|
+
raise ValueError(f"Invalid integer value: {value!r}") from exc
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def parse_bool(value: Optional[str], *, default: Optional[bool] = None) -> Optional[bool]:
|
|
35
|
+
"""Convert a string attribute into a boolean."""
|
|
36
|
+
|
|
37
|
+
if value is None:
|
|
38
|
+
return default
|
|
39
|
+
if value in _TRUE_VALUES:
|
|
40
|
+
return True
|
|
41
|
+
if value in _FALSE_VALUES:
|
|
42
|
+
return False
|
|
43
|
+
raise ValueError(f"Invalid boolean value: {value!r}")
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def text_or_none(node: etree._Element) -> Optional[str]:
|
|
47
|
+
"""Return the text content of *node* stripped of leading/trailing whitespace."""
|
|
48
|
+
|
|
49
|
+
if node.text is None:
|
|
50
|
+
return None
|
|
51
|
+
text = node.text.strip()
|
|
52
|
+
return text if text else None
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
XmlSource = Union[str, bytes, Path, etree._Element, etree._ElementTree]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def coerce_xml_source(source: XmlSource) -> Tuple[etree._Element, etree._ElementTree]:
|
|
59
|
+
"""Return ``(root, tree)`` for *source*.
|
|
60
|
+
|
|
61
|
+
*source* may be an ``lxml`` element, element tree, path-like object or
|
|
62
|
+
raw XML (``str``/``bytes``). The helper normalises the input so that callers
|
|
63
|
+
always receive both the element and the owning tree which is handy for XSD
|
|
64
|
+
validation.
|
|
65
|
+
"""
|
|
66
|
+
|
|
67
|
+
if isinstance(source, etree._Element):
|
|
68
|
+
return source, source.getroottree()
|
|
69
|
+
if isinstance(source, etree._ElementTree):
|
|
70
|
+
return source.getroot(), source
|
|
71
|
+
|
|
72
|
+
if isinstance(source, (str, Path)):
|
|
73
|
+
path = Path(source)
|
|
74
|
+
if path.exists():
|
|
75
|
+
tree = etree.parse(str(path))
|
|
76
|
+
return tree.getroot(), tree
|
|
77
|
+
xml_bytes = str(source).encode("utf-8")
|
|
78
|
+
else:
|
|
79
|
+
xml_bytes = bytes(source)
|
|
80
|
+
|
|
81
|
+
root = etree.fromstring(xml_bytes)
|
|
82
|
+
return root, root.getroottree()
|
hwpx/package.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Utilities for working with the container format used by HWPX files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import BinaryIO, Dict, Iterable, Mapping, MutableMapping
|
|
8
|
+
import xml.etree.ElementTree as ET
|
|
9
|
+
from zipfile import ZIP_DEFLATED, ZipFile
|
|
10
|
+
|
|
11
|
+
_OPF_NS = "http://www.idpf.org/2007/opf/"
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _ensure_bytes(value: bytes | str | ET.Element) -> bytes:
|
|
15
|
+
if isinstance(value, bytes):
|
|
16
|
+
return value
|
|
17
|
+
if isinstance(value, str):
|
|
18
|
+
return value.encode("utf-8")
|
|
19
|
+
if isinstance(value, ET.Element):
|
|
20
|
+
return ET.tostring(value, encoding="utf-8", xml_declaration=True)
|
|
21
|
+
raise TypeError(f"unsupported part payload type: {type(value)!r}")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class HwpxPackage:
|
|
25
|
+
"""Represents the OPC-style package that stores HWPX parts."""
|
|
26
|
+
|
|
27
|
+
MANIFEST_PATH = "Contents/content.hpf"
|
|
28
|
+
HEADER_PATH = "Contents/header.xml"
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
parts: MutableMapping[str, bytes],
|
|
33
|
+
source_path: Path | None = None,
|
|
34
|
+
):
|
|
35
|
+
self._parts: MutableMapping[str, bytes] = dict(parts)
|
|
36
|
+
self._source_path = source_path
|
|
37
|
+
self._manifest_tree: ET.Element | None = None
|
|
38
|
+
self._spine_cache: list[str] | None = None
|
|
39
|
+
self._section_paths_cache: list[str] | None = None
|
|
40
|
+
self._header_paths_cache: list[str] | None = None
|
|
41
|
+
|
|
42
|
+
# -- construction ----------------------------------------------------
|
|
43
|
+
@classmethod
|
|
44
|
+
def open(cls, source: str | Path | bytes | BinaryIO) -> "HwpxPackage":
|
|
45
|
+
if isinstance(source, (str, Path)):
|
|
46
|
+
path = Path(source)
|
|
47
|
+
with ZipFile(path) as archive:
|
|
48
|
+
parts = {info.filename: archive.read(info.filename) for info in archive.infolist()}
|
|
49
|
+
return cls(parts, source_path=path)
|
|
50
|
+
|
|
51
|
+
if isinstance(source, (bytes, bytearray)):
|
|
52
|
+
buffer = io.BytesIO(source)
|
|
53
|
+
with ZipFile(buffer) as archive:
|
|
54
|
+
parts = {info.filename: archive.read(info.filename) for info in archive.infolist()}
|
|
55
|
+
return cls(parts)
|
|
56
|
+
|
|
57
|
+
if hasattr(source, "read"):
|
|
58
|
+
data = source.read()
|
|
59
|
+
buffer = io.BytesIO(data)
|
|
60
|
+
with ZipFile(buffer) as archive:
|
|
61
|
+
parts = {info.filename: archive.read(info.filename) for info in archive.infolist()}
|
|
62
|
+
package = cls(parts)
|
|
63
|
+
package._source_path = None
|
|
64
|
+
return package
|
|
65
|
+
|
|
66
|
+
raise TypeError("unsupported source type for HwpxPackage")
|
|
67
|
+
|
|
68
|
+
# -- accessors -------------------------------------------------------
|
|
69
|
+
def part_names(self) -> Iterable[str]:
|
|
70
|
+
return list(self._parts.keys())
|
|
71
|
+
|
|
72
|
+
def has_part(self, part_name: str) -> bool:
|
|
73
|
+
return part_name in self._parts
|
|
74
|
+
|
|
75
|
+
def get_part(self, part_name: str) -> bytes:
|
|
76
|
+
try:
|
|
77
|
+
return self._parts[part_name]
|
|
78
|
+
except KeyError as exc:
|
|
79
|
+
raise KeyError(f"package does not contain part '{part_name}'") from exc
|
|
80
|
+
|
|
81
|
+
def set_part(self, part_name: str, payload: bytes | str | ET.Element) -> None:
|
|
82
|
+
self._parts[part_name] = _ensure_bytes(payload)
|
|
83
|
+
if part_name == self.MANIFEST_PATH:
|
|
84
|
+
self._manifest_tree = None
|
|
85
|
+
self._spine_cache = None
|
|
86
|
+
self._section_paths_cache = None
|
|
87
|
+
self._header_paths_cache = None
|
|
88
|
+
|
|
89
|
+
def get_xml(self, part_name: str) -> ET.Element:
|
|
90
|
+
return ET.fromstring(self.get_part(part_name))
|
|
91
|
+
|
|
92
|
+
def set_xml(self, part_name: str, element: ET.Element) -> None:
|
|
93
|
+
self.set_part(part_name, element)
|
|
94
|
+
|
|
95
|
+
def get_text(self, part_name: str, encoding: str = "utf-8") -> str:
|
|
96
|
+
return self.get_part(part_name).decode(encoding)
|
|
97
|
+
|
|
98
|
+
# -- manifest helpers ------------------------------------------------
|
|
99
|
+
def manifest_tree(self) -> ET.Element:
|
|
100
|
+
if self._manifest_tree is None:
|
|
101
|
+
self._manifest_tree = self.get_xml(self.MANIFEST_PATH)
|
|
102
|
+
return self._manifest_tree
|
|
103
|
+
|
|
104
|
+
def _resolve_spine_paths(self) -> list[str]:
|
|
105
|
+
if self._spine_cache is None:
|
|
106
|
+
manifest = self.manifest_tree()
|
|
107
|
+
ns = {"opf": _OPF_NS}
|
|
108
|
+
manifest_items: Dict[str, str] = {}
|
|
109
|
+
for item in manifest.findall("./opf:manifest/opf:item", ns):
|
|
110
|
+
item_id = item.attrib.get("id")
|
|
111
|
+
href = item.attrib.get("href", "")
|
|
112
|
+
if item_id and href:
|
|
113
|
+
manifest_items[item_id] = href
|
|
114
|
+
spine_paths: list[str] = []
|
|
115
|
+
for itemref in manifest.findall("./opf:spine/opf:itemref", ns):
|
|
116
|
+
idref = itemref.attrib.get("idref")
|
|
117
|
+
if not idref:
|
|
118
|
+
continue
|
|
119
|
+
href = manifest_items.get(idref)
|
|
120
|
+
if href:
|
|
121
|
+
spine_paths.append(href)
|
|
122
|
+
self._spine_cache = spine_paths
|
|
123
|
+
return self._spine_cache
|
|
124
|
+
|
|
125
|
+
def section_paths(self) -> list[str]:
|
|
126
|
+
if self._section_paths_cache is None:
|
|
127
|
+
from pathlib import PurePosixPath
|
|
128
|
+
|
|
129
|
+
paths = [
|
|
130
|
+
path
|
|
131
|
+
for path in self._resolve_spine_paths()
|
|
132
|
+
if path and PurePosixPath(path).name.startswith("section")
|
|
133
|
+
]
|
|
134
|
+
if not paths:
|
|
135
|
+
# Fallback: include known section files if they exist.
|
|
136
|
+
paths = [
|
|
137
|
+
name
|
|
138
|
+
for name in self._parts.keys()
|
|
139
|
+
if PurePosixPath(name).name.startswith("section")
|
|
140
|
+
]
|
|
141
|
+
self._section_paths_cache = paths
|
|
142
|
+
return list(self._section_paths_cache)
|
|
143
|
+
|
|
144
|
+
def header_paths(self) -> list[str]:
|
|
145
|
+
if self._header_paths_cache is None:
|
|
146
|
+
from pathlib import PurePosixPath
|
|
147
|
+
|
|
148
|
+
paths = [
|
|
149
|
+
path
|
|
150
|
+
for path in self._resolve_spine_paths()
|
|
151
|
+
if path and PurePosixPath(path).name.startswith("header")
|
|
152
|
+
]
|
|
153
|
+
if not paths and self.has_part(self.HEADER_PATH):
|
|
154
|
+
paths = [self.HEADER_PATH]
|
|
155
|
+
self._header_paths_cache = paths
|
|
156
|
+
return list(self._header_paths_cache)
|
|
157
|
+
|
|
158
|
+
# -- saving ----------------------------------------------------------
|
|
159
|
+
def save(
|
|
160
|
+
self,
|
|
161
|
+
path_or_stream: str | Path | BinaryIO | None = None,
|
|
162
|
+
updates: Mapping[str, bytes | str | ET.Element] | None = None,
|
|
163
|
+
) -> str | Path | BinaryIO | bytes | None:
|
|
164
|
+
if updates:
|
|
165
|
+
for part_name, payload in updates.items():
|
|
166
|
+
self.set_part(part_name, payload)
|
|
167
|
+
|
|
168
|
+
destination = path_or_stream or self._source_path
|
|
169
|
+
|
|
170
|
+
if destination is None:
|
|
171
|
+
buffer = io.BytesIO()
|
|
172
|
+
self._write_to_stream(buffer)
|
|
173
|
+
return buffer.getvalue()
|
|
174
|
+
|
|
175
|
+
if isinstance(destination, (str, Path)):
|
|
176
|
+
dest_path = Path(destination)
|
|
177
|
+
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
178
|
+
with ZipFile(dest_path, "w", compression=ZIP_DEFLATED) as archive:
|
|
179
|
+
self._write_archive(archive)
|
|
180
|
+
self._source_path = dest_path
|
|
181
|
+
return dest_path
|
|
182
|
+
|
|
183
|
+
stream = destination
|
|
184
|
+
if hasattr(stream, "seek"):
|
|
185
|
+
stream.seek(0)
|
|
186
|
+
if hasattr(stream, "truncate"):
|
|
187
|
+
stream.truncate(0)
|
|
188
|
+
with ZipFile(stream, "w", compression=ZIP_DEFLATED) as archive:
|
|
189
|
+
self._write_archive(archive)
|
|
190
|
+
if hasattr(stream, "seek"):
|
|
191
|
+
stream.seek(0)
|
|
192
|
+
return stream
|
|
193
|
+
|
|
194
|
+
# -- internals -------------------------------------------------------
|
|
195
|
+
def _write_to_stream(self, stream: BinaryIO) -> None:
|
|
196
|
+
with ZipFile(stream, "w", compression=ZIP_DEFLATED) as archive:
|
|
197
|
+
self._write_archive(archive)
|
|
198
|
+
stream.seek(0)
|
|
199
|
+
|
|
200
|
+
def _write_archive(self, archive: ZipFile) -> None:
|
|
201
|
+
for part_name in sorted(self._parts.keys()):
|
|
202
|
+
archive.writestr(part_name, self._parts[part_name])
|
hwpx/tools/__init__.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Tooling helpers for inspecting HWPX archives."""
|
|
2
|
+
|
|
3
|
+
from .object_finder import FoundElement, ObjectFinder
|
|
4
|
+
from .text_extractor import (
|
|
5
|
+
DEFAULT_NAMESPACES,
|
|
6
|
+
ParagraphInfo,
|
|
7
|
+
SectionInfo,
|
|
8
|
+
TextExtractor,
|
|
9
|
+
build_parent_map,
|
|
10
|
+
describe_element_path,
|
|
11
|
+
strip_namespace,
|
|
12
|
+
)
|
|
13
|
+
from .validator import (
|
|
14
|
+
DocumentSchemas,
|
|
15
|
+
ValidationIssue,
|
|
16
|
+
ValidationReport,
|
|
17
|
+
load_default_schemas,
|
|
18
|
+
validate_document,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"DEFAULT_NAMESPACES",
|
|
23
|
+
"ParagraphInfo",
|
|
24
|
+
"SectionInfo",
|
|
25
|
+
"TextExtractor",
|
|
26
|
+
"build_parent_map",
|
|
27
|
+
"describe_element_path",
|
|
28
|
+
"strip_namespace",
|
|
29
|
+
"FoundElement",
|
|
30
|
+
"ObjectFinder",
|
|
31
|
+
"DocumentSchemas",
|
|
32
|
+
"ValidationIssue",
|
|
33
|
+
"ValidationReport",
|
|
34
|
+
"load_default_schemas",
|
|
35
|
+
"validate_document",
|
|
36
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
|
3
|
+
targetNamespace="http://www.hancom.co.kr/hwpml/2011/head"
|
|
4
|
+
xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head"
|
|
5
|
+
elementFormDefault="qualified">
|
|
6
|
+
<xs:element name="head" type="hh:HeadType"/>
|
|
7
|
+
<xs:complexType name="HeadType">
|
|
8
|
+
<xs:sequence>
|
|
9
|
+
<xs:any namespace="##any" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>
|
|
10
|
+
</xs:sequence>
|
|
11
|
+
<xs:attribute name="version" type="xs:string" use="required"/>
|
|
12
|
+
<xs:attribute name="secCnt" type="xs:nonNegativeInteger" use="required"/>
|
|
13
|
+
</xs:complexType>
|
|
14
|
+
</xs:schema>
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
|
2
|
+
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema"
|
|
3
|
+
targetNamespace="http://www.hancom.co.kr/hwpml/2011/section"
|
|
4
|
+
xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section"
|
|
5
|
+
elementFormDefault="qualified">
|
|
6
|
+
<xs:element name="sec" type="hs:SectionType"/>
|
|
7
|
+
<xs:complexType name="SectionType">
|
|
8
|
+
<xs:sequence>
|
|
9
|
+
<xs:any namespace="##any" processContents="lax" minOccurs="0" maxOccurs="unbounded"/>
|
|
10
|
+
</xs:sequence>
|
|
11
|
+
</xs:complexType>
|
|
12
|
+
</xs:schema>
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
"""Helper utilities that locate XML objects inside HWPX archives."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import (
|
|
8
|
+
Callable,
|
|
9
|
+
Dict,
|
|
10
|
+
Iterator,
|
|
11
|
+
List,
|
|
12
|
+
Mapping,
|
|
13
|
+
Optional,
|
|
14
|
+
Pattern,
|
|
15
|
+
Sequence,
|
|
16
|
+
Tuple,
|
|
17
|
+
Union,
|
|
18
|
+
)
|
|
19
|
+
from xml.etree import ElementTree as ET
|
|
20
|
+
from zipfile import ZipFile
|
|
21
|
+
|
|
22
|
+
from .text_extractor import (
|
|
23
|
+
DEFAULT_NAMESPACES,
|
|
24
|
+
AnnotationOptions,
|
|
25
|
+
SectionInfo,
|
|
26
|
+
TextExtractor,
|
|
27
|
+
_resolve_control_nested_text,
|
|
28
|
+
_resolve_hyperlink_target,
|
|
29
|
+
_resolve_note_text,
|
|
30
|
+
build_parent_map,
|
|
31
|
+
describe_element_path,
|
|
32
|
+
strip_namespace,
|
|
33
|
+
tag_matches,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
__all__ = ["AttrMatcher", "AnnotationMatch", "FoundElement", "ObjectFinder"]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
AttrMatcher = Union[str, Sequence[str], Pattern[str], Callable[[str], bool]]
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass(frozen=True)
|
|
43
|
+
class FoundElement:
|
|
44
|
+
"""Location information for an XML element that matched a query."""
|
|
45
|
+
|
|
46
|
+
section: SectionInfo
|
|
47
|
+
path: str
|
|
48
|
+
element: ET.Element
|
|
49
|
+
|
|
50
|
+
@property
|
|
51
|
+
def tag(self) -> str:
|
|
52
|
+
"""Return the local tag name for the matched element."""
|
|
53
|
+
|
|
54
|
+
return strip_namespace(self.element.tag)
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def hierarchy(self) -> Tuple[str, ...]:
|
|
58
|
+
"""Return the split representation of :pyattr:`path`."""
|
|
59
|
+
|
|
60
|
+
return tuple(self.path.split("/"))
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def text(self) -> Optional[str]:
|
|
64
|
+
"""Expose ``element.text`` for convenience."""
|
|
65
|
+
|
|
66
|
+
return self.element.text
|
|
67
|
+
|
|
68
|
+
def get(self, name: str, default: Optional[str] = None) -> Optional[str]:
|
|
69
|
+
"""Fetch an attribute value from the underlying element."""
|
|
70
|
+
|
|
71
|
+
return self.element.attrib.get(name, default)
|
|
72
|
+
|
|
73
|
+
def __str__(self) -> str: # pragma: no cover - debugging helper
|
|
74
|
+
section = self.section.name
|
|
75
|
+
return f"{section}:{self.path} <{self.tag}>"
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
@dataclass(frozen=True)
|
|
79
|
+
class AnnotationMatch:
|
|
80
|
+
"""Representation of a document annotation located by the finder."""
|
|
81
|
+
|
|
82
|
+
kind: str
|
|
83
|
+
element: FoundElement
|
|
84
|
+
value: Optional[str]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class ObjectFinder:
|
|
88
|
+
"""Perform element searches across the XML payload in an HWPX document."""
|
|
89
|
+
|
|
90
|
+
def __init__(
|
|
91
|
+
self,
|
|
92
|
+
source: Union[str, Path, ZipFile],
|
|
93
|
+
*,
|
|
94
|
+
namespaces: Optional[Mapping[str, str]] = None,
|
|
95
|
+
) -> None:
|
|
96
|
+
self._source = source
|
|
97
|
+
merged_namespaces = dict(DEFAULT_NAMESPACES)
|
|
98
|
+
if namespaces:
|
|
99
|
+
merged_namespaces.update(namespaces)
|
|
100
|
+
self.namespaces: Dict[str, str] = merged_namespaces
|
|
101
|
+
|
|
102
|
+
def iter(
|
|
103
|
+
self,
|
|
104
|
+
*,
|
|
105
|
+
tag: Union[str, Sequence[str], None] = None,
|
|
106
|
+
attrs: Optional[Mapping[str, AttrMatcher]] = None,
|
|
107
|
+
xpath: Optional[str] = None,
|
|
108
|
+
limit: Optional[int] = None,
|
|
109
|
+
section_filter: Optional[Callable[[SectionInfo], bool]] = None,
|
|
110
|
+
) -> Iterator[FoundElement]:
|
|
111
|
+
"""Yield elements that match a combination of criteria."""
|
|
112
|
+
|
|
113
|
+
with TextExtractor(self._source, namespaces=self.namespaces) as extractor:
|
|
114
|
+
for section in extractor.iter_sections():
|
|
115
|
+
if section_filter is not None and not section_filter(section):
|
|
116
|
+
continue
|
|
117
|
+
parent_map = build_parent_map(section.element)
|
|
118
|
+
if xpath is not None:
|
|
119
|
+
candidates = section.element.findall(xpath, namespaces=self.namespaces)
|
|
120
|
+
else:
|
|
121
|
+
candidates = section.element.iter()
|
|
122
|
+
for element in candidates:
|
|
123
|
+
if tag is not None and not tag_matches(element.tag, tag, self.namespaces):
|
|
124
|
+
continue
|
|
125
|
+
if attrs and not self._match_attributes(element, attrs):
|
|
126
|
+
continue
|
|
127
|
+
path = describe_element_path(element, parent_map)
|
|
128
|
+
yield FoundElement(section=section, path=path, element=element)
|
|
129
|
+
if limit is not None:
|
|
130
|
+
limit -= 1
|
|
131
|
+
if limit <= 0:
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
def find_first(
|
|
135
|
+
self,
|
|
136
|
+
*,
|
|
137
|
+
tag: Union[str, Sequence[str], None] = None,
|
|
138
|
+
attrs: Optional[Mapping[str, AttrMatcher]] = None,
|
|
139
|
+
xpath: Optional[str] = None,
|
|
140
|
+
section_filter: Optional[Callable[[SectionInfo], bool]] = None,
|
|
141
|
+
) -> Optional[FoundElement]:
|
|
142
|
+
"""Return the first element that matches or ``None`` when absent."""
|
|
143
|
+
|
|
144
|
+
return next(
|
|
145
|
+
self.iter(
|
|
146
|
+
tag=tag,
|
|
147
|
+
attrs=attrs,
|
|
148
|
+
xpath=xpath,
|
|
149
|
+
limit=1,
|
|
150
|
+
section_filter=section_filter,
|
|
151
|
+
),
|
|
152
|
+
None,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
def find_all(
|
|
156
|
+
self,
|
|
157
|
+
*,
|
|
158
|
+
tag: Union[str, Sequence[str], None] = None,
|
|
159
|
+
attrs: Optional[Mapping[str, AttrMatcher]] = None,
|
|
160
|
+
xpath: Optional[str] = None,
|
|
161
|
+
section_filter: Optional[Callable[[SectionInfo], bool]] = None,
|
|
162
|
+
limit: Optional[int] = None,
|
|
163
|
+
) -> List[FoundElement]:
|
|
164
|
+
"""Return every matching element eagerly as a list."""
|
|
165
|
+
|
|
166
|
+
return list(
|
|
167
|
+
self.iter(
|
|
168
|
+
tag=tag,
|
|
169
|
+
attrs=attrs,
|
|
170
|
+
xpath=xpath,
|
|
171
|
+
limit=limit,
|
|
172
|
+
section_filter=section_filter,
|
|
173
|
+
)
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def iter_annotations(
|
|
177
|
+
self,
|
|
178
|
+
*,
|
|
179
|
+
kinds: Optional[Sequence[str]] = None,
|
|
180
|
+
options: Optional[AnnotationOptions] = None,
|
|
181
|
+
section_filter: Optional[Callable[[SectionInfo], bool]] = None,
|
|
182
|
+
preserve_breaks: bool = True,
|
|
183
|
+
) -> Iterator[AnnotationMatch]:
|
|
184
|
+
"""Yield annotations such as highlights or notes with formatted values."""
|
|
185
|
+
|
|
186
|
+
requested = {
|
|
187
|
+
kind.lower() for kind in (kinds or ("highlight", "footnote", "endnote", "hyperlink", "control"))
|
|
188
|
+
}
|
|
189
|
+
if not requested:
|
|
190
|
+
return
|
|
191
|
+
|
|
192
|
+
render_options = options or AnnotationOptions()
|
|
193
|
+
|
|
194
|
+
with TextExtractor(self._source, namespaces=self.namespaces) as extractor:
|
|
195
|
+
for section in extractor.iter_sections():
|
|
196
|
+
if section_filter is not None and not section_filter(section):
|
|
197
|
+
continue
|
|
198
|
+
parent_map = build_parent_map(section.element)
|
|
199
|
+
|
|
200
|
+
if "highlight" in requested:
|
|
201
|
+
for element in section.element.findall(
|
|
202
|
+
".//hp:markpenBegin", namespaces=self.namespaces
|
|
203
|
+
):
|
|
204
|
+
path = describe_element_path(element, parent_map)
|
|
205
|
+
found = FoundElement(section=section, path=path, element=element)
|
|
206
|
+
color = element.get("color") or ""
|
|
207
|
+
if render_options.highlight == "markers":
|
|
208
|
+
value = render_options.highlight_start.format(color=color)
|
|
209
|
+
else:
|
|
210
|
+
value = render_options.highlight_summary.format(color=color)
|
|
211
|
+
yield AnnotationMatch("highlight", found, value)
|
|
212
|
+
|
|
213
|
+
if "footnote" in requested:
|
|
214
|
+
for element in section.element.findall(
|
|
215
|
+
".//hp:footNote", namespaces=self.namespaces
|
|
216
|
+
):
|
|
217
|
+
yield self._format_note_annotation(
|
|
218
|
+
extractor,
|
|
219
|
+
section,
|
|
220
|
+
parent_map,
|
|
221
|
+
element,
|
|
222
|
+
kind="footnote",
|
|
223
|
+
options=render_options,
|
|
224
|
+
preserve_breaks=preserve_breaks,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
if "endnote" in requested:
|
|
228
|
+
for element in section.element.findall(
|
|
229
|
+
".//hp:endNote", namespaces=self.namespaces
|
|
230
|
+
):
|
|
231
|
+
yield self._format_note_annotation(
|
|
232
|
+
extractor,
|
|
233
|
+
section,
|
|
234
|
+
parent_map,
|
|
235
|
+
element,
|
|
236
|
+
kind="endnote",
|
|
237
|
+
options=render_options,
|
|
238
|
+
preserve_breaks=preserve_breaks,
|
|
239
|
+
)
|
|
240
|
+
|
|
241
|
+
if "hyperlink" in requested:
|
|
242
|
+
for element in section.element.findall(
|
|
243
|
+
".//hp:fieldBegin", namespaces=self.namespaces
|
|
244
|
+
):
|
|
245
|
+
field_type = (element.get("type") or "").upper()
|
|
246
|
+
if field_type != "HYPERLINK":
|
|
247
|
+
continue
|
|
248
|
+
path = describe_element_path(element, parent_map)
|
|
249
|
+
found = FoundElement(section=section, path=path, element=element)
|
|
250
|
+
target = _resolve_hyperlink_target(element, self.namespaces) or ""
|
|
251
|
+
behavior = render_options.hyperlink
|
|
252
|
+
if behavior == "target":
|
|
253
|
+
value = render_options.hyperlink_target_format.format(target=target)
|
|
254
|
+
elif behavior == "placeholder":
|
|
255
|
+
value = render_options.hyperlink_placeholder.format(target=target)
|
|
256
|
+
else:
|
|
257
|
+
value = render_options.hyperlink_summary.format(target=target)
|
|
258
|
+
yield AnnotationMatch("hyperlink", found, value)
|
|
259
|
+
|
|
260
|
+
if "control" in requested:
|
|
261
|
+
for element in section.element.findall(
|
|
262
|
+
".//hp:ctrl", namespaces=self.namespaces
|
|
263
|
+
):
|
|
264
|
+
field_begin = element.find("hp:fieldBegin", namespaces=self.namespaces)
|
|
265
|
+
if field_begin is not None and (field_begin.get("type") or "").upper() == "HYPERLINK":
|
|
266
|
+
continue
|
|
267
|
+
if element.find("hp:fieldEnd", namespaces=self.namespaces) is not None:
|
|
268
|
+
continue
|
|
269
|
+
path = describe_element_path(element, parent_map)
|
|
270
|
+
found = FoundElement(section=section, path=path, element=element)
|
|
271
|
+
first_child = next(iter(element), None)
|
|
272
|
+
name = strip_namespace(first_child.tag) if first_child is not None else "ctrl"
|
|
273
|
+
ctrl_type = (
|
|
274
|
+
first_child.get("type") if first_child is not None else element.get("type") or ""
|
|
275
|
+
)
|
|
276
|
+
behavior = render_options.control
|
|
277
|
+
if behavior == "nested":
|
|
278
|
+
value = _resolve_control_nested_text(
|
|
279
|
+
extractor,
|
|
280
|
+
element,
|
|
281
|
+
render_options,
|
|
282
|
+
preserve_breaks=preserve_breaks,
|
|
283
|
+
)
|
|
284
|
+
elif behavior == "placeholder":
|
|
285
|
+
value = render_options.control_placeholder.format(name=name, type=ctrl_type)
|
|
286
|
+
else:
|
|
287
|
+
value = render_options.control_summary.format(name=name, type=ctrl_type)
|
|
288
|
+
yield AnnotationMatch("control", found, value)
|
|
289
|
+
|
|
290
|
+
def _format_note_annotation(
|
|
291
|
+
self,
|
|
292
|
+
extractor: TextExtractor,
|
|
293
|
+
section: SectionInfo,
|
|
294
|
+
parent_map: Mapping[ET.Element, ET.Element],
|
|
295
|
+
element: ET.Element,
|
|
296
|
+
*,
|
|
297
|
+
kind: str,
|
|
298
|
+
options: AnnotationOptions,
|
|
299
|
+
preserve_breaks: bool,
|
|
300
|
+
) -> AnnotationMatch:
|
|
301
|
+
path = describe_element_path(element, parent_map)
|
|
302
|
+
found = FoundElement(section=section, path=path, element=element)
|
|
303
|
+
inst_id = element.get("instId") or ""
|
|
304
|
+
behavior = options.footnote if kind == "footnote" else options.endnote
|
|
305
|
+
if behavior == "inline":
|
|
306
|
+
text = _resolve_note_text(
|
|
307
|
+
extractor,
|
|
308
|
+
element,
|
|
309
|
+
options,
|
|
310
|
+
preserve_breaks=preserve_breaks,
|
|
311
|
+
)
|
|
312
|
+
value = options.note_inline_format.format(kind=kind, inst_id=inst_id, text=text)
|
|
313
|
+
elif behavior == "placeholder":
|
|
314
|
+
value = options.note_placeholder.format(kind=kind, inst_id=inst_id)
|
|
315
|
+
else:
|
|
316
|
+
value = options.note_summary.format(kind=kind, inst_id=inst_id)
|
|
317
|
+
return AnnotationMatch(kind, found, value)
|
|
318
|
+
|
|
319
|
+
# ------------------------------------------------------------------
|
|
320
|
+
# Internal helpers
|
|
321
|
+
# ------------------------------------------------------------------
|
|
322
|
+
@staticmethod
|
|
323
|
+
def _match_attributes(
|
|
324
|
+
element: ET.Element,
|
|
325
|
+
expected: Mapping[str, AttrMatcher],
|
|
326
|
+
) -> bool:
|
|
327
|
+
for name, matcher in expected.items():
|
|
328
|
+
value = element.attrib.get(name)
|
|
329
|
+
if value is None:
|
|
330
|
+
return False
|
|
331
|
+
if isinstance(matcher, str):
|
|
332
|
+
if value != matcher:
|
|
333
|
+
return False
|
|
334
|
+
elif isinstance(matcher, Sequence) and not isinstance(matcher, (str, bytes)):
|
|
335
|
+
if value not in matcher:
|
|
336
|
+
return False
|
|
337
|
+
elif hasattr(matcher, "search"):
|
|
338
|
+
if not matcher.search(value): # type: ignore[call-arg]
|
|
339
|
+
return False
|
|
340
|
+
elif callable(matcher):
|
|
341
|
+
if not matcher(value):
|
|
342
|
+
return False
|
|
343
|
+
else:
|
|
344
|
+
raise TypeError(
|
|
345
|
+
"Attribute matchers must be str, Sequence, Pattern or callable",
|
|
346
|
+
)
|
|
347
|
+
return True
|