pypxml 1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cli/__init__.py +0 -0
- cli/pypxml_cli.py +15 -0
- pypxml/__init__.py +9 -0
- pypxml/element.py +225 -0
- pypxml/page.py +244 -0
- pypxml/pxml.py +223 -0
- pypxml/resources/__init__.py +4 -0
- pypxml/resources/xml_schema.py +43 -0
- pypxml/resources/xml_types.py +59 -0
- pypxml-1.0.dist-info/LICENSE +21 -0
- pypxml-1.0.dist-info/METADATA +156 -0
- pypxml-1.0.dist-info/RECORD +15 -0
- pypxml-1.0.dist-info/WHEEL +5 -0
- pypxml-1.0.dist-info/entry_points.txt +2 -0
- pypxml-1.0.dist-info/top_level.txt +2 -0
cli/__init__.py
ADDED
|
File without changes
|
cli/pypxml_cli.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import click
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
@click.group()
|
|
5
|
+
@click.help_option('--help')
|
|
6
|
+
@click.version_option('1.0', '--version',
|
|
7
|
+
prog_name='PyPXML',
|
|
8
|
+
message='%(prog)s v%(version)s - Developed at Centre for Philology and Digitality (ZPD), '
|
|
9
|
+
'University of Würzburg')
|
|
10
|
+
def cli():
|
|
11
|
+
"""
|
|
12
|
+
PyPXML command line interface entry point.
|
|
13
|
+
"""
|
|
14
|
+
pass
|
|
15
|
+
|
pypxml/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# This file is licensed under the MIT License.
|
|
2
|
+
# Copyright (c) 2024 Janik Haitz
|
|
3
|
+
# See the LICENSE file in the root directory for more details.
|
|
4
|
+
|
|
5
|
+
from .pxml import PageXML
|
|
6
|
+
from .page import Page
|
|
7
|
+
from .element import Element
|
|
8
|
+
from .resources.xml_schema import XMLSchema
|
|
9
|
+
from .resources.xml_types import XMLType
|
pypxml/element.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# This file is licensed under the MIT License.
|
|
2
|
+
# Copyright (c) 2024 Janik Haitz
|
|
3
|
+
# See the LICENSE file in the root directory for more details.
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Union, Self
|
|
6
|
+
|
|
7
|
+
from lxml import etree
|
|
8
|
+
|
|
9
|
+
from .resources.xml_types import XMLType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Element:
|
|
13
|
+
""" Represents an element inside a page. """
|
|
14
|
+
def __init__(self, _type: XMLType, attributes: Optional[dict[str, str]] = None):
|
|
15
|
+
self.__type: XMLType = _type
|
|
16
|
+
self.__attributes: dict[str, str] = attributes if attributes else {}
|
|
17
|
+
self.__elements: list[Element] = []
|
|
18
|
+
self.__text: Optional[str] = None
|
|
19
|
+
|
|
20
|
+
def __len__(self) -> int:
|
|
21
|
+
""" Returns the number of elements. """
|
|
22
|
+
return len(self.__elements)
|
|
23
|
+
|
|
24
|
+
def __iter__(self) -> Self:
|
|
25
|
+
""" Iterator: starting point for iterating over all elements. """
|
|
26
|
+
self.__n = 0
|
|
27
|
+
return self
|
|
28
|
+
|
|
29
|
+
def __next__(self) -> Self:
|
|
30
|
+
""" Iterator: yield next element. """
|
|
31
|
+
if self.__n < len(self.__elements):
|
|
32
|
+
self.__n += 1
|
|
33
|
+
return self.__elements[self.__n - 1]
|
|
34
|
+
else:
|
|
35
|
+
raise StopIteration
|
|
36
|
+
|
|
37
|
+
def __getitem__(self, key: Union[int, str]) -> Optional[Union[Self, str]]:
|
|
38
|
+
"""
|
|
39
|
+
Get an Element object by its index or an attribute value by its key
|
|
40
|
+
:param key: Index (integer) of an Element object or a key (string) of an attribute.
|
|
41
|
+
:return: The Element of passed index (returns last object if the key is out of range) or the value of the
|
|
42
|
+
selected attribute. Returns None, if no match was found.
|
|
43
|
+
"""
|
|
44
|
+
if isinstance(key, int) and len(self.__elements) > 0:
|
|
45
|
+
return self.__elements[min(key, len(self.__elements) - 1)]
|
|
46
|
+
elif isinstance(key, str) and key in self.__attributes:
|
|
47
|
+
return self.__attributes[key]
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
def __setitem__(self, key: Union[int, str], value: Union[Self, str]) -> None:
|
|
51
|
+
"""
|
|
52
|
+
Set an Element object or an attribute value.
|
|
53
|
+
:param key: Index (integer) for an Element object or a key (string) for an attribute.
|
|
54
|
+
:param value: Element object (if key is of type integer) or a string (if key is of type string).
|
|
55
|
+
"""
|
|
56
|
+
if isinstance(key, int) and isinstance(value, Element) and len(self.__elements) > 0:
|
|
57
|
+
self.__elements[min(key, len(self.__elements) - 1)] = value
|
|
58
|
+
elif isinstance(key, str):
|
|
59
|
+
self.__attributes[key] = value
|
|
60
|
+
else:
|
|
61
|
+
raise ValueError('Invalid key or value')
|
|
62
|
+
|
|
63
|
+
def __contains__(self, key: Union[Self, str]) -> bool:
|
|
64
|
+
"""
|
|
65
|
+
Checks if an Element object or an attribute exists.
|
|
66
|
+
:param key: Element object or attribute key.
|
|
67
|
+
:return: True, if either the passed Element object or the attribute exists. Else return False.
|
|
68
|
+
"""
|
|
69
|
+
if isinstance(key, Element):
|
|
70
|
+
return key in self.__elements
|
|
71
|
+
elif isinstance(key, str):
|
|
72
|
+
return key in self.__attributes
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
@property
|
|
76
|
+
def type(self) -> XMLType:
|
|
77
|
+
return self.__type
|
|
78
|
+
|
|
79
|
+
@type.setter
|
|
80
|
+
def type(self, value: XMLType) -> None:
|
|
81
|
+
self.__type = value
|
|
82
|
+
|
|
83
|
+
@property
|
|
84
|
+
def attributes(self) -> dict[str, str]:
|
|
85
|
+
return self.__attributes
|
|
86
|
+
|
|
87
|
+
@property
|
|
88
|
+
def elements(self) -> list[Self]:
|
|
89
|
+
return self.__elements
|
|
90
|
+
|
|
91
|
+
@property
|
|
92
|
+
def id(self) -> Optional[str]:
|
|
93
|
+
return self.__attributes.get('id', None)
|
|
94
|
+
|
|
95
|
+
@id.setter
|
|
96
|
+
def id(self, value: Optional[str]) -> None:
|
|
97
|
+
if value is None:
|
|
98
|
+
self.__attributes.pop('id', None)
|
|
99
|
+
else:
|
|
100
|
+
self.__attributes['id'] = value
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
def text(self) -> Optional[str]:
|
|
104
|
+
return self.__text
|
|
105
|
+
|
|
106
|
+
@text.setter
|
|
107
|
+
def text(self, value: Optional[str]) -> None:
|
|
108
|
+
self.__text = None if value is None else str(value)
|
|
109
|
+
|
|
110
|
+
@classmethod
|
|
111
|
+
def new(cls, _type: XMLType, **attributes: str) -> Self:
|
|
112
|
+
"""
|
|
113
|
+
Create a new Element object from scratch.
|
|
114
|
+
:param _type: The type of element to create.
|
|
115
|
+
:param attributes: Named arguments that will be stores as xml attributes.
|
|
116
|
+
:return: The newly created Element object.
|
|
117
|
+
"""
|
|
118
|
+
attributes = {str(k): str(v) for k, v in attributes.items() if v is not None}
|
|
119
|
+
return cls(_type, attributes)
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def from_etree(cls, tree: etree.Element) -> Self:
|
|
123
|
+
"""
|
|
124
|
+
Create a new Element object from a lxml etree object.
|
|
125
|
+
:param tree: lxml etree object.
|
|
126
|
+
:return: Element object that represents the passed etree object.
|
|
127
|
+
"""
|
|
128
|
+
element = cls(XMLType(tree.tag.split('}')[1]), dict(tree.items()))
|
|
129
|
+
element.text = tree.text
|
|
130
|
+
for child in tree:
|
|
131
|
+
element.add_element(Element.from_etree(child))
|
|
132
|
+
return element
|
|
133
|
+
|
|
134
|
+
def to_etree(self) -> etree.Element:
|
|
135
|
+
"""
|
|
136
|
+
Convert the Element object to a lxml etree object.
|
|
137
|
+
:return: A lxml etree object that represents this Element object.
|
|
138
|
+
"""
|
|
139
|
+
element = etree.Element(self.__type.value, **self.__attributes)
|
|
140
|
+
if self.__text is not None:
|
|
141
|
+
element.text = self.__text
|
|
142
|
+
for child in self.__elements:
|
|
143
|
+
element.append(child.to_etree())
|
|
144
|
+
return element
|
|
145
|
+
|
|
146
|
+
def is_region(self) -> bool:
|
|
147
|
+
""" Returns True, if the Element object is a region. """
|
|
148
|
+
return self.__type.value.endswith('Region')
|
|
149
|
+
|
|
150
|
+
def contains_text(self) -> bool:
|
|
151
|
+
""" Returns True, if the Element object contains text. """
|
|
152
|
+
return self.__text is not None
|
|
153
|
+
|
|
154
|
+
def set_attribute(self, key: str, value: Optional[str]) -> None:
|
|
155
|
+
"""
|
|
156
|
+
Set an attribute.
|
|
157
|
+
:param key: Key of attribute. Creates a new one if the key does not exist.
|
|
158
|
+
:param value: Value for the attribute. Deletes the attribute of None is passed.
|
|
159
|
+
"""
|
|
160
|
+
if value is None:
|
|
161
|
+
self.__attributes.pop(str(key), None)
|
|
162
|
+
else:
|
|
163
|
+
self.__attributes[str(key)] = str(value)
|
|
164
|
+
|
|
165
|
+
def delete_attribute(self, key: str) -> Optional[str]:
|
|
166
|
+
"""
|
|
167
|
+
Delete an attribute.
|
|
168
|
+
:param key: The key to delete.
|
|
169
|
+
:return: Returns the deleted attribute value. If the key does not exist, None is returned.
|
|
170
|
+
"""
|
|
171
|
+
return self.__attributes.pop(str(key), None)
|
|
172
|
+
|
|
173
|
+
def get_coords(self) -> Optional[Self]:
|
|
174
|
+
""" Return the first direct child Element object of type Coords. """
|
|
175
|
+
for element in self.__elements:
|
|
176
|
+
if element.type == XMLType.Coords:
|
|
177
|
+
return element
|
|
178
|
+
return None
|
|
179
|
+
|
|
180
|
+
def get_baseline(self) -> Optional[Self]:
|
|
181
|
+
""" Return the first direct child Element object of type Baseline. """
|
|
182
|
+
for element in self.__elements:
|
|
183
|
+
if element.type == XMLType.Baseline:
|
|
184
|
+
return element
|
|
185
|
+
return None
|
|
186
|
+
|
|
187
|
+
def add_element(self, element: Self, index: Optional[int] = None) -> None:
|
|
188
|
+
"""
|
|
189
|
+
Add an existing Element object to the list elements.
|
|
190
|
+
:param element: The element to add.
|
|
191
|
+
:param index: If set, insert the element at this index. Else append to the list.
|
|
192
|
+
"""
|
|
193
|
+
if index is None:
|
|
194
|
+
self.__elements.append(element)
|
|
195
|
+
else:
|
|
196
|
+
self.__elements.insert(min(index, len(self.__elements) - 1), element)
|
|
197
|
+
|
|
198
|
+
def create_element(self, _type: XMLType, index: int = None, **attributes: str) -> Self:
|
|
199
|
+
"""
|
|
200
|
+
Create a new Element object and add it to the list of elements.
|
|
201
|
+
:param _type: XMLType of new element.
|
|
202
|
+
:param index: If set, insert the new element at this index. Else append to the list.
|
|
203
|
+
:param attributes: Named arguments that will be stores as xml attributes.
|
|
204
|
+
:return: The newly created Element object.
|
|
205
|
+
"""
|
|
206
|
+
element = Element.new(_type, **attributes)
|
|
207
|
+
self.add_element(element, index)
|
|
208
|
+
return element
|
|
209
|
+
|
|
210
|
+
def remove_element(self, element: Union[Self, int]) -> Optional[Self]:
|
|
211
|
+
"""
|
|
212
|
+
Remove an element from the list of elements.
|
|
213
|
+
:param element: The Element object or the index of the element to remove.
|
|
214
|
+
:return: The removed element, if it existed.
|
|
215
|
+
"""
|
|
216
|
+
if isinstance(element, int) and element < len(self.__elements) - 1:
|
|
217
|
+
return self.__elements.pop(element)
|
|
218
|
+
elif isinstance(element, Element) and element in self.__elements:
|
|
219
|
+
self.__elements.remove(element)
|
|
220
|
+
return element
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
def clear_elements(self) -> None:
|
|
224
|
+
""" Remove all Element objects from the list of elements. """
|
|
225
|
+
self.__elements.clear()
|
pypxml/page.py
ADDED
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
# This file is licensed under the MIT License.
|
|
2
|
+
# Copyright (c) 2024 Janik Haitz
|
|
3
|
+
# See the LICENSE file in the root directory for more details.
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Union, Self
|
|
6
|
+
|
|
7
|
+
from lxml import etree
|
|
8
|
+
|
|
9
|
+
from .element import Element
|
|
10
|
+
from .resources.xml_types import XMLType
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class Page:
|
|
14
|
+
""" Represents a page of a PageXML file. """
|
|
15
|
+
def __init__(self, attributes: Optional[dict[str, str]] = None):
|
|
16
|
+
self.__attributes: dict[str, str] = attributes if attributes else {}
|
|
17
|
+
self.__reading_order: list[str] = [] # region id's
|
|
18
|
+
self.__elements: list[Element] = []
|
|
19
|
+
|
|
20
|
+
def __len__(self) -> int:
|
|
21
|
+
""" Returns the number of elements. """
|
|
22
|
+
return len(self.__elements)
|
|
23
|
+
|
|
24
|
+
def __iter__(self) -> Self:
|
|
25
|
+
""" Iterator: starting point for iterating over all elements that are regions. """
|
|
26
|
+
self.__n = 0
|
|
27
|
+
self.__regions = [element for element in self.__elements if element.is_region()]
|
|
28
|
+
return self
|
|
29
|
+
|
|
30
|
+
def __next__(self) -> Element:
|
|
31
|
+
""" Iterator: yield next element that is a region. """
|
|
32
|
+
if self.__n < len(self.__regions):
|
|
33
|
+
self.__n += 1
|
|
34
|
+
return self.__regions[self.__n - 1]
|
|
35
|
+
else:
|
|
36
|
+
raise StopIteration
|
|
37
|
+
|
|
38
|
+
def __getitem__(self, key: Union[int, str]) -> Optional[Union[Element, str]]:
|
|
39
|
+
"""
|
|
40
|
+
Get an Element object by its index or an attribute value by its key
|
|
41
|
+
:param key: Index (integer) of an Element object or a key (string) of an attribute.
|
|
42
|
+
:return: The Element of passed index (returns last object if the key is out of range) or the value of the
|
|
43
|
+
selected attribute. Returns None, if no match was found.
|
|
44
|
+
"""
|
|
45
|
+
if isinstance(key, int) and len(self.__elements) > 0:
|
|
46
|
+
return self.__elements[min(key, len(self.__elements) - 1)]
|
|
47
|
+
elif isinstance(key, str) and key in self.__attributes:
|
|
48
|
+
return self.__attributes[key]
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
def __setitem__(self, key: Union[int, str], value: Union[Element, str]) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Set an Element object or an attribute value.
|
|
54
|
+
:param key: Index (integer) for an Element object or a key (string) for an attribute.
|
|
55
|
+
:param value: Element object (if key is of type integer) or a string (if key is of type string).
|
|
56
|
+
"""
|
|
57
|
+
if isinstance(key, int) and isinstance(value, Element) and len(self.__elements) > 0:
|
|
58
|
+
self.__elements[min(key, len(self.__elements) - 1)] = value
|
|
59
|
+
elif isinstance(key, str):
|
|
60
|
+
self.__attributes[key] = value
|
|
61
|
+
else:
|
|
62
|
+
raise ValueError('Invalid key or value')
|
|
63
|
+
|
|
64
|
+
def __contains__(self, key: Union[Element, str]) -> bool:
|
|
65
|
+
"""
|
|
66
|
+
Checks if an Element object or an attribute exists.
|
|
67
|
+
:param key: Element object or attribute key.
|
|
68
|
+
:return: True, if either the passed Element object or the attribute exists. Else return False.
|
|
69
|
+
"""
|
|
70
|
+
if isinstance(key, Element):
|
|
71
|
+
return key in self.__elements
|
|
72
|
+
elif isinstance(key, str):
|
|
73
|
+
return key in self.__attributes
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def attributes(self) -> dict[str, str]:
|
|
78
|
+
return self.__attributes
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def reading_order(self) -> list[str]:
|
|
82
|
+
return self.__reading_order
|
|
83
|
+
|
|
84
|
+
@property
|
|
85
|
+
def elements(self) -> list[Element]:
|
|
86
|
+
return self.__elements
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def regions(self) -> list[Element]:
|
|
90
|
+
return list([element for element in self.__regions if element.is_region()])
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def image_filename(self) -> Optional[str]:
|
|
94
|
+
return self.__attributes.get('imageFilename', None)
|
|
95
|
+
|
|
96
|
+
@image_filename.setter
|
|
97
|
+
def image_filename(self, filename: Optional[str]) -> None:
|
|
98
|
+
if filename is None:
|
|
99
|
+
self.__attributes.pop('imageFilename', None)
|
|
100
|
+
else:
|
|
101
|
+
self.__attributes['imageFilename'] = str(filename)
|
|
102
|
+
|
|
103
|
+
@property
|
|
104
|
+
def width(self) -> Optional[int]:
|
|
105
|
+
if (w := self.__attributes.get('imageWidth', None)) is not None:
|
|
106
|
+
return int(w)
|
|
107
|
+
return None
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def height(self) -> Optional[int]:
|
|
111
|
+
if (h := self.__attributes.get('imageHeight', None)) is not None:
|
|
112
|
+
return int(h)
|
|
113
|
+
return None
|
|
114
|
+
|
|
115
|
+
@classmethod
|
|
116
|
+
def new(cls, **attributes: str) -> Self:
|
|
117
|
+
"""
|
|
118
|
+
Create a new Page object from scratch.
|
|
119
|
+
:param attributes: Named arguments that will be stored as attributes.
|
|
120
|
+
:return: The newly created Page object.
|
|
121
|
+
"""
|
|
122
|
+
attributes = {str(k): str(v) for k, v in attributes.items() if v is not None}
|
|
123
|
+
return cls(attributes)
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def from_etree(cls, tree: etree.Element) -> Self:
|
|
127
|
+
"""
|
|
128
|
+
Create a new Page object from a lxml etree object.
|
|
129
|
+
:param tree: lxml etree object.
|
|
130
|
+
:return: Page object that represents the passed etree object.
|
|
131
|
+
"""
|
|
132
|
+
page = cls(dict(tree.items()))
|
|
133
|
+
if (ro := tree.find('./{*}ReadingOrder')) is not None:
|
|
134
|
+
if (ro_elements := tree.findall('../{*}RegionRefIndexed')) is not None:
|
|
135
|
+
page._ro = list([i.get('regionRef') for i in sorted(list(ro_elements), key=lambda i: i.get('index'))])
|
|
136
|
+
tree.remove(ro)
|
|
137
|
+
for element in tree:
|
|
138
|
+
page.add_element(Element.from_etree(element), ro=False)
|
|
139
|
+
return page
|
|
140
|
+
|
|
141
|
+
def to_etree(self) -> etree.Element:
|
|
142
|
+
"""
|
|
143
|
+
Convert the Page object to a lxml etree object.
|
|
144
|
+
:return: A lxml etree object that represents this Page object.
|
|
145
|
+
"""
|
|
146
|
+
page = etree.Element('Page', **self.__attributes)
|
|
147
|
+
if len(self.__reading_order) > 0:
|
|
148
|
+
reading_order = etree.SubElement(page, 'ReadingOrder')
|
|
149
|
+
order_group = etree.SubElement(reading_order, 'OrderedGroup', id='g0') # does id matter?
|
|
150
|
+
for i, rid in enumerate(self.__reading_order):
|
|
151
|
+
etree.SubElement(order_group, 'RegionRefIndexed', index=str(i), regionRef=rid)
|
|
152
|
+
for element in self.__elements:
|
|
153
|
+
page.append(element.to_etree())
|
|
154
|
+
return page
|
|
155
|
+
|
|
156
|
+
def set_attribute(self, key: str, value: Optional[str]) -> None:
|
|
157
|
+
"""
|
|
158
|
+
Set or create an attribute.
|
|
159
|
+
:param key: Key of attribute. Creates a new one if the key does not exist.
|
|
160
|
+
:param value: Value for the attribute. Deletes the attribute of None is passed.
|
|
161
|
+
"""
|
|
162
|
+
if value is None:
|
|
163
|
+
self.__attributes.pop(str(key), None)
|
|
164
|
+
else:
|
|
165
|
+
self.__attributes[str(key)] = str(value)
|
|
166
|
+
|
|
167
|
+
def delete_attribute(self, key: str) -> Optional[str]:
|
|
168
|
+
"""
|
|
169
|
+
Delete an attribute.
|
|
170
|
+
:param key: The key to delete.
|
|
171
|
+
:return: Returns the deleted attribute value. If the key does not exist, None is returned.
|
|
172
|
+
"""
|
|
173
|
+
return self.__attributes.pop(str(key), None)
|
|
174
|
+
|
|
175
|
+
def get_regions(self, region: Optional[XMLType] = None) -> list[Element]:
|
|
176
|
+
"""
|
|
177
|
+
Return a list of all direct child elements that are regions.
|
|
178
|
+
:param region: Only select a specific region type.
|
|
179
|
+
:return: List of matching Element objects.
|
|
180
|
+
"""
|
|
181
|
+
if region is None:
|
|
182
|
+
return list([e for e in self.__elements if e.is_region()])
|
|
183
|
+
return list([e for e in self.__elements if e.type == region])
|
|
184
|
+
|
|
185
|
+
def add_element(self, element: Element, index: Optional[int] = None, ro: bool = True) -> None:
|
|
186
|
+
"""
|
|
187
|
+
Add an existing Element object to the list elements.
|
|
188
|
+
:param element: The element to add.
|
|
189
|
+
:param index: If set, insert the element at this index. Else append to the list.
|
|
190
|
+
:param ro: If set to true, add the element to the reading order at the specified index.
|
|
191
|
+
Only if the element is a region.
|
|
192
|
+
"""
|
|
193
|
+
if index is None:
|
|
194
|
+
self.__elements.append(element)
|
|
195
|
+
if ro and element.is_region() and element.id:
|
|
196
|
+
self.__reading_order.append(element.id)
|
|
197
|
+
else:
|
|
198
|
+
self.__elements.insert(min(index, len(self.__elements) - 1), element)
|
|
199
|
+
if ro and element.is_region() and element.id:
|
|
200
|
+
self.__reading_order.insert(min(index, len(self.__elements) - 1), element.id)
|
|
201
|
+
|
|
202
|
+
def create_element(self, _type: XMLType, index: int = None, ro: bool = True, **attributes: str) -> Element:
|
|
203
|
+
"""
|
|
204
|
+
Create a new Element object and add it to the list of elements.
|
|
205
|
+
:param _type: XMLType of new element.
|
|
206
|
+
:param index: If set, insert the new element at this index. Else append to the list.
|
|
207
|
+
:param ro: If set to true, add the element to the reading order at the specified index.
|
|
208
|
+
Only if the element is a region.
|
|
209
|
+
:param attributes: Named arguments that will be stores as xml attributes.
|
|
210
|
+
:return: The newly created Element object.
|
|
211
|
+
"""
|
|
212
|
+
element = Element.new(_type, **attributes)
|
|
213
|
+
self.add_element(element, index, ro)
|
|
214
|
+
return element
|
|
215
|
+
|
|
216
|
+
def remove_element(self, element: Union[Element, int]) -> Optional[Element]:
|
|
217
|
+
"""
|
|
218
|
+
Remove an element from the list of elements.
|
|
219
|
+
:param element: The Element object or the index of the element to remove.
|
|
220
|
+
:return: The removed element, if it existed.
|
|
221
|
+
"""
|
|
222
|
+
if isinstance(element, int) and element < len(self.__elements) - 1:
|
|
223
|
+
return self.__elements.pop(element)
|
|
224
|
+
elif isinstance(element, Element) and element in self.__elements:
|
|
225
|
+
self.__elements.remove(element)
|
|
226
|
+
return element
|
|
227
|
+
return None
|
|
228
|
+
|
|
229
|
+
def clear_elements(self) -> None:
|
|
230
|
+
""" Remove all Element objects from the list of elements. """
|
|
231
|
+
self.__elements.clear()
|
|
232
|
+
self.clear_reading_order()
|
|
233
|
+
|
|
234
|
+
def clear_regions(self) -> None:
|
|
235
|
+
""" Remove all Element objects from the list of elements, that are regions. """
|
|
236
|
+
for element in self.__elements:
|
|
237
|
+
if element.is_region():
|
|
238
|
+
self.__elements.remove(element)
|
|
239
|
+
if element.id and element.id in self.__reading_order:
|
|
240
|
+
self.__reading_order.remove(element.id)
|
|
241
|
+
|
|
242
|
+
def clear_reading_order(self) -> None:
|
|
243
|
+
""" Reset the reading order. """
|
|
244
|
+
self.__reading_order.clear()
|
pypxml/pxml.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
1
|
+
# This file is licensed under the MIT License.
|
|
2
|
+
# Copyright (c) 2024 Janik Haitz
|
|
3
|
+
# See the LICENSE file in the root directory for more details.
|
|
4
|
+
|
|
5
|
+
from typing import Optional, Union, Self
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
import json
|
|
9
|
+
|
|
10
|
+
from lxml import etree
|
|
11
|
+
|
|
12
|
+
from .page import Page
|
|
13
|
+
from .resources.xml_schema import XMLSchema
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class PageXML:
|
|
17
|
+
""" Represents a PageXML file. """
|
|
18
|
+
def __init__(self, creator: str, created: str, changed: str) -> None:
|
|
19
|
+
self.__creator: str = creator
|
|
20
|
+
self.__created: str = created
|
|
21
|
+
self.__changed: str = changed
|
|
22
|
+
self.__pages: list[Page] = []
|
|
23
|
+
|
|
24
|
+
def __len__(self) -> int:
|
|
25
|
+
""" Returns the number of pages. """
|
|
26
|
+
return len(self.__pages)
|
|
27
|
+
|
|
28
|
+
def __iter__(self) -> Self:
|
|
29
|
+
""" Iterator: starting point for iterating over all pages. """
|
|
30
|
+
self.__n = 0
|
|
31
|
+
return self
|
|
32
|
+
|
|
33
|
+
def __next__(self) -> Page:
|
|
34
|
+
""" Iterator: yield next page. """
|
|
35
|
+
if self.__n < len(self.__pages):
|
|
36
|
+
self.__n += 1
|
|
37
|
+
return self.__pages[self.__n - 1]
|
|
38
|
+
else:
|
|
39
|
+
raise StopIteration
|
|
40
|
+
|
|
41
|
+
def __getitem__(self, key: int) -> Optional[Page]:
|
|
42
|
+
"""
|
|
43
|
+
Get the Page object of a given index.
|
|
44
|
+
:param key: The index value of the Page object.
|
|
45
|
+
:return: Page object if pages are available. Returns last page if index is out of range.
|
|
46
|
+
"""
|
|
47
|
+
if len(self.__pages) > 0:
|
|
48
|
+
return self.__pages[min(key, len(self.__pages) - 1)]
|
|
49
|
+
return None
|
|
50
|
+
|
|
51
|
+
def __setitem__(self, key: int, value: Page) -> None:
|
|
52
|
+
"""
|
|
53
|
+
Set a Page object for a given index.
|
|
54
|
+
:param key: The target index for the Page object.
|
|
55
|
+
:param value: The new Page object.
|
|
56
|
+
"""
|
|
57
|
+
if len(self.__pages) > 0:
|
|
58
|
+
self.__pages[min(key, len(self.__pages) - 1)] = value
|
|
59
|
+
|
|
60
|
+
def __contains__(self, key: Page) -> bool:
|
|
61
|
+
"""
|
|
62
|
+
Checks if a Page objects exists.
|
|
63
|
+
:param key: The Page object.
|
|
64
|
+
:return: True, if the Page object exists. Else return False.
|
|
65
|
+
"""
|
|
66
|
+
if isinstance(key, Page):
|
|
67
|
+
return key in self.__pages
|
|
68
|
+
return False
|
|
69
|
+
|
|
70
|
+
@property
|
|
71
|
+
def creator(self) -> Optional[str]:
|
|
72
|
+
return self.__creator
|
|
73
|
+
|
|
74
|
+
@creator.setter
|
|
75
|
+
def creator(self, creator: str) -> None:
|
|
76
|
+
self.__creator = str(creator)
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def created(self) -> Optional[str]:
|
|
80
|
+
return self.__created
|
|
81
|
+
|
|
82
|
+
@created.setter
|
|
83
|
+
def created(self, created: Union[datetime, str]) -> None:
|
|
84
|
+
if isinstance(created, datetime):
|
|
85
|
+
self.__created = created.isoformat()
|
|
86
|
+
else:
|
|
87
|
+
self.__created = str(created)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def changed(self) -> Optional[str]:
|
|
91
|
+
return self.__changed
|
|
92
|
+
|
|
93
|
+
@changed.setter
|
|
94
|
+
def changed(self, changed: Union[datetime, str]) -> None:
|
|
95
|
+
if isinstance(changed, datetime):
|
|
96
|
+
self.__changed = changed.isoformat()
|
|
97
|
+
else:
|
|
98
|
+
self.__changed = str(changed)
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def pages(self) -> list[Page]:
|
|
102
|
+
return self.__pages
|
|
103
|
+
|
|
104
|
+
@classmethod
|
|
105
|
+
def new(cls, creator: str = 'PyPXML') -> Self:
|
|
106
|
+
"""
|
|
107
|
+
Create a new PageXML object from scratch.
|
|
108
|
+
:param creator: Specify creator tag in PageXMLs metadata.
|
|
109
|
+
:return: Newly created PageXML object.
|
|
110
|
+
"""
|
|
111
|
+
return cls(creator, datetime.now().isoformat(), datetime.now().isoformat())
|
|
112
|
+
|
|
113
|
+
@classmethod
|
|
114
|
+
def from_etree(cls, tree: etree.Element) -> Self:
|
|
115
|
+
"""
|
|
116
|
+
Create a new PageXML object from a lxml etree object.
|
|
117
|
+
:param tree: lxml etree object.
|
|
118
|
+
:return: PageXML object that represents the passed etree object.
|
|
119
|
+
"""
|
|
120
|
+
if (md_tree := tree.find('./{*}Metadata')) is not None:
|
|
121
|
+
if (creator := md_tree.find('./{*}Creator')) is not None:
|
|
122
|
+
creator = creator.text
|
|
123
|
+
if (created := md_tree.find('./{*}Created')) is not None:
|
|
124
|
+
created = created.text
|
|
125
|
+
if (last_change := md_tree.find('./{*}LastChange')) is not None:
|
|
126
|
+
last_change = last_change.text
|
|
127
|
+
pxml = cls(creator, created, last_change)
|
|
128
|
+
else:
|
|
129
|
+
pxml = cls.new()
|
|
130
|
+
if (pages := tree.findall('./{*}Page')) is not None:
|
|
131
|
+
for page_tree in pages:
|
|
132
|
+
pxml.add_page(Page.from_etree(page_tree))
|
|
133
|
+
return pxml
|
|
134
|
+
|
|
135
|
+
def to_etree(self, version: str = '2019', schema_file: Optional[Path] = None) -> etree.Element:
|
|
136
|
+
"""
|
|
137
|
+
Convert a PageXML object to a lxml etree element.
|
|
138
|
+
:param version: PageXML Version to use. Currently supported: `2019`.
|
|
139
|
+
:param schema_file: Custom schema in json format.
|
|
140
|
+
:return: A lxml etree object that represents this PageXML object.
|
|
141
|
+
"""
|
|
142
|
+
self.changed = datetime.now().isoformat()
|
|
143
|
+
if schema_file is not None:
|
|
144
|
+
with open(schema_file) as stream:
|
|
145
|
+
schema = XMLSchema.custom('pagexml', version, json.load(stream))
|
|
146
|
+
else:
|
|
147
|
+
schema = XMLSchema.pagexml(version)
|
|
148
|
+
xsi_qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", 'schemaLocation')
|
|
149
|
+
nsmap = { None: schema['xmlns'], 'xsi': schema['xmlns_xsi'] }
|
|
150
|
+
root = etree.Element('PcGts', { xsi_qname: schema['xsi_schema_location'] }, nsmap=nsmap)
|
|
151
|
+
metadata = etree.SubElement(root, 'Metadata')
|
|
152
|
+
etree.SubElement(metadata, 'Creator').text = self.__creator
|
|
153
|
+
etree.SubElement(metadata, 'Created').text = self.__created
|
|
154
|
+
etree.SubElement(metadata, 'LastChange').text = self.__changed
|
|
155
|
+
for page in self.__pages:
|
|
156
|
+
root.append(page.to_etree())
|
|
157
|
+
return root
|
|
158
|
+
|
|
159
|
+
@classmethod
|
|
160
|
+
def from_xml(cls, fp: Union[Path, str], encoding: Optional[str] = None) -> Self:
|
|
161
|
+
"""
|
|
162
|
+
Create a new PageXML object from a PageXML file.
|
|
163
|
+
:param fp: Path of PageXML file.
|
|
164
|
+
:param encoding: Set custom encoding.
|
|
165
|
+
:return: PageXML object.
|
|
166
|
+
"""
|
|
167
|
+
parser = etree.XMLParser(remove_blank_text=True, encoding=encoding)
|
|
168
|
+
tree = etree.parse(fp, parser).getroot()
|
|
169
|
+
return cls.from_etree(tree)
|
|
170
|
+
|
|
171
|
+
def to_xml(self, fp: Union[Path, str], version: str = '2019-07-15', schema_file: Optional[Path] = None,
|
|
172
|
+
encoding: str = 'utf-8') -> None:
|
|
173
|
+
"""
|
|
174
|
+
Create a PageXML file from a PageXML file.
|
|
175
|
+
:param fp: Path to new PageXML file.
|
|
176
|
+
:param version: The PageXML version to use. Currently supported: `2019`.
|
|
177
|
+
:param schema_file: Custom schema in json format.
|
|
178
|
+
:param encoding: Set custom encoding.
|
|
179
|
+
"""
|
|
180
|
+
with open(fp, 'wb') as f:
|
|
181
|
+
tree = etree.tostring(self.to_etree(version, schema_file),
|
|
182
|
+
pretty_print=True,
|
|
183
|
+
encoding=encoding,
|
|
184
|
+
xml_declaration=True)
|
|
185
|
+
f.write(tree)
|
|
186
|
+
|
|
187
|
+
def add_page(self, page: Page, index: Optional[int] = None) -> None:
|
|
188
|
+
"""
|
|
189
|
+
Add a Page object to the list of pages.
|
|
190
|
+
:param page: The Page object to add.
|
|
191
|
+
:param index: If set, insert the Page object at this index.
|
|
192
|
+
"""
|
|
193
|
+
if index is None or index >= len(self.__pages) - 1:
|
|
194
|
+
self.__pages.append(page)
|
|
195
|
+
else:
|
|
196
|
+
self.__pages.insert(index, page)
|
|
197
|
+
|
|
198
|
+
def create_page(self, index: Optional[int] = None, **attributes: str) -> Page:
|
|
199
|
+
"""
|
|
200
|
+
Create a new Page object and add it to the list of pages.
|
|
201
|
+
:param index: If set, insert the Page object at this index.
|
|
202
|
+
:param attributes: Named arguments that will be stores as xml attributes.
|
|
203
|
+
:return: The newly created Page object.
|
|
204
|
+
"""
|
|
205
|
+
page = Page.new(**attributes)
|
|
206
|
+
self.add_page(page, index)
|
|
207
|
+
return page
|
|
208
|
+
|
|
209
|
+
def remove_page(self, page: Union[Page, int]) -> Optional[Page]:
|
|
210
|
+
"""
|
|
211
|
+
Remove a Page object from the list of pages.
|
|
212
|
+
:param page: The index of the Page object to remove or the Page object itself.
|
|
213
|
+
:return: The Page object that was removed if it existed.
|
|
214
|
+
"""
|
|
215
|
+
if isinstance(page, Page) and page in self.__pages:
|
|
216
|
+
self.__pages.remove(page)
|
|
217
|
+
return page
|
|
218
|
+
elif isinstance(page, int) and page < len(self.__pages):
|
|
219
|
+
return self.__pages.pop(page)
|
|
220
|
+
|
|
221
|
+
def clear_pages(self) -> None:
|
|
222
|
+
""" Remove all Page objects from the list of pages. """
|
|
223
|
+
self.__pages.clear()
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# This file is licensed under the MIT License.
|
|
2
|
+
# Copyright (c) 2024 Janik Haitz
|
|
3
|
+
# See the LICENSE file in the root directory for more details.
|
|
4
|
+
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DEFAULT_SCHEMA = {
|
|
9
|
+
'pagexml': {
|
|
10
|
+
'2019': {
|
|
11
|
+
'xmlns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15',
|
|
12
|
+
'xmlns_xsi': 'http://www.w3.org/2001/XMLSchema-instance',
|
|
13
|
+
'xsi_schema_location': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15/pagecontent.xsd'
|
|
14
|
+
},
|
|
15
|
+
'2017': {
|
|
16
|
+
'xmlns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15',
|
|
17
|
+
'xmlns_xsi': 'http://www.w3.org/2001/XMLSchema-instance',
|
|
18
|
+
'xsi_schema_location': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd'
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class XMLSchema:
|
|
25
|
+
@staticmethod
|
|
26
|
+
def pagexml(version: Literal['2019', '2017'] = '2019') -> dict[str, str]:
|
|
27
|
+
"""
|
|
28
|
+
Returns the xml schema values of a specified PageXML version.
|
|
29
|
+
:param version: The PageXML version to use.
|
|
30
|
+
:return: A dictionary containing all header attributes: `xmlns`, `xmlns_xsi`, `xsi_schema_location`
|
|
31
|
+
"""
|
|
32
|
+
return DEFAULT_SCHEMA['pagexml'][version]
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def custom(schema: str, version: str, custom: dict) -> dict[str, str]:
|
|
36
|
+
"""
|
|
37
|
+
Returns the custom xml schema values of a specified version.
|
|
38
|
+
:param schema: The schema to use.
|
|
39
|
+
:param version: The version to use.
|
|
40
|
+
:param custom: A dictionary containing custom xml schema values. See DEFAULT_SCHEMA as example.
|
|
41
|
+
:return: A dictionary containing all header attributes provided by the custom schema.
|
|
42
|
+
"""
|
|
43
|
+
return custom[schema][version]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# This file is licensed under the MIT License.
|
|
2
|
+
# Copyright (c) 2024 Janik Haitz
|
|
3
|
+
# See the LICENSE file in the root directory for more details.
|
|
4
|
+
|
|
5
|
+
from enum import Enum
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class XMLType(Enum):
|
|
9
|
+
"""
|
|
10
|
+
https://ocr-d.de/de/gt-guidelines/pagexml/pagecontent_xsd_Complex_Type_pc_PcGtsType.html#PcGtsType_Page
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
# ReadingOrder
|
|
14
|
+
ReadingOrder = "ReadingOrder"
|
|
15
|
+
OrderedGroup = "OrderedGroup"
|
|
16
|
+
RegionRefIndexed = "RegionRefIndexed"
|
|
17
|
+
|
|
18
|
+
# Regions
|
|
19
|
+
AdvertRegion = "AdvertRegion"
|
|
20
|
+
ChartRegion = "ChartRegion"
|
|
21
|
+
ChemRegion = "ChemRegion"
|
|
22
|
+
CustomRegion = "CustomRegion"
|
|
23
|
+
GraphicRegion = "GraphicRegion"
|
|
24
|
+
ImageRegion = "ImageRegion"
|
|
25
|
+
LineDrawingRegion = "LineDrawingRegion"
|
|
26
|
+
MapRegion = "MapRegion"
|
|
27
|
+
MathsRegion = "MathsRegion"
|
|
28
|
+
MusicRegion = "MusicRegion"
|
|
29
|
+
NoiseRegion = "NoiseRegion"
|
|
30
|
+
SeparatorRegion = "SeparatorRegion"
|
|
31
|
+
TableRegion = "TableRegion"
|
|
32
|
+
TextRegion = "TextRegion"
|
|
33
|
+
UnknownRegion = "UnknownRegion"
|
|
34
|
+
|
|
35
|
+
# Elements
|
|
36
|
+
AlternativeImage = "AlternativeImage"
|
|
37
|
+
Baseline = "Baseline"
|
|
38
|
+
Border = "Border"
|
|
39
|
+
Coords = "Coords"
|
|
40
|
+
Glyph = "Glyph"
|
|
41
|
+
GraphemeGroup = "GraphemeGroup"
|
|
42
|
+
Graphemes = "Graphemes"
|
|
43
|
+
Grid = "Grid"
|
|
44
|
+
Label = "Label"
|
|
45
|
+
Labels = "Labels"
|
|
46
|
+
Layers = "Layers"
|
|
47
|
+
Metadata = "Metadata"
|
|
48
|
+
NonPrintingChar = "NonPrintingChar"
|
|
49
|
+
PlainText = "PlainText"
|
|
50
|
+
PrintSpace = "PrintSpace"
|
|
51
|
+
Relations = "Relations"
|
|
52
|
+
Roles = "Roles"
|
|
53
|
+
TextEquiv = "TextEquiv"
|
|
54
|
+
TextLine = "TextLine"
|
|
55
|
+
TextStyle = "TextStyle"
|
|
56
|
+
Unicode = "Unicode"
|
|
57
|
+
UserAttribute = "UserAttribute"
|
|
58
|
+
UserDefined = "UserDefined"
|
|
59
|
+
Word = "Word"
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Janik Haitz
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pypxml
|
|
3
|
+
Version: 1.0
|
|
4
|
+
Summary: A python library for parsing, converting and modifying PageXML files.
|
|
5
|
+
Author-email: Janik Haitz <jahtz.dev@proton.me>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2024 Janik Haitz
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: repository, https://github.com/jahtz/pypxml
|
|
29
|
+
Keywords: PageXML,XML,OCR,optical character recognition
|
|
30
|
+
Classifier: Programming Language :: Python :: 3
|
|
31
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
32
|
+
Classifier: Operating System :: OS Independent
|
|
33
|
+
Requires-Python: >=3.11
|
|
34
|
+
Description-Content-Type: text/markdown
|
|
35
|
+
License-File: LICENSE
|
|
36
|
+
Requires-Dist: lxml ~=5.3.0
|
|
37
|
+
Requires-Dist: click ~=8.1.7
|
|
38
|
+
|
|
39
|
+
from src.pypxml import XMLSchema
|
|
40
|
+
|
|
41
|
+
# PyPXML
|
|
42
|
+
A python library for parsing, converting and modifying PageXML files.
|
|
43
|
+
|
|
44
|
+
## Setup
|
|
45
|
+
```shell
|
|
46
|
+
pip install pypxml
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
### Install from source
|
|
50
|
+
1. Clone repository: `git clone https://github.com/jahtz/pypxml`
|
|
51
|
+
2. Install package: `cd pypxml && pip install .`
|
|
52
|
+
3. Test with `pypxml --version`
|
|
53
|
+
|
|
54
|
+
## CLI
|
|
55
|
+
```
|
|
56
|
+
pypxml [OPTIONS] COMMAND [ARGS]...
|
|
57
|
+
```
|
|
58
|
+
Coming in version 2.x
|
|
59
|
+
|
|
60
|
+
## API
|
|
61
|
+
PyXML provides a feature rich Python API for working with PageXML files.
|
|
62
|
+
|
|
63
|
+
### Basics
|
|
64
|
+
```python
|
|
65
|
+
from pypxml import PageXML, Page, Element, XMLType
|
|
66
|
+
|
|
67
|
+
pxml = PageXML.from_xml('path_to_pagexml.xml')
|
|
68
|
+
page1 = pxml.create_page(imageFilename='0001.png',
|
|
69
|
+
imageWidth=1000,
|
|
70
|
+
imageHeight=2500)
|
|
71
|
+
page1.create_element(XMLType.TextRegion, id='ir01')
|
|
72
|
+
pxml.to_xml('path_to_output.xml')
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
### PageXML class
|
|
76
|
+
```python
|
|
77
|
+
from pypxml import PageXML
|
|
78
|
+
|
|
79
|
+
# open file
|
|
80
|
+
pxml = PageXML.from_xml('path_to.xml')
|
|
81
|
+
# or create new PageXML
|
|
82
|
+
pxml = PageXML.new()
|
|
83
|
+
|
|
84
|
+
# edit metadata
|
|
85
|
+
pxml.creator = 'yourname'
|
|
86
|
+
...
|
|
87
|
+
|
|
88
|
+
# create a page
|
|
89
|
+
page = pxml.create_page(imageFilename='0001.png',
|
|
90
|
+
imageWidth=1000,
|
|
91
|
+
imageHeight='2500')
|
|
92
|
+
# or add existing page
|
|
93
|
+
pxml.add_page(page) # see below
|
|
94
|
+
|
|
95
|
+
# iterate over pages
|
|
96
|
+
for page in pxml:
|
|
97
|
+
...
|
|
98
|
+
|
|
99
|
+
# delete or modify pages
|
|
100
|
+
pxml[0] = ...
|
|
101
|
+
pxml.remove_page(pxml[1])
|
|
102
|
+
|
|
103
|
+
# save object to file
|
|
104
|
+
pxml.to_xml('output.xml')
|
|
105
|
+
...
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Page class
|
|
109
|
+
```python
|
|
110
|
+
from pypxml import Page, XMLType
|
|
111
|
+
|
|
112
|
+
# create a page
|
|
113
|
+
page = Page.new(imageFilename='0001.png',
|
|
114
|
+
imageWidth=1000,
|
|
115
|
+
imageHeight=2500)
|
|
116
|
+
|
|
117
|
+
# modify attributes
|
|
118
|
+
page['imageFilename'] = '0002.png'
|
|
119
|
+
# or get element by index
|
|
120
|
+
element = page[3]
|
|
121
|
+
|
|
122
|
+
# add elements (automatically added to reading order if it is a region)
|
|
123
|
+
text_region = page.create_element(XMLType.TextRegion, id='tr1')
|
|
124
|
+
# or add existing element
|
|
125
|
+
page.add_element(element)
|
|
126
|
+
|
|
127
|
+
# iterate over regions
|
|
128
|
+
for region in page:
|
|
129
|
+
...
|
|
130
|
+
...
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
### Element class
|
|
134
|
+
```python
|
|
135
|
+
from pypxml import Element, XMLType
|
|
136
|
+
|
|
137
|
+
# create an element
|
|
138
|
+
coords = Element.new(XMLType.Coords,
|
|
139
|
+
points='1,2 3,4 5,6 7,8')
|
|
140
|
+
# modify attributes
|
|
141
|
+
coords['points'] = 'some other coords'
|
|
142
|
+
# or get element by index
|
|
143
|
+
baseline = text_region[2]
|
|
144
|
+
|
|
145
|
+
# check if element is a region
|
|
146
|
+
if text_region.is_region():
|
|
147
|
+
...
|
|
148
|
+
|
|
149
|
+
# get coords and baseline, if they exist
|
|
150
|
+
coords = text_line.get_coords()
|
|
151
|
+
baseline = text_line.get_baseline()
|
|
152
|
+
...
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## ZPD
|
|
156
|
+
Developed at Centre for [Philology and Digitality](https://www.uni-wuerzburg.de/en/zpd/) (ZPD), [University of Würzburg](https://www.uni-wuerzburg.de/en/).
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
cli/pypxml_cli.py,sha256=Ui5FqdPaYk43686AytkKB6L66mDmbxGdE2s9f8PC_eU,397
|
|
3
|
+
pypxml/__init__.py,sha256=gGyTUwm-H6Z_iESkAKUhjEVMUldRkZs7wqOKjpfHmKc,307
|
|
4
|
+
pypxml/element.py,sha256=ivvkS-ktcQzU7pxbJb1uZut7ShF0a3s7WXzFSebFTq8,8530
|
|
5
|
+
pypxml/page.py,sha256=6mqyGS9VsM2NMX87h6AzhagudjkSWuqnpiWH-x3Zuao,10131
|
|
6
|
+
pypxml/pxml.py,sha256=LHPiHI4gaKxzacqPTVEch2Zx1LnjZSA7yXMqJ_LLpdk,8385
|
|
7
|
+
pypxml/resources/__init__.py,sha256=5LWKUrEyh94LutFHtwZxjieOnSeS-o0afz6qZMmRydU,144
|
|
8
|
+
pypxml/resources/xml_schema.py,sha256=FDOX_tK-ctN3oxxXiL9-ocuPSV8vChmvom27cJ1fleA,1881
|
|
9
|
+
pypxml/resources/xml_types.py,sha256=UBHWVznBwSTzYXdp0hpbbeChXsYozkNeL1qvHxQgwwY,1620
|
|
10
|
+
pypxml-1.0.dist-info/LICENSE,sha256=YtURTiiG41gsIDU4gyMNba_R6Db-J_GvXZvtXGVUyZo,1068
|
|
11
|
+
pypxml-1.0.dist-info/METADATA,sha256=2har3hD2fOqyaq3Ac4AaIZwEYg2ODyoDwFaJSLsXP2o,4400
|
|
12
|
+
pypxml-1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
|
13
|
+
pypxml-1.0.dist-info/entry_points.txt,sha256=zkGOJGSpAat6xMZrzonPstt_BiHgTBkH0aEYb9SpX1w,46
|
|
14
|
+
pypxml-1.0.dist-info/top_level.txt,sha256=wz-ismkaMeQqzKDV6hKEH3gMDZT9mVbLSqnha04iUZE,11
|
|
15
|
+
pypxml-1.0.dist-info/RECORD,,
|