pypxml 1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
cli/__init__.py ADDED
File without changes
cli/pypxml_cli.py ADDED
@@ -0,0 +1,15 @@
1
+ import click
2
+
3
+
4
+ @click.group()
5
+ @click.help_option('--help')
6
+ @click.version_option('1.0', '--version',
7
+ prog_name='PyPXML',
8
+ message='%(prog)s v%(version)s - Developed at Centre for Philology and Digitality (ZPD), '
9
+ 'University of Würzburg')
10
+ def cli():
11
+ """
12
+ PyPXML command line interface entry point.
13
+ """
14
+ pass
15
+
pypxml/__init__.py ADDED
@@ -0,0 +1,9 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
5
+ from .pxml import PageXML
6
+ from .page import Page
7
+ from .element import Element
8
+ from .resources.xml_schema import XMLSchema
9
+ from .resources.xml_types import XMLType
pypxml/element.py ADDED
@@ -0,0 +1,225 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
5
+ from typing import Optional, Union, Self
6
+
7
+ from lxml import etree
8
+
9
+ from .resources.xml_types import XMLType
10
+
11
+
12
+ class Element:
13
+ """ Represents an element inside a page. """
14
+ def __init__(self, _type: XMLType, attributes: Optional[dict[str, str]] = None):
15
+ self.__type: XMLType = _type
16
+ self.__attributes: dict[str, str] = attributes if attributes else {}
17
+ self.__elements: list[Element] = []
18
+ self.__text: Optional[str] = None
19
+
20
+ def __len__(self) -> int:
21
+ """ Returns the number of elements. """
22
+ return len(self.__elements)
23
+
24
+ def __iter__(self) -> Self:
25
+ """ Iterator: starting point for iterating over all elements. """
26
+ self.__n = 0
27
+ return self
28
+
29
+ def __next__(self) -> Self:
30
+ """ Iterator: yield next element. """
31
+ if self.__n < len(self.__elements):
32
+ self.__n += 1
33
+ return self.__elements[self.__n - 1]
34
+ else:
35
+ raise StopIteration
36
+
37
+ def __getitem__(self, key: Union[int, str]) -> Optional[Union[Self, str]]:
38
+ """
39
+ Get an Element object by its index or an attribute value by its key
40
+ :param key: Index (integer) of an Element object or a key (string) of an attribute.
41
+ :return: The Element of passed index (returns last object if the key is out of range) or the value of the
42
+ selected attribute. Returns None, if no match was found.
43
+ """
44
+ if isinstance(key, int) and len(self.__elements) > 0:
45
+ return self.__elements[min(key, len(self.__elements) - 1)]
46
+ elif isinstance(key, str) and key in self.__attributes:
47
+ return self.__attributes[key]
48
+ return None
49
+
50
+ def __setitem__(self, key: Union[int, str], value: Union[Self, str]) -> None:
51
+ """
52
+ Set an Element object or an attribute value.
53
+ :param key: Index (integer) for an Element object or a key (string) for an attribute.
54
+ :param value: Element object (if key is of type integer) or a string (if key is of type string).
55
+ """
56
+ if isinstance(key, int) and isinstance(value, Element) and len(self.__elements) > 0:
57
+ self.__elements[min(key, len(self.__elements) - 1)] = value
58
+ elif isinstance(key, str):
59
+ self.__attributes[key] = value
60
+ else:
61
+ raise ValueError('Invalid key or value')
62
+
63
+ def __contains__(self, key: Union[Self, str]) -> bool:
64
+ """
65
+ Checks if an Element object or an attribute exists.
66
+ :param key: Element object or attribute key.
67
+ :return: True, if either the passed Element object or the attribute exists. Else return False.
68
+ """
69
+ if isinstance(key, Element):
70
+ return key in self.__elements
71
+ elif isinstance(key, str):
72
+ return key in self.__attributes
73
+ return False
74
+
75
+ @property
76
+ def type(self) -> XMLType:
77
+ return self.__type
78
+
79
+ @type.setter
80
+ def type(self, value: XMLType) -> None:
81
+ self.__type = value
82
+
83
+ @property
84
+ def attributes(self) -> dict[str, str]:
85
+ return self.__attributes
86
+
87
+ @property
88
+ def elements(self) -> list[Self]:
89
+ return self.__elements
90
+
91
+ @property
92
+ def id(self) -> Optional[str]:
93
+ return self.__attributes.get('id', None)
94
+
95
+ @id.setter
96
+ def id(self, value: Optional[str]) -> None:
97
+ if value is None:
98
+ self.__attributes.pop('id', None)
99
+ else:
100
+ self.__attributes['id'] = value
101
+
102
+ @property
103
+ def text(self) -> Optional[str]:
104
+ return self.__text
105
+
106
+ @text.setter
107
+ def text(self, value: Optional[str]) -> None:
108
+ self.__text = None if value is None else str(value)
109
+
110
+ @classmethod
111
+ def new(cls, _type: XMLType, **attributes: str) -> Self:
112
+ """
113
+ Create a new Element object from scratch.
114
+ :param _type: The type of element to create.
115
+ :param attributes: Named arguments that will be stores as xml attributes.
116
+ :return: The newly created Element object.
117
+ """
118
+ attributes = {str(k): str(v) for k, v in attributes.items() if v is not None}
119
+ return cls(_type, attributes)
120
+
121
+ @classmethod
122
+ def from_etree(cls, tree: etree.Element) -> Self:
123
+ """
124
+ Create a new Element object from a lxml etree object.
125
+ :param tree: lxml etree object.
126
+ :return: Element object that represents the passed etree object.
127
+ """
128
+ element = cls(XMLType(tree.tag.split('}')[1]), dict(tree.items()))
129
+ element.text = tree.text
130
+ for child in tree:
131
+ element.add_element(Element.from_etree(child))
132
+ return element
133
+
134
+ def to_etree(self) -> etree.Element:
135
+ """
136
+ Convert the Element object to a lxml etree object.
137
+ :return: A lxml etree object that represents this Element object.
138
+ """
139
+ element = etree.Element(self.__type.value, **self.__attributes)
140
+ if self.__text is not None:
141
+ element.text = self.__text
142
+ for child in self.__elements:
143
+ element.append(child.to_etree())
144
+ return element
145
+
146
+ def is_region(self) -> bool:
147
+ """ Returns True, if the Element object is a region. """
148
+ return self.__type.value.endswith('Region')
149
+
150
+ def contains_text(self) -> bool:
151
+ """ Returns True, if the Element object contains text. """
152
+ return self.__text is not None
153
+
154
+ def set_attribute(self, key: str, value: Optional[str]) -> None:
155
+ """
156
+ Set an attribute.
157
+ :param key: Key of attribute. Creates a new one if the key does not exist.
158
+ :param value: Value for the attribute. Deletes the attribute of None is passed.
159
+ """
160
+ if value is None:
161
+ self.__attributes.pop(str(key), None)
162
+ else:
163
+ self.__attributes[str(key)] = str(value)
164
+
165
+ def delete_attribute(self, key: str) -> Optional[str]:
166
+ """
167
+ Delete an attribute.
168
+ :param key: The key to delete.
169
+ :return: Returns the deleted attribute value. If the key does not exist, None is returned.
170
+ """
171
+ return self.__attributes.pop(str(key), None)
172
+
173
+ def get_coords(self) -> Optional[Self]:
174
+ """ Return the first direct child Element object of type Coords. """
175
+ for element in self.__elements:
176
+ if element.type == XMLType.Coords:
177
+ return element
178
+ return None
179
+
180
+ def get_baseline(self) -> Optional[Self]:
181
+ """ Return the first direct child Element object of type Baseline. """
182
+ for element in self.__elements:
183
+ if element.type == XMLType.Baseline:
184
+ return element
185
+ return None
186
+
187
+ def add_element(self, element: Self, index: Optional[int] = None) -> None:
188
+ """
189
+ Add an existing Element object to the list elements.
190
+ :param element: The element to add.
191
+ :param index: If set, insert the element at this index. Else append to the list.
192
+ """
193
+ if index is None:
194
+ self.__elements.append(element)
195
+ else:
196
+ self.__elements.insert(min(index, len(self.__elements) - 1), element)
197
+
198
+ def create_element(self, _type: XMLType, index: int = None, **attributes: str) -> Self:
199
+ """
200
+ Create a new Element object and add it to the list of elements.
201
+ :param _type: XMLType of new element.
202
+ :param index: If set, insert the new element at this index. Else append to the list.
203
+ :param attributes: Named arguments that will be stores as xml attributes.
204
+ :return: The newly created Element object.
205
+ """
206
+ element = Element.new(_type, **attributes)
207
+ self.add_element(element, index)
208
+ return element
209
+
210
+ def remove_element(self, element: Union[Self, int]) -> Optional[Self]:
211
+ """
212
+ Remove an element from the list of elements.
213
+ :param element: The Element object or the index of the element to remove.
214
+ :return: The removed element, if it existed.
215
+ """
216
+ if isinstance(element, int) and element < len(self.__elements) - 1:
217
+ return self.__elements.pop(element)
218
+ elif isinstance(element, Element) and element in self.__elements:
219
+ self.__elements.remove(element)
220
+ return element
221
+ return None
222
+
223
+ def clear_elements(self) -> None:
224
+ """ Remove all Element objects from the list of elements. """
225
+ self.__elements.clear()
pypxml/page.py ADDED
@@ -0,0 +1,244 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
5
+ from typing import Optional, Union, Self
6
+
7
+ from lxml import etree
8
+
9
+ from .element import Element
10
+ from .resources.xml_types import XMLType
11
+
12
+
13
+ class Page:
14
+ """ Represents a page of a PageXML file. """
15
+ def __init__(self, attributes: Optional[dict[str, str]] = None):
16
+ self.__attributes: dict[str, str] = attributes if attributes else {}
17
+ self.__reading_order: list[str] = [] # region id's
18
+ self.__elements: list[Element] = []
19
+
20
+ def __len__(self) -> int:
21
+ """ Returns the number of elements. """
22
+ return len(self.__elements)
23
+
24
+ def __iter__(self) -> Self:
25
+ """ Iterator: starting point for iterating over all elements that are regions. """
26
+ self.__n = 0
27
+ self.__regions = [element for element in self.__elements if element.is_region()]
28
+ return self
29
+
30
+ def __next__(self) -> Element:
31
+ """ Iterator: yield next element that is a region. """
32
+ if self.__n < len(self.__regions):
33
+ self.__n += 1
34
+ return self.__regions[self.__n - 1]
35
+ else:
36
+ raise StopIteration
37
+
38
+ def __getitem__(self, key: Union[int, str]) -> Optional[Union[Element, str]]:
39
+ """
40
+ Get an Element object by its index or an attribute value by its key
41
+ :param key: Index (integer) of an Element object or a key (string) of an attribute.
42
+ :return: The Element of passed index (returns last object if the key is out of range) or the value of the
43
+ selected attribute. Returns None, if no match was found.
44
+ """
45
+ if isinstance(key, int) and len(self.__elements) > 0:
46
+ return self.__elements[min(key, len(self.__elements) - 1)]
47
+ elif isinstance(key, str) and key in self.__attributes:
48
+ return self.__attributes[key]
49
+ return None
50
+
51
+ def __setitem__(self, key: Union[int, str], value: Union[Element, str]) -> None:
52
+ """
53
+ Set an Element object or an attribute value.
54
+ :param key: Index (integer) for an Element object or a key (string) for an attribute.
55
+ :param value: Element object (if key is of type integer) or a string (if key is of type string).
56
+ """
57
+ if isinstance(key, int) and isinstance(value, Element) and len(self.__elements) > 0:
58
+ self.__elements[min(key, len(self.__elements) - 1)] = value
59
+ elif isinstance(key, str):
60
+ self.__attributes[key] = value
61
+ else:
62
+ raise ValueError('Invalid key or value')
63
+
64
+ def __contains__(self, key: Union[Element, str]) -> bool:
65
+ """
66
+ Checks if an Element object or an attribute exists.
67
+ :param key: Element object or attribute key.
68
+ :return: True, if either the passed Element object or the attribute exists. Else return False.
69
+ """
70
+ if isinstance(key, Element):
71
+ return key in self.__elements
72
+ elif isinstance(key, str):
73
+ return key in self.__attributes
74
+ return False
75
+
76
+ @property
77
+ def attributes(self) -> dict[str, str]:
78
+ return self.__attributes
79
+
80
+ @property
81
+ def reading_order(self) -> list[str]:
82
+ return self.__reading_order
83
+
84
+ @property
85
+ def elements(self) -> list[Element]:
86
+ return self.__elements
87
+
88
+ @property
89
+ def regions(self) -> list[Element]:
90
+ return list([element for element in self.__regions if element.is_region()])
91
+
92
+ @property
93
+ def image_filename(self) -> Optional[str]:
94
+ return self.__attributes.get('imageFilename', None)
95
+
96
+ @image_filename.setter
97
+ def image_filename(self, filename: Optional[str]) -> None:
98
+ if filename is None:
99
+ self.__attributes.pop('imageFilename', None)
100
+ else:
101
+ self.__attributes['imageFilename'] = str(filename)
102
+
103
+ @property
104
+ def width(self) -> Optional[int]:
105
+ if (w := self.__attributes.get('imageWidth', None)) is not None:
106
+ return int(w)
107
+ return None
108
+
109
+ @property
110
+ def height(self) -> Optional[int]:
111
+ if (h := self.__attributes.get('imageHeight', None)) is not None:
112
+ return int(h)
113
+ return None
114
+
115
+ @classmethod
116
+ def new(cls, **attributes: str) -> Self:
117
+ """
118
+ Create a new Page object from scratch.
119
+ :param attributes: Named arguments that will be stored as attributes.
120
+ :return: The newly created Page object.
121
+ """
122
+ attributes = {str(k): str(v) for k, v in attributes.items() if v is not None}
123
+ return cls(attributes)
124
+
125
+ @classmethod
126
+ def from_etree(cls, tree: etree.Element) -> Self:
127
+ """
128
+ Create a new Page object from a lxml etree object.
129
+ :param tree: lxml etree object.
130
+ :return: Page object that represents the passed etree object.
131
+ """
132
+ page = cls(dict(tree.items()))
133
+ if (ro := tree.find('./{*}ReadingOrder')) is not None:
134
+ if (ro_elements := tree.findall('../{*}RegionRefIndexed')) is not None:
135
+ page._ro = list([i.get('regionRef') for i in sorted(list(ro_elements), key=lambda i: i.get('index'))])
136
+ tree.remove(ro)
137
+ for element in tree:
138
+ page.add_element(Element.from_etree(element), ro=False)
139
+ return page
140
+
141
+ def to_etree(self) -> etree.Element:
142
+ """
143
+ Convert the Page object to a lxml etree object.
144
+ :return: A lxml etree object that represents this Page object.
145
+ """
146
+ page = etree.Element('Page', **self.__attributes)
147
+ if len(self.__reading_order) > 0:
148
+ reading_order = etree.SubElement(page, 'ReadingOrder')
149
+ order_group = etree.SubElement(reading_order, 'OrderedGroup', id='g0') # does id matter?
150
+ for i, rid in enumerate(self.__reading_order):
151
+ etree.SubElement(order_group, 'RegionRefIndexed', index=str(i), regionRef=rid)
152
+ for element in self.__elements:
153
+ page.append(element.to_etree())
154
+ return page
155
+
156
+ def set_attribute(self, key: str, value: Optional[str]) -> None:
157
+ """
158
+ Set or create an attribute.
159
+ :param key: Key of attribute. Creates a new one if the key does not exist.
160
+ :param value: Value for the attribute. Deletes the attribute of None is passed.
161
+ """
162
+ if value is None:
163
+ self.__attributes.pop(str(key), None)
164
+ else:
165
+ self.__attributes[str(key)] = str(value)
166
+
167
+ def delete_attribute(self, key: str) -> Optional[str]:
168
+ """
169
+ Delete an attribute.
170
+ :param key: The key to delete.
171
+ :return: Returns the deleted attribute value. If the key does not exist, None is returned.
172
+ """
173
+ return self.__attributes.pop(str(key), None)
174
+
175
+ def get_regions(self, region: Optional[XMLType] = None) -> list[Element]:
176
+ """
177
+ Return a list of all direct child elements that are regions.
178
+ :param region: Only select a specific region type.
179
+ :return: List of matching Element objects.
180
+ """
181
+ if region is None:
182
+ return list([e for e in self.__elements if e.is_region()])
183
+ return list([e for e in self.__elements if e.type == region])
184
+
185
+ def add_element(self, element: Element, index: Optional[int] = None, ro: bool = True) -> None:
186
+ """
187
+ Add an existing Element object to the list elements.
188
+ :param element: The element to add.
189
+ :param index: If set, insert the element at this index. Else append to the list.
190
+ :param ro: If set to true, add the element to the reading order at the specified index.
191
+ Only if the element is a region.
192
+ """
193
+ if index is None:
194
+ self.__elements.append(element)
195
+ if ro and element.is_region() and element.id:
196
+ self.__reading_order.append(element.id)
197
+ else:
198
+ self.__elements.insert(min(index, len(self.__elements) - 1), element)
199
+ if ro and element.is_region() and element.id:
200
+ self.__reading_order.insert(min(index, len(self.__elements) - 1), element.id)
201
+
202
+ def create_element(self, _type: XMLType, index: int = None, ro: bool = True, **attributes: str) -> Element:
203
+ """
204
+ Create a new Element object and add it to the list of elements.
205
+ :param _type: XMLType of new element.
206
+ :param index: If set, insert the new element at this index. Else append to the list.
207
+ :param ro: If set to true, add the element to the reading order at the specified index.
208
+ Only if the element is a region.
209
+ :param attributes: Named arguments that will be stores as xml attributes.
210
+ :return: The newly created Element object.
211
+ """
212
+ element = Element.new(_type, **attributes)
213
+ self.add_element(element, index, ro)
214
+ return element
215
+
216
+ def remove_element(self, element: Union[Element, int]) -> Optional[Element]:
217
+ """
218
+ Remove an element from the list of elements.
219
+ :param element: The Element object or the index of the element to remove.
220
+ :return: The removed element, if it existed.
221
+ """
222
+ if isinstance(element, int) and element < len(self.__elements) - 1:
223
+ return self.__elements.pop(element)
224
+ elif isinstance(element, Element) and element in self.__elements:
225
+ self.__elements.remove(element)
226
+ return element
227
+ return None
228
+
229
+ def clear_elements(self) -> None:
230
+ """ Remove all Element objects from the list of elements. """
231
+ self.__elements.clear()
232
+ self.clear_reading_order()
233
+
234
+ def clear_regions(self) -> None:
235
+ """ Remove all Element objects from the list of elements, that are regions. """
236
+ for element in self.__elements:
237
+ if element.is_region():
238
+ self.__elements.remove(element)
239
+ if element.id and element.id in self.__reading_order:
240
+ self.__reading_order.remove(element.id)
241
+
242
+ def clear_reading_order(self) -> None:
243
+ """ Reset the reading order. """
244
+ self.__reading_order.clear()
pypxml/pxml.py ADDED
@@ -0,0 +1,223 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
5
+ from typing import Optional, Union, Self
6
+ from datetime import datetime
7
+ from pathlib import Path
8
+ import json
9
+
10
+ from lxml import etree
11
+
12
+ from .page import Page
13
+ from .resources.xml_schema import XMLSchema
14
+
15
+
16
+ class PageXML:
17
+ """ Represents a PageXML file. """
18
+ def __init__(self, creator: str, created: str, changed: str) -> None:
19
+ self.__creator: str = creator
20
+ self.__created: str = created
21
+ self.__changed: str = changed
22
+ self.__pages: list[Page] = []
23
+
24
+ def __len__(self) -> int:
25
+ """ Returns the number of pages. """
26
+ return len(self.__pages)
27
+
28
+ def __iter__(self) -> Self:
29
+ """ Iterator: starting point for iterating over all pages. """
30
+ self.__n = 0
31
+ return self
32
+
33
+ def __next__(self) -> Page:
34
+ """ Iterator: yield next page. """
35
+ if self.__n < len(self.__pages):
36
+ self.__n += 1
37
+ return self.__pages[self.__n - 1]
38
+ else:
39
+ raise StopIteration
40
+
41
+ def __getitem__(self, key: int) -> Optional[Page]:
42
+ """
43
+ Get the Page object of a given index.
44
+ :param key: The index value of the Page object.
45
+ :return: Page object if pages are available. Returns last page if index is out of range.
46
+ """
47
+ if len(self.__pages) > 0:
48
+ return self.__pages[min(key, len(self.__pages) - 1)]
49
+ return None
50
+
51
+ def __setitem__(self, key: int, value: Page) -> None:
52
+ """
53
+ Set a Page object for a given index.
54
+ :param key: The target index for the Page object.
55
+ :param value: The new Page object.
56
+ """
57
+ if len(self.__pages) > 0:
58
+ self.__pages[min(key, len(self.__pages) - 1)] = value
59
+
60
+ def __contains__(self, key: Page) -> bool:
61
+ """
62
+ Checks if a Page objects exists.
63
+ :param key: The Page object.
64
+ :return: True, if the Page object exists. Else return False.
65
+ """
66
+ if isinstance(key, Page):
67
+ return key in self.__pages
68
+ return False
69
+
70
+ @property
71
+ def creator(self) -> Optional[str]:
72
+ return self.__creator
73
+
74
+ @creator.setter
75
+ def creator(self, creator: str) -> None:
76
+ self.__creator = str(creator)
77
+
78
+ @property
79
+ def created(self) -> Optional[str]:
80
+ return self.__created
81
+
82
+ @created.setter
83
+ def created(self, created: Union[datetime, str]) -> None:
84
+ if isinstance(created, datetime):
85
+ self.__created = created.isoformat()
86
+ else:
87
+ self.__created = str(created)
88
+
89
+ @property
90
+ def changed(self) -> Optional[str]:
91
+ return self.__changed
92
+
93
+ @changed.setter
94
+ def changed(self, changed: Union[datetime, str]) -> None:
95
+ if isinstance(changed, datetime):
96
+ self.__changed = changed.isoformat()
97
+ else:
98
+ self.__changed = str(changed)
99
+
100
+ @property
101
+ def pages(self) -> list[Page]:
102
+ return self.__pages
103
+
104
+ @classmethod
105
+ def new(cls, creator: str = 'PyPXML') -> Self:
106
+ """
107
+ Create a new PageXML object from scratch.
108
+ :param creator: Specify creator tag in PageXMLs metadata.
109
+ :return: Newly created PageXML object.
110
+ """
111
+ return cls(creator, datetime.now().isoformat(), datetime.now().isoformat())
112
+
113
+ @classmethod
114
+ def from_etree(cls, tree: etree.Element) -> Self:
115
+ """
116
+ Create a new PageXML object from a lxml etree object.
117
+ :param tree: lxml etree object.
118
+ :return: PageXML object that represents the passed etree object.
119
+ """
120
+ if (md_tree := tree.find('./{*}Metadata')) is not None:
121
+ if (creator := md_tree.find('./{*}Creator')) is not None:
122
+ creator = creator.text
123
+ if (created := md_tree.find('./{*}Created')) is not None:
124
+ created = created.text
125
+ if (last_change := md_tree.find('./{*}LastChange')) is not None:
126
+ last_change = last_change.text
127
+ pxml = cls(creator, created, last_change)
128
+ else:
129
+ pxml = cls.new()
130
+ if (pages := tree.findall('./{*}Page')) is not None:
131
+ for page_tree in pages:
132
+ pxml.add_page(Page.from_etree(page_tree))
133
+ return pxml
134
+
135
+ def to_etree(self, version: str = '2019', schema_file: Optional[Path] = None) -> etree.Element:
136
+ """
137
+ Convert a PageXML object to a lxml etree element.
138
+ :param version: PageXML Version to use. Currently supported: `2019`.
139
+ :param schema_file: Custom schema in json format.
140
+ :return: A lxml etree object that represents this PageXML object.
141
+ """
142
+ self.changed = datetime.now().isoformat()
143
+ if schema_file is not None:
144
+ with open(schema_file) as stream:
145
+ schema = XMLSchema.custom('pagexml', version, json.load(stream))
146
+ else:
147
+ schema = XMLSchema.pagexml(version)
148
+ xsi_qname = etree.QName("http://www.w3.org/2001/XMLSchema-instance", 'schemaLocation')
149
+ nsmap = { None: schema['xmlns'], 'xsi': schema['xmlns_xsi'] }
150
+ root = etree.Element('PcGts', { xsi_qname: schema['xsi_schema_location'] }, nsmap=nsmap)
151
+ metadata = etree.SubElement(root, 'Metadata')
152
+ etree.SubElement(metadata, 'Creator').text = self.__creator
153
+ etree.SubElement(metadata, 'Created').text = self.__created
154
+ etree.SubElement(metadata, 'LastChange').text = self.__changed
155
+ for page in self.__pages:
156
+ root.append(page.to_etree())
157
+ return root
158
+
159
+ @classmethod
160
+ def from_xml(cls, fp: Union[Path, str], encoding: Optional[str] = None) -> Self:
161
+ """
162
+ Create a new PageXML object from a PageXML file.
163
+ :param fp: Path of PageXML file.
164
+ :param encoding: Set custom encoding.
165
+ :return: PageXML object.
166
+ """
167
+ parser = etree.XMLParser(remove_blank_text=True, encoding=encoding)
168
+ tree = etree.parse(fp, parser).getroot()
169
+ return cls.from_etree(tree)
170
+
171
+ def to_xml(self, fp: Union[Path, str], version: str = '2019-07-15', schema_file: Optional[Path] = None,
172
+ encoding: str = 'utf-8') -> None:
173
+ """
174
+ Create a PageXML file from a PageXML file.
175
+ :param fp: Path to new PageXML file.
176
+ :param version: The PageXML version to use. Currently supported: `2019`.
177
+ :param schema_file: Custom schema in json format.
178
+ :param encoding: Set custom encoding.
179
+ """
180
+ with open(fp, 'wb') as f:
181
+ tree = etree.tostring(self.to_etree(version, schema_file),
182
+ pretty_print=True,
183
+ encoding=encoding,
184
+ xml_declaration=True)
185
+ f.write(tree)
186
+
187
+ def add_page(self, page: Page, index: Optional[int] = None) -> None:
188
+ """
189
+ Add a Page object to the list of pages.
190
+ :param page: The Page object to add.
191
+ :param index: If set, insert the Page object at this index.
192
+ """
193
+ if index is None or index >= len(self.__pages) - 1:
194
+ self.__pages.append(page)
195
+ else:
196
+ self.__pages.insert(index, page)
197
+
198
+ def create_page(self, index: Optional[int] = None, **attributes: str) -> Page:
199
+ """
200
+ Create a new Page object and add it to the list of pages.
201
+ :param index: If set, insert the Page object at this index.
202
+ :param attributes: Named arguments that will be stores as xml attributes.
203
+ :return: The newly created Page object.
204
+ """
205
+ page = Page.new(**attributes)
206
+ self.add_page(page, index)
207
+ return page
208
+
209
+ def remove_page(self, page: Union[Page, int]) -> Optional[Page]:
210
+ """
211
+ Remove a Page object from the list of pages.
212
+ :param page: The index of the Page object to remove or the Page object itself.
213
+ :return: The Page object that was removed if it existed.
214
+ """
215
+ if isinstance(page, Page) and page in self.__pages:
216
+ self.__pages.remove(page)
217
+ return page
218
+ elif isinstance(page, int) and page < len(self.__pages):
219
+ return self.__pages.pop(page)
220
+
221
+ def clear_pages(self) -> None:
222
+ """ Remove all Page objects from the list of pages. """
223
+ self.__pages.clear()
@@ -0,0 +1,4 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
@@ -0,0 +1,43 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
5
+ from typing import Literal
6
+
7
+
8
+ DEFAULT_SCHEMA = {
9
+ 'pagexml': {
10
+ '2019': {
11
+ 'xmlns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15',
12
+ 'xmlns_xsi': 'http://www.w3.org/2001/XMLSchema-instance',
13
+ 'xsi_schema_location': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2017-07-15/pagecontent.xsd'
14
+ },
15
+ '2017': {
16
+ 'xmlns': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15',
17
+ 'xmlns_xsi': 'http://www.w3.org/2001/XMLSchema-instance',
18
+ 'xsi_schema_location': 'http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd'
19
+ }
20
+ }
21
+ }
22
+
23
+
24
+ class XMLSchema:
25
+ @staticmethod
26
+ def pagexml(version: Literal['2019', '2017'] = '2019') -> dict[str, str]:
27
+ """
28
+ Returns the xml schema values of a specified PageXML version.
29
+ :param version: The PageXML version to use.
30
+ :return: A dictionary containing all header attributes: `xmlns`, `xmlns_xsi`, `xsi_schema_location`
31
+ """
32
+ return DEFAULT_SCHEMA['pagexml'][version]
33
+
34
+ @staticmethod
35
+ def custom(schema: str, version: str, custom: dict) -> dict[str, str]:
36
+ """
37
+ Returns the custom xml schema values of a specified version.
38
+ :param schema: The schema to use.
39
+ :param version: The version to use.
40
+ :param custom: A dictionary containing custom xml schema values. See DEFAULT_SCHEMA as example.
41
+ :return: A dictionary containing all header attributes provided by the custom schema.
42
+ """
43
+ return custom[schema][version]
@@ -0,0 +1,59 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
5
+ from enum import Enum
6
+
7
+
8
+ class XMLType(Enum):
9
+ """
10
+ https://ocr-d.de/de/gt-guidelines/pagexml/pagecontent_xsd_Complex_Type_pc_PcGtsType.html#PcGtsType_Page
11
+ """
12
+
13
+ # ReadingOrder
14
+ ReadingOrder = "ReadingOrder"
15
+ OrderedGroup = "OrderedGroup"
16
+ RegionRefIndexed = "RegionRefIndexed"
17
+
18
+ # Regions
19
+ AdvertRegion = "AdvertRegion"
20
+ ChartRegion = "ChartRegion"
21
+ ChemRegion = "ChemRegion"
22
+ CustomRegion = "CustomRegion"
23
+ GraphicRegion = "GraphicRegion"
24
+ ImageRegion = "ImageRegion"
25
+ LineDrawingRegion = "LineDrawingRegion"
26
+ MapRegion = "MapRegion"
27
+ MathsRegion = "MathsRegion"
28
+ MusicRegion = "MusicRegion"
29
+ NoiseRegion = "NoiseRegion"
30
+ SeparatorRegion = "SeparatorRegion"
31
+ TableRegion = "TableRegion"
32
+ TextRegion = "TextRegion"
33
+ UnknownRegion = "UnknownRegion"
34
+
35
+ # Elements
36
+ AlternativeImage = "AlternativeImage"
37
+ Baseline = "Baseline"
38
+ Border = "Border"
39
+ Coords = "Coords"
40
+ Glyph = "Glyph"
41
+ GraphemeGroup = "GraphemeGroup"
42
+ Graphemes = "Graphemes"
43
+ Grid = "Grid"
44
+ Label = "Label"
45
+ Labels = "Labels"
46
+ Layers = "Layers"
47
+ Metadata = "Metadata"
48
+ NonPrintingChar = "NonPrintingChar"
49
+ PlainText = "PlainText"
50
+ PrintSpace = "PrintSpace"
51
+ Relations = "Relations"
52
+ Roles = "Roles"
53
+ TextEquiv = "TextEquiv"
54
+ TextLine = "TextLine"
55
+ TextStyle = "TextStyle"
56
+ Unicode = "Unicode"
57
+ UserAttribute = "UserAttribute"
58
+ UserDefined = "UserDefined"
59
+ Word = "Word"
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Janik Haitz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.1
2
+ Name: pypxml
3
+ Version: 1.0
4
+ Summary: A python library for parsing, converting and modifying PageXML files.
5
+ Author-email: Janik Haitz <jahtz.dev@proton.me>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 Janik Haitz
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: repository, https://github.com/jahtz/pypxml
29
+ Keywords: PageXML,XML,OCR,optical character recognition
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Operating System :: OS Independent
33
+ Requires-Python: >=3.11
34
+ Description-Content-Type: text/markdown
35
+ License-File: LICENSE
36
+ Requires-Dist: lxml ~=5.3.0
37
+ Requires-Dist: click ~=8.1.7
38
+
39
+ from src.pypxml import XMLSchema
40
+
41
+ # PyPXML
42
+ A python library for parsing, converting and modifying PageXML files.
43
+
44
+ ## Setup
45
+ ```shell
46
+ pip install pypxml
47
+ ```
48
+
49
+ ### Install from source
50
+ 1. Clone repository: `git clone https://github.com/jahtz/pypxml`
51
+ 2. Install package: `cd pypxml && pip install .`
52
+ 3. Test with `pypxml --version`
53
+
54
+ ## CLI
55
+ ```
56
+ pypxml [OPTIONS] COMMAND [ARGS]...
57
+ ```
58
+ Coming in version 2.x
59
+
60
+ ## API
61
+ PyXML provides a feature rich Python API for working with PageXML files.
62
+
63
+ ### Basics
64
+ ```python
65
+ from pypxml import PageXML, Page, Element, XMLType
66
+
67
+ pxml = PageXML.from_xml('path_to_pagexml.xml')
68
+ page1 = pxml.create_page(imageFilename='0001.png',
69
+ imageWidth=1000,
70
+ imageHeight=2500)
71
+ page1.create_element(XMLType.TextRegion, id='ir01')
72
+ pxml.to_xml('path_to_output.xml')
73
+ ```
74
+
75
+ ### PageXML class
76
+ ```python
77
+ from pypxml import PageXML
78
+
79
+ # open file
80
+ pxml = PageXML.from_xml('path_to.xml')
81
+ # or create new PageXML
82
+ pxml = PageXML.new()
83
+
84
+ # edit metadata
85
+ pxml.creator = 'yourname'
86
+ ...
87
+
88
+ # create a page
89
+ page = pxml.create_page(imageFilename='0001.png',
90
+ imageWidth=1000,
91
+ imageHeight='2500')
92
+ # or add existing page
93
+ pxml.add_page(page) # see below
94
+
95
+ # iterate over pages
96
+ for page in pxml:
97
+ ...
98
+
99
+ # delete or modify pages
100
+ pxml[0] = ...
101
+ pxml.remove_page(pxml[1])
102
+
103
+ # save object to file
104
+ pxml.to_xml('output.xml')
105
+ ...
106
+ ```
107
+
108
+ ### Page class
109
+ ```python
110
+ from pypxml import Page, XMLType
111
+
112
+ # create a page
113
+ page = Page.new(imageFilename='0001.png',
114
+ imageWidth=1000,
115
+ imageHeight=2500)
116
+
117
+ # modify attributes
118
+ page['imageFilename'] = '0002.png'
119
+ # or get element by index
120
+ element = page[3]
121
+
122
+ # add elements (automatically added to reading order if it is a region)
123
+ text_region = page.create_element(XMLType.TextRegion, id='tr1')
124
+ # or add existing element
125
+ page.add_element(element)
126
+
127
+ # iterate over regions
128
+ for region in page:
129
+ ...
130
+ ...
131
+ ```
132
+
133
+ ### Element class
134
+ ```python
135
+ from pypxml import Element, XMLType
136
+
137
+ # create an element
138
+ coords = Element.new(XMLType.Coords,
139
+ points='1,2 3,4 5,6 7,8')
140
+ # modify attributes
141
+ coords['points'] = 'some other coords'
142
+ # or get element by index
143
+ baseline = text_region[2]
144
+
145
+ # check if element is a region
146
+ if text_region.is_region():
147
+ ...
148
+
149
+ # get coords and baseline, if they exist
150
+ coords = text_line.get_coords()
151
+ baseline = text_line.get_baseline()
152
+ ...
153
+ ```
154
+
155
+ ## ZPD
156
+ Developed at Centre for [Philology and Digitality](https://www.uni-wuerzburg.de/en/zpd/) (ZPD), [University of Würzburg](https://www.uni-wuerzburg.de/en/).
@@ -0,0 +1,15 @@
1
+ cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ cli/pypxml_cli.py,sha256=Ui5FqdPaYk43686AytkKB6L66mDmbxGdE2s9f8PC_eU,397
3
+ pypxml/__init__.py,sha256=gGyTUwm-H6Z_iESkAKUhjEVMUldRkZs7wqOKjpfHmKc,307
4
+ pypxml/element.py,sha256=ivvkS-ktcQzU7pxbJb1uZut7ShF0a3s7WXzFSebFTq8,8530
5
+ pypxml/page.py,sha256=6mqyGS9VsM2NMX87h6AzhagudjkSWuqnpiWH-x3Zuao,10131
6
+ pypxml/pxml.py,sha256=LHPiHI4gaKxzacqPTVEch2Zx1LnjZSA7yXMqJ_LLpdk,8385
7
+ pypxml/resources/__init__.py,sha256=5LWKUrEyh94LutFHtwZxjieOnSeS-o0afz6qZMmRydU,144
8
+ pypxml/resources/xml_schema.py,sha256=FDOX_tK-ctN3oxxXiL9-ocuPSV8vChmvom27cJ1fleA,1881
9
+ pypxml/resources/xml_types.py,sha256=UBHWVznBwSTzYXdp0hpbbeChXsYozkNeL1qvHxQgwwY,1620
10
+ pypxml-1.0.dist-info/LICENSE,sha256=YtURTiiG41gsIDU4gyMNba_R6Db-J_GvXZvtXGVUyZo,1068
11
+ pypxml-1.0.dist-info/METADATA,sha256=2har3hD2fOqyaq3Ac4AaIZwEYg2ODyoDwFaJSLsXP2o,4400
12
+ pypxml-1.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
13
+ pypxml-1.0.dist-info/entry_points.txt,sha256=zkGOJGSpAat6xMZrzonPstt_BiHgTBkH0aEYb9SpX1w,46
14
+ pypxml-1.0.dist-info/top_level.txt,sha256=wz-ismkaMeQqzKDV6hKEH3gMDZT9mVbLSqnha04iUZE,11
15
+ pypxml-1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (75.1.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ pypxml = cli.pypxml_cli:cli
@@ -0,0 +1,2 @@
1
+ cli
2
+ pypxml