pypxml 1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pypxml-1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Janik Haitz
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
pypxml-1.0/MANIFEST.in ADDED
@@ -0,0 +1,7 @@
1
+ recursive-include src/pypxml/resources *
2
+ recursive-include src/pypxml *.py
3
+ recursive-include src/cli *.py
4
+
5
+ # Include the README and LICENSE files
6
+ include README.md
7
+ include LICENSE
pypxml-1.0/PKG-INFO ADDED
@@ -0,0 +1,156 @@
1
+ Metadata-Version: 2.1
2
+ Name: pypxml
3
+ Version: 1.0
4
+ Summary: A python library for parsing, converting and modifying PageXML files.
5
+ Author-email: Janik Haitz <jahtz.dev@proton.me>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2024 Janik Haitz
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: repository, https://github.com/jahtz/pypxml
29
+ Keywords: PageXML,XML,OCR,optical character recognition
30
+ Classifier: Programming Language :: Python :: 3
31
+ Classifier: License :: OSI Approved :: MIT License
32
+ Classifier: Operating System :: OS Independent
33
+ Requires-Python: >=3.11
34
+ Description-Content-Type: text/markdown
35
+ License-File: LICENSE
36
+ Requires-Dist: lxml~=5.3.0
37
+ Requires-Dist: click~=8.1.7
38
+
39
+ from src.pypxml import XMLSchema
40
+
41
+ # PyPXML
42
+ A python library for parsing, converting and modifying PageXML files.
43
+
44
+ ## Setup
45
+ ```shell
46
+ pip install pypxml
47
+ ```
48
+
49
+ ### Install from source
50
+ 1. Clone repository: `git clone https://github.com/jahtz/pypxml`
51
+ 2. Install package: `cd pypxml && pip install .`
52
+ 3. Test with `pypxml --version`
53
+
54
+ ## CLI
55
+ ```
56
+ pypxml [OPTIONS] COMMAND [ARGS]...
57
+ ```
58
+ Coming in version 2.x
59
+
60
+ ## API
61
+ PyXML provides a feature rich Python API for working with PageXML files.
62
+
63
+ ### Basics
64
+ ```python
65
+ from pypxml import PageXML, Page, Element, XMLType
66
+
67
+ pxml = PageXML.from_xml('path_to_pagexml.xml')
68
+ page1 = pxml.create_page(imageFilename='0001.png',
69
+ imageWidth=1000,
70
+ imageHeight=2500)
71
+ page1.create_element(XMLType.TextRegion, id='ir01')
72
+ pxml.to_xml('path_to_output.xml')
73
+ ```
74
+
75
+ ### PageXML class
76
+ ```python
77
+ from pypxml import PageXML
78
+
79
+ # open file
80
+ pxml = PageXML.from_xml('path_to.xml')
81
+ # or create new PageXML
82
+ pxml = PageXML.new()
83
+
84
+ # edit metadata
85
+ pxml.creator = 'yourname'
86
+ ...
87
+
88
+ # create a page
89
+ page = pxml.create_page(imageFilename='0001.png',
90
+ imageWidth=1000,
91
+ imageHeight='2500')
92
+ # or add existing page
93
+ pxml.add_page(page) # see below
94
+
95
+ # iterate over pages
96
+ for page in pxml:
97
+ ...
98
+
99
+ # delete or modify pages
100
+ pxml[0] = ...
101
+ pxml.remove_page(pxml[1])
102
+
103
+ # save object to file
104
+ pxml.to_xml('output.xml')
105
+ ...
106
+ ```
107
+
108
+ ### Page class
109
+ ```python
110
+ from pypxml import Page, XMLType
111
+
112
+ # create a page
113
+ page = Page.new(imageFilename='0001.png',
114
+ imageWidth=1000,
115
+ imageHeight=2500)
116
+
117
+ # modify attributes
118
+ page['imageFilename'] = '0002.png'
119
+ # or get element by index
120
+ element = page[3]
121
+
122
+ # add elements (automatically added to reading order if it is a region)
123
+ text_region = page.create_element(XMLType.TextRegion, id='tr1')
124
+ # or add existing element
125
+ page.add_element(element)
126
+
127
+ # iterate over regions
128
+ for region in page:
129
+ ...
130
+ ...
131
+ ```
132
+
133
+ ### Element class
134
+ ```python
135
+ from pypxml import Element, XMLType
136
+
137
+ # create an element
138
+ coords = Element.new(XMLType.Coords,
139
+ points='1,2 3,4 5,6 7,8')
140
+ # modify attributes
141
+ coords['points'] = 'some other coords'
142
+ # or get element by index
143
+ baseline = text_region[2]
144
+
145
+ # check if element is a region
146
+ if text_region.is_region():
147
+ ...
148
+
149
+ # get coords and baseline, if they exist
150
+ coords = text_line.get_coords()
151
+ baseline = text_line.get_baseline()
152
+ ...
153
+ ```
154
+
155
+ ## ZPD
156
+ Developed at Centre for [Philology and Digitality](https://www.uni-wuerzburg.de/en/zpd/) (ZPD), [University of Würzburg](https://www.uni-wuerzburg.de/en/).
pypxml-1.0/README.md ADDED
@@ -0,0 +1,118 @@
1
+ from src.pypxml import XMLSchema
2
+
3
+ # PyPXML
4
+ A python library for parsing, converting and modifying PageXML files.
5
+
6
+ ## Setup
7
+ ```shell
8
+ pip install pypxml
9
+ ```
10
+
11
+ ### Install from source
12
+ 1. Clone repository: `git clone https://github.com/jahtz/pypxml`
13
+ 2. Install package: `cd pypxml && pip install .`
14
+ 3. Test with `pypxml --version`
15
+
16
+ ## CLI
17
+ ```
18
+ pypxml [OPTIONS] COMMAND [ARGS]...
19
+ ```
20
+ Coming in version 2.x
21
+
22
+ ## API
23
+ PyXML provides a feature rich Python API for working with PageXML files.
24
+
25
+ ### Basics
26
+ ```python
27
+ from pypxml import PageXML, Page, Element, XMLType
28
+
29
+ pxml = PageXML.from_xml('path_to_pagexml.xml')
30
+ page1 = pxml.create_page(imageFilename='0001.png',
31
+ imageWidth=1000,
32
+ imageHeight=2500)
33
+ page1.create_element(XMLType.TextRegion, id='ir01')
34
+ pxml.to_xml('path_to_output.xml')
35
+ ```
36
+
37
+ ### PageXML class
38
+ ```python
39
+ from pypxml import PageXML
40
+
41
+ # open file
42
+ pxml = PageXML.from_xml('path_to.xml')
43
+ # or create new PageXML
44
+ pxml = PageXML.new()
45
+
46
+ # edit metadata
47
+ pxml.creator = 'yourname'
48
+ ...
49
+
50
+ # create a page
51
+ page = pxml.create_page(imageFilename='0001.png',
52
+ imageWidth=1000,
53
+ imageHeight='2500')
54
+ # or add existing page
55
+ pxml.add_page(page) # see below
56
+
57
+ # iterate over pages
58
+ for page in pxml:
59
+ ...
60
+
61
+ # delete or modify pages
62
+ pxml[0] = ...
63
+ pxml.remove_page(pxml[1])
64
+
65
+ # save object to file
66
+ pxml.to_xml('output.xml')
67
+ ...
68
+ ```
69
+
70
+ ### Page class
71
+ ```python
72
+ from pypxml import Page, XMLType
73
+
74
+ # create a page
75
+ page = Page.new(imageFilename='0001.png',
76
+ imageWidth=1000,
77
+ imageHeight=2500)
78
+
79
+ # modify attributes
80
+ page['imageFilename'] = '0002.png'
81
+ # or get element by index
82
+ element = page[3]
83
+
84
+ # add elements (automatically added to reading order if it is a region)
85
+ text_region = page.create_element(XMLType.TextRegion, id='tr1')
86
+ # or add existing element
87
+ page.add_element(element)
88
+
89
+ # iterate over regions
90
+ for region in page:
91
+ ...
92
+ ...
93
+ ```
94
+
95
+ ### Element class
96
+ ```python
97
+ from pypxml import Element, XMLType
98
+
99
+ # create an element
100
+ coords = Element.new(XMLType.Coords,
101
+ points='1,2 3,4 5,6 7,8')
102
+ # modify attributes
103
+ coords['points'] = 'some other coords'
104
+ # or get element by index
105
+ baseline = text_region[2]
106
+
107
+ # check if element is a region
108
+ if text_region.is_region():
109
+ ...
110
+
111
+ # get coords and baseline, if they exist
112
+ coords = text_line.get_coords()
113
+ baseline = text_line.get_baseline()
114
+ ...
115
+ ```
116
+
117
+ ## ZPD
118
+ Developed at Centre for [Philology and Digitality](https://www.uni-wuerzburg.de/en/zpd/) (ZPD), [University of Würzburg](https://www.uni-wuerzburg.de/en/).
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "pypxml"
7
+ description = "A python library for parsing, converting and modifying PageXML files. "
8
+ keywords = ["PageXML", "XML", "OCR", "optical character recognition"]
9
+ version = "1.0"
10
+ readme = "README.md"
11
+ license = { file = "LICENSE" }
12
+ authors = [
13
+ { name="Janik Haitz", email="jahtz.dev@proton.me" },
14
+ ]
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ ]
20
+ dependencies = [
21
+ "lxml ~= 5.3.0",
22
+ "click ~= 8.1.7",
23
+ ]
24
+ requires-python = ">=3.11"
25
+ scripts = { pypxml = "cli.pypxml_cli:cli" }
26
+
27
+ [project.urls]
28
+ repository = "https://github.com/jahtz/pypxml"
29
+
30
+ [tool.setuptools.packages.find]
31
+ where = ["src"]
32
+ exclude = ["tests", "assets"]
pypxml-1.0/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,15 @@
1
+ import click
2
+
3
+
4
+ @click.group()
5
+ @click.help_option('--help')
6
+ @click.version_option('1.0', '--version',
7
+ prog_name='PyPXML',
8
+ message='%(prog)s v%(version)s - Developed at Centre for Philology and Digitality (ZPD), '
9
+ 'University of Würzburg')
10
+ def cli():
11
+ """
12
+ PyPXML command line interface entry point.
13
+ """
14
+ pass
15
+
@@ -0,0 +1,9 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
5
+ from .pxml import PageXML
6
+ from .page import Page
7
+ from .element import Element
8
+ from .resources.xml_schema import XMLSchema
9
+ from .resources.xml_types import XMLType
@@ -0,0 +1,225 @@
1
+ # This file is licensed under the MIT License.
2
+ # Copyright (c) 2024 Janik Haitz
3
+ # See the LICENSE file in the root directory for more details.
4
+
5
+ from typing import Optional, Union, Self
6
+
7
+ from lxml import etree
8
+
9
+ from .resources.xml_types import XMLType
10
+
11
+
12
+ class Element:
13
+ """ Represents an element inside a page. """
14
+ def __init__(self, _type: XMLType, attributes: Optional[dict[str, str]] = None):
15
+ self.__type: XMLType = _type
16
+ self.__attributes: dict[str, str] = attributes if attributes else {}
17
+ self.__elements: list[Element] = []
18
+ self.__text: Optional[str] = None
19
+
20
+ def __len__(self) -> int:
21
+ """ Returns the number of elements. """
22
+ return len(self.__elements)
23
+
24
+ def __iter__(self) -> Self:
25
+ """ Iterator: starting point for iterating over all elements. """
26
+ self.__n = 0
27
+ return self
28
+
29
+ def __next__(self) -> Self:
30
+ """ Iterator: yield next element. """
31
+ if self.__n < len(self.__elements):
32
+ self.__n += 1
33
+ return self.__elements[self.__n - 1]
34
+ else:
35
+ raise StopIteration
36
+
37
+ def __getitem__(self, key: Union[int, str]) -> Optional[Union[Self, str]]:
38
+ """
39
+ Get an Element object by its index or an attribute value by its key
40
+ :param key: Index (integer) of an Element object or a key (string) of an attribute.
41
+ :return: The Element of passed index (returns last object if the key is out of range) or the value of the
42
+ selected attribute. Returns None, if no match was found.
43
+ """
44
+ if isinstance(key, int) and len(self.__elements) > 0:
45
+ return self.__elements[min(key, len(self.__elements) - 1)]
46
+ elif isinstance(key, str) and key in self.__attributes:
47
+ return self.__attributes[key]
48
+ return None
49
+
50
+ def __setitem__(self, key: Union[int, str], value: Union[Self, str]) -> None:
51
+ """
52
+ Set an Element object or an attribute value.
53
+ :param key: Index (integer) for an Element object or a key (string) for an attribute.
54
+ :param value: Element object (if key is of type integer) or a string (if key is of type string).
55
+ """
56
+ if isinstance(key, int) and isinstance(value, Element) and len(self.__elements) > 0:
57
+ self.__elements[min(key, len(self.__elements) - 1)] = value
58
+ elif isinstance(key, str):
59
+ self.__attributes[key] = value
60
+ else:
61
+ raise ValueError('Invalid key or value')
62
+
63
+ def __contains__(self, key: Union[Self, str]) -> bool:
64
+ """
65
+ Checks if an Element object or an attribute exists.
66
+ :param key: Element object or attribute key.
67
+ :return: True, if either the passed Element object or the attribute exists. Else return False.
68
+ """
69
+ if isinstance(key, Element):
70
+ return key in self.__elements
71
+ elif isinstance(key, str):
72
+ return key in self.__attributes
73
+ return False
74
+
75
+ @property
76
+ def type(self) -> XMLType:
77
+ return self.__type
78
+
79
+ @type.setter
80
+ def type(self, value: XMLType) -> None:
81
+ self.__type = value
82
+
83
+ @property
84
+ def attributes(self) -> dict[str, str]:
85
+ return self.__attributes
86
+
87
+ @property
88
+ def elements(self) -> list[Self]:
89
+ return self.__elements
90
+
91
+ @property
92
+ def id(self) -> Optional[str]:
93
+ return self.__attributes.get('id', None)
94
+
95
+ @id.setter
96
+ def id(self, value: Optional[str]) -> None:
97
+ if value is None:
98
+ self.__attributes.pop('id', None)
99
+ else:
100
+ self.__attributes['id'] = value
101
+
102
+ @property
103
+ def text(self) -> Optional[str]:
104
+ return self.__text
105
+
106
+ @text.setter
107
+ def text(self, value: Optional[str]) -> None:
108
+ self.__text = None if value is None else str(value)
109
+
110
+ @classmethod
111
+ def new(cls, _type: XMLType, **attributes: str) -> Self:
112
+ """
113
+ Create a new Element object from scratch.
114
+ :param _type: The type of element to create.
115
+ :param attributes: Named arguments that will be stores as xml attributes.
116
+ :return: The newly created Element object.
117
+ """
118
+ attributes = {str(k): str(v) for k, v in attributes.items() if v is not None}
119
+ return cls(_type, attributes)
120
+
121
+ @classmethod
122
+ def from_etree(cls, tree: etree.Element) -> Self:
123
+ """
124
+ Create a new Element object from a lxml etree object.
125
+ :param tree: lxml etree object.
126
+ :return: Element object that represents the passed etree object.
127
+ """
128
+ element = cls(XMLType(tree.tag.split('}')[1]), dict(tree.items()))
129
+ element.text = tree.text
130
+ for child in tree:
131
+ element.add_element(Element.from_etree(child))
132
+ return element
133
+
134
+ def to_etree(self) -> etree.Element:
135
+ """
136
+ Convert the Element object to a lxml etree object.
137
+ :return: A lxml etree object that represents this Element object.
138
+ """
139
+ element = etree.Element(self.__type.value, **self.__attributes)
140
+ if self.__text is not None:
141
+ element.text = self.__text
142
+ for child in self.__elements:
143
+ element.append(child.to_etree())
144
+ return element
145
+
146
+ def is_region(self) -> bool:
147
+ """ Returns True, if the Element object is a region. """
148
+ return self.__type.value.endswith('Region')
149
+
150
+ def contains_text(self) -> bool:
151
+ """ Returns True, if the Element object contains text. """
152
+ return self.__text is not None
153
+
154
+ def set_attribute(self, key: str, value: Optional[str]) -> None:
155
+ """
156
+ Set an attribute.
157
+ :param key: Key of attribute. Creates a new one if the key does not exist.
158
+ :param value: Value for the attribute. Deletes the attribute of None is passed.
159
+ """
160
+ if value is None:
161
+ self.__attributes.pop(str(key), None)
162
+ else:
163
+ self.__attributes[str(key)] = str(value)
164
+
165
+ def delete_attribute(self, key: str) -> Optional[str]:
166
+ """
167
+ Delete an attribute.
168
+ :param key: The key to delete.
169
+ :return: Returns the deleted attribute value. If the key does not exist, None is returned.
170
+ """
171
+ return self.__attributes.pop(str(key), None)
172
+
173
+ def get_coords(self) -> Optional[Self]:
174
+ """ Return the first direct child Element object of type Coords. """
175
+ for element in self.__elements:
176
+ if element.type == XMLType.Coords:
177
+ return element
178
+ return None
179
+
180
+ def get_baseline(self) -> Optional[Self]:
181
+ """ Return the first direct child Element object of type Baseline. """
182
+ for element in self.__elements:
183
+ if element.type == XMLType.Baseline:
184
+ return element
185
+ return None
186
+
187
+ def add_element(self, element: Self, index: Optional[int] = None) -> None:
188
+ """
189
+ Add an existing Element object to the list elements.
190
+ :param element: The element to add.
191
+ :param index: If set, insert the element at this index. Else append to the list.
192
+ """
193
+ if index is None:
194
+ self.__elements.append(element)
195
+ else:
196
+ self.__elements.insert(min(index, len(self.__elements) - 1), element)
197
+
198
+ def create_element(self, _type: XMLType, index: int = None, **attributes: str) -> Self:
199
+ """
200
+ Create a new Element object and add it to the list of elements.
201
+ :param _type: XMLType of new element.
202
+ :param index: If set, insert the new element at this index. Else append to the list.
203
+ :param attributes: Named arguments that will be stores as xml attributes.
204
+ :return: The newly created Element object.
205
+ """
206
+ element = Element.new(_type, **attributes)
207
+ self.add_element(element, index)
208
+ return element
209
+
210
+ def remove_element(self, element: Union[Self, int]) -> Optional[Self]:
211
+ """
212
+ Remove an element from the list of elements.
213
+ :param element: The Element object or the index of the element to remove.
214
+ :return: The removed element, if it existed.
215
+ """
216
+ if isinstance(element, int) and element < len(self.__elements) - 1:
217
+ return self.__elements.pop(element)
218
+ elif isinstance(element, Element) and element in self.__elements:
219
+ self.__elements.remove(element)
220
+ return element
221
+ return None
222
+
223
+ def clear_elements(self) -> None:
224
+ """ Remove all Element objects from the list of elements. """
225
+ self.__elements.clear()