PyPI - pypxml - Versions diffs - 1.0__tar.gz - Mend

pypxml 1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

pypxml-1.0/LICENSE +21 -0
pypxml-1.0/MANIFEST.in +7 -0
pypxml-1.0/PKG-INFO +156 -0
pypxml-1.0/README.md +118 -0
pypxml-1.0/pyproject.toml +32 -0
pypxml-1.0/setup.cfg +4 -0
pypxml-1.0/src/cli/__init__.py +0 -0
pypxml-1.0/src/cli/pypxml_cli.py +15 -0
pypxml-1.0/src/pypxml/__init__.py +9 -0
pypxml-1.0/src/pypxml/element.py +225 -0
pypxml-1.0/src/pypxml/page.py +244 -0
pypxml-1.0/src/pypxml/pxml.py +223 -0
pypxml-1.0/src/pypxml/resources/__init__.py +4 -0
pypxml-1.0/src/pypxml/resources/xml_schema.py +43 -0
pypxml-1.0/src/pypxml/resources/xml_types.py +59 -0
pypxml-1.0/src/pypxml.egg-info/PKG-INFO +156 -0
pypxml-1.0/src/pypxml.egg-info/SOURCES.txt +19 -0
pypxml-1.0/src/pypxml.egg-info/dependency_links.txt +1 -0
pypxml-1.0/src/pypxml.egg-info/entry_points.txt +2 -0
pypxml-1.0/src/pypxml.egg-info/requires.txt +2 -0
pypxml-1.0/src/pypxml.egg-info/top_level.txt +2 -0

pypxml-1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 Janik Haitz
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

pypxml-1.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,7 @@
+recursive-include src/pypxml/resources *
+recursive-include src/pypxml *.py
+recursive-include src/cli *.py
+# Include the README and LICENSE files
+include README.md
+include LICENSE

pypxml-1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,156 @@
+Metadata-Version: 2.1
+Name: pypxml
+Version: 1.0
+Summary: A python library for parsing, converting and modifying PageXML files.
+Author-email: Janik Haitz <jahtz.dev@proton.me>
+License: MIT License
+        Copyright (c) 2024 Janik Haitz
+        Permission is hereby granted, free of charge, to any person obtaining a copy
+        of this software and associated documentation files (the "Software"), to deal
+        in the Software without restriction, including without limitation the rights
+        to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+        copies of the Software, and to permit persons to whom the Software is
+        furnished to do so, subject to the following conditions:
+        The above copyright notice and this permission notice shall be included in all
+        copies or substantial portions of the Software.
+        THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+        IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+        FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+        AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+        LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+        OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+        SOFTWARE.
+Project-URL: repository, https://github.com/jahtz/pypxml
+Keywords: PageXML,XML,OCR,optical character recognition
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.11
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: lxml~=5.3.0
+Requires-Dist: click~=8.1.7
+from src.pypxml import XMLSchema
+# PyPXML
+A python library for parsing, converting and modifying PageXML files.
+## Setup
+```shell
+pip install pypxml
+```
+### Install from source
+1. Clone repository: `git clone https://github.com/jahtz/pypxml`
+2. Install package: `cd pypxml && pip install .`
+3. Test with `pypxml --version`
+## CLI
+```
+pypxml [OPTIONS] COMMAND [ARGS]...
+```
+Coming in version 2.x
+## API
+PyXML provides a feature rich Python API for working with PageXML files.
+### Basics
+```python
+from pypxml import PageXML, Page, Element, XMLType
+pxml = PageXML.from_xml('path_to_pagexml.xml')
+page1 = pxml.create_page(imageFilename='0001.png',
+                         imageWidth=1000,
+                         imageHeight=2500)
+page1.create_element(XMLType.TextRegion, id='ir01')
+pxml.to_xml('path_to_output.xml')
+```
+### PageXML class
+```python
+from pypxml import PageXML
+# open file
+pxml = PageXML.from_xml('path_to.xml')
+# or create new PageXML
+pxml = PageXML.new()
+# edit metadata
+pxml.creator = 'yourname'
+...
+# create a page
+page = pxml.create_page(imageFilename='0001.png',
+                        imageWidth=1000,
+                        imageHeight='2500')
+# or add existing page
+pxml.add_page(page)  # see below
+# iterate over pages
+for page in pxml:
+    ...
+# delete or modify pages
+pxml[0] = ...
+pxml.remove_page(pxml[1])
+# save object to file
+pxml.to_xml('output.xml')
+...
+```
+### Page class
+```python
+from pypxml import Page, XMLType
+# create a page
+page = Page.new(imageFilename='0001.png',
+                imageWidth=1000,
+                imageHeight=2500)
+# modify attributes
+page['imageFilename'] = '0002.png'
+# or get element by index
+element = page[3]
+# add elements (automatically added to reading order if it is a region)
+text_region = page.create_element(XMLType.TextRegion, id='tr1')
+# or add existing element
+page.add_element(element)
+# iterate over regions
+for region in page:
+    ...
+...
+```
+### Element class
+```python
+from pypxml import Element, XMLType
+# create an element
+coords = Element.new(XMLType.Coords,
+                     points='1,2 3,4 5,6 7,8')
+# modify attributes
+coords['points'] = 'some other coords'
+# or get element by index
+baseline = text_region[2]
+# check if element is a region
+if text_region.is_region():
+    ...
+# get coords and baseline, if they exist
+coords = text_line.get_coords()
+baseline = text_line.get_baseline()
+...
+```
+## ZPD
+Developed at Centre for [Philology and Digitality](https://www.uni-wuerzburg.de/en/zpd/) (ZPD), [University of Würzburg](https://www.uni-wuerzburg.de/en/).

pypxml-1.0/README.md ADDED Viewed

@@ -0,0 +1,118 @@
+from src.pypxml import XMLSchema
+# PyPXML
+A python library for parsing, converting and modifying PageXML files.
+## Setup
+```shell
+pip install pypxml
+```
+### Install from source
+1. Clone repository: `git clone https://github.com/jahtz/pypxml`
+2. Install package: `cd pypxml && pip install .`
+3. Test with `pypxml --version`
+## CLI
+```
+pypxml [OPTIONS] COMMAND [ARGS]...
+```
+Coming in version 2.x
+## API
+PyXML provides a feature rich Python API for working with PageXML files.
+### Basics
+```python
+from pypxml import PageXML, Page, Element, XMLType
+pxml = PageXML.from_xml('path_to_pagexml.xml')
+page1 = pxml.create_page(imageFilename='0001.png',
+                         imageWidth=1000,
+                         imageHeight=2500)
+page1.create_element(XMLType.TextRegion, id='ir01')
+pxml.to_xml('path_to_output.xml')
+```
+### PageXML class
+```python
+from pypxml import PageXML
+# open file
+pxml = PageXML.from_xml('path_to.xml')
+# or create new PageXML
+pxml = PageXML.new()
+# edit metadata
+pxml.creator = 'yourname'
+...
+# create a page
+page = pxml.create_page(imageFilename='0001.png',
+                        imageWidth=1000,
+                        imageHeight='2500')
+# or add existing page
+pxml.add_page(page)  # see below
+# iterate over pages
+for page in pxml:
+    ...
+# delete or modify pages
+pxml[0] = ...
+pxml.remove_page(pxml[1])
+# save object to file
+pxml.to_xml('output.xml')
+...
+```
+### Page class
+```python
+from pypxml import Page, XMLType
+# create a page
+page = Page.new(imageFilename='0001.png',
+                imageWidth=1000,
+                imageHeight=2500)
+# modify attributes
+page['imageFilename'] = '0002.png'
+# or get element by index
+element = page[3]
+# add elements (automatically added to reading order if it is a region)
+text_region = page.create_element(XMLType.TextRegion, id='tr1')
+# or add existing element
+page.add_element(element)
+# iterate over regions
+for region in page:
+    ...
+...
+```
+### Element class
+```python
+from pypxml import Element, XMLType
+# create an element
+coords = Element.new(XMLType.Coords,
+                     points='1,2 3,4 5,6 7,8')
+# modify attributes
+coords['points'] = 'some other coords'
+# or get element by index
+baseline = text_region[2]
+# check if element is a region
+if text_region.is_region():
+    ...
+# get coords and baseline, if they exist
+coords = text_line.get_coords()
+baseline = text_line.get_baseline()
+...
+```
+## ZPD
+Developed at Centre for [Philology and Digitality](https://www.uni-wuerzburg.de/en/zpd/) (ZPD), [University of Würzburg](https://www.uni-wuerzburg.de/en/).

pypxml-1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,32 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "pypxml"
+description = "A python library for parsing, converting and modifying PageXML files. "
+keywords = ["PageXML", "XML", "OCR", "optical character recognition"]
+version = "1.0"
+readme = "README.md"
+license = { file = "LICENSE" }
+authors = [
+    { name="Janik Haitz", email="jahtz.dev@proton.me" },
+]
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+]
+dependencies = [
+    "lxml ~= 5.3.0",
+    "click ~= 8.1.7",
+]
+requires-python = ">=3.11"
+scripts = { pypxml = "cli.pypxml_cli:cli" }
+[project.urls]
+repository = "https://github.com/jahtz/pypxml"
+[tool.setuptools.packages.find]
+where = ["src"]
+exclude = ["tests", "assets"]

pypxml-1.0/setup.cfg ADDED Viewed

@@ -0,0 +1,4 @@
+[egg_info]
+tag_build =
+tag_date = 0

pypxml-1.0/src/cli/__init__.py ADDED Viewed

File without changes

pypxml-1.0/src/cli/pypxml_cli.py ADDED Viewed

@@ -0,0 +1,15 @@
+import click
+@click.group()
+@click.help_option('--help')
+@click.version_option('1.0', '--version',
+                      prog_name='PyPXML',
+                      message='%(prog)s v%(version)s - Developed at Centre for Philology and Digitality (ZPD), '
+                              'University of Würzburg')
+def cli():
+    """
+    PyPXML command line interface entry point.
+    """
+    pass

pypxml-1.0/src/pypxml/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+# This file is licensed under the MIT License.
+# Copyright (c) 2024 Janik Haitz
+# See the LICENSE file in the root directory for more details.
+from .pxml import PageXML
+from .page import Page
+from .element import Element
+from .resources.xml_schema import XMLSchema
+from .resources.xml_types import XMLType

pypxml-1.0/src/pypxml/element.py ADDED Viewed

@@ -0,0 +1,225 @@
+# This file is licensed under the MIT License.
+# Copyright (c) 2024 Janik Haitz
+# See the LICENSE file in the root directory for more details.
+from typing import Optional, Union, Self
+from lxml import etree
+from .resources.xml_types import XMLType
+class Element:
+    """ Represents an element inside a page. """
+    def __init__(self, _type: XMLType, attributes: Optional[dict[str, str]] = None):
+        self.__type: XMLType = _type
+        self.__attributes: dict[str, str] = attributes if attributes else {}
+        self.__elements: list[Element] = []
+        self.__text: Optional[str] = None
+    def __len__(self) -> int:
+        """ Returns the number of elements. """
+        return len(self.__elements)
+    def __iter__(self) -> Self:
+        """ Iterator: starting point for iterating over all elements. """
+        self.__n = 0
+        return self
+    def __next__(self) -> Self:
+        """ Iterator: yield next element. """
+        if self.__n < len(self.__elements):
+            self.__n += 1
+            return self.__elements[self.__n - 1]
+        else:
+            raise StopIteration
+    def __getitem__(self, key: Union[int, str]) -> Optional[Union[Self, str]]:
+        """
+        Get an Element object by its index or an attribute value by its key
+        :param key: Index (integer) of an Element object or a key (string) of an attribute.
+        :return: The Element of passed index (returns last object if the key is out of range) or the value of the
+            selected attribute. Returns None, if no match was found.
+        """
+        if isinstance(key, int) and len(self.__elements) > 0:
+            return self.__elements[min(key, len(self.__elements) - 1)]
+        elif isinstance(key, str) and key in self.__attributes:
+            return self.__attributes[key]
+        return None
+    def __setitem__(self, key: Union[int, str], value: Union[Self, str]) -> None:
+        """
+        Set an Element object or an attribute value.
+        :param key: Index (integer) for an Element object or a key (string) for an attribute.
+        :param value: Element object (if key is of type integer) or a string (if key is of type string).
+        """
+        if isinstance(key, int) and isinstance(value, Element) and len(self.__elements) > 0:
+            self.__elements[min(key, len(self.__elements) - 1)] = value
+        elif isinstance(key, str):
+            self.__attributes[key] = value
+        else:
+            raise ValueError('Invalid key or value')
+    def __contains__(self, key: Union[Self, str]) -> bool:
+        """
+        Checks if an Element object or an attribute exists.
+        :param key: Element object or attribute key.
+        :return: True, if either the passed Element object or the attribute exists. Else return False.
+        """
+        if isinstance(key, Element):
+            return key in self.__elements
+        elif isinstance(key, str):
+            return key in self.__attributes
+        return False
+    @property
+    def type(self) -> XMLType:
+        return self.__type
+    @type.setter
+    def type(self, value: XMLType) -> None:
+        self.__type = value
+    @property
+    def attributes(self) -> dict[str, str]:
+        return self.__attributes
+    @property
+    def elements(self) -> list[Self]:
+        return self.__elements
+    @property
+    def id(self) -> Optional[str]:
+        return self.__attributes.get('id', None)
+    @id.setter
+    def id(self, value: Optional[str]) -> None:
+        if value is None:
+            self.__attributes.pop('id', None)
+        else:
+            self.__attributes['id'] = value
+    @property
+    def text(self) -> Optional[str]:
+        return self.__text
+    @text.setter
+    def text(self, value: Optional[str]) -> None:
+        self.__text = None if value is None else str(value)
+    @classmethod
+    def new(cls, _type: XMLType, **attributes: str) -> Self:
+        """
+        Create a new Element object from scratch.
+        :param _type: The type of element to create.
+        :param attributes: Named arguments that will be stores as xml attributes.
+        :return: The newly created Element object.
+        """
+        attributes = {str(k): str(v) for k, v in attributes.items() if v is not None}
+        return cls(_type, attributes)
+    @classmethod
+    def from_etree(cls, tree: etree.Element) -> Self:
+        """
+        Create a new Element object from a lxml etree object.
+        :param tree: lxml etree object.
+        :return: Element object that represents the passed etree object.
+        """
+        element = cls(XMLType(tree.tag.split('}')[1]), dict(tree.items()))
+        element.text = tree.text
+        for child in tree:
+            element.add_element(Element.from_etree(child))
+        return element
+    def to_etree(self) -> etree.Element:
+        """
+        Convert the Element object to a lxml etree object.
+        :return: A lxml etree object that represents this Element object.
+        """
+        element = etree.Element(self.__type.value, **self.__attributes)
+        if self.__text is not None:
+            element.text = self.__text
+        for child in self.__elements:
+            element.append(child.to_etree())
+        return element
+    def is_region(self) -> bool:
+        """ Returns True, if the Element object is a region. """
+        return self.__type.value.endswith('Region')
+    def contains_text(self) -> bool:
+        """ Returns True, if the Element object contains text. """
+        return self.__text is not None
+    def set_attribute(self, key: str, value: Optional[str]) -> None:
+        """
+        Set an attribute.
+        :param key: Key of attribute. Creates a new one if the key does not exist.
+        :param value: Value for the attribute. Deletes the attribute of None is passed.
+        """
+        if value is None:
+            self.__attributes.pop(str(key), None)
+        else:
+            self.__attributes[str(key)] = str(value)
+    def delete_attribute(self, key: str) -> Optional[str]:
+        """
+        Delete an attribute.
+        :param key: The key to delete.
+        :return: Returns the deleted attribute value. If the key does not exist, None is returned.
+        """
+        return self.__attributes.pop(str(key), None)
+    def get_coords(self) -> Optional[Self]:
+        """ Return the first direct child Element object of type Coords. """
+        for element in self.__elements:
+            if element.type == XMLType.Coords:
+                return element
+        return None
+    def get_baseline(self) -> Optional[Self]:
+        """ Return the first direct child Element object of type Baseline. """
+        for element in self.__elements:
+            if element.type == XMLType.Baseline:
+                return element
+        return None
+    def add_element(self, element: Self, index: Optional[int] = None) -> None:
+        """
+        Add an existing Element object to the list elements.
+        :param element: The element to add.
+        :param index: If set, insert the element at this index. Else append to the list.
+        """
+        if index is None:
+            self.__elements.append(element)
+        else:
+            self.__elements.insert(min(index, len(self.__elements) - 1), element)
+    def create_element(self, _type: XMLType, index: int = None, **attributes: str) -> Self:
+        """
+        Create a new Element object and add it to the list of elements.
+        :param _type: XMLType of new element.
+        :param index: If set, insert the new element at this index. Else append to the list.
+        :param attributes: Named arguments that will be stores as xml attributes.
+        :return: The newly created Element object.
+        """
+        element = Element.new(_type, **attributes)
+        self.add_element(element, index)
+        return element
+    def remove_element(self, element: Union[Self, int]) -> Optional[Self]:
+        """
+        Remove an element from the list of elements.
+        :param element: The Element object or the index of the element to remove.
+        :return: The removed element, if it existed.
+        """
+        if isinstance(element, int) and element < len(self.__elements) - 1:
+            return self.__elements.pop(element)
+        elif isinstance(element, Element) and element in self.__elements:
+            self.__elements.remove(element)
+            return element
+        return None
+    def clear_elements(self) -> None:
+        """ Remove all Element objects from the list of elements. """
+        self.__elements.clear()