PyPI - docp - Versions diffs - 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

docp 0.1.0b1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

docp/objects/_textobject.py CHANGED Viewed

@@ -0,0 +1,64 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the implementation for the
+            ``TextObject`` object.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+"""
+class TextObject:
+    """This class provides the implementation for the ``TextObject``.
+    For each page (or slide) in a document, an instance of this class is
+    created, populated and appended into the page's ``texts`` list
+    attribute.
+    Args:
+        content (str): Page content as a single string.
+    Note:
+        No string cleaning is performed by this class. The string
+        contained in the :attr:`contents` attribute is stored exactly as
+        extracted from the page or slide's text object.
+    """
+    __slots__ = ('_content', '_hastext')
+    def __init__(self, content: str):
+        """Text object class initialiser."""
+        self._content = content
+        self._hastext = bool(content)
+    def __str__(self) -> str:
+        """When printing this object, display the text contents."""
+        return self._content
+    @property
+    def content(self) -> str:
+        """Accessor to the textual content."""
+        return self._content
+    @content.setter
+    def content(self, value: str) -> None:
+        """Setter for the ``content`` attribute.
+        If the ``value`` argument is populated, the content is set and
+        the ``hastext`` attribute is set to ``True``.
+        """
+        if value:
+            self._content = value
+            self._hastext = True
+    @property
+    def hastext(self) -> bool:
+        """Flag indicating if the ``content`` attribute is populated."""
+        return self._hastext

docp/objects/pdfobject.py CHANGED Viewed

@@ -12,9 +12,13 @@
 :Comments:  n/a
 """
-# pylint: disable=import-error
-from objects._docbaseobject import _DocBase
+try:
+    from .objects._docbaseobject import _DocBase
+    from .objects._pageobject import PageObject
+except ImportError:
+    from objects._docbaseobject import _DocBase
+    from objects._pageobject import PageObject
 class DocPDF(_DocBase):
@@ -24,6 +28,24 @@ class DocPDF(_DocBase):
         """PDF document object class initialiser."""
         super().__init__()
         self._tags = False
+        # List of PageObjects, offset by 1 to align the index with page numbers.
+        self._pages = [PageObject(pageno=0)]
+    @property
+    def pages(self) -> list[PageObject]:
+        """A list of containing an object for each page in the document.
+        .. tip::
+            The page number index aligns to the page number in the PDF
+            file.
+            For example, to access the ``PageObject`` for page 42, use::
+                pages[42]
+       """
+        return self._pages
     @property
     def parsed_using_tags(self) -> bool:

docp/objects/pptxobject.py ADDED Viewed

@@ -0,0 +1,46 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the 'PPTX Document' object structure
+            into which MS PowerPoint documents are parsed into for
+            transport and onward use.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+"""
+try:
+    from .objects._docbaseobject import _DocBase
+    from .objects._slideobject import SlideObject
+except ImportError:
+    from objects._docbaseobject import _DocBase
+    from objects._slideobject import SlideObject
+class DocPPTX(_DocBase):
+    """Container class for storing data parsed from a PPTX file."""
+    def __init__(self):
+        """PPTX document object class initialiser."""
+        super().__init__()
+        self._slides = [SlideObject(pageno=0)]
+    @property
+    def slides(self) -> list[SlideObject]:
+        """A list of containing an object for each slide in the document.
+        .. tip::
+            The slide number index aligns to the slide number in the
+            PPTX file.
+            For example, to access the ``SlideObject`` for side 42, use::
+                slides[42]
+       """
+        return self._slides

docp/parsers/_pdfbaseparser.py CHANGED Viewed

@@ -8,13 +8,17 @@
 :Developer: J Berendt
 :Email:     development@s3dev.uk
-Note:       This module is *not* designed to be interacted with
+:Comments:  n/a
+.. attention::
+            This module is *not* designed to be interacted with
             directly, only via the appropriate interface class(es).
             Rather, please create an instance of a PDF document parsing
-            object using the following:
+            object using the following class:
-                - :class:`pdfparser.PDFParser`
+                - :class:`~docp.parsers.pdfparser.PDFParser`
 """
 # pylint: disable=import-error
@@ -26,7 +30,12 @@ import pdfplumber
 from collections import Counter
 from unidecode import unidecode
 # locals
-from objects.pdfobject import DocPDF
+try:
+    from .libs.utilities import utilities
+    from .objects.pdfobject import DocPDF
+except ImportError:
+    from libs.utilities import utilities
+    from objects.pdfobject import DocPDF
 class _PDFBaseParser:
@@ -94,7 +103,7 @@ class _PDFBaseParser:
             case 1: num = 1
             case _ if npages in range(2, 11): num = 2
             case _: num = 5
-        pg = self._doc.parser.pages[num]  # The pages list has a has a page offset at [0].
+        pg = self._doc.parser.pages[num - 1]  # The parser does not have a page offset at [0].
         # Default coordinates to the whole page.
         coords = {'x0': 0, 'top': 0, 'x1': pg.width, 'bottom': pg.height}
         # If the header and/or footer is to be skipped, find and iterate
@@ -117,6 +126,13 @@ class _PDFBaseParser:
     def _open(self) -> None:
         """Open the PDF document for reading.
+        Before opening the file, a test is performed to ensure the PDF
+        is valid. The file must:
+            - exist
+            - be a valid PDF file, per the file signature
+            - have a .pdf file extension
         :Other Operations:
             - Store the ``pdfplumber`` parser object returned from the
@@ -127,10 +143,20 @@ class _PDFBaseParser:
             - Store the document's meta data into the
               :attr:`self._doc._meta` attribute.
+        Raises:
+            TypeError: Raised if the file type criteria above are not
+            met.
         """
-        self._doc._parser = pdfplumber.open(self._doc._fpath)
-        self._doc._npages = len(self._doc._parser.pages)
-        self._doc._meta = self._doc._parser.metadata
+        if all((os.path.exists(self._doc._fpath),
+                utilities.ispdf(self._doc._fpath),
+                os.path.splitext(self._doc._fpath)[1].lower() == '.pdf')):
+            self._doc._parser = pdfplumber.open(self._doc._fpath)
+            self._doc._npages = len(self._doc._parser.pages)
+            self._doc._meta = self._doc._parser.metadata
+        else:
+            msg = f'{self._doc._fname} is not a valid PDF file.'
+            raise TypeError(msg)
     @staticmethod
     def _prepare_row(row: list) -> str:
@@ -196,13 +222,13 @@ class _PDFBaseParser:
         # Only scan if document has more than three pages.
         if self._doc.npages < 4:
             return []
-        if self._doc.common is None:
+        if self._doc._common is None:
             # Create a line generator for all pages.
             lines = (l for p in self._doc.parser.pages for l in p.extract_text().split('\n'))
             # Return the lines whose occurrence rate is 90% of document pages.
             self._doc._common = [i[0] for i in Counter(lines).most_common()
                                  if i[1] > self._doc.npages * 0.9]
-        return self._doc.common
+        return self._doc._common
     def _set_paths(self) -> None:
         """Set the document's file path attributes."""

docp/parsers/_pdftableparser.py CHANGED Viewed

@@ -8,14 +8,15 @@
 :Developer: J Berendt
 :Email:     jeremy.berendt@rolls-royce.com
-Note:       This module is *not* designed to be interacted with
+.. attention::
+            This module is *not* designed to be interacted with
             directly, only via the appropriate interface class(es).
             Rather, please create an instance of a PDF document parsing
             object using the following:
-                - :class:`pdfparser.PDFParser`
+                - :class:`~docp.parsers.pdfparser.PDFParser`
 """
 # pylint: disable=import-error
@@ -35,7 +36,6 @@ _SETTINGS = {'vertical_strategy': 'lines',
              'snap_x_tolerance': 12}
-# TODO: Revise the docstring.
 class _PDFTableParser(_PDFBaseParser):
     """Private PDF document table parser intermediate class.
@@ -46,10 +46,9 @@ class _PDFTableParser(_PDFBaseParser):
         Extract tables from a PDF file::
-            >>> from docutils.parsers.pdf import PDFParser
+            >>> from docp import PDFParser
-            >>> path = '/path/to/myfile.pdf'
-            >>> pdf = PDFParser(path)
+            >>> pdf = PDFParser(path='/path/to/myfile.pdf')
             >>> pdf.extract_tables()
             >>> tables = pdf.doc.tables

docp/parsers/_pdftextparser.py CHANGED Viewed

@@ -8,17 +8,22 @@
 :Developer: J Berendt
 :Email:     development@s3dev.uk
-Note:       This module is *not* designed to be interacted with
+.. attention::
+            This module is *not* designed to be interacted with
             directly, only via the appropriate interface class(es).
             Rather, please create an instance of a PDF document parsing
             object using the following:
-                - :class:`pdfparser.PDFParser`
+                - :class:`~docp.parsers.pdfparser.PDFParser`
+.. note::
+            **Multi-processing**
-Note:       **Multi-processing:**
             Text extraction through multi-processing has been tested and
-            is not feesible due to an error indicating
+            is not feasible due to an error indicating
             the ``pdfplumber.page.Page`` object can not be pickled. This
             object was being passed into the extraction method as the
             object contains the :func:`extract_text` function.
@@ -35,17 +40,17 @@ Note:       **Multi-processing:**
             It has therefore been determined that this module will remain
             single-threaded.
-           **Multi-Thread Timings**
+            **Multi-Thread Timings**
-           **Single-threaded:**
+                - **Single-threaded:**
-                - 14 page document: ~2 seconds
-                - 92 page document: ~32 seconds
+                  - 14 page document: ~2 seconds
+                  - 92 page document: ~32 seconds
-           **Multi-threaded:**
+                - **Multi-threaded:**
-                - 14 page document: ~2 seconds
-                - 92 page document: ~35 seconds
+                  - 14 page document: ~2 seconds
+                  - 92 page document: ~35 seconds
 """
 # pylint: disable=import-error
@@ -83,7 +88,8 @@ class _PDFTextParser(_PDFBaseParser):
                      remove_footer: bool=False,
                      remove_newlines: bool=False,
                      ignore_tags: set=None,
-                     convert_to_ascii: bool=True):
+                     convert_to_ascii: bool=True,
+                     **kwargs):
         """Extract text from the document.
         If the PDF document contains 'marked content' tags, these tags
@@ -125,10 +131,14 @@ class _PDFTextParser(_PDFBaseParser):
                 converted, it is replaced with a ``'?'``.
                 Defaults to True.
+        :Keyword Args:
+            - None
         Returns:
             None.
         """
+        # pylint: disable=unused-argument  # **kwargs
         # pylint: disable=unnecessary-dunder-call
         if len(self.doc.pages) > 1:
             # Reinitialise the doc object and reopen the document.
@@ -216,7 +226,7 @@ class _PDFTextParser(_PDFBaseParser):
         yield ''
     def _uses_marked_content(self) -> bool:
-        """Test wether the document can be parsed using tags.
+        """Test whether the document can be parsed using tags.
         Marked content allows us to parse the PDF using tags (rather than
         OCR) which is more accurate not only in terms of character

docp/parsers/_pptxbaseparser.py ADDED Viewed

@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides generalised base functionality for
+            parsing PPTX documents.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+.. attention::
+            This module is *not* designed to be interacted with
+            directly, only via the appropriate interface class(es).
+            Rather, please create an instance of a PPTX document parsing
+            object using the following:
+                - :class:`~docp.parsers.pptxparser.PPTXParser`
+"""
+# pylint: disable=protected-access
+import os
+from pptx import Presentation
+# locals
+try:
+    from libs.utilities import utilities
+    from objects.pptxobject import DocPPTX
+except ImportError:
+    from .libs.utilities import utilities
+    from .objects.pptxobject import DocPPTX
+class _PPTXBaseParser:
+    """Base class containing generalised PPTX parsing functionality."""
+    def __init__(self, path: str):
+        """Private base parser class initialiser.
+        Args:
+            path (str): Full path to the document to be parsed.
+        """
+        self._path = path
+        self._doc = DocPPTX()
+        self._set_paths()
+        self._open()
+    @property
+    def doc(self) -> DocPPTX:
+        """Accessor to the document object."""
+        return self._doc
+    def _open(self) -> None:
+        """Open the PPTX document for reading.
+        Before opening the file, a test is performed to ensure the PPTX
+        is valid. The file must:
+            - exist
+            - be a ZIP archive, per the file signature
+            - have a .pptx file extension
+        :Other Operations:
+            - Store the ``pptx.Presentation`` parser object returned
+              from the :func:`pptx.Presentation` instance creation into
+              the :attr:`self._doc._parser` attribute.
+            - Store the number of pages into the
+              :attr:`self._doc._npages` attribute.
+            - Store the document's meta data into the
+              :attr:`self._doc._meta` attribute.
+        Raises:
+            TypeError: Raised if the file type criteria above are not
+            met.
+        """
+        if all((os.path.exists(self._doc._fpath),
+                utilities.iszip(self._doc._fpath),
+                os.path.splitext(self._doc._fpath)[1].lower() == '.pptx')):
+            self._doc._parser = Presentation(self._doc._fpath)
+            self._doc._npages = len(self._doc._parser.slides)
+            self._doc._meta = self._doc._parser.core_properties
+        else:
+            msg = f'{self._doc._fname} is not a valid PPTX file.'
+            raise TypeError(msg)
+    def _set_paths(self) -> None:
+        """Set the document's file path attributes."""
+        self._doc._fpath = os.path.realpath(self._path)
+        self._doc._fname = os.path.basename(self._path)

docp/parsers/_pptxtextparser.py ADDED Viewed

@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the logic for parsing text from a PPTX
+            document.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+.. attention::
+            This module is *not* designed to be interacted with
+            directly, only via the appropriate interface class(es).
+            Rather, please create an instance of a PPTX document parsing
+            object using the following:
+                - :class:`~docp.parsers.pptxparser.PPTXParser`
+"""
+from unidecode import unidecode
+# locals
+try:
+    from .objects._slideobject import SlideObject
+    from .objects._textobject import TextObject
+    from .parsers._pptxbaseparser import _PPTXBaseParser
+except ImportError:
+    from objects._slideobject import SlideObject
+    from objects._textobject import TextObject
+    from parsers._pptxbaseparser import _PPTXBaseParser
+class _PPTXTextParser(_PPTXBaseParser):
+    """Private PPTX document text parser intermediate class.
+    Args:
+        path (str): Full path to the PPTX document.
+    :Example:
+        Extract text from a PPTX file::
+            >>> from docp import PPTXParser
+            >>> pptx = PPTXParser(path='/path/to/myfile.pptx')
+            >>> pptx.extract_text()
+            # Access the text on slide 1.
+            >>> pg1 = pptx.doc.slides[1].content
+    """
+    def extract_text(self,
+                     *,
+                     remove_newlines: bool=False,
+                     convert_to_ascii: bool=True,
+                     **kwargs) -> None:
+        """Extract text from the document.
+        A list of slides, with extracted content can be accessed using
+        the :attr:`self.doc.slides` attribute.
+        Args:
+            remove_newlines (bool, optional): If True, the newline
+                characters are replaced with a space. Defaults to False.
+            convert_to_ascii (bool, optional): When a non-ASCII character
+                is found, an attempt is made to convert it to an
+                associated ASCII character. If a character cannot be
+                converted, it is replaced with a ``'?'``.
+                Defaults to True.
+        :Keyword Args:
+            - None
+        Returns:
+            None.
+        """
+        # pylint: disable=unused-argument  # **kwargs
+        # pylint: disable=unnecessary-dunder-call
+        if len(self.doc.slides) > 1:
+            # Reinitialise the doc object and reopen the document.
+            self.__init__(path=self._path)
+        self._extract_text(remove_newlines=remove_newlines, convert_to_ascii=convert_to_ascii)
+    def _extract_text(self, remove_newlines: bool, convert_to_ascii: bool) -> None:
+        """Extract the text from all shapes on all slides.
+        Args:
+            remove_newlines (bool): Replace the newline characters with
+                a space.
+            convert_to_ascii (bool): Attempt to convert any non-ASCII
+                characters to their ASCII equivalent.
+        The text extracted from each slide is stored as a ``TextObject``
+        which is appended to the slide's ``texts`` attribute.
+        """
+        for idx, slide in enumerate(self.doc.parser.slides, 1):
+            _slideobj = SlideObject(pageno=idx, parser=slide)
+            for shape in slide.shapes:
+                if hasattr(shape, 'text'):
+                    if shape.text:
+                        text = shape.text
+                        if remove_newlines:
+                            text = text.replace('\n', ' ')
+                        if convert_to_ascii:
+                            text = unidecode(string=text,
+                                             errors='replace',
+                                             replace_str='?')
+                        _textobj = TextObject(content=text)
+                        _slideobj.texts.append(_textobj)
+            self.doc.slides.append(_slideobj)

docp/parsers/pptxparser.py ADDED Viewed

@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module serves as the public interface for interacting
+            with PPTX files and parsing their contents.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+:Example:   For example code usage, please refer to the
+            :class:`PPTXParser` class docstring.
+"""
+# Set sys.path for relative imports.
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+# locals
+try:
+    from .parsers._pptxtextparser import _PPTXTextParser
+except ImportError:
+    from parsers._pptxtextparser import _PPTXTextParser
+class PPTXParser(_PPTXTextParser):
+    """PPTX document parser.
+    Args:
+        path (str): Full path to the PPTX document to be parsed.
+    :Example:
+        Extract text from a PPTX file::
+            >>> from docp import PPTXParser
+            >>> pptx = PPTXParser(path='/path/to/myfile.pptx')
+            >>> pptx.extract_text()
+            # Access the text on slide 1.
+            >>> pg1 = pptx.doc.slides[1].content
+    """
+    def __init__(self, path: str):
+        """PPTX parser class initialiser."""
+        super().__init__(path=path)

docp/parsers/putilities.py ADDED Viewed

@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides parser-specific utility functions for
+            the project.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+"""
+# locals
+try:
+    from .libs.utilities import utilities
+    from .parsers.pdfparser import PDFParser
+    from .parsers.pptxparser import PPTXParser
+except ImportError:
+    from libs.utilities import utilities
+    from parsers.pdfparser import PDFParser
+    from parsers.pptxparser import PPTXParser
+class ParserUtilities:
+    """Parser-based (cross-project) utility functions."""
+    def get_parser(self, path: str) -> PDFParser | PPTXParser:
+        """Return the appropriate parser for the file type.
+        Args:
+            path (str): Full path to the file to be tested.
+        Returns:
+            PDFParser | PPTXParser: The appropriate parser for the file,
+            given the *file signature*; this test is not file extension
+            based.
+        """
+        if utilities.ispdf(path=path):
+            return PDFParser
+        if utilities.iszip(path=path):
+            return PPTXParser
+        raise NotImplementedError('A parser is not available for: os.path.basename(path)')
+putilities = ParserUtilities()

docp 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl

docp 0.1.0b1py3-none-any.whl → 0.2.0py3-none-any.whl