PyPI - docp - Versions diffs - 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

docp 0.0.0.dev1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (52) hide show

docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
docp/__init__.py +35 -6
docp/dbs/__init__.py +0 -0
docp/dbs/chroma.py +197 -0
docp/libs/_version.py +1 -0
docp/libs/changelog.py +7 -0
docp/libs/utilities.py +107 -0
docp/loaders/__init__.py +38 -0
docp/loaders/_chromabaseloader.py +338 -0
docp/loaders/_chromabaseloader.py.bak +378 -0
docp/loaders/_chromabasepdfloader.py +121 -0
docp/loaders/_chromabasepptxloader.py +123 -0
docp/loaders/chroma.py.bak +196 -0
docp/loaders/chromapdfloader.py +199 -0
docp/loaders/chromapptxloader.py +192 -0
docp/loaders/lutilities.py +52 -0
docp/objects/__init__.py +0 -0
docp/objects/_docbaseobject.py +65 -0
docp/objects/_imgobject.py +0 -0
docp/objects/_pageobject.py +127 -0
docp/objects/_slideobject.py +110 -0
docp/objects/_tableobject.py +0 -0
docp/objects/_textobject.py +64 -0
docp/objects/pdfobject.py +61 -0
docp/objects/pptxobject.py +46 -0
docp/parsers/__init__.py +0 -0
docp/parsers/_pdfbaseparser.py +236 -0
docp/parsers/_pdftableparser.py +272 -0
docp/parsers/_pdftextparser.py +263 -0
docp/parsers/_pptxbaseparser.py +93 -0
docp/parsers/_pptxtextparser.py +115 -0
docp/parsers/pdfparser.py +62 -0
docp/parsers/pptxparser.py +51 -0
docp/parsers/putilities.py +48 -0
{docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
docp-0.2.0.dist-info/METADATA +110 -0
docp-0.2.0.dist-info/RECORD +49 -0
{docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
docp/_version.py +0 -1
docp-0.0.0.dev1.dist-info/METADATA +0 -55
docp-0.0.0.dev1.dist-info/RECORD +0 -7
{docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0

docp/parsers/_pdftextparser.py ADDED Viewed

@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the logic for parsing text from a PDF
+            document.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+.. attention::
+            This module is *not* designed to be interacted with
+            directly, only via the appropriate interface class(es).
+            Rather, please create an instance of a PDF document parsing
+            object using the following:
+                - :class:`~docp.parsers.pdfparser.PDFParser`
+.. note::
+            **Multi-processing**
+            Text extraction through multi-processing has been tested and
+            is not feasible due to an error indicating
+            the ``pdfplumber.page.Page`` object can not be pickled. This
+            object was being passed into the extraction method as the
+            object contains the :func:`extract_text` function.
+            Additionally, multi-threading has also been tested and
+            it was determined to be too complex and inefficient. This was
+            tested using the ``concurrent.futures.ThreadPoolExecutor``
+            class and two documents, 14 and 92 pages; the timings are
+            shown below. The multi-threaded approach took longer to
+            process and added unnecessary complexity to the code base.
+            As a side-effect, the pages are processed and stored out of
+            order which would require a re-order, adding more complexity.
+            It has therefore been determined that this module will remain
+            single-threaded.
+            **Multi-Thread Timings**
+                - **Single-threaded:**
+                  - 14 page document: ~2 seconds
+                  - 92 page document: ~32 seconds
+                - **Multi-threaded:**
+                  - 14 page document: ~2 seconds
+                  - 92 page document: ~35 seconds
+"""
+# pylint: disable=import-error
+from __future__ import annotations
+from unidecode import unidecode
+# locals
+from objects._pageobject import PageObject
+from parsers._pdfbaseparser import _PDFBaseParser
+class _PDFTextParser(_PDFBaseParser):
+    """Private PDF document text parser intermediate class.
+    Args:
+        path (str): Full path to the PDF document.
+    :Example:
+        Extract text from a PDF file::
+            >>> from docp import PDFParser
+            >>> pdf = PDFParser(path='/path/to/myfile.pdf')
+            >>> pdf.extract_text()
+            # Access the content of page 1.
+            >>> pg1 = pdf.doc.pages[1].content
+    """
+    def extract_text(self,
+                     *,
+                     remove_header: bool=False,
+                     remove_footer: bool=False,
+                     remove_newlines: bool=False,
+                     ignore_tags: set=None,
+                     convert_to_ascii: bool=True,
+                     **kwargs):
+        """Extract text from the document.
+        If the PDF document contains 'marked content' tags, these tags
+        are used to extract the text as this is a more accurate approach
+        and respects the structure of the page(s). Otherwise, a bounding
+        box method is used to extract the text. If instructed, the
+        header and/or footer regions can be excluded.
+        .. tip:
+            If a tag-based extract is used, the header/footer should be
+            automatically excluded as these will often have an 'Artifact'
+            tag, which is excluded by default, by passing
+            ``ignore_tags=None``.
+            To *keep* the header and footer, pass ``ignore_tags='na'``.
+        A list of pages, with extracted content can be accessed using
+        the :attr:`self.doc.pages` attribute.
+        Args:
+            remove_header (bool, optional): If True, the header is
+                cropped (skipped) from text extraction. This only applies
+                to the bounding box extraction method. Defaults to False.
+            remove_footer (bool, optional): If True, the footer is
+                cropped (skipped) from text extraction. This only applies
+                to the bounding box extraction method. Defaults to False.
+            remove_newlines (bool, optional): If True, the newline
+                characters are replaced with a space. Defaults to False.
+            ignore_tags (set, optional): If provided, these are the
+                PDF 'marked content' tags which will be ignored. Note
+                that the PDF document must contain tags, otherwise the
+                bounding box method is used and this argument is ignored.
+                Defaults to ``{'Artifact'}``, as these generally
+                relate to a header and/or footer. To include all tags,
+                (not skip any) pass this argument as ``'na'``.
+            convert_to_ascii (bool, optional): When a non-ASCII character
+                is found, an attempt is made to convert it to an
+                associated ASCII character. If a character cannot be
+                converted, it is replaced with a ``'?'``.
+                Defaults to True.
+        :Keyword Args:
+            - None
+        Returns:
+            None.
+        """
+        # pylint: disable=unused-argument  # **kwargs
+        # pylint: disable=unnecessary-dunder-call
+        if len(self.doc.pages) > 1:
+            # Reinitialise the doc object and reopen the document.
+            self.__init__(path=self._path)
+        # If tags are found, these are used for text extraction. If tags
+        # are not found, a bounding box is used to remove the header and
+        # footer, if instructed.
+        if self._uses_marked_content():
+            match ignore_tags:
+                case None: ignore_tags = {'Artifact'}
+                case 'na': ignore_tags = set()
+            # Involves more processing, but also more accurate.
+            self._extract_text_using_tags(ignore_tags=ignore_tags, remove_newlines=remove_newlines)
+        else:
+            bbox = self._get_crop_coordinates(skip_header=remove_header, skip_footer=remove_footer)
+            self._extract_text_using_bbox(bbox=bbox, remove_newlines=remove_newlines)
+        if convert_to_ascii:
+            for page in self.doc.pages:
+                page.content = unidecode(string=page.content,
+                                         errors='replace',
+                                         replace_str='?')
+    def _extract_text_using_bbox(self, **kwargs):
+        """Extract text using a bbox for finding the header and footer.
+       :Keyword Arguments:
+            Those passed by the caller, :meth:`~extract_text`.
+        """
+        for page in self.doc.parser.pages:
+            text = page.within_bbox(bbox=kwargs['bbox']).extract_text().strip()
+            if kwargs['remove_newlines']:
+                text = text.replace('\n', ' ')
+            self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
+    def _extract_text_using_tags(self, **kwargs):
+        """Extract text using tags.
+        The tags defined by the ``ignore_tags`` are skipped.
+        :Keyword Arguments:
+            Those passed by the caller, :meth:`~extract_text`.
+        """
+        # pylint: disable=protected-access
+        ignored = kwargs['ignore_tags']
+        self.doc._tags = True  # Set the doc's 'parsed_using_tags' flag.
+        for page in self.doc.parser.pages:
+            text = ''.join(self._text_from_tags(page=page, ignored=ignored))
+            if kwargs['remove_newlines']:
+                text = text.replace('\n', ' ')
+            self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
+    @staticmethod
+    def _text_from_tags(page: pdfplumber.page.Page, ignored: set) -> str:  # pylint: disable=undefined-variable  # noqa
+        """Generate a page of text extracted from tags.
+        When extracting text from tags, newlines are not encoded and must
+        be derived. For each character on the page, the top and bottom
+        coordinates are compared to determine when a newline should be
+        inserted. If both the top and bottom of the current character
+        are greater than the previous character, a newline is inserted
+        into the text stream.
+        Args:
+            page (pdfplumber.page.Page): Page to be parsed.
+            ignored (set): A set containing the tags to be ignored.
+        Yields:
+            str: Each character on the page, providing its tag is not to
+            be ignored. Or, a newline character if the current
+            character's coordinates are greater than (lower on the page)
+            than the previous character.
+        """
+        if page.chars:
+            # Micro-optimisation: Push tag filtering down to the C-level.
+            chars = filter(lambda x: x['tag'] not in ignored, page.chars)
+            top, btm = 999, 999
+            for c in chars:
+                if top < c['top'] and btm < c['bottom']:
+                    yield '\n'
+                yield c['text']
+                top, btm = c['top'], c['bottom']
+        yield ''
+    def _uses_marked_content(self) -> bool:
+        """Test whether the document can be parsed using tags.
+        Marked content allows us to parse the PDF using tags (rather than
+        OCR) which is more accurate not only in terms of character
+        recognition, but also with regard to the structure of the text on
+        a page.
+        :Logic:
+            If the document's catalog shows ``Marked: True``, then
+            ``True`` is returned immediately.
+            Otherwise, a second attempt is made which detects marked
+            content tags on the first three pages. If no tags are found,
+            a third attempt is made by searching the first 10 pages. If
+            tags are found during either of these attempts, ``True`` is
+            returned immediately.
+            Finally, if no marked content or tags were found, ``False``
+            is returned.
+        Returns:
+            bool: Returns True if the document can be parsed using marked
+            content tags, otherwise False.
+        """
+        # Use pdfminer.six to get the document's catalog.
+        if self.doc.parser.doc.catalog.get('MarkInfo', {}).get('Marked', False):
+            return True
+        # Check only first three pages for tags first, if found, get out.
+        # If not, retry with the first 10 pages.
+        for i in [3, 10]:
+            tags = set(c['tag'] for p in self.doc.parser.pages[:i] for c in p.chars)
+            if tags != {None}:
+                return True
+        return False

docp/parsers/_pptxbaseparser.py ADDED Viewed

@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides generalised base functionality for
+            parsing PPTX documents.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+.. attention::
+            This module is *not* designed to be interacted with
+            directly, only via the appropriate interface class(es).
+            Rather, please create an instance of a PPTX document parsing
+            object using the following:
+                - :class:`~docp.parsers.pptxparser.PPTXParser`
+"""
+# pylint: disable=protected-access
+import os
+from pptx import Presentation
+# locals
+try:
+    from libs.utilities import utilities
+    from objects.pptxobject import DocPPTX
+except ImportError:
+    from .libs.utilities import utilities
+    from .objects.pptxobject import DocPPTX
+class _PPTXBaseParser:
+    """Base class containing generalised PPTX parsing functionality."""
+    def __init__(self, path: str):
+        """Private base parser class initialiser.
+        Args:
+            path (str): Full path to the document to be parsed.
+        """
+        self._path = path
+        self._doc = DocPPTX()
+        self._set_paths()
+        self._open()
+    @property
+    def doc(self) -> DocPPTX:
+        """Accessor to the document object."""
+        return self._doc
+    def _open(self) -> None:
+        """Open the PPTX document for reading.
+        Before opening the file, a test is performed to ensure the PPTX
+        is valid. The file must:
+            - exist
+            - be a ZIP archive, per the file signature
+            - have a .pptx file extension
+        :Other Operations:
+            - Store the ``pptx.Presentation`` parser object returned
+              from the :func:`pptx.Presentation` instance creation into
+              the :attr:`self._doc._parser` attribute.
+            - Store the number of pages into the
+              :attr:`self._doc._npages` attribute.
+            - Store the document's meta data into the
+              :attr:`self._doc._meta` attribute.
+        Raises:
+            TypeError: Raised if the file type criteria above are not
+            met.
+        """
+        if all((os.path.exists(self._doc._fpath),
+                utilities.iszip(self._doc._fpath),
+                os.path.splitext(self._doc._fpath)[1].lower() == '.pptx')):
+            self._doc._parser = Presentation(self._doc._fpath)
+            self._doc._npages = len(self._doc._parser.slides)
+            self._doc._meta = self._doc._parser.core_properties
+        else:
+            msg = f'{self._doc._fname} is not a valid PPTX file.'
+            raise TypeError(msg)
+    def _set_paths(self) -> None:
+        """Set the document's file path attributes."""
+        self._doc._fpath = os.path.realpath(self._path)
+        self._doc._fname = os.path.basename(self._path)

docp/parsers/_pptxtextparser.py ADDED Viewed

@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the logic for parsing text from a PPTX
+            document.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+.. attention::
+            This module is *not* designed to be interacted with
+            directly, only via the appropriate interface class(es).
+            Rather, please create an instance of a PPTX document parsing
+            object using the following:
+                - :class:`~docp.parsers.pptxparser.PPTXParser`
+"""
+from unidecode import unidecode
+# locals
+try:
+    from .objects._slideobject import SlideObject
+    from .objects._textobject import TextObject
+    from .parsers._pptxbaseparser import _PPTXBaseParser
+except ImportError:
+    from objects._slideobject import SlideObject
+    from objects._textobject import TextObject
+    from parsers._pptxbaseparser import _PPTXBaseParser
+class _PPTXTextParser(_PPTXBaseParser):
+    """Private PPTX document text parser intermediate class.
+    Args:
+        path (str): Full path to the PPTX document.
+    :Example:
+        Extract text from a PPTX file::
+            >>> from docp import PPTXParser
+            >>> pptx = PPTXParser(path='/path/to/myfile.pptx')
+            >>> pptx.extract_text()
+            # Access the text on slide 1.
+            >>> pg1 = pptx.doc.slides[1].content
+    """
+    def extract_text(self,
+                     *,
+                     remove_newlines: bool=False,
+                     convert_to_ascii: bool=True,
+                     **kwargs) -> None:
+        """Extract text from the document.
+        A list of slides, with extracted content can be accessed using
+        the :attr:`self.doc.slides` attribute.
+        Args:
+            remove_newlines (bool, optional): If True, the newline
+                characters are replaced with a space. Defaults to False.
+            convert_to_ascii (bool, optional): When a non-ASCII character
+                is found, an attempt is made to convert it to an
+                associated ASCII character. If a character cannot be
+                converted, it is replaced with a ``'?'``.
+                Defaults to True.
+        :Keyword Args:
+            - None
+        Returns:
+            None.
+        """
+        # pylint: disable=unused-argument  # **kwargs
+        # pylint: disable=unnecessary-dunder-call
+        if len(self.doc.slides) > 1:
+            # Reinitialise the doc object and reopen the document.
+            self.__init__(path=self._path)
+        self._extract_text(remove_newlines=remove_newlines, convert_to_ascii=convert_to_ascii)
+    def _extract_text(self, remove_newlines: bool, convert_to_ascii: bool) -> None:
+        """Extract the text from all shapes on all slides.
+        Args:
+            remove_newlines (bool): Replace the newline characters with
+                a space.
+            convert_to_ascii (bool): Attempt to convert any non-ASCII
+                characters to their ASCII equivalent.
+        The text extracted from each slide is stored as a ``TextObject``
+        which is appended to the slide's ``texts`` attribute.
+        """
+        for idx, slide in enumerate(self.doc.parser.slides, 1):
+            _slideobj = SlideObject(pageno=idx, parser=slide)
+            for shape in slide.shapes:
+                if hasattr(shape, 'text'):
+                    if shape.text:
+                        text = shape.text
+                        if remove_newlines:
+                            text = text.replace('\n', ' ')
+                        if convert_to_ascii:
+                            text = unidecode(string=text,
+                                             errors='replace',
+                                             replace_str='?')
+                        _textobj = TextObject(content=text)
+                        _slideobj.texts.append(_textobj)
+            self.doc.slides.append(_slideobj)

docp/parsers/pdfparser.py ADDED Viewed

@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module serves as the public interface for interacting
+            with PDF files and parsing their contents.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+:Example:   For example code usage, please refer to the
+            :class:`PDFParser` class docstring.
+"""
+# pylint: disable=import-error
+# pylint: disable=wrong-import-position
+# Set sys.path for relative imports.
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+# locals
+from parsers._pdftableparser import _PDFTableParser
+from parsers._pdftextparser import _PDFTextParser
+class PDFParser(_PDFTableParser, _PDFTextParser):
+    """PDF document parser.
+    Args:
+        path (str): Full path to the PDF document to be parsed.
+    :Example:
+        Extract text from a PDF file::
+            >>> from docp import PDFParser
+            >>> pdf = PDFParser(path='/path/to/myfile.pdf')
+            >>> pdf.extract_text()
+            # Access the content of page 1.
+            >>> pg1 = pdf.doc.pages[1].content
+        Extract tables from a PDF file::
+            >>> from docp import PDFParser
+            >>> pdf = PDFParser('/path/to/myfile.pdf')
+            >>> pdf.extract_tables()
+            # Access the first table on page 1.
+            >>> tbl1 = pdf.doc.pages[1].tables[1]
+    """
+    def __init__(self, path: str):
+        """PDF parser class initialiser."""
+        super().__init__(path=path)

docp/parsers/pptxparser.py ADDED Viewed

@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module serves as the public interface for interacting
+            with PPTX files and parsing their contents.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+:Example:   For example code usage, please refer to the
+            :class:`PPTXParser` class docstring.
+"""
+# Set sys.path for relative imports.
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+# locals
+try:
+    from .parsers._pptxtextparser import _PPTXTextParser
+except ImportError:
+    from parsers._pptxtextparser import _PPTXTextParser
+class PPTXParser(_PPTXTextParser):
+    """PPTX document parser.
+    Args:
+        path (str): Full path to the PPTX document to be parsed.
+    :Example:
+        Extract text from a PPTX file::
+            >>> from docp import PPTXParser
+            >>> pptx = PPTXParser(path='/path/to/myfile.pptx')
+            >>> pptx.extract_text()
+            # Access the text on slide 1.
+            >>> pg1 = pptx.doc.slides[1].content
+    """
+    def __init__(self, path: str):
+        """PPTX parser class initialiser."""
+        super().__init__(path=path)

docp/parsers/putilities.py ADDED Viewed

@@ -0,0 +1,48 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides parser-specific utility functions for
+            the project.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+"""
+# locals
+try:
+    from .libs.utilities import utilities
+    from .parsers.pdfparser import PDFParser
+    from .parsers.pptxparser import PPTXParser
+except ImportError:
+    from libs.utilities import utilities
+    from parsers.pdfparser import PDFParser
+    from parsers.pptxparser import PPTXParser
+class ParserUtilities:
+    """Parser-based (cross-project) utility functions."""
+    def get_parser(self, path: str) -> PDFParser | PPTXParser:
+        """Return the appropriate parser for the file type.
+        Args:
+            path (str): Full path to the file to be tested.
+        Returns:
+            PDFParser | PPTXParser: The appropriate parser for the file,
+            given the *file signature*; this test is not file extension
+            based.
+        """
+        if utilities.ispdf(path=path):
+            return PDFParser
+        if utilities.iszip(path=path):
+            return PPTXParser
+        raise NotImplementedError('A parser is not available for: os.path.basename(path)')
+putilities = ParserUtilities()

docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl

docp 0.0.0.dev1py3-none-any.whl → 0.2.0py3-none-any.whl