PyPI - docp - Versions diffs - 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

docp 0.1.0b1py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

docp/loaders/chromapdfloader.py ADDED Viewed

@@ -0,0 +1,199 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the entry point for loading PDF files
+            into a Chroma database.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+:Examples:
+    Parse and load a *single* PDF file into a Chroma database
+    collection::
+        >>> from docp.loaders import ChromaPDFLoader
+        >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
+                                collection='spam')
+        >>> l.load(path='/path/to/directory/myfile.pdf')
+    Parse and load a *directory* of PDF files into a Chroma database
+    collection::
+        >>> from docp.loaders import ChromaPDFLoader
+        >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
+                                collection='spam')
+        >>> l.load(path='/path/to/directory', ext='pdf')
+    For further example code use, please refer to the
+    :class:`ChromaPDFLoader` class docstring.
+"""
+import os
+# locals
+try:
+    from .libs.utilities import utilities
+    from .loaders._chromabasepdfloader import _ChromaBasePDFLoader
+except ImportError:
+    from libs.utilities import utilities
+    from loaders._chromabasepdfloader import _ChromaBasePDFLoader
+class ChromaPDFLoader(_ChromaBasePDFLoader):
+    """Chroma database PDF-specific document loader.
+    Args:
+        dbpath (str | ChromaDB): Either the full path to the Chroma
+            database *directory*, or an instance of a
+            :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
+            passed, the ``collection`` argument is ignored.
+        collection (str, optional): Name of the Chroma database
+            collection. Only required if the ``dbpath`` parameter is a
+            path. Defaults to None.
+        split_text (bool, optional): Split the document into chunks,
+            before loading it into the database. Defaults to True.
+        load_keywords (bool, optional): Use an LLM to derive keywords
+            from the document and load these keywords into the sister
+            keywords collection. Defaults to False.
+        llm (object, optional): If deriving keywords, this is the LLM
+            which will do the derivation. Defaults to None.
+        offline (bool, optional): Remain offline and use the locally
+            cached embedding function model. Defaults to False.
+    .. important::
+        The *deriving and loading of keywords* is only recommended for
+        **GPU-bound processing** as the LLM is invoked to infer the
+        keywords for each given document.
+        If called on a 'standard' PC, this will take a *long* time to
+        complete, if it completes at all.
+    :Examples:
+        Parse and load a *single* PDF file into a Chroma database
+        collection::
+            >>> from docp.loaders import ChromaPDFLoader
+            >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
+                                    collection='spam')
+            >>> l.load(path='/path/to/directory/myfile.pdf')
+        Parse and load a *directory* of PDF files into a Chroma
+        database collection::
+            >>> from docp.loaders import ChromaPDFLoader
+            >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
+                                    collection='spam')
+            >>> l.load(path='/path/to/directory', ext='pdf')
+    """
+    #
+    # No __init__ method here to ensure the ultimate base class'
+    # signature is used and to save passing loads of stuff around, if we
+    # don't have to.
+    #
+    def load(self,
+             path: str,
+             *,
+             ext: str='**',
+             recursive: bool=True,
+             remove_header: bool=True,
+             remove_footer: bool=True,
+             remove_newlines: bool=True,
+             ignore_tags: set=None,
+             convert_to_ascii: bool=True,
+             **unused) -> None:
+        """Load a PDF file (or files) into a Chroma database.
+        Args:
+            path (str): Full path to the file (or *directory*) to be
+                parsed and loaded. Note: If this is a directory, a
+                specific file extension can be passed into the
+                :meth:`load` method using the ``ext`` argument.
+            ext (str, optional): If the ``path`` argument refers to a
+                *directory*, a specific file extension can be specified
+                here. For example: ``ext = 'pdf'``.
+                If anything other than ``'**'`` is provided, all
+                alpha-characters are parsed from the string, and prefixed
+                with ``*.``. Meaning, if ``'.pdf'`` is passed, the
+                characters ``'pdf'`` are parsed and prefixed with ``*.``
+                to create ``'*.pdf'``. However, if ``'things.foo'`` is
+                passed, the derived extension will be ``'*.thingsfoo'``.
+                Defaults to '**', for a recursive search.
+            recursive (bool, optional): If True, subdirectories are
+                searched. Defaults to True.
+            remove_header (bool, optional): Attempt to remove the header
+                from each page. Defaults to True.
+            remove_footer (bool, optional): Attempt to remove the footer
+                from each page. Defaults to True.
+            remove_newlines (bool, optional): Replace newline characters
+                with a space. Defaults to True, as this helps with
+                document chunk splitting.
+            ignore_tags (set, optional): If provided, these are the
+                PDF 'marked content' tags which will be ignored. Note
+                that the PDF document must contain tags, otherwise the
+                bounding box method is used and this argument is ignored.
+                Defaults to ``{'Artifact'}``, as these generally
+                relate to a header and/or footer. To include all tags,
+                (not skip any) pass this argument as ``'na'``.
+            convert_to_ascii (bool, optional): Convert all characters to
+                ASCII. Defaults to True.
+        :Keyword Args:
+            unused (dict): This enables keywords to be passed into a
+                loader-agnostic ``.load()`` function without raising a
+                'unexpected keyword argument` ``TypeError``.
+        """
+        # pylint: disable=unused-argument  # They are 'used' via locals().
+        # Prepare the arguments being sent to the doc parser.
+        kwargs = self._set_kwargs(locals_=locals())
+        # Load multi
+        if os.path.isdir(path):
+            files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
+            count = len(files)
+            for idx, f in enumerate(files, 1):
+                print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
+                self._load(path=f, **kwargs)
+        # Load single
+        else:
+            print(f'Processing: {os.path.basename(path)} ...')
+            self._load(path=path, **kwargs)
+    @staticmethod
+    def _set_kwargs(locals_: dict) -> dict:
+        r"""Prepare the arguments which are sent to the doc parser.
+        As :func:`locals()` is used to capture the :meth:`load` method's
+        arguments for passing into the doc parser, some argument must be
+        removed first.
+        Args:
+            locals\_ (dict): The return value from a :func:`locals` call.
+        Returns:
+            dict: A *copy* of the provided dictionary with specific
+            key/value pairs removed.
+        """
+        # ^^^ The backslash in locals\_ is required for documentation to render correctly.
+        kwargs = locals_.copy()
+        for k in ['self', 'path']:
+            kwargs.pop(k)
+        return kwargs

docp/loaders/chromapptxloader.py ADDED Viewed

@@ -0,0 +1,192 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the entry point for loading PPTX files
+            into a Chroma database.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+:Examples:
+    Parse and load a *single* PPTX file into a Chroma database
+    collection::
+        >>> from docp.loaders import ChromaPPTXLoader
+        >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
+                                 collection='spam',
+                                 split_text=False)
+        >>> l.load(path='/path/to/directory/myfile.pptx')
+    Parse and load a *directory* of PPTX files into a Chroma database
+    collection::
+        >>> from docp.loaders import ChromaPPTXLoader
+        >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
+                                 collection='spam',
+                                 split_text=False)
+        >>> l.load(path='/path/to/directory', ext='pptx')
+    For further example code use, please refer to the
+    :class:`ChromaPPTXLoader` class docstring.
+"""
+import os
+# locals
+try:
+    from .libs.utilities import utilities
+    from .loaders._chromabasepptxloader import _ChromaBasePPTXLoader
+except ImportError:
+    from libs.utilities import utilities
+    from loaders._chromabasepptxloader import _ChromaBasePPTXLoader
+class ChromaPPTXLoader(_ChromaBasePPTXLoader):
+    """Chroma database PPTX-specific document loader.
+    Args:
+        dbpath (str | ChromaDB): Either the full path to the Chroma
+            database *directory*, or an instance of a
+            :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
+            passed, the ``collection`` argument is ignored.
+        collection (str, optional): Name of the Chroma database
+            collection. Only required if the ``db`` parameter is a path.
+            Defaults to None.
+        split_text (bool, optional): Split the document into chunks,
+            before loading it into the database. Defaults to True.
+        load_keywords (bool, optional): Derive keywords from the document
+            and load these into the sister keywords collection.
+            Defaults to False.
+        llm (object, optional): If deriving keywords, this is the LLM
+            which will do the derivation. Defaults to None.
+        offline (bool, optional): Remain offline and use the locally
+            cached embedding function model. Defaults to False.
+    .. important::
+        The *deriving and loading of keywords* is only recommended for
+        **GPU-bound processing**, as the LLM is invoked to infer the
+        keywords for each given document.
+        If called on a 'standard' PC, this will take a *long* time to
+        complete, if it completes at all.
+    .. tip::
+        It is recommended to pass ``split_text=False`` into the
+        :class:`ChromaPPTXLoader` constructor.
+        Often, PowerPoint presentations are structured such that related
+        text is found in the same 'shape' (textbox) on a slide.
+        Splitting the text in these shapes may have undesired results.
+    :Examples:
+        Parse and load a *single* PPTX file into a Chroma database
+        collection::
+            >>> from docp.loaders import ChromaPPTXLoader
+            >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
+                                     collection='spam',
+                                     split_text=False)  # <-- Note this
+            >>> l.load(path='/path/to/directory/myfile.pptx')
+        Parse and load a *directory* of PPTX files into a Chroma database
+        collection::
+            >>> from docp.loaders import ChromaPPTXLoader
+            >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
+                                     collection='spam',
+                                     split_text=False)  # <-- Note this
+            >>> l.load(path='/path/to/directory', ext='pptx')
+    """
+    def load(self,
+             path: str,
+             *,
+             ext: str='**',
+             recursive: bool=True,
+             remove_newlines: bool=True,
+             convert_to_ascii: bool=True,
+             **unused) -> None:
+        """Load a PDF file (or files) into a Chroma database.
+        Args:
+            path (str): Full path to the file (or *directory*) to be
+                parsed and loaded. Note: If this is a directory, a
+                specific file extension can be passed into the
+                :meth:`load` method using the ``ext`` argument.
+            ext (str, optional): If the ``path`` argument refers to a
+                *directory*, a specific file extension can be specified
+                here. For example: ``ext = 'pptx'``.
+                If anything other than ``'**'`` is provided, all
+                alpha-characters are parsed from the string, and prefixed
+                with ``*.``. Meaning, if ``'.pptx'`` is passed, the
+                characters ``'pptx'`` are parsed and prefixed with ``*.``
+                to create ``'*.pptx'``. However, if ``'things.foo'`` is
+                passed, the derived extension will be ``'*.thingsfoo'``.
+                Defaults to '**', for a recursive search.
+            recursive (bool, optional): If True, subdirectories are
+                searched. Defaults to True.
+            remove_newlines (bool, optional): Replace newline characters
+                with a space. Defaults to True, as this helps with
+                document chunk splitting.
+            convert_to_ascii (bool, optional): Convert all characters to
+                ASCII. Defaults to True.
+        :Keyword Args:
+            unused (dict): This enables keywords such as ``remove_header``
+                and ``remove_footer`` (for example) to be passed into a
+                loader-agnostic ``.load()`` function without raising a
+                'unexpected keyword argument` ``TypeError``.
+        """
+        # pylint: disable=unused-argument  # They are 'used' via locals().
+        # Prepare the arguments being sent to the doc parser.
+        kwargs = self._set_kwargs(locals_=locals())
+        # Load multi
+        if os.path.isdir(path):
+            files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
+            count = len(files)
+            for idx, f in enumerate(files, 1):
+                print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
+                self._load(path=f, **kwargs)
+        # Load single
+        else:
+            print(f'Processing: {os.path.basename(path)} ...')
+            self._load(path=path, **kwargs)
+    @staticmethod
+    def _set_kwargs(locals_: dict) -> dict:
+        r"""Prepare the arguments which are sent to the doc parser.
+        As :func:`locals()` is used to capture the :meth:`load` method's
+        arguments for passing into the doc parser, some argument must be
+        removed first.
+        Args:
+            locals\_ (dict): The return value from a :func:`locals` call.
+        Returns:
+            dict: A *copy* of the provided dictionary with specific
+            key/value pairs removed.
+        """
+        # ^^^ The backslash in locals\_ is required for documentation to render correctly.
+        kwargs = locals_.copy()
+        for k in ['self', 'path']:
+            kwargs.pop(k)
+        return kwargs

docp/loaders/lutilities.py ADDED Viewed

@@ -0,0 +1,52 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides loader-specific utility functions for
+            the project.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  This module is here (in the ``docp/loaders``) directory
+            rather than merged with the ``docp/parsers/putilities.py``
+            module as the loaders' dependencies are *heavy*. Keeping the
+            loader functionality separate helps to ease the dependency
+            requirements for parser-only projects.
+"""
+# locals
+try:
+    from .libs.utilities import utilities
+    from .loaders.chromapdfloader import ChromaPDFLoader
+    from .loaders.chromapptxloader import ChromaPPTXLoader
+except ImportError:
+    from libs.utilities import utilities
+    from loaders.chromapdfloader import ChromaPDFLoader
+    from loaders.chromapptxloader import ChromaPPTXLoader
+class LoaderUtilities:
+    """Loader-based (cross-project) utility functions."""
+    def get_loader(self, path: str) -> ChromaPDFLoader | ChromaPPTXLoader:
+        """Return the appropriate loader for the file type.
+        Args:
+            path (str): Full path to the file to be tested.
+        Returns:
+            ChromaPDFLoader | ChromaPPTXLoader: The appropriate loader
+            for the file, given the *file signature*; this test is not
+            file extension based.
+        """
+        if utilities.ispdf(path=path):
+            return ChromaPDFLoader
+        if utilities.iszip(path=path):
+            return ChromaPPTXLoader
+        raise NotImplementedError('A loader is not available for: os.path.basename(path)')
+lutilities = LoaderUtilities()

docp/objects/_docbaseobject.py CHANGED Viewed

@@ -1,8 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-:Purpose:   This module provides the implementation for the
-            document-type-specific base class.
+:Purpose:   This module provides the generalised base functionality for
+            the document-type-specific base classes.
 :Platform:  Linux/Windows | Python 3.10+
 :Developer: J Berendt
@@ -12,19 +12,15 @@
 """
-from __future__ import annotations
-try:
-    from .objects._pageobject import PageObject
-except ImportError:
-    from objects._pageobject import PageObject
 class _DocBase:
     """Private document base class.
-    This class is *not* designed to be interacted with directly, but
-    rather to be inherited by the document-type-specific document
-    objects.
+    .. attention::
+        This class is *not* designed to be interacted with directly, but
+        rather to be inherited by the document-type-specific document
+        objects.
     """
@@ -37,8 +33,6 @@ class _DocBase:
         self._npages = 0        # Number of pages in the document
         self._ntables = 0       # Number of tables extracted
         self._parser = None     # Underlying document parser functionality
-        # List of PageObjects, offset by 1 to align the index with page numbers.
-        self._pages = [PageObject(pageno=0)]
     @property
     def basename(self) -> str:
@@ -65,11 +59,6 @@ class _DocBase:
         """The number of tables successfully extracted from the source."""
         return self._ntables
-    @property
-    def pages(self) -> list[PageObject]:  # noqa pylint: disable=undefined-variable
-        """A list of containing an object for each page in the document."""
-        return self._pages
     @property
     def parser(self) -> object:
         """Accessor to the underlying document parser's functionality."""

docp/objects/_imgobject.py ADDED Viewed

File without changes

docp/objects/_pageobject.py CHANGED Viewed

@@ -1,7 +1,8 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-:Purpose:   This module provides the ``page`` object implementation.
+:Purpose:   This module provides the implementation for the
+            ``PageObject`` object.
 :Platform:  Linux/Windows | Python 3.10+
 :Developer: J Berendt
@@ -82,7 +83,7 @@ class PageObject:
         """Accessor to the page number.
         Note:
-            This is the page number 1-n, concerning the page's *sequence
+            This is the page number with regard to the page's *sequence
             in the overall document*. This is *not* guaranteed to be the
             page's number per the document's page labeling scheme.

docp/objects/_slideobject.py ADDED Viewed

@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the implementation for the
+            ``SlideObject`` object.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+"""
+class SlideObject:
+    r"""This class provides the implementation for the ``SlideObject``.
+    For each slide in a document (e.g. PowerPoint), an instance of this
+    class is created, populated and appended into the PPTX document's
+    ``slides`` list attribute.
+    Args:
+        pageno (int, optional): Page number. Defaults to 0.
+        parser (object, optional): The underlying document parser object.
+            Defaults to None.
+    .. tip::
+        To display the textual contents of a slide, simply call the
+        following, where 42 is the slide to be displayed::
+            >>> print(*pptx.doc.slides[42].texts, sep='\n\n')
+    """
+    __slots__ = ('_imgs', '_tables', '_texts', '_pageno', '_parser')
+    def __init__(self, pageno: int=0, parser: object=None):
+        """Slide object class initialiser."""
+        self._imgs = []
+        self._tables = []
+        self._texts = []
+        self._pageno = pageno
+        self._parser = parser
+    def __repr__(self) -> str:
+        """Formatted representation of this object."""
+        return f'<Slide: {self._pageno}>'
+    def __str__(self) -> str:
+        """Formatted representation of this object, when printed."""
+        if self._pageno == 0:
+            return f'<Slide: {self._pageno}; <index offset>>'
+        return (f'<Slide: {self._pageno}; '
+                f'Text blocks: {len(self._texts)}; '
+                f'Tables: {len(self._tables)}; '
+                f'Images: {len(self._imgs)}; '
+                f'Parser: {bool(self._parser)}>')
+    @property
+    def content(self) -> str:
+        """Accessor to the textual content of a slide.
+        Returns:
+            str: A concatenated string for all text objects found on the
+            slide; each object separated by a double-newline.
+        """
+        return '\n\n'.join(i.content for i in self._texts)
+    @property
+    def images(self) -> list:
+        """Accessor to a slide's image objects."""
+        return self._imgs
+    @property
+    def pageno(self) -> int:
+        """Accessor to the page number.
+        Note:
+            This is the page number with regard to the page's *sequence
+            in the overall document*. This is *not* guaranteed to be the
+            page's number per the document's page labeling scheme.
+        """
+        return self._pageno
+    @property
+    def parser(self) -> object:
+        """Accessor to the document parser's internal functionality.
+        Note:
+            The population of this property is determined by the
+            document-type-specific ``docp`` parser. If the underlying
+            parsing library has functionality worth preserving and making
+            available to the user, it is stored to this property.
+            Otherwise, this property will remain as ``None``.
+        """
+        return self._parser
+    @property
+    def tables(self) -> list:
+        """Accessor to a slide's table objects."""
+        return self._tables
+    @property
+    def texts(self) -> list:
+        """Accessor to a slide's text objects."""
+        return self._texts

docp 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl

docp 0.1.0b1py3-none-any.whl → 0.2.0py3-none-any.whl