PyPI - docp - Versions diffs - 0.1.0b1__py3-none-any.whl - Mend

docp 0.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

docp/__init__.py +31 -0
docp/_version.py +1 -0
docp/dbs/__init__.py +0 -0
docp/dbs/chroma.py +184 -0
docp/loaders/__init__.py +0 -0
docp/loaders/_chromabaseloader.py +362 -0
docp/loaders/chroma.py +166 -0
docp/objects/__init__.py +0 -0
docp/objects/_docbaseobject.py +76 -0
docp/objects/_pageobject.py +126 -0
docp/objects/_tableobject.py +0 -0
docp/objects/_textobject.py +0 -0
docp/objects/pdfobject.py +39 -0
docp/parsers/__init__.py +0 -0
docp/parsers/_pdfbaseparser.py +210 -0
docp/parsers/_pdftableparser.py +273 -0
docp/parsers/_pdftextparser.py +253 -0
docp/parsers/pdfparser.py +62 -0
docp-0.1.0b1.dist-info/LICENSE +622 -0
docp-0.1.0b1.dist-info/METADATA +55 -0
docp-0.1.0b1.dist-info/RECORD +23 -0
docp-0.1.0b1.dist-info/WHEEL +5 -0
docp-0.1.0b1.dist-info/top_level.txt +1 -0

docp/loaders/chroma.py ADDED Viewed

@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the entry point for loading a document
+            into a Chroma database.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+:Example:   For example code use, please refer to the
+            :class:`ChromaLoader` class docstring.
+# pylint: disable=import-error
+# pylint: disable=wrong-import-position
+"""
+import os
+import re
+from glob import glob
+# locals
+try:
+    from .loaders._chromabaseloader import _ChromaBaseLoader
+except ImportError:
+    from loaders._chromabaseloader import _ChromaBaseLoader
+class ChromaLoader(_ChromaBaseLoader):
+    """Chroma database document loader.
+    Args:
+        path (str): Full path to the file (or *directory*) to be parsed
+            and loaded. Note: If this is a directory, a specific file
+            extension can be passed into the :meth:`load` method using
+            the ``ext`` argument.
+        dbpath (str): Full path to the Chroma database *directory*.
+        collection (str): Name of the Chroma database collection into
+            which the data is to be loaded.
+        load_keywords (bool, optional): Use the provided LLM
+            (via the ``llm`` parameter) to read the document and infer
+            keywords to be loaded into the ``<collection>-kwds``
+            database, for keyword-driven document filtering.
+            Note: This *requires* the ``llm`` parameter and is
+            recommended only for GPU-bound processing. Defaults to False.
+        llm (object, optional): An LLM *instance* which can be provided
+            directly into the
+            :func:`langchain.chains.RetrievalQA.from_chain_type` function
+            for keywork inferrence. This is *required* for keyword
+            loading. Defaults to None.
+        offline (bool, optional): Remain offline and use the locally
+            cached embedding function model. Defaults to False.
+    .. important::
+        The *deriving and loading of keywords* is only recommended for
+        **GPU-bound processing**, as the LLM is invoked to infer the
+        keywords for each given document.
+        If called on a 'standard' PC, this will take a *long* time to
+        complete, if it completes at all.
+    :Example:
+        Parse and load a *single* document into a Chroma database
+        collection::
+            >>> from docp import ChromaLoader
+            >>> l = ChromaLoader(path='/path/to/file.pdf',
+                                 dbpath='/path/to/chroma',
+                                 collection='spam')
+            >>> l.load()
+        Parse and load a *directory* of PDF documents into a Chroma
+        database collection::
+            >>> from docp import ChromaLoader
+            >>> l = ChromaLoader(path='/path/to/directory',
+                                 dbpath='/path/to/chroma',
+                                 collection='spam')
+            >>> l.load(ext='pdf')
+    """
+    def __init__(self,
+                 path: str,
+                 dbpath: str,
+                 collection: str,
+                 *,
+                 load_keywords: bool=False,
+                 llm: object=None,
+                 offline: bool=False):
+        """Chroma database loader class initialiser."""
+        super().__init__(dbpath=dbpath,
+                         collection=collection,
+                         load_keywords=load_keywords,
+                         llm=llm,
+                         offline=offline)
+        self._path = path
+    def load(self,
+             *,
+             ext: str='**',
+             recursive: bool=True,
+             remove_header: bool=True,
+             remove_footer: bool=True,
+             remove_newlines: bool=True,
+             ignore_tags: set=None,
+             convert_to_ascii: bool=True) -> None:
+        """Load a document (or documents) into a Chroma database.
+        Args:
+            ext (str): If the ``path`` argument refers to a *directory*,
+                a specific file extension can be specified here.
+                For example::
+                    ext = 'pdf'
+                If anything other than ``'**'`` is provided, all
+                alpha-characters are parsed from the string, and prefixed
+                with ``*.``. Meaning, if ``'.pdf'`` is passed, the
+                characters ``'pdf'`` are parsed and prefixed with ``*.``
+                to create ``'*.pdf'``. However, if ``'things.foo'`` is
+                passed, the derived extension will be ``'*.thingsfoo'``.
+                Defaults to '**', for a recursive search.
+            recursive (bool, optional): If True, subdirectories are
+                searched. Defaults to True.
+            remove_header (bool, optional): Attempt to remove the header
+                from each page. Defaults to True.
+            remove_footer (bool, optional): Attempt to remove the footer
+                from each page. Defaults to True.
+            remove_newlines (bool, optional): Replace newline characters
+                with a space. Defaults to True, as this helps with
+                document chunk splitting.
+            ignore_tags (set, optional): If provided, these are the
+                PDF 'marked content' tags which will be ignored. Note
+                that the PDF document must contain tags, otherwise the
+                bounding box method is used and this argument is ignored.
+                Defaults to ``{'Artifact'}``, as these generally
+                relate to a header and/or footer. To include all tags,
+                (not skip any) pass this argument as ``'na'``.
+            convert_to_ascii (bool, optional): Convert all characters to
+                ASCII. Defaults to True.
+        """
+        if os.path.isdir(self._path):
+            if ext != '**':
+                ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
+            files = glob(os.path.join(self._path, ext), recursive=recursive)
+            count = len(files)
+            for idx, f in enumerate(files, 1):
+                print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
+                self._load(path=f)
+        else:
+            print(f'Processing: {os.path.basename(self._path)} ...')
+            self._load(path=self._path,
+                       remove_header=remove_header,
+                       remove_footer=remove_footer,
+                       remove_newlines=remove_newlines,
+                       ignore_tags=ignore_tags,
+                       convert_to_ascii=convert_to_ascii)

docp/objects/__init__.py ADDED Viewed

File without changes

docp/objects/_docbaseobject.py ADDED Viewed

@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the implementation for the
+            document-type-specific base class.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+"""
+from __future__ import annotations
+try:
+    from .objects._pageobject import PageObject
+except ImportError:
+    from objects._pageobject import PageObject
+class _DocBase:
+    """Private document base class.
+    This class is *not* designed to be interacted with directly, but
+    rather to be inherited by the document-type-specific document
+    objects.
+    """
+    def __init__(self):
+        """Base document object class initialiser."""
+        self._common = None     # Used by the header/footer scanner.
+        self._fname = None      # Filename (basename)
+        self._fpath = None      # Full file path
+        self._meta = None       # Metadata from the document parger
+        self._npages = 0        # Number of pages in the document
+        self._ntables = 0       # Number of tables extracted
+        self._parser = None     # Underlying document parser functionality
+        # List of PageObjects, offset by 1 to align the index with page numbers.
+        self._pages = [PageObject(pageno=0)]
+    @property
+    def basename(self) -> str:
+        """Accessor for the file's basename."""
+        return self._fname
+    @property
+    def filepath(self) -> str:
+        """Accessor for the explicit path to this file."""
+        return self._fpath
+    @property
+    def metadata(self) -> dict | object:
+        """The meta data as extracted from the document."""
+        return self._meta
+    @property
+    def npages(self) -> int:
+        """The number of pages successfully extracted from the source."""
+        return self._npages
+    @property
+    def ntables(self) -> int:
+        """The number of tables successfully extracted from the source."""
+        return self._ntables
+    @property
+    def pages(self) -> list[PageObject]:  # noqa pylint: disable=undefined-variable
+        """A list of containing an object for each page in the document."""
+        return self._pages
+    @property
+    def parser(self) -> object:
+        """Accessor to the underlying document parser's functionality."""
+        return self._parser

docp/objects/_pageobject.py ADDED Viewed

@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the ``page`` object implementation.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+"""
+from __future__ import annotations
+class PageObject:
+    """This class provides the implementation for the ``PageObject``.
+    For each page in a document, an instance of this class is created,
+    populated and appended into the document's ``pages`` list attribute.
+    Args:
+        content (str, optional): Page content as a single string.
+            Defaults to ''.
+        pageno (int, optional): Page number. Defaults to 0.
+        parser (object, optional): The underlying document parser object.
+            Defaults to None.
+    """
+    __slots__ = ('_content', '_hastext', '_pageno', '_parser', '_tables')
+    def __init__(self, content: str='', pageno: int=0, parser: object=None):
+        """Page object class initialiser."""
+        self._content = content
+        self._pageno = pageno
+        self._parser = parser
+        self._hastext = bool(content)
+        self._tables = []
+    def __repr__(self) -> str:
+        """Formatted representation of this object."""
+        if self._pageno == 0:
+            return f'<Page: {self._pageno}; <index offset>>'
+        return f'<Page: {self._pageno}; Chars: {len(self._content)}>'
+    def __str__(self) -> str:
+        """Formatted string displayed when printing this object."""
+        c = self._content[:25].replace('\n', ' ') + ' ...' if self._content else ''
+        fmt = (f'Page no: {self._pageno}; '
+               f'Content: "{c}"; '
+               f'Chars: {len(self._content)}; '
+               f'nTables: {len(self._tables)}; '
+               f'Parser avail: {bool(self._parser)}')
+        return fmt
+    @property
+    def content(self) -> str:
+        """Accessor to the page's textual content."""
+        return self._content
+    @content.setter
+    def content(self, value: str) -> None:
+        """Setter for the ``content`` attribute.
+        If the ``value`` argument is populated, the content is set and
+        the ``hastext`` attribute is set to ``True``.
+        """
+        if value:
+            self._content = value
+            self._hastext = True
+    @property
+    def hastext(self) -> bool:
+        """Flag indicating if the ``content`` attribute is populated."""
+        return self._hastext
+    @property
+    def pageno(self) -> int:
+        """Accessor to the page number.
+        Note:
+            This is the page number 1-n, concerning the page's *sequence
+            in the overall document*. This is *not* guaranteed to be the
+            page's number per the document's page labeling scheme.
+        """
+        return self._pageno
+    @property
+    def parser(self) -> object:
+        """Accessor to the document parser's internal functionality.
+        Note:
+            The population of this property is determined by the
+            document-type-specific ``docp`` parser. If the underlying
+            parsing library has functionality worth preserving and making
+            available to the user, it is stored to this property.
+            Otherwise, this property will remain as ``None``.
+        """
+        return self._parser
+    @property
+    def tables(self) -> list:
+        """Accessor to the page's tables, if parsed."""
+        return self._tables
+    def show(self) -> pdfplumber.display.PageImage:  # pylint: disable=undefined-variable  # noqa
+        """Display the page as an image.
+        Additionally, the return value exposes access to the underlying
+        ``pdfplumber`` debugging visualisation methods such as:
+            - :func:`img.debug_tablefinder`
+            - :func:`img.draw_*`
+            - :func:`img.outline_chars`
+            - :func:`img.outline_words`
+            - :func:`img.reset`
+            - etc.
+        """
+        return self.parser.to_image()

docp/objects/_tableobject.py ADDED Viewed

File without changes

docp/objects/_textobject.py ADDED Viewed

File without changes

docp/objects/pdfobject.py ADDED Viewed

@@ -0,0 +1,39 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the 'PDF Document' object structure into
+            which PDF documents are parsed into for transport and onward
+            use.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+"""
+# pylint: disable=import-error
+from objects._docbaseobject import _DocBase
+class DocPDF(_DocBase):
+    """Container class for storing data parsed from a PDF file."""
+    def __init__(self):
+        """PDF document object class initialiser."""
+        super().__init__()
+        self._tags = False
+    @property
+    def parsed_using_tags(self) -> bool:
+        """Flag indicating if the document was parsed using tags.
+        PDF documents can be created with 'marked content' tags. When
+        a PDF document is parsed using tags, as this flag indicates, the
+        parser respects columns and other page formatting schemes. If a
+        multi-column page is parsed without tags, the parser reads
+        straight across the line, thus corrupting the text.
+        """
+        return self._tags

docp/parsers/__init__.py ADDED Viewed

File without changes

docp/parsers/_pdfbaseparser.py ADDED Viewed

@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides generalised base functionality for
+            parsing PDF documents.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+Note:       This module is *not* designed to be interacted with
+            directly, only via the appropriate interface class(es).
+            Rather, please create an instance of a PDF document parsing
+            object using the following:
+                - :class:`pdfparser.PDFParser`
+"""
+# pylint: disable=import-error
+# pylint: disable=protected-access
+# pylint: disable=wrong-import-order
+import os
+import pdfplumber
+from collections import Counter
+from unidecode import unidecode
+# locals
+from objects.pdfobject import DocPDF
+class _PDFBaseParser:
+    """Base class containing generalised PDF parsing functionality."""
+    def __init__(self, path: str):
+        """Private base parser class initialiser.
+        Args:
+            path (str): Full path to the document to be parsed.
+        """
+        self._path = path
+        self._doc = DocPDF()
+        self._tbl_opath = None
+        self._set_paths()
+        self._open()
+    def __del__(self):
+        """Class deconstructor.
+        :Tasks:
+            - Ensure the PDF document is closed.
+        """
+        if hasattr(self._doc, '_parser'):
+            self._doc._parser.close()
+    @property
+    def doc(self) -> DocPDF:
+        """Accessor to the document object."""
+        return self._doc
+    def _get_crop_coordinates(self,
+                              skip_header: bool=False,
+                              skip_footer: bool=False) -> tuple[float]:
+        """Determine the bounding box coordinates.
+        These coordinates are used for removing the header and/or footer.
+        Args:
+            skip_header (bool, optional): If True, set the coordinates
+                such that the header is skipped. Defaults to False.
+            skip_footer (bool, optional): If True, set the coordinates
+                such that the footer is skipped. Defaults to False.
+        :Logic:
+            When excluding a header and/or footer, the following page
+            numbers are used for header/footer *position* detection,
+            given the length of the document:
+                - Number of pages [1]: 1
+                - Number of pages [2,10]: 2
+                - Number of pages [11,]: 5
+        Returns:
+            tuple: A bounding box tuple of the following form, to be
+            passed directly into the :func:`Page.crop` method::
+                (x0, top, x1, bottom)
+        """
+        npages = self._doc.npages
+        match npages:
+            case 1: num = 1
+            case _ if npages in range(2, 11): num = 2
+            case _: num = 5
+        pg = self._doc.parser.pages[num]  # The pages list has a has a page offset at [0].
+        # Default coordinates to the whole page.
+        coords = {'x0': 0, 'top': 0, 'x1': pg.width, 'bottom': pg.height}
+        # If the header and/or footer is to be skipped, find and iterate
+        # through the common lines and overwrite the coordinates as
+        # appropriate, given the key and the line's location on the page.
+        if skip_header or skip_footer:
+            lines = self._scan_common()
+            for line in lines:
+                s = pg.search(line)
+                if s:
+                    for key in coords:
+                        v = s[0][key]
+                        match key:
+                            case 'top' if v < pg.height/2 and skip_header:
+                                coords[key] = max(coords[key], v+2)
+                            case 'bottom' if v > pg.height/2 and skip_footer:
+                                coords[key] = min(coords[key], v-2)
+        return tuple(coords.values())
+    def _open(self) -> None:
+        """Open the PDF document for reading.
+        :Other Operations:
+            - Store the ``pdfplumber`` parser object returned from the
+              :func:`pdfplumber.open` function into the
+              :attr:`self._doc._parser` attribute.
+            - Store the number of pages into the
+              :attr:`self._doc._npages` attribute.
+            - Store the document's meta data into the
+              :attr:`self._doc._meta` attribute.
+        """
+        self._doc._parser = pdfplumber.open(self._doc._fpath)
+        self._doc._npages = len(self._doc._parser.pages)
+        self._doc._meta = self._doc._parser.metadata
+    @staticmethod
+    def _prepare_row(row: list) -> str:
+        """Prepare the table row for writing a table to to CSV.
+        Args:
+            row (list): A list of strings, constituting a table row.
+        :Processing Tasks:
+            For each element in the row:
+                - Remove any double quote characters (ASCII and Unicode).
+                - Replace any empty values with ``'None'``.
+                - If the element contains a comma, wrap the element in
+                  double quotes.
+                - Attempt to convert any non-ASCII characters to an
+                  associated ASCII character. If the replacement cannot
+                  be made, the character is replaced with a ``'?'``.
+        Returns:
+            str: A processed comma-separated string, ready to be written
+            to a CSV file.
+        """
+        trans = {34: '', 8220: '', 8221: ''}  # Remove double quotes in Unicode.
+        row = [e.translate(trans) if e else 'None' for e in row]  # Cannot be a generator.
+        for idx, e in enumerate(row):
+            if ',' in e:
+                row[idx] = f'"{e}"'  # Escape comma-separation by quoting.
+        line = unidecode(','.join(row).replace('\n', ' '), errors='replace', replace_str='?')
+        return line
+    def _scan_common(self) -> list[str]:
+        """Scan the PDF document to find the most common lines.
+        :Rationale:
+            Generally, the most common lines in a document will be the
+            header and footer, as these are expected to be repeated on
+            each page of the document.
+            'Most common' is defined as line occurring on 90% of the
+            pages throughout the document. Therefore, only documents with
+            more than three pages are scanned. Otherwise, the 90% may
+            exclude relevant pieces of the document (as was discovered in
+            testing).
+        :Logic:
+            For documents with more than three pages, the entire PDF is
+            read through and each line extracted. The occurrence of each
+            line is counted, with the most common occurrences returned
+            to the caller.
+            The returned lines are to be passed into a page search to
+            determine the x/y coordinates of the header and footer.
+        Returns:
+            list: For documents with more than three pages, a list
+            containing the most common lines in the document. Otherwise,
+            an empty list if returned.
+        """
+        # Only scan if document has more than three pages.
+        if self._doc.npages < 4:
+            return []
+        if self._doc.common is None:
+            # Create a line generator for all pages.
+            lines = (l for p in self._doc.parser.pages for l in p.extract_text().split('\n'))
+            # Return the lines whose occurrence rate is 90% of document pages.
+            self._doc._common = [i[0] for i in Counter(lines).most_common()
+                                 if i[1] > self._doc.npages * 0.9]
+        return self._doc.common
+    def _set_paths(self) -> None:
+        """Set the document's file path attributes."""
+        self._doc._fpath = os.path.realpath(self._path)
+        self._doc._fname = os.path.basename(self._path)