PyPI - docp - Versions diffs - 0.1.0b1__py3-none-any.whl - Mend

docp 0.1.0b1__py3-none-any.whl

Files changed (23) hide show

docp/__init__.py +31 -0
docp/_version.py +1 -0
docp/dbs/__init__.py +0 -0
docp/dbs/chroma.py +184 -0
docp/loaders/__init__.py +0 -0
docp/loaders/_chromabaseloader.py +362 -0
docp/loaders/chroma.py +166 -0
docp/objects/__init__.py +0 -0
docp/objects/_docbaseobject.py +76 -0
docp/objects/_pageobject.py +126 -0
docp/objects/_tableobject.py +0 -0
docp/objects/_textobject.py +0 -0
docp/objects/pdfobject.py +39 -0
docp/parsers/__init__.py +0 -0
docp/parsers/_pdfbaseparser.py +210 -0
docp/parsers/_pdftableparser.py +273 -0
docp/parsers/_pdftextparser.py +253 -0
docp/parsers/pdfparser.py +62 -0
docp-0.1.0b1.dist-info/LICENSE +622 -0
docp-0.1.0b1.dist-info/METADATA +55 -0
docp-0.1.0b1.dist-info/RECORD +23 -0
docp-0.1.0b1.dist-info/WHEEL +5 -0
docp-0.1.0b1.dist-info/top_level.txt +1 -0

docp/parsers/_pdftableparser.py ADDED Viewed

@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the logic for parsing tables from a PDF
+            document.
+:Platform:  Linux
+:Developer: J Berendt
+:Email:     jeremy.berendt@rolls-royce.com
+Note:       This module is *not* designed to be interacted with
+            directly, only via the appropriate interface class(es).
+            Rather, please create an instance of a PDF document parsing
+            object using the following:
+                - :class:`pdfparser.PDFParser`
+"""
+# pylint: disable=import-error
+# pylint: disable=protected-access
+# pylint: disable=wrong-import-order
+import io
+import os
+import pandas as pd
+import shutil
+# locals
+from parsers._pdfbaseparser import _PDFBaseParser
+# TODO: Move to a config file/class.  (TOML?)
+_SETTINGS = {'vertical_strategy': 'lines',
+             'horizontal_strategy':'lines',
+             'snap_x_tolerance': 12}
+# TODO: Revise the docstring.
+class _PDFTableParser(_PDFBaseParser):
+    """Private PDF document table parser intermediate class.
+    Args:
+        path (str): Full path to the PDF document.
+    :Example:
+        Extract tables from a PDF file::
+            >>> from docutils.parsers.pdf import PDFParser
+            >>> path = '/path/to/myfile.pdf'
+            >>> pdf = PDFParser(path)
+            >>> pdf.extract_tables()
+            >>> tables = pdf.doc.tables
+    """
+    def extract_tables(self,
+                       table_settings: dict=None,
+                       as_dataframe: bool=False,
+                       to_csv: bool=True,
+                       verbose: bool=False):
+        """Extract tables from the document.
+        Before a table is extracted, a number of validation tests are
+        performed to verify what has been identified as a 'table' is
+        actually a table which might be useful to the user.
+        Each 'valid' table is written as a CSV file on the user's
+        desktop.
+        Additionally, the extracted table data is stored to the class'
+        :attr:`self.tables` attribute.
+        Args:
+            table_settings (dict, optional): Table settings to be used
+                for the table extraction. Defaults to None, which is
+                replaced by the value in the config.
+            as_dataframe (bool, optional): By default, the extracted
+                tables are returned as a list of (lists of lists), for
+                example: all_tables[table[rows[data]]]. However, if this
+                argument is ``True``, the table data is returned as a
+                list of ``pandas.DataFrame`` objects. In this case, the
+                first row of the table is used as the header, and all
+                remaining rows are treated as data. **Note:** This will
+                *not* work properly for all tables. Defaults to False.
+            to_csv (bool, optional): Dump extracted table data to a CSV
+                file, one per table. Defaults to True.
+            verbose (bool, optional): Display how many tables were
+                extracted, and the path to their location.
+        """
+        # pylint: disable=invalid-name
+        # pylint: disable=too-many-nested-blocks
+        # pylint: disable=unnecessary-dunder-call
+        if self._doc.tables:
+            # Reinitialise the doc object and reopen the document.
+            self.__init__(path=self._path)
+        c = 0
+        if to_csv:
+            self._create_table_directory_path()
+        if table_settings is None:
+            table_settings = _SETTINGS
+        for p in self._doc._pdf.pages:
+            tblno = 1
+            tables = self._filter_tables(tables=p.find_tables(), threshold=5000)
+            for table in tables:
+                pc = p.crop(table.bbox)
+                data = pc.extract_table(table_settings=table_settings)
+                if all(len(row) > 1 for row in data) and len(data) > 1:
+                    # Verify no table rows are found in the most common rows (header/footer).
+                    if not self._table_header_footer(table=data):
+                        if not as_dataframe:
+                            self._doc._tables.append(data)
+                        if to_csv or as_dataframe:
+                            buffer = self._to_buffer(data=data)
+                            if to_csv:
+                                c += self._to_csv(buffer=buffer,
+                                                  pageno=p.page_number,
+                                                  tableno=tblno)
+                            if as_dataframe:
+                                self._to_df(buffer=buffer)
+                            buffer.close()
+                        tblno += 1
+        if verbose and to_csv:
+            print('',
+                  'Complete.',
+                  f'{c} tables were extracted and stored at the path below.',
+                  f'Path: {self._tbl_opath}',
+                  sep='\n')
+    def _create_table_directory_path(self):
+        """Create the output directory for table data.
+        If the directory does not exist, it is created.
+        """
+        # Defined in parent class.
+        # pylint: disable=attribute-defined-outside-init
+        trans = {32: '_', 45: '_'}
+        path = (os.path.join(os.path.join(os.environ['HOME'], 'Desktop'),
+                             'docutils',
+                             'pdf_tables',
+                             (os.path.splitext(os.path.basename(self._path))[0]
+                              .lower()
+                              .translate(trans))))
+        self._tbl_opath = path
+        if not os.path.exists(path):
+            os.makedirs(path)
+    def _create_table_file_path(self, pageno: int, tblno: int) -> str:
+        """Create the filename for the table.
+        Args:
+            pageno (int): Page from which the table was extracted.
+            tblno (int): Number of the table on the page, starting at 1.
+        Returns:
+            str: Explicit path to the file to be written.
+        """
+        path = os.path.join(self._tbl_opath,
+                            f'pg{str(pageno).zfill(3)}_tb{str(tblno).zfill(3)}.csv')
+        return path
+    @staticmethod
+    def _filter_tables(tables: list, threshold: int=5000) -> list:
+        """Remove tables from the passed list which are deemed invalid.
+        Args:
+            tables (list): A list of tables as detected by the
+                :meth:`Page.find_table()` method.
+            threshold (int, optional): Minimum pixel area for a detected
+                table to be returned. Defaults to 5000.
+        :Rationale:
+            An 'invalid' table is determined by the number of pixels
+            which the table covered. Any table which is less than (N)
+            pixels is likely a block of text which has been categorised
+            as a 'table', but is not.
+        Returns:
+            list: A list of tables whose pixel area is greater than
+            ``threshold``.
+        """
+        # pylint: disable=invalid-name
+        t = []
+        for table in tables:
+            x0, y0, x1, y1 = table.bbox
+            if (x1-x0) * (y1-y0) > threshold:
+                t.append(table)
+        return t
+    def _table_header_footer(self, table: list[list]) -> bool:
+        """Verify a table is not a header or footer.
+        Args:
+            table (list[list]): Table (a list of lists) be a analysed.
+        :Rationale:
+            A table is determined to be a header or footer if any of the
+            line contained in the 'common lines list' are found in the
+            table.
+            If any of these lines are found, the table is determined to
+            be a header/footer, True is returned.
+        Returns:
+            bool: False if the table is *not* a header/footer, otherwise
+            True.
+        """
+        lines = self._scan_common()  # Only re-runs if not already run.
+        # r: row; c: cell; l: line
+        return any(l in c for l in lines for r in table for c in r if c)
+    def _to_buffer(self, data: list[list]) -> io.StringIO:
+        """Write the table data into a string buffer.
+        Args:
+            data (list[list]): The table data as a list of lists to be
+                written to a buffer.
+        Returns:
+            io.StringIO: A string buffer as an ``io.StringIO`` object.
+        """
+        b = io.StringIO()
+        for row in data:
+            line = self._prepare_row(row=row)
+            b.write(line)
+            b.write('\n')
+        b.seek(0)
+        return b
+    def _to_csv(self, buffer: io.StringIO, pageno: int, tableno: int) -> int:
+        """Write a table (from the buffer) to CSV.
+        Args:
+            buffer (io.StringIO): A pre-processed ``StringIO`` object
+                containing table data to be written.
+            pageno (int): Page number from the ``Page`` object.
+            tableno (int): Number of the table on the page, based at 1.
+        Returns:
+            int: 1 if the file was written, otherwise 0. This is used by
+            the caller to track the number of CSV files written.
+        """
+        if buffer.seek(0, os.SEEK_END):  # Test buffer is populated.
+            path = self._create_table_file_path(pageno=pageno, tblno=tableno)
+            with open(path, 'w', encoding='utf-8') as f:
+                buffer.seek(0)
+                shutil.copyfileobj(buffer, f)
+                return 1
+        return 0
+    def _to_df(self, buffer: io.StringIO):
+        """Write a table (from the buffer) to a DataFrame.
+        Once written, the DataFrame is appended to
+        :attr:`self._doc._tables` list of tables.
+        Args:
+            buffer (io.StringIO): A pre-processed ``StringIO`` object
+                containing table data to be written.
+        """
+        if buffer.seek(0, os.SEEK_END):
+            buffer.seek(0)
+            self._doc._tables.append(pd.read_csv(buffer))

docp/parsers/_pdftextparser.py ADDED Viewed

@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module provides the logic for parsing text from a PDF
+            document.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+Note:       This module is *not* designed to be interacted with
+            directly, only via the appropriate interface class(es).
+            Rather, please create an instance of a PDF document parsing
+            object using the following:
+                - :class:`pdfparser.PDFParser`
+Note:       **Multi-processing:**
+            Text extraction through multi-processing has been tested and
+            is not feesible due to an error indicating
+            the ``pdfplumber.page.Page`` object can not be pickled. This
+            object was being passed into the extraction method as the
+            object contains the :func:`extract_text` function.
+            Additionally, multi-threading has also been tested and
+            it was determined to be too complex and inefficient. This was
+            tested using the ``concurrent.futures.ThreadPoolExecutor``
+            class and two documents, 14 and 92 pages; the timings are
+            shown below. The multi-threaded approach took longer to
+            process and added unnecessary complexity to the code base.
+            As a side-effect, the pages are processed and stored out of
+            order which would require a re-order, adding more complexity.
+            It has therefore been determined that this module will remain
+            single-threaded.
+           **Multi-Thread Timings**
+           **Single-threaded:**
+                - 14 page document: ~2 seconds
+                - 92 page document: ~32 seconds
+           **Multi-threaded:**
+                - 14 page document: ~2 seconds
+                - 92 page document: ~35 seconds
+"""
+# pylint: disable=import-error
+from __future__ import annotations
+from unidecode import unidecode
+# locals
+from objects._pageobject import PageObject
+from parsers._pdfbaseparser import _PDFBaseParser
+class _PDFTextParser(_PDFBaseParser):
+    """Private PDF document text parser intermediate class.
+    Args:
+        path (str): Full path to the PDF document.
+    :Example:
+        Extract text from a PDF file::
+            >>> from docp import PDFParser
+            >>> pdf = PDFParser(path='/path/to/myfile.pdf')
+            >>> pdf.extract_text()
+            # Access the content of page 1.
+            >>> pg1 = pdf.doc.pages[1].content
+    """
+    def extract_text(self,
+                     *,
+                     remove_header: bool=False,
+                     remove_footer: bool=False,
+                     remove_newlines: bool=False,
+                     ignore_tags: set=None,
+                     convert_to_ascii: bool=True):
+        """Extract text from the document.
+        If the PDF document contains 'marked content' tags, these tags
+        are used to extract the text as this is a more accurate approach
+        and respects the structure of the page(s). Otherwise, a bounding
+        box method is used to extract the text. If instructed, the
+        header and/or footer regions can be excluded.
+        .. tip:
+            If a tag-based extract is used, the header/footer should be
+            automatically excluded as these will often have an 'Artifact'
+            tag, which is excluded by default, by passing
+            ``ignore_tags=None``.
+            To *keep* the header and footer, pass ``ignore_tags='na'``.
+        A list of pages, with extracted content can be accessed using
+        the :attr:`self.doc.pages` attribute.
+        Args:
+            remove_header (bool, optional): If True, the header is
+                cropped (skipped) from text extraction. This only applies
+                to the bounding box extraction method. Defaults to False.
+            remove_footer (bool, optional): If True, the footer is
+                cropped (skipped) from text extraction. This only applies
+                to the bounding box extraction method. Defaults to False.
+            remove_newlines (bool, optional): If True, the newline
+                characters are replaced with a space. Defaults to False.
+            ignore_tags (set, optional): If provided, these are the
+                PDF 'marked content' tags which will be ignored. Note
+                that the PDF document must contain tags, otherwise the
+                bounding box method is used and this argument is ignored.
+                Defaults to ``{'Artifact'}``, as these generally
+                relate to a header and/or footer. To include all tags,
+                (not skip any) pass this argument as ``'na'``.
+            convert_to_ascii (bool, optional): When a non-ASCII character
+                is found, an attempt is made to convert it to an
+                associated ASCII character. If a character cannot be
+                converted, it is replaced with a ``'?'``.
+                Defaults to True.
+        Returns:
+            None.
+        """
+        # pylint: disable=unnecessary-dunder-call
+        if len(self.doc.pages) > 1:
+            # Reinitialise the doc object and reopen the document.
+            self.__init__(path=self._path)
+        # If tags are found, these are used for text extraction. If tags
+        # are not found, a bounding box is used to remove the header and
+        # footer, if instructed.
+        if self._uses_marked_content():
+            match ignore_tags:
+                case None: ignore_tags = {'Artifact'}
+                case 'na': ignore_tags = set()
+            # Involves more processing, but also more accurate.
+            self._extract_text_using_tags(ignore_tags=ignore_tags, remove_newlines=remove_newlines)
+        else:
+            bbox = self._get_crop_coordinates(skip_header=remove_header, skip_footer=remove_footer)
+            self._extract_text_using_bbox(bbox=bbox, remove_newlines=remove_newlines)
+        if convert_to_ascii:
+            for page in self.doc.pages:
+                page.content = unidecode(string=page.content,
+                                         errors='replace',
+                                         replace_str='?')
+    def _extract_text_using_bbox(self, **kwargs):
+        """Extract text using a bbox for finding the header and footer.
+       :Keyword Arguments:
+            Those passed by the caller, :meth:`~extract_text`.
+        """
+        for page in self.doc.parser.pages:
+            text = page.within_bbox(bbox=kwargs['bbox']).extract_text().strip()
+            if kwargs['remove_newlines']:
+                text = text.replace('\n', ' ')
+            self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
+    def _extract_text_using_tags(self, **kwargs):
+        """Extract text using tags.
+        The tags defined by the ``ignore_tags`` are skipped.
+        :Keyword Arguments:
+            Those passed by the caller, :meth:`~extract_text`.
+        """
+        # pylint: disable=protected-access
+        ignored = kwargs['ignore_tags']
+        self.doc._tags = True  # Set the doc's 'parsed_using_tags' flag.
+        for page in self.doc.parser.pages:
+            text = ''.join(self._text_from_tags(page=page, ignored=ignored))
+            if kwargs['remove_newlines']:
+                text = text.replace('\n', ' ')
+            self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
+    @staticmethod
+    def _text_from_tags(page: pdfplumber.page.Page, ignored: set) -> str:  # pylint: disable=undefined-variable  # noqa
+        """Generate a page of text extracted from tags.
+        When extracting text from tags, newlines are not encoded and must
+        be derived. For each character on the page, the top and bottom
+        coordinates are compared to determine when a newline should be
+        inserted. If both the top and bottom of the current character
+        are greater than the previous character, a newline is inserted
+        into the text stream.
+        Args:
+            page (pdfplumber.page.Page): Page to be parsed.
+            ignored (set): A set containing the tags to be ignored.
+        Yields:
+            str: Each character on the page, providing its tag is not to
+            be ignored. Or, a newline character if the current
+            character's coordinates are greater than (lower on the page)
+            than the previous character.
+        """
+        if page.chars:
+            # Micro-optimisation: Push tag filtering down to the C-level.
+            chars = filter(lambda x: x['tag'] not in ignored, page.chars)
+            top, btm = 999, 999
+            for c in chars:
+                if top < c['top'] and btm < c['bottom']:
+                    yield '\n'
+                yield c['text']
+                top, btm = c['top'], c['bottom']
+        yield ''
+    def _uses_marked_content(self) -> bool:
+        """Test wether the document can be parsed using tags.
+        Marked content allows us to parse the PDF using tags (rather than
+        OCR) which is more accurate not only in terms of character
+        recognition, but also with regard to the structure of the text on
+        a page.
+        :Logic:
+            If the document's catalog shows ``Marked: True``, then
+            ``True`` is returned immediately.
+            Otherwise, a second attempt is made which detects marked
+            content tags on the first three pages. If no tags are found,
+            a third attempt is made by searching the first 10 pages. If
+            tags are found during either of these attempts, ``True`` is
+            returned immediately.
+            Finally, if no marked content or tags were found, ``False``
+            is returned.
+        Returns:
+            bool: Returns True if the document can be parsed using marked
+            content tags, otherwise False.
+        """
+        # Use pdfminer.six to get the document's catalog.
+        if self.doc.parser.doc.catalog.get('MarkInfo', {}).get('Marked', False):
+            return True
+        # Check only first three pages for tags first, if found, get out.
+        # If not, retry with the first 10 pages.
+        for i in [3, 10]:
+            tags = set(c['tag'] for p in self.doc.parser.pages[:i] for c in p.chars)
+            if tags != {None}:
+                return True
+        return False

docp/parsers/pdfparser.py ADDED Viewed

@@ -0,0 +1,62 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+:Purpose:   This module serves as the public interface for interacting
+            with PDF files and parsing their contents.
+:Platform:  Linux/Windows | Python 3.10+
+:Developer: J Berendt
+:Email:     development@s3dev.uk
+:Comments:  n/a
+:Example:   For example code usage, please refer to the
+            :class:`PDFParser` class docstring.
+"""
+# pylint: disable=import-error
+# pylint: disable=wrong-import-position
+# Set sys.path for relative imports.
+import os
+import sys
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
+# locals
+from parsers._pdftableparser import _PDFTableParser
+from parsers._pdftextparser import _PDFTextParser
+class PDFParser(_PDFTableParser, _PDFTextParser):
+    """PDF document parser.
+    Args:
+        path (str): Full path to the PDF document to be parsed.
+    :Example:
+        Extract text from a PDF file::
+            >>> from docp import PDFParser
+            >>> pdf = PDFParser(path='/path/to/myfile.pdf')
+            >>> pdf.extract_text()
+            # Access the content of page 1.
+            >>> pg1 = pdf.doc.pages[1].content
+        Extract tables from a PDF file::
+            >>> from docp import PDFParser
+            >>> pdf = PDFParser('/path/to/myfile.pdf')
+            >>> pdf.extract_tables()
+            # Access the first table on page 1.
+            >>> tbl1 = pdf.doc.pages[1].tables[1]
+    """
+    def __init__(self, path: str):
+        """PDF parser class initialiser."""
+        super().__init__(path=path)