docp 0.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docp/loaders/chroma.py ADDED
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the entry point for loading a document
5
+ into a Chroma database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Example: For example code use, please refer to the
14
+ :class:`ChromaLoader` class docstring.
15
+
16
+ # pylint: disable=import-error
17
+ # pylint: disable=wrong-import-position
18
+ """
19
+
20
+ import os
21
+ import re
22
+ from glob import glob
23
+ # locals
24
+ try:
25
+ from .loaders._chromabaseloader import _ChromaBaseLoader
26
+ except ImportError:
27
+ from loaders._chromabaseloader import _ChromaBaseLoader
28
+
29
+
30
+ class ChromaLoader(_ChromaBaseLoader):
31
+ """Chroma database document loader.
32
+
33
+ Args:
34
+ path (str): Full path to the file (or *directory*) to be parsed
35
+ and loaded. Note: If this is a directory, a specific file
36
+ extension can be passed into the :meth:`load` method using
37
+ the ``ext`` argument.
38
+ dbpath (str): Full path to the Chroma database *directory*.
39
+ collection (str): Name of the Chroma database collection into
40
+ which the data is to be loaded.
41
+ load_keywords (bool, optional): Use the provided LLM
42
+ (via the ``llm`` parameter) to read the document and infer
43
+ keywords to be loaded into the ``<collection>-kwds``
44
+ database, for keyword-driven document filtering.
45
+ Note: This *requires* the ``llm`` parameter and is
46
+ recommended only for GPU-bound processing. Defaults to False.
47
+ llm (object, optional): An LLM *instance* which can be provided
48
+ directly into the
49
+ :func:`langchain.chains.RetrievalQA.from_chain_type` function
50
+ for keywork inferrence. This is *required* for keyword
51
+ loading. Defaults to None.
52
+ offline (bool, optional): Remain offline and use the locally
53
+ cached embedding function model. Defaults to False.
54
+
55
+ .. important::
56
+
57
+ The *deriving and loading of keywords* is only recommended for
58
+ **GPU-bound processing**, as the LLM is invoked to infer the
59
+ keywords for each given document.
60
+
61
+ If called on a 'standard' PC, this will take a *long* time to
62
+ complete, if it completes at all.
63
+
64
+ :Example:
65
+
66
+ Parse and load a *single* document into a Chroma database
67
+ collection::
68
+
69
+ >>> from docp import ChromaLoader
70
+
71
+ >>> l = ChromaLoader(path='/path/to/file.pdf',
72
+ dbpath='/path/to/chroma',
73
+ collection='spam')
74
+ >>> l.load()
75
+
76
+
77
+ Parse and load a *directory* of PDF documents into a Chroma
78
+ database collection::
79
+
80
+ >>> from docp import ChromaLoader
81
+
82
+ >>> l = ChromaLoader(path='/path/to/directory',
83
+ dbpath='/path/to/chroma',
84
+ collection='spam')
85
+ >>> l.load(ext='pdf')
86
+
87
+ """
88
+
89
+ def __init__(self,
90
+ path: str,
91
+ dbpath: str,
92
+ collection: str,
93
+ *,
94
+ load_keywords: bool=False,
95
+ llm: object=None,
96
+ offline: bool=False):
97
+ """Chroma database loader class initialiser."""
98
+ super().__init__(dbpath=dbpath,
99
+ collection=collection,
100
+ load_keywords=load_keywords,
101
+ llm=llm,
102
+ offline=offline)
103
+ self._path = path
104
+
105
+ def load(self,
106
+ *,
107
+ ext: str='**',
108
+ recursive: bool=True,
109
+ remove_header: bool=True,
110
+ remove_footer: bool=True,
111
+ remove_newlines: bool=True,
112
+ ignore_tags: set=None,
113
+ convert_to_ascii: bool=True) -> None:
114
+ """Load a document (or documents) into a Chroma database.
115
+
116
+ Args:
117
+ ext (str): If the ``path`` argument refers to a *directory*,
118
+ a specific file extension can be specified here.
119
+ For example::
120
+
121
+ ext = 'pdf'
122
+
123
+ If anything other than ``'**'`` is provided, all
124
+ alpha-characters are parsed from the string, and prefixed
125
+ with ``*.``. Meaning, if ``'.pdf'`` is passed, the
126
+ characters ``'pdf'`` are parsed and prefixed with ``*.``
127
+ to create ``'*.pdf'``. However, if ``'things.foo'`` is
128
+ passed, the derived extension will be ``'*.thingsfoo'``.
129
+ Defaults to '**', for a recursive search.
130
+
131
+ recursive (bool, optional): If True, subdirectories are
132
+ searched. Defaults to True.
133
+ remove_header (bool, optional): Attempt to remove the header
134
+ from each page. Defaults to True.
135
+ remove_footer (bool, optional): Attempt to remove the footer
136
+ from each page. Defaults to True.
137
+ remove_newlines (bool, optional): Replace newline characters
138
+ with a space. Defaults to True, as this helps with
139
+ document chunk splitting.
140
+ ignore_tags (set, optional): If provided, these are the
141
+ PDF 'marked content' tags which will be ignored. Note
142
+ that the PDF document must contain tags, otherwise the
143
+ bounding box method is used and this argument is ignored.
144
+ Defaults to ``{'Artifact'}``, as these generally
145
+ relate to a header and/or footer. To include all tags,
146
+ (not skip any) pass this argument as ``'na'``.
147
+ convert_to_ascii (bool, optional): Convert all characters to
148
+ ASCII. Defaults to True.
149
+
150
+ """
151
+ if os.path.isdir(self._path):
152
+ if ext != '**':
153
+ ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
154
+ files = glob(os.path.join(self._path, ext), recursive=recursive)
155
+ count = len(files)
156
+ for idx, f in enumerate(files, 1):
157
+ print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
158
+ self._load(path=f)
159
+ else:
160
+ print(f'Processing: {os.path.basename(self._path)} ...')
161
+ self._load(path=self._path,
162
+ remove_header=remove_header,
163
+ remove_footer=remove_footer,
164
+ remove_newlines=remove_newlines,
165
+ ignore_tags=ignore_tags,
166
+ convert_to_ascii=convert_to_ascii)
File without changes
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the implementation for the
5
+ document-type-specific base class.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+ from __future__ import annotations
16
+ try:
17
+ from .objects._pageobject import PageObject
18
+ except ImportError:
19
+ from objects._pageobject import PageObject
20
+
21
+
22
+ class _DocBase:
23
+ """Private document base class.
24
+
25
+ This class is *not* designed to be interacted with directly, but
26
+ rather to be inherited by the document-type-specific document
27
+ objects.
28
+
29
+ """
30
+
31
+ def __init__(self):
32
+ """Base document object class initialiser."""
33
+ self._common = None # Used by the header/footer scanner.
34
+ self._fname = None # Filename (basename)
35
+ self._fpath = None # Full file path
36
+ self._meta = None # Metadata from the document parger
37
+ self._npages = 0 # Number of pages in the document
38
+ self._ntables = 0 # Number of tables extracted
39
+ self._parser = None # Underlying document parser functionality
40
+ # List of PageObjects, offset by 1 to align the index with page numbers.
41
+ self._pages = [PageObject(pageno=0)]
42
+
43
+ @property
44
+ def basename(self) -> str:
45
+ """Accessor for the file's basename."""
46
+ return self._fname
47
+
48
+ @property
49
+ def filepath(self) -> str:
50
+ """Accessor for the explicit path to this file."""
51
+ return self._fpath
52
+
53
+ @property
54
+ def metadata(self) -> dict | object:
55
+ """The meta data as extracted from the document."""
56
+ return self._meta
57
+
58
+ @property
59
+ def npages(self) -> int:
60
+ """The number of pages successfully extracted from the source."""
61
+ return self._npages
62
+
63
+ @property
64
+ def ntables(self) -> int:
65
+ """The number of tables successfully extracted from the source."""
66
+ return self._ntables
67
+
68
+ @property
69
+ def pages(self) -> list[PageObject]: # noqa pylint: disable=undefined-variable
70
+ """A list of containing an object for each page in the document."""
71
+ return self._pages
72
+
73
+ @property
74
+ def parser(self) -> object:
75
+ """Accessor to the underlying document parser's functionality."""
76
+ return self._parser
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the ``page`` object implementation.
5
+
6
+ :Platform: Linux/Windows | Python 3.10+
7
+ :Developer: J Berendt
8
+ :Email: development@s3dev.uk
9
+
10
+ :Comments: n/a
11
+
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+
17
+ class PageObject:
18
+ """This class provides the implementation for the ``PageObject``.
19
+
20
+ For each page in a document, an instance of this class is created,
21
+ populated and appended into the document's ``pages`` list attribute.
22
+
23
+ Args:
24
+ content (str, optional): Page content as a single string.
25
+ Defaults to ''.
26
+ pageno (int, optional): Page number. Defaults to 0.
27
+ parser (object, optional): The underlying document parser object.
28
+ Defaults to None.
29
+
30
+ """
31
+
32
+ __slots__ = ('_content', '_hastext', '_pageno', '_parser', '_tables')
33
+
34
+ def __init__(self, content: str='', pageno: int=0, parser: object=None):
35
+ """Page object class initialiser."""
36
+ self._content = content
37
+ self._pageno = pageno
38
+ self._parser = parser
39
+ self._hastext = bool(content)
40
+ self._tables = []
41
+
42
+ def __repr__(self) -> str:
43
+ """Formatted representation of this object."""
44
+ if self._pageno == 0:
45
+ return f'<Page: {self._pageno}; <index offset>>'
46
+ return f'<Page: {self._pageno}; Chars: {len(self._content)}>'
47
+
48
+ def __str__(self) -> str:
49
+ """Formatted string displayed when printing this object."""
50
+ c = self._content[:25].replace('\n', ' ') + ' ...' if self._content else ''
51
+ fmt = (f'Page no: {self._pageno}; '
52
+ f'Content: "{c}"; '
53
+ f'Chars: {len(self._content)}; '
54
+ f'nTables: {len(self._tables)}; '
55
+ f'Parser avail: {bool(self._parser)}')
56
+ return fmt
57
+
58
+ @property
59
+ def content(self) -> str:
60
+ """Accessor to the page's textual content."""
61
+ return self._content
62
+
63
+ @content.setter
64
+ def content(self, value: str) -> None:
65
+ """Setter for the ``content`` attribute.
66
+
67
+ If the ``value`` argument is populated, the content is set and
68
+ the ``hastext`` attribute is set to ``True``.
69
+
70
+ """
71
+ if value:
72
+ self._content = value
73
+ self._hastext = True
74
+
75
+ @property
76
+ def hastext(self) -> bool:
77
+ """Flag indicating if the ``content`` attribute is populated."""
78
+ return self._hastext
79
+
80
+ @property
81
+ def pageno(self) -> int:
82
+ """Accessor to the page number.
83
+
84
+ Note:
85
+ This is the page number 1-n, concerning the page's *sequence
86
+ in the overall document*. This is *not* guaranteed to be the
87
+ page's number per the document's page labeling scheme.
88
+
89
+ """
90
+ return self._pageno
91
+
92
+ @property
93
+ def parser(self) -> object:
94
+ """Accessor to the document parser's internal functionality.
95
+
96
+ Note:
97
+ The population of this property is determined by the
98
+ document-type-specific ``docp`` parser. If the underlying
99
+ parsing library has functionality worth preserving and making
100
+ available to the user, it is stored to this property.
101
+ Otherwise, this property will remain as ``None``.
102
+
103
+ """
104
+ return self._parser
105
+
106
+ @property
107
+ def tables(self) -> list:
108
+ """Accessor to the page's tables, if parsed."""
109
+ return self._tables
110
+
111
+ def show(self) -> pdfplumber.display.PageImage: # pylint: disable=undefined-variable # noqa
112
+ """Display the page as an image.
113
+
114
+ Additionally, the return value exposes access to the underlying
115
+ ``pdfplumber`` debugging visualisation methods such as:
116
+
117
+ - :func:`img.debug_tablefinder`
118
+ - :func:`img.draw_*`
119
+ - :func:`img.outline_chars`
120
+ - :func:`img.outline_words`
121
+ - :func:`img.reset`
122
+ - etc.
123
+
124
+
125
+ """
126
+ return self.parser.to_image()
File without changes
File without changes
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the 'PDF Document' object structure into
5
+ which PDF documents are parsed into for transport and onward
6
+ use.
7
+
8
+ :Platform: Linux/Windows | Python 3.10+
9
+ :Developer: J Berendt
10
+ :Email: development@s3dev.uk
11
+
12
+ :Comments: n/a
13
+
14
+ """
15
+ # pylint: disable=import-error
16
+
17
+ from objects._docbaseobject import _DocBase
18
+
19
+
20
+ class DocPDF(_DocBase):
21
+ """Container class for storing data parsed from a PDF file."""
22
+
23
+ def __init__(self):
24
+ """PDF document object class initialiser."""
25
+ super().__init__()
26
+ self._tags = False
27
+
28
+ @property
29
+ def parsed_using_tags(self) -> bool:
30
+ """Flag indicating if the document was parsed using tags.
31
+
32
+ PDF documents can be created with 'marked content' tags. When
33
+ a PDF document is parsed using tags, as this flag indicates, the
34
+ parser respects columns and other page formatting schemes. If a
35
+ multi-column page is parsed without tags, the parser reads
36
+ straight across the line, thus corrupting the text.
37
+
38
+ """
39
+ return self._tags
File without changes
@@ -0,0 +1,210 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides generalised base functionality for
5
+ parsing PDF documents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ Note: This module is *not* designed to be interacted with
12
+ directly, only via the appropriate interface class(es).
13
+
14
+ Rather, please create an instance of a PDF document parsing
15
+ object using the following:
16
+
17
+ - :class:`pdfparser.PDFParser`
18
+
19
+ """
20
+ # pylint: disable=import-error
21
+ # pylint: disable=protected-access
22
+ # pylint: disable=wrong-import-order
23
+
24
+ import os
25
+ import pdfplumber
26
+ from collections import Counter
27
+ from unidecode import unidecode
28
+ # locals
29
+ from objects.pdfobject import DocPDF
30
+
31
+
32
+ class _PDFBaseParser:
33
+ """Base class containing generalised PDF parsing functionality."""
34
+
35
+ def __init__(self, path: str):
36
+ """Private base parser class initialiser.
37
+
38
+ Args:
39
+ path (str): Full path to the document to be parsed.
40
+
41
+ """
42
+ self._path = path
43
+ self._doc = DocPDF()
44
+ self._tbl_opath = None
45
+ self._set_paths()
46
+ self._open()
47
+
48
+ def __del__(self):
49
+ """Class deconstructor.
50
+
51
+ :Tasks:
52
+ - Ensure the PDF document is closed.
53
+
54
+ """
55
+ if hasattr(self._doc, '_parser'):
56
+ self._doc._parser.close()
57
+
58
+ @property
59
+ def doc(self) -> DocPDF:
60
+ """Accessor to the document object."""
61
+ return self._doc
62
+
63
+ def _get_crop_coordinates(self,
64
+ skip_header: bool=False,
65
+ skip_footer: bool=False) -> tuple[float]:
66
+ """Determine the bounding box coordinates.
67
+
68
+ These coordinates are used for removing the header and/or footer.
69
+
70
+ Args:
71
+ skip_header (bool, optional): If True, set the coordinates
72
+ such that the header is skipped. Defaults to False.
73
+ skip_footer (bool, optional): If True, set the coordinates
74
+ such that the footer is skipped. Defaults to False.
75
+
76
+ :Logic:
77
+ When excluding a header and/or footer, the following page
78
+ numbers are used for header/footer *position* detection,
79
+ given the length of the document:
80
+
81
+ - Number of pages [1]: 1
82
+ - Number of pages [2,10]: 2
83
+ - Number of pages [11,]: 5
84
+
85
+ Returns:
86
+ tuple: A bounding box tuple of the following form, to be
87
+ passed directly into the :func:`Page.crop` method::
88
+
89
+ (x0, top, x1, bottom)
90
+
91
+ """
92
+ npages = self._doc.npages
93
+ match npages:
94
+ case 1: num = 1
95
+ case _ if npages in range(2, 11): num = 2
96
+ case _: num = 5
97
+ pg = self._doc.parser.pages[num] # The pages list has a has a page offset at [0].
98
+ # Default coordinates to the whole page.
99
+ coords = {'x0': 0, 'top': 0, 'x1': pg.width, 'bottom': pg.height}
100
+ # If the header and/or footer is to be skipped, find and iterate
101
+ # through the common lines and overwrite the coordinates as
102
+ # appropriate, given the key and the line's location on the page.
103
+ if skip_header or skip_footer:
104
+ lines = self._scan_common()
105
+ for line in lines:
106
+ s = pg.search(line)
107
+ if s:
108
+ for key in coords:
109
+ v = s[0][key]
110
+ match key:
111
+ case 'top' if v < pg.height/2 and skip_header:
112
+ coords[key] = max(coords[key], v+2)
113
+ case 'bottom' if v > pg.height/2 and skip_footer:
114
+ coords[key] = min(coords[key], v-2)
115
+ return tuple(coords.values())
116
+
117
+ def _open(self) -> None:
118
+ """Open the PDF document for reading.
119
+
120
+ :Other Operations:
121
+
122
+ - Store the ``pdfplumber`` parser object returned from the
123
+ :func:`pdfplumber.open` function into the
124
+ :attr:`self._doc._parser` attribute.
125
+ - Store the number of pages into the
126
+ :attr:`self._doc._npages` attribute.
127
+ - Store the document's meta data into the
128
+ :attr:`self._doc._meta` attribute.
129
+
130
+ """
131
+ self._doc._parser = pdfplumber.open(self._doc._fpath)
132
+ self._doc._npages = len(self._doc._parser.pages)
133
+ self._doc._meta = self._doc._parser.metadata
134
+
135
+ @staticmethod
136
+ def _prepare_row(row: list) -> str:
137
+ """Prepare the table row for writing a table to to CSV.
138
+
139
+ Args:
140
+ row (list): A list of strings, constituting a table row.
141
+
142
+ :Processing Tasks:
143
+
144
+ For each element in the row:
145
+
146
+ - Remove any double quote characters (ASCII and Unicode).
147
+ - Replace any empty values with ``'None'``.
148
+ - If the element contains a comma, wrap the element in
149
+ double quotes.
150
+ - Attempt to convert any non-ASCII characters to an
151
+ associated ASCII character. If the replacement cannot
152
+ be made, the character is replaced with a ``'?'``.
153
+
154
+ Returns:
155
+ str: A processed comma-separated string, ready to be written
156
+ to a CSV file.
157
+
158
+ """
159
+ trans = {34: '', 8220: '', 8221: ''} # Remove double quotes in Unicode.
160
+ row = [e.translate(trans) if e else 'None' for e in row] # Cannot be a generator.
161
+ for idx, e in enumerate(row):
162
+ if ',' in e:
163
+ row[idx] = f'"{e}"' # Escape comma-separation by quoting.
164
+ line = unidecode(','.join(row).replace('\n', ' '), errors='replace', replace_str='?')
165
+ return line
166
+
167
+ def _scan_common(self) -> list[str]:
168
+ """Scan the PDF document to find the most common lines.
169
+
170
+ :Rationale:
171
+ Generally, the most common lines in a document will be the
172
+ header and footer, as these are expected to be repeated on
173
+ each page of the document.
174
+
175
+ 'Most common' is defined as line occurring on 90% of the
176
+ pages throughout the document. Therefore, only documents with
177
+ more than three pages are scanned. Otherwise, the 90% may
178
+ exclude relevant pieces of the document (as was discovered in
179
+ testing).
180
+
181
+ :Logic:
182
+ For documents with more than three pages, the entire PDF is
183
+ read through and each line extracted. The occurrence of each
184
+ line is counted, with the most common occurrences returned
185
+ to the caller.
186
+
187
+ The returned lines are to be passed into a page search to
188
+ determine the x/y coordinates of the header and footer.
189
+
190
+ Returns:
191
+ list: For documents with more than three pages, a list
192
+ containing the most common lines in the document. Otherwise,
193
+ an empty list if returned.
194
+
195
+ """
196
+ # Only scan if document has more than three pages.
197
+ if self._doc.npages < 4:
198
+ return []
199
+ if self._doc.common is None:
200
+ # Create a line generator for all pages.
201
+ lines = (l for p in self._doc.parser.pages for l in p.extract_text().split('\n'))
202
+ # Return the lines whose occurrence rate is 90% of document pages.
203
+ self._doc._common = [i[0] for i in Counter(lines).most_common()
204
+ if i[1] > self._doc.npages * 0.9]
205
+ return self._doc.common
206
+
207
+ def _set_paths(self) -> None:
208
+ """Set the document's file path attributes."""
209
+ self._doc._fpath = os.path.realpath(self._path)
210
+ self._doc._fname = os.path.basename(self._path)