docp 0.1.0b1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
docp/loaders/chroma.py ADDED
@@ -0,0 +1,166 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the entry point for loading a document
5
+ into a Chroma database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Example: For example code use, please refer to the
14
+ :class:`ChromaLoader` class docstring.
15
+
16
+ # pylint: disable=import-error
17
+ # pylint: disable=wrong-import-position
18
+ """
19
+
20
+ import os
21
+ import re
22
+ from glob import glob
23
+ # locals
24
+ try:
25
+ from .loaders._chromabaseloader import _ChromaBaseLoader
26
+ except ImportError:
27
+ from loaders._chromabaseloader import _ChromaBaseLoader
28
+
29
+
30
+ class ChromaLoader(_ChromaBaseLoader):
31
+ """Chroma database document loader.
32
+
33
+ Args:
34
+ path (str): Full path to the file (or *directory*) to be parsed
35
+ and loaded. Note: If this is a directory, a specific file
36
+ extension can be passed into the :meth:`load` method using
37
+ the ``ext`` argument.
38
+ dbpath (str): Full path to the Chroma database *directory*.
39
+ collection (str): Name of the Chroma database collection into
40
+ which the data is to be loaded.
41
+ load_keywords (bool, optional): Use the provided LLM
42
+ (via the ``llm`` parameter) to read the document and infer
43
+ keywords to be loaded into the ``<collection>-kwds``
44
+ database, for keyword-driven document filtering.
45
+ Note: This *requires* the ``llm`` parameter and is
46
+ recommended only for GPU-bound processing. Defaults to False.
47
+ llm (object, optional): An LLM *instance* which can be provided
48
+ directly into the
49
+ :func:`langchain.chains.RetrievalQA.from_chain_type` function
50
+ for keywork inferrence. This is *required* for keyword
51
+ loading. Defaults to None.
52
+ offline (bool, optional): Remain offline and use the locally
53
+ cached embedding function model. Defaults to False.
54
+
55
+ .. important::
56
+
57
+ The *deriving and loading of keywords* is only recommended for
58
+ **GPU-bound processing**, as the LLM is invoked to infer the
59
+ keywords for each given document.
60
+
61
+ If called on a 'standard' PC, this will take a *long* time to
62
+ complete, if it completes at all.
63
+
64
+ :Example:
65
+
66
+ Parse and load a *single* document into a Chroma database
67
+ collection::
68
+
69
+ >>> from docp import ChromaLoader
70
+
71
+ >>> l = ChromaLoader(path='/path/to/file.pdf',
72
+ dbpath='/path/to/chroma',
73
+ collection='spam')
74
+ >>> l.load()
75
+
76
+
77
+ Parse and load a *directory* of PDF documents into a Chroma
78
+ database collection::
79
+
80
+ >>> from docp import ChromaLoader
81
+
82
+ >>> l = ChromaLoader(path='/path/to/directory',
83
+ dbpath='/path/to/chroma',
84
+ collection='spam')
85
+ >>> l.load(ext='pdf')
86
+
87
+ """
88
+
89
+ def __init__(self,
90
+ path: str,
91
+ dbpath: str,
92
+ collection: str,
93
+ *,
94
+ load_keywords: bool=False,
95
+ llm: object=None,
96
+ offline: bool=False):
97
+ """Chroma database loader class initialiser."""
98
+ super().__init__(dbpath=dbpath,
99
+ collection=collection,
100
+ load_keywords=load_keywords,
101
+ llm=llm,
102
+ offline=offline)
103
+ self._path = path
104
+
105
+ def load(self,
106
+ *,
107
+ ext: str='**',
108
+ recursive: bool=True,
109
+ remove_header: bool=True,
110
+ remove_footer: bool=True,
111
+ remove_newlines: bool=True,
112
+ ignore_tags: set=None,
113
+ convert_to_ascii: bool=True) -> None:
114
+ """Load a document (or documents) into a Chroma database.
115
+
116
+ Args:
117
+ ext (str): If the ``path`` argument refers to a *directory*,
118
+ a specific file extension can be specified here.
119
+ For example::
120
+
121
+ ext = 'pdf'
122
+
123
+ If anything other than ``'**'`` is provided, all
124
+ alpha-characters are parsed from the string, and prefixed
125
+ with ``*.``. Meaning, if ``'.pdf'`` is passed, the
126
+ characters ``'pdf'`` are parsed and prefixed with ``*.``
127
+ to create ``'*.pdf'``. However, if ``'things.foo'`` is
128
+ passed, the derived extension will be ``'*.thingsfoo'``.
129
+ Defaults to '**', for a recursive search.
130
+
131
+ recursive (bool, optional): If True, subdirectories are
132
+ searched. Defaults to True.
133
+ remove_header (bool, optional): Attempt to remove the header
134
+ from each page. Defaults to True.
135
+ remove_footer (bool, optional): Attempt to remove the footer
136
+ from each page. Defaults to True.
137
+ remove_newlines (bool, optional): Replace newline characters
138
+ with a space. Defaults to True, as this helps with
139
+ document chunk splitting.
140
+ ignore_tags (set, optional): If provided, these are the
141
+ PDF 'marked content' tags which will be ignored. Note
142
+ that the PDF document must contain tags, otherwise the
143
+ bounding box method is used and this argument is ignored.
144
+ Defaults to ``{'Artifact'}``, as these generally
145
+ relate to a header and/or footer. To include all tags,
146
+ (not skip any) pass this argument as ``'na'``.
147
+ convert_to_ascii (bool, optional): Convert all characters to
148
+ ASCII. Defaults to True.
149
+
150
+ """
151
+ if os.path.isdir(self._path):
152
+ if ext != '**':
153
+ ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
154
+ files = glob(os.path.join(self._path, ext), recursive=recursive)
155
+ count = len(files)
156
+ for idx, f in enumerate(files, 1):
157
+ print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
158
+ self._load(path=f)
159
+ else:
160
+ print(f'Processing: {os.path.basename(self._path)} ...')
161
+ self._load(path=self._path,
162
+ remove_header=remove_header,
163
+ remove_footer=remove_footer,
164
+ remove_newlines=remove_newlines,
165
+ ignore_tags=ignore_tags,
166
+ convert_to_ascii=convert_to_ascii)
File without changes
@@ -0,0 +1,76 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the implementation for the
5
+ document-type-specific base class.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+ from __future__ import annotations
16
+ try:
17
+ from .objects._pageobject import PageObject
18
+ except ImportError:
19
+ from objects._pageobject import PageObject
20
+
21
+
22
+ class _DocBase:
23
+ """Private document base class.
24
+
25
+ This class is *not* designed to be interacted with directly, but
26
+ rather to be inherited by the document-type-specific document
27
+ objects.
28
+
29
+ """
30
+
31
+ def __init__(self):
32
+ """Base document object class initialiser."""
33
+ self._common = None # Used by the header/footer scanner.
34
+ self._fname = None # Filename (basename)
35
+ self._fpath = None # Full file path
36
+ self._meta = None # Metadata from the document parger
37
+ self._npages = 0 # Number of pages in the document
38
+ self._ntables = 0 # Number of tables extracted
39
+ self._parser = None # Underlying document parser functionality
40
+ # List of PageObjects, offset by 1 to align the index with page numbers.
41
+ self._pages = [PageObject(pageno=0)]
42
+
43
+ @property
44
+ def basename(self) -> str:
45
+ """Accessor for the file's basename."""
46
+ return self._fname
47
+
48
+ @property
49
+ def filepath(self) -> str:
50
+ """Accessor for the explicit path to this file."""
51
+ return self._fpath
52
+
53
+ @property
54
+ def metadata(self) -> dict | object:
55
+ """The meta data as extracted from the document."""
56
+ return self._meta
57
+
58
+ @property
59
+ def npages(self) -> int:
60
+ """The number of pages successfully extracted from the source."""
61
+ return self._npages
62
+
63
+ @property
64
+ def ntables(self) -> int:
65
+ """The number of tables successfully extracted from the source."""
66
+ return self._ntables
67
+
68
+ @property
69
+ def pages(self) -> list[PageObject]: # noqa pylint: disable=undefined-variable
70
+ """A list of containing an object for each page in the document."""
71
+ return self._pages
72
+
73
+ @property
74
+ def parser(self) -> object:
75
+ """Accessor to the underlying document parser's functionality."""
76
+ return self._parser
@@ -0,0 +1,126 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the ``page`` object implementation.
5
+
6
+ :Platform: Linux/Windows | Python 3.10+
7
+ :Developer: J Berendt
8
+ :Email: development@s3dev.uk
9
+
10
+ :Comments: n/a
11
+
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+
17
+ class PageObject:
18
+ """This class provides the implementation for the ``PageObject``.
19
+
20
+ For each page in a document, an instance of this class is created,
21
+ populated and appended into the document's ``pages`` list attribute.
22
+
23
+ Args:
24
+ content (str, optional): Page content as a single string.
25
+ Defaults to ''.
26
+ pageno (int, optional): Page number. Defaults to 0.
27
+ parser (object, optional): The underlying document parser object.
28
+ Defaults to None.
29
+
30
+ """
31
+
32
+ __slots__ = ('_content', '_hastext', '_pageno', '_parser', '_tables')
33
+
34
+ def __init__(self, content: str='', pageno: int=0, parser: object=None):
35
+ """Page object class initialiser."""
36
+ self._content = content
37
+ self._pageno = pageno
38
+ self._parser = parser
39
+ self._hastext = bool(content)
40
+ self._tables = []
41
+
42
+ def __repr__(self) -> str:
43
+ """Formatted representation of this object."""
44
+ if self._pageno == 0:
45
+ return f'<Page: {self._pageno}; <index offset>>'
46
+ return f'<Page: {self._pageno}; Chars: {len(self._content)}>'
47
+
48
+ def __str__(self) -> str:
49
+ """Formatted string displayed when printing this object."""
50
+ c = self._content[:25].replace('\n', ' ') + ' ...' if self._content else ''
51
+ fmt = (f'Page no: {self._pageno}; '
52
+ f'Content: "{c}"; '
53
+ f'Chars: {len(self._content)}; '
54
+ f'nTables: {len(self._tables)}; '
55
+ f'Parser avail: {bool(self._parser)}')
56
+ return fmt
57
+
58
+ @property
59
+ def content(self) -> str:
60
+ """Accessor to the page's textual content."""
61
+ return self._content
62
+
63
+ @content.setter
64
+ def content(self, value: str) -> None:
65
+ """Setter for the ``content`` attribute.
66
+
67
+ If the ``value`` argument is populated, the content is set and
68
+ the ``hastext`` attribute is set to ``True``.
69
+
70
+ """
71
+ if value:
72
+ self._content = value
73
+ self._hastext = True
74
+
75
+ @property
76
+ def hastext(self) -> bool:
77
+ """Flag indicating if the ``content`` attribute is populated."""
78
+ return self._hastext
79
+
80
+ @property
81
+ def pageno(self) -> int:
82
+ """Accessor to the page number.
83
+
84
+ Note:
85
+ This is the page number 1-n, concerning the page's *sequence
86
+ in the overall document*. This is *not* guaranteed to be the
87
+ page's number per the document's page labeling scheme.
88
+
89
+ """
90
+ return self._pageno
91
+
92
+ @property
93
+ def parser(self) -> object:
94
+ """Accessor to the document parser's internal functionality.
95
+
96
+ Note:
97
+ The population of this property is determined by the
98
+ document-type-specific ``docp`` parser. If the underlying
99
+ parsing library has functionality worth preserving and making
100
+ available to the user, it is stored to this property.
101
+ Otherwise, this property will remain as ``None``.
102
+
103
+ """
104
+ return self._parser
105
+
106
+ @property
107
+ def tables(self) -> list:
108
+ """Accessor to the page's tables, if parsed."""
109
+ return self._tables
110
+
111
+ def show(self) -> pdfplumber.display.PageImage: # pylint: disable=undefined-variable # noqa
112
+ """Display the page as an image.
113
+
114
+ Additionally, the return value exposes access to the underlying
115
+ ``pdfplumber`` debugging visualisation methods such as:
116
+
117
+ - :func:`img.debug_tablefinder`
118
+ - :func:`img.draw_*`
119
+ - :func:`img.outline_chars`
120
+ - :func:`img.outline_words`
121
+ - :func:`img.reset`
122
+ - etc.
123
+
124
+
125
+ """
126
+ return self.parser.to_image()
File without changes
File without changes
@@ -0,0 +1,39 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the 'PDF Document' object structure into
5
+ which PDF documents are parsed into for transport and onward
6
+ use.
7
+
8
+ :Platform: Linux/Windows | Python 3.10+
9
+ :Developer: J Berendt
10
+ :Email: development@s3dev.uk
11
+
12
+ :Comments: n/a
13
+
14
+ """
15
+ # pylint: disable=import-error
16
+
17
+ from objects._docbaseobject import _DocBase
18
+
19
+
20
+ class DocPDF(_DocBase):
21
+ """Container class for storing data parsed from a PDF file."""
22
+
23
+ def __init__(self):
24
+ """PDF document object class initialiser."""
25
+ super().__init__()
26
+ self._tags = False
27
+
28
+ @property
29
+ def parsed_using_tags(self) -> bool:
30
+ """Flag indicating if the document was parsed using tags.
31
+
32
+ PDF documents can be created with 'marked content' tags. When
33
+ a PDF document is parsed using tags, as this flag indicates, the
34
+ parser respects columns and other page formatting schemes. If a
35
+ multi-column page is parsed without tags, the parser reads
36
+ straight across the line, thus corrupting the text.
37
+
38
+ """
39
+ return self._tags
File without changes
@@ -0,0 +1,210 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides generalised base functionality for
5
+ parsing PDF documents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ Note: This module is *not* designed to be interacted with
12
+ directly, only via the appropriate interface class(es).
13
+
14
+ Rather, please create an instance of a PDF document parsing
15
+ object using the following:
16
+
17
+ - :class:`pdfparser.PDFParser`
18
+
19
+ """
20
+ # pylint: disable=import-error
21
+ # pylint: disable=protected-access
22
+ # pylint: disable=wrong-import-order
23
+
24
+ import os
25
+ import pdfplumber
26
+ from collections import Counter
27
+ from unidecode import unidecode
28
+ # locals
29
+ from objects.pdfobject import DocPDF
30
+
31
+
32
+ class _PDFBaseParser:
33
+ """Base class containing generalised PDF parsing functionality."""
34
+
35
+ def __init__(self, path: str):
36
+ """Private base parser class initialiser.
37
+
38
+ Args:
39
+ path (str): Full path to the document to be parsed.
40
+
41
+ """
42
+ self._path = path
43
+ self._doc = DocPDF()
44
+ self._tbl_opath = None
45
+ self._set_paths()
46
+ self._open()
47
+
48
+ def __del__(self):
49
+ """Class deconstructor.
50
+
51
+ :Tasks:
52
+ - Ensure the PDF document is closed.
53
+
54
+ """
55
+ if hasattr(self._doc, '_parser'):
56
+ self._doc._parser.close()
57
+
58
+ @property
59
+ def doc(self) -> DocPDF:
60
+ """Accessor to the document object."""
61
+ return self._doc
62
+
63
+ def _get_crop_coordinates(self,
64
+ skip_header: bool=False,
65
+ skip_footer: bool=False) -> tuple[float]:
66
+ """Determine the bounding box coordinates.
67
+
68
+ These coordinates are used for removing the header and/or footer.
69
+
70
+ Args:
71
+ skip_header (bool, optional): If True, set the coordinates
72
+ such that the header is skipped. Defaults to False.
73
+ skip_footer (bool, optional): If True, set the coordinates
74
+ such that the footer is skipped. Defaults to False.
75
+
76
+ :Logic:
77
+ When excluding a header and/or footer, the following page
78
+ numbers are used for header/footer *position* detection,
79
+ given the length of the document:
80
+
81
+ - Number of pages [1]: 1
82
+ - Number of pages [2,10]: 2
83
+ - Number of pages [11,]: 5
84
+
85
+ Returns:
86
+ tuple: A bounding box tuple of the following form, to be
87
+ passed directly into the :func:`Page.crop` method::
88
+
89
+ (x0, top, x1, bottom)
90
+
91
+ """
92
+ npages = self._doc.npages
93
+ match npages:
94
+ case 1: num = 1
95
+ case _ if npages in range(2, 11): num = 2
96
+ case _: num = 5
97
+ pg = self._doc.parser.pages[num] # The pages list has a has a page offset at [0].
98
+ # Default coordinates to the whole page.
99
+ coords = {'x0': 0, 'top': 0, 'x1': pg.width, 'bottom': pg.height}
100
+ # If the header and/or footer is to be skipped, find and iterate
101
+ # through the common lines and overwrite the coordinates as
102
+ # appropriate, given the key and the line's location on the page.
103
+ if skip_header or skip_footer:
104
+ lines = self._scan_common()
105
+ for line in lines:
106
+ s = pg.search(line)
107
+ if s:
108
+ for key in coords:
109
+ v = s[0][key]
110
+ match key:
111
+ case 'top' if v < pg.height/2 and skip_header:
112
+ coords[key] = max(coords[key], v+2)
113
+ case 'bottom' if v > pg.height/2 and skip_footer:
114
+ coords[key] = min(coords[key], v-2)
115
+ return tuple(coords.values())
116
+
117
+ def _open(self) -> None:
118
+ """Open the PDF document for reading.
119
+
120
+ :Other Operations:
121
+
122
+ - Store the ``pdfplumber`` parser object returned from the
123
+ :func:`pdfplumber.open` function into the
124
+ :attr:`self._doc._parser` attribute.
125
+ - Store the number of pages into the
126
+ :attr:`self._doc._npages` attribute.
127
+ - Store the document's meta data into the
128
+ :attr:`self._doc._meta` attribute.
129
+
130
+ """
131
+ self._doc._parser = pdfplumber.open(self._doc._fpath)
132
+ self._doc._npages = len(self._doc._parser.pages)
133
+ self._doc._meta = self._doc._parser.metadata
134
+
135
+ @staticmethod
136
+ def _prepare_row(row: list) -> str:
137
+ """Prepare the table row for writing a table to to CSV.
138
+
139
+ Args:
140
+ row (list): A list of strings, constituting a table row.
141
+
142
+ :Processing Tasks:
143
+
144
+ For each element in the row:
145
+
146
+ - Remove any double quote characters (ASCII and Unicode).
147
+ - Replace any empty values with ``'None'``.
148
+ - If the element contains a comma, wrap the element in
149
+ double quotes.
150
+ - Attempt to convert any non-ASCII characters to an
151
+ associated ASCII character. If the replacement cannot
152
+ be made, the character is replaced with a ``'?'``.
153
+
154
+ Returns:
155
+ str: A processed comma-separated string, ready to be written
156
+ to a CSV file.
157
+
158
+ """
159
+ trans = {34: '', 8220: '', 8221: ''} # Remove double quotes in Unicode.
160
+ row = [e.translate(trans) if e else 'None' for e in row] # Cannot be a generator.
161
+ for idx, e in enumerate(row):
162
+ if ',' in e:
163
+ row[idx] = f'"{e}"' # Escape comma-separation by quoting.
164
+ line = unidecode(','.join(row).replace('\n', ' '), errors='replace', replace_str='?')
165
+ return line
166
+
167
+ def _scan_common(self) -> list[str]:
168
+ """Scan the PDF document to find the most common lines.
169
+
170
+ :Rationale:
171
+ Generally, the most common lines in a document will be the
172
+ header and footer, as these are expected to be repeated on
173
+ each page of the document.
174
+
175
+ 'Most common' is defined as line occurring on 90% of the
176
+ pages throughout the document. Therefore, only documents with
177
+ more than three pages are scanned. Otherwise, the 90% may
178
+ exclude relevant pieces of the document (as was discovered in
179
+ testing).
180
+
181
+ :Logic:
182
+ For documents with more than three pages, the entire PDF is
183
+ read through and each line extracted. The occurrence of each
184
+ line is counted, with the most common occurrences returned
185
+ to the caller.
186
+
187
+ The returned lines are to be passed into a page search to
188
+ determine the x/y coordinates of the header and footer.
189
+
190
+ Returns:
191
+ list: For documents with more than three pages, a list
192
+ containing the most common lines in the document. Otherwise,
193
+ an empty list if returned.
194
+
195
+ """
196
+ # Only scan if document has more than three pages.
197
+ if self._doc.npages < 4:
198
+ return []
199
+ if self._doc.common is None:
200
+ # Create a line generator for all pages.
201
+ lines = (l for p in self._doc.parser.pages for l in p.extract_text().split('\n'))
202
+ # Return the lines whose occurrence rate is 90% of document pages.
203
+ self._doc._common = [i[0] for i in Counter(lines).most_common()
204
+ if i[1] > self._doc.npages * 0.9]
205
+ return self._doc.common
206
+
207
+ def _set_paths(self) -> None:
208
+ """Set the document's file path attributes."""
209
+ self._doc._fpath = os.path.realpath(self._path)
210
+ self._doc._fname = os.path.basename(self._path)