docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
- docp/__init__.py +35 -6
- docp/dbs/__init__.py +0 -0
- docp/dbs/chroma.py +197 -0
- docp/libs/_version.py +1 -0
- docp/libs/changelog.py +7 -0
- docp/libs/utilities.py +107 -0
- docp/loaders/__init__.py +38 -0
- docp/loaders/_chromabaseloader.py +338 -0
- docp/loaders/_chromabaseloader.py.bak +378 -0
- docp/loaders/_chromabasepdfloader.py +121 -0
- docp/loaders/_chromabasepptxloader.py +123 -0
- docp/loaders/chroma.py.bak +196 -0
- docp/loaders/chromapdfloader.py +199 -0
- docp/loaders/chromapptxloader.py +192 -0
- docp/loaders/lutilities.py +52 -0
- docp/objects/__init__.py +0 -0
- docp/objects/_docbaseobject.py +65 -0
- docp/objects/_imgobject.py +0 -0
- docp/objects/_pageobject.py +127 -0
- docp/objects/_slideobject.py +110 -0
- docp/objects/_tableobject.py +0 -0
- docp/objects/_textobject.py +64 -0
- docp/objects/pdfobject.py +61 -0
- docp/objects/pptxobject.py +46 -0
- docp/parsers/__init__.py +0 -0
- docp/parsers/_pdfbaseparser.py +236 -0
- docp/parsers/_pdftableparser.py +272 -0
- docp/parsers/_pdftextparser.py +263 -0
- docp/parsers/_pptxbaseparser.py +93 -0
- docp/parsers/_pptxtextparser.py +115 -0
- docp/parsers/pdfparser.py +62 -0
- docp/parsers/pptxparser.py +51 -0
- docp/parsers/putilities.py +48 -0
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
- docp-0.2.0.dist-info/METADATA +110 -0
- docp-0.2.0.dist-info/RECORD +49 -0
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
- docp/_version.py +0 -1
- docp-0.0.0.dev1.dist-info/METADATA +0 -55
- docp-0.0.0.dev1.dist-info/RECORD +0 -7
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,338 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the base functionality for parsing and
|
5
|
+
storing a document's data into a Chroma vector database.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
.. attention::
|
14
|
+
|
15
|
+
This module is *not* designed to be interacted with
|
16
|
+
directly, only via the appropriate interface class(es).
|
17
|
+
|
18
|
+
Rather, please create an instance of a Chroma
|
19
|
+
document-type-specific loader object using one of the
|
20
|
+
following classes:
|
21
|
+
|
22
|
+
- :class:`~docp.loaders.chromapdfloader.ChromaPDFLoader`
|
23
|
+
- :class:`~docp.loaders.chromapptxloader.ChromaPPTXLoader`
|
24
|
+
|
25
|
+
"""
|
26
|
+
# pylint: disable=no-name-in-module # langchain.chains.RetrievalQA
|
27
|
+
|
28
|
+
import contextlib
|
29
|
+
import os
|
30
|
+
from chromadb.api.types import errors as chromadberrors
|
31
|
+
from langchain.chains import RetrievalQA
|
32
|
+
from langchain.docstore.document import Document
|
33
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
34
|
+
from utils4.reporterror import reporterror
|
35
|
+
from utils4.user_interface import ui
|
36
|
+
# locals
|
37
|
+
try:
|
38
|
+
from .dbs.chroma import ChromaDB
|
39
|
+
from .libs.utilities import utilities
|
40
|
+
except ImportError:
|
41
|
+
from dbs.chroma import ChromaDB
|
42
|
+
from libs.utilities import utilities
|
43
|
+
|
44
|
+
|
45
|
+
class _ChromaBaseLoader:
|
46
|
+
"""Base class for loading documents into a Chroma vector database.
|
47
|
+
|
48
|
+
Args:
|
49
|
+
dbpath (str | ChromaDB): Either the full path to the Chroma
|
50
|
+
database *directory*, or an instance of a
|
51
|
+
:class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
|
52
|
+
passed, the ``collection`` argument is ignored.
|
53
|
+
collection (str, optional): Name of the Chroma database
|
54
|
+
collection. Only required if the ``db`` parameter is a path.
|
55
|
+
Defaults to None.
|
56
|
+
split_text (bool, optional): Split the document into chunks,
|
57
|
+
before loading it into the database. Defaults to True.
|
58
|
+
load_keywords (bool, optional): Derive keywords from the document
|
59
|
+
and load these into the sister keywords collection.
|
60
|
+
Defaults to False.
|
61
|
+
llm (object, optional): If deriving keywords, this is the LLM
|
62
|
+
which will do the derivation. Defaults to None.
|
63
|
+
offline (bool, optional): Remain offline and use the locally
|
64
|
+
cached embedding function model. Defaults to False.
|
65
|
+
|
66
|
+
"""
|
67
|
+
# pylint: disable=assignment-from-no-return # These are stub methods.
|
68
|
+
|
69
|
+
_PFX_ERR = '\n[ERROR]:'
|
70
|
+
_PFX_WARN = '\n[WARNING]:'
|
71
|
+
|
72
|
+
def __init__(self,
|
73
|
+
dbpath: str | ChromaDB,
|
74
|
+
collection: str=None,
|
75
|
+
*,
|
76
|
+
split_text: bool=True,
|
77
|
+
load_keywords: bool=False,
|
78
|
+
llm: object=None,
|
79
|
+
offline: bool=False):
|
80
|
+
"""Chroma database class initialiser."""
|
81
|
+
self._dbpath = dbpath
|
82
|
+
self._cname = collection
|
83
|
+
self._split_text = split_text
|
84
|
+
self._load_keywords = load_keywords
|
85
|
+
self._llm = llm
|
86
|
+
self._offline = offline
|
87
|
+
self._dbo = None # Database object.
|
88
|
+
self._docs = [] # List of 'Document' objects.
|
89
|
+
self._docss = [] # List of 'Document' objects *with splits*.
|
90
|
+
self._fbase = None # Basename of the document currently being loaded.
|
91
|
+
self._fpath = None # Full path to the document currently being loaded.
|
92
|
+
self._p = None # Document parser object.
|
93
|
+
self._splitter = None # Text splitter.
|
94
|
+
self._set_db_client()
|
95
|
+
self._check_parameters()
|
96
|
+
|
97
|
+
@property
|
98
|
+
def chroma(self):
|
99
|
+
"""Accessor to the database client object."""
|
100
|
+
return self._dbo
|
101
|
+
|
102
|
+
@property
|
103
|
+
def parser(self):
|
104
|
+
"""Accessor to the document parser object."""
|
105
|
+
return self._p
|
106
|
+
|
107
|
+
def _already_loaded(self) -> bool:
|
108
|
+
"""Test if the file has already been loaded into the collection.
|
109
|
+
|
110
|
+
:Logic:
|
111
|
+
This test is performed by querying the collection for a
|
112
|
+
metadata 'source' which equals the filename. As this uses
|
113
|
+
a chromadb 'filter' (i.e. ``$eq``), testing for partial
|
114
|
+
matches is not possible at this time.
|
115
|
+
|
116
|
+
If the filename is different (in any way) from the source's
|
117
|
+
filename in the database, the file will be loaded again.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
bool: True is the *exact* filename was found in the
|
121
|
+
collection's metadata, otherwise False.
|
122
|
+
|
123
|
+
"""
|
124
|
+
if self._dbo.collection.get(where={'source': {'$eq': self._fbase}})['ids']:
|
125
|
+
print(f'-- File already loaded: {self._fbase} - skipping')
|
126
|
+
return True
|
127
|
+
return False
|
128
|
+
|
129
|
+
def _check_parameters(self) -> None:
|
130
|
+
"""Verify the class parameters are viable.
|
131
|
+
|
132
|
+
Raises:
|
133
|
+
ValueError: If the ``load_keywords`` argument is True and the
|
134
|
+
``llm`` argument is None, or the inverse. Both arguments
|
135
|
+
must either sum to 0, or 2.
|
136
|
+
|
137
|
+
"""
|
138
|
+
if sum((self._load_keywords, self._llm is not None)) not in (0, 2):
|
139
|
+
raise ValueError('For keyword loading, the load_keywords argument '
|
140
|
+
'must be True and a model instance must be provided.')
|
141
|
+
|
142
|
+
def _create_documents(self) -> bool:
|
143
|
+
"""Stub method; overridden by the child class."""
|
144
|
+
|
145
|
+
def _get_keywords(self) -> str:
|
146
|
+
"""Query the document (using the LLM) to extract the keywords."""
|
147
|
+
# pylint: disable=line-too-long
|
148
|
+
print('- Extracting keywords ...')
|
149
|
+
qry = ('List the important keywords which can be used to summarize this '
|
150
|
+
f'document: "{self._fbase}". Use only phrases which are found in the document.')
|
151
|
+
# Suppress stdout.
|
152
|
+
with contextlib.redirect_stdout(None):
|
153
|
+
nids = len(self._dbo.get(where={'source': self._fbase})['ids'])
|
154
|
+
# Max of 50, min n records; prefer n records or 10%.
|
155
|
+
filter_ = {'k': min(nids, max(25, min(nids//10, 50))),
|
156
|
+
'filter': {'source': {'$eq': self._fbase}}}
|
157
|
+
# TODO: Replace this with the module.cless.method once created.
|
158
|
+
qa = RetrievalQA.from_chain_type(llm=self._llm,
|
159
|
+
chain_type="stuff",
|
160
|
+
retriever=self._dbo.as_retriever(search_kwargs=filter_),
|
161
|
+
return_source_documents=True,
|
162
|
+
verbose=True)
|
163
|
+
resp = qa.invoke(qry)
|
164
|
+
kwds = utilities.parse_to_keywords(resp=resp['result'])
|
165
|
+
return kwds
|
166
|
+
|
167
|
+
def _load(self, path: str, **kwargs):
|
168
|
+
"""Load the provided file into the vector store.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
path (str): Full path to the file to be loaded.
|
172
|
+
|
173
|
+
:Keyword Arguments:
|
174
|
+
Those passed from the document-type-specific loader's
|
175
|
+
:func:`load` method.
|
176
|
+
|
177
|
+
"""
|
178
|
+
# pylint: disable=multiple-statements
|
179
|
+
self._fpath = path
|
180
|
+
self._fbase = os.path.basename(path)
|
181
|
+
if self._already_loaded():
|
182
|
+
return
|
183
|
+
self._set_parser()
|
184
|
+
s = self._set_text_splitter()
|
185
|
+
if s: s = self._parse_text(**kwargs)
|
186
|
+
if s: s = self._create_documents()
|
187
|
+
if s: s = self._split_texts()
|
188
|
+
if s: s = self._load_worker()
|
189
|
+
if s and self._load_keywords and self._llm:
|
190
|
+
kwds = self._get_keywords()
|
191
|
+
s = self._store_keywords(kwds=kwds)
|
192
|
+
self._print_summary(success=s)
|
193
|
+
|
194
|
+
def _load_worker(self) -> bool:
|
195
|
+
"""Load the split documents into the database collection.
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
bool: True if loaded successfully, otherwise False. Success
|
199
|
+
is based on the number of records after the load being
|
200
|
+
greater than the number of records before the load, or not
|
201
|
+
exceptions being raised.
|
202
|
+
|
203
|
+
"""
|
204
|
+
# pylint: disable=line-too-long
|
205
|
+
try:
|
206
|
+
print('- Loading the document into the database ...')
|
207
|
+
nrecs_b = self._dbo.collection.count() # Count records before.
|
208
|
+
self._dbo.add_documents(self._docss)
|
209
|
+
nrecs_a = self._dbo.collection.count() # Count records after.
|
210
|
+
return self._test_load(nrecs_b=nrecs_b, nrecs_a=nrecs_a)
|
211
|
+
except chromadberrors.DuplicateIDError:
|
212
|
+
print(' -- Document *chunk* already loaded, duplication detected. File may be corrupt.')
|
213
|
+
return False # Prevent from loading keywords.
|
214
|
+
except Exception as err:
|
215
|
+
reporterror(err)
|
216
|
+
return False
|
217
|
+
|
218
|
+
def _parse_text(self, **kwargs) -> bool:
|
219
|
+
"""Stub method, overridden by the child class."""
|
220
|
+
|
221
|
+
@staticmethod
|
222
|
+
def _print_summary(success: bool):
|
223
|
+
"""Print an end of processing summary.
|
224
|
+
|
225
|
+
Args:
|
226
|
+
success (bool): Success flag from the processor.
|
227
|
+
|
228
|
+
"""
|
229
|
+
if success:
|
230
|
+
print('Processing complete. Success.')
|
231
|
+
else:
|
232
|
+
print('Processing aborted due to error. Failure.')
|
233
|
+
|
234
|
+
def _set_db_client(self) -> bool:
|
235
|
+
"""Set the database client object.
|
236
|
+
|
237
|
+
If the ``_db`` object is a string, this is inferred as the *path*
|
238
|
+
to the database. Otherwise, it is inferred as the database object
|
239
|
+
itself.
|
240
|
+
|
241
|
+
Returns:
|
242
|
+
bool: True if the database object is set without error.
|
243
|
+
Otherwise False.
|
244
|
+
|
245
|
+
"""
|
246
|
+
try:
|
247
|
+
if isinstance(self._dbpath, str):
|
248
|
+
self._dbo = ChromaDB(path=self._dbpath,
|
249
|
+
collection=self._cname,
|
250
|
+
offline=self._offline)
|
251
|
+
else:
|
252
|
+
self._dbo = self._dbpath
|
253
|
+
except Exception as err:
|
254
|
+
reporterror(err)
|
255
|
+
return False
|
256
|
+
return True
|
257
|
+
|
258
|
+
def _set_parser(self):
|
259
|
+
"""Stub method, overridden by the child class."""
|
260
|
+
|
261
|
+
# TODO: Add these to a config file.
|
262
|
+
def _set_text_splitter(self) -> bool:
|
263
|
+
"""Define the text splitter to be used.
|
264
|
+
|
265
|
+
Returns:
|
266
|
+
bool: True, always.
|
267
|
+
|
268
|
+
"""
|
269
|
+
self._splitter = RecursiveCharacterTextSplitter(chunk_size=256,
|
270
|
+
chunk_overlap=25,
|
271
|
+
separators=['\n\n\n', '\n\n', '\n', ' '])
|
272
|
+
return True
|
273
|
+
|
274
|
+
def _split_texts(self) -> bool:
|
275
|
+
"""Split the document text using a recursive text splitter.
|
276
|
+
|
277
|
+
Note:
|
278
|
+
If the ``split_text`` parameter was passed as ``False`` on
|
279
|
+
instantiation, the texts will not be split. Rather, the
|
280
|
+
:attr:`_docs` list is simply *copied* to the :attr:`_docss`
|
281
|
+
attribute.
|
282
|
+
|
283
|
+
Returns:
|
284
|
+
bool: True if the text was split (or copied) successfully,
|
285
|
+
otherwise False.
|
286
|
+
|
287
|
+
"""
|
288
|
+
if self._split_text:
|
289
|
+
self._docss = self._splitter.split_documents(self._docs)
|
290
|
+
else:
|
291
|
+
self._docss = self._docs[:]
|
292
|
+
if not self._docss:
|
293
|
+
msg = (f'{self._PFX_ERR} An error occurred while splitting the documents for '
|
294
|
+
f'{self._fbase}.')
|
295
|
+
ui.print_warning(msg)
|
296
|
+
return False
|
297
|
+
return True
|
298
|
+
|
299
|
+
def _store_keywords(self, kwds: str) -> bool:
|
300
|
+
"""Store the extracted keywords into the keywords collection.
|
301
|
+
|
302
|
+
Args:
|
303
|
+
kwds (str): A string containing the keywords extracted from
|
304
|
+
the document.
|
305
|
+
|
306
|
+
Returns:
|
307
|
+
bool: True if loaded successfully, otherwise False.
|
308
|
+
|
309
|
+
"""
|
310
|
+
print('- Storing keywords ...')
|
311
|
+
db = ChromaDB(path=self._dbo.path, collection=f'{self._cname}-kwds', offline=self._offline)
|
312
|
+
nrecs_b = db.collection.count() # Count records before.
|
313
|
+
docs = [Document(page_content=kwds, metadata={'source': self._fbase})]
|
314
|
+
db.add_documents(docs)
|
315
|
+
nrecs_a = db.collection.count() # Count records after.
|
316
|
+
return 1 == nrecs_a - nrecs_b
|
317
|
+
|
318
|
+
def _test_load(self, nrecs_b: int, nrecs_a: int) -> bool:
|
319
|
+
"""Test the document was loaded successfully.
|
320
|
+
|
321
|
+
:Test:
|
322
|
+
- Given a count of records before the load, verify the number
|
323
|
+
of records after the load is equal to the number of records
|
324
|
+
before, plus the number of split documents.
|
325
|
+
|
326
|
+
Args:
|
327
|
+
nrecs_b (int): Number of records *before* the load.
|
328
|
+
nrecs_a (int): Number of records *after* the load.
|
329
|
+
|
330
|
+
Returns:
|
331
|
+
bool: True if the number of records before the load plus the
|
332
|
+
number is splits is equal to the number of records after the
|
333
|
+
load.
|
334
|
+
|
335
|
+
"""
|
336
|
+
if nrecs_a == nrecs_b:
|
337
|
+
ui.print_warning(f'{self._PFX_WARN} No new documents added. Possibly already loaded?')
|
338
|
+
return nrecs_a == nrecs_b + len(self._docss)
|