docp 0.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
docp/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the project initilisation logic.
5
+
6
+ :Platform: Linux/Windows | Python 3.10+
7
+ :Developer: J Berendt
8
+ :Email: development@s3dev.uk
9
+
10
+ :Comments: n/a
11
+
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
17
+
18
+ # Bring entry-points to the surface.
19
+ try:
20
+ from loaders.chroma import ChromaLoader
21
+ except ImportError as err:
22
+ # The chroma loader requires a lot of backend which is not required for the parser.
23
+ msg = f'An error occurred while importing the Chroma loader:\n- {err}'
24
+ raise ImportError(msg) from err
25
+
26
+ try:
27
+ from .parsers.pdfparser import PDFParser
28
+ from ._version import __version__
29
+ except ImportError:
30
+ from parsers.pdfparser import PDFParser
31
+ from _version import __version__
docp/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = '0.1.0b1'
docp/dbs/__init__.py ADDED
File without changes
docp/dbs/chroma.py ADDED
@@ -0,0 +1,184 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides a localised wrapper and specialised
5
+ functionality around the
6
+ ``langchain_community.vectorstores.Chroma`` class, for
7
+ interacting with a Chroma database.
8
+
9
+ :Platform: Linux/Windows | Python 3.10+
10
+ :Developer: J Berendt
11
+ :Email: development@s3dev.uk
12
+
13
+ :Comments: n/a
14
+
15
+ """
16
+ # pylint: disable=wrong-import-order
17
+
18
+ import chromadb
19
+ import os
20
+ import torch
21
+ from glob import glob
22
+ from hashlib import md5
23
+ from langchain_huggingface import HuggingFaceEmbeddings
24
+ # langchain's Chroma is used rather than the base chromadb as it provides
25
+ # the add_texts method which support GPU processing and parallelisation.
26
+ from langchain_community.vectorstores import Chroma as _Chroma
27
+
28
+
29
+ class ChromaDB(_Chroma):
30
+ """Wrapper class around the ``chromadb`` library.
31
+
32
+ Args:
33
+ path (str): Path to the chroma database's *directory*.
34
+ collection (str): Collection name.
35
+ offline (bool, optional): Remain offline, used the cached
36
+ embedding function model rather than obtaining one online.
37
+ Defaults to False.
38
+ """
39
+ # pylint: disable=line-too-long
40
+
41
+ _MODEL_CACHE = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), '.cache')
42
+ # Installing torch is a huge overhead, just for this. However, torch
43
+ # will already be installed as part of the sentence-transformers library,
44
+ # so we'll use it here.
45
+ _MODEL_KWARGS = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
46
+ # TODO: Add this to a config file.
47
+ _MODEL_NAME = 'all-MiniLM-L6-v2'
48
+
49
+ def __init__(self, path: str, collection: str, offline: bool=False):
50
+ """Chroma database class initialiser."""
51
+ self._path = os.path.realpath(path)
52
+ self._cname = collection
53
+ self._offline = offline
54
+ self._client = None # Database 'client' object
55
+ self._dbc = None # Database 'collection' object.
56
+ self._set_client()
57
+ self._set_embedding_fn()
58
+ super().__init__(client=self._client,
59
+ collection_name=self._cname,
60
+ embedding_function=self._embfn,
61
+ persist_directory=self._path)
62
+ self._set_collection()
63
+
64
+ @property
65
+ def client(self):
66
+ """Accessor to the :class:`chromadb.PersistentClient` class."""
67
+ return self._client
68
+
69
+ @property
70
+ def collection(self):
71
+ """Accessor to the chromadb client's collection object."""
72
+ return self._dbc
73
+
74
+ @property
75
+ def embedding_function(self):
76
+ """Accessor to the embedding function used."""
77
+ return self._embfn
78
+
79
+ @property
80
+ def path(self) -> str:
81
+ """Accessor to the database's path."""
82
+ return self._path
83
+
84
+ def add_documents(self, docs: list):
85
+ """Add multiple documents to the collection.
86
+
87
+ This method wraps ``Chroma.add_texts`` method which supports GPU
88
+ processing and parallelisation. The ID is derived locally from
89
+ the file's basename, page number and page content.
90
+
91
+ Args:
92
+ docs (list): A list of ``langchain_core.documents.base.Document``
93
+ document objects.
94
+
95
+ """
96
+ # This method overrides the base class' add_documents method.
97
+ # pylint: disable=arguments-differ
98
+ # pylint: disable=arguments-renamed
99
+ if not isinstance(docs, list):
100
+ docs = [docs]
101
+ ids_, docs_, meta_ = self._preproc(docs=docs)
102
+ self.add_texts(ids=ids_, texts=docs_, metadatas=meta_)
103
+
104
+ def show_all(self):
105
+ """Return the entire contents of the collection.
106
+
107
+ This is an alias around ``.collection.get()``.
108
+
109
+ """
110
+ return self._dbc.get()
111
+
112
+ def _get_embedding_function_model(self) -> str:
113
+ """Derive the path to the embedding function model.
114
+
115
+ :Note:
116
+ If ``offline=True`` was passed into the class constructor,
117
+ the model cache is used, if available - otherwise the user
118
+ is warned.
119
+
120
+ If online usage is allowed, the model is obtained by the
121
+ means defined by the embedding function constructor.
122
+
123
+ Returns:
124
+ str: The name of the model. Or, if offline, the path to the
125
+ model's cache to be passed into the embedding function
126
+ constructor is returned.
127
+
128
+ """
129
+ if self._offline:
130
+ if not os.path.exists(self._MODEL_CACHE):
131
+ os.makedirs(self._MODEL_CACHE)
132
+ msg = ('Offline mode has been chosen, yet the embedding function model cache does not exist. '
133
+ 'Therefore, a model must be downloaded. Please enable online usage for the first run '
134
+ 'so a model can be downloaded and stored into the cache for future (offline) use.')
135
+ raise FileNotFoundError(msg)
136
+ # Find the cache directory containing the named model, this enables offline use.
137
+ model_loc = os.path.commonpath(filter(lambda x: 'config.json' in x,
138
+ glob(os.path.join(self._MODEL_CACHE,
139
+ f'*{self._MODEL_NAME}*',
140
+ '**'),
141
+ recursive=True)))
142
+ return model_loc
143
+ return self._MODEL_NAME
144
+
145
+ @staticmethod
146
+ def _preproc(docs: list):
147
+ """Pre-process the document objects to create the IDs.
148
+
149
+ Parse the ``Document`` object into its parts for storage.
150
+ Additionally, create the ID as a hash of the source document's
151
+ basename, page number and content.
152
+
153
+ """
154
+ ids = []
155
+ txts = []
156
+ metas = []
157
+ for doc in docs:
158
+ pc = doc.page_content
159
+ m = doc.metadata
160
+ pc_, src_ = map(str.encode, (pc, m['source']))
161
+ pg_ = str(m.get('pageno', 0)).zfill(4)
162
+ id_ = f'id_{md5(src_).hexdigest()}_{pg_}_{md5(pc_).hexdigest()}'
163
+ ids.append(id_)
164
+ txts.append(pc)
165
+ metas.append(m)
166
+ return ids, txts, metas
167
+
168
+ def _set_client(self):
169
+ """Set the database client object."""
170
+ settings = chromadb.Settings(anonymized_telemetry=False)
171
+ self._client = chromadb.PersistentClient(path=self._path,
172
+ settings=settings)
173
+
174
+ def _set_collection(self):
175
+ """Set the database collection object."""
176
+ self._dbc = self._client.get_or_create_collection(self._cname,
177
+ metadata={'hnsw:space': 'cosine'})
178
+
179
+ def _set_embedding_fn(self):
180
+ """Set the embeddings function object."""
181
+ model_name = self._get_embedding_function_model()
182
+ self._embfn = HuggingFaceEmbeddings(model_name=model_name,
183
+ model_kwargs=self._MODEL_KWARGS,
184
+ cache_folder=self._MODEL_CACHE)
File without changes
@@ -0,0 +1,362 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides functionality to parse and store
5
+ document data into a Chroma vector database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+ # pylint: disable=no-name-in-module # langchain.chains.RetrievalQA
15
+
16
+ import contextlib
17
+ import os
18
+ import re
19
+ from chromadb.api.types import errors as chromadberrors
20
+ from langchain.chains import RetrievalQA
21
+ from langchain.docstore.document import Document
22
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
23
+ from utils4.reporterror import reporterror
24
+ from utils4.user_interface import ui
25
+ # locals
26
+ try:
27
+ from .dbs.chroma import ChromaDB
28
+ from .parsers.pdfparser import PDFParser
29
+ except ImportError:
30
+ from dbs.chroma import ChromaDB
31
+ from parsers.pdfparser import PDFParser
32
+
33
+ _PRE_ERR = '\n[ERROR]:'
34
+ _PRE_WARN = '\n[WARNING]:'
35
+
36
+
37
+ class Tools:
38
+ """General tools used for loading documents."""
39
+
40
+ @staticmethod
41
+ def parse_to_keywords(resp: str) -> list:
42
+ """Parse the bot's response into a list of keywords.
43
+
44
+ Args:
45
+ resp (str): Text response directly from the bot.
46
+
47
+ Returns:
48
+ list: A list of keywords extracted from the response,
49
+ separated by asterisks as bullet points.
50
+
51
+ """
52
+ # Capture asterisk bullet points or a numbered list.
53
+ rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
54
+ trans = {45: ' ', 47: ' '}
55
+ resp_ = resp.translate(trans).lower()
56
+ kwds = rexp.findall(resp_)
57
+ if kwds:
58
+ return ', '.join(kwds)
59
+ return ''
60
+
61
+
62
+ class _ChromaBaseLoader:
63
+ """Base class for loading documents into a Chroma vector database.
64
+
65
+ Args:
66
+ path (str): Full path to the file to be parsed and loaded.
67
+ dbpath (str | Chroma): Either the full path to the Chroma database
68
+ *directory*, or an instance of a :class:`~dbs.chroma.Chroma`
69
+ database. If the instance is passed, the ``collection``
70
+ argument is ignored.
71
+ collection (str, optional): Name of the Chroma database
72
+ collection. Only required if the ``db`` parameter is a path.
73
+ Defaults to None.
74
+ offline (bool, optional): Remain offline and use the locally
75
+ cached embedding function model. Defaults to False.
76
+
77
+ """
78
+
79
+ _PARSERS = {'.pdf': PDFParser}
80
+
81
+ def __init__(self,
82
+ dbpath: str | ChromaDB,
83
+ collection: str=None,
84
+ *,
85
+ load_keywords: bool=False,
86
+ llm: object=None,
87
+ offline: bool=False):
88
+ """Chroma database class initialiser."""
89
+ self._dbpath = dbpath
90
+ self._cname = collection
91
+ self._load_keywords = load_keywords
92
+ self._llm = llm
93
+ self._offline = offline
94
+ self._dbo = None # Database object.
95
+ self._docs = [] # List of 'Document' objects.
96
+ self._docss = [] # List of 'Document' objects *with splits*.
97
+ self._fbase = None # Basename of the document currently being loaded.
98
+ self._fpath = None # Full path to the document currently being loaded.
99
+ self._p = None # Document parser object.
100
+ self._splitter = None # Text splitter.
101
+ self._set_db_client()
102
+ self._check_parameters()
103
+
104
+ @property
105
+ def chroma(self):
106
+ """Accessor to the database client object."""
107
+ return self._dbo
108
+
109
+ @property
110
+ def parser(self):
111
+ """Accessor to the document parser object."""
112
+ return self._p
113
+
114
+ def _check_parameters(self) -> None:
115
+ """Verify the class parameters are viable.
116
+
117
+ Raises:
118
+ ValueError: If the ``load_keywords`` argument is True and the
119
+ ``llm`` argument is None, or the inverse. Both arguments
120
+ must either sum to 0, or 2.
121
+
122
+ """
123
+ if sum((self._load_keywords, self._llm is not None)) not in (0, 2):
124
+ raise ValueError('For keyword loading, the load_keywords argument '
125
+ 'must be True and a model instance must be provided.')
126
+
127
+ def _create_documents(self) -> bool:
128
+ """Convert each extracted page into a ``Document`` object.
129
+
130
+ Returns:
131
+ bool: True of the pages are loaded as ``Document`` objects
132
+ successfully. Otherwise False.
133
+
134
+ """
135
+ self._docs = [Document(page_content=page.content,
136
+ metadata={'source': self._p.doc.basename,
137
+ 'pageno': page.pageno})
138
+ for page in self._p.doc.pages if page.hastext]
139
+ if not self._docs:
140
+ msg = f'{_PRE_WARN} Text could not be parsed from {self._p.doc.basename}.'
141
+ ui.print_warning(msg)
142
+ return False
143
+ return True
144
+
145
+ def _get_keywords(self) -> str:
146
+ """Query the document (using the LLM) to extract the keywords."""
147
+ # pylint: disable=line-too-long
148
+ print('- Extracting keywords ...')
149
+ qry = ('List the important keywords which can be used to summarize this '
150
+ f'document: "{self._fbase}". Use only phrases which are found in the document.')
151
+ # Suppress stdout.
152
+ with contextlib.redirect_stdout(None):
153
+ nids = len(self._dbo.get(where={'source': self._fbase})['ids'])
154
+ # Max of 50, min n records; prefer n records or 10%.
155
+ filter_ = {'k': min(nids, max(25, min(nids//10, 50))),
156
+ 'filter': {'source': {'$eq': self._fbase}}}
157
+ # TODO: Replace this with the module.cless.method once created.
158
+ qa = RetrievalQA.from_chain_type(llm=self._llm,
159
+ chain_type="stuff",
160
+ retriever=self._dbo.as_retriever(search_kwargs=filter_),
161
+ return_source_documents=True,
162
+ verbose=True)
163
+ resp = qa.invoke(qry)
164
+ kwds = Tools.parse_to_keywords(resp=resp['result'])
165
+ return kwds
166
+
167
+ def _load(self, path: str, **kwargs):
168
+ """Load the selected files into the vector store.
169
+
170
+ Args:
171
+ path (str): Full path to the file to be loaded.
172
+
173
+ :Keyword Arguments:
174
+ Those passed from the loader-specific ``load`` method.
175
+
176
+ """
177
+ # pylint: disable=multiple-statements
178
+ self._fpath = path
179
+ self._fbase = os.path.basename(path)
180
+ s = self._set_parser()
181
+ if s: s = self._set_text_splitter()
182
+ if s: s = self._parse_text(**kwargs)
183
+ if s: s = self._create_documents()
184
+ if s: s = self._split_texts()
185
+ if s: s = self._load_worker()
186
+ if s and self._load_keywords and self._llm:
187
+ kwds = self._get_keywords()
188
+ s = self._store_keywords(kwds=kwds)
189
+ self._print_summary(success=s)
190
+
191
+ def _load_worker(self) -> bool:
192
+ """Load the split documents into the database collection.
193
+
194
+ Returns:
195
+ bool: True if loaded successfully, otherwise False. Success
196
+ is based on the number of records after the load being
197
+ greater than the number of records before the load, or not
198
+ exceptions being raised.
199
+
200
+ """
201
+ try:
202
+ print('- Loading the document into the database ...')
203
+ nrecs_b = self._dbo.collection.count() # Count records before.
204
+ self._dbo.add_documents(self._docss)
205
+ nrecs_a = self._dbo.collection.count() # Count records after.
206
+ return self._test_load(nrecs_b=nrecs_b, nrecs_a=nrecs_a)
207
+ except chromadberrors.DuplicateIDError:
208
+ print('-- Document already loaded; duplicate detected.')
209
+ return False # Prevent from loading keywords.
210
+ except Exception as err:
211
+ reporterror(err)
212
+ return False
213
+
214
+ def _parse_text(self, **kwargs) -> bool:
215
+ """Parse text from the document.
216
+
217
+ :Keyword Arguments:
218
+ Those to be passed into the text extraction method.
219
+
220
+ Returns:
221
+ bool: True if the parser's 'text' object is populated,
222
+ otherwise False.
223
+
224
+ """
225
+ print('- Extracting text ...')
226
+ self._p.extract_text(**kwargs)
227
+ if len(self._p.doc.pages) < 2:
228
+ ui.print_warning(f'No text extracted from {self._p.doc.basename}')
229
+ return False
230
+ return True
231
+
232
+ @staticmethod
233
+ def _print_summary(success: bool):
234
+ """Print an end of processing summary.
235
+
236
+ Args:
237
+ success (bool): Success flag from the processor.
238
+
239
+ """
240
+ if success:
241
+ print('Processing complete. Success.')
242
+ else:
243
+ print('Processing aborted due to error. Failure.')
244
+
245
+ def _set_db_client(self) -> bool:
246
+ """Set the database client object.
247
+
248
+ If the ``_db`` object is a string, this is inferred as the *path*
249
+ to the database. Otherwise, it is inferred as the database object
250
+ itself.
251
+
252
+ Returns:
253
+ bool: True if the database object is set without error.
254
+ Otherwise False.
255
+
256
+ """
257
+ try:
258
+ if isinstance(self._dbpath, str):
259
+ self._dbo = ChromaDB(path=self._dbpath,
260
+ collection=self._cname,
261
+ offline=self._offline)
262
+ else:
263
+ self._dbo = self._dbpath
264
+ except Exception as err:
265
+ reporterror(err)
266
+ return False
267
+ return True
268
+
269
+ def _set_parser(self) -> bool:
270
+ """Set the appropriate document parser.
271
+
272
+ :Rationale:
273
+ The parser is set by the file extension. For example, a file
274
+ extension ``.pdf`` will set the
275
+ :class:`parsers.pdfparser.PDFParser` class.
276
+
277
+ Returns:
278
+ bool: True if a file extension appropriate parser was found.
279
+ Otherwise, False.
280
+
281
+ """
282
+ # pylint: disable=invalid-name # OK as the variable (Parser) is a class.
283
+ # TODO: Updated this to use the (not-yet-available) ispdf utility
284
+ # function, rather than relying on the file extension.
285
+ ext = os.path.splitext(self._fpath)[1]
286
+ Parser = self._PARSERS.get(ext)
287
+ if not Parser:
288
+ msg = f'{_PRE_WARN} Document parser not set for {os.path.basename(self._fpath)}.'
289
+ ui.print_warning(msg)
290
+ return False
291
+ self._p = Parser(path=self._fpath)
292
+ return True
293
+
294
+ # TODO: Add these to a config file.
295
+ def _set_text_splitter(self) -> bool:
296
+ """Define the text splitter to be used.
297
+
298
+ Returns:
299
+ bool: True, always.
300
+
301
+ """
302
+ self._splitter = RecursiveCharacterTextSplitter(chunk_size=256,
303
+ chunk_overlap=25,
304
+ separators=['\n\n\n', '\n\n', '\n', ' '])
305
+ return True
306
+
307
+ def _split_texts(self) -> bool:
308
+ """Split the document text using a recursive text splitter.
309
+
310
+ Returns:
311
+ bool: True if the text was split successfully, otherwise
312
+ False.
313
+
314
+ """
315
+ self._docss = self._splitter.split_documents(self._docs)
316
+ if not self._docss:
317
+ msg = (f'{_PRE_ERR} An error occurred while splitting the documents for '
318
+ f'{self._p.doc.basename}.')
319
+ ui.print_warning(msg)
320
+ return False
321
+ return True
322
+
323
+ def _store_keywords(self, kwds: str) -> bool:
324
+ """Store the extracted keywords into the keywords collection.
325
+
326
+ Args:
327
+ kwds (str): A string containing the keywords extracted from
328
+ the document.
329
+
330
+ Returns:
331
+ bool: True if loaded successfully, otherwise False.
332
+
333
+ """
334
+ print('- Storing keywords ...')
335
+ db = ChromaDB(path=self._dbo.path, collection=f'{self._cname}-kwds', offline=self._offline)
336
+ nrecs_b = db.collection.count() # Count records before.
337
+ docs = [Document(page_content=kwds, metadata={'source': self._fbase})]
338
+ db.add_documents(docs)
339
+ nrecs_a = db.collection.count() # Count records after.
340
+ return 1 == nrecs_a - nrecs_b
341
+
342
+ def _test_load(self, nrecs_b: int, nrecs_a: int) -> bool:
343
+ """Test the document was loaded successfully.
344
+
345
+ :Test:
346
+ - Given a count of records before the load, verify the number
347
+ of records after the load is equal to the number of records
348
+ before, plus the number of split documents.
349
+
350
+ Args:
351
+ nrecs_b (int): Number of records *before* the load.
352
+ nrecs_a (int): Number of records *after* the load.
353
+
354
+ Returns:
355
+ bool: True if the number of records before the load plus the
356
+ number is splits is equal to the number of records after the
357
+ load.
358
+
359
+ """
360
+ if nrecs_a == nrecs_b:
361
+ ui.print_warning(f'{_PRE_WARN} No new documents added. Possibly already loaded?')
362
+ return nrecs_a == nrecs_b + len(self._docss)