docp 0.1.0b1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
docp/__init__.py ADDED
@@ -0,0 +1,31 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the project initilisation logic.
5
+
6
+ :Platform: Linux/Windows | Python 3.10+
7
+ :Developer: J Berendt
8
+ :Email: development@s3dev.uk
9
+
10
+ :Comments: n/a
11
+
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
17
+
18
+ # Bring entry-points to the surface.
19
+ try:
20
+ from loaders.chroma import ChromaLoader
21
+ except ImportError as err:
22
+ # The chroma loader requires a lot of backend which is not required for the parser.
23
+ msg = f'An error occurred while importing the Chroma loader:\n- {err}'
24
+ raise ImportError(msg) from err
25
+
26
+ try:
27
+ from .parsers.pdfparser import PDFParser
28
+ from ._version import __version__
29
+ except ImportError:
30
+ from parsers.pdfparser import PDFParser
31
+ from _version import __version__
docp/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = '0.1.0b1'
docp/dbs/__init__.py ADDED
File without changes
docp/dbs/chroma.py ADDED
@@ -0,0 +1,184 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides a localised wrapper and specialised
5
+ functionality around the
6
+ ``langchain_community.vectorstores.Chroma`` class, for
7
+ interacting with a Chroma database.
8
+
9
+ :Platform: Linux/Windows | Python 3.10+
10
+ :Developer: J Berendt
11
+ :Email: development@s3dev.uk
12
+
13
+ :Comments: n/a
14
+
15
+ """
16
+ # pylint: disable=wrong-import-order
17
+
18
+ import chromadb
19
+ import os
20
+ import torch
21
+ from glob import glob
22
+ from hashlib import md5
23
+ from langchain_huggingface import HuggingFaceEmbeddings
24
+ # langchain's Chroma is used rather than the base chromadb as it provides
25
+ # the add_texts method which support GPU processing and parallelisation.
26
+ from langchain_community.vectorstores import Chroma as _Chroma
27
+
28
+
29
+ class ChromaDB(_Chroma):
30
+ """Wrapper class around the ``chromadb`` library.
31
+
32
+ Args:
33
+ path (str): Path to the chroma database's *directory*.
34
+ collection (str): Collection name.
35
+ offline (bool, optional): Remain offline, used the cached
36
+ embedding function model rather than obtaining one online.
37
+ Defaults to False.
38
+ """
39
+ # pylint: disable=line-too-long
40
+
41
+ _MODEL_CACHE = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), '.cache')
42
+ # Installing torch is a huge overhead, just for this. However, torch
43
+ # will already be installed as part of the sentence-transformers library,
44
+ # so we'll use it here.
45
+ _MODEL_KWARGS = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
46
+ # TODO: Add this to a config file.
47
+ _MODEL_NAME = 'all-MiniLM-L6-v2'
48
+
49
+ def __init__(self, path: str, collection: str, offline: bool=False):
50
+ """Chroma database class initialiser."""
51
+ self._path = os.path.realpath(path)
52
+ self._cname = collection
53
+ self._offline = offline
54
+ self._client = None # Database 'client' object
55
+ self._dbc = None # Database 'collection' object.
56
+ self._set_client()
57
+ self._set_embedding_fn()
58
+ super().__init__(client=self._client,
59
+ collection_name=self._cname,
60
+ embedding_function=self._embfn,
61
+ persist_directory=self._path)
62
+ self._set_collection()
63
+
64
+ @property
65
+ def client(self):
66
+ """Accessor to the :class:`chromadb.PersistentClient` class."""
67
+ return self._client
68
+
69
+ @property
70
+ def collection(self):
71
+ """Accessor to the chromadb client's collection object."""
72
+ return self._dbc
73
+
74
+ @property
75
+ def embedding_function(self):
76
+ """Accessor to the embedding function used."""
77
+ return self._embfn
78
+
79
+ @property
80
+ def path(self) -> str:
81
+ """Accessor to the database's path."""
82
+ return self._path
83
+
84
+ def add_documents(self, docs: list):
85
+ """Add multiple documents to the collection.
86
+
87
+ This method wraps ``Chroma.add_texts`` method which supports GPU
88
+ processing and parallelisation. The ID is derived locally from
89
+ the file's basename, page number and page content.
90
+
91
+ Args:
92
+ docs (list): A list of ``langchain_core.documents.base.Document``
93
+ document objects.
94
+
95
+ """
96
+ # This method overrides the base class' add_documents method.
97
+ # pylint: disable=arguments-differ
98
+ # pylint: disable=arguments-renamed
99
+ if not isinstance(docs, list):
100
+ docs = [docs]
101
+ ids_, docs_, meta_ = self._preproc(docs=docs)
102
+ self.add_texts(ids=ids_, texts=docs_, metadatas=meta_)
103
+
104
+ def show_all(self):
105
+ """Return the entire contents of the collection.
106
+
107
+ This is an alias around ``.collection.get()``.
108
+
109
+ """
110
+ return self._dbc.get()
111
+
112
+ def _get_embedding_function_model(self) -> str:
113
+ """Derive the path to the embedding function model.
114
+
115
+ :Note:
116
+ If ``offline=True`` was passed into the class constructor,
117
+ the model cache is used, if available - otherwise the user
118
+ is warned.
119
+
120
+ If online usage is allowed, the model is obtained by the
121
+ means defined by the embedding function constructor.
122
+
123
+ Returns:
124
+ str: The name of the model. Or, if offline, the path to the
125
+ model's cache to be passed into the embedding function
126
+ constructor is returned.
127
+
128
+ """
129
+ if self._offline:
130
+ if not os.path.exists(self._MODEL_CACHE):
131
+ os.makedirs(self._MODEL_CACHE)
132
+ msg = ('Offline mode has been chosen, yet the embedding function model cache does not exist. '
133
+ 'Therefore, a model must be downloaded. Please enable online usage for the first run '
134
+ 'so a model can be downloaded and stored into the cache for future (offline) use.')
135
+ raise FileNotFoundError(msg)
136
+ # Find the cache directory containing the named model, this enables offline use.
137
+ model_loc = os.path.commonpath(filter(lambda x: 'config.json' in x,
138
+ glob(os.path.join(self._MODEL_CACHE,
139
+ f'*{self._MODEL_NAME}*',
140
+ '**'),
141
+ recursive=True)))
142
+ return model_loc
143
+ return self._MODEL_NAME
144
+
145
+ @staticmethod
146
+ def _preproc(docs: list):
147
+ """Pre-process the document objects to create the IDs.
148
+
149
+ Parse the ``Document`` object into its parts for storage.
150
+ Additionally, create the ID as a hash of the source document's
151
+ basename, page number and content.
152
+
153
+ """
154
+ ids = []
155
+ txts = []
156
+ metas = []
157
+ for doc in docs:
158
+ pc = doc.page_content
159
+ m = doc.metadata
160
+ pc_, src_ = map(str.encode, (pc, m['source']))
161
+ pg_ = str(m.get('pageno', 0)).zfill(4)
162
+ id_ = f'id_{md5(src_).hexdigest()}_{pg_}_{md5(pc_).hexdigest()}'
163
+ ids.append(id_)
164
+ txts.append(pc)
165
+ metas.append(m)
166
+ return ids, txts, metas
167
+
168
+ def _set_client(self):
169
+ """Set the database client object."""
170
+ settings = chromadb.Settings(anonymized_telemetry=False)
171
+ self._client = chromadb.PersistentClient(path=self._path,
172
+ settings=settings)
173
+
174
+ def _set_collection(self):
175
+ """Set the database collection object."""
176
+ self._dbc = self._client.get_or_create_collection(self._cname,
177
+ metadata={'hnsw:space': 'cosine'})
178
+
179
+ def _set_embedding_fn(self):
180
+ """Set the embeddings function object."""
181
+ model_name = self._get_embedding_function_model()
182
+ self._embfn = HuggingFaceEmbeddings(model_name=model_name,
183
+ model_kwargs=self._MODEL_KWARGS,
184
+ cache_folder=self._MODEL_CACHE)
File without changes
@@ -0,0 +1,362 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides functionality to parse and store
5
+ document data into a Chroma vector database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+ # pylint: disable=no-name-in-module # langchain.chains.RetrievalQA
15
+
16
+ import contextlib
17
+ import os
18
+ import re
19
+ from chromadb.api.types import errors as chromadberrors
20
+ from langchain.chains import RetrievalQA
21
+ from langchain.docstore.document import Document
22
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
23
+ from utils4.reporterror import reporterror
24
+ from utils4.user_interface import ui
25
+ # locals
26
+ try:
27
+ from .dbs.chroma import ChromaDB
28
+ from .parsers.pdfparser import PDFParser
29
+ except ImportError:
30
+ from dbs.chroma import ChromaDB
31
+ from parsers.pdfparser import PDFParser
32
+
33
+ _PRE_ERR = '\n[ERROR]:'
34
+ _PRE_WARN = '\n[WARNING]:'
35
+
36
+
37
+ class Tools:
38
+ """General tools used for loading documents."""
39
+
40
+ @staticmethod
41
+ def parse_to_keywords(resp: str) -> list:
42
+ """Parse the bot's response into a list of keywords.
43
+
44
+ Args:
45
+ resp (str): Text response directly from the bot.
46
+
47
+ Returns:
48
+ list: A list of keywords extracted from the response,
49
+ separated by asterisks as bullet points.
50
+
51
+ """
52
+ # Capture asterisk bullet points or a numbered list.
53
+ rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
54
+ trans = {45: ' ', 47: ' '}
55
+ resp_ = resp.translate(trans).lower()
56
+ kwds = rexp.findall(resp_)
57
+ if kwds:
58
+ return ', '.join(kwds)
59
+ return ''
60
+
61
+
62
+ class _ChromaBaseLoader:
63
+ """Base class for loading documents into a Chroma vector database.
64
+
65
+ Args:
66
+ path (str): Full path to the file to be parsed and loaded.
67
+ dbpath (str | Chroma): Either the full path to the Chroma database
68
+ *directory*, or an instance of a :class:`~dbs.chroma.Chroma`
69
+ database. If the instance is passed, the ``collection``
70
+ argument is ignored.
71
+ collection (str, optional): Name of the Chroma database
72
+ collection. Only required if the ``db`` parameter is a path.
73
+ Defaults to None.
74
+ offline (bool, optional): Remain offline and use the locally
75
+ cached embedding function model. Defaults to False.
76
+
77
+ """
78
+
79
+ _PARSERS = {'.pdf': PDFParser}
80
+
81
+ def __init__(self,
82
+ dbpath: str | ChromaDB,
83
+ collection: str=None,
84
+ *,
85
+ load_keywords: bool=False,
86
+ llm: object=None,
87
+ offline: bool=False):
88
+ """Chroma database class initialiser."""
89
+ self._dbpath = dbpath
90
+ self._cname = collection
91
+ self._load_keywords = load_keywords
92
+ self._llm = llm
93
+ self._offline = offline
94
+ self._dbo = None # Database object.
95
+ self._docs = [] # List of 'Document' objects.
96
+ self._docss = [] # List of 'Document' objects *with splits*.
97
+ self._fbase = None # Basename of the document currently being loaded.
98
+ self._fpath = None # Full path to the document currently being loaded.
99
+ self._p = None # Document parser object.
100
+ self._splitter = None # Text splitter.
101
+ self._set_db_client()
102
+ self._check_parameters()
103
+
104
+ @property
105
+ def chroma(self):
106
+ """Accessor to the database client object."""
107
+ return self._dbo
108
+
109
+ @property
110
+ def parser(self):
111
+ """Accessor to the document parser object."""
112
+ return self._p
113
+
114
+ def _check_parameters(self) -> None:
115
+ """Verify the class parameters are viable.
116
+
117
+ Raises:
118
+ ValueError: If the ``load_keywords`` argument is True and the
119
+ ``llm`` argument is None, or the inverse. Both arguments
120
+ must either sum to 0, or 2.
121
+
122
+ """
123
+ if sum((self._load_keywords, self._llm is not None)) not in (0, 2):
124
+ raise ValueError('For keyword loading, the load_keywords argument '
125
+ 'must be True and a model instance must be provided.')
126
+
127
+ def _create_documents(self) -> bool:
128
+ """Convert each extracted page into a ``Document`` object.
129
+
130
+ Returns:
131
+ bool: True of the pages are loaded as ``Document`` objects
132
+ successfully. Otherwise False.
133
+
134
+ """
135
+ self._docs = [Document(page_content=page.content,
136
+ metadata={'source': self._p.doc.basename,
137
+ 'pageno': page.pageno})
138
+ for page in self._p.doc.pages if page.hastext]
139
+ if not self._docs:
140
+ msg = f'{_PRE_WARN} Text could not be parsed from {self._p.doc.basename}.'
141
+ ui.print_warning(msg)
142
+ return False
143
+ return True
144
+
145
+ def _get_keywords(self) -> str:
146
+ """Query the document (using the LLM) to extract the keywords."""
147
+ # pylint: disable=line-too-long
148
+ print('- Extracting keywords ...')
149
+ qry = ('List the important keywords which can be used to summarize this '
150
+ f'document: "{self._fbase}". Use only phrases which are found in the document.')
151
+ # Suppress stdout.
152
+ with contextlib.redirect_stdout(None):
153
+ nids = len(self._dbo.get(where={'source': self._fbase})['ids'])
154
+ # Max of 50, min n records; prefer n records or 10%.
155
+ filter_ = {'k': min(nids, max(25, min(nids//10, 50))),
156
+ 'filter': {'source': {'$eq': self._fbase}}}
157
+ # TODO: Replace this with the module.cless.method once created.
158
+ qa = RetrievalQA.from_chain_type(llm=self._llm,
159
+ chain_type="stuff",
160
+ retriever=self._dbo.as_retriever(search_kwargs=filter_),
161
+ return_source_documents=True,
162
+ verbose=True)
163
+ resp = qa.invoke(qry)
164
+ kwds = Tools.parse_to_keywords(resp=resp['result'])
165
+ return kwds
166
+
167
+ def _load(self, path: str, **kwargs):
168
+ """Load the selected files into the vector store.
169
+
170
+ Args:
171
+ path (str): Full path to the file to be loaded.
172
+
173
+ :Keyword Arguments:
174
+ Those passed from the loader-specific ``load`` method.
175
+
176
+ """
177
+ # pylint: disable=multiple-statements
178
+ self._fpath = path
179
+ self._fbase = os.path.basename(path)
180
+ s = self._set_parser()
181
+ if s: s = self._set_text_splitter()
182
+ if s: s = self._parse_text(**kwargs)
183
+ if s: s = self._create_documents()
184
+ if s: s = self._split_texts()
185
+ if s: s = self._load_worker()
186
+ if s and self._load_keywords and self._llm:
187
+ kwds = self._get_keywords()
188
+ s = self._store_keywords(kwds=kwds)
189
+ self._print_summary(success=s)
190
+
191
+ def _load_worker(self) -> bool:
192
+ """Load the split documents into the database collection.
193
+
194
+ Returns:
195
+ bool: True if loaded successfully, otherwise False. Success
196
+ is based on the number of records after the load being
197
+ greater than the number of records before the load, or not
198
+ exceptions being raised.
199
+
200
+ """
201
+ try:
202
+ print('- Loading the document into the database ...')
203
+ nrecs_b = self._dbo.collection.count() # Count records before.
204
+ self._dbo.add_documents(self._docss)
205
+ nrecs_a = self._dbo.collection.count() # Count records after.
206
+ return self._test_load(nrecs_b=nrecs_b, nrecs_a=nrecs_a)
207
+ except chromadberrors.DuplicateIDError:
208
+ print('-- Document already loaded; duplicate detected.')
209
+ return False # Prevent from loading keywords.
210
+ except Exception as err:
211
+ reporterror(err)
212
+ return False
213
+
214
+ def _parse_text(self, **kwargs) -> bool:
215
+ """Parse text from the document.
216
+
217
+ :Keyword Arguments:
218
+ Those to be passed into the text extraction method.
219
+
220
+ Returns:
221
+ bool: True if the parser's 'text' object is populated,
222
+ otherwise False.
223
+
224
+ """
225
+ print('- Extracting text ...')
226
+ self._p.extract_text(**kwargs)
227
+ if len(self._p.doc.pages) < 2:
228
+ ui.print_warning(f'No text extracted from {self._p.doc.basename}')
229
+ return False
230
+ return True
231
+
232
+ @staticmethod
233
+ def _print_summary(success: bool):
234
+ """Print an end of processing summary.
235
+
236
+ Args:
237
+ success (bool): Success flag from the processor.
238
+
239
+ """
240
+ if success:
241
+ print('Processing complete. Success.')
242
+ else:
243
+ print('Processing aborted due to error. Failure.')
244
+
245
+ def _set_db_client(self) -> bool:
246
+ """Set the database client object.
247
+
248
+ If the ``_db`` object is a string, this is inferred as the *path*
249
+ to the database. Otherwise, it is inferred as the database object
250
+ itself.
251
+
252
+ Returns:
253
+ bool: True if the database object is set without error.
254
+ Otherwise False.
255
+
256
+ """
257
+ try:
258
+ if isinstance(self._dbpath, str):
259
+ self._dbo = ChromaDB(path=self._dbpath,
260
+ collection=self._cname,
261
+ offline=self._offline)
262
+ else:
263
+ self._dbo = self._dbpath
264
+ except Exception as err:
265
+ reporterror(err)
266
+ return False
267
+ return True
268
+
269
+ def _set_parser(self) -> bool:
270
+ """Set the appropriate document parser.
271
+
272
+ :Rationale:
273
+ The parser is set by the file extension. For example, a file
274
+ extension ``.pdf`` will set the
275
+ :class:`parsers.pdfparser.PDFParser` class.
276
+
277
+ Returns:
278
+ bool: True if a file extension appropriate parser was found.
279
+ Otherwise, False.
280
+
281
+ """
282
+ # pylint: disable=invalid-name # OK as the variable (Parser) is a class.
283
+ # TODO: Updated this to use the (not-yet-available) ispdf utility
284
+ # function, rather than relying on the file extension.
285
+ ext = os.path.splitext(self._fpath)[1]
286
+ Parser = self._PARSERS.get(ext)
287
+ if not Parser:
288
+ msg = f'{_PRE_WARN} Document parser not set for {os.path.basename(self._fpath)}.'
289
+ ui.print_warning(msg)
290
+ return False
291
+ self._p = Parser(path=self._fpath)
292
+ return True
293
+
294
+ # TODO: Add these to a config file.
295
+ def _set_text_splitter(self) -> bool:
296
+ """Define the text splitter to be used.
297
+
298
+ Returns:
299
+ bool: True, always.
300
+
301
+ """
302
+ self._splitter = RecursiveCharacterTextSplitter(chunk_size=256,
303
+ chunk_overlap=25,
304
+ separators=['\n\n\n', '\n\n', '\n', ' '])
305
+ return True
306
+
307
+ def _split_texts(self) -> bool:
308
+ """Split the document text using a recursive text splitter.
309
+
310
+ Returns:
311
+ bool: True if the text was split successfully, otherwise
312
+ False.
313
+
314
+ """
315
+ self._docss = self._splitter.split_documents(self._docs)
316
+ if not self._docss:
317
+ msg = (f'{_PRE_ERR} An error occurred while splitting the documents for '
318
+ f'{self._p.doc.basename}.')
319
+ ui.print_warning(msg)
320
+ return False
321
+ return True
322
+
323
+ def _store_keywords(self, kwds: str) -> bool:
324
+ """Store the extracted keywords into the keywords collection.
325
+
326
+ Args:
327
+ kwds (str): A string containing the keywords extracted from
328
+ the document.
329
+
330
+ Returns:
331
+ bool: True if loaded successfully, otherwise False.
332
+
333
+ """
334
+ print('- Storing keywords ...')
335
+ db = ChromaDB(path=self._dbo.path, collection=f'{self._cname}-kwds', offline=self._offline)
336
+ nrecs_b = db.collection.count() # Count records before.
337
+ docs = [Document(page_content=kwds, metadata={'source': self._fbase})]
338
+ db.add_documents(docs)
339
+ nrecs_a = db.collection.count() # Count records after.
340
+ return 1 == nrecs_a - nrecs_b
341
+
342
+ def _test_load(self, nrecs_b: int, nrecs_a: int) -> bool:
343
+ """Test the document was loaded successfully.
344
+
345
+ :Test:
346
+ - Given a count of records before the load, verify the number
347
+ of records after the load is equal to the number of records
348
+ before, plus the number of split documents.
349
+
350
+ Args:
351
+ nrecs_b (int): Number of records *before* the load.
352
+ nrecs_a (int): Number of records *after* the load.
353
+
354
+ Returns:
355
+ bool: True if the number of records before the load plus the
356
+ number is splits is equal to the number of records after the
357
+ load.
358
+
359
+ """
360
+ if nrecs_a == nrecs_b:
361
+ ui.print_warning(f'{_PRE_WARN} No new documents added. Possibly already loaded?')
362
+ return nrecs_a == nrecs_b + len(self._docss)