docp 0.1.0b1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- docp/__init__.py +31 -0
- docp/_version.py +1 -0
- docp/dbs/__init__.py +0 -0
- docp/dbs/chroma.py +184 -0
- docp/loaders/__init__.py +0 -0
- docp/loaders/_chromabaseloader.py +362 -0
- docp/loaders/chroma.py +166 -0
- docp/objects/__init__.py +0 -0
- docp/objects/_docbaseobject.py +76 -0
- docp/objects/_pageobject.py +126 -0
- docp/objects/_tableobject.py +0 -0
- docp/objects/_textobject.py +0 -0
- docp/objects/pdfobject.py +39 -0
- docp/parsers/__init__.py +0 -0
- docp/parsers/_pdfbaseparser.py +210 -0
- docp/parsers/_pdftableparser.py +273 -0
- docp/parsers/_pdftextparser.py +253 -0
- docp/parsers/pdfparser.py +62 -0
- docp-0.1.0b1.dist-info/LICENSE +622 -0
- docp-0.1.0b1.dist-info/METADATA +55 -0
- docp-0.1.0b1.dist-info/RECORD +23 -0
- docp-0.1.0b1.dist-info/WHEEL +5 -0
- docp-0.1.0b1.dist-info/top_level.txt +1 -0
docp/__init__.py
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the project initilisation logic.
|
5
|
+
|
6
|
+
:Platform: Linux/Windows | Python 3.10+
|
7
|
+
:Developer: J Berendt
|
8
|
+
:Email: development@s3dev.uk
|
9
|
+
|
10
|
+
:Comments: n/a
|
11
|
+
|
12
|
+
"""
|
13
|
+
|
14
|
+
import os
|
15
|
+
import sys
|
16
|
+
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
|
17
|
+
|
18
|
+
# Bring entry-points to the surface.
|
19
|
+
try:
|
20
|
+
from loaders.chroma import ChromaLoader
|
21
|
+
except ImportError as err:
|
22
|
+
# The chroma loader requires a lot of backend which is not required for the parser.
|
23
|
+
msg = f'An error occurred while importing the Chroma loader:\n- {err}'
|
24
|
+
raise ImportError(msg) from err
|
25
|
+
|
26
|
+
try:
|
27
|
+
from .parsers.pdfparser import PDFParser
|
28
|
+
from ._version import __version__
|
29
|
+
except ImportError:
|
30
|
+
from parsers.pdfparser import PDFParser
|
31
|
+
from _version import __version__
|
docp/_version.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.1.0b1'
|
docp/dbs/__init__.py
ADDED
File without changes
|
docp/dbs/chroma.py
ADDED
@@ -0,0 +1,184 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides a localised wrapper and specialised
|
5
|
+
functionality around the
|
6
|
+
``langchain_community.vectorstores.Chroma`` class, for
|
7
|
+
interacting with a Chroma database.
|
8
|
+
|
9
|
+
:Platform: Linux/Windows | Python 3.10+
|
10
|
+
:Developer: J Berendt
|
11
|
+
:Email: development@s3dev.uk
|
12
|
+
|
13
|
+
:Comments: n/a
|
14
|
+
|
15
|
+
"""
|
16
|
+
# pylint: disable=wrong-import-order
|
17
|
+
|
18
|
+
import chromadb
|
19
|
+
import os
|
20
|
+
import torch
|
21
|
+
from glob import glob
|
22
|
+
from hashlib import md5
|
23
|
+
from langchain_huggingface import HuggingFaceEmbeddings
|
24
|
+
# langchain's Chroma is used rather than the base chromadb as it provides
|
25
|
+
# the add_texts method which support GPU processing and parallelisation.
|
26
|
+
from langchain_community.vectorstores import Chroma as _Chroma
|
27
|
+
|
28
|
+
|
29
|
+
class ChromaDB(_Chroma):
|
30
|
+
"""Wrapper class around the ``chromadb`` library.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
path (str): Path to the chroma database's *directory*.
|
34
|
+
collection (str): Collection name.
|
35
|
+
offline (bool, optional): Remain offline, used the cached
|
36
|
+
embedding function model rather than obtaining one online.
|
37
|
+
Defaults to False.
|
38
|
+
"""
|
39
|
+
# pylint: disable=line-too-long
|
40
|
+
|
41
|
+
_MODEL_CACHE = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), '.cache')
|
42
|
+
# Installing torch is a huge overhead, just for this. However, torch
|
43
|
+
# will already be installed as part of the sentence-transformers library,
|
44
|
+
# so we'll use it here.
|
45
|
+
_MODEL_KWARGS = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
|
46
|
+
# TODO: Add this to a config file.
|
47
|
+
_MODEL_NAME = 'all-MiniLM-L6-v2'
|
48
|
+
|
49
|
+
def __init__(self, path: str, collection: str, offline: bool=False):
|
50
|
+
"""Chroma database class initialiser."""
|
51
|
+
self._path = os.path.realpath(path)
|
52
|
+
self._cname = collection
|
53
|
+
self._offline = offline
|
54
|
+
self._client = None # Database 'client' object
|
55
|
+
self._dbc = None # Database 'collection' object.
|
56
|
+
self._set_client()
|
57
|
+
self._set_embedding_fn()
|
58
|
+
super().__init__(client=self._client,
|
59
|
+
collection_name=self._cname,
|
60
|
+
embedding_function=self._embfn,
|
61
|
+
persist_directory=self._path)
|
62
|
+
self._set_collection()
|
63
|
+
|
64
|
+
@property
|
65
|
+
def client(self):
|
66
|
+
"""Accessor to the :class:`chromadb.PersistentClient` class."""
|
67
|
+
return self._client
|
68
|
+
|
69
|
+
@property
|
70
|
+
def collection(self):
|
71
|
+
"""Accessor to the chromadb client's collection object."""
|
72
|
+
return self._dbc
|
73
|
+
|
74
|
+
@property
|
75
|
+
def embedding_function(self):
|
76
|
+
"""Accessor to the embedding function used."""
|
77
|
+
return self._embfn
|
78
|
+
|
79
|
+
@property
|
80
|
+
def path(self) -> str:
|
81
|
+
"""Accessor to the database's path."""
|
82
|
+
return self._path
|
83
|
+
|
84
|
+
def add_documents(self, docs: list):
|
85
|
+
"""Add multiple documents to the collection.
|
86
|
+
|
87
|
+
This method wraps ``Chroma.add_texts`` method which supports GPU
|
88
|
+
processing and parallelisation. The ID is derived locally from
|
89
|
+
the file's basename, page number and page content.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
docs (list): A list of ``langchain_core.documents.base.Document``
|
93
|
+
document objects.
|
94
|
+
|
95
|
+
"""
|
96
|
+
# This method overrides the base class' add_documents method.
|
97
|
+
# pylint: disable=arguments-differ
|
98
|
+
# pylint: disable=arguments-renamed
|
99
|
+
if not isinstance(docs, list):
|
100
|
+
docs = [docs]
|
101
|
+
ids_, docs_, meta_ = self._preproc(docs=docs)
|
102
|
+
self.add_texts(ids=ids_, texts=docs_, metadatas=meta_)
|
103
|
+
|
104
|
+
def show_all(self):
|
105
|
+
"""Return the entire contents of the collection.
|
106
|
+
|
107
|
+
This is an alias around ``.collection.get()``.
|
108
|
+
|
109
|
+
"""
|
110
|
+
return self._dbc.get()
|
111
|
+
|
112
|
+
def _get_embedding_function_model(self) -> str:
|
113
|
+
"""Derive the path to the embedding function model.
|
114
|
+
|
115
|
+
:Note:
|
116
|
+
If ``offline=True`` was passed into the class constructor,
|
117
|
+
the model cache is used, if available - otherwise the user
|
118
|
+
is warned.
|
119
|
+
|
120
|
+
If online usage is allowed, the model is obtained by the
|
121
|
+
means defined by the embedding function constructor.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
str: The name of the model. Or, if offline, the path to the
|
125
|
+
model's cache to be passed into the embedding function
|
126
|
+
constructor is returned.
|
127
|
+
|
128
|
+
"""
|
129
|
+
if self._offline:
|
130
|
+
if not os.path.exists(self._MODEL_CACHE):
|
131
|
+
os.makedirs(self._MODEL_CACHE)
|
132
|
+
msg = ('Offline mode has been chosen, yet the embedding function model cache does not exist. '
|
133
|
+
'Therefore, a model must be downloaded. Please enable online usage for the first run '
|
134
|
+
'so a model can be downloaded and stored into the cache for future (offline) use.')
|
135
|
+
raise FileNotFoundError(msg)
|
136
|
+
# Find the cache directory containing the named model, this enables offline use.
|
137
|
+
model_loc = os.path.commonpath(filter(lambda x: 'config.json' in x,
|
138
|
+
glob(os.path.join(self._MODEL_CACHE,
|
139
|
+
f'*{self._MODEL_NAME}*',
|
140
|
+
'**'),
|
141
|
+
recursive=True)))
|
142
|
+
return model_loc
|
143
|
+
return self._MODEL_NAME
|
144
|
+
|
145
|
+
@staticmethod
|
146
|
+
def _preproc(docs: list):
|
147
|
+
"""Pre-process the document objects to create the IDs.
|
148
|
+
|
149
|
+
Parse the ``Document`` object into its parts for storage.
|
150
|
+
Additionally, create the ID as a hash of the source document's
|
151
|
+
basename, page number and content.
|
152
|
+
|
153
|
+
"""
|
154
|
+
ids = []
|
155
|
+
txts = []
|
156
|
+
metas = []
|
157
|
+
for doc in docs:
|
158
|
+
pc = doc.page_content
|
159
|
+
m = doc.metadata
|
160
|
+
pc_, src_ = map(str.encode, (pc, m['source']))
|
161
|
+
pg_ = str(m.get('pageno', 0)).zfill(4)
|
162
|
+
id_ = f'id_{md5(src_).hexdigest()}_{pg_}_{md5(pc_).hexdigest()}'
|
163
|
+
ids.append(id_)
|
164
|
+
txts.append(pc)
|
165
|
+
metas.append(m)
|
166
|
+
return ids, txts, metas
|
167
|
+
|
168
|
+
def _set_client(self):
|
169
|
+
"""Set the database client object."""
|
170
|
+
settings = chromadb.Settings(anonymized_telemetry=False)
|
171
|
+
self._client = chromadb.PersistentClient(path=self._path,
|
172
|
+
settings=settings)
|
173
|
+
|
174
|
+
def _set_collection(self):
|
175
|
+
"""Set the database collection object."""
|
176
|
+
self._dbc = self._client.get_or_create_collection(self._cname,
|
177
|
+
metadata={'hnsw:space': 'cosine'})
|
178
|
+
|
179
|
+
def _set_embedding_fn(self):
|
180
|
+
"""Set the embeddings function object."""
|
181
|
+
model_name = self._get_embedding_function_model()
|
182
|
+
self._embfn = HuggingFaceEmbeddings(model_name=model_name,
|
183
|
+
model_kwargs=self._MODEL_KWARGS,
|
184
|
+
cache_folder=self._MODEL_CACHE)
|
docp/loaders/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,362 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides functionality to parse and store
|
5
|
+
document data into a Chroma vector database.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
# pylint: disable=no-name-in-module # langchain.chains.RetrievalQA
|
15
|
+
|
16
|
+
import contextlib
|
17
|
+
import os
|
18
|
+
import re
|
19
|
+
from chromadb.api.types import errors as chromadberrors
|
20
|
+
from langchain.chains import RetrievalQA
|
21
|
+
from langchain.docstore.document import Document
|
22
|
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
23
|
+
from utils4.reporterror import reporterror
|
24
|
+
from utils4.user_interface import ui
|
25
|
+
# locals
|
26
|
+
try:
|
27
|
+
from .dbs.chroma import ChromaDB
|
28
|
+
from .parsers.pdfparser import PDFParser
|
29
|
+
except ImportError:
|
30
|
+
from dbs.chroma import ChromaDB
|
31
|
+
from parsers.pdfparser import PDFParser
|
32
|
+
|
33
|
+
_PRE_ERR = '\n[ERROR]:'
|
34
|
+
_PRE_WARN = '\n[WARNING]:'
|
35
|
+
|
36
|
+
|
37
|
+
class Tools:
|
38
|
+
"""General tools used for loading documents."""
|
39
|
+
|
40
|
+
@staticmethod
|
41
|
+
def parse_to_keywords(resp: str) -> list:
|
42
|
+
"""Parse the bot's response into a list of keywords.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
resp (str): Text response directly from the bot.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
list: A list of keywords extracted from the response,
|
49
|
+
separated by asterisks as bullet points.
|
50
|
+
|
51
|
+
"""
|
52
|
+
# Capture asterisk bullet points or a numbered list.
|
53
|
+
rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
|
54
|
+
trans = {45: ' ', 47: ' '}
|
55
|
+
resp_ = resp.translate(trans).lower()
|
56
|
+
kwds = rexp.findall(resp_)
|
57
|
+
if kwds:
|
58
|
+
return ', '.join(kwds)
|
59
|
+
return ''
|
60
|
+
|
61
|
+
|
62
|
+
class _ChromaBaseLoader:
|
63
|
+
"""Base class for loading documents into a Chroma vector database.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
path (str): Full path to the file to be parsed and loaded.
|
67
|
+
dbpath (str | Chroma): Either the full path to the Chroma database
|
68
|
+
*directory*, or an instance of a :class:`~dbs.chroma.Chroma`
|
69
|
+
database. If the instance is passed, the ``collection``
|
70
|
+
argument is ignored.
|
71
|
+
collection (str, optional): Name of the Chroma database
|
72
|
+
collection. Only required if the ``db`` parameter is a path.
|
73
|
+
Defaults to None.
|
74
|
+
offline (bool, optional): Remain offline and use the locally
|
75
|
+
cached embedding function model. Defaults to False.
|
76
|
+
|
77
|
+
"""
|
78
|
+
|
79
|
+
_PARSERS = {'.pdf': PDFParser}
|
80
|
+
|
81
|
+
def __init__(self,
|
82
|
+
dbpath: str | ChromaDB,
|
83
|
+
collection: str=None,
|
84
|
+
*,
|
85
|
+
load_keywords: bool=False,
|
86
|
+
llm: object=None,
|
87
|
+
offline: bool=False):
|
88
|
+
"""Chroma database class initialiser."""
|
89
|
+
self._dbpath = dbpath
|
90
|
+
self._cname = collection
|
91
|
+
self._load_keywords = load_keywords
|
92
|
+
self._llm = llm
|
93
|
+
self._offline = offline
|
94
|
+
self._dbo = None # Database object.
|
95
|
+
self._docs = [] # List of 'Document' objects.
|
96
|
+
self._docss = [] # List of 'Document' objects *with splits*.
|
97
|
+
self._fbase = None # Basename of the document currently being loaded.
|
98
|
+
self._fpath = None # Full path to the document currently being loaded.
|
99
|
+
self._p = None # Document parser object.
|
100
|
+
self._splitter = None # Text splitter.
|
101
|
+
self._set_db_client()
|
102
|
+
self._check_parameters()
|
103
|
+
|
104
|
+
@property
|
105
|
+
def chroma(self):
|
106
|
+
"""Accessor to the database client object."""
|
107
|
+
return self._dbo
|
108
|
+
|
109
|
+
@property
|
110
|
+
def parser(self):
|
111
|
+
"""Accessor to the document parser object."""
|
112
|
+
return self._p
|
113
|
+
|
114
|
+
def _check_parameters(self) -> None:
|
115
|
+
"""Verify the class parameters are viable.
|
116
|
+
|
117
|
+
Raises:
|
118
|
+
ValueError: If the ``load_keywords`` argument is True and the
|
119
|
+
``llm`` argument is None, or the inverse. Both arguments
|
120
|
+
must either sum to 0, or 2.
|
121
|
+
|
122
|
+
"""
|
123
|
+
if sum((self._load_keywords, self._llm is not None)) not in (0, 2):
|
124
|
+
raise ValueError('For keyword loading, the load_keywords argument '
|
125
|
+
'must be True and a model instance must be provided.')
|
126
|
+
|
127
|
+
def _create_documents(self) -> bool:
|
128
|
+
"""Convert each extracted page into a ``Document`` object.
|
129
|
+
|
130
|
+
Returns:
|
131
|
+
bool: True of the pages are loaded as ``Document`` objects
|
132
|
+
successfully. Otherwise False.
|
133
|
+
|
134
|
+
"""
|
135
|
+
self._docs = [Document(page_content=page.content,
|
136
|
+
metadata={'source': self._p.doc.basename,
|
137
|
+
'pageno': page.pageno})
|
138
|
+
for page in self._p.doc.pages if page.hastext]
|
139
|
+
if not self._docs:
|
140
|
+
msg = f'{_PRE_WARN} Text could not be parsed from {self._p.doc.basename}.'
|
141
|
+
ui.print_warning(msg)
|
142
|
+
return False
|
143
|
+
return True
|
144
|
+
|
145
|
+
def _get_keywords(self) -> str:
|
146
|
+
"""Query the document (using the LLM) to extract the keywords."""
|
147
|
+
# pylint: disable=line-too-long
|
148
|
+
print('- Extracting keywords ...')
|
149
|
+
qry = ('List the important keywords which can be used to summarize this '
|
150
|
+
f'document: "{self._fbase}". Use only phrases which are found in the document.')
|
151
|
+
# Suppress stdout.
|
152
|
+
with contextlib.redirect_stdout(None):
|
153
|
+
nids = len(self._dbo.get(where={'source': self._fbase})['ids'])
|
154
|
+
# Max of 50, min n records; prefer n records or 10%.
|
155
|
+
filter_ = {'k': min(nids, max(25, min(nids//10, 50))),
|
156
|
+
'filter': {'source': {'$eq': self._fbase}}}
|
157
|
+
# TODO: Replace this with the module.cless.method once created.
|
158
|
+
qa = RetrievalQA.from_chain_type(llm=self._llm,
|
159
|
+
chain_type="stuff",
|
160
|
+
retriever=self._dbo.as_retriever(search_kwargs=filter_),
|
161
|
+
return_source_documents=True,
|
162
|
+
verbose=True)
|
163
|
+
resp = qa.invoke(qry)
|
164
|
+
kwds = Tools.parse_to_keywords(resp=resp['result'])
|
165
|
+
return kwds
|
166
|
+
|
167
|
+
def _load(self, path: str, **kwargs):
|
168
|
+
"""Load the selected files into the vector store.
|
169
|
+
|
170
|
+
Args:
|
171
|
+
path (str): Full path to the file to be loaded.
|
172
|
+
|
173
|
+
:Keyword Arguments:
|
174
|
+
Those passed from the loader-specific ``load`` method.
|
175
|
+
|
176
|
+
"""
|
177
|
+
# pylint: disable=multiple-statements
|
178
|
+
self._fpath = path
|
179
|
+
self._fbase = os.path.basename(path)
|
180
|
+
s = self._set_parser()
|
181
|
+
if s: s = self._set_text_splitter()
|
182
|
+
if s: s = self._parse_text(**kwargs)
|
183
|
+
if s: s = self._create_documents()
|
184
|
+
if s: s = self._split_texts()
|
185
|
+
if s: s = self._load_worker()
|
186
|
+
if s and self._load_keywords and self._llm:
|
187
|
+
kwds = self._get_keywords()
|
188
|
+
s = self._store_keywords(kwds=kwds)
|
189
|
+
self._print_summary(success=s)
|
190
|
+
|
191
|
+
def _load_worker(self) -> bool:
|
192
|
+
"""Load the split documents into the database collection.
|
193
|
+
|
194
|
+
Returns:
|
195
|
+
bool: True if loaded successfully, otherwise False. Success
|
196
|
+
is based on the number of records after the load being
|
197
|
+
greater than the number of records before the load, or not
|
198
|
+
exceptions being raised.
|
199
|
+
|
200
|
+
"""
|
201
|
+
try:
|
202
|
+
print('- Loading the document into the database ...')
|
203
|
+
nrecs_b = self._dbo.collection.count() # Count records before.
|
204
|
+
self._dbo.add_documents(self._docss)
|
205
|
+
nrecs_a = self._dbo.collection.count() # Count records after.
|
206
|
+
return self._test_load(nrecs_b=nrecs_b, nrecs_a=nrecs_a)
|
207
|
+
except chromadberrors.DuplicateIDError:
|
208
|
+
print('-- Document already loaded; duplicate detected.')
|
209
|
+
return False # Prevent from loading keywords.
|
210
|
+
except Exception as err:
|
211
|
+
reporterror(err)
|
212
|
+
return False
|
213
|
+
|
214
|
+
def _parse_text(self, **kwargs) -> bool:
|
215
|
+
"""Parse text from the document.
|
216
|
+
|
217
|
+
:Keyword Arguments:
|
218
|
+
Those to be passed into the text extraction method.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
bool: True if the parser's 'text' object is populated,
|
222
|
+
otherwise False.
|
223
|
+
|
224
|
+
"""
|
225
|
+
print('- Extracting text ...')
|
226
|
+
self._p.extract_text(**kwargs)
|
227
|
+
if len(self._p.doc.pages) < 2:
|
228
|
+
ui.print_warning(f'No text extracted from {self._p.doc.basename}')
|
229
|
+
return False
|
230
|
+
return True
|
231
|
+
|
232
|
+
@staticmethod
|
233
|
+
def _print_summary(success: bool):
|
234
|
+
"""Print an end of processing summary.
|
235
|
+
|
236
|
+
Args:
|
237
|
+
success (bool): Success flag from the processor.
|
238
|
+
|
239
|
+
"""
|
240
|
+
if success:
|
241
|
+
print('Processing complete. Success.')
|
242
|
+
else:
|
243
|
+
print('Processing aborted due to error. Failure.')
|
244
|
+
|
245
|
+
def _set_db_client(self) -> bool:
|
246
|
+
"""Set the database client object.
|
247
|
+
|
248
|
+
If the ``_db`` object is a string, this is inferred as the *path*
|
249
|
+
to the database. Otherwise, it is inferred as the database object
|
250
|
+
itself.
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
bool: True if the database object is set without error.
|
254
|
+
Otherwise False.
|
255
|
+
|
256
|
+
"""
|
257
|
+
try:
|
258
|
+
if isinstance(self._dbpath, str):
|
259
|
+
self._dbo = ChromaDB(path=self._dbpath,
|
260
|
+
collection=self._cname,
|
261
|
+
offline=self._offline)
|
262
|
+
else:
|
263
|
+
self._dbo = self._dbpath
|
264
|
+
except Exception as err:
|
265
|
+
reporterror(err)
|
266
|
+
return False
|
267
|
+
return True
|
268
|
+
|
269
|
+
def _set_parser(self) -> bool:
|
270
|
+
"""Set the appropriate document parser.
|
271
|
+
|
272
|
+
:Rationale:
|
273
|
+
The parser is set by the file extension. For example, a file
|
274
|
+
extension ``.pdf`` will set the
|
275
|
+
:class:`parsers.pdfparser.PDFParser` class.
|
276
|
+
|
277
|
+
Returns:
|
278
|
+
bool: True if a file extension appropriate parser was found.
|
279
|
+
Otherwise, False.
|
280
|
+
|
281
|
+
"""
|
282
|
+
# pylint: disable=invalid-name # OK as the variable (Parser) is a class.
|
283
|
+
# TODO: Updated this to use the (not-yet-available) ispdf utility
|
284
|
+
# function, rather than relying on the file extension.
|
285
|
+
ext = os.path.splitext(self._fpath)[1]
|
286
|
+
Parser = self._PARSERS.get(ext)
|
287
|
+
if not Parser:
|
288
|
+
msg = f'{_PRE_WARN} Document parser not set for {os.path.basename(self._fpath)}.'
|
289
|
+
ui.print_warning(msg)
|
290
|
+
return False
|
291
|
+
self._p = Parser(path=self._fpath)
|
292
|
+
return True
|
293
|
+
|
294
|
+
# TODO: Add these to a config file.
|
295
|
+
def _set_text_splitter(self) -> bool:
|
296
|
+
"""Define the text splitter to be used.
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
bool: True, always.
|
300
|
+
|
301
|
+
"""
|
302
|
+
self._splitter = RecursiveCharacterTextSplitter(chunk_size=256,
|
303
|
+
chunk_overlap=25,
|
304
|
+
separators=['\n\n\n', '\n\n', '\n', ' '])
|
305
|
+
return True
|
306
|
+
|
307
|
+
def _split_texts(self) -> bool:
|
308
|
+
"""Split the document text using a recursive text splitter.
|
309
|
+
|
310
|
+
Returns:
|
311
|
+
bool: True if the text was split successfully, otherwise
|
312
|
+
False.
|
313
|
+
|
314
|
+
"""
|
315
|
+
self._docss = self._splitter.split_documents(self._docs)
|
316
|
+
if not self._docss:
|
317
|
+
msg = (f'{_PRE_ERR} An error occurred while splitting the documents for '
|
318
|
+
f'{self._p.doc.basename}.')
|
319
|
+
ui.print_warning(msg)
|
320
|
+
return False
|
321
|
+
return True
|
322
|
+
|
323
|
+
def _store_keywords(self, kwds: str) -> bool:
|
324
|
+
"""Store the extracted keywords into the keywords collection.
|
325
|
+
|
326
|
+
Args:
|
327
|
+
kwds (str): A string containing the keywords extracted from
|
328
|
+
the document.
|
329
|
+
|
330
|
+
Returns:
|
331
|
+
bool: True if loaded successfully, otherwise False.
|
332
|
+
|
333
|
+
"""
|
334
|
+
print('- Storing keywords ...')
|
335
|
+
db = ChromaDB(path=self._dbo.path, collection=f'{self._cname}-kwds', offline=self._offline)
|
336
|
+
nrecs_b = db.collection.count() # Count records before.
|
337
|
+
docs = [Document(page_content=kwds, metadata={'source': self._fbase})]
|
338
|
+
db.add_documents(docs)
|
339
|
+
nrecs_a = db.collection.count() # Count records after.
|
340
|
+
return 1 == nrecs_a - nrecs_b
|
341
|
+
|
342
|
+
def _test_load(self, nrecs_b: int, nrecs_a: int) -> bool:
|
343
|
+
"""Test the document was loaded successfully.
|
344
|
+
|
345
|
+
:Test:
|
346
|
+
- Given a count of records before the load, verify the number
|
347
|
+
of records after the load is equal to the number of records
|
348
|
+
before, plus the number of split documents.
|
349
|
+
|
350
|
+
Args:
|
351
|
+
nrecs_b (int): Number of records *before* the load.
|
352
|
+
nrecs_a (int): Number of records *after* the load.
|
353
|
+
|
354
|
+
Returns:
|
355
|
+
bool: True if the number of records before the load plus the
|
356
|
+
number is splits is equal to the number of records after the
|
357
|
+
load.
|
358
|
+
|
359
|
+
"""
|
360
|
+
if nrecs_a == nrecs_b:
|
361
|
+
ui.print_warning(f'{_PRE_WARN} No new documents added. Possibly already loaded?')
|
362
|
+
return nrecs_a == nrecs_b + len(self._docss)
|