docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
- docp/__init__.py +35 -6
- docp/dbs/__init__.py +0 -0
- docp/dbs/chroma.py +197 -0
- docp/libs/_version.py +1 -0
- docp/libs/changelog.py +7 -0
- docp/libs/utilities.py +107 -0
- docp/loaders/__init__.py +38 -0
- docp/loaders/_chromabaseloader.py +338 -0
- docp/loaders/_chromabaseloader.py.bak +378 -0
- docp/loaders/_chromabasepdfloader.py +121 -0
- docp/loaders/_chromabasepptxloader.py +123 -0
- docp/loaders/chroma.py.bak +196 -0
- docp/loaders/chromapdfloader.py +199 -0
- docp/loaders/chromapptxloader.py +192 -0
- docp/loaders/lutilities.py +52 -0
- docp/objects/__init__.py +0 -0
- docp/objects/_docbaseobject.py +65 -0
- docp/objects/_imgobject.py +0 -0
- docp/objects/_pageobject.py +127 -0
- docp/objects/_slideobject.py +110 -0
- docp/objects/_tableobject.py +0 -0
- docp/objects/_textobject.py +64 -0
- docp/objects/pdfobject.py +61 -0
- docp/objects/pptxobject.py +46 -0
- docp/parsers/__init__.py +0 -0
- docp/parsers/_pdfbaseparser.py +236 -0
- docp/parsers/_pdftableparser.py +272 -0
- docp/parsers/_pdftextparser.py +263 -0
- docp/parsers/_pptxbaseparser.py +93 -0
- docp/parsers/_pptxtextparser.py +115 -0
- docp/parsers/pdfparser.py +62 -0
- docp/parsers/pptxparser.py +51 -0
- docp/parsers/putilities.py +48 -0
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
- docp-0.2.0.dist-info/METADATA +110 -0
- docp-0.2.0.dist-info/RECORD +49 -0
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
- docp/_version.py +0 -1
- docp-0.0.0.dev1.dist-info/METADATA +0 -55
- docp-0.0.0.dev1.dist-info/RECORD +0 -7
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
docp/__init__.py
CHANGED
@@ -1,11 +1,40 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the project initilisation logic.
|
5
|
+
|
6
|
+
:Platform: Linux/Windows | Python 3.10+
|
7
|
+
:Developer: J Berendt
|
8
|
+
:Email: development@s3dev.uk
|
9
|
+
|
10
|
+
:Comments: Ths loader modules/classes have *not* been imported due to the
|
11
|
+
heavy dependency requirements. Refer to the loaders/__init__.py
|
12
|
+
module instead.
|
13
|
+
|
14
|
+
"""
|
15
|
+
|
1
16
|
import os
|
2
17
|
import sys
|
3
18
|
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
|
19
|
+
from utils4.user_interface import ui
|
20
|
+
# locals
|
21
|
+
from .libs._version import __version__
|
22
|
+
|
23
|
+
# TODO: Change these to use logging.
|
24
|
+
|
4
25
|
# Bring entry-points to the surface.
|
5
26
|
try:
|
6
|
-
from
|
7
|
-
except ImportError:
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
27
|
+
from .parsers.pdfparser import PDFParser
|
28
|
+
except ImportError as err:
|
29
|
+
msg = ( 'An error occurred while importing the PDF parser:\n'
|
30
|
+
f'- {err}\n'
|
31
|
+
' - This can be ignored if the parser is not in use.\n')
|
32
|
+
ui.print_warning(f'\n[ImportError]: {msg}')
|
33
|
+
|
34
|
+
try:
|
35
|
+
from .parsers.pptxparser import PPTXParser
|
36
|
+
except ImportError as err:
|
37
|
+
msg = ( 'An error occurred while importing the PPTX parser:\n'
|
38
|
+
f'- {err}\n'
|
39
|
+
' - This can be ignored if the parser is not in use.\n')
|
40
|
+
ui.print_warning(f'\n[ImportError]: {msg}')
|
docp/dbs/__init__.py
ADDED
File without changes
|
docp/dbs/chroma.py
ADDED
@@ -0,0 +1,197 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides a localised wrapper and specialised
|
5
|
+
functionality around the
|
6
|
+
``langchain_community.vectorstores.Chroma`` class, for
|
7
|
+
interacting with a Chroma database.
|
8
|
+
|
9
|
+
:Platform: Linux/Windows | Python 3.10+
|
10
|
+
:Developer: J Berendt
|
11
|
+
:Email: development@s3dev.uk
|
12
|
+
|
13
|
+
:Comments: This module uses the
|
14
|
+
``langchain_community.vectorstores.Chroma`` wrapper class,
|
15
|
+
rather than the base ``chromadb`` library as it provides the
|
16
|
+
``add_texts`` method which supports GPU processing and
|
17
|
+
parallelisation; which is implemented by this module's
|
18
|
+
:meth:`~ChromaDB.add_documents` method.
|
19
|
+
|
20
|
+
"""
|
21
|
+
# pylint: disable=import-error
|
22
|
+
# pylint: disable=wrong-import-order
|
23
|
+
|
24
|
+
from __future__ import annotations
|
25
|
+
import chromadb
|
26
|
+
import os
|
27
|
+
import torch
|
28
|
+
from glob import glob
|
29
|
+
from hashlib import md5
|
30
|
+
from langchain_huggingface import HuggingFaceEmbeddings
|
31
|
+
# langchain's Chroma is used rather than the base chromadb as it provides
|
32
|
+
# the add_texts method which support GPU processing and parallelisation.
|
33
|
+
from langchain_community.vectorstores import Chroma as _Chroma
|
34
|
+
|
35
|
+
|
36
|
+
class ChromaDB(_Chroma):
|
37
|
+
"""Wrapper class around the ``chromadb`` library.
|
38
|
+
|
39
|
+
Args:
|
40
|
+
path (str): Path to the chroma database's *directory*.
|
41
|
+
collection (str): Collection name.
|
42
|
+
offline (bool, optional): Remain offline, used the cached
|
43
|
+
embedding function model rather than obtaining one online.
|
44
|
+
Defaults to False.
|
45
|
+
"""
|
46
|
+
# pylint: disable=line-too-long
|
47
|
+
|
48
|
+
_MODEL_CACHE = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), '.cache')
|
49
|
+
# Installing torch is a huge overhead, just for this. However, torch
|
50
|
+
# will already be installed as part of the sentence-transformers library,
|
51
|
+
# so we'll use it here.
|
52
|
+
_MODEL_KWARGS = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
|
53
|
+
# TODO: Add this to a config file.
|
54
|
+
_MODEL_NAME = 'all-MiniLM-L6-v2'
|
55
|
+
|
56
|
+
def __init__(self, path: str, collection: str, offline: bool=False):
|
57
|
+
"""Chroma database class initialiser."""
|
58
|
+
self._path = os.path.realpath(path)
|
59
|
+
self._cname = collection
|
60
|
+
self._offline = offline
|
61
|
+
self._client = None # Database 'client' object
|
62
|
+
self._dbc = None # Database 'collection' object.
|
63
|
+
self._set_client()
|
64
|
+
self._set_embedding_fn()
|
65
|
+
super().__init__(client=self._client,
|
66
|
+
collection_name=self._cname,
|
67
|
+
embedding_function=self._embfn,
|
68
|
+
persist_directory=self._path)
|
69
|
+
self._set_collection()
|
70
|
+
|
71
|
+
@property
|
72
|
+
def client(self):
|
73
|
+
"""Accessor to the :class:`chromadb.PersistentClient` class."""
|
74
|
+
return self._client
|
75
|
+
|
76
|
+
@property
|
77
|
+
def collection(self):
|
78
|
+
"""Accessor to the chromadb client's collection object."""
|
79
|
+
return self._dbc
|
80
|
+
|
81
|
+
@property
|
82
|
+
def embedding_function(self):
|
83
|
+
"""Accessor to the embedding function used."""
|
84
|
+
return self._embfn
|
85
|
+
|
86
|
+
@property
|
87
|
+
def path(self) -> str:
|
88
|
+
"""Accessor to the database's path."""
|
89
|
+
return self._path
|
90
|
+
|
91
|
+
def add_documents(self, docs: list[langchain_core.documents.base.Document]): # noqa # pylint: disable=undefined-variable
|
92
|
+
"""Add multiple documents to the collection.
|
93
|
+
|
94
|
+
This method overrides the base class' ``add_documents`` method
|
95
|
+
to enable local ID derivation. Knowing *how* the IDs are derived
|
96
|
+
gives us greater understanding and querying ability of the
|
97
|
+
documents in the database. Each ID is derived locally by the
|
98
|
+
:meth:`_preproc` method from the file's basename, page number
|
99
|
+
and page content.
|
100
|
+
|
101
|
+
Additionally, this method wraps the
|
102
|
+
:func:`langchain_community.vectorstores.Chroma.add_texts`
|
103
|
+
method which supports GPU processing and parallelisation.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
docs (list): A list of ``langchain_core.documents.base.Document``
|
107
|
+
document objects.
|
108
|
+
|
109
|
+
"""
|
110
|
+
# pylint: disable=arguments-differ
|
111
|
+
# pylint: disable=arguments-renamed
|
112
|
+
if not isinstance(docs, list):
|
113
|
+
docs = [docs]
|
114
|
+
ids_, docs_, meta_ = self._preproc(docs=docs)
|
115
|
+
self.add_texts(ids=ids_, texts=docs_, metadatas=meta_)
|
116
|
+
|
117
|
+
def show_all(self):
|
118
|
+
"""Return the entire contents of the collection.
|
119
|
+
|
120
|
+
This is an alias around ``.collection.get()``.
|
121
|
+
|
122
|
+
"""
|
123
|
+
return self._dbc.get()
|
124
|
+
|
125
|
+
def _get_embedding_function_model(self) -> str:
|
126
|
+
"""Derive the path to the embedding function model.
|
127
|
+
|
128
|
+
:Note:
|
129
|
+
If ``offline=True`` was passed into the class constructor,
|
130
|
+
the model cache is used, if available - otherwise the user
|
131
|
+
is warned.
|
132
|
+
|
133
|
+
If online usage is allowed, the model is obtained by the
|
134
|
+
means defined by the embedding function constructor.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
str: The name of the model. Or, if offline, the path to the
|
138
|
+
model's cache to be passed into the embedding function
|
139
|
+
constructor is returned.
|
140
|
+
|
141
|
+
"""
|
142
|
+
if self._offline:
|
143
|
+
if not os.path.exists(self._MODEL_CACHE):
|
144
|
+
os.makedirs(self._MODEL_CACHE)
|
145
|
+
msg = ('Offline mode has been chosen, yet the embedding function model cache does not exist. '
|
146
|
+
'Therefore, a model must be downloaded. Please enable online usage for the first run '
|
147
|
+
'so a model can be downloaded and stored into the cache for future (offline) use.')
|
148
|
+
raise FileNotFoundError(msg)
|
149
|
+
# Find the cache directory containing the named model, this enables offline use.
|
150
|
+
model_loc = os.path.commonpath(filter(lambda x: 'config.json' in x,
|
151
|
+
glob(os.path.join(self._MODEL_CACHE,
|
152
|
+
f'*{self._MODEL_NAME}*',
|
153
|
+
'**'),
|
154
|
+
recursive=True)))
|
155
|
+
return model_loc
|
156
|
+
return self._MODEL_NAME
|
157
|
+
|
158
|
+
@staticmethod
|
159
|
+
def _preproc(docs: list):
|
160
|
+
"""Pre-process the document objects to create the IDs.
|
161
|
+
|
162
|
+
Parse the ``Document`` object into its parts for storage.
|
163
|
+
Additionally, create the ID as a hash of the source document's
|
164
|
+
basename, page number and content.
|
165
|
+
|
166
|
+
"""
|
167
|
+
ids = []
|
168
|
+
txts = []
|
169
|
+
metas = []
|
170
|
+
for doc in docs:
|
171
|
+
pc = doc.page_content
|
172
|
+
m = doc.metadata
|
173
|
+
pc_, src_ = map(str.encode, (pc, m['source']))
|
174
|
+
pg_ = str(m.get('pageno', 0)).zfill(4)
|
175
|
+
id_ = f'id_{md5(src_).hexdigest()}_{pg_}_{md5(pc_).hexdigest()}'
|
176
|
+
ids.append(id_)
|
177
|
+
txts.append(pc)
|
178
|
+
metas.append(m)
|
179
|
+
return ids, txts, metas
|
180
|
+
|
181
|
+
def _set_client(self):
|
182
|
+
"""Set the database client object."""
|
183
|
+
settings = chromadb.Settings(anonymized_telemetry=False)
|
184
|
+
self._client = chromadb.PersistentClient(path=self._path,
|
185
|
+
settings=settings)
|
186
|
+
|
187
|
+
def _set_collection(self):
|
188
|
+
"""Set the database collection object."""
|
189
|
+
self._dbc = self._client.get_or_create_collection(self._cname,
|
190
|
+
metadata={'hnsw:space': 'cosine'})
|
191
|
+
|
192
|
+
def _set_embedding_fn(self):
|
193
|
+
"""Set the embeddings function object."""
|
194
|
+
model_name = self._get_embedding_function_model()
|
195
|
+
self._embfn = HuggingFaceEmbeddings(model_name=model_name,
|
196
|
+
model_kwargs=self._MODEL_KWARGS,
|
197
|
+
cache_folder=self._MODEL_CACHE)
|
docp/libs/_version.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.2.0'
|
docp/libs/changelog.py
ADDED
docp/libs/utilities.py
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides utility-based functionality for the
|
5
|
+
project.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
import os
|
16
|
+
import sys
|
17
|
+
sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../'))
|
18
|
+
import re
|
19
|
+
from glob import glob
|
20
|
+
from utils4 import futils
|
21
|
+
|
22
|
+
|
23
|
+
class Utilities:
|
24
|
+
"""General (cross-project) utility functions."""
|
25
|
+
|
26
|
+
@staticmethod
|
27
|
+
def collect_files(path: str, ext: str, recursive: bool) -> list:
|
28
|
+
"""Collect all files for a given extension from a path.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
path (str): Full path serving as the root for the search.
|
32
|
+
ext (str, optional): If the ``path`` argument refers to a
|
33
|
+
*directory*, a specific file extension can be specified
|
34
|
+
here. For example: ``ext = 'pdf'``.
|
35
|
+
|
36
|
+
If anything other than ``'**'`` is provided, all
|
37
|
+
alpha-characters are parsed from the string, and prefixed
|
38
|
+
with ``*.``. Meaning, if ``'.pdf'`` is passed, the
|
39
|
+
characters ``'pdf'`` are parsed and prefixed with ``*.``
|
40
|
+
to create ``'*.pdf'``. However, if ``'things.foo'`` is
|
41
|
+
passed, the derived extension will be ``'*.thingsfoo'``.
|
42
|
+
Defaults to '**', for a recursive search.
|
43
|
+
|
44
|
+
recursive (bool): Instruct the search to recurse into
|
45
|
+
sub-directories.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
list: The list of full file paths returned by the ``glob``
|
49
|
+
call. Any directory-only paths are removed.
|
50
|
+
|
51
|
+
"""
|
52
|
+
if ext != '**':
|
53
|
+
ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
|
54
|
+
return list(filter(os.path.isfile, glob(os.path.join(path, ext), recursive=recursive)))
|
55
|
+
|
56
|
+
# !!!: Replace this with utils4.futils when available.
|
57
|
+
@staticmethod
|
58
|
+
def ispdf(path: str) -> bool:
|
59
|
+
"""Test the file signature. Verify this is a valid PDF file.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
path (str): Path to the file being tested.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
bool: True if this is a valid PDF file, otherwise False.
|
66
|
+
|
67
|
+
"""
|
68
|
+
with open(path, 'rb') as f:
|
69
|
+
sig = f.read(5)
|
70
|
+
return sig == b'\x25\x50\x44\x46\x2d'
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def iszip(path: str) -> bool:
|
74
|
+
"""Test the file signature. Verify this is a valid ZIP archive.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
path (str): Path to the file being tested.
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
bool: True if this is a valid ZIP archive, otherwise False.
|
81
|
+
|
82
|
+
"""
|
83
|
+
return futils.iszip(path)
|
84
|
+
|
85
|
+
@staticmethod
|
86
|
+
def parse_to_keywords(resp: str) -> list:
|
87
|
+
"""Parse the bot's response into a list of keywords.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
resp (str): Text response directly from the bot.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
list: A list of keywords extracted from the response,
|
94
|
+
separated by asterisks as bullet points.
|
95
|
+
|
96
|
+
"""
|
97
|
+
# Capture asterisk bullet points or a numbered list.
|
98
|
+
rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
|
99
|
+
trans = {45: ' ', 47: ' '}
|
100
|
+
resp_ = resp.translate(trans).lower()
|
101
|
+
kwds = rexp.findall(resp_)
|
102
|
+
if kwds:
|
103
|
+
return ', '.join(kwds)
|
104
|
+
return ''
|
105
|
+
|
106
|
+
|
107
|
+
utilities = Utilities()
|
docp/loaders/__init__.py
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the project initilisation logic.
|
5
|
+
|
6
|
+
:Platform: Linux/Windows | Python 3.10+
|
7
|
+
:Developer: J Berendt
|
8
|
+
:Email: development@s3dev.uk
|
9
|
+
|
10
|
+
:Comments: n/a
|
11
|
+
|
12
|
+
"""
|
13
|
+
|
14
|
+
import os
|
15
|
+
import sys
|
16
|
+
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
|
17
|
+
from utils4.user_interface import ui
|
18
|
+
|
19
|
+
# TODO: Change these to use logging.
|
20
|
+
|
21
|
+
# Bring entry-points to the surface.
|
22
|
+
try:
|
23
|
+
from .chromapdfloader import ChromaPDFLoader
|
24
|
+
except ImportError as err:
|
25
|
+
# The chroma loader requires a lot of backend which is not required for the parser.
|
26
|
+
msg = ( 'An error occurred while importing the Chroma PDF loader:\n'
|
27
|
+
f'- {err}\n'
|
28
|
+
' - This can be ignored if the loader is not in use.\n')
|
29
|
+
ui.print_warning(f'\n[ImportError]: {msg}')
|
30
|
+
|
31
|
+
try:
|
32
|
+
from .chromapptxloader import ChromaPPTXLoader
|
33
|
+
except ImportError as err:
|
34
|
+
# The chroma loader requires a lot of backend which is not required for the parser.
|
35
|
+
msg = ( 'An error occurred while importing the Chroma PPTX loader:\n'
|
36
|
+
f'- {err}\n'
|
37
|
+
' - This can be ignored if the loader is not in use.\n')
|
38
|
+
ui.print_warning(f'\n[ImportError]: {msg}')
|