docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +35 -6
  13. docp/dbs/__init__.py +0 -0
  14. docp/dbs/chroma.py +197 -0
  15. docp/libs/_version.py +1 -0
  16. docp/libs/changelog.py +7 -0
  17. docp/libs/utilities.py +107 -0
  18. docp/loaders/__init__.py +38 -0
  19. docp/loaders/_chromabaseloader.py +338 -0
  20. docp/loaders/_chromabaseloader.py.bak +378 -0
  21. docp/loaders/_chromabasepdfloader.py +121 -0
  22. docp/loaders/_chromabasepptxloader.py +123 -0
  23. docp/loaders/chroma.py.bak +196 -0
  24. docp/loaders/chromapdfloader.py +199 -0
  25. docp/loaders/chromapptxloader.py +192 -0
  26. docp/loaders/lutilities.py +52 -0
  27. docp/objects/__init__.py +0 -0
  28. docp/objects/_docbaseobject.py +65 -0
  29. docp/objects/_imgobject.py +0 -0
  30. docp/objects/_pageobject.py +127 -0
  31. docp/objects/_slideobject.py +110 -0
  32. docp/objects/_tableobject.py +0 -0
  33. docp/objects/_textobject.py +64 -0
  34. docp/objects/pdfobject.py +61 -0
  35. docp/objects/pptxobject.py +46 -0
  36. docp/parsers/__init__.py +0 -0
  37. docp/parsers/_pdfbaseparser.py +236 -0
  38. docp/parsers/_pdftableparser.py +272 -0
  39. docp/parsers/_pdftextparser.py +263 -0
  40. docp/parsers/_pptxbaseparser.py +93 -0
  41. docp/parsers/_pptxtextparser.py +115 -0
  42. docp/parsers/pdfparser.py +62 -0
  43. docp/parsers/pptxparser.py +51 -0
  44. docp/parsers/putilities.py +48 -0
  45. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
  46. docp-0.2.0.dist-info/METADATA +110 -0
  47. docp-0.2.0.dist-info/RECORD +49 -0
  48. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  49. docp/_version.py +0 -1
  50. docp-0.0.0.dev1.dist-info/METADATA +0 -55
  51. docp-0.0.0.dev1.dist-info/RECORD +0 -7
  52. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
docp/__init__.py CHANGED
@@ -1,11 +1,40 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the project initilisation logic.
5
+
6
+ :Platform: Linux/Windows | Python 3.10+
7
+ :Developer: J Berendt
8
+ :Email: development@s3dev.uk
9
+
10
+ :Comments: Ths loader modules/classes have *not* been imported due to the
11
+ heavy dependency requirements. Refer to the loaders/__init__.py
12
+ module instead.
13
+
14
+ """
15
+
1
16
  import os
2
17
  import sys
3
18
  sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
19
+ from utils4.user_interface import ui
20
+ # locals
21
+ from .libs._version import __version__
22
+
23
+ # TODO: Change these to use logging.
24
+
4
25
  # Bring entry-points to the surface.
5
26
  try:
6
- from loaders.chroma import LoadChroma
7
- except ImportError:
8
- # The chroma loader requires a lot of backend which is not required for the parser.
9
- pass
10
- from parsers.pdf import ParsePDF
11
- from ._version import __version__
27
+ from .parsers.pdfparser import PDFParser
28
+ except ImportError as err:
29
+ msg = ( 'An error occurred while importing the PDF parser:\n'
30
+ f'- {err}\n'
31
+ ' - This can be ignored if the parser is not in use.\n')
32
+ ui.print_warning(f'\n[ImportError]: {msg}')
33
+
34
+ try:
35
+ from .parsers.pptxparser import PPTXParser
36
+ except ImportError as err:
37
+ msg = ( 'An error occurred while importing the PPTX parser:\n'
38
+ f'- {err}\n'
39
+ ' - This can be ignored if the parser is not in use.\n')
40
+ ui.print_warning(f'\n[ImportError]: {msg}')
docp/dbs/__init__.py ADDED
File without changes
docp/dbs/chroma.py ADDED
@@ -0,0 +1,197 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides a localised wrapper and specialised
5
+ functionality around the
6
+ ``langchain_community.vectorstores.Chroma`` class, for
7
+ interacting with a Chroma database.
8
+
9
+ :Platform: Linux/Windows | Python 3.10+
10
+ :Developer: J Berendt
11
+ :Email: development@s3dev.uk
12
+
13
+ :Comments: This module uses the
14
+ ``langchain_community.vectorstores.Chroma`` wrapper class,
15
+ rather than the base ``chromadb`` library as it provides the
16
+ ``add_texts`` method which supports GPU processing and
17
+ parallelisation; which is implemented by this module's
18
+ :meth:`~ChromaDB.add_documents` method.
19
+
20
+ """
21
+ # pylint: disable=import-error
22
+ # pylint: disable=wrong-import-order
23
+
24
+ from __future__ import annotations
25
+ import chromadb
26
+ import os
27
+ import torch
28
+ from glob import glob
29
+ from hashlib import md5
30
+ from langchain_huggingface import HuggingFaceEmbeddings
31
+ # langchain's Chroma is used rather than the base chromadb as it provides
32
+ # the add_texts method which support GPU processing and parallelisation.
33
+ from langchain_community.vectorstores import Chroma as _Chroma
34
+
35
+
36
+ class ChromaDB(_Chroma):
37
+ """Wrapper class around the ``chromadb`` library.
38
+
39
+ Args:
40
+ path (str): Path to the chroma database's *directory*.
41
+ collection (str): Collection name.
42
+ offline (bool, optional): Remain offline, used the cached
43
+ embedding function model rather than obtaining one online.
44
+ Defaults to False.
45
+ """
46
+ # pylint: disable=line-too-long
47
+
48
+ _MODEL_CACHE = os.path.join(os.path.dirname(os.path.dirname(os.path.realpath(__file__))), '.cache')
49
+ # Installing torch is a huge overhead, just for this. However, torch
50
+ # will already be installed as part of the sentence-transformers library,
51
+ # so we'll use it here.
52
+ _MODEL_KWARGS = {'device': 'cuda' if torch.cuda.is_available() else 'cpu'}
53
+ # TODO: Add this to a config file.
54
+ _MODEL_NAME = 'all-MiniLM-L6-v2'
55
+
56
+ def __init__(self, path: str, collection: str, offline: bool=False):
57
+ """Chroma database class initialiser."""
58
+ self._path = os.path.realpath(path)
59
+ self._cname = collection
60
+ self._offline = offline
61
+ self._client = None # Database 'client' object
62
+ self._dbc = None # Database 'collection' object.
63
+ self._set_client()
64
+ self._set_embedding_fn()
65
+ super().__init__(client=self._client,
66
+ collection_name=self._cname,
67
+ embedding_function=self._embfn,
68
+ persist_directory=self._path)
69
+ self._set_collection()
70
+
71
+ @property
72
+ def client(self):
73
+ """Accessor to the :class:`chromadb.PersistentClient` class."""
74
+ return self._client
75
+
76
+ @property
77
+ def collection(self):
78
+ """Accessor to the chromadb client's collection object."""
79
+ return self._dbc
80
+
81
+ @property
82
+ def embedding_function(self):
83
+ """Accessor to the embedding function used."""
84
+ return self._embfn
85
+
86
+ @property
87
+ def path(self) -> str:
88
+ """Accessor to the database's path."""
89
+ return self._path
90
+
91
+ def add_documents(self, docs: list[langchain_core.documents.base.Document]): # noqa # pylint: disable=undefined-variable
92
+ """Add multiple documents to the collection.
93
+
94
+ This method overrides the base class' ``add_documents`` method
95
+ to enable local ID derivation. Knowing *how* the IDs are derived
96
+ gives us greater understanding and querying ability of the
97
+ documents in the database. Each ID is derived locally by the
98
+ :meth:`_preproc` method from the file's basename, page number
99
+ and page content.
100
+
101
+ Additionally, this method wraps the
102
+ :func:`langchain_community.vectorstores.Chroma.add_texts`
103
+ method which supports GPU processing and parallelisation.
104
+
105
+ Args:
106
+ docs (list): A list of ``langchain_core.documents.base.Document``
107
+ document objects.
108
+
109
+ """
110
+ # pylint: disable=arguments-differ
111
+ # pylint: disable=arguments-renamed
112
+ if not isinstance(docs, list):
113
+ docs = [docs]
114
+ ids_, docs_, meta_ = self._preproc(docs=docs)
115
+ self.add_texts(ids=ids_, texts=docs_, metadatas=meta_)
116
+
117
+ def show_all(self):
118
+ """Return the entire contents of the collection.
119
+
120
+ This is an alias around ``.collection.get()``.
121
+
122
+ """
123
+ return self._dbc.get()
124
+
125
+ def _get_embedding_function_model(self) -> str:
126
+ """Derive the path to the embedding function model.
127
+
128
+ :Note:
129
+ If ``offline=True`` was passed into the class constructor,
130
+ the model cache is used, if available - otherwise the user
131
+ is warned.
132
+
133
+ If online usage is allowed, the model is obtained by the
134
+ means defined by the embedding function constructor.
135
+
136
+ Returns:
137
+ str: The name of the model. Or, if offline, the path to the
138
+ model's cache to be passed into the embedding function
139
+ constructor is returned.
140
+
141
+ """
142
+ if self._offline:
143
+ if not os.path.exists(self._MODEL_CACHE):
144
+ os.makedirs(self._MODEL_CACHE)
145
+ msg = ('Offline mode has been chosen, yet the embedding function model cache does not exist. '
146
+ 'Therefore, a model must be downloaded. Please enable online usage for the first run '
147
+ 'so a model can be downloaded and stored into the cache for future (offline) use.')
148
+ raise FileNotFoundError(msg)
149
+ # Find the cache directory containing the named model, this enables offline use.
150
+ model_loc = os.path.commonpath(filter(lambda x: 'config.json' in x,
151
+ glob(os.path.join(self._MODEL_CACHE,
152
+ f'*{self._MODEL_NAME}*',
153
+ '**'),
154
+ recursive=True)))
155
+ return model_loc
156
+ return self._MODEL_NAME
157
+
158
+ @staticmethod
159
+ def _preproc(docs: list):
160
+ """Pre-process the document objects to create the IDs.
161
+
162
+ Parse the ``Document`` object into its parts for storage.
163
+ Additionally, create the ID as a hash of the source document's
164
+ basename, page number and content.
165
+
166
+ """
167
+ ids = []
168
+ txts = []
169
+ metas = []
170
+ for doc in docs:
171
+ pc = doc.page_content
172
+ m = doc.metadata
173
+ pc_, src_ = map(str.encode, (pc, m['source']))
174
+ pg_ = str(m.get('pageno', 0)).zfill(4)
175
+ id_ = f'id_{md5(src_).hexdigest()}_{pg_}_{md5(pc_).hexdigest()}'
176
+ ids.append(id_)
177
+ txts.append(pc)
178
+ metas.append(m)
179
+ return ids, txts, metas
180
+
181
+ def _set_client(self):
182
+ """Set the database client object."""
183
+ settings = chromadb.Settings(anonymized_telemetry=False)
184
+ self._client = chromadb.PersistentClient(path=self._path,
185
+ settings=settings)
186
+
187
+ def _set_collection(self):
188
+ """Set the database collection object."""
189
+ self._dbc = self._client.get_or_create_collection(self._cname,
190
+ metadata={'hnsw:space': 'cosine'})
191
+
192
+ def _set_embedding_fn(self):
193
+ """Set the embeddings function object."""
194
+ model_name = self._get_embedding_function_model()
195
+ self._embfn = HuggingFaceEmbeddings(model_name=model_name,
196
+ model_kwargs=self._MODEL_KWARGS,
197
+ cache_folder=self._MODEL_CACHE)
docp/libs/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = '0.2.0'
docp/libs/changelog.py ADDED
@@ -0,0 +1,7 @@
1
+ # Changed.
2
+ # ENABLE SPHINX TO ACCESS THE GIT LOG
3
+ """
4
+ .. git_changelog::
5
+ :revisions: 99
6
+ :detailed-message-pre: True
7
+ """
docp/libs/utilities.py ADDED
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides utility-based functionality for the
5
+ project.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../'))
18
+ import re
19
+ from glob import glob
20
+ from utils4 import futils
21
+
22
+
23
+ class Utilities:
24
+ """General (cross-project) utility functions."""
25
+
26
+ @staticmethod
27
+ def collect_files(path: str, ext: str, recursive: bool) -> list:
28
+ """Collect all files for a given extension from a path.
29
+
30
+ Args:
31
+ path (str): Full path serving as the root for the search.
32
+ ext (str, optional): If the ``path`` argument refers to a
33
+ *directory*, a specific file extension can be specified
34
+ here. For example: ``ext = 'pdf'``.
35
+
36
+ If anything other than ``'**'`` is provided, all
37
+ alpha-characters are parsed from the string, and prefixed
38
+ with ``*.``. Meaning, if ``'.pdf'`` is passed, the
39
+ characters ``'pdf'`` are parsed and prefixed with ``*.``
40
+ to create ``'*.pdf'``. However, if ``'things.foo'`` is
41
+ passed, the derived extension will be ``'*.thingsfoo'``.
42
+ Defaults to '**', for a recursive search.
43
+
44
+ recursive (bool): Instruct the search to recurse into
45
+ sub-directories.
46
+
47
+ Returns:
48
+ list: The list of full file paths returned by the ``glob``
49
+ call. Any directory-only paths are removed.
50
+
51
+ """
52
+ if ext != '**':
53
+ ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
54
+ return list(filter(os.path.isfile, glob(os.path.join(path, ext), recursive=recursive)))
55
+
56
+ # !!!: Replace this with utils4.futils when available.
57
+ @staticmethod
58
+ def ispdf(path: str) -> bool:
59
+ """Test the file signature. Verify this is a valid PDF file.
60
+
61
+ Args:
62
+ path (str): Path to the file being tested.
63
+
64
+ Returns:
65
+ bool: True if this is a valid PDF file, otherwise False.
66
+
67
+ """
68
+ with open(path, 'rb') as f:
69
+ sig = f.read(5)
70
+ return sig == b'\x25\x50\x44\x46\x2d'
71
+
72
+ @staticmethod
73
+ def iszip(path: str) -> bool:
74
+ """Test the file signature. Verify this is a valid ZIP archive.
75
+
76
+ Args:
77
+ path (str): Path to the file being tested.
78
+
79
+ Returns:
80
+ bool: True if this is a valid ZIP archive, otherwise False.
81
+
82
+ """
83
+ return futils.iszip(path)
84
+
85
+ @staticmethod
86
+ def parse_to_keywords(resp: str) -> list:
87
+ """Parse the bot's response into a list of keywords.
88
+
89
+ Args:
90
+ resp (str): Text response directly from the bot.
91
+
92
+ Returns:
93
+ list: A list of keywords extracted from the response,
94
+ separated by asterisks as bullet points.
95
+
96
+ """
97
+ # Capture asterisk bullet points or a numbered list.
98
+ rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
99
+ trans = {45: ' ', 47: ' '}
100
+ resp_ = resp.translate(trans).lower()
101
+ kwds = rexp.findall(resp_)
102
+ if kwds:
103
+ return ', '.join(kwds)
104
+ return ''
105
+
106
+
107
+ utilities = Utilities()
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the project initilisation logic.
5
+
6
+ :Platform: Linux/Windows | Python 3.10+
7
+ :Developer: J Berendt
8
+ :Email: development@s3dev.uk
9
+
10
+ :Comments: n/a
11
+
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
17
+ from utils4.user_interface import ui
18
+
19
+ # TODO: Change these to use logging.
20
+
21
+ # Bring entry-points to the surface.
22
+ try:
23
+ from .chromapdfloader import ChromaPDFLoader
24
+ except ImportError as err:
25
+ # The chroma loader requires a lot of backend which is not required for the parser.
26
+ msg = ( 'An error occurred while importing the Chroma PDF loader:\n'
27
+ f'- {err}\n'
28
+ ' - This can be ignored if the loader is not in use.\n')
29
+ ui.print_warning(f'\n[ImportError]: {msg}')
30
+
31
+ try:
32
+ from .chromapptxloader import ChromaPPTXLoader
33
+ except ImportError as err:
34
+ # The chroma loader requires a lot of backend which is not required for the parser.
35
+ msg = ( 'An error occurred while importing the Chroma PPTX loader:\n'
36
+ f'- {err}\n'
37
+ ' - This can be ignored if the loader is not in use.\n')
38
+ ui.print_warning(f'\n[ImportError]: {msg}')