docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +35 -6
  13. docp/dbs/__init__.py +0 -0
  14. docp/dbs/chroma.py +197 -0
  15. docp/libs/_version.py +1 -0
  16. docp/libs/changelog.py +7 -0
  17. docp/libs/utilities.py +107 -0
  18. docp/loaders/__init__.py +38 -0
  19. docp/loaders/_chromabaseloader.py +338 -0
  20. docp/loaders/_chromabaseloader.py.bak +378 -0
  21. docp/loaders/_chromabasepdfloader.py +121 -0
  22. docp/loaders/_chromabasepptxloader.py +123 -0
  23. docp/loaders/chroma.py.bak +196 -0
  24. docp/loaders/chromapdfloader.py +199 -0
  25. docp/loaders/chromapptxloader.py +192 -0
  26. docp/loaders/lutilities.py +52 -0
  27. docp/objects/__init__.py +0 -0
  28. docp/objects/_docbaseobject.py +65 -0
  29. docp/objects/_imgobject.py +0 -0
  30. docp/objects/_pageobject.py +127 -0
  31. docp/objects/_slideobject.py +110 -0
  32. docp/objects/_tableobject.py +0 -0
  33. docp/objects/_textobject.py +64 -0
  34. docp/objects/pdfobject.py +61 -0
  35. docp/objects/pptxobject.py +46 -0
  36. docp/parsers/__init__.py +0 -0
  37. docp/parsers/_pdfbaseparser.py +236 -0
  38. docp/parsers/_pdftableparser.py +272 -0
  39. docp/parsers/_pdftextparser.py +263 -0
  40. docp/parsers/_pptxbaseparser.py +93 -0
  41. docp/parsers/_pptxtextparser.py +115 -0
  42. docp/parsers/pdfparser.py +62 -0
  43. docp/parsers/pptxparser.py +51 -0
  44. docp/parsers/putilities.py +48 -0
  45. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
  46. docp-0.2.0.dist-info/METADATA +110 -0
  47. docp-0.2.0.dist-info/RECORD +49 -0
  48. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  49. docp/_version.py +0 -1
  50. docp-0.0.0.dev1.dist-info/METADATA +0 -55
  51. docp-0.0.0.dev1.dist-info/RECORD +0 -7
  52. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,378 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides functionality to parse and store
5
+ document data into a Chroma vector database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ .. attention::
14
+
15
+ This module is *not* designed to be interacted with
16
+ directly, only via the appropriate interface class(es).
17
+
18
+ Rather, please create an instance of a Chroma document
19
+ loading object using the following class:
20
+
21
+ - :class:`~docp.loaders.chroma.ChromaLoader`
22
+
23
+ """
24
+ # pylint: disable=import-error
25
+ # pylint: disable=no-name-in-module # langchain.chains.RetrievalQA
26
+
27
+ import contextlib
28
+ import os
29
+ import re
30
+ from chromadb.api.types import errors as chromadberrors
31
+ from langchain.chains import RetrievalQA
32
+ from langchain.docstore.document import Document
33
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
34
+ from utils4.reporterror import reporterror
35
+ from utils4.user_interface import ui
36
+ # locals
37
+ try:
38
+ from .dbs.chroma import ChromaDB
39
+ from .parsers.pdfparser import PDFParser
40
+ from .parsers.pptxparser import PPTXParser
41
+ except ImportError:
42
+ from dbs.chroma import ChromaDB
43
+ from parsers.pdfparser import PDFParser
44
+ from parsers.pptxparser import PPTXParser
45
+
46
+ _PRE_ERR = '\n[ERROR]:'
47
+ _PRE_WARN = '\n[WARNING]:'
48
+
49
+
50
+ class Tools:
51
+ """General tools used for loading documents."""
52
+
53
+ @staticmethod
54
+ def parse_to_keywords(resp: str) -> list:
55
+ """Parse the bot's response into a list of keywords.
56
+
57
+ Args:
58
+ resp (str): Text response directly from the bot.
59
+
60
+ Returns:
61
+ list: A list of keywords extracted from the response,
62
+ separated by asterisks as bullet points.
63
+
64
+ """
65
+ # Capture asterisk bullet points or a numbered list.
66
+ rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
67
+ trans = {45: ' ', 47: ' '}
68
+ resp_ = resp.translate(trans).lower()
69
+ kwds = rexp.findall(resp_)
70
+ if kwds:
71
+ return ', '.join(kwds)
72
+ return ''
73
+
74
+
75
+ class _ChromaBaseLoader:
76
+ """Base class for loading documents into a Chroma vector database.
77
+
78
+ Args:
79
+ path (str): Full path to the file to be parsed and loaded.
80
+ dbpath (str | Chroma): Either the full path to the Chroma database
81
+ *directory*, or an instance of a :class:`~dbs.chroma.Chroma`
82
+ database. If the instance is passed, the ``collection``
83
+ argument is ignored.
84
+ collection (str, optional): Name of the Chroma database
85
+ collection. Only required if the ``db`` parameter is a path.
86
+ Defaults to None.
87
+ offline (bool, optional): Remain offline and use the locally
88
+ cached embedding function model. Defaults to False.
89
+
90
+ """
91
+
92
+ _PARSERS = {'.pdf': PDFParser,
93
+ '.pptx': PPTXParser}
94
+
95
+ def __init__(self,
96
+ dbpath: str | ChromaDB,
97
+ collection: str=None,
98
+ *,
99
+ load_keywords: bool=False,
100
+ llm: object=None,
101
+ offline: bool=False):
102
+ """Chroma database class initialiser."""
103
+ self._dbpath = dbpath
104
+ self._cname = collection
105
+ self._load_keywords = load_keywords
106
+ self._llm = llm
107
+ self._offline = offline
108
+ self._dbo = None # Database object.
109
+ self._docs = [] # List of 'Document' objects.
110
+ self._docss = [] # List of 'Document' objects *with splits*.
111
+ self._fbase = None # Basename of the document currently being loaded.
112
+ self._fpath = None # Full path to the document currently being loaded.
113
+ self._p = None # Document parser object.
114
+ self._splitter = None # Text splitter.
115
+ self._set_db_client()
116
+ self._check_parameters()
117
+
118
+ @property
119
+ def chroma(self):
120
+ """Accessor to the database client object."""
121
+ return self._dbo
122
+
123
+ @property
124
+ def parser(self):
125
+ """Accessor to the document parser object."""
126
+ return self._p
127
+
128
+ def _check_parameters(self) -> None:
129
+ """Verify the class parameters are viable.
130
+
131
+ Raises:
132
+ ValueError: If the ``load_keywords`` argument is True and the
133
+ ``llm`` argument is None, or the inverse. Both arguments
134
+ must either sum to 0, or 2.
135
+
136
+ """
137
+ if sum((self._load_keywords, self._llm is not None)) not in (0, 2):
138
+ raise ValueError('For keyword loading, the load_keywords argument '
139
+ 'must be True and a model instance must be provided.')
140
+
141
+ # TODO: Need a document creation routine for PPTX files as the internal
142
+ # structure of the objects is slightly different.
143
+ def _create_documents(self) -> bool:
144
+ """Convert each extracted page into a ``Document`` object.
145
+
146
+ Returns:
147
+ bool: True of the pages are loaded as ``Document`` objects
148
+ successfully. Otherwise False.
149
+
150
+ """
151
+ self._docs = [Document(page_content=page.content,
152
+ metadata={'source': self._p.doc.basename,
153
+ 'pageno': page.pageno})
154
+ for page in self._p.doc.pages if page.hastext]
155
+ if not self._docs:
156
+ msg = f'{_PRE_WARN} Text could not be parsed from {self._p.doc.basename}.'
157
+ ui.print_warning(msg)
158
+ return False
159
+ return True
160
+
161
+ def _get_keywords(self) -> str:
162
+ """Query the document (using the LLM) to extract the keywords."""
163
+ # pylint: disable=line-too-long
164
+ print('- Extracting keywords ...')
165
+ qry = ('List the important keywords which can be used to summarize this '
166
+ f'document: "{self._fbase}". Use only phrases which are found in the document.')
167
+ # Suppress stdout.
168
+ with contextlib.redirect_stdout(None):
169
+ nids = len(self._dbo.get(where={'source': self._fbase})['ids'])
170
+ # Max of 50, min n records; prefer n records or 10%.
171
+ filter_ = {'k': min(nids, max(25, min(nids//10, 50))),
172
+ 'filter': {'source': {'$eq': self._fbase}}}
173
+ # TODO: Replace this with the module.cless.method once created.
174
+ qa = RetrievalQA.from_chain_type(llm=self._llm,
175
+ chain_type="stuff",
176
+ retriever=self._dbo.as_retriever(search_kwargs=filter_),
177
+ return_source_documents=True,
178
+ verbose=True)
179
+ resp = qa.invoke(qry)
180
+ kwds = Tools.parse_to_keywords(resp=resp['result'])
181
+ return kwds
182
+
183
+ def _load(self, path: str, **kwargs):
184
+ """Load the selected files into the vector store.
185
+
186
+ Args:
187
+ path (str): Full path to the file to be loaded.
188
+
189
+ :Keyword Arguments:
190
+ Those passed from the loader-specific ``load`` method.
191
+
192
+ """
193
+ # pylint: disable=multiple-statements
194
+ self._fpath = path
195
+ self._fbase = os.path.basename(path)
196
+ s = self._set_parser()
197
+ if s: s = self._set_text_splitter()
198
+ if s: s = self._parse_text(**kwargs)
199
+ if s: s = self._create_documents()
200
+ if s: s = self._split_texts()
201
+ if s: s = self._load_worker()
202
+ if s and self._load_keywords and self._llm:
203
+ kwds = self._get_keywords()
204
+ s = self._store_keywords(kwds=kwds)
205
+ self._print_summary(success=s)
206
+
207
+ def _load_worker(self) -> bool:
208
+ """Load the split documents into the database collection.
209
+
210
+ Returns:
211
+ bool: True if loaded successfully, otherwise False. Success
212
+ is based on the number of records after the load being
213
+ greater than the number of records before the load, or not
214
+ exceptions being raised.
215
+
216
+ """
217
+ try:
218
+ print('- Loading the document into the database ...')
219
+ nrecs_b = self._dbo.collection.count() # Count records before.
220
+ self._dbo.add_documents(self._docss)
221
+ nrecs_a = self._dbo.collection.count() # Count records after.
222
+ return self._test_load(nrecs_b=nrecs_b, nrecs_a=nrecs_a)
223
+ except chromadberrors.DuplicateIDError:
224
+ print('-- Document already loaded; duplicate detected.')
225
+ return False # Prevent from loading keywords.
226
+ except Exception as err:
227
+ reporterror(err)
228
+ return False
229
+
230
+ def _parse_text(self, **kwargs) -> bool:
231
+ """Parse text from the document.
232
+
233
+ :Keyword Arguments:
234
+ Those to be passed into the text extraction method.
235
+
236
+ Returns:
237
+ bool: True if the parser's 'text' object is populated,
238
+ otherwise False.
239
+
240
+ """
241
+ print('- Extracting text ...')
242
+ self._p.extract_text(**kwargs)
243
+ if len(self._p.doc.pages) < 2:
244
+ ui.print_warning(f'No text extracted from {self._p.doc.basename}')
245
+ return False
246
+ return True
247
+
248
+ @staticmethod
249
+ def _print_summary(success: bool):
250
+ """Print an end of processing summary.
251
+
252
+ Args:
253
+ success (bool): Success flag from the processor.
254
+
255
+ """
256
+ if success:
257
+ print('Processing complete. Success.')
258
+ else:
259
+ print('Processing aborted due to error. Failure.')
260
+
261
+ def _set_db_client(self) -> bool:
262
+ """Set the database client object.
263
+
264
+ If the ``_db`` object is a string, this is inferred as the *path*
265
+ to the database. Otherwise, it is inferred as the database object
266
+ itself.
267
+
268
+ Returns:
269
+ bool: True if the database object is set without error.
270
+ Otherwise False.
271
+
272
+ """
273
+ try:
274
+ if isinstance(self._dbpath, str):
275
+ self._dbo = ChromaDB(path=self._dbpath,
276
+ collection=self._cname,
277
+ offline=self._offline)
278
+ else:
279
+ self._dbo = self._dbpath
280
+ except Exception as err:
281
+ reporterror(err)
282
+ return False
283
+ return True
284
+
285
+ def _set_parser(self) -> bool:
286
+ """Set the appropriate document parser.
287
+
288
+ :Rationale:
289
+ The parser is set by the file extension. For example, a file
290
+ extension ``.pdf`` will set the
291
+ :class:`parsers.pdfparser.PDFParser` class.
292
+
293
+ Returns:
294
+ bool: True if a file extension appropriate parser was found.
295
+ Otherwise, False.
296
+
297
+ """
298
+ # pylint: disable=invalid-name # OK as the variable (Parser) is a class.
299
+ # TODO: Updated this to use the (not-yet-available) ispdf utility
300
+ # function, rather than relying on the file extension.
301
+ ext = os.path.splitext(self._fpath)[1]
302
+ Parser = self._PARSERS.get(ext)
303
+ if not Parser:
304
+ msg = f'{_PRE_WARN} Document parser not set for {os.path.basename(self._fpath)}.'
305
+ ui.print_warning(msg)
306
+ return False
307
+ self._p = Parser(path=self._fpath)
308
+ return True
309
+
310
+ # TODO: Add these to a config file.
311
+ def _set_text_splitter(self) -> bool:
312
+ """Define the text splitter to be used.
313
+
314
+ Returns:
315
+ bool: True, always.
316
+
317
+ """
318
+ self._splitter = RecursiveCharacterTextSplitter(chunk_size=256,
319
+ chunk_overlap=25,
320
+ separators=['\n\n\n', '\n\n', '\n', ' '])
321
+ return True
322
+
323
+ def _split_texts(self) -> bool:
324
+ """Split the document text using a recursive text splitter.
325
+
326
+ Returns:
327
+ bool: True if the text was split successfully, otherwise
328
+ False.
329
+
330
+ """
331
+ self._docss = self._splitter.split_documents(self._docs)
332
+ if not self._docss:
333
+ msg = (f'{_PRE_ERR} An error occurred while splitting the documents for '
334
+ f'{self._p.doc.basename}.')
335
+ ui.print_warning(msg)
336
+ return False
337
+ return True
338
+
339
+ def _store_keywords(self, kwds: str) -> bool:
340
+ """Store the extracted keywords into the keywords collection.
341
+
342
+ Args:
343
+ kwds (str): A string containing the keywords extracted from
344
+ the document.
345
+
346
+ Returns:
347
+ bool: True if loaded successfully, otherwise False.
348
+
349
+ """
350
+ print('- Storing keywords ...')
351
+ db = ChromaDB(path=self._dbo.path, collection=f'{self._cname}-kwds', offline=self._offline)
352
+ nrecs_b = db.collection.count() # Count records before.
353
+ docs = [Document(page_content=kwds, metadata={'source': self._fbase})]
354
+ db.add_documents(docs)
355
+ nrecs_a = db.collection.count() # Count records after.
356
+ return 1 == nrecs_a - nrecs_b
357
+
358
+ def _test_load(self, nrecs_b: int, nrecs_a: int) -> bool:
359
+ """Test the document was loaded successfully.
360
+
361
+ :Test:
362
+ - Given a count of records before the load, verify the number
363
+ of records after the load is equal to the number of records
364
+ before, plus the number of split documents.
365
+
366
+ Args:
367
+ nrecs_b (int): Number of records *before* the load.
368
+ nrecs_a (int): Number of records *after* the load.
369
+
370
+ Returns:
371
+ bool: True if the number of records before the load plus the
372
+ number is splits is equal to the number of records after the
373
+ load.
374
+
375
+ """
376
+ if nrecs_a == nrecs_b:
377
+ ui.print_warning(f'{_PRE_WARN} No new documents added. Possibly already loaded?')
378
+ return nrecs_a == nrecs_b + len(self._docss)
@@ -0,0 +1,121 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the mid-level functionality to parse
5
+ and store PDF files into a Chroma vector database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ .. attention::
14
+
15
+ This module is *not* designed to be interacted with
16
+ directly, only via the appropriate interface class(es).
17
+
18
+ Rather, please create an instance of a Chroma PDF document
19
+ loading object using the following class:
20
+
21
+ - :class:`~docp.loaders.chromapdfloader.ChromaPDFLoader`
22
+
23
+ """
24
+
25
+ from langchain.docstore.document import Document
26
+ from utils4.user_interface import ui
27
+ # locals
28
+ try:
29
+ from .loaders._chromabaseloader import _ChromaBaseLoader
30
+ from .parsers.pdfparser import PDFParser
31
+ except ImportError:
32
+ from loaders._chromabaseloader import _ChromaBaseLoader
33
+ from parsers.pdfparser import PDFParser
34
+
35
+
36
+ class _ChromaBasePDFLoader(_ChromaBaseLoader):
37
+ """Base class for loading PDF documents into a Chroma vector database.
38
+
39
+ This class is a specialised version of the
40
+ :class:`~docp.loaders._chromabaseloader._ChromaBaseLoader` class,
41
+ designed to handle PDF presentations.
42
+
43
+ Args:
44
+ dbpath (str | ChromaDB): Either the full path to the Chroma
45
+ database *directory*, or an instance of a
46
+ :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
47
+ passed, the ``collection`` argument is ignored.
48
+ collection (str, optional): Name of the Chroma database
49
+ collection. Only required if the ``db`` parameter is a path.
50
+ Defaults to None.
51
+ split_text (bool, optional): Split the document into chunks,
52
+ before loading it into the database. Defaults to True.
53
+ load_keywords (bool, optional): Derive keywords from the document
54
+ and load these into the sister keywords collection.
55
+ Defaults to False.
56
+ llm (object, optional): If deriving keywords, this is the LLM
57
+ which will do the derivation. Defaults to None.
58
+ offline (bool, optional): Remain offline and use the locally
59
+ cached embedding function model. Defaults to False.
60
+
61
+ """
62
+ # pylint: disable=attribute-defined-outside-init # These are defined in the base class.
63
+
64
+ #
65
+ # No __init__ method here to ensure the ultimate base class'
66
+ # signature is used and to save passing loads of stuff around, if we
67
+ # don't have to.
68
+ #
69
+
70
+ def _create_documents(self) -> bool:
71
+ """Convert each extracted page into a ``Document`` object.
72
+
73
+ Returns:
74
+ bool: True of the pages are loaded as ``Document`` objects
75
+ successfully. Otherwise False.
76
+
77
+ """
78
+ for page in self._p.doc.pages:
79
+ if page.hastext:
80
+ doc = Document(page_content=page.content,
81
+ metadata={'source': self._p.doc.basename,
82
+ 'pageno': page.pageno})
83
+ # Prevent duplicates which cause chroma to fall over on load.
84
+ if doc not in self._docs:
85
+ self._docs.append(doc)
86
+ if not self._docs:
87
+ msg = f'{self._PFX_WARN} Text could not be parsed from {self._p.doc.basename}.'
88
+ ui.print_warning(msg)
89
+ return False
90
+ return True
91
+
92
+ def _parse_text(self, **kwargs) -> bool:
93
+ """Parse text from the document.
94
+
95
+ :Keyword Arguments:
96
+ Those to be passed into the text extraction method.
97
+
98
+ Returns:
99
+ bool: True if the parser's 'text' object is populated,
100
+ otherwise False.
101
+
102
+ """
103
+ print('- Extracting text ...')
104
+ self._p.extract_text(**kwargs)
105
+ if len(self._p.doc.pages) < 2:
106
+ ui.print_warning(f'No text extracted from {self._p.doc.basename}')
107
+ return False
108
+ return True
109
+
110
+ def _set_parser(self):
111
+ """Set the appropriate document parser.
112
+
113
+ Setting the parser creates a parser instance as an attribute of
114
+ this class. When the parser instance is created, various file
115
+ verification checks are made. For detail, refer to the following
116
+ parser method:
117
+
118
+ - :meth:`docp.parsers._pdfbaseparser._PDFBaseParser._open`
119
+
120
+ """
121
+ self._p = PDFParser(path=self._fpath)
@@ -0,0 +1,123 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides functionality to parse and store
5
+ document data into a Chroma vector database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ .. attention::
14
+
15
+ This module is *not* designed to be interacted with
16
+ directly, only via the appropriate interface class(es).
17
+
18
+ Rather, please create an instance of a Chroma PPTX document
19
+ loading object using the following class:
20
+
21
+ - :class:`~docp.loaders.chromapptxloader.ChromaPPTXLoader`
22
+
23
+ """
24
+
25
+ from langchain.docstore.document import Document
26
+ from utils4.user_interface import ui
27
+ # locals
28
+ try:
29
+ from .loaders._chromabaseloader import _ChromaBaseLoader
30
+ from .parsers.pptxparser import PPTXParser
31
+ except ImportError:
32
+ from loaders._chromabaseloader import _ChromaBaseLoader
33
+ from parsers.pptxparser import PPTXParser
34
+
35
+
36
+ class _ChromaBasePPTXLoader(_ChromaBaseLoader):
37
+ """Base class for loading PPTX files into a Chroma vector database.
38
+
39
+ This class is a specialised version of the
40
+ :class:`~docp.loaders._chromabaseloader._ChromaBaseLoader` class,
41
+ designed to handle PPTX presentations.
42
+
43
+ Args:
44
+ dbpath (str | ChromaDB): Either the full path to the Chroma
45
+ database *directory*, or an instance of a
46
+ :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
47
+ passed, the ``collection`` argument is ignored.
48
+ collection (str, optional): Name of the Chroma database
49
+ collection. Only required if the ``db`` parameter is a path.
50
+ Defaults to None.
51
+ split_text (bool, optional): Split the document into chunks,
52
+ before loading it into the database. Defaults to True.
53
+ load_keywords (bool, optional): Derive keywords from the document
54
+ and load these into the sister keywords collection.
55
+ Defaults to False.
56
+ llm (object, optional): If deriving keywords, this is the LLM
57
+ which will do the derivation. Defaults to None.
58
+ offline (bool, optional): Remain offline and use the locally
59
+ cached embedding function model. Defaults to False.
60
+
61
+ """
62
+ # pylint: disable=attribute-defined-outside-init # These are defined in the base class.
63
+
64
+ #
65
+ # No __init__ method here to ensure the ultimate base class'
66
+ # signature is used and to save passing loads of stuff around, if we
67
+ # don't have to.
68
+ #
69
+
70
+ def _create_documents(self) -> bool:
71
+ """Convert each extracted slide into a ``Document`` object.
72
+
73
+ Returns:
74
+ bool: True of the slides are loaded as ``Document`` objects
75
+ successfully. Otherwise False.
76
+
77
+ """
78
+ self._docs = []
79
+ for slide in self._p.doc.slides:
80
+ for txtobj in slide.texts:
81
+ if txtobj and txtobj.hastext:
82
+ doc = Document(page_content=txtobj.content,
83
+ metadata={'source': self._p.doc.basename,
84
+ 'pageno': slide.pageno})
85
+ # Prevent duplicates which cause chroma to fall over on load.
86
+ if doc not in self._docs:
87
+ self._docs.append(doc)
88
+ if not self._docs:
89
+ msg = f'{self._PFX_WARN} Text could not be parsed from {self._p.doc.basename}.'
90
+ ui.print_warning(msg)
91
+ return False
92
+ return True
93
+
94
+ def _parse_text(self, **kwargs) -> bool:
95
+ """Parse text from the presentation.
96
+
97
+ :Keyword Arguments:
98
+ Those to be passed into the text extraction method.
99
+
100
+ Returns:
101
+ bool: True if the parser's 'text' object is populated,
102
+ otherwise False.
103
+
104
+ """
105
+ print('- Extracting text ...')
106
+ self._p.extract_text(**kwargs)
107
+ if len(self._p.doc.slides) < 2:
108
+ ui.print_warning(f'No text extracted from {self._p.doc.basename}')
109
+ return False
110
+ return True
111
+
112
+ def _set_parser(self):
113
+ """Set the appropriate document parser.
114
+
115
+ Setting the parser creates a parser instance as an attribute of
116
+ this class. When the parser instance is created, various file
117
+ verification checks are made. For detail, refer to the following
118
+ parser method:
119
+
120
+ - :meth:`docp.parsers._pptxbaseparser._PPTXBaseParser._open`
121
+
122
+ """
123
+ self._p = PPTXParser(path=self._fpath)