docp 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +19 -10
  13. docp/dbs/chroma.py +19 -6
  14. docp/libs/_version.py +1 -0
  15. docp/libs/changelog.py +7 -0
  16. docp/libs/utilities.py +107 -0
  17. docp/loaders/__init__.py +38 -0
  18. docp/loaders/_chromabaseloader.py +83 -107
  19. docp/loaders/_chromabaseloader.py.bak +378 -0
  20. docp/loaders/_chromabasepdfloader.py +121 -0
  21. docp/loaders/_chromabasepptxloader.py +123 -0
  22. docp/loaders/{chroma.py → chroma.py.bak} +38 -8
  23. docp/loaders/chromapdfloader.py +199 -0
  24. docp/loaders/chromapptxloader.py +192 -0
  25. docp/loaders/lutilities.py +52 -0
  26. docp/objects/_docbaseobject.py +7 -18
  27. docp/objects/_imgobject.py +0 -0
  28. docp/objects/_pageobject.py +3 -2
  29. docp/objects/_slideobject.py +110 -0
  30. docp/objects/_textobject.py +64 -0
  31. docp/objects/pdfobject.py +24 -2
  32. docp/objects/pptxobject.py +46 -0
  33. docp/parsers/_pdfbaseparser.py +36 -10
  34. docp/parsers/_pdftableparser.py +6 -7
  35. docp/parsers/_pdftextparser.py +23 -13
  36. docp/parsers/_pptxbaseparser.py +93 -0
  37. docp/parsers/_pptxtextparser.py +115 -0
  38. docp/parsers/pptxparser.py +51 -0
  39. docp/parsers/putilities.py +48 -0
  40. docp-0.2.0.dist-info/METADATA +110 -0
  41. docp-0.2.0.dist-info/RECORD +49 -0
  42. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  43. docp/_version.py +0 -1
  44. docp-0.1.0b1.dist-info/METADATA +0 -55
  45. docp-0.1.0b1.dist-info/RECORD +0 -23
  46. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/LICENSE +0 -0
  47. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
docp/__init__.py CHANGED
@@ -7,25 +7,34 @@
7
7
  :Developer: J Berendt
8
8
  :Email: development@s3dev.uk
9
9
 
10
- :Comments: n/a
10
+ :Comments: Ths loader modules/classes have *not* been imported due to the
11
+ heavy dependency requirements. Refer to the loaders/__init__.py
12
+ module instead.
11
13
 
12
14
  """
13
15
 
14
16
  import os
15
17
  import sys
16
18
  sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
19
+ from utils4.user_interface import ui
20
+ # locals
21
+ from .libs._version import __version__
22
+
23
+ # TODO: Change these to use logging.
17
24
 
18
25
  # Bring entry-points to the surface.
19
26
  try:
20
- from loaders.chroma import ChromaLoader
27
+ from .parsers.pdfparser import PDFParser
21
28
  except ImportError as err:
22
- # The chroma loader requires a lot of backend which is not required for the parser.
23
- msg = f'An error occurred while importing the Chroma loader:\n- {err}'
24
- raise ImportError(msg) from err
29
+ msg = ( 'An error occurred while importing the PDF parser:\n'
30
+ f'- {err}\n'
31
+ ' - This can be ignored if the parser is not in use.\n')
32
+ ui.print_warning(f'\n[ImportError]: {msg}')
25
33
 
26
34
  try:
27
- from .parsers.pdfparser import PDFParser
28
- from ._version import __version__
29
- except ImportError:
30
- from parsers.pdfparser import PDFParser
31
- from _version import __version__
35
+ from .parsers.pptxparser import PPTXParser
36
+ except ImportError as err:
37
+ msg = ( 'An error occurred while importing the PPTX parser:\n'
38
+ f'- {err}\n'
39
+ ' - This can be ignored if the parser is not in use.\n')
40
+ ui.print_warning(f'\n[ImportError]: {msg}')
docp/dbs/chroma.py CHANGED
@@ -10,11 +10,18 @@
10
10
  :Developer: J Berendt
11
11
  :Email: development@s3dev.uk
12
12
 
13
- :Comments: n/a
13
+ :Comments: This module uses the
14
+ ``langchain_community.vectorstores.Chroma`` wrapper class,
15
+ rather than the base ``chromadb`` library as it provides the
16
+ ``add_texts`` method which supports GPU processing and
17
+ parallelisation; which is implemented by this module's
18
+ :meth:`~ChromaDB.add_documents` method.
14
19
 
15
20
  """
21
+ # pylint: disable=import-error
16
22
  # pylint: disable=wrong-import-order
17
23
 
24
+ from __future__ import annotations
18
25
  import chromadb
19
26
  import os
20
27
  import torch
@@ -81,19 +88,25 @@ class ChromaDB(_Chroma):
81
88
  """Accessor to the database's path."""
82
89
  return self._path
83
90
 
84
- def add_documents(self, docs: list):
91
+ def add_documents(self, docs: list[langchain_core.documents.base.Document]): # noqa # pylint: disable=undefined-variable
85
92
  """Add multiple documents to the collection.
86
93
 
87
- This method wraps ``Chroma.add_texts`` method which supports GPU
88
- processing and parallelisation. The ID is derived locally from
89
- the file's basename, page number and page content.
94
+ This method overrides the base class' ``add_documents`` method
95
+ to enable local ID derivation. Knowing *how* the IDs are derived
96
+ gives us greater understanding and querying ability of the
97
+ documents in the database. Each ID is derived locally by the
98
+ :meth:`_preproc` method from the file's basename, page number
99
+ and page content.
100
+
101
+ Additionally, this method wraps the
102
+ :func:`langchain_community.vectorstores.Chroma.add_texts`
103
+ method which supports GPU processing and parallelisation.
90
104
 
91
105
  Args:
92
106
  docs (list): A list of ``langchain_core.documents.base.Document``
93
107
  document objects.
94
108
 
95
109
  """
96
- # This method overrides the base class' add_documents method.
97
110
  # pylint: disable=arguments-differ
98
111
  # pylint: disable=arguments-renamed
99
112
  if not isinstance(docs, list):
docp/libs/_version.py ADDED
@@ -0,0 +1 @@
1
+ __version__ = '0.2.0'
docp/libs/changelog.py ADDED
@@ -0,0 +1,7 @@
1
+ # Changed.
2
+ # ENABLE SPHINX TO ACCESS THE GIT LOG
3
+ """
4
+ .. git_changelog::
5
+ :revisions: 99
6
+ :detailed-message-pre: True
7
+ """
docp/libs/utilities.py ADDED
@@ -0,0 +1,107 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides utility-based functionality for the
5
+ project.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+ import os
16
+ import sys
17
+ sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../'))
18
+ import re
19
+ from glob import glob
20
+ from utils4 import futils
21
+
22
+
23
+ class Utilities:
24
+ """General (cross-project) utility functions."""
25
+
26
+ @staticmethod
27
+ def collect_files(path: str, ext: str, recursive: bool) -> list:
28
+ """Collect all files for a given extension from a path.
29
+
30
+ Args:
31
+ path (str): Full path serving as the root for the search.
32
+ ext (str, optional): If the ``path`` argument refers to a
33
+ *directory*, a specific file extension can be specified
34
+ here. For example: ``ext = 'pdf'``.
35
+
36
+ If anything other than ``'**'`` is provided, all
37
+ alpha-characters are parsed from the string, and prefixed
38
+ with ``*.``. Meaning, if ``'.pdf'`` is passed, the
39
+ characters ``'pdf'`` are parsed and prefixed with ``*.``
40
+ to create ``'*.pdf'``. However, if ``'things.foo'`` is
41
+ passed, the derived extension will be ``'*.thingsfoo'``.
42
+ Defaults to '**', for a recursive search.
43
+
44
+ recursive (bool): Instruct the search to recurse into
45
+ sub-directories.
46
+
47
+ Returns:
48
+ list: The list of full file paths returned by the ``glob``
49
+ call. Any directory-only paths are removed.
50
+
51
+ """
52
+ if ext != '**':
53
+ ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
54
+ return list(filter(os.path.isfile, glob(os.path.join(path, ext), recursive=recursive)))
55
+
56
+ # !!!: Replace this with utils4.futils when available.
57
+ @staticmethod
58
+ def ispdf(path: str) -> bool:
59
+ """Test the file signature. Verify this is a valid PDF file.
60
+
61
+ Args:
62
+ path (str): Path to the file being tested.
63
+
64
+ Returns:
65
+ bool: True if this is a valid PDF file, otherwise False.
66
+
67
+ """
68
+ with open(path, 'rb') as f:
69
+ sig = f.read(5)
70
+ return sig == b'\x25\x50\x44\x46\x2d'
71
+
72
+ @staticmethod
73
+ def iszip(path: str) -> bool:
74
+ """Test the file signature. Verify this is a valid ZIP archive.
75
+
76
+ Args:
77
+ path (str): Path to the file being tested.
78
+
79
+ Returns:
80
+ bool: True if this is a valid ZIP archive, otherwise False.
81
+
82
+ """
83
+ return futils.iszip(path)
84
+
85
+ @staticmethod
86
+ def parse_to_keywords(resp: str) -> list:
87
+ """Parse the bot's response into a list of keywords.
88
+
89
+ Args:
90
+ resp (str): Text response directly from the bot.
91
+
92
+ Returns:
93
+ list: A list of keywords extracted from the response,
94
+ separated by asterisks as bullet points.
95
+
96
+ """
97
+ # Capture asterisk bullet points or a numbered list.
98
+ rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
99
+ trans = {45: ' ', 47: ' '}
100
+ resp_ = resp.translate(trans).lower()
101
+ kwds = rexp.findall(resp_)
102
+ if kwds:
103
+ return ', '.join(kwds)
104
+ return ''
105
+
106
+
107
+ utilities = Utilities()
docp/loaders/__init__.py CHANGED
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the project initilisation logic.
5
+
6
+ :Platform: Linux/Windows | Python 3.10+
7
+ :Developer: J Berendt
8
+ :Email: development@s3dev.uk
9
+
10
+ :Comments: n/a
11
+
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
17
+ from utils4.user_interface import ui
18
+
19
+ # TODO: Change these to use logging.
20
+
21
+ # Bring entry-points to the surface.
22
+ try:
23
+ from .chromapdfloader import ChromaPDFLoader
24
+ except ImportError as err:
25
+ # The chroma loader requires a lot of backend which is not required for the parser.
26
+ msg = ( 'An error occurred while importing the Chroma PDF loader:\n'
27
+ f'- {err}\n'
28
+ ' - This can be ignored if the loader is not in use.\n')
29
+ ui.print_warning(f'\n[ImportError]: {msg}')
30
+
31
+ try:
32
+ from .chromapptxloader import ChromaPPTXLoader
33
+ except ImportError as err:
34
+ # The chroma loader requires a lot of backend which is not required for the parser.
35
+ msg = ( 'An error occurred while importing the Chroma PPTX loader:\n'
36
+ f'- {err}\n'
37
+ ' - This can be ignored if the loader is not in use.\n')
38
+ ui.print_warning(f'\n[ImportError]: {msg}')
@@ -1,8 +1,8 @@
1
1
  #!/usr/bin/env python3
2
2
  # -*- coding: utf-8 -*-
3
3
  """
4
- :Purpose: This module provides functionality to parse and store
5
- document data into a Chroma vector database.
4
+ :Purpose: This module provides the base functionality for parsing and
5
+ storing a document's data into a Chroma vector database.
6
6
 
7
7
  :Platform: Linux/Windows | Python 3.10+
8
8
  :Developer: J Berendt
@@ -10,12 +10,23 @@
10
10
 
11
11
  :Comments: n/a
12
12
 
13
+ .. attention::
14
+
15
+ This module is *not* designed to be interacted with
16
+ directly, only via the appropriate interface class(es).
17
+
18
+ Rather, please create an instance of a Chroma
19
+ document-type-specific loader object using one of the
20
+ following classes:
21
+
22
+ - :class:`~docp.loaders.chromapdfloader.ChromaPDFLoader`
23
+ - :class:`~docp.loaders.chromapptxloader.ChromaPPTXLoader`
24
+
13
25
  """
14
26
  # pylint: disable=no-name-in-module # langchain.chains.RetrievalQA
15
27
 
16
28
  import contextlib
17
29
  import os
18
- import re
19
30
  from chromadb.api.types import errors as chromadberrors
20
31
  from langchain.chains import RetrievalQA
21
32
  from langchain.docstore.document import Document
@@ -25,69 +36,51 @@ from utils4.user_interface import ui
25
36
  # locals
26
37
  try:
27
38
  from .dbs.chroma import ChromaDB
28
- from .parsers.pdfparser import PDFParser
39
+ from .libs.utilities import utilities
29
40
  except ImportError:
30
41
  from dbs.chroma import ChromaDB
31
- from parsers.pdfparser import PDFParser
32
-
33
- _PRE_ERR = '\n[ERROR]:'
34
- _PRE_WARN = '\n[WARNING]:'
35
-
36
-
37
- class Tools:
38
- """General tools used for loading documents."""
39
-
40
- @staticmethod
41
- def parse_to_keywords(resp: str) -> list:
42
- """Parse the bot's response into a list of keywords.
43
-
44
- Args:
45
- resp (str): Text response directly from the bot.
46
-
47
- Returns:
48
- list: A list of keywords extracted from the response,
49
- separated by asterisks as bullet points.
50
-
51
- """
52
- # Capture asterisk bullet points or a numbered list.
53
- rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
54
- trans = {45: ' ', 47: ' '}
55
- resp_ = resp.translate(trans).lower()
56
- kwds = rexp.findall(resp_)
57
- if kwds:
58
- return ', '.join(kwds)
59
- return ''
42
+ from libs.utilities import utilities
60
43
 
61
44
 
62
45
  class _ChromaBaseLoader:
63
46
  """Base class for loading documents into a Chroma vector database.
64
47
 
65
48
  Args:
66
- path (str): Full path to the file to be parsed and loaded.
67
- dbpath (str | Chroma): Either the full path to the Chroma database
68
- *directory*, or an instance of a :class:`~dbs.chroma.Chroma`
69
- database. If the instance is passed, the ``collection``
70
- argument is ignored.
49
+ dbpath (str | ChromaDB): Either the full path to the Chroma
50
+ database *directory*, or an instance of a
51
+ :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
52
+ passed, the ``collection`` argument is ignored.
71
53
  collection (str, optional): Name of the Chroma database
72
54
  collection. Only required if the ``db`` parameter is a path.
73
55
  Defaults to None.
56
+ split_text (bool, optional): Split the document into chunks,
57
+ before loading it into the database. Defaults to True.
58
+ load_keywords (bool, optional): Derive keywords from the document
59
+ and load these into the sister keywords collection.
60
+ Defaults to False.
61
+ llm (object, optional): If deriving keywords, this is the LLM
62
+ which will do the derivation. Defaults to None.
74
63
  offline (bool, optional): Remain offline and use the locally
75
64
  cached embedding function model. Defaults to False.
76
65
 
77
66
  """
67
+ # pylint: disable=assignment-from-no-return # These are stub methods.
78
68
 
79
- _PARSERS = {'.pdf': PDFParser}
69
+ _PFX_ERR = '\n[ERROR]:'
70
+ _PFX_WARN = '\n[WARNING]:'
80
71
 
81
72
  def __init__(self,
82
73
  dbpath: str | ChromaDB,
83
74
  collection: str=None,
84
75
  *,
76
+ split_text: bool=True,
85
77
  load_keywords: bool=False,
86
78
  llm: object=None,
87
79
  offline: bool=False):
88
80
  """Chroma database class initialiser."""
89
81
  self._dbpath = dbpath
90
82
  self._cname = collection
83
+ self._split_text = split_text
91
84
  self._load_keywords = load_keywords
92
85
  self._llm = llm
93
86
  self._offline = offline
@@ -111,6 +104,28 @@ class _ChromaBaseLoader:
111
104
  """Accessor to the document parser object."""
112
105
  return self._p
113
106
 
107
+ def _already_loaded(self) -> bool:
108
+ """Test if the file has already been loaded into the collection.
109
+
110
+ :Logic:
111
+ This test is performed by querying the collection for a
112
+ metadata 'source' which equals the filename. As this uses
113
+ a chromadb 'filter' (i.e. ``$eq``), testing for partial
114
+ matches is not possible at this time.
115
+
116
+ If the filename is different (in any way) from the source's
117
+ filename in the database, the file will be loaded again.
118
+
119
+ Returns:
120
+ bool: True is the *exact* filename was found in the
121
+ collection's metadata, otherwise False.
122
+
123
+ """
124
+ if self._dbo.collection.get(where={'source': {'$eq': self._fbase}})['ids']:
125
+ print(f'-- File already loaded: {self._fbase} - skipping')
126
+ return True
127
+ return False
128
+
114
129
  def _check_parameters(self) -> None:
115
130
  """Verify the class parameters are viable.
116
131
 
@@ -125,22 +140,7 @@ class _ChromaBaseLoader:
125
140
  'must be True and a model instance must be provided.')
126
141
 
127
142
  def _create_documents(self) -> bool:
128
- """Convert each extracted page into a ``Document`` object.
129
-
130
- Returns:
131
- bool: True of the pages are loaded as ``Document`` objects
132
- successfully. Otherwise False.
133
-
134
- """
135
- self._docs = [Document(page_content=page.content,
136
- metadata={'source': self._p.doc.basename,
137
- 'pageno': page.pageno})
138
- for page in self._p.doc.pages if page.hastext]
139
- if not self._docs:
140
- msg = f'{_PRE_WARN} Text could not be parsed from {self._p.doc.basename}.'
141
- ui.print_warning(msg)
142
- return False
143
- return True
143
+ """Stub method; overridden by the child class."""
144
144
 
145
145
  def _get_keywords(self) -> str:
146
146
  """Query the document (using the LLM) to extract the keywords."""
@@ -161,24 +161,27 @@ class _ChromaBaseLoader:
161
161
  return_source_documents=True,
162
162
  verbose=True)
163
163
  resp = qa.invoke(qry)
164
- kwds = Tools.parse_to_keywords(resp=resp['result'])
164
+ kwds = utilities.parse_to_keywords(resp=resp['result'])
165
165
  return kwds
166
166
 
167
167
  def _load(self, path: str, **kwargs):
168
- """Load the selected files into the vector store.
168
+ """Load the provided file into the vector store.
169
169
 
170
170
  Args:
171
171
  path (str): Full path to the file to be loaded.
172
172
 
173
173
  :Keyword Arguments:
174
- Those passed from the loader-specific ``load`` method.
174
+ Those passed from the document-type-specific loader's
175
+ :func:`load` method.
175
176
 
176
177
  """
177
178
  # pylint: disable=multiple-statements
178
179
  self._fpath = path
179
180
  self._fbase = os.path.basename(path)
180
- s = self._set_parser()
181
- if s: s = self._set_text_splitter()
181
+ if self._already_loaded():
182
+ return
183
+ self._set_parser()
184
+ s = self._set_text_splitter()
182
185
  if s: s = self._parse_text(**kwargs)
183
186
  if s: s = self._create_documents()
184
187
  if s: s = self._split_texts()
@@ -198,6 +201,7 @@ class _ChromaBaseLoader:
198
201
  exceptions being raised.
199
202
 
200
203
  """
204
+ # pylint: disable=line-too-long
201
205
  try:
202
206
  print('- Loading the document into the database ...')
203
207
  nrecs_b = self._dbo.collection.count() # Count records before.
@@ -205,29 +209,14 @@ class _ChromaBaseLoader:
205
209
  nrecs_a = self._dbo.collection.count() # Count records after.
206
210
  return self._test_load(nrecs_b=nrecs_b, nrecs_a=nrecs_a)
207
211
  except chromadberrors.DuplicateIDError:
208
- print('-- Document already loaded; duplicate detected.')
212
+ print(' -- Document *chunk* already loaded, duplication detected. File may be corrupt.')
209
213
  return False # Prevent from loading keywords.
210
214
  except Exception as err:
211
215
  reporterror(err)
212
216
  return False
213
217
 
214
218
  def _parse_text(self, **kwargs) -> bool:
215
- """Parse text from the document.
216
-
217
- :Keyword Arguments:
218
- Those to be passed into the text extraction method.
219
-
220
- Returns:
221
- bool: True if the parser's 'text' object is populated,
222
- otherwise False.
223
-
224
- """
225
- print('- Extracting text ...')
226
- self._p.extract_text(**kwargs)
227
- if len(self._p.doc.pages) < 2:
228
- ui.print_warning(f'No text extracted from {self._p.doc.basename}')
229
- return False
230
- return True
219
+ """Stub method, overridden by the child class."""
231
220
 
232
221
  @staticmethod
233
222
  def _print_summary(success: bool):
@@ -266,30 +255,8 @@ class _ChromaBaseLoader:
266
255
  return False
267
256
  return True
268
257
 
269
- def _set_parser(self) -> bool:
270
- """Set the appropriate document parser.
271
-
272
- :Rationale:
273
- The parser is set by the file extension. For example, a file
274
- extension ``.pdf`` will set the
275
- :class:`parsers.pdfparser.PDFParser` class.
276
-
277
- Returns:
278
- bool: True if a file extension appropriate parser was found.
279
- Otherwise, False.
280
-
281
- """
282
- # pylint: disable=invalid-name # OK as the variable (Parser) is a class.
283
- # TODO: Updated this to use the (not-yet-available) ispdf utility
284
- # function, rather than relying on the file extension.
285
- ext = os.path.splitext(self._fpath)[1]
286
- Parser = self._PARSERS.get(ext)
287
- if not Parser:
288
- msg = f'{_PRE_WARN} Document parser not set for {os.path.basename(self._fpath)}.'
289
- ui.print_warning(msg)
290
- return False
291
- self._p = Parser(path=self._fpath)
292
- return True
258
+ def _set_parser(self):
259
+ """Stub method, overridden by the child class."""
293
260
 
294
261
  # TODO: Add these to a config file.
295
262
  def _set_text_splitter(self) -> bool:
@@ -307,15 +274,24 @@ class _ChromaBaseLoader:
307
274
  def _split_texts(self) -> bool:
308
275
  """Split the document text using a recursive text splitter.
309
276
 
277
+ Note:
278
+ If the ``split_text`` parameter was passed as ``False`` on
279
+ instantiation, the texts will not be split. Rather, the
280
+ :attr:`_docs` list is simply *copied* to the :attr:`_docss`
281
+ attribute.
282
+
310
283
  Returns:
311
- bool: True if the text was split successfully, otherwise
312
- False.
284
+ bool: True if the text was split (or copied) successfully,
285
+ otherwise False.
313
286
 
314
287
  """
315
- self._docss = self._splitter.split_documents(self._docs)
288
+ if self._split_text:
289
+ self._docss = self._splitter.split_documents(self._docs)
290
+ else:
291
+ self._docss = self._docs[:]
316
292
  if not self._docss:
317
- msg = (f'{_PRE_ERR} An error occurred while splitting the documents for '
318
- f'{self._p.doc.basename}.')
293
+ msg = (f'{self._PFX_ERR} An error occurred while splitting the documents for '
294
+ f'{self._fbase}.')
319
295
  ui.print_warning(msg)
320
296
  return False
321
297
  return True
@@ -358,5 +334,5 @@ class _ChromaBaseLoader:
358
334
 
359
335
  """
360
336
  if nrecs_a == nrecs_b:
361
- ui.print_warning(f'{_PRE_WARN} No new documents added. Possibly already loaded?')
337
+ ui.print_warning(f'{self._PFX_WARN} No new documents added. Possibly already loaded?')
362
338
  return nrecs_a == nrecs_b + len(self._docss)