docp 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
- docp/__init__.py +19 -10
- docp/dbs/chroma.py +19 -6
- docp/libs/_version.py +1 -0
- docp/libs/changelog.py +7 -0
- docp/libs/utilities.py +107 -0
- docp/loaders/__init__.py +38 -0
- docp/loaders/_chromabaseloader.py +83 -107
- docp/loaders/_chromabaseloader.py.bak +378 -0
- docp/loaders/_chromabasepdfloader.py +121 -0
- docp/loaders/_chromabasepptxloader.py +123 -0
- docp/loaders/{chroma.py → chroma.py.bak} +38 -8
- docp/loaders/chromapdfloader.py +199 -0
- docp/loaders/chromapptxloader.py +192 -0
- docp/loaders/lutilities.py +52 -0
- docp/objects/_docbaseobject.py +7 -18
- docp/objects/_imgobject.py +0 -0
- docp/objects/_pageobject.py +3 -2
- docp/objects/_slideobject.py +110 -0
- docp/objects/_textobject.py +64 -0
- docp/objects/pdfobject.py +24 -2
- docp/objects/pptxobject.py +46 -0
- docp/parsers/_pdfbaseparser.py +36 -10
- docp/parsers/_pdftableparser.py +6 -7
- docp/parsers/_pdftextparser.py +23 -13
- docp/parsers/_pptxbaseparser.py +93 -0
- docp/parsers/_pptxtextparser.py +115 -0
- docp/parsers/pptxparser.py +51 -0
- docp/parsers/putilities.py +48 -0
- docp-0.2.0.dist-info/METADATA +110 -0
- docp-0.2.0.dist-info/RECORD +49 -0
- {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
- docp/_version.py +0 -1
- docp-0.1.0b1.dist-info/METADATA +0 -55
- docp-0.1.0b1.dist-info/RECORD +0 -23
- {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/LICENSE +0 -0
- {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
docp/__init__.py
CHANGED
@@ -7,25 +7,34 @@
|
|
7
7
|
:Developer: J Berendt
|
8
8
|
:Email: development@s3dev.uk
|
9
9
|
|
10
|
-
:Comments:
|
10
|
+
:Comments: Ths loader modules/classes have *not* been imported due to the
|
11
|
+
heavy dependency requirements. Refer to the loaders/__init__.py
|
12
|
+
module instead.
|
11
13
|
|
12
14
|
"""
|
13
15
|
|
14
16
|
import os
|
15
17
|
import sys
|
16
18
|
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
|
19
|
+
from utils4.user_interface import ui
|
20
|
+
# locals
|
21
|
+
from .libs._version import __version__
|
22
|
+
|
23
|
+
# TODO: Change these to use logging.
|
17
24
|
|
18
25
|
# Bring entry-points to the surface.
|
19
26
|
try:
|
20
|
-
from
|
27
|
+
from .parsers.pdfparser import PDFParser
|
21
28
|
except ImportError as err:
|
22
|
-
|
23
|
-
|
24
|
-
|
29
|
+
msg = ( 'An error occurred while importing the PDF parser:\n'
|
30
|
+
f'- {err}\n'
|
31
|
+
' - This can be ignored if the parser is not in use.\n')
|
32
|
+
ui.print_warning(f'\n[ImportError]: {msg}')
|
25
33
|
|
26
34
|
try:
|
27
|
-
from .parsers.
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
35
|
+
from .parsers.pptxparser import PPTXParser
|
36
|
+
except ImportError as err:
|
37
|
+
msg = ( 'An error occurred while importing the PPTX parser:\n'
|
38
|
+
f'- {err}\n'
|
39
|
+
' - This can be ignored if the parser is not in use.\n')
|
40
|
+
ui.print_warning(f'\n[ImportError]: {msg}')
|
docp/dbs/chroma.py
CHANGED
@@ -10,11 +10,18 @@
|
|
10
10
|
:Developer: J Berendt
|
11
11
|
:Email: development@s3dev.uk
|
12
12
|
|
13
|
-
:Comments:
|
13
|
+
:Comments: This module uses the
|
14
|
+
``langchain_community.vectorstores.Chroma`` wrapper class,
|
15
|
+
rather than the base ``chromadb`` library as it provides the
|
16
|
+
``add_texts`` method which supports GPU processing and
|
17
|
+
parallelisation; which is implemented by this module's
|
18
|
+
:meth:`~ChromaDB.add_documents` method.
|
14
19
|
|
15
20
|
"""
|
21
|
+
# pylint: disable=import-error
|
16
22
|
# pylint: disable=wrong-import-order
|
17
23
|
|
24
|
+
from __future__ import annotations
|
18
25
|
import chromadb
|
19
26
|
import os
|
20
27
|
import torch
|
@@ -81,19 +88,25 @@ class ChromaDB(_Chroma):
|
|
81
88
|
"""Accessor to the database's path."""
|
82
89
|
return self._path
|
83
90
|
|
84
|
-
def add_documents(self, docs: list):
|
91
|
+
def add_documents(self, docs: list[langchain_core.documents.base.Document]): # noqa # pylint: disable=undefined-variable
|
85
92
|
"""Add multiple documents to the collection.
|
86
93
|
|
87
|
-
This method
|
88
|
-
|
89
|
-
|
94
|
+
This method overrides the base class' ``add_documents`` method
|
95
|
+
to enable local ID derivation. Knowing *how* the IDs are derived
|
96
|
+
gives us greater understanding and querying ability of the
|
97
|
+
documents in the database. Each ID is derived locally by the
|
98
|
+
:meth:`_preproc` method from the file's basename, page number
|
99
|
+
and page content.
|
100
|
+
|
101
|
+
Additionally, this method wraps the
|
102
|
+
:func:`langchain_community.vectorstores.Chroma.add_texts`
|
103
|
+
method which supports GPU processing and parallelisation.
|
90
104
|
|
91
105
|
Args:
|
92
106
|
docs (list): A list of ``langchain_core.documents.base.Document``
|
93
107
|
document objects.
|
94
108
|
|
95
109
|
"""
|
96
|
-
# This method overrides the base class' add_documents method.
|
97
110
|
# pylint: disable=arguments-differ
|
98
111
|
# pylint: disable=arguments-renamed
|
99
112
|
if not isinstance(docs, list):
|
docp/libs/_version.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = '0.2.0'
|
docp/libs/changelog.py
ADDED
docp/libs/utilities.py
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides utility-based functionality for the
|
5
|
+
project.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
import os
|
16
|
+
import sys
|
17
|
+
sys.path.insert(0, os.path.join(os.path.dirname(os.path.realpath(__file__)), '../../'))
|
18
|
+
import re
|
19
|
+
from glob import glob
|
20
|
+
from utils4 import futils
|
21
|
+
|
22
|
+
|
23
|
+
class Utilities:
|
24
|
+
"""General (cross-project) utility functions."""
|
25
|
+
|
26
|
+
@staticmethod
|
27
|
+
def collect_files(path: str, ext: str, recursive: bool) -> list:
|
28
|
+
"""Collect all files for a given extension from a path.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
path (str): Full path serving as the root for the search.
|
32
|
+
ext (str, optional): If the ``path`` argument refers to a
|
33
|
+
*directory*, a specific file extension can be specified
|
34
|
+
here. For example: ``ext = 'pdf'``.
|
35
|
+
|
36
|
+
If anything other than ``'**'`` is provided, all
|
37
|
+
alpha-characters are parsed from the string, and prefixed
|
38
|
+
with ``*.``. Meaning, if ``'.pdf'`` is passed, the
|
39
|
+
characters ``'pdf'`` are parsed and prefixed with ``*.``
|
40
|
+
to create ``'*.pdf'``. However, if ``'things.foo'`` is
|
41
|
+
passed, the derived extension will be ``'*.thingsfoo'``.
|
42
|
+
Defaults to '**', for a recursive search.
|
43
|
+
|
44
|
+
recursive (bool): Instruct the search to recurse into
|
45
|
+
sub-directories.
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
list: The list of full file paths returned by the ``glob``
|
49
|
+
call. Any directory-only paths are removed.
|
50
|
+
|
51
|
+
"""
|
52
|
+
if ext != '**':
|
53
|
+
ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
|
54
|
+
return list(filter(os.path.isfile, glob(os.path.join(path, ext), recursive=recursive)))
|
55
|
+
|
56
|
+
# !!!: Replace this with utils4.futils when available.
|
57
|
+
@staticmethod
|
58
|
+
def ispdf(path: str) -> bool:
|
59
|
+
"""Test the file signature. Verify this is a valid PDF file.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
path (str): Path to the file being tested.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
bool: True if this is a valid PDF file, otherwise False.
|
66
|
+
|
67
|
+
"""
|
68
|
+
with open(path, 'rb') as f:
|
69
|
+
sig = f.read(5)
|
70
|
+
return sig == b'\x25\x50\x44\x46\x2d'
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def iszip(path: str) -> bool:
|
74
|
+
"""Test the file signature. Verify this is a valid ZIP archive.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
path (str): Path to the file being tested.
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
bool: True if this is a valid ZIP archive, otherwise False.
|
81
|
+
|
82
|
+
"""
|
83
|
+
return futils.iszip(path)
|
84
|
+
|
85
|
+
@staticmethod
|
86
|
+
def parse_to_keywords(resp: str) -> list:
|
87
|
+
"""Parse the bot's response into a list of keywords.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
resp (str): Text response directly from the bot.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
list: A list of keywords extracted from the response,
|
94
|
+
separated by asterisks as bullet points.
|
95
|
+
|
96
|
+
"""
|
97
|
+
# Capture asterisk bullet points or a numbered list.
|
98
|
+
rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
|
99
|
+
trans = {45: ' ', 47: ' '}
|
100
|
+
resp_ = resp.translate(trans).lower()
|
101
|
+
kwds = rexp.findall(resp_)
|
102
|
+
if kwds:
|
103
|
+
return ', '.join(kwds)
|
104
|
+
return ''
|
105
|
+
|
106
|
+
|
107
|
+
utilities = Utilities()
|
docp/loaders/__init__.py
CHANGED
@@ -0,0 +1,38 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the project initilisation logic.
|
5
|
+
|
6
|
+
:Platform: Linux/Windows | Python 3.10+
|
7
|
+
:Developer: J Berendt
|
8
|
+
:Email: development@s3dev.uk
|
9
|
+
|
10
|
+
:Comments: n/a
|
11
|
+
|
12
|
+
"""
|
13
|
+
|
14
|
+
import os
|
15
|
+
import sys
|
16
|
+
sys.path.insert(0, os.path.dirname(os.path.realpath(__file__)))
|
17
|
+
from utils4.user_interface import ui
|
18
|
+
|
19
|
+
# TODO: Change these to use logging.
|
20
|
+
|
21
|
+
# Bring entry-points to the surface.
|
22
|
+
try:
|
23
|
+
from .chromapdfloader import ChromaPDFLoader
|
24
|
+
except ImportError as err:
|
25
|
+
# The chroma loader requires a lot of backend which is not required for the parser.
|
26
|
+
msg = ( 'An error occurred while importing the Chroma PDF loader:\n'
|
27
|
+
f'- {err}\n'
|
28
|
+
' - This can be ignored if the loader is not in use.\n')
|
29
|
+
ui.print_warning(f'\n[ImportError]: {msg}')
|
30
|
+
|
31
|
+
try:
|
32
|
+
from .chromapptxloader import ChromaPPTXLoader
|
33
|
+
except ImportError as err:
|
34
|
+
# The chroma loader requires a lot of backend which is not required for the parser.
|
35
|
+
msg = ( 'An error occurred while importing the Chroma PPTX loader:\n'
|
36
|
+
f'- {err}\n'
|
37
|
+
' - This can be ignored if the loader is not in use.\n')
|
38
|
+
ui.print_warning(f'\n[ImportError]: {msg}')
|
@@ -1,8 +1,8 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
2
|
# -*- coding: utf-8 -*-
|
3
3
|
"""
|
4
|
-
:Purpose: This module provides functionality
|
5
|
-
document data into a Chroma vector database.
|
4
|
+
:Purpose: This module provides the base functionality for parsing and
|
5
|
+
storing a document's data into a Chroma vector database.
|
6
6
|
|
7
7
|
:Platform: Linux/Windows | Python 3.10+
|
8
8
|
:Developer: J Berendt
|
@@ -10,12 +10,23 @@
|
|
10
10
|
|
11
11
|
:Comments: n/a
|
12
12
|
|
13
|
+
.. attention::
|
14
|
+
|
15
|
+
This module is *not* designed to be interacted with
|
16
|
+
directly, only via the appropriate interface class(es).
|
17
|
+
|
18
|
+
Rather, please create an instance of a Chroma
|
19
|
+
document-type-specific loader object using one of the
|
20
|
+
following classes:
|
21
|
+
|
22
|
+
- :class:`~docp.loaders.chromapdfloader.ChromaPDFLoader`
|
23
|
+
- :class:`~docp.loaders.chromapptxloader.ChromaPPTXLoader`
|
24
|
+
|
13
25
|
"""
|
14
26
|
# pylint: disable=no-name-in-module # langchain.chains.RetrievalQA
|
15
27
|
|
16
28
|
import contextlib
|
17
29
|
import os
|
18
|
-
import re
|
19
30
|
from chromadb.api.types import errors as chromadberrors
|
20
31
|
from langchain.chains import RetrievalQA
|
21
32
|
from langchain.docstore.document import Document
|
@@ -25,69 +36,51 @@ from utils4.user_interface import ui
|
|
25
36
|
# locals
|
26
37
|
try:
|
27
38
|
from .dbs.chroma import ChromaDB
|
28
|
-
from .
|
39
|
+
from .libs.utilities import utilities
|
29
40
|
except ImportError:
|
30
41
|
from dbs.chroma import ChromaDB
|
31
|
-
from
|
32
|
-
|
33
|
-
_PRE_ERR = '\n[ERROR]:'
|
34
|
-
_PRE_WARN = '\n[WARNING]:'
|
35
|
-
|
36
|
-
|
37
|
-
class Tools:
|
38
|
-
"""General tools used for loading documents."""
|
39
|
-
|
40
|
-
@staticmethod
|
41
|
-
def parse_to_keywords(resp: str) -> list:
|
42
|
-
"""Parse the bot's response into a list of keywords.
|
43
|
-
|
44
|
-
Args:
|
45
|
-
resp (str): Text response directly from the bot.
|
46
|
-
|
47
|
-
Returns:
|
48
|
-
list: A list of keywords extracted from the response,
|
49
|
-
separated by asterisks as bullet points.
|
50
|
-
|
51
|
-
"""
|
52
|
-
# Capture asterisk bullet points or a numbered list.
|
53
|
-
rexp = re.compile(r'(?:\*|[0-9]+\.)\s*(.*)\n')
|
54
|
-
trans = {45: ' ', 47: ' '}
|
55
|
-
resp_ = resp.translate(trans).lower()
|
56
|
-
kwds = rexp.findall(resp_)
|
57
|
-
if kwds:
|
58
|
-
return ', '.join(kwds)
|
59
|
-
return ''
|
42
|
+
from libs.utilities import utilities
|
60
43
|
|
61
44
|
|
62
45
|
class _ChromaBaseLoader:
|
63
46
|
"""Base class for loading documents into a Chroma vector database.
|
64
47
|
|
65
48
|
Args:
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
argument is ignored.
|
49
|
+
dbpath (str | ChromaDB): Either the full path to the Chroma
|
50
|
+
database *directory*, or an instance of a
|
51
|
+
:class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
|
52
|
+
passed, the ``collection`` argument is ignored.
|
71
53
|
collection (str, optional): Name of the Chroma database
|
72
54
|
collection. Only required if the ``db`` parameter is a path.
|
73
55
|
Defaults to None.
|
56
|
+
split_text (bool, optional): Split the document into chunks,
|
57
|
+
before loading it into the database. Defaults to True.
|
58
|
+
load_keywords (bool, optional): Derive keywords from the document
|
59
|
+
and load these into the sister keywords collection.
|
60
|
+
Defaults to False.
|
61
|
+
llm (object, optional): If deriving keywords, this is the LLM
|
62
|
+
which will do the derivation. Defaults to None.
|
74
63
|
offline (bool, optional): Remain offline and use the locally
|
75
64
|
cached embedding function model. Defaults to False.
|
76
65
|
|
77
66
|
"""
|
67
|
+
# pylint: disable=assignment-from-no-return # These are stub methods.
|
78
68
|
|
79
|
-
|
69
|
+
_PFX_ERR = '\n[ERROR]:'
|
70
|
+
_PFX_WARN = '\n[WARNING]:'
|
80
71
|
|
81
72
|
def __init__(self,
|
82
73
|
dbpath: str | ChromaDB,
|
83
74
|
collection: str=None,
|
84
75
|
*,
|
76
|
+
split_text: bool=True,
|
85
77
|
load_keywords: bool=False,
|
86
78
|
llm: object=None,
|
87
79
|
offline: bool=False):
|
88
80
|
"""Chroma database class initialiser."""
|
89
81
|
self._dbpath = dbpath
|
90
82
|
self._cname = collection
|
83
|
+
self._split_text = split_text
|
91
84
|
self._load_keywords = load_keywords
|
92
85
|
self._llm = llm
|
93
86
|
self._offline = offline
|
@@ -111,6 +104,28 @@ class _ChromaBaseLoader:
|
|
111
104
|
"""Accessor to the document parser object."""
|
112
105
|
return self._p
|
113
106
|
|
107
|
+
def _already_loaded(self) -> bool:
|
108
|
+
"""Test if the file has already been loaded into the collection.
|
109
|
+
|
110
|
+
:Logic:
|
111
|
+
This test is performed by querying the collection for a
|
112
|
+
metadata 'source' which equals the filename. As this uses
|
113
|
+
a chromadb 'filter' (i.e. ``$eq``), testing for partial
|
114
|
+
matches is not possible at this time.
|
115
|
+
|
116
|
+
If the filename is different (in any way) from the source's
|
117
|
+
filename in the database, the file will be loaded again.
|
118
|
+
|
119
|
+
Returns:
|
120
|
+
bool: True is the *exact* filename was found in the
|
121
|
+
collection's metadata, otherwise False.
|
122
|
+
|
123
|
+
"""
|
124
|
+
if self._dbo.collection.get(where={'source': {'$eq': self._fbase}})['ids']:
|
125
|
+
print(f'-- File already loaded: {self._fbase} - skipping')
|
126
|
+
return True
|
127
|
+
return False
|
128
|
+
|
114
129
|
def _check_parameters(self) -> None:
|
115
130
|
"""Verify the class parameters are viable.
|
116
131
|
|
@@ -125,22 +140,7 @@ class _ChromaBaseLoader:
|
|
125
140
|
'must be True and a model instance must be provided.')
|
126
141
|
|
127
142
|
def _create_documents(self) -> bool:
|
128
|
-
"""
|
129
|
-
|
130
|
-
Returns:
|
131
|
-
bool: True of the pages are loaded as ``Document`` objects
|
132
|
-
successfully. Otherwise False.
|
133
|
-
|
134
|
-
"""
|
135
|
-
self._docs = [Document(page_content=page.content,
|
136
|
-
metadata={'source': self._p.doc.basename,
|
137
|
-
'pageno': page.pageno})
|
138
|
-
for page in self._p.doc.pages if page.hastext]
|
139
|
-
if not self._docs:
|
140
|
-
msg = f'{_PRE_WARN} Text could not be parsed from {self._p.doc.basename}.'
|
141
|
-
ui.print_warning(msg)
|
142
|
-
return False
|
143
|
-
return True
|
143
|
+
"""Stub method; overridden by the child class."""
|
144
144
|
|
145
145
|
def _get_keywords(self) -> str:
|
146
146
|
"""Query the document (using the LLM) to extract the keywords."""
|
@@ -161,24 +161,27 @@ class _ChromaBaseLoader:
|
|
161
161
|
return_source_documents=True,
|
162
162
|
verbose=True)
|
163
163
|
resp = qa.invoke(qry)
|
164
|
-
kwds =
|
164
|
+
kwds = utilities.parse_to_keywords(resp=resp['result'])
|
165
165
|
return kwds
|
166
166
|
|
167
167
|
def _load(self, path: str, **kwargs):
|
168
|
-
"""Load the
|
168
|
+
"""Load the provided file into the vector store.
|
169
169
|
|
170
170
|
Args:
|
171
171
|
path (str): Full path to the file to be loaded.
|
172
172
|
|
173
173
|
:Keyword Arguments:
|
174
|
-
Those passed from the
|
174
|
+
Those passed from the document-type-specific loader's
|
175
|
+
:func:`load` method.
|
175
176
|
|
176
177
|
"""
|
177
178
|
# pylint: disable=multiple-statements
|
178
179
|
self._fpath = path
|
179
180
|
self._fbase = os.path.basename(path)
|
180
|
-
|
181
|
-
|
181
|
+
if self._already_loaded():
|
182
|
+
return
|
183
|
+
self._set_parser()
|
184
|
+
s = self._set_text_splitter()
|
182
185
|
if s: s = self._parse_text(**kwargs)
|
183
186
|
if s: s = self._create_documents()
|
184
187
|
if s: s = self._split_texts()
|
@@ -198,6 +201,7 @@ class _ChromaBaseLoader:
|
|
198
201
|
exceptions being raised.
|
199
202
|
|
200
203
|
"""
|
204
|
+
# pylint: disable=line-too-long
|
201
205
|
try:
|
202
206
|
print('- Loading the document into the database ...')
|
203
207
|
nrecs_b = self._dbo.collection.count() # Count records before.
|
@@ -205,29 +209,14 @@ class _ChromaBaseLoader:
|
|
205
209
|
nrecs_a = self._dbo.collection.count() # Count records after.
|
206
210
|
return self._test_load(nrecs_b=nrecs_b, nrecs_a=nrecs_a)
|
207
211
|
except chromadberrors.DuplicateIDError:
|
208
|
-
print('-- Document already loaded
|
212
|
+
print(' -- Document *chunk* already loaded, duplication detected. File may be corrupt.')
|
209
213
|
return False # Prevent from loading keywords.
|
210
214
|
except Exception as err:
|
211
215
|
reporterror(err)
|
212
216
|
return False
|
213
217
|
|
214
218
|
def _parse_text(self, **kwargs) -> bool:
|
215
|
-
"""
|
216
|
-
|
217
|
-
:Keyword Arguments:
|
218
|
-
Those to be passed into the text extraction method.
|
219
|
-
|
220
|
-
Returns:
|
221
|
-
bool: True if the parser's 'text' object is populated,
|
222
|
-
otherwise False.
|
223
|
-
|
224
|
-
"""
|
225
|
-
print('- Extracting text ...')
|
226
|
-
self._p.extract_text(**kwargs)
|
227
|
-
if len(self._p.doc.pages) < 2:
|
228
|
-
ui.print_warning(f'No text extracted from {self._p.doc.basename}')
|
229
|
-
return False
|
230
|
-
return True
|
219
|
+
"""Stub method, overridden by the child class."""
|
231
220
|
|
232
221
|
@staticmethod
|
233
222
|
def _print_summary(success: bool):
|
@@ -266,30 +255,8 @@ class _ChromaBaseLoader:
|
|
266
255
|
return False
|
267
256
|
return True
|
268
257
|
|
269
|
-
def _set_parser(self)
|
270
|
-
"""
|
271
|
-
|
272
|
-
:Rationale:
|
273
|
-
The parser is set by the file extension. For example, a file
|
274
|
-
extension ``.pdf`` will set the
|
275
|
-
:class:`parsers.pdfparser.PDFParser` class.
|
276
|
-
|
277
|
-
Returns:
|
278
|
-
bool: True if a file extension appropriate parser was found.
|
279
|
-
Otherwise, False.
|
280
|
-
|
281
|
-
"""
|
282
|
-
# pylint: disable=invalid-name # OK as the variable (Parser) is a class.
|
283
|
-
# TODO: Updated this to use the (not-yet-available) ispdf utility
|
284
|
-
# function, rather than relying on the file extension.
|
285
|
-
ext = os.path.splitext(self._fpath)[1]
|
286
|
-
Parser = self._PARSERS.get(ext)
|
287
|
-
if not Parser:
|
288
|
-
msg = f'{_PRE_WARN} Document parser not set for {os.path.basename(self._fpath)}.'
|
289
|
-
ui.print_warning(msg)
|
290
|
-
return False
|
291
|
-
self._p = Parser(path=self._fpath)
|
292
|
-
return True
|
258
|
+
def _set_parser(self):
|
259
|
+
"""Stub method, overridden by the child class."""
|
293
260
|
|
294
261
|
# TODO: Add these to a config file.
|
295
262
|
def _set_text_splitter(self) -> bool:
|
@@ -307,15 +274,24 @@ class _ChromaBaseLoader:
|
|
307
274
|
def _split_texts(self) -> bool:
|
308
275
|
"""Split the document text using a recursive text splitter.
|
309
276
|
|
277
|
+
Note:
|
278
|
+
If the ``split_text`` parameter was passed as ``False`` on
|
279
|
+
instantiation, the texts will not be split. Rather, the
|
280
|
+
:attr:`_docs` list is simply *copied* to the :attr:`_docss`
|
281
|
+
attribute.
|
282
|
+
|
310
283
|
Returns:
|
311
|
-
bool: True if the text was split successfully,
|
312
|
-
False.
|
284
|
+
bool: True if the text was split (or copied) successfully,
|
285
|
+
otherwise False.
|
313
286
|
|
314
287
|
"""
|
315
|
-
|
288
|
+
if self._split_text:
|
289
|
+
self._docss = self._splitter.split_documents(self._docs)
|
290
|
+
else:
|
291
|
+
self._docss = self._docs[:]
|
316
292
|
if not self._docss:
|
317
|
-
msg = (f'{
|
318
|
-
f'{self.
|
293
|
+
msg = (f'{self._PFX_ERR} An error occurred while splitting the documents for '
|
294
|
+
f'{self._fbase}.')
|
319
295
|
ui.print_warning(msg)
|
320
296
|
return False
|
321
297
|
return True
|
@@ -358,5 +334,5 @@ class _ChromaBaseLoader:
|
|
358
334
|
|
359
335
|
"""
|
360
336
|
if nrecs_a == nrecs_b:
|
361
|
-
ui.print_warning(f'{
|
337
|
+
ui.print_warning(f'{self._PFX_WARN} No new documents added. Possibly already loaded?')
|
362
338
|
return nrecs_a == nrecs_b + len(self._docss)
|