docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
- docp/__init__.py +35 -6
- docp/dbs/__init__.py +0 -0
- docp/dbs/chroma.py +197 -0
- docp/libs/_version.py +1 -0
- docp/libs/changelog.py +7 -0
- docp/libs/utilities.py +107 -0
- docp/loaders/__init__.py +38 -0
- docp/loaders/_chromabaseloader.py +338 -0
- docp/loaders/_chromabaseloader.py.bak +378 -0
- docp/loaders/_chromabasepdfloader.py +121 -0
- docp/loaders/_chromabasepptxloader.py +123 -0
- docp/loaders/chroma.py.bak +196 -0
- docp/loaders/chromapdfloader.py +199 -0
- docp/loaders/chromapptxloader.py +192 -0
- docp/loaders/lutilities.py +52 -0
- docp/objects/__init__.py +0 -0
- docp/objects/_docbaseobject.py +65 -0
- docp/objects/_imgobject.py +0 -0
- docp/objects/_pageobject.py +127 -0
- docp/objects/_slideobject.py +110 -0
- docp/objects/_tableobject.py +0 -0
- docp/objects/_textobject.py +64 -0
- docp/objects/pdfobject.py +61 -0
- docp/objects/pptxobject.py +46 -0
- docp/parsers/__init__.py +0 -0
- docp/parsers/_pdfbaseparser.py +236 -0
- docp/parsers/_pdftableparser.py +272 -0
- docp/parsers/_pdftextparser.py +263 -0
- docp/parsers/_pptxbaseparser.py +93 -0
- docp/parsers/_pptxtextparser.py +115 -0
- docp/parsers/pdfparser.py +62 -0
- docp/parsers/pptxparser.py +51 -0
- docp/parsers/putilities.py +48 -0
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
- docp-0.2.0.dist-info/METADATA +110 -0
- docp-0.2.0.dist-info/RECORD +49 -0
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
- docp/_version.py +0 -1
- docp-0.0.0.dev1.dist-info/METADATA +0 -55
- docp-0.0.0.dev1.dist-info/RECORD +0 -7
- {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,196 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the entry point for loading a document
|
5
|
+
into a Chroma database.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
:Examples:
|
14
|
+
|
15
|
+
Parse and load a *single* document into a Chroma database
|
16
|
+
collection::
|
17
|
+
|
18
|
+
>>> from docp import ChromaLoader
|
19
|
+
|
20
|
+
>>> l = ChromaLoader(path='/path/to/file.pdf',
|
21
|
+
dbpath='/path/to/chroma',
|
22
|
+
collection='spam')
|
23
|
+
>>> l.load()
|
24
|
+
|
25
|
+
|
26
|
+
Parse and load a *directory* of PDF documents into a Chroma
|
27
|
+
database collection::
|
28
|
+
|
29
|
+
>>> from docp import ChromaLoader
|
30
|
+
|
31
|
+
>>> l = ChromaLoader(path='/path/to/directory',
|
32
|
+
dbpath='/path/to/chroma',
|
33
|
+
collection='spam')
|
34
|
+
>>> l.load(ext='pdf')
|
35
|
+
|
36
|
+
|
37
|
+
For further example code use, please refer to the
|
38
|
+
:class:`ChromaLoader` class docstring.
|
39
|
+
|
40
|
+
"""
|
41
|
+
|
42
|
+
import os
|
43
|
+
import re
|
44
|
+
from glob import glob
|
45
|
+
# locals
|
46
|
+
try:
|
47
|
+
# from .loaders._chromabaseloader import _ChromaBaseLoader
|
48
|
+
from .loaders._chromabaseloader import _ChromaBasePDFLoader
|
49
|
+
from .loaders._chromabaseloader import _ChromaBasePPTXLoader
|
50
|
+
except ImportError:
|
51
|
+
# from loaders._chromabaseloader import _ChromaBaseLoader
|
52
|
+
from loaders._chromabaseloader import _ChromaBasePDFLoader
|
53
|
+
from loaders._chromabaseloader import _ChromaBasePPTXLoader
|
54
|
+
|
55
|
+
|
56
|
+
# TODO: The document-type-specific loader will be determined by this
|
57
|
+
# class.
|
58
|
+
|
59
|
+
# !!!: This won't work. This class will have to create an instance of either.
|
60
|
+
class ChromaLoader(_ChromaBasePDFLoader):
|
61
|
+
"""Chroma database document loader.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
path (str): Full path to the file (or *directory*) to be parsed
|
65
|
+
and loaded. Note: If this is a directory, a specific file
|
66
|
+
extension can be passed into the :meth:`load` method using
|
67
|
+
the ``ext`` argument.
|
68
|
+
dbpath (str): Full path to the Chroma database *directory*.
|
69
|
+
collection (str): Name of the Chroma database collection into
|
70
|
+
which the data is to be loaded.
|
71
|
+
load_keywords (bool, optional): Use the provided LLM
|
72
|
+
(via the ``llm`` parameter) to read the document and infer
|
73
|
+
keywords to be loaded into the ``<collection>-kwds``
|
74
|
+
database, for keyword-driven document filtering.
|
75
|
+
Note: This *requires* the ``llm`` parameter and is
|
76
|
+
recommended only for GPU-bound processing. Defaults to False.
|
77
|
+
llm (object, optional): An LLM *instance* which can be provided
|
78
|
+
directly into the
|
79
|
+
:func:`langchain.chains.RetrievalQA.from_chain_type` function
|
80
|
+
for keyword inference. This is *required* for keyword
|
81
|
+
loading. Defaults to None.
|
82
|
+
offline (bool, optional): Remain offline and use the locally
|
83
|
+
cached embedding function model. Defaults to False.
|
84
|
+
|
85
|
+
.. important::
|
86
|
+
|
87
|
+
The *deriving and loading of keywords* is only recommended for
|
88
|
+
**GPU-bound processing**, as the LLM is invoked to infer the
|
89
|
+
keywords for each given document.
|
90
|
+
|
91
|
+
If called on a 'standard' PC, this will take a *long* time to
|
92
|
+
complete, if it completes at all.
|
93
|
+
|
94
|
+
:Example:
|
95
|
+
|
96
|
+
Parse and load a *single* document into a Chroma database
|
97
|
+
collection::
|
98
|
+
|
99
|
+
>>> from docp import ChromaLoader
|
100
|
+
|
101
|
+
>>> l = ChromaLoader(path='/path/to/file.pdf',
|
102
|
+
dbpath='/path/to/chroma',
|
103
|
+
collection='spam')
|
104
|
+
>>> l.load()
|
105
|
+
|
106
|
+
|
107
|
+
Parse and load a *directory* of PDF documents into a Chroma
|
108
|
+
database collection::
|
109
|
+
|
110
|
+
>>> from docp import ChromaLoader
|
111
|
+
|
112
|
+
>>> l = ChromaLoader(path='/path/to/directory',
|
113
|
+
dbpath='/path/to/chroma',
|
114
|
+
collection='spam')
|
115
|
+
>>> l.load(ext='pdf')
|
116
|
+
|
117
|
+
"""
|
118
|
+
|
119
|
+
def __init__(self,
|
120
|
+
path: str,
|
121
|
+
dbpath: str,
|
122
|
+
collection: str,
|
123
|
+
*,
|
124
|
+
load_keywords: bool=False,
|
125
|
+
llm: object=None,
|
126
|
+
offline: bool=False):
|
127
|
+
"""Chroma database loader class initialiser."""
|
128
|
+
super().__init__(dbpath=dbpath,
|
129
|
+
collection=collection,
|
130
|
+
load_keywords=load_keywords,
|
131
|
+
llm=llm,
|
132
|
+
offline=offline)
|
133
|
+
self._path = path
|
134
|
+
|
135
|
+
def load(self,
|
136
|
+
*,
|
137
|
+
ext: str='**',
|
138
|
+
recursive: bool=True,
|
139
|
+
remove_header: bool=True,
|
140
|
+
remove_footer: bool=True,
|
141
|
+
remove_newlines: bool=True,
|
142
|
+
ignore_tags: set=None,
|
143
|
+
convert_to_ascii: bool=True) -> None:
|
144
|
+
"""Load a document (or documents) into a Chroma database.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
ext (str): If the ``path`` argument refers to a *directory*,
|
148
|
+
a specific file extension can be specified here.
|
149
|
+
For example::
|
150
|
+
|
151
|
+
ext = 'pdf'
|
152
|
+
|
153
|
+
If anything other than ``'**'`` is provided, all
|
154
|
+
alpha-characters are parsed from the string, and prefixed
|
155
|
+
with ``*.``. Meaning, if ``'.pdf'`` is passed, the
|
156
|
+
characters ``'pdf'`` are parsed and prefixed with ``*.``
|
157
|
+
to create ``'*.pdf'``. However, if ``'things.foo'`` is
|
158
|
+
passed, the derived extension will be ``'*.thingsfoo'``.
|
159
|
+
Defaults to '**', for a recursive search.
|
160
|
+
|
161
|
+
recursive (bool, optional): If True, subdirectories are
|
162
|
+
searched. Defaults to True.
|
163
|
+
remove_header (bool, optional): Attempt to remove the header
|
164
|
+
from each page. Defaults to True.
|
165
|
+
remove_footer (bool, optional): Attempt to remove the footer
|
166
|
+
from each page. Defaults to True.
|
167
|
+
remove_newlines (bool, optional): Replace newline characters
|
168
|
+
with a space. Defaults to True, as this helps with
|
169
|
+
document chunk splitting.
|
170
|
+
ignore_tags (set, optional): If provided, these are the
|
171
|
+
PDF 'marked content' tags which will be ignored. Note
|
172
|
+
that the PDF document must contain tags, otherwise the
|
173
|
+
bounding box method is used and this argument is ignored.
|
174
|
+
Defaults to ``{'Artifact'}``, as these generally
|
175
|
+
relate to a header and/or footer. To include all tags,
|
176
|
+
(not skip any) pass this argument as ``'na'``.
|
177
|
+
convert_to_ascii (bool, optional): Convert all characters to
|
178
|
+
ASCII. Defaults to True.
|
179
|
+
|
180
|
+
"""
|
181
|
+
if os.path.isdir(self._path):
|
182
|
+
if ext != '**':
|
183
|
+
ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
|
184
|
+
files = glob(os.path.join(self._path, ext), recursive=recursive)
|
185
|
+
count = len(files)
|
186
|
+
for idx, f in enumerate(files, 1):
|
187
|
+
print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
|
188
|
+
self._load(path=f)
|
189
|
+
else:
|
190
|
+
print(f'Processing: {os.path.basename(self._path)} ...')
|
191
|
+
self._load(path=self._path,
|
192
|
+
remove_header=remove_header,
|
193
|
+
remove_footer=remove_footer,
|
194
|
+
remove_newlines=remove_newlines,
|
195
|
+
ignore_tags=ignore_tags,
|
196
|
+
convert_to_ascii=convert_to_ascii)
|
@@ -0,0 +1,199 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the entry point for loading PDF files
|
5
|
+
into a Chroma database.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
:Examples:
|
14
|
+
|
15
|
+
Parse and load a *single* PDF file into a Chroma database
|
16
|
+
collection::
|
17
|
+
|
18
|
+
>>> from docp.loaders import ChromaPDFLoader
|
19
|
+
|
20
|
+
>>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
|
21
|
+
collection='spam')
|
22
|
+
>>> l.load(path='/path/to/directory/myfile.pdf')
|
23
|
+
|
24
|
+
|
25
|
+
Parse and load a *directory* of PDF files into a Chroma database
|
26
|
+
collection::
|
27
|
+
|
28
|
+
>>> from docp.loaders import ChromaPDFLoader
|
29
|
+
|
30
|
+
>>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
|
31
|
+
collection='spam')
|
32
|
+
>>> l.load(path='/path/to/directory', ext='pdf')
|
33
|
+
|
34
|
+
|
35
|
+
For further example code use, please refer to the
|
36
|
+
:class:`ChromaPDFLoader` class docstring.
|
37
|
+
|
38
|
+
"""
|
39
|
+
|
40
|
+
import os
|
41
|
+
# locals
|
42
|
+
try:
|
43
|
+
from .libs.utilities import utilities
|
44
|
+
from .loaders._chromabasepdfloader import _ChromaBasePDFLoader
|
45
|
+
except ImportError:
|
46
|
+
from libs.utilities import utilities
|
47
|
+
from loaders._chromabasepdfloader import _ChromaBasePDFLoader
|
48
|
+
|
49
|
+
|
50
|
+
class ChromaPDFLoader(_ChromaBasePDFLoader):
|
51
|
+
"""Chroma database PDF-specific document loader.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
dbpath (str | ChromaDB): Either the full path to the Chroma
|
55
|
+
database *directory*, or an instance of a
|
56
|
+
:class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
|
57
|
+
passed, the ``collection`` argument is ignored.
|
58
|
+
collection (str, optional): Name of the Chroma database
|
59
|
+
collection. Only required if the ``dbpath`` parameter is a
|
60
|
+
path. Defaults to None.
|
61
|
+
split_text (bool, optional): Split the document into chunks,
|
62
|
+
before loading it into the database. Defaults to True.
|
63
|
+
load_keywords (bool, optional): Use an LLM to derive keywords
|
64
|
+
from the document and load these keywords into the sister
|
65
|
+
keywords collection. Defaults to False.
|
66
|
+
llm (object, optional): If deriving keywords, this is the LLM
|
67
|
+
which will do the derivation. Defaults to None.
|
68
|
+
offline (bool, optional): Remain offline and use the locally
|
69
|
+
cached embedding function model. Defaults to False.
|
70
|
+
|
71
|
+
.. important::
|
72
|
+
|
73
|
+
The *deriving and loading of keywords* is only recommended for
|
74
|
+
**GPU-bound processing** as the LLM is invoked to infer the
|
75
|
+
keywords for each given document.
|
76
|
+
|
77
|
+
If called on a 'standard' PC, this will take a *long* time to
|
78
|
+
complete, if it completes at all.
|
79
|
+
|
80
|
+
:Examples:
|
81
|
+
|
82
|
+
Parse and load a *single* PDF file into a Chroma database
|
83
|
+
collection::
|
84
|
+
|
85
|
+
>>> from docp.loaders import ChromaPDFLoader
|
86
|
+
|
87
|
+
>>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
|
88
|
+
collection='spam')
|
89
|
+
>>> l.load(path='/path/to/directory/myfile.pdf')
|
90
|
+
|
91
|
+
|
92
|
+
Parse and load a *directory* of PDF files into a Chroma
|
93
|
+
database collection::
|
94
|
+
|
95
|
+
>>> from docp.loaders import ChromaPDFLoader
|
96
|
+
|
97
|
+
>>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
|
98
|
+
collection='spam')
|
99
|
+
>>> l.load(path='/path/to/directory', ext='pdf')
|
100
|
+
|
101
|
+
"""
|
102
|
+
|
103
|
+
#
|
104
|
+
# No __init__ method here to ensure the ultimate base class'
|
105
|
+
# signature is used and to save passing loads of stuff around, if we
|
106
|
+
# don't have to.
|
107
|
+
#
|
108
|
+
|
109
|
+
def load(self,
|
110
|
+
path: str,
|
111
|
+
*,
|
112
|
+
ext: str='**',
|
113
|
+
recursive: bool=True,
|
114
|
+
remove_header: bool=True,
|
115
|
+
remove_footer: bool=True,
|
116
|
+
remove_newlines: bool=True,
|
117
|
+
ignore_tags: set=None,
|
118
|
+
convert_to_ascii: bool=True,
|
119
|
+
**unused) -> None:
|
120
|
+
"""Load a PDF file (or files) into a Chroma database.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
path (str): Full path to the file (or *directory*) to be
|
124
|
+
parsed and loaded. Note: If this is a directory, a
|
125
|
+
specific file extension can be passed into the
|
126
|
+
:meth:`load` method using the ``ext`` argument.
|
127
|
+
ext (str, optional): If the ``path`` argument refers to a
|
128
|
+
*directory*, a specific file extension can be specified
|
129
|
+
here. For example: ``ext = 'pdf'``.
|
130
|
+
|
131
|
+
If anything other than ``'**'`` is provided, all
|
132
|
+
alpha-characters are parsed from the string, and prefixed
|
133
|
+
with ``*.``. Meaning, if ``'.pdf'`` is passed, the
|
134
|
+
characters ``'pdf'`` are parsed and prefixed with ``*.``
|
135
|
+
to create ``'*.pdf'``. However, if ``'things.foo'`` is
|
136
|
+
passed, the derived extension will be ``'*.thingsfoo'``.
|
137
|
+
Defaults to '**', for a recursive search.
|
138
|
+
|
139
|
+
recursive (bool, optional): If True, subdirectories are
|
140
|
+
searched. Defaults to True.
|
141
|
+
remove_header (bool, optional): Attempt to remove the header
|
142
|
+
from each page. Defaults to True.
|
143
|
+
remove_footer (bool, optional): Attempt to remove the footer
|
144
|
+
from each page. Defaults to True.
|
145
|
+
remove_newlines (bool, optional): Replace newline characters
|
146
|
+
with a space. Defaults to True, as this helps with
|
147
|
+
document chunk splitting.
|
148
|
+
ignore_tags (set, optional): If provided, these are the
|
149
|
+
PDF 'marked content' tags which will be ignored. Note
|
150
|
+
that the PDF document must contain tags, otherwise the
|
151
|
+
bounding box method is used and this argument is ignored.
|
152
|
+
Defaults to ``{'Artifact'}``, as these generally
|
153
|
+
relate to a header and/or footer. To include all tags,
|
154
|
+
(not skip any) pass this argument as ``'na'``.
|
155
|
+
convert_to_ascii (bool, optional): Convert all characters to
|
156
|
+
ASCII. Defaults to True.
|
157
|
+
|
158
|
+
:Keyword Args:
|
159
|
+
unused (dict): This enables keywords to be passed into a
|
160
|
+
loader-agnostic ``.load()`` function without raising a
|
161
|
+
'unexpected keyword argument` ``TypeError``.
|
162
|
+
|
163
|
+
"""
|
164
|
+
# pylint: disable=unused-argument # They are 'used' via locals().
|
165
|
+
# Prepare the arguments being sent to the doc parser.
|
166
|
+
kwargs = self._set_kwargs(locals_=locals())
|
167
|
+
# Load multi
|
168
|
+
if os.path.isdir(path):
|
169
|
+
files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
|
170
|
+
count = len(files)
|
171
|
+
for idx, f in enumerate(files, 1):
|
172
|
+
print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
|
173
|
+
self._load(path=f, **kwargs)
|
174
|
+
# Load single
|
175
|
+
else:
|
176
|
+
print(f'Processing: {os.path.basename(path)} ...')
|
177
|
+
self._load(path=path, **kwargs)
|
178
|
+
|
179
|
+
@staticmethod
|
180
|
+
def _set_kwargs(locals_: dict) -> dict:
|
181
|
+
r"""Prepare the arguments which are sent to the doc parser.
|
182
|
+
|
183
|
+
As :func:`locals()` is used to capture the :meth:`load` method's
|
184
|
+
arguments for passing into the doc parser, some argument must be
|
185
|
+
removed first.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
locals\_ (dict): The return value from a :func:`locals` call.
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
dict: A *copy* of the provided dictionary with specific
|
192
|
+
key/value pairs removed.
|
193
|
+
|
194
|
+
"""
|
195
|
+
# ^^^ The backslash in locals\_ is required for documentation to render correctly.
|
196
|
+
kwargs = locals_.copy()
|
197
|
+
for k in ['self', 'path']:
|
198
|
+
kwargs.pop(k)
|
199
|
+
return kwargs
|
@@ -0,0 +1,192 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the entry point for loading PPTX files
|
5
|
+
into a Chroma database.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
:Examples:
|
14
|
+
|
15
|
+
Parse and load a *single* PPTX file into a Chroma database
|
16
|
+
collection::
|
17
|
+
|
18
|
+
>>> from docp.loaders import ChromaPPTXLoader
|
19
|
+
|
20
|
+
>>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
|
21
|
+
collection='spam',
|
22
|
+
split_text=False)
|
23
|
+
>>> l.load(path='/path/to/directory/myfile.pptx')
|
24
|
+
|
25
|
+
|
26
|
+
Parse and load a *directory* of PPTX files into a Chroma database
|
27
|
+
collection::
|
28
|
+
|
29
|
+
>>> from docp.loaders import ChromaPPTXLoader
|
30
|
+
|
31
|
+
>>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
|
32
|
+
collection='spam',
|
33
|
+
split_text=False)
|
34
|
+
>>> l.load(path='/path/to/directory', ext='pptx')
|
35
|
+
|
36
|
+
|
37
|
+
For further example code use, please refer to the
|
38
|
+
:class:`ChromaPPTXLoader` class docstring.
|
39
|
+
|
40
|
+
"""
|
41
|
+
|
42
|
+
import os
|
43
|
+
# locals
|
44
|
+
try:
|
45
|
+
from .libs.utilities import utilities
|
46
|
+
from .loaders._chromabasepptxloader import _ChromaBasePPTXLoader
|
47
|
+
except ImportError:
|
48
|
+
from libs.utilities import utilities
|
49
|
+
from loaders._chromabasepptxloader import _ChromaBasePPTXLoader
|
50
|
+
|
51
|
+
|
52
|
+
class ChromaPPTXLoader(_ChromaBasePPTXLoader):
|
53
|
+
"""Chroma database PPTX-specific document loader.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
dbpath (str | ChromaDB): Either the full path to the Chroma
|
57
|
+
database *directory*, or an instance of a
|
58
|
+
:class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
|
59
|
+
passed, the ``collection`` argument is ignored.
|
60
|
+
collection (str, optional): Name of the Chroma database
|
61
|
+
collection. Only required if the ``db`` parameter is a path.
|
62
|
+
Defaults to None.
|
63
|
+
split_text (bool, optional): Split the document into chunks,
|
64
|
+
before loading it into the database. Defaults to True.
|
65
|
+
load_keywords (bool, optional): Derive keywords from the document
|
66
|
+
and load these into the sister keywords collection.
|
67
|
+
Defaults to False.
|
68
|
+
llm (object, optional): If deriving keywords, this is the LLM
|
69
|
+
which will do the derivation. Defaults to None.
|
70
|
+
offline (bool, optional): Remain offline and use the locally
|
71
|
+
cached embedding function model. Defaults to False.
|
72
|
+
|
73
|
+
.. important::
|
74
|
+
|
75
|
+
The *deriving and loading of keywords* is only recommended for
|
76
|
+
**GPU-bound processing**, as the LLM is invoked to infer the
|
77
|
+
keywords for each given document.
|
78
|
+
|
79
|
+
If called on a 'standard' PC, this will take a *long* time to
|
80
|
+
complete, if it completes at all.
|
81
|
+
|
82
|
+
.. tip::
|
83
|
+
|
84
|
+
It is recommended to pass ``split_text=False`` into the
|
85
|
+
:class:`ChromaPPTXLoader` constructor.
|
86
|
+
|
87
|
+
Often, PowerPoint presentations are structured such that related
|
88
|
+
text is found in the same 'shape' (textbox) on a slide.
|
89
|
+
Splitting the text in these shapes may have undesired results.
|
90
|
+
|
91
|
+
:Examples:
|
92
|
+
|
93
|
+
Parse and load a *single* PPTX file into a Chroma database
|
94
|
+
collection::
|
95
|
+
|
96
|
+
>>> from docp.loaders import ChromaPPTXLoader
|
97
|
+
|
98
|
+
>>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
|
99
|
+
collection='spam',
|
100
|
+
split_text=False) # <-- Note this
|
101
|
+
>>> l.load(path='/path/to/directory/myfile.pptx')
|
102
|
+
|
103
|
+
|
104
|
+
Parse and load a *directory* of PPTX files into a Chroma database
|
105
|
+
collection::
|
106
|
+
|
107
|
+
>>> from docp.loaders import ChromaPPTXLoader
|
108
|
+
|
109
|
+
>>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
|
110
|
+
collection='spam',
|
111
|
+
split_text=False) # <-- Note this
|
112
|
+
>>> l.load(path='/path/to/directory', ext='pptx')
|
113
|
+
|
114
|
+
"""
|
115
|
+
def load(self,
|
116
|
+
path: str,
|
117
|
+
*,
|
118
|
+
ext: str='**',
|
119
|
+
recursive: bool=True,
|
120
|
+
remove_newlines: bool=True,
|
121
|
+
convert_to_ascii: bool=True,
|
122
|
+
**unused) -> None:
|
123
|
+
"""Load a PDF file (or files) into a Chroma database.
|
124
|
+
|
125
|
+
Args:
|
126
|
+
path (str): Full path to the file (or *directory*) to be
|
127
|
+
parsed and loaded. Note: If this is a directory, a
|
128
|
+
specific file extension can be passed into the
|
129
|
+
:meth:`load` method using the ``ext`` argument.
|
130
|
+
ext (str, optional): If the ``path`` argument refers to a
|
131
|
+
*directory*, a specific file extension can be specified
|
132
|
+
here. For example: ``ext = 'pptx'``.
|
133
|
+
|
134
|
+
If anything other than ``'**'`` is provided, all
|
135
|
+
alpha-characters are parsed from the string, and prefixed
|
136
|
+
with ``*.``. Meaning, if ``'.pptx'`` is passed, the
|
137
|
+
characters ``'pptx'`` are parsed and prefixed with ``*.``
|
138
|
+
to create ``'*.pptx'``. However, if ``'things.foo'`` is
|
139
|
+
passed, the derived extension will be ``'*.thingsfoo'``.
|
140
|
+
Defaults to '**', for a recursive search.
|
141
|
+
|
142
|
+
recursive (bool, optional): If True, subdirectories are
|
143
|
+
searched. Defaults to True.
|
144
|
+
remove_newlines (bool, optional): Replace newline characters
|
145
|
+
with a space. Defaults to True, as this helps with
|
146
|
+
document chunk splitting.
|
147
|
+
convert_to_ascii (bool, optional): Convert all characters to
|
148
|
+
ASCII. Defaults to True.
|
149
|
+
|
150
|
+
:Keyword Args:
|
151
|
+
unused (dict): This enables keywords such as ``remove_header``
|
152
|
+
and ``remove_footer`` (for example) to be passed into a
|
153
|
+
loader-agnostic ``.load()`` function without raising a
|
154
|
+
'unexpected keyword argument` ``TypeError``.
|
155
|
+
|
156
|
+
"""
|
157
|
+
# pylint: disable=unused-argument # They are 'used' via locals().
|
158
|
+
# Prepare the arguments being sent to the doc parser.
|
159
|
+
kwargs = self._set_kwargs(locals_=locals())
|
160
|
+
# Load multi
|
161
|
+
if os.path.isdir(path):
|
162
|
+
files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
|
163
|
+
count = len(files)
|
164
|
+
for idx, f in enumerate(files, 1):
|
165
|
+
print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
|
166
|
+
self._load(path=f, **kwargs)
|
167
|
+
# Load single
|
168
|
+
else:
|
169
|
+
print(f'Processing: {os.path.basename(path)} ...')
|
170
|
+
self._load(path=path, **kwargs)
|
171
|
+
|
172
|
+
@staticmethod
|
173
|
+
def _set_kwargs(locals_: dict) -> dict:
|
174
|
+
r"""Prepare the arguments which are sent to the doc parser.
|
175
|
+
|
176
|
+
As :func:`locals()` is used to capture the :meth:`load` method's
|
177
|
+
arguments for passing into the doc parser, some argument must be
|
178
|
+
removed first.
|
179
|
+
|
180
|
+
Args:
|
181
|
+
locals\_ (dict): The return value from a :func:`locals` call.
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
dict: A *copy* of the provided dictionary with specific
|
185
|
+
key/value pairs removed.
|
186
|
+
|
187
|
+
"""
|
188
|
+
# ^^^ The backslash in locals\_ is required for documentation to render correctly.
|
189
|
+
kwargs = locals_.copy()
|
190
|
+
for k in ['self', 'path']:
|
191
|
+
kwargs.pop(k)
|
192
|
+
return kwargs
|
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides loader-specific utility functions for
|
5
|
+
the project.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: This module is here (in the ``docp/loaders``) directory
|
12
|
+
rather than merged with the ``docp/parsers/putilities.py``
|
13
|
+
module as the loaders' dependencies are *heavy*. Keeping the
|
14
|
+
loader functionality separate helps to ease the dependency
|
15
|
+
requirements for parser-only projects.
|
16
|
+
|
17
|
+
"""
|
18
|
+
|
19
|
+
# locals
|
20
|
+
try:
|
21
|
+
from .libs.utilities import utilities
|
22
|
+
from .loaders.chromapdfloader import ChromaPDFLoader
|
23
|
+
from .loaders.chromapptxloader import ChromaPPTXLoader
|
24
|
+
except ImportError:
|
25
|
+
from libs.utilities import utilities
|
26
|
+
from loaders.chromapdfloader import ChromaPDFLoader
|
27
|
+
from loaders.chromapptxloader import ChromaPPTXLoader
|
28
|
+
|
29
|
+
|
30
|
+
class LoaderUtilities:
|
31
|
+
"""Loader-based (cross-project) utility functions."""
|
32
|
+
|
33
|
+
def get_loader(self, path: str) -> ChromaPDFLoader | ChromaPPTXLoader:
|
34
|
+
"""Return the appropriate loader for the file type.
|
35
|
+
|
36
|
+
Args:
|
37
|
+
path (str): Full path to the file to be tested.
|
38
|
+
|
39
|
+
Returns:
|
40
|
+
ChromaPDFLoader | ChromaPPTXLoader: The appropriate loader
|
41
|
+
for the file, given the *file signature*; this test is not
|
42
|
+
file extension based.
|
43
|
+
|
44
|
+
"""
|
45
|
+
if utilities.ispdf(path=path):
|
46
|
+
return ChromaPDFLoader
|
47
|
+
if utilities.iszip(path=path):
|
48
|
+
return ChromaPPTXLoader
|
49
|
+
raise NotImplementedError('A loader is not available for: os.path.basename(path)')
|
50
|
+
|
51
|
+
|
52
|
+
lutilities = LoaderUtilities()
|
docp/objects/__init__.py
ADDED
File without changes
|