docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +35 -6
  13. docp/dbs/__init__.py +0 -0
  14. docp/dbs/chroma.py +197 -0
  15. docp/libs/_version.py +1 -0
  16. docp/libs/changelog.py +7 -0
  17. docp/libs/utilities.py +107 -0
  18. docp/loaders/__init__.py +38 -0
  19. docp/loaders/_chromabaseloader.py +338 -0
  20. docp/loaders/_chromabaseloader.py.bak +378 -0
  21. docp/loaders/_chromabasepdfloader.py +121 -0
  22. docp/loaders/_chromabasepptxloader.py +123 -0
  23. docp/loaders/chroma.py.bak +196 -0
  24. docp/loaders/chromapdfloader.py +199 -0
  25. docp/loaders/chromapptxloader.py +192 -0
  26. docp/loaders/lutilities.py +52 -0
  27. docp/objects/__init__.py +0 -0
  28. docp/objects/_docbaseobject.py +65 -0
  29. docp/objects/_imgobject.py +0 -0
  30. docp/objects/_pageobject.py +127 -0
  31. docp/objects/_slideobject.py +110 -0
  32. docp/objects/_tableobject.py +0 -0
  33. docp/objects/_textobject.py +64 -0
  34. docp/objects/pdfobject.py +61 -0
  35. docp/objects/pptxobject.py +46 -0
  36. docp/parsers/__init__.py +0 -0
  37. docp/parsers/_pdfbaseparser.py +236 -0
  38. docp/parsers/_pdftableparser.py +272 -0
  39. docp/parsers/_pdftextparser.py +263 -0
  40. docp/parsers/_pptxbaseparser.py +93 -0
  41. docp/parsers/_pptxtextparser.py +115 -0
  42. docp/parsers/pdfparser.py +62 -0
  43. docp/parsers/pptxparser.py +51 -0
  44. docp/parsers/putilities.py +48 -0
  45. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
  46. docp-0.2.0.dist-info/METADATA +110 -0
  47. docp-0.2.0.dist-info/RECORD +49 -0
  48. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  49. docp/_version.py +0 -1
  50. docp-0.0.0.dev1.dist-info/METADATA +0 -55
  51. docp-0.0.0.dev1.dist-info/RECORD +0 -7
  52. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,196 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the entry point for loading a document
5
+ into a Chroma database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Examples:
14
+
15
+ Parse and load a *single* document into a Chroma database
16
+ collection::
17
+
18
+ >>> from docp import ChromaLoader
19
+
20
+ >>> l = ChromaLoader(path='/path/to/file.pdf',
21
+ dbpath='/path/to/chroma',
22
+ collection='spam')
23
+ >>> l.load()
24
+
25
+
26
+ Parse and load a *directory* of PDF documents into a Chroma
27
+ database collection::
28
+
29
+ >>> from docp import ChromaLoader
30
+
31
+ >>> l = ChromaLoader(path='/path/to/directory',
32
+ dbpath='/path/to/chroma',
33
+ collection='spam')
34
+ >>> l.load(ext='pdf')
35
+
36
+
37
+ For further example code use, please refer to the
38
+ :class:`ChromaLoader` class docstring.
39
+
40
+ """
41
+
42
+ import os
43
+ import re
44
+ from glob import glob
45
+ # locals
46
+ try:
47
+ # from .loaders._chromabaseloader import _ChromaBaseLoader
48
+ from .loaders._chromabaseloader import _ChromaBasePDFLoader
49
+ from .loaders._chromabaseloader import _ChromaBasePPTXLoader
50
+ except ImportError:
51
+ # from loaders._chromabaseloader import _ChromaBaseLoader
52
+ from loaders._chromabaseloader import _ChromaBasePDFLoader
53
+ from loaders._chromabaseloader import _ChromaBasePPTXLoader
54
+
55
+
56
+ # TODO: The document-type-specific loader will be determined by this
57
+ # class.
58
+
59
+ # !!!: This won't work. This class will have to create an instance of either.
60
+ class ChromaLoader(_ChromaBasePDFLoader):
61
+ """Chroma database document loader.
62
+
63
+ Args:
64
+ path (str): Full path to the file (or *directory*) to be parsed
65
+ and loaded. Note: If this is a directory, a specific file
66
+ extension can be passed into the :meth:`load` method using
67
+ the ``ext`` argument.
68
+ dbpath (str): Full path to the Chroma database *directory*.
69
+ collection (str): Name of the Chroma database collection into
70
+ which the data is to be loaded.
71
+ load_keywords (bool, optional): Use the provided LLM
72
+ (via the ``llm`` parameter) to read the document and infer
73
+ keywords to be loaded into the ``<collection>-kwds``
74
+ database, for keyword-driven document filtering.
75
+ Note: This *requires* the ``llm`` parameter and is
76
+ recommended only for GPU-bound processing. Defaults to False.
77
+ llm (object, optional): An LLM *instance* which can be provided
78
+ directly into the
79
+ :func:`langchain.chains.RetrievalQA.from_chain_type` function
80
+ for keyword inference. This is *required* for keyword
81
+ loading. Defaults to None.
82
+ offline (bool, optional): Remain offline and use the locally
83
+ cached embedding function model. Defaults to False.
84
+
85
+ .. important::
86
+
87
+ The *deriving and loading of keywords* is only recommended for
88
+ **GPU-bound processing**, as the LLM is invoked to infer the
89
+ keywords for each given document.
90
+
91
+ If called on a 'standard' PC, this will take a *long* time to
92
+ complete, if it completes at all.
93
+
94
+ :Example:
95
+
96
+ Parse and load a *single* document into a Chroma database
97
+ collection::
98
+
99
+ >>> from docp import ChromaLoader
100
+
101
+ >>> l = ChromaLoader(path='/path/to/file.pdf',
102
+ dbpath='/path/to/chroma',
103
+ collection='spam')
104
+ >>> l.load()
105
+
106
+
107
+ Parse and load a *directory* of PDF documents into a Chroma
108
+ database collection::
109
+
110
+ >>> from docp import ChromaLoader
111
+
112
+ >>> l = ChromaLoader(path='/path/to/directory',
113
+ dbpath='/path/to/chroma',
114
+ collection='spam')
115
+ >>> l.load(ext='pdf')
116
+
117
+ """
118
+
119
+ def __init__(self,
120
+ path: str,
121
+ dbpath: str,
122
+ collection: str,
123
+ *,
124
+ load_keywords: bool=False,
125
+ llm: object=None,
126
+ offline: bool=False):
127
+ """Chroma database loader class initialiser."""
128
+ super().__init__(dbpath=dbpath,
129
+ collection=collection,
130
+ load_keywords=load_keywords,
131
+ llm=llm,
132
+ offline=offline)
133
+ self._path = path
134
+
135
+ def load(self,
136
+ *,
137
+ ext: str='**',
138
+ recursive: bool=True,
139
+ remove_header: bool=True,
140
+ remove_footer: bool=True,
141
+ remove_newlines: bool=True,
142
+ ignore_tags: set=None,
143
+ convert_to_ascii: bool=True) -> None:
144
+ """Load a document (or documents) into a Chroma database.
145
+
146
+ Args:
147
+ ext (str): If the ``path`` argument refers to a *directory*,
148
+ a specific file extension can be specified here.
149
+ For example::
150
+
151
+ ext = 'pdf'
152
+
153
+ If anything other than ``'**'`` is provided, all
154
+ alpha-characters are parsed from the string, and prefixed
155
+ with ``*.``. Meaning, if ``'.pdf'`` is passed, the
156
+ characters ``'pdf'`` are parsed and prefixed with ``*.``
157
+ to create ``'*.pdf'``. However, if ``'things.foo'`` is
158
+ passed, the derived extension will be ``'*.thingsfoo'``.
159
+ Defaults to '**', for a recursive search.
160
+
161
+ recursive (bool, optional): If True, subdirectories are
162
+ searched. Defaults to True.
163
+ remove_header (bool, optional): Attempt to remove the header
164
+ from each page. Defaults to True.
165
+ remove_footer (bool, optional): Attempt to remove the footer
166
+ from each page. Defaults to True.
167
+ remove_newlines (bool, optional): Replace newline characters
168
+ with a space. Defaults to True, as this helps with
169
+ document chunk splitting.
170
+ ignore_tags (set, optional): If provided, these are the
171
+ PDF 'marked content' tags which will be ignored. Note
172
+ that the PDF document must contain tags, otherwise the
173
+ bounding box method is used and this argument is ignored.
174
+ Defaults to ``{'Artifact'}``, as these generally
175
+ relate to a header and/or footer. To include all tags,
176
+ (not skip any) pass this argument as ``'na'``.
177
+ convert_to_ascii (bool, optional): Convert all characters to
178
+ ASCII. Defaults to True.
179
+
180
+ """
181
+ if os.path.isdir(self._path):
182
+ if ext != '**':
183
+ ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
184
+ files = glob(os.path.join(self._path, ext), recursive=recursive)
185
+ count = len(files)
186
+ for idx, f in enumerate(files, 1):
187
+ print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
188
+ self._load(path=f)
189
+ else:
190
+ print(f'Processing: {os.path.basename(self._path)} ...')
191
+ self._load(path=self._path,
192
+ remove_header=remove_header,
193
+ remove_footer=remove_footer,
194
+ remove_newlines=remove_newlines,
195
+ ignore_tags=ignore_tags,
196
+ convert_to_ascii=convert_to_ascii)
@@ -0,0 +1,199 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the entry point for loading PDF files
5
+ into a Chroma database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Examples:
14
+
15
+ Parse and load a *single* PDF file into a Chroma database
16
+ collection::
17
+
18
+ >>> from docp.loaders import ChromaPDFLoader
19
+
20
+ >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
21
+ collection='spam')
22
+ >>> l.load(path='/path/to/directory/myfile.pdf')
23
+
24
+
25
+ Parse and load a *directory* of PDF files into a Chroma database
26
+ collection::
27
+
28
+ >>> from docp.loaders import ChromaPDFLoader
29
+
30
+ >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
31
+ collection='spam')
32
+ >>> l.load(path='/path/to/directory', ext='pdf')
33
+
34
+
35
+ For further example code use, please refer to the
36
+ :class:`ChromaPDFLoader` class docstring.
37
+
38
+ """
39
+
40
+ import os
41
+ # locals
42
+ try:
43
+ from .libs.utilities import utilities
44
+ from .loaders._chromabasepdfloader import _ChromaBasePDFLoader
45
+ except ImportError:
46
+ from libs.utilities import utilities
47
+ from loaders._chromabasepdfloader import _ChromaBasePDFLoader
48
+
49
+
50
+ class ChromaPDFLoader(_ChromaBasePDFLoader):
51
+ """Chroma database PDF-specific document loader.
52
+
53
+ Args:
54
+ dbpath (str | ChromaDB): Either the full path to the Chroma
55
+ database *directory*, or an instance of a
56
+ :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
57
+ passed, the ``collection`` argument is ignored.
58
+ collection (str, optional): Name of the Chroma database
59
+ collection. Only required if the ``dbpath`` parameter is a
60
+ path. Defaults to None.
61
+ split_text (bool, optional): Split the document into chunks,
62
+ before loading it into the database. Defaults to True.
63
+ load_keywords (bool, optional): Use an LLM to derive keywords
64
+ from the document and load these keywords into the sister
65
+ keywords collection. Defaults to False.
66
+ llm (object, optional): If deriving keywords, this is the LLM
67
+ which will do the derivation. Defaults to None.
68
+ offline (bool, optional): Remain offline and use the locally
69
+ cached embedding function model. Defaults to False.
70
+
71
+ .. important::
72
+
73
+ The *deriving and loading of keywords* is only recommended for
74
+ **GPU-bound processing** as the LLM is invoked to infer the
75
+ keywords for each given document.
76
+
77
+ If called on a 'standard' PC, this will take a *long* time to
78
+ complete, if it completes at all.
79
+
80
+ :Examples:
81
+
82
+ Parse and load a *single* PDF file into a Chroma database
83
+ collection::
84
+
85
+ >>> from docp.loaders import ChromaPDFLoader
86
+
87
+ >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
88
+ collection='spam')
89
+ >>> l.load(path='/path/to/directory/myfile.pdf')
90
+
91
+
92
+ Parse and load a *directory* of PDF files into a Chroma
93
+ database collection::
94
+
95
+ >>> from docp.loaders import ChromaPDFLoader
96
+
97
+ >>> l = ChromaPDFLoader(dbpath='/path/to/chroma',
98
+ collection='spam')
99
+ >>> l.load(path='/path/to/directory', ext='pdf')
100
+
101
+ """
102
+
103
+ #
104
+ # No __init__ method here to ensure the ultimate base class'
105
+ # signature is used and to save passing loads of stuff around, if we
106
+ # don't have to.
107
+ #
108
+
109
+ def load(self,
110
+ path: str,
111
+ *,
112
+ ext: str='**',
113
+ recursive: bool=True,
114
+ remove_header: bool=True,
115
+ remove_footer: bool=True,
116
+ remove_newlines: bool=True,
117
+ ignore_tags: set=None,
118
+ convert_to_ascii: bool=True,
119
+ **unused) -> None:
120
+ """Load a PDF file (or files) into a Chroma database.
121
+
122
+ Args:
123
+ path (str): Full path to the file (or *directory*) to be
124
+ parsed and loaded. Note: If this is a directory, a
125
+ specific file extension can be passed into the
126
+ :meth:`load` method using the ``ext`` argument.
127
+ ext (str, optional): If the ``path`` argument refers to a
128
+ *directory*, a specific file extension can be specified
129
+ here. For example: ``ext = 'pdf'``.
130
+
131
+ If anything other than ``'**'`` is provided, all
132
+ alpha-characters are parsed from the string, and prefixed
133
+ with ``*.``. Meaning, if ``'.pdf'`` is passed, the
134
+ characters ``'pdf'`` are parsed and prefixed with ``*.``
135
+ to create ``'*.pdf'``. However, if ``'things.foo'`` is
136
+ passed, the derived extension will be ``'*.thingsfoo'``.
137
+ Defaults to '**', for a recursive search.
138
+
139
+ recursive (bool, optional): If True, subdirectories are
140
+ searched. Defaults to True.
141
+ remove_header (bool, optional): Attempt to remove the header
142
+ from each page. Defaults to True.
143
+ remove_footer (bool, optional): Attempt to remove the footer
144
+ from each page. Defaults to True.
145
+ remove_newlines (bool, optional): Replace newline characters
146
+ with a space. Defaults to True, as this helps with
147
+ document chunk splitting.
148
+ ignore_tags (set, optional): If provided, these are the
149
+ PDF 'marked content' tags which will be ignored. Note
150
+ that the PDF document must contain tags, otherwise the
151
+ bounding box method is used and this argument is ignored.
152
+ Defaults to ``{'Artifact'}``, as these generally
153
+ relate to a header and/or footer. To include all tags,
154
+ (not skip any) pass this argument as ``'na'``.
155
+ convert_to_ascii (bool, optional): Convert all characters to
156
+ ASCII. Defaults to True.
157
+
158
+ :Keyword Args:
159
+ unused (dict): This enables keywords to be passed into a
160
+ loader-agnostic ``.load()`` function without raising a
161
+ 'unexpected keyword argument` ``TypeError``.
162
+
163
+ """
164
+ # pylint: disable=unused-argument # They are 'used' via locals().
165
+ # Prepare the arguments being sent to the doc parser.
166
+ kwargs = self._set_kwargs(locals_=locals())
167
+ # Load multi
168
+ if os.path.isdir(path):
169
+ files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
170
+ count = len(files)
171
+ for idx, f in enumerate(files, 1):
172
+ print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
173
+ self._load(path=f, **kwargs)
174
+ # Load single
175
+ else:
176
+ print(f'Processing: {os.path.basename(path)} ...')
177
+ self._load(path=path, **kwargs)
178
+
179
+ @staticmethod
180
+ def _set_kwargs(locals_: dict) -> dict:
181
+ r"""Prepare the arguments which are sent to the doc parser.
182
+
183
+ As :func:`locals()` is used to capture the :meth:`load` method's
184
+ arguments for passing into the doc parser, some argument must be
185
+ removed first.
186
+
187
+ Args:
188
+ locals\_ (dict): The return value from a :func:`locals` call.
189
+
190
+ Returns:
191
+ dict: A *copy* of the provided dictionary with specific
192
+ key/value pairs removed.
193
+
194
+ """
195
+ # ^^^ The backslash in locals\_ is required for documentation to render correctly.
196
+ kwargs = locals_.copy()
197
+ for k in ['self', 'path']:
198
+ kwargs.pop(k)
199
+ return kwargs
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the entry point for loading PPTX files
5
+ into a Chroma database.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Examples:
14
+
15
+ Parse and load a *single* PPTX file into a Chroma database
16
+ collection::
17
+
18
+ >>> from docp.loaders import ChromaPPTXLoader
19
+
20
+ >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
21
+ collection='spam',
22
+ split_text=False)
23
+ >>> l.load(path='/path/to/directory/myfile.pptx')
24
+
25
+
26
+ Parse and load a *directory* of PPTX files into a Chroma database
27
+ collection::
28
+
29
+ >>> from docp.loaders import ChromaPPTXLoader
30
+
31
+ >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
32
+ collection='spam',
33
+ split_text=False)
34
+ >>> l.load(path='/path/to/directory', ext='pptx')
35
+
36
+
37
+ For further example code use, please refer to the
38
+ :class:`ChromaPPTXLoader` class docstring.
39
+
40
+ """
41
+
42
+ import os
43
+ # locals
44
+ try:
45
+ from .libs.utilities import utilities
46
+ from .loaders._chromabasepptxloader import _ChromaBasePPTXLoader
47
+ except ImportError:
48
+ from libs.utilities import utilities
49
+ from loaders._chromabasepptxloader import _ChromaBasePPTXLoader
50
+
51
+
52
+ class ChromaPPTXLoader(_ChromaBasePPTXLoader):
53
+ """Chroma database PPTX-specific document loader.
54
+
55
+ Args:
56
+ dbpath (str | ChromaDB): Either the full path to the Chroma
57
+ database *directory*, or an instance of a
58
+ :class:`~docp.dbs.chroma.ChromaDB` class. If the instance is
59
+ passed, the ``collection`` argument is ignored.
60
+ collection (str, optional): Name of the Chroma database
61
+ collection. Only required if the ``db`` parameter is a path.
62
+ Defaults to None.
63
+ split_text (bool, optional): Split the document into chunks,
64
+ before loading it into the database. Defaults to True.
65
+ load_keywords (bool, optional): Derive keywords from the document
66
+ and load these into the sister keywords collection.
67
+ Defaults to False.
68
+ llm (object, optional): If deriving keywords, this is the LLM
69
+ which will do the derivation. Defaults to None.
70
+ offline (bool, optional): Remain offline and use the locally
71
+ cached embedding function model. Defaults to False.
72
+
73
+ .. important::
74
+
75
+ The *deriving and loading of keywords* is only recommended for
76
+ **GPU-bound processing**, as the LLM is invoked to infer the
77
+ keywords for each given document.
78
+
79
+ If called on a 'standard' PC, this will take a *long* time to
80
+ complete, if it completes at all.
81
+
82
+ .. tip::
83
+
84
+ It is recommended to pass ``split_text=False`` into the
85
+ :class:`ChromaPPTXLoader` constructor.
86
+
87
+ Often, PowerPoint presentations are structured such that related
88
+ text is found in the same 'shape' (textbox) on a slide.
89
+ Splitting the text in these shapes may have undesired results.
90
+
91
+ :Examples:
92
+
93
+ Parse and load a *single* PPTX file into a Chroma database
94
+ collection::
95
+
96
+ >>> from docp.loaders import ChromaPPTXLoader
97
+
98
+ >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
99
+ collection='spam',
100
+ split_text=False) # <-- Note this
101
+ >>> l.load(path='/path/to/directory/myfile.pptx')
102
+
103
+
104
+ Parse and load a *directory* of PPTX files into a Chroma database
105
+ collection::
106
+
107
+ >>> from docp.loaders import ChromaPPTXLoader
108
+
109
+ >>> l = ChromaPPTXLoader(dbpath='/path/to/chroma',
110
+ collection='spam',
111
+ split_text=False) # <-- Note this
112
+ >>> l.load(path='/path/to/directory', ext='pptx')
113
+
114
+ """
115
+ def load(self,
116
+ path: str,
117
+ *,
118
+ ext: str='**',
119
+ recursive: bool=True,
120
+ remove_newlines: bool=True,
121
+ convert_to_ascii: bool=True,
122
+ **unused) -> None:
123
+ """Load a PDF file (or files) into a Chroma database.
124
+
125
+ Args:
126
+ path (str): Full path to the file (or *directory*) to be
127
+ parsed and loaded. Note: If this is a directory, a
128
+ specific file extension can be passed into the
129
+ :meth:`load` method using the ``ext`` argument.
130
+ ext (str, optional): If the ``path`` argument refers to a
131
+ *directory*, a specific file extension can be specified
132
+ here. For example: ``ext = 'pptx'``.
133
+
134
+ If anything other than ``'**'`` is provided, all
135
+ alpha-characters are parsed from the string, and prefixed
136
+ with ``*.``. Meaning, if ``'.pptx'`` is passed, the
137
+ characters ``'pptx'`` are parsed and prefixed with ``*.``
138
+ to create ``'*.pptx'``. However, if ``'things.foo'`` is
139
+ passed, the derived extension will be ``'*.thingsfoo'``.
140
+ Defaults to '**', for a recursive search.
141
+
142
+ recursive (bool, optional): If True, subdirectories are
143
+ searched. Defaults to True.
144
+ remove_newlines (bool, optional): Replace newline characters
145
+ with a space. Defaults to True, as this helps with
146
+ document chunk splitting.
147
+ convert_to_ascii (bool, optional): Convert all characters to
148
+ ASCII. Defaults to True.
149
+
150
+ :Keyword Args:
151
+ unused (dict): This enables keywords such as ``remove_header``
152
+ and ``remove_footer`` (for example) to be passed into a
153
+ loader-agnostic ``.load()`` function without raising a
154
+ 'unexpected keyword argument` ``TypeError``.
155
+
156
+ """
157
+ # pylint: disable=unused-argument # They are 'used' via locals().
158
+ # Prepare the arguments being sent to the doc parser.
159
+ kwargs = self._set_kwargs(locals_=locals())
160
+ # Load multi
161
+ if os.path.isdir(path):
162
+ files = utilities.collect_files(path=path, ext=ext, recursive=recursive)
163
+ count = len(files)
164
+ for idx, f in enumerate(files, 1):
165
+ print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
166
+ self._load(path=f, **kwargs)
167
+ # Load single
168
+ else:
169
+ print(f'Processing: {os.path.basename(path)} ...')
170
+ self._load(path=path, **kwargs)
171
+
172
+ @staticmethod
173
+ def _set_kwargs(locals_: dict) -> dict:
174
+ r"""Prepare the arguments which are sent to the doc parser.
175
+
176
+ As :func:`locals()` is used to capture the :meth:`load` method's
177
+ arguments for passing into the doc parser, some argument must be
178
+ removed first.
179
+
180
+ Args:
181
+ locals\_ (dict): The return value from a :func:`locals` call.
182
+
183
+ Returns:
184
+ dict: A *copy* of the provided dictionary with specific
185
+ key/value pairs removed.
186
+
187
+ """
188
+ # ^^^ The backslash in locals\_ is required for documentation to render correctly.
189
+ kwargs = locals_.copy()
190
+ for k in ['self', 'path']:
191
+ kwargs.pop(k)
192
+ return kwargs
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides loader-specific utility functions for
5
+ the project.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: This module is here (in the ``docp/loaders``) directory
12
+ rather than merged with the ``docp/parsers/putilities.py``
13
+ module as the loaders' dependencies are *heavy*. Keeping the
14
+ loader functionality separate helps to ease the dependency
15
+ requirements for parser-only projects.
16
+
17
+ """
18
+
19
+ # locals
20
+ try:
21
+ from .libs.utilities import utilities
22
+ from .loaders.chromapdfloader import ChromaPDFLoader
23
+ from .loaders.chromapptxloader import ChromaPPTXLoader
24
+ except ImportError:
25
+ from libs.utilities import utilities
26
+ from loaders.chromapdfloader import ChromaPDFLoader
27
+ from loaders.chromapptxloader import ChromaPPTXLoader
28
+
29
+
30
+ class LoaderUtilities:
31
+ """Loader-based (cross-project) utility functions."""
32
+
33
+ def get_loader(self, path: str) -> ChromaPDFLoader | ChromaPPTXLoader:
34
+ """Return the appropriate loader for the file type.
35
+
36
+ Args:
37
+ path (str): Full path to the file to be tested.
38
+
39
+ Returns:
40
+ ChromaPDFLoader | ChromaPPTXLoader: The appropriate loader
41
+ for the file, given the *file signature*; this test is not
42
+ file extension based.
43
+
44
+ """
45
+ if utilities.ispdf(path=path):
46
+ return ChromaPDFLoader
47
+ if utilities.iszip(path=path):
48
+ return ChromaPPTXLoader
49
+ raise NotImplementedError('A loader is not available for: os.path.basename(path)')
50
+
51
+
52
+ lutilities = LoaderUtilities()
File without changes