docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +35 -6
  13. docp/dbs/__init__.py +0 -0
  14. docp/dbs/chroma.py +197 -0
  15. docp/libs/_version.py +1 -0
  16. docp/libs/changelog.py +7 -0
  17. docp/libs/utilities.py +107 -0
  18. docp/loaders/__init__.py +38 -0
  19. docp/loaders/_chromabaseloader.py +338 -0
  20. docp/loaders/_chromabaseloader.py.bak +378 -0
  21. docp/loaders/_chromabasepdfloader.py +121 -0
  22. docp/loaders/_chromabasepptxloader.py +123 -0
  23. docp/loaders/chroma.py.bak +196 -0
  24. docp/loaders/chromapdfloader.py +199 -0
  25. docp/loaders/chromapptxloader.py +192 -0
  26. docp/loaders/lutilities.py +52 -0
  27. docp/objects/__init__.py +0 -0
  28. docp/objects/_docbaseobject.py +65 -0
  29. docp/objects/_imgobject.py +0 -0
  30. docp/objects/_pageobject.py +127 -0
  31. docp/objects/_slideobject.py +110 -0
  32. docp/objects/_tableobject.py +0 -0
  33. docp/objects/_textobject.py +64 -0
  34. docp/objects/pdfobject.py +61 -0
  35. docp/objects/pptxobject.py +46 -0
  36. docp/parsers/__init__.py +0 -0
  37. docp/parsers/_pdfbaseparser.py +236 -0
  38. docp/parsers/_pdftableparser.py +272 -0
  39. docp/parsers/_pdftextparser.py +263 -0
  40. docp/parsers/_pptxbaseparser.py +93 -0
  41. docp/parsers/_pptxtextparser.py +115 -0
  42. docp/parsers/pdfparser.py +62 -0
  43. docp/parsers/pptxparser.py +51 -0
  44. docp/parsers/putilities.py +48 -0
  45. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
  46. docp-0.2.0.dist-info/METADATA +110 -0
  47. docp-0.2.0.dist-info/RECORD +49 -0
  48. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  49. docp/_version.py +0 -1
  50. docp-0.0.0.dev1.dist-info/METADATA +0 -55
  51. docp-0.0.0.dev1.dist-info/RECORD +0 -7
  52. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,263 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the logic for parsing text from a PDF
5
+ document.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ .. attention::
12
+
13
+ This module is *not* designed to be interacted with
14
+ directly, only via the appropriate interface class(es).
15
+
16
+ Rather, please create an instance of a PDF document parsing
17
+ object using the following:
18
+
19
+ - :class:`~docp.parsers.pdfparser.PDFParser`
20
+
21
+ .. note::
22
+
23
+ **Multi-processing**
24
+
25
+ Text extraction through multi-processing has been tested and
26
+ is not feasible due to an error indicating
27
+ the ``pdfplumber.page.Page`` object can not be pickled. This
28
+ object was being passed into the extraction method as the
29
+ object contains the :func:`extract_text` function.
30
+
31
+ Additionally, multi-threading has also been tested and
32
+ it was determined to be too complex and inefficient. This was
33
+ tested using the ``concurrent.futures.ThreadPoolExecutor``
34
+ class and two documents, 14 and 92 pages; the timings are
35
+ shown below. The multi-threaded approach took longer to
36
+ process and added unnecessary complexity to the code base.
37
+ As a side-effect, the pages are processed and stored out of
38
+ order which would require a re-order, adding more complexity.
39
+
40
+ It has therefore been determined that this module will remain
41
+ single-threaded.
42
+
43
+ **Multi-Thread Timings**
44
+
45
+ - **Single-threaded:**
46
+
47
+ - 14 page document: ~2 seconds
48
+ - 92 page document: ~32 seconds
49
+
50
+ - **Multi-threaded:**
51
+
52
+ - 14 page document: ~2 seconds
53
+ - 92 page document: ~35 seconds
54
+
55
+ """
56
+ # pylint: disable=import-error
57
+
58
+ from __future__ import annotations
59
+ from unidecode import unidecode
60
+ # locals
61
+ from objects._pageobject import PageObject
62
+ from parsers._pdfbaseparser import _PDFBaseParser
63
+
64
+
65
+ class _PDFTextParser(_PDFBaseParser):
66
+ """Private PDF document text parser intermediate class.
67
+
68
+ Args:
69
+ path (str): Full path to the PDF document.
70
+
71
+ :Example:
72
+
73
+ Extract text from a PDF file::
74
+
75
+ >>> from docp import PDFParser
76
+
77
+ >>> pdf = PDFParser(path='/path/to/myfile.pdf')
78
+ >>> pdf.extract_text()
79
+
80
+ # Access the content of page 1.
81
+ >>> pg1 = pdf.doc.pages[1].content
82
+
83
+ """
84
+
85
+ def extract_text(self,
86
+ *,
87
+ remove_header: bool=False,
88
+ remove_footer: bool=False,
89
+ remove_newlines: bool=False,
90
+ ignore_tags: set=None,
91
+ convert_to_ascii: bool=True,
92
+ **kwargs):
93
+ """Extract text from the document.
94
+
95
+ If the PDF document contains 'marked content' tags, these tags
96
+ are used to extract the text as this is a more accurate approach
97
+ and respects the structure of the page(s). Otherwise, a bounding
98
+ box method is used to extract the text. If instructed, the
99
+ header and/or footer regions can be excluded.
100
+
101
+ .. tip:
102
+ If a tag-based extract is used, the header/footer should be
103
+ automatically excluded as these will often have an 'Artifact'
104
+ tag, which is excluded by default, by passing
105
+ ``ignore_tags=None``.
106
+
107
+ To *keep* the header and footer, pass ``ignore_tags='na'``.
108
+
109
+ A list of pages, with extracted content can be accessed using
110
+ the :attr:`self.doc.pages` attribute.
111
+
112
+ Args:
113
+ remove_header (bool, optional): If True, the header is
114
+ cropped (skipped) from text extraction. This only applies
115
+ to the bounding box extraction method. Defaults to False.
116
+ remove_footer (bool, optional): If True, the footer is
117
+ cropped (skipped) from text extraction. This only applies
118
+ to the bounding box extraction method. Defaults to False.
119
+ remove_newlines (bool, optional): If True, the newline
120
+ characters are replaced with a space. Defaults to False.
121
+ ignore_tags (set, optional): If provided, these are the
122
+ PDF 'marked content' tags which will be ignored. Note
123
+ that the PDF document must contain tags, otherwise the
124
+ bounding box method is used and this argument is ignored.
125
+ Defaults to ``{'Artifact'}``, as these generally
126
+ relate to a header and/or footer. To include all tags,
127
+ (not skip any) pass this argument as ``'na'``.
128
+ convert_to_ascii (bool, optional): When a non-ASCII character
129
+ is found, an attempt is made to convert it to an
130
+ associated ASCII character. If a character cannot be
131
+ converted, it is replaced with a ``'?'``.
132
+ Defaults to True.
133
+
134
+ :Keyword Args:
135
+ - None
136
+
137
+ Returns:
138
+ None.
139
+
140
+ """
141
+ # pylint: disable=unused-argument # **kwargs
142
+ # pylint: disable=unnecessary-dunder-call
143
+ if len(self.doc.pages) > 1:
144
+ # Reinitialise the doc object and reopen the document.
145
+ self.__init__(path=self._path)
146
+ # If tags are found, these are used for text extraction. If tags
147
+ # are not found, a bounding box is used to remove the header and
148
+ # footer, if instructed.
149
+ if self._uses_marked_content():
150
+ match ignore_tags:
151
+ case None: ignore_tags = {'Artifact'}
152
+ case 'na': ignore_tags = set()
153
+ # Involves more processing, but also more accurate.
154
+ self._extract_text_using_tags(ignore_tags=ignore_tags, remove_newlines=remove_newlines)
155
+ else:
156
+ bbox = self._get_crop_coordinates(skip_header=remove_header, skip_footer=remove_footer)
157
+ self._extract_text_using_bbox(bbox=bbox, remove_newlines=remove_newlines)
158
+ if convert_to_ascii:
159
+ for page in self.doc.pages:
160
+ page.content = unidecode(string=page.content,
161
+ errors='replace',
162
+ replace_str='?')
163
+
164
+ def _extract_text_using_bbox(self, **kwargs):
165
+ """Extract text using a bbox for finding the header and footer.
166
+
167
+ :Keyword Arguments:
168
+ Those passed by the caller, :meth:`~extract_text`.
169
+
170
+ """
171
+ for page in self.doc.parser.pages:
172
+ text = page.within_bbox(bbox=kwargs['bbox']).extract_text().strip()
173
+ if kwargs['remove_newlines']:
174
+ text = text.replace('\n', ' ')
175
+ self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
176
+
177
+ def _extract_text_using_tags(self, **kwargs):
178
+ """Extract text using tags.
179
+
180
+ The tags defined by the ``ignore_tags`` are skipped.
181
+
182
+ :Keyword Arguments:
183
+ Those passed by the caller, :meth:`~extract_text`.
184
+
185
+ """
186
+ # pylint: disable=protected-access
187
+ ignored = kwargs['ignore_tags']
188
+ self.doc._tags = True # Set the doc's 'parsed_using_tags' flag.
189
+ for page in self.doc.parser.pages:
190
+ text = ''.join(self._text_from_tags(page=page, ignored=ignored))
191
+ if kwargs['remove_newlines']:
192
+ text = text.replace('\n', ' ')
193
+ self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
194
+
195
+ @staticmethod
196
+ def _text_from_tags(page: pdfplumber.page.Page, ignored: set) -> str: # pylint: disable=undefined-variable # noqa
197
+ """Generate a page of text extracted from tags.
198
+
199
+ When extracting text from tags, newlines are not encoded and must
200
+ be derived. For each character on the page, the top and bottom
201
+ coordinates are compared to determine when a newline should be
202
+ inserted. If both the top and bottom of the current character
203
+ are greater than the previous character, a newline is inserted
204
+ into the text stream.
205
+
206
+ Args:
207
+ page (pdfplumber.page.Page): Page to be parsed.
208
+ ignored (set): A set containing the tags to be ignored.
209
+
210
+ Yields:
211
+ str: Each character on the page, providing its tag is not to
212
+ be ignored. Or, a newline character if the current
213
+ character's coordinates are greater than (lower on the page)
214
+ than the previous character.
215
+
216
+ """
217
+ if page.chars:
218
+ # Micro-optimisation: Push tag filtering down to the C-level.
219
+ chars = filter(lambda x: x['tag'] not in ignored, page.chars)
220
+ top, btm = 999, 999
221
+ for c in chars:
222
+ if top < c['top'] and btm < c['bottom']:
223
+ yield '\n'
224
+ yield c['text']
225
+ top, btm = c['top'], c['bottom']
226
+ yield ''
227
+
228
+ def _uses_marked_content(self) -> bool:
229
+ """Test whether the document can be parsed using tags.
230
+
231
+ Marked content allows us to parse the PDF using tags (rather than
232
+ OCR) which is more accurate not only in terms of character
233
+ recognition, but also with regard to the structure of the text on
234
+ a page.
235
+
236
+ :Logic:
237
+ If the document's catalog shows ``Marked: True``, then
238
+ ``True`` is returned immediately.
239
+
240
+ Otherwise, a second attempt is made which detects marked
241
+ content tags on the first three pages. If no tags are found,
242
+ a third attempt is made by searching the first 10 pages. If
243
+ tags are found during either of these attempts, ``True`` is
244
+ returned immediately.
245
+
246
+ Finally, if no marked content or tags were found, ``False``
247
+ is returned.
248
+
249
+ Returns:
250
+ bool: Returns True if the document can be parsed using marked
251
+ content tags, otherwise False.
252
+
253
+ """
254
+ # Use pdfminer.six to get the document's catalog.
255
+ if self.doc.parser.doc.catalog.get('MarkInfo', {}).get('Marked', False):
256
+ return True
257
+ # Check only first three pages for tags first, if found, get out.
258
+ # If not, retry with the first 10 pages.
259
+ for i in [3, 10]:
260
+ tags = set(c['tag'] for p in self.doc.parser.pages[:i] for c in p.chars)
261
+ if tags != {None}:
262
+ return True
263
+ return False
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides generalised base functionality for
5
+ parsing PPTX documents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ .. attention::
12
+
13
+ This module is *not* designed to be interacted with
14
+ directly, only via the appropriate interface class(es).
15
+
16
+ Rather, please create an instance of a PPTX document parsing
17
+ object using the following:
18
+
19
+ - :class:`~docp.parsers.pptxparser.PPTXParser`
20
+
21
+ """
22
+ # pylint: disable=protected-access
23
+
24
+ import os
25
+ from pptx import Presentation
26
+ # locals
27
+ try:
28
+ from libs.utilities import utilities
29
+ from objects.pptxobject import DocPPTX
30
+ except ImportError:
31
+ from .libs.utilities import utilities
32
+ from .objects.pptxobject import DocPPTX
33
+
34
+
35
+ class _PPTXBaseParser:
36
+ """Base class containing generalised PPTX parsing functionality."""
37
+
38
+ def __init__(self, path: str):
39
+ """Private base parser class initialiser.
40
+
41
+ Args:
42
+ path (str): Full path to the document to be parsed.
43
+
44
+ """
45
+ self._path = path
46
+ self._doc = DocPPTX()
47
+ self._set_paths()
48
+ self._open()
49
+
50
+ @property
51
+ def doc(self) -> DocPPTX:
52
+ """Accessor to the document object."""
53
+ return self._doc
54
+
55
+ def _open(self) -> None:
56
+ """Open the PPTX document for reading.
57
+
58
+ Before opening the file, a test is performed to ensure the PPTX
59
+ is valid. The file must:
60
+
61
+ - exist
62
+ - be a ZIP archive, per the file signature
63
+ - have a .pptx file extension
64
+
65
+ :Other Operations:
66
+
67
+ - Store the ``pptx.Presentation`` parser object returned
68
+ from the :func:`pptx.Presentation` instance creation into
69
+ the :attr:`self._doc._parser` attribute.
70
+ - Store the number of pages into the
71
+ :attr:`self._doc._npages` attribute.
72
+ - Store the document's meta data into the
73
+ :attr:`self._doc._meta` attribute.
74
+
75
+ Raises:
76
+ TypeError: Raised if the file type criteria above are not
77
+ met.
78
+
79
+ """
80
+ if all((os.path.exists(self._doc._fpath),
81
+ utilities.iszip(self._doc._fpath),
82
+ os.path.splitext(self._doc._fpath)[1].lower() == '.pptx')):
83
+ self._doc._parser = Presentation(self._doc._fpath)
84
+ self._doc._npages = len(self._doc._parser.slides)
85
+ self._doc._meta = self._doc._parser.core_properties
86
+ else:
87
+ msg = f'{self._doc._fname} is not a valid PPTX file.'
88
+ raise TypeError(msg)
89
+
90
+ def _set_paths(self) -> None:
91
+ """Set the document's file path attributes."""
92
+ self._doc._fpath = os.path.realpath(self._path)
93
+ self._doc._fname = os.path.basename(self._path)
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the logic for parsing text from a PPTX
5
+ document.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ .. attention::
12
+
13
+ This module is *not* designed to be interacted with
14
+ directly, only via the appropriate interface class(es).
15
+
16
+ Rather, please create an instance of a PPTX document parsing
17
+ object using the following:
18
+
19
+ - :class:`~docp.parsers.pptxparser.PPTXParser`
20
+
21
+ """
22
+
23
+ from unidecode import unidecode
24
+ # locals
25
+ try:
26
+ from .objects._slideobject import SlideObject
27
+ from .objects._textobject import TextObject
28
+ from .parsers._pptxbaseparser import _PPTXBaseParser
29
+ except ImportError:
30
+ from objects._slideobject import SlideObject
31
+ from objects._textobject import TextObject
32
+ from parsers._pptxbaseparser import _PPTXBaseParser
33
+
34
+
35
+ class _PPTXTextParser(_PPTXBaseParser):
36
+ """Private PPTX document text parser intermediate class.
37
+
38
+ Args:
39
+ path (str): Full path to the PPTX document.
40
+
41
+ :Example:
42
+
43
+ Extract text from a PPTX file::
44
+
45
+ >>> from docp import PPTXParser
46
+
47
+ >>> pptx = PPTXParser(path='/path/to/myfile.pptx')
48
+ >>> pptx.extract_text()
49
+
50
+ # Access the text on slide 1.
51
+ >>> pg1 = pptx.doc.slides[1].content
52
+
53
+ """
54
+
55
+ def extract_text(self,
56
+ *,
57
+ remove_newlines: bool=False,
58
+ convert_to_ascii: bool=True,
59
+ **kwargs) -> None:
60
+ """Extract text from the document.
61
+
62
+ A list of slides, with extracted content can be accessed using
63
+ the :attr:`self.doc.slides` attribute.
64
+
65
+ Args:
66
+ remove_newlines (bool, optional): If True, the newline
67
+ characters are replaced with a space. Defaults to False.
68
+ convert_to_ascii (bool, optional): When a non-ASCII character
69
+ is found, an attempt is made to convert it to an
70
+ associated ASCII character. If a character cannot be
71
+ converted, it is replaced with a ``'?'``.
72
+ Defaults to True.
73
+
74
+ :Keyword Args:
75
+ - None
76
+
77
+ Returns:
78
+ None.
79
+
80
+ """
81
+ # pylint: disable=unused-argument # **kwargs
82
+ # pylint: disable=unnecessary-dunder-call
83
+ if len(self.doc.slides) > 1:
84
+ # Reinitialise the doc object and reopen the document.
85
+ self.__init__(path=self._path)
86
+ self._extract_text(remove_newlines=remove_newlines, convert_to_ascii=convert_to_ascii)
87
+
88
+ def _extract_text(self, remove_newlines: bool, convert_to_ascii: bool) -> None:
89
+ """Extract the text from all shapes on all slides.
90
+
91
+ Args:
92
+ remove_newlines (bool): Replace the newline characters with
93
+ a space.
94
+ convert_to_ascii (bool): Attempt to convert any non-ASCII
95
+ characters to their ASCII equivalent.
96
+
97
+ The text extracted from each slide is stored as a ``TextObject``
98
+ which is appended to the slide's ``texts`` attribute.
99
+
100
+ """
101
+ for idx, slide in enumerate(self.doc.parser.slides, 1):
102
+ _slideobj = SlideObject(pageno=idx, parser=slide)
103
+ for shape in slide.shapes:
104
+ if hasattr(shape, 'text'):
105
+ if shape.text:
106
+ text = shape.text
107
+ if remove_newlines:
108
+ text = text.replace('\n', ' ')
109
+ if convert_to_ascii:
110
+ text = unidecode(string=text,
111
+ errors='replace',
112
+ replace_str='?')
113
+ _textobj = TextObject(content=text)
114
+ _slideobj.texts.append(_textobj)
115
+ self.doc.slides.append(_slideobj)
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module serves as the public interface for interacting
5
+ with PDF files and parsing their contents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Example: For example code usage, please refer to the
14
+ :class:`PDFParser` class docstring.
15
+
16
+ """
17
+ # pylint: disable=import-error
18
+ # pylint: disable=wrong-import-position
19
+
20
+ # Set sys.path for relative imports.
21
+ import os
22
+ import sys
23
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
24
+ # locals
25
+ from parsers._pdftableparser import _PDFTableParser
26
+ from parsers._pdftextparser import _PDFTextParser
27
+
28
+
29
+ class PDFParser(_PDFTableParser, _PDFTextParser):
30
+ """PDF document parser.
31
+
32
+ Args:
33
+ path (str): Full path to the PDF document to be parsed.
34
+
35
+ :Example:
36
+
37
+ Extract text from a PDF file::
38
+
39
+ >>> from docp import PDFParser
40
+
41
+ >>> pdf = PDFParser(path='/path/to/myfile.pdf')
42
+ >>> pdf.extract_text()
43
+
44
+ # Access the content of page 1.
45
+ >>> pg1 = pdf.doc.pages[1].content
46
+
47
+
48
+ Extract tables from a PDF file::
49
+
50
+ >>> from docp import PDFParser
51
+
52
+ >>> pdf = PDFParser('/path/to/myfile.pdf')
53
+ >>> pdf.extract_tables()
54
+
55
+ # Access the first table on page 1.
56
+ >>> tbl1 = pdf.doc.pages[1].tables[1]
57
+
58
+ """
59
+
60
+ def __init__(self, path: str):
61
+ """PDF parser class initialiser."""
62
+ super().__init__(path=path)
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module serves as the public interface for interacting
5
+ with PPTX files and parsing their contents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Example: For example code usage, please refer to the
14
+ :class:`PPTXParser` class docstring.
15
+
16
+ """
17
+
18
+ # Set sys.path for relative imports.
19
+ import os
20
+ import sys
21
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
22
+ # locals
23
+ try:
24
+ from .parsers._pptxtextparser import _PPTXTextParser
25
+ except ImportError:
26
+ from parsers._pptxtextparser import _PPTXTextParser
27
+
28
+
29
+ class PPTXParser(_PPTXTextParser):
30
+ """PPTX document parser.
31
+
32
+ Args:
33
+ path (str): Full path to the PPTX document to be parsed.
34
+
35
+ :Example:
36
+
37
+ Extract text from a PPTX file::
38
+
39
+ >>> from docp import PPTXParser
40
+
41
+ >>> pptx = PPTXParser(path='/path/to/myfile.pptx')
42
+ >>> pptx.extract_text()
43
+
44
+ # Access the text on slide 1.
45
+ >>> pg1 = pptx.doc.slides[1].content
46
+
47
+ """
48
+
49
+ def __init__(self, path: str):
50
+ """PPTX parser class initialiser."""
51
+ super().__init__(path=path)
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides parser-specific utility functions for
5
+ the project.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+ # locals
16
+ try:
17
+ from .libs.utilities import utilities
18
+ from .parsers.pdfparser import PDFParser
19
+ from .parsers.pptxparser import PPTXParser
20
+ except ImportError:
21
+ from libs.utilities import utilities
22
+ from parsers.pdfparser import PDFParser
23
+ from parsers.pptxparser import PPTXParser
24
+
25
+
26
+ class ParserUtilities:
27
+ """Parser-based (cross-project) utility functions."""
28
+
29
+ def get_parser(self, path: str) -> PDFParser | PPTXParser:
30
+ """Return the appropriate parser for the file type.
31
+
32
+ Args:
33
+ path (str): Full path to the file to be tested.
34
+
35
+ Returns:
36
+ PDFParser | PPTXParser: The appropriate parser for the file,
37
+ given the *file signature*; this test is not file extension
38
+ based.
39
+
40
+ """
41
+ if utilities.ispdf(path=path):
42
+ return PDFParser
43
+ if utilities.iszip(path=path):
44
+ return PPTXParser
45
+ raise NotImplementedError('A parser is not available for: os.path.basename(path)')
46
+
47
+
48
+ putilities = ParserUtilities()