docp 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +19 -10
  13. docp/dbs/chroma.py +19 -6
  14. docp/libs/_version.py +1 -0
  15. docp/libs/changelog.py +7 -0
  16. docp/libs/utilities.py +107 -0
  17. docp/loaders/__init__.py +38 -0
  18. docp/loaders/_chromabaseloader.py +83 -107
  19. docp/loaders/_chromabaseloader.py.bak +378 -0
  20. docp/loaders/_chromabasepdfloader.py +121 -0
  21. docp/loaders/_chromabasepptxloader.py +123 -0
  22. docp/loaders/{chroma.py → chroma.py.bak} +38 -8
  23. docp/loaders/chromapdfloader.py +199 -0
  24. docp/loaders/chromapptxloader.py +192 -0
  25. docp/loaders/lutilities.py +52 -0
  26. docp/objects/_docbaseobject.py +7 -18
  27. docp/objects/_imgobject.py +0 -0
  28. docp/objects/_pageobject.py +3 -2
  29. docp/objects/_slideobject.py +110 -0
  30. docp/objects/_textobject.py +64 -0
  31. docp/objects/pdfobject.py +24 -2
  32. docp/objects/pptxobject.py +46 -0
  33. docp/parsers/_pdfbaseparser.py +36 -10
  34. docp/parsers/_pdftableparser.py +6 -7
  35. docp/parsers/_pdftextparser.py +23 -13
  36. docp/parsers/_pptxbaseparser.py +93 -0
  37. docp/parsers/_pptxtextparser.py +115 -0
  38. docp/parsers/pptxparser.py +51 -0
  39. docp/parsers/putilities.py +48 -0
  40. docp-0.2.0.dist-info/METADATA +110 -0
  41. docp-0.2.0.dist-info/RECORD +49 -0
  42. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  43. docp/_version.py +0 -1
  44. docp-0.1.0b1.dist-info/METADATA +0 -55
  45. docp-0.1.0b1.dist-info/RECORD +0 -23
  46. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/LICENSE +0 -0
  47. {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,64 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the implementation for the
5
+ ``TextObject`` object.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+
16
+ class TextObject:
17
+ """This class provides the implementation for the ``TextObject``.
18
+
19
+ For each page (or slide) in a document, an instance of this class is
20
+ created, populated and appended into the page's ``texts`` list
21
+ attribute.
22
+
23
+ Args:
24
+ content (str): Page content as a single string.
25
+
26
+ Note:
27
+ No string cleaning is performed by this class. The string
28
+ contained in the :attr:`contents` attribute is stored exactly as
29
+ extracted from the page or slide's text object.
30
+
31
+ """
32
+
33
+ __slots__ = ('_content', '_hastext')
34
+
35
+ def __init__(self, content: str):
36
+ """Text object class initialiser."""
37
+ self._content = content
38
+ self._hastext = bool(content)
39
+
40
+ def __str__(self) -> str:
41
+ """When printing this object, display the text contents."""
42
+ return self._content
43
+
44
+ @property
45
+ def content(self) -> str:
46
+ """Accessor to the textual content."""
47
+ return self._content
48
+
49
+ @content.setter
50
+ def content(self, value: str) -> None:
51
+ """Setter for the ``content`` attribute.
52
+
53
+ If the ``value`` argument is populated, the content is set and
54
+ the ``hastext`` attribute is set to ``True``.
55
+
56
+ """
57
+ if value:
58
+ self._content = value
59
+ self._hastext = True
60
+
61
+ @property
62
+ def hastext(self) -> bool:
63
+ """Flag indicating if the ``content`` attribute is populated."""
64
+ return self._hastext
docp/objects/pdfobject.py CHANGED
@@ -12,9 +12,13 @@
12
12
  :Comments: n/a
13
13
 
14
14
  """
15
- # pylint: disable=import-error
16
15
 
17
- from objects._docbaseobject import _DocBase
16
+ try:
17
+ from .objects._docbaseobject import _DocBase
18
+ from .objects._pageobject import PageObject
19
+ except ImportError:
20
+ from objects._docbaseobject import _DocBase
21
+ from objects._pageobject import PageObject
18
22
 
19
23
 
20
24
  class DocPDF(_DocBase):
@@ -24,6 +28,24 @@ class DocPDF(_DocBase):
24
28
  """PDF document object class initialiser."""
25
29
  super().__init__()
26
30
  self._tags = False
31
+ # List of PageObjects, offset by 1 to align the index with page numbers.
32
+ self._pages = [PageObject(pageno=0)]
33
+
34
+ @property
35
+ def pages(self) -> list[PageObject]:
36
+ """A list of containing an object for each page in the document.
37
+
38
+ .. tip::
39
+
40
+ The page number index aligns to the page number in the PDF
41
+ file.
42
+
43
+ For example, to access the ``PageObject`` for page 42, use::
44
+
45
+ pages[42]
46
+
47
+ """
48
+ return self._pages
27
49
 
28
50
  @property
29
51
  def parsed_using_tags(self) -> bool:
@@ -0,0 +1,46 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the 'PPTX Document' object structure
5
+ into which MS PowerPoint documents are parsed into for
6
+ transport and onward use.
7
+
8
+ :Platform: Linux/Windows | Python 3.10+
9
+ :Developer: J Berendt
10
+ :Email: development@s3dev.uk
11
+
12
+ :Comments: n/a
13
+
14
+ """
15
+
16
+ try:
17
+ from .objects._docbaseobject import _DocBase
18
+ from .objects._slideobject import SlideObject
19
+ except ImportError:
20
+ from objects._docbaseobject import _DocBase
21
+ from objects._slideobject import SlideObject
22
+
23
+
24
+ class DocPPTX(_DocBase):
25
+ """Container class for storing data parsed from a PPTX file."""
26
+
27
+ def __init__(self):
28
+ """PPTX document object class initialiser."""
29
+ super().__init__()
30
+ self._slides = [SlideObject(pageno=0)]
31
+
32
+ @property
33
+ def slides(self) -> list[SlideObject]:
34
+ """A list of containing an object for each slide in the document.
35
+
36
+ .. tip::
37
+
38
+ The slide number index aligns to the slide number in the
39
+ PPTX file.
40
+
41
+ For example, to access the ``SlideObject`` for side 42, use::
42
+
43
+ slides[42]
44
+
45
+ """
46
+ return self._slides
@@ -8,13 +8,17 @@
8
8
  :Developer: J Berendt
9
9
  :Email: development@s3dev.uk
10
10
 
11
- Note: This module is *not* designed to be interacted with
11
+ :Comments: n/a
12
+
13
+ .. attention::
14
+
15
+ This module is *not* designed to be interacted with
12
16
  directly, only via the appropriate interface class(es).
13
17
 
14
18
  Rather, please create an instance of a PDF document parsing
15
- object using the following:
19
+ object using the following class:
16
20
 
17
- - :class:`pdfparser.PDFParser`
21
+ - :class:`~docp.parsers.pdfparser.PDFParser`
18
22
 
19
23
  """
20
24
  # pylint: disable=import-error
@@ -26,7 +30,12 @@ import pdfplumber
26
30
  from collections import Counter
27
31
  from unidecode import unidecode
28
32
  # locals
29
- from objects.pdfobject import DocPDF
33
+ try:
34
+ from .libs.utilities import utilities
35
+ from .objects.pdfobject import DocPDF
36
+ except ImportError:
37
+ from libs.utilities import utilities
38
+ from objects.pdfobject import DocPDF
30
39
 
31
40
 
32
41
  class _PDFBaseParser:
@@ -94,7 +103,7 @@ class _PDFBaseParser:
94
103
  case 1: num = 1
95
104
  case _ if npages in range(2, 11): num = 2
96
105
  case _: num = 5
97
- pg = self._doc.parser.pages[num] # The pages list has a has a page offset at [0].
106
+ pg = self._doc.parser.pages[num - 1] # The parser does not have a page offset at [0].
98
107
  # Default coordinates to the whole page.
99
108
  coords = {'x0': 0, 'top': 0, 'x1': pg.width, 'bottom': pg.height}
100
109
  # If the header and/or footer is to be skipped, find and iterate
@@ -117,6 +126,13 @@ class _PDFBaseParser:
117
126
  def _open(self) -> None:
118
127
  """Open the PDF document for reading.
119
128
 
129
+ Before opening the file, a test is performed to ensure the PDF
130
+ is valid. The file must:
131
+
132
+ - exist
133
+ - be a valid PDF file, per the file signature
134
+ - have a .pdf file extension
135
+
120
136
  :Other Operations:
121
137
 
122
138
  - Store the ``pdfplumber`` parser object returned from the
@@ -127,10 +143,20 @@ class _PDFBaseParser:
127
143
  - Store the document's meta data into the
128
144
  :attr:`self._doc._meta` attribute.
129
145
 
146
+ Raises:
147
+ TypeError: Raised if the file type criteria above are not
148
+ met.
149
+
130
150
  """
131
- self._doc._parser = pdfplumber.open(self._doc._fpath)
132
- self._doc._npages = len(self._doc._parser.pages)
133
- self._doc._meta = self._doc._parser.metadata
151
+ if all((os.path.exists(self._doc._fpath),
152
+ utilities.ispdf(self._doc._fpath),
153
+ os.path.splitext(self._doc._fpath)[1].lower() == '.pdf')):
154
+ self._doc._parser = pdfplumber.open(self._doc._fpath)
155
+ self._doc._npages = len(self._doc._parser.pages)
156
+ self._doc._meta = self._doc._parser.metadata
157
+ else:
158
+ msg = f'{self._doc._fname} is not a valid PDF file.'
159
+ raise TypeError(msg)
134
160
 
135
161
  @staticmethod
136
162
  def _prepare_row(row: list) -> str:
@@ -196,13 +222,13 @@ class _PDFBaseParser:
196
222
  # Only scan if document has more than three pages.
197
223
  if self._doc.npages < 4:
198
224
  return []
199
- if self._doc.common is None:
225
+ if self._doc._common is None:
200
226
  # Create a line generator for all pages.
201
227
  lines = (l for p in self._doc.parser.pages for l in p.extract_text().split('\n'))
202
228
  # Return the lines whose occurrence rate is 90% of document pages.
203
229
  self._doc._common = [i[0] for i in Counter(lines).most_common()
204
230
  if i[1] > self._doc.npages * 0.9]
205
- return self._doc.common
231
+ return self._doc._common
206
232
 
207
233
  def _set_paths(self) -> None:
208
234
  """Set the document's file path attributes."""
@@ -8,14 +8,15 @@
8
8
  :Developer: J Berendt
9
9
  :Email: jeremy.berendt@rolls-royce.com
10
10
 
11
- Note: This module is *not* designed to be interacted with
11
+ .. attention::
12
+
13
+ This module is *not* designed to be interacted with
12
14
  directly, only via the appropriate interface class(es).
13
15
 
14
16
  Rather, please create an instance of a PDF document parsing
15
17
  object using the following:
16
18
 
17
- - :class:`pdfparser.PDFParser`
18
-
19
+ - :class:`~docp.parsers.pdfparser.PDFParser`
19
20
 
20
21
  """
21
22
  # pylint: disable=import-error
@@ -35,7 +36,6 @@ _SETTINGS = {'vertical_strategy': 'lines',
35
36
  'snap_x_tolerance': 12}
36
37
 
37
38
 
38
- # TODO: Revise the docstring.
39
39
  class _PDFTableParser(_PDFBaseParser):
40
40
  """Private PDF document table parser intermediate class.
41
41
 
@@ -46,10 +46,9 @@ class _PDFTableParser(_PDFBaseParser):
46
46
 
47
47
  Extract tables from a PDF file::
48
48
 
49
- >>> from docutils.parsers.pdf import PDFParser
49
+ >>> from docp import PDFParser
50
50
 
51
- >>> path = '/path/to/myfile.pdf'
52
- >>> pdf = PDFParser(path)
51
+ >>> pdf = PDFParser(path='/path/to/myfile.pdf')
53
52
  >>> pdf.extract_tables()
54
53
 
55
54
  >>> tables = pdf.doc.tables
@@ -8,17 +8,22 @@
8
8
  :Developer: J Berendt
9
9
  :Email: development@s3dev.uk
10
10
 
11
- Note: This module is *not* designed to be interacted with
11
+ .. attention::
12
+
13
+ This module is *not* designed to be interacted with
12
14
  directly, only via the appropriate interface class(es).
13
15
 
14
16
  Rather, please create an instance of a PDF document parsing
15
17
  object using the following:
16
18
 
17
- - :class:`pdfparser.PDFParser`
19
+ - :class:`~docp.parsers.pdfparser.PDFParser`
20
+
21
+ .. note::
22
+
23
+ **Multi-processing**
18
24
 
19
- Note: **Multi-processing:**
20
25
  Text extraction through multi-processing has been tested and
21
- is not feesible due to an error indicating
26
+ is not feasible due to an error indicating
22
27
  the ``pdfplumber.page.Page`` object can not be pickled. This
23
28
  object was being passed into the extraction method as the
24
29
  object contains the :func:`extract_text` function.
@@ -35,17 +40,17 @@ Note: **Multi-processing:**
35
40
  It has therefore been determined that this module will remain
36
41
  single-threaded.
37
42
 
38
- **Multi-Thread Timings**
43
+ **Multi-Thread Timings**
39
44
 
40
- **Single-threaded:**
45
+ - **Single-threaded:**
41
46
 
42
- - 14 page document: ~2 seconds
43
- - 92 page document: ~32 seconds
47
+ - 14 page document: ~2 seconds
48
+ - 92 page document: ~32 seconds
44
49
 
45
- **Multi-threaded:**
50
+ - **Multi-threaded:**
46
51
 
47
- - 14 page document: ~2 seconds
48
- - 92 page document: ~35 seconds
52
+ - 14 page document: ~2 seconds
53
+ - 92 page document: ~35 seconds
49
54
 
50
55
  """
51
56
  # pylint: disable=import-error
@@ -83,7 +88,8 @@ class _PDFTextParser(_PDFBaseParser):
83
88
  remove_footer: bool=False,
84
89
  remove_newlines: bool=False,
85
90
  ignore_tags: set=None,
86
- convert_to_ascii: bool=True):
91
+ convert_to_ascii: bool=True,
92
+ **kwargs):
87
93
  """Extract text from the document.
88
94
 
89
95
  If the PDF document contains 'marked content' tags, these tags
@@ -125,10 +131,14 @@ class _PDFTextParser(_PDFBaseParser):
125
131
  converted, it is replaced with a ``'?'``.
126
132
  Defaults to True.
127
133
 
134
+ :Keyword Args:
135
+ - None
136
+
128
137
  Returns:
129
138
  None.
130
139
 
131
140
  """
141
+ # pylint: disable=unused-argument # **kwargs
132
142
  # pylint: disable=unnecessary-dunder-call
133
143
  if len(self.doc.pages) > 1:
134
144
  # Reinitialise the doc object and reopen the document.
@@ -216,7 +226,7 @@ class _PDFTextParser(_PDFBaseParser):
216
226
  yield ''
217
227
 
218
228
  def _uses_marked_content(self) -> bool:
219
- """Test wether the document can be parsed using tags.
229
+ """Test whether the document can be parsed using tags.
220
230
 
221
231
  Marked content allows us to parse the PDF using tags (rather than
222
232
  OCR) which is more accurate not only in terms of character
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides generalised base functionality for
5
+ parsing PPTX documents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ .. attention::
12
+
13
+ This module is *not* designed to be interacted with
14
+ directly, only via the appropriate interface class(es).
15
+
16
+ Rather, please create an instance of a PPTX document parsing
17
+ object using the following:
18
+
19
+ - :class:`~docp.parsers.pptxparser.PPTXParser`
20
+
21
+ """
22
+ # pylint: disable=protected-access
23
+
24
+ import os
25
+ from pptx import Presentation
26
+ # locals
27
+ try:
28
+ from libs.utilities import utilities
29
+ from objects.pptxobject import DocPPTX
30
+ except ImportError:
31
+ from .libs.utilities import utilities
32
+ from .objects.pptxobject import DocPPTX
33
+
34
+
35
+ class _PPTXBaseParser:
36
+ """Base class containing generalised PPTX parsing functionality."""
37
+
38
+ def __init__(self, path: str):
39
+ """Private base parser class initialiser.
40
+
41
+ Args:
42
+ path (str): Full path to the document to be parsed.
43
+
44
+ """
45
+ self._path = path
46
+ self._doc = DocPPTX()
47
+ self._set_paths()
48
+ self._open()
49
+
50
+ @property
51
+ def doc(self) -> DocPPTX:
52
+ """Accessor to the document object."""
53
+ return self._doc
54
+
55
+ def _open(self) -> None:
56
+ """Open the PPTX document for reading.
57
+
58
+ Before opening the file, a test is performed to ensure the PPTX
59
+ is valid. The file must:
60
+
61
+ - exist
62
+ - be a ZIP archive, per the file signature
63
+ - have a .pptx file extension
64
+
65
+ :Other Operations:
66
+
67
+ - Store the ``pptx.Presentation`` parser object returned
68
+ from the :func:`pptx.Presentation` instance creation into
69
+ the :attr:`self._doc._parser` attribute.
70
+ - Store the number of pages into the
71
+ :attr:`self._doc._npages` attribute.
72
+ - Store the document's meta data into the
73
+ :attr:`self._doc._meta` attribute.
74
+
75
+ Raises:
76
+ TypeError: Raised if the file type criteria above are not
77
+ met.
78
+
79
+ """
80
+ if all((os.path.exists(self._doc._fpath),
81
+ utilities.iszip(self._doc._fpath),
82
+ os.path.splitext(self._doc._fpath)[1].lower() == '.pptx')):
83
+ self._doc._parser = Presentation(self._doc._fpath)
84
+ self._doc._npages = len(self._doc._parser.slides)
85
+ self._doc._meta = self._doc._parser.core_properties
86
+ else:
87
+ msg = f'{self._doc._fname} is not a valid PPTX file.'
88
+ raise TypeError(msg)
89
+
90
+ def _set_paths(self) -> None:
91
+ """Set the document's file path attributes."""
92
+ self._doc._fpath = os.path.realpath(self._path)
93
+ self._doc._fname = os.path.basename(self._path)
@@ -0,0 +1,115 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the logic for parsing text from a PPTX
5
+ document.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ .. attention::
12
+
13
+ This module is *not* designed to be interacted with
14
+ directly, only via the appropriate interface class(es).
15
+
16
+ Rather, please create an instance of a PPTX document parsing
17
+ object using the following:
18
+
19
+ - :class:`~docp.parsers.pptxparser.PPTXParser`
20
+
21
+ """
22
+
23
+ from unidecode import unidecode
24
+ # locals
25
+ try:
26
+ from .objects._slideobject import SlideObject
27
+ from .objects._textobject import TextObject
28
+ from .parsers._pptxbaseparser import _PPTXBaseParser
29
+ except ImportError:
30
+ from objects._slideobject import SlideObject
31
+ from objects._textobject import TextObject
32
+ from parsers._pptxbaseparser import _PPTXBaseParser
33
+
34
+
35
+ class _PPTXTextParser(_PPTXBaseParser):
36
+ """Private PPTX document text parser intermediate class.
37
+
38
+ Args:
39
+ path (str): Full path to the PPTX document.
40
+
41
+ :Example:
42
+
43
+ Extract text from a PPTX file::
44
+
45
+ >>> from docp import PPTXParser
46
+
47
+ >>> pptx = PPTXParser(path='/path/to/myfile.pptx')
48
+ >>> pptx.extract_text()
49
+
50
+ # Access the text on slide 1.
51
+ >>> pg1 = pptx.doc.slides[1].content
52
+
53
+ """
54
+
55
+ def extract_text(self,
56
+ *,
57
+ remove_newlines: bool=False,
58
+ convert_to_ascii: bool=True,
59
+ **kwargs) -> None:
60
+ """Extract text from the document.
61
+
62
+ A list of slides, with extracted content can be accessed using
63
+ the :attr:`self.doc.slides` attribute.
64
+
65
+ Args:
66
+ remove_newlines (bool, optional): If True, the newline
67
+ characters are replaced with a space. Defaults to False.
68
+ convert_to_ascii (bool, optional): When a non-ASCII character
69
+ is found, an attempt is made to convert it to an
70
+ associated ASCII character. If a character cannot be
71
+ converted, it is replaced with a ``'?'``.
72
+ Defaults to True.
73
+
74
+ :Keyword Args:
75
+ - None
76
+
77
+ Returns:
78
+ None.
79
+
80
+ """
81
+ # pylint: disable=unused-argument # **kwargs
82
+ # pylint: disable=unnecessary-dunder-call
83
+ if len(self.doc.slides) > 1:
84
+ # Reinitialise the doc object and reopen the document.
85
+ self.__init__(path=self._path)
86
+ self._extract_text(remove_newlines=remove_newlines, convert_to_ascii=convert_to_ascii)
87
+
88
+ def _extract_text(self, remove_newlines: bool, convert_to_ascii: bool) -> None:
89
+ """Extract the text from all shapes on all slides.
90
+
91
+ Args:
92
+ remove_newlines (bool): Replace the newline characters with
93
+ a space.
94
+ convert_to_ascii (bool): Attempt to convert any non-ASCII
95
+ characters to their ASCII equivalent.
96
+
97
+ The text extracted from each slide is stored as a ``TextObject``
98
+ which is appended to the slide's ``texts`` attribute.
99
+
100
+ """
101
+ for idx, slide in enumerate(self.doc.parser.slides, 1):
102
+ _slideobj = SlideObject(pageno=idx, parser=slide)
103
+ for shape in slide.shapes:
104
+ if hasattr(shape, 'text'):
105
+ if shape.text:
106
+ text = shape.text
107
+ if remove_newlines:
108
+ text = text.replace('\n', ' ')
109
+ if convert_to_ascii:
110
+ text = unidecode(string=text,
111
+ errors='replace',
112
+ replace_str='?')
113
+ _textobj = TextObject(content=text)
114
+ _slideobj.texts.append(_textobj)
115
+ self.doc.slides.append(_slideobj)
@@ -0,0 +1,51 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module serves as the public interface for interacting
5
+ with PPTX files and parsing their contents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Example: For example code usage, please refer to the
14
+ :class:`PPTXParser` class docstring.
15
+
16
+ """
17
+
18
+ # Set sys.path for relative imports.
19
+ import os
20
+ import sys
21
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
22
+ # locals
23
+ try:
24
+ from .parsers._pptxtextparser import _PPTXTextParser
25
+ except ImportError:
26
+ from parsers._pptxtextparser import _PPTXTextParser
27
+
28
+
29
+ class PPTXParser(_PPTXTextParser):
30
+ """PPTX document parser.
31
+
32
+ Args:
33
+ path (str): Full path to the PPTX document to be parsed.
34
+
35
+ :Example:
36
+
37
+ Extract text from a PPTX file::
38
+
39
+ >>> from docp import PPTXParser
40
+
41
+ >>> pptx = PPTXParser(path='/path/to/myfile.pptx')
42
+ >>> pptx.extract_text()
43
+
44
+ # Access the text on slide 1.
45
+ >>> pg1 = pptx.doc.slides[1].content
46
+
47
+ """
48
+
49
+ def __init__(self, path: str):
50
+ """PPTX parser class initialiser."""
51
+ super().__init__(path=path)
@@ -0,0 +1,48 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides parser-specific utility functions for
5
+ the project.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ """
14
+
15
+ # locals
16
+ try:
17
+ from .libs.utilities import utilities
18
+ from .parsers.pdfparser import PDFParser
19
+ from .parsers.pptxparser import PPTXParser
20
+ except ImportError:
21
+ from libs.utilities import utilities
22
+ from parsers.pdfparser import PDFParser
23
+ from parsers.pptxparser import PPTXParser
24
+
25
+
26
+ class ParserUtilities:
27
+ """Parser-based (cross-project) utility functions."""
28
+
29
+ def get_parser(self, path: str) -> PDFParser | PPTXParser:
30
+ """Return the appropriate parser for the file type.
31
+
32
+ Args:
33
+ path (str): Full path to the file to be tested.
34
+
35
+ Returns:
36
+ PDFParser | PPTXParser: The appropriate parser for the file,
37
+ given the *file signature*; this test is not file extension
38
+ based.
39
+
40
+ """
41
+ if utilities.ispdf(path=path):
42
+ return PDFParser
43
+ if utilities.iszip(path=path):
44
+ return PPTXParser
45
+ raise NotImplementedError('A parser is not available for: os.path.basename(path)')
46
+
47
+
48
+ putilities = ParserUtilities()