docp 0.1.0b1__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
- docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
- docp/__init__.py +19 -10
- docp/dbs/chroma.py +19 -6
- docp/libs/_version.py +1 -0
- docp/libs/changelog.py +7 -0
- docp/libs/utilities.py +107 -0
- docp/loaders/__init__.py +38 -0
- docp/loaders/_chromabaseloader.py +83 -107
- docp/loaders/_chromabaseloader.py.bak +378 -0
- docp/loaders/_chromabasepdfloader.py +121 -0
- docp/loaders/_chromabasepptxloader.py +123 -0
- docp/loaders/{chroma.py → chroma.py.bak} +38 -8
- docp/loaders/chromapdfloader.py +199 -0
- docp/loaders/chromapptxloader.py +192 -0
- docp/loaders/lutilities.py +52 -0
- docp/objects/_docbaseobject.py +7 -18
- docp/objects/_imgobject.py +0 -0
- docp/objects/_pageobject.py +3 -2
- docp/objects/_slideobject.py +110 -0
- docp/objects/_textobject.py +64 -0
- docp/objects/pdfobject.py +24 -2
- docp/objects/pptxobject.py +46 -0
- docp/parsers/_pdfbaseparser.py +36 -10
- docp/parsers/_pdftableparser.py +6 -7
- docp/parsers/_pdftextparser.py +23 -13
- docp/parsers/_pptxbaseparser.py +93 -0
- docp/parsers/_pptxtextparser.py +115 -0
- docp/parsers/pptxparser.py +51 -0
- docp/parsers/putilities.py +48 -0
- docp-0.2.0.dist-info/METADATA +110 -0
- docp-0.2.0.dist-info/RECORD +49 -0
- {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
- docp/_version.py +0 -1
- docp-0.1.0b1.dist-info/METADATA +0 -55
- docp-0.1.0b1.dist-info/RECORD +0 -23
- {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/LICENSE +0 -0
- {docp-0.1.0b1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
docp/objects/_textobject.py
CHANGED
@@ -0,0 +1,64 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the implementation for the
|
5
|
+
``TextObject`` object.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
|
16
|
+
class TextObject:
|
17
|
+
"""This class provides the implementation for the ``TextObject``.
|
18
|
+
|
19
|
+
For each page (or slide) in a document, an instance of this class is
|
20
|
+
created, populated and appended into the page's ``texts`` list
|
21
|
+
attribute.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
content (str): Page content as a single string.
|
25
|
+
|
26
|
+
Note:
|
27
|
+
No string cleaning is performed by this class. The string
|
28
|
+
contained in the :attr:`contents` attribute is stored exactly as
|
29
|
+
extracted from the page or slide's text object.
|
30
|
+
|
31
|
+
"""
|
32
|
+
|
33
|
+
__slots__ = ('_content', '_hastext')
|
34
|
+
|
35
|
+
def __init__(self, content: str):
|
36
|
+
"""Text object class initialiser."""
|
37
|
+
self._content = content
|
38
|
+
self._hastext = bool(content)
|
39
|
+
|
40
|
+
def __str__(self) -> str:
|
41
|
+
"""When printing this object, display the text contents."""
|
42
|
+
return self._content
|
43
|
+
|
44
|
+
@property
|
45
|
+
def content(self) -> str:
|
46
|
+
"""Accessor to the textual content."""
|
47
|
+
return self._content
|
48
|
+
|
49
|
+
@content.setter
|
50
|
+
def content(self, value: str) -> None:
|
51
|
+
"""Setter for the ``content`` attribute.
|
52
|
+
|
53
|
+
If the ``value`` argument is populated, the content is set and
|
54
|
+
the ``hastext`` attribute is set to ``True``.
|
55
|
+
|
56
|
+
"""
|
57
|
+
if value:
|
58
|
+
self._content = value
|
59
|
+
self._hastext = True
|
60
|
+
|
61
|
+
@property
|
62
|
+
def hastext(self) -> bool:
|
63
|
+
"""Flag indicating if the ``content`` attribute is populated."""
|
64
|
+
return self._hastext
|
docp/objects/pdfobject.py
CHANGED
@@ -12,9 +12,13 @@
|
|
12
12
|
:Comments: n/a
|
13
13
|
|
14
14
|
"""
|
15
|
-
# pylint: disable=import-error
|
16
15
|
|
17
|
-
|
16
|
+
try:
|
17
|
+
from .objects._docbaseobject import _DocBase
|
18
|
+
from .objects._pageobject import PageObject
|
19
|
+
except ImportError:
|
20
|
+
from objects._docbaseobject import _DocBase
|
21
|
+
from objects._pageobject import PageObject
|
18
22
|
|
19
23
|
|
20
24
|
class DocPDF(_DocBase):
|
@@ -24,6 +28,24 @@ class DocPDF(_DocBase):
|
|
24
28
|
"""PDF document object class initialiser."""
|
25
29
|
super().__init__()
|
26
30
|
self._tags = False
|
31
|
+
# List of PageObjects, offset by 1 to align the index with page numbers.
|
32
|
+
self._pages = [PageObject(pageno=0)]
|
33
|
+
|
34
|
+
@property
|
35
|
+
def pages(self) -> list[PageObject]:
|
36
|
+
"""A list of containing an object for each page in the document.
|
37
|
+
|
38
|
+
.. tip::
|
39
|
+
|
40
|
+
The page number index aligns to the page number in the PDF
|
41
|
+
file.
|
42
|
+
|
43
|
+
For example, to access the ``PageObject`` for page 42, use::
|
44
|
+
|
45
|
+
pages[42]
|
46
|
+
|
47
|
+
"""
|
48
|
+
return self._pages
|
27
49
|
|
28
50
|
@property
|
29
51
|
def parsed_using_tags(self) -> bool:
|
@@ -0,0 +1,46 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the 'PPTX Document' object structure
|
5
|
+
into which MS PowerPoint documents are parsed into for
|
6
|
+
transport and onward use.
|
7
|
+
|
8
|
+
:Platform: Linux/Windows | Python 3.10+
|
9
|
+
:Developer: J Berendt
|
10
|
+
:Email: development@s3dev.uk
|
11
|
+
|
12
|
+
:Comments: n/a
|
13
|
+
|
14
|
+
"""
|
15
|
+
|
16
|
+
try:
|
17
|
+
from .objects._docbaseobject import _DocBase
|
18
|
+
from .objects._slideobject import SlideObject
|
19
|
+
except ImportError:
|
20
|
+
from objects._docbaseobject import _DocBase
|
21
|
+
from objects._slideobject import SlideObject
|
22
|
+
|
23
|
+
|
24
|
+
class DocPPTX(_DocBase):
|
25
|
+
"""Container class for storing data parsed from a PPTX file."""
|
26
|
+
|
27
|
+
def __init__(self):
|
28
|
+
"""PPTX document object class initialiser."""
|
29
|
+
super().__init__()
|
30
|
+
self._slides = [SlideObject(pageno=0)]
|
31
|
+
|
32
|
+
@property
|
33
|
+
def slides(self) -> list[SlideObject]:
|
34
|
+
"""A list of containing an object for each slide in the document.
|
35
|
+
|
36
|
+
.. tip::
|
37
|
+
|
38
|
+
The slide number index aligns to the slide number in the
|
39
|
+
PPTX file.
|
40
|
+
|
41
|
+
For example, to access the ``SlideObject`` for side 42, use::
|
42
|
+
|
43
|
+
slides[42]
|
44
|
+
|
45
|
+
"""
|
46
|
+
return self._slides
|
docp/parsers/_pdfbaseparser.py
CHANGED
@@ -8,13 +8,17 @@
|
|
8
8
|
:Developer: J Berendt
|
9
9
|
:Email: development@s3dev.uk
|
10
10
|
|
11
|
-
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
.. attention::
|
14
|
+
|
15
|
+
This module is *not* designed to be interacted with
|
12
16
|
directly, only via the appropriate interface class(es).
|
13
17
|
|
14
18
|
Rather, please create an instance of a PDF document parsing
|
15
|
-
object using the following:
|
19
|
+
object using the following class:
|
16
20
|
|
17
|
-
- :class
|
21
|
+
- :class:`~docp.parsers.pdfparser.PDFParser`
|
18
22
|
|
19
23
|
"""
|
20
24
|
# pylint: disable=import-error
|
@@ -26,7 +30,12 @@ import pdfplumber
|
|
26
30
|
from collections import Counter
|
27
31
|
from unidecode import unidecode
|
28
32
|
# locals
|
29
|
-
|
33
|
+
try:
|
34
|
+
from .libs.utilities import utilities
|
35
|
+
from .objects.pdfobject import DocPDF
|
36
|
+
except ImportError:
|
37
|
+
from libs.utilities import utilities
|
38
|
+
from objects.pdfobject import DocPDF
|
30
39
|
|
31
40
|
|
32
41
|
class _PDFBaseParser:
|
@@ -94,7 +103,7 @@ class _PDFBaseParser:
|
|
94
103
|
case 1: num = 1
|
95
104
|
case _ if npages in range(2, 11): num = 2
|
96
105
|
case _: num = 5
|
97
|
-
pg = self._doc.parser.pages[num] # The
|
106
|
+
pg = self._doc.parser.pages[num - 1] # The parser does not have a page offset at [0].
|
98
107
|
# Default coordinates to the whole page.
|
99
108
|
coords = {'x0': 0, 'top': 0, 'x1': pg.width, 'bottom': pg.height}
|
100
109
|
# If the header and/or footer is to be skipped, find and iterate
|
@@ -117,6 +126,13 @@ class _PDFBaseParser:
|
|
117
126
|
def _open(self) -> None:
|
118
127
|
"""Open the PDF document for reading.
|
119
128
|
|
129
|
+
Before opening the file, a test is performed to ensure the PDF
|
130
|
+
is valid. The file must:
|
131
|
+
|
132
|
+
- exist
|
133
|
+
- be a valid PDF file, per the file signature
|
134
|
+
- have a .pdf file extension
|
135
|
+
|
120
136
|
:Other Operations:
|
121
137
|
|
122
138
|
- Store the ``pdfplumber`` parser object returned from the
|
@@ -127,10 +143,20 @@ class _PDFBaseParser:
|
|
127
143
|
- Store the document's meta data into the
|
128
144
|
:attr:`self._doc._meta` attribute.
|
129
145
|
|
146
|
+
Raises:
|
147
|
+
TypeError: Raised if the file type criteria above are not
|
148
|
+
met.
|
149
|
+
|
130
150
|
"""
|
131
|
-
|
132
|
-
|
133
|
-
|
151
|
+
if all((os.path.exists(self._doc._fpath),
|
152
|
+
utilities.ispdf(self._doc._fpath),
|
153
|
+
os.path.splitext(self._doc._fpath)[1].lower() == '.pdf')):
|
154
|
+
self._doc._parser = pdfplumber.open(self._doc._fpath)
|
155
|
+
self._doc._npages = len(self._doc._parser.pages)
|
156
|
+
self._doc._meta = self._doc._parser.metadata
|
157
|
+
else:
|
158
|
+
msg = f'{self._doc._fname} is not a valid PDF file.'
|
159
|
+
raise TypeError(msg)
|
134
160
|
|
135
161
|
@staticmethod
|
136
162
|
def _prepare_row(row: list) -> str:
|
@@ -196,13 +222,13 @@ class _PDFBaseParser:
|
|
196
222
|
# Only scan if document has more than three pages.
|
197
223
|
if self._doc.npages < 4:
|
198
224
|
return []
|
199
|
-
if self._doc.
|
225
|
+
if self._doc._common is None:
|
200
226
|
# Create a line generator for all pages.
|
201
227
|
lines = (l for p in self._doc.parser.pages for l in p.extract_text().split('\n'))
|
202
228
|
# Return the lines whose occurrence rate is 90% of document pages.
|
203
229
|
self._doc._common = [i[0] for i in Counter(lines).most_common()
|
204
230
|
if i[1] > self._doc.npages * 0.9]
|
205
|
-
return self._doc.
|
231
|
+
return self._doc._common
|
206
232
|
|
207
233
|
def _set_paths(self) -> None:
|
208
234
|
"""Set the document's file path attributes."""
|
docp/parsers/_pdftableparser.py
CHANGED
@@ -8,14 +8,15 @@
|
|
8
8
|
:Developer: J Berendt
|
9
9
|
:Email: jeremy.berendt@rolls-royce.com
|
10
10
|
|
11
|
-
|
11
|
+
.. attention::
|
12
|
+
|
13
|
+
This module is *not* designed to be interacted with
|
12
14
|
directly, only via the appropriate interface class(es).
|
13
15
|
|
14
16
|
Rather, please create an instance of a PDF document parsing
|
15
17
|
object using the following:
|
16
18
|
|
17
|
-
- :class
|
18
|
-
|
19
|
+
- :class:`~docp.parsers.pdfparser.PDFParser`
|
19
20
|
|
20
21
|
"""
|
21
22
|
# pylint: disable=import-error
|
@@ -35,7 +36,6 @@ _SETTINGS = {'vertical_strategy': 'lines',
|
|
35
36
|
'snap_x_tolerance': 12}
|
36
37
|
|
37
38
|
|
38
|
-
# TODO: Revise the docstring.
|
39
39
|
class _PDFTableParser(_PDFBaseParser):
|
40
40
|
"""Private PDF document table parser intermediate class.
|
41
41
|
|
@@ -46,10 +46,9 @@ class _PDFTableParser(_PDFBaseParser):
|
|
46
46
|
|
47
47
|
Extract tables from a PDF file::
|
48
48
|
|
49
|
-
>>> from
|
49
|
+
>>> from docp import PDFParser
|
50
50
|
|
51
|
-
>>>
|
52
|
-
>>> pdf = PDFParser(path)
|
51
|
+
>>> pdf = PDFParser(path='/path/to/myfile.pdf')
|
53
52
|
>>> pdf.extract_tables()
|
54
53
|
|
55
54
|
>>> tables = pdf.doc.tables
|
docp/parsers/_pdftextparser.py
CHANGED
@@ -8,17 +8,22 @@
|
|
8
8
|
:Developer: J Berendt
|
9
9
|
:Email: development@s3dev.uk
|
10
10
|
|
11
|
-
|
11
|
+
.. attention::
|
12
|
+
|
13
|
+
This module is *not* designed to be interacted with
|
12
14
|
directly, only via the appropriate interface class(es).
|
13
15
|
|
14
16
|
Rather, please create an instance of a PDF document parsing
|
15
17
|
object using the following:
|
16
18
|
|
17
|
-
- :class
|
19
|
+
- :class:`~docp.parsers.pdfparser.PDFParser`
|
20
|
+
|
21
|
+
.. note::
|
22
|
+
|
23
|
+
**Multi-processing**
|
18
24
|
|
19
|
-
Note: **Multi-processing:**
|
20
25
|
Text extraction through multi-processing has been tested and
|
21
|
-
is not
|
26
|
+
is not feasible due to an error indicating
|
22
27
|
the ``pdfplumber.page.Page`` object can not be pickled. This
|
23
28
|
object was being passed into the extraction method as the
|
24
29
|
object contains the :func:`extract_text` function.
|
@@ -35,17 +40,17 @@ Note: **Multi-processing:**
|
|
35
40
|
It has therefore been determined that this module will remain
|
36
41
|
single-threaded.
|
37
42
|
|
38
|
-
|
43
|
+
**Multi-Thread Timings**
|
39
44
|
|
40
|
-
|
45
|
+
- **Single-threaded:**
|
41
46
|
|
42
|
-
|
43
|
-
|
47
|
+
- 14 page document: ~2 seconds
|
48
|
+
- 92 page document: ~32 seconds
|
44
49
|
|
45
|
-
|
50
|
+
- **Multi-threaded:**
|
46
51
|
|
47
|
-
|
48
|
-
|
52
|
+
- 14 page document: ~2 seconds
|
53
|
+
- 92 page document: ~35 seconds
|
49
54
|
|
50
55
|
"""
|
51
56
|
# pylint: disable=import-error
|
@@ -83,7 +88,8 @@ class _PDFTextParser(_PDFBaseParser):
|
|
83
88
|
remove_footer: bool=False,
|
84
89
|
remove_newlines: bool=False,
|
85
90
|
ignore_tags: set=None,
|
86
|
-
convert_to_ascii: bool=True
|
91
|
+
convert_to_ascii: bool=True,
|
92
|
+
**kwargs):
|
87
93
|
"""Extract text from the document.
|
88
94
|
|
89
95
|
If the PDF document contains 'marked content' tags, these tags
|
@@ -125,10 +131,14 @@ class _PDFTextParser(_PDFBaseParser):
|
|
125
131
|
converted, it is replaced with a ``'?'``.
|
126
132
|
Defaults to True.
|
127
133
|
|
134
|
+
:Keyword Args:
|
135
|
+
- None
|
136
|
+
|
128
137
|
Returns:
|
129
138
|
None.
|
130
139
|
|
131
140
|
"""
|
141
|
+
# pylint: disable=unused-argument # **kwargs
|
132
142
|
# pylint: disable=unnecessary-dunder-call
|
133
143
|
if len(self.doc.pages) > 1:
|
134
144
|
# Reinitialise the doc object and reopen the document.
|
@@ -216,7 +226,7 @@ class _PDFTextParser(_PDFBaseParser):
|
|
216
226
|
yield ''
|
217
227
|
|
218
228
|
def _uses_marked_content(self) -> bool:
|
219
|
-
"""Test
|
229
|
+
"""Test whether the document can be parsed using tags.
|
220
230
|
|
221
231
|
Marked content allows us to parse the PDF using tags (rather than
|
222
232
|
OCR) which is more accurate not only in terms of character
|
@@ -0,0 +1,93 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides generalised base functionality for
|
5
|
+
parsing PPTX documents.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
.. attention::
|
12
|
+
|
13
|
+
This module is *not* designed to be interacted with
|
14
|
+
directly, only via the appropriate interface class(es).
|
15
|
+
|
16
|
+
Rather, please create an instance of a PPTX document parsing
|
17
|
+
object using the following:
|
18
|
+
|
19
|
+
- :class:`~docp.parsers.pptxparser.PPTXParser`
|
20
|
+
|
21
|
+
"""
|
22
|
+
# pylint: disable=protected-access
|
23
|
+
|
24
|
+
import os
|
25
|
+
from pptx import Presentation
|
26
|
+
# locals
|
27
|
+
try:
|
28
|
+
from libs.utilities import utilities
|
29
|
+
from objects.pptxobject import DocPPTX
|
30
|
+
except ImportError:
|
31
|
+
from .libs.utilities import utilities
|
32
|
+
from .objects.pptxobject import DocPPTX
|
33
|
+
|
34
|
+
|
35
|
+
class _PPTXBaseParser:
|
36
|
+
"""Base class containing generalised PPTX parsing functionality."""
|
37
|
+
|
38
|
+
def __init__(self, path: str):
|
39
|
+
"""Private base parser class initialiser.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
path (str): Full path to the document to be parsed.
|
43
|
+
|
44
|
+
"""
|
45
|
+
self._path = path
|
46
|
+
self._doc = DocPPTX()
|
47
|
+
self._set_paths()
|
48
|
+
self._open()
|
49
|
+
|
50
|
+
@property
|
51
|
+
def doc(self) -> DocPPTX:
|
52
|
+
"""Accessor to the document object."""
|
53
|
+
return self._doc
|
54
|
+
|
55
|
+
def _open(self) -> None:
|
56
|
+
"""Open the PPTX document for reading.
|
57
|
+
|
58
|
+
Before opening the file, a test is performed to ensure the PPTX
|
59
|
+
is valid. The file must:
|
60
|
+
|
61
|
+
- exist
|
62
|
+
- be a ZIP archive, per the file signature
|
63
|
+
- have a .pptx file extension
|
64
|
+
|
65
|
+
:Other Operations:
|
66
|
+
|
67
|
+
- Store the ``pptx.Presentation`` parser object returned
|
68
|
+
from the :func:`pptx.Presentation` instance creation into
|
69
|
+
the :attr:`self._doc._parser` attribute.
|
70
|
+
- Store the number of pages into the
|
71
|
+
:attr:`self._doc._npages` attribute.
|
72
|
+
- Store the document's meta data into the
|
73
|
+
:attr:`self._doc._meta` attribute.
|
74
|
+
|
75
|
+
Raises:
|
76
|
+
TypeError: Raised if the file type criteria above are not
|
77
|
+
met.
|
78
|
+
|
79
|
+
"""
|
80
|
+
if all((os.path.exists(self._doc._fpath),
|
81
|
+
utilities.iszip(self._doc._fpath),
|
82
|
+
os.path.splitext(self._doc._fpath)[1].lower() == '.pptx')):
|
83
|
+
self._doc._parser = Presentation(self._doc._fpath)
|
84
|
+
self._doc._npages = len(self._doc._parser.slides)
|
85
|
+
self._doc._meta = self._doc._parser.core_properties
|
86
|
+
else:
|
87
|
+
msg = f'{self._doc._fname} is not a valid PPTX file.'
|
88
|
+
raise TypeError(msg)
|
89
|
+
|
90
|
+
def _set_paths(self) -> None:
|
91
|
+
"""Set the document's file path attributes."""
|
92
|
+
self._doc._fpath = os.path.realpath(self._path)
|
93
|
+
self._doc._fname = os.path.basename(self._path)
|
@@ -0,0 +1,115 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the logic for parsing text from a PPTX
|
5
|
+
document.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
.. attention::
|
12
|
+
|
13
|
+
This module is *not* designed to be interacted with
|
14
|
+
directly, only via the appropriate interface class(es).
|
15
|
+
|
16
|
+
Rather, please create an instance of a PPTX document parsing
|
17
|
+
object using the following:
|
18
|
+
|
19
|
+
- :class:`~docp.parsers.pptxparser.PPTXParser`
|
20
|
+
|
21
|
+
"""
|
22
|
+
|
23
|
+
from unidecode import unidecode
|
24
|
+
# locals
|
25
|
+
try:
|
26
|
+
from .objects._slideobject import SlideObject
|
27
|
+
from .objects._textobject import TextObject
|
28
|
+
from .parsers._pptxbaseparser import _PPTXBaseParser
|
29
|
+
except ImportError:
|
30
|
+
from objects._slideobject import SlideObject
|
31
|
+
from objects._textobject import TextObject
|
32
|
+
from parsers._pptxbaseparser import _PPTXBaseParser
|
33
|
+
|
34
|
+
|
35
|
+
class _PPTXTextParser(_PPTXBaseParser):
|
36
|
+
"""Private PPTX document text parser intermediate class.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
path (str): Full path to the PPTX document.
|
40
|
+
|
41
|
+
:Example:
|
42
|
+
|
43
|
+
Extract text from a PPTX file::
|
44
|
+
|
45
|
+
>>> from docp import PPTXParser
|
46
|
+
|
47
|
+
>>> pptx = PPTXParser(path='/path/to/myfile.pptx')
|
48
|
+
>>> pptx.extract_text()
|
49
|
+
|
50
|
+
# Access the text on slide 1.
|
51
|
+
>>> pg1 = pptx.doc.slides[1].content
|
52
|
+
|
53
|
+
"""
|
54
|
+
|
55
|
+
def extract_text(self,
|
56
|
+
*,
|
57
|
+
remove_newlines: bool=False,
|
58
|
+
convert_to_ascii: bool=True,
|
59
|
+
**kwargs) -> None:
|
60
|
+
"""Extract text from the document.
|
61
|
+
|
62
|
+
A list of slides, with extracted content can be accessed using
|
63
|
+
the :attr:`self.doc.slides` attribute.
|
64
|
+
|
65
|
+
Args:
|
66
|
+
remove_newlines (bool, optional): If True, the newline
|
67
|
+
characters are replaced with a space. Defaults to False.
|
68
|
+
convert_to_ascii (bool, optional): When a non-ASCII character
|
69
|
+
is found, an attempt is made to convert it to an
|
70
|
+
associated ASCII character. If a character cannot be
|
71
|
+
converted, it is replaced with a ``'?'``.
|
72
|
+
Defaults to True.
|
73
|
+
|
74
|
+
:Keyword Args:
|
75
|
+
- None
|
76
|
+
|
77
|
+
Returns:
|
78
|
+
None.
|
79
|
+
|
80
|
+
"""
|
81
|
+
# pylint: disable=unused-argument # **kwargs
|
82
|
+
# pylint: disable=unnecessary-dunder-call
|
83
|
+
if len(self.doc.slides) > 1:
|
84
|
+
# Reinitialise the doc object and reopen the document.
|
85
|
+
self.__init__(path=self._path)
|
86
|
+
self._extract_text(remove_newlines=remove_newlines, convert_to_ascii=convert_to_ascii)
|
87
|
+
|
88
|
+
def _extract_text(self, remove_newlines: bool, convert_to_ascii: bool) -> None:
|
89
|
+
"""Extract the text from all shapes on all slides.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
remove_newlines (bool): Replace the newline characters with
|
93
|
+
a space.
|
94
|
+
convert_to_ascii (bool): Attempt to convert any non-ASCII
|
95
|
+
characters to their ASCII equivalent.
|
96
|
+
|
97
|
+
The text extracted from each slide is stored as a ``TextObject``
|
98
|
+
which is appended to the slide's ``texts`` attribute.
|
99
|
+
|
100
|
+
"""
|
101
|
+
for idx, slide in enumerate(self.doc.parser.slides, 1):
|
102
|
+
_slideobj = SlideObject(pageno=idx, parser=slide)
|
103
|
+
for shape in slide.shapes:
|
104
|
+
if hasattr(shape, 'text'):
|
105
|
+
if shape.text:
|
106
|
+
text = shape.text
|
107
|
+
if remove_newlines:
|
108
|
+
text = text.replace('\n', ' ')
|
109
|
+
if convert_to_ascii:
|
110
|
+
text = unidecode(string=text,
|
111
|
+
errors='replace',
|
112
|
+
replace_str='?')
|
113
|
+
_textobj = TextObject(content=text)
|
114
|
+
_slideobj.texts.append(_textobj)
|
115
|
+
self.doc.slides.append(_slideobj)
|
@@ -0,0 +1,51 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module serves as the public interface for interacting
|
5
|
+
with PPTX files and parsing their contents.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
:Example: For example code usage, please refer to the
|
14
|
+
:class:`PPTXParser` class docstring.
|
15
|
+
|
16
|
+
"""
|
17
|
+
|
18
|
+
# Set sys.path for relative imports.
|
19
|
+
import os
|
20
|
+
import sys
|
21
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
22
|
+
# locals
|
23
|
+
try:
|
24
|
+
from .parsers._pptxtextparser import _PPTXTextParser
|
25
|
+
except ImportError:
|
26
|
+
from parsers._pptxtextparser import _PPTXTextParser
|
27
|
+
|
28
|
+
|
29
|
+
class PPTXParser(_PPTXTextParser):
|
30
|
+
"""PPTX document parser.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
path (str): Full path to the PPTX document to be parsed.
|
34
|
+
|
35
|
+
:Example:
|
36
|
+
|
37
|
+
Extract text from a PPTX file::
|
38
|
+
|
39
|
+
>>> from docp import PPTXParser
|
40
|
+
|
41
|
+
>>> pptx = PPTXParser(path='/path/to/myfile.pptx')
|
42
|
+
>>> pptx.extract_text()
|
43
|
+
|
44
|
+
# Access the text on slide 1.
|
45
|
+
>>> pg1 = pptx.doc.slides[1].content
|
46
|
+
|
47
|
+
"""
|
48
|
+
|
49
|
+
def __init__(self, path: str):
|
50
|
+
"""PPTX parser class initialiser."""
|
51
|
+
super().__init__(path=path)
|
@@ -0,0 +1,48 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides parser-specific utility functions for
|
5
|
+
the project.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
# locals
|
16
|
+
try:
|
17
|
+
from .libs.utilities import utilities
|
18
|
+
from .parsers.pdfparser import PDFParser
|
19
|
+
from .parsers.pptxparser import PPTXParser
|
20
|
+
except ImportError:
|
21
|
+
from libs.utilities import utilities
|
22
|
+
from parsers.pdfparser import PDFParser
|
23
|
+
from parsers.pptxparser import PPTXParser
|
24
|
+
|
25
|
+
|
26
|
+
class ParserUtilities:
|
27
|
+
"""Parser-based (cross-project) utility functions."""
|
28
|
+
|
29
|
+
def get_parser(self, path: str) -> PDFParser | PPTXParser:
|
30
|
+
"""Return the appropriate parser for the file type.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
path (str): Full path to the file to be tested.
|
34
|
+
|
35
|
+
Returns:
|
36
|
+
PDFParser | PPTXParser: The appropriate parser for the file,
|
37
|
+
given the *file signature*; this test is not file extension
|
38
|
+
based.
|
39
|
+
|
40
|
+
"""
|
41
|
+
if utilities.ispdf(path=path):
|
42
|
+
return PDFParser
|
43
|
+
if utilities.iszip(path=path):
|
44
|
+
return PPTXParser
|
45
|
+
raise NotImplementedError('A parser is not available for: os.path.basename(path)')
|
46
|
+
|
47
|
+
|
48
|
+
putilities = ParserUtilities()
|