docp 0.1.0b1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- docp/__init__.py +31 -0
- docp/_version.py +1 -0
- docp/dbs/__init__.py +0 -0
- docp/dbs/chroma.py +184 -0
- docp/loaders/__init__.py +0 -0
- docp/loaders/_chromabaseloader.py +362 -0
- docp/loaders/chroma.py +166 -0
- docp/objects/__init__.py +0 -0
- docp/objects/_docbaseobject.py +76 -0
- docp/objects/_pageobject.py +126 -0
- docp/objects/_tableobject.py +0 -0
- docp/objects/_textobject.py +0 -0
- docp/objects/pdfobject.py +39 -0
- docp/parsers/__init__.py +0 -0
- docp/parsers/_pdfbaseparser.py +210 -0
- docp/parsers/_pdftableparser.py +273 -0
- docp/parsers/_pdftextparser.py +253 -0
- docp/parsers/pdfparser.py +62 -0
- docp-0.1.0b1.dist-info/LICENSE +622 -0
- docp-0.1.0b1.dist-info/METADATA +55 -0
- docp-0.1.0b1.dist-info/RECORD +23 -0
- docp-0.1.0b1.dist-info/WHEEL +5 -0
- docp-0.1.0b1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,273 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the logic for parsing tables from a PDF
|
5
|
+
document.
|
6
|
+
|
7
|
+
:Platform: Linux
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: jeremy.berendt@rolls-royce.com
|
10
|
+
|
11
|
+
Note: This module is *not* designed to be interacted with
|
12
|
+
directly, only via the appropriate interface class(es).
|
13
|
+
|
14
|
+
Rather, please create an instance of a PDF document parsing
|
15
|
+
object using the following:
|
16
|
+
|
17
|
+
- :class:`pdfparser.PDFParser`
|
18
|
+
|
19
|
+
|
20
|
+
"""
|
21
|
+
# pylint: disable=import-error
|
22
|
+
# pylint: disable=protected-access
|
23
|
+
# pylint: disable=wrong-import-order
|
24
|
+
|
25
|
+
import io
|
26
|
+
import os
|
27
|
+
import pandas as pd
|
28
|
+
import shutil
|
29
|
+
# locals
|
30
|
+
from parsers._pdfbaseparser import _PDFBaseParser
|
31
|
+
|
32
|
+
# TODO: Move to a config file/class. (TOML?)
|
33
|
+
_SETTINGS = {'vertical_strategy': 'lines',
|
34
|
+
'horizontal_strategy':'lines',
|
35
|
+
'snap_x_tolerance': 12}
|
36
|
+
|
37
|
+
|
38
|
+
# TODO: Revise the docstring.
|
39
|
+
class _PDFTableParser(_PDFBaseParser):
|
40
|
+
"""Private PDF document table parser intermediate class.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
path (str): Full path to the PDF document.
|
44
|
+
|
45
|
+
:Example:
|
46
|
+
|
47
|
+
Extract tables from a PDF file::
|
48
|
+
|
49
|
+
>>> from docutils.parsers.pdf import PDFParser
|
50
|
+
|
51
|
+
>>> path = '/path/to/myfile.pdf'
|
52
|
+
>>> pdf = PDFParser(path)
|
53
|
+
>>> pdf.extract_tables()
|
54
|
+
|
55
|
+
>>> tables = pdf.doc.tables
|
56
|
+
|
57
|
+
"""
|
58
|
+
|
59
|
+
def extract_tables(self,
|
60
|
+
table_settings: dict=None,
|
61
|
+
as_dataframe: bool=False,
|
62
|
+
to_csv: bool=True,
|
63
|
+
verbose: bool=False):
|
64
|
+
"""Extract tables from the document.
|
65
|
+
|
66
|
+
Before a table is extracted, a number of validation tests are
|
67
|
+
performed to verify what has been identified as a 'table' is
|
68
|
+
actually a table which might be useful to the user.
|
69
|
+
|
70
|
+
Each 'valid' table is written as a CSV file on the user's
|
71
|
+
desktop.
|
72
|
+
|
73
|
+
Additionally, the extracted table data is stored to the class'
|
74
|
+
:attr:`self.tables` attribute.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
table_settings (dict, optional): Table settings to be used
|
78
|
+
for the table extraction. Defaults to None, which is
|
79
|
+
replaced by the value in the config.
|
80
|
+
as_dataframe (bool, optional): By default, the extracted
|
81
|
+
tables are returned as a list of (lists of lists), for
|
82
|
+
example: all_tables[table[rows[data]]]. However, if this
|
83
|
+
argument is ``True``, the table data is returned as a
|
84
|
+
list of ``pandas.DataFrame`` objects. In this case, the
|
85
|
+
first row of the table is used as the header, and all
|
86
|
+
remaining rows are treated as data. **Note:** This will
|
87
|
+
*not* work properly for all tables. Defaults to False.
|
88
|
+
to_csv (bool, optional): Dump extracted table data to a CSV
|
89
|
+
file, one per table. Defaults to True.
|
90
|
+
verbose (bool, optional): Display how many tables were
|
91
|
+
extracted, and the path to their location.
|
92
|
+
|
93
|
+
"""
|
94
|
+
# pylint: disable=invalid-name
|
95
|
+
# pylint: disable=too-many-nested-blocks
|
96
|
+
# pylint: disable=unnecessary-dunder-call
|
97
|
+
if self._doc.tables:
|
98
|
+
# Reinitialise the doc object and reopen the document.
|
99
|
+
self.__init__(path=self._path)
|
100
|
+
c = 0
|
101
|
+
if to_csv:
|
102
|
+
self._create_table_directory_path()
|
103
|
+
if table_settings is None:
|
104
|
+
table_settings = _SETTINGS
|
105
|
+
for p in self._doc._pdf.pages:
|
106
|
+
tblno = 1
|
107
|
+
tables = self._filter_tables(tables=p.find_tables(), threshold=5000)
|
108
|
+
for table in tables:
|
109
|
+
pc = p.crop(table.bbox)
|
110
|
+
data = pc.extract_table(table_settings=table_settings)
|
111
|
+
if all(len(row) > 1 for row in data) and len(data) > 1:
|
112
|
+
# Verify no table rows are found in the most common rows (header/footer).
|
113
|
+
if not self._table_header_footer(table=data):
|
114
|
+
if not as_dataframe:
|
115
|
+
self._doc._tables.append(data)
|
116
|
+
if to_csv or as_dataframe:
|
117
|
+
buffer = self._to_buffer(data=data)
|
118
|
+
if to_csv:
|
119
|
+
c += self._to_csv(buffer=buffer,
|
120
|
+
pageno=p.page_number,
|
121
|
+
tableno=tblno)
|
122
|
+
if as_dataframe:
|
123
|
+
self._to_df(buffer=buffer)
|
124
|
+
buffer.close()
|
125
|
+
tblno += 1
|
126
|
+
if verbose and to_csv:
|
127
|
+
print('',
|
128
|
+
'Complete.',
|
129
|
+
f'{c} tables were extracted and stored at the path below.',
|
130
|
+
f'Path: {self._tbl_opath}',
|
131
|
+
sep='\n')
|
132
|
+
|
133
|
+
def _create_table_directory_path(self):
|
134
|
+
"""Create the output directory for table data.
|
135
|
+
|
136
|
+
If the directory does not exist, it is created.
|
137
|
+
|
138
|
+
"""
|
139
|
+
# Defined in parent class.
|
140
|
+
# pylint: disable=attribute-defined-outside-init
|
141
|
+
trans = {32: '_', 45: '_'}
|
142
|
+
path = (os.path.join(os.path.join(os.environ['HOME'], 'Desktop'),
|
143
|
+
'docutils',
|
144
|
+
'pdf_tables',
|
145
|
+
(os.path.splitext(os.path.basename(self._path))[0]
|
146
|
+
.lower()
|
147
|
+
.translate(trans))))
|
148
|
+
self._tbl_opath = path
|
149
|
+
if not os.path.exists(path):
|
150
|
+
os.makedirs(path)
|
151
|
+
|
152
|
+
def _create_table_file_path(self, pageno: int, tblno: int) -> str:
|
153
|
+
"""Create the filename for the table.
|
154
|
+
|
155
|
+
Args:
|
156
|
+
pageno (int): Page from which the table was extracted.
|
157
|
+
tblno (int): Number of the table on the page, starting at 1.
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
str: Explicit path to the file to be written.
|
161
|
+
|
162
|
+
"""
|
163
|
+
path = os.path.join(self._tbl_opath,
|
164
|
+
f'pg{str(pageno).zfill(3)}_tb{str(tblno).zfill(3)}.csv')
|
165
|
+
return path
|
166
|
+
|
167
|
+
@staticmethod
|
168
|
+
def _filter_tables(tables: list, threshold: int=5000) -> list:
|
169
|
+
"""Remove tables from the passed list which are deemed invalid.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
tables (list): A list of tables as detected by the
|
173
|
+
:meth:`Page.find_table()` method.
|
174
|
+
threshold (int, optional): Minimum pixel area for a detected
|
175
|
+
table to be returned. Defaults to 5000.
|
176
|
+
|
177
|
+
:Rationale:
|
178
|
+
An 'invalid' table is determined by the number of pixels
|
179
|
+
which the table covered. Any table which is less than (N)
|
180
|
+
pixels is likely a block of text which has been categorised
|
181
|
+
as a 'table', but is not.
|
182
|
+
|
183
|
+
Returns:
|
184
|
+
list: A list of tables whose pixel area is greater than
|
185
|
+
``threshold``.
|
186
|
+
|
187
|
+
"""
|
188
|
+
# pylint: disable=invalid-name
|
189
|
+
t = []
|
190
|
+
for table in tables:
|
191
|
+
x0, y0, x1, y1 = table.bbox
|
192
|
+
if (x1-x0) * (y1-y0) > threshold:
|
193
|
+
t.append(table)
|
194
|
+
return t
|
195
|
+
|
196
|
+
def _table_header_footer(self, table: list[list]) -> bool:
|
197
|
+
"""Verify a table is not a header or footer.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
table (list[list]): Table (a list of lists) be a analysed.
|
201
|
+
|
202
|
+
:Rationale:
|
203
|
+
A table is determined to be a header or footer if any of the
|
204
|
+
line contained in the 'common lines list' are found in the
|
205
|
+
table.
|
206
|
+
|
207
|
+
If any of these lines are found, the table is determined to
|
208
|
+
be a header/footer, True is returned.
|
209
|
+
|
210
|
+
Returns:
|
211
|
+
bool: False if the table is *not* a header/footer, otherwise
|
212
|
+
True.
|
213
|
+
|
214
|
+
"""
|
215
|
+
lines = self._scan_common() # Only re-runs if not already run.
|
216
|
+
# r: row; c: cell; l: line
|
217
|
+
return any(l in c for l in lines for r in table for c in r if c)
|
218
|
+
|
219
|
+
def _to_buffer(self, data: list[list]) -> io.StringIO:
|
220
|
+
"""Write the table data into a string buffer.
|
221
|
+
|
222
|
+
Args:
|
223
|
+
data (list[list]): The table data as a list of lists to be
|
224
|
+
written to a buffer.
|
225
|
+
|
226
|
+
Returns:
|
227
|
+
io.StringIO: A string buffer as an ``io.StringIO`` object.
|
228
|
+
|
229
|
+
"""
|
230
|
+
b = io.StringIO()
|
231
|
+
for row in data:
|
232
|
+
line = self._prepare_row(row=row)
|
233
|
+
b.write(line)
|
234
|
+
b.write('\n')
|
235
|
+
b.seek(0)
|
236
|
+
return b
|
237
|
+
|
238
|
+
def _to_csv(self, buffer: io.StringIO, pageno: int, tableno: int) -> int:
|
239
|
+
"""Write a table (from the buffer) to CSV.
|
240
|
+
|
241
|
+
Args:
|
242
|
+
buffer (io.StringIO): A pre-processed ``StringIO`` object
|
243
|
+
containing table data to be written.
|
244
|
+
pageno (int): Page number from the ``Page`` object.
|
245
|
+
tableno (int): Number of the table on the page, based at 1.
|
246
|
+
|
247
|
+
Returns:
|
248
|
+
int: 1 if the file was written, otherwise 0. This is used by
|
249
|
+
the caller to track the number of CSV files written.
|
250
|
+
|
251
|
+
"""
|
252
|
+
if buffer.seek(0, os.SEEK_END): # Test buffer is populated.
|
253
|
+
path = self._create_table_file_path(pageno=pageno, tblno=tableno)
|
254
|
+
with open(path, 'w', encoding='utf-8') as f:
|
255
|
+
buffer.seek(0)
|
256
|
+
shutil.copyfileobj(buffer, f)
|
257
|
+
return 1
|
258
|
+
return 0
|
259
|
+
|
260
|
+
def _to_df(self, buffer: io.StringIO):
|
261
|
+
"""Write a table (from the buffer) to a DataFrame.
|
262
|
+
|
263
|
+
Once written, the DataFrame is appended to
|
264
|
+
:attr:`self._doc._tables` list of tables.
|
265
|
+
|
266
|
+
Args:
|
267
|
+
buffer (io.StringIO): A pre-processed ``StringIO`` object
|
268
|
+
containing table data to be written.
|
269
|
+
|
270
|
+
"""
|
271
|
+
if buffer.seek(0, os.SEEK_END):
|
272
|
+
buffer.seek(0)
|
273
|
+
self._doc._tables.append(pd.read_csv(buffer))
|
@@ -0,0 +1,253 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the logic for parsing text from a PDF
|
5
|
+
document.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
Note: This module is *not* designed to be interacted with
|
12
|
+
directly, only via the appropriate interface class(es).
|
13
|
+
|
14
|
+
Rather, please create an instance of a PDF document parsing
|
15
|
+
object using the following:
|
16
|
+
|
17
|
+
- :class:`pdfparser.PDFParser`
|
18
|
+
|
19
|
+
Note: **Multi-processing:**
|
20
|
+
Text extraction through multi-processing has been tested and
|
21
|
+
is not feesible due to an error indicating
|
22
|
+
the ``pdfplumber.page.Page`` object can not be pickled. This
|
23
|
+
object was being passed into the extraction method as the
|
24
|
+
object contains the :func:`extract_text` function.
|
25
|
+
|
26
|
+
Additionally, multi-threading has also been tested and
|
27
|
+
it was determined to be too complex and inefficient. This was
|
28
|
+
tested using the ``concurrent.futures.ThreadPoolExecutor``
|
29
|
+
class and two documents, 14 and 92 pages; the timings are
|
30
|
+
shown below. The multi-threaded approach took longer to
|
31
|
+
process and added unnecessary complexity to the code base.
|
32
|
+
As a side-effect, the pages are processed and stored out of
|
33
|
+
order which would require a re-order, adding more complexity.
|
34
|
+
|
35
|
+
It has therefore been determined that this module will remain
|
36
|
+
single-threaded.
|
37
|
+
|
38
|
+
**Multi-Thread Timings**
|
39
|
+
|
40
|
+
**Single-threaded:**
|
41
|
+
|
42
|
+
- 14 page document: ~2 seconds
|
43
|
+
- 92 page document: ~32 seconds
|
44
|
+
|
45
|
+
**Multi-threaded:**
|
46
|
+
|
47
|
+
- 14 page document: ~2 seconds
|
48
|
+
- 92 page document: ~35 seconds
|
49
|
+
|
50
|
+
"""
|
51
|
+
# pylint: disable=import-error
|
52
|
+
|
53
|
+
from __future__ import annotations
|
54
|
+
from unidecode import unidecode
|
55
|
+
# locals
|
56
|
+
from objects._pageobject import PageObject
|
57
|
+
from parsers._pdfbaseparser import _PDFBaseParser
|
58
|
+
|
59
|
+
|
60
|
+
class _PDFTextParser(_PDFBaseParser):
|
61
|
+
"""Private PDF document text parser intermediate class.
|
62
|
+
|
63
|
+
Args:
|
64
|
+
path (str): Full path to the PDF document.
|
65
|
+
|
66
|
+
:Example:
|
67
|
+
|
68
|
+
Extract text from a PDF file::
|
69
|
+
|
70
|
+
>>> from docp import PDFParser
|
71
|
+
|
72
|
+
>>> pdf = PDFParser(path='/path/to/myfile.pdf')
|
73
|
+
>>> pdf.extract_text()
|
74
|
+
|
75
|
+
# Access the content of page 1.
|
76
|
+
>>> pg1 = pdf.doc.pages[1].content
|
77
|
+
|
78
|
+
"""
|
79
|
+
|
80
|
+
def extract_text(self,
|
81
|
+
*,
|
82
|
+
remove_header: bool=False,
|
83
|
+
remove_footer: bool=False,
|
84
|
+
remove_newlines: bool=False,
|
85
|
+
ignore_tags: set=None,
|
86
|
+
convert_to_ascii: bool=True):
|
87
|
+
"""Extract text from the document.
|
88
|
+
|
89
|
+
If the PDF document contains 'marked content' tags, these tags
|
90
|
+
are used to extract the text as this is a more accurate approach
|
91
|
+
and respects the structure of the page(s). Otherwise, a bounding
|
92
|
+
box method is used to extract the text. If instructed, the
|
93
|
+
header and/or footer regions can be excluded.
|
94
|
+
|
95
|
+
.. tip:
|
96
|
+
If a tag-based extract is used, the header/footer should be
|
97
|
+
automatically excluded as these will often have an 'Artifact'
|
98
|
+
tag, which is excluded by default, by passing
|
99
|
+
``ignore_tags=None``.
|
100
|
+
|
101
|
+
To *keep* the header and footer, pass ``ignore_tags='na'``.
|
102
|
+
|
103
|
+
A list of pages, with extracted content can be accessed using
|
104
|
+
the :attr:`self.doc.pages` attribute.
|
105
|
+
|
106
|
+
Args:
|
107
|
+
remove_header (bool, optional): If True, the header is
|
108
|
+
cropped (skipped) from text extraction. This only applies
|
109
|
+
to the bounding box extraction method. Defaults to False.
|
110
|
+
remove_footer (bool, optional): If True, the footer is
|
111
|
+
cropped (skipped) from text extraction. This only applies
|
112
|
+
to the bounding box extraction method. Defaults to False.
|
113
|
+
remove_newlines (bool, optional): If True, the newline
|
114
|
+
characters are replaced with a space. Defaults to False.
|
115
|
+
ignore_tags (set, optional): If provided, these are the
|
116
|
+
PDF 'marked content' tags which will be ignored. Note
|
117
|
+
that the PDF document must contain tags, otherwise the
|
118
|
+
bounding box method is used and this argument is ignored.
|
119
|
+
Defaults to ``{'Artifact'}``, as these generally
|
120
|
+
relate to a header and/or footer. To include all tags,
|
121
|
+
(not skip any) pass this argument as ``'na'``.
|
122
|
+
convert_to_ascii (bool, optional): When a non-ASCII character
|
123
|
+
is found, an attempt is made to convert it to an
|
124
|
+
associated ASCII character. If a character cannot be
|
125
|
+
converted, it is replaced with a ``'?'``.
|
126
|
+
Defaults to True.
|
127
|
+
|
128
|
+
Returns:
|
129
|
+
None.
|
130
|
+
|
131
|
+
"""
|
132
|
+
# pylint: disable=unnecessary-dunder-call
|
133
|
+
if len(self.doc.pages) > 1:
|
134
|
+
# Reinitialise the doc object and reopen the document.
|
135
|
+
self.__init__(path=self._path)
|
136
|
+
# If tags are found, these are used for text extraction. If tags
|
137
|
+
# are not found, a bounding box is used to remove the header and
|
138
|
+
# footer, if instructed.
|
139
|
+
if self._uses_marked_content():
|
140
|
+
match ignore_tags:
|
141
|
+
case None: ignore_tags = {'Artifact'}
|
142
|
+
case 'na': ignore_tags = set()
|
143
|
+
# Involves more processing, but also more accurate.
|
144
|
+
self._extract_text_using_tags(ignore_tags=ignore_tags, remove_newlines=remove_newlines)
|
145
|
+
else:
|
146
|
+
bbox = self._get_crop_coordinates(skip_header=remove_header, skip_footer=remove_footer)
|
147
|
+
self._extract_text_using_bbox(bbox=bbox, remove_newlines=remove_newlines)
|
148
|
+
if convert_to_ascii:
|
149
|
+
for page in self.doc.pages:
|
150
|
+
page.content = unidecode(string=page.content,
|
151
|
+
errors='replace',
|
152
|
+
replace_str='?')
|
153
|
+
|
154
|
+
def _extract_text_using_bbox(self, **kwargs):
|
155
|
+
"""Extract text using a bbox for finding the header and footer.
|
156
|
+
|
157
|
+
:Keyword Arguments:
|
158
|
+
Those passed by the caller, :meth:`~extract_text`.
|
159
|
+
|
160
|
+
"""
|
161
|
+
for page in self.doc.parser.pages:
|
162
|
+
text = page.within_bbox(bbox=kwargs['bbox']).extract_text().strip()
|
163
|
+
if kwargs['remove_newlines']:
|
164
|
+
text = text.replace('\n', ' ')
|
165
|
+
self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
|
166
|
+
|
167
|
+
def _extract_text_using_tags(self, **kwargs):
|
168
|
+
"""Extract text using tags.
|
169
|
+
|
170
|
+
The tags defined by the ``ignore_tags`` are skipped.
|
171
|
+
|
172
|
+
:Keyword Arguments:
|
173
|
+
Those passed by the caller, :meth:`~extract_text`.
|
174
|
+
|
175
|
+
"""
|
176
|
+
# pylint: disable=protected-access
|
177
|
+
ignored = kwargs['ignore_tags']
|
178
|
+
self.doc._tags = True # Set the doc's 'parsed_using_tags' flag.
|
179
|
+
for page in self.doc.parser.pages:
|
180
|
+
text = ''.join(self._text_from_tags(page=page, ignored=ignored))
|
181
|
+
if kwargs['remove_newlines']:
|
182
|
+
text = text.replace('\n', ' ')
|
183
|
+
self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
|
184
|
+
|
185
|
+
@staticmethod
|
186
|
+
def _text_from_tags(page: pdfplumber.page.Page, ignored: set) -> str: # pylint: disable=undefined-variable # noqa
|
187
|
+
"""Generate a page of text extracted from tags.
|
188
|
+
|
189
|
+
When extracting text from tags, newlines are not encoded and must
|
190
|
+
be derived. For each character on the page, the top and bottom
|
191
|
+
coordinates are compared to determine when a newline should be
|
192
|
+
inserted. If both the top and bottom of the current character
|
193
|
+
are greater than the previous character, a newline is inserted
|
194
|
+
into the text stream.
|
195
|
+
|
196
|
+
Args:
|
197
|
+
page (pdfplumber.page.Page): Page to be parsed.
|
198
|
+
ignored (set): A set containing the tags to be ignored.
|
199
|
+
|
200
|
+
Yields:
|
201
|
+
str: Each character on the page, providing its tag is not to
|
202
|
+
be ignored. Or, a newline character if the current
|
203
|
+
character's coordinates are greater than (lower on the page)
|
204
|
+
than the previous character.
|
205
|
+
|
206
|
+
"""
|
207
|
+
if page.chars:
|
208
|
+
# Micro-optimisation: Push tag filtering down to the C-level.
|
209
|
+
chars = filter(lambda x: x['tag'] not in ignored, page.chars)
|
210
|
+
top, btm = 999, 999
|
211
|
+
for c in chars:
|
212
|
+
if top < c['top'] and btm < c['bottom']:
|
213
|
+
yield '\n'
|
214
|
+
yield c['text']
|
215
|
+
top, btm = c['top'], c['bottom']
|
216
|
+
yield ''
|
217
|
+
|
218
|
+
def _uses_marked_content(self) -> bool:
|
219
|
+
"""Test wether the document can be parsed using tags.
|
220
|
+
|
221
|
+
Marked content allows us to parse the PDF using tags (rather than
|
222
|
+
OCR) which is more accurate not only in terms of character
|
223
|
+
recognition, but also with regard to the structure of the text on
|
224
|
+
a page.
|
225
|
+
|
226
|
+
:Logic:
|
227
|
+
If the document's catalog shows ``Marked: True``, then
|
228
|
+
``True`` is returned immediately.
|
229
|
+
|
230
|
+
Otherwise, a second attempt is made which detects marked
|
231
|
+
content tags on the first three pages. If no tags are found,
|
232
|
+
a third attempt is made by searching the first 10 pages. If
|
233
|
+
tags are found during either of these attempts, ``True`` is
|
234
|
+
returned immediately.
|
235
|
+
|
236
|
+
Finally, if no marked content or tags were found, ``False``
|
237
|
+
is returned.
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
bool: Returns True if the document can be parsed using marked
|
241
|
+
content tags, otherwise False.
|
242
|
+
|
243
|
+
"""
|
244
|
+
# Use pdfminer.six to get the document's catalog.
|
245
|
+
if self.doc.parser.doc.catalog.get('MarkInfo', {}).get('Marked', False):
|
246
|
+
return True
|
247
|
+
# Check only first three pages for tags first, if found, get out.
|
248
|
+
# If not, retry with the first 10 pages.
|
249
|
+
for i in [3, 10]:
|
250
|
+
tags = set(c['tag'] for p in self.doc.parser.pages[:i] for c in p.chars)
|
251
|
+
if tags != {None}:
|
252
|
+
return True
|
253
|
+
return False
|
@@ -0,0 +1,62 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module serves as the public interface for interacting
|
5
|
+
with PDF files and parsing their contents.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
:Example: For example code usage, please refer to the
|
14
|
+
:class:`PDFParser` class docstring.
|
15
|
+
|
16
|
+
"""
|
17
|
+
# pylint: disable=import-error
|
18
|
+
# pylint: disable=wrong-import-position
|
19
|
+
|
20
|
+
# Set sys.path for relative imports.
|
21
|
+
import os
|
22
|
+
import sys
|
23
|
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
|
24
|
+
# locals
|
25
|
+
from parsers._pdftableparser import _PDFTableParser
|
26
|
+
from parsers._pdftextparser import _PDFTextParser
|
27
|
+
|
28
|
+
|
29
|
+
class PDFParser(_PDFTableParser, _PDFTextParser):
|
30
|
+
"""PDF document parser.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
path (str): Full path to the PDF document to be parsed.
|
34
|
+
|
35
|
+
:Example:
|
36
|
+
|
37
|
+
Extract text from a PDF file::
|
38
|
+
|
39
|
+
>>> from docp import PDFParser
|
40
|
+
|
41
|
+
>>> pdf = PDFParser(path='/path/to/myfile.pdf')
|
42
|
+
>>> pdf.extract_text()
|
43
|
+
|
44
|
+
# Access the content of page 1.
|
45
|
+
>>> pg1 = pdf.doc.pages[1].content
|
46
|
+
|
47
|
+
|
48
|
+
Extract tables from a PDF file::
|
49
|
+
|
50
|
+
>>> from docp import PDFParser
|
51
|
+
|
52
|
+
>>> pdf = PDFParser('/path/to/myfile.pdf')
|
53
|
+
>>> pdf.extract_tables()
|
54
|
+
|
55
|
+
# Access the first table on page 1.
|
56
|
+
>>> tbl1 = pdf.doc.pages[1].tables[1]
|
57
|
+
|
58
|
+
"""
|
59
|
+
|
60
|
+
def __init__(self, path: str):
|
61
|
+
"""PDF parser class initialiser."""
|
62
|
+
super().__init__(path=path)
|