docp 0.1.0b1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,273 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the logic for parsing tables from a PDF
5
+ document.
6
+
7
+ :Platform: Linux
8
+ :Developer: J Berendt
9
+ :Email: jeremy.berendt@rolls-royce.com
10
+
11
+ Note: This module is *not* designed to be interacted with
12
+ directly, only via the appropriate interface class(es).
13
+
14
+ Rather, please create an instance of a PDF document parsing
15
+ object using the following:
16
+
17
+ - :class:`pdfparser.PDFParser`
18
+
19
+
20
+ """
21
+ # pylint: disable=import-error
22
+ # pylint: disable=protected-access
23
+ # pylint: disable=wrong-import-order
24
+
25
+ import io
26
+ import os
27
+ import pandas as pd
28
+ import shutil
29
+ # locals
30
+ from parsers._pdfbaseparser import _PDFBaseParser
31
+
32
+ # TODO: Move to a config file/class. (TOML?)
33
+ _SETTINGS = {'vertical_strategy': 'lines',
34
+ 'horizontal_strategy':'lines',
35
+ 'snap_x_tolerance': 12}
36
+
37
+
38
+ # TODO: Revise the docstring.
39
+ class _PDFTableParser(_PDFBaseParser):
40
+ """Private PDF document table parser intermediate class.
41
+
42
+ Args:
43
+ path (str): Full path to the PDF document.
44
+
45
+ :Example:
46
+
47
+ Extract tables from a PDF file::
48
+
49
+ >>> from docutils.parsers.pdf import PDFParser
50
+
51
+ >>> path = '/path/to/myfile.pdf'
52
+ >>> pdf = PDFParser(path)
53
+ >>> pdf.extract_tables()
54
+
55
+ >>> tables = pdf.doc.tables
56
+
57
+ """
58
+
59
+ def extract_tables(self,
60
+ table_settings: dict=None,
61
+ as_dataframe: bool=False,
62
+ to_csv: bool=True,
63
+ verbose: bool=False):
64
+ """Extract tables from the document.
65
+
66
+ Before a table is extracted, a number of validation tests are
67
+ performed to verify what has been identified as a 'table' is
68
+ actually a table which might be useful to the user.
69
+
70
+ Each 'valid' table is written as a CSV file on the user's
71
+ desktop.
72
+
73
+ Additionally, the extracted table data is stored to the class'
74
+ :attr:`self.tables` attribute.
75
+
76
+ Args:
77
+ table_settings (dict, optional): Table settings to be used
78
+ for the table extraction. Defaults to None, which is
79
+ replaced by the value in the config.
80
+ as_dataframe (bool, optional): By default, the extracted
81
+ tables are returned as a list of (lists of lists), for
82
+ example: all_tables[table[rows[data]]]. However, if this
83
+ argument is ``True``, the table data is returned as a
84
+ list of ``pandas.DataFrame`` objects. In this case, the
85
+ first row of the table is used as the header, and all
86
+ remaining rows are treated as data. **Note:** This will
87
+ *not* work properly for all tables. Defaults to False.
88
+ to_csv (bool, optional): Dump extracted table data to a CSV
89
+ file, one per table. Defaults to True.
90
+ verbose (bool, optional): Display how many tables were
91
+ extracted, and the path to their location.
92
+
93
+ """
94
+ # pylint: disable=invalid-name
95
+ # pylint: disable=too-many-nested-blocks
96
+ # pylint: disable=unnecessary-dunder-call
97
+ if self._doc.tables:
98
+ # Reinitialise the doc object and reopen the document.
99
+ self.__init__(path=self._path)
100
+ c = 0
101
+ if to_csv:
102
+ self._create_table_directory_path()
103
+ if table_settings is None:
104
+ table_settings = _SETTINGS
105
+ for p in self._doc._pdf.pages:
106
+ tblno = 1
107
+ tables = self._filter_tables(tables=p.find_tables(), threshold=5000)
108
+ for table in tables:
109
+ pc = p.crop(table.bbox)
110
+ data = pc.extract_table(table_settings=table_settings)
111
+ if all(len(row) > 1 for row in data) and len(data) > 1:
112
+ # Verify no table rows are found in the most common rows (header/footer).
113
+ if not self._table_header_footer(table=data):
114
+ if not as_dataframe:
115
+ self._doc._tables.append(data)
116
+ if to_csv or as_dataframe:
117
+ buffer = self._to_buffer(data=data)
118
+ if to_csv:
119
+ c += self._to_csv(buffer=buffer,
120
+ pageno=p.page_number,
121
+ tableno=tblno)
122
+ if as_dataframe:
123
+ self._to_df(buffer=buffer)
124
+ buffer.close()
125
+ tblno += 1
126
+ if verbose and to_csv:
127
+ print('',
128
+ 'Complete.',
129
+ f'{c} tables were extracted and stored at the path below.',
130
+ f'Path: {self._tbl_opath}',
131
+ sep='\n')
132
+
133
+ def _create_table_directory_path(self):
134
+ """Create the output directory for table data.
135
+
136
+ If the directory does not exist, it is created.
137
+
138
+ """
139
+ # Defined in parent class.
140
+ # pylint: disable=attribute-defined-outside-init
141
+ trans = {32: '_', 45: '_'}
142
+ path = (os.path.join(os.path.join(os.environ['HOME'], 'Desktop'),
143
+ 'docutils',
144
+ 'pdf_tables',
145
+ (os.path.splitext(os.path.basename(self._path))[0]
146
+ .lower()
147
+ .translate(trans))))
148
+ self._tbl_opath = path
149
+ if not os.path.exists(path):
150
+ os.makedirs(path)
151
+
152
+ def _create_table_file_path(self, pageno: int, tblno: int) -> str:
153
+ """Create the filename for the table.
154
+
155
+ Args:
156
+ pageno (int): Page from which the table was extracted.
157
+ tblno (int): Number of the table on the page, starting at 1.
158
+
159
+ Returns:
160
+ str: Explicit path to the file to be written.
161
+
162
+ """
163
+ path = os.path.join(self._tbl_opath,
164
+ f'pg{str(pageno).zfill(3)}_tb{str(tblno).zfill(3)}.csv')
165
+ return path
166
+
167
+ @staticmethod
168
+ def _filter_tables(tables: list, threshold: int=5000) -> list:
169
+ """Remove tables from the passed list which are deemed invalid.
170
+
171
+ Args:
172
+ tables (list): A list of tables as detected by the
173
+ :meth:`Page.find_table()` method.
174
+ threshold (int, optional): Minimum pixel area for a detected
175
+ table to be returned. Defaults to 5000.
176
+
177
+ :Rationale:
178
+ An 'invalid' table is determined by the number of pixels
179
+ which the table covered. Any table which is less than (N)
180
+ pixels is likely a block of text which has been categorised
181
+ as a 'table', but is not.
182
+
183
+ Returns:
184
+ list: A list of tables whose pixel area is greater than
185
+ ``threshold``.
186
+
187
+ """
188
+ # pylint: disable=invalid-name
189
+ t = []
190
+ for table in tables:
191
+ x0, y0, x1, y1 = table.bbox
192
+ if (x1-x0) * (y1-y0) > threshold:
193
+ t.append(table)
194
+ return t
195
+
196
+ def _table_header_footer(self, table: list[list]) -> bool:
197
+ """Verify a table is not a header or footer.
198
+
199
+ Args:
200
+ table (list[list]): Table (a list of lists) be a analysed.
201
+
202
+ :Rationale:
203
+ A table is determined to be a header or footer if any of the
204
+ line contained in the 'common lines list' are found in the
205
+ table.
206
+
207
+ If any of these lines are found, the table is determined to
208
+ be a header/footer, True is returned.
209
+
210
+ Returns:
211
+ bool: False if the table is *not* a header/footer, otherwise
212
+ True.
213
+
214
+ """
215
+ lines = self._scan_common() # Only re-runs if not already run.
216
+ # r: row; c: cell; l: line
217
+ return any(l in c for l in lines for r in table for c in r if c)
218
+
219
+ def _to_buffer(self, data: list[list]) -> io.StringIO:
220
+ """Write the table data into a string buffer.
221
+
222
+ Args:
223
+ data (list[list]): The table data as a list of lists to be
224
+ written to a buffer.
225
+
226
+ Returns:
227
+ io.StringIO: A string buffer as an ``io.StringIO`` object.
228
+
229
+ """
230
+ b = io.StringIO()
231
+ for row in data:
232
+ line = self._prepare_row(row=row)
233
+ b.write(line)
234
+ b.write('\n')
235
+ b.seek(0)
236
+ return b
237
+
238
+ def _to_csv(self, buffer: io.StringIO, pageno: int, tableno: int) -> int:
239
+ """Write a table (from the buffer) to CSV.
240
+
241
+ Args:
242
+ buffer (io.StringIO): A pre-processed ``StringIO`` object
243
+ containing table data to be written.
244
+ pageno (int): Page number from the ``Page`` object.
245
+ tableno (int): Number of the table on the page, based at 1.
246
+
247
+ Returns:
248
+ int: 1 if the file was written, otherwise 0. This is used by
249
+ the caller to track the number of CSV files written.
250
+
251
+ """
252
+ if buffer.seek(0, os.SEEK_END): # Test buffer is populated.
253
+ path = self._create_table_file_path(pageno=pageno, tblno=tableno)
254
+ with open(path, 'w', encoding='utf-8') as f:
255
+ buffer.seek(0)
256
+ shutil.copyfileobj(buffer, f)
257
+ return 1
258
+ return 0
259
+
260
+ def _to_df(self, buffer: io.StringIO):
261
+ """Write a table (from the buffer) to a DataFrame.
262
+
263
+ Once written, the DataFrame is appended to
264
+ :attr:`self._doc._tables` list of tables.
265
+
266
+ Args:
267
+ buffer (io.StringIO): A pre-processed ``StringIO`` object
268
+ containing table data to be written.
269
+
270
+ """
271
+ if buffer.seek(0, os.SEEK_END):
272
+ buffer.seek(0)
273
+ self._doc._tables.append(pd.read_csv(buffer))
@@ -0,0 +1,253 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the logic for parsing text from a PDF
5
+ document.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ Note: This module is *not* designed to be interacted with
12
+ directly, only via the appropriate interface class(es).
13
+
14
+ Rather, please create an instance of a PDF document parsing
15
+ object using the following:
16
+
17
+ - :class:`pdfparser.PDFParser`
18
+
19
+ Note: **Multi-processing:**
20
+ Text extraction through multi-processing has been tested and
21
+ is not feesible due to an error indicating
22
+ the ``pdfplumber.page.Page`` object can not be pickled. This
23
+ object was being passed into the extraction method as the
24
+ object contains the :func:`extract_text` function.
25
+
26
+ Additionally, multi-threading has also been tested and
27
+ it was determined to be too complex and inefficient. This was
28
+ tested using the ``concurrent.futures.ThreadPoolExecutor``
29
+ class and two documents, 14 and 92 pages; the timings are
30
+ shown below. The multi-threaded approach took longer to
31
+ process and added unnecessary complexity to the code base.
32
+ As a side-effect, the pages are processed and stored out of
33
+ order which would require a re-order, adding more complexity.
34
+
35
+ It has therefore been determined that this module will remain
36
+ single-threaded.
37
+
38
+ **Multi-Thread Timings**
39
+
40
+ **Single-threaded:**
41
+
42
+ - 14 page document: ~2 seconds
43
+ - 92 page document: ~32 seconds
44
+
45
+ **Multi-threaded:**
46
+
47
+ - 14 page document: ~2 seconds
48
+ - 92 page document: ~35 seconds
49
+
50
+ """
51
+ # pylint: disable=import-error
52
+
53
+ from __future__ import annotations
54
+ from unidecode import unidecode
55
+ # locals
56
+ from objects._pageobject import PageObject
57
+ from parsers._pdfbaseparser import _PDFBaseParser
58
+
59
+
60
+ class _PDFTextParser(_PDFBaseParser):
61
+ """Private PDF document text parser intermediate class.
62
+
63
+ Args:
64
+ path (str): Full path to the PDF document.
65
+
66
+ :Example:
67
+
68
+ Extract text from a PDF file::
69
+
70
+ >>> from docp import PDFParser
71
+
72
+ >>> pdf = PDFParser(path='/path/to/myfile.pdf')
73
+ >>> pdf.extract_text()
74
+
75
+ # Access the content of page 1.
76
+ >>> pg1 = pdf.doc.pages[1].content
77
+
78
+ """
79
+
80
+ def extract_text(self,
81
+ *,
82
+ remove_header: bool=False,
83
+ remove_footer: bool=False,
84
+ remove_newlines: bool=False,
85
+ ignore_tags: set=None,
86
+ convert_to_ascii: bool=True):
87
+ """Extract text from the document.
88
+
89
+ If the PDF document contains 'marked content' tags, these tags
90
+ are used to extract the text as this is a more accurate approach
91
+ and respects the structure of the page(s). Otherwise, a bounding
92
+ box method is used to extract the text. If instructed, the
93
+ header and/or footer regions can be excluded.
94
+
95
+ .. tip:
96
+ If a tag-based extract is used, the header/footer should be
97
+ automatically excluded as these will often have an 'Artifact'
98
+ tag, which is excluded by default, by passing
99
+ ``ignore_tags=None``.
100
+
101
+ To *keep* the header and footer, pass ``ignore_tags='na'``.
102
+
103
+ A list of pages, with extracted content can be accessed using
104
+ the :attr:`self.doc.pages` attribute.
105
+
106
+ Args:
107
+ remove_header (bool, optional): If True, the header is
108
+ cropped (skipped) from text extraction. This only applies
109
+ to the bounding box extraction method. Defaults to False.
110
+ remove_footer (bool, optional): If True, the footer is
111
+ cropped (skipped) from text extraction. This only applies
112
+ to the bounding box extraction method. Defaults to False.
113
+ remove_newlines (bool, optional): If True, the newline
114
+ characters are replaced with a space. Defaults to False.
115
+ ignore_tags (set, optional): If provided, these are the
116
+ PDF 'marked content' tags which will be ignored. Note
117
+ that the PDF document must contain tags, otherwise the
118
+ bounding box method is used and this argument is ignored.
119
+ Defaults to ``{'Artifact'}``, as these generally
120
+ relate to a header and/or footer. To include all tags,
121
+ (not skip any) pass this argument as ``'na'``.
122
+ convert_to_ascii (bool, optional): When a non-ASCII character
123
+ is found, an attempt is made to convert it to an
124
+ associated ASCII character. If a character cannot be
125
+ converted, it is replaced with a ``'?'``.
126
+ Defaults to True.
127
+
128
+ Returns:
129
+ None.
130
+
131
+ """
132
+ # pylint: disable=unnecessary-dunder-call
133
+ if len(self.doc.pages) > 1:
134
+ # Reinitialise the doc object and reopen the document.
135
+ self.__init__(path=self._path)
136
+ # If tags are found, these are used for text extraction. If tags
137
+ # are not found, a bounding box is used to remove the header and
138
+ # footer, if instructed.
139
+ if self._uses_marked_content():
140
+ match ignore_tags:
141
+ case None: ignore_tags = {'Artifact'}
142
+ case 'na': ignore_tags = set()
143
+ # Involves more processing, but also more accurate.
144
+ self._extract_text_using_tags(ignore_tags=ignore_tags, remove_newlines=remove_newlines)
145
+ else:
146
+ bbox = self._get_crop_coordinates(skip_header=remove_header, skip_footer=remove_footer)
147
+ self._extract_text_using_bbox(bbox=bbox, remove_newlines=remove_newlines)
148
+ if convert_to_ascii:
149
+ for page in self.doc.pages:
150
+ page.content = unidecode(string=page.content,
151
+ errors='replace',
152
+ replace_str='?')
153
+
154
+ def _extract_text_using_bbox(self, **kwargs):
155
+ """Extract text using a bbox for finding the header and footer.
156
+
157
+ :Keyword Arguments:
158
+ Those passed by the caller, :meth:`~extract_text`.
159
+
160
+ """
161
+ for page in self.doc.parser.pages:
162
+ text = page.within_bbox(bbox=kwargs['bbox']).extract_text().strip()
163
+ if kwargs['remove_newlines']:
164
+ text = text.replace('\n', ' ')
165
+ self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
166
+
167
+ def _extract_text_using_tags(self, **kwargs):
168
+ """Extract text using tags.
169
+
170
+ The tags defined by the ``ignore_tags`` are skipped.
171
+
172
+ :Keyword Arguments:
173
+ Those passed by the caller, :meth:`~extract_text`.
174
+
175
+ """
176
+ # pylint: disable=protected-access
177
+ ignored = kwargs['ignore_tags']
178
+ self.doc._tags = True # Set the doc's 'parsed_using_tags' flag.
179
+ for page in self.doc.parser.pages:
180
+ text = ''.join(self._text_from_tags(page=page, ignored=ignored))
181
+ if kwargs['remove_newlines']:
182
+ text = text.replace('\n', ' ')
183
+ self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
184
+
185
+ @staticmethod
186
+ def _text_from_tags(page: pdfplumber.page.Page, ignored: set) -> str: # pylint: disable=undefined-variable # noqa
187
+ """Generate a page of text extracted from tags.
188
+
189
+ When extracting text from tags, newlines are not encoded and must
190
+ be derived. For each character on the page, the top and bottom
191
+ coordinates are compared to determine when a newline should be
192
+ inserted. If both the top and bottom of the current character
193
+ are greater than the previous character, a newline is inserted
194
+ into the text stream.
195
+
196
+ Args:
197
+ page (pdfplumber.page.Page): Page to be parsed.
198
+ ignored (set): A set containing the tags to be ignored.
199
+
200
+ Yields:
201
+ str: Each character on the page, providing its tag is not to
202
+ be ignored. Or, a newline character if the current
203
+ character's coordinates are greater than (lower on the page)
204
+ than the previous character.
205
+
206
+ """
207
+ if page.chars:
208
+ # Micro-optimisation: Push tag filtering down to the C-level.
209
+ chars = filter(lambda x: x['tag'] not in ignored, page.chars)
210
+ top, btm = 999, 999
211
+ for c in chars:
212
+ if top < c['top'] and btm < c['bottom']:
213
+ yield '\n'
214
+ yield c['text']
215
+ top, btm = c['top'], c['bottom']
216
+ yield ''
217
+
218
+ def _uses_marked_content(self) -> bool:
219
+ """Test wether the document can be parsed using tags.
220
+
221
+ Marked content allows us to parse the PDF using tags (rather than
222
+ OCR) which is more accurate not only in terms of character
223
+ recognition, but also with regard to the structure of the text on
224
+ a page.
225
+
226
+ :Logic:
227
+ If the document's catalog shows ``Marked: True``, then
228
+ ``True`` is returned immediately.
229
+
230
+ Otherwise, a second attempt is made which detects marked
231
+ content tags on the first three pages. If no tags are found,
232
+ a third attempt is made by searching the first 10 pages. If
233
+ tags are found during either of these attempts, ``True`` is
234
+ returned immediately.
235
+
236
+ Finally, if no marked content or tags were found, ``False``
237
+ is returned.
238
+
239
+ Returns:
240
+ bool: Returns True if the document can be parsed using marked
241
+ content tags, otherwise False.
242
+
243
+ """
244
+ # Use pdfminer.six to get the document's catalog.
245
+ if self.doc.parser.doc.catalog.get('MarkInfo', {}).get('Marked', False):
246
+ return True
247
+ # Check only first three pages for tags first, if found, get out.
248
+ # If not, retry with the first 10 pages.
249
+ for i in [3, 10]:
250
+ tags = set(c['tag'] for p in self.doc.parser.pages[:i] for c in p.chars)
251
+ if tags != {None}:
252
+ return True
253
+ return False
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module serves as the public interface for interacting
5
+ with PDF files and parsing their contents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Example: For example code usage, please refer to the
14
+ :class:`PDFParser` class docstring.
15
+
16
+ """
17
+ # pylint: disable=import-error
18
+ # pylint: disable=wrong-import-position
19
+
20
+ # Set sys.path for relative imports.
21
+ import os
22
+ import sys
23
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
24
+ # locals
25
+ from parsers._pdftableparser import _PDFTableParser
26
+ from parsers._pdftextparser import _PDFTextParser
27
+
28
+
29
+ class PDFParser(_PDFTableParser, _PDFTextParser):
30
+ """PDF document parser.
31
+
32
+ Args:
33
+ path (str): Full path to the PDF document to be parsed.
34
+
35
+ :Example:
36
+
37
+ Extract text from a PDF file::
38
+
39
+ >>> from docp import PDFParser
40
+
41
+ >>> pdf = PDFParser(path='/path/to/myfile.pdf')
42
+ >>> pdf.extract_text()
43
+
44
+ # Access the content of page 1.
45
+ >>> pg1 = pdf.doc.pages[1].content
46
+
47
+
48
+ Extract tables from a PDF file::
49
+
50
+ >>> from docp import PDFParser
51
+
52
+ >>> pdf = PDFParser('/path/to/myfile.pdf')
53
+ >>> pdf.extract_tables()
54
+
55
+ # Access the first table on page 1.
56
+ >>> tbl1 = pdf.doc.pages[1].tables[1]
57
+
58
+ """
59
+
60
+ def __init__(self, path: str):
61
+ """PDF parser class initialiser."""
62
+ super().__init__(path=path)