docp 0.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,273 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the logic for parsing tables from a PDF
5
+ document.
6
+
7
+ :Platform: Linux
8
+ :Developer: J Berendt
9
+ :Email: jeremy.berendt@rolls-royce.com
10
+
11
+ Note: This module is *not* designed to be interacted with
12
+ directly, only via the appropriate interface class(es).
13
+
14
+ Rather, please create an instance of a PDF document parsing
15
+ object using the following:
16
+
17
+ - :class:`pdfparser.PDFParser`
18
+
19
+
20
+ """
21
+ # pylint: disable=import-error
22
+ # pylint: disable=protected-access
23
+ # pylint: disable=wrong-import-order
24
+
25
+ import io
26
+ import os
27
+ import pandas as pd
28
+ import shutil
29
+ # locals
30
+ from parsers._pdfbaseparser import _PDFBaseParser
31
+
32
+ # TODO: Move to a config file/class. (TOML?)
33
+ _SETTINGS = {'vertical_strategy': 'lines',
34
+ 'horizontal_strategy':'lines',
35
+ 'snap_x_tolerance': 12}
36
+
37
+
38
+ # TODO: Revise the docstring.
39
+ class _PDFTableParser(_PDFBaseParser):
40
+ """Private PDF document table parser intermediate class.
41
+
42
+ Args:
43
+ path (str): Full path to the PDF document.
44
+
45
+ :Example:
46
+
47
+ Extract tables from a PDF file::
48
+
49
+ >>> from docutils.parsers.pdf import PDFParser
50
+
51
+ >>> path = '/path/to/myfile.pdf'
52
+ >>> pdf = PDFParser(path)
53
+ >>> pdf.extract_tables()
54
+
55
+ >>> tables = pdf.doc.tables
56
+
57
+ """
58
+
59
+ def extract_tables(self,
60
+ table_settings: dict=None,
61
+ as_dataframe: bool=False,
62
+ to_csv: bool=True,
63
+ verbose: bool=False):
64
+ """Extract tables from the document.
65
+
66
+ Before a table is extracted, a number of validation tests are
67
+ performed to verify what has been identified as a 'table' is
68
+ actually a table which might be useful to the user.
69
+
70
+ Each 'valid' table is written as a CSV file on the user's
71
+ desktop.
72
+
73
+ Additionally, the extracted table data is stored to the class'
74
+ :attr:`self.tables` attribute.
75
+
76
+ Args:
77
+ table_settings (dict, optional): Table settings to be used
78
+ for the table extraction. Defaults to None, which is
79
+ replaced by the value in the config.
80
+ as_dataframe (bool, optional): By default, the extracted
81
+ tables are returned as a list of (lists of lists), for
82
+ example: all_tables[table[rows[data]]]. However, if this
83
+ argument is ``True``, the table data is returned as a
84
+ list of ``pandas.DataFrame`` objects. In this case, the
85
+ first row of the table is used as the header, and all
86
+ remaining rows are treated as data. **Note:** This will
87
+ *not* work properly for all tables. Defaults to False.
88
+ to_csv (bool, optional): Dump extracted table data to a CSV
89
+ file, one per table. Defaults to True.
90
+ verbose (bool, optional): Display how many tables were
91
+ extracted, and the path to their location.
92
+
93
+ """
94
+ # pylint: disable=invalid-name
95
+ # pylint: disable=too-many-nested-blocks
96
+ # pylint: disable=unnecessary-dunder-call
97
+ if self._doc.tables:
98
+ # Reinitialise the doc object and reopen the document.
99
+ self.__init__(path=self._path)
100
+ c = 0
101
+ if to_csv:
102
+ self._create_table_directory_path()
103
+ if table_settings is None:
104
+ table_settings = _SETTINGS
105
+ for p in self._doc._pdf.pages:
106
+ tblno = 1
107
+ tables = self._filter_tables(tables=p.find_tables(), threshold=5000)
108
+ for table in tables:
109
+ pc = p.crop(table.bbox)
110
+ data = pc.extract_table(table_settings=table_settings)
111
+ if all(len(row) > 1 for row in data) and len(data) > 1:
112
+ # Verify no table rows are found in the most common rows (header/footer).
113
+ if not self._table_header_footer(table=data):
114
+ if not as_dataframe:
115
+ self._doc._tables.append(data)
116
+ if to_csv or as_dataframe:
117
+ buffer = self._to_buffer(data=data)
118
+ if to_csv:
119
+ c += self._to_csv(buffer=buffer,
120
+ pageno=p.page_number,
121
+ tableno=tblno)
122
+ if as_dataframe:
123
+ self._to_df(buffer=buffer)
124
+ buffer.close()
125
+ tblno += 1
126
+ if verbose and to_csv:
127
+ print('',
128
+ 'Complete.',
129
+ f'{c} tables were extracted and stored at the path below.',
130
+ f'Path: {self._tbl_opath}',
131
+ sep='\n')
132
+
133
+ def _create_table_directory_path(self):
134
+ """Create the output directory for table data.
135
+
136
+ If the directory does not exist, it is created.
137
+
138
+ """
139
+ # Defined in parent class.
140
+ # pylint: disable=attribute-defined-outside-init
141
+ trans = {32: '_', 45: '_'}
142
+ path = (os.path.join(os.path.join(os.environ['HOME'], 'Desktop'),
143
+ 'docutils',
144
+ 'pdf_tables',
145
+ (os.path.splitext(os.path.basename(self._path))[0]
146
+ .lower()
147
+ .translate(trans))))
148
+ self._tbl_opath = path
149
+ if not os.path.exists(path):
150
+ os.makedirs(path)
151
+
152
+ def _create_table_file_path(self, pageno: int, tblno: int) -> str:
153
+ """Create the filename for the table.
154
+
155
+ Args:
156
+ pageno (int): Page from which the table was extracted.
157
+ tblno (int): Number of the table on the page, starting at 1.
158
+
159
+ Returns:
160
+ str: Explicit path to the file to be written.
161
+
162
+ """
163
+ path = os.path.join(self._tbl_opath,
164
+ f'pg{str(pageno).zfill(3)}_tb{str(tblno).zfill(3)}.csv')
165
+ return path
166
+
167
+ @staticmethod
168
+ def _filter_tables(tables: list, threshold: int=5000) -> list:
169
+ """Remove tables from the passed list which are deemed invalid.
170
+
171
+ Args:
172
+ tables (list): A list of tables as detected by the
173
+ :meth:`Page.find_table()` method.
174
+ threshold (int, optional): Minimum pixel area for a detected
175
+ table to be returned. Defaults to 5000.
176
+
177
+ :Rationale:
178
+ An 'invalid' table is determined by the number of pixels
179
+ which the table covered. Any table which is less than (N)
180
+ pixels is likely a block of text which has been categorised
181
+ as a 'table', but is not.
182
+
183
+ Returns:
184
+ list: A list of tables whose pixel area is greater than
185
+ ``threshold``.
186
+
187
+ """
188
+ # pylint: disable=invalid-name
189
+ t = []
190
+ for table in tables:
191
+ x0, y0, x1, y1 = table.bbox
192
+ if (x1-x0) * (y1-y0) > threshold:
193
+ t.append(table)
194
+ return t
195
+
196
+ def _table_header_footer(self, table: list[list]) -> bool:
197
+ """Verify a table is not a header or footer.
198
+
199
+ Args:
200
+ table (list[list]): Table (a list of lists) be a analysed.
201
+
202
+ :Rationale:
203
+ A table is determined to be a header or footer if any of the
204
+ line contained in the 'common lines list' are found in the
205
+ table.
206
+
207
+ If any of these lines are found, the table is determined to
208
+ be a header/footer, True is returned.
209
+
210
+ Returns:
211
+ bool: False if the table is *not* a header/footer, otherwise
212
+ True.
213
+
214
+ """
215
+ lines = self._scan_common() # Only re-runs if not already run.
216
+ # r: row; c: cell; l: line
217
+ return any(l in c for l in lines for r in table for c in r if c)
218
+
219
+ def _to_buffer(self, data: list[list]) -> io.StringIO:
220
+ """Write the table data into a string buffer.
221
+
222
+ Args:
223
+ data (list[list]): The table data as a list of lists to be
224
+ written to a buffer.
225
+
226
+ Returns:
227
+ io.StringIO: A string buffer as an ``io.StringIO`` object.
228
+
229
+ """
230
+ b = io.StringIO()
231
+ for row in data:
232
+ line = self._prepare_row(row=row)
233
+ b.write(line)
234
+ b.write('\n')
235
+ b.seek(0)
236
+ return b
237
+
238
+ def _to_csv(self, buffer: io.StringIO, pageno: int, tableno: int) -> int:
239
+ """Write a table (from the buffer) to CSV.
240
+
241
+ Args:
242
+ buffer (io.StringIO): A pre-processed ``StringIO`` object
243
+ containing table data to be written.
244
+ pageno (int): Page number from the ``Page`` object.
245
+ tableno (int): Number of the table on the page, based at 1.
246
+
247
+ Returns:
248
+ int: 1 if the file was written, otherwise 0. This is used by
249
+ the caller to track the number of CSV files written.
250
+
251
+ """
252
+ if buffer.seek(0, os.SEEK_END): # Test buffer is populated.
253
+ path = self._create_table_file_path(pageno=pageno, tblno=tableno)
254
+ with open(path, 'w', encoding='utf-8') as f:
255
+ buffer.seek(0)
256
+ shutil.copyfileobj(buffer, f)
257
+ return 1
258
+ return 0
259
+
260
+ def _to_df(self, buffer: io.StringIO):
261
+ """Write a table (from the buffer) to a DataFrame.
262
+
263
+ Once written, the DataFrame is appended to
264
+ :attr:`self._doc._tables` list of tables.
265
+
266
+ Args:
267
+ buffer (io.StringIO): A pre-processed ``StringIO`` object
268
+ containing table data to be written.
269
+
270
+ """
271
+ if buffer.seek(0, os.SEEK_END):
272
+ buffer.seek(0)
273
+ self._doc._tables.append(pd.read_csv(buffer))
@@ -0,0 +1,253 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the logic for parsing text from a PDF
5
+ document.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ Note: This module is *not* designed to be interacted with
12
+ directly, only via the appropriate interface class(es).
13
+
14
+ Rather, please create an instance of a PDF document parsing
15
+ object using the following:
16
+
17
+ - :class:`pdfparser.PDFParser`
18
+
19
+ Note: **Multi-processing:**
20
+ Text extraction through multi-processing has been tested and
21
+ is not feesible due to an error indicating
22
+ the ``pdfplumber.page.Page`` object can not be pickled. This
23
+ object was being passed into the extraction method as the
24
+ object contains the :func:`extract_text` function.
25
+
26
+ Additionally, multi-threading has also been tested and
27
+ it was determined to be too complex and inefficient. This was
28
+ tested using the ``concurrent.futures.ThreadPoolExecutor``
29
+ class and two documents, 14 and 92 pages; the timings are
30
+ shown below. The multi-threaded approach took longer to
31
+ process and added unnecessary complexity to the code base.
32
+ As a side-effect, the pages are processed and stored out of
33
+ order which would require a re-order, adding more complexity.
34
+
35
+ It has therefore been determined that this module will remain
36
+ single-threaded.
37
+
38
+ **Multi-Thread Timings**
39
+
40
+ **Single-threaded:**
41
+
42
+ - 14 page document: ~2 seconds
43
+ - 92 page document: ~32 seconds
44
+
45
+ **Multi-threaded:**
46
+
47
+ - 14 page document: ~2 seconds
48
+ - 92 page document: ~35 seconds
49
+
50
+ """
51
+ # pylint: disable=import-error
52
+
53
+ from __future__ import annotations
54
+ from unidecode import unidecode
55
+ # locals
56
+ from objects._pageobject import PageObject
57
+ from parsers._pdfbaseparser import _PDFBaseParser
58
+
59
+
60
+ class _PDFTextParser(_PDFBaseParser):
61
+ """Private PDF document text parser intermediate class.
62
+
63
+ Args:
64
+ path (str): Full path to the PDF document.
65
+
66
+ :Example:
67
+
68
+ Extract text from a PDF file::
69
+
70
+ >>> from docp import PDFParser
71
+
72
+ >>> pdf = PDFParser(path='/path/to/myfile.pdf')
73
+ >>> pdf.extract_text()
74
+
75
+ # Access the content of page 1.
76
+ >>> pg1 = pdf.doc.pages[1].content
77
+
78
+ """
79
+
80
+ def extract_text(self,
81
+ *,
82
+ remove_header: bool=False,
83
+ remove_footer: bool=False,
84
+ remove_newlines: bool=False,
85
+ ignore_tags: set=None,
86
+ convert_to_ascii: bool=True):
87
+ """Extract text from the document.
88
+
89
+ If the PDF document contains 'marked content' tags, these tags
90
+ are used to extract the text as this is a more accurate approach
91
+ and respects the structure of the page(s). Otherwise, a bounding
92
+ box method is used to extract the text. If instructed, the
93
+ header and/or footer regions can be excluded.
94
+
95
+ .. tip:
96
+ If a tag-based extract is used, the header/footer should be
97
+ automatically excluded as these will often have an 'Artifact'
98
+ tag, which is excluded by default, by passing
99
+ ``ignore_tags=None``.
100
+
101
+ To *keep* the header and footer, pass ``ignore_tags='na'``.
102
+
103
+ A list of pages, with extracted content can be accessed using
104
+ the :attr:`self.doc.pages` attribute.
105
+
106
+ Args:
107
+ remove_header (bool, optional): If True, the header is
108
+ cropped (skipped) from text extraction. This only applies
109
+ to the bounding box extraction method. Defaults to False.
110
+ remove_footer (bool, optional): If True, the footer is
111
+ cropped (skipped) from text extraction. This only applies
112
+ to the bounding box extraction method. Defaults to False.
113
+ remove_newlines (bool, optional): If True, the newline
114
+ characters are replaced with a space. Defaults to False.
115
+ ignore_tags (set, optional): If provided, these are the
116
+ PDF 'marked content' tags which will be ignored. Note
117
+ that the PDF document must contain tags, otherwise the
118
+ bounding box method is used and this argument is ignored.
119
+ Defaults to ``{'Artifact'}``, as these generally
120
+ relate to a header and/or footer. To include all tags,
121
+ (not skip any) pass this argument as ``'na'``.
122
+ convert_to_ascii (bool, optional): When a non-ASCII character
123
+ is found, an attempt is made to convert it to an
124
+ associated ASCII character. If a character cannot be
125
+ converted, it is replaced with a ``'?'``.
126
+ Defaults to True.
127
+
128
+ Returns:
129
+ None.
130
+
131
+ """
132
+ # pylint: disable=unnecessary-dunder-call
133
+ if len(self.doc.pages) > 1:
134
+ # Reinitialise the doc object and reopen the document.
135
+ self.__init__(path=self._path)
136
+ # If tags are found, these are used for text extraction. If tags
137
+ # are not found, a bounding box is used to remove the header and
138
+ # footer, if instructed.
139
+ if self._uses_marked_content():
140
+ match ignore_tags:
141
+ case None: ignore_tags = {'Artifact'}
142
+ case 'na': ignore_tags = set()
143
+ # Involves more processing, but also more accurate.
144
+ self._extract_text_using_tags(ignore_tags=ignore_tags, remove_newlines=remove_newlines)
145
+ else:
146
+ bbox = self._get_crop_coordinates(skip_header=remove_header, skip_footer=remove_footer)
147
+ self._extract_text_using_bbox(bbox=bbox, remove_newlines=remove_newlines)
148
+ if convert_to_ascii:
149
+ for page in self.doc.pages:
150
+ page.content = unidecode(string=page.content,
151
+ errors='replace',
152
+ replace_str='?')
153
+
154
+ def _extract_text_using_bbox(self, **kwargs):
155
+ """Extract text using a bbox for finding the header and footer.
156
+
157
+ :Keyword Arguments:
158
+ Those passed by the caller, :meth:`~extract_text`.
159
+
160
+ """
161
+ for page in self.doc.parser.pages:
162
+ text = page.within_bbox(bbox=kwargs['bbox']).extract_text().strip()
163
+ if kwargs['remove_newlines']:
164
+ text = text.replace('\n', ' ')
165
+ self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
166
+
167
+ def _extract_text_using_tags(self, **kwargs):
168
+ """Extract text using tags.
169
+
170
+ The tags defined by the ``ignore_tags`` are skipped.
171
+
172
+ :Keyword Arguments:
173
+ Those passed by the caller, :meth:`~extract_text`.
174
+
175
+ """
176
+ # pylint: disable=protected-access
177
+ ignored = kwargs['ignore_tags']
178
+ self.doc._tags = True # Set the doc's 'parsed_using_tags' flag.
179
+ for page in self.doc.parser.pages:
180
+ text = ''.join(self._text_from_tags(page=page, ignored=ignored))
181
+ if kwargs['remove_newlines']:
182
+ text = text.replace('\n', ' ')
183
+ self.doc.pages.append(PageObject(content=text, pageno=page.page_number, parser=page))
184
+
185
+ @staticmethod
186
+ def _text_from_tags(page: pdfplumber.page.Page, ignored: set) -> str: # pylint: disable=undefined-variable # noqa
187
+ """Generate a page of text extracted from tags.
188
+
189
+ When extracting text from tags, newlines are not encoded and must
190
+ be derived. For each character on the page, the top and bottom
191
+ coordinates are compared to determine when a newline should be
192
+ inserted. If both the top and bottom of the current character
193
+ are greater than the previous character, a newline is inserted
194
+ into the text stream.
195
+
196
+ Args:
197
+ page (pdfplumber.page.Page): Page to be parsed.
198
+ ignored (set): A set containing the tags to be ignored.
199
+
200
+ Yields:
201
+ str: Each character on the page, providing its tag is not to
202
+ be ignored. Or, a newline character if the current
203
+ character's coordinates are greater than (lower on the page)
204
+ than the previous character.
205
+
206
+ """
207
+ if page.chars:
208
+ # Micro-optimisation: Push tag filtering down to the C-level.
209
+ chars = filter(lambda x: x['tag'] not in ignored, page.chars)
210
+ top, btm = 999, 999
211
+ for c in chars:
212
+ if top < c['top'] and btm < c['bottom']:
213
+ yield '\n'
214
+ yield c['text']
215
+ top, btm = c['top'], c['bottom']
216
+ yield ''
217
+
218
+ def _uses_marked_content(self) -> bool:
219
+ """Test wether the document can be parsed using tags.
220
+
221
+ Marked content allows us to parse the PDF using tags (rather than
222
+ OCR) which is more accurate not only in terms of character
223
+ recognition, but also with regard to the structure of the text on
224
+ a page.
225
+
226
+ :Logic:
227
+ If the document's catalog shows ``Marked: True``, then
228
+ ``True`` is returned immediately.
229
+
230
+ Otherwise, a second attempt is made which detects marked
231
+ content tags on the first three pages. If no tags are found,
232
+ a third attempt is made by searching the first 10 pages. If
233
+ tags are found during either of these attempts, ``True`` is
234
+ returned immediately.
235
+
236
+ Finally, if no marked content or tags were found, ``False``
237
+ is returned.
238
+
239
+ Returns:
240
+ bool: Returns True if the document can be parsed using marked
241
+ content tags, otherwise False.
242
+
243
+ """
244
+ # Use pdfminer.six to get the document's catalog.
245
+ if self.doc.parser.doc.catalog.get('MarkInfo', {}).get('Marked', False):
246
+ return True
247
+ # Check only first three pages for tags first, if found, get out.
248
+ # If not, retry with the first 10 pages.
249
+ for i in [3, 10]:
250
+ tags = set(c['tag'] for p in self.doc.parser.pages[:i] for c in p.chars)
251
+ if tags != {None}:
252
+ return True
253
+ return False
@@ -0,0 +1,62 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module serves as the public interface for interacting
5
+ with PDF files and parsing their contents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ :Example: For example code usage, please refer to the
14
+ :class:`PDFParser` class docstring.
15
+
16
+ """
17
+ # pylint: disable=import-error
18
+ # pylint: disable=wrong-import-position
19
+
20
+ # Set sys.path for relative imports.
21
+ import os
22
+ import sys
23
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.realpath(__file__))))
24
+ # locals
25
+ from parsers._pdftableparser import _PDFTableParser
26
+ from parsers._pdftextparser import _PDFTextParser
27
+
28
+
29
+ class PDFParser(_PDFTableParser, _PDFTextParser):
30
+ """PDF document parser.
31
+
32
+ Args:
33
+ path (str): Full path to the PDF document to be parsed.
34
+
35
+ :Example:
36
+
37
+ Extract text from a PDF file::
38
+
39
+ >>> from docp import PDFParser
40
+
41
+ >>> pdf = PDFParser(path='/path/to/myfile.pdf')
42
+ >>> pdf.extract_text()
43
+
44
+ # Access the content of page 1.
45
+ >>> pg1 = pdf.doc.pages[1].content
46
+
47
+
48
+ Extract tables from a PDF file::
49
+
50
+ >>> from docp import PDFParser
51
+
52
+ >>> pdf = PDFParser('/path/to/myfile.pdf')
53
+ >>> pdf.extract_tables()
54
+
55
+ # Access the first table on page 1.
56
+ >>> tbl1 = pdf.doc.pages[1].tables[1]
57
+
58
+ """
59
+
60
+ def __init__(self, path: str):
61
+ """PDF parser class initialiser."""
62
+ super().__init__(path=path)