docp 0.0.0.dev1__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  2. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  3. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  4. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/8cfec92309f5626a223304af2423e332f6d31887.lock +0 -0
  5. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  6. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  7. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  8. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  9. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  10. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  11. docp/.cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  12. docp/__init__.py +35 -6
  13. docp/dbs/__init__.py +0 -0
  14. docp/dbs/chroma.py +197 -0
  15. docp/libs/_version.py +1 -0
  16. docp/libs/changelog.py +7 -0
  17. docp/libs/utilities.py +107 -0
  18. docp/loaders/__init__.py +38 -0
  19. docp/loaders/_chromabaseloader.py +338 -0
  20. docp/loaders/_chromabaseloader.py.bak +378 -0
  21. docp/loaders/_chromabasepdfloader.py +121 -0
  22. docp/loaders/_chromabasepptxloader.py +123 -0
  23. docp/loaders/chroma.py.bak +196 -0
  24. docp/loaders/chromapdfloader.py +199 -0
  25. docp/loaders/chromapptxloader.py +192 -0
  26. docp/loaders/lutilities.py +52 -0
  27. docp/objects/__init__.py +0 -0
  28. docp/objects/_docbaseobject.py +65 -0
  29. docp/objects/_imgobject.py +0 -0
  30. docp/objects/_pageobject.py +127 -0
  31. docp/objects/_slideobject.py +110 -0
  32. docp/objects/_tableobject.py +0 -0
  33. docp/objects/_textobject.py +64 -0
  34. docp/objects/pdfobject.py +61 -0
  35. docp/objects/pptxobject.py +46 -0
  36. docp/parsers/__init__.py +0 -0
  37. docp/parsers/_pdfbaseparser.py +236 -0
  38. docp/parsers/_pdftableparser.py +272 -0
  39. docp/parsers/_pdftextparser.py +263 -0
  40. docp/parsers/_pptxbaseparser.py +93 -0
  41. docp/parsers/_pptxtextparser.py +115 -0
  42. docp/parsers/pdfparser.py +62 -0
  43. docp/parsers/pptxparser.py +51 -0
  44. docp/parsers/putilities.py +48 -0
  45. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/LICENSE +622 -622
  46. docp-0.2.0.dist-info/METADATA +110 -0
  47. docp-0.2.0.dist-info/RECORD +49 -0
  48. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/WHEEL +1 -1
  49. docp/_version.py +0 -1
  50. docp-0.0.0.dev1.dist-info/METADATA +0 -55
  51. docp-0.0.0.dev1.dist-info/RECORD +0 -7
  52. {docp-0.0.0.dev1.dist-info → docp-0.2.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,236 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides generalised base functionality for
5
+ parsing PDF documents.
6
+
7
+ :Platform: Linux/Windows | Python 3.10+
8
+ :Developer: J Berendt
9
+ :Email: development@s3dev.uk
10
+
11
+ :Comments: n/a
12
+
13
+ .. attention::
14
+
15
+ This module is *not* designed to be interacted with
16
+ directly, only via the appropriate interface class(es).
17
+
18
+ Rather, please create an instance of a PDF document parsing
19
+ object using the following class:
20
+
21
+ - :class:`~docp.parsers.pdfparser.PDFParser`
22
+
23
+ """
24
+ # pylint: disable=import-error
25
+ # pylint: disable=protected-access
26
+ # pylint: disable=wrong-import-order
27
+
28
+ import os
29
+ import pdfplumber
30
+ from collections import Counter
31
+ from unidecode import unidecode
32
+ # locals
33
+ try:
34
+ from .libs.utilities import utilities
35
+ from .objects.pdfobject import DocPDF
36
+ except ImportError:
37
+ from libs.utilities import utilities
38
+ from objects.pdfobject import DocPDF
39
+
40
+
41
+ class _PDFBaseParser:
42
+ """Base class containing generalised PDF parsing functionality."""
43
+
44
+ def __init__(self, path: str):
45
+ """Private base parser class initialiser.
46
+
47
+ Args:
48
+ path (str): Full path to the document to be parsed.
49
+
50
+ """
51
+ self._path = path
52
+ self._doc = DocPDF()
53
+ self._tbl_opath = None
54
+ self._set_paths()
55
+ self._open()
56
+
57
+ def __del__(self):
58
+ """Class deconstructor.
59
+
60
+ :Tasks:
61
+ - Ensure the PDF document is closed.
62
+
63
+ """
64
+ if hasattr(self._doc, '_parser'):
65
+ self._doc._parser.close()
66
+
67
+ @property
68
+ def doc(self) -> DocPDF:
69
+ """Accessor to the document object."""
70
+ return self._doc
71
+
72
+ def _get_crop_coordinates(self,
73
+ skip_header: bool=False,
74
+ skip_footer: bool=False) -> tuple[float]:
75
+ """Determine the bounding box coordinates.
76
+
77
+ These coordinates are used for removing the header and/or footer.
78
+
79
+ Args:
80
+ skip_header (bool, optional): If True, set the coordinates
81
+ such that the header is skipped. Defaults to False.
82
+ skip_footer (bool, optional): If True, set the coordinates
83
+ such that the footer is skipped. Defaults to False.
84
+
85
+ :Logic:
86
+ When excluding a header and/or footer, the following page
87
+ numbers are used for header/footer *position* detection,
88
+ given the length of the document:
89
+
90
+ - Number of pages [1]: 1
91
+ - Number of pages [2,10]: 2
92
+ - Number of pages [11,]: 5
93
+
94
+ Returns:
95
+ tuple: A bounding box tuple of the following form, to be
96
+ passed directly into the :func:`Page.crop` method::
97
+
98
+ (x0, top, x1, bottom)
99
+
100
+ """
101
+ npages = self._doc.npages
102
+ match npages:
103
+ case 1: num = 1
104
+ case _ if npages in range(2, 11): num = 2
105
+ case _: num = 5
106
+ pg = self._doc.parser.pages[num - 1] # The parser does not have a page offset at [0].
107
+ # Default coordinates to the whole page.
108
+ coords = {'x0': 0, 'top': 0, 'x1': pg.width, 'bottom': pg.height}
109
+ # If the header and/or footer is to be skipped, find and iterate
110
+ # through the common lines and overwrite the coordinates as
111
+ # appropriate, given the key and the line's location on the page.
112
+ if skip_header or skip_footer:
113
+ lines = self._scan_common()
114
+ for line in lines:
115
+ s = pg.search(line)
116
+ if s:
117
+ for key in coords:
118
+ v = s[0][key]
119
+ match key:
120
+ case 'top' if v < pg.height/2 and skip_header:
121
+ coords[key] = max(coords[key], v+2)
122
+ case 'bottom' if v > pg.height/2 and skip_footer:
123
+ coords[key] = min(coords[key], v-2)
124
+ return tuple(coords.values())
125
+
126
+ def _open(self) -> None:
127
+ """Open the PDF document for reading.
128
+
129
+ Before opening the file, a test is performed to ensure the PDF
130
+ is valid. The file must:
131
+
132
+ - exist
133
+ - be a valid PDF file, per the file signature
134
+ - have a .pdf file extension
135
+
136
+ :Other Operations:
137
+
138
+ - Store the ``pdfplumber`` parser object returned from the
139
+ :func:`pdfplumber.open` function into the
140
+ :attr:`self._doc._parser` attribute.
141
+ - Store the number of pages into the
142
+ :attr:`self._doc._npages` attribute.
143
+ - Store the document's meta data into the
144
+ :attr:`self._doc._meta` attribute.
145
+
146
+ Raises:
147
+ TypeError: Raised if the file type criteria above are not
148
+ met.
149
+
150
+ """
151
+ if all((os.path.exists(self._doc._fpath),
152
+ utilities.ispdf(self._doc._fpath),
153
+ os.path.splitext(self._doc._fpath)[1].lower() == '.pdf')):
154
+ self._doc._parser = pdfplumber.open(self._doc._fpath)
155
+ self._doc._npages = len(self._doc._parser.pages)
156
+ self._doc._meta = self._doc._parser.metadata
157
+ else:
158
+ msg = f'{self._doc._fname} is not a valid PDF file.'
159
+ raise TypeError(msg)
160
+
161
+ @staticmethod
162
+ def _prepare_row(row: list) -> str:
163
+ """Prepare the table row for writing a table to to CSV.
164
+
165
+ Args:
166
+ row (list): A list of strings, constituting a table row.
167
+
168
+ :Processing Tasks:
169
+
170
+ For each element in the row:
171
+
172
+ - Remove any double quote characters (ASCII and Unicode).
173
+ - Replace any empty values with ``'None'``.
174
+ - If the element contains a comma, wrap the element in
175
+ double quotes.
176
+ - Attempt to convert any non-ASCII characters to an
177
+ associated ASCII character. If the replacement cannot
178
+ be made, the character is replaced with a ``'?'``.
179
+
180
+ Returns:
181
+ str: A processed comma-separated string, ready to be written
182
+ to a CSV file.
183
+
184
+ """
185
+ trans = {34: '', 8220: '', 8221: ''} # Remove double quotes in Unicode.
186
+ row = [e.translate(trans) if e else 'None' for e in row] # Cannot be a generator.
187
+ for idx, e in enumerate(row):
188
+ if ',' in e:
189
+ row[idx] = f'"{e}"' # Escape comma-separation by quoting.
190
+ line = unidecode(','.join(row).replace('\n', ' '), errors='replace', replace_str='?')
191
+ return line
192
+
193
+ def _scan_common(self) -> list[str]:
194
+ """Scan the PDF document to find the most common lines.
195
+
196
+ :Rationale:
197
+ Generally, the most common lines in a document will be the
198
+ header and footer, as these are expected to be repeated on
199
+ each page of the document.
200
+
201
+ 'Most common' is defined as line occurring on 90% of the
202
+ pages throughout the document. Therefore, only documents with
203
+ more than three pages are scanned. Otherwise, the 90% may
204
+ exclude relevant pieces of the document (as was discovered in
205
+ testing).
206
+
207
+ :Logic:
208
+ For documents with more than three pages, the entire PDF is
209
+ read through and each line extracted. The occurrence of each
210
+ line is counted, with the most common occurrences returned
211
+ to the caller.
212
+
213
+ The returned lines are to be passed into a page search to
214
+ determine the x/y coordinates of the header and footer.
215
+
216
+ Returns:
217
+ list: For documents with more than three pages, a list
218
+ containing the most common lines in the document. Otherwise,
219
+ an empty list if returned.
220
+
221
+ """
222
+ # Only scan if document has more than three pages.
223
+ if self._doc.npages < 4:
224
+ return []
225
+ if self._doc._common is None:
226
+ # Create a line generator for all pages.
227
+ lines = (l for p in self._doc.parser.pages for l in p.extract_text().split('\n'))
228
+ # Return the lines whose occurrence rate is 90% of document pages.
229
+ self._doc._common = [i[0] for i in Counter(lines).most_common()
230
+ if i[1] > self._doc.npages * 0.9]
231
+ return self._doc._common
232
+
233
+ def _set_paths(self) -> None:
234
+ """Set the document's file path attributes."""
235
+ self._doc._fpath = os.path.realpath(self._path)
236
+ self._doc._fname = os.path.basename(self._path)
@@ -0,0 +1,272 @@
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ :Purpose: This module provides the logic for parsing tables from a PDF
5
+ document.
6
+
7
+ :Platform: Linux
8
+ :Developer: J Berendt
9
+ :Email: jeremy.berendt@rolls-royce.com
10
+
11
+ .. attention::
12
+
13
+ This module is *not* designed to be interacted with
14
+ directly, only via the appropriate interface class(es).
15
+
16
+ Rather, please create an instance of a PDF document parsing
17
+ object using the following:
18
+
19
+ - :class:`~docp.parsers.pdfparser.PDFParser`
20
+
21
+ """
22
+ # pylint: disable=import-error
23
+ # pylint: disable=protected-access
24
+ # pylint: disable=wrong-import-order
25
+
26
+ import io
27
+ import os
28
+ import pandas as pd
29
+ import shutil
30
+ # locals
31
+ from parsers._pdfbaseparser import _PDFBaseParser
32
+
33
+ # TODO: Move to a config file/class. (TOML?)
34
+ _SETTINGS = {'vertical_strategy': 'lines',
35
+ 'horizontal_strategy':'lines',
36
+ 'snap_x_tolerance': 12}
37
+
38
+
39
+ class _PDFTableParser(_PDFBaseParser):
40
+ """Private PDF document table parser intermediate class.
41
+
42
+ Args:
43
+ path (str): Full path to the PDF document.
44
+
45
+ :Example:
46
+
47
+ Extract tables from a PDF file::
48
+
49
+ >>> from docp import PDFParser
50
+
51
+ >>> pdf = PDFParser(path='/path/to/myfile.pdf')
52
+ >>> pdf.extract_tables()
53
+
54
+ >>> tables = pdf.doc.tables
55
+
56
+ """
57
+
58
+ def extract_tables(self,
59
+ table_settings: dict=None,
60
+ as_dataframe: bool=False,
61
+ to_csv: bool=True,
62
+ verbose: bool=False):
63
+ """Extract tables from the document.
64
+
65
+ Before a table is extracted, a number of validation tests are
66
+ performed to verify what has been identified as a 'table' is
67
+ actually a table which might be useful to the user.
68
+
69
+ Each 'valid' table is written as a CSV file on the user's
70
+ desktop.
71
+
72
+ Additionally, the extracted table data is stored to the class'
73
+ :attr:`self.tables` attribute.
74
+
75
+ Args:
76
+ table_settings (dict, optional): Table settings to be used
77
+ for the table extraction. Defaults to None, which is
78
+ replaced by the value in the config.
79
+ as_dataframe (bool, optional): By default, the extracted
80
+ tables are returned as a list of (lists of lists), for
81
+ example: all_tables[table[rows[data]]]. However, if this
82
+ argument is ``True``, the table data is returned as a
83
+ list of ``pandas.DataFrame`` objects. In this case, the
84
+ first row of the table is used as the header, and all
85
+ remaining rows are treated as data. **Note:** This will
86
+ *not* work properly for all tables. Defaults to False.
87
+ to_csv (bool, optional): Dump extracted table data to a CSV
88
+ file, one per table. Defaults to True.
89
+ verbose (bool, optional): Display how many tables were
90
+ extracted, and the path to their location.
91
+
92
+ """
93
+ # pylint: disable=invalid-name
94
+ # pylint: disable=too-many-nested-blocks
95
+ # pylint: disable=unnecessary-dunder-call
96
+ if self._doc.tables:
97
+ # Reinitialise the doc object and reopen the document.
98
+ self.__init__(path=self._path)
99
+ c = 0
100
+ if to_csv:
101
+ self._create_table_directory_path()
102
+ if table_settings is None:
103
+ table_settings = _SETTINGS
104
+ for p in self._doc._pdf.pages:
105
+ tblno = 1
106
+ tables = self._filter_tables(tables=p.find_tables(), threshold=5000)
107
+ for table in tables:
108
+ pc = p.crop(table.bbox)
109
+ data = pc.extract_table(table_settings=table_settings)
110
+ if all(len(row) > 1 for row in data) and len(data) > 1:
111
+ # Verify no table rows are found in the most common rows (header/footer).
112
+ if not self._table_header_footer(table=data):
113
+ if not as_dataframe:
114
+ self._doc._tables.append(data)
115
+ if to_csv or as_dataframe:
116
+ buffer = self._to_buffer(data=data)
117
+ if to_csv:
118
+ c += self._to_csv(buffer=buffer,
119
+ pageno=p.page_number,
120
+ tableno=tblno)
121
+ if as_dataframe:
122
+ self._to_df(buffer=buffer)
123
+ buffer.close()
124
+ tblno += 1
125
+ if verbose and to_csv:
126
+ print('',
127
+ 'Complete.',
128
+ f'{c} tables were extracted and stored at the path below.',
129
+ f'Path: {self._tbl_opath}',
130
+ sep='\n')
131
+
132
+ def _create_table_directory_path(self):
133
+ """Create the output directory for table data.
134
+
135
+ If the directory does not exist, it is created.
136
+
137
+ """
138
+ # Defined in parent class.
139
+ # pylint: disable=attribute-defined-outside-init
140
+ trans = {32: '_', 45: '_'}
141
+ path = (os.path.join(os.path.join(os.environ['HOME'], 'Desktop'),
142
+ 'docutils',
143
+ 'pdf_tables',
144
+ (os.path.splitext(os.path.basename(self._path))[0]
145
+ .lower()
146
+ .translate(trans))))
147
+ self._tbl_opath = path
148
+ if not os.path.exists(path):
149
+ os.makedirs(path)
150
+
151
+ def _create_table_file_path(self, pageno: int, tblno: int) -> str:
152
+ """Create the filename for the table.
153
+
154
+ Args:
155
+ pageno (int): Page from which the table was extracted.
156
+ tblno (int): Number of the table on the page, starting at 1.
157
+
158
+ Returns:
159
+ str: Explicit path to the file to be written.
160
+
161
+ """
162
+ path = os.path.join(self._tbl_opath,
163
+ f'pg{str(pageno).zfill(3)}_tb{str(tblno).zfill(3)}.csv')
164
+ return path
165
+
166
+ @staticmethod
167
+ def _filter_tables(tables: list, threshold: int=5000) -> list:
168
+ """Remove tables from the passed list which are deemed invalid.
169
+
170
+ Args:
171
+ tables (list): A list of tables as detected by the
172
+ :meth:`Page.find_table()` method.
173
+ threshold (int, optional): Minimum pixel area for a detected
174
+ table to be returned. Defaults to 5000.
175
+
176
+ :Rationale:
177
+ An 'invalid' table is determined by the number of pixels
178
+ which the table covered. Any table which is less than (N)
179
+ pixels is likely a block of text which has been categorised
180
+ as a 'table', but is not.
181
+
182
+ Returns:
183
+ list: A list of tables whose pixel area is greater than
184
+ ``threshold``.
185
+
186
+ """
187
+ # pylint: disable=invalid-name
188
+ t = []
189
+ for table in tables:
190
+ x0, y0, x1, y1 = table.bbox
191
+ if (x1-x0) * (y1-y0) > threshold:
192
+ t.append(table)
193
+ return t
194
+
195
+ def _table_header_footer(self, table: list[list]) -> bool:
196
+ """Verify a table is not a header or footer.
197
+
198
+ Args:
199
+ table (list[list]): Table (a list of lists) be a analysed.
200
+
201
+ :Rationale:
202
+ A table is determined to be a header or footer if any of the
203
+ line contained in the 'common lines list' are found in the
204
+ table.
205
+
206
+ If any of these lines are found, the table is determined to
207
+ be a header/footer, True is returned.
208
+
209
+ Returns:
210
+ bool: False if the table is *not* a header/footer, otherwise
211
+ True.
212
+
213
+ """
214
+ lines = self._scan_common() # Only re-runs if not already run.
215
+ # r: row; c: cell; l: line
216
+ return any(l in c for l in lines for r in table for c in r if c)
217
+
218
+ def _to_buffer(self, data: list[list]) -> io.StringIO:
219
+ """Write the table data into a string buffer.
220
+
221
+ Args:
222
+ data (list[list]): The table data as a list of lists to be
223
+ written to a buffer.
224
+
225
+ Returns:
226
+ io.StringIO: A string buffer as an ``io.StringIO`` object.
227
+
228
+ """
229
+ b = io.StringIO()
230
+ for row in data:
231
+ line = self._prepare_row(row=row)
232
+ b.write(line)
233
+ b.write('\n')
234
+ b.seek(0)
235
+ return b
236
+
237
+ def _to_csv(self, buffer: io.StringIO, pageno: int, tableno: int) -> int:
238
+ """Write a table (from the buffer) to CSV.
239
+
240
+ Args:
241
+ buffer (io.StringIO): A pre-processed ``StringIO`` object
242
+ containing table data to be written.
243
+ pageno (int): Page number from the ``Page`` object.
244
+ tableno (int): Number of the table on the page, based at 1.
245
+
246
+ Returns:
247
+ int: 1 if the file was written, otherwise 0. This is used by
248
+ the caller to track the number of CSV files written.
249
+
250
+ """
251
+ if buffer.seek(0, os.SEEK_END): # Test buffer is populated.
252
+ path = self._create_table_file_path(pageno=pageno, tblno=tableno)
253
+ with open(path, 'w', encoding='utf-8') as f:
254
+ buffer.seek(0)
255
+ shutil.copyfileobj(buffer, f)
256
+ return 1
257
+ return 0
258
+
259
+ def _to_df(self, buffer: io.StringIO):
260
+ """Write a table (from the buffer) to a DataFrame.
261
+
262
+ Once written, the DataFrame is appended to
263
+ :attr:`self._doc._tables` list of tables.
264
+
265
+ Args:
266
+ buffer (io.StringIO): A pre-processed ``StringIO`` object
267
+ containing table data to be written.
268
+
269
+ """
270
+ if buffer.seek(0, os.SEEK_END):
271
+ buffer.seek(0)
272
+ self._doc._tables.append(pd.read_csv(buffer))