docp 0.1.0b1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- docp/__init__.py +31 -0
- docp/_version.py +1 -0
- docp/dbs/__init__.py +0 -0
- docp/dbs/chroma.py +184 -0
- docp/loaders/__init__.py +0 -0
- docp/loaders/_chromabaseloader.py +362 -0
- docp/loaders/chroma.py +166 -0
- docp/objects/__init__.py +0 -0
- docp/objects/_docbaseobject.py +76 -0
- docp/objects/_pageobject.py +126 -0
- docp/objects/_tableobject.py +0 -0
- docp/objects/_textobject.py +0 -0
- docp/objects/pdfobject.py +39 -0
- docp/parsers/__init__.py +0 -0
- docp/parsers/_pdfbaseparser.py +210 -0
- docp/parsers/_pdftableparser.py +273 -0
- docp/parsers/_pdftextparser.py +253 -0
- docp/parsers/pdfparser.py +62 -0
- docp-0.1.0b1.dist-info/LICENSE +622 -0
- docp-0.1.0b1.dist-info/METADATA +55 -0
- docp-0.1.0b1.dist-info/RECORD +23 -0
- docp-0.1.0b1.dist-info/WHEEL +5 -0
- docp-0.1.0b1.dist-info/top_level.txt +1 -0
docp/loaders/chroma.py
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the entry point for loading a document
|
5
|
+
into a Chroma database.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
:Example: For example code use, please refer to the
|
14
|
+
:class:`ChromaLoader` class docstring.
|
15
|
+
|
16
|
+
# pylint: disable=import-error
|
17
|
+
# pylint: disable=wrong-import-position
|
18
|
+
"""
|
19
|
+
|
20
|
+
import os
|
21
|
+
import re
|
22
|
+
from glob import glob
|
23
|
+
# locals
|
24
|
+
try:
|
25
|
+
from .loaders._chromabaseloader import _ChromaBaseLoader
|
26
|
+
except ImportError:
|
27
|
+
from loaders._chromabaseloader import _ChromaBaseLoader
|
28
|
+
|
29
|
+
|
30
|
+
class ChromaLoader(_ChromaBaseLoader):
|
31
|
+
"""Chroma database document loader.
|
32
|
+
|
33
|
+
Args:
|
34
|
+
path (str): Full path to the file (or *directory*) to be parsed
|
35
|
+
and loaded. Note: If this is a directory, a specific file
|
36
|
+
extension can be passed into the :meth:`load` method using
|
37
|
+
the ``ext`` argument.
|
38
|
+
dbpath (str): Full path to the Chroma database *directory*.
|
39
|
+
collection (str): Name of the Chroma database collection into
|
40
|
+
which the data is to be loaded.
|
41
|
+
load_keywords (bool, optional): Use the provided LLM
|
42
|
+
(via the ``llm`` parameter) to read the document and infer
|
43
|
+
keywords to be loaded into the ``<collection>-kwds``
|
44
|
+
database, for keyword-driven document filtering.
|
45
|
+
Note: This *requires* the ``llm`` parameter and is
|
46
|
+
recommended only for GPU-bound processing. Defaults to False.
|
47
|
+
llm (object, optional): An LLM *instance* which can be provided
|
48
|
+
directly into the
|
49
|
+
:func:`langchain.chains.RetrievalQA.from_chain_type` function
|
50
|
+
for keywork inferrence. This is *required* for keyword
|
51
|
+
loading. Defaults to None.
|
52
|
+
offline (bool, optional): Remain offline and use the locally
|
53
|
+
cached embedding function model. Defaults to False.
|
54
|
+
|
55
|
+
.. important::
|
56
|
+
|
57
|
+
The *deriving and loading of keywords* is only recommended for
|
58
|
+
**GPU-bound processing**, as the LLM is invoked to infer the
|
59
|
+
keywords for each given document.
|
60
|
+
|
61
|
+
If called on a 'standard' PC, this will take a *long* time to
|
62
|
+
complete, if it completes at all.
|
63
|
+
|
64
|
+
:Example:
|
65
|
+
|
66
|
+
Parse and load a *single* document into a Chroma database
|
67
|
+
collection::
|
68
|
+
|
69
|
+
>>> from docp import ChromaLoader
|
70
|
+
|
71
|
+
>>> l = ChromaLoader(path='/path/to/file.pdf',
|
72
|
+
dbpath='/path/to/chroma',
|
73
|
+
collection='spam')
|
74
|
+
>>> l.load()
|
75
|
+
|
76
|
+
|
77
|
+
Parse and load a *directory* of PDF documents into a Chroma
|
78
|
+
database collection::
|
79
|
+
|
80
|
+
>>> from docp import ChromaLoader
|
81
|
+
|
82
|
+
>>> l = ChromaLoader(path='/path/to/directory',
|
83
|
+
dbpath='/path/to/chroma',
|
84
|
+
collection='spam')
|
85
|
+
>>> l.load(ext='pdf')
|
86
|
+
|
87
|
+
"""
|
88
|
+
|
89
|
+
def __init__(self,
|
90
|
+
path: str,
|
91
|
+
dbpath: str,
|
92
|
+
collection: str,
|
93
|
+
*,
|
94
|
+
load_keywords: bool=False,
|
95
|
+
llm: object=None,
|
96
|
+
offline: bool=False):
|
97
|
+
"""Chroma database loader class initialiser."""
|
98
|
+
super().__init__(dbpath=dbpath,
|
99
|
+
collection=collection,
|
100
|
+
load_keywords=load_keywords,
|
101
|
+
llm=llm,
|
102
|
+
offline=offline)
|
103
|
+
self._path = path
|
104
|
+
|
105
|
+
def load(self,
|
106
|
+
*,
|
107
|
+
ext: str='**',
|
108
|
+
recursive: bool=True,
|
109
|
+
remove_header: bool=True,
|
110
|
+
remove_footer: bool=True,
|
111
|
+
remove_newlines: bool=True,
|
112
|
+
ignore_tags: set=None,
|
113
|
+
convert_to_ascii: bool=True) -> None:
|
114
|
+
"""Load a document (or documents) into a Chroma database.
|
115
|
+
|
116
|
+
Args:
|
117
|
+
ext (str): If the ``path`` argument refers to a *directory*,
|
118
|
+
a specific file extension can be specified here.
|
119
|
+
For example::
|
120
|
+
|
121
|
+
ext = 'pdf'
|
122
|
+
|
123
|
+
If anything other than ``'**'`` is provided, all
|
124
|
+
alpha-characters are parsed from the string, and prefixed
|
125
|
+
with ``*.``. Meaning, if ``'.pdf'`` is passed, the
|
126
|
+
characters ``'pdf'`` are parsed and prefixed with ``*.``
|
127
|
+
to create ``'*.pdf'``. However, if ``'things.foo'`` is
|
128
|
+
passed, the derived extension will be ``'*.thingsfoo'``.
|
129
|
+
Defaults to '**', for a recursive search.
|
130
|
+
|
131
|
+
recursive (bool, optional): If True, subdirectories are
|
132
|
+
searched. Defaults to True.
|
133
|
+
remove_header (bool, optional): Attempt to remove the header
|
134
|
+
from each page. Defaults to True.
|
135
|
+
remove_footer (bool, optional): Attempt to remove the footer
|
136
|
+
from each page. Defaults to True.
|
137
|
+
remove_newlines (bool, optional): Replace newline characters
|
138
|
+
with a space. Defaults to True, as this helps with
|
139
|
+
document chunk splitting.
|
140
|
+
ignore_tags (set, optional): If provided, these are the
|
141
|
+
PDF 'marked content' tags which will be ignored. Note
|
142
|
+
that the PDF document must contain tags, otherwise the
|
143
|
+
bounding box method is used and this argument is ignored.
|
144
|
+
Defaults to ``{'Artifact'}``, as these generally
|
145
|
+
relate to a header and/or footer. To include all tags,
|
146
|
+
(not skip any) pass this argument as ``'na'``.
|
147
|
+
convert_to_ascii (bool, optional): Convert all characters to
|
148
|
+
ASCII. Defaults to True.
|
149
|
+
|
150
|
+
"""
|
151
|
+
if os.path.isdir(self._path):
|
152
|
+
if ext != '**':
|
153
|
+
ext = f'*.{re.findall("[a-zA-Z]+", ext)[0]}'
|
154
|
+
files = glob(os.path.join(self._path, ext), recursive=recursive)
|
155
|
+
count = len(files)
|
156
|
+
for idx, f in enumerate(files, 1):
|
157
|
+
print(f'\nProcessing {idx} of {count}: {os.path.basename(f)}')
|
158
|
+
self._load(path=f)
|
159
|
+
else:
|
160
|
+
print(f'Processing: {os.path.basename(self._path)} ...')
|
161
|
+
self._load(path=self._path,
|
162
|
+
remove_header=remove_header,
|
163
|
+
remove_footer=remove_footer,
|
164
|
+
remove_newlines=remove_newlines,
|
165
|
+
ignore_tags=ignore_tags,
|
166
|
+
convert_to_ascii=convert_to_ascii)
|
docp/objects/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,76 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the implementation for the
|
5
|
+
document-type-specific base class.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
:Comments: n/a
|
12
|
+
|
13
|
+
"""
|
14
|
+
|
15
|
+
from __future__ import annotations
|
16
|
+
try:
|
17
|
+
from .objects._pageobject import PageObject
|
18
|
+
except ImportError:
|
19
|
+
from objects._pageobject import PageObject
|
20
|
+
|
21
|
+
|
22
|
+
class _DocBase:
|
23
|
+
"""Private document base class.
|
24
|
+
|
25
|
+
This class is *not* designed to be interacted with directly, but
|
26
|
+
rather to be inherited by the document-type-specific document
|
27
|
+
objects.
|
28
|
+
|
29
|
+
"""
|
30
|
+
|
31
|
+
def __init__(self):
|
32
|
+
"""Base document object class initialiser."""
|
33
|
+
self._common = None # Used by the header/footer scanner.
|
34
|
+
self._fname = None # Filename (basename)
|
35
|
+
self._fpath = None # Full file path
|
36
|
+
self._meta = None # Metadata from the document parger
|
37
|
+
self._npages = 0 # Number of pages in the document
|
38
|
+
self._ntables = 0 # Number of tables extracted
|
39
|
+
self._parser = None # Underlying document parser functionality
|
40
|
+
# List of PageObjects, offset by 1 to align the index with page numbers.
|
41
|
+
self._pages = [PageObject(pageno=0)]
|
42
|
+
|
43
|
+
@property
|
44
|
+
def basename(self) -> str:
|
45
|
+
"""Accessor for the file's basename."""
|
46
|
+
return self._fname
|
47
|
+
|
48
|
+
@property
|
49
|
+
def filepath(self) -> str:
|
50
|
+
"""Accessor for the explicit path to this file."""
|
51
|
+
return self._fpath
|
52
|
+
|
53
|
+
@property
|
54
|
+
def metadata(self) -> dict | object:
|
55
|
+
"""The meta data as extracted from the document."""
|
56
|
+
return self._meta
|
57
|
+
|
58
|
+
@property
|
59
|
+
def npages(self) -> int:
|
60
|
+
"""The number of pages successfully extracted from the source."""
|
61
|
+
return self._npages
|
62
|
+
|
63
|
+
@property
|
64
|
+
def ntables(self) -> int:
|
65
|
+
"""The number of tables successfully extracted from the source."""
|
66
|
+
return self._ntables
|
67
|
+
|
68
|
+
@property
|
69
|
+
def pages(self) -> list[PageObject]: # noqa pylint: disable=undefined-variable
|
70
|
+
"""A list of containing an object for each page in the document."""
|
71
|
+
return self._pages
|
72
|
+
|
73
|
+
@property
|
74
|
+
def parser(self) -> object:
|
75
|
+
"""Accessor to the underlying document parser's functionality."""
|
76
|
+
return self._parser
|
@@ -0,0 +1,126 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the ``page`` object implementation.
|
5
|
+
|
6
|
+
:Platform: Linux/Windows | Python 3.10+
|
7
|
+
:Developer: J Berendt
|
8
|
+
:Email: development@s3dev.uk
|
9
|
+
|
10
|
+
:Comments: n/a
|
11
|
+
|
12
|
+
"""
|
13
|
+
|
14
|
+
from __future__ import annotations
|
15
|
+
|
16
|
+
|
17
|
+
class PageObject:
|
18
|
+
"""This class provides the implementation for the ``PageObject``.
|
19
|
+
|
20
|
+
For each page in a document, an instance of this class is created,
|
21
|
+
populated and appended into the document's ``pages`` list attribute.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
content (str, optional): Page content as a single string.
|
25
|
+
Defaults to ''.
|
26
|
+
pageno (int, optional): Page number. Defaults to 0.
|
27
|
+
parser (object, optional): The underlying document parser object.
|
28
|
+
Defaults to None.
|
29
|
+
|
30
|
+
"""
|
31
|
+
|
32
|
+
__slots__ = ('_content', '_hastext', '_pageno', '_parser', '_tables')
|
33
|
+
|
34
|
+
def __init__(self, content: str='', pageno: int=0, parser: object=None):
|
35
|
+
"""Page object class initialiser."""
|
36
|
+
self._content = content
|
37
|
+
self._pageno = pageno
|
38
|
+
self._parser = parser
|
39
|
+
self._hastext = bool(content)
|
40
|
+
self._tables = []
|
41
|
+
|
42
|
+
def __repr__(self) -> str:
|
43
|
+
"""Formatted representation of this object."""
|
44
|
+
if self._pageno == 0:
|
45
|
+
return f'<Page: {self._pageno}; <index offset>>'
|
46
|
+
return f'<Page: {self._pageno}; Chars: {len(self._content)}>'
|
47
|
+
|
48
|
+
def __str__(self) -> str:
|
49
|
+
"""Formatted string displayed when printing this object."""
|
50
|
+
c = self._content[:25].replace('\n', ' ') + ' ...' if self._content else ''
|
51
|
+
fmt = (f'Page no: {self._pageno}; '
|
52
|
+
f'Content: "{c}"; '
|
53
|
+
f'Chars: {len(self._content)}; '
|
54
|
+
f'nTables: {len(self._tables)}; '
|
55
|
+
f'Parser avail: {bool(self._parser)}')
|
56
|
+
return fmt
|
57
|
+
|
58
|
+
@property
|
59
|
+
def content(self) -> str:
|
60
|
+
"""Accessor to the page's textual content."""
|
61
|
+
return self._content
|
62
|
+
|
63
|
+
@content.setter
|
64
|
+
def content(self, value: str) -> None:
|
65
|
+
"""Setter for the ``content`` attribute.
|
66
|
+
|
67
|
+
If the ``value`` argument is populated, the content is set and
|
68
|
+
the ``hastext`` attribute is set to ``True``.
|
69
|
+
|
70
|
+
"""
|
71
|
+
if value:
|
72
|
+
self._content = value
|
73
|
+
self._hastext = True
|
74
|
+
|
75
|
+
@property
|
76
|
+
def hastext(self) -> bool:
|
77
|
+
"""Flag indicating if the ``content`` attribute is populated."""
|
78
|
+
return self._hastext
|
79
|
+
|
80
|
+
@property
|
81
|
+
def pageno(self) -> int:
|
82
|
+
"""Accessor to the page number.
|
83
|
+
|
84
|
+
Note:
|
85
|
+
This is the page number 1-n, concerning the page's *sequence
|
86
|
+
in the overall document*. This is *not* guaranteed to be the
|
87
|
+
page's number per the document's page labeling scheme.
|
88
|
+
|
89
|
+
"""
|
90
|
+
return self._pageno
|
91
|
+
|
92
|
+
@property
|
93
|
+
def parser(self) -> object:
|
94
|
+
"""Accessor to the document parser's internal functionality.
|
95
|
+
|
96
|
+
Note:
|
97
|
+
The population of this property is determined by the
|
98
|
+
document-type-specific ``docp`` parser. If the underlying
|
99
|
+
parsing library has functionality worth preserving and making
|
100
|
+
available to the user, it is stored to this property.
|
101
|
+
Otherwise, this property will remain as ``None``.
|
102
|
+
|
103
|
+
"""
|
104
|
+
return self._parser
|
105
|
+
|
106
|
+
@property
|
107
|
+
def tables(self) -> list:
|
108
|
+
"""Accessor to the page's tables, if parsed."""
|
109
|
+
return self._tables
|
110
|
+
|
111
|
+
def show(self) -> pdfplumber.display.PageImage: # pylint: disable=undefined-variable # noqa
|
112
|
+
"""Display the page as an image.
|
113
|
+
|
114
|
+
Additionally, the return value exposes access to the underlying
|
115
|
+
``pdfplumber`` debugging visualisation methods such as:
|
116
|
+
|
117
|
+
- :func:`img.debug_tablefinder`
|
118
|
+
- :func:`img.draw_*`
|
119
|
+
- :func:`img.outline_chars`
|
120
|
+
- :func:`img.outline_words`
|
121
|
+
- :func:`img.reset`
|
122
|
+
- etc.
|
123
|
+
|
124
|
+
|
125
|
+
"""
|
126
|
+
return self.parser.to_image()
|
File without changes
|
File without changes
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides the 'PDF Document' object structure into
|
5
|
+
which PDF documents are parsed into for transport and onward
|
6
|
+
use.
|
7
|
+
|
8
|
+
:Platform: Linux/Windows | Python 3.10+
|
9
|
+
:Developer: J Berendt
|
10
|
+
:Email: development@s3dev.uk
|
11
|
+
|
12
|
+
:Comments: n/a
|
13
|
+
|
14
|
+
"""
|
15
|
+
# pylint: disable=import-error
|
16
|
+
|
17
|
+
from objects._docbaseobject import _DocBase
|
18
|
+
|
19
|
+
|
20
|
+
class DocPDF(_DocBase):
|
21
|
+
"""Container class for storing data parsed from a PDF file."""
|
22
|
+
|
23
|
+
def __init__(self):
|
24
|
+
"""PDF document object class initialiser."""
|
25
|
+
super().__init__()
|
26
|
+
self._tags = False
|
27
|
+
|
28
|
+
@property
|
29
|
+
def parsed_using_tags(self) -> bool:
|
30
|
+
"""Flag indicating if the document was parsed using tags.
|
31
|
+
|
32
|
+
PDF documents can be created with 'marked content' tags. When
|
33
|
+
a PDF document is parsed using tags, as this flag indicates, the
|
34
|
+
parser respects columns and other page formatting schemes. If a
|
35
|
+
multi-column page is parsed without tags, the parser reads
|
36
|
+
straight across the line, thus corrupting the text.
|
37
|
+
|
38
|
+
"""
|
39
|
+
return self._tags
|
docp/parsers/__init__.py
ADDED
File without changes
|
@@ -0,0 +1,210 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
# -*- coding: utf-8 -*-
|
3
|
+
"""
|
4
|
+
:Purpose: This module provides generalised base functionality for
|
5
|
+
parsing PDF documents.
|
6
|
+
|
7
|
+
:Platform: Linux/Windows | Python 3.10+
|
8
|
+
:Developer: J Berendt
|
9
|
+
:Email: development@s3dev.uk
|
10
|
+
|
11
|
+
Note: This module is *not* designed to be interacted with
|
12
|
+
directly, only via the appropriate interface class(es).
|
13
|
+
|
14
|
+
Rather, please create an instance of a PDF document parsing
|
15
|
+
object using the following:
|
16
|
+
|
17
|
+
- :class:`pdfparser.PDFParser`
|
18
|
+
|
19
|
+
"""
|
20
|
+
# pylint: disable=import-error
|
21
|
+
# pylint: disable=protected-access
|
22
|
+
# pylint: disable=wrong-import-order
|
23
|
+
|
24
|
+
import os
|
25
|
+
import pdfplumber
|
26
|
+
from collections import Counter
|
27
|
+
from unidecode import unidecode
|
28
|
+
# locals
|
29
|
+
from objects.pdfobject import DocPDF
|
30
|
+
|
31
|
+
|
32
|
+
class _PDFBaseParser:
|
33
|
+
"""Base class containing generalised PDF parsing functionality."""
|
34
|
+
|
35
|
+
def __init__(self, path: str):
|
36
|
+
"""Private base parser class initialiser.
|
37
|
+
|
38
|
+
Args:
|
39
|
+
path (str): Full path to the document to be parsed.
|
40
|
+
|
41
|
+
"""
|
42
|
+
self._path = path
|
43
|
+
self._doc = DocPDF()
|
44
|
+
self._tbl_opath = None
|
45
|
+
self._set_paths()
|
46
|
+
self._open()
|
47
|
+
|
48
|
+
def __del__(self):
|
49
|
+
"""Class deconstructor.
|
50
|
+
|
51
|
+
:Tasks:
|
52
|
+
- Ensure the PDF document is closed.
|
53
|
+
|
54
|
+
"""
|
55
|
+
if hasattr(self._doc, '_parser'):
|
56
|
+
self._doc._parser.close()
|
57
|
+
|
58
|
+
@property
|
59
|
+
def doc(self) -> DocPDF:
|
60
|
+
"""Accessor to the document object."""
|
61
|
+
return self._doc
|
62
|
+
|
63
|
+
def _get_crop_coordinates(self,
|
64
|
+
skip_header: bool=False,
|
65
|
+
skip_footer: bool=False) -> tuple[float]:
|
66
|
+
"""Determine the bounding box coordinates.
|
67
|
+
|
68
|
+
These coordinates are used for removing the header and/or footer.
|
69
|
+
|
70
|
+
Args:
|
71
|
+
skip_header (bool, optional): If True, set the coordinates
|
72
|
+
such that the header is skipped. Defaults to False.
|
73
|
+
skip_footer (bool, optional): If True, set the coordinates
|
74
|
+
such that the footer is skipped. Defaults to False.
|
75
|
+
|
76
|
+
:Logic:
|
77
|
+
When excluding a header and/or footer, the following page
|
78
|
+
numbers are used for header/footer *position* detection,
|
79
|
+
given the length of the document:
|
80
|
+
|
81
|
+
- Number of pages [1]: 1
|
82
|
+
- Number of pages [2,10]: 2
|
83
|
+
- Number of pages [11,]: 5
|
84
|
+
|
85
|
+
Returns:
|
86
|
+
tuple: A bounding box tuple of the following form, to be
|
87
|
+
passed directly into the :func:`Page.crop` method::
|
88
|
+
|
89
|
+
(x0, top, x1, bottom)
|
90
|
+
|
91
|
+
"""
|
92
|
+
npages = self._doc.npages
|
93
|
+
match npages:
|
94
|
+
case 1: num = 1
|
95
|
+
case _ if npages in range(2, 11): num = 2
|
96
|
+
case _: num = 5
|
97
|
+
pg = self._doc.parser.pages[num] # The pages list has a has a page offset at [0].
|
98
|
+
# Default coordinates to the whole page.
|
99
|
+
coords = {'x0': 0, 'top': 0, 'x1': pg.width, 'bottom': pg.height}
|
100
|
+
# If the header and/or footer is to be skipped, find and iterate
|
101
|
+
# through the common lines and overwrite the coordinates as
|
102
|
+
# appropriate, given the key and the line's location on the page.
|
103
|
+
if skip_header or skip_footer:
|
104
|
+
lines = self._scan_common()
|
105
|
+
for line in lines:
|
106
|
+
s = pg.search(line)
|
107
|
+
if s:
|
108
|
+
for key in coords:
|
109
|
+
v = s[0][key]
|
110
|
+
match key:
|
111
|
+
case 'top' if v < pg.height/2 and skip_header:
|
112
|
+
coords[key] = max(coords[key], v+2)
|
113
|
+
case 'bottom' if v > pg.height/2 and skip_footer:
|
114
|
+
coords[key] = min(coords[key], v-2)
|
115
|
+
return tuple(coords.values())
|
116
|
+
|
117
|
+
def _open(self) -> None:
|
118
|
+
"""Open the PDF document for reading.
|
119
|
+
|
120
|
+
:Other Operations:
|
121
|
+
|
122
|
+
- Store the ``pdfplumber`` parser object returned from the
|
123
|
+
:func:`pdfplumber.open` function into the
|
124
|
+
:attr:`self._doc._parser` attribute.
|
125
|
+
- Store the number of pages into the
|
126
|
+
:attr:`self._doc._npages` attribute.
|
127
|
+
- Store the document's meta data into the
|
128
|
+
:attr:`self._doc._meta` attribute.
|
129
|
+
|
130
|
+
"""
|
131
|
+
self._doc._parser = pdfplumber.open(self._doc._fpath)
|
132
|
+
self._doc._npages = len(self._doc._parser.pages)
|
133
|
+
self._doc._meta = self._doc._parser.metadata
|
134
|
+
|
135
|
+
@staticmethod
|
136
|
+
def _prepare_row(row: list) -> str:
|
137
|
+
"""Prepare the table row for writing a table to to CSV.
|
138
|
+
|
139
|
+
Args:
|
140
|
+
row (list): A list of strings, constituting a table row.
|
141
|
+
|
142
|
+
:Processing Tasks:
|
143
|
+
|
144
|
+
For each element in the row:
|
145
|
+
|
146
|
+
- Remove any double quote characters (ASCII and Unicode).
|
147
|
+
- Replace any empty values with ``'None'``.
|
148
|
+
- If the element contains a comma, wrap the element in
|
149
|
+
double quotes.
|
150
|
+
- Attempt to convert any non-ASCII characters to an
|
151
|
+
associated ASCII character. If the replacement cannot
|
152
|
+
be made, the character is replaced with a ``'?'``.
|
153
|
+
|
154
|
+
Returns:
|
155
|
+
str: A processed comma-separated string, ready to be written
|
156
|
+
to a CSV file.
|
157
|
+
|
158
|
+
"""
|
159
|
+
trans = {34: '', 8220: '', 8221: ''} # Remove double quotes in Unicode.
|
160
|
+
row = [e.translate(trans) if e else 'None' for e in row] # Cannot be a generator.
|
161
|
+
for idx, e in enumerate(row):
|
162
|
+
if ',' in e:
|
163
|
+
row[idx] = f'"{e}"' # Escape comma-separation by quoting.
|
164
|
+
line = unidecode(','.join(row).replace('\n', ' '), errors='replace', replace_str='?')
|
165
|
+
return line
|
166
|
+
|
167
|
+
def _scan_common(self) -> list[str]:
|
168
|
+
"""Scan the PDF document to find the most common lines.
|
169
|
+
|
170
|
+
:Rationale:
|
171
|
+
Generally, the most common lines in a document will be the
|
172
|
+
header and footer, as these are expected to be repeated on
|
173
|
+
each page of the document.
|
174
|
+
|
175
|
+
'Most common' is defined as line occurring on 90% of the
|
176
|
+
pages throughout the document. Therefore, only documents with
|
177
|
+
more than three pages are scanned. Otherwise, the 90% may
|
178
|
+
exclude relevant pieces of the document (as was discovered in
|
179
|
+
testing).
|
180
|
+
|
181
|
+
:Logic:
|
182
|
+
For documents with more than three pages, the entire PDF is
|
183
|
+
read through and each line extracted. The occurrence of each
|
184
|
+
line is counted, with the most common occurrences returned
|
185
|
+
to the caller.
|
186
|
+
|
187
|
+
The returned lines are to be passed into a page search to
|
188
|
+
determine the x/y coordinates of the header and footer.
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
list: For documents with more than three pages, a list
|
192
|
+
containing the most common lines in the document. Otherwise,
|
193
|
+
an empty list if returned.
|
194
|
+
|
195
|
+
"""
|
196
|
+
# Only scan if document has more than three pages.
|
197
|
+
if self._doc.npages < 4:
|
198
|
+
return []
|
199
|
+
if self._doc.common is None:
|
200
|
+
# Create a line generator for all pages.
|
201
|
+
lines = (l for p in self._doc.parser.pages for l in p.extract_text().split('\n'))
|
202
|
+
# Return the lines whose occurrence rate is 90% of document pages.
|
203
|
+
self._doc._common = [i[0] for i in Counter(lines).most_common()
|
204
|
+
if i[1] > self._doc.npages * 0.9]
|
205
|
+
return self._doc.common
|
206
|
+
|
207
|
+
def _set_paths(self) -> None:
|
208
|
+
"""Set the document's file path attributes."""
|
209
|
+
self._doc._fpath = os.path.realpath(self._path)
|
210
|
+
self._doc._fname = os.path.basename(self._path)
|