custom-layoutparser 9.9.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- custom_layoutparser-9.9.9.dist-info/METADATA +5 -0
- custom_layoutparser-9.9.9.dist-info/RECORD +36 -0
- custom_layoutparser-9.9.9.dist-info/WHEEL +5 -0
- custom_layoutparser-9.9.9.dist-info/top_level.txt +1 -0
- layoutparser/__init__.py +89 -0
- layoutparser/elements/__init__.py +25 -0
- layoutparser/elements/base.py +275 -0
- layoutparser/elements/errors.py +26 -0
- layoutparser/elements/layout.py +348 -0
- layoutparser/elements/layout_elements.py +1352 -0
- layoutparser/elements/utils.py +82 -0
- layoutparser/file_utils.py +235 -0
- layoutparser/io/__init__.py +2 -0
- layoutparser/io/basic.py +148 -0
- layoutparser/io/pdf.py +225 -0
- layoutparser/models/__init__.py +18 -0
- layoutparser/models/auto_layoutmodel.py +70 -0
- layoutparser/models/base_catalog.py +34 -0
- layoutparser/models/base_layoutmodel.py +88 -0
- layoutparser/models/detectron2/__init__.py +18 -0
- layoutparser/models/detectron2/catalog.py +142 -0
- layoutparser/models/detectron2/layoutmodel.py +168 -0
- layoutparser/models/effdet/__init__.py +16 -0
- layoutparser/models/effdet/catalog.py +70 -0
- layoutparser/models/effdet/layoutmodel.py +256 -0
- layoutparser/models/model_config.py +133 -0
- layoutparser/models/paddledetection/__init__.py +17 -0
- layoutparser/models/paddledetection/catalog.py +214 -0
- layoutparser/models/paddledetection/layoutmodel.py +297 -0
- layoutparser/ocr/__init__.py +16 -0
- layoutparser/ocr/base.py +41 -0
- layoutparser/ocr/gcv_agent.py +288 -0
- layoutparser/ocr/tesseract_agent.py +193 -0
- layoutparser/tools/__init__.py +5 -0
- layoutparser/tools/shape_operations.py +167 -0
- layoutparser/visualization.py +571 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
# Copyright 2021 The Layout Parser team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import List, Union, Dict, Dict, Any, Optional, Tuple
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
from PIL import Image
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def cvt_coordinates_to_points(coords: Tuple[float, float, float, float]) -> np.ndarray:
|
|
22
|
+
|
|
23
|
+
x_1, y_1, x_2, y_2 = coords
|
|
24
|
+
return np.array(
|
|
25
|
+
[
|
|
26
|
+
[x_1, y_1], # Top Left
|
|
27
|
+
[x_2, y_1], # Top Right
|
|
28
|
+
[x_2, y_2], # Bottom Right
|
|
29
|
+
[x_1, y_2], # Bottom Left
|
|
30
|
+
]
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def cvt_points_to_coordinates(points: np.ndarray) -> Tuple[float, float, float, float]:
|
|
35
|
+
x_1 = points[:, 0].min()
|
|
36
|
+
y_1 = points[:, 1].min()
|
|
37
|
+
x_2 = points[:, 0].max()
|
|
38
|
+
y_2 = points[:, 1].max()
|
|
39
|
+
return (x_1, y_1, x_2, y_2)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def perspective_transformation(
|
|
43
|
+
M: np.ndarray, points: np.ndarray, is_inv: bool = False
|
|
44
|
+
) -> np.ndarray:
|
|
45
|
+
|
|
46
|
+
if is_inv:
|
|
47
|
+
M = np.linalg.inv(M)
|
|
48
|
+
|
|
49
|
+
src_mid = np.hstack([points, np.ones((points.shape[0], 1))]).T # 3x4
|
|
50
|
+
dst_mid = np.matmul(M, src_mid)
|
|
51
|
+
|
|
52
|
+
dst = (dst_mid / dst_mid[-1]).T[:, :2] # 4x2
|
|
53
|
+
|
|
54
|
+
return dst
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def vertice_in_polygon(vertice: np.ndarray, polygon_points: np.ndarray) -> bool:
|
|
58
|
+
# The polygon_points are ordered clockwise
|
|
59
|
+
|
|
60
|
+
# The implementation is based on the algorithm from
|
|
61
|
+
# https://demonstrations.wolfram.com/AnEfficientTestForAPointToBeInAConvexPolygon/
|
|
62
|
+
|
|
63
|
+
points = polygon_points - vertice # shift the coordinates origin to the vertice
|
|
64
|
+
edges = np.append(points, points[0:1, :], axis=0)
|
|
65
|
+
return all([np.linalg.det([e1, e2]) >= 0 for e1, e2 in zip(edges, edges[1:])])
|
|
66
|
+
# If the points are ordered clockwise, the det should <=0
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def polygon_area(xs: np.ndarray, ys: np.ndarray) -> float:
|
|
70
|
+
"""Calculate the area of polygons using
|
|
71
|
+
`Shoelace Formula <https://en.wikipedia.org/wiki/Shoelace_formula>`_.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
xs (`np.ndarray`): The x coordinates of the points
|
|
75
|
+
ys (`np.ndarray`): The y coordinates of the points
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
# Refer to: https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
|
|
79
|
+
# The formula is equivalent to the original one indicated in the wikipedia
|
|
80
|
+
# page.
|
|
81
|
+
|
|
82
|
+
return 0.5 * np.abs(np.dot(xs, np.roll(ys, 1)) - np.dot(ys, np.roll(xs, 1)))
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
# Copyright 2021 The Layout Parser team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
# Some code are adapted from
|
|
16
|
+
# https://github.com/huggingface/transformers/blob/master/src/transformers/file_utils.py
|
|
17
|
+
|
|
18
|
+
from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
|
|
19
|
+
import sys
|
|
20
|
+
import os
|
|
21
|
+
import logging
|
|
22
|
+
import importlib.util
|
|
23
|
+
from types import ModuleType
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
|
|
26
|
+
|
|
27
|
+
# The package importlib_metadata is in a different place, depending on the python version.
|
|
28
|
+
if sys.version_info < (3, 8):
|
|
29
|
+
import importlib_metadata
|
|
30
|
+
else:
|
|
31
|
+
import importlib.metadata as importlib_metadata
|
|
32
|
+
|
|
33
|
+
###########################################
|
|
34
|
+
############ Layout Model Deps ############
|
|
35
|
+
###########################################
|
|
36
|
+
|
|
37
|
+
_torch_available = importlib.util.find_spec("torch") is not None
|
|
38
|
+
try:
|
|
39
|
+
_torch_version = importlib_metadata.version("torch")
|
|
40
|
+
logger.debug(f"PyTorch version {_torch_version} available.")
|
|
41
|
+
except importlib_metadata.PackageNotFoundError:
|
|
42
|
+
_torch_available = False
|
|
43
|
+
|
|
44
|
+
_detectron2_available = importlib.util.find_spec("detectron2") is not None
|
|
45
|
+
try:
|
|
46
|
+
_detectron2_version = importlib_metadata.version("detectron2")
|
|
47
|
+
logger.debug(f"Detectron2 version {_detectron2_version} available")
|
|
48
|
+
except importlib_metadata.PackageNotFoundError:
|
|
49
|
+
_detectron2_available = False
|
|
50
|
+
|
|
51
|
+
_paddle_available = importlib.util.find_spec("paddle") is not None
|
|
52
|
+
try:
|
|
53
|
+
# The name of the paddlepaddle library:
|
|
54
|
+
# Install name: pip install paddlepaddle
|
|
55
|
+
# Import name: import paddle
|
|
56
|
+
_paddle_version = importlib_metadata.version("paddlepaddle")
|
|
57
|
+
logger.debug(f"Paddle version {_paddle_version} available.")
|
|
58
|
+
except importlib_metadata.PackageNotFoundError:
|
|
59
|
+
_paddle_available = False
|
|
60
|
+
|
|
61
|
+
_effdet_available = importlib.util.find_spec("effdet") is not None
|
|
62
|
+
try:
|
|
63
|
+
_effdet_version = importlib_metadata.version("effdet")
|
|
64
|
+
logger.debug(f"Effdet version {_effdet_version} available.")
|
|
65
|
+
except importlib_metadata.PackageNotFoundError:
|
|
66
|
+
_effdet_version = False
|
|
67
|
+
|
|
68
|
+
###########################################
|
|
69
|
+
############## OCR Tool Deps ##############
|
|
70
|
+
###########################################
|
|
71
|
+
|
|
72
|
+
_pytesseract_available = importlib.util.find_spec("pytesseract") is not None
|
|
73
|
+
try:
|
|
74
|
+
_pytesseract_version = importlib_metadata.version("pytesseract")
|
|
75
|
+
logger.debug(f"Pytesseract version {_pytesseract_version} available.")
|
|
76
|
+
except importlib_metadata.PackageNotFoundError:
|
|
77
|
+
_pytesseract_available = False
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
_gcv_available = importlib.util.find_spec("google.cloud.vision") is not None
|
|
81
|
+
try:
|
|
82
|
+
_gcv_version = importlib_metadata.version(
|
|
83
|
+
"google-cloud-vision"
|
|
84
|
+
) # This is slightly different
|
|
85
|
+
logger.debug(f"Google Cloud Vision Utils version {_gcv_version} available.")
|
|
86
|
+
except importlib_metadata.PackageNotFoundError:
|
|
87
|
+
_gcv_available = False
|
|
88
|
+
except ModuleNotFoundError:
|
|
89
|
+
_gcv_available = False
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def is_torch_available():
|
|
93
|
+
return _torch_available
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def is_torch_cuda_available():
|
|
97
|
+
if is_torch_available():
|
|
98
|
+
import torch
|
|
99
|
+
|
|
100
|
+
return torch.cuda.is_available()
|
|
101
|
+
else:
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def is_detectron2_available():
|
|
106
|
+
return _detectron2_available
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def is_paddle_available():
|
|
110
|
+
return _paddle_available
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def is_effdet_available():
|
|
114
|
+
return _effdet_available
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def is_pytesseract_available():
|
|
118
|
+
return _pytesseract_available
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def is_gcv_available():
|
|
122
|
+
return _gcv_available
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
PYTORCH_IMPORT_ERROR = """
|
|
126
|
+
{0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
|
|
127
|
+
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
|
|
128
|
+
"""
|
|
129
|
+
|
|
130
|
+
DETECTRON2_IMPORT_ERROR = """
|
|
131
|
+
{0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
|
|
132
|
+
installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
|
|
133
|
+
that match your environment. Typically the following would work for MacOS or Linux CPU machines:
|
|
134
|
+
pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
|
|
135
|
+
"""
|
|
136
|
+
|
|
137
|
+
PADDLE_IMPORT_ERROR = """
|
|
138
|
+
{0} requires the PaddlePaddle library but it was not found in your environment. Checkout the instructions on the
|
|
139
|
+
installation page: https://github.com/PaddlePaddle/Paddle and follow the ones that match your environment.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
EFFDET_IMPORT_ERROR = """
|
|
143
|
+
{0} requires the effdet library but it was not found in your environment. You can install it with pip:
|
|
144
|
+
`pip install effdet`
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
PYTESSERACT_IMPORT_ERROR = """
|
|
148
|
+
{0} requires the PyTesseract library but it was not found in your environment. You can install it with pip:
|
|
149
|
+
`pip install pytesseract`
|
|
150
|
+
"""
|
|
151
|
+
|
|
152
|
+
GCV_IMPORT_ERROR = """
|
|
153
|
+
{0} requires the Google Cloud Vision Python utils but it was not found in your environment. You can install it with pip:
|
|
154
|
+
`pip install google-cloud-vision==1`
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
BACKENDS_MAPPING = dict(
|
|
158
|
+
[
|
|
159
|
+
("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
|
|
160
|
+
("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
|
|
161
|
+
("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)),
|
|
162
|
+
("effdet", (is_effdet_available, EFFDET_IMPORT_ERROR)),
|
|
163
|
+
("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
|
|
164
|
+
("google-cloud-vision", (is_gcv_available, GCV_IMPORT_ERROR)),
|
|
165
|
+
]
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def requires_backends(obj, backends):
|
|
170
|
+
if not isinstance(backends, (list, tuple)):
|
|
171
|
+
backends = [backends]
|
|
172
|
+
|
|
173
|
+
name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
|
|
174
|
+
if not all(BACKENDS_MAPPING[backend][0]() for backend in backends):
|
|
175
|
+
raise ImportError(
|
|
176
|
+
"".join([BACKENDS_MAPPING[backend][1].format(name) for backend in backends])
|
|
177
|
+
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
class _LazyModule(ModuleType):
|
|
181
|
+
"""
|
|
182
|
+
Module class that surfaces all objects but only performs associated imports when the objects are requested.
|
|
183
|
+
"""
|
|
184
|
+
|
|
185
|
+
# Adapted from HuggingFace
|
|
186
|
+
# https://github.com/huggingface/transformers/blob/c37573806ab3526dd805c49cbe2489ad4d68a9d7/src/transformers/file_utils.py#L1990
|
|
187
|
+
|
|
188
|
+
def __init__(
|
|
189
|
+
self, name, module_file, import_structure, module_spec=None, extra_objects=None
|
|
190
|
+
):
|
|
191
|
+
super().__init__(name)
|
|
192
|
+
self._modules = set(import_structure.keys())
|
|
193
|
+
self._class_to_module = {}
|
|
194
|
+
for key, values in import_structure.items():
|
|
195
|
+
for value in values:
|
|
196
|
+
self._class_to_module[value] = key
|
|
197
|
+
# Needed for autocompletion in an IDE
|
|
198
|
+
self.__all__ = list(import_structure.keys()) + sum(
|
|
199
|
+
import_structure.values(), []
|
|
200
|
+
)
|
|
201
|
+
self.__file__ = module_file
|
|
202
|
+
self.__spec__ = module_spec
|
|
203
|
+
self.__path__ = [os.path.dirname(module_file)]
|
|
204
|
+
self._objects = {} if extra_objects is None else extra_objects
|
|
205
|
+
self._name = name
|
|
206
|
+
self._import_structure = import_structure
|
|
207
|
+
|
|
208
|
+
# Following [PEP 366](https://www.python.org/dev/peps/pep-0366/)
|
|
209
|
+
# The __package__ variable should be set
|
|
210
|
+
# https://docs.python.org/3/reference/import.html#__package__
|
|
211
|
+
self.__package__ = self.__name__
|
|
212
|
+
|
|
213
|
+
# Needed for autocompletion in an IDE
|
|
214
|
+
def __dir__(self):
|
|
215
|
+
return super().__dir__() + self.__all__
|
|
216
|
+
|
|
217
|
+
def __getattr__(self, name: str) -> Any:
|
|
218
|
+
if name in self._objects:
|
|
219
|
+
return self._objects[name]
|
|
220
|
+
if name in self._modules:
|
|
221
|
+
value = self._get_module(name)
|
|
222
|
+
elif name in self._class_to_module.keys():
|
|
223
|
+
module = self._get_module(self._class_to_module[name])
|
|
224
|
+
value = getattr(module, name)
|
|
225
|
+
else:
|
|
226
|
+
raise AttributeError(f"module {self.__name__} has no attribute {name}")
|
|
227
|
+
|
|
228
|
+
setattr(self, name, value)
|
|
229
|
+
return value
|
|
230
|
+
|
|
231
|
+
def _get_module(self, module_name: str):
|
|
232
|
+
return importlib.import_module("." + module_name, self.__name__)
|
|
233
|
+
|
|
234
|
+
def __reduce__(self):
|
|
235
|
+
return (self.__class__, (self._name, self.__file__, self._import_structure))
|
layoutparser/io/basic.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# Copyright 2021 The Layout Parser team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
import ast
|
|
16
|
+
import json
|
|
17
|
+
from typing import List, Union, Dict, Dict, Any
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from ..elements import (
|
|
22
|
+
BaseLayoutElement,
|
|
23
|
+
TextBlock,
|
|
24
|
+
Layout,
|
|
25
|
+
BASECOORD_ELEMENT_NAMEMAP,
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def load_json(filename: str) -> Union[BaseLayoutElement, Layout]:
|
|
30
|
+
"""Load a JSON file and save it as a layout object with appropriate data types.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
filename (str):
|
|
34
|
+
The name of the JSON file.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Union[BaseLayoutElement, Layout]:
|
|
38
|
+
Based on the JSON file format, it will automatically parse
|
|
39
|
+
the type of the data and load it accordingly.
|
|
40
|
+
"""
|
|
41
|
+
with open(filename, "r") as fp:
|
|
42
|
+
res = json.load(fp)
|
|
43
|
+
|
|
44
|
+
return load_dict(res)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def load_dict(data: Union[Dict, List[Dict]]) -> Union[BaseLayoutElement, Layout]:
|
|
48
|
+
"""Load a dict of list of dict representations of some layout data,
|
|
49
|
+
automatically parse its type, and save it as any of BaseLayoutElement
|
|
50
|
+
or Layout datatype.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
data (Union[Dict, List]):
|
|
54
|
+
A dict of list of dict representations of the layout data
|
|
55
|
+
|
|
56
|
+
Raises:
|
|
57
|
+
ValueError:
|
|
58
|
+
If the data format is incompatible with the layout-data-JSON format,
|
|
59
|
+
raise a `ValueError`.
|
|
60
|
+
ValueError:
|
|
61
|
+
If any `block_type` name is not in the available list of layout element
|
|
62
|
+
names defined in `BASECOORD_ELEMENT_NAMEMAP`, raise a `ValueError`.
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Union[BaseLayoutElement, Layout]:
|
|
66
|
+
Based on the dict format, it will automatically parse the type of
|
|
67
|
+
the data and load it accordingly.
|
|
68
|
+
"""
|
|
69
|
+
if isinstance(data, dict):
|
|
70
|
+
if "page_data" in data:
|
|
71
|
+
# It is a layout instance
|
|
72
|
+
return Layout(load_dict(data["blocks"])._blocks, page_data=data["page_data"])
|
|
73
|
+
else:
|
|
74
|
+
|
|
75
|
+
if data["block_type"] not in BASECOORD_ELEMENT_NAMEMAP:
|
|
76
|
+
raise ValueError(f"Invalid block_type {data['block_type']}")
|
|
77
|
+
|
|
78
|
+
# Check if it is a textblock
|
|
79
|
+
is_textblock = any(ele in data for ele in TextBlock._features)
|
|
80
|
+
if is_textblock:
|
|
81
|
+
return TextBlock.from_dict(data)
|
|
82
|
+
else:
|
|
83
|
+
return BASECOORD_ELEMENT_NAMEMAP[data["block_type"]].from_dict(data)
|
|
84
|
+
|
|
85
|
+
elif isinstance(data, list):
|
|
86
|
+
return Layout([load_dict(ele) for ele in data])
|
|
87
|
+
|
|
88
|
+
else:
|
|
89
|
+
raise ValueError(f"Invalid input JSON structure.")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def load_csv(filename: str, block_type: str = None) -> Layout:
|
|
93
|
+
"""Load the Layout object from the given CSV file.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
filename (str):
|
|
97
|
+
The name of the CSV file. A row of the table represents
|
|
98
|
+
an individual layout element.
|
|
99
|
+
|
|
100
|
+
block_type (str):
|
|
101
|
+
If there's no block_type column in the CSV file,
|
|
102
|
+
you must pass in a block_type variable such that layout parser
|
|
103
|
+
can appropriately detect the type of the layout elements.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
Layout:
|
|
107
|
+
The parsed Layout object from the CSV file.
|
|
108
|
+
"""
|
|
109
|
+
|
|
110
|
+
return load_dataframe(pd.read_csv(filename), block_type=block_type)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def load_dataframe(df: pd.DataFrame, block_type: str = None) -> Layout:
|
|
114
|
+
"""Load the Layout object from the given dataframe.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
df (pd.DataFrame):
|
|
118
|
+
|
|
119
|
+
block_type (str):
|
|
120
|
+
If there's no block_type column in the CSV file,
|
|
121
|
+
you must pass in a block_type variable such that layout parser
|
|
122
|
+
can appropriately detect the type of the layout elements.
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Layout:
|
|
126
|
+
The parsed Layout object from the CSV file.
|
|
127
|
+
"""
|
|
128
|
+
df = df.copy()
|
|
129
|
+
if "points" in df.columns:
|
|
130
|
+
if df["points"].dtype == object:
|
|
131
|
+
df["points"] = df["points"].map(
|
|
132
|
+
lambda x: ast.literal_eval(x) if not pd.isna(x) else x
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
if block_type is None:
|
|
136
|
+
if "block_type" not in df.columns:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
"`block_type` not specified both in dataframe and arguments"
|
|
139
|
+
)
|
|
140
|
+
else:
|
|
141
|
+
df["block_type"] = block_type
|
|
142
|
+
|
|
143
|
+
if any(col in TextBlock._features for col in df.columns):
|
|
144
|
+
# Automatically setting index for textblock
|
|
145
|
+
if "id" not in df.columns:
|
|
146
|
+
df["id"] = df.index
|
|
147
|
+
|
|
148
|
+
return load_dict(df.apply(lambda x: x.dropna().to_dict(), axis=1).to_list())
|
layoutparser/io/pdf.py
ADDED
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
# Copyright 2021 The Layout Parser team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from typing import List, Union, Optional, Dict, Tuple
|
|
16
|
+
|
|
17
|
+
import pdfplumber
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from ..elements import Layout
|
|
21
|
+
from .basic import load_dataframe
|
|
22
|
+
|
|
23
|
+
DEFAULT_PDF_DPI = 72
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def extract_words_for_page(
|
|
27
|
+
page: pdfplumber.page.Page,
|
|
28
|
+
x_tolerance=1.5,
|
|
29
|
+
y_tolerance=2,
|
|
30
|
+
keep_blank_chars=False,
|
|
31
|
+
use_text_flow=True,
|
|
32
|
+
horizontal_ltr=True,
|
|
33
|
+
vertical_ttb=True,
|
|
34
|
+
extra_attrs=None,
|
|
35
|
+
) -> Layout:
|
|
36
|
+
"""The helper function used for extracting words from a pdfplumber page
|
|
37
|
+
object.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Layout: a layout object representing all extracted pdf tokens on this page.
|
|
41
|
+
"""
|
|
42
|
+
if extra_attrs is None:
|
|
43
|
+
extra_attrs = ["fontname", "size"]
|
|
44
|
+
|
|
45
|
+
tokens = page.extract_words(
|
|
46
|
+
x_tolerance=x_tolerance,
|
|
47
|
+
y_tolerance=y_tolerance,
|
|
48
|
+
keep_blank_chars=keep_blank_chars,
|
|
49
|
+
use_text_flow=use_text_flow,
|
|
50
|
+
horizontal_ltr=horizontal_ltr,
|
|
51
|
+
vertical_ttb=vertical_ttb,
|
|
52
|
+
extra_attrs=extra_attrs,
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
df = pd.DataFrame(tokens)
|
|
56
|
+
|
|
57
|
+
if len(df) == 0:
|
|
58
|
+
return Layout()
|
|
59
|
+
|
|
60
|
+
df[["x0", "x1"]] = (
|
|
61
|
+
df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float")
|
|
62
|
+
)
|
|
63
|
+
df[["top", "bottom"]] = (
|
|
64
|
+
df[["top", "bottom"]].clip(lower=0, upper=int(page.height)).astype("float")
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
page_tokens = load_dataframe(
|
|
68
|
+
df.reset_index().rename(
|
|
69
|
+
columns={
|
|
70
|
+
"x0": "x_1",
|
|
71
|
+
"x1": "x_2",
|
|
72
|
+
"top": "y_1",
|
|
73
|
+
"bottom": "y_2",
|
|
74
|
+
"index": "id",
|
|
75
|
+
"fontname": "type", # also loading fontname as "type"
|
|
76
|
+
}
|
|
77
|
+
),
|
|
78
|
+
block_type="rectangle",
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
return page_tokens
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def load_pdf(
|
|
85
|
+
filename: str,
|
|
86
|
+
load_images: bool = False,
|
|
87
|
+
x_tolerance: int = 1.5,
|
|
88
|
+
y_tolerance: int = 2,
|
|
89
|
+
keep_blank_chars: bool = False,
|
|
90
|
+
use_text_flow: bool = True,
|
|
91
|
+
horizontal_ltr: bool = True,
|
|
92
|
+
vertical_ttb: bool = True,
|
|
93
|
+
extra_attrs: Optional[List[str]] = None,
|
|
94
|
+
dpi: int = DEFAULT_PDF_DPI,
|
|
95
|
+
) -> Union[List[Layout], Tuple[List[Layout], List["Image.Image"]]]:
|
|
96
|
+
"""Load all tokens for each page from a PDF file, and save them
|
|
97
|
+
in a list of Layout objects with the original page order.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
filename (str): The path to the PDF file.
|
|
101
|
+
load_images (bool, optional):
|
|
102
|
+
Whether load screenshot for each page of the PDF file.
|
|
103
|
+
When set to true, the function will return both the layout and
|
|
104
|
+
screenshot image for each page.
|
|
105
|
+
Defaults to False.
|
|
106
|
+
x_tolerance (int, optional):
|
|
107
|
+
The threshold used for extracting "word tokens" from the pdf file.
|
|
108
|
+
It will merge the pdf characters into a word token if the difference
|
|
109
|
+
between the x_2 of one character and the x_1 of the next is less than
|
|
110
|
+
or equal to x_tolerance. See details in `pdf2plumber's documentation
|
|
111
|
+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
|
|
112
|
+
Defaults to 1.5.
|
|
113
|
+
y_tolerance (int, optional):
|
|
114
|
+
The threshold used for extracting "word tokens" from the pdf file.
|
|
115
|
+
It will merge the pdf characters into a word token if the difference
|
|
116
|
+
between the y_2 of one character and the y_1 of the next is less than
|
|
117
|
+
or equal to y_tolerance. See details in `pdf2plumber's documentation
|
|
118
|
+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
|
|
119
|
+
Defaults to 2.
|
|
120
|
+
keep_blank_chars (bool, optional):
|
|
121
|
+
When keep_blank_chars is set to True, it will treat blank characters
|
|
122
|
+
are treated as part of a word, not as a space between words. See
|
|
123
|
+
details in `pdf2plumber's documentation
|
|
124
|
+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
|
|
125
|
+
Defaults to False.
|
|
126
|
+
use_text_flow (bool, optional):
|
|
127
|
+
When use_text_flow is set to True, it will use the PDF's underlying
|
|
128
|
+
flow of characters as a guide for ordering and segmenting the words,
|
|
129
|
+
rather than presorting the characters by x/y position. (This mimics
|
|
130
|
+
how dragging a cursor highlights text in a PDF; as with that, the
|
|
131
|
+
order does not always appear to be logical.) See details in
|
|
132
|
+
`pdf2plumber's documentation
|
|
133
|
+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
|
|
134
|
+
Defaults to True.
|
|
135
|
+
horizontal_ltr (bool, optional):
|
|
136
|
+
When horizontal_ltr is set to True, it means the doc should read
|
|
137
|
+
text from left to right, vice versa.
|
|
138
|
+
Defaults to True.
|
|
139
|
+
vertical_ttb (bool, optional):
|
|
140
|
+
When vertical_ttb is set to True, it means the doc should read
|
|
141
|
+
text from top to bottom, vice versa.
|
|
142
|
+
Defaults to True.
|
|
143
|
+
extra_attrs (Optional[List[str]], optional):
|
|
144
|
+
Passing a list of extra_attrs (e.g., ["fontname", "size"]) will
|
|
145
|
+
restrict each words to characters that share exactly the same
|
|
146
|
+
value for each of those `attributes extracted by pdfplumber
|
|
147
|
+
<https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
|
|
148
|
+
and the resulting word dicts will indicate those attributes.
|
|
149
|
+
See details in `pdf2plumber's documentation
|
|
150
|
+
<https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
|
|
151
|
+
Defaults to `["fontname", "size"]`.
|
|
152
|
+
dpi (int, optional):
|
|
153
|
+
When loading images of the pdf, you can also specify the resolution
|
|
154
|
+
(or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
|
|
155
|
+
for rendering the images. Higher DPI values mean clearer images (also
|
|
156
|
+
larger file sizes).
|
|
157
|
+
Setting dpi will also automatically resizes the extracted pdf_layout
|
|
158
|
+
to match the sizes of the images. Therefore, when visualizing the
|
|
159
|
+
pdf_layouts, it can be rendered appropriately.
|
|
160
|
+
Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
|
|
161
|
+
from the pdfplumber PDF parser.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List[Layout]:
|
|
165
|
+
When `load_images=False`, it will only load the pdf_tokens from
|
|
166
|
+
the PDF file. Each element of the list denotes all the tokens appeared
|
|
167
|
+
on a single page, and the list is ordered the same as the original PDF
|
|
168
|
+
page order.
|
|
169
|
+
Tuple[List[Layout], List["Image.Image"]]:
|
|
170
|
+
When `load_images=True`, besides the `all_page_layout`, it will also
|
|
171
|
+
return a list of page images.
|
|
172
|
+
|
|
173
|
+
Examples::
|
|
174
|
+
>>> import layoutparser as lp
|
|
175
|
+
>>> pdf_layout = lp.load_pdf("path/to/pdf")
|
|
176
|
+
>>> pdf_layout[0] # the layout for page 0
|
|
177
|
+
>>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)
|
|
178
|
+
>>> lp.draw_box(pdf_images[0], pdf_layout[0])
|
|
179
|
+
"""
|
|
180
|
+
|
|
181
|
+
plumber_pdf_object = pdfplumber.open(filename)
|
|
182
|
+
|
|
183
|
+
all_page_layout = []
|
|
184
|
+
for page_id in range(len(plumber_pdf_object.pages)):
|
|
185
|
+
cur_page = plumber_pdf_object.pages[page_id]
|
|
186
|
+
|
|
187
|
+
page_tokens = extract_words_for_page(
|
|
188
|
+
cur_page,
|
|
189
|
+
x_tolerance=x_tolerance,
|
|
190
|
+
y_tolerance=y_tolerance,
|
|
191
|
+
keep_blank_chars=keep_blank_chars,
|
|
192
|
+
use_text_flow=use_text_flow,
|
|
193
|
+
horizontal_ltr=horizontal_ltr,
|
|
194
|
+
vertical_ttb=vertical_ttb,
|
|
195
|
+
extra_attrs=extra_attrs,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
# Adding metadata for the current page
|
|
199
|
+
page_tokens.page_data["width"] = float(cur_page.width)
|
|
200
|
+
page_tokens.page_data["height"] = float(cur_page.height)
|
|
201
|
+
page_tokens.page_data["index"] = page_id
|
|
202
|
+
|
|
203
|
+
all_page_layout.append(page_tokens)
|
|
204
|
+
|
|
205
|
+
if not load_images:
|
|
206
|
+
return all_page_layout
|
|
207
|
+
else:
|
|
208
|
+
import pdf2image
|
|
209
|
+
|
|
210
|
+
pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
|
|
211
|
+
|
|
212
|
+
for page_id, page_image in enumerate(pdf_images):
|
|
213
|
+
image_width, image_height = page_image.size
|
|
214
|
+
page_layout = all_page_layout[page_id]
|
|
215
|
+
layout_width = page_layout.page_data["width"]
|
|
216
|
+
layout_height = page_layout.page_data["height"]
|
|
217
|
+
if image_width != layout_width or image_height != layout_height:
|
|
218
|
+
scale_x = image_width / layout_width
|
|
219
|
+
scale_y = image_height / layout_height
|
|
220
|
+
page_layout = page_layout.scale((scale_x, scale_y))
|
|
221
|
+
page_layout.page_data["width"] = image_width
|
|
222
|
+
page_layout.page_data["height"] = image_height
|
|
223
|
+
all_page_layout[page_id] = page_layout
|
|
224
|
+
|
|
225
|
+
return all_page_layout, pdf_images
|