custom-layoutparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. custom_layoutparser-0.1.0.dist-info/METADATA +5 -0
  2. custom_layoutparser-0.1.0.dist-info/RECORD +36 -0
  3. custom_layoutparser-0.1.0.dist-info/WHEEL +5 -0
  4. custom_layoutparser-0.1.0.dist-info/top_level.txt +1 -0
  5. layoutparser/__init__.py +89 -0
  6. layoutparser/elements/__init__.py +25 -0
  7. layoutparser/elements/base.py +275 -0
  8. layoutparser/elements/errors.py +26 -0
  9. layoutparser/elements/layout.py +348 -0
  10. layoutparser/elements/layout_elements.py +1352 -0
  11. layoutparser/elements/utils.py +82 -0
  12. layoutparser/file_utils.py +235 -0
  13. layoutparser/io/__init__.py +2 -0
  14. layoutparser/io/basic.py +148 -0
  15. layoutparser/io/pdf.py +225 -0
  16. layoutparser/models/__init__.py +18 -0
  17. layoutparser/models/auto_layoutmodel.py +70 -0
  18. layoutparser/models/base_catalog.py +34 -0
  19. layoutparser/models/base_layoutmodel.py +88 -0
  20. layoutparser/models/detectron2/__init__.py +18 -0
  21. layoutparser/models/detectron2/catalog.py +142 -0
  22. layoutparser/models/detectron2/layoutmodel.py +168 -0
  23. layoutparser/models/effdet/__init__.py +16 -0
  24. layoutparser/models/effdet/catalog.py +88 -0
  25. layoutparser/models/effdet/layoutmodel.py +256 -0
  26. layoutparser/models/model_config.py +133 -0
  27. layoutparser/models/paddledetection/__init__.py +17 -0
  28. layoutparser/models/paddledetection/catalog.py +214 -0
  29. layoutparser/models/paddledetection/layoutmodel.py +297 -0
  30. layoutparser/ocr/__init__.py +16 -0
  31. layoutparser/ocr/base.py +41 -0
  32. layoutparser/ocr/gcv_agent.py +288 -0
  33. layoutparser/ocr/tesseract_agent.py +193 -0
  34. layoutparser/tools/__init__.py +5 -0
  35. layoutparser/tools/shape_operations.py +167 -0
  36. layoutparser/visualization.py +571 -0
@@ -0,0 +1,82 @@
1
+ # Copyright 2021 The Layout Parser team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Union, Dict, Dict, Any, Optional, Tuple
16
+
17
+ import numpy as np
18
+ from PIL import Image
19
+
20
+
21
+ def cvt_coordinates_to_points(coords: Tuple[float, float, float, float]) -> np.ndarray:
22
+
23
+ x_1, y_1, x_2, y_2 = coords
24
+ return np.array(
25
+ [
26
+ [x_1, y_1], # Top Left
27
+ [x_2, y_1], # Top Right
28
+ [x_2, y_2], # Bottom Right
29
+ [x_1, y_2], # Bottom Left
30
+ ]
31
+ )
32
+
33
+
34
+ def cvt_points_to_coordinates(points: np.ndarray) -> Tuple[float, float, float, float]:
35
+ x_1 = points[:, 0].min()
36
+ y_1 = points[:, 1].min()
37
+ x_2 = points[:, 0].max()
38
+ y_2 = points[:, 1].max()
39
+ return (x_1, y_1, x_2, y_2)
40
+
41
+
42
+ def perspective_transformation(
43
+ M: np.ndarray, points: np.ndarray, is_inv: bool = False
44
+ ) -> np.ndarray:
45
+
46
+ if is_inv:
47
+ M = np.linalg.inv(M)
48
+
49
+ src_mid = np.hstack([points, np.ones((points.shape[0], 1))]).T # 3x4
50
+ dst_mid = np.matmul(M, src_mid)
51
+
52
+ dst = (dst_mid / dst_mid[-1]).T[:, :2] # 4x2
53
+
54
+ return dst
55
+
56
+
57
+ def vertice_in_polygon(vertice: np.ndarray, polygon_points: np.ndarray) -> bool:
58
+ # The polygon_points are ordered clockwise
59
+
60
+ # The implementation is based on the algorithm from
61
+ # https://demonstrations.wolfram.com/AnEfficientTestForAPointToBeInAConvexPolygon/
62
+
63
+ points = polygon_points - vertice # shift the coordinates origin to the vertice
64
+ edges = np.append(points, points[0:1, :], axis=0)
65
+ return all([np.linalg.det([e1, e2]) >= 0 for e1, e2 in zip(edges, edges[1:])])
66
+ # If the points are ordered clockwise, the det should <=0
67
+
68
+
69
+ def polygon_area(xs: np.ndarray, ys: np.ndarray) -> float:
70
+ """Calculate the area of polygons using
71
+ `Shoelace Formula <https://en.wikipedia.org/wiki/Shoelace_formula>`_.
72
+
73
+ Args:
74
+ xs (`np.ndarray`): The x coordinates of the points
75
+ ys (`np.ndarray`): The y coordinates of the points
76
+ """
77
+
78
+ # Refer to: https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates
79
+ # The formula is equivalent to the original one indicated in the wikipedia
80
+ # page.
81
+
82
+ return 0.5 * np.abs(np.dot(xs, np.roll(ys, 1)) - np.dot(ys, np.roll(xs, 1)))
@@ -0,0 +1,235 @@
1
+ # Copyright 2021 The Layout Parser team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ # Some code are adapted from
16
+ # https://github.com/huggingface/transformers/blob/master/src/transformers/file_utils.py
17
+
18
+ from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union
19
+ import sys
20
+ import os
21
+ import logging
22
+ import importlib.util
23
+ from types import ModuleType
24
+
25
+ logger = logging.getLogger(__name__) # pylint: disable=invalid-name
26
+
27
+ # The package importlib_metadata is in a different place, depending on the python version.
28
+ if sys.version_info < (3, 8):
29
+ import importlib_metadata
30
+ else:
31
+ import importlib.metadata as importlib_metadata
32
+
33
+ ###########################################
34
+ ############ Layout Model Deps ############
35
+ ###########################################
36
+
37
+ _torch_available = importlib.util.find_spec("torch") is not None
38
+ try:
39
+ _torch_version = importlib_metadata.version("torch")
40
+ logger.debug(f"PyTorch version {_torch_version} available.")
41
+ except importlib_metadata.PackageNotFoundError:
42
+ _torch_available = False
43
+
44
+ _detectron2_available = importlib.util.find_spec("detectron2") is not None
45
+ try:
46
+ _detectron2_version = importlib_metadata.version("detectron2")
47
+ logger.debug(f"Detectron2 version {_detectron2_version} available")
48
+ except importlib_metadata.PackageNotFoundError:
49
+ _detectron2_available = False
50
+
51
+ _paddle_available = importlib.util.find_spec("paddle") is not None
52
+ try:
53
+ # The name of the paddlepaddle library:
54
+ # Install name: pip install paddlepaddle
55
+ # Import name: import paddle
56
+ _paddle_version = importlib_metadata.version("paddlepaddle")
57
+ logger.debug(f"Paddle version {_paddle_version} available.")
58
+ except importlib_metadata.PackageNotFoundError:
59
+ _paddle_available = False
60
+
61
+ _effdet_available = importlib.util.find_spec("effdet") is not None
62
+ try:
63
+ _effdet_version = importlib_metadata.version("effdet")
64
+ logger.debug(f"Effdet version {_effdet_version} available.")
65
+ except importlib_metadata.PackageNotFoundError:
66
+ _effdet_version = False
67
+
68
+ ###########################################
69
+ ############## OCR Tool Deps ##############
70
+ ###########################################
71
+
72
+ _pytesseract_available = importlib.util.find_spec("pytesseract") is not None
73
+ try:
74
+ _pytesseract_version = importlib_metadata.version("pytesseract")
75
+ logger.debug(f"Pytesseract version {_pytesseract_version} available.")
76
+ except importlib_metadata.PackageNotFoundError:
77
+ _pytesseract_available = False
78
+
79
+ try:
80
+ _gcv_available = importlib.util.find_spec("google.cloud.vision") is not None
81
+ try:
82
+ _gcv_version = importlib_metadata.version(
83
+ "google-cloud-vision"
84
+ ) # This is slightly different
85
+ logger.debug(f"Google Cloud Vision Utils version {_gcv_version} available.")
86
+ except importlib_metadata.PackageNotFoundError:
87
+ _gcv_available = False
88
+ except ModuleNotFoundError:
89
+ _gcv_available = False
90
+
91
+
92
+ def is_torch_available():
93
+ return _torch_available
94
+
95
+
96
+ def is_torch_cuda_available():
97
+ if is_torch_available():
98
+ import torch
99
+
100
+ return torch.cuda.is_available()
101
+ else:
102
+ return False
103
+
104
+
105
+ def is_detectron2_available():
106
+ return _detectron2_available
107
+
108
+
109
+ def is_paddle_available():
110
+ return _paddle_available
111
+
112
+
113
+ def is_effdet_available():
114
+ return _effdet_available
115
+
116
+
117
+ def is_pytesseract_available():
118
+ return _pytesseract_available
119
+
120
+
121
+ def is_gcv_available():
122
+ return _gcv_available
123
+
124
+
125
+ PYTORCH_IMPORT_ERROR = """
126
+ {0} requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
127
+ installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
128
+ """
129
+
130
+ DETECTRON2_IMPORT_ERROR = """
131
+ {0} requires the detectron2 library but it was not found in your environment. Checkout the instructions on the
132
+ installation page: https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md and follow the ones
133
+ that match your environment. Typically the following would work for MacOS or Linux CPU machines:
134
+ pip install 'git+https://github.com/facebookresearch/detectron2.git@v0.4#egg=detectron2'
135
+ """
136
+
137
+ PADDLE_IMPORT_ERROR = """
138
+ {0} requires the PaddlePaddle library but it was not found in your environment. Checkout the instructions on the
139
+ installation page: https://github.com/PaddlePaddle/Paddle and follow the ones that match your environment.
140
+ """
141
+
142
+ EFFDET_IMPORT_ERROR = """
143
+ {0} requires the effdet library but it was not found in your environment. You can install it with pip:
144
+ `pip install effdet`
145
+ """
146
+
147
+ PYTESSERACT_IMPORT_ERROR = """
148
+ {0} requires the PyTesseract library but it was not found in your environment. You can install it with pip:
149
+ `pip install pytesseract`
150
+ """
151
+
152
+ GCV_IMPORT_ERROR = """
153
+ {0} requires the Google Cloud Vision Python utils but it was not found in your environment. You can install it with pip:
154
+ `pip install google-cloud-vision==1`
155
+ """
156
+
157
+ BACKENDS_MAPPING = dict(
158
+ [
159
+ ("torch", (is_torch_available, PYTORCH_IMPORT_ERROR)),
160
+ ("detectron2", (is_detectron2_available, DETECTRON2_IMPORT_ERROR)),
161
+ ("paddle", (is_paddle_available, PADDLE_IMPORT_ERROR)),
162
+ ("effdet", (is_effdet_available, EFFDET_IMPORT_ERROR)),
163
+ ("pytesseract", (is_pytesseract_available, PYTESSERACT_IMPORT_ERROR)),
164
+ ("google-cloud-vision", (is_gcv_available, GCV_IMPORT_ERROR)),
165
+ ]
166
+ )
167
+
168
+
169
+ def requires_backends(obj, backends):
170
+ if not isinstance(backends, (list, tuple)):
171
+ backends = [backends]
172
+
173
+ name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__
174
+ if not all(BACKENDS_MAPPING[backend][0]() for backend in backends):
175
+ raise ImportError(
176
+ "".join([BACKENDS_MAPPING[backend][1].format(name) for backend in backends])
177
+ )
178
+
179
+
180
+ class _LazyModule(ModuleType):
181
+ """
182
+ Module class that surfaces all objects but only performs associated imports when the objects are requested.
183
+ """
184
+
185
+ # Adapted from HuggingFace
186
+ # https://github.com/huggingface/transformers/blob/c37573806ab3526dd805c49cbe2489ad4d68a9d7/src/transformers/file_utils.py#L1990
187
+
188
+ def __init__(
189
+ self, name, module_file, import_structure, module_spec=None, extra_objects=None
190
+ ):
191
+ super().__init__(name)
192
+ self._modules = set(import_structure.keys())
193
+ self._class_to_module = {}
194
+ for key, values in import_structure.items():
195
+ for value in values:
196
+ self._class_to_module[value] = key
197
+ # Needed for autocompletion in an IDE
198
+ self.__all__ = list(import_structure.keys()) + sum(
199
+ import_structure.values(), []
200
+ )
201
+ self.__file__ = module_file
202
+ self.__spec__ = module_spec
203
+ self.__path__ = [os.path.dirname(module_file)]
204
+ self._objects = {} if extra_objects is None else extra_objects
205
+ self._name = name
206
+ self._import_structure = import_structure
207
+
208
+ # Following [PEP 366](https://www.python.org/dev/peps/pep-0366/)
209
+ # The __package__ variable should be set
210
+ # https://docs.python.org/3/reference/import.html#__package__
211
+ self.__package__ = self.__name__
212
+
213
+ # Needed for autocompletion in an IDE
214
+ def __dir__(self):
215
+ return super().__dir__() + self.__all__
216
+
217
+ def __getattr__(self, name: str) -> Any:
218
+ if name in self._objects:
219
+ return self._objects[name]
220
+ if name in self._modules:
221
+ value = self._get_module(name)
222
+ elif name in self._class_to_module.keys():
223
+ module = self._get_module(self._class_to_module[name])
224
+ value = getattr(module, name)
225
+ else:
226
+ raise AttributeError(f"module {self.__name__} has no attribute {name}")
227
+
228
+ setattr(self, name, value)
229
+ return value
230
+
231
+ def _get_module(self, module_name: str):
232
+ return importlib.import_module("." + module_name, self.__name__)
233
+
234
+ def __reduce__(self):
235
+ return (self.__class__, (self._name, self.__file__, self._import_structure))
@@ -0,0 +1,2 @@
1
+ from .basic import load_json, load_dict, load_csv, load_dataframe
2
+ from .pdf import load_pdf
@@ -0,0 +1,148 @@
1
+ # Copyright 2021 The Layout Parser team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import ast
16
+ import json
17
+ from typing import List, Union, Dict, Dict, Any
18
+
19
+ import pandas as pd
20
+
21
+ from ..elements import (
22
+ BaseLayoutElement,
23
+ TextBlock,
24
+ Layout,
25
+ BASECOORD_ELEMENT_NAMEMAP,
26
+ )
27
+
28
+
29
+ def load_json(filename: str) -> Union[BaseLayoutElement, Layout]:
30
+ """Load a JSON file and save it as a layout object with appropriate data types.
31
+
32
+ Args:
33
+ filename (str):
34
+ The name of the JSON file.
35
+
36
+ Returns:
37
+ Union[BaseLayoutElement, Layout]:
38
+ Based on the JSON file format, it will automatically parse
39
+ the type of the data and load it accordingly.
40
+ """
41
+ with open(filename, "r") as fp:
42
+ res = json.load(fp)
43
+
44
+ return load_dict(res)
45
+
46
+
47
+ def load_dict(data: Union[Dict, List[Dict]]) -> Union[BaseLayoutElement, Layout]:
48
+ """Load a dict of list of dict representations of some layout data,
49
+ automatically parse its type, and save it as any of BaseLayoutElement
50
+ or Layout datatype.
51
+
52
+ Args:
53
+ data (Union[Dict, List]):
54
+ A dict of list of dict representations of the layout data
55
+
56
+ Raises:
57
+ ValueError:
58
+ If the data format is incompatible with the layout-data-JSON format,
59
+ raise a `ValueError`.
60
+ ValueError:
61
+ If any `block_type` name is not in the available list of layout element
62
+ names defined in `BASECOORD_ELEMENT_NAMEMAP`, raise a `ValueError`.
63
+
64
+ Returns:
65
+ Union[BaseLayoutElement, Layout]:
66
+ Based on the dict format, it will automatically parse the type of
67
+ the data and load it accordingly.
68
+ """
69
+ if isinstance(data, dict):
70
+ if "page_data" in data:
71
+ # It is a layout instance
72
+ return Layout(load_dict(data["blocks"])._blocks, page_data=data["page_data"])
73
+ else:
74
+
75
+ if data["block_type"] not in BASECOORD_ELEMENT_NAMEMAP:
76
+ raise ValueError(f"Invalid block_type {data['block_type']}")
77
+
78
+ # Check if it is a textblock
79
+ is_textblock = any(ele in data for ele in TextBlock._features)
80
+ if is_textblock:
81
+ return TextBlock.from_dict(data)
82
+ else:
83
+ return BASECOORD_ELEMENT_NAMEMAP[data["block_type"]].from_dict(data)
84
+
85
+ elif isinstance(data, list):
86
+ return Layout([load_dict(ele) for ele in data])
87
+
88
+ else:
89
+ raise ValueError(f"Invalid input JSON structure.")
90
+
91
+
92
+ def load_csv(filename: str, block_type: str = None) -> Layout:
93
+ """Load the Layout object from the given CSV file.
94
+
95
+ Args:
96
+ filename (str):
97
+ The name of the CSV file. A row of the table represents
98
+ an individual layout element.
99
+
100
+ block_type (str):
101
+ If there's no block_type column in the CSV file,
102
+ you must pass in a block_type variable such that layout parser
103
+ can appropriately detect the type of the layout elements.
104
+
105
+ Returns:
106
+ Layout:
107
+ The parsed Layout object from the CSV file.
108
+ """
109
+
110
+ return load_dataframe(pd.read_csv(filename), block_type=block_type)
111
+
112
+
113
+ def load_dataframe(df: pd.DataFrame, block_type: str = None) -> Layout:
114
+ """Load the Layout object from the given dataframe.
115
+
116
+ Args:
117
+ df (pd.DataFrame):
118
+
119
+ block_type (str):
120
+ If there's no block_type column in the CSV file,
121
+ you must pass in a block_type variable such that layout parser
122
+ can appropriately detect the type of the layout elements.
123
+
124
+ Returns:
125
+ Layout:
126
+ The parsed Layout object from the CSV file.
127
+ """
128
+ df = df.copy()
129
+ if "points" in df.columns:
130
+ if df["points"].dtype == object:
131
+ df["points"] = df["points"].map(
132
+ lambda x: ast.literal_eval(x) if not pd.isna(x) else x
133
+ )
134
+
135
+ if block_type is None:
136
+ if "block_type" not in df.columns:
137
+ raise ValueError(
138
+ "`block_type` not specified both in dataframe and arguments"
139
+ )
140
+ else:
141
+ df["block_type"] = block_type
142
+
143
+ if any(col in TextBlock._features for col in df.columns):
144
+ # Automatically setting index for textblock
145
+ if "id" not in df.columns:
146
+ df["id"] = df.index
147
+
148
+ return load_dict(df.apply(lambda x: x.dropna().to_dict(), axis=1).to_list())
layoutparser/io/pdf.py ADDED
@@ -0,0 +1,225 @@
1
+ # Copyright 2021 The Layout Parser team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import List, Union, Optional, Dict, Tuple
16
+
17
+ import pdfplumber
18
+ import pandas as pd
19
+
20
+ from ..elements import Layout
21
+ from .basic import load_dataframe
22
+
23
+ DEFAULT_PDF_DPI = 72
24
+
25
+
26
+ def extract_words_for_page(
27
+ page: pdfplumber.page.Page,
28
+ x_tolerance=1.5,
29
+ y_tolerance=2,
30
+ keep_blank_chars=False,
31
+ use_text_flow=True,
32
+ horizontal_ltr=True,
33
+ vertical_ttb=True,
34
+ extra_attrs=None,
35
+ ) -> Layout:
36
+ """The helper function used for extracting words from a pdfplumber page
37
+ object.
38
+
39
+ Returns:
40
+ Layout: a layout object representing all extracted pdf tokens on this page.
41
+ """
42
+ if extra_attrs is None:
43
+ extra_attrs = ["fontname", "size"]
44
+
45
+ tokens = page.extract_words(
46
+ x_tolerance=x_tolerance,
47
+ y_tolerance=y_tolerance,
48
+ keep_blank_chars=keep_blank_chars,
49
+ use_text_flow=use_text_flow,
50
+ horizontal_ltr=horizontal_ltr,
51
+ vertical_ttb=vertical_ttb,
52
+ extra_attrs=extra_attrs,
53
+ )
54
+
55
+ df = pd.DataFrame(tokens)
56
+
57
+ if len(df) == 0:
58
+ return Layout()
59
+
60
+ df[["x0", "x1"]] = (
61
+ df[["x0", "x1"]].clip(lower=0, upper=int(page.width)).astype("float")
62
+ )
63
+ df[["top", "bottom"]] = (
64
+ df[["top", "bottom"]].clip(lower=0, upper=int(page.height)).astype("float")
65
+ )
66
+
67
+ page_tokens = load_dataframe(
68
+ df.reset_index().rename(
69
+ columns={
70
+ "x0": "x_1",
71
+ "x1": "x_2",
72
+ "top": "y_1",
73
+ "bottom": "y_2",
74
+ "index": "id",
75
+ "fontname": "type", # also loading fontname as "type"
76
+ }
77
+ ),
78
+ block_type="rectangle",
79
+ )
80
+
81
+ return page_tokens
82
+
83
+
84
+ def load_pdf(
85
+ filename: str,
86
+ load_images: bool = False,
87
+ x_tolerance: int = 1.5,
88
+ y_tolerance: int = 2,
89
+ keep_blank_chars: bool = False,
90
+ use_text_flow: bool = True,
91
+ horizontal_ltr: bool = True,
92
+ vertical_ttb: bool = True,
93
+ extra_attrs: Optional[List[str]] = None,
94
+ dpi: int = DEFAULT_PDF_DPI,
95
+ ) -> Union[List[Layout], Tuple[List[Layout], List["Image.Image"]]]:
96
+ """Load all tokens for each page from a PDF file, and save them
97
+ in a list of Layout objects with the original page order.
98
+
99
+ Args:
100
+ filename (str): The path to the PDF file.
101
+ load_images (bool, optional):
102
+ Whether load screenshot for each page of the PDF file.
103
+ When set to true, the function will return both the layout and
104
+ screenshot image for each page.
105
+ Defaults to False.
106
+ x_tolerance (int, optional):
107
+ The threshold used for extracting "word tokens" from the pdf file.
108
+ It will merge the pdf characters into a word token if the difference
109
+ between the x_2 of one character and the x_1 of the next is less than
110
+ or equal to x_tolerance. See details in `pdf2plumber's documentation
111
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
112
+ Defaults to 1.5.
113
+ y_tolerance (int, optional):
114
+ The threshold used for extracting "word tokens" from the pdf file.
115
+ It will merge the pdf characters into a word token if the difference
116
+ between the y_2 of one character and the y_1 of the next is less than
117
+ or equal to y_tolerance. See details in `pdf2plumber's documentation
118
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
119
+ Defaults to 2.
120
+ keep_blank_chars (bool, optional):
121
+ When keep_blank_chars is set to True, it will treat blank characters
122
+ are treated as part of a word, not as a space between words. See
123
+ details in `pdf2plumber's documentation
124
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
125
+ Defaults to False.
126
+ use_text_flow (bool, optional):
127
+ When use_text_flow is set to True, it will use the PDF's underlying
128
+ flow of characters as a guide for ordering and segmenting the words,
129
+ rather than presorting the characters by x/y position. (This mimics
130
+ how dragging a cursor highlights text in a PDF; as with that, the
131
+ order does not always appear to be logical.) See details in
132
+ `pdf2plumber's documentation
133
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
134
+ Defaults to True.
135
+ horizontal_ltr (bool, optional):
136
+ When horizontal_ltr is set to True, it means the doc should read
137
+ text from left to right, vice versa.
138
+ Defaults to True.
139
+ vertical_ttb (bool, optional):
140
+ When vertical_ttb is set to True, it means the doc should read
141
+ text from top to bottom, vice versa.
142
+ Defaults to True.
143
+ extra_attrs (Optional[List[str]], optional):
144
+ Passing a list of extra_attrs (e.g., ["fontname", "size"]) will
145
+ restrict each words to characters that share exactly the same
146
+ value for each of those `attributes extracted by pdfplumber
147
+ <https://github.com/jsvine/pdfplumber/blob/develop/README.md#char-properties>`_,
148
+ and the resulting word dicts will indicate those attributes.
149
+ See details in `pdf2plumber's documentation
150
+ <https://github.com/jsvine/pdfplumber#the-pdfplumberpage-class>`_.
151
+ Defaults to `["fontname", "size"]`.
152
+ dpi (int, optional):
153
+ When loading images of the pdf, you can also specify the resolution
154
+ (or `DPI, dots per inch <https://en.wikipedia.org/wiki/Dots_per_inch>`_)
155
+ for rendering the images. Higher DPI values mean clearer images (also
156
+ larger file sizes).
157
+ Setting dpi will also automatically resizes the extracted pdf_layout
158
+ to match the sizes of the images. Therefore, when visualizing the
159
+ pdf_layouts, it can be rendered appropriately.
160
+ Defaults to `DEFAULT_PDF_DPI=72`, which is also the default rendering dpi
161
+ from the pdfplumber PDF parser.
162
+
163
+ Returns:
164
+ List[Layout]:
165
+ When `load_images=False`, it will only load the pdf_tokens from
166
+ the PDF file. Each element of the list denotes all the tokens appeared
167
+ on a single page, and the list is ordered the same as the original PDF
168
+ page order.
169
+ Tuple[List[Layout], List["Image.Image"]]:
170
+ When `load_images=True`, besides the `all_page_layout`, it will also
171
+ return a list of page images.
172
+
173
+ Examples::
174
+ >>> import layoutparser as lp
175
+ >>> pdf_layout = lp.load_pdf("path/to/pdf")
176
+ >>> pdf_layout[0] # the layout for page 0
177
+ >>> pdf_layout, pdf_images = lp.load_pdf("path/to/pdf", load_images=True)
178
+ >>> lp.draw_box(pdf_images[0], pdf_layout[0])
179
+ """
180
+
181
+ plumber_pdf_object = pdfplumber.open(filename)
182
+
183
+ all_page_layout = []
184
+ for page_id in range(len(plumber_pdf_object.pages)):
185
+ cur_page = plumber_pdf_object.pages[page_id]
186
+
187
+ page_tokens = extract_words_for_page(
188
+ cur_page,
189
+ x_tolerance=x_tolerance,
190
+ y_tolerance=y_tolerance,
191
+ keep_blank_chars=keep_blank_chars,
192
+ use_text_flow=use_text_flow,
193
+ horizontal_ltr=horizontal_ltr,
194
+ vertical_ttb=vertical_ttb,
195
+ extra_attrs=extra_attrs,
196
+ )
197
+
198
+ # Adding metadata for the current page
199
+ page_tokens.page_data["width"] = float(cur_page.width)
200
+ page_tokens.page_data["height"] = float(cur_page.height)
201
+ page_tokens.page_data["index"] = page_id
202
+
203
+ all_page_layout.append(page_tokens)
204
+
205
+ if not load_images:
206
+ return all_page_layout
207
+ else:
208
+ import pdf2image
209
+
210
+ pdf_images = pdf2image.convert_from_path(filename, dpi=dpi)
211
+
212
+ for page_id, page_image in enumerate(pdf_images):
213
+ image_width, image_height = page_image.size
214
+ page_layout = all_page_layout[page_id]
215
+ layout_width = page_layout.page_data["width"]
216
+ layout_height = page_layout.page_data["height"]
217
+ if image_width != layout_width or image_height != layout_height:
218
+ scale_x = image_width / layout_width
219
+ scale_y = image_height / layout_height
220
+ page_layout = page_layout.scale((scale_x, scale_y))
221
+ page_layout.page_data["width"] = image_width
222
+ page_layout.page_data["height"] = image_height
223
+ all_page_layout[page_id] = page_layout
224
+
225
+ return all_page_layout, pdf_images