PyPI - custom-layoutparser - Versions diffs - 0.1.0__py3-none-any.whl - Mend

custom-layoutparser 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

custom_layoutparser-0.1.0.dist-info/METADATA +5 -0
custom_layoutparser-0.1.0.dist-info/RECORD +36 -0
custom_layoutparser-0.1.0.dist-info/WHEEL +5 -0
custom_layoutparser-0.1.0.dist-info/top_level.txt +1 -0
layoutparser/__init__.py +89 -0
layoutparser/elements/__init__.py +25 -0
layoutparser/elements/base.py +275 -0
layoutparser/elements/errors.py +26 -0
layoutparser/elements/layout.py +348 -0
layoutparser/elements/layout_elements.py +1352 -0
layoutparser/elements/utils.py +82 -0
layoutparser/file_utils.py +235 -0
layoutparser/io/__init__.py +2 -0
layoutparser/io/basic.py +148 -0
layoutparser/io/pdf.py +225 -0
layoutparser/models/__init__.py +18 -0
layoutparser/models/auto_layoutmodel.py +70 -0
layoutparser/models/base_catalog.py +34 -0
layoutparser/models/base_layoutmodel.py +88 -0
layoutparser/models/detectron2/__init__.py +18 -0
layoutparser/models/detectron2/catalog.py +142 -0
layoutparser/models/detectron2/layoutmodel.py +168 -0
layoutparser/models/effdet/__init__.py +16 -0
layoutparser/models/effdet/catalog.py +88 -0
layoutparser/models/effdet/layoutmodel.py +256 -0
layoutparser/models/model_config.py +133 -0
layoutparser/models/paddledetection/__init__.py +17 -0
layoutparser/models/paddledetection/catalog.py +214 -0
layoutparser/models/paddledetection/layoutmodel.py +297 -0
layoutparser/ocr/__init__.py +16 -0
layoutparser/ocr/base.py +41 -0
layoutparser/ocr/gcv_agent.py +288 -0
layoutparser/ocr/tesseract_agent.py +193 -0
layoutparser/tools/__init__.py +5 -0
layoutparser/tools/shape_operations.py +167 -0
layoutparser/visualization.py +571 -0

layoutparser/models/effdet/layoutmodel.py ADDED Viewed

@@ -0,0 +1,256 @@
+# Copyright 2021 The Layout Parser team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Optional, Union, Dict, Any, Tuple
+from PIL import Image
+import numpy as np
+from .catalog import PathManager, LABEL_MAP_CATALOG, MODEL_CATALOG
+from ..base_layoutmodel import BaseLayoutModel
+from ...elements import Rectangle, TextBlock, Layout
+from ...file_utils import is_effdet_available, is_torch_cuda_available
+if is_effdet_available():
+    import torch
+    from effdet import create_model
+    from effdet.data.transforms import (
+        IMAGENET_DEFAULT_MEAN,
+        IMAGENET_DEFAULT_STD,
+        transforms_coco_eval,
+    )
+else:
+    # Copied from https://github.com/rwightman/efficientdet-pytorch/blob/c5b694aa34900fdee6653210d856ca8320bf7d4e/effdet/data/transforms.py#L13
+    # Such that when effdet is not loaded, we'll still have default values for IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+    IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+    IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+    # IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
+    # IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
+class InputTransform:
+    def __init__(
+        self,
+        image_size,
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+    ):
+        self.mean = mean
+        self.std = std
+        self.transform = transforms_coco_eval(
+            image_size,
+            interpolation="bilinear",
+            use_prefetcher=True,
+            fill_color="mean",
+            mean=self.mean,
+            std=self.std,
+        )
+        self.mean_tensor = torch.tensor([x * 255 for x in mean]).view(1, 3, 1, 1)
+        self.std_tensor = torch.tensor([x * 255 for x in std]).view(1, 3, 1, 1)
+    def preprocess(self, image: Image) -> Tuple["torch.Tensor", Dict]:
+        image = image.convert("RGB")
+        image_info = {"img_size": image.size}
+        input, image_info = self.transform(image, image_info)
+        image_info = {
+            key: torch.tensor(val).unsqueeze(0) for key, val in image_info.items()
+        }
+        input = torch.tensor(input).unsqueeze(0)
+        input = input.float().sub_(self.mean_tensor).div_(self.std_tensor)
+        return input, image_info
+class EfficientDetLayoutModel(BaseLayoutModel):
+    """Create a EfficientDet-based Layout Detection Model
+    Args:
+        config_path (:obj:`str`):
+            The path to the configuration file.
+        model_path (:obj:`str`, None):
+            The path to the saved weights of the model.
+            If set, overwrite the weights in the configuration file.
+            Defaults to `None`.
+        label_map (:obj:`dict`, optional):
+            The map from the model prediction (ids) to real
+            word labels (strings). If the config is from one of the supported
+            datasets, Layout Parser will automatically initialize the label_map.
+            Defaults to `None`.
+        enforce_cpu(:obj:`bool`, optional):
+            When set to `True`, it will enforce using cpu even if it is on a CUDA
+            available device.
+        extra_config (:obj:`dict`, optional):
+            Extra configuration passed to the EfficientDet model
+            configuration. Currently supported arguments:
+                num_classes: specifying the number of classes for the models
+                output_confidence_threshold: minmum object prediction confidence to retain
+    Examples::
+        >>> import layoutparser as lp
+        >>> model = lp.EfficientDetLayoutModel("lp://PubLayNet/tf_efficientdet_d0/config")
+        >>> model.detect(image)
+    """
+    DEPENDENCIES = ["effdet"]
+    DETECTOR_NAME = "efficientdet"
+    MODEL_CATALOG = MODEL_CATALOG
+    DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD = 0.25
+    def __init__(
+        self,
+        config_path: str,
+        model_path: str = None,
+        label_map: Optional[Dict] = None,
+        extra_config: Optional[Dict] = None,
+        enforce_cpu: bool = False,
+        device: str = None,
+    ):
+        if is_torch_cuda_available():
+            if device is None:
+                device = "cuda"
+        else:
+            device = "cpu"
+        self.device = device
+        extra_config = extra_config if extra_config is not None else {}
+        self._initialize_model(config_path, model_path, label_map, extra_config)
+        self.output_confidence_threshold = extra_config.get(
+            "output_confidence_threshold", self.DEFAULT_OUTPUT_CONFIDENCE_THRESHOLD
+        )
+        self.preprocessor = InputTransform(self.config.image_size)
+    def _initialize_model(
+        self,
+        config_path: str,
+        model_path: Optional[str],
+        label_map: Optional[Dict],
+        extra_config: Optional[Dict],
+    ):
+        config_path, model_path = self.config_parser(config_path, model_path)
+        if config_path.startswith("lp://"):
+            # If it's officially supported by layoutparser
+            dataset_name, model_name = config_path.lstrip("lp://").split("/")[1:3]
+            if label_map is None:
+                label_map = LABEL_MAP_CATALOG[dataset_name]
+            num_classes = len(label_map)
+            model_path = PathManager.get_local_path(model_path)
+            self.model = create_model(
+                model_name,
+                num_classes=num_classes,
+                bench_task="predict",
+                pretrained=True,
+                checkpoint_path=model_path,
+            )
+        else:
+            assert (
+                model_path is not None
+            ), f"When the specified model is not layoutparser-based, you need to specify the model_path"
+            assert (
+                label_map is not None or "num_classes" in extra_config
+            ), "When the specified model is not layoutparser-based, you need to specify the label_map or add num_classes in the extra_config"
+            model_name = config_path
+            model_path = PathManager.get_local_path(
+                model_path
+            )  # It might be an https URL
+            num_classes = len(label_map) if label_map else extra_config["num_classes"]
+            self.model = create_model(
+                model_name,
+                num_classes=num_classes,
+                bench_task="predict",
+                pretrained=True,
+                checkpoint_path=model_path,
+            )
+        self.model.to(self.device)
+        self.model.eval()
+        self.config = self.model.config
+        self.label_map = label_map if label_map is not None else {}
+    def detect(self, image: Union["np.ndarray", "Image.Image"]):
+        image = self.image_loader(image)
+        model_inputs, image_info = self.preprocessor.preprocess(image)
+        model_outputs = self.model(
+            model_inputs.to(self.device),
+            {key: val.to(self.device) for key, val in image_info.items()},
+        )
+        layout = self.gather_output(model_outputs)
+        return layout
+    def gather_output(self, model_outputs: "torch.Tensor") -> Layout:
+        model_outputs = model_outputs.cpu().detach()
+        box_predictions = Layout()
+        for index, sample in enumerate(model_outputs):
+            sample[:, 2] -= sample[:, 0]
+            sample[:, 3] -= sample[:, 1]
+            for det in sample:
+                score = float(det[4])
+                pred_cat = int(det[5])
+                x, y, w, h = det[0:4].tolist()
+                if (
+                    score < self.output_confidence_threshold
+                ):  # stop when below this threshold, scores in descending order
+                    break
+                box_predictions.append(
+                    TextBlock(
+                        block=Rectangle(x, y, w + x, h + y),
+                        score=score,
+                        id=index,
+                        type=self.label_map.get(pred_cat, pred_cat),
+                    )
+                )
+        return box_predictions
+    def image_loader(self, image: Union["np.ndarray", "Image.Image"]):
+        # Convert cv2 Image Input
+        if isinstance(image, np.ndarray):
+            # In this case, we assume the image is loaded by cv2
+            # and the channel order is BGR
+            image = image[..., ::-1]
+            image = Image.fromarray(image, mode="RGB")
+        return image

layoutparser/models/model_config.py ADDED Viewed

@@ -0,0 +1,133 @@
+# Copyright 2021 The Layout Parser team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Inside layoutparser, we support the following formats for specifying layout model configs
+or weights:
+1. URL-based formats:
+    - A local path: ~/models/publaynet/path
+    - Link to the models: https://web/url/to/models
+2. LayoutParser Based Model/Config Path Formats:
+    - Full format: lp://<backend-name>/<dataset-name>/<model-architecture-name>
+    - Short format: lp://<dataset-name>/<model-architecture-name>
+    - Brief format: lp://<dataset-name>
+For each LayoutParser-based format, you could also add a `config` or `weight` identifier
+after them:
+    - Full format: lp://<backend-name>/<dataset-name>/<model-architecture-name>/<config, weight>
+    - Short format: lp://<dataset-name>/<model-architecture-name>/<config, weight>
+    - Brief format: lp://<dataset-name>/<config, weight>
+"""
+from dataclasses import dataclass
+LAYOUT_PARSER_MODEL_PREFIX = "lp://"
+ALLOWED_LAYOUT_MODEL_IDENTIFIER_NAMES = ["config", "weight"]
+@dataclass
+class LayoutModelConfig:
+    backend_name: str
+    dataset_name: str
+    model_arch: str
+    identifier: str
+    def __post_init__(self):
+        assert self.identifier in ALLOWED_LAYOUT_MODEL_IDENTIFIER_NAMES
+    @property
+    def full(self):
+        return LAYOUT_PARSER_MODEL_PREFIX + "/".join(
+            [self.backend_name, self.dataset_name, self.model_arch, self.identifier]
+        )
+    @property
+    def short(self):
+        return LAYOUT_PARSER_MODEL_PREFIX + "/".join(
+            [self.dataset_name, self.model_arch, self.identifier]
+        )
+    @property
+    def brief(self):
+        return LAYOUT_PARSER_MODEL_PREFIX + "/".join([self.dataset_name, self.model_arch])
+    def dual(self):
+        for identifier in ALLOWED_LAYOUT_MODEL_IDENTIFIER_NAMES:
+            if identifier != self.identifier:
+                break
+        return self.__class__(
+            backend_name=self.backend_name,
+            dataset_name=self.dataset_name,
+            model_arch=self.model_arch,
+            identifier=identifier,
+        )
+def is_lp_layout_model_config_any_format(config: str) -> bool:
+    if not config.startswith(LAYOUT_PARSER_MODEL_PREFIX):
+        return False
+    if len(config[len(LAYOUT_PARSER_MODEL_PREFIX) :].split("/")) not in [1, 2, 3, 4]:
+        return False
+    return True
+def add_identifier_for_config(config: str, identifier: str) -> str:
+    return config.rstrip("/").rstrip(f"/{identifier}") + f"/{identifier}"
+def layout_model_config_parser(
+    config, backend_name=None, model_arch=None
+) -> LayoutModelConfig:
+    assert config.split("/")[-1] in ALLOWED_LAYOUT_MODEL_IDENTIFIER_NAMES, (
+        f"The input config {config} does not contain identifier information."
+        f"Consider run `config = add_identifier_for_config(config, identifier)` first."
+    )
+    parts = config[len(LAYOUT_PARSER_MODEL_PREFIX) :].split("/")
+    if len(parts) == 4:  # Full format
+        backend_name, dataset_name, model_arch, identifier = parts
+    elif len(parts) == 3:  # Short format
+        assert backend_name != None
+        if parts[0] == backend_name:
+            # lp://<backend-name>/<dataset-name>/<identifier>
+            assert model_arch != None
+            _, dataset_name, identifier = parts
+        else:
+            # lp://<dataset-name>/<model-arch>/<identifier>
+            dataset_name, model_arch, identifier = parts
+    elif len(parts) == 2:  # brief format
+        assert backend_name != None
+        assert model_arch != None
+        if parts[0] == backend_name:
+            # lp://<backend-name>/<identifier>
+            raise ValueError(f"Invalid LP Model Config {config}")
+        # lp://<dataset-name>/<identifier>
+        dataset_name, identifier = parts
+    else:
+        raise ValueError(f"Invalid LP Model Config {config}")
+    return LayoutModelConfig(
+        backend_name=backend_name,
+        dataset_name=dataset_name,
+        model_arch=model_arch,
+        identifier=identifier,
+    )

layoutparser/models/paddledetection/__init__.py ADDED Viewed

@@ -0,0 +1,17 @@
+# Copyright 2021 The Layout Parser team and Paddle Detection model
+# contributors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import catalog as _UNUSED
+from .layoutmodel import PaddleDetectionLayoutModel

layoutparser/models/paddledetection/catalog.py ADDED Viewed

@@ -0,0 +1,214 @@
+# Copyright 2021 The Layout Parser team and Paddle Detection model
+# contributors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import logging
+from typing import Any, Optional
+from urllib.parse import urlparse
+import tarfile
+import uuid
+from iopath.common.file_io import PathHandler
+from iopath.common.file_io import HTTPURLHandler
+from iopath.common.file_io import get_cache_dir, file_lock
+from iopath.common.download import download
+from ..base_catalog import PathManager
+MODEL_CATALOG = {
+    "PubLayNet": {
+        "ppyolov2_r50vd_dcn_365e": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_publaynet.tar",
+    },
+    "TableBank": {
+        "ppyolov2_r50vd_dcn_365e": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_word.tar",
+        # "ppyolov2_r50vd_dcn_365e_tableBank_latex": "https://paddle-model-ecology.bj.bcebos.com/model/layout-parser/ppyolov2_r50vd_dcn_365e_tableBank_latex.tar",
+        # TODO: Train a single tablebank model for paddlepaddle
+    },
+}
+# fmt: off
+LABEL_MAP_CATALOG = {
+    "PubLayNet": {
+        0: "Text",
+        1: "Title",
+        2: "List",
+        3: "Table",
+        4: "Figure"},
+    "TableBank": {
+        0: "Table"
+    },
+}
+# fmt: on
+# Paddle model package everything in tar files, and each model's tar file should contain
+# the following files in the list:
+_TAR_FILE_NAME_LIST = [
+    "inference.pdiparams",
+    "inference.pdiparams.info",
+    "inference.pdmodel",
+]
+def _get_untar_directory(tar_file: str) -> str:
+    base_path = os.path.dirname(tar_file)
+    file_name = os.path.splitext(os.path.basename(tar_file))[0]
+    target_folder = os.path.join(base_path, file_name)
+    return target_folder
+def _untar_model_weights(model_tar):
+    """untar model files"""
+    model_dir = _get_untar_directory(model_tar)
+    if not os.path.exists(
+        os.path.join(model_dir, _TAR_FILE_NAME_LIST[0])
+    ) or not os.path.exists(os.path.join(model_dir, _TAR_FILE_NAME_LIST[2])):
+        # the path to save the decompressed file
+        os.makedirs(model_dir, exist_ok=True)
+        with tarfile.open(model_tar, "r") as tarobj:
+            for member in tarobj.getmembers():
+                filename = None
+                for tar_file_name in _TAR_FILE_NAME_LIST:
+                    if tar_file_name in member.name:
+                        filename = tar_file_name
+                if filename is None:
+                    continue
+                file = tarobj.extractfile(member)
+                with open(os.path.join(model_dir, filename), "wb") as model_file:
+                    model_file.write(file.read())
+    return model_dir
+def is_cached_folder_exists_and_valid(cached):
+    possible_extracted_model_folder = _get_untar_directory(cached)
+    if not os.path.exists(possible_extracted_model_folder):
+        return False
+    for tar_file in _TAR_FILE_NAME_LIST:
+        if not os.path.exists(os.path.join(possible_extracted_model_folder, tar_file)):
+            return False
+    return True
+class PaddleModelURLHandler(HTTPURLHandler):
+    """
+    Supports download and file check for Baidu Cloud links
+    """
+    MAX_FILENAME_LEN = 250
+    def _get_supported_prefixes(self):
+        return ["https://paddle-model-ecology.bj.bcebos.com"]
+    def _isfile(self, path):
+        return path in self.cache_map
+    def _get_local_path(
+        self,
+        path: str,
+        force: bool = False,
+        cache_dir: Optional[str] = None,
+        **kwargs: Any,
+    ) -> str:
+        """
+        As paddle model stores all files in tar files, we need to extract them
+        and get the newly extracted folder path. This function rewrites the base
+        function to support the following situations:
+        1. If the tar file is not downloaded, it will download the tar file,
+            extract it to the target folder, delete the downloaded tar file,
+            and return the folder path.
+        2. If the extracted target folder is present, and all the necessary model
+            files are present (specified in _TAR_FILE_NAME_LIST), it will
+            return the folder path.
+        3. If the tar file is downloaded, but the extracted target folder is not
+            present (or it doesn't contain the necessary files in _TAR_FILE_NAME_LIST),
+            it will extract the tar file to the target folder, delete the tar file,
+            and return the folder path.
+        """
+        self._check_kwargs(kwargs)
+        if (
+            force
+            or path not in self.cache_map
+            or not os.path.exists(self.cache_map[path])
+        ):
+            logger = logging.getLogger(__name__)
+            parsed_url = urlparse(path)
+            dirname = os.path.join(
+                get_cache_dir(cache_dir), os.path.dirname(parsed_url.path.lstrip("/"))
+            )
+            filename = path.split("/")[-1]
+            if len(filename) > self.MAX_FILENAME_LEN:
+                filename = filename[:100] + "_" + uuid.uuid4().hex
+            cached = os.path.join(dirname, filename)
+            if is_cached_folder_exists_and_valid(cached):
+                # When the cached folder exists and valid, we don't need to redownload
+                # the tar file.
+                self.cache_map[path] = _get_untar_directory(cached)
+            else:
+                with file_lock(cached):
+                    if not os.path.isfile(cached):
+                        logger.info("Downloading {} ...".format(path))
+                        cached = download(path, dirname, filename=filename)
+                    if path.endswith(".tar"):
+                        model_dir = _untar_model_weights(cached)
+                        try:
+                            os.remove(cached)  # remove the redundant tar file
+                            # TODO: remove the .lock file .
+                        except:
+                            logger.warning(
+                                f"Not able to remove the cached tar file {cached}"
+                            )
+                logger.info("URL {} cached in {}".format(path, model_dir))
+                self.cache_map[path] = model_dir
+        return self.cache_map[path]
+class LayoutParserPaddleModelHandler(PathHandler):
+    """
+    Resolve anything that's in LayoutParser model zoo.
+    """
+    PREFIX = "lp://paddledetection/"
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+    def _get_local_path(self, path, **kwargs):
+        model_name = path[len(self.PREFIX) :]
+        dataset_name, *model_name, data_type = model_name.split("/")
+        if data_type == "weight":
+            model_url = MODEL_CATALOG[dataset_name]["/".join(model_name)]
+        else:
+            raise ValueError(f"Unknown data_type {data_type}")
+        return PathManager.get_local_path(model_url, **kwargs)
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+PathManager.register_handler(PaddleModelURLHandler())
+PathManager.register_handler(LayoutParserPaddleModelHandler())