PyPI - deepdoctection - Versions diffs - 0.30__py3-none-any.whl → 0.32__py3-none-any.whl - Mend

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show

deepdoctection/__init__.py +38 -29
deepdoctection/analyzer/dd.py +36 -29
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +4 -3
deepdoctection/dataflow/custom_serialize.py +14 -5
deepdoctection/dataflow/parallel_map.py +12 -11
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/datapoint/annotation.py +35 -13
deepdoctection/datapoint/box.py +3 -5
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +79 -36
deepdoctection/datapoint/view.py +152 -49
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +6 -3
deepdoctection/datasets/base.py +86 -11
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +4 -4
deepdoctection/datasets/instances/doclaynet.py +3 -2
deepdoctection/datasets/instances/fintabnet.py +2 -1
deepdoctection/datasets/instances/funsd.py +2 -1
deepdoctection/datasets/instances/iiitar13k.py +5 -2
deepdoctection/datasets/instances/layouttest.py +4 -8
deepdoctection/datasets/instances/publaynet.py +2 -2
deepdoctection/datasets/instances/pubtables1m.py +6 -3
deepdoctection/datasets/instances/pubtabnet.py +2 -1
deepdoctection/datasets/instances/rvlcdip.py +2 -1
deepdoctection/datasets/instances/xfund.py +2 -1
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +1 -1
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/cocometric.py +2 -1
deepdoctection/eval/eval.py +19 -15
deepdoctection/eval/tedsmetric.py +14 -11
deepdoctection/eval/tp_eval_callback.py +14 -7
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +182 -90
deepdoctection/extern/deskew.py +36 -9
deepdoctection/extern/doctrocr.py +265 -83
deepdoctection/extern/fastlang.py +49 -9
deepdoctection/extern/hfdetr.py +106 -55
deepdoctection/extern/hflayoutlm.py +441 -122
deepdoctection/extern/hflm.py +225 -0
deepdoctection/extern/model.py +56 -47
deepdoctection/extern/pdftext.py +10 -5
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +27 -18
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +6 -2
deepdoctection/extern/tp/tfutils.py +43 -9
deepdoctection/extern/tp/tpcompat.py +14 -11
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +54 -30
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/d2struct.py +9 -7
deepdoctection/mapper/hfstruct.py +7 -2
deepdoctection/mapper/laylmstruct.py +164 -21
deepdoctection/mapper/maputils.py +16 -3
deepdoctection/mapper/misc.py +6 -3
deepdoctection/mapper/prodigystruct.py +1 -1
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/common.py +23 -13
deepdoctection/pipe/concurrency.py +2 -1
deepdoctection/pipe/doctectionpipe.py +2 -2
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +6 -3
deepdoctection/pipe/lm.py +34 -66
deepdoctection/pipe/order.py +142 -35
deepdoctection/pipe/refine.py +26 -24
deepdoctection/pipe/segment.py +21 -16
deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -9
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +36 -28
deepdoctection/train/hf_detr_train.py +26 -17
deepdoctection/train/hf_layoutlm_train.py +133 -111
deepdoctection/train/tp_frcnn_train.py +21 -19
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +2 -2
deepdoctection/utils/env_info.py +41 -84
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +4 -15
deepdoctection/utils/fs.py +7 -7
deepdoctection/utils/logger.py +1 -0
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +5 -4
deepdoctection/utils/settings.py +6 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +48 -5
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
deepdoctection-0.32.dist-info/RECORD +146 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
deepdoctection-0.30.dist-info/RECORD +0 -143
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0

deepdoctection/extern/hflm.py ADDED Viewed

@@ -0,0 +1,225 @@
+# -*- coding: utf-8 -*-
+# File: hfml.py
+# Copyright 2024 Dr. Janis Meyer. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Wrapper for the Hugging Face Language Model for sequence and token  classification
+"""
+from __future__ import annotations
+from abc import ABC
+from copy import copy
+from pathlib import Path
+from typing import Any, List, Literal, Mapping, Optional, Tuple, Union
+from lazy_imports import try_import
+from ..utils.detection_types import JsonDict, Requirement
+from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
+from ..utils.settings import TypeOrStr
+from .base import LMSequenceClassifier, SequenceClassResult
+from .hflayoutlm import get_tokenizer_from_model_class
+from .pt.ptutils import get_torch_device
+with try_import() as pt_import_guard:
+    import torch
+    import torch.nn.functional as F
+with try_import() as tr_import_guard:
+    from transformers import PretrainedConfig, XLMRobertaForSequenceClassification
+def predict_sequence_classes(
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    model: Union[XLMRobertaForSequenceClassification],
+) -> SequenceClassResult:
+    """
+    :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
+    :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
+    :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
+    :param model: layoutlm model for sequence classification
+    :return: SequenceClassResult
+    """
+    outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+    score = torch.max(F.softmax(outputs.logits)).tolist()
+    sequence_class_predictions = outputs.logits.argmax(-1).squeeze().tolist()
+    return SequenceClassResult(class_id=sequence_class_predictions, score=float(score))  # type: ignore
+class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
+    """
+    Abstract base class for wrapping Bert-type models  for sequence classification into the deepdoctection framework.
+    """
+    model: Union[XLMRobertaForSequenceClassification]
+    def __init__(
+        self,
+        path_config_json: str,
+        path_weights: str,
+        categories: Mapping[str, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
+    ):
+        self.path_config = path_config_json
+        self.path_weights = path_weights
+        self.categories = copy(categories)  # type: ignore
+        self.device = get_torch_device(device)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(use_xlm_tokenizer)
+    @classmethod
+    def get_requirements(cls) -> List[Requirement]:
+        return [get_pytorch_requirement(), get_transformers_requirement()]
+    def clone(self) -> HFLmSequenceClassifierBase:
+        return self.__class__(self.path_config, self.path_weights, self.categories, self.device)
+    def _validate_encodings(
+        self, **encodings: Union[List[List[str]], torch.Tensor]
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        input_ids = encodings.get("input_ids")
+        attention_mask = encodings.get("attention_mask")
+        token_type_ids = encodings.get("token_type_ids")
+        if isinstance(input_ids, torch.Tensor):
+            input_ids = input_ids.to(self.device)
+        else:
+            raise ValueError(f"input_ids must be list but is {type(input_ids)}")
+        if isinstance(attention_mask, torch.Tensor):
+            attention_mask = attention_mask.to(self.device)
+        else:
+            raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
+        if isinstance(token_type_ids, torch.Tensor):
+            token_type_ids = token_type_ids.to(self.device)
+        else:
+            raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
+        token_type_ids = token_type_ids.to(self.device)
+        return input_ids, attention_mask, token_type_ids
+    @staticmethod
+    def get_name(path_weights: str, architecture: str) -> str:
+        """Returns the name of the model"""
+        return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
+    def get_tokenizer_class_name(self, use_xlm_tokenizer: bool) -> str:
+        """A refinement for adding the tokenizer class name to the model configs.
+        :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
+        """
+        tokenizer = get_tokenizer_from_model_class(self.model.__class__.__name__, use_xlm_tokenizer)
+        return tokenizer.__class__.__name__
+    @staticmethod
+    def image_to_raw_features_mapping() -> str:
+        """Returns the mapping function to convert images into raw features."""
+        return "image_to_raw_lm_features"
+    @staticmethod
+    def image_to_features_mapping() -> str:
+        """Returns the mapping function to convert images into features."""
+        return "image_to_lm_features"
+class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
+    """
+    A wrapper class for `transformers.XLMRobertaForSequenceClassification` and similar models to use within a pipeline
+    component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
+    model itself.
+    Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
+    classification and other things please use another model of the family.
+    **Example**
+            # setting up compulsory ocr service
+            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+            tess = TesseractOcrDetector(tesseract_config_path)
+            ocr_service = TextExtractionService(tess)
+            # hf tokenizer and token classifier
+            tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
+            roberta = HFLmSequenceClassifier("path/to/config.json","path/to/model.bin",
+                                                  categories=["handwritten", "presentation", "resume"])
+            # token classification service
+            roberta_service = LMSequenceClassifierService(tokenizer,roberta)
+            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
+            path = "path/to/some/form"
+            df = pipe.analyze(path=path)
+            for dp in df:
+                ...
+    """
+    def __init__(
+        self,
+        path_config_json: str,
+        path_weights: str,
+        categories: Mapping[str, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = True,
+    ):
+        self.name = self.get_name(path_weights, "bert-like")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
+    def predict(self, **encodings: Union[List[List[str]], torch.Tensor]) -> SequenceClassResult:
+        input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
+        result = predict_sequence_classes(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            self.model,
+        )
+        result.class_id += 1
+        result.class_name = self.categories[str(result.class_id)]
+        return result
+    @staticmethod
+    def get_wrapped_model(path_config_json: str, path_weights: str) -> Any:
+        """
+        Get the inner (wrapped) model.
+        :param path_config_json: path to .json config file
+        :param path_weights: path to model artifact
+        :return: 'nn.Module'
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return XLMRobertaForSequenceClassification.from_pretrained(
+            pretrained_model_name_or_path=path_weights, config=config
+        )
+    @staticmethod
+    def default_kwargs_for_input_mapping() -> JsonDict:
+        """
+        Add some default arguments that might be necessary when preparing a sample. Overwrite this method
+        for some custom setting.
+        """
+        return {}

deepdoctection/extern/model.py CHANGED Viewed

@@ -185,25 +185,6 @@ class ModelCatalog:
             dl_library="TF",
             model_wrapper="TPFrcnnDetector",
         ),
-        "layout/d2_model-800000-layout.pkl": ModelProfile(
-            name="layout/d2_model-800000-layout.pkl",
-            description="Detectron2 layout detection model trained on Publaynet",
-            config="dd/d2/layout/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[274568239],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_publaynet_inference_only",
-            hf_model_name="d2_model-800000-layout.pkl",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={
-                "1": LayoutType.text,
-                "2": LayoutType.title,
-                "3": LayoutType.list,
-                "4": LayoutType.table,
-                "5": LayoutType.figure,
-            },
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
         "layout/d2_model_0829999_layout_inf_only.pt": ModelProfile(
             name="layout/d2_model_0829999_layout_inf_only.pt",
             description="Detectron2 layout detection model trained on Publaynet",
@@ -261,19 +242,6 @@ class ModelCatalog:
             dl_library="PT",
             model_wrapper="D2FrcnnTracingDetector",
         ),
-        "cell/d2_model-1800000-cell.pkl": ModelProfile(
-            name="cell/d2_model-1800000-cell.pkl",
-            description="Detectron2 cell detection inference only model trained on Pubtabnet",
-            config="dd/d2/cell/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[274519039],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_c_inference_only",
-            hf_model_name="d2_model-1800000-cell.pkl",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={"1": LayoutType.cell},
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
         "cell/d2_model_1849999_cell_inf_only.pt": ModelProfile(
             name="cell/d2_model_1849999_cell_inf_only.pt",
             description="Detectron2 cell detection inference only model trained on Pubtabnet",
@@ -313,19 +281,6 @@ class ModelCatalog:
             dl_library="PT",
             model_wrapper="D2FrcnnDetector",
         ),
-        "item/d2_model-1620000-item.pkl": ModelProfile(
-            name="item/d2_model-1620000-item.pkl",
-            description="Detectron2 item detection inference only model trained on Pubtabnet",
-            config="dd/d2/item/CASCADE_RCNN_R_50_FPN_GN.yaml",
-            size=[274531339],
-            tp_model=False,
-            hf_repo_id="deepdoctection/d2_casc_rcnn_X_32xd4_50_FPN_GN_2FC_pubtabnet_rc_inference_only",
-            hf_model_name="d2_model-1620000-item.pkl",
-            hf_config_file=["Base-RCNN-FPN.yaml", "CASCADE_RCNN_R_50_FPN_GN.yaml"],
-            categories={"1": LayoutType.row, "2": LayoutType.column},
-            dl_library="PT",
-            model_wrapper="D2FrcnnDetector",
-        ),
         "item/d2_model_1639999_item.pth": ModelProfile(
             name="item/d2_model_1639999_item.pth",
             description="Detectron2 item detection model trained on Pubtabnet",
@@ -365,6 +320,45 @@ class ModelCatalog:
             dl_library="PT",
             model_wrapper="D2FrcnnTracingDetector",
         ),
+        "nielsr/lilt-xlm-roberta-base/pytorch_model.bin": ModelProfile(
+            name="nielsr/lilt-xlm-roberta-base/pytorch_model.bin",
+            description="LiLT build with a RobertaXLM base model",
+            config="nielsr/lilt-xlm-roberta-base/config.json",
+            size=[1136743583],
+            tp_model=False,
+            hf_repo_id="nielsr/lilt-xlm-roberta-base",
+            hf_model_name="pytorch_model.bin",
+            hf_config_file=["config.json"],
+            dl_library="PT",
+        ),
+        "SCUT-DLVCLab/lilt-infoxlm-base/pytorch_model.bin": ModelProfile(
+            name="SCUT-DLVCLab/lilt-infoxlm-base/pytorch_model.bin",
+            description="Language-Independent Layout Transformer - InfoXLM model by stitching a pre-trained InfoXLM"
+            " and a pre-trained Language-Independent Layout Transformer (LiLT) together. It was introduced"
+            " in the paper LiLT: A Simple yet Effective Language-Independent Layout Transformer for"
+            " Structured Document Understanding by Wang et al. and first released in this repository.",
+            config="SCUT-DLVCLab/lilt-infoxlm-base/config.json",
+            size=[1136743583],
+            tp_model=False,
+            hf_repo_id="SCUT-DLVCLab/lilt-infoxlm-base",
+            hf_model_name="pytorch_model.bin",
+            hf_config_file=["config.json"],
+            dl_library="PT",
+        ),
+        "SCUT-DLVCLab/lilt-roberta-en-base/pytorch_model.bin": ModelProfile(
+            name="SCUT-DLVCLab/lilt-roberta-en-base/pytorch_model.bin",
+            description="Language-Independent Layout Transformer - RoBERTa model by stitching a pre-trained RoBERTa"
+            " (English) and a pre-trained Language-Independent Layout Transformer (LiLT) together. It was"
+            " introduced in the paper LiLT: A Simple yet Effective Language-Independent Layout Transformer"
+            " for Structured Document Understanding by Wang et al. and first released in this repository.",
+            config="SCUT-DLVCLab/lilt-roberta-en-base/config.json",
+            size=[523151519],
+            tp_model=False,
+            hf_repo_id="SCUT-DLVCLab/lilt-roberta-en-base",
+            hf_model_name="pytorch_model.bin",
+            hf_config_file=["config.json"],
+            dl_library="PT",
+        ),
         "microsoft/layoutlm-base-uncased/pytorch_model.bin": ModelProfile(
             name="microsoft/layoutlm-base-uncased/pytorch_model.bin",
             description="LayoutLM is a simple but effective pre-training method of text and layout for document image"
@@ -535,6 +529,19 @@ class ModelCatalog:
             model_wrapper="DoctrTextRecognizer",
             architecture="crnn_vgg16_bn",
         ),
+        "FacebookAI/xlm-roberta-base": ModelProfile(
+            name="FacebookAI/xlm-roberta-base/pytorch_model.bin",
+            description="XLM-RoBERTa model pre-trained on 2.5TB of filtered CommonCrawl data containing 100 languages."
+            " It was introduced in the paper Unsupervised Cross-lingual Representation Learning at Scale"
+            " by Conneau et al. and first released in this repository.",
+            size=[1115590446],
+            tp_model=False,
+            config="FacebookAI/xlm-roberta-base/config.json",
+            hf_repo_id="FacebookAI/xlm-roberta-base",
+            hf_model_name="pytorch_model.bin",
+            hf_config_file=["config.json"],
+            dl_library="PT",
+        ),
         "fasttext/lid.176.bin": ModelProfile(
             name="fasttext/lid.176.bin",
             description="Fasttext language detection model",
@@ -980,9 +987,11 @@ class ModelDownloadManager:
                 else:
                     file_names.append(model_name)
             if profile.hf_repo_id:
-                ModelDownloadManager.load_model_from_hf_hub(profile, absolute_path_weights, file_names)
+                if not os.path.isfile(absolute_path_weights):
+                    ModelDownloadManager.load_model_from_hf_hub(profile, absolute_path_weights, file_names)
                 absolute_path_configs = ModelCatalog.get_full_path_configs(name)
-                ModelDownloadManager.load_configs_from_hf_hub(profile, absolute_path_configs)
+                if not os.path.isfile(absolute_path_configs):
+                    ModelDownloadManager.load_configs_from_hf_hub(profile, absolute_path_configs)
             else:
                 ModelDownloadManager._load_from_gd(profile, absolute_path_weights, file_names)

deepdoctection/extern/pdftext.py CHANGED Viewed

@@ -21,13 +21,15 @@ PDFPlumber text extraction engine
 from typing import Dict, List, Tuple
+from lazy_imports import try_import
 from ..utils.context import save_tmp_file
 from ..utils.detection_types import Requirement
-from ..utils.file_utils import get_pdfplumber_requirement, pdfplumber_available
+from ..utils.file_utils import get_pdfplumber_requirement
 from ..utils.settings import LayoutType, ObjectTypes
 from .base import DetectionResult, PdfMiner
-if pdfplumber_available():
+with try_import() as import_guard:
     from pdfplumber.pdf import PDF
@@ -64,9 +66,12 @@ class PdfPlumberTextDetector(PdfMiner):
     """
-    def __init__(self) -> None:
-        self.name = "pdfplumber"
+    def __init__(self, x_tolerance: int = 3, y_tolerance: int = 3) -> None:
+        self.name = "Pdfplumber"
+        self.model_id = self.get_model_id()
         self.categories = {"1": LayoutType.word}
+        self.x_tolerance = x_tolerance
+        self.y_tolerance = y_tolerance
     def predict(self, pdf_bytes: bytes) -> List[DetectionResult]:
         """
@@ -81,7 +86,7 @@ class PdfPlumberTextDetector(PdfMiner):
                 _pdf = PDF(fin)
                 self._page = _pdf.pages[0]
                 self._pdf_bytes = pdf_bytes
-                words = self._page.extract_words()
+                words = self._page.extract_words(x_tolerance=self.x_tolerance, y_tolerance=self.y_tolerance)
         detect_results = list(map(_to_detect_result, words))
         return detect_results

deepdoctection/extern/pt/__init__.py CHANGED Viewed

@@ -19,7 +19,5 @@
 Init file for pytorch compatibility package
 """
+from .nms import *
 from .ptutils import *
-if pytorch_available():
-    from .nms import *

deepdoctection/extern/pt/nms.py CHANGED Viewed

@@ -18,9 +18,13 @@
 """
 Module for custom NMS functions.
 """
+from __future__ import annotations
-import torch
-from torchvision.ops import boxes as box_ops  # type: ignore
+from lazy_imports import try_import
+with try_import() as import_guard:
+    import torch
+    from torchvision.ops import boxes as box_ops  # type: ignore
 # Copy & paste from https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/nms.py

deepdoctection/extern/pt/ptutils.py CHANGED Viewed

@@ -18,31 +18,40 @@
 """
 Torch related utils
 """
+from __future__ import annotations
+import os
+from typing import Optional, Union
-from ...utils.file_utils import pytorch_available
+from lazy_imports import try_import
+with try_import() as import_guard:
+    import torch
-def set_torch_auto_device() -> "torch.device":  # type: ignore
-    """
-    Returns cuda device if available, otherwise cpu
+def get_torch_device(device: Optional[Union[str, torch.device]] = None) -> torch.device:
     """
-    if pytorch_available():
-        from torch import cuda, device  # pylint: disable=C0415
+    Selecting a device on which to load a model. The selection follows a cascade of priorities:
-        return device("cuda" if cuda.is_available() else "cpu")
-    raise ModuleNotFoundError("Pytorch must be installed")
+    - If a device string is provided, it is used.
+    - If the environment variable "USE_CUDA" is set, a GPU is used. If more GPUs are available, it will use all of them
+      unless something else is specified by CUDA_VISIBLE_DEVICES:
+          https://stackoverflow.com/questions/54216920/how-to-use-multiple-gpus-in-pytorch
-def get_num_gpu() -> int:
-    """
-    Returns number of CUDA devices if pytorch is available
+    - If an MPS device is available, it is used.
+    - Otherwise, the CPU is used.
-    :return:
+    :param device: Device either as string or torch.device
+    :return: Tensorflow device
     """
-    if pytorch_available():
-        from torch import cuda  # pylint: disable=C0415
-        return cuda.device_count()
-    raise ModuleNotFoundError("Pytorch must be installed")
+    if device is not None:
+        if isinstance(device, torch.device):
+            return device
+        if isinstance(device, str):
+            return torch.device(device)
+    if os.environ.get("USE_CUDA"):
+        return torch.device("cuda")
+    if os.environ.get("USE_MPS"):
+        return torch.device("mps")
+    return torch.device("cpu")

deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl