PyPI - deepdoctection - Versions diffs - 0.30__py3-none-any.whl → 0.32__py3-none-any.whl - Mend

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show

deepdoctection/__init__.py +38 -29
deepdoctection/analyzer/dd.py +36 -29
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +4 -3
deepdoctection/dataflow/custom_serialize.py +14 -5
deepdoctection/dataflow/parallel_map.py +12 -11
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/datapoint/annotation.py +35 -13
deepdoctection/datapoint/box.py +3 -5
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +79 -36
deepdoctection/datapoint/view.py +152 -49
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +6 -3
deepdoctection/datasets/base.py +86 -11
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +4 -4
deepdoctection/datasets/instances/doclaynet.py +3 -2
deepdoctection/datasets/instances/fintabnet.py +2 -1
deepdoctection/datasets/instances/funsd.py +2 -1
deepdoctection/datasets/instances/iiitar13k.py +5 -2
deepdoctection/datasets/instances/layouttest.py +4 -8
deepdoctection/datasets/instances/publaynet.py +2 -2
deepdoctection/datasets/instances/pubtables1m.py +6 -3
deepdoctection/datasets/instances/pubtabnet.py +2 -1
deepdoctection/datasets/instances/rvlcdip.py +2 -1
deepdoctection/datasets/instances/xfund.py +2 -1
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +1 -1
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/cocometric.py +2 -1
deepdoctection/eval/eval.py +19 -15
deepdoctection/eval/tedsmetric.py +14 -11
deepdoctection/eval/tp_eval_callback.py +14 -7
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +182 -90
deepdoctection/extern/deskew.py +36 -9
deepdoctection/extern/doctrocr.py +265 -83
deepdoctection/extern/fastlang.py +49 -9
deepdoctection/extern/hfdetr.py +106 -55
deepdoctection/extern/hflayoutlm.py +441 -122
deepdoctection/extern/hflm.py +225 -0
deepdoctection/extern/model.py +56 -47
deepdoctection/extern/pdftext.py +10 -5
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +27 -18
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +6 -2
deepdoctection/extern/tp/tfutils.py +43 -9
deepdoctection/extern/tp/tpcompat.py +14 -11
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +54 -30
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/d2struct.py +9 -7
deepdoctection/mapper/hfstruct.py +7 -2
deepdoctection/mapper/laylmstruct.py +164 -21
deepdoctection/mapper/maputils.py +16 -3
deepdoctection/mapper/misc.py +6 -3
deepdoctection/mapper/prodigystruct.py +1 -1
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/common.py +23 -13
deepdoctection/pipe/concurrency.py +2 -1
deepdoctection/pipe/doctectionpipe.py +2 -2
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +6 -3
deepdoctection/pipe/lm.py +34 -66
deepdoctection/pipe/order.py +142 -35
deepdoctection/pipe/refine.py +26 -24
deepdoctection/pipe/segment.py +21 -16
deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -9
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +36 -28
deepdoctection/train/hf_detr_train.py +26 -17
deepdoctection/train/hf_layoutlm_train.py +133 -111
deepdoctection/train/tp_frcnn_train.py +21 -19
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +2 -2
deepdoctection/utils/env_info.py +41 -84
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +4 -15
deepdoctection/utils/fs.py +7 -7
deepdoctection/utils/logger.py +1 -0
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +5 -4
deepdoctection/utils/settings.py +6 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +48 -5
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
deepdoctection-0.32.dist-info/RECORD +146 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
deepdoctection-0.30.dist-info/RECORD +0 -143
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0

deepdoctection/train/d2_frcnn_train.py CHANGED Viewed

@@ -18,19 +18,12 @@
 """
 Module for training Detectron2 `GeneralizedRCNN`
 """
+from __future__ import annotations
 import copy
 from typing import Any, Dict, List, Mapping, Optional, Sequence, Type, Union
-from detectron2.config import CfgNode, get_cfg
-from detectron2.data import DatasetMapper, build_detection_train_loader
-from detectron2.data.transforms import RandomFlip, ResizeShortestEdge
-from detectron2.engine import DefaultTrainer, HookBase, default_writers, hooks
-from detectron2.utils import comm
-from detectron2.utils.events import EventWriter, get_event_storage
-from fvcore.nn.precise_bn import get_bn_modules  # type: ignore
-from torch.utils.data import DataLoader, IterableDataset
+from lazy_imports import try_import
 from ..datasets.adapter import DatasetAdapter
 from ..datasets.base import DatasetBase
@@ -39,15 +32,28 @@ from ..eval.base import MetricBase
 from ..eval.eval import Evaluator
 from ..eval.registry import metric_registry
 from ..extern.d2detect import D2FrcnnDetector
-from ..extern.pt.ptutils import get_num_gpu
 from ..mapper.d2struct import image_to_d2_frcnn_training
 from ..pipe.base import PredictorPipelineComponent
 from ..pipe.registry import pipeline_component_registry
+from ..utils.error import DependencyError
 from ..utils.file_utils import get_wandb_requirement, wandb_available
 from ..utils.logger import LoggingRecord, logger
 from ..utils.utils import string_to_dict
-if wandb_available():
+with try_import() as d2_import_guard:
+    from detectron2.config import CfgNode, get_cfg
+    from detectron2.data import DatasetMapper, build_detection_train_loader
+    from detectron2.data.transforms import RandomFlip, ResizeShortestEdge
+    from detectron2.engine import DefaultTrainer, HookBase, default_writers, hooks
+    from detectron2.utils import comm
+    from detectron2.utils.events import EventWriter, get_event_storage
+    from fvcore.nn.precise_bn import get_bn_modules  # type: ignore
+with try_import() as pt_import_guard:
+    from torch import cuda
+    from torch.utils.data import DataLoader, IterableDataset
+with try_import() as wb_import_guard:
     import wandb
@@ -111,7 +117,7 @@ class WandbWriter(EventWriter):
             config = {}
         self._window_size = window_size
         self._run = wandb.init(project=project, config=config, **kwargs) if not wandb.run else wandb.run
-        self._run._label(repo=repo)  # type:ignore
+        self._run._label(repo=repo)
     def write(self) -> None:
         storage = get_event_storage()
@@ -120,10 +126,10 @@ class WandbWriter(EventWriter):
         for key, (val, _) in storage.latest_with_smoothing_hint(self._window_size).items():
             log_dict[key] = val
-        self._run.log(log_dict)  # type:ignore
+        self._run.log(log_dict)
     def close(self) -> None:
-        self._run.finish()  # type:ignore
+        self._run.finish()
 class D2Trainer(DefaultTrainer):
@@ -153,16 +159,18 @@ class D2Trainer(DefaultTrainer):
         ret = [
             hooks.IterationTimer(),
             hooks.LRScheduler(),
-            hooks.PreciseBN(
-                # Run at the same freq as (but before) evaluation.
-                cfg.TEST.EVAL_PERIOD,
-                self.model,  # pylint: disable=E1101
-                # Build a new data loader to not affect training
-                self.build_train_loader(cfg),
-                cfg.TEST.PRECISE_BN.NUM_ITER,
-            )
-            if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)  # pylint: disable=E1101
-            else None,
+            (
+                hooks.PreciseBN(
+                    # Run at the same freq as (but before) evaluation.
+                    cfg.TEST.EVAL_PERIOD,
+                    self.model,  # pylint: disable=E1101
+                    # Build a new data loader to not affect training
+                    self.build_train_loader(cfg),
+                    cfg.TEST.PRECISE_BN.NUM_ITER,
+                )
+                if cfg.TEST.PRECISE_BN.ENABLED and get_bn_modules(self.model)  # pylint: disable=E1101
+                else None
+            ),
         ]
         # Do PreciseBN before checkpointer, because it updates the model and need to
@@ -201,7 +209,7 @@ class D2Trainer(DefaultTrainer):
         if self.cfg.WANDB.USE_WANDB:
             _, _wandb_available, err_msg = get_wandb_requirement()
             if not _wandb_available:
-                raise ImportError(err_msg)
+                raise DependencyError(err_msg)
             if self.cfg.WANDB.PROJECT is None:
                 raise ValueError("When using W&B, you must specify a project, i.e. WANDB.PROJECT")
             writers_list.append(WandbWriter(self.cfg.WANDB.PROJECT, self.cfg.WANDB.REPO, self.cfg))
@@ -256,7 +264,7 @@ class D2Trainer(DefaultTrainer):
             dataset_val,
             pipeline_component,
             metric,
-            num_threads=get_num_gpu() * 2,
+            num_threads=cuda.device_count() * 2,
             run=run,
         )
         if build_val_dict:
@@ -269,7 +277,7 @@ class D2Trainer(DefaultTrainer):
     @classmethod
     def build_evaluator(cls, cfg, dataset_name):  # type: ignore
-        raise NotImplementedError
+        raise NotImplementedError()
 def train_d2_faster_rcnn(
@@ -332,7 +340,7 @@ def train_d2_faster_rcnn(
     :param pipeline_component_name: A pipeline component name to use for validation.
     """
-    assert get_num_gpu() > 0, "Has to train with GPU!"
+    assert cuda.device_count() > 0, "Has to train with GPU!"
     build_train_dict: Dict[str, str] = {}
     if build_train_config is not None:

deepdoctection/train/hf_detr_train.py CHANGED Viewed

@@ -19,20 +19,12 @@
 Module for training Hugging Face Detr implementation. Note, that this scripts only trans Tabletransformer like Detr
 models that are a slightly different from the plain Detr model that are provided by the transformer library.
 """
+from __future__ import annotations
 import copy
 from typing import Any, Dict, List, Optional, Sequence, Type, Union
-from torch.nn import Module
-from torch.utils.data import Dataset
-from transformers import (
-    AutoFeatureExtractor,
-    IntervalStrategy,
-    PretrainedConfig,
-    PreTrainedModel,
-    TableTransformerForObjectDetection,
-)
-from transformers.trainer import Trainer, TrainingArguments
+from lazy_imports import try_import
 from ..datasets.adapter import DatasetAdapter
 from ..datasets.base import DatasetBase
@@ -47,6 +39,21 @@ from ..pipe.registry import pipeline_component_registry
 from ..utils.logger import LoggingRecord, logger
 from ..utils.utils import string_to_dict
+with try_import() as pt_import_guard:
+    from torch import nn
+    from torch.utils.data import Dataset
+with try_import() as hf_import_guard:
+    from transformers import (
+        AutoFeatureExtractor,
+        IntervalStrategy,
+        PretrainedConfig,
+        PreTrainedModel,
+        TableTransformerForObjectDetection,
+        Trainer,
+        TrainingArguments,
+    )
 class DetrDerivedTrainer(Trainer):
     """
@@ -61,7 +68,7 @@ class DetrDerivedTrainer(Trainer):
     def __init__(
         self,
-        model: Union[PreTrainedModel, Module],
+        model: Union[PreTrainedModel, nn.Module],
         args: TrainingArguments,
         data_collator: DetrDataCollator,
         train_dataset: Dataset[Any],
@@ -97,9 +104,9 @@ class DetrDerivedTrainer(Trainer):
     def evaluate(
         self,
-        eval_dataset: Optional[Dataset[Any]] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
+        eval_dataset: Optional[Dataset[Any]] = None,  # pylint: disable=W0613
+        ignore_keys: Optional[List[str]] = None,  # pylint: disable=W0613
+        metric_key_prefix: str = "eval",  # pylint: disable=W0613
     ) -> Dict[str, float]:
         """
         Overwritten method from `Trainer`. Arguments will not be used.
@@ -193,9 +200,11 @@ def train_hf_detr(
         "remove_unused_columns": False,
         "per_device_train_batch_size": 2,
         "max_steps": number_samples,
-        "evaluation_strategy": "steps"
-        if (dataset_val is not None and metric is not None and pipeline_component_name is not None)
-        else "no",
+        "evaluation_strategy": (
+            "steps"
+            if (dataset_val is not None and metric is not None and pipeline_component_name is not None)
+            else "no"
+        ),
         "eval_steps": 5000,
     }

deepdoctection/train/hf_layoutlm_train.py CHANGED Viewed

@@ -18,32 +18,15 @@
 """
 Module for training Huggingface implementation of LayoutLm
 """
+from __future__ import annotations
 import copy
 import json
 import os
 import pprint
-from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
-from torch.nn import Module
-from torch.utils.data import Dataset
-from transformers import (
-    IntervalStrategy,
-    LayoutLMForSequenceClassification,
-    LayoutLMForTokenClassification,
-    LayoutLMTokenizerFast,
-    LayoutLMv2Config,
-    LayoutLMv2ForSequenceClassification,
-    LayoutLMv2ForTokenClassification,
-    LayoutLMv3Config,
-    LayoutLMv3ForSequenceClassification,
-    LayoutLMv3ForTokenClassification,
-    PretrainedConfig,
-    PreTrainedModel,
-    RobertaTokenizerFast,
-    XLMRobertaTokenizerFast,
-)
-from transformers.trainer import Trainer, TrainingArguments
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Type, Union
+from lazy_imports import try_import
 from ..datasets.adapter import DatasetAdapter
 from ..datasets.base import DatasetBase
@@ -57,78 +40,108 @@ from ..extern.hflayoutlm import (
     HFLayoutLmv2TokenClassifier,
     HFLayoutLmv3SequenceClassifier,
     HFLayoutLmv3TokenClassifier,
+    HFLiltSequenceClassifier,
+    HFLiltTokenClassifier,
+    get_tokenizer_from_model_class,
 )
-from ..mapper.laylmstruct import LayoutLMDataCollator, image_to_raw_layoutlm_features
+from ..extern.hflm import HFLmSequenceClassifier
+from ..extern.pt.ptutils import get_torch_device
+from ..mapper.laylmstruct import LayoutLMDataCollator, image_to_raw_layoutlm_features, image_to_raw_lm_features
 from ..pipe.base import LanguageModelPipelineComponent
-from ..pipe.lm import get_tokenizer_from_architecture
 from ..pipe.registry import pipeline_component_registry
-from ..utils.env_info import get_device
+from ..utils.error import DependencyError
 from ..utils.file_utils import wandb_available
 from ..utils.logger import LoggingRecord, logger
-from ..utils.settings import DatasetType, LayoutType, ObjectTypes, WordType
+from ..utils.settings import DatasetType, LayoutType, WordType
 from ..utils.utils import string_to_dict
-if wandb_available():
-    import wandb
-_ARCHITECTURES_TO_MODEL_CLASS = {
-    "LayoutLMForTokenClassification": (LayoutLMForTokenClassification, HFLayoutLmTokenClassifier, PretrainedConfig),
-    "LayoutLMForSequenceClassification": (
-        LayoutLMForSequenceClassification,
-        HFLayoutLmSequenceClassifier,
-        PretrainedConfig,
-    ),
-    "LayoutLMv2ForTokenClassification": (
-        LayoutLMv2ForTokenClassification,
-        HFLayoutLmv2TokenClassifier,
-        LayoutLMv2Config,
-    ),
-    "LayoutLMv2ForSequenceClassification": (
-        LayoutLMv2ForSequenceClassification,
-        HFLayoutLmv2SequenceClassifier,
-        LayoutLMv2Config,
-    ),
-}
+with try_import() as pt_import_guard:
+    from torch import nn
+    from torch.utils.data import Dataset
-_MODEL_TYPE_AND_TASK_TO_MODEL_CLASS: Mapping[Tuple[str, ObjectTypes], Any] = {
-    ("layoutlm", DatasetType.sequence_classification): (
+with try_import() as tr_import_guard:
+    from transformers import (
+        IntervalStrategy,
         LayoutLMForSequenceClassification,
-        HFLayoutLmSequenceClassifier,
-        PretrainedConfig,
-    ),
-    ("layoutlm", DatasetType.token_classification): (
         LayoutLMForTokenClassification,
-        HFLayoutLmTokenClassifier,
-        PretrainedConfig,
-    ),
-    ("layoutlmv2", DatasetType.sequence_classification): (
-        LayoutLMv2ForSequenceClassification,
-        HFLayoutLmv2SequenceClassifier,
         LayoutLMv2Config,
-    ),
-    ("layoutlmv2", DatasetType.token_classification): (
+        LayoutLMv2ForSequenceClassification,
         LayoutLMv2ForTokenClassification,
-        HFLayoutLmv2TokenClassifier,
-        LayoutLMv2Config,
-    ),
-    ("layoutlmv3", DatasetType.sequence_classification): (
-        LayoutLMv3ForSequenceClassification,
-        HFLayoutLmv3SequenceClassifier,
         LayoutLMv3Config,
-    ),
-    ("layoutlmv3", DatasetType.token_classification): (
+        LayoutLMv3ForSequenceClassification,
         LayoutLMv3ForTokenClassification,
-        HFLayoutLmv3TokenClassifier,
-        LayoutLMv3Config,
-    ),
-}
-_MODEL_TYPE_TO_TOKENIZER = {
-    ("layoutlm", False): LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased"),
-    ("layoutlmv2", False): LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased"),
-    ("layoutlmv2", True): XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base", add_prefix_space=True),
-    ("layoutlmv3", False): RobertaTokenizerFast.from_pretrained("roberta-base", add_prefix_space=True),
-}
+        LiltForSequenceClassification,
+        LiltForTokenClassification,
+        PretrainedConfig,
+        PreTrainedModel,
+        XLMRobertaForSequenceClassification,
+    )
+    from transformers.trainer import Trainer, TrainingArguments
+with try_import() as wb_import_guard:
+    import wandb
+def get_model_architectures_and_configs(model_type: str, dataset_type: DatasetType) -> Tuple[Any, Any, Any]:
+    """
+    Get the model architecture, model wrapper and config class for a given model type and dataset type.
+    :param model_type: The model type
+    :param dataset_type: The dataset type
+    :return: Tuple of model architecture, model wrapper and config class
+    """
+    return {
+        ("layoutlm", DatasetType.sequence_classification): (
+            LayoutLMForSequenceClassification,
+            HFLayoutLmSequenceClassifier,
+            PretrainedConfig,
+        ),
+        ("layoutlm", DatasetType.token_classification): (
+            LayoutLMForTokenClassification,
+            HFLayoutLmTokenClassifier,
+            PretrainedConfig,
+        ),
+        ("layoutlmv2", DatasetType.sequence_classification): (
+            LayoutLMv2ForSequenceClassification,
+            HFLayoutLmv2SequenceClassifier,
+            LayoutLMv2Config,
+        ),
+        ("layoutlmv2", DatasetType.token_classification): (
+            LayoutLMv2ForTokenClassification,
+            HFLayoutLmv2TokenClassifier,
+            LayoutLMv2Config,
+        ),
+        ("layoutlmv3", DatasetType.sequence_classification): (
+            LayoutLMv3ForSequenceClassification,
+            HFLayoutLmv3SequenceClassifier,
+            LayoutLMv3Config,
+        ),
+        ("layoutlmv3", DatasetType.token_classification): (
+            LayoutLMv3ForTokenClassification,
+            HFLayoutLmv3TokenClassifier,
+            LayoutLMv3Config,
+        ),
+        ("lilt", DatasetType.token_classification): (
+            LiltForTokenClassification,
+            HFLiltTokenClassifier,
+            PretrainedConfig,
+        ),
+        ("lilt", DatasetType.sequence_classification): (
+            LiltForSequenceClassification,
+            HFLiltSequenceClassifier,
+            PretrainedConfig,
+        ),
+        ("xlm-roberta", DatasetType.sequence_classification): (
+            XLMRobertaForSequenceClassification,
+            HFLmSequenceClassifier,
+            PretrainedConfig,
+        ),
+    }[(model_type, dataset_type)]
+def maybe_remove_bounding_box_features(model_type: str) -> bool:
+    """Listing of models that do not need bounding box features."""
+    return {"xlm-roberta": True}.get(model_type, False)
 class LayoutLMTrainer(Trainer):
@@ -144,7 +157,7 @@ class LayoutLMTrainer(Trainer):
     def __init__(
         self,
-        model: Union[PreTrainedModel, Module],
+        model: Union[PreTrainedModel, nn.Module],
         args: TrainingArguments,
         data_collator: LayoutLMDataCollator,
         train_dataset: Dataset[Any],
@@ -158,7 +171,7 @@ class LayoutLMTrainer(Trainer):
         dataset_val: DatasetBase,
         pipeline_component: LanguageModelPipelineComponent,
         metric: Union[Type[ClassificationMetric], ClassificationMetric],
-        run: Optional["wandb.sdk.wandb_run.Run"] = None,
+        run: Optional[wandb.sdk.wandb_run.Run] = None,
         **build_eval_kwargs: Union[str, int],
     ) -> None:
         """
@@ -180,15 +193,17 @@ class LayoutLMTrainer(Trainer):
     def evaluate(
         self,
-        eval_dataset: Optional[Dataset[Any]] = None,
-        ignore_keys: Optional[List[str]] = None,
-        metric_key_prefix: str = "eval",
+        eval_dataset: Optional[Dataset[Any]] = None,  # pylint: disable=W0613
+        ignore_keys: Optional[List[str]] = None,  # pylint: disable=W0613
+        metric_key_prefix: str = "eval",  # pylint: disable=W0613
     ) -> Dict[str, float]:
         """
         Overwritten method from `Trainer`. Arguments will not be used.
         """
-        assert self.evaluator is not None
-        assert self.evaluator.pipe_component is not None
+        if self.evaluator is None:
+            raise ValueError("Evaluator not set up. Please use `setup_evaluator` before running evaluation")
+        if self.evaluator.pipe_component is None:
+            raise ValueError("Pipeline component not set up. Please use `setup_evaluator` before running evaluation")
         # memory metrics - must set up as early as possible
         self._memory_tracker.start()
@@ -205,26 +220,27 @@ class LayoutLMTrainer(Trainer):
 def _get_model_class_and_tokenizer(
-    path_config_json: str, dataset_type: ObjectTypes, use_xlm_tokenizer: bool
-) -> Tuple[Any, Any, Any, Any]:
+    path_config_json: str, dataset_type: DatasetType, use_xlm_tokenizer: bool
+) -> Tuple[Any, Any, Any, Any, Any]:
     with open(path_config_json, "r", encoding="UTF-8") as file:
         config_json = json.load(file)
-    model_type = config_json.get("model_type")
-    if architectures := config_json.get("architectures"):
-        model_cls, model_wrapper_cls, config_cls = _ARCHITECTURES_TO_MODEL_CLASS[architectures[0]]
-        tokenizer_fast = get_tokenizer_from_architecture(architectures[0], use_xlm_tokenizer)
-    elif model_type:
-        model_cls, model_wrapper_cls, config_cls = _MODEL_TYPE_AND_TASK_TO_MODEL_CLASS[(model_type, dataset_type)]
-        tokenizer_fast = _MODEL_TYPE_TO_TOKENIZER[(model_type, use_xlm_tokenizer)]
+    if model_type := config_json.get("model_type"):
+        model_cls, model_wrapper_cls, config_cls = get_model_architectures_and_configs(model_type, dataset_type)
+        remove_box_features = maybe_remove_bounding_box_features(model_type)
     else:
-        raise KeyError("model_type and architectures not available in configs")
+        raise KeyError("model_type not available in configs. It seems that the config is not valid")
+    tokenizer_fast = get_tokenizer_from_model_class(model_cls.__name__, use_xlm_tokenizer)
+    return config_cls, model_cls, model_wrapper_cls, tokenizer_fast, remove_box_features
-    if not model_cls:
-        raise ValueError("model not eligible to run with this framework")
-    return config_cls, model_cls, model_wrapper_cls, tokenizer_fast
+def get_image_to_raw_features_mapping(input_str: str) -> Any:
+    """Replacing eval functions"""
+    return {
+        "image_to_raw_layoutlm_features": image_to_raw_layoutlm_features,
+        "image_to_raw_lm_features": image_to_raw_lm_features,
+    }[input_str]
 def train_hf_layoutlm(
@@ -347,19 +363,21 @@ def train_hf_layoutlm(
                 name_as_key=True,
             )[LayoutType.word][WordType.token_class]
     else:
-        raise ValueError("Dataset type not supported for training")
+        raise UserWarning("Dataset type not supported for training")
-    config_cls, model_cls, model_wrapper_cls, tokenizer_fast = _get_model_class_and_tokenizer(
+    config_cls, model_cls, model_wrapper_cls, tokenizer_fast, remove_box_features = _get_model_class_and_tokenizer(
         path_config_json, dataset_type, use_xlm_tokenizer
     )
-    image_to_raw_layoutlm_kwargs = {"dataset_type": dataset_type, "use_token_tag": use_token_tag}
+    image_to_raw_features_func = get_image_to_raw_features_mapping(model_wrapper_cls.image_to_raw_features_mapping())
+    image_to_raw_features_kwargs = {"dataset_type": dataset_type, "use_token_tag": use_token_tag}
     if segment_positions:
-        image_to_raw_layoutlm_kwargs["segment_positions"] = segment_positions  # type: ignore
-    image_to_raw_layoutlm_kwargs.update(model_wrapper_cls.default_kwargs_for_input_mapping())
+        image_to_raw_features_kwargs["segment_positions"] = segment_positions  # type: ignore
+    image_to_raw_features_kwargs.update(model_wrapper_cls.default_kwargs_for_input_mapping())
     dataset = DatasetAdapter(
         dataset_train,
         True,
-        image_to_raw_layoutlm_features(**image_to_raw_layoutlm_kwargs),
+        image_to_raw_features_func(**image_to_raw_features_kwargs),
         use_token_tag,
         **build_train_dict,
     )
@@ -374,9 +392,11 @@ def train_hf_layoutlm(
         "remove_unused_columns": False,
         "per_device_train_batch_size": 8,
         "max_steps": number_samples,
-        "evaluation_strategy": "steps"
-        if (dataset_val is not None and metric is not None and pipeline_component_name is not None)
-        else "no",
+        "evaluation_strategy": (
+            "steps"
+            if (dataset_val is not None and metric is not None and pipeline_component_name is not None)
+            else "no"
+        ),
         "eval_steps": 100,
         "use_wandb": False,
         "wandb_project": None,
@@ -416,7 +436,7 @@ def train_hf_layoutlm(
     run = None
     if use_wandb:
         if not wandb_available():
-            raise ModuleNotFoundError("WandB must be installed separately")
+            raise DependencyError("WandB must be installed separately")
         run = wandb.init(project=wandb_project, config=conf_dict)  # type: ignore
         run._label(repo=wandb_repo)  # type: ignore # pylint: disable=W0212
     else:
@@ -448,6 +468,7 @@ def train_hf_layoutlm(
         return_tensors="pt",
         sliding_window_stride=sliding_window_stride,  # type: ignore
         max_batch_size=max_batch_size,  # type: ignore
+        remove_bounding_box_features=remove_box_features,
     )
     trainer = LayoutLMTrainer(model, arguments, data_collator, dataset)
@@ -470,7 +491,8 @@ def train_hf_layoutlm(
             path_config_json=path_config_json,
             path_weights=path_weights,
             categories=categories,
-            device=get_device(),
+            device=get_torch_device(),
+            use_xlm_tokenizer=use_xlm_tokenizer,
         )
         pipeline_component_cls = pipeline_component_registry.get(pipeline_component_name)
         if dataset_type == DatasetType.sequence_classification:

deepdoctection/train/tp_frcnn_train.py CHANGED Viewed

@@ -22,25 +22,7 @@ Module for training Tensorpack `GeneralizedRCNN`
 import os
 from typing import Dict, List, Optional, Sequence, Type, Union
-# pylint: disable=import-error
-from tensorpack.callbacks import (
-    EstimatedTimeLeft,
-    GPUMemoryTracker,
-    GPUUtilizationTracker,
-    HostMemoryTracker,
-    ModelSaver,
-    PeriodicCallback,
-    ScheduledHyperParamSetter,
-    SessionRunTimeout,
-    ThroughputTracker,
-)
-# todo: check how dataflow import is directly possible without having AssertionError
-from tensorpack.dataflow import ProxyDataFlow, imgaug
-from tensorpack.input_source import QueueInput
-from tensorpack.tfutils import SmartInit
-from tensorpack.train import SyncMultiGPUTrainerReplicated, TrainConfig, launch_train_with_config
-from tensorpack.utils import logger
+from lazy_imports import try_import
 from ..dataflow.base import DataFlow
 from ..dataflow.common import MapData
@@ -68,6 +50,26 @@ from ..utils.metacfg import AttrDict, set_config_by_yaml
 from ..utils.tqdm import get_tqdm
 from ..utils.utils import string_to_dict
+with try_import() as tp_import_guard:
+    # todo: check how dataflow import is directly possible without having an AssertionError
+    # pylint: disable=import-error
+    from tensorpack.callbacks import (
+        EstimatedTimeLeft,
+        GPUMemoryTracker,
+        GPUUtilizationTracker,
+        HostMemoryTracker,
+        ModelSaver,
+        PeriodicCallback,
+        ScheduledHyperParamSetter,
+        SessionRunTimeout,
+        ThroughputTracker,
+    )
+    from tensorpack.dataflow import ProxyDataFlow, imgaug
+    from tensorpack.input_source import QueueInput
+    from tensorpack.tfutils import SmartInit
+    from tensorpack.train import SyncMultiGPUTrainerReplicated, TrainConfig, launch_train_with_config
+    from tensorpack.utils import logger
 __all__ = ["train_faster_rcnn"]

deepdoctection/utils/__init__.py CHANGED Viewed

@@ -6,7 +6,10 @@ Init file for utils package
 """
 from typing import Optional, Tuple, Union, no_type_check
+from .concurrency import *
 from .context import *
+from .env_info import *
+from .error import *
 from .file_utils import *
 from .fs import *
 from .identifier import *

deepdoctection/utils/concurrency.py CHANGED Viewed

@@ -109,7 +109,7 @@ def enable_death_signal(_warn: bool = True) -> None:
         prctl, "set_pdeathsig"
     ), "prctl.set_pdeathsig does not exist! Note that you need to install 'python-prctl' instead of 'prctl'."
     # is SIGHUP a good choice?
-    prctl.set_pdeathsig(signal.SIGHUP)
+    prctl.set_pdeathsig(signal.SIGHUP)  # pylint: disable=E1101
 # taken from https://github.com/tensorpack/dataflow/blob/master/dataflow/utils/concurrency.py

deepdoctection/utils/context.py CHANGED Viewed

@@ -61,7 +61,7 @@ def timeout_manager(proc, seconds: Optional[int] = None) -> Iterator[str]:  # ty
             proc.terminate()
             proc.kill()
             proc.returncode = -1
-            raise RuntimeError("Tesseract process timeout")  # pylint: disable=W0707
+            raise RuntimeError(f"timeout for process id: {proc.pid}")  # pylint: disable=W0707
     finally:
         if proc.stdin is not None:
             proc.stdin.close()
@@ -88,7 +88,7 @@ def save_tmp_file(image: Union[str, ImageType, bytes], prefix: str) -> Iterator[
                 yield file.name, path.realpath(path.normpath(path.normcase(image)))
                 return
             if isinstance(image, (np.ndarray, np.generic)):
-                input_file_name = file.name + ".PNG"
+                input_file_name = file.name + "_input.PNG"
                 viz_handler.write_image(input_file_name, image)
                 yield file.name, input_file_name
             if isinstance(image, bytes):

deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl