PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.33__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show

deepdoctection/__init__.py +16 -29
deepdoctection/analyzer/dd.py +70 -59
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +41 -56
deepdoctection/datapoint/box.py +9 -8
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +56 -44
deepdoctection/datapoint/view.py +245 -150
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +35 -26
deepdoctection/datasets/base.py +14 -12
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +24 -26
deepdoctection/datasets/instances/doclaynet.py +51 -51
deepdoctection/datasets/instances/fintabnet.py +46 -46
deepdoctection/datasets/instances/funsd.py +25 -24
deepdoctection/datasets/instances/iiitar13k.py +13 -10
deepdoctection/datasets/instances/layouttest.py +4 -3
deepdoctection/datasets/instances/publaynet.py +5 -5
deepdoctection/datasets/instances/pubtables1m.py +24 -21
deepdoctection/datasets/instances/pubtabnet.py +32 -30
deepdoctection/datasets/instances/rvlcdip.py +30 -30
deepdoctection/datasets/instances/xfund.py +26 -26
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +15 -13
deepdoctection/eval/eval.py +41 -37
deepdoctection/eval/tedsmetric.py +30 -23
deepdoctection/eval/tp_eval_callback.py +16 -19
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +85 -113
deepdoctection/extern/deskew.py +14 -11
deepdoctection/extern/doctrocr.py +141 -130
deepdoctection/extern/fastlang.py +27 -18
deepdoctection/extern/hfdetr.py +71 -62
deepdoctection/extern/hflayoutlm.py +504 -211
deepdoctection/extern/hflm.py +230 -0
deepdoctection/extern/model.py +488 -302
deepdoctection/extern/pdftext.py +23 -19
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +29 -19
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +18 -18
deepdoctection/extern/tp/tfutils.py +57 -9
deepdoctection/extern/tp/tpcompat.py +21 -14
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +45 -53
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/cats.py +27 -29
deepdoctection/mapper/cocostruct.py +10 -10
deepdoctection/mapper/d2struct.py +27 -26
deepdoctection/mapper/hfstruct.py +13 -8
deepdoctection/mapper/laylmstruct.py +178 -37
deepdoctection/mapper/maputils.py +12 -11
deepdoctection/mapper/match.py +2 -2
deepdoctection/mapper/misc.py +11 -9
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +5 -5
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +5 -5
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +12 -14
deepdoctection/pipe/base.py +52 -106
deepdoctection/pipe/common.py +72 -59
deepdoctection/pipe/concurrency.py +16 -11
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +20 -16
deepdoctection/pipe/lm.py +75 -105
deepdoctection/pipe/order.py +194 -89
deepdoctection/pipe/refine.py +111 -124
deepdoctection/pipe/segment.py +156 -161
deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +48 -41
deepdoctection/train/hf_detr_train.py +41 -30
deepdoctection/train/hf_layoutlm_train.py +153 -135
deepdoctection/train/tp_frcnn_train.py +32 -31
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +87 -125
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +22 -18
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +16 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +11 -11
deepdoctection/utils/settings.py +185 -181
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +74 -72
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
deepdoctection-0.33.dist-info/RECORD +146 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0

deepdoctection/mapper/d2struct.py CHANGED Viewed

@@ -19,26 +19,28 @@
 Module for mapping annotations into standard Detectron2 dataset dict. Also providing some tools for W&B mapping and
 visualising
 """
+from __future__ import annotations
 import os.path
-from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Union
+from typing import Mapping, Optional, Sequence, Union
 import numpy as np
-import torch
+from lazy_imports import try_import
-from ..datapoint.annotation import ImageAnnotation
+from ..datapoint.annotation import DEFAULT_CATEGORY_ID, ImageAnnotation
 from ..datapoint.image import Image
 from ..extern.pt.nms import batched_nms
 from ..mapper.maputils import curry
-from ..utils.detection_types import JsonDict
-from ..utils.file_utils import detectron2_available, wandb_available
-from ..utils.settings import ObjectTypes, TypeOrStr, get_type
+from ..utils.settings import DefaultType, ObjectTypes, TypeOrStr, get_type
+from ..utils.types import Detectron2Dict
+with try_import() as pt_import_guard:
+    import torch
-if detectron2_available():
+with try_import() as d2_import_guard:
     from detectron2.structures import BoxMode
-if wandb_available():
+with try_import() as wb_import_guard:
     from wandb import Classes
     from wandb import Image as Wbimage
@@ -47,8 +49,8 @@ if wandb_available():
 def image_to_d2_frcnn_training(
     dp: Image,
     add_mask: bool = False,
-    category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
-) -> Optional[JsonDict]:
+    category_names: Optional[Union[TypeOrStr, Sequence[TypeOrStr]]] = None,
+) -> Optional[Detectron2Dict]:
     """
     Maps an image to a standard dataset dict as described in
     <https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html>. It further checks if the image is physically
@@ -64,7 +66,7 @@ def image_to_d2_frcnn_training(
     if not os.path.isfile(dp.location) and dp.image is None:
         return None
-    output: JsonDict = {"file_name": str(dp.location)}
+    output: Detectron2Dict = {"file_name": str(dp.location)}
     if dp.image is not None:
         output["image"] = dp.image.astype("float32")
@@ -85,10 +87,10 @@ def image_to_d2_frcnn_training(
             box = box.transform(dp.width, dp.height, absolute_coords=True)
         # Detectron2 does not fully support BoxMode.XYXY_REL
-        mapped_ann: Dict[str, Union[str, int, List[float]]] = {
+        mapped_ann: dict[str, Union[str, int, list[float]]] = {
             "bbox_mode": BoxMode.XYXY_ABS,
             "bbox": box.to_list(mode="xyxy"),
-            "category_id": int(ann.category_id) - 1,
+            "category_id": ann.category_id - 1,
         }
         annotations.append(mapped_ann)
@@ -147,23 +149,23 @@ def pt_nms_image_annotations(
 def _get_category_attributes(
     ann: ImageAnnotation, cat_to_sub_cat: Optional[Mapping[ObjectTypes, ObjectTypes]] = None
-) -> Tuple[str, str, Optional[float]]:
+) -> tuple[ObjectTypes, int, Optional[float]]:
     if cat_to_sub_cat:
         sub_cat_key = cat_to_sub_cat.get(get_type(ann.category_name))
         if sub_cat_key in ann.sub_categories:
             sub_cat = ann.get_sub_category(sub_cat_key)
-            return sub_cat.category_name, sub_cat.category_id, sub_cat.score
-        return "", "", 0.0
-    return ann.category_name, ann.category_id, ann.score
+            return get_type(sub_cat.category_name), sub_cat.category_id, sub_cat.score
+        return DefaultType.DEFAULT_TYPE, DEFAULT_CATEGORY_ID, 0.0
+    return get_type(ann.category_name), ann.category_id, ann.score
 @curry
 def to_wandb_image(
     dp: Image,
-    categories: Mapping[str, TypeOrStr],
-    sub_categories: Optional[Mapping[str, TypeOrStr]] = None,
+    categories: Mapping[int, TypeOrStr],
+    sub_categories: Optional[Mapping[int, TypeOrStr]] = None,
     cat_to_sub_cat: Optional[Mapping[ObjectTypes, ObjectTypes]] = None,
-) -> Tuple[str, "Wbimage"]:
+) -> tuple[str, Wbimage]:
     """
     Converting a deepdoctection image into a wandb image
@@ -183,11 +185,10 @@ def to_wandb_image(
     anns = dp.get_annotation(category_names=list(categories.values()))
     if sub_categories:
-        class_labels = {int(key): val for key, val in sub_categories.items()}
-        class_set = Classes([{"name": val, "id": int(key)} for key, val in sub_categories.items()])
+        class_labels = dict(sub_categories.items())
+        class_set = Classes([{"name": val, "id": key} for key, val in sub_categories.items()])
     else:
-        class_labels = {int(key): val for key, val in categories.items()}
-        class_set = Classes([{"name": val, "id": int(key)} for key, val in categories.items()])
+        class_set = Classes([{"name": val, "id": key} for key, val in categories.items()])
     for ann in anns:
         bounding_box = ann.get_bounding_box(dp.image_id)
@@ -198,7 +199,7 @@ def to_wandb_image(
             box = {
                 "position": {"middle": bounding_box.center, "width": bounding_box.width, "height": bounding_box.height},
                 "domain": "pixel",
-                "class_id": int(category_id),
+                "class_id": category_id,
                 "box_caption": category_name,
             }
             if score:

deepdoctection/mapper/hfstruct.py CHANGED Viewed

@@ -19,26 +19,31 @@
 Module for mapping annotations into standard Huggingface Detr input structure for training
 """
+from __future__ import annotations
 import os
 from dataclasses import dataclass, field
-from typing import Dict, List, Literal, Optional, Sequence, Union
+from typing import Literal, Optional, Sequence, Union
 import numpy as np
-from transformers import BatchFeature, DetrFeatureExtractor
+from lazy_imports import try_import
 from ..datapoint.image import Image
 from ..mapper.maputils import curry
 from ..mapper.misc import get_load_image_func
-from ..utils.detection_types import JsonDict
-from ..utils.settings import ObjectTypes
+from ..utils.settings import TypeOrStr
 from ..utils.transform import PadTransform
+from ..utils.types import JsonDict
+with try_import() as tr_import_guard:
+    from transformers import BatchFeature, DetrFeatureExtractor
 @curry
 def image_to_hf_detr_training(
     dp: Image,
     add_mask: bool = False,
-    category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
+    category_names: Optional[Union[TypeOrStr, Sequence[Union[TypeOrStr]]]] = None,
 ) -> Optional[JsonDict]:
     """
     Maps an image to a detr input datapoint dict, that, after collating can be used for training.
@@ -71,11 +76,11 @@ def image_to_hf_detr_training(
     for ann in anns:
         box = ann.get_bounding_box(dp.image_id)
-        mapped_ann: Dict[str, Union[str, int, float, List[float]]] = {
+        mapped_ann: dict[str, Union[str, int, float, list[float]]] = {
             "id": "".join([c for c in ann.annotation_id if c.isdigit()])[:8],
             "image_id": "".join([c for c in dp.image_id if c.isdigit()])[:8],
             "bbox": box.to_list(mode="xywh"),
-            "category_id": int(ann.category_id) - 1,
+            "category_id": ann.category_id - 1,
             "area": box.area,
         }
         annotations.append(mapped_ann)
@@ -103,7 +108,7 @@ class DetrDataCollator:
     padder: Optional[PadTransform] = None
     return_tensors: Optional[Literal["pt"]] = field(default="pt")
-    def __call__(self, raw_features: List[JsonDict]) -> BatchFeature:
+    def __call__(self, raw_features: list[JsonDict]) -> BatchFeature:
         """
         Creating BatchFeature from a list of dict of raw features.

deepdoctection/mapper/laylmstruct.py CHANGED Viewed

@@ -20,32 +20,30 @@ Module for mapping annotations from image to layout lm input structure. Heavily
 <https://github.com/NielsRogge/Transformers-Tutorials>
 """
+from __future__ import annotations
 import random
 from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, List, Literal, NewType, Optional, Sequence, Union
+from typing import Any, Callable, Literal, NewType, Optional, Sequence, Union
 import numpy as np
 import numpy.typing as npt
+from lazy_imports import try_import
 from ..datapoint.annotation import ContainerAnnotation
 from ..datapoint.convert import box_to_point4, point4_to_box
 from ..datapoint.image import Image
-from ..utils.detection_types import JsonDict
-from ..utils.file_utils import pytorch_available, transformers_available
+from ..datapoint.view import Page
 from ..utils.settings import DatasetType, LayoutType, PageType, Relationships, WordType
 from ..utils.transform import ResizeTransform, normalize_image
+from ..utils.types import JsonDict
 from .maputils import curry
-if pytorch_available():
+with try_import() as import_guard:
     import torch
-if transformers_available():
-    from transformers import (  # pylint: disable=W0611
-        BatchEncoding,
-        PreTrainedTokenizerFast,
-        RobertaTokenizerFast,
-        XLMRobertaTokenizerFast,
-    )
+with try_import() as tr_import_guard:
+    from transformers import BatchEncoding, PreTrainedTokenizerFast  # pylint: disable=W0611
 __all__ = [
     "image_to_raw_layoutlm_features",
@@ -54,19 +52,24 @@ __all__ = [
     "image_to_layoutlm_features",
     "DataCollator",
     "LayoutLMFeatures",
+    "image_to_raw_lm_features",
+    "image_to_lm_features",
 ]
 RawLayoutLMFeatures = NewType("RawLayoutLMFeatures", JsonDict)
+RawLMFeatures = NewType("RawLMFeatures", JsonDict)
 LayoutLMFeatures = NewType("LayoutLMFeatures", JsonDict)
+LMFeatures = NewType("LMFeatures", JsonDict)
 InputDataClass = NewType("InputDataClass", JsonDict)
 """
 <https://github.com/huggingface/transformers/src/transformers/data/data_collator.py>
 A DataCollator is a function that takes a list of samples from a Dataset and collate them into a batch, as a dictionary
 of PyTorch/TensorFlow tensors or NumPy arrays.
 """
-DataCollator = NewType("DataCollator", Callable[[List[InputDataClass]], Dict[str, Any]])  # type: ignore
+DataCollator = NewType("DataCollator", Callable[[list[InputDataClass]], dict[str, Any]])  # type: ignore
 _CLS_BOX = [0.0, 0.0, 1000.0, 1000.0]
 _SEP_BOX = [1000.0, 1000.0, 1000.0, 1000.0]
@@ -122,9 +125,9 @@ def image_to_raw_layoutlm_features(
     all_ann_ids = []
     all_words = []
     all_boxes = []
-    all_labels: List[int] = []
+    all_labels: list[int] = []
-    anns = dp.get_annotation_iter(category_names=LayoutType.word)
+    anns = dp.get_annotation_iter(category_names=LayoutType.WORD)
     word_id_to_segment_box = {}
     if segment_positions:
@@ -136,12 +139,12 @@ def image_to_raw_layoutlm_features(
             if not bounding_box.absolute_coords:
                 bounding_box = bounding_box.transform(dp.width, dp.height, absolute_coords=True)
             word_id_to_segment_box.update(
-                {word_ann: bounding_box for word_ann in segm_ann.get_relationship(Relationships.child)}
+                {word_ann: bounding_box for word_ann in segm_ann.get_relationship(Relationships.CHILD)}
             )
     for ann in anns:
         all_ann_ids.append(ann.annotation_id)
-        char_cat = ann.get_sub_category(WordType.characters)
+        char_cat = ann.get_sub_category(WordType.CHARACTERS)
         if not isinstance(char_cat, ContainerAnnotation):
             raise TypeError(f"char_cat must be of type ContainerAnnotation but is of type {type(char_cat)}")
         word = char_cat.value
@@ -155,15 +158,15 @@ def image_to_raw_layoutlm_features(
         all_boxes.append(word_id_to_segment_box.get(ann.annotation_id, box).to_list(mode="xyxy"))
         if (
-            WordType.token_tag in ann.sub_categories or WordType.token_class in ann.sub_categories
-        ) and dataset_type == DatasetType.token_classification:
+            WordType.TOKEN_TAG in ann.sub_categories or WordType.TOKEN_CLASS in ann.sub_categories
+        ) and dataset_type == DatasetType.TOKEN_CLASSIFICATION:
             if use_token_tag:
-                all_labels.append(int(ann.get_sub_category(WordType.token_tag).category_id) - 1)
+                all_labels.append(ann.get_sub_category(WordType.TOKEN_TAG).category_id - 1)
             else:
-                all_labels.append(int(ann.get_sub_category(WordType.token_class).category_id) - 1)
+                all_labels.append(ann.get_sub_category(WordType.TOKEN_CLASS).category_id - 1)
-    if dp.summary is not None and dataset_type == DatasetType.sequence_classification:
-        all_labels.append(int(dp.summary.get_sub_category(PageType.document_type).category_id) - 1)
+    if dataset_type == DatasetType.SEQUENCE_CLASSIFICATION:
+        all_labels.append(dp.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1)
     boxes = np.asarray(all_boxes, dtype="float32")
     if boxes.ndim == 1:
@@ -208,7 +211,7 @@ def image_to_raw_layoutlm_features(
     return raw_features
-def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
+def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
     """
     Converting list of floats to pytorch tensors
     :param features: LayoutLMFeatures
@@ -216,7 +219,8 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
     """
     _image_key = "pixel_values" if "pixel_values" in features else "image"
-    features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
+    if "bbox" in features:
+        features["bbox"] = torch.tensor(features["bbox"], dtype=torch.long)
     if "labels" in features:
         features["labels"] = torch.tensor(features["labels"], dtype=torch.long)
     if _image_key in features:
@@ -230,12 +234,12 @@ def features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
 def _tokenize_with_sliding_window(
-    raw_features: List[RawLayoutLMFeatures],
-    tokenizer: "PreTrainedTokenizerFast",
+    raw_features: list[Union[RawLayoutLMFeatures, RawLMFeatures]],
+    tokenizer: PreTrainedTokenizerFast,
     sliding_window_stride: int,
     max_batch_size: int,
     return_tensors: Optional[Literal["pt"]] = None,
-) -> Union[JsonDict, "BatchEncoding"]:
+) -> Union[JsonDict, BatchEncoding]:
     """
     Runs a tokenizer: If there are no overflowing tokens, the tokenizer output will be returned as it is.
     If there are overflowing tokens, sliding windows have to be built. As it is easier to prepare the sliding windows
@@ -381,7 +385,7 @@ def _tokenize_with_sliding_window(
                 )
             )
-    slided_tokenized_inputs: Dict[str, Union[List[Union[str, int]], torch.Tensor]] = {}
+    slided_tokenized_inputs: dict[str, Union[list[Union[str, int]], torch.Tensor]] = {}
     if return_tensors == "pt":
         slided_tokenized_inputs["overflow_to_sample_mapping"] = torch.tensor(overflow_to_sample_mapping)
         slided_tokenized_inputs["input_ids"] = torch.tensor(all_input_ids)
@@ -398,8 +402,8 @@ def _tokenize_with_sliding_window(
 def raw_features_to_layoutlm_features(
-    raw_features: Union[RawLayoutLMFeatures, List[RawLayoutLMFeatures]],
-    tokenizer: "PreTrainedTokenizerFast",
+    raw_features: Union[RawLayoutLMFeatures, RawLMFeatures, list[Union[RawLayoutLMFeatures, RawLMFeatures]]],
+    tokenizer: PreTrainedTokenizerFast,
     padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
     truncation: bool = True,
     return_overflowing_tokens: bool = False,
@@ -407,6 +411,7 @@ def raw_features_to_layoutlm_features(
     remove_columns_for_training: bool = False,
     sliding_window_stride: int = 0,
     max_batch_size: int = 0,
+    remove_bounding_boxes: bool = False,
 ) -> LayoutLMFeatures:
     """
     Mapping raw features to tokenized input sequences for LayoutLM models.
@@ -442,11 +447,11 @@ def raw_features_to_layoutlm_features(
         raw_features = [raw_features]
     _has_token_labels = (
-        raw_features[0]["dataset_type"] == DatasetType.token_classification
+        raw_features[0]["dataset_type"] == DatasetType.TOKEN_CLASSIFICATION
         and raw_features[0].get("labels") is not None
     )
     _has_sequence_labels = (
-        raw_features[0]["dataset_type"] == DatasetType.sequence_classification
+        raw_features[0]["dataset_type"] == DatasetType.SEQUENCE_CLASSIFICATION
         and raw_features[0].get("labels") is not None
     )
     _has_labels = bool(_has_token_labels or _has_sequence_labels)
@@ -563,8 +568,11 @@ def raw_features_to_layoutlm_features(
         input_dict.pop("ann_ids")
         input_dict.pop("tokens")
+    if remove_bounding_boxes:
+        input_dict.pop("bbox")
     if return_tensors == "pt":
-        return features_to_pt_tensors(LayoutLMFeatures(input_dict))
+        return layoutlm_features_to_pt_tensors(LayoutLMFeatures(input_dict))
     return LayoutLMFeatures(input_dict)
@@ -595,13 +603,14 @@ class LayoutLMDataCollator:
                            with windows shifted `sliding_window_stride` to the right.
     """
-    tokenizer: "PreTrainedTokenizerFast"
+    tokenizer: PreTrainedTokenizerFast
     padding: Literal["max_length", "do_not_pad", "longest"] = field(default="max_length")
     truncation: bool = field(default=True)
     return_overflowing_tokens: bool = field(default=False)
     return_tensors: Optional[Literal["pt"]] = field(default=None)
     sliding_window_stride: int = field(default=0)
     max_batch_size: int = field(default=0)
+    remove_bounding_box_features: bool = field(default=False)
     def __post_init__(self) -> None:
         assert isinstance(self.tokenizer, PreTrainedTokenizerFast), "Tokenizer must be a fast tokenizer"
@@ -611,7 +620,7 @@ class LayoutLMDataCollator:
         if self.return_overflowing_tokens:
             assert self.truncation, self.truncation
-    def __call__(self, raw_features: Union[RawLayoutLMFeatures, List[RawLayoutLMFeatures]]) -> LayoutLMFeatures:
+    def __call__(self, raw_features: Union[RawLayoutLMFeatures, list[RawLayoutLMFeatures]]) -> LayoutLMFeatures:
         """
         Calling the DataCollator to form model inputs for training and inference. Takes a single raw
         :param raw_features: A dictionary with the following arguments: `image_id, width, height, ann_ids, words,
@@ -620,7 +629,7 @@ class LayoutLMDataCollator:
                  token_type_ids, attention_masks, boxes, labels`.
         """
         return raw_features_to_layoutlm_features(
-            raw_features,
+            raw_features,  # type: ignore
             self.tokenizer,
             self.padding,
             self.truncation,
@@ -629,13 +638,14 @@ class LayoutLMDataCollator:
             True,
             self.sliding_window_stride,
             self.max_batch_size,
+            self.remove_bounding_box_features,
         )
 @curry
 def image_to_layoutlm_features(
     dp: Image,
-    tokenizer: "PreTrainedTokenizerFast",
+    tokenizer: PreTrainedTokenizerFast,
     padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
     truncation: bool = True,
     return_overflowing_tokens: bool = False,
@@ -724,3 +734,134 @@ def image_to_layoutlm_features(
         sliding_window_stride=sliding_window_stride,
     )
     return features
+@curry
+def image_to_raw_lm_features(
+    dp: Image,
+    dataset_type: Optional[Literal["sequence_classification", "token_classification"]] = None,
+    use_token_tag: bool = True,
+    text_container: Optional[LayoutType] = LayoutType.WORD,
+    floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
+    include_residual_text_container: bool = False,
+) -> Optional[RawLMFeatures]:
+    """
+    Mapping a datapoint into an intermediate format for bert-like models. Features will be provided into a dict and
+    this mapping can be used for sequence or token classification as well as for inference. To generate input features
+    for the model please `use raw_features_to_layoutlm_features`.
+    :param dp: Image
+    :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
+    :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
+                          labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
+                          `WordType.token_class`.
+    :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
+    :param floating_text_block_categories: A list of top level layout objects
+    :param include_residual_text_container: This will regard synthetic text line annotations as floating text
+                                            blocks and therefore incorporate all image annotations of category
+                                            `word` when building text strings.
+    :return: dictionary with the following arguments:
+            'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
+    """
+    raw_features: RawLMFeatures = RawLMFeatures({})
+    page = Page.from_image(dp, text_container, floating_text_block_categories, include_residual_text_container)
+    text_ = page.text_
+    # pylint: disable=E1137  #3162
+    raw_features["image_id"] = page.image_id
+    raw_features["width"] = page.width
+    raw_features["height"] = page.height
+    raw_features["ann_ids"] = text_["ann_ids"]
+    raw_features["words"] = text_["words"]
+    # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
+    # raw_features_to_layoutlm_features
+    raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
+    raw_features["dataset_type"] = dataset_type
+    if use_token_tag and text_["token_tags"]:
+        raw_features["labels"] = text_["token_tags"]
+    elif text_["token_classes"]:
+        raw_features["labels"] = text_["token_classes"]
+    elif page.document_type is not None:
+        document_type_id = page.image_orig.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1
+        raw_features["labels"] = [document_type_id]
+    raw_features["dataset_type"] = dataset_type
+    # pylint: enable=E1137
+    return raw_features
+@curry
+def image_to_lm_features(
+    dp: Image,
+    tokenizer: PreTrainedTokenizerFast,
+    padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
+    truncation: bool = True,
+    return_overflowing_tokens: bool = False,
+    return_tensors: Optional[Literal["pt"]] = "pt",
+    sliding_window_stride: int = 0,
+    text_container: Optional[LayoutType] = LayoutType.WORD,
+    floating_text_block_categories: Optional[Sequence[LayoutType]] = None,
+    include_residual_text_container: bool = False,
+) -> Optional[LayoutLMFeatures]:
+    """
+    Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
+    `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
+    with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
+    used internally in `LMTokenClassifierService`.
+            tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
+            layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
+                                                  categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
+            layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
+    :param dp: Image datapoint
+    :param tokenizer: Tokenizer compatible with the language model
+    :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
+                    `do_not_pad`.
+    :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
+                       maximum acceptable input length for the model if that argument is not provided. This will
+                       truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                       sequences (or a batch of pairs) is provided.
+                       If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
+                       model maximum admissible input size).
+    :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
+                                      can be returned as an additional batch element. Not that in this case, the number
+                                      of input batch samples will be smaller than the output batch samples.
+    :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
+                           returned in list objects.
+    :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
+                                  windows will be created with each window having max_length sequence input. When using
+                                  `sliding_window_stride=0` no strides will be created, otherwise it will create slides
+                                  with windows shifted `sliding_window_stride` to the right.
+    :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
+    :param floating_text_block_categories: A list of top level layout objects
+    :param include_residual_text_container: This will regard synthetic text line annotations as floating text
+                                            blocks and therefore incorporate all image annotations of category
+                                            `word` when building text strings.
+    :return: A dict of lm features
+    """
+    raw_features = image_to_raw_lm_features(  # pylint: disable=E1102
+        dataset_type=None,
+        use_token_tag=True,
+        text_container=text_container,
+        floating_text_block_categories=floating_text_block_categories,
+        include_residual_text_container=include_residual_text_container,
+    )(dp)
+    if raw_features is None:
+        return None
+    features = raw_features_to_layoutlm_features(
+        raw_features,
+        tokenizer,
+        padding,
+        truncation,
+        return_overflowing_tokens,
+        return_tensors=return_tensors,
+        sliding_window_stride=sliding_window_stride,
+    )
+    return features

deepdoctection/mapper/maputils.py CHANGED Viewed

@@ -18,20 +18,22 @@
 """
 Utility functions related to mapping tasks
 """
+from __future__ import annotations
 import functools
 import itertools
 import traceback
 from types import TracebackType
-from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union
+from typing import Any, Callable, Mapping, Optional, Sequence, Union
 import numpy as np
 from tabulate import tabulate
 from termcolor import colored
-from ..utils.detection_types import DP, BaseExceptionType, S, T
 from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDError
 from ..utils.logger import LoggingRecord, logger
 from ..utils.settings import ObjectTypes
+from ..utils.types import DP, BaseExceptionType, S, T
 __all__ = ["MappingContextManager", "DefaultMapper", "maybe_get_fake_score", "LabelSummarizer", "curry"]
@@ -43,7 +45,7 @@ class MappingContextManager:
     """
     def __init__(
-        self, dp_name: Optional[str] = None, filter_level: str = "image", **kwargs: Dict[str, Optional[str]]
+        self, dp_name: Optional[str] = None, filter_level: str = "image", **kwargs: dict[str, Optional[str]]
     ) -> None:
         """
         :param dp_name: A name for the datapoint to be mapped
@@ -55,7 +57,7 @@ class MappingContextManager:
         self.context_error = True
         self.kwargs = kwargs
-    def __enter__(self) -> "MappingContextManager":
+    def __enter__(self) -> MappingContextManager:
         """
         context enter
         """
@@ -79,6 +81,7 @@ class MappingContextManager:
                 AssertionError,
                 TypeError,
                 FileNotFoundError,
+                AttributeError,
                 BoundingBoxError,
                 AnnotationError,
                 ImageError,
@@ -190,7 +193,7 @@ class LabelSummarizer:
     """
-    def __init__(self, categories: Mapping[str, ObjectTypes]) -> None:
+    def __init__(self, categories: Mapping[int, ObjectTypes]) -> None:
         """
         :param categories: A dict of categories as given as in categories.get_categories().
         """
@@ -208,11 +211,11 @@ class LabelSummarizer:
         np_item = np.asarray(item, dtype="int8")
         self.summary += np.histogram(np_item, bins=self.hist_bins)[0]
-    def get_summary(self) -> Dict[str, np.int32]:
+    def get_summary(self) -> dict[int, int]:
         """
         Get a dictionary with category ids and the number dumped
         """
-        return dict(list(zip(self.categories.keys(), self.summary.astype(np.int32))))
+        return dict(list(zip(self.categories.keys(), self.summary.tolist())))
     def print_summary_histogram(self, dd_logic: bool = True) -> None:
         """
@@ -221,11 +224,9 @@ class LabelSummarizer:
         :param dd_logic: Follow dd category convention when printing histogram (last background bucket omitted).
         """
         if dd_logic:
-            data = list(itertools.chain(*[[self.categories[str(i)].value, v] for i, v in enumerate(self.summary, 1)]))
+            data = list(itertools.chain(*[[self.categories[i].value, v] for i, v in enumerate(self.summary, 1)]))
         else:
-            data = list(
-                itertools.chain(*[[self.categories[str(i + 1)].value, v] for i, v in enumerate(self.summary[:-1])])
-            )
+            data = list(itertools.chain(*[[self.categories[i + 1].value, v] for i, v in enumerate(self.summary[:-1])]))
         num_columns = min(6, len(data))
         total_img_anns = sum(data[1::2])
         data.extend([None] * ((num_columns - len(data) % num_columns) % num_columns))

deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl