PyPI - deepdoctection - Versions diffs - 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl - Mend

deepdoctection 0.42.0py3-none-any.whl → 0.43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +2 -1
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +904 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +157 -106
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +196 -113
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +25 -17
deepdoctection/utils/env_info.py +85 -36
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -62
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.dist-info/METADATA +376 -0
deepdoctection-0.43.dist-info/RECORD +149 -0
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.0.dist-info/METADATA +0 -431
deepdoctection-0.42.0.dist-info/RECORD +0 -148
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0

deepdoctection/mapper/d2struct.py CHANGED Viewed

@@ -16,8 +16,9 @@
 # limitations under the License.
 """
-Module for mapping annotations into standard Detectron2 dataset dict. Also providing some tools for W&B mapping and
-visualising
+Module for mapping annotations into standard Detectron2 dataset dict.
+Also providing some tools for W&B mapping and visualisation.
 """
 from __future__ import annotations
@@ -57,11 +58,14 @@ def image_to_d2_frcnn_training(
     available, for otherwise the annotation will be filtered.
     Note, that the returned dict will not suffice for training as gt for RPN and anchors still need to be created.
-    :param dp: Image
-    :param add_mask: True is not implemented (yet).
-    :param category_names: A list of category names for training a model. Pass nothing to train with all annotations
-    :return: Dict with 'image', 'width', 'height', 'image_id', 'annotations' where 'annotations' is a list of dict
-             with 'bbox_mode' (D2 internal bounding box description), 'bbox' and 'category_id'.
+    Args:
+        dp: Image
+        add_mask: `True` is not implemented (yet).
+        category_names: A list of category names for training a model. Pass nothing to train with all annotations
+    Returns:
+        Dict with 'image', 'width', 'height', 'image_id', 'annotations' where 'annotations' is a list of dict
+        with 'bbox_mode' (D2 internal bounding box description), 'bbox' and 'category_id'.
     """
     if not os.path.isfile(dp.location) and dp.image is None:
         return None
@@ -110,11 +114,14 @@ def pt_nms_image_annotations_depr(
     annotation, e.g. given by name or returned through different predictors. This is the pt version, for tf check
     `mapper.tpstruct`
-    :param anns: A sequence of ImageAnnotations. All annotations will be treated as if they belong to one category
-    :param threshold: NMS threshold
-    :param image_id: id in order to get the embedding bounding box
-    :param prio: If an annotation has prio, it will overwrite its given score to 1 so that it will never be suppressed
-    :return: A list of annotation_ids that belong to the given input sequence and that survive the NMS process
+    Args:
+        anns: A sequence of ImageAnnotations. All annotations will be treated as if they belong to one category
+        threshold: NMS threshold
+        image_id: id in order to get the embedding bounding box
+        prio: If an annotation has prio, it will overwrite its given score to 1 so that it will never be suppressed
+    Returns:
+        A list of `annotation_id`s that belong to the given input sequence and that survive the NMS process
     """
     if len(anns) == 1:
         return [anns[0].annotation_id]
@@ -151,15 +158,18 @@ def pt_nms_image_annotations(
     anns: Sequence[ImageAnnotation], threshold: float, image_id: Optional[str] = None, prio: str = ""
 ) -> Sequence[str]:
     """
-    Processing given image annotations through NMS. This is useful, if you want to supress some specific image
-    annotation, e.g. given by name or returned through different predictors. This is the pt version, for tf check
+    Processes given image annotations through NMS (Non-Maximum Suppression). Useful for suppressing specific image
+    annotations, e.g., given by name or returned through different predictors. This is the pt version, for tf check
     `mapper.tpstruct`
-    :param anns: A sequence of ImageAnnotations. All annotations will be treated as if they belong to one category
-    :param threshold: NMS threshold
-    :param image_id: id in order to get the embedding bounding box
-    :param prio: If an annotation has prio, it will overwrite its given score to 1 so that it will never be suppressed
-    :return: A list of annotation_ids that belong to the given input sequence and that survive the NMS process
+    Args:
+        anns: A sequence of `ImageAnnotation`. All annotations will be treated as if they belong to one category.
+        threshold: NMS threshold.
+        image_id: ID to get the embedding bounding box.
+        prio: If an annotation has priority, its score will be set to 1 so that it will never be suppressed.
+    Returns:
+        A list of `annotation_id` that belong to the given input sequence and that survive the NMS process.
     """
     if len(anns) == 1:
         return [anns[0].annotation_id]
@@ -213,6 +223,16 @@ def pt_nms_image_annotations(
 def _get_category_attributes(
     ann: ImageAnnotation, cat_to_sub_cat: Optional[Mapping[ObjectTypes, ObjectTypes]] = None
 ) -> tuple[ObjectTypes, int, Optional[float]]:
+    """
+    Gets the category attributes for an annotation, optionally using a mapping from category to sub-category.
+    Args:
+        ann: `ImageAnnotation`
+        cat_to_sub_cat: Optional mapping from `ObjectTypes` to `ObjectTypes`.
+    Returns:
+        Tuple of `ObjectTypes`, `category_id`, and `score`.
+    """
     if cat_to_sub_cat:
         sub_cat_key = cat_to_sub_cat.get(get_type(ann.category_name))
         if sub_cat_key in ann.sub_categories:
@@ -230,16 +250,23 @@ def to_wandb_image(
     cat_to_sub_cat: Optional[Mapping[ObjectTypes, ObjectTypes]] = None,
 ) -> tuple[str, Wbimage]:
     """
-    Converting a deepdoctection image into a wandb image
-    :param dp: deepdoctection image
-    :param categories: dict of categories. The categories refer to categories of `ImageAnnotation`s.
-    :param sub_categories:  dict of sub categories. If provided, these categories will define the classes for the table
-    :param cat_to_sub_cat: dict of category to sub category keys. Suppose your category `foo` has a sub category defined
-                           by the key `sub_foo`. The range sub category values must then be given by `sub_categories`
-                           and to extract the sub category values one must pass `{"foo": "sub_foo"}
-    :return: a W&B image
+    Converts a deepdoctection `Image` into a `W&B` image.
+    Args:
+        dp: deepdoctection `Image`
+        categories: Dict of categories. The categories refer to categories of `ImageAnnotation`.
+        sub_categories: Dict of `sub_categories`. If provided, these categories will define the classes for the table.
+        cat_to_sub_cat: Dict of category to sub_category keys. Suppose your category `foo` has a sub-category defined
+                        by the key `sub_foo`. The range of sub-category values must then be given by `sub_categories`,
+                        and to extract the sub-category values, one must pass `{"foo": "sub_foo"}`.
+    Returns:
+        Tuple of `image_id` and a W&B image.
+    Example:
+        ```python
+        to_wandb_image(dp, categories)
+        ```
     """
     if dp.image is None:
         raise ValueError("Cannot convert to W&B image type when Image.image is None")

deepdoctection/mapper/hfstruct.py CHANGED Viewed

@@ -46,13 +46,19 @@ def image_to_hf_detr_training(
     category_names: Optional[Union[TypeOrStr, Sequence[Union[TypeOrStr]]]] = None,
 ) -> Optional[JsonDict]:
     """
-    Maps an image to a detr input datapoint dict, that, after collating can be used for training.
+    Maps an `image` to a detr input datapoint `dict`, that, after collating, can be used for training.
-    :param dp: Image
-    :param add_mask: True is not implemented (yet).
-    :param category_names: A list of category names for training a model. Pass nothing to train with all annotations
-    :return: Dict with 'image', 'width', 'height', 'image_id', 'annotations' where 'annotations' is a list of dict
-             with 'boxes' and 'class_labels'.
+    Args:
+        dp: `Image`
+        add_mask: `True` is not implemented (yet).
+        category_names: A list of `category_name`s for training a model. Pass nothing to train with all annotations.
+    Returns:
+        Dict with `image`, `width`, `height`, `image_id`, `annotations` where `annotations` is a list of dicts with
+        `boxes` and `class_labels`.
+    Note:
+        If `add_mask` is True, segmentation in `deepdoctection` is not supported.
     """
     if not os.path.isfile(dp.location) and dp.image is None:
@@ -96,12 +102,13 @@ def image_to_hf_detr_training(
 @dataclass
 class DetrDataCollator:
     """
-    Data collator that will prepare a list of raw features to a BatchFeature that can be used
-    to train a Detr or Tabletransformer model.
+    Data collator that will prepare a list of raw features to a `BatchFeature` that can be used to train a Detr or
+    Tabletransformer model.
-    :param feature_extractor:  DetrFeatureExtractor
-    :param padder: An optional PadTransform instance
-    :param return_tensors: "pt" or None
+    Args:
+        feature_extractor: `DetrFeatureExtractor`
+        padder: An optional `PadTransform` instance.
+        return_tensors: "pt" or None.
     """
     feature_extractor: DetrFeatureExtractor  # TODO: Replace deprecated DetrFeatureExtractor with DetrImageProcessor
@@ -110,12 +117,15 @@ class DetrDataCollator:
     def __call__(self, raw_features: list[JsonDict]) -> BatchFeature:
         """
-        Creating BatchFeature from a list of dict of raw features.
+        Creating `BatchFeature` from a list of dict of raw features.
-        :param raw_features: A list of dict with keys: 'image' or 'file_name', "width', "height' and 'annotations'.
-                             'annotations' mus be a list of dict as well, where each dict element must contain
-                             annotation information following COCO standard.
-        :return: BatchFeature
+        Args:
+            raw_features: A list of dicts with keys: `image` or `file_name`, `width`, `height`, and `annotations`.
+                          `annotations` must be a list of dicts as well, where each dict element must contain
+                           annotation information following `COCO` standard.
+        Returns:
+            `BatchFeature`
         """
         images_input = []
@@ -136,10 +146,13 @@ class DetrDataCollator:
     def maybe_pad_image_and_transform(self, feature: JsonDict) -> JsonDict:
         """
-        Pads an 'image' and transforming bounding boxes from annotations.
+        Pads an `image` and transforms bounding boxes from `annotations`.
+        Args:
+            feature: A dict of `raw_features`.
-        :param feature: A dict of raw_features
-        :return: Same as input
+        Returns:
+            Same as input.
         """
         if self.padder is None:
             return feature

deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.0py3-none-any.whl → 0.43py3-none-any.whl