PyPI - deepdoctection - Versions diffs - 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl - Mend

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +4 -2
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +919 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +162 -108
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +205 -119
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +26 -17
deepdoctection/utils/env_info.py +86 -37
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -71
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.1.dist-info/METADATA +376 -0
deepdoctection-0.43.1.dist-info/RECORD +149 -0
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.1.dist-info/METADATA +0 -431
deepdoctection-0.42.1.dist-info/RECORD +0 -148
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0

deepdoctection/datasets/info.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # limitations under the License.
 """
-Module for storing dataset info (e.g. general meta data or categories)
+General meta-data or categories
 """
 from copy import copy
@@ -53,10 +53,13 @@ def _get_dict(
     """
     Converts a list into a dict, where keys/values are the list indices.
-    :param l: A list of categories
-    :param name_as_key: Whether to return the dict with category names as key (True)
-    :param starts_with: index count start
-    :return: A dictionary of list indices/list elements.
+    Args:
+        l: A list of categories
+        name_as_key: Whether to return the dict with category names as key (`True`)
+        starts_with: index count start
+    Returns:
+        A dictionary of list indices/list elements.
     """
     if name_as_key:
         return {v: k for k, v in enumerate(l, starts_with)}
@@ -66,22 +69,17 @@ def _get_dict(
 @dataclass
 class DatasetInfo:
     """
-    DatasetInfo is a simple dataclass that stores some meta-data information about a dataset.
-    `name`: Name of the dataset. Using the name you can retrieve the dataset from the
-    `registry.DatasetRegistry`.
-    `description`: Short description of the dataset.
-    `license`: License to the dataset.
-    `url`: url, where the dataset can be downloaded from.
-    `splits`: A dict of splits. The value must store the relative path, where the split can be found.
-    `type`: The type describes whether this is a dataset for object detection (pass 'OBJECT_DETECTION'),
-    sequence classification (pass 'SEQUENCE_CLASSIFICATION') or token classification ('TOKEN_CLASSIFICATION').
-    Optionally, pass `None`.
+    `DatasetInfo` is a simple dataclass that stores some meta-data information about a dataset.
+     Attributes:
+        name: Name of the dataset. Using the name you can retrieve the dataset from the `registry.DatasetRegistry`.
+        description: Short description of the dataset.
+        license: License to the dataset.
+        url: url, where the dataset can be downloaded from.
+        splits: A `dict` of splits. The value must store the relative path, where the split can be found.
+        type: The type describes whether this is a dataset for object detection (pass 'OBJECT_DETECTION'),
+              sequence classification (pass 'SEQUENCE_CLASSIFICATION') or token classification ('TOKEN_CLASSIFICATION').
+              Optionally, pass `None`.
     """
     name: str
@@ -96,8 +94,11 @@ class DatasetInfo:
         """
         Get the split directory by its key (if it exists).
-        :param key: The key to a split (i.e. "train", "val", "test")
-        :return: The local directory path to the split. An empty string if the key doesn't exist.
+        Args:
+            key: The key to a split (i.e. `train`, `val`,`test`)
+        Returns:
+            The local directory path to the split. An empty string if the key doesn't exist.
         """
         return self.splits[key]
@@ -112,22 +113,26 @@ class DatasetCategories:
     for the index/category name relationship and guarantees that a sequence of natural numbers for the categories
     is always returned as the category-id even after replacing and/or filtering.
-    `init_categories`: A list of category names. The list must include all categories that can occur within the
-    annotations.
-    `init_sub_categories`: A dict of categories/sub-categories. Each sub-category that can appear in the
-    annotations in combination with a category must be listed.
+    Attributes:
+        init_categories: A list of `category_name`s. The list must include all categories that can occur within the
+                         annotations.
+        init_sub_categories: A dict of categories/sub-categories. Each sub-category that can appear in the
+                             annotations in combination with a category must be listed.
-    **Example:**
+    Example:
         An annotation file hast the category/sub-category combinations for three datapoints:
-            (cat1,s1),(cat1,s2), (cat2,s2).
+        ```python
+        (cat1,s1),(cat1,s2), (cat2,s2).
+        ```
         You must list `init_categories`, `init_sub_categories` as follows:
-            init_categories = [cat1,cat2]
-            init_sub_categories = {cat1: [s1,s2],cat2: [s2]}
+        ```python
+        init_categories = [cat1,cat2]
+        init_sub_categories = {cat1: [s1,s2],cat2: [s2]}
+        ```
     Use `filter_categories` or `set_cat_to_sub_cat` to filter or swap categories with sub-categories.
     """
@@ -173,14 +178,17 @@ class DatasetCategories:
         categories of replaced categories with sub categories. However, you must correctly pass arguments to return the
         state you want.
-        :param as_dict: Will pass a dict if set to 'True' otherwise a list.
-        :param name_as_key: Categories are stored as key/value pair in a dict with integers as keys. name_as_key set to
-                            "False" will swap keys and values.
-        :param init: If set to "True" it will return the list/dict of categories as initially provided. Manipulations
-                     due to replacing/filtering will not be regarded.
-        :param filtered: If set to "True" will return an unfiltered list of all categories. If a replacing has been
-                         invoked selected sub categories will be returned.
-        :return: A dict of index/category names (or the other way around) or a list of category names.
+        Args:
+            as_dict: Will pass a dict if set to 'True' otherwise a list.
+            name_as_key: Categories are stored as key/value pair in a dict with integers as keys. `name_as_key` set to
+                         `False` will swap keys and values.
+            init: If set to `True` it will return the list/dict of categories as initially provided. Manipulations
+                  due to replacing/filtering will not be regarded.
+            filtered: If set to `True` will return an unfiltered list of all categories. If a replacing has been
+                      invoked selected sub categories will be returned.
+        Returns:
+            A dict of index/category names (or the other way around) or a list of category names.
         """
         if init:
             if as_dict:
@@ -209,14 +217,17 @@ class DatasetCategories:
         """
         Returns a dict of list with a category name and their sub categories.
-        :param categories: A single category or list of category names
-        :param sub_categories: A mapping of categories to sub category keys on which the result should be filtered. Only
+        Args:
+            categories: A single category or list of category names
+            sub_categories: A mapping of categories to sub category keys on which the result should be filtered. Only
                                relevant, if `keys=False`
-        :param keys: Will only pass keys if set to `True`.
-        :param values_as_dict: Will generate a dict with indices and sub category value names if set to `True`.
-        :param name_as_key: sub category values are stored as key/value pair in a dict with integers as keys.
+            keys: Will only pass keys if set to `True`.
+            values_as_dict: Will generate a dict with indices and sub category value names if set to `True`.
+            name_as_key: sub category values are stored as key/value pair in a dict with integers as keys.
                             name_as_key set to `False` will swap keys and values.
-        :return: Dict with all selected categories.
+        Returns:
+            Dict with all selected categories.
         """
         _categories: Sequence[ObjectTypes]
         if isinstance(categories, (ObjectTypes, str)):
@@ -293,14 +304,16 @@ class DatasetCategories:
         This method can only be called once per object. Re-setting or further replacing of categories would make the
         code messy and is therefore not allowed.
-        **Example:**
-                  cat_to_sub_cat={cat1: sub_cat1}
+        Example:
+            ```python
+            cat_to_sub_cat={cat1: sub_cat1}
+            ```
             will replace cat1 with sub_cat1 as category. This will also be respected when returning datapoints.
-        :param cat_to_sub_cat: A dict of pairs of category/sub-category. Note that the combination must be available
-                               according to the initial settings.
+        Args:
+            cat_to_sub_cat: A dict of pairs of category/sub-category. Note that the combination must be available
+                            according to the initial settings.
         """
         _cat_to_sub_cat = {get_type(key): get_type(value) for key, value in cat_to_sub_cat.items()}
@@ -327,7 +340,8 @@ class DatasetCategories:
         Filter categories of a dataset. This will keep all the categories chosen and remove all others.
         This method can only be called once per object.
-        :param categories: A single category name or a list of category names.
+        Args:
+            categories: A single `category_name` or a list of `category_name`s.
         """
         if not self._allow_update:
@@ -344,13 +358,14 @@ class DatasetCategories:
     @property
     def cat_to_sub_cat(self) -> Optional[Mapping[ObjectTypes, ObjectTypes]]:
         """
-        cat_to_sub_cat
+        `cat_to_sub_cat`
         """
         return self._cat_to_sub_cat
     def is_cat_to_sub_cat(self) -> bool:
         """
-        returns `True` if a category is replaced with sub categories
+        Returns:
+            `True` if a category is replaced with sub categories
         """
         if self._cat_to_sub_cat is not None:
             return True
@@ -358,7 +373,8 @@ class DatasetCategories:
     def is_filtered(self) -> bool:
         """
-        return `True` if categories are filtered
+        Returns:
+            `True` if categories are filtered
         """
         if hasattr(self, "_categories_filter_update"):
             return True
@@ -379,8 +395,11 @@ def get_merged_categories(*categories: DatasetCategories) -> DatasetCategories:
     as well but no sub category than the merged dataset will have no sub categories at all. Whereas in a similar setting
     dataset B has sub category `foo`:`bak`, then `bak` will be an optional sub category for the merged dataset as well.
-    :param categories: A tuple/list of dataset categories
-    :return: An instance of `DatasetCategories` to be used as `DatasetCategories` for merged datasets
+    Args:
+        categories: A tuple/list of dataset categories
+    Returns:
+        An instance of `DatasetCategories` to be used as `DatasetCategories` for merged datasets
     """
     # working with lists is not possible as the order of categories is important here

deepdoctection/datasets/instances/__init__.py CHANGED Viewed

@@ -16,7 +16,9 @@
 # limitations under the License.
 """
-Init file for instances package. Place all datasets in a directory using the structure
+Dataset samples for pre-training and fine-tuning models
+Place all datasets in a **deep**doctection's cache
     deepdoctection
     ├── datasets
@@ -24,9 +26,11 @@ Init file for instances package. Place all datasets in a directory using the str
     │ ├── dataset_2
     │ ├── dataset_3
-If not sure use
+If not sure:
+    ```python
     print(dataset_instance.dataflow.get_workdir())
+    ```
 """
 from .doclaynet import *

deepdoctection/datasets/instances/doclaynet.py CHANGED Viewed

@@ -25,6 +25,7 @@ Module for DocLayNet dataset. Place the dataset as follows
     ├── PNG
     │ ├── 0a0d43e301facee9e99cc33b9b16e732dd207135f4027e75f6aea2bf117535a2.png
 """
 from __future__ import annotations
 import os
@@ -101,7 +102,7 @@ _SUB_CATEGORIES: Mapping[ObjectTypes, Mapping[ObjectTypes, Sequence[ObjectTypes]
 @dataset_registry.register("doclaynet")
 class DocLayNet(DatasetBase):
     """
-    DocLayNetSeq
+    `DocLayNet`
     """
     @classmethod
@@ -125,7 +126,7 @@ class DocLayNet(DatasetBase):
 class DocLayNetBuilder(DataFlowBaseBuilder):
     """
-    DocLayNetBuilder dataflow builder
+    `DocLayNetBuilder` dataflow builder
     """
     def build(self, **kwargs: Union[str, int]) -> DataFlow:
@@ -133,15 +134,14 @@ class DocLayNetBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
         of the dataflow:
-        `split:` Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: None
-        `load_image:` Will load the image for each datapoint.  Default: False
-        `fake_score:` Will add a fake score so that annotations look like predictions
+        Args:
+            kwargs: (split) Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
+                    (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                    (load_image) Will load the image for each datapoint.  Default: `False`
+                    (fake_score) Will add a fake score so that annotations look like predictions
-        :return: dataflow
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "val"))
         max_datapoints = kwargs.get("max_datapoints")
@@ -233,11 +233,14 @@ class DocLayNetSeqBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
         of the dataflow:
-        `split:` Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
-        `load_image:` Will load the image for each datapoint.  Default: `False`
+        Args:
+            kwargs:
+                (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                (load_image) Will load the image for each datapoint. Default: `False`
-        :return: dataflow
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "val"))
         max_datapoints = kwargs.get("max_datapoints")

deepdoctection/datasets/instances/fintabnet.py CHANGED Viewed

@@ -157,28 +157,22 @@ class FintabnetBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
         of the dataflow:
-        `split:` Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
-        `build_mode:` Returns the full image or crops a table according to the table bounding box. Pass `table`
-                           if you only want the cropped table. Default: ""
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
-        `rows_and_cols:` Will add a 'item' image annotations that either represent a row or a column of a table.
-                              Note, that the type of the item (i.e. being a row or a column) can be inferred from the
-                              sub category added. Note further, that "item" are not originally part of the annotations
-                              and are inferred from cell positions and their associated table semantic. Default: `True`
-        `load_image:` Will load the image for each datapoint.  Default: `False`
-        `use_multi_proc:` As the original files are stored as pdf conversion into a numpy array is time-consuming.
-                        When setting use_multi_proc to True is will use several processes depending on the number
-                        of CPUs available.
-        `use_multi_proc_strict:` Will use strict mode in multiprocessing.
-        `fake_score:` Will add a fake score so that annotations look like predictions
-        :return: dataflow
+        Args:
+            kwargs:
+                (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
+                (build_mode) Returns the full image or crops a table according to the table bounding box. Pass `table`
+                             if you only want the cropped table. Default: `""`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                (rows_and_cols) Will add 'item' image annotations that either represent a row or a column of a table.
+                                Default: `True`
+                (load_image) Will load the image for each datapoint. Default: `False`
+                (use_multi_proc) Uses multiple processes for PDF conversion. Default: `True`
+                (use_multi_proc_strict) Uses strict mode in multiprocessing. Default: `False`
+                (fake_score) Adds a fake score so that annotations look like predictions. Default: `False`
+                (pubtables_like) Treats the dataset as PubTables-like. Default: `False`
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "val"))

deepdoctection/datasets/instances/funsd.py CHANGED Viewed

@@ -54,8 +54,10 @@ def load_file(path_ann: PathLikeOrStr) -> FunsdDict:
     """
     Loading json file
-    :param path_ann: path
-    :return: dict
+    Args:
+        path_ann: path
+    Returns:
+        dict
     """
     anns = load_json(path_ann)
     path, file_name = os.path.split(path_ann)
@@ -144,11 +146,14 @@ class FunsdBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
         of the dataflow:
-        `split:` Split of the dataset. "train" and "test" is available
-        `load_image:` Will load the image for each datapoint.  Default: `False`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
+        Args:
+            kwargs:
+                (split) Split of the dataset. Can be `train` or `test`. Default: `test`
+                (load_image) Will load the image for each datapoint. Default: `False`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
-        :return: Dataflow
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "test"))

deepdoctection/datasets/instances/iiitar13k.py CHANGED Viewed

@@ -124,15 +124,15 @@ class IIITar13KBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the return
         values of the dataflow:
-        `split:` Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
-        `load_image:` Will load the image for each datapoint.  Default: `False`
-        `fake_score:` Will add a fake score so that annotations look like predictions
-        :return: dataflow
+        Args:
+            kwargs:
+                (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                (load_image) Will load the image for each datapoint. Default: `False`
+                (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
+        Returns:
+            Dataflow
         """
         if not lxml_available():

deepdoctection/datasets/instances/layouttest.py CHANGED Viewed

@@ -66,7 +66,7 @@ _INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutTy
 @dataset_registry.register("testlayout")
 class LayoutTest(_BuiltInDataset):
     """
-    LayoutTest
+    `LayoutTest`
     """
     _name = _NAME
@@ -99,15 +99,15 @@ class LayoutTestBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
         of the dataflow:
-        `split:` Split of the dataset. Only "test" is for this small sample available
+        Args:
+            kwargs:
+                (split) Split of the dataset. Only `test` is available for this small sample. Default: `test`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                (load_image) Will load the image for each datapoint. Default: `False`
+                (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
-        `load_image:` Will load the image for each datapoint.  Default: `False`
-        `fake_score:` Will add a fake score so that annotations look like predictions
-        :return: Dataflow
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "test"))
         max_datapoints = kwargs.get("max_datapoints")

deepdoctection/datasets/instances/publaynet.py CHANGED Viewed

@@ -73,7 +73,7 @@ _INIT_CATEGORIES = [LayoutType.TEXT, LayoutType.TITLE, LayoutType.LIST, LayoutTy
 @dataset_registry.register("publaynet")
 class Publaynet(_BuiltInDataset):
     """
-    Publaynet
+    `Publaynet`
     """
     _name = _NAME
@@ -107,15 +107,15 @@ class PublaynetBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
         of the dataflow:
-        `split:` Split of the dataset. Can be `train`,`val` or `test`. Default: `val`
+        Args:
+            kwargs:
+                (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                (load_image) Will load the image for each datapoint. Default: `False`
+                (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
-        `load_image:` Will load the image for each datapoint.  Default: `False`
-        `fake_score:` Will add a fake score so that annotations look like predictions
-        :return: dataflow
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "val"))
         max_datapoints = kwargs.get("max_datapoints")

deepdoctection/datasets/instances/pubtables1m.py CHANGED Viewed

@@ -119,23 +119,23 @@ class Pubtables1MDet(_BuiltInDataset):
 class Pubtables1MBuilder(DataFlowBaseBuilder):
     """
-    Pubtables1M dataflow builder
+    `Pubtables1M` dataflow builder
     """
     def build(self, **kwargs: Union[str, int]) -> DataFlow:
         """
-        Returns a dataflow from which you can stream datapoints of images. The following arguments affect the return
-        values of the dataflow:
-        `split:` Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
-        `load_image:` Will load the image for each datapoint.  Default: `False`
-        `fake_score:` Will add a fake score so that annotations look like predictions
-        :return: dataflow
+        Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
+        of the dataflow:
+        Args:
+            kwargs:
+                (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                (load_image) Will load the image for each datapoint. Default: `False`
+                (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
+        Returns:
+            Dataflow
         """
         if not lxml_available():

deepdoctection/datasets/instances/pubtabnet.py CHANGED Viewed

@@ -109,7 +109,7 @@ _SUB_CATEGORIES = {
 @dataset_registry.register("pubtabnet")
 class Pubtabnet(_BuiltInDataset):
     """
-    Pubtabnet
+    `Pubtabnet`
     """
     _name = _NAME
@@ -143,20 +143,18 @@ class PubtabnetBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
         of the dataflow:
-        `split:` Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
-        `load_image:` Will load the image for each datapoint.  Default: `False`
-        `rows_and_cols:` Will add a 'item' image annotations that either represent a row or a column of a table.
-                         Note, that the type of the item (i.e. being a row or a column) can be inferred from the
-                         sub category added. Note further, that 'ITEM' are not originally part of the annotations
-                         and are inferred from cell positions and their associated table semantic. Default: `True`
-        `fake_score:` Will add a fake score so that annotations look like predictions
-        :return: dataflow
+        Args:
+            kwargs:
+                (split) Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                (load_image) Will load the image for each datapoint. Default: `False`
+                (rows_and_cols) Will add 'item' image annotations that represent rows or columns of a
+                                table. Default: `True`
+                (fake_score) Will add a fake score so that annotations look like predictions. Default: `False`
+                (dd_pipe_like) If `True`, sets `load_image` to `True`. Default: `False`
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "val"))
         if split == "val":

deepdoctection/datasets/instances/rvlcdip.py CHANGED Viewed

@@ -123,16 +123,16 @@ class RvlcdipBuilder(DataFlowBaseBuilder):
     def build(self, **kwargs: Union[str, int]) -> DataFlow:
         """
-        Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
-        of the dataflow:
+        Returns a dataflow from which you can stream datapoints of images.
-        `split:` Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
+        Args:
+            kwargs:
+                split (str): Split of the dataset. Can be `train`, `val` or `test`. Default: `val`
+                max_datapoints (int): Will stop iterating after max_datapoints. Default: `None`
+                load_image (bool): Will load the image for each datapoint. Default: `False`
-        max_datapoints: Will stop iterating after max_datapoints. Default: `None`
-        load_image: Will load the image for each datapoint.  Default: `False`
-        :return: dataflow
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "val"))

deepdoctection/datasets/instances/xfund.py CHANGED Viewed

@@ -131,15 +131,17 @@ class XfundBuilder(DataFlowBaseBuilder):
         Returns a dataflow from which you can stream datapoints of images. The following arguments affect the returns
         of the dataflow:
-        `split:` Split of the dataset. `train` and `val` is available
-        `load_image:` Will load the image for each datapoint.  Default: `False`
-        `max_datapoints:` Will stop iterating after max_datapoints. Default: `None`
-        `languages:` Will select only samples of selected languages. Available languages: `de`, `es`, `fr`, `it`, `ja` ,
-                     `pt`, `zh`. If default will take any language.
-        :return: Dataflow
+        Args:
+            kwargs:
+                (split) Split of the dataset. `train` and `val` are available. Default: `val`
+                (load_image) Will load the image for each datapoint. Default: `False`
+                (max_datapoints) Will stop iterating after `max_datapoints`. Default: `None`
+                (languages) Will select only samples of selected languages. Available languages:
+                            `de`, `es`, `fr`, `it`, `ja`, `pt`, `zh`. If `None`, all
+                            languages are taken. Default: `None`
+        Returns:
+            Dataflow
         """
         split = str(kwargs.get("split", "val"))

deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl