PyPI - deepdoctection - Versions diffs - 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl - Mend

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +4 -2
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +919 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +162 -108
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +205 -119
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +26 -17
deepdoctection/utils/env_info.py +86 -37
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -71
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.1.dist-info/METADATA +376 -0
deepdoctection-0.43.1.dist-info/RECORD +149 -0
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.1.dist-info/METADATA +0 -431
deepdoctection-0.42.1.dist-info/RECORD +0 -148
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0

deepdoctection/datasets/__init__.py CHANGED Viewed

@@ -16,14 +16,10 @@
 # limitations under the License.
 """
-Simple framework inspired by
+# Dataset concept: Building, training and evaluating datasets
-<https://huggingface.co/docs/datasets/>
+Simple framework inspired by <https://huggingface.co/docs/datasets/> for creating datasets.
-for creating datasets.
-Create an info card, a DataFlowBaseBuilder derived instance, possibly a category card and a
-DatasetBase derived instance to create a data set.
 """
 from .adapter import *

deepdoctection/datasets/adapter.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # limitations under the License.
 """
-Module for wrapping datasets into a pytorch dataset framework.
+Wrapping datasets into a PyTorch dataset framework
 """
@@ -46,9 +46,8 @@ class DatasetAdapter(IterableDataset):  # type: ignore
     pytorch frameworks (e.g. Detectron2). It wraps the dataset and defines the compulsory
     `__iter__` using  `dataflow.build` .
-    DatasetAdapter is meant for training and will therefore produce an infinite number of datapoints
+    `DatasetAdapter` is meant for training and will therefore produce an infinite number of datapoints
     by shuffling and restart iteration once the previous dataflow is exhausted.
     """
     def __init__(
@@ -61,14 +60,15 @@ class DatasetAdapter(IterableDataset):  # type: ignore
         **build_kwargs: str,
     ) -> None:
         """
-        :param name_or_dataset: Registered name of the dataset or an instance.
-        :param cache_dataset: If set to true, it will cache the dataset (without loading images). If possible,
-                              some statistics, e.g. number of specific labels will be printed.
-        :param image_to_framework_func: A mapping function that converts image datapoints into the framework format
-        :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
-                              labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
-                              `WordType.token_class`.
-        :param build_kwargs: optional parameters for defining the dataflow.
+        Args:
+            name_or_dataset: Registered name of the dataset or an instance.
+            cache_dataset: If set to `True`, it will cache the dataset (without loading images). If possible,
+                           some statistics, e.g. number of specific labels will be printed.
+            image_to_framework_func: A mapping function that converts image datapoints into the framework format
+            use_token_tag: Will only be used for dataset_type="token_classification". If `use_token_tag=True`, will use
+                           labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
+                           `WordType.token_class`.
+            build_kwargs: optional parameters for defining the dataflow.
         """
         if number_repetitions == -1 and not cache_dataset:
             raise ValueError(

deepdoctection/datasets/base.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # limitations under the License.
 """
-Module for the base class of datasets.
+DatasetBase, MergeDatasets and CustomDataset
 """
 from __future__ import annotations
@@ -42,9 +42,9 @@ from .info import DatasetCategories, DatasetInfo, get_merged_categories
 class DatasetBase(ABC):
     """
-    Base class for a dataset. Requires to implementing `_categories` `_info` and `_builder` by
-    yourself. These methods must return a DatasetCategories, a DatasetInfo and a DataFlow_Builder instance, which
-    together give a complete description of the dataset. Compare some specific dataset cards in the :mod:`instance` .
+    Base class for a dataset. Requires to implement `_categories`, `_info` and `_builder` by
+    yourself. These methods must return a `DatasetCategories`, a `DatasetInfo` and a `DataFlow_Builder` instance, which
+    together give a complete description of the dataset. Compare some specific dataset cards in the `instance`.
     """
     def __init__(self) -> None:
@@ -65,21 +65,21 @@ class DatasetBase(ABC):
     @property
     def dataset_info(self) -> DatasetInfo:
         """
-        dataset_info
+        `dataset_info`
         """
         return self._dataset_info
     @property
     def dataflow(self) -> DataFlowBaseBuilder:
         """
-        dataflow
+        `dataflow`
         """
         return self._dataflow_builder
     @abstractmethod
     def _categories(self) -> DatasetCategories:
         """
-        Construct the DatasetCategory object.
+        Construct the `DatasetCategory` object.
         """
         raise NotImplementedError()
@@ -88,7 +88,7 @@ class DatasetBase(ABC):
     @abstractmethod
     def _info(cls) -> DatasetInfo:
         """
-        Construct the DatasetInfo object.
+        Construct the `DatasetInfo` object.
         """
         raise NotImplementedError()
@@ -96,7 +96,7 @@ class DatasetBase(ABC):
     @abstractmethod
     def _builder(self) -> DataFlowBaseBuilder:
         """
-        Construct the DataFlowBaseBuilder object. It needs to be implemented in the derived class.
+        Construct the `DataFlowBaseBuilder` object. It needs to be implemented in the derived class.
         """
         raise NotImplementedError()
@@ -113,7 +113,7 @@ class DatasetBase(ABC):
     @staticmethod
     def is_built_in() -> bool:
         """
-        Returns flag to indicate if dataset is custom or built int.
+        Returns flag to indicate if dataset is custom or built-in.
         """
         return False
@@ -140,9 +140,10 @@ class SplitDataFlow(DataFlowBaseBuilder):
     def __init__(self, train: list[Image], val: list[Image], test: Optional[list[Image]]):
         """
-        :param train: Cached train split
-        :param val: Cached val split
-        :param test: Cached test split
+        Args:
+            train: Cached `train` split
+            val: Cached `val` split
+            test: Cached `test` split
         """
         super().__init__(location="")
         self.split_cache: dict[str, list[Image]]
@@ -154,8 +155,12 @@ class SplitDataFlow(DataFlowBaseBuilder):
     def build(self, **kwargs: Union[str, int]) -> DataFlow:
         """
         Dataflow builder for merged split datasets
-        :param kwargs: Only split and max_datapoints arguments will be considered.
-        :return: Dataflow
+        Args:
+            kwargs: Only split and max_datapoints arguments will be considered.
+        Returns:
+            Dataflow
         """
         split = kwargs.get("split", "train")
@@ -175,44 +180,49 @@ class MergeDataset(DatasetBase):
     guarantee flexibility it is possible to pass customized dataflows explicitly to maybe reduce the dataflow size from
     one dataset or to use different splits from different datasets.
-    When yielding datapoint from :build(), note that one dataset will pass all its samples successively which
-    might reduce randomness for training, especially when using datasets from the same domain. Buffering all datasets
-    (without loading heavy components like images) is therefore possible and the merged dataset can be shuffled.
+    Note:
+        When yielding datapoints from `build` dataflows, note that one dataset will pass all its samples successively
+        which might reduce randomness for training. Buffering all datasets (without loading heavy components like
+        images) is therefore possible and the merged dataset can be shuffled.
-    When the datasets are buffered are split functionality can divide the buffered samples into an train, val and test
-    set.
+        When the datasets that are buffered are split functionality one can divide the buffered samples into an `train`,
+        `val` and `test` set.
     While the selection of categories is given by the union of all categories of all datasets, sub categories need to
     be handled with care: Only sub categories for one specific category are available provided that every dataset has
     this sub category available for this specific category. The range of sub category values again is defined as the
     range of all values from all datasets.
-    **Example:**
-            dataset_1 = get_dataset("dataset_1")
-            dataset_2 = get_dataset("dataset_2")
+    Example:
-            union_dataset = MergeDataset(dataset_1,dataset_2)
-            union_dataset.buffer_datasets(split="train")     # will cache the train split of dataset_1 and dataset_2
-            merge.split_datasets(ratio=0.1, add_test=False)  # will create a new split of the union.
+        ```python
+        dataset_1 = get_dataset("dataset_1")
+        dataset_2 = get_dataset("dataset_2")
+        union_dataset = MergeDataset(dataset_1,dataset_2)
+        union_dataset.buffer_datasets(split="train")     # will cache the train split of dataset_1 and dataset_2
+        merge.split_datasets(ratio=0.1, add_test=False)  # will create a new split of the union.
+        ```
-    **Example:**
+    Example:
-            dataset_1 = get_dataset("dataset_1")
-            dataset_2 = get_dataset("dataset_2")
+        ```python
+        dataset_1 = get_dataset("dataset_1")
+        dataset_2 = get_dataset("dataset_2")
-            df_1 = dataset_1.dataflow.build(max_datapoints=20)  # handle separate dataflow configs ...
-            df_2 = dataset_1.dataflow.build(max_datapoints=30)
+        df_1 = dataset_1.dataflow.build(max_datapoints=20)  # handle separate dataflow configs ...
+        df_2 = dataset_1.dataflow.build(max_datapoints=30)
-            union_dataset = MergeDataset(dataset_1,dataset_2)
-            union_dataset.explicit_dataflows(df_1,df_2)   # ... and pass them explicitly. Filtering is another
-                                                          # possibility
+        union_dataset = MergeDataset(dataset_1,dataset_2)
+        union_dataset.explicit_dataflows(df_1,df_2)   # ... and pass them explicitly. Filtering is another
+                                                      # possibility
+        ```
     """
     def __init__(self, *datasets: DatasetBase):
         """
-        :param datasets: An arbitrary number of datasets
+        Args:
+            datasets: An arbitrary number of datasets
         """
         self.datasets = datasets
         self.dataflows: Optional[tuple[DataFlow, ...]] = None
@@ -244,12 +254,17 @@ class MergeDataset(DatasetBase):
             def build(self, **kwargs: Union[str, int]) -> DataFlow:
                 """
                 Building the dataflow of merged datasets. No argument will affect the stream if the dataflows have
-                been explicitly passed. Otherwise, all kwargs will be passed to all dataflows. Note that each dataflow
-                will iterate until it is exhausted. To guarantee randomness across different datasets cache all
-                datapoints and shuffle them afterwards (e.g. use :buffer_dataset() ).
+                been explicitly passed. Otherwise, all kwargs will be passed to all dataflows.
+                Note:
+                    Note that each dataflow will iterate until it is exhausted. To guarantee randomness across
+                    different datasets cache all datapoints and shuffle them afterwards (e.g. use `buffer_dataset()`).
+                Args:
+                    kwargs: arguments for `build()`
-                :param kwargs: arguments for :build()
-                :return: Dataflow
+                Return:
+                    `Dataflow`
                 """
                 df_list = []
                 if self.dataflows is not None:
@@ -272,7 +287,8 @@ class MergeDataset(DatasetBase):
         Pass explicit dataflows for each dataset. Using several dataflow configurations for one dataset is possible as
         well. However, the number of dataflow must exceed the number of merged datasets.
-        :param dataflows: An arbitrary number of dataflows
+        Args:
+            dataflows args: An arbitrary number of dataflows
         """
         self.dataflows = dataflows
         if len(self.datasets) > len(self.dataflows):
@@ -286,19 +302,23 @@ class MergeDataset(DatasetBase):
         """
         Buffer datasets with given configs. If dataflows are passed explicitly it will cache their streamed output.
-        :param kwargs: arguments for :build()
-        :return: Dataflow
+        Args:
+            kwargs: arguments for `build()`
+        Returns:
+            Dataflow
         """
         df = self.dataflow.build(**kwargs)
         self.datapoint_list = CacheData(df, shuffle=True).get_cache()
     def split_datasets(self, ratio: float = 0.1, add_test: bool = True) -> None:
         """
-        Split cached datasets into train/val(/test).
+        Split cached datasets into `train`/`val`(/`test`).
-        :param ratio: 1-ratio will be assigned to the train split. The remaining bit will be assigned to val and test
-                      split.
-        :param add_test: Add a test split
+        Args:
+            ratio: 1-ratio will be assigned to the train split. The remaining bit will be assigned to val and test
+                   split.
+            add_test: Add a test split
         """
         assert self.datapoint_list is not None, "Datasets need to be buffered before splitting"
         number_datapoints = len(self.datapoint_list)
@@ -332,7 +352,8 @@ class MergeDataset(DatasetBase):
         To reproduce a dataset split at a later stage, get a summary of the by having a dict of list with split and
         the image ids contained in the split.
-        :return: E.g. `{"train": ['ab','ac'],"val":['bc','bd']}`
+        Returns:
+            A dict with keys `train`, `val` and `test`: `{"train": ['ab','ac'],"val":['bc','bd']}`
         """
         if isinstance(self._dataflow_builder, SplitDataFlow):
             return {
@@ -345,8 +366,11 @@ class MergeDataset(DatasetBase):
         self, split_dict: Mapping[str, Sequence[str]], **dataflow_build_kwargs: Union[str, int]
     ) -> None:
         """
-        Reproducing a dataset split from a dataset or a dataflow by a dict of list of image ids.
+        Reproducing a dataset split from a dataset or a dataflow by a dict of list of `image_id`s.
+        Example:
+            ```python
             merge = dd.MergeDataset(doclaynet)
             merge.explicit_dataflows(df_doc)
             merge.buffer_datasets()
@@ -357,8 +381,10 @@ class MergeDataset(DatasetBase):
             df_doc_2 = doclaynet.dataflow.build(split="train", max_datapoints=4000)
             merge_2.explicit_dataflows(df_doc_2)
             merge_2.create_split_by_id(out)   # merge_2 now has the same split as merge
+            ```
-        :param split_dict: e.g. `{"train":['ab','ac',...],"val":['bc'],"test":[]}`
+        Args:
+            split_dict: e.g. `{"train":['ab','ac',...],"val":['bc'],"test":[]}`
         """
         if set(split_dict.keys()) != {"train", "val", "test"}:
@@ -399,33 +425,41 @@ class CustomDataset(DatasetBase):
         description: Optional[str] = None,
     ):
         """
-        :param name: Name of the dataset. It will not be used in the code, however it might be helpful, if several
+        Args:
+            name: Name of the dataset. It will not be used in the code, however it might be helpful, if several
                      custom datasets are in use.
-        :param dataset_type: Datasets need to be characterized by one of the `enum` members `DatasetType` that describe
+            dataset_type: Datasets need to be characterized by one of the `enum` members `DatasetType` that describe
                      the machine learning task the dataset is built for. You can get all registered types with
-                            types = dd.object_types_registry.get("DatasetType")
-                            print({t for t in types})
+                     ```python
+                     types = dd.object_types_registry.get("DatasetType")
+                     print({t for t in types})
+                     ```
-        :param location: Datasets should be stored a sub folder of name `location` in the local cache
+            location: Datasets should be stored a sub folder of name `location` in the local cache
                          `get_dataset_dir_path()`. There are good reasons to use `name`.
-        :param init_categories: A list of all available categories in this dataset. You must use a list as the order
-                                of the categories must always be preserved: they determine the category id that in turn
-                                will be used for model training.
-        :param dataflow_builder: A subclass of `DataFlowBaseBuilder`. Do not instantiate the class by yourself.
-        :param init_sub_categories: A dict mapping main categories to sub categories, if there are any available.
-                                    Suppose an object `LayoutType.cell` has two additional information in the annotation
-                                    file: `CellType.header, CellType.body`. You can then write:
+            init_categories: A list of all available categories in this dataset. You must use a list as the order
+                             of the categories must always be preserved: they determine the category id that in turn
+                             will be used for model training.
+            dataflow_builder: A subclass of `DataFlowBaseBuilder`. Do not instantiate the class by yourself.
+            init_sub_categories: A dict mapping main categories to sub categories, if there are any available.
+                                 Suppose an object `LayoutType.cell` has two additional information in the annotation
+                                 file: `CellType.header, CellType.body`. You can then write:
-                                        {LayoutType.cell: {CellType.header: [CellType.header, CellType.body]}
+                                 ```python
+                                 {LayoutType.cell: {CellType.header: [CellType.header, CellType.body]}
+                                 ```
-                                    This setting assumes that later in the mapping the `ImageAnnotation` with
-                                    `category_name=LayoutType.cell` will have a sub category of key `CellType.header`
-                                    and one of the two values `CellType.header, CellType.body`
-        :param annotation_files: A mapping to one or more annotation files, e.g.
+                                 This setting assumes that later in the mapping the `ImageAnnotation` with
+                                 `category_name=LayoutType.cell` will have a sub category of key `CellType.header`
+                                 and one of the two values `CellType.header, CellType.body`.
+            annotation_files: A mapping to one or more annotation files, e.g.
-                                       annotation_file = {"train": "train_file.json", "test": "test_file.json"}
-        :param description: A description of the dataset.
+                                 ```python
+                                 annotation_file = {"train": "train_file.json", "test": "test_file.json"}
+                                 ```
+            description: A description of the dataset.
         """
         self.name = name
@@ -467,13 +501,16 @@ class CustomDataset(DatasetBase):
         """
         This static method creates a CustomDataset instance from a dataset card.
-        A dataset card is a JSON file that contains metadata about the dataset such as its name, type, location,
-        initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is a class
-        that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
+        A dataset card is a `JSON` file that contains metadata about the dataset such as its `name`, `dataset_type`,
+        `location`, initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is
+        a class that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
-        :param file_path: The path to the dataset card (JSON file).
-        :param dataflow_builder: The class used to build the dataflow for the dataset.
-        :return: A CustomDataset instance created from the dataset card.
+        Args:
+            file_path: The path to the dataset card (`JSON` file).
+            dataflow_builder: The class used to build the dataflow for the dataset.
+        Returns:
+            A CustomDataset instance created from the dataset card.
         """
         with open(file_path, "r", encoding="UTF-8") as file:
@@ -496,9 +533,8 @@ class CustomDataset(DatasetBase):
     def as_dict(self) -> Mapping[str, Any]:
         """
-        Return the meta-data of the dataset as a dictionary.
-        :return: A dictionary containing the meta-data of the dataset.
+        Return:
+           The meta-data of the dataset as a dictionary.
         """
         return {
             "name": self.name,
@@ -519,9 +555,10 @@ class CustomDataset(DatasetBase):
     def save_dataset_card(self, file_path: str) -> None:
         """
-        Save the dataset card to a JSON file.
+        Save the dataset card to a `JSON` file.
-        :param file_path: file_path
+        Args:
+            file_path: file_path
         """
         with open(file_path, "w", encoding="UTF-8") as file:
             json.dump(self.as_dict(), file, indent=4)

deepdoctection/datasets/dataflow_builder.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # limitations under the License.
 """
-Module for DataFlowBaseBuilder class.
+Module for `DataFlowBaseBuilder` class.
 """
 from abc import ABC, abstractmethod
@@ -48,8 +48,9 @@ class DataFlowBaseBuilder(ABC):
         annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None,
     ):
         """
-        :param location: Relative path of the physical dataset.
-        :param annotation_files: Dict of annotation files e.g. depending on the split.
+        Args:
+            location: Relative path of the physical dataset.
+            annotation_files: Dict of annotation files e.g. depending on the split.
         """
         self.location = location
         if annotation_files is None:
@@ -61,7 +62,7 @@ class DataFlowBaseBuilder(ABC):
     @property
     def categories(self) -> DatasetCategories:
         """
-        categories
+        `categories`
         """
         if self._categories is not None:
             return self._categories
@@ -70,27 +71,28 @@ class DataFlowBaseBuilder(ABC):
     @categories.setter
     def categories(self, categories: DatasetCategories) -> None:
         """
-        categories setter
+        `categories` setter
         """
         self._categories = categories
     def get_split(self, key: str) -> str:
         """
-        split value
+        Args:
+            key: split value
         """
         return self._splits[key]
     @property
     def splits(self) -> Mapping[str, str]:
         """
-        splits
+        `splits`
         """
         return self._splits
     @splits.setter
     def splits(self, splits: Mapping[str, str]) -> None:
         """
-        set splits
+        `splits` setter
         """
         self._splits = splits
@@ -98,7 +100,8 @@ class DataFlowBaseBuilder(ABC):
         """
         Get the absolute path to the locally physically stored dataset.
-        :return: local workdir
+        Returns:
+            local workdir
         """
         return Path(get_dataset_dir_path()) / self.location
@@ -107,13 +110,16 @@ class DataFlowBaseBuilder(ABC):
         """
         Consult the docstring w.r.t `DataFlowBaseBuilder`.
-        :param kwargs: A custom set of arguments/values
-        :return: dataflow
+        Args:
+            kwargs: A custom set of arguments/values
+        Returns:
+            dataflow
         """
         raise NotImplementedError()
     def get_annotation_file(self, split: str) -> str:
-        """Get single annotation file."""
+        """Get single annotation file"""
         split_file = self.annotation_files[split]
         if isinstance(split_file, str):
             return split_file

deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl