PyPI - deepdoctection - Versions diffs - 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl - Mend

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +4 -2
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +919 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +162 -108
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +205 -119
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +26 -17
deepdoctection/utils/env_info.py +86 -37
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -71
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.1.dist-info/METADATA +376 -0
deepdoctection-0.43.1.dist-info/RECORD +149 -0
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.1.dist-info/METADATA +0 -431
deepdoctection-0.42.1.dist-info/RECORD +0 -148
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0

deepdoctection/dataflow/custom_serialize.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # limitations under the License.
 """
-Methods that convert incoming data to dataflows.
+Classes to load data and produce dataflows
 """
 from __future__ import annotations
@@ -63,13 +63,14 @@ class FileClosingIterator:
     reading the data from it. It is used in the context of reading data from a file
     in a streaming manner, where the data is not loaded into memory all at once.
-    **Example:**
+    Example:
+        ```python
         file = open(path, "r")
         iterator = Reader(file)
         closing_iterator = FileClosingIterator(file, iter(iterator))
-        df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints) # set up a dataflow
+        df = CustomDataFromIterable(closing_iterator, max_datapoints=max_datapoints)
+        ```
     """
@@ -77,8 +78,9 @@ class FileClosingIterator:
         """
         Initializes the FileClosingIterator with a file object and its iterator.
-        :param file_obj (TextIO): The file object to read data from.
-        :param     iterator (Iterator): The actual iterator of the file object.
+        Args:
+            file_obj: The file object to read data from.
+            iterator: The actual iterator of the file object.
         """
         self.file_obj = file_obj
         self.iterator = iterator
@@ -87,7 +89,8 @@ class FileClosingIterator:
         """
         Returns the iterator object itself.
-        :return:  FileClosingIterator: The instance of the class itself.
+        Returns:
+            FileClosingIterator: The instance of the class itself.
         """
         return self
@@ -96,7 +99,8 @@ class FileClosingIterator:
         Returns the next item from the file object's iterator.
         Closes the file object if the iteration is finished.
-        :return: The next item from the file object's iterator.
+        Returns:
+            The next item from the file object's iterator.
         Raises:
             StopIteration: If there are no more items to return.
@@ -110,23 +114,27 @@ class FileClosingIterator:
 class SerializerJsonlines:
     """
-    Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of JSON objects to a .jsonl file.
+    Serialize a dataflow from a jsonlines file. Alternatively, save a dataflow of `JSON` objects to a `.jsonl` file.
-    **Example:**
+    Example:
+        ```python
+          df = SerializerJsonlines.load("path/to/file.jsonl")
+          df.reset_state()
-            df = SerializerJsonlines.load("path/to/file.jsonl")
-            df.reset_state()
-            for dp in df:
-               ... # is a dict
+          for dp in df:
+              ... # is a dict
+        ```
     """
     @staticmethod
     def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromIterable:
         """
-        :param path: a path to a .jsonl file.
-        :param max_datapoints: Will stop the iteration once max_datapoints have been streamed
+        Args:
+            path: a path to a .jsonl file.
+            max_datapoints: Will stop the iteration once max_datapoints have been streamed
-        :return: dataflow to iterate from
+        Returns:
+            Dataflow to iterate from
         """
         file = open(path, "r")  # pylint: disable=W1514,R1732
         iterator = Reader(file)
@@ -136,14 +144,15 @@ class SerializerJsonlines:
     @staticmethod
     def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
         """
-        Writes a dataflow iteratively to a .jsonl file. Every datapoint must be a dict where all items are serializable.
-        As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
-        unexpectedly large file
+        Writes a dataflow iteratively to a `.jsonl` file. Every datapoint must be a dict where all items are
+        serializable. As the length of the dataflow cannot be determined in every case max_datapoint prevents
+        generating an unexpectedly large file
-        :param df: The dataflow to write from.
-        :param path: The path, the .jsonl file to write to.
-        :param file_name: name of the target file.
-        :param max_datapoints: maximum number of datapoint to consider writing to a file.
+        Args:
+            df: The dataflow to write from.
+            path: The path, the .jsonl file to write to.
+            file_name: name of the target file.
+            max_datapoints: maximum number of datapoint to consider writing to a file.
         """
         if not os.path.isdir(path):
@@ -166,27 +175,30 @@ class SerializerJsonlines:
 class SerializerTabsepFiles:
     """
     Serialize a dataflow from a tab separated text file. Alternatively, save a dataflow of plain text
-    to a .txt file.
+    to a `.txt` file.
-    **Example**:
-            df = SerializerTabsepFiles.load("path/to/file.txt")
+    Example:
+        ```python
+        df = SerializerTabsepFiles.load("path/to/file.txt")
         will yield each text line of the file.
+        ```
     """
     @staticmethod
-    def load(path: PathLikeOrStr, max_datapoins: Optional[int] = None) -> CustomDataFromList:
+    def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> CustomDataFromList:
         """
-        :param path: a path to a .txt file.
-        :param max_datapoins: Will stop the iteration once max_datapoints have been streamed
+        Args:
+            path: a path to a .txt file.
+            max_datapoints: Will stop the iteration once max_datapoints have been streamed
-        :return: dataflow to iterate from
+        Returns:
+            Dataflow to iterate from
         """
         with open(path, "r", encoding="UTF-8") as file:
             file_list = file.readlines()
-        return CustomDataFromList(file_list, max_datapoints=max_datapoins)
+        return CustomDataFromList(file_list, max_datapoints=max_datapoints)
     @staticmethod
     def save(df: DataFlow, path: PathLikeOrStr, file_name: str, max_datapoints: Optional[int] = None) -> None:
@@ -195,10 +207,11 @@ class SerializerTabsepFiles:
         As the length of the dataflow cannot be determined in every case max_datapoint prevents generating an
         unexpectedly large file
-        :param df: The dataflow to write from.
-        :param path: The path, the .txt file to write to.
-        :param file_name: name of the target file.
-        :param max_datapoints: maximum number of datapoint to consider writing to a file.
+        Args:
+            df: The dataflow to write from.
+            path: The path, the .txt file to write to.
+            file_name: Name of the target file.
+            max_datapoints: Maximum number of datapoint to consider writing to a file.
         """
         if not os.path.isdir(path):
@@ -220,6 +233,13 @@ class SerializerFiles:
     """
     Serialize files from a directory and all subdirectories. Only one file type can be serialized. Once specified, all
     other types will be filtered out.
+    Example:
+        ```python
+        df = SerializerFiles.load("path/to/dir",file_type=".pdf")
+        will yield absolute paths to all `.pdf` files in the directory and all subdirectories.
+        ```
     """
     @staticmethod
@@ -234,12 +254,15 @@ class SerializerFiles:
         Generates a dataflow where a datapoint consists of a string of names of files with respect to some file type.
         If you want to load the files you need to do this in a following step by yourself.
-        :param path: A path to some base directory. Will inspect all subdirectories, as well
-        :param file_type: A file type (suffix) to look out for (single str or list of stings)
-        :param max_datapoints: Stop iteration after passing max_datapoints
-        :param shuffle: Shuffle the files, so that the order of appearance in dataflow is random.
-        :param sort: If set to "True" it will sort all selected files by its string
-        :return: dataflow to iterate from
+        Args:
+            path: A path to some base directory. Will inspect all subdirectories, as well
+            file_type: A file type (suffix) to look out for (single str or list of stings)
+            max_datapoints: Stop iteration after passing max_datapoints
+            shuffle: Shuffle the files, so that the order of appearance in dataflow is random.
+            sort: If set to `True` it will sort all selected files by its string
+        Returns:
+            Dataflow to iterate from
         """
         df: DataFlow
         df1: DataFlow
@@ -286,10 +309,11 @@ class SerializerFiles:
 class CocoParser:
     """
-    A simplified version of the Microsoft COCO helper class for reading  annotations. It currently supports only
+    A simplified version of the COCO helper class for reading  annotations. It currently supports only
     bounding box annotations
-    :param annotation_file: location of annotation file
+    Args:
+        annotation_file: Location of annotation file
     """
     def __init__(self, annotation_file: Optional[PathLikeOrStr] = None) -> None:
@@ -358,14 +382,16 @@ class CocoParser:
         is_crowd: Optional[bool] = None,
     ) -> Sequence[int]:
         """
-        Get ann ids that satisfy given filter conditions. default skips that filter
+        Get annotation ids that satisfy given filter conditions. default skips that filter
-        :param img_ids: get anns for given imgs
-        :param cat_ids: get anns for given cats
-        :param area_range: get anns for given area range (e.g. [0 inf])
-        :param is_crowd: get anns for given crowd label (False or True)
+        Args:
+            img_ids: get anns for given imgs
+            cat_ids: get anns for given cats
+            area_range: get anns for given area range (e.g. [0 inf])
+            is_crowd: get anns for given crowd label (False or True)
-        :return: ids: integer array of ann ids
+        Returns:
+            ids: integer array of ann ids
         """
         if img_ids is None:
@@ -403,13 +429,15 @@ class CocoParser:
         category_ids: Optional[Union[int, Sequence[int]]] = None,
     ) -> Sequence[int]:
         """
-        Filtering parameters. default skips that filter.
+        Filtering parameters. Default does not filter anything.
-        :param category_names: get cats for given cat names
-        :param super_category_names: get cats for given super category names
-        :param category_ids: get cats for given cat ids
+        Args:
+            category_names: get cats for given cat names
+            super_category_names: get cats for given super category names
+            category_ids: get cats for given cat ids
-        :return: ids: integer array of cat ids
+        Returns:
+            ids: integer array of cat ids
         """
         if category_names is None:
@@ -441,12 +469,14 @@ class CocoParser:
         self, img_ids: Optional[Union[int, Sequence[int]]] = None, cat_ids: Optional[Union[int, Sequence[int]]] = None
     ) -> Sequence[int]:
         """
-        Get img ids that satisfy given filter conditions.
+        Get image ids that satisfy given filter conditions.
-        :param img_ids: get imgs for given ids
-        :param cat_ids: get imgs with all given cats
+        Args:
+            img_ids: get imgs for given ids
+            cat_ids: get imgs with all given cats
-        :return: ids: integer array of img ids
+        Returns:
+            ids: integer array of img ids
         """
         if img_ids is None:
@@ -472,9 +502,11 @@ class CocoParser:
         """
         Load anns with the specified ids.
-        :param ids: integer ids specifying anns
+        Args:
+            ids: integer ids specifying anns
-        :return: anns: loaded ann objects
+        Returns:
+            anns: loaded ann objects
         """
         if ids is None:
             ids = []
@@ -486,9 +518,11 @@ class CocoParser:
         """
         Load cats with the specified ids.
-        :param ids: integer ids specifying cats
+        Args:
+            ids: integer ids specifying cats
-        :return: cats: loaded cat objects
+        Returns:
+            cats: loaded cat objects
         """
         if ids is None:
             ids = []
@@ -500,9 +534,11 @@ class CocoParser:
         """
         Load anns with the specified ids.
-        :param ids: integer ids specifying img
+        Args:
+            ids: integer ids specifying img
-        :return: imgs: loaded img objects
+        Returns:
+            imgs: loaded img objects
         """
         if ids is None:
             ids = []
@@ -513,31 +549,34 @@ class CocoParser:
 class SerializerCoco:
     """
-    Class for serializing annotation files in Coco format. Coco comes in JSON format which is a priori not
+    Class for serializing annotation files in COCO format. COCO comes in `JSON` format which is a priori not
     serialized. This class implements only the very basic methods to generate a dataflow. It wraps the coco class
-    from pycocotools and assembles annotations that belong to the image. Note, that the conversion into the core
-    `Image` has to be done by yourself.
+    from `pycocotools` and assembles annotations that belong to the image.
+    Note:
+        Conversion into the core `Image` has to be done by yourself.
+    Example:
+        ```python
+        df = SerializerCoco.load("path/to/annotations.json")
+        df.reset_state()
+        for dp in df:
+            # {'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
+        ```
     """
     @staticmethod
     def load(path: PathLikeOrStr, max_datapoints: Optional[int] = None) -> DataFlow:
         """
-        Loads a .json file and generates a dataflow.
-        **Example:**
-                {'images':[img1,img2,...], 'annotations':[ann1,ann2,...],...}
-            it will generate a dataflow with datapoints
+        Loads a `.json` file and generates a dataflow.
+        Args:
+            max_datapoints: Will stop the iteration once max_datapoints have been streamed.
+            path: a path to a .json file.
-                {'image':{'id',...},'annotations':[{'id':…,'bbox':...}]}
-            for each image id. We use the type hint CocoDatapointDict to describe this dictionary
-        :param max_datapoints: Will stop the iteration once max_datapoints have been streamed.
-        :param path: a path to a .json file.
-        :return: dataflow to iterate from
+        Returns:
+            dataflow to iterate from
         """
         if not os.path.isfile(path):
             raise FileNotFoundError(path)
@@ -570,13 +609,15 @@ class SerializerPdfDoc:
     """
     Serialize a pdf document with an arbitrary number of pages.
-    **Example:**
-            df = SerializerPdfDoc.load("path/to/document.pdf")
+    Example:
+        ```python
+        df = SerializerPdfDoc.load("path/to/document.pdf")
         will yield datapoints:
-            {"path": "path/to/document.pdf", "file_name" document_page_1.pdf, "pdf_bytes": b"some-bytes"}
+        {"path": "path/to/document.pdf", "file_name" document_page_1.pdf, "pdf_bytes": b"some-bytes"}
+        ```
     """
     @staticmethod
@@ -584,10 +625,13 @@ class SerializerPdfDoc:
         """
         Loads the document page wise and returns a dataflow accordingly.
-        :param path: Path to the pdf document.
-        :param max_datapoints: The maximum number of pages to stream.
-        :return: A dict with structure {"path":... ,"file_name": ..., "pdf_bytes": ...}. The file name is a
-                 concatenation of the physical file name and the current page number.
+        Args:
+            path: Path to the pdf document.
+            max_datapoints: The maximum number of pages to stream.
+        Returns:
+            A dict with structure `{"path":... ,"file_name": ..., "pdf_bytes": ...}`. The file name is a
+            concatenation of the physical file name and the current page number.
         """
         file_name = os.path.split(path)[1]

deepdoctection/dataflow/parallel_map.py CHANGED Viewed

@@ -9,7 +9,6 @@ Replaces relevant parts of the Dataflow package. Most of the functions have been
 <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/parallel.py>
 <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/parallel_map.py>
 """
 import atexit
@@ -163,18 +162,19 @@ class MultiThreadMapData(_ParallelMapData):
     The semantics of this class is **identical** to `MapData` except for the ordering.
     Threads run in parallel and can take different time to run the
     mapping function. Therefore, the order of datapoints won't be preserved.
-    When ``strict=True``, ``MultiThreadMapData(df, ...)``
-    is guaranteed to produce the exact set of data as ``MapData(df, ...)``,
-    if both are iterated until ``StopIteration``. But the produced data will have different ordering.
-    The behavior of strict mode is undefined if the given dataflow ``df`` is infinite.
-    When ``strict=False``, the data that's produced by ``MultiThreadMapData(df, ...)``
-    is a reordering of the data produced by ``RepeatedData(MapData(df, ...), -1)``.
-    In other words, first pass of ``MultiThreadMapData.__iter__`` may contain
-    datapoints from the second pass of ``df.__iter__``.
+    When `strict=True`, `MultiThreadMapData(df, ...)`
+    is guaranteed to produce the exact set of data as `MapData(df, ...)`,
+    if both are iterated until `StopIteration`. But the produced data will have different ordering.
+    The behavior of strict mode is undefined if the given dataflow `df` is infinite.
+    When `strict=False`, the data that's produced by `MultiThreadMapData(df, ...)`
+    is a re-ordering of the data produced by `RepeatedData(MapData(df, ...), -1)`.
+    In other words, first pass of `MultiThreadMapData.__iter__` may contain
+    datapoints from the second pass of `df.__iter__`.
     Note:
         1. You should avoid starting many threads in your main process to reduce GIL contention.
            The threads will only start in the process which calls `reset_state()`.
-           Therefore you can use ``MultiProcessRunnerZMQ(MultiThreadMapData(...), 1)``
+           Therefore you can use `MultiProcessRunnerZMQ(MultiThreadMapData(...), 1)`
            to reduce GIL contention.
     """
@@ -215,12 +215,13 @@ class MultiThreadMapData(_ParallelMapData):
         strict: bool = False,
     ):
         """
-        :param df: the dataflow to map
-        :param num_thread: number of threads to use
-        :param map_func: datapoint -> datapoint | None. Return None to
-                         discard/skip the datapoint.
-        :param buffer_size: number of datapoints in the buffer
-        :param strict: use "strict mode", see notes above.
+        Args:
+            df: the dataflow to map
+            num_thread: number of threads to use
+            map_func: datapoint -> datapoint | None. Return None to
+                      discard/skip the datapoint.
+            buffer_size: number of datapoints in the buffer
+            strict: use "strict mode", see notes above.
         """
         if strict:
             # In strict mode, buffer size cannot be larger than the total number of datapoints
@@ -290,7 +291,7 @@ class _MultiProcessZMQDataFlow(DataFlow, ABC):
     def reset_state(self) -> Any:
         """
-        All forked dataflows should only be reset **once and only once** in spawned processes.
+        All forked dataflows should only be reset once and only once in spawned processes.
         Subclasses should call this method with super.
         """
         assert not self._reset_done, "reset_state() was called twice! This violates the API of DataFlow!"
@@ -338,17 +339,17 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
     """
     Same as `MapData`, but start processes to run the mapping function,
     and communicate with ZeroMQ pipe.
-    The semantics of this class is **identical** to `MapData` except for the ordering.
+    The semantics of this class is identical to `MapData` except for the ordering.
     Processes run in parallel and can take different time to run the
     mapping function. Therefore, the order of datapoints won't be preserved.
-    When ``strict=True``, ``MultiProcessMapData(df, ...)``
-    is guaranteed to produce the exact set of data as ``MapData(df, ...)``,
-    if both are iterated until ``StopIteration``. But the produced data will have different ordering.
-    The behavior of strict mode is undefined if the given dataflow ``df`` is infinite.
-    When ``strict=False``, the data that's produced by ``MultiProcessMapData(df, ...)``
-    is a reordering of the data produced by ``RepeatedData(MapData(df, ...), -1)``.
-    In other words, first pass of ``MultiProcessMapData.__iter__`` may contain
-    datapoints from the second pass of ``df.__iter__``.
+    When `strict=True`, `MultiProcessMapData(df, ...)`
+    is guaranteed to produce the exact set of data as `MapData(df, ...)`,
+    if both are iterated until `StopIteration`. But the produced data will have different ordering.
+    The behavior of strict mode is undefined if the given dataflow `df` is infinite.
+    When `strict=False`, the data that's produced by `MultiProcessMapData(df, ...)`
+    is a reordering of the data produced by `RepeatedData(MapData(df, ...), -1)`.
+    In other words, first pass of `MultiProcessMapData.__iter__` may contain
+    datapoints from the second pass of `df.__iter__`.
     """
     class _Worker(mp.Process):
@@ -384,11 +385,12 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
         strict: bool = False,
     ) -> None:
         """
-        :param df: the dataflow to map
-        :param num_proc: number of threads to use
-        :param map_func: datapoint -> datapoint | None. Return None to
-        :param buffer_size: number of datapoints in the buffer
-        :param strict: use "strict mode", see notes above.
+        Args:
+            df: the dataflow to map
+            num_proc: number of threads to use
+            map_func: datapoint -> datapoint | None. Return None to
+            buffer_size: number of datapoints in the buffer
+            strict: use "strict mode", see notes above.
         """
         if strict:
             # In strict mode, buffer size cannot be larger than the total number of datapoints

deepdoctection/dataflow/serialize.py CHANGED Viewed

@@ -25,8 +25,9 @@ class DataFromList(RNGDataFlow):
     def __init__(self, lst: list[Any], shuffle: bool = True) -> None:
         """
-        :param lst: input list. Each element is a datapoint.
-        :param shuffle: shuffle data.
+        Args:
+            lst: input list. Each element is a datapoint.
+            shuffle: shuffle data.
         """
         super().__init__()
         self.lst = lst
@@ -53,7 +54,8 @@ class DataFromIterable(DataFlow):
     def __init__(self, iterable: Iterable[Any]) -> None:
         """
-        :param iterable: an iterable object
+        Args:
+            iterable: an iterable object
         """
         self._itr = iterable
         self._len: Optional[int] = None
@@ -86,12 +88,13 @@ class FakeData(RNGDataFlow):
         domain: tuple[Union[float, int], Union[float, int]] = (0, 1),
     ):
         """
-        :param  shapes: a list of lists/tuples. Shapes of each component.
-        :param  size: size of this DataFlow.
-        :param  random: whether to randomly generate data every iteration.
+        Args:
+            shapes: a list of lists/tuples. Shapes of each component.
+            size: size of this DataFlow.
+            random: whether to randomly generate data every iteration.
                         Note that merely generating the data could sometimes be time-consuming!
-        :param dtype: data type as string, or a list of data types.
-        :param domain: (min, max) tuple, or a list of such tuples
+            dtype: data type as string, or a list of data types.
+            domain: (min, max) tuple, or a list of such tuples
         """
         super().__init__()
@@ -136,13 +139,15 @@ class PickleSerializer:
     @staticmethod
     def dumps(obj: Any) -> bytes:
         """
-        :param obj: bytes
+        Args:
+            obj: bytes
         """
         return pickle.dumps(obj, protocol=-1)
     @staticmethod
     def loads(buf: Any) -> Any:
         """
-        :param buf: bytes
+        Args:
+            buf: bytes
         """
         return pickle.loads(buf)

deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl