PyPI - deepdoctection - Versions diffs - 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl - Mend

deepdoctection 0.42.0py3-none-any.whl → 0.43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +2 -1
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +904 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +157 -106
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +196 -113
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +25 -17
deepdoctection/utils/env_info.py +85 -36
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -62
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.dist-info/METADATA +376 -0
deepdoctection-0.43.dist-info/RECORD +149 -0
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.0.dist-info/METADATA +0 -431
deepdoctection-0.42.0.dist-info/RECORD +0 -148
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0

deepdoctection/dataflow/common.py CHANGED Viewed

@@ -6,10 +6,9 @@
 """
-Some DataFlow classes for transforming and processing datapoints. Many classes have been taken from
-<https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
+Some DataFlows  for transforming and processing datapoints
 """
 import itertools
 from copy import copy
 from typing import Any, Callable, Iterator, Union
@@ -25,9 +24,10 @@ class TestDataSpeed(ProxyDataFlow):
     def __init__(self, df: DataFlow, size: int = 5000, warmup: int = 0) -> None:
         """
-        :param df: the DataFlow to test.
-        :param size: number of datapoints to fetch.
-        :param warmup: warmup iterations
+        Args:
+            df: The DataFlow to test.
+            size: Number of datapoints to fetch.
+            warmup: Warmup iterations.
         """
         super().__init__(df)
         self.test_size = int(size)
@@ -63,16 +63,16 @@ class TestDataSpeed(ProxyDataFlow):
 class FlattenData(ProxyDataFlow):
     """
-    Flatten an iterator within a datapoint. Will flatten the datapoint if it is a list or a tuple.
-    **Example:**
-            dp_1 = ['a','b']
-            dp_2 = ['c','d']
+    FlattenData flattens an iterator within a datapoint. Will flatten the datapoint if it is a list or a tuple.
-        will yield
+    Example:
+        ```python
+        dp_1 = ['a','b']
+        dp_2 = ['c','d']
-            ['a'], ['b'], ['c'], ['d'].
+        yields:
+            ['a'], ['b'], ['c'], ['d']
+        ```
     """
     def __iter__(self) -> Any:
@@ -84,23 +84,25 @@ class FlattenData(ProxyDataFlow):
 class MapData(ProxyDataFlow):
     """
-    Apply a mapper/filter on the datapoints of a DataFlow.
-    Note:
-        1. Please make sure func doesn't modify its arguments in place,
-           unless you're certain it's safe.
-        2. If you discard some datapoints, `len(MapData(ds))` will be incorrect.
+    MapData applies a mapper/filter on the datapoints of a DataFlow.
-    **Example:**
+    Notes:
+        1. Please ensure that `func` does not modify its arguments in-place unless it is safe.
+        2. If some datapoints are discarded, `len(MapData(ds))` will be incorrect.
-            df = ... # some dataflow each datapoint is [img, label]
-            ds = MapData(ds, lambda dp: [dp[0] * 255, dp[1]])
+    Example:
+        ```python
+        df = ... # a DataFlow where each datapoint is [img, label]
+        ds = MapData(ds, lambda dp: [dp[0] * 255, dp[1]])
+        ```
     """
     def __init__(self, df: DataFlow, func: Callable[[Any], Any]) -> None:
         """
-        :param df: input DataFlow
-        :param func: takes a datapoint and returns a new
-               datapoint. Return None to discard/skip this datapoint.
+        Args:
+            df: input DataFlow
+            func: takes a datapoint and returns a new
+                  datapoint. Return None to discard/skip this datapoint.
         """
         super().__init__(df)
         self.func = func
@@ -114,27 +116,27 @@ class MapData(ProxyDataFlow):
 class MapDataComponent(MapData):
     """
-    Apply a mapper/filter on a datapoint component.
-    Note:
-        1. This dataflow itself doesn't modify the datapoints.
-           But please make sure func doesn't modify its arguments in place,
-           unless you're certain it's safe.
-        2. If you discard some datapoints, ``len(MapDataComponent(ds, ..))`` will be incorrect.
-    **Example:**
-            df = ... # some dataflow each datapoint is [img, label]
-            ds = MapDataComponent(ds, lambda img: img * 255, 0)  # map the 0th component
+    MapDataComponent applies a mapper/filter on a component of a datapoint.
+    Notes:
+        1. This DataFlow itself does not modify the datapoints. Please ensure that `func` does not modify its arguments
+           in-place unless it is safe.
+        2. If some datapoints are discarded, `len(MapDataComponent(ds, ..))` will be incorrect.
+    Example:
+        ```python
+        df = ... # a DataFlow where each datapoint is [img, label]
+        ds = MapDataComponent(ds, lambda img: img * 255, 0)  # maps the 0th component
+        ```
     """
     def __init__(self, df: DataFlow, func: Callable[[Any], Any], index: Union[int, str] = 0) -> None:
         """
-        :param df: input DataFlow which produces either list or dict.
-            func (TYPE -> TYPE|None): takes ``dp[index]``, returns a new value for ``dp[index]``.
+        Args:
+            df: input DataFlow which produces either list or dict.
+                func (TYPE -> TYPE|None): takes ``dp[index]``, returns a new value for ``dp[index]``.
                 Return None to discard/skip this datapoint.
-        :param index: index or key of the component.
+            index: index or key of the component.
         """
         self._index = index
         self._func = func
@@ -152,16 +154,21 @@ class MapDataComponent(MapData):
 class RepeatedData(ProxyDataFlow):
-    """Take data points from another DataFlow and produce them until
-    it's exhausted for certain amount of times. i.e.:
-    `dp1`, `dp2`, .... `dpn`, `dp1`, `dp2`, ....`dpn`.
+    """
+    RepeatedData takes datapoints from another DataFlow and produces them until they are exhausted for a certain number
+    of repetitions.
+    Example:
+        ```python
+        dp1, dp2, .... dpn, dp1, dp2, ....dpn
+        ```
     """
     def __init__(self, df: DataFlow, num: int) -> None:
         """
-        :param df: input DataFlow
-        :param num: number of times to repeat ds.
-                Set to -1 to repeat ``ds`` infinite times.
+        Args:
+            df: Input DataFlow.
+            num: Number of repetitions of the DataFlow. Set `-1` to repeat the DataFlow infinitely.
         """
         self.num = num
         if self.num != -1:
@@ -173,7 +180,7 @@ class RepeatedData(ProxyDataFlow):
     def __len__(self) -> int:
         """
         Raises:
-            `ValueError` when num == -1.
+            ValueError: when num == -1.
         """
         if self.num == -1:
             raise NotImplementedError("__len__() is unavailable for infinite dataflow")
@@ -190,20 +197,23 @@ class RepeatedData(ProxyDataFlow):
 class ConcatData(DataFlow):
     """
-    Concatenate several DataFlow.
-    Produce datapoints from each DataFlow and start the next when one
-    DataFlow is exhausted. Use this dataflow to process several .pdf in one step.
+    ConcatData concatenates multiple DataFlows. Produces datapoints from each DataFlow and starts the next when one
+    DataFlow is exhausted. Use this DataFlow to process multiple .pdf files in one step.
+    Example:
+        ```python
+        df_1 = analyzer.analyze(path="path/to/pdf_1.pdf")
+        df_2 = analyzer.analyze(path="path/to/pdf_2.pdf")
+        df = ConcatData([df_1, df_2])
+        ```
-    **Example:**
-           df_1 = analyzer.analyze(path=path/to/pdf_1.pdf")
-           df_2 = analyzer.analyze(path=path/to/pdf_2.pdf")
-           df = ConcatData([df_1,df_2])
     """
     def __init__(self, df_lists: list[DataFlow]) -> None:
         """
-        :param df_lists: a list of DataFlow.
+        Args:
+            df_lists: A list of DataFlows.
         """
         self.df_lists = df_lists
@@ -221,28 +231,31 @@ class ConcatData(DataFlow):
 class JoinData(DataFlow):
     """
-    Join the components from each DataFlow. See below for its behavior.
-    Note that you can't join a DataFlow that produces lists with one that produces dicts.
-    **Example:**
+    JoinData joins the components from each DataFlow. See below for its behavior. It is not possible to join a DataFlow
+    that produces lists with one that produces dictionaries.
+    Example:
+        ```python
         df1 produces: [[c1], [c2]]
         df2 produces: [[c3], [c4]]
         joined: [[c1, c3], [c2, c4]]
-        df1 produces: {"a":c1, "b":c2}
-        df2 produces: {"c":c3}
-        joined: {"a":c1, "b":c2, "c":c3}
+        df1 produces: {"a": c1, "b": c2}
+        df2 produces: {"c": c3}
+        joined: {"a": c1, "b": c2, "c": c3}
+        ```
+    `JoinData` stops once the first DataFlow raises a `StopIteration`.
-    `JoinData` will stop once the first Dataflow throws a StopIteration
     """
     def __init__(self, df_lists: list[DataFlow]) -> None:
         """
-        :param df_lists: a list of DataFlow. When these dataflows have different sizes, JoinData will stop when any
-                        of them is exhausted.
-                        The list could contain the same DataFlow instance more than once,
-                        but note that in that case `__iter__` will then also be called many times.
+        Args:
+            df_lists: A list of DataFlows. If these DataFlows have different sizes, `JoinData` stops when one of them is
+                      exhausted. The list can contain the same DataFlow instance multiple times, but note that in this
+                      case `__iter__` will also be called multiple times.
         """
         self.df_lists = df_lists
@@ -275,18 +288,26 @@ class JoinData(DataFlow):
 class BatchData(ProxyDataFlow):
     """
-    Stack datapoints into batches. It produces datapoints of the same number of components as `df`, but
+    BatchData stacks datapoints into batches. It produces datapoints with the same number of components as `df`, but
     each datapoint is now a list of datapoints.
+    Example:
+        ```python
+        df produces: [[c1], [c2], [c3], [c4]]
+        batch_size = 2
+        yields: [[c1, c2], [c3, c4]]
+        ```
     """
     def __init__(self, df: DataFlow, batch_size: int, remainder: bool = False) -> None:
         """
-        :param df: A dataflow
-        :param batch_size: batch size
-        :param remainder: When the remaining datapoints in ``df`` is not enough to form a batch, whether or not to
-                          also produce the remaining data as a smaller batch.
-                          If set to `False`, all produced datapoints are guaranteed to have the same batch size.
-                          If set to `True`, `len(ds)` must be accurate.
+        Args:
+            df: A DataFlow.
+            batch_size: Batch size.
+            remainder: If the remaining datapoints in `df` are not enough to form a batch, whether to produce the
+                       remaining data as a smaller batch. If set to `False`, all produced datapoints are guaranteed to
+                       have the same batch size. If set to `True`, `len(ds)` must be accurate.
         """
         super().__init__(df)
         if not remainder:

deepdoctection/dataflow/custom.py CHANGED Viewed

@@ -16,8 +16,7 @@
 # limitations under the License.
 """
-Adding some functionality to dataflow classes (e.g. monkey patching, inheritance ...). Some ideas have been taken
-from
+Some custom dataflow classes. Some ideas have been taken from
 <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
 """
@@ -40,18 +39,22 @@ class CacheData(ProxyDataFlow):
     Completely cache the first pass of a DataFlow in memory,
     and produce from the cache thereafter.
-    NOTE: The user should not stop the iterator before it has reached the end.
-    Otherwise, the cache may be incomplete.
+    Note:
+        The user should not stop the iterator before it has reached the end.
+        Otherwise, the cache may be incomplete.
-    **Example:**
+    Example:
+        ```python
+        df_list = CacheData(df).get_cache() # Buffers the whole dataflow and return a list of all datapoints
+        ```
-            df_list = CacheData(df).get_cache()   # buffers the whole dataflow and return a list of all datapoints
     """
     def __init__(self, df: DataFlow, shuffle: bool = False) -> None:
         """
-        :param df: input DataFlow.
-        :param shuffle: whether to shuffle the cache before yielding from it.
+        Args:
+            df: input DataFlow.
+            shuffle: whether to shuffle the cache before yielding from it.
         """
         self.shuffle = shuffle
         self.buffer: list[Any] = []
@@ -80,9 +83,10 @@ class CacheData(ProxyDataFlow):
     def get_cache(self) -> list[Any]:
         """
-        get the cache of the whole dataflow as a list
+        Get the cache of the whole dataflow as a list.
-        :return: list of datapoints
+        Returns:
+            list of datapoints
         """
         self.reset_state()
         with get_tqdm() as status_bar:
@@ -95,21 +99,22 @@ class CacheData(ProxyDataFlow):
 class CustomDataFromList(DataFromList):
     """
-    Wraps a list of datapoints to a dataflow. Compared to `Tensorpack.DataFlow.DataFromList` implementation you
-    can specify a number of datapoints after that the iteration stops. You can also pass a rebalance function that
-    filters on that list.
+    Wraps a list of datapoints to a dataflow. Compared to `Tensorpack.DataFlow.DataFromList`
+    implementation you can specify a number of datapoints after that the iteration stops.
+    You can also pass a re-balance function that filters on that list.
-    **Example:**
+    Example:
-                def filter_first(lst):
-                    return lst.pop(0)
+        ```python
+        def filter_first(lst):
+            return lst.pop(0)
-                df = CustomDataFromList(lst=[["a","b"],["c","d"]],rebalance_func=filter_first)
-                df.reset_state()
+        df = CustomDataFromList(lst=[["a","b"],["c","d"]], rebalance_func=filter_first)
+        df.reset_state()
         will yield:
             ["c","d"]
+        ```
     """
@@ -121,13 +126,14 @@ class CustomDataFromList(DataFromList):
         rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
     ):
         """
-        :param lst: the input list. Each element represents a datapoint.
-        :param shuffle: Whether to shuffle the list before streaming.
-        :param max_datapoints: The maximum number of datapoints to return before stopping the iteration.
-                               If None it streams the whole dataflow.
-        :param rebalance_func: A func that inputs a list and outputs a list. Useful, if you want to filter the passed
-                               list and re-balance the sample. Only the output list of the re-balancing function will be
-                               considered.
+        Args:
+            lst: The input list. Each element represents a datapoint.
+            shuffle: Whether to shuffle the list before streaming.
+            max_datapoints: The maximum number of datapoints to return before stopping the iteration.
+                            If None it streams the whole dataflow.
+            rebalance_func: A func that inputs a list and outputs a list. Useful, if you want to filter the passed
+                            list and re-balance the sample. Only the output list of the re-balancing function will be
+                            considered.
         """
         super().__init__(lst, shuffle)
         self.max_datapoints = max_datapoints
@@ -176,9 +182,10 @@ class CustomDataFromIterable(DataFromIterable):
     def __init__(self, iterable: Iterable[Any], max_datapoints: Optional[int] = None):
         """
-        :param iterable: An iterable object
-        :param max_datapoints: The maximum number of datapoints to stream. If None it iterates through the whole
-                               dataflow.
+        Args:
+            iterable: An iterable object
+            max_datapoints: The maximum number of datapoints to stream. If None it iterates through the whole
+                            dataflow.
         """
         super().__init__(iterable)
         self.max_datapoints = max_datapoints

deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.0py3-none-any.whl → 0.43py3-none-any.whl