PyPI - deepdoctection - Versions diffs - 0.30__py3-none-any.whl → 0.32__py3-none-any.whl - Mend

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show

deepdoctection/__init__.py +38 -29
deepdoctection/analyzer/dd.py +36 -29
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +4 -3
deepdoctection/dataflow/custom_serialize.py +14 -5
deepdoctection/dataflow/parallel_map.py +12 -11
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/datapoint/annotation.py +35 -13
deepdoctection/datapoint/box.py +3 -5
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +79 -36
deepdoctection/datapoint/view.py +152 -49
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +6 -3
deepdoctection/datasets/base.py +86 -11
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +4 -4
deepdoctection/datasets/instances/doclaynet.py +3 -2
deepdoctection/datasets/instances/fintabnet.py +2 -1
deepdoctection/datasets/instances/funsd.py +2 -1
deepdoctection/datasets/instances/iiitar13k.py +5 -2
deepdoctection/datasets/instances/layouttest.py +4 -8
deepdoctection/datasets/instances/publaynet.py +2 -2
deepdoctection/datasets/instances/pubtables1m.py +6 -3
deepdoctection/datasets/instances/pubtabnet.py +2 -1
deepdoctection/datasets/instances/rvlcdip.py +2 -1
deepdoctection/datasets/instances/xfund.py +2 -1
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +1 -1
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/cocometric.py +2 -1
deepdoctection/eval/eval.py +19 -15
deepdoctection/eval/tedsmetric.py +14 -11
deepdoctection/eval/tp_eval_callback.py +14 -7
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +182 -90
deepdoctection/extern/deskew.py +36 -9
deepdoctection/extern/doctrocr.py +265 -83
deepdoctection/extern/fastlang.py +49 -9
deepdoctection/extern/hfdetr.py +106 -55
deepdoctection/extern/hflayoutlm.py +441 -122
deepdoctection/extern/hflm.py +225 -0
deepdoctection/extern/model.py +56 -47
deepdoctection/extern/pdftext.py +10 -5
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +27 -18
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +6 -2
deepdoctection/extern/tp/tfutils.py +43 -9
deepdoctection/extern/tp/tpcompat.py +14 -11
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +54 -30
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/d2struct.py +9 -7
deepdoctection/mapper/hfstruct.py +7 -2
deepdoctection/mapper/laylmstruct.py +164 -21
deepdoctection/mapper/maputils.py +16 -3
deepdoctection/mapper/misc.py +6 -3
deepdoctection/mapper/prodigystruct.py +1 -1
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/common.py +23 -13
deepdoctection/pipe/concurrency.py +2 -1
deepdoctection/pipe/doctectionpipe.py +2 -2
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +6 -3
deepdoctection/pipe/lm.py +34 -66
deepdoctection/pipe/order.py +142 -35
deepdoctection/pipe/refine.py +26 -24
deepdoctection/pipe/segment.py +21 -16
deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -9
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +36 -28
deepdoctection/train/hf_detr_train.py +26 -17
deepdoctection/train/hf_layoutlm_train.py +133 -111
deepdoctection/train/tp_frcnn_train.py +21 -19
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +2 -2
deepdoctection/utils/env_info.py +41 -84
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +4 -15
deepdoctection/utils/fs.py +7 -7
deepdoctection/utils/logger.py +1 -0
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +5 -4
deepdoctection/utils/settings.py +6 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +48 -5
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
deepdoctection-0.32.dist-info/RECORD +146 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
deepdoctection-0.30.dist-info/RECORD +0 -143
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0

deepdoctection/__init__.py CHANGED Viewed

@@ -19,15 +19,13 @@ import os
 import sys
 from typing import TYPE_CHECKING
-from packaging import version
-from .utils.env_info import auto_select_lib_and_device
+from .utils.env_info import collect_env_info
 from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
-from .utils.logger import logger
+from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = 0.30
+__version__ = 0.32
 _IMPORT_STRUCTURE = {
     "analyzer": [
@@ -179,8 +177,10 @@ _IMPORT_STRUCTURE = {
         "Jdeskewer",
         "DoctrTextlineDetector",
         "DoctrTextRecognizer",
+        "DocTrRotationTransformer",
         "FasttextLangDetector",
         "HFDetrDerivedDetector",
+        "get_tokenizer_from_architecture",
         "HFLayoutLmTokenClassifierBase",
         "HFLayoutLmTokenClassifier",
         "HFLayoutLmv2TokenClassifier",
@@ -188,12 +188,16 @@ _IMPORT_STRUCTURE = {
         "HFLayoutLmSequenceClassifier",
         "HFLayoutLmv2SequenceClassifier",
         "HFLayoutLmv3SequenceClassifier",
+        "HFLiltTokenClassifier",
+        "HFLiltSequenceClassifier",
+        "HFLmSequenceClassifier",
         "ModelProfile",
         "ModelCatalog",
         "print_model_infos",
         "ModelDownloadManager",
         "PdfPlumberTextDetector",
         "TesseractOcrDetector",
+        "TesseractRotationTransformer",
         "TextractOcrDetector",
         "TPFrcnnDetector",
     ],
@@ -266,11 +270,11 @@ _IMPORT_STRUCTURE = {
         "DoctectionPipe",
         "LanguageDetectionService",
         "ImageLayoutService",
-        "get_tokenizer_from_architecture",
         "LMTokenClassifierService",
         "LMSequenceClassifierService",
         "OrderGenerator",
         "TextLineGenerator",
+        "TextLineService",
         "TextOrderService",
         "TableSegmentationRefinementService",
         "generate_html_string",
@@ -279,7 +283,7 @@ _IMPORT_STRUCTURE = {
         "PubtablesSegmentationService",
         "SegmentationResult",
         "TextExtractionService",
-        "SimpleTransformPipelineComponent",
+        "SimpleTransformService",
     ],
     "train": [
         "D2Trainer",
@@ -295,14 +299,13 @@ _IMPORT_STRUCTURE = {
         "save_tmp_file",
         "timed_operation",
         "collect_env_info",
-        "get_device",
-        "auto_select_lib_and_device",
         "auto_select_viz_library",
         "get_tensorflow_requirement",
         "tf_addons_available",
         "get_tf_addons_requirements",
         "tensorpack_available",
         "get_tensorpack_requirement",
+        "pytorch_available",
         "get_pytorch_requirement",
         "lxml_available",
         "get_lxml_requirement",
@@ -416,25 +419,31 @@ _IMPORT_STRUCTURE = {
     ],
 }
+# Setting some environment variables so that standard functions can be invoked with available hardware
+env_info = collect_env_info()
+logger.debug(LoggingRecord(msg=env_info))
-# disable TF warnings for versions > 2.4.1
-if tf_available():
-    if version.parse(get_tf_version()) > version.parse("2.4.1"):
-        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
-    try:
-        import tensorflow.python.util.deprecation as deprecation  # type: ignore # pylint: disable=E0401,R0402
-        deprecation._PRINT_DEPRECATION_WARNINGS = False  # pylint: disable=W0212
-    except Exception:  # pylint: disable=W0703
-        try:
-            from tensorflow.python.util import deprecation  # type: ignore # pylint: disable=E0401
-            deprecation._PRINT_DEPRECATION_WARNINGS = False  # pylint: disable=W0212
-        except Exception:  # pylint: disable=W0703
-            pass
+if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
+    os.environ["DD_USE_TORCH"] = "1"
+    os.environ["USE_TORCH"] = "1"
+if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
+    os.environ["DD_USE_TF"] = "1"
+    os.environ["USE_TF"] = "1"
+if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
+    logger.warning(
+        "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
+        "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
+    )
+    os.environ.pop("DD_USE_TF")
+    os.environ.pop("USE_TF")
-# Setting some environment variables so that standard functions can be invoked with available hardware
-auto_select_lib_and_device()
+if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
+    logger.warning(
+        LoggingRecord(
+            msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
+            "model from the library."
+        )
+    )
 # Direct imports for type-checking
@@ -442,10 +451,10 @@ if TYPE_CHECKING:
     from .analyzer import *
     from .dataflow import *
     from .datapoint import *
-    from .datasets import *
+    from .datasets import *  # type: ignore
     from .eval import *
-    from .extern import *
-    from .mapper import *
+    from .extern import *  # type: ignore
+    from .mapper import *  # type: ignore
     from .pipe import *
     from .train import *
     from .utils import *

deepdoctection/analyzer/dd.py CHANGED Viewed

@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
 -user factory with a reduced config setting
 """
-import ast
 import os
 from os import environ
 from shutil import copyfile
 from typing import List, Optional, Union
+from lazy_imports import try_import
 from ..extern.base import ObjectDetector
+from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
 from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
+from ..extern.hfdetr import HFDetrDerivedDetector
 from ..extern.model import ModelCatalog, ModelDownloadManager
 from ..extern.pdftext import PdfPlumberTextDetector
+from ..extern.pt.ptutils import get_torch_device
 from ..extern.tessocr import TesseractOcrDetector
 from ..extern.texocr import TextractOcrDetector
+from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
+from ..extern.tpdetect import TPFrcnnDetector
 from ..pipe.base import PipelineComponent
-from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
 from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
 from ..pipe.doctectionpipe import DoctectionPipe
 from ..pipe.layout import ImageLayoutService
 from ..pipe.order import TextOrderService
 from ..pipe.refine import TableSegmentationRefinementService
 from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
+from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
 from ..pipe.text import TextExtractionService
 from ..utils.detection_types import Pathlike
-from ..utils.env_info import get_device
-from ..utils.file_utils import (
-    boto3_available,
-    detectron2_available,
-    pytorch_available,
-    tensorpack_available,
-    tf_available,
-)
+from ..utils.error import DependencyError
+from ..utils.file_utils import detectron2_available, tensorpack_available
 from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
 from ..utils.logger import LoggingRecord, logger
 from ..utils.metacfg import AttrDict, set_config_by_yaml
 from ..utils.settings import CellType, LayoutType
 from ..utils.transform import PadTransform
-if tf_available() and tensorpack_available():
-    from ..extern.tp.tfutils import disable_tp_layer_logging
-    from ..extern.tpdetect import TPFrcnnDetector
-if pytorch_available():
-    from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
-    from ..extern.hfdetr import HFDetrDerivedDetector
-if boto3_available():
+with try_import() as image_guard:
     from botocore.config import Config  # type: ignore
@@ -113,11 +105,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
     """Some config sanity checks"""
     if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
         raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
-    if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
-        raise ValueError(
-            "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True and set the other two "
-            "to False. Only one OCR system can be activated."
-        )
+    if cfg.USE_OCR:
+        if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
+            raise ValueError(
+                "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
+                "and set the other two to False. Only one OCR system can be activated."
+            )
 def build_detector(
@@ -343,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
             pipe_component_list.append(table_segmentation)
             if cfg.USE_TABLE_REFINEMENT:
-                table_segmentation_refinement = TableSegmentationRefinementService()
+                table_segmentation_refinement = TableSegmentationRefinementService(
+                    [LayoutType.table, LayoutType.table_rotated],
+                    [
+                        LayoutType.cell,
+                        CellType.column_header,
+                        CellType.projected_row_header,
+                        CellType.spanning,
+                        CellType.row_header,
+                    ],
+                )
                 pipe_component_list.append(table_segmentation_refinement)
     if cfg.USE_PDF_MINER:
-        pdf_text = PdfPlumberTextDetector()
+        pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
         d_text = TextExtractionService(pdf_text)
         pipe_component_list.append(d_text)
@@ -400,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
 def get_dd_analyzer(
-    reset_config_file: bool = False,
+    reset_config_file: bool = True,
     config_overwrite: Optional[List[str]] = None,
     path_config_file: Optional[Pathlike] = None,
 ) -> DoctectionPipe:
@@ -429,8 +431,13 @@ def get_dd_analyzer(
     :return: A DoctectionPipe instance with given configs
     """
     config_overwrite = [] if config_overwrite is None else config_overwrite
-    lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
-    device = get_device(False)
+    lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
+    if lib == "TF":
+        device = get_tf_device()
+    elif lib == "PT":
+        device = get_torch_device()
+    else:
+        raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
     dd_one_config_path = maybe_copy_config_to_cache(
         get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
     )

deepdoctection/configs/conf_dd_one.yaml CHANGED Viewed

@@ -1,38 +1,38 @@
 USE_LAYOUT: True
 USE_TABLE_SEGMENTATION: True
 TF:
-   LAYOUT:
-      WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
-      FILTER:
-   CELL:
-      WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
-      FILTER:
-   ITEM:
-      WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
-      FILTER:
+  LAYOUT:
+    WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
+    FILTER:
+  CELL:
+    WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
+    FILTER:
+  ITEM:
+    WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
+    FILTER:
 PT:
-   LAYOUT:
-      WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
-      WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
-      FILTER:
-      PAD:
-        TOP: 60
-        RIGHT: 60
-        BOTTOM: 60
-        LEFT: 60
-   ITEM:
-     WEIGHTS: item/d2_model_1639999_item_inf_only.pt
-     WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
-     FILTER:
-     PAD:
-        TOP: 60
-        RIGHT: 60
-        BOTTOM: 60
-        LEFT: 60
-   CELL:
-      WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
-      WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
-      FILTER:
+  LAYOUT:
+    WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
+    WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
+    FILTER:
+    PAD:
+      TOP: 60
+      RIGHT: 60
+      BOTTOM: 60
+      LEFT: 60
+  ITEM:
+    WEIGHTS: item/d2_model_1639999_item_inf_only.pt
+    WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
+    FILTER:
+    PAD:
+      TOP: 60
+      RIGHT: 60
+      BOTTOM: 60
+      LEFT: 60
+  CELL:
+    WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
+    WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
+    FILTER:
 LAYOUT_NMS_PAIRS:
   COMBINATIONS:
   THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
   STRETCH_RULE: equal
 USE_TABLE_REFINEMENT: True
 USE_PDF_MINER: False
+PDF_MINER:
+  X_TOLERANCE: 3
+  Y_TOLERANCE: 3
 USE_OCR: True
 OCR:
   USE_TESSERACT: True

deepdoctection/dataflow/base.py CHANGED Viewed

@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
 from ..utils.utils import get_rng
-class DataFlowTerminated(BaseException):
-    """
-    An exception indicating that the DataFlow is unable to produce any more
-    data, i.e. something wrong happened so that calling `__iter__`
-    cannot give a valid iterator anymore.
-    In most DataFlow this will never be raised.
-    """
-class DataFlowResetStateNotCalled(BaseException):
-    """
-    An exception indicating that `reset_state()` has not been called before starting
-    iteration.
-    """
-    def __init__(self) -> None:
-        super().__init__("Iterating a dataflow requires .reset_state() to be called first")
 class DataFlowReentrantGuard:
     """
     A tool to enforce non-reentrancy.

deepdoctection/dataflow/custom.py CHANGED Viewed

@@ -25,10 +25,11 @@ from typing import Any, Callable, Iterable, Iterator, List, Optional
 import numpy as np
+from ..utils.error import DataFlowResetStateNotCalledError
 from ..utils.logger import LoggingRecord, logger
 from ..utils.tqdm import get_tqdm
 from ..utils.utils import get_rng
-from .base import DataFlow, DataFlowReentrantGuard, DataFlowResetStateNotCalled, ProxyDataFlow
+from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
 from .serialize import DataFromIterable, DataFromList
 __all__ = ["CacheData", "CustomDataFromList", "CustomDataFromIterable"]
@@ -65,7 +66,7 @@ class CacheData(ProxyDataFlow):
     def __iter__(self) -> Iterator[Any]:
         if self._guard is None:
-            raise DataFlowResetStateNotCalled()
+            raise DataFlowResetStateNotCalledError()
         with self._guard:
             if self.buffer:
@@ -139,7 +140,7 @@ class CustomDataFromList(DataFromList):
     def __iter__(self) -> Iterator[Any]:
         if self.rng is None:
-            raise DataFlowResetStateNotCalled()
+            raise DataFlowResetStateNotCalledError()
         if self.rebalance_func is not None:
             lst_tmp = self.rebalance_func(self.lst)
             logger.info(LoggingRecord(f"CustomDataFromList: subset size after re-balancing: {len(lst_tmp)}"))

deepdoctection/dataflow/custom_serialize.py CHANGED Viewed

@@ -27,13 +27,16 @@ from pathlib import Path
 from typing import DefaultDict, Dict, List, Optional, Sequence, Union
 from jsonlines import Reader, Writer
+from tabulate import tabulate
+from termcolor import colored
 from ..utils.context import timed_operation
 from ..utils.detection_types import JsonDict, Pathlike
+from ..utils.error import FileExtensionError
 from ..utils.identifier import get_uuid_from_str
 from ..utils.pdf_utils import PDFStreamer
 from ..utils.tqdm import get_tqdm
-from ..utils.utils import FileExtensionError, is_file_extension
+from ..utils.utils import is_file_extension
 from .base import DataFlow
 from .common import FlattenData, JoinData, MapData
 from .custom import CacheData, CustomDataFromIterable, CustomDataFromList
@@ -223,7 +226,7 @@ class SerializerFiles:
         """
         Not implemented
         """
-        raise NotImplementedError
+        raise NotImplementedError()
 class CocoParser:
@@ -283,8 +286,14 @@ class CocoParser:
         """
         Print information about the annotation file.
         """
+        rows = []
         for key, value in self.dataset["info"].items():
-            print(f"{key}: {value}")
+            row = [key, value]
+            rows.append(row)
+        header = ["key", "value"]
+        table = tabulate(rows, headers=header, tablefmt="fancy_grid", stralign="left", numalign="left")
+        print(colored(table, "cyan"))
     def get_ann_ids(
         self,
@@ -499,7 +508,7 @@ class SerializerCoco:
         """
         Not implemented
         """
-        raise NotImplementedError
+        raise NotImplementedError()
 class SerializerPdfDoc:
@@ -547,7 +556,7 @@ class SerializerPdfDoc:
         """
         Not implemented
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     @staticmethod
     def split(path: Pathlike, path_target: Optional[Pathlike] = None, max_datapoint: Optional[int] = None) -> None:

deepdoctection/dataflow/parallel_map.py CHANGED Viewed

@@ -28,8 +28,9 @@ from typing import Any, Callable, Iterator, List, no_type_check
 import zmq
 from ..utils.concurrency import StoppableThread, enable_death_signal, start_proc_mask_signal
+from ..utils.error import DataFlowTerminatedError
 from ..utils.logger import LoggingRecord, logger
-from .base import DataFlow, DataFlowReentrantGuard, DataFlowTerminated, ProxyDataFlow
+from .base import DataFlow, DataFlowReentrantGuard, ProxyDataFlow
 from .common import RepeatedData
 from .serialize import PickleSerializer
@@ -49,14 +50,14 @@ def _zmq_catch_error(name):
         yield
     except zmq.ContextTerminated as exc:
         logger.info(LoggingRecord(f"_zmq_catch_error: [{name}] Context terminated."))
-        raise DataFlowTerminated() from exc
+        raise DataFlowTerminatedError() from exc
     except zmq.ZMQError as exc:
         if exc.errno == errno.ENOTSOCK:  # socket closed
             logger.info(LoggingRecord(f"_zmq_catch_error: [{name}]  Socket closed."))
-            raise DataFlowTerminated() from exc
-        raise ValueError from exc
+            raise DataFlowTerminatedError() from exc
+        raise ValueError() from exc
     except Exception as exc:
-        raise ValueError from exc
+        raise ValueError() from exc
 @no_type_check
@@ -78,8 +79,8 @@ def _get_pipe_name(name):
 class _ParallelMapData(ProxyDataFlow, ABC):
     def __init__(self, df: DataFlow, buffer_size: int, strict: bool = False) -> None:
         super().__init__(df)
-        if not buffer_size:
-            raise ValueError("buffer_size must be a positive number")
+        if buffer_size <= 0:
+            raise ValueError(f"buffer_size must be a positive number, got {buffer_size}")
         self._buffer_size = buffer_size
         self._buffer_occupancy = 0  # actual #elements in buffer, only useful in strict mode
         self._strict = strict
@@ -95,12 +96,12 @@ class _ParallelMapData(ProxyDataFlow, ABC):
     @no_type_check
     @abstractmethod
     def _recv(self):
-        raise NotImplementedError
+        raise NotImplementedError()
     @no_type_check
     @abstractmethod
     def _send(self, dp: Any):
-        raise NotImplementedError
+        raise NotImplementedError()
     @no_type_check
     def _recv_filter_none(self):
@@ -398,8 +399,8 @@ class MultiProcessMapData(_ParallelMapData, _MultiProcessZMQDataFlow):
         _ParallelMapData.__init__(self, df, buffer_size, strict)
         _MultiProcessZMQDataFlow.__init__(self)
-        if not num_proc:
-            raise ValueError("num_proc must be a positive number")
+        if num_proc <= 0:
+            raise ValueError(f"num_proc must be a positive number, got {num_proc}")
         self.num_proc = num_proc
         self.map_func = map_func
         self._strict = strict

deepdoctection/dataflow/serialize.py CHANGED Viewed

@@ -16,7 +16,8 @@ from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union
 import numpy as np
-from .base import DataFlow, DataFlowResetStateNotCalled, RNGDataFlow
+from ..utils.error import DataFlowResetStateNotCalledError
+from .base import DataFlow, RNGDataFlow
 class DataFromList(RNGDataFlow):
@@ -44,7 +45,7 @@ class DataFromList(RNGDataFlow):
                 for k in idxs:
                     yield self.lst[k]
             else:
-                raise DataFlowResetStateNotCalled()
+                raise DataFlowResetStateNotCalledError()
 class DataFromIterable(DataFlow):
@@ -63,7 +64,7 @@ class DataFromIterable(DataFlow):
     def __len__(self) -> int:
         if self._len is None:
-            raise NotImplementedError
+            raise NotImplementedError()
         return self._len
     def __iter__(self) -> Iterator[Any]:
@@ -107,7 +108,7 @@ class FakeData(RNGDataFlow):
     def __iter__(self) -> Iterator[Any]:
         if self.rng is None:
-            raise DataFlowResetStateNotCalled()
+            raise DataFlowResetStateNotCalledError()
         if self.random:
             for _ in range(self._size):
                 val = []

deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl