PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.33__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (131) hide show

deepdoctection/__init__.py +16 -29
deepdoctection/analyzer/dd.py +70 -59
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/common.py +9 -5
deepdoctection/dataflow/custom.py +5 -5
deepdoctection/dataflow/custom_serialize.py +75 -18
deepdoctection/dataflow/parallel_map.py +3 -3
deepdoctection/dataflow/serialize.py +4 -4
deepdoctection/dataflow/stats.py +3 -3
deepdoctection/datapoint/annotation.py +41 -56
deepdoctection/datapoint/box.py +9 -8
deepdoctection/datapoint/convert.py +6 -6
deepdoctection/datapoint/image.py +56 -44
deepdoctection/datapoint/view.py +245 -150
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +35 -26
deepdoctection/datasets/base.py +14 -12
deepdoctection/datasets/dataflow_builder.py +3 -3
deepdoctection/datasets/info.py +24 -26
deepdoctection/datasets/instances/doclaynet.py +51 -51
deepdoctection/datasets/instances/fintabnet.py +46 -46
deepdoctection/datasets/instances/funsd.py +25 -24
deepdoctection/datasets/instances/iiitar13k.py +13 -10
deepdoctection/datasets/instances/layouttest.py +4 -3
deepdoctection/datasets/instances/publaynet.py +5 -5
deepdoctection/datasets/instances/pubtables1m.py +24 -21
deepdoctection/datasets/instances/pubtabnet.py +32 -30
deepdoctection/datasets/instances/rvlcdip.py +30 -30
deepdoctection/datasets/instances/xfund.py +26 -26
deepdoctection/datasets/save.py +6 -6
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +32 -33
deepdoctection/eval/base.py +8 -9
deepdoctection/eval/cocometric.py +15 -13
deepdoctection/eval/eval.py +41 -37
deepdoctection/eval/tedsmetric.py +30 -23
deepdoctection/eval/tp_eval_callback.py +16 -19
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +339 -134
deepdoctection/extern/d2detect.py +85 -113
deepdoctection/extern/deskew.py +14 -11
deepdoctection/extern/doctrocr.py +141 -130
deepdoctection/extern/fastlang.py +27 -18
deepdoctection/extern/hfdetr.py +71 -62
deepdoctection/extern/hflayoutlm.py +504 -211
deepdoctection/extern/hflm.py +230 -0
deepdoctection/extern/model.py +488 -302
deepdoctection/extern/pdftext.py +23 -19
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +29 -19
deepdoctection/extern/tessocr.py +39 -38
deepdoctection/extern/texocr.py +18 -18
deepdoctection/extern/tp/tfutils.py +57 -9
deepdoctection/extern/tp/tpcompat.py +21 -14
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +45 -53
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/cats.py +27 -29
deepdoctection/mapper/cocostruct.py +10 -10
deepdoctection/mapper/d2struct.py +27 -26
deepdoctection/mapper/hfstruct.py +13 -8
deepdoctection/mapper/laylmstruct.py +178 -37
deepdoctection/mapper/maputils.py +12 -11
deepdoctection/mapper/match.py +2 -2
deepdoctection/mapper/misc.py +11 -9
deepdoctection/mapper/pascalstruct.py +4 -4
deepdoctection/mapper/prodigystruct.py +5 -5
deepdoctection/mapper/pubstruct.py +84 -92
deepdoctection/mapper/tpstruct.py +5 -5
deepdoctection/mapper/xfundstruct.py +33 -33
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +12 -14
deepdoctection/pipe/base.py +52 -106
deepdoctection/pipe/common.py +72 -59
deepdoctection/pipe/concurrency.py +16 -11
deepdoctection/pipe/doctectionpipe.py +24 -21
deepdoctection/pipe/language.py +20 -25
deepdoctection/pipe/layout.py +20 -16
deepdoctection/pipe/lm.py +75 -105
deepdoctection/pipe/order.py +194 -89
deepdoctection/pipe/refine.py +111 -124
deepdoctection/pipe/segment.py +156 -161
deepdoctection/pipe/{cell.py → sub_layout.py} +50 -40
deepdoctection/pipe/text.py +37 -36
deepdoctection/pipe/transform.py +19 -16
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +48 -41
deepdoctection/train/hf_detr_train.py +41 -30
deepdoctection/train/hf_layoutlm_train.py +153 -135
deepdoctection/train/tp_frcnn_train.py +32 -31
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +13 -6
deepdoctection/utils/develop.py +4 -4
deepdoctection/utils/env_info.py +87 -125
deepdoctection/utils/file_utils.py +6 -11
deepdoctection/utils/fs.py +22 -18
deepdoctection/utils/identifier.py +2 -2
deepdoctection/utils/logger.py +16 -15
deepdoctection/utils/metacfg.py +7 -7
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +11 -11
deepdoctection/utils/settings.py +185 -181
deepdoctection/utils/tqdm.py +1 -1
deepdoctection/utils/transform.py +14 -9
deepdoctection/utils/types.py +104 -0
deepdoctection/utils/utils.py +7 -7
deepdoctection/utils/viz.py +74 -72
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/METADATA +30 -21
deepdoctection-0.33.dist-info/RECORD +146 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/WHEEL +1 -1
deepdoctection/utils/detection_types.py +0 -68
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.33.dist-info}/top_level.txt +0 -0

deepdoctection/__init__.py CHANGED Viewed

@@ -19,15 +19,13 @@ import os
 import sys
 from typing import TYPE_CHECKING
-from packaging import version
-from .utils.env_info import auto_select_lib_and_device
+from .utils.env_info import collect_env_info
 from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
-from .utils.logger import logger
+from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = 0.31
+__version__ = 0.33
 _IMPORT_STRUCTURE = {
     "analyzer": [
@@ -162,6 +160,8 @@ _IMPORT_STRUCTURE = {
         "EvalCallback",
     ],
     "extern": [
+        "ModelCategories",
+        "NerModelCategories",
         "PredictorBase",
         "DetectionResult",
         "ObjectDetector",
@@ -182,6 +182,7 @@ _IMPORT_STRUCTURE = {
         "DocTrRotationTransformer",
         "FasttextLangDetector",
         "HFDetrDerivedDetector",
+        "get_tokenizer_from_architecture",
         "HFLayoutLmTokenClassifierBase",
         "HFLayoutLmTokenClassifier",
         "HFLayoutLmv2TokenClassifier",
@@ -189,6 +190,9 @@ _IMPORT_STRUCTURE = {
         "HFLayoutLmSequenceClassifier",
         "HFLayoutLmv2SequenceClassifier",
         "HFLayoutLmv3SequenceClassifier",
+        "HFLiltTokenClassifier",
+        "HFLiltSequenceClassifier",
+        "HFLmSequenceClassifier",
         "ModelProfile",
         "ModelCatalog",
         "print_model_infos",
@@ -268,11 +272,11 @@ _IMPORT_STRUCTURE = {
         "DoctectionPipe",
         "LanguageDetectionService",
         "ImageLayoutService",
-        "get_tokenizer_from_architecture",
         "LMTokenClassifierService",
         "LMSequenceClassifierService",
         "OrderGenerator",
         "TextLineGenerator",
+        "TextLineService",
         "TextOrderService",
         "TableSegmentationRefinementService",
         "generate_html_string",
@@ -297,14 +301,13 @@ _IMPORT_STRUCTURE = {
         "save_tmp_file",
         "timed_operation",
         "collect_env_info",
-        "get_device",
-        "auto_select_lib_and_device",
         "auto_select_viz_library",
         "get_tensorflow_requirement",
         "tf_addons_available",
         "get_tf_addons_requirements",
         "tensorpack_available",
         "get_tensorpack_requirement",
+        "pytorch_available",
         "get_pytorch_requirement",
         "lxml_available",
         "get_lxml_requirement",
@@ -418,25 +421,9 @@ _IMPORT_STRUCTURE = {
     ],
 }
-# disable TF warnings for versions > 2.4.1
-if tf_available():
-    if version.parse(get_tf_version()) > version.parse("2.4.1"):
-        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
-    try:
-        import tensorflow.python.util.deprecation as deprecation  # type: ignore # pylint: disable=E0401,R0402
-        deprecation._PRINT_DEPRECATION_WARNINGS = False  # pylint: disable=W0212
-    except Exception:  # pylint: disable=W0703
-        try:
-            from tensorflow.python.util import deprecation  # type: ignore # pylint: disable=E0401
-            deprecation._PRINT_DEPRECATION_WARNINGS = False  # pylint: disable=W0212
-        except Exception:  # pylint: disable=W0703
-            pass
 # Setting some environment variables so that standard functions can be invoked with available hardware
-auto_select_lib_and_device()
+env_info = collect_env_info()
+logger.debug(LoggingRecord(msg=env_info))
 # Direct imports for type-checking
@@ -444,10 +431,10 @@ if TYPE_CHECKING:
     from .analyzer import *
     from .dataflow import *
     from .datapoint import *
-    from .datasets import *
+    from .datasets import *  # type: ignore
     from .eval import *
-    from .extern import *
-    from .mapper import *
+    from .extern import *  # type: ignore
+    from .mapper import *  # type: ignore
     from .pipe import *
     from .train import *
     from .utils import *

deepdoctection/analyzer/dd.py CHANGED Viewed

@@ -23,51 +23,46 @@ Module for **deep**doctection analyzer.
 -user factory with a reduced config setting
 """
-import ast
+from __future__ import annotations
 import os
 from os import environ
 from shutil import copyfile
-from typing import List, Optional, Union
+from typing import Optional, Union
+from lazy_imports import try_import
 from ..extern.base import ObjectDetector
+from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
 from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
+from ..extern.hfdetr import HFDetrDerivedDetector
 from ..extern.model import ModelCatalog, ModelDownloadManager
 from ..extern.pdftext import PdfPlumberTextDetector
+from ..extern.pt.ptutils import get_torch_device
 from ..extern.tessocr import TesseractOcrDetector
 from ..extern.texocr import TextractOcrDetector
+from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
+from ..extern.tpdetect import TPFrcnnDetector
 from ..pipe.base import PipelineComponent
-from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
 from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
 from ..pipe.doctectionpipe import DoctectionPipe
 from ..pipe.layout import ImageLayoutService
 from ..pipe.order import TextOrderService
 from ..pipe.refine import TableSegmentationRefinementService
 from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
+from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
 from ..pipe.text import TextExtractionService
-from ..utils.detection_types import Pathlike
-from ..utils.env_info import get_device
-from ..utils.file_utils import (
-    boto3_available,
-    detectron2_available,
-    pytorch_available,
-    tensorpack_available,
-    tf_available,
-)
+from ..utils.env_info import ENV_VARS_TRUE
+from ..utils.error import DependencyError
+from ..utils.file_utils import detectron2_available, tensorpack_available
 from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
 from ..utils.logger import LoggingRecord, logger
 from ..utils.metacfg import AttrDict, set_config_by_yaml
 from ..utils.settings import CellType, LayoutType
 from ..utils.transform import PadTransform
+from ..utils.types import PathLikeOrStr
-if tf_available() and tensorpack_available():
-    from ..extern.tp.tfutils import disable_tp_layer_logging
-    from ..extern.tpdetect import TPFrcnnDetector
-if pytorch_available():
-    from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
-    from ..extern.hfdetr import HFDetrDerivedDetector
-if boto3_available():
+with try_import() as image_guard:
     from botocore.config import Config  # type: ignore
@@ -89,7 +84,7 @@ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
 def maybe_copy_config_to_cache(
-    package_path: Pathlike, configs_dir_path: Pathlike, file_name: str, force_copy: bool = True
+    package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
 ) -> str:
     """
     Initial copying of various files
@@ -123,7 +118,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
 def build_detector(
     cfg: AttrDict, mode: str
-) -> Union["D2FrcnnDetector", "TPFrcnnDetector", "HFDetrDerivedDetector", "D2FrcnnTracingDetector"]:
+) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
     """Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
     the config
@@ -141,8 +136,8 @@ def build_detector(
     config_path = ModelCatalog.get_full_path_configs(weights)
     weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
     profile = ModelCatalog.get_profile(weights)
-    categories = profile.categories
-    assert categories is not None
+    categories = profile.categories if profile.categories is not None else {}
     if profile.model_wrapper in ("TPFrcnnDetector",):
         return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
     if profile.model_wrapper in ("D2FrcnnDetector",):
@@ -210,11 +205,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
     padder = None
     if mode == "ITEM":
         if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
-            exclude_category_ids.extend(["1", "3", "4", "5", "6"])
+            exclude_category_ids.extend([1, 3, 4, 5, 6])
             padder = build_padder(cfg, mode)
-    detect_result_generator = DetectResultGenerator(detector.categories, exclude_category_ids=exclude_category_ids)
+    detect_result_generator = DetectResultGenerator(
+        categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
+    )
     return SubImageLayoutService(
-        detector, [LayoutType.table, LayoutType.table_rotated], None, detect_result_generator, padder
+        detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
     )
@@ -241,9 +238,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
         )
     if cfg.OCR.USE_TEXTRACT:
         credentials_kwargs = {
-            "aws_access_key_id": environ.get("ACCESS_KEY"),
-            "aws_secret_access_key": environ.get("SECRET_KEY"),
-            "config": Config(region_name=environ.get("REGION")),
+            "aws_access_key_id": environ.get("ACCESS_KEY", None),
+            "aws_secret_access_key": environ.get("SECRET_KEY", None),
+            "config": Config(region_name=environ.get("REGION", None)),
         }
         return TextractOcrDetector(**credentials_kwargs)
     raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
@@ -268,7 +265,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
     :param cfg: A configuration
     :return: Analyzer pipeline
     """
-    pipe_component_list: List[PipelineComponent] = []
+    pipe_component_list: list[PipelineComponent] = []
     if cfg.USE_LAYOUT:
         d_layout = build_detector(cfg, "LAYOUT")
@@ -308,22 +305,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
                 cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
                 cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
                 cfg.SEGMENTATION.CELL_CATEGORY_ID,
-                LayoutType.table,
+                LayoutType.TABLE,
                 [
-                    CellType.spanning,
-                    CellType.row_header,
-                    CellType.column_header,
-                    CellType.projected_row_header,
-                    LayoutType.cell,
+                    CellType.SPANNING,
+                    CellType.ROW_HEADER,
+                    CellType.COLUMN_HEADER,
+                    CellType.PROJECTED_ROW_HEADER,
+                    LayoutType.CELL,
                 ],
                 [
-                    CellType.spanning,
-                    CellType.row_header,
-                    CellType.column_header,
-                    CellType.projected_row_header,
+                    CellType.SPANNING,
+                    CellType.ROW_HEADER,
+                    CellType.COLUMN_HEADER,
+                    CellType.PROJECTED_ROW_HEADER,
                 ],
-                [LayoutType.row, LayoutType.column],
-                [CellType.row_number, CellType.column_number],
+                [LayoutType.ROW, LayoutType.COLUMN],
+                [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
                 stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
             )
             pipe_component_list.append(pubtables)
@@ -335,20 +332,29 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
                 cfg.SEGMENTATION.FULL_TABLE_TILING,
                 cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
                 cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
-                LayoutType.table,
-                [CellType.header, CellType.body, LayoutType.cell],
-                [LayoutType.row, LayoutType.column],
-                [CellType.row_number, CellType.column_number],
+                LayoutType.TABLE,
+                [CellType.HEADER, CellType.BODY, LayoutType.CELL],
+                [LayoutType.ROW, LayoutType.COLUMN],
+                [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
                 cfg.SEGMENTATION.STRETCH_RULE,
             )
             pipe_component_list.append(table_segmentation)
             if cfg.USE_TABLE_REFINEMENT:
-                table_segmentation_refinement = TableSegmentationRefinementService()
+                table_segmentation_refinement = TableSegmentationRefinementService(
+                    [LayoutType.TABLE, LayoutType.TABLE_ROTATED],
+                    [
+                        LayoutType.CELL,
+                        CellType.COLUMN_HEADER,
+                        CellType.PROJECTED_ROW_HEADER,
+                        CellType.SPANNING,
+                        CellType.ROW_HEADER,
+                    ],
+                )
                 pipe_component_list.append(table_segmentation_refinement)
     if cfg.USE_PDF_MINER:
-        pdf_text = PdfPlumberTextDetector()
+        pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
         d_text = TextExtractionService(pdf_text)
         pipe_component_list.append(d_text)
@@ -362,7 +368,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
         ocr = build_ocr(cfg)
         skip_if_text_extracted = cfg.USE_PDF_MINER
-        extract_from_roi = LayoutType.word if cfg.OCR.USE_DOCTR else None
+        extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
         text = TextExtractionService(
             ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
         )
@@ -371,7 +377,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
     if cfg.USE_PDF_MINER or cfg.USE_OCR:
         match = MatchingService(
             parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
-            child_categories=LayoutType.word,
+            child_categories=LayoutType.WORD,
             matching_rule=cfg.WORD_MATCHING.RULE,
             threshold=cfg.WORD_MATCHING.THRESHOLD,
             max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
@@ -379,7 +385,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
         pipe_component_list.append(match)
         order = TextOrderService(
-            text_container=LayoutType.word,
+            text_container=LayoutType.WORD,
             text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
             floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
             include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
@@ -391,7 +397,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
         pipe_component_list.append(order)
     page_parsing_service = PageParsingService(
-        text_container=LayoutType.word,
+        text_container=LayoutType.WORD,
         floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
         include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
     )
@@ -401,9 +407,9 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
 def get_dd_analyzer(
-    reset_config_file: bool = False,
-    config_overwrite: Optional[List[str]] = None,
-    path_config_file: Optional[Pathlike] = None,
+    reset_config_file: bool = True,
+    config_overwrite: Optional[list[str]] = None,
+    path_config_file: Optional[PathLikeOrStr] = None,
 ) -> DoctectionPipe:
     """
     Factory function for creating the built-in **deep**doctection analyzer.
@@ -430,8 +436,13 @@ def get_dd_analyzer(
     :return: A DoctectionPipe instance with given configs
     """
     config_overwrite = [] if config_overwrite is None else config_overwrite
-    lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
-    device = get_device(False)
+    lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
+    if lib == "TF":
+        device = get_tf_device()
+    elif lib == "PT":
+        device = get_torch_device()
+    else:
+        raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
     dd_one_config_path = maybe_copy_config_to_cache(
         get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
     )

deepdoctection/configs/conf_dd_one.yaml CHANGED Viewed

@@ -1,38 +1,38 @@
 USE_LAYOUT: True
 USE_TABLE_SEGMENTATION: True
 TF:
-   LAYOUT:
-      WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
-      FILTER:
-   CELL:
-      WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
-      FILTER:
-   ITEM:
-      WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
-      FILTER:
+  LAYOUT:
+    WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
+    FILTER:
+  CELL:
+    WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
+    FILTER:
+  ITEM:
+    WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
+    FILTER:
 PT:
-   LAYOUT:
-      WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
-      WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
-      FILTER:
-      PAD:
-        TOP: 60
-        RIGHT: 60
-        BOTTOM: 60
-        LEFT: 60
-   ITEM:
-     WEIGHTS: item/d2_model_1639999_item_inf_only.pt
-     WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
-     FILTER:
-     PAD:
-        TOP: 60
-        RIGHT: 60
-        BOTTOM: 60
-        LEFT: 60
-   CELL:
-      WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
-      WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
-      FILTER:
+  LAYOUT:
+    WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
+    WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
+    FILTER:
+    PAD:
+      TOP: 60
+      RIGHT: 60
+      BOTTOM: 60
+      LEFT: 60
+  ITEM:
+    WEIGHTS: item/d2_model_1639999_item_inf_only.pt
+    WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
+    FILTER:
+    PAD:
+      TOP: 60
+      RIGHT: 60
+      BOTTOM: 60
+      LEFT: 60
+  CELL:
+    WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
+    WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
+    FILTER:
 LAYOUT_NMS_PAIRS:
   COMBINATIONS:
   THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
   STRETCH_RULE: equal
 USE_TABLE_REFINEMENT: True
 USE_PDF_MINER: False
+PDF_MINER:
+  X_TOLERANCE: 3
+  Y_TOLERANCE: 3
 USE_OCR: True
 OCR:
   USE_TESSERACT: True

deepdoctection/dataflow/common.py CHANGED Viewed

@@ -12,7 +12,7 @@ Some DataFlow classes for transforming and processing datapoints. Many classes h
 """
 import itertools
 from copy import copy
-from typing import Any, Callable, Iterator, List, Union
+from typing import Any, Callable, Iterator, Union
 import tqdm
@@ -164,6 +164,10 @@ class RepeatedData(ProxyDataFlow):
                 Set to -1 to repeat ``ds`` infinite times.
         """
         self.num = num
+        if self.num != -1:
+            self.dfs = itertools.tee(df, self.num)
+        else:
+            self.dfs = ()
         super().__init__(df)
     def __len__(self) -> int:
@@ -180,8 +184,8 @@ class RepeatedData(ProxyDataFlow):
             while True:
                 yield from self.df
         else:
-            for _ in range(self.num):
-                yield from self.df
+            for df in self.dfs:
+                yield from df
 class ConcatData(DataFlow):
@@ -197,7 +201,7 @@ class ConcatData(DataFlow):
            df = ConcatData([df_1,df_2])
     """
-    def __init__(self, df_lists: List[DataFlow]) -> None:
+    def __init__(self, df_lists: list[DataFlow]) -> None:
         """
         :param df_lists: a list of DataFlow.
         """
@@ -233,7 +237,7 @@ class JoinData(DataFlow):
     `JoinData` will stop once the first Dataflow throws a StopIteration
     """
-    def __init__(self, df_lists: List[DataFlow]) -> None:
+    def __init__(self, df_lists: list[DataFlow]) -> None:
         """
         :param df_lists: a list of DataFlow. When these dataflows have different sizes, JoinData will stop when any
                         of them is exhausted.

deepdoctection/dataflow/custom.py CHANGED Viewed

@@ -21,7 +21,7 @@ from
 <https://github.com/tensorpack/dataflow/blob/master/dataflow/dataflow/common.py>
 """
-from typing import Any, Callable, Iterable, Iterator, List, Optional
+from typing import Any, Callable, Iterable, Iterator, Optional
 import numpy as np
@@ -54,7 +54,7 @@ class CacheData(ProxyDataFlow):
         :param shuffle: whether to shuffle the cache before yielding from it.
         """
         self.shuffle = shuffle
-        self.buffer: List[Any] = []
+        self.buffer: list[Any] = []
         self._guard: Optional[DataFlowReentrantGuard] = None
         self.rng = get_rng(self)
         super().__init__(df)
@@ -78,7 +78,7 @@ class CacheData(ProxyDataFlow):
                     yield dp
                     self.buffer.append(dp)
-    def get_cache(self) -> List[Any]:
+    def get_cache(self) -> list[Any]:
         """
         get the cache of the whole dataflow as a list
@@ -115,10 +115,10 @@ class CustomDataFromList(DataFromList):
     def __init__(
         self,
-        lst: List[Any],
+        lst: list[Any],
         shuffle: bool = False,
         max_datapoints: Optional[int] = None,
-        rebalance_func: Optional[Callable[[List[Any]], List[Any]]] = None,
+        rebalance_func: Optional[Callable[[list[Any]], list[Any]]] = None,
     ):
         """
         :param lst: the input list. Each element represents a datapoint.

deepdoctection 0.31__py3-none-any.whl → 0.33__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.33py3-none-any.whl