PyPI - deepdoctection - Versions diffs - 0.31__py3-none-any.whl → 0.32__py3-none-any.whl - Mend

deepdoctection 0.31py3-none-any.whl → 0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (91) hide show

deepdoctection/__init__.py +35 -28
deepdoctection/analyzer/dd.py +30 -24
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/datapoint/annotation.py +2 -1
deepdoctection/datapoint/box.py +2 -1
deepdoctection/datapoint/image.py +13 -7
deepdoctection/datapoint/view.py +95 -24
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +5 -2
deepdoctection/datasets/base.py +5 -3
deepdoctection/datasets/info.py +2 -2
deepdoctection/datasets/instances/doclaynet.py +3 -2
deepdoctection/datasets/instances/fintabnet.py +2 -1
deepdoctection/datasets/instances/funsd.py +2 -1
deepdoctection/datasets/instances/iiitar13k.py +5 -2
deepdoctection/datasets/instances/layouttest.py +2 -1
deepdoctection/datasets/instances/publaynet.py +2 -2
deepdoctection/datasets/instances/pubtables1m.py +6 -3
deepdoctection/datasets/instances/pubtabnet.py +2 -1
deepdoctection/datasets/instances/rvlcdip.py +2 -1
deepdoctection/datasets/instances/xfund.py +2 -1
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/cocometric.py +2 -1
deepdoctection/eval/eval.py +17 -13
deepdoctection/eval/tedsmetric.py +14 -11
deepdoctection/eval/tp_eval_callback.py +9 -3
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/d2detect.py +24 -32
deepdoctection/extern/deskew.py +4 -2
deepdoctection/extern/doctrocr.py +75 -81
deepdoctection/extern/fastlang.py +4 -2
deepdoctection/extern/hfdetr.py +22 -28
deepdoctection/extern/hflayoutlm.py +335 -103
deepdoctection/extern/hflm.py +225 -0
deepdoctection/extern/model.py +56 -47
deepdoctection/extern/pdftext.py +8 -4
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +27 -19
deepdoctection/extern/texocr.py +4 -2
deepdoctection/extern/tp/tfutils.py +43 -9
deepdoctection/extern/tp/tpcompat.py +10 -7
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +7 -3
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +5 -8
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/d2struct.py +8 -6
deepdoctection/mapper/hfstruct.py +6 -1
deepdoctection/mapper/laylmstruct.py +163 -20
deepdoctection/mapper/maputils.py +3 -1
deepdoctection/mapper/misc.py +6 -3
deepdoctection/mapper/tpstruct.py +2 -2
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/common.py +11 -9
deepdoctection/pipe/concurrency.py +2 -1
deepdoctection/pipe/layout.py +3 -1
deepdoctection/pipe/lm.py +32 -64
deepdoctection/pipe/order.py +142 -35
deepdoctection/pipe/refine.py +8 -14
deepdoctection/pipe/{cell.py → sub_layout.py} +1 -1
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +21 -16
deepdoctection/train/hf_detr_train.py +18 -11
deepdoctection/train/hf_layoutlm_train.py +118 -101
deepdoctection/train/tp_frcnn_train.py +21 -19
deepdoctection/utils/env_info.py +41 -117
deepdoctection/utils/logger.py +1 -0
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/settings.py +1 -0
deepdoctection/utils/viz.py +4 -3
{deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/METADATA +27 -18
deepdoctection-0.32.dist-info/RECORD +146 -0
deepdoctection-0.31.dist-info/RECORD +0 -144
{deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/WHEEL +0 -0
{deepdoctection-0.31.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0

deepdoctection/__init__.py CHANGED Viewed

@@ -19,15 +19,13 @@ import os
 import sys
 from typing import TYPE_CHECKING
-from packaging import version
-from .utils.env_info import auto_select_lib_and_device
+from .utils.env_info import collect_env_info
 from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
-from .utils.logger import logger
+from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = 0.31
+__version__ = 0.32
 _IMPORT_STRUCTURE = {
     "analyzer": [
@@ -182,6 +180,7 @@ _IMPORT_STRUCTURE = {
         "DocTrRotationTransformer",
         "FasttextLangDetector",
         "HFDetrDerivedDetector",
+        "get_tokenizer_from_architecture",
         "HFLayoutLmTokenClassifierBase",
         "HFLayoutLmTokenClassifier",
         "HFLayoutLmv2TokenClassifier",
@@ -189,6 +188,9 @@ _IMPORT_STRUCTURE = {
         "HFLayoutLmSequenceClassifier",
         "HFLayoutLmv2SequenceClassifier",
         "HFLayoutLmv3SequenceClassifier",
+        "HFLiltTokenClassifier",
+        "HFLiltSequenceClassifier",
+        "HFLmSequenceClassifier",
         "ModelProfile",
         "ModelCatalog",
         "print_model_infos",
@@ -268,11 +270,11 @@ _IMPORT_STRUCTURE = {
         "DoctectionPipe",
         "LanguageDetectionService",
         "ImageLayoutService",
-        "get_tokenizer_from_architecture",
         "LMTokenClassifierService",
         "LMSequenceClassifierService",
         "OrderGenerator",
         "TextLineGenerator",
+        "TextLineService",
         "TextOrderService",
         "TableSegmentationRefinementService",
         "generate_html_string",
@@ -297,14 +299,13 @@ _IMPORT_STRUCTURE = {
         "save_tmp_file",
         "timed_operation",
         "collect_env_info",
-        "get_device",
-        "auto_select_lib_and_device",
         "auto_select_viz_library",
         "get_tensorflow_requirement",
         "tf_addons_available",
         "get_tf_addons_requirements",
         "tensorpack_available",
         "get_tensorpack_requirement",
+        "pytorch_available",
         "get_pytorch_requirement",
         "lxml_available",
         "get_lxml_requirement",
@@ -418,25 +419,31 @@ _IMPORT_STRUCTURE = {
     ],
 }
+# Setting some environment variables so that standard functions can be invoked with available hardware
+env_info = collect_env_info()
+logger.debug(LoggingRecord(msg=env_info))
-# disable TF warnings for versions > 2.4.1
-if tf_available():
-    if version.parse(get_tf_version()) > version.parse("2.4.1"):
-        os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
-    try:
-        import tensorflow.python.util.deprecation as deprecation  # type: ignore # pylint: disable=E0401,R0402
-        deprecation._PRINT_DEPRECATION_WARNINGS = False  # pylint: disable=W0212
-    except Exception:  # pylint: disable=W0703
-        try:
-            from tensorflow.python.util import deprecation  # type: ignore # pylint: disable=E0401
-            deprecation._PRINT_DEPRECATION_WARNINGS = False  # pylint: disable=W0212
-        except Exception:  # pylint: disable=W0703
-            pass
+if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
+    os.environ["DD_USE_TORCH"] = "1"
+    os.environ["USE_TORCH"] = "1"
+if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
+    os.environ["DD_USE_TF"] = "1"
+    os.environ["USE_TF"] = "1"
+if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
+    logger.warning(
+        "Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
+        "behaviour, set DD_USE_TORCH to None before importing deepdoctection."
+    )
+    os.environ.pop("DD_USE_TF")
+    os.environ.pop("USE_TF")
-# Setting some environment variables so that standard functions can be invoked with available hardware
-auto_select_lib_and_device()
+if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
+    logger.warning(
+        LoggingRecord(
+            msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
+            "model from the library."
+        )
+    )
 # Direct imports for type-checking
@@ -444,10 +451,10 @@ if TYPE_CHECKING:
     from .analyzer import *
     from .dataflow import *
     from .datapoint import *
-    from .datasets import *
+    from .datasets import *  # type: ignore
     from .eval import *
-    from .extern import *
-    from .mapper import *
+    from .extern import *  # type: ignore
+    from .mapper import *  # type: ignore
     from .pipe import *
     from .train import *
     from .utils import *

deepdoctection/analyzer/dd.py CHANGED Viewed

@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
 -user factory with a reduced config setting
 """
-import ast
 import os
 from os import environ
 from shutil import copyfile
 from typing import List, Optional, Union
+from lazy_imports import try_import
 from ..extern.base import ObjectDetector
+from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
 from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
+from ..extern.hfdetr import HFDetrDerivedDetector
 from ..extern.model import ModelCatalog, ModelDownloadManager
 from ..extern.pdftext import PdfPlumberTextDetector
+from ..extern.pt.ptutils import get_torch_device
 from ..extern.tessocr import TesseractOcrDetector
 from ..extern.texocr import TextractOcrDetector
+from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
+from ..extern.tpdetect import TPFrcnnDetector
 from ..pipe.base import PipelineComponent
-from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
 from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
 from ..pipe.doctectionpipe import DoctectionPipe
 from ..pipe.layout import ImageLayoutService
 from ..pipe.order import TextOrderService
 from ..pipe.refine import TableSegmentationRefinementService
 from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
+from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
 from ..pipe.text import TextExtractionService
 from ..utils.detection_types import Pathlike
-from ..utils.env_info import get_device
-from ..utils.file_utils import (
-    boto3_available,
-    detectron2_available,
-    pytorch_available,
-    tensorpack_available,
-    tf_available,
-)
+from ..utils.error import DependencyError
+from ..utils.file_utils import detectron2_available, tensorpack_available
 from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
 from ..utils.logger import LoggingRecord, logger
 from ..utils.metacfg import AttrDict, set_config_by_yaml
 from ..utils.settings import CellType, LayoutType
 from ..utils.transform import PadTransform
-if tf_available() and tensorpack_available():
-    from ..extern.tp.tfutils import disable_tp_layer_logging
-    from ..extern.tpdetect import TPFrcnnDetector
-if pytorch_available():
-    from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
-    from ..extern.hfdetr import HFDetrDerivedDetector
-if boto3_available():
+with try_import() as image_guard:
     from botocore.config import Config  # type: ignore
@@ -344,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
             pipe_component_list.append(table_segmentation)
             if cfg.USE_TABLE_REFINEMENT:
-                table_segmentation_refinement = TableSegmentationRefinementService()
+                table_segmentation_refinement = TableSegmentationRefinementService(
+                    [LayoutType.table, LayoutType.table_rotated],
+                    [
+                        LayoutType.cell,
+                        CellType.column_header,
+                        CellType.projected_row_header,
+                        CellType.spanning,
+                        CellType.row_header,
+                    ],
+                )
                 pipe_component_list.append(table_segmentation_refinement)
     if cfg.USE_PDF_MINER:
-        pdf_text = PdfPlumberTextDetector()
+        pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
         d_text = TextExtractionService(pdf_text)
         pipe_component_list.append(d_text)
@@ -401,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
 def get_dd_analyzer(
-    reset_config_file: bool = False,
+    reset_config_file: bool = True,
     config_overwrite: Optional[List[str]] = None,
     path_config_file: Optional[Pathlike] = None,
 ) -> DoctectionPipe:
@@ -430,8 +431,13 @@ def get_dd_analyzer(
     :return: A DoctectionPipe instance with given configs
     """
     config_overwrite = [] if config_overwrite is None else config_overwrite
-    lib = "TF" if ast.literal_eval(os.environ.get("USE_TENSORFLOW", "False")) else "PT"
-    device = get_device(False)
+    lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
+    if lib == "TF":
+        device = get_tf_device()
+    elif lib == "PT":
+        device = get_torch_device()
+    else:
+        raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
     dd_one_config_path = maybe_copy_config_to_cache(
         get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
     )

deepdoctection/configs/conf_dd_one.yaml CHANGED Viewed

@@ -1,38 +1,38 @@
 USE_LAYOUT: True
 USE_TABLE_SEGMENTATION: True
 TF:
-   LAYOUT:
-      WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
-      FILTER:
-   CELL:
-      WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
-      FILTER:
-   ITEM:
-      WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
-      FILTER:
+  LAYOUT:
+    WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
+    FILTER:
+  CELL:
+    WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
+    FILTER:
+  ITEM:
+    WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
+    FILTER:
 PT:
-   LAYOUT:
-      WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
-      WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
-      FILTER:
-      PAD:
-        TOP: 60
-        RIGHT: 60
-        BOTTOM: 60
-        LEFT: 60
-   ITEM:
-     WEIGHTS: item/d2_model_1639999_item_inf_only.pt
-     WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
-     FILTER:
-     PAD:
-        TOP: 60
-        RIGHT: 60
-        BOTTOM: 60
-        LEFT: 60
-   CELL:
-      WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
-      WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
-      FILTER:
+  LAYOUT:
+    WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
+    WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
+    FILTER:
+    PAD:
+      TOP: 60
+      RIGHT: 60
+      BOTTOM: 60
+      LEFT: 60
+  ITEM:
+    WEIGHTS: item/d2_model_1639999_item_inf_only.pt
+    WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
+    FILTER:
+    PAD:
+      TOP: 60
+      RIGHT: 60
+      BOTTOM: 60
+      LEFT: 60
+  CELL:
+    WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
+    WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
+    FILTER:
 LAYOUT_NMS_PAIRS:
   COMBINATIONS:
   THRESHOLDS:
@@ -48,6 +48,9 @@ SEGMENTATION:
   STRETCH_RULE: equal
 USE_TABLE_REFINEMENT: True
 USE_PDF_MINER: False
+PDF_MINER:
+  X_TOLERANCE: 3
+  Y_TOLERANCE: 3
 USE_OCR: True
 OCR:
   USE_TESSERACT: True

deepdoctection/datapoint/annotation.py CHANGED Viewed

@@ -504,5 +504,6 @@ class ContainerAnnotation(CategoryAnnotation):
     @classmethod
     def from_dict(cls, **kwargs: JsonDict) -> "SummaryAnnotation":
         container_ann = ann_from_dict(cls, **kwargs)
-        container_ann.value = kwargs.get("value")
+        value = kwargs.get("value", "")
+        container_ann.value = value if isinstance(value, str) else list(value)
         return container_ann

deepdoctection/datapoint/box.py CHANGED Viewed

@@ -25,6 +25,7 @@ from typing import List, Optional, Sequence, no_type_check
 import numpy as np
 import numpy.typing as npt
+from lazy_imports import try_import
 from numpy import float32
 from ..utils.detection_types import ImageType
@@ -32,7 +33,7 @@ from ..utils.error import BoundingBoxError
 from ..utils.file_utils import cocotools_available
 from ..utils.logger import LoggingRecord, logger
-if cocotools_available():
+with try_import() as import_guard:
     import pycocotools.mask as coco_mask

deepdoctection/datapoint/image.py CHANGED Viewed

@@ -18,6 +18,8 @@
 """
 Dataclass Image
 """
+from __future__ import annotations
 import json
 from dataclasses import dataclass, field
 from os import environ
@@ -202,7 +204,7 @@ class Image:
             self._bbox = None
             self.embeddings.pop(self.image_id)
-    def get_image(self) -> "_Img":  # type: ignore
+    def get_image(self) -> _Img:  # type: ignore # pylint: disable=E0602
         """
         Get the image either in base64 string representation or as np.array.
@@ -531,16 +533,20 @@ class Image:
             )
             ann.image.dump(sub_image)
-    def remove_image_from_lower_hierachy(self) -> None:
+    def remove_image_from_lower_hierachy(self, pixel_values_only: bool = False) -> None:
         """Will remove all images from image annotations."""
         for ann in self.annotations:
-            absolute_bounding_box = ann.get_bounding_box(self.image_id)
-            ann.bounding_box = absolute_bounding_box
-            ann.image = None
+            if pixel_values_only:
+                if ann.image is not None:
+                    ann.image.clear_image()
+            else:
+                absolute_bounding_box = ann.get_bounding_box(self.image_id)
+                ann.bounding_box = absolute_bounding_box
+                ann.image = None
     @classmethod
     @no_type_check
-    def from_dict(cls, **kwargs) -> "Image":
+    def from_dict(cls, **kwargs) -> Image:
         """
         Create `Image` instance from dict.
@@ -571,7 +577,7 @@ class Image:
     @classmethod
     @no_type_check
-    def from_file(cls, file_path: str) -> "Image":
+    def from_file(cls, file_path: str) -> Image:
         """
         Create `Image` instance from .json file.

deepdoctection/datapoint/view.py CHANGED Viewed

@@ -19,6 +19,7 @@
 Subclasses for ImageAnnotation and Image objects with various properties. These classes
 simplify consumption
 """
+from __future__ import annotations
 from copy import copy
 from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, no_type_check
@@ -64,7 +65,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
     base_page: `Page` class instantiated by the lowest hierarchy `Image`
     """
-    base_page: "Page"
+    base_page: Page
     @property
     def bbox(self) -> List[float]:
@@ -148,7 +149,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
         return attribute_names
     @classmethod
-    def from_dict(cls, **kwargs: JsonDict) -> "ImageAnnotationBaseView":
+    def from_dict(cls, **kwargs: JsonDict) -> ImageAnnotationBaseView:
         """
         Identical to its base class method for having correct return types. If the base class changes, please
         change this method as well.
@@ -205,15 +206,38 @@ class Layout(ImageAnnotationBaseView):
         return words_with_reading_order
     @property
-    def text_(self) -> Dict[str, Union[str, List[str]]]:
+    def text_(self) -> JsonDict:
         """Returns a dict `{"text": text string,
         "text_list": list of single words,
         "annotation_ids": word annotation ids`"""
         words = self.get_ordered_words()
+        characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
+            *[
+                (
+                    word.characters,
+                    word.annotation_id,
+                    word.token_class,
+                    word.token_tag,
+                    (
+                        word.get_sub_category(WordType.token_class).category_id
+                        if WordType.token_class in word.sub_categories
+                        else None
+                    ),
+                    (word.get_sub_category(WordType.token_tag).category_id)
+                    if WordType.token_tag in word.sub_categories
+                    else None,
+                )
+                for word in words
+            ]
+        )
         return {
-            "text": " ".join([word.characters for word in words]),  # type: ignore
-            "text_list": [word.characters for word in words],  # type: ignore
-            "annotation_ids": [word.annotation_id for word in words],
+            "text": " ".join(characters),
+            "words": characters,
+            "ann_ids": ann_ids,
+            "token_classes": token_classes,
+            "token_tags": token_tags,
+            "token_class_ids": token_classes_ids,
+            "token_tag_ids": token_tag_ids,
         }
     def get_attribute_names(self) -> Set[str]:
@@ -331,19 +355,33 @@ class Table(Layout):
             return super().text
     @property
-    def text_(self) -> Dict[str, Union[str, List[str]]]:
+    def text_(self) -> JsonDict:
         cells = self.cells
         if not cells:
             return super().text_
-        text_list: List[str] = []
-        annotation_id_list: List[str] = []
+        text: List[str] = []
+        words: List[str] = []
+        ann_ids: List[str] = []
+        token_classes: List[str] = []
+        token_tags: List[str] = []
+        token_class_ids: List[str] = []
+        token_tag_ids: List[str] = []
         for cell in cells:
-            text_list.extend(cell.text_["text_list"])  # type: ignore
-            annotation_id_list.extend(cell.text_["annotation_ids"])  # type: ignore
+            text.extend(cell.text_["text"])  # type: ignore
+            words.extend(cell.text_["words"])  # type: ignore
+            ann_ids.extend(cell.text_["ann_ids"])  # type: ignore
+            token_classes.extend(cell.text_["token_classes"])  # type: ignore
+            token_tags.extend(cell.text_["token_tags"])  # type: ignore
+            token_class_ids.extend(cell.text_["token_class_ids"])  # type: ignore
+            token_tag_ids.extend(cell.text_["token_tag_ids"])  # type: ignore
         return {
-            "text": " ".join([cell.text for cell in cells]),  # type: ignore
-            "text_list": text_list,
-            "annotation_ids": annotation_id_list,
+            "text": " ".join(text),
+            "words": words,
+            "ann_ids": ann_ids,
+            "token_classes": token_classes,
+            "token_tags": token_tags,
+            "token_class_ids": token_class_ids,
+            "token_tag_ids": token_tag_ids,
         }
     @property
@@ -452,6 +490,7 @@ class Page(Image):
         "document_id",
         "page_number",
     }
+    include_residual_text_container: bool = True
     def get_annotation(  # type: ignore
         self,
@@ -556,8 +595,8 @@ class Page(Image):
         text_container: Optional[ObjectTypes] = None,
         floating_text_block_categories: Optional[Sequence[ObjectTypes]] = None,
         include_residual_text_container: bool = True,
-        base_page: Optional["Page"] = None,
-    ) -> "Page":
+        base_page: Optional[Page] = None,
+    ) -> Page:
         """
         Factory function for generating a `Page` instance from `image_orig` .
@@ -615,6 +654,7 @@ class Page(Image):
             page.summary = SummaryAnnotation.from_dict(**summary_dict)
         page.floating_text_block_categories = floating_text_block_categories  # type: ignore
         page.text_container = text_container  # type: ignore
+        page.include_residual_text_container = include_residual_text_container
         return page
     def _order(self, block: str) -> List[ImageAnnotationBaseView]:
@@ -628,7 +668,7 @@ class Page(Image):
         break_str = "\n" if line_break else " "
         for block in block_with_order:
             text += f"{block.text}{break_str}"
-        return text
+        return text[:-1]
     @property
     def text(self) -> str:
@@ -638,17 +678,35 @@ class Page(Image):
         return self._make_text()
     @property
-    def text_(self) -> Dict[str, Union[str, List[str]]]:
+    def text_(self) -> JsonDict:
         """Returns a dict `{"text": text string,
         "text_list": list of single words,
         "annotation_ids": word annotation ids`"""
         block_with_order = self._order("layouts")
-        text_list: List[str] = []
-        annotation_id_list: List[str] = []
+        text: List[str] = []
+        words: List[str] = []
+        ann_ids: List[str] = []
+        token_classes: List[str] = []
+        token_tags: List[str] = []
+        token_class_ids: List[str] = []
+        token_tag_ids: List[str] = []
         for block in block_with_order:
-            text_list.extend(block.text_["text_list"])  # type: ignore
-            annotation_id_list.extend(block.text_["annotation_ids"])  # type: ignore
-        return {"text": self.text, "text_list": text_list, "annotation_ids": annotation_id_list}
+            text.append(block.text_["text"])  # type: ignore
+            words.extend(block.text_["words"])  # type: ignore
+            ann_ids.extend(block.text_["ann_ids"])  # type: ignore
+            token_classes.extend(block.text_["token_classes"])  # type: ignore
+            token_tags.extend(block.text_["token_tags"])  # type: ignore
+            token_class_ids.extend(block.text_["token_class_ids"])  # type: ignore
+            token_tag_ids.extend(block.text_["token_tag_ids"])  # type: ignore
+        return {
+            "text": " ".join(text),
+            "words": words,
+            "ann_ids": ann_ids,
+            "token_classes": token_classes,
+            "token_tags": token_tags,
+            "token_class_ids": token_class_ids,
+            "token_tag_ids": token_tag_ids,
+        }
     def get_layout_context(self, annotation_id: str, context_size: int = 3) -> List[ImageAnnotationBaseView]:
         """For a given `annotation_id` get a list of `ImageAnnotation` that are nearby in terms of reading order.
@@ -759,6 +817,11 @@ class Page(Image):
         box_stack = []
         cells_found = False
+        if self.image is None and interactive:
+            logger.warning(
+                LoggingRecord("No image provided. Cannot display image in interactive mode", {"page_id": self.image_id})
+            )
         if debug_kwargs:
             anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
             for ann in anns:
@@ -906,7 +969,7 @@ class Page(Image):
         text_container: Optional[ObjectTypes] = None,
         floating_text_block_categories: Optional[List[ObjectTypes]] = None,
         include_residual_text_container: bool = True,
-    ) -> "Page":
+    ) -> Page:
         """Reading JSON file and building a `Page` object with given config.
         :param file_path: Path to file
         :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
@@ -929,3 +992,11 @@ class Page(Image):
             for word in all_words
             if word.token_tag not in (TokenClasses.other, None)
         ]
+    def __copy__(self) -> Page:
+        return self.__class__.from_image(
+            self.image_orig,
+            self.text_container,
+            self.floating_text_block_categories,
+            self.include_residual_text_container,
+        )

deepdoctection/datasets/__init__.py CHANGED Viewed

@@ -26,13 +26,10 @@ Create an info card, a DataFlowBaseBuilder derived instance, possibly a category
 DatasetBase derived instance to create a data set.
 """
-from ..utils.file_utils import pytorch_available
+from .adapter import *
 from .base import *
 from .dataflow_builder import DataFlowBaseBuilder
 from .info import *
 from .instances import *
 from .registry import *
 from .save import *
-if pytorch_available():
-    from .adapter import *

deepdoctection/datasets/adapter.py CHANGED Viewed

@@ -22,19 +22,22 @@ Module for wrapping datasets into a pytorch dataset framework.
 from typing import Any, Callable, Iterator, Mapping, Optional, Union
+from lazy_imports import try_import
 from ..dataflow import CacheData, CustomDataFromList, MapData, RepeatedData
 from ..datapoint.image import Image
 from ..datasets.base import DatasetBase
 from ..mapper.maputils import LabelSummarizer
 from ..utils.detection_types import DP, JsonDict
-from ..utils.file_utils import pytorch_available
 from ..utils.logger import LoggingRecord, log_once, logger
 from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
 from ..utils.tqdm import get_tqdm
 from .registry import get_dataset
-if pytorch_available():
+with try_import() as import_guard:
     from torch.utils.data import IterableDataset
+if not import_guard.is_successful():
+    from ..utils.mocks import IterableDataset  # type: ignore
 class DatasetAdapter(IterableDataset):  # type: ignore

deepdoctection 0.31__py3-none-any.whl → 0.32__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.31py3-none-any.whl → 0.32py3-none-any.whl