PyPI - deepdoctection - Versions diffs - 0.30__py3-none-any.whl → 0.32__py3-none-any.whl - Mend

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (120) hide show

deepdoctection/__init__.py +38 -29
deepdoctection/analyzer/dd.py +36 -29
deepdoctection/configs/conf_dd_one.yaml +34 -31
deepdoctection/dataflow/base.py +0 -19
deepdoctection/dataflow/custom.py +4 -3
deepdoctection/dataflow/custom_serialize.py +14 -5
deepdoctection/dataflow/parallel_map.py +12 -11
deepdoctection/dataflow/serialize.py +5 -4
deepdoctection/datapoint/annotation.py +35 -13
deepdoctection/datapoint/box.py +3 -5
deepdoctection/datapoint/convert.py +3 -1
deepdoctection/datapoint/image.py +79 -36
deepdoctection/datapoint/view.py +152 -49
deepdoctection/datasets/__init__.py +1 -4
deepdoctection/datasets/adapter.py +6 -3
deepdoctection/datasets/base.py +86 -11
deepdoctection/datasets/dataflow_builder.py +1 -1
deepdoctection/datasets/info.py +4 -4
deepdoctection/datasets/instances/doclaynet.py +3 -2
deepdoctection/datasets/instances/fintabnet.py +2 -1
deepdoctection/datasets/instances/funsd.py +2 -1
deepdoctection/datasets/instances/iiitar13k.py +5 -2
deepdoctection/datasets/instances/layouttest.py +4 -8
deepdoctection/datasets/instances/publaynet.py +2 -2
deepdoctection/datasets/instances/pubtables1m.py +6 -3
deepdoctection/datasets/instances/pubtabnet.py +2 -1
deepdoctection/datasets/instances/rvlcdip.py +2 -1
deepdoctection/datasets/instances/xfund.py +2 -1
deepdoctection/eval/__init__.py +1 -4
deepdoctection/eval/accmetric.py +1 -1
deepdoctection/eval/base.py +5 -4
deepdoctection/eval/cocometric.py +2 -1
deepdoctection/eval/eval.py +19 -15
deepdoctection/eval/tedsmetric.py +14 -11
deepdoctection/eval/tp_eval_callback.py +14 -7
deepdoctection/extern/__init__.py +2 -7
deepdoctection/extern/base.py +39 -13
deepdoctection/extern/d2detect.py +182 -90
deepdoctection/extern/deskew.py +36 -9
deepdoctection/extern/doctrocr.py +265 -83
deepdoctection/extern/fastlang.py +49 -9
deepdoctection/extern/hfdetr.py +106 -55
deepdoctection/extern/hflayoutlm.py +441 -122
deepdoctection/extern/hflm.py +225 -0
deepdoctection/extern/model.py +56 -47
deepdoctection/extern/pdftext.py +10 -5
deepdoctection/extern/pt/__init__.py +1 -3
deepdoctection/extern/pt/nms.py +6 -2
deepdoctection/extern/pt/ptutils.py +27 -18
deepdoctection/extern/tessocr.py +134 -22
deepdoctection/extern/texocr.py +6 -2
deepdoctection/extern/tp/tfutils.py +43 -9
deepdoctection/extern/tp/tpcompat.py +14 -11
deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +20 -0
deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
deepdoctection/extern/tpdetect.py +54 -30
deepdoctection/mapper/__init__.py +3 -8
deepdoctection/mapper/d2struct.py +9 -7
deepdoctection/mapper/hfstruct.py +7 -2
deepdoctection/mapper/laylmstruct.py +164 -21
deepdoctection/mapper/maputils.py +16 -3
deepdoctection/mapper/misc.py +6 -3
deepdoctection/mapper/prodigystruct.py +1 -1
deepdoctection/mapper/pubstruct.py +10 -10
deepdoctection/mapper/tpstruct.py +3 -3
deepdoctection/pipe/__init__.py +1 -1
deepdoctection/pipe/anngen.py +35 -8
deepdoctection/pipe/base.py +53 -19
deepdoctection/pipe/common.py +23 -13
deepdoctection/pipe/concurrency.py +2 -1
deepdoctection/pipe/doctectionpipe.py +2 -2
deepdoctection/pipe/language.py +3 -2
deepdoctection/pipe/layout.py +6 -3
deepdoctection/pipe/lm.py +34 -66
deepdoctection/pipe/order.py +142 -35
deepdoctection/pipe/refine.py +26 -24
deepdoctection/pipe/segment.py +21 -16
deepdoctection/pipe/{cell.py → sub_layout.py} +30 -9
deepdoctection/pipe/text.py +14 -8
deepdoctection/pipe/transform.py +16 -9
deepdoctection/train/__init__.py +6 -12
deepdoctection/train/d2_frcnn_train.py +36 -28
deepdoctection/train/hf_detr_train.py +26 -17
deepdoctection/train/hf_layoutlm_train.py +133 -111
deepdoctection/train/tp_frcnn_train.py +21 -19
deepdoctection/utils/__init__.py +3 -0
deepdoctection/utils/concurrency.py +1 -1
deepdoctection/utils/context.py +2 -2
deepdoctection/utils/env_info.py +41 -84
deepdoctection/utils/error.py +84 -0
deepdoctection/utils/file_utils.py +4 -15
deepdoctection/utils/fs.py +7 -7
deepdoctection/utils/logger.py +1 -0
deepdoctection/utils/mocks.py +93 -0
deepdoctection/utils/pdf_utils.py +5 -4
deepdoctection/utils/settings.py +6 -1
deepdoctection/utils/transform.py +1 -1
deepdoctection/utils/utils.py +0 -6
deepdoctection/utils/viz.py +48 -5
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/METADATA +57 -73
deepdoctection-0.32.dist-info/RECORD +146 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/WHEEL +1 -1
deepdoctection-0.30.dist-info/RECORD +0 -143
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/LICENSE +0 -0
{deepdoctection-0.30.dist-info → deepdoctection-0.32.dist-info}/top_level.txt +0 -0

deepdoctection/datapoint/view.py CHANGED Viewed

@@ -19,6 +19,7 @@
 Subclasses for ImageAnnotation and Image objects with various properties. These classes
 simplify consumption
 """
+from __future__ import annotations
 from copy import copy
 from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Type, Union, no_type_check
@@ -26,6 +27,7 @@ from typing import Any, Dict, List, Mapping, Optional, Sequence, Set, Tuple, Typ
 import numpy as np
 from ..utils.detection_types import ImageType, JsonDict, Pathlike
+from ..utils.error import AnnotationError, ImageError
 from ..utils.logger import LoggingRecord, logger
 from ..utils.settings import (
     CellType,
@@ -63,7 +65,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
     base_page: `Page` class instantiated by the lowest hierarchy `Image`
     """
-    base_page: "Page"
+    base_page: Page
     @property
     def bbox(self) -> List[float]:
@@ -96,7 +98,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
                 interactive_imshow(np_image)
                 return None
             return np_image
-        raise ValueError(f"base_page.image is None for {self.annotation_id}")
+        raise AnnotationError(f"base_page.image is None for {self.annotation_id}")
     def __getattr__(self, item: str) -> Optional[Union[str, int, List[str]]]:
         """
@@ -115,7 +117,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
         :return: value according to the logic described above
         """
         if item not in self.get_attribute_names():
-            raise AttributeError(f"Attribute {item} is not supported for {type(self)}")
+            raise AnnotationError(f"Attribute {item} is not supported for {type(self)}")
         if item in self.sub_categories:
             sub_cat = self.get_sub_category(get_type(item))
             if item != sub_cat.category_name:
@@ -147,7 +149,7 @@ class ImageAnnotationBaseView(ImageAnnotation):
         return attribute_names
     @classmethod
-    def from_dict(cls, **kwargs: JsonDict) -> "ImageAnnotationBaseView":
+    def from_dict(cls, **kwargs: JsonDict) -> ImageAnnotationBaseView:
         """
         Identical to its base class method for having correct return types. If the base class changes, please
         change this method as well.
@@ -204,15 +206,38 @@ class Layout(ImageAnnotationBaseView):
         return words_with_reading_order
     @property
-    def text_(self) -> Dict[str, Union[str, List[str]]]:
+    def text_(self) -> JsonDict:
         """Returns a dict `{"text": text string,
         "text_list": list of single words,
         "annotation_ids": word annotation ids`"""
         words = self.get_ordered_words()
+        characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
+            *[
+                (
+                    word.characters,
+                    word.annotation_id,
+                    word.token_class,
+                    word.token_tag,
+                    (
+                        word.get_sub_category(WordType.token_class).category_id
+                        if WordType.token_class in word.sub_categories
+                        else None
+                    ),
+                    (word.get_sub_category(WordType.token_tag).category_id)
+                    if WordType.token_tag in word.sub_categories
+                    else None,
+                )
+                for word in words
+            ]
+        )
         return {
-            "text": " ".join([word.characters for word in words]),  # type: ignore
-            "text_list": [word.characters for word in words],  # type: ignore
-            "annotation_ids": [word.annotation_id for word in words],
+            "text": " ".join(characters),
+            "words": characters,
+            "ann_ids": ann_ids,
+            "token_classes": token_classes,
+            "token_tags": token_tags,
+            "token_class_ids": token_classes_ids,
+            "token_tag_ids": token_tag_ids,
         }
     def get_attribute_names(self) -> Set[str]:
@@ -326,23 +351,37 @@ class Table(Layout):
     def text(self) -> str:
         try:
             return str(self)
-        except TypeError:
+        except (TypeError, AnnotationError):
             return super().text
     @property
-    def text_(self) -> Dict[str, Union[str, List[str]]]:
+    def text_(self) -> JsonDict:
         cells = self.cells
         if not cells:
             return super().text_
-        text_list: List[str] = []
-        annotation_id_list: List[str] = []
+        text: List[str] = []
+        words: List[str] = []
+        ann_ids: List[str] = []
+        token_classes: List[str] = []
+        token_tags: List[str] = []
+        token_class_ids: List[str] = []
+        token_tag_ids: List[str] = []
         for cell in cells:
-            text_list.extend(cell.text_["text_list"])  # type: ignore
-            annotation_id_list.extend(cell.text_["annotation_ids"])  # type: ignore
+            text.extend(cell.text_["text"])  # type: ignore
+            words.extend(cell.text_["words"])  # type: ignore
+            ann_ids.extend(cell.text_["ann_ids"])  # type: ignore
+            token_classes.extend(cell.text_["token_classes"])  # type: ignore
+            token_tags.extend(cell.text_["token_tags"])  # type: ignore
+            token_class_ids.extend(cell.text_["token_class_ids"])  # type: ignore
+            token_tag_ids.extend(cell.text_["token_tag_ids"])  # type: ignore
         return {
-            "text": " ".join([cell.text for cell in cells]),  # type: ignore
-            "text_list": text_list,
-            "annotation_ids": annotation_id_list,
+            "text": " ".join(text),
+            "words": words,
+            "ann_ids": ann_ids,
+            "token_classes": token_classes,
+            "token_tags": token_tags,
+            "token_class_ids": token_class_ids,
+            "token_tag_ids": token_tag_ids,
         }
     @property
@@ -368,7 +407,7 @@ class Table(Layout):
             for cell in cells:
                 all_words.extend(cell.get_ordered_words())  # type: ignore
             return all_words
-        except TypeError:
+        except (TypeError, AnnotationError):
             return super().get_ordered_words()
@@ -451,41 +490,73 @@ class Page(Image):
         "document_id",
         "page_number",
     }
+    include_residual_text_container: bool = True
-    @no_type_check
-    def get_annotation(
+    def get_annotation(  # type: ignore
         self,
         category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
         annotation_ids: Optional[Union[str, Sequence[str]]] = None,
-        annotation_types: Optional[Union[str, Sequence[str]]] = None,
+        service_id: Optional[Union[str, Sequence[str]]] = None,
+        model_id: Optional[Union[str, Sequence[str]]] = None,
+        session_ids: Optional[Union[str, Sequence[str]]] = None,
+        ignore_inactive: bool = True,
     ) -> List[ImageAnnotationBaseView]:
         """
+        Selection of annotations from the annotation container. Filter conditions can be defined by specifying
+        the annotation_id or the category name. (Since only image annotations are currently allowed in the container,
+        annotation_type is a redundant filter condition.) Only annotations that have  active = 'True' are
+        returned. If more than one condition is provided, only annotations will be returned that satisfy all conditions.
+        If no condition is provided, it will return all active annotations.
         Identical to its base class method for having correct return types. If the base class changes, please
         change this method as well.
+        :param category_names: A single name or list of names
+        :param annotation_ids: A single id or list of ids
+        :param service_id: A single service name or list of service names
+        :param model_id: A single model name or list of model names
+        :param session_ids: A single session id or list of session ids
+        :param ignore_inactive: If set to `True` only active annotations are returned.
+        :return: A (possibly empty) list of Annotations
         """
-        cat_names = [category_names] if isinstance(category_names, (ObjectTypes, str)) else category_names
-        if cat_names is not None:
-            cat_names = [get_type(cat_name) for cat_name in cat_names]
-        ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
-        ann_types = [annotation_types] if isinstance(annotation_types, str) else annotation_types
-        anns = filter(lambda x: x.active, self.annotations)
+        if category_names is not None:
+            category_names = (
+                [get_type(cat_name) for cat_name in category_names]
+                if isinstance(category_names, list)
+                else [get_type(category_names)]  # type:ignore
+            )
+        ann_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
+        service_id = [service_id] if isinstance(service_id, str) else service_id
+        model_id = [model_id] if isinstance(model_id, str) else model_id
+        session_id = [session_ids] if isinstance(session_ids, str) else session_ids
-        if ann_types is not None:
-            for type_name in ann_types:
-                anns = filter(lambda x: isinstance(x, eval(type_name)), anns)  # pylint: disable=W0123, W0640
+        if ignore_inactive:
+            anns = filter(lambda x: x.active, self.annotations)
+        else:
+            anns = self.annotations  # type:ignore
-        if cat_names is not None:
-            anns = filter(lambda x: x.category_name in cat_names, anns)
+        if category_names is not None:
+            anns = filter(lambda x: x.category_name in category_names, anns)  # type:ignore
         if ann_ids is not None:
-            anns = filter(lambda x: x.annotation_id in ann_ids, anns)
+            anns = filter(lambda x: x.annotation_id in ann_ids, anns)  # type:ignore
-        return list(anns)
+        if service_id is not None:
+            anns = filter(lambda x: x.generating_service in service_id, anns)  # type:ignore
+        if model_id is not None:
+            anns = filter(lambda x: x.generating_model in model_id, anns)  # type:ignore
+        if session_id is not None:
+            anns = filter(lambda x: x.session_id in session_id, anns)  # type:ignore
+        return list(anns)  # type:ignore
     def __getattr__(self, item: str) -> Any:
         if item not in self.get_attribute_names():
-            raise AttributeError(f"Attribute {item} is not supported for {type(self)}")
+            raise ImageError(f"Attribute {item} is not supported for {type(self)}")
         if self.summary is not None:
             if item in self.summary.sub_categories:
                 sub_cat = self.summary.get_sub_category(get_type(item))
@@ -524,8 +595,8 @@ class Page(Image):
         text_container: Optional[ObjectTypes] = None,
         floating_text_block_categories: Optional[Sequence[ObjectTypes]] = None,
         include_residual_text_container: bool = True,
-        base_page: Optional["Page"] = None,
-    ) -> "Page":
+        base_page: Optional[Page] = None,
+    ) -> Page:
         """
         Factory function for generating a `Page` instance from `image_orig` .
@@ -583,6 +654,7 @@ class Page(Image):
             page.summary = SummaryAnnotation.from_dict(**summary_dict)
         page.floating_text_block_categories = floating_text_block_categories  # type: ignore
         page.text_container = text_container  # type: ignore
+        page.include_residual_text_container = include_residual_text_container
         return page
     def _order(self, block: str) -> List[ImageAnnotationBaseView]:
@@ -596,7 +668,7 @@ class Page(Image):
         break_str = "\n" if line_break else " "
         for block in block_with_order:
             text += f"{block.text}{break_str}"
-        return text
+        return text[:-1]
     @property
     def text(self) -> str:
@@ -606,17 +678,35 @@ class Page(Image):
         return self._make_text()
     @property
-    def text_(self) -> Dict[str, Union[str, List[str]]]:
+    def text_(self) -> JsonDict:
         """Returns a dict `{"text": text string,
         "text_list": list of single words,
         "annotation_ids": word annotation ids`"""
         block_with_order = self._order("layouts")
-        text_list: List[str] = []
-        annotation_id_list: List[str] = []
+        text: List[str] = []
+        words: List[str] = []
+        ann_ids: List[str] = []
+        token_classes: List[str] = []
+        token_tags: List[str] = []
+        token_class_ids: List[str] = []
+        token_tag_ids: List[str] = []
         for block in block_with_order:
-            text_list.extend(block.text_["text_list"])  # type: ignore
-            annotation_id_list.extend(block.text_["annotation_ids"])  # type: ignore
-        return {"text": self.text, "text_list": text_list, "annotation_ids": annotation_id_list}
+            text.append(block.text_["text"])  # type: ignore
+            words.extend(block.text_["words"])  # type: ignore
+            ann_ids.extend(block.text_["ann_ids"])  # type: ignore
+            token_classes.extend(block.text_["token_classes"])  # type: ignore
+            token_tags.extend(block.text_["token_tags"])  # type: ignore
+            token_class_ids.extend(block.text_["token_class_ids"])  # type: ignore
+            token_tag_ids.extend(block.text_["token_tag_ids"])  # type: ignore
+        return {
+            "text": " ".join(text),
+            "words": words,
+            "ann_ids": ann_ids,
+            "token_classes": token_classes,
+            "token_tags": token_tags,
+            "token_class_ids": token_class_ids,
+            "token_tag_ids": token_tag_ids,
+        }
     def get_layout_context(self, annotation_id: str, context_size: int = 3) -> List[ImageAnnotationBaseView]:
         """For a given `annotation_id` get a list of `ImageAnnotation` that are nearby in terms of reading order.
@@ -629,10 +719,10 @@ class Page(Image):
         """
         ann = self.get_annotation(annotation_ids=annotation_id)[0]
         if ann.category_name not in self.floating_text_block_categories:
-            raise ValueError(
-                f"Annotation {annotation_id} with category_name {ann.category_name} is not a floating text "
-                f"block category. Cannot get context. Make sure to make this category a floating text "
-                f"block"
+            raise ImageError(
+                f"Cannot get context. Make sure to parametrize this category to a floating text: "
+                f"annotation_id: {annotation_id},"
+                f"category_name: {ann.category_name}"
             )
         block_with_order = self._order("layouts")
         position = block_with_order.index(ann)
@@ -727,6 +817,11 @@ class Page(Image):
         box_stack = []
         cells_found = False
+        if self.image is None and interactive:
+            logger.warning(
+                LoggingRecord("No image provided. Cannot display image in interactive mode", {"page_id": self.image_id})
+            )
         if debug_kwargs:
             anns = self.get_annotation(category_names=list(debug_kwargs.keys()))
             for ann in anns:
@@ -874,7 +969,7 @@ class Page(Image):
         text_container: Optional[ObjectTypes] = None,
         floating_text_block_categories: Optional[List[ObjectTypes]] = None,
         include_residual_text_container: bool = True,
-    ) -> "Page":
+    ) -> Page:
         """Reading JSON file and building a `Page` object with given config.
         :param file_path: Path to file
         :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
@@ -897,3 +992,11 @@ class Page(Image):
             for word in all_words
             if word.token_tag not in (TokenClasses.other, None)
         ]
+    def __copy__(self) -> Page:
+        return self.__class__.from_image(
+            self.image_orig,
+            self.text_container,
+            self.floating_text_block_categories,
+            self.include_residual_text_container,
+        )

deepdoctection/datasets/__init__.py CHANGED Viewed

@@ -26,13 +26,10 @@ Create an info card, a DataFlowBaseBuilder derived instance, possibly a category
 DatasetBase derived instance to create a data set.
 """
-from ..utils.file_utils import pytorch_available
+from .adapter import *
 from .base import *
 from .dataflow_builder import DataFlowBaseBuilder
 from .info import *
 from .instances import *
 from .registry import *
 from .save import *
-if pytorch_available():
-    from .adapter import *

deepdoctection/datasets/adapter.py CHANGED Viewed

@@ -22,19 +22,22 @@ Module for wrapping datasets into a pytorch dataset framework.
 from typing import Any, Callable, Iterator, Mapping, Optional, Union
+from lazy_imports import try_import
 from ..dataflow import CacheData, CustomDataFromList, MapData, RepeatedData
 from ..datapoint.image import Image
 from ..datasets.base import DatasetBase
 from ..mapper.maputils import LabelSummarizer
 from ..utils.detection_types import DP, JsonDict
-from ..utils.file_utils import pytorch_available
 from ..utils.logger import LoggingRecord, log_once, logger
 from ..utils.settings import DatasetType, LayoutType, ObjectTypes, PageType, WordType
 from ..utils.tqdm import get_tqdm
 from .registry import get_dataset
-if pytorch_available():
+with try_import() as import_guard:
     from torch.utils.data import IterableDataset
+if not import_guard.is_successful():
+    from ..utils.mocks import IterableDataset  # type: ignore
 class DatasetAdapter(IterableDataset):  # type: ignore
@@ -165,4 +168,4 @@ class DatasetAdapter(IterableDataset):  # type: ignore
         return len(self.df)
     def __getitem__(self, item: Any) -> None:
-        raise NotImplementedError
+        raise NotImplementedError()

deepdoctection/datasets/base.py CHANGED Viewed

@@ -18,20 +18,24 @@
 """
 Module for the base class of datasets.
 """
+from __future__ import annotations
+import json
 import os
 import pprint
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from typing import Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
+from inspect import signature
+from pathlib import Path
+from typing import Any, Dict, List, Mapping, Optional, Sequence, Tuple, Type, Union
 import numpy as np
 from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
-from ..datapoint import Image
+from ..datapoint.image import Image
 from ..utils.detection_types import Pathlike
 from ..utils.logger import LoggingRecord, logger
-from ..utils.settings import ObjectTypes, TypeOrStr, get_type
+from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
 from .dataflow_builder import DataFlowBaseBuilder
 from .info import DatasetCategories, DatasetInfo, get_merged_categories
@@ -51,9 +55,11 @@ class DatasetBase(ABC):
         self._dataflow_builder.splits = self._dataset_info.splits
         if not self.dataset_available() and self.is_built_in():
-            print(
-                f"Dataset {self._dataset_info.name} not locally found. Please download at {self._dataset_info.url}"
-                f" and place under {self._dataflow_builder.get_workdir()}"
+            logger.warning(
+                LoggingRecord(
+                    f"Dataset {self._dataset_info.name} not locally found. Please download at {self._dataset_info.url}"
+                    f" and place under {self._dataflow_builder.get_workdir()}"
+                )
             )
     @property
@@ -76,7 +82,7 @@ class DatasetBase(ABC):
         Construct the DatasetCategory object.
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     @classmethod
     @abstractmethod
@@ -85,7 +91,7 @@ class DatasetBase(ABC):
         Construct the DatasetInfo object.
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     @abstractmethod
     def _builder(self) -> DataFlowBaseBuilder:
@@ -93,7 +99,7 @@ class DatasetBase(ABC):
         Construct the DataFlowBaseBuilder object. It needs to be implemented in the derived class.
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     def dataset_available(self) -> bool:
         """
@@ -114,7 +120,7 @@ class DatasetBase(ABC):
 class _BuiltInDataset(DatasetBase, ABC):
     """
-    Dataclass for built-in dataset. Do not use this it
+    Dataclass for built-in dataset. Do not use this
     """
     _name: Optional[str] = None
@@ -419,7 +425,7 @@ class CustomDataset(DatasetBase):
         """
         self.name = name
-        self.type = get_type(dataset_type)
+        self.type: DatasetType = get_type(dataset_type)  # type: ignore
         self.location = location
         self.init_categories = init_categories
         if init_sub_categories is None:
@@ -427,6 +433,11 @@ class CustomDataset(DatasetBase):
         else:
             self.init_sub_categories = init_sub_categories
         self.annotation_files = annotation_files
+        if signature(dataflow_builder.__init__).parameters.keys() != {"self", "location", "annotation_files"}:
+            raise TypeError(
+                "Dataflow builder must have the signature `def __init__(self, location: Pathlike, "
+                "annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None):`"
+            )
         self.dataflow_builder = dataflow_builder(self.location, self.annotation_files)
         super().__init__()
@@ -438,3 +449,67 @@ class CustomDataset(DatasetBase):
     def _builder(self) -> DataFlowBaseBuilder:
         return self.dataflow_builder
+    @staticmethod
+    def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
+        """
+        This static method creates a CustomDataset instance from a dataset card.
+        A dataset card is a JSON file that contains metadata about the dataset such as its name, type, location,
+        initial categories, initial sub categories, and annotation files. The dataflow_builder parameter is a class
+        that inherits from DataFlowBaseBuilder and is used to build the dataflow for the dataset.
+        :param file_path: The path to the dataset card (JSON file).
+        :param dataflow_builder: The class used to build the dataflow for the dataset.
+        :return: A CustomDataset instance created from the dataset card.
+        """
+        with open(file_path, "r", encoding="UTF-8") as file:
+            meta_data = json.load(file)
+        meta_data["dataset_type"] = get_type(meta_data["dataset_type"])
+        meta_data["location"] = Path(meta_data["location"])
+        meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
+        meta_data["init_sub_categories"] = (
+            {
+                get_type(cat): {
+                    get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
+                    for sub_cat_key, sub_cat_values in sub_cats.items()
+                }
+                for cat, sub_cats in meta_data["init_sub_categories"].items()
+            }
+            if meta_data["init_sub_categories"] is not None
+            else None
+        )
+        return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
+    def as_dict(self) -> Mapping[str, Any]:
+        """
+        Return the meta-data of the dataset as a dictionary.
+        :return: A dictionary containing the meta-data of the dataset.
+        """
+        return {
+            "name": self.name,
+            "dataset_type": self.type,
+            "location": str(self.location),
+            "annotation_files": self.annotation_files,
+            "init_categories": [cat.value for cat in self.init_categories],
+            "init_sub_categories": {
+                cat.value: {
+                    sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
+                    for sub_cat_key, sub_cat_values in sub_cats.items()
+                }
+                for cat, sub_cats in self.init_sub_categories.items()
+            }
+            if self.init_sub_categories is not None
+            else None,
+        }
+    def save_dataset_card(self, file_path: str) -> None:
+        """
+        Save the dataset card to a JSON file.
+        :param file_path: file_path
+        """
+        with open(file_path, "w", encoding="UTF-8") as file:
+            json.dump(self.as_dict(), file, indent=4)

deepdoctection/datasets/dataflow_builder.py CHANGED Viewed

@@ -110,7 +110,7 @@ class DataFlowBaseBuilder(ABC):
         :param kwargs: A custom set of arguments/values
         :return: dataflow
         """
-        raise NotImplementedError
+        raise NotImplementedError()
     def get_annotation_file(self, split: str) -> str:
         """Get single annotation file."""

deepdoctection/datasets/info.py CHANGED Viewed

@@ -24,7 +24,7 @@ from dataclasses import dataclass, field
 from itertools import chain
 from typing import Any, Dict, List, Literal, Mapping, Optional, Sequence, Set, Union, no_type_check, overload
-from ..utils.settings import DefaultType, ObjectTypes, TypeOrStr, get_type
+from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
 from ..utils.utils import call_only_once
 __all__ = ["DatasetInfo", "DatasetCategories", "get_merged_categories"]
@@ -89,7 +89,7 @@ class DatasetInfo:
     license: str = field(default="")
     url: Union[str, Sequence[str]] = field(default="")
     splits: Mapping[str, str] = field(default_factory=dict)
-    type: ObjectTypes = field(default=DefaultType.default_type)
+    type: DatasetType = field(default=DatasetType.default)
     def get_split(self, key: str) -> str:
         """
@@ -306,7 +306,7 @@ class DatasetCategories:
         _cat_to_sub_cat = {get_type(key): get_type(value) for key, value in cat_to_sub_cat.items()}
         if not self._allow_update:
-            raise PermissionError("Replacing categories with sub categories is not allowed")
+            raise RuntimeWarning("Replacing categories with sub categories is not allowed")
         self._categories_update = self.init_categories
         categories = self.get_categories(name_as_key=True)
         cats_or_sub_cats = [
@@ -332,7 +332,7 @@ class DatasetCategories:
         """
         if not self._allow_update:
-            raise PermissionError("Filtering categories is not allowed")
+            raise RuntimeWarning("Filtering categories is not allowed")
         if isinstance(categories, (ObjectTypes, str)):
             categories = [get_type(categories)]
         else:

deepdoctection/datasets/instances/doclaynet.py CHANGED Viewed

@@ -25,6 +25,7 @@ Module for DocLayNet dataset. Place the dataset as follows
     ├── PNG
     │ ├── 0a0d43e301facee9e99cc33b9b16e732dd207135f4027e75f6aea2bf117535a2.png
 """
+from __future__ import annotations
 import os
 from typing import Mapping, Sequence, Union
@@ -109,7 +110,7 @@ class DocLayNet(DatasetBase):
     def _categories(self) -> DatasetCategories:
         return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
-    def _builder(self) -> "DocLayNetBuilder":
+    def _builder(self) -> DocLayNetBuilder:
         return DocLayNetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)
@@ -209,7 +210,7 @@ class DocLayNetSeq(DatasetBase):
     def _categories(self) -> DatasetCategories:
         return DatasetCategories(init_categories=_INIT_CATEGORIES_SEQ)
-    def _builder(self) -> "DocLayNetSeqBuilder":
+    def _builder(self) -> DocLayNetSeqBuilder:
         return DocLayNetSeqBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)

deepdoctection/datasets/instances/fintabnet.py CHANGED Viewed

@@ -30,6 +30,7 @@ Module for Fintabnet dataset. Place the dataset as follows
     ├── FinTabNet_1.0.0_table_train.jsonl
     ├── FinTabNet_1.0.0_table_val.jsonl
 """
+from __future__ import annotations
 from pathlib import Path
 from typing import List, Mapping, Sequence, Union
@@ -133,7 +134,7 @@ class Fintabnet(_BuiltInDataset):
     def _categories(self) -> DatasetCategories:
         return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
-    def _builder(self) -> "FintabnetBuilder":
+    def _builder(self) -> FintabnetBuilder:
         return FintabnetBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)

deepdoctection/datasets/instances/funsd.py CHANGED Viewed

@@ -32,6 +32,7 @@ Module for Funsd dataset.  Install the dataset following the folder structure
     │ ├── images
     │ │ ├── ...
 """
+from __future__ import annotations
 import os
 from typing import Dict, List, Mapping, Union
@@ -120,7 +121,7 @@ class Funsd(_BuiltInDataset):
     def _categories(self) -> DatasetCategories:
         return DatasetCategories(init_categories=_INIT_CATEGORIES, init_sub_categories=_SUB_CATEGORIES)
-    def _builder(self) -> "FunsdBuilder":
+    def _builder(self) -> FunsdBuilder:
         return FunsdBuilder(location=_LOCATION, annotation_files=_ANNOTATION_FILES)

deepdoctection 0.30__py3-none-any.whl → 0.32__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.30py3-none-any.whl → 0.32py3-none-any.whl