PyPI - deepdoctection - Versions diffs - 0.43.6__py3-none-any.whl → 0.44.1__py3-none-any.whl - Mend

deepdoctection 0.43.6py3-none-any.whl → 0.44.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (26) hide show

deepdoctection/__init__.py +5 -1
deepdoctection/datapoint/__init__.py +1 -1
deepdoctection/datapoint/image.py +50 -1
deepdoctection/datapoint/view.py +149 -54
deepdoctection/datasets/base.py +196 -51
deepdoctection/extern/fastlang.py +4 -2
deepdoctection/mapper/laylmstruct.py +7 -7
deepdoctection/pipe/base.py +29 -25
deepdoctection/pipe/common.py +2 -2
deepdoctection/pipe/concurrency.py +2 -2
deepdoctection/pipe/language.py +2 -2
deepdoctection/pipe/layout.py +2 -2
deepdoctection/pipe/lm.py +13 -3
deepdoctection/pipe/order.py +9 -5
deepdoctection/pipe/refine.py +7 -7
deepdoctection/pipe/segment.py +30 -30
deepdoctection/pipe/sub_layout.py +2 -2
deepdoctection/pipe/text.py +10 -5
deepdoctection/pipe/transform.py +2 -4
deepdoctection/utils/file_utils.py +34 -0
deepdoctection/utils/types.py +0 -1
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/METADATA +4 -4
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/RECORD +26 -26
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/top_level.txt +0 -0

deepdoctection/__init__.py CHANGED Viewed

@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = "0.43.6"
+__version__ = "0.44.1"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -92,6 +92,7 @@ _IMPORT_STRUCTURE = {
         "convert_pdf_bytes_to_np_array_v2",
         "as_dict",
         "ImageAnnotationBaseView",
+        "MetaAnnotation",
         "Image",
         "Word",
         "Layout",
@@ -105,6 +106,7 @@ _IMPORT_STRUCTURE = {
         "DatasetAdapter",
         "DatasetBase",
         "MergeDataset",
+        "DatasetCard",
         "CustomDataset",
         "DataFlowBaseBuilder",
         "DatasetInfo",
@@ -313,6 +315,8 @@ _IMPORT_STRUCTURE = {
         "get_apted_requirement",
         "distance_available",
         "get_distance_requirement",
+        "numpy_v1_available",
+        "get_numpy_v1_requirement",
         "transformers_available",
         "get_transformers_requirement",
         "detectron2_available",

deepdoctection/datapoint/__init__.py CHANGED Viewed

@@ -34,5 +34,5 @@ After all, the point here is not to provide an optimal processing environment.
 from .annotation import *
 from .box import *
 from .convert import *
-from .image import Image
+from .image import Image, MetaAnnotation
 from .view import *

deepdoctection/datapoint/image.py CHANGED Viewed

@@ -25,7 +25,7 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 from os import environ, fspath
 from pathlib import Path
-from typing import Any, Optional, Sequence, Union, no_type_check
+from typing import Any, Optional, Sequence, TypedDict, Union, no_type_check
 import numpy as np
 from numpy import uint8
@@ -40,6 +40,55 @@ from .box import crop_box_from_image, global_to_local_coords, intersection_box
 from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
+class MetaAnnotationDict(TypedDict):
+    """MetaAnnotationDict"""
+    image_annotations: list[str]
+    sub_categories: dict[str, dict[str, list[str]]]
+    relationships: dict[str, list[str]]
+    summaries: list[str]
+@dataclass(frozen=True)
+class MetaAnnotation:
+    """
+    An immutable dataclass that stores information about what `Image` are being
+    modified through a pipeline component.
+    Attributes:
+        image_annotations: Tuple of `ObjectTypes` representing image annotations.
+        sub_categories: Dictionary mapping `ObjectTypes` to dicts of `ObjectTypes` to sets of `ObjectTypes`
+        for sub-categories.
+        relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
+        summaries: Tuple of `ObjectTypes` representing summaries.
+    """
+    image_annotations: tuple[ObjectTypes, ...] = field(default=())
+    sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = field(default_factory=dict)
+    relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
+    summaries: tuple[ObjectTypes, ...] = field(default=())
+    def as_dict(self) -> MetaAnnotationDict:
+        """
+        Returns the MetaAnnotation as a dictionary, with all `ObjectTypes` converted to strings.
+        Returns:
+            A dictionary representation of the MetaAnnotation where all `ObjectTypes` are converted to strings.
+        """
+        return {
+            "image_annotations": [obj.value for obj in self.image_annotations],
+            "sub_categories": {
+                outer_key.value: {
+                    inner_key.value: [val.value for val in inner_values]
+                    for inner_key, inner_values in outer_value.items()
+                }
+                for outer_key, outer_value in self.sub_categories.items()
+            },
+            "relationships": {key.value: [val.value for val in values] for key, values in self.relationships.items()},
+            "summaries": [obj.value for obj in self.summaries],
+        }
 @dataclass
 class Image:
     """

deepdoctection/datapoint/view.py CHANGED Viewed

@@ -42,13 +42,60 @@ from ..utils.settings import (
     get_type,
 )
 from ..utils.transform import ResizeTransform, box_to_point4, point4_to_box
-from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, Text_, csv
+from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, csv
 from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
 from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
 from .box import BoundingBox, crop_box_from_image
 from .image import Image
+@dataclass(frozen=True)
+class Text_:
+    """
+    Immutable dataclass for storing structured text extraction results.
+    Attributes:
+        text: The concatenated text string.
+        words: List of word strings.
+        ann_ids: List of annotation IDs for each word.
+        token_classes: List of token class names for each word.
+        token_class_ann_ids: List of annotation IDs for each token class.
+        token_tags: List of token tag names for each word.
+        token_tag_ann_ids: List of annotation IDs for each token tag.
+        token_class_ids: List of token class IDs.
+        token_tag_ids: List of token tag IDs.
+    """
+    text: str = ""
+    words: list[str] = field(default_factory=list)
+    ann_ids: list[str] = field(default_factory=list)
+    token_classes: list[str] = field(default_factory=list)
+    token_class_ann_ids: list[str] = field(default_factory=list)
+    token_tags: list[str] = field(default_factory=list)
+    token_tag_ann_ids: list[str] = field(default_factory=list)
+    token_class_ids: list[str] = field(default_factory=list)
+    token_tag_ids: list[str] = field(default_factory=list)
+    def as_dict(self) -> dict[str, Union[list[str], str]]:
+        """
+        Returns the Text_ as a dictionary.
+        Returns:
+            A dictionary representation of the Text_ dataclass.
+        """
+        return {
+            "text": self.text,
+            "words": self.words,
+            "ann_ids": self.ann_ids,
+            "token_classes": self.token_classes,
+            "token_class_ann_ids": self.token_class_ann_ids,
+            "token_tags": self.token_tags,
+            "token_tag_ann_ids": self.token_tag_ann_ids,
+            "token_class_ids": self.token_class_ids,
+            "token_tag_ids": self.token_tag_ids,
+        }
 class ImageAnnotationBaseView(ImageAnnotation):
     """
     Consumption class for having easier access to categories added to an `ImageAnnotation`.
@@ -263,13 +310,28 @@ class Layout(ImageAnnotationBaseView):
         """
         words = self.get_ordered_words()
         if words:
-            characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
+            (
+                characters,
+                ann_ids,
+                token_classes,
+                token_class_ann_ids,
+                token_tags,
+                token_tag_ann_ids,
+                token_classes_ids,
+                token_tag_ids,
+            ) = map(list, zip(
                 *[
                     (
                         word.characters,
                         word.annotation_id,
                         word.token_class,
+                        word.get_sub_category(WordType.TOKEN_CLASS).annotation_id
+                        if WordType.TOKEN_CLASS in word.sub_categories
+                        else None,
                         word.token_tag,
+                        word.get_sub_category(WordType.TOKEN_TAG).annotation_id
+                        if WordType.TOKEN_TAG in word.sub_categories
+                        else None,
                         word.get_sub_category(WordType.TOKEN_CLASS).category_id
                         if WordType.TOKEN_CLASS in word.sub_categories
                         else None,
@@ -279,25 +341,40 @@ class Layout(ImageAnnotationBaseView):
                     )
                     for word in words
                 ]
-            )
+            ))
         else:
-            characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
-                [],  # type: ignore
-                [],  # type: ignore
-                [],  # type: ignore
-                [],  # type: ignore
-                [],  # type: ignore
-                [],  # type: ignore
+            (
+                characters,
+                ann_ids,
+                token_classes,
+                token_class_ann_ids,
+                token_tags,
+                token_tag_ann_ids,
+                token_classes_ids,
+                token_tag_ids,
+            ) = (
+                [],
+                [],
+                [],
+                [],
+                [],
+                [],
+                [],
+                [],
             )
-        return {
-            "text": " ".join(characters),
-            "words": characters,
-            "ann_ids": ann_ids,
-            "token_classes": token_classes,
-            "token_tags": token_tags,
-            "token_class_ids": token_classes_ids,
-            "token_tag_ids": token_tag_ids,
-        }
+        return Text_(
+            text=" ".join(characters), # type: ignore
+            words=characters, # type: ignore
+            ann_ids=ann_ids, # type: ignore
+            token_classes=token_classes, # type: ignore
+            token_class_ann_ids=token_class_ann_ids, # type: ignore
+            token_tags=token_tags, # type: ignore
+            token_tag_ann_ids=token_tag_ann_ids, # type: ignore
+            token_class_ids=token_classes_ids, # type: ignore
+            token_tag_ids=token_tag_ids, # type: ignore
+        )
     def get_attribute_names(self) -> set[str]:
         attr_names = (
@@ -590,14 +667,16 @@ class Table(Layout):
     @property
     def csv_(self) -> list[list[list[Text_]]]:
+        """
+        Returns:
+            A csv-style representation of a table as list of lists of cell.text_.
+        """
         cells = self.cells
         table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)]  # type: ignore
         for cell in cells:
             table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_)  # type: ignore
         return table_list
     def __str__(self) -> str:
         out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
         return out
@@ -624,26 +703,34 @@ class Table(Layout):
         words: list[str] = []
         ann_ids: list[str] = []
         token_classes: list[str] = []
+        token_class_ann_ids: list[str] = []
         token_tags: list[str] = []
+        token_tag_ann_ids: list[str] = []
         token_class_ids: list[str] = []
         token_tag_ids: list[str] = []
         for cell in cells:
-            text.append(cell.text_["text"])
-            words.extend(cell.text_["words"])
-            ann_ids.extend(cell.text_["ann_ids"])
-            token_classes.extend(cell.text_["token_classes"])
-            token_tags.extend(cell.text_["token_tags"])
-            token_class_ids.extend(cell.text_["token_class_ids"])
-            token_tag_ids.extend(cell.text_["token_tag_ids"])
-        return {
-            "text": " ".join(text),
-            "words": words,
-            "ann_ids": ann_ids,
-            "token_classes": token_classes,
-            "token_tags": token_tags,
-            "token_class_ids": token_class_ids,
-            "token_tag_ids": token_tag_ids,
-        }
+            text_ = cell.text_
+            text.append(text_.text)
+            words.extend(text_.words)
+            ann_ids.extend(text_.ann_ids)
+            token_classes.extend(text_.token_classes)
+            token_class_ann_ids.extend(text_.token_class_ann_ids)
+            token_tags.extend(text_.token_tags)
+            token_tag_ann_ids.extend(text_.token_tag_ann_ids)
+            token_class_ids.extend(text_.token_class_ids)
+            token_tag_ids.extend(text_.token_tag_ids)
+        return Text_(
+            text=" ".join(text),
+            words=words,
+            ann_ids=ann_ids,
+            token_classes=token_classes,
+            token_class_ann_ids=token_class_ann_ids,
+            token_tags=token_tags,
+            token_tag_ann_ids=token_tag_ann_ids,
+            token_class_ids=token_class_ids,
+            token_tag_ids=token_tag_ids,
+        )
     @property
     def words(self) -> list[ImageAnnotationBaseView]:
@@ -1051,7 +1138,7 @@ class Page(Image):
             ```python
                 {"text": text string,
-                 "text_list": list of single words,
+                 "words": list of single words,
                  "annotation_ids": word annotation ids}
          ```
         """
@@ -1060,26 +1147,34 @@ class Page(Image):
         words: list[str] = []
         ann_ids: list[str] = []
         token_classes: list[str] = []
+        token_class_ann_ids: list[str] = []
         token_tags: list[str] = []
+        token_tag_ann_ids: list[str] = []
         token_class_ids: list[str] = []
         token_tag_ids: list[str] = []
         for block in block_with_order:
-            text.append(block.text_["text"])  # type: ignore
-            words.extend(block.text_["words"])  # type: ignore
-            ann_ids.extend(block.text_["ann_ids"])  # type: ignore
-            token_classes.extend(block.text_["token_classes"])  # type: ignore
-            token_tags.extend(block.text_["token_tags"])  # type: ignore
-            token_class_ids.extend(block.text_["token_class_ids"])  # type: ignore
-            token_tag_ids.extend(block.text_["token_tag_ids"])  # type: ignore
-        return {
-            "text": " ".join(text),
-            "words": words,
-            "ann_ids": ann_ids,
-            "token_classes": token_classes,
-            "token_tags": token_tags,
-            "token_class_ids": token_class_ids,
-            "token_tag_ids": token_tag_ids,
-        }
+            text_ = block.text_
+            text.append(text_.text)  # type: ignore
+            words.extend(text_.words)  # type: ignore
+            ann_ids.extend(text_.ann_ids)  # type: ignore
+            token_classes.extend(text_.token_classes)  # type: ignore
+            token_class_ann_ids.extend(text_.token_class_ann_ids)  # type: ignore
+            token_tags.extend(text_.token_tags)  # type: ignore
+            token_tag_ann_ids.extend(text_.token_tag_ann_ids)  # type: ignore
+            token_class_ids.extend(text_.token_class_ids)  # type: ignore
+            token_tag_ids.extend(text_.token_tag_ids)  # type: ignore
+        return Text_(
+            text=" ".join(text),
+            words=words,
+            ann_ids=ann_ids,
+            token_classes=token_classes,
+            token_class_ann_ids=token_class_ann_ids,
+            token_tags=token_tags,
+            token_tag_ann_ids=token_tag_ann_ids,
+            token_class_ids=token_class_ids,
+            token_tag_ids=token_tag_ann_ids,
+        )
     def get_layout_context(self, annotation_id: str, context_size: int = 3) -> list[ImageAnnotationBaseView]:
         """

deepdoctection/datasets/base.py CHANGED Viewed

@@ -25,14 +25,15 @@ import os
 import pprint
 from abc import ABC, abstractmethod
 from collections import defaultdict
+from dataclasses import dataclass, field
 from inspect import signature
 from pathlib import Path
-from typing import Any, Mapping, Optional, Sequence, Type, Union
+from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union
 import numpy as np
 from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..utils.logger import LoggingRecord, logger
 from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
 from ..utils.types import PathLikeOrStr
@@ -405,6 +406,194 @@ class MergeDataset(DatasetBase):
         self._dataflow_builder.categories = self._categories()
+class DatasetCardDict(TypedDict):
+    """DatasetCardDict"""
+    name: str
+    dataset_type: Union[str, Any]
+    location: str
+    init_categories: Sequence[Any]
+    init_sub_categories: dict[Any, dict[Any, list[Any]]]
+    annotation_files: Optional[dict[Any, Union[Any, Sequence[Any]]]]
+    description: str
+    service_id_to_meta_annotation: dict[str, Any]
+# Usage:
+# def as_dict(self, ...) -> DatasetCardDict:
+@dataclass
+class DatasetCard:
+    """
+    An immutable dataclass representing the metadata of a dataset, including categories, sub-categories,
+    storage location, annotation files, and description. It facilitates management and consistency checks
+    for annotations generated by pipeline components.
+    Attributes:
+        name: Name of the dataset.
+        dataset_type: Type of the dataset as `ObjectTypes`.
+        location: Storage location of the dataset as `Path`.
+        init_categories: List of all initial categories (`ObjectTypes`) present in the dataset.
+        init_sub_categories: Mapping from main categories to sub-categories and their possible values.
+        annotation_files: Optional mapping from split names to annotation files.
+        description: Description of the dataset.
+        service_id_to_meta_annotation: Mapping from service IDs to `MetaAnnotation` objects, storing
+            annotation structure for different pipeline components.
+    """
+    name: str
+    dataset_type: ObjectTypes
+    location: Path
+    init_categories: list[ObjectTypes] = field(default_factory=list)
+    init_sub_categories: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]] = field(default_factory=dict)
+    annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None
+    description: str = field(default="")
+    service_id_to_meta_annotation: dict[str, MetaAnnotation] = field(default_factory=dict)
+    def save_dataset_card(self, file_path: Union[str, Path]) -> None:
+        """Save the DatasetCard instance as a JSON file."""
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(self.as_dict(), f, indent=4)
+    @staticmethod
+    def load_dataset_card(file_path: PathLikeOrStr) -> DatasetCard:
+        """Load a DatasetCard instance from a JSON file."""
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            service_id_to_meta_annotation = {}
+            if "service_id_to_meta_annotation" in data:
+                for service_id, meta_ann_dict in data.pop("service_id_to_meta_annotation").items():
+                    meta_ann_dict["image_annotations"] = tuple(
+                        get_type(cat) for cat in meta_ann_dict["image_annotations"]
+                    )
+                    meta_ann_dict["sub_categories"] = {
+                        get_type(cat): {
+                            get_type(sub_cat): set({get_type(value) for value in values})
+                            for sub_cat, values in sub_cats.items()
+                        }
+                        for cat, sub_cats in meta_ann_dict["sub_categories"].items()
+                    }
+                    meta_ann_dict["relationships"] = {
+                        get_type(key): set({get_type(value) for value in values})
+                        for key, values in meta_ann_dict["relationships"].items()
+                    }
+                    meta_ann_dict["summaries"] = tuple(get_type(val) for val in meta_ann_dict["summaries"])
+                    service_id_to_meta_annotation[service_id] = MetaAnnotation(**meta_ann_dict)
+                data["service_id_to_meta_annotation"] = service_id_to_meta_annotation
+        return DatasetCard(**data)
+    def as_dict(self, keep_object_types: bool = False) -> DatasetCardDict:
+        """Convert the DatasetCard to a dictionary."""
+        if keep_object_types:
+            return {
+                "name": self.name,
+                "dataset_type": self.dataset_type,
+                "location": self.location.as_posix(),
+                "init_categories": self.init_categories,
+                "init_sub_categories": self.init_sub_categories,
+                "annotation_files": self.annotation_files,  # type: ignore
+                "description": self.description,
+                "service_id_to_meta_annotation": {
+                    key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
+                },
+            }
+        return {
+            "name": self.name,
+            "dataset_type": self.dataset_type.value,
+            "location": self.location.as_posix(),
+            "init_categories": [cat.value for cat in self.init_categories],
+            "init_sub_categories": {
+                cat.value: {
+                    sub_cat.value: list({value.value for value in values}) for sub_cat, values in sub_cats.items()
+                }
+                for cat, sub_cats in self.init_sub_categories.items()
+            },
+            "annotation_files": self.annotation_files,  # type: ignore
+            "description": self.description,
+            "service_id_to_meta_annotation": {
+                key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
+            },
+        }
+    def update_from_pipeline(
+        self, meta_annotations: MetaAnnotation, service_id_to_meta_annotation: Mapping[str, MetaAnnotation]
+    ) -> None:
+        """
+        Update the initial categories, sub-categories, and service ID to `MetaAnnotation` mapping
+        based on the results from a pipeline.
+        ```python
+           analyzer = dd.get_dd_analyzer(config_overwrite=["USE_OCR=True","USE_TABLE_SEGMENTATION=True"])
+           meta_annotations = analyzer.get_meta_annotation()
+           service_id_to_meta_annotation = analyzer.get_service_id_to_meta_annotation()
+           card.update_from_pipeline(meta_annotations, service_id_to_meta_annotation)
+        ```
+        Args:
+            meta_annotations: A `MetaAnnotation` object containing new or updated categories and sub-categories.
+            service_id_to_meta_annotation: A mapping from service IDs to `MetaAnnotation` objects generated by the
+             pipeline.
+        Adds any missing categories, sub-categories, and values to the respective attributes of the instance.
+        """
+        for category in meta_annotations.image_annotations:
+            if category not in self.init_categories:
+                self.init_categories.append(category)
+        for cat, sub_cats in meta_annotations.sub_categories.items():
+            if cat not in self.init_sub_categories:
+                self.init_sub_categories[cat] = {}
+            for sub_cat, values in sub_cats.items():
+                if sub_cat not in self.init_sub_categories[cat]:
+                    self.init_sub_categories[cat][sub_cat] = []
+                for value in values:
+                    if value not in self.init_sub_categories[cat][sub_cat]:
+                        self.init_sub_categories[cat][sub_cat].append(value)
+        for service_id, meta_annotation in service_id_to_meta_annotation.items():
+            if service_id not in self.service_id_to_meta_annotation:
+                self.service_id_to_meta_annotation[service_id] = meta_annotation
+    def __post_init__(self) -> None:
+        """
+        Perform internal consistency checks ensuring `init_categories` and
+        `init_sub_categories` align with `service_id_to_meta_annotation`.
+        """
+        self.dataset_type = get_type(self.dataset_type)
+        self.location = Path(self.location)
+        self.init_categories = [get_type(cat) for cat in self.init_categories]
+        self.init_sub_categories = {
+            get_type(outer_key): {
+                get_type(inner_key): [get_type(value) for value in inner_values]
+                for inner_key, inner_values in outer_value.items()
+            }
+            for outer_key, outer_value in self.init_sub_categories.items()
+        }
+        if self.service_id_to_meta_annotation is None:
+            return
+        # Check compatibility of image_annotations with init_categories
+        for service_id, meta_annotation in self.service_id_to_meta_annotation.items():
+            for annotation in meta_annotation.image_annotations:
+                if annotation not in self.init_categories:
+                    raise ValueError(
+                        f"Image annotation '{annotation}' in service ID '{service_id}' is not "
+                        f"present in `init_categories`."
+                    )
+            # Check compatibility of sub_categories
+            for cat, sub_cats in meta_annotation.sub_categories.items():
+                if not (
+                    cat in self.init_sub_categories
+                    and all(sub_cat in self.init_sub_categories[cat] for sub_cat in sub_cats)
+                ):
+                    raise ValueError(
+                        f"Sub-categories for category '{cat}' in service ID '{service_id}' "
+                        f"do not match with `init_sub_categories`."
+                    )
 class CustomDataset(DatasetBase):
     """
     A simple dataset interface that implements the boilerplate code and reduces complexity by merely leaving
@@ -512,53 +701,9 @@ class CustomDataset(DatasetBase):
         Returns:
             A CustomDataset instance created from the dataset card.
         """
-        with open(file_path, "r", encoding="UTF-8") as file:
-            meta_data = json.load(file)
-        meta_data["dataset_type"] = get_type(meta_data["dataset_type"])
-        meta_data["location"] = Path(meta_data["location"])
-        meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
-        meta_data["init_sub_categories"] = (
-            {
-                get_type(cat): {
-                    get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
-                    for sub_cat_key, sub_cat_values in sub_cats.items()
-                }
-                for cat, sub_cats in meta_data["init_sub_categories"].items()
-            }
-            if meta_data["init_sub_categories"] is not None
-            else None
+        dataset_card = DatasetCard.load_dataset_card(file_path)
+        dataset_card_as_dict = dataset_card.as_dict(True)
+        dataset_card_as_dict.pop("service_id_to_meta_annotation")  # type: ignore  # pylint: disable=E1123
+        return CustomDataset(  # pylint: disable=E1123
+            **dataset_card_as_dict, dataflow_builder=dataflow_builder  # type: ignore
         )
-        return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
-    def as_dict(self) -> Mapping[str, Any]:
-        """
-        Return:
-           The meta-data of the dataset as a dictionary.
-        """
-        return {
-            "name": self.name,
-            "dataset_type": self.type,
-            "location": str(self.location),
-            "annotation_files": self.annotation_files,
-            "init_categories": [cat.value for cat in self.init_categories],
-            "init_sub_categories": {
-                cat.value: {
-                    sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
-                    for sub_cat_key, sub_cat_values in sub_cats.items()
-                }
-                for cat, sub_cats in self.init_sub_categories.items()
-            }
-            if self.init_sub_categories is not None
-            else None,
-        }
-    def save_dataset_card(self, file_path: str) -> None:
-        """
-        Save the dataset card to a `JSON` file.
-        Args:
-            file_path: file_path
-        """
-        with open(file_path, "w", encoding="UTF-8") as file:
-            json.dump(self.as_dict(), file, indent=4)

deepdoctection 0.43.6__py3-none-any.whl → 0.44.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.43.6py3-none-any.whl → 0.44.1py3-none-any.whl