PyPI - deepdoctection - Versions diffs - 0.43.5__tar.gz → 0.44.0__tar.gz - Mend

deepdoctection 0.43.5tar.gz → 0.44.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (155) hide show

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepdoctection
-Version: 0.43.5
+Version: 0.44.0
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer
@@ -321,7 +321,7 @@ For a simple setup which is enough to parse documents with the default setting,
 ```
 pip install transformers
-pip install python-doctr
+pip install python-doctr==0.9.0
 pip install deepdoctection
 ```
@@ -329,7 +329,7 @@ pip install deepdoctection
 ```
 pip install tensorpack
-pip install python-doctr
+pip install python-doctr==0.9.0
 pip install deepdoctection
 ```

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/README.md RENAMED Viewed

@@ -178,7 +178,7 @@ For a simple setup which is enough to parse documents with the default setting,
 ```
 pip install transformers
-pip install python-doctr
+pip install python-doctr==0.9.0
 pip install deepdoctection
 ```
@@ -186,7 +186,7 @@ pip install deepdoctection
 ```
 pip install tensorpack
-pip install python-doctr
+pip install python-doctr==0.9.0
 pip install deepdoctection
 ```

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/__init__.py RENAMED Viewed

@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = "0.43.5"
+__version__ = "0.44.0"
 _IMPORT_STRUCTURE = {
     "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
@@ -92,6 +92,7 @@ _IMPORT_STRUCTURE = {
         "convert_pdf_bytes_to_np_array_v2",
         "as_dict",
         "ImageAnnotationBaseView",
+        "MetaAnnotation",
         "Image",
         "Word",
         "Layout",
@@ -105,6 +106,7 @@ _IMPORT_STRUCTURE = {
         "DatasetAdapter",
         "DatasetBase",
         "MergeDataset",
+        "DatasetCard",
         "CustomDataset",
         "DataFlowBaseBuilder",
         "DatasetInfo",

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/analyzer/config.py RENAMED Viewed

@@ -629,7 +629,7 @@ cfg.PT.ENFORCE_WEIGHTS.ITEM = True
 # Specifies the PyTorch model weights for item detection.
 # Use either .pt or .safetensors files.
-cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin"
+cfg.PT.ITEM.WEIGHTS = "deepdoctection/tatr_tab_struct_v2/model.safetensors"
 # Specifies the TorchScript model for item detection.
 # Use .ts files for deployment without model implementation dependencies.

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/configs/profiles.jsonl RENAMED Viewed

@@ -30,3 +30,4 @@
 {"name": "Felix92/doctr-torch-parseq-multilingual-v1/pytorch_model.bin", "description": "", "size": [63286381], "tp_model": false, "config": "Felix92/doctr-torch-parseq-multilingual-v1/config.json", "preprocessor_config": null, "hf_repo_id": "Felix92/doctr-torch-parseq-multilingual-v1", "hf_model_name": "pytorch_model.bin", "hf_config_file": ["config.json"], "urls": null, "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "parseq", "padding": null}
 {"name": "doctr/crnn_vgg16_bn/pt/master-fde31e4a.pt", "description": "MASTER", "size": [63286381], "tp_model": false, "config": null, "preprocessor_config": null, "hf_repo_id": null, "hf_model_name": null, "hf_config_file": null, "urls": ["https://doctr-static.mindee.com/models?id=v0.7.0/master-fde31e4a.pt&src=0"], "categories": {}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "DoctrTextRecognizer", "architecture": "master", "padding": null}
 {"name": "Aryn/deformable-detr-DocLayNet/model.safetensors", "description": "Deformable DEtection TRansformer (DETR), trained on DocLayNet (including 80k annotated pages in 11 classes).", "size": [115511753], "tp_model": false, "config": "Aryn/deformable-detr-DocLayNet/config.json", "preprocessor_config": "Aryn/deformable-detr-DocLayNet/preprocessor_config.json", "hf_repo_id": "Aryn/deformable-detr-DocLayNet", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "default_type", "2": "caption", "11": "text", "12": "title", "3": "footnote", "4": "formula", "5": "list_item", "6": "page_footer", "7": "page_header", "8": "figure", "9": "section_header", "10": "table"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}
+{"name": "deepdoctection/tatr_tab_struct_v2/model.safetensors", "description": "Table Transformer (DETR) model trained on PubTables1M. It was introduced in the paper Aligning benchmark datasets for table structure recognition by Smock et al. This model is devoted to table structure recognition and assumes to receive a slightly croppedtable as input. It will predict rows, column and spanning cells. Use a padding of around 5 pixels. This artefact has been converted from deepdoctection/tatr_tab_struct_v2/pytorch_model.bin and should be used to reduce security issues", "size": [115511753], "tp_model": false, "config": "deepdoctection/tatr_tab_struct_v2/config.json", "preprocessor_config": "deepdoctection/tatr_tab_struct_v2/preprocessor_config.json", "hf_repo_id": "deepdoctection/tatr_tab_struct_v2", "hf_model_name": "model.safetensors", "hf_config_file": ["config.json", "preprocessor_config.json"], "urls": null, "categories": {"1": "table", "2": "column", "3": "row", "4": "column_header", "5": "projected_row_header", "6": "spanning"}, "categories_orig": null, "dl_library": "PT", "model_wrapper": "HFDetrDerivedDetector", "architecture": null, "padding": null}

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/__init__.py RENAMED Viewed

@@ -34,5 +34,5 @@ After all, the point here is not to provide an optimal processing environment.
 from .annotation import *
 from .box import *
 from .convert import *
-from .image import Image
+from .image import Image, MetaAnnotation
 from .view import *

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/image.py RENAMED Viewed

@@ -25,7 +25,7 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 from os import environ, fspath
 from pathlib import Path
-from typing import Any, Optional, Sequence, Union, no_type_check
+from typing import Any, Optional, Sequence, TypedDict, Union, no_type_check
 import numpy as np
 from numpy import uint8
@@ -40,6 +40,54 @@ from .box import crop_box_from_image, global_to_local_coords, intersection_box
 from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
+class MetaAnnotationDict(TypedDict):
+    """MetaAnnotationDict"""
+    image_annotations: list[str]
+    sub_categories: dict[str, dict[str, list[str]]]
+    relationships: dict[str, list[str]]
+    summaries: list[str]
+@dataclass(frozen=True)
+class MetaAnnotation:
+    """
+    An immutable dataclass that stores information about what `Image` are being
+    modified through a pipeline component.
+    Attributes:
+        image_annotations: Tuple of `ObjectTypes` representing image annotations.
+        sub_categories: Dictionary mapping `ObjectTypes` to dicts of `ObjectTypes` to sets of `ObjectTypes`
+        for sub-categories.
+        relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
+        summaries: Tuple of `ObjectTypes` representing summaries.
+    """
+    image_annotations: tuple[ObjectTypes, ...] = field(default=())
+    sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = field(default_factory=dict)
+    relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
+    summaries: tuple[ObjectTypes, ...] = field(default=())
+    def as_dict(self) -> MetaAnnotationDict:
+        """
+        Returns the MetaAnnotation as a dictionary, with all `ObjectTypes` converted to strings.
+        Returns:
+            A dictionary representation of the MetaAnnotation where all `ObjectTypes` are converted to strings.
+        """
+        return {
+            "image_annotations": [obj.value for obj in self.image_annotations],
+            "sub_categories": {
+                outer_key.value: {
+                    inner_key.value: [val.value for val in inner_values]
+                    for inner_key, inner_values in outer_value.items()
+                }
+                for outer_key, outer_value in self.sub_categories.items()
+            },
+            "relationships": {key.value: [val.value for val in values] for key, values in self.relationships.items()},
+            "summaries": [obj.value for obj in self.summaries],
+        }
 @dataclass
 class Image:
     """

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datapoint/view.py RENAMED Viewed

@@ -195,7 +195,9 @@ class Word(ImageAnnotationBaseView):
         attr_names = (
             set(WordType)
             .union(super().get_attribute_names())
-            .union({Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK})
+            .union(
+                {Relationships.READING_ORDER, Relationships.LAYOUT_LINK, Relationships.LINK, Relationships.SUCCESSOR}
+            )
         )
         return {attr_name.value if isinstance(attr_name, ObjectTypes) else attr_name for attr_name in attr_names}
@@ -384,16 +386,10 @@ class Table(Layout):
         Returns:
             A list of a table cells.
         """
-        all_relation_ids = self.get_relationship(Relationships.CHILD)
-        cell_anns: list[Cell] = self.base_page.get_annotation(  # type: ignore
-            annotation_ids=all_relation_ids,
-            category_names=[
-                LayoutType.CELL,
-                CellType.HEADER,
-                CellType.BODY,
-                CellType.SPANNING,
-            ],
-        )
+        cell_anns: list[Cell] = []
+        for row_number in range(1, self.number_of_rows + 1):  # type: ignore
+            cell_anns.extend(self.row(row_number))  # type: ignore
         return cell_anns
     @property
@@ -592,6 +588,18 @@ class Table(Layout):
             )
         return table_list
+    @property
+    def csv_(self) -> list[list[list[Text_]]]:
+        """
+        Returns:
+            A csv-style representation of a table as list of lists of cell.text_.
+        """
+        cells = self.cells
+        table_list = [[[] for _ in range(self.number_of_columns)] for _ in range(self.number_of_rows)]  # type: ignore
+        for cell in cells:
+            table_list[cell.row_number - 1][cell.column_number - 1].append(cell.text_)  # type: ignore
+        return table_list
     def __str__(self) -> str:
         out = " ".join([" ".join(row + ["\n"]) for row in self.csv])
         return out
@@ -599,7 +607,13 @@ class Table(Layout):
     @property
     def text(self) -> str:
         try:
-            return str(self)
+            cells = self.cells
+            if not cells:
+                return super().text
+            text_list: list[str] = []
+            for cell in cells:
+                text_list.append(cell.text)
+            return " ".join(text_list)
         except (TypeError, AnnotationError):
             return super().text
@@ -616,7 +630,7 @@ class Table(Layout):
         token_class_ids: list[str] = []
         token_tag_ids: list[str] = []
         for cell in cells:
-            text.extend(cell.text_["text"])
+            text.append(cell.text_["text"])
             words.extend(cell.text_["words"])
             ann_ids.extend(cell.text_["ann_ids"])
             token_classes.extend(cell.text_["token_classes"])

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/datasets/base.py RENAMED Viewed

@@ -25,14 +25,15 @@ import os
 import pprint
 from abc import ABC, abstractmethod
 from collections import defaultdict
+from dataclasses import dataclass, field
 from inspect import signature
 from pathlib import Path
-from typing import Any, Mapping, Optional, Sequence, Type, Union
+from typing import Any, Mapping, Optional, Sequence, Type, TypedDict, Union
 import numpy as np
 from ..dataflow import CacheData, ConcatData, CustomDataFromList, DataFlow
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..utils.logger import LoggingRecord, logger
 from ..utils.settings import DatasetType, ObjectTypes, TypeOrStr, get_type
 from ..utils.types import PathLikeOrStr
@@ -405,6 +406,193 @@ class MergeDataset(DatasetBase):
         self._dataflow_builder.categories = self._categories()
+class DatasetCardDict(TypedDict):
+    """DatasetCardDict"""
+    name: str
+    dataset_type: Union[str, Any]
+    location: str
+    init_categories: Sequence[Any]
+    init_sub_categories: dict[Any, dict[Any, list[Any]]]
+    annotation_files: Optional[dict[Any, Union[Any, Sequence[Any]]]]
+    description: str
+    service_id_to_meta_annotation: dict[str, Any]
+# Usage:
+# def as_dict(self, ...) -> DatasetCardDict:
+@dataclass
+class DatasetCard:
+    """
+    An immutable dataclass representing the metadata of a dataset, including categories, sub-categories,
+    storage location, annotation files, and description. It facilitates management and consistency checks
+    for annotations generated by pipeline components.
+    Attributes:
+        name: Name of the dataset.
+        dataset_type: Type of the dataset as `ObjectTypes`.
+        location: Storage location of the dataset as `Path`.
+        init_categories: List of all initial categories (`ObjectTypes`) present in the dataset.
+        init_sub_categories: Mapping from main categories to sub-categories and their possible values.
+        annotation_files: Optional mapping from split names to annotation files.
+        description: Description of the dataset.
+        service_id_to_meta_annotation: Mapping from service IDs to `MetaAnnotation` objects, storing
+            annotation structure for different pipeline components.
+    """
+    name: str
+    dataset_type: ObjectTypes
+    location: Path
+    init_categories: list[ObjectTypes] = field(default_factory=list)
+    init_sub_categories: dict[ObjectTypes, dict[ObjectTypes, list[ObjectTypes]]] = field(default_factory=dict)
+    annotation_files: Optional[Mapping[str, Union[str, Sequence[str]]]] = None
+    description: str = field(default="")
+    service_id_to_meta_annotation: dict[str, MetaAnnotation] = field(default_factory=dict)
+    def save_dataset_card(self, file_path: Union[str, Path]) -> None:
+        """Save the DatasetCard instance as a JSON file."""
+        with open(file_path, "w", encoding="utf-8") as f:
+            json.dump(self.as_dict(), f, indent=4)
+    @staticmethod
+    def load_dataset_card(file_path: PathLikeOrStr) -> DatasetCard:
+        """Load a DatasetCard instance from a JSON file."""
+        with open(file_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            service_id_to_meta_annotation = {}
+            if "service_id_to_meta_annotation" in data:
+                for service_id, meta_ann_dict in data.pop("service_id_to_meta_annotation").items():
+                    meta_ann_dict["image_annotations"] = tuple(
+                        get_type(cat) for cat in meta_ann_dict["image_annotations"]
+                    )
+                    meta_ann_dict["sub_categories"] = {
+                        get_type(cat): {
+                            get_type(sub_cat): set({get_type(value) for value in values})
+                            for sub_cat, values in sub_cats.items()
+                        }
+                        for cat, sub_cats in meta_ann_dict["sub_categories"].items()
+                    }
+                    meta_ann_dict["relationships"] = {
+                        get_type(key): set({get_type(value) for value in values})
+                        for key, values in meta_ann_dict["relationships"].items()
+                    }
+                    meta_ann_dict["summaries"] = tuple(get_type(val) for val in meta_ann_dict["summaries"])
+                    service_id_to_meta_annotation[service_id] = MetaAnnotation(**meta_ann_dict)
+                data["service_id_to_meta_annotation"] = service_id_to_meta_annotation
+        return DatasetCard(**data)
+    def as_dict(self, keep_object_types: bool = False) -> DatasetCardDict:
+        """Convert the DatasetCard to a dictionary."""
+        if keep_object_types:
+            return {
+                "name": self.name,
+                "dataset_type": self.dataset_type,
+                "location": self.location.as_posix(),
+                "init_categories": self.init_categories,
+                "init_sub_categories": self.init_sub_categories,
+                "annotation_files": self.annotation_files,  # type: ignore
+                "description": self.description,
+                "service_id_to_meta_annotation": {
+                    key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
+                },
+            }
+        return {
+            "name": self.name,
+            "dataset_type": self.dataset_type.value,
+            "location": self.location.as_posix(),
+            "init_categories": [cat.value for cat in self.init_categories],
+            "init_sub_categories": {
+                cat.value: {
+                    sub_cat.value: list({value.value for value in values}) for sub_cat, values in sub_cats.items()
+                }
+                for cat, sub_cats in self.init_sub_categories.items()
+            },
+            "annotation_files": self.annotation_files,  # type: ignore
+            "description": self.description,
+            "service_id_to_meta_annotation": {
+                key: val.as_dict() for key, val in self.service_id_to_meta_annotation.items()
+            },
+        }
+    def update_from_pipeline(
+        self, meta_annotations: MetaAnnotation, service_id_to_meta_annotation: Mapping[str, MetaAnnotation]
+    ) -> None:
+        """
+        Update the initial categories, sub-categories, and service ID to `MetaAnnotation` mapping
+        based on the results from a pipeline.
+        ```python
+           analyzer = dd.get_dd_analyzer(config_overwrite=["USE_OCR=True","USE_TABLE_SEGMENTATION=True"])
+           meta_annotations = analyzer.get_meta_annotation()
+           service_id_to_meta_annotation = analyzer.get_service_id_to_meta_annotation()
+           card.update_from_pipeline(meta_annotations, service_id_to_meta_annotation)
+        ```
+        Args:
+            meta_annotations: A `MetaAnnotation` object containing new or updated categories and sub-categories.
+            service_id_to_meta_annotation: A mapping from service IDs to `MetaAnnotation` objects generated by the
+             pipeline.
+        Adds any missing categories, sub-categories, and values to the respective attributes of the instance.
+        """
+        for category in meta_annotations.image_annotations:
+            if category not in self.init_categories:
+                self.init_categories.append(category)
+        for cat, sub_cats in meta_annotations.sub_categories.items():
+            if cat not in self.init_sub_categories:
+                self.init_sub_categories[cat] = {}
+            for sub_cat, values in sub_cats.items():
+                if sub_cat not in self.init_sub_categories[cat]:
+                    self.init_sub_categories[cat][sub_cat] = []
+                for value in values:
+                    if value not in self.init_sub_categories[cat][sub_cat]:
+                        self.init_sub_categories[cat][sub_cat].append(value)
+        for service_id, meta_annotation in service_id_to_meta_annotation.items():
+            if service_id not in self.service_id_to_meta_annotation:
+                self.service_id_to_meta_annotation[service_id] = meta_annotation
+    def __post_init__(self) -> None:
+        """
+        Perform internal consistency checks ensuring `init_categories` and
+        `init_sub_categories` align with `service_id_to_meta_annotation`.
+        """
+        self.dataset_type = get_type(self.dataset_type)
+        self.location = Path(self.location)
+        self.init_categories = [get_type(cat) for cat in self.init_categories]
+        self.init_sub_categories = {
+            get_type(outer_key): {
+                get_type(inner_key): [get_type(value) for value in inner_values]
+                for inner_key, inner_values in outer_value.items()
+            }
+            for outer_key, outer_value in self.init_sub_categories.items()
+        }
+        if self.service_id_to_meta_annotation is None:
+            return
+        # Check compatibility of image_annotations with init_categories
+        for service_id, meta_annotation in self.service_id_to_meta_annotation.items():
+            for annotation in meta_annotation.image_annotations:
+                if annotation not in self.init_categories:
+                    raise ValueError(
+                        f"Image annotation '{annotation}' in service ID '{service_id}' is not "
+                        f"present in `init_categories`."
+                    )
+            # Check compatibility of sub_categories
+            for cat, sub_cats in meta_annotation.sub_categories.items():
+                if not (
+                    cat in self.init_sub_categories
+                    and all(sub_cat in self.init_sub_categories[cat] for sub_cat in sub_cats)
+                ):
+                    raise ValueError(
+                        f"Sub-categories for category '{cat}' in service ID '{service_id}' "
+                        f"do not match with `init_sub_categories`."
+                    )
 class CustomDataset(DatasetBase):
     """
     A simple dataset interface that implements the boilerplate code and reduces complexity by merely leaving
@@ -512,53 +700,9 @@ class CustomDataset(DatasetBase):
         Returns:
             A CustomDataset instance created from the dataset card.
         """
-        with open(file_path, "r", encoding="UTF-8") as file:
-            meta_data = json.load(file)
-        meta_data["dataset_type"] = get_type(meta_data["dataset_type"])
-        meta_data["location"] = Path(meta_data["location"])
-        meta_data["init_categories"] = [get_type(cat) for cat in meta_data["init_categories"]]
-        meta_data["init_sub_categories"] = (
-            {
-                get_type(cat): {
-                    get_type(sub_cat_key): [get_type(sub_cat_value) for sub_cat_value in sub_cat_values]
-                    for sub_cat_key, sub_cat_values in sub_cats.items()
-                }
-                for cat, sub_cats in meta_data["init_sub_categories"].items()
-            }
-            if meta_data["init_sub_categories"] is not None
-            else None
+        dataset_card = DatasetCard.load_dataset_card(file_path)
+        dataset_card_as_dict = dataset_card.as_dict(True)
+        dataset_card_as_dict.pop("service_id_to_meta_annotation")  # type: ignore  # pylint: disable=E1123
+        return CustomDataset(  # pylint: disable=E1123
+            **dataset_card_as_dict, dataflow_builder=dataflow_builder  # type: ignore
         )
-        return CustomDataset(**meta_data, dataflow_builder=dataflow_builder)
-    def as_dict(self) -> Mapping[str, Any]:
-        """
-        Return:
-           The meta-data of the dataset as a dictionary.
-        """
-        return {
-            "name": self.name,
-            "dataset_type": self.type,
-            "location": str(self.location),
-            "annotation_files": self.annotation_files,
-            "init_categories": [cat.value for cat in self.init_categories],
-            "init_sub_categories": {
-                cat.value: {
-                    sub_cat_key.value: [sub_cat_value.value for sub_cat_value in sub_cat_values]
-                    for sub_cat_key, sub_cat_values in sub_cats.items()
-                }
-                for cat, sub_cats in self.init_sub_categories.items()
-            }
-            if self.init_sub_categories is not None
-            else None,
-        }
-    def save_dataset_card(self, file_path: str) -> None:
-        """
-        Save the dataset card to a `JSON` file.
-        Args:
-            file_path: file_path
-        """
-        with open(file_path, "w", encoding="UTF-8") as file:
-            json.dump(self.as_dict(), file, indent=4)

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/extern/model.py RENAMED Viewed

@@ -306,7 +306,7 @@ class ModelCatalog:
 # Loading default profiles
 dd_profile_path = maybe_copy_config_to_cache(
-    get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", False
+    get_package_path(), get_cache_dir_path(), "deepdoctection/configs/profiles.jsonl", True
 )
 ModelCatalog.load_profiles_from_file(dd_profile_path)
 # Additional profiles can be added

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/base.py RENAMED Viewed

@@ -23,12 +23,11 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from dataclasses import dataclass, field
 from typing import Any, Callable, Mapping, Optional, Union
 from uuid import uuid1
 from ..dataflow import DataFlow, MapData
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..mapper.misc import curry
 from ..utils.context import timed_operation
 from ..utils.identifier import get_uuid_from_str
@@ -37,25 +36,6 @@ from ..utils.types import DP
 from .anngen import DatapointManager
-@dataclass(frozen=True)
-class MetaAnnotation:
-    """
-    A immutable dataclass that stores information about what `Image` are being
-    modified through a pipeline component.
-    Attributes:
-        image_annotations: Tuple of `ObjectTypes` representing image annotations.
-        sub_categories: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for sub-categories.
-        relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
-        summaries: Tuple of `ObjectTypes` representing summaries.
-    """
-    image_annotations: tuple[ObjectTypes, ...] = field(default=())
-    sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
-    relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
-    summaries: tuple[ObjectTypes, ...] = field(default=())
 class PipelineComponent(ABC):
     """
     Base class for pipeline components.
@@ -427,15 +407,24 @@ class Pipeline(ABC):
             as well as summaries (list with sub categories).
         """
         image_annotations: list[ObjectTypes] = []
-        sub_categories = defaultdict(set)
-        relationships = defaultdict(set)
+        sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
+        relationships = defaultdict(set[ObjectTypes])  # type: ignore
         summaries: list[ObjectTypes] = []
         for component in self.pipe_component_list:
             meta_anns = component.get_meta_annotation()
             image_annotations.extend(meta_anns.image_annotations)
             for key, value in meta_anns.sub_categories.items():
-                sub_categories[key].update(value)
-            for key, value in meta_anns.relationships.items():
+                sub_dict = meta_anns.sub_categories[key]
+                for sub_cat, sub_cat_value in value.items():
+                    if sub_cat in sub_dict:
+                        sub_dict[sub_cat].update(sub_cat_value)
+                    else:
+                        sub_dict[sub_cat] = {sub_cat_value}  # type: ignore
+                if key in sub_categories:
+                    sub_categories[key].update(sub_dict)
+                else:
+                    sub_categories[key] = sub_dict
+            for key, value in meta_anns.relationships.items():  # type: ignore
                 relationships[key].update(value)
             summaries.extend(meta_anns.summaries)
         return MetaAnnotation(
@@ -445,6 +434,21 @@ class Pipeline(ABC):
             summaries=tuple(summaries),
         )
+    def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
+        """
+        Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
+        Returns:
+            `service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
+            category names and generated sub categories), relationships (dict with category names and generated
+            relationships) as well as summaries (list with sub categories).
+        """
+        service_id_to_meta_annotation = {}
+        for component in self.pipe_component_list:
+            meta_anns = component.get_meta_annotation()
+            service_id_to_meta_annotation[component.service_id] = meta_anns
+        return service_id_to_meta_annotation
     def get_pipeline_info(
         self, service_id: Optional[str] = None, name: Optional[str] = None
     ) -> Union[str, Mapping[str, str]]:

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/common.py RENAMED Viewed

@@ -28,13 +28,13 @@ from typing import Literal, Mapping, Optional, Sequence, Union
 import numpy as np
 from ..dataflow import DataFlow, MapData
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..datapoint.view import IMAGE_DEFAULTS, Page
 from ..extern.base import DetectionResult
 from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
 from ..mapper.misc import to_image
 from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry
 if os.environ.get("DD_USE_TORCH"):

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/concurrency.py RENAMED Viewed

@@ -29,11 +29,11 @@ from typing import Callable, Optional, Sequence, Union
 import tqdm
 from ..dataflow import DataFlow, MapData
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..utils.context import timed_operation
 from ..utils.tqdm import get_tqdm
 from ..utils.types import QueueType, TqdmType
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .common import ImageParsingService, PageParsingService
 from .registry import pipeline_component_registry

{deepdoctection-0.43.5 → deepdoctection-0.44.0}/deepdoctection/pipe/language.py RENAMED Viewed

@@ -20,12 +20,12 @@ Module for language detection pipeline component
 """
 from typing import Optional, Sequence
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..datapoint.view import ImageDefaults, Page
 from ..extern.base import LanguageDetector, ObjectDetector
 from ..utils.error import ImageError
 from ..utils.settings import PageType, TypeOrStr, get_type
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry

deepdoctection 0.43.5__tar.gz → 0.44.0__tar.gz

Potentially problematic release.

deepdoctection 0.43.5tar.gz → 0.44.0tar.gz