PyPI - deepdoctection - Versions diffs - 0.43.6__py3-none-any.whl → 0.44.1__py3-none-any.whl - Mend

deepdoctection 0.43.6py3-none-any.whl → 0.44.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (26) hide show

deepdoctection/__init__.py +5 -1
deepdoctection/datapoint/__init__.py +1 -1
deepdoctection/datapoint/image.py +50 -1
deepdoctection/datapoint/view.py +149 -54
deepdoctection/datasets/base.py +196 -51
deepdoctection/extern/fastlang.py +4 -2
deepdoctection/mapper/laylmstruct.py +7 -7
deepdoctection/pipe/base.py +29 -25
deepdoctection/pipe/common.py +2 -2
deepdoctection/pipe/concurrency.py +2 -2
deepdoctection/pipe/language.py +2 -2
deepdoctection/pipe/layout.py +2 -2
deepdoctection/pipe/lm.py +13 -3
deepdoctection/pipe/order.py +9 -5
deepdoctection/pipe/refine.py +7 -7
deepdoctection/pipe/segment.py +30 -30
deepdoctection/pipe/sub_layout.py +2 -2
deepdoctection/pipe/text.py +10 -5
deepdoctection/pipe/transform.py +2 -4
deepdoctection/utils/file_utils.py +34 -0
deepdoctection/utils/types.py +0 -1
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/METADATA +4 -4
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/RECORD +26 -26
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/top_level.txt +0 -0

deepdoctection/extern/fastlang.py CHANGED Viewed

@@ -29,7 +29,8 @@ from typing import Any, Mapping, Union
 from lazy_imports import try_import
-from ..utils.file_utils import Requirement, get_fasttext_requirement
+from ..utils.develop import deprecated
+from ..utils.file_utils import Requirement, get_fasttext_requirement, get_numpy_v1_requirement
 from ..utils.settings import TypeOrStr, get_type
 from ..utils.types import PathLikeOrStr
 from .base import DetectionResult, LanguageDetector, ModelCategories
@@ -69,6 +70,7 @@ class FasttextLangDetectorMixin(LanguageDetector, ABC):
         return "fasttext_" + "_".join(Path(path_weights).parts[-2:])
+@deprecated("As FastText archived, it will be deprecated in the near future.", "2025-08-17")
 class FasttextLangDetector(FasttextLangDetectorMixin):
     """
     Fasttext language detector wrapper. Two models provided in the fasttext library can be used to identify languages.
@@ -114,7 +116,7 @@ class FasttextLangDetector(FasttextLangDetectorMixin):
     @classmethod
     def get_requirements(cls) -> list[Requirement]:
-        return [get_fasttext_requirement()]
+        return [get_numpy_v1_requirement(), get_fasttext_requirement()]
     def clone(self) -> FasttextLangDetector:
         return self.__class__(self.path_weights, self.categories.get_categories(), self.categories_orig)

deepdoctection/mapper/laylmstruct.py CHANGED Viewed

@@ -806,17 +806,17 @@ def image_to_raw_lm_features(
     raw_features["image_id"] = page.image_id
     raw_features["width"] = page.width
     raw_features["height"] = page.height
-    raw_features["ann_ids"] = text_["ann_ids"]
-    raw_features["words"] = text_["words"]
+    raw_features["ann_ids"] = text_.ann_ids
+    raw_features["words"] = text_.words
     # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
     # raw_features_to_layoutlm_features
-    raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
+    raw_features["bbox"] = [_CLS_BOX] * len(text_.words)
     raw_features["dataset_type"] = dataset_type
-    if use_token_tag and text_["token_tags"]:
-        raw_features["labels"] = text_["token_tags"]
-    elif text_["token_classes"]:
-        raw_features["labels"] = text_["token_classes"]
+    if use_token_tag and text_.token_tags:
+        raw_features["labels"] = text_.token_tags
+    elif text_.token_classes:
+        raw_features["labels"] = text_.token_classes
     elif page.document_type is not None:
         document_type_id = page.image_orig.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1
         raw_features["labels"] = [document_type_id]

deepdoctection/pipe/base.py CHANGED Viewed

@@ -23,12 +23,11 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from collections import defaultdict
-from dataclasses import dataclass, field
 from typing import Any, Callable, Mapping, Optional, Union
 from uuid import uuid1
 from ..dataflow import DataFlow, MapData
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..mapper.misc import curry
 from ..utils.context import timed_operation
 from ..utils.identifier import get_uuid_from_str
@@ -37,25 +36,6 @@ from ..utils.types import DP
 from .anngen import DatapointManager
-@dataclass(frozen=True)
-class MetaAnnotation:
-    """
-    A immutable dataclass that stores information about what `Image` are being
-    modified through a pipeline component.
-    Attributes:
-        image_annotations: Tuple of `ObjectTypes` representing image annotations.
-        sub_categories: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for sub-categories.
-        relationships: Dictionary mapping `ObjectTypes` to sets of `ObjectTypes` for relationships.
-        summaries: Tuple of `ObjectTypes` representing summaries.
-    """
-    image_annotations: tuple[ObjectTypes, ...] = field(default=())
-    sub_categories: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
-    relationships: dict[ObjectTypes, set[ObjectTypes]] = field(default_factory=dict)
-    summaries: tuple[ObjectTypes, ...] = field(default=())
 class PipelineComponent(ABC):
     """
     Base class for pipeline components.
@@ -427,15 +407,24 @@ class Pipeline(ABC):
             as well as summaries (list with sub categories).
         """
         image_annotations: list[ObjectTypes] = []
-        sub_categories = defaultdict(set)
-        relationships = defaultdict(set)
+        sub_categories: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]] = {}
+        relationships = defaultdict(set[ObjectTypes])  # type: ignore
         summaries: list[ObjectTypes] = []
         for component in self.pipe_component_list:
             meta_anns = component.get_meta_annotation()
             image_annotations.extend(meta_anns.image_annotations)
             for key, value in meta_anns.sub_categories.items():
-                sub_categories[key].update(value)
-            for key, value in meta_anns.relationships.items():
+                sub_dict = meta_anns.sub_categories[key]
+                for sub_cat, sub_cat_value in value.items():
+                    if sub_cat in sub_dict:
+                        sub_dict[sub_cat].update(sub_cat_value)
+                    else:
+                        sub_dict[sub_cat] = {sub_cat_value}  # type: ignore
+                if key in sub_categories:
+                    sub_categories[key].update(sub_dict)
+                else:
+                    sub_categories[key] = sub_dict
+            for key, value in meta_anns.relationships.items():  # type: ignore
                 relationships[key].update(value)
             summaries.extend(meta_anns.summaries)
         return MetaAnnotation(
@@ -445,6 +434,21 @@ class Pipeline(ABC):
             summaries=tuple(summaries),
         )
+    def get_service_id_to_meta_annotation(self) -> Mapping[str, MetaAnnotation]:
+        """
+        Collects meta annotations from all pipeline components and return a dict of service id to its meta annotation.
+        Returns:
+            `service_id` to `MetaAnnotation` with information about image annotations (list), sub categories (dict with
+            category names and generated sub categories), relationships (dict with category names and generated
+            relationships) as well as summaries (list with sub categories).
+        """
+        service_id_to_meta_annotation = {}
+        for component in self.pipe_component_list:
+            meta_anns = component.get_meta_annotation()
+            service_id_to_meta_annotation[component.service_id] = meta_anns
+        return service_id_to_meta_annotation
     def get_pipeline_info(
         self, service_id: Optional[str] = None, name: Optional[str] = None
     ) -> Union[str, Mapping[str, str]]:

deepdoctection/pipe/common.py CHANGED Viewed

@@ -28,13 +28,13 @@ from typing import Literal, Mapping, Optional, Sequence, Union
 import numpy as np
 from ..dataflow import DataFlow, MapData
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..datapoint.view import IMAGE_DEFAULTS, Page
 from ..extern.base import DetectionResult
 from ..mapper.match import match_anns_by_distance, match_anns_by_intersection
 from ..mapper.misc import to_image
 from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry
 if os.environ.get("DD_USE_TORCH"):

deepdoctection/pipe/concurrency.py CHANGED Viewed

@@ -29,11 +29,11 @@ from typing import Callable, Optional, Sequence, Union
 import tqdm
 from ..dataflow import DataFlow, MapData
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..utils.context import timed_operation
 from ..utils.tqdm import get_tqdm
 from ..utils.types import QueueType, TqdmType
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .common import ImageParsingService, PageParsingService
 from .registry import pipeline_component_registry

deepdoctection/pipe/language.py CHANGED Viewed

@@ -20,12 +20,12 @@ Module for language detection pipeline component
 """
 from typing import Optional, Sequence
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..datapoint.view import ImageDefaults, Page
 from ..extern.base import LanguageDetector, ObjectDetector
 from ..utils.error import ImageError
 from ..utils.settings import PageType, TypeOrStr, get_type
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry

deepdoctection/pipe/layout.py CHANGED Viewed

@@ -24,13 +24,13 @@ from typing import Optional, Sequence, Union
 import numpy as np
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..extern.base import ObjectDetector, PdfMiner
 from ..mapper.misc import curry
 from ..utils.error import ImageError
 from ..utils.settings import ObjectTypes
 from ..utils.transform import PadTransform
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -23,11 +23,11 @@ from __future__ import annotations
 from copy import copy
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..extern.base import SequenceClassResult
 from ..mapper.laylmstruct import image_to_layoutlm_features, image_to_lm_features
 from ..utils.settings import BioTag, LayoutType, ObjectTypes, PageType, TokenClasses, WordType
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry
 if TYPE_CHECKING:
@@ -246,7 +246,17 @@ class LMTokenClassifierService(PipelineComponent):
     def get_meta_annotation(self) -> MetaAnnotation:
         return MetaAnnotation(
             image_annotations=(),
-            sub_categories={LayoutType.WORD: {WordType.TOKEN_CLASS, WordType.TAG, WordType.TOKEN_TAG}},
+            sub_categories={
+                LayoutType.WORD: {
+                    WordType.TOKEN_CLASS: set(self.language_model.categories.categories_semantics)  # type: ignore
+                    if self.language_model.categories.categories_semantics
+                    else [],
+                    WordType.TAG: set(self.language_model.categories.categories_bio)  # type: ignore
+                    if self.language_model.categories.categories_bio
+                    else [],
+                    WordType.TOKEN_TAG: set(self.language_model.categories.get_categories(as_dict=False)),
+                }
+            },
             relationships={},
             summaries=(),
         )

deepdoctection/pipe/order.py CHANGED Viewed

@@ -31,11 +31,11 @@ import numpy as np
 from ..datapoint.annotation import ImageAnnotation
 from ..datapoint.box import BoundingBox, merge_boxes
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..datapoint.view import IMAGE_DEFAULTS
 from ..extern.base import DetectionResult
 from ..extern.tp.tpfrcnn.utils.np_box_ops import ioa as np_ioa
-from ..pipe.base import MetaAnnotation, PipelineComponent
+from ..pipe.base import PipelineComponent
 from ..pipe.registry import pipeline_component_registry
 from ..utils.logger import LoggingRecord, logger
 from ..utils.settings import LayoutType, ObjectTypes, Relationships, TypeOrStr, get_type
@@ -611,8 +611,8 @@ class TextLineService(TextLineServiceMixin):
         """
         return MetaAnnotation(
             image_annotations=(LayoutType.LINE,),
-            sub_categories={LayoutType.LINE: {Relationships.CHILD}},
-            relationships={},
+            sub_categories={},
+            relationships={LayoutType.LINE: {Relationships.CHILD}},
             summaries=(),
         )
@@ -818,7 +818,11 @@ class TextOrderService(TextLineServiceMixin):
         anns_with_reading_order = list(copy(self.floating_text_block_categories)) + add_category
         return MetaAnnotation(
             image_annotations=tuple(image_annotations),
-            sub_categories={category: {Relationships.READING_ORDER} for category in anns_with_reading_order},
+            sub_categories={  # type: ignore
+                category: {Relationships.READING_ORDER: {Relationships.READING_ORDER}}
+                for category in anns_with_reading_order
+            }
+            | {self.text_container: {Relationships.READING_ORDER: {Relationships.READING_ORDER}}},
             relationships={},
             summaries=(),
         )

deepdoctection/pipe/refine.py CHANGED Viewed

@@ -31,12 +31,12 @@ import networkx as nx  # type: ignore
 from ..datapoint.annotation import ImageAnnotation
 from ..datapoint.box import merge_boxes
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..extern.base import DetectionResult
 from ..mapper.maputils import MappingContextManager
 from ..utils.error import ImageError
 from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, TableType, get_type
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry
 __all__ = ["TableSegmentationRefinementService", "generate_html_string"]
@@ -537,12 +537,12 @@ class TableSegmentationRefinementService(PipelineComponent):
             image_annotations=(),
             sub_categories={
                 LayoutType.CELL: {
-                    CellType.ROW_NUMBER,
-                    CellType.COLUMN_NUMBER,
-                    CellType.ROW_SPAN,
-                    CellType.COLUMN_SPAN,
+                    CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
+                    CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
+                    CellType.ROW_SPAN: {CellType.ROW_SPAN},
+                    CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
                 },
-                LayoutType.TABLE: {TableType.HTML},
+                LayoutType.TABLE: {TableType.HTML: {TableType.HTML}},
             },
             relationships={},
             summaries=(),

deepdoctection/pipe/segment.py CHANGED Viewed

@@ -29,13 +29,13 @@ import numpy as np
 from ..datapoint.annotation import ImageAnnotation
 from ..datapoint.box import BoundingBox, global_to_local_coords, intersection_box, intersection_boxes, iou, merge_boxes
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..extern.base import DetectionResult
 from ..mapper.maputils import MappingContextManager
 from ..mapper.match import match_anns_by_intersection
 from ..utils.error import ImageError
 from ..utils.settings import CellType, LayoutType, ObjectTypes, Relationships, TableType, TypeOrStr, get_type
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .refine import generate_html_string
 from .registry import pipeline_component_registry
@@ -974,13 +974,13 @@ class TableSegmentationService(PipelineComponent):
             image_annotations=(),
             sub_categories={
                 LayoutType.CELL: {
-                    CellType.ROW_NUMBER,
-                    CellType.COLUMN_NUMBER,
-                    CellType.ROW_SPAN,
-                    CellType.COLUMN_SPAN,
+                    CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
+                    CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
+                    CellType.ROW_SPAN: {CellType.ROW_SPAN},
+                    CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
                 },
-                LayoutType.ROW: {CellType.ROW_NUMBER},
-                LayoutType.COLUMN: {CellType.COLUMN_NUMBER},
+                LayoutType.ROW: {CellType.ROW_NUMBER: {CellType.ROW_NUMBER}},
+                LayoutType.COLUMN: {CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER}},
             },
             relationships={},
             summaries=(),
@@ -1314,37 +1314,37 @@ class PubtablesSegmentationService(PipelineComponent):
             image_annotations=(),
             sub_categories={
                 LayoutType.CELL: {
-                    CellType.ROW_NUMBER,
-                    CellType.COLUMN_NUMBER,
-                    CellType.ROW_SPAN,
-                    CellType.COLUMN_SPAN,
+                    CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
+                    CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
+                    CellType.ROW_SPAN: {CellType.ROW_SPAN},
+                    CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
                 },
                 CellType.SPANNING: {
-                    CellType.ROW_NUMBER,
-                    CellType.COLUMN_NUMBER,
-                    CellType.ROW_SPAN,
-                    CellType.COLUMN_SPAN,
+                    CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
+                    CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
+                    CellType.ROW_SPAN: {CellType.ROW_SPAN},
+                    CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
                 },
                 CellType.ROW_HEADER: {
-                    CellType.ROW_NUMBER,
-                    CellType.COLUMN_NUMBER,
-                    CellType.ROW_SPAN,
-                    CellType.COLUMN_SPAN,
+                    CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
+                    CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
+                    CellType.ROW_SPAN: {CellType.ROW_SPAN},
+                    CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
                 },
                 CellType.COLUMN_HEADER: {
-                    CellType.ROW_NUMBER,
-                    CellType.COLUMN_NUMBER,
-                    CellType.ROW_SPAN,
-                    CellType.COLUMN_SPAN,
+                    CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
+                    CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
+                    CellType.ROW_SPAN: {CellType.ROW_SPAN},
+                    CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
                 },
                 CellType.PROJECTED_ROW_HEADER: {
-                    CellType.ROW_NUMBER,
-                    CellType.COLUMN_NUMBER,
-                    CellType.ROW_SPAN,
-                    CellType.COLUMN_SPAN,
+                    CellType.ROW_NUMBER: {CellType.ROW_NUMBER},
+                    CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER},
+                    CellType.ROW_SPAN: {CellType.ROW_SPAN},
+                    CellType.COLUMN_SPAN: {CellType.COLUMN_SPAN},
                 },
-                LayoutType.ROW: {CellType.ROW_NUMBER},
-                LayoutType.COLUMN: {CellType.COLUMN_NUMBER},
+                LayoutType.ROW: {CellType.ROW_NUMBER: {CellType.ROW_NUMBER}},
+                LayoutType.COLUMN: {CellType.COLUMN_NUMBER: {CellType.COLUMN_NUMBER}},
             },
             relationships={},
             summaries=(),

deepdoctection/pipe/sub_layout.py CHANGED Viewed

@@ -28,12 +28,12 @@ import numpy as np
 from ..datapoint.annotation import ImageAnnotation
 from ..datapoint.box import crop_box_from_image
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..extern.base import DetectionResult, ObjectDetector, PdfMiner
 from ..utils.settings import ObjectTypes, Relationships, TypeOrStr, get_type
 from ..utils.transform import PadTransform
 from ..utils.types import PixelValues
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry

deepdoctection/pipe/text.py CHANGED Viewed

@@ -25,13 +25,13 @@ from copy import deepcopy
 from typing import Optional, Sequence, Union
 from ..datapoint.annotation import ImageAnnotation
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..extern.base import ObjectDetector, PdfMiner, TextRecognizer
 from ..extern.tessocr import TesseractOcrDetector
 from ..utils.error import ImageError
 from ..utils.settings import ObjectTypes, PageType, TypeOrStr, WordType, get_type
 from ..utils.types import PixelValues
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry
 __all__ = ["TextExtractionService"]
@@ -202,16 +202,21 @@ class TextExtractionService(PipelineComponent):
         return 1
     def get_meta_annotation(self) -> MetaAnnotation:
-        sub_cat_dict: dict[ObjectTypes, set[ObjectTypes]]
+        sub_cat_dict: dict[ObjectTypes, dict[ObjectTypes, set[ObjectTypes]]]
         if self.extract_from_category:
-            sub_cat_dict = {category: {WordType.CHARACTERS} for category in self.extract_from_category}
+            sub_cat_dict = {
+                category: {WordType.CHARACTERS: {WordType.CHARACTERS}} for category in self.extract_from_category
+            }
         else:
             if not isinstance(self.predictor, (ObjectDetector, PdfMiner)):
                 raise TypeError(
                     f"self.predictor must be of type ObjectDetector or PdfMiner but is of type "
                     f"{type(self.predictor)}"
                 )
-            sub_cat_dict = {category: {WordType.CHARACTERS} for category in self.predictor.get_category_names()}
+            sub_cat_dict = {
+                category: {WordType.CHARACTERS: {WordType.CHARACTERS}}
+                for category in self.predictor.get_category_names()
+            }
         return MetaAnnotation(
             image_annotations=self.predictor.get_category_names()
             if isinstance(self.predictor, (ObjectDetector, PdfMiner))

deepdoctection/pipe/transform.py CHANGED Viewed

@@ -22,9 +22,9 @@ Transform style pipeline components.
 from __future__ import annotations
 from .. import DetectionResult
-from ..datapoint.image import Image
+from ..datapoint.image import Image, MetaAnnotation
 from ..extern.base import ImageTransformer
-from .base import MetaAnnotation, PipelineComponent
+from .base import PipelineComponent
 from .registry import pipeline_component_registry
@@ -83,8 +83,6 @@ class SimpleTransformService(PipelineComponent):
             for detect_result in output_detect_results:
                 ann = dp.get_annotation(annotation_ids=detect_result.uuid)[0]
                 transformed_ann_id = self.dp_manager.set_image_annotation(detect_result)
-                if transformed_ann_id is None:
-                    print("here")
                 transformed_ann = self.dp_manager.datapoint.get_annotation(annotation_ids=transformed_ann_id)[0]
                 for key, sub_ann in ann.sub_categories.items():

deepdoctection/utils/file_utils.py CHANGED Viewed

@@ -18,6 +18,7 @@ from types import ModuleType
 from typing import Any, Union, no_type_check
 import importlib_metadata
+import numpy as np
 from packaging import version
 from .error import DependencyError
@@ -249,6 +250,39 @@ def get_distance_requirement() -> Requirement:
     return "distance", distance_available(), _DISTANCE_ERR_MSG
+_NUMPY_V1_ERR_MSG = "numpy v1 must be installed."
+def numpy_v1_available() -> bool:
+    """
+    Check if the installed NumPy version is version 1.
+    This helper function determines whether the currently installed version
+    of NumPy is version 1 by inspecting its major version number.
+    Returns:
+        True if the installed NumPy version is 1, otherwise False
+    """
+    major_version = np.__version__.split('.', maxsplit=1)[0]
+    print(f"major version: {major_version}")
+    if major_version in (1, "1"):
+        return True
+    return False
+def get_numpy_v1_requirement() -> Requirement:
+    """
+    Retrieves the requirement details for numpy version 1.
+    Returns:
+        A tuple containing three elements:
+            - The requirement name for numpy version 1.
+            - A Boolean value indicating whether numpy version 1 is available.
+            - An error message in case numpy version 1 is not available.
+    """
+    return "numpy v1", numpy_v1_available(), _NUMPY_V1_ERR_MSG
 # Transformers
 _TRANSFORMERS_AVAILABLE = importlib.util.find_spec("transformers") is not None
 _TRANSFORMERS_ERR_MSG = f"transformers must be installed. {_GENERIC_ERR_MSG}"

deepdoctection/utils/types.py CHANGED Viewed

@@ -70,7 +70,6 @@ AnnotationDict: TypeAlias = dict[str, Any]
 ImageDict: TypeAlias = dict[str, Any]
 # We use these types for output types of the Page object
-Text_: TypeAlias = dict[str, Any]
 HTML: TypeAlias = str
 csv: TypeAlias = list[list[str]]
 Chunks: TypeAlias = list[tuple[str, str, int, str, str, str, str]]

{deepdoctection-0.43.6.dist-info → deepdoctection-0.44.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deepdoctection
-Version: 0.43.6
+Version: 0.44.1
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer
@@ -27,7 +27,7 @@ Requires-Dist: networkx>=2.7.1
 Requires-Dist: numpy<2.0,>=1.21
 Requires-Dist: packaging>=20.0
 Requires-Dist: Pillow>=10.0.0
-Requires-Dist: pypdf>=3.16.0
+Requires-Dist: pypdf>=6.0.0
 Requires-Dist: pypdfium2>=4.30.0
 Requires-Dist: pyyaml>=6.0.1
 Requires-Dist: pyzmq>=16
@@ -46,7 +46,7 @@ Requires-Dist: networkx>=2.7.1; extra == "tf"
 Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
 Requires-Dist: packaging>=20.0; extra == "tf"
 Requires-Dist: Pillow>=10.0.0; extra == "tf"
-Requires-Dist: pypdf>=3.16.0; extra == "tf"
+Requires-Dist: pypdf>=6.0.0; extra == "tf"
 Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
 Requires-Dist: pyyaml>=6.0.1; extra == "tf"
 Requires-Dist: pyzmq>=16; extra == "tf"
@@ -78,7 +78,7 @@ Requires-Dist: networkx>=2.7.1; extra == "pt"
 Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
 Requires-Dist: packaging>=20.0; extra == "pt"
 Requires-Dist: Pillow>=10.0.0; extra == "pt"
-Requires-Dist: pypdf>=3.16.0; extra == "pt"
+Requires-Dist: pypdf>=6.0.0; extra == "pt"
 Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
 Requires-Dist: pyyaml>=6.0.1; extra == "pt"
 Requires-Dist: pyzmq>=16; extra == "pt"

deepdoctection 0.43.6__py3-none-any.whl → 0.44.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.43.6py3-none-any.whl → 0.44.1py3-none-any.whl