PyPI - deepdoctection - Versions diffs - 0.33__tar.gz → 0.34__tar.gz - Mend

deepdoctection 0.33tar.gz → 0.34tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (152) hide show

{deepdoctection-0.33 → deepdoctection-0.34}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: deepdoctection
-Version: 0.33
+Version: 0.34
 Summary: Repository for Document AI
 Home-page: https://github.com/deepdoctection/deepdoctection
 Author: Dr. Janis Meyer
@@ -29,6 +29,7 @@ Requires-Dist: Pillow>=10.0.0
 Requires-Dist: pypdf>=3.16.0
 Requires-Dist: pyyaml>=6.0.1
 Requires-Dist: pyzmq>=16
+Requires-Dist: scipy>=1.13.1
 Requires-Dist: termcolor>=1.1
 Requires-Dist: tabulate>=0.7.7
 Requires-Dist: tqdm==4.64.0
@@ -46,6 +47,7 @@ Requires-Dist: Pillow>=10.0.0; extra == "tf"
 Requires-Dist: pypdf>=3.16.0; extra == "tf"
 Requires-Dist: pyyaml>=6.0.1; extra == "tf"
 Requires-Dist: pyzmq>=16; extra == "tf"
+Requires-Dist: scipy>=1.13.1; extra == "tf"
 Requires-Dist: termcolor>=1.1; extra == "tf"
 Requires-Dist: tabulate>=0.7.7; extra == "tf"
 Requires-Dist: tqdm==4.64.0; extra == "tf"
@@ -76,6 +78,7 @@ Requires-Dist: Pillow>=10.0.0; extra == "pt"
 Requires-Dist: pypdf>=3.16.0; extra == "pt"
 Requires-Dist: pyyaml>=6.0.1; extra == "pt"
 Requires-Dist: pyzmq>=16; extra == "pt"
+Requires-Dist: scipy>=1.13.1; extra == "pt"
 Requires-Dist: termcolor>=1.1; extra == "pt"
 Requires-Dist: tabulate>=0.7.7; extra == "pt"
 Requires-Dist: tqdm==4.64.0; extra == "pt"

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/__init__.py RENAMED Viewed

@@ -15,7 +15,6 @@ if importlib.util.find_spec("dotenv") is not None:
 # pylint: disable=wrong-import-position
-import os
 import sys
 from typing import TYPE_CHECKING
@@ -25,11 +24,10 @@ from .utils.logger import LoggingRecord, logger
 # pylint: enable=wrong-import-position
-__version__ = 0.33
+__version__ = 0.34
 _IMPORT_STRUCTURE = {
     "analyzer": [
-        "maybe_copy_config_to_cache",
         "config_sanity_checks",
         "build_detector",
         "build_padder",
@@ -76,6 +74,7 @@ _IMPORT_STRUCTURE = {
     ],
     "datapoint": [
         "ann_from_dict",
+        "AnnotationMap",
         "Annotation",
         "CategoryAnnotation",
         "ImageAnnotation",
@@ -237,6 +236,7 @@ _IMPORT_STRUCTURE = {
         "LabelSummarizer",
         "curry",
         "match_anns_by_intersection",
+        "match_anns_by_distance",
         "to_image",
         "maybe_load_image",
         "maybe_remove_image",
@@ -265,6 +265,8 @@ _IMPORT_STRUCTURE = {
         "DetectResultGenerator",
         "SubImageLayoutService",
         "ImageCroppingService",
+        "IntersectionMatcher",
+        "NeighbourMatcher",
         "MatchingService",
         "PageParsingService",
         "AnnotationNmsService",
@@ -364,6 +366,7 @@ _IMPORT_STRUCTURE = {
         "get_configs_dir_path",
         "get_weights_dir_path",
         "get_dataset_dir_path",
+        "maybe_copy_config_to_cache",
         "is_uuid_like",
         "get_uuid_from_str",
         "get_uuid",

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/analyzer/dd.py RENAMED Viewed

@@ -27,7 +27,6 @@ from __future__ import annotations
 import os
 from os import environ
-from shutil import copyfile
 from typing import Optional, Union
 from lazy_imports import try_import
@@ -44,7 +43,7 @@ from ..extern.texocr import TextractOcrDetector
 from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
 from ..extern.tpdetect import TPFrcnnDetector
 from ..pipe.base import PipelineComponent
-from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
+from ..pipe.common import AnnotationNmsService, IntersectionMatcher, MatchingService, PageParsingService
 from ..pipe.doctectionpipe import DoctectionPipe
 from ..pipe.layout import ImageLayoutService
 from ..pipe.order import TextOrderService
@@ -55,10 +54,10 @@ from ..pipe.text import TextExtractionService
 from ..utils.env_info import ENV_VARS_TRUE
 from ..utils.error import DependencyError
 from ..utils.file_utils import detectron2_available, tensorpack_available
-from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
+from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
 from ..utils.logger import LoggingRecord, logger
 from ..utils.metacfg import AttrDict, set_config_by_yaml
-from ..utils.settings import CellType, LayoutType
+from ..utils.settings import CellType, LayoutType, Relationships
 from ..utils.transform import PadTransform
 from ..utils.types import PathLikeOrStr
@@ -67,7 +66,6 @@ with try_import() as image_guard:
 __all__ = [
-    "maybe_copy_config_to_cache",
     "config_sanity_checks",
     "build_detector",
     "build_padder",
@@ -77,31 +75,37 @@ __all__ = [
     "build_doctr_word",
     "get_dd_analyzer",
     "build_analyzer",
+    "set_config_by_yaml",
 ]
 _DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
 _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
-def maybe_copy_config_to_cache(
-    package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
-) -> str:
-    """
-    Initial copying of various files
-    :param package_path: base path to directory of source file `file_name`
-    :param configs_dir_path: base path to target directory
-    :param file_name: file to copy
-    :param force_copy: If file is already in target directory, will re-copy the file
-    :return: path to the copied file_name
-    """
-    absolute_path_source = os.path.join(package_path, file_name)
-    absolute_path = os.path.join(configs_dir_path, os.path.join("dd", os.path.split(file_name)[1]))
-    mkdir_p(os.path.split(absolute_path)[0])
-    if not os.path.isfile(absolute_path) or force_copy:
-        copyfile(absolute_path_source, absolute_path)
-    return absolute_path
+_MODEL_CHOICES = {
+    "layout": [
+        "layout/d2_model_0829999_layout_inf_only.pt",
+        "xrf_layout/model_final_inf_only.pt",
+        "microsoft/table-transformer-detection/pytorch_model.bin",
+    ],
+    "segmentation": [
+        "item/model-1620000_inf_only.data-00000-of-00001",
+        "xrf_item/model_final_inf_only.pt",
+        "microsoft/table-transformer-structure-recognition/pytorch_model.bin",
+        "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
+    ],
+    "ocr": ["Tesseract", "DocTr", "Textract"],
+    "doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
+    "doctr_recognition": [
+        "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
+        "doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
+    ],
+    "llm": ["gpt-3.5-turbo", "gpt-4"],
+    "segmentation_choices": {
+        "item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
+        "xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
+        "microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
+        "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
+    },
+}
 def config_sanity_checks(cfg: AttrDict) -> None:
@@ -375,13 +379,17 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
         pipe_component_list.append(text)
     if cfg.USE_PDF_MINER or cfg.USE_OCR:
-        match = MatchingService(
-            parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
-            child_categories=LayoutType.WORD,
+        matcher = IntersectionMatcher(
             matching_rule=cfg.WORD_MATCHING.RULE,
             threshold=cfg.WORD_MATCHING.THRESHOLD,
             max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
         )
+        match = MatchingService(
+            parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
+            child_categories=LayoutType.WORD,
+            matcher=matcher,
+            relationship_key=Relationships.CHILD,
+        )
         pipe_component_list.append(match)
         order = TextOrderService(
@@ -444,9 +452,9 @@ def get_dd_analyzer(
     else:
         raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
     dd_one_config_path = maybe_copy_config_to_cache(
-        get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
+        get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
     )
-    maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path(), _TESSERACT)
+    maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
     # Set up of the configuration and logging
     cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/annotation.py RENAMED Viewed

@@ -21,6 +21,7 @@ Dataclass for annotations and their derived classes.
 from __future__ import annotations
 from abc import ABC, abstractmethod
+from collections import defaultdict
 from dataclasses import dataclass, field
 from typing import Optional, Union, no_type_check
@@ -66,6 +67,16 @@ def ann_from_dict(cls, **kwargs: AnnotationDict):
     return ann
+@dataclass(frozen=True)
+class AnnotationMap:
+    """AnnotationMap to store all sub categories, relationship keys and summary keys of an annotation"""
+    image_annotation_id: str
+    sub_category_key: Optional[ObjectTypes] = None
+    relationship_key: Optional[ObjectTypes] = None
+    summary_key: Optional[ObjectTypes] = None
 @dataclass
 class Annotation(ABC):
     """
@@ -397,7 +408,8 @@ class CategoryAnnotation(Annotation):
                 except ValueError:
                     logger.warning(LoggingRecord(f"Relationship {key} cannot be removed because it does not exist"))
         else:
-            self.relationships[key].clear()
+            if key in self.relationships:
+                self.relationships[key].clear()
     def get_defining_attributes(self) -> list[str]:
         return ["category_name", "category_id"]
@@ -409,7 +421,7 @@ class CategoryAnnotation(Annotation):
         :return: list of attributes.
         """
-        return []
+        return ["_category_name"]
     @classmethod
     def from_dict(cls, **kwargs: AnnotationDict) -> CategoryAnnotation:
@@ -470,6 +482,32 @@ class ImageAnnotation(CategoryAnnotation):
             return self.image.summary.get_sub_category(key)
         raise AnnotationError(f"Summary does not exist for {self.annotation_id} and key: {key}")
+    def get_annotation_map(self) -> defaultdict[str, list[AnnotationMap]]:
+        """
+        Returns a defaultdict with annotation ids as keys and a list of AnnotationMap instances as values for all sub
+         categories, relationships and image summaries.
+        :return: defaultdict with annotation ids as keys and a list of AnnotationMap instances as values.
+        """
+        annotation_id_dict = defaultdict(list)
+        annotation_id_dict[self.annotation_id].append(AnnotationMap(image_annotation_id=self.annotation_id))
+        for sub_cat_key in self.sub_categories:
+            sub_cat = self.get_sub_category(sub_cat_key)
+            annotation_id_dict[sub_cat.annotation_id].append(
+                AnnotationMap(image_annotation_id=self.annotation_id, sub_category_key=sub_cat_key)
+            )
+        if self.image is not None:
+            for summary_cat_key in self.image.summary.sub_categories:
+                summary_cat = self.get_summary(summary_cat_key)
+                annotation_id_dict[summary_cat.annotation_id].append(
+                    AnnotationMap(image_annotation_id=self.annotation_id, summary_key=summary_cat_key)
+                )
+        for rel_key in self.relationships:
+            for rel_ann_ids in self.get_relationship(rel_key):
+                annotation_id_dict[rel_ann_ids].append(
+                    AnnotationMap(image_annotation_id=self.annotation_id, relationship_key=rel_key)
+                )
+        return annotation_id_dict
 @dataclass
 class ContainerAnnotation(CategoryAnnotation):

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/image.py RENAMED Viewed

@@ -21,10 +21,11 @@ Dataclass Image
 from __future__ import annotations
 import json
+from collections import defaultdict
 from dataclasses import dataclass, field
 from os import environ
 from pathlib import Path
-from typing import Any, Iterable, Optional, Sequence, Union, no_type_check
+from typing import Any, Optional, Sequence, Union, no_type_check
 import numpy as np
 from numpy import uint8
@@ -33,7 +34,7 @@ from ..utils.error import AnnotationError, BoundingBoxError, ImageError, UUIDErr
 from ..utils.identifier import get_uuid, is_uuid_like
 from ..utils.settings import ObjectTypes, SummaryType, get_type
 from ..utils.types import ImageDict, PathLikeOrStr, PixelValues
-from .annotation import Annotation, BoundingBox, CategoryAnnotation, ImageAnnotation
+from .annotation import Annotation, AnnotationMap, BoundingBox, CategoryAnnotation, ImageAnnotation
 from .box import crop_box_from_image, global_to_local_coords, intersection_box
 from .convert import as_dict, convert_b64_to_np_array, convert_np_array_to_b64, convert_pdf_bytes_to_np_array_v2
@@ -303,6 +304,15 @@ class Image:
         return self.embeddings[image_id]
+    def remove_embedding(self, image_id: str) -> None:
+        """
+        Remove an embedding from the image.
+        :param image_id: uuid string of the embedding image
+        """
+        if image_id in self.embeddings:
+            self.embeddings.pop(image_id)
     def _self_embedding(self) -> None:
         if self._bbox is not None:
             self.set_embedding(self.image_id, self._bbox)
@@ -387,39 +397,6 @@ class Image:
         return list(anns)
-    def get_annotation_iter(
-        self,
-        category_names: Optional[Union[str, ObjectTypes, Sequence[Union[str, ObjectTypes]]]] = None,
-        annotation_ids: Optional[Union[str, Sequence[str]]] = None,
-        service_id: Optional[Union[str, Sequence[str]]] = None,
-        model_id: Optional[Union[str, Sequence[str]]] = None,
-        session_ids: Optional[Union[str, Sequence[str]]] = None,
-        ignore_inactive: bool = True,
-    ) -> Iterable[ImageAnnotation]:
-        """
-        Get annotation as an iterator. Same as `get_annotation` but returns an iterator instead of a list.
-        :param category_names: A single name or list of names
-        :param annotation_ids: A single id or list of ids
-        :param service_id: A single service name or list of service names
-        :param model_id: A single model name or list of model names
-        :param session_ids: A single session id or list of session ids
-        :param ignore_inactive: If set to `True` only active annotations are returned.
-        :return: A (possibly empty) list of annotations
-        """
-        return iter(
-            self.get_annotation(
-                category_names=category_names,
-                annotation_ids=annotation_ids,
-                service_id=service_id,
-                model_id=model_id,
-                session_ids=session_ids,
-                ignore_inactive=ignore_inactive,
-            )
-        )
     def as_dict(self) -> dict[str, Any]:
         """
         Returns the full image dataclass as dict. Uses the custom `convert.as_dict` to disregard attributes
@@ -441,7 +418,7 @@ class Image:
         A list of attributes to suspend from as_dict creation.
         """
-        return ["_image"]
+        return ["_image", "_annotation_ids"]
     def define_annotation_id(self, annotation: Annotation) -> str:
         """
@@ -456,7 +433,11 @@ class Image:
         attributes_values = [str(getattr(annotation, attribute)) for attribute in attributes]
         return get_uuid(*attributes_values, str(self.image_id))
-    def remove(self, annotation: ImageAnnotation) -> None:
+    def remove(
+        self,
+        annotation_ids: Optional[Union[str, list[str]]] = None,
+        service_ids: Optional[Union[str, list[str]]] = None,
+    ) -> None:
         """
         Instead of removing consider deactivating annotations.
@@ -464,9 +445,66 @@ class Image:
         :param annotation: The annotation to remove
         """
+        ann_id_to_annotation_maps = self.get_annotation_id_to_annotation_maps()
+        if annotation_ids is not None:
+            annotation_ids = [annotation_ids] if isinstance(annotation_ids, str) else annotation_ids
+            for ann_id in annotation_ids:
+                if ann_id not in ann_id_to_annotation_maps:
+                    raise ImageError(f"Annotation with id {ann_id} not found")
+                annotation_maps = ann_id_to_annotation_maps[ann_id]
+                for annotation_map in annotation_maps:
+                    self._remove_by_annotation_id(ann_id, annotation_map)
+        if service_ids is not None:
+            service_ids = [service_ids] if isinstance(service_ids, str) else service_ids
+            service_id_to_annotation_id = self.get_service_id_to_annotation_id()
+            for service_id in service_ids:
+                if service_id not in service_id_to_annotation_id:
+                    raise ImageError(f"Service id {service_id} not found")
+                annotation_ids = service_id_to_annotation_id[service_id]
+                for ann_id in annotation_ids:
+                    if ann_id not in ann_id_to_annotation_maps:
+                        raise ImageError(f"Annotation with id {ann_id} not found")
+                    annotation_maps = ann_id_to_annotation_maps[ann_id]
+                    for annotation_map in annotation_maps:
+                        self._remove_by_annotation_id(ann_id, annotation_map)
+    def _remove_by_annotation_id(self, annotation_id: str, location_dict: AnnotationMap) -> None:
+        image_annotation_id = location_dict.image_annotation_id
+        annotations = self.get_annotation(annotation_ids=image_annotation_id)
+        if not annotations:
+            return
+        # There can only be one annotation with a given id
+        annotation = annotations[0]
+        if (
+            location_dict.sub_category_key is None
+            and location_dict.relationship_key is None
+            and location_dict.summary_key is None
+        ):
+            self.annotations.remove(annotation)
+            self._annotation_ids.remove(annotation.annotation_id)
+        sub_category_key = location_dict.sub_category_key
+        if sub_category_key is not None:
+            annotation.remove_sub_category(sub_category_key)
+        relationship_key = location_dict.relationship_key
-        self.annotations.remove(annotation)
-        self._annotation_ids.remove(annotation.annotation_id)
+        if relationship_key is not None:
+            annotation.remove_relationship(relationship_key, annotation_id)
+        summary_key = location_dict.summary_key
+        if summary_key is not None:
+            if annotation.image is not None:
+                annotation.image.summary.remove_sub_category(summary_key)
     def image_ann_to_image(self, annotation_id: str, crop_image: bool = False) -> None:
         """
@@ -580,6 +618,7 @@ class Image:
         if summary_dict := kwargs.get("_summary", kwargs.get("summary")):
             image.summary = CategoryAnnotation.from_dict(**summary_dict)
             image.summary.category_name = SummaryType.SUMMARY
         return image
     @classmethod
@@ -645,7 +684,7 @@ class Image:
         highest_hierarchy_only: bool = False,
         path: Optional[PathLikeOrStr] = None,
         dry: bool = False,
-    ) -> Optional[ImageDict]:
+    ) -> Optional[Union[ImageDict, str]]:
         """
         Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
         base64 encodings.
@@ -677,8 +716,45 @@ class Image:
             return export_dict
         with open(path_json, "w", encoding="UTF-8") as file:
             json.dump(export_dict, file, indent=2)
-        return None
+        return path_json
     def get_categories_from_current_state(self) -> set[str]:
         """Returns all active dumped categories"""
         return {ann.category_name for ann in self.get_annotation()}
+    def get_service_id_to_annotation_id(self) -> defaultdict[str, list[str]]:
+        """
+        Returns a dictionary with service ids as keys and lists of annotation ids that have been generated by the
+        service
+        :return: default with service ids as keys and lists of annotation ids as values
+        """
+        service_id_dict = defaultdict(list)
+        for ann in self.get_annotation():
+            if ann.service_id:
+                service_id_dict[ann.service_id].append(ann.annotation_id)
+            for sub_cat_key in ann.sub_categories:
+                sub_cat = ann.get_sub_category(sub_cat_key)
+                if sub_cat.service_id:
+                    service_id_dict[sub_cat.service_id].append(sub_cat.annotation_id)
+            if ann.image is not None:
+                for summary_cat_key in ann.image.summary:
+                    summary_cat = ann.get_summary(summary_cat_key)
+                    if summary_cat.service_id:
+                        service_id_dict[summary_cat.service_id].append(summary_cat.annotation_id)
+        return service_id_dict
+    def get_annotation_id_to_annotation_maps(self) -> defaultdict[str, list[AnnotationMap]]:
+        """
+        Returns a dictionary with annotation ids as keys and lists of AnnotationMap as values. The range of ids
+        is the union of all ImageAnnotation, CategoryAnnotation and ContainerAnnotation of the image.
+        :return: default dict with annotation ids as keys and lists of AnnotationMap as values
+        """
+        all_ann_id_dict = defaultdict(list)
+        for ann in self.get_annotation():
+            ann_id_dict = ann.get_annotation_map()
+            for key, val in ann_id_dict.items():
+                all_ann_id_dict[key].extend(val)
+        return all_ann_id_dict

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datapoint/view.py RENAMED Viewed

@@ -971,7 +971,7 @@ class Page(Image):
         highest_hierarchy_only: bool = False,
         path: Optional[PathLikeOrStr] = None,
         dry: bool = False,
-    ) -> Optional[ImageDict]:
+    ) -> Optional[Union[ImageDict, str]]:
         """
         Export image as dictionary. As numpy array cannot be serialized `image` values will be converted into
         base64 encodings.

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/base.py RENAMED Viewed

@@ -451,7 +451,7 @@ class CustomDataset(DatasetBase):
         return self.dataflow_builder
     @staticmethod
-    def from_dataset_card(file_path: str, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
+    def from_dataset_card(file_path: PathLikeOrStr, dataflow_builder: Type[DataFlowBaseBuilder]) -> CustomDataset:
         """
         This static method creates a CustomDataset instance from a dataset card.

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/fintabnet.py RENAMED Viewed

@@ -264,7 +264,7 @@ class FintabnetBuilder(DataFlowBaseBuilder):
                     add_summary=True,
                 ),
             )
-            df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation_iter(category_names=LayoutType.TABLE)])
+            df = MapData(df, lambda dp: [ann.image for ann in dp.get_annotation(category_names=LayoutType.TABLE)])
             df = FlattenData(df)
             df = MapData(df, lambda dp: dp[0])

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/datasets/instances/xfund.py RENAMED Viewed

@@ -180,13 +180,35 @@ class XfundBuilder(DataFlowBaseBuilder):
             "answer": TokenClasses.ANSWER,
             "header": TokenClasses.HEADER,
         }
-        ner_token_to_id_mapping = self.categories.get_sub_categories(
-            categories=LayoutType.WORD,
-            sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
-            keys=False,
-            values_as_dict=True,
-            name_as_key=True,
-        )
+        if LayoutType.WORD in self.categories.get_categories(filtered=True, name_as_key=True):
+            ner_token_to_id_mapping = self.categories.get_sub_categories(
+                categories=LayoutType.WORD,
+                sub_categories={LayoutType.WORD: [WordType.TOKEN_TAG, WordType.TAG, WordType.TOKEN_CLASS]},
+                keys=False,
+                values_as_dict=True,
+                name_as_key=True,
+            )
+        else:
+            ner_token_to_id_mapping = {
+                LayoutType.WORD: {
+                    WordType.TAG: {BioTag.BEGIN: 3, BioTag.INSIDE: 1, BioTag.OUTSIDE: 2},
+                    WordType.TOKEN_CLASS: {
+                        TokenClasses.ANSWER: 3,
+                        TokenClasses.HEADER: 4,
+                        TokenClasses.OTHER: 1,
+                        TokenClasses.QUESTION: 2,
+                    },
+                    WordType.TOKEN_TAG: {
+                        TokenClassWithTag.B_ANSWER: 1,
+                        TokenClassWithTag.B_HEADER: 2,
+                        TokenClassWithTag.B_QUESTION: 3,
+                        TokenClassWithTag.I_ANSWER: 4,
+                        TokenClassWithTag.I_HEADER: 5,
+                        TokenClassWithTag.I_QUESTION: 6,
+                        BioTag.OUTSIDE: 7,
+                    },
+                }
+            }
         df = MapData(
             df,
             xfund_to_image(

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/eval/eval.py RENAMED Viewed

@@ -293,6 +293,8 @@ class Evaluator:
         show_words = kwargs.pop("show_words", False)
         show_token_class = kwargs.pop("show_token_class", True)
         ignore_default_token_class = kwargs.pop("ignore_default_token_class", False)
+        floating_text_block_categories = kwargs.pop("floating_text_block_categories", None)
+        include_residual_text_containers = kwargs.pop("include_residual_Text_containers", True)
         df_gt = self.dataset.dataflow.build(**kwargs)
         df_pr = self.dataset.dataflow.build(**kwargs)
@@ -301,7 +303,11 @@ class Evaluator:
         df_pr = MapData(df_pr, deepcopy)
         df_pr = self._clean_up_predict_dataflow_annotations(df_pr)
-        page_parsing_component = PageParsingService(text_container=LayoutType.WORD)
+        page_parsing_component = PageParsingService(
+            text_container=LayoutType.WORD,
+            floating_text_block_categories=floating_text_block_categories,  # type: ignore
+            include_residual_text_container=bool(include_residual_text_containers),
+        )
         df_gt = page_parsing_component.predict_dataflow(df_gt)
         if self.pipe_component:

{deepdoctection-0.33 → deepdoctection-0.34}/deepdoctection/extern/model.py RENAMED Viewed

@@ -1051,7 +1051,8 @@ class ModelCatalog:
         with jsonlines.open(path) as reader:
             for obj in reader:
                 if not obj["name"] in ModelCatalog.CATALOG:
-                    obj["categories"] = {int(key): get_type(val) for key, val in obj["categories"].items()}
+                    categories = obj.get("categories") or {}
+                    obj["categories"] = {int(key): get_type(val) for key, val in categories.items()}
                     ModelCatalog.register(obj["name"], ModelProfile(**obj))
     @staticmethod

deepdoctection 0.33__tar.gz → 0.34__tar.gz

Potentially problematic release.

deepdoctection 0.33tar.gz → 0.34tar.gz