PyPI - deepdoctection - Versions diffs - 0.37.2__py3-none-any.whl → 0.38__py3-none-any.whl - Mend

deepdoctection 0.37.2py3-none-any.whl → 0.38py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (19) hide show

deepdoctection/__init__.py +1 -1
deepdoctection/analyzer/_config.py +2 -1
deepdoctection/analyzer/factory.py +9 -4
deepdoctection/configs/conf_dd_one.yaml +126 -85
deepdoctection/datapoint/box.py +2 -4
deepdoctection/datapoint/image.py +11 -4
deepdoctection/datapoint/view.py +124 -36
deepdoctection/extern/hfdetr.py +4 -3
deepdoctection/extern/hflayoutlm.py +51 -34
deepdoctection/pipe/doctectionpipe.py +1 -1
deepdoctection/pipe/lm.py +7 -7
deepdoctection/pipe/refine.py +6 -13
deepdoctection/pipe/segment.py +229 -46
deepdoctection/pipe/sub_layout.py +40 -22
{deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/METADATA +12 -2
{deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/RECORD +19 -19
{deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/WHEEL +1 -1
{deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/LICENSE +0 -0
{deepdoctection-0.37.2.dist-info → deepdoctection-0.38.dist-info}/top_level.txt +0 -0

deepdoctection/extern/hfdetr.py CHANGED Viewed

@@ -41,6 +41,7 @@ with try_import() as tr_import_guard:
     from transformers import (  # pylint: disable=W0611
         AutoFeatureExtractor,
         DetrFeatureExtractor,
+        DetrImageProcessor,
         PretrainedConfig,
         TableTransformerForObjectDetection,
     )
@@ -55,7 +56,7 @@ def _detr_post_processing(
 def detr_predict_image(
     np_img: PixelValues,
     predictor: TableTransformerForObjectDetection,
-    feature_extractor: DetrFeatureExtractor,
+    feature_extractor: DetrImageProcessor,
     device: torch.device,
     threshold: float,
     nms_threshold: float,
@@ -224,13 +225,13 @@ class HFDetrDerivedDetector(HFDetrDerivedDetectorMixin):
         )
     @staticmethod
-    def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrFeatureExtractor:
+    def get_pre_processor(path_feature_extractor_config: PathLikeOrStr) -> DetrImageProcessor:
         """
         Builds the feature extractor
         :return: DetrFeatureExtractor
         """
-        return AutoFeatureExtractor.from_pretrained(
+        return DetrImageProcessor.from_pretrained(
             pretrained_model_name_or_path=os.fspath(path_feature_extractor_config)
         )

deepdoctection/extern/hflayoutlm.py CHANGED Viewed

@@ -48,7 +48,7 @@ with try_import() as pt_import_guard:
     import torch.nn.functional as F
 with try_import() as tr_import_guard:
-    from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD  # type:ignore
+    from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
     from transformers import (
         LayoutLMForSequenceClassification,
         LayoutLMForTokenClassification,
@@ -67,20 +67,6 @@ with try_import() as tr_import_guard:
     )
 if TYPE_CHECKING:
-    LayoutTokenModels: TypeAlias = Union[
-        LayoutLMForTokenClassification,
-        LayoutLMv2ForTokenClassification,
-        LayoutLMv3ForTokenClassification,
-        LiltForTokenClassification,
-    ]
-    LayoutSequenceModels: TypeAlias = Union[
-        LayoutLMForSequenceClassification,
-        LayoutLMv2ForSequenceClassification,
-        LayoutLMv3ForSequenceClassification,
-        LiltForSequenceClassification,
-    ]
     HfLayoutTokenModels: TypeAlias = Union[
         LayoutLMForTokenClassification,
         LayoutLMv2ForTokenClassification,
@@ -147,7 +133,7 @@ def predict_token_classes(
     token_type_ids: torch.Tensor,
     boxes: torch.Tensor,
     tokens: list[list[str]],
-    model: LayoutTokenModels,
+    model: HfLayoutTokenModels,
     images: Optional[torch.Tensor] = None,
 ) -> list[TokenClassResult]:
     """
@@ -205,7 +191,7 @@ def predict_sequence_classes(
     attention_mask: torch.Tensor,
     token_type_ids: torch.Tensor,
     boxes: torch.Tensor,
-    model: LayoutSequenceModels,
+    model: HfLayoutSequenceModels,
     images: Optional[torch.Tensor] = None,
 ) -> SequenceClassResult:
     """
@@ -254,6 +240,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
         categories_bio: Optional[Sequence[TypeOrStr]] = None,
         categories: Optional[Mapping[int, TypeOrStr]] = None,
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         """
         :param path_config_json: path to .json config file
@@ -281,6 +268,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
             init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
         )
         self.device = get_torch_device(device)
+        self.use_xlm_tokenizer = use_xlm_tokenizer
     @classmethod
     def get_requirements(cls) -> list[Requirement]:
@@ -342,6 +330,7 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
             self.categories.categories_bio,
             self.categories.get_categories(),
             self.device,
+            self.use_xlm_tokenizer,
         )
     @staticmethod
@@ -427,13 +416,15 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
                                   Tokenizer.
         """
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
         self.name = self.get_name(path_weights, "LayoutLM")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -540,13 +531,15 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
         :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
                                   default value.
         """
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
         self.name = self.get_name(path_weights, "LayoutLMv2")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -666,13 +659,15 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
         :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLMv3 model with a different
                                   tokenizer.
         """
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
         self.name = self.get_name(path_weights, "LayoutLMv3")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -746,19 +741,23 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
         path_weights: PathLikeOrStr,
         categories: Mapping[int, TypeOrStr],
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
     ):
         self.path_config = Path(path_config_json)
         self.path_weights = Path(path_weights)
         self.categories = ModelCategories(init_categories=categories)
         self.device = get_torch_device(device)
+        self.use_xlm_tokenizer = use_xlm_tokenizer
     @classmethod
     def get_requirements(cls) -> list[Requirement]:
         return [get_pytorch_requirement(), get_transformers_requirement()]
     def clone(self) -> HFLayoutLmSequenceClassifierBase:
-        return self.__class__(self.path_config, self.path_weights, self.categories.get_categories(), self.device)
+        return self.__class__(
+            self.path_config, self.path_weights, self.categories.get_categories(), self.device, self.use_xlm_tokenizer
+        )
     def _validate_encodings(
         self, **encodings: Union[list[list[str]], torch.Tensor]
@@ -856,13 +855,13 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
-        super().__init__(path_config_json, path_weights, categories, device)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LayoutLM")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -939,13 +938,13 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
-        super().__init__(path_config_json, path_weights, categories, device)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LayoutLMv2")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1029,13 +1028,13 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
-        super().__init__(path_config_json, path_weights, categories, device)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LayoutLMv3")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1142,13 +1141,15 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
         :param device: The device (cpu,"cuda"), where to place the model.
         """
-        super().__init__(path_config_json, path_weights, categories_semantics, categories_bio, categories, device)
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
         self.name = self.get_name(path_weights, "LiLT")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
@@ -1232,13 +1233,13 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
-        super().__init__(path_config_json, path_weights, categories, device)
+        super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LiLT")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
-            self.model.__class__.__name__, use_xlm_tokenizer
+            self.model.__class__.__name__, self.use_xlm_tokenizer
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
@@ -1270,3 +1271,19 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
     def clear_model(self) -> None:
         self.model = None
+if TYPE_CHECKING:
+    LayoutTokenModels: TypeAlias = Union[
+        HFLayoutLmTokenClassifier,
+        HFLayoutLmv2TokenClassifier,
+        HFLayoutLmv3TokenClassifier,
+        HFLiltTokenClassifier,
+    ]
+    LayoutSequenceModels: TypeAlias = Union[
+        HFLayoutLmSequenceClassifier,
+        HFLayoutLmv2SequenceClassifier,
+        HFLayoutLmv3SequenceClassifier,
+        HFLiltSequenceClassifier,
+    ]

deepdoctection/pipe/doctectionpipe.py CHANGED Viewed

@@ -188,7 +188,7 @@ class DoctectionPipe(Pipeline):
         df = MapData(df, _proto_process(path, doc_path))
         if dataset_dataflow is None:
-            df = MapData(df, _to_image(dpi=os.environ.get("DPI", 300)))  # pylint: disable=E1120
+            df = MapData(df, _to_image(dpi=int(os.environ.get("DPI", 300))))  # pylint: disable=E1120
         return df
     @staticmethod

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -30,7 +30,7 @@ from .base import MetaAnnotation, PipelineComponent
 from .registry import pipeline_component_registry
 if TYPE_CHECKING:
-    from ..extern.hflayoutlm import HfLayoutSequenceModels, HfLayoutTokenModels
+    from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
 @pipeline_component_registry.register("LMTokenClassifierService")
@@ -66,7 +66,7 @@ class LMTokenClassifierService(PipelineComponent):
     def __init__(
         self,
         tokenizer: Any,
-        language_model: HfLayoutTokenModels,
+        language_model: LayoutTokenModels,
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
@@ -155,11 +155,11 @@ class LMTokenClassifierService(PipelineComponent):
                 else:
                     token_class_name_id = None
                 self.dp_manager.set_category_annotation(
-                    token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid
+                    token.semantic_name, token_class_name_id, WordType.TOKEN_CLASS, token.uuid, token.score
                 )
                 self.dp_manager.set_category_annotation(token.bio_tag, None, WordType.TAG, token.uuid)
                 self.dp_manager.set_category_annotation(
-                    token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid
+                    token.class_name, token.class_id, WordType.TOKEN_TAG, token.uuid, token.score
                 )
                 words_populated.append(token.uuid)
@@ -188,7 +188,7 @@ class LMTokenClassifierService(PipelineComponent):
         # multiple threads
         return self.__class__(
             copy(self.tokenizer),
-            self.language_model.clone(),
+            self.language_model.clone(),  # type: ignore
             self.padding,
             self.truncation,
             self.return_overflowing_tokens,
@@ -260,7 +260,7 @@ class LMSequenceClassifierService(PipelineComponent):
     def __init__(
         self,
         tokenizer: Any,
-        language_model: HfLayoutSequenceModels,
+        language_model: LayoutSequenceModels,
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
@@ -309,7 +309,7 @@ class LMSequenceClassifierService(PipelineComponent):
     def clone(self) -> LMSequenceClassifierService:
         return self.__class__(
             copy(self.tokenizer),
-            self.language_model.clone(),
+            self.language_model.clone(),  # type: ignore
             self.padding,
             self.truncation,
             self.return_overflowing_tokens,

deepdoctection/pipe/refine.py CHANGED Viewed

@@ -295,28 +295,21 @@ def _html_table(
     return html
-def generate_html_string(table: ImageAnnotation) -> list[str]:
+def generate_html_string(table: ImageAnnotation, cell_names: Sequence[ObjectTypes]) -> list[str]:
     """
     Takes the table segmentation by using table cells row number, column numbers etc. and generates a html
     representation.
     :param table: An annotation that has a not None image and fully segmented cell annotation.
+    :param cell_names: List of cell names that are used for the table segmentation. Note: It must be ensured that
+                      that all cells have a row number, column number, row span and column span and that the dissection
+                      by rows and columns is completely covered by cells.
     :return: HTML representation of the table
     """
     if table.image is None:
         raise ImageError("table.image cannot be None")
     table_image = table.image
-    cells = table_image.get_annotation(
-        category_names=[
-            LayoutType.CELL,
-            CellType.HEADER,
-            CellType.BODY,
-            CellType.SPANNING,
-            CellType.ROW_HEADER,
-            CellType.COLUMN_HEADER,
-            CellType.PROJECTED_ROW_HEADER,
-        ]
-    )
+    cells = table_image.get_annotation(category_names=cell_names)
     number_of_rows = table_image.summary.get_sub_category(TableType.NUMBER_OF_ROWS).category_id
     number_of_cols = table_image.summary.get_sub_category(TableType.NUMBER_OF_COLUMNS).category_id
     table_list = []
@@ -485,7 +478,7 @@ class TableSegmentationRefinementService(PipelineComponent):
             self.dp_manager.set_summary_annotation(
                 TableType.MAX_COL_SPAN, TableType.MAX_COL_SPAN, max_col_span, annotation_id=table.annotation_id
             )
-            html = generate_html_string(table)
+            html = generate_html_string(table, self.cell_names)
             self.dp_manager.set_container_annotation(TableType.HTML, -1, TableType.HTML, table.annotation_id, html)
     def clone(self) -> TableSegmentationRefinementService:

deepdoctection 0.37.2__py3-none-any.whl → 0.38__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.37.2py3-none-any.whl → 0.38py3-none-any.whl