PyPI - deepdoctection - Versions diffs - 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl - Mend

deepdoctection 0.44.0py3-none-any.whl → 0.45.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (24) hide show

deepdoctection/__init__.py +6 -3
deepdoctection/analyzer/config.py +41 -0
deepdoctection/analyzer/factory.py +249 -1
deepdoctection/configs/profiles.jsonl +2 -1
deepdoctection/datapoint/image.py +1 -0
deepdoctection/datapoint/view.py +162 -69
deepdoctection/datasets/base.py +1 -0
deepdoctection/extern/__init__.py +1 -0
deepdoctection/extern/d2detect.py +1 -1
deepdoctection/extern/fastlang.py +6 -4
deepdoctection/extern/hflayoutlm.py +23 -10
deepdoctection/extern/hflm.py +432 -7
deepdoctection/mapper/laylmstruct.py +7 -7
deepdoctection/pipe/language.py +4 -4
deepdoctection/pipe/lm.py +7 -3
deepdoctection/utils/file_utils.py +34 -0
deepdoctection/utils/settings.py +2 -0
deepdoctection/utils/types.py +0 -1
deepdoctection/utils/viz.py +3 -3
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/METADATA +15 -15
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/RECORD +24 -24
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/WHEEL +0 -0
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/top_level.txt +0 -0

deepdoctection/datapoint/view.py CHANGED Viewed

@@ -42,13 +42,60 @@ from ..utils.settings import (
     get_type,
 )
 from ..utils.transform import ResizeTransform, box_to_point4, point4_to_box
-from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, Text_, csv
+from ..utils.types import HTML, AnnotationDict, Chunks, ImageDict, PathLikeOrStr, PixelValues, csv
 from ..utils.viz import draw_boxes, interactive_imshow, viz_handler
 from .annotation import CategoryAnnotation, ContainerAnnotation, ImageAnnotation, ann_from_dict
 from .box import BoundingBox, crop_box_from_image
 from .image import Image
+@dataclass(frozen=True)
+class Text_:
+    """
+    Immutable dataclass for storing structured text extraction results.
+    Attributes:
+        text: The concatenated text string.
+        words: List of word strings.
+        ann_ids: List of annotation IDs for each word.
+        token_classes: List of token class names for each word.
+        token_class_ann_ids: List of annotation IDs for each token class.
+        token_tags: List of token tag names for each word.
+        token_tag_ann_ids: List of annotation IDs for each token tag.
+        token_class_ids: List of token class IDs.
+        token_tag_ids: List of token tag IDs.
+    """
+    text: str = ""
+    words: list[str] = field(default_factory=list)
+    ann_ids: list[str] = field(default_factory=list)
+    token_classes: list[str] = field(default_factory=list)
+    token_class_ann_ids: list[str] = field(default_factory=list)
+    token_tags: list[str] = field(default_factory=list)
+    token_tag_ann_ids: list[str] = field(default_factory=list)
+    token_class_ids: list[str] = field(default_factory=list)
+    token_tag_ids: list[str] = field(default_factory=list)
+    def as_dict(self) -> dict[str, Union[list[str], str]]:
+        """
+        Returns the Text_ as a dictionary.
+        Returns:
+            A dictionary representation of the Text_ dataclass.
+        """
+        return {
+            "text": self.text,
+            "words": self.words,
+            "ann_ids": self.ann_ids,
+            "token_classes": self.token_classes,
+            "token_class_ann_ids": self.token_class_ann_ids,
+            "token_tags": self.token_tags,
+            "token_tag_ann_ids": self.token_tag_ann_ids,
+            "token_class_ids": self.token_class_ids,
+            "token_tag_ids": self.token_tag_ids,
+        }
 class ImageAnnotationBaseView(ImageAnnotation):
     """
     Consumption class for having easier access to categories added to an `ImageAnnotation`.
@@ -263,41 +310,73 @@ class Layout(ImageAnnotationBaseView):
         """
         words = self.get_ordered_words()
         if words:
-            characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = zip(
-                *[
-                    (
-                        word.characters,
-                        word.annotation_id,
-                        word.token_class,
-                        word.token_tag,
-                        word.get_sub_category(WordType.TOKEN_CLASS).category_id
-                        if WordType.TOKEN_CLASS in word.sub_categories
-                        else None,
-                        word.get_sub_category(WordType.TOKEN_TAG).category_id
-                        if WordType.TOKEN_TAG in word.sub_categories
-                        else None,
-                    )
-                    for word in words
-                ]
+            (
+                characters,
+                ann_ids,
+                token_classes,
+                token_class_ann_ids,
+                token_tags,
+                token_tag_ann_ids,
+                token_classes_ids,
+                token_tag_ids,
+            ) = map(
+                list,
+                zip(
+                    *[
+                        (
+                            word.characters,
+                            word.annotation_id,
+                            word.token_class,
+                            word.get_sub_category(WordType.TOKEN_CLASS).annotation_id
+                            if WordType.TOKEN_CLASS in word.sub_categories
+                            else None,
+                            word.token_tag,
+                            word.get_sub_category(WordType.TOKEN_TAG).annotation_id
+                            if WordType.TOKEN_TAG in word.sub_categories
+                            else None,
+                            word.get_sub_category(WordType.TOKEN_CLASS).category_id
+                            if WordType.TOKEN_CLASS in word.sub_categories
+                            else None,
+                            word.get_sub_category(WordType.TOKEN_TAG).category_id
+                            if WordType.TOKEN_TAG in word.sub_categories
+                            else None,
+                        )
+                        for word in words
+                    ]
+                ),
             )
         else:
-            characters, ann_ids, token_classes, token_tags, token_classes_ids, token_tag_ids = (
-                [],  # type: ignore
-                [],  # type: ignore
-                [],  # type: ignore
-                [],  # type: ignore
-                [],  # type: ignore
-                [],  # type: ignore
+            (
+                characters,
+                ann_ids,
+                token_classes,
+                token_class_ann_ids,
+                token_tags,
+                token_tag_ann_ids,
+                token_classes_ids,
+                token_tag_ids,
+            ) = (
+                [],
+                [],
+                [],
+                [],
+                [],
+                [],
+                [],
+                [],
             )
-        return {
-            "text": " ".join(characters),
-            "words": characters,
-            "ann_ids": ann_ids,
-            "token_classes": token_classes,
-            "token_tags": token_tags,
-            "token_class_ids": token_classes_ids,
-            "token_tag_ids": token_tag_ids,
-        }
+        return Text_(
+            text=" ".join(characters),  # type: ignore
+            words=characters,  # type: ignore
+            ann_ids=ann_ids,  # type: ignore
+            token_classes=token_classes,  # type: ignore
+            token_class_ann_ids=token_class_ann_ids,  # type: ignore
+            token_tags=token_tags,  # type: ignore
+            token_tag_ann_ids=token_tag_ann_ids,  # type: ignore
+            token_class_ids=token_classes_ids,  # type: ignore
+            token_tag_ids=token_tag_ids,  # type: ignore
+        )
     def get_attribute_names(self) -> set[str]:
         attr_names = (
@@ -387,9 +466,9 @@ class Table(Layout):
             A list of a table cells.
         """
         cell_anns: list[Cell] = []
-        for row_number in range(1, self.number_of_rows + 1):  # type: ignore
-            cell_anns.extend(self.row(row_number))  # type: ignore
+        if self.number_of_rows:
+            for row_number in range(1, self.number_of_rows + 1):  # type: ignore
+                cell_anns.extend(self.row(row_number))  # type: ignore
         return cell_anns
     @property
@@ -626,26 +705,33 @@ class Table(Layout):
         words: list[str] = []
         ann_ids: list[str] = []
         token_classes: list[str] = []
+        token_class_ann_ids: list[str] = []
         token_tags: list[str] = []
+        token_tag_ann_ids: list[str] = []
         token_class_ids: list[str] = []
         token_tag_ids: list[str] = []
         for cell in cells:
-            text.append(cell.text_["text"])
-            words.extend(cell.text_["words"])
-            ann_ids.extend(cell.text_["ann_ids"])
-            token_classes.extend(cell.text_["token_classes"])
-            token_tags.extend(cell.text_["token_tags"])
-            token_class_ids.extend(cell.text_["token_class_ids"])
-            token_tag_ids.extend(cell.text_["token_tag_ids"])
-        return {
-            "text": " ".join(text),
-            "words": words,
-            "ann_ids": ann_ids,
-            "token_classes": token_classes,
-            "token_tags": token_tags,
-            "token_class_ids": token_class_ids,
-            "token_tag_ids": token_tag_ids,
-        }
+            text_ = cell.text_
+            text.append(text_.text)
+            words.extend(text_.words)
+            ann_ids.extend(text_.ann_ids)
+            token_classes.extend(text_.token_classes)
+            token_class_ann_ids.extend(text_.token_class_ann_ids)
+            token_tags.extend(text_.token_tags)
+            token_tag_ann_ids.extend(text_.token_tag_ann_ids)
+            token_class_ids.extend(text_.token_class_ids)
+            token_tag_ids.extend(text_.token_tag_ids)
+        return Text_(
+            text=" ".join(text),
+            words=words,
+            ann_ids=ann_ids,
+            token_classes=token_classes,
+            token_class_ann_ids=token_class_ann_ids,
+            token_tags=token_tags,
+            token_tag_ann_ids=token_tag_ann_ids,
+            token_class_ids=token_class_ids,
+            token_tag_ids=token_tag_ids,
+        )
     @property
     def words(self) -> list[ImageAnnotationBaseView]:
@@ -1053,7 +1139,7 @@ class Page(Image):
             ```python
                 {"text": text string,
-                 "text_list": list of single words,
+                 "words": list of single words,
                  "annotation_ids": word annotation ids}
          ```
         """
@@ -1062,26 +1148,33 @@ class Page(Image):
         words: list[str] = []
         ann_ids: list[str] = []
         token_classes: list[str] = []
+        token_class_ann_ids: list[str] = []
         token_tags: list[str] = []
+        token_tag_ann_ids: list[str] = []
         token_class_ids: list[str] = []
         token_tag_ids: list[str] = []
         for block in block_with_order:
-            text.append(block.text_["text"])  # type: ignore
-            words.extend(block.text_["words"])  # type: ignore
-            ann_ids.extend(block.text_["ann_ids"])  # type: ignore
-            token_classes.extend(block.text_["token_classes"])  # type: ignore
-            token_tags.extend(block.text_["token_tags"])  # type: ignore
-            token_class_ids.extend(block.text_["token_class_ids"])  # type: ignore
-            token_tag_ids.extend(block.text_["token_tag_ids"])  # type: ignore
-        return {
-            "text": " ".join(text),
-            "words": words,
-            "ann_ids": ann_ids,
-            "token_classes": token_classes,
-            "token_tags": token_tags,
-            "token_class_ids": token_class_ids,
-            "token_tag_ids": token_tag_ids,
-        }
+            text_ = block.text_
+            text.append(text_.text)  # type: ignore
+            words.extend(text_.words)  # type: ignore
+            ann_ids.extend(text_.ann_ids)  # type: ignore
+            token_classes.extend(text_.token_classes)  # type: ignore
+            token_class_ann_ids.extend(text_.token_class_ann_ids)  # type: ignore
+            token_tags.extend(text_.token_tags)  # type: ignore
+            token_tag_ann_ids.extend(text_.token_tag_ann_ids)  # type: ignore
+            token_class_ids.extend(text_.token_class_ids)  # type: ignore
+            token_tag_ids.extend(text_.token_tag_ids)  # type: ignore
+        return Text_(
+            text=" ".join(text),
+            words=words,
+            ann_ids=ann_ids,
+            token_classes=token_classes,
+            token_class_ann_ids=token_class_ann_ids,
+            token_tags=token_tags,
+            token_tag_ann_ids=token_tag_ann_ids,
+            token_class_ids=token_class_ids,
+            token_tag_ids=token_tag_ann_ids,
+        )
     def get_layout_context(self, annotation_id: str, context_size: int = 3) -> list[ImageAnnotationBaseView]:
         """

deepdoctection/datasets/base.py CHANGED Viewed

@@ -408,6 +408,7 @@ class MergeDataset(DatasetBase):
 class DatasetCardDict(TypedDict):
     """DatasetCardDict"""
     name: str
     dataset_type: Union[str, Any]
     location: str

deepdoctection/extern/__init__.py CHANGED Viewed

@@ -26,6 +26,7 @@ from .doctrocr import *
 from .fastlang import *
 from .hfdetr import *
 from .hflayoutlm import *
+from .hflm import *
 from .model import *
 from .pdftext import *
 from .tessocr import *

deepdoctection/extern/d2detect.py CHANGED Viewed

@@ -91,7 +91,7 @@ def d2_predict_image(
     """
     height, width = np_img.shape[:2]
     resized_img = resizer.get_transform(np_img).apply_image(np_img)
-    image = torch.as_tensor(resized_img.astype("float32").transpose(2, 0, 1))
+    image = torch.as_tensor(resized_img.astype(np.float32).transpose(2, 0, 1))
     with torch.no_grad():
         inputs = {"image": image, "height": height, "width": width}

deepdoctection/extern/fastlang.py CHANGED Viewed

@@ -29,13 +29,14 @@ from typing import Any, Mapping, Union
 from lazy_imports import try_import
-from ..utils.file_utils import Requirement, get_fasttext_requirement
+from ..utils.develop import deprecated
+from ..utils.file_utils import Requirement, get_fasttext_requirement, get_numpy_v1_requirement
 from ..utils.settings import TypeOrStr, get_type
 from ..utils.types import PathLikeOrStr
 from .base import DetectionResult, LanguageDetector, ModelCategories
 with try_import() as import_guard:
-    from fasttext import load_model  # type: ignore
+    from fasttext import load_model  # type: ignore # pylint: disable=E0401
 class FasttextLangDetectorMixin(LanguageDetector, ABC):
@@ -61,7 +62,7 @@ class FasttextLangDetectorMixin(LanguageDetector, ABC):
         Returns:
             `DetectionResult` filled with `text` and `score`
         """
-        return DetectionResult(text=self.categories_orig[output[0][0]], score=output[1][0])
+        return DetectionResult(class_name=self.categories_orig[output[0][0]], score=output[1][0])
     @staticmethod
     def get_name(path_weights: PathLikeOrStr) -> str:
@@ -69,6 +70,7 @@ class FasttextLangDetectorMixin(LanguageDetector, ABC):
         return "fasttext_" + "_".join(Path(path_weights).parts[-2:])
+@deprecated("As FastText archived, it will be deprecated in the near future.", "2025-08-17")
 class FasttextLangDetector(FasttextLangDetectorMixin):
     """
     Fasttext language detector wrapper. Two models provided in the fasttext library can be used to identify languages.
@@ -114,7 +116,7 @@ class FasttextLangDetector(FasttextLangDetectorMixin):
     @classmethod
     def get_requirements(cls) -> list[Requirement]:
-        return [get_fasttext_requirement()]
+        return [get_numpy_v1_requirement(), get_fasttext_requirement()]
     def clone(self) -> FasttextLangDetector:
         return self.__class__(self.path_weights, self.categories.get_categories(), self.categories_orig)

deepdoctection/extern/hflayoutlm.py CHANGED Viewed

@@ -126,10 +126,13 @@ def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) ->
         ("XLMRobertaForSequenceClassification", True): XLMRobertaTokenizerFast.from_pretrained(
             "FacebookAI/xlm-roberta-base"
         ),
+        ("XLMRobertaForTokenClassification", True): XLMRobertaTokenizerFast.from_pretrained(
+            "FacebookAI/xlm-roberta-base"
+        ),
     }[(model_class, use_xlm_tokenizer)]
-def predict_token_classes(
+def predict_token_classes_from_layoutlm(
     uuids: list[list[str]],
     input_ids: torch.Tensor,
     attention_mask: torch.Tensor,
@@ -192,7 +195,7 @@ def predict_token_classes(
     return all_token_classes
-def predict_sequence_classes(
+def predict_sequence_classes_from_layoutlm(
     input_ids: torch.Tensor,
     attention_mask: torch.Tensor,
     token_type_ids: torch.Tensor,
@@ -462,7 +465,7 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
-        results = predict_token_classes(
+        results = predict_token_classes_from_layoutlm(
             ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, None
         )
@@ -586,7 +589,7 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
             images = images.to(self.device)
         else:
             raise ValueError(f"images must be list but is {type(images)}")
-        results = predict_token_classes(
+        results = predict_token_classes_from_layoutlm(
             ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, images
         )
@@ -710,7 +713,7 @@ class HFLayoutLmv3TokenClassifier(HFLayoutLmTokenClassifierBase):
             images = images.to(self.device)
         else:
             raise ValueError(f"images must be list but is {type(images)}")
-        results = predict_token_classes(
+        results = predict_token_classes_from_layoutlm(
             ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, images
         )
@@ -909,7 +912,7 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         """
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
-        result = predict_sequence_classes(
+        result = predict_sequence_classes_from_layoutlm(
             input_ids,
             attention_mask,
             token_type_ids,
@@ -1021,7 +1024,12 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         else:
             raise ValueError(f"images must be list but is {type(images)}")
-        result = predict_sequence_classes(input_ids, attention_mask, token_type_ids, boxes, self.model, images)
+        result = predict_sequence_classes_from_layoutlm(input_ids,
+                                                        attention_mask,
+                                                        token_type_ids,
+                                                        boxes,
+                                                        self.model,
+                                                        images)
         result.class_id += 1
         result.class_name = self.categories.categories[result.class_id]
@@ -1115,7 +1123,12 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         else:
             raise ValueError(f"images must be list but is {type(images)}")
-        result = predict_sequence_classes(input_ids, attention_mask, token_type_ids, boxes, self.model, images)
+        result = predict_sequence_classes_from_layoutlm(input_ids,
+                                                        attention_mask,
+                                                        token_type_ids,
+                                                        boxes,
+                                                        self.model,
+                                                        images)
         result.class_id += 1
         result.class_name = self.categories.categories[result.class_id]
@@ -1245,7 +1258,7 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
         ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
-        results = predict_token_classes(
+        results = predict_token_classes_from_layoutlm(
             ann_ids, input_ids, attention_mask, token_type_ids, boxes, tokens, self.model, None
         )
@@ -1323,7 +1336,7 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
-        result = predict_sequence_classes(
+        result = predict_sequence_classes_from_layoutlm(
             input_ids,
             attention_mask,
             token_type_ids,

deepdoctection 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.44.0py3-none-any.whl → 0.45.0py3-none-any.whl