PyPI - deepdoctection - Versions diffs - 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl - Mend

deepdoctection 0.44.0py3-none-any.whl → 0.45.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (24) hide show

deepdoctection/__init__.py +6 -3
deepdoctection/analyzer/config.py +41 -0
deepdoctection/analyzer/factory.py +249 -1
deepdoctection/configs/profiles.jsonl +2 -1
deepdoctection/datapoint/image.py +1 -0
deepdoctection/datapoint/view.py +162 -69
deepdoctection/datasets/base.py +1 -0
deepdoctection/extern/__init__.py +1 -0
deepdoctection/extern/d2detect.py +1 -1
deepdoctection/extern/fastlang.py +6 -4
deepdoctection/extern/hflayoutlm.py +23 -10
deepdoctection/extern/hflm.py +432 -7
deepdoctection/mapper/laylmstruct.py +7 -7
deepdoctection/pipe/language.py +4 -4
deepdoctection/pipe/lm.py +7 -3
deepdoctection/utils/file_utils.py +34 -0
deepdoctection/utils/settings.py +2 -0
deepdoctection/utils/types.py +0 -1
deepdoctection/utils/viz.py +3 -3
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/METADATA +15 -15
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/RECORD +24 -24
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/WHEEL +0 -0
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.44.0.dist-info → deepdoctection-0.45.0.dist-info}/top_level.txt +0 -0

deepdoctection/extern/hflm.py CHANGED Viewed

@@ -20,16 +20,28 @@ Wrapper for the HF Language Model for sequence and token classification
 """
 from __future__ import annotations
+import os
 from abc import ABC
+from collections import defaultdict
 from pathlib import Path
-from typing import Literal, Mapping, Optional, Union
+from typing import TYPE_CHECKING, Any, Literal, Mapping, Optional, Sequence, Union
 from lazy_imports import try_import
+from typing_extensions import TypeAlias
 from ..utils.file_utils import get_pytorch_requirement, get_transformers_requirement
 from ..utils.settings import TypeOrStr
 from ..utils.types import JsonDict, PathLikeOrStr, Requirement
-from .base import LMSequenceClassifier, ModelCategories, SequenceClassResult
+from .base import (
+    DetectionResult,
+    LanguageDetector,
+    LMSequenceClassifier,
+    LMTokenClassifier,
+    ModelCategories,
+    NerModelCategories,
+    SequenceClassResult,
+    TokenClassResult,
+)
 from .hflayoutlm import get_tokenizer_from_model_class
 from .pt.ptutils import get_torch_device
@@ -38,14 +50,63 @@ with try_import() as pt_import_guard:
     import torch.nn.functional as F
 with try_import() as tr_import_guard:
-    from transformers import PretrainedConfig, XLMRobertaForSequenceClassification
+    from transformers import (
+        PretrainedConfig,
+        XLMRobertaForSequenceClassification,
+        XLMRobertaForTokenClassification,
+        XLMRobertaTokenizerFast,
+    )
-def predict_sequence_classes(
+def predict_token_classes_from_lm(
+    uuids: list[list[str]],
     input_ids: torch.Tensor,
     attention_mask: torch.Tensor,
     token_type_ids: torch.Tensor,
-    model: Union[XLMRobertaForSequenceClassification],
+    tokens: list[list[str]],
+    model: XLMRobertaForTokenClassification,
+) -> list[TokenClassResult]:
+    """
+    Args:
+        uuids: A list of uuids that correspond to a word that induces the resulting token
+        input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+        attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
+        token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+        tokens: List of original tokens taken from `LayoutLMTokenizer`
+        model: layoutlm model for token classification
+    Returns:
+        A list of `TokenClassResult`s
+    """
+    outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
+    soft_max = F.softmax(outputs.logits, dim=2)
+    score = torch.max(soft_max, dim=2)[0].tolist()
+    token_class_predictions_ = outputs.logits.argmax(-1).tolist()
+    input_ids_list = input_ids.tolist()
+    all_results = defaultdict(list)
+    for idx, uuid_list in enumerate(uuids):
+        for pos, token in enumerate(uuid_list):
+            all_results[token].append(
+                (input_ids_list[idx][pos], token_class_predictions_[idx][pos], tokens[idx][pos], score[idx][pos])
+            )
+    all_token_classes = []
+    for uuid, res in all_results.items():
+        res.sort(key=lambda x: x[3], reverse=True)
+        output = res[0]
+        all_token_classes.append(
+            TokenClassResult(uuid=uuid, token_id=output[0], class_id=output[1], token=output[2], score=output[3])
+        )
+    return all_token_classes
+def predict_sequence_classes_from_lm(
+    input_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    token_type_ids: torch.Tensor,
+    model: XLMRobertaForSequenceClassification,
 ) -> SequenceClassResult:
     """
     Args:
@@ -66,6 +127,250 @@ def predict_sequence_classes(
     return SequenceClassResult(class_id=sequence_class_predictions, score=float(score))  # type: ignore
+class HFLmTokenClassifierBase(LMTokenClassifier, ABC):
+    """
+    Abstract base class for wrapping Bert-like models for token classification into the framework.
+    """
+    def __init__(
+        self,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories_semantics: Optional[Sequence[TypeOrStr]] = None,
+        categories_bio: Optional[Sequence[TypeOrStr]] = None,
+        categories: Optional[Mapping[int, TypeOrStr]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = False,
+    ):
+        """
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+            categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
+                                 entities self. To be consistent with detectors use only values `>0`. Conversion will
+                                 be done internally.
+            categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
+                           consistent with detectors use only `values>0`. Conversion will be done internally.
+            categories: If you have a pre-trained model you can pass a complete dict of NER categories
+            device: The device (cpu,"cuda"), where to place the model.
+            use_xlm_tokenizer: True if one uses the `LayoutXLM` or a lilt model built with a xlm language model, e.g.
+                              `info-xlm` or `roberta-xlm`. (`LayoutXLM` cannot be distinguished from LayoutLMv2).
+        """
+        if categories is None:
+            if categories_semantics is None:
+                raise ValueError("If categories is None then categories_semantics cannot be None")
+            if categories_bio is None:
+                raise ValueError("If categories is None then categories_bio cannot be None")
+        self.path_config = Path(path_config_json)
+        self.path_weights = Path(path_weights)
+        self.categories = NerModelCategories(
+            init_categories=categories, categories_semantics=categories_semantics, categories_bio=categories_bio
+        )
+        self.device = get_torch_device(device)
+        self.use_xlm_tokenizer = use_xlm_tokenizer
+    @classmethod
+    def get_requirements(cls) -> list[Requirement]:
+        return [get_pytorch_requirement(), get_transformers_requirement()]
+    def _map_category_names(self, token_results: list[TokenClassResult]) -> list[TokenClassResult]:
+        for result in token_results:
+            result.class_name = self.categories.categories[result.class_id + 1]
+            output = self.categories.disentangle_token_class_and_tag(result.class_name)
+            if output is not None:
+                token_class, tag = output
+                result.semantic_name = token_class
+                result.bio_tag = tag
+            else:
+                result.semantic_name = result.class_name
+            result.class_id += 1
+        return token_results
+    def _validate_encodings(
+        self, **encodings: Any
+    ) -> tuple[list[list[str]], list[str], torch.Tensor, torch.Tensor, torch.Tensor, list[list[str]]]:
+        image_ids = encodings.get("image_ids", [])
+        ann_ids = encodings.get("ann_ids")
+        input_ids = encodings.get("input_ids")
+        attention_mask = encodings.get("attention_mask")
+        token_type_ids = encodings.get("token_type_ids")
+        tokens = encodings.get("tokens")
+        assert isinstance(ann_ids, list), type(ann_ids)
+        if len(set(image_ids)) > 1:
+            raise ValueError("HFLmTokenClassifier accepts for inference only one image.")
+        if isinstance(input_ids, torch.Tensor):
+            input_ids = input_ids.to(self.device)
+        else:
+            raise ValueError(f"input_ids must be list but is {type(input_ids)}")
+        if isinstance(attention_mask, torch.Tensor):
+            attention_mask = attention_mask.to(self.device)
+        else:
+            raise ValueError(f"attention_mask must be list but is {type(attention_mask)}")
+        if isinstance(token_type_ids, torch.Tensor):
+            token_type_ids = token_type_ids.to(self.device)
+        else:
+            raise ValueError(f"token_type_ids must be list but is {type(token_type_ids)}")
+        if not isinstance(tokens, list):
+            raise ValueError(f"tokens must be list but is {type(tokens)}")
+        return ann_ids, image_ids, input_ids, attention_mask, token_type_ids, tokens
+    def clone(self) -> HFLmTokenClassifierBase:
+        return self.__class__(
+            self.path_config,
+            self.path_weights,
+            self.categories.categories_semantics,
+            self.categories.categories_bio,
+            self.categories.get_categories(),
+            self.device,
+            self.use_xlm_tokenizer,
+        )
+    @staticmethod
+    def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
+        """Returns the name of the model"""
+        return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
+    @staticmethod
+    def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
+        """
+        A refinement for adding the tokenizer class name to the model configs.
+        Args:
+            model_class_name: The model name, e.g. `model.__class__.__name__`
+            use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
+        Returns:
+            The name of the tokenizer class.
+        """
+        tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
+        return tokenizer.__class__.__name__
+    @staticmethod
+    def image_to_raw_features_mapping() -> str:
+        """Returns the mapping function to convert images into raw features."""
+        return "image_to_raw_lm_features"
+    @staticmethod
+    def image_to_features_mapping() -> str:
+        """Returns the mapping function to convert images into features."""
+        return "image_to_lm_features"
+class HFLmTokenClassifier(HFLmTokenClassifierBase):
+    """
+    A wrapper class for `transformers.XLMRobertaForTokenClassification` and similar models to use within a pipeline
+    component. Check <https://huggingface.co/docs/transformers/model_doc/xlm-roberta> for documentation of the
+    model itself.
+    Note that this model is equipped with a head that is only useful for classifying the tokens. For sequence
+    classification and other things please use another model of the family.
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
+        # hf tokenizer and token classifier
+        tokenizer = XLMRobertaTokenizerFast.from_pretrained("FacebookAI/xlm-roberta-base")
+        roberta = XLMRobertaForTokenClassification("path/to/config.json","path/to/model.bin",
+                                              categories=["first_name", "surname", "street"])
+        # token classification service
+        roberta_service = LMTokenClassifierService(tokenizer,roberta)
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,roberta_service])
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
+        for dp in df:
+            ...
+        ```
+    """
+    def __init__(
+        self,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories_semantics: Optional[Sequence[TypeOrStr]] = None,
+        categories_bio: Optional[Sequence[TypeOrStr]] = None,
+        categories: Optional[Mapping[int, TypeOrStr]] = None,
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = True,
+    ):
+        """
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+            categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
+                                 entities self. To be consistent with detectors use only values `>0`. Conversion will
+                                 be done internally.
+            categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. BIO). To be
+                           consistent with detectors use only values>0. Conversion will be done internally.
+            categories: If you have a pre-trained model you can pass a complete dict of NER categories
+            device: The device (cpu,"cuda"), where to place the model.
+            use_xlm_tokenizer: Do not change this value unless you pre-trained a bert-like model with a different
+                              Tokenizer.
+        """
+        super().__init__(
+            path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
+        )
+        self.name = self.get_name(path_weights, "bert-like-token-classification")
+        self.model_id = self.get_model_id()
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        self.model.to(self.device)
+        self.model.config.tokenizer_class = self.get_tokenizer_class_name(
+            self.model.__class__.__name__, self.use_xlm_tokenizer
+        )
+    def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
+        """
+        Launch inference on bert-like models for token classification. Pass the following arguments
+        Args:
+            encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+                       attention_mask: The associated attention masks from padded sequences taken from
+                                       `LayoutLMTokenizer`
+                       token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+                       boxes: Torch tensor of bounding boxes of type `xyxy`
+                       tokens: List of original tokens taken from `LayoutLMTokenizer`
+        Returns:
+            A list of `TokenClassResult`s
+        """
+        ann_ids, _, input_ids, attention_mask, token_type_ids, tokens = self._validate_encodings(**encodings)
+        results = predict_token_classes_from_lm(ann_ids, input_ids, attention_mask, token_type_ids, tokens, self.model)
+        return self._map_category_names(results)
+    @staticmethod
+    def get_wrapped_model(
+        path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
+    ) -> XLMRobertaForTokenClassification:
+        """
+        Get the inner (wrapped) model.
+        Args:
+            path_config_json: path to .json config file
+            path_weights: path to model artifact
+        Returns:
+            `nn.Module`
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
+        return XLMRobertaForTokenClassification.from_pretrained(
+            pretrained_model_name_or_path=os.fspath(path_weights), config=config
+        )
+    def clear_model(self) -> None:
+        self.model = None
 class HFLmSequenceClassifierBase(LMSequenceClassifier, ABC):
     """
     Abstract base class for wrapping Bert-type models for sequence classification into the deepdoctection framework.
@@ -208,10 +513,11 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
         use_xlm_tokenizer: bool = True,
     ):
         super().__init__(path_config_json, path_weights, categories, device)
-        self.name = self.get_name(path_weights, "bert-like")
+        self.name = self.get_name(path_weights, "bert-like-sequence-classification")
         self.model_id = self.get_model_id()
         self.model = self.get_wrapped_model(path_config_json, path_weights)
         self.model.to(self.device)
+        self.use_xlm_tokenizer = use_xlm_tokenizer
         self.model.config.tokenizer_class = self.get_tokenizer_class_name(
             self.model.__class__.__name__, use_xlm_tokenizer
         )
@@ -219,7 +525,7 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
         input_ids, attention_mask, token_type_ids = self._validate_encodings(**encodings)
-        result = predict_sequence_classes(
+        result = predict_sequence_classes_from_lm(
             input_ids,
             attention_mask,
             token_type_ids,
@@ -262,3 +568,122 @@ class HFLmSequenceClassifier(HFLmSequenceClassifierBase):
     def clear_model(self) -> None:
         self.model = None
+class HFLmLanguageDetector(LanguageDetector):
+    """
+    Language detector using HuggingFace's `XLMRobertaForSequenceClassification`.
+    This class wraps a multilingual sequence classification model (XLMRobertaForSequenceClassification)
+    for language detection tasks. Input text is tokenized and truncated/padded to a maximum length of 512 tokens.
+    The prediction returns a `DetectionResult` containing the detected language code and its confidence score.
+    """
+    def __init__(
+        self,
+        path_config_json: PathLikeOrStr,
+        path_weights: PathLikeOrStr,
+        categories: Mapping[int, TypeOrStr],
+        device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
+        use_xlm_tokenizer: bool = True,
+    ):
+        super().__init__()
+        self.path_config = Path(path_config_json)
+        self.path_weights = Path(path_weights)
+        self.categories = ModelCategories(init_categories=categories)
+        self.device = get_torch_device(device)
+        self.use_xlm_tokenizer = use_xlm_tokenizer
+        self.model = self.get_wrapped_model(path_config_json, path_weights)
+        self.model.to(self.device)
+        self.tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-base")
+        self.name = self.get_name(path_weights, "bert-like-language-detection")
+        self.model_id = self.get_model_id()
+    def predict(self, text_string: str) -> DetectionResult:
+        """
+        Predict the language of the input sequence.
+        Args:
+            text_string: The input text sequence to classify.
+        Returns:
+            DetectionResult: The detected language and its confidence score.
+        """
+        encoding = self.tokenizer(
+            text_string,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=512,
+        )
+        input_ids = encoding["input_ids"].to(self.device)
+        attention_mask = encoding["attention_mask"].to(self.device)
+        token_type_ids = encoding.get("token_type_ids")
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.to(self.device)
+        else:
+            token_type_ids = torch.zeros_like(input_ids)
+        self.model.eval()
+        with torch.no_grad():
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+            )
+            probs = torch.softmax(outputs.logits, dim=-1)
+            score, class_id_tensor = torch.max(probs, dim=-1)
+            class_id = int(class_id_tensor.item() + 1)
+            lang = self.categories.categories[class_id]
+        return DetectionResult(class_name=lang, score=float(score.item()))
+    def clear_model(self) -> None:
+        self.model = None
+    @classmethod
+    def get_requirements(cls) -> list[Requirement]:
+        return [get_pytorch_requirement(), get_transformers_requirement()]
+    @staticmethod
+    def get_wrapped_model(
+        path_config_json: PathLikeOrStr, path_weights: PathLikeOrStr
+    ) -> XLMRobertaForSequenceClassification:
+        """
+        Get the inner (wrapped) model.
+        Args:
+            path_config_json: path to .json config file
+            path_weights: path to model artifact
+        Returns:
+            `XLMRobertaForSequenceClassification`
+        """
+        config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
+        return XLMRobertaForSequenceClassification.from_pretrained(
+            pretrained_model_name_or_path=path_weights, config=config
+        )
+    def clone(self) -> HFLmLanguageDetector:
+        return self.__class__(
+            self.path_config, self.path_weights, self.categories.get_categories(), self.device, self.use_xlm_tokenizer
+        )
+    @staticmethod
+    def get_name(path_weights: PathLikeOrStr, architecture: str) -> str:
+        """
+        Returns the name of the model
+        Args:
+            path_weights: Path to model weights
+            architecture: Architecture name
+        Returns:
+            str: Model name
+        """
+        return f"Transformers_{architecture}_" + "_".join(Path(path_weights).parts[-2:])
+if TYPE_CHECKING:
+    LmTokenModels: TypeAlias = Union[HFLmTokenClassifier,]
+    LmSequenceModels: TypeAlias = Union[HFLmSequenceClassifier,]

deepdoctection/mapper/laylmstruct.py CHANGED Viewed

@@ -806,17 +806,17 @@ def image_to_raw_lm_features(
     raw_features["image_id"] = page.image_id
     raw_features["width"] = page.width
     raw_features["height"] = page.height
-    raw_features["ann_ids"] = text_["ann_ids"]
-    raw_features["words"] = text_["words"]
+    raw_features["ann_ids"] = text_.ann_ids
+    raw_features["words"] = text_.words
     # We use a dummy bounding box for all bounding boxes so that we can pass the raw features to
     # raw_features_to_layoutlm_features
-    raw_features["bbox"] = [_CLS_BOX] * len(text_["words"])
+    raw_features["bbox"] = [_CLS_BOX] * len(text_.words)
     raw_features["dataset_type"] = dataset_type
-    if use_token_tag and text_["token_tags"]:
-        raw_features["labels"] = text_["token_tags"]
-    elif text_["token_classes"]:
-        raw_features["labels"] = text_["token_classes"]
+    if use_token_tag and text_.token_tags:
+        raw_features["labels"] = text_.token_tags
+    elif text_.token_classes:
+        raw_features["labels"] = text_.token_classes
     elif page.document_type is not None:
         document_type_id = page.image_orig.summary.get_sub_category(PageType.DOCUMENT_TYPE).category_id - 1
         raw_features["labels"] = [document_type_id]

deepdoctection/pipe/language.py CHANGED Viewed

@@ -21,7 +21,7 @@ Module for language detection pipeline component
 from typing import Optional, Sequence
 from ..datapoint.image import Image, MetaAnnotation
-from ..datapoint.view import ImageDefaults, Page
+from ..datapoint.view import IMAGE_DEFAULTS, Page
 from ..extern.base import LanguageDetector, ObjectDetector
 from ..utils.error import ImageError
 from ..utils.settings import PageType, TypeOrStr, get_type
@@ -75,11 +75,11 @@ class LanguageDetectionService(PipelineComponent):
         self.predictor = language_detector
         self.text_detector = text_detector
-        self.text_container = get_type(text_container) if text_container is not None else ImageDefaults.TEXT_CONTAINER
+        self.text_container = get_type(text_container) if text_container is not None else IMAGE_DEFAULTS.TEXT_CONTAINER
         self.floating_text_block_categories = (
             tuple(get_type(text_block) for text_block in floating_text_block_categories)
             if (floating_text_block_categories is not None)
-            else ()
+            else IMAGE_DEFAULTS.FLOATING_TEXT_BLOCK_CATEGORIES
         )
         super().__init__(self._get_name(self.predictor.name))
@@ -109,7 +109,7 @@ class LanguageDetectionService(PipelineComponent):
             text = " ".join((result.text for result in detect_result_list if result.text is not None))
         predict_result = self.predictor.predict(text)
         self.dp_manager.set_summary_annotation(
-            PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.text, predict_result.score
+            PageType.LANGUAGE, PageType.LANGUAGE, 1, predict_result.class_name, predict_result.score
         )
     def clone(self) -> PipelineComponent:

deepdoctection/pipe/lm.py CHANGED Viewed

@@ -20,6 +20,7 @@ Module for token classification pipeline
 """
 from __future__ import annotations
+import inspect
 from copy import copy
 from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Sequence, Union
@@ -32,6 +33,7 @@ from .registry import pipeline_component_registry
 if TYPE_CHECKING:
     from ..extern.hflayoutlm import LayoutSequenceModels, LayoutTokenModels
+    from ..extern.hflm import LmSequenceModels, LmTokenModels
 @pipeline_component_registry.register("LMTokenClassifierService")
@@ -70,7 +72,7 @@ class LMTokenClassifierService(PipelineComponent):
     def __init__(
         self,
         tokenizer: Any,
-        language_model: LayoutTokenModels,
+        language_model: Union[LayoutTokenModels, LmTokenModels],
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,
@@ -124,7 +126,7 @@ class LMTokenClassifierService(PipelineComponent):
                                            might not get sent to the model because they are categorized as not
                                            eligible token (e.g. empty string). If set to `True` it will assign all
                                            words without token the `BioTag.outside` token.
-            segment_positions: Using bounding boxes of segment instead of words improves model accuracy
+            segment_positions: Using bounding boxes of segments instead of words improves model accuracy
                                significantly for models that have been trained on segments rather than words.
                                Choose a single or a sequence of layout segments to use their bounding boxes. Note,
                                that the layout segments need to have a child-relationship with words. If a word
@@ -271,6 +273,8 @@ class LMTokenClassifierService(PipelineComponent):
                 f"You want to use {type(self.tokenizer)} but you should use {tokenizer_class_name} "
                 f"in this framework"
             )
+        func_params = inspect.signature(self.mapping_to_lm_input_func).parameters
+        self.required_kwargs = {k: v for k, v in self.required_kwargs.items() if k in func_params}
     @staticmethod
     def image_to_features_func(mapping_str: str) -> Callable[..., Callable[[Image], Optional[Any]]]:
@@ -318,7 +322,7 @@ class LMSequenceClassifierService(PipelineComponent):
     def __init__(
         self,
         tokenizer: Any,
-        language_model: LayoutSequenceModels,
+        language_model: Union[LayoutSequenceModels, LmSequenceModels],
         padding: Literal["max_length", "do_not_pad", "longest"] = "max_length",
         truncation: bool = True,
         return_overflowing_tokens: bool = False,

deepdoctection/utils/file_utils.py CHANGED Viewed

@@ -18,6 +18,7 @@ from types import ModuleType
 from typing import Any, Union, no_type_check
 import importlib_metadata
+import numpy as np
 from packaging import version
 from .error import DependencyError
@@ -249,6 +250,39 @@ def get_distance_requirement() -> Requirement:
     return "distance", distance_available(), _DISTANCE_ERR_MSG
+_NUMPY_V1_ERR_MSG = "numpy v1 must be installed."
+def numpy_v1_available() -> bool:
+    """
+    Check if the installed NumPy version is version 1.
+    This helper function determines whether the currently installed version
+    of NumPy is version 1 by inspecting its major version number.
+    Returns:
+        True if the installed NumPy version is 1, otherwise False
+    """
+    major_version = np.__version__.split(".", maxsplit=1)[0]
+    print(f"major version: {major_version}")
+    if major_version in (1, "1"):
+        return True
+    return False
+def get_numpy_v1_requirement() -> Requirement:
+    """
+    Retrieves the requirement details for numpy version 1.
+    Returns:
+        A tuple containing three elements:
+            - The requirement name for numpy version 1.
+            - A Boolean value indicating whether numpy version 1 is available.
+            - An error message in case numpy version 1 is not available.
+    """
+    return "numpy v1", numpy_v1_available(), _NUMPY_V1_ERR_MSG
 # Transformers
 _TRANSFORMERS_AVAILABLE = importlib.util.find_spec("transformers") is not None
 _TRANSFORMERS_ERR_MSG = f"transformers must be installed. {_GENERIC_ERR_MSG}"

deepdoctection/utils/settings.py CHANGED Viewed

@@ -108,6 +108,7 @@ class DocumentType(ObjectTypes):
     GOVERNMENT_TENDERS = "government_tenders"
     MANUALS = "manuals"
     PATENTS = "patents"
+    BANK_STATEMENT = "bank_statement"
 @object_types_registry.register("LayoutType")
@@ -296,6 +297,7 @@ class Languages(ObjectTypes):
     BOSNIAN = "bos"
     NORWEGIAN_NOVOSIBIRSK = "nno"
     URDU = "urd"
+    SWAHILI = "swa"
     NOT_DEFINED = "nn"

deepdoctection/utils/types.py CHANGED Viewed

@@ -70,7 +70,6 @@ AnnotationDict: TypeAlias = dict[str, Any]
 ImageDict: TypeAlias = dict[str, Any]
 # We use these types for output types of the Page object
-Text_: TypeAlias = dict[str, Any]
 HTML: TypeAlias = str
 csv: TypeAlias = list[list[str]]
 Chunks: TypeAlias = list[tuple[str, str, int, str, str, str, str]]

deepdoctection 0.44.0__py3-none-any.whl → 0.45.0__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.44.0py3-none-any.whl → 0.45.0py3-none-any.whl