PyPI - deepdoctection - Versions diffs - 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl - Mend

deepdoctection 0.42.0py3-none-any.whl → 0.43py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +2 -1
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +904 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +157 -106
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +196 -113
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +25 -17
deepdoctection/utils/env_info.py +85 -36
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -62
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.dist-info/METADATA +376 -0
deepdoctection-0.43.dist-info/RECORD +149 -0
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.0.dist-info/METADATA +0 -431
deepdoctection-0.42.0.dist-info/RECORD +0 -148
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0

deepdoctection/extern/hflayoutlm.py CHANGED Viewed

@@ -16,7 +16,7 @@
 # limitations under the License.
 """
-HF Layoutlm model for diverse downstream tasks.
+HF Layoutlm models.
 """
 from __future__ import annotations
@@ -87,9 +87,12 @@ def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) ->
     We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
     returns the tokenizer that should be used for a particular model.
-    :param model_class: The model as stated in the transformer library.
-    :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
-    :return: Tokenizer instance to use.
+    Args:
+        model_class: The model as stated in the transformer library.
+        use_xlm_tokenizer: True if one uses the `LayoutXLM`. (The model cannot be distinguished from `LayoutLMv2`).
+    Returns:
+        Tokenizer instance to use.
     """
     return {
         ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
@@ -137,15 +140,18 @@ def predict_token_classes(
     images: Optional[torch.Tensor] = None,
 ) -> list[TokenClassResult]:
     """
-    :param uuids: A list of uuids that correspond to a word that induces the resulting token
-    :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
-    :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
-    :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
-    :param boxes: Torch tensor of bounding boxes of type 'xyxy'
-    :param tokens: List of original tokens taken from LayoutLMTokenizer
-    :param model: layoutlm model for token classification
-    :param images: A list of torch image tensors or None
-    :return: A list of TokenClassResults
+    Args:
+        uuids: A list of uuids that correspond to a word that induces the resulting token
+        input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+        attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
+        token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+        boxes: Torch tensor of bounding boxes of type 'xyxy'
+        tokens: List of original tokens taken from `LayoutLMTokenizer`
+        model: layoutlm model for token classification
+        images: A list of torch image tensors or None
+    Returns:
+        A list of `TokenClassResult`s
     """
     if images is None:
@@ -195,13 +201,16 @@ def predict_sequence_classes(
     images: Optional[torch.Tensor] = None,
 ) -> SequenceClassResult:
     """
-    :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
-    :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
-    :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
-    :param boxes: Torch tensor of bounding boxes of type 'xyxy'
-    :param model: layoutlm model for sequence classification
-    :param images: A list of torch image tensors or None
-    :return: SequenceClassResult
+    Args:
+        input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+        attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
+        token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+        boxes: Torch tensor of bounding boxes of type `xyxy`
+        model: layoutlm model for sequence classification
+        images: A list of torch image tensors or None
+    Returns:
+        SequenceClassResult
     """
     if images is None:
@@ -229,7 +238,7 @@ def predict_sequence_classes(
 class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
     """
-    Abstract base class for wrapping LayoutLM models for token classification into the deepdoctection framework.
+    Abstract base class for wrapping `LayoutLM` models for token classification into the framework.
     """
     def __init__(
@@ -243,17 +252,18 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
         use_xlm_tokenizer: bool = False,
     ):
         """
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
-                                     entities self. To be consistent with detectors use only values >0. Conversion will
-                                     be done internally.
-        :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
-                               consistent with detectors use only values>0. Conversion will be done internally.
-        :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
-        :param device: The device (cpu,"cuda"), where to place the model.
-        :param use_xlm_tokenizer: True if one uses the LayoutXLM or a lilt model built with a xlm language model, e.g.
-                                  info-xlm or roberta-xlm. (LayoutXLM cannot be distinguished from LayoutLMv2).
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+            categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
+                                 entities self. To be consistent with detectors use only values `>0`. Conversion will
+                                 be done internally.
+            categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
+                           consistent with detectors use only `values>0`. Conversion will be done internally.
+            categories: If you have a pre-trained model you can pass a complete dict of NER categories
+            device: The device (cpu,"cuda"), where to place the model.
+            use_xlm_tokenizer: True if one uses the `LayoutXLM` or a lilt model built with a xlm language model, e.g.
+                              `info-xlm` or `roberta-xlm`. (`LayoutXLM` cannot be distinguished from LayoutLMv2).
         """
         if categories is None:
@@ -340,10 +350,15 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
     @staticmethod
     def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
-        """A refinement for adding the tokenizer class name to the model configs.
+        """
+        A refinement for adding the tokenizer class name to the model configs.
+        Args:
+            model_class_name: The model name, e.g. `model.__class__.__name__`
+            use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
-        :param model_class_name: The model name, e.g. model.__class__.__name__
-        :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
+        Returns:
+            The name of the tokenizer class.
         """
         tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
         return tokenizer.__class__.__name__
@@ -366,31 +381,32 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
     Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
     classification and other things please use another model of the family.
-    **Example**
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+        # hf tokenizer and token classifier
+        tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+        layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
+                                              categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
+                                                           'E-header', 'E-question', 'I-answer', 'I-header',
+                                                           'I-question', 'O', 'S-answer', 'S-header',
+                                                           'S-question'])
-            # hf tokenizer and token classifier
-            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
-            layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
-                                                               'E-header', 'E-question', 'I-answer', 'I-header',
-                                                               'I-question', 'O', 'S-answer', 'S-header',
-                                                               'S-question'])
+        # token classification service
+        layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
-            # token classification service
-            layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -404,17 +420,18 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         use_xlm_tokenizer: bool = False,
     ):
         """
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
-                                     entities self. To be consistent with detectors use only values >0. Conversion will
-                                     be done internally.
-        :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
-                               consistent with detectors use only values>0. Conversion will be done internally.
-        :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
-        :param device: The device (cpu,"cuda"), where to place the model.
-        :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
-                                  Tokenizer.
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+            categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
+                                 entities self. To be consistent with detectors use only values `>0`. Conversion will
+                                 be done internally.
+            categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. BIO). To be
+                           consistent with detectors use only values>0. Conversion will be done internally.
+            categories: If you have a pre-trained model you can pass a complete dict of NER categories
+            device: The device (cpu,"cuda"), where to place the model.
+            use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
+                              Tokenizer.
         """
         super().__init__(
             path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
@@ -431,17 +448,16 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         """
         Launch inference on LayoutLm for token classification. Pass the following arguments
-        `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
-        `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
-        `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
-        `boxes:` Torch tensor of bounding boxes of type 'xyxy'
+        Args:
+            encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+                       attention_mask: The associated attention masks from padded sequences taken from
+                                       `LayoutLMTokenizer`
+                       token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+                       boxes: Torch tensor of bounding boxes of type `xyxy`
+                       tokens: List of original tokens taken from `LayoutLMTokenizer`
-        `tokens:` List of original tokens taken from `LayoutLMTokenizer`
-        :return: A list of TokenClassResults
+        Returns:
+            A list of `TokenClassResult`s
         """
         ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
@@ -459,9 +475,12 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
         """
         Get the inner (wrapped) model.
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :return: 'nn.Module'
+        Args:
+            path_config_json: path to .json config file
+            path_weights: path to model artifact
+        Returns:
+            `nn.Module`
         """
         config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMForTokenClassification.from_pretrained(
@@ -481,31 +500,32 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
     Note, that you must use `LayoutLMTokenizerFast` as tokenizer. `LayoutLMv2TokenizerFast` will not be accepted.
-    **Example**
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+        # hf tokenizer and token classifier
+        tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+        layoutlm = HFLayoutLmv2TokenClassifier("path/to/config.json","path/to/model.bin",
+                                              categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
+                                                           'E-header', 'E-question', 'I-answer', 'I-header',
+                                                           'I-question', 'O', 'S-answer', 'S-header',
+                                                           'S-question'])
-            # hf tokenizer and token classifier
-            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
-            layoutlm = HFLayoutLmv2TokenClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
-                                                               'E-header', 'E-question', 'I-answer', 'I-header',
-                                                               'I-question', 'O', 'S-answer', 'S-header',
-                                                               'S-question'])
-            # token classification service
-            layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
+        # token classification service
+        layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -519,17 +539,18 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
         use_xlm_tokenizer: bool = False,
     ):
         """
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
-                                     entities self. To be consistent with detectors use only values >0. Conversion will
-                                     be done internally.
-        :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
-                               consistent with detectors use only values>0. Conversion will be done internally.
-        :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
-        :param device: The device (cpu,"cuda"), where to place the model.
-        :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
-                                  default value.
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+            categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
+                                 entities self. To be consistent with detectors use only values `>0`. Conversion will
+                                 be done internally.
+            categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
+                           consistent with detectors use only values>0. Conversion will be done internally.
+            categories: If you have a pre-trained model you can pass a complete dict of `NER` categories
+            device: The device (cpu,"cuda"), where to place the model.
+            use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a `LayoutLMv2` model keep the
+                              default value.
         """
         super().__init__(
             path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
@@ -544,19 +565,18 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
         """
-        Launch inference on LayoutLm for token classification. Pass the following arguments
-        `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
-        `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
-        `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
-        `boxes:` Torch tensor of bounding boxes of type `xyxy`
-        `tokens:` List of original tokens taken from `LayoutLMTokenizer`
-        :return: A list of TokenClassResults
+        Launch inference on `LayoutLm` for token classification. Pass the following arguments
+        Args:
+            encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+                       attention_mask: The associated attention masks from padded sequences taken from
+                                       `LayoutLMTokenizer`
+                       token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+                       boxes: Torch tensor of bounding boxes of type `xyxy`
+                       tokens: List of original tokens taken from `LayoutLMTokenizer`
+        Returns:
+            A list of `TokenClassResult`s
         """
         ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
@@ -799,8 +819,9 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
     def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
         """A refinement for adding the tokenizer class name to the model configs.
-        :param model_class_name: The model name, e.g. model.__class__.__name__
-        :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
+        Args:
+            model_class_name: The model name, e.g. `model.__class__.__name__`
+            use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
         """
         tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
         return tokenizer.__class__.__name__
@@ -823,28 +844,29 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
     Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
     classification and other things please use another model of the family.
-    **Example**
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # hf tokenizer and token classifier
-            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
-            layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories=["handwritten", "presentation", "resume"])
+        # hf tokenizer and token classifier
+        tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+        layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json","path/to/model.bin",
+                                              categories=["handwritten", "presentation", "resume"])
-            # token classification service
-            layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
+        # token classification service
+        layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -855,6 +877,16 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
+        """
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+            categories: A dict with key (indices) and values (category names) for sequence classification.
+                        To be consistent with detectors use only values `>0`. Conversion will be done internally.
+            device: The device ("cpu","cuda"), where to place the model.
+            use_xlm_tokenizer: Do not change this value unless you pre-trained a `LayoutLM` model with a different
+                              Tokenizer.
+        """
         super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LayoutLM")
         self.model_id = self.get_model_id()
@@ -865,6 +897,16 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
+        """
+        Launch inference on LayoutLm for sequence classification. Pass the following arguments
+        Args:
+            encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+                       attention_mask: The associated attention masks from padded sequences taken from
+                                       `LayoutLMTokenizer`
+                       token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+                       boxes: Torch tensor of bounding boxes of type `xyxy`
+        """
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
         result = predict_sequence_classes(
@@ -886,9 +928,12 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         """
         Get the inner (wrapped) model.
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :return: 'nn.Module'
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+        Returns:
+            'nn.Module'
         """
         config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMForSequenceClassification.from_pretrained(
@@ -906,28 +951,29 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
     itself. Note that this model is equipped with a head that is only useful for classifying the input sequence. For
     token classification and other things please use another model of the family.
-    **Example**
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # hf tokenizer and token classifier
-            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
-            layoutlm = HFLayoutLmv2SequenceClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories=["handwritten", "presentation", "resume"])
+        # hf tokenizer and token classifier
+        tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+        layoutlm = HFLayoutLmv2SequenceClassifier("path/to/config.json","path/to/model.bin",
+                                              categories=["handwritten", "presentation", "resume"])
-            # token classification service
-            layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
+        # token classification service
+        layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -938,6 +984,16 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
         use_xlm_tokenizer: bool = False,
     ):
+        """
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+            categories: A dict with key (indices) and values (category names) for sequence classification.
+                        To be consistent with detectors use only values `>0`. Conversion will be done internally.
+            device: The device ("cpu","cuda"), where to place the model.
+            use_xlm_tokenizer: Do not change this value unless you pre-trained a `LayoutLM` model with a different
+                              Tokenizer.
+        """
         super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
         self.name = self.get_name(path_weights, "LayoutLMv2")
         self.model_id = self.get_model_id()
@@ -948,6 +1004,16 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         )
     def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
+        """
+        Launch inference on LayoutLm for sequence classification. Pass the following arguments
+        Args:
+            encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+                       attention_mask: The associated attention masks from padded sequences taken from
+                                       `LayoutLMTokenizer`
+                       token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+                       boxes: Torch tensor of bounding boxes of type `xyxy`
+        """
         input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
         images = encodings.get("image")
         if isinstance(images, torch.Tensor):
@@ -976,9 +1042,12 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         """
         Get the inner (wrapped) model.
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :return: 'nn.Module'
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+        Returns:
+            'nn.Module'
         """
         config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMv2ForSequenceClassification.from_pretrained(
@@ -996,28 +1065,29 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
     itself. Note that this model is equipped with a head that is only useful for classifying the input sequence. For
     token classification and other things please use another model of the family.
-    **Example**
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+        # hf tokenizer and token classifier
+        tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+        layoutlm = HFLayoutLmv3SequenceClassifier("path/to/config.json","path/to/model.bin",
+                                              categories=["handwritten", "presentation", "resume"])
-            # hf tokenizer and token classifier
-            tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
-            layoutlm = HFLayoutLmv3SequenceClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories=["handwritten", "presentation", "resume"])
+        # token classification service
+        layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
-            # token classification service
-            layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -1072,9 +1142,12 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
         """
         Get the inner (wrapped) model.
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :return: 'nn.Module'
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+        Returns:
+            'nn.Module'
         """
         config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
         return LayoutLMv3ForSequenceClassification.from_pretrained(
@@ -1092,31 +1165,32 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
     Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
     classification and other things please use another model of the family.
-    **Example**
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # hf tokenizer and token classifier
-            tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
-            lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
-                                                               'E-header', 'E-question', 'I-answer', 'I-header',
-                                                               'I-question', 'O', 'S-answer', 'S-header',
-                                                               'S-question'])
+        # hf tokenizer and token classifier
+        tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+        lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
+                                              categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
+                                                           'E-header', 'E-question', 'I-answer', 'I-header',
+                                                           'I-question', 'O', 'S-answer', 'S-header',
+                                                           'S-question'])
-            # token classification service
-            lilt_service = LMTokenClassifierService(tokenizer,lilt)
+        # token classification service
+        lilt_service = LMTokenClassifierService(tokenizer,lilt)
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -1130,15 +1204,16 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
         use_xlm_tokenizer: bool = False,
     ):
         """
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
-                                     entities self. To be consistent with detectors use only values >0. Conversion will
-                                     be done internally.
-        :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
-                               consistent with detectors use only values>0. Conversion will be done internally.
-        :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
-        :param device: The device (cpu,"cuda"), where to place the model.
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+            categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
+                                 entities self. To be consistent with detectors use only values `>0`. Conversion will
+                                 be done internally.
+            categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. `BIO`). To be
+                           consistent with detectors use only values>0. Conversion will be done internally.
+            categories: If you have a pre-trained model you can pass a complete dict of `NER` categories
+            device: The device ("cpu","cuda"), where to place the model.
         """
         super().__init__(
@@ -1156,17 +1231,16 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
         """
         Launch inference on LayoutLm for token classification. Pass the following arguments
-        `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
-        `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
+        Args:
+            encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
+                       attention_mask: The associated attention masks from padded sequences taken from
+                                       `LayoutLMTokenizer`
+                       token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
+                       boxes: Torch tensor of bounding boxes of type `xyxy`
+                       tokens: List of original tokens taken from `LayoutLMTokenizer`
-        `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
-        `boxes:` Torch tensor of bounding boxes of type 'xyxy'
-        `tokens:` List of original tokens taken from `LayoutLMTokenizer`
-        :return: A list of TokenClassResults
+        Returns:
+            A list of `TokenClassResult`s
         """
         ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
@@ -1182,9 +1256,12 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
         """
         Get the inner (wrapped) model.
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :return: 'nn.Module'
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+        Returns:
+            `nn.Module`
         """
         config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
         return LiltForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
@@ -1200,29 +1277,30 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
     Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
     classification and other things please use another model of the family.
-    **Example**
+    Example:
+        ```python
+        # setting up compulsory ocr service
+        tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
+        tess = TesseractOcrDetector(tesseract_config_path)
+        ocr_service = TextExtractionService(tess)
-            # setting up compulsory ocr service
-            tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
-            tess = TesseractOcrDetector(tesseract_config_path)
-            ocr_service = TextExtractionService(tess)
+        # hf tokenizer and sequence classifier
+        tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
+        lilt = HFLiltSequenceClassifier("path/to/config.json",
+                                            "path/to/model.bin",
+                                            categories=["handwritten", "presentation", "resume"])
-            # hf tokenizer and sequence classifier
-            tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
-            lilt = HFLiltSequenceClassifier("path/to/config.json",
-                                                "path/to/model.bin",
-                                                categories=["handwritten", "presentation", "resume"])
+        # sequence classification service
+        lilt_service = LMSequenceClassifierService(tokenizer,lilt)
-            # sequence classification service
-            lilt_service = LMSequenceClassifierService(tokenizer,lilt)
+        pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
-            pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
-            path = "path/to/some/form"
-            df = pipe.analyze(path=path)
+        path = "path/to/some/form"
+        df = pipe.analyze(path=path)
-            for dp in df:
-                ...
+        for dp in df:
+            ...
+        ```
     """
     def __init__(
@@ -1262,9 +1340,12 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
         """
         Get the inner (wrapped) model.
-        :param path_config_json: path to .json config file
-        :param path_weights: path to model artifact
-        :return: 'nn.Module'
+        Args:
+            path_config_json: path to `.json` config file
+            path_weights: path to model artifact
+        Returns:
+            `nn.Module`
         """
         config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
         return LiltForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)

deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.0py3-none-any.whl → 0.43py3-none-any.whl