PyPI - deepdoctection - Versions diffs - 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl - Mend

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show

deepdoctection/__init__.py +4 -2
deepdoctection/analyzer/__init__.py +2 -1
deepdoctection/analyzer/config.py +919 -0
deepdoctection/analyzer/dd.py +36 -62
deepdoctection/analyzer/factory.py +311 -141
deepdoctection/configs/conf_dd_one.yaml +100 -44
deepdoctection/configs/profiles.jsonl +32 -0
deepdoctection/dataflow/__init__.py +9 -6
deepdoctection/dataflow/base.py +33 -15
deepdoctection/dataflow/common.py +96 -75
deepdoctection/dataflow/custom.py +36 -29
deepdoctection/dataflow/custom_serialize.py +135 -91
deepdoctection/dataflow/parallel_map.py +33 -31
deepdoctection/dataflow/serialize.py +15 -10
deepdoctection/dataflow/stats.py +41 -28
deepdoctection/datapoint/__init__.py +4 -6
deepdoctection/datapoint/annotation.py +104 -66
deepdoctection/datapoint/box.py +190 -130
deepdoctection/datapoint/convert.py +66 -39
deepdoctection/datapoint/image.py +151 -95
deepdoctection/datapoint/view.py +383 -236
deepdoctection/datasets/__init__.py +2 -6
deepdoctection/datasets/adapter.py +11 -11
deepdoctection/datasets/base.py +118 -81
deepdoctection/datasets/dataflow_builder.py +18 -12
deepdoctection/datasets/info.py +76 -57
deepdoctection/datasets/instances/__init__.py +6 -2
deepdoctection/datasets/instances/doclaynet.py +17 -14
deepdoctection/datasets/instances/fintabnet.py +16 -22
deepdoctection/datasets/instances/funsd.py +11 -6
deepdoctection/datasets/instances/iiitar13k.py +9 -9
deepdoctection/datasets/instances/layouttest.py +9 -9
deepdoctection/datasets/instances/publaynet.py +9 -9
deepdoctection/datasets/instances/pubtables1m.py +13 -13
deepdoctection/datasets/instances/pubtabnet.py +13 -15
deepdoctection/datasets/instances/rvlcdip.py +8 -8
deepdoctection/datasets/instances/xfund.py +11 -9
deepdoctection/datasets/registry.py +18 -11
deepdoctection/datasets/save.py +12 -11
deepdoctection/eval/__init__.py +3 -2
deepdoctection/eval/accmetric.py +72 -52
deepdoctection/eval/base.py +29 -10
deepdoctection/eval/cocometric.py +14 -12
deepdoctection/eval/eval.py +56 -41
deepdoctection/eval/registry.py +6 -3
deepdoctection/eval/tedsmetric.py +24 -9
deepdoctection/eval/tp_eval_callback.py +13 -12
deepdoctection/extern/__init__.py +1 -1
deepdoctection/extern/base.py +176 -97
deepdoctection/extern/d2detect.py +127 -92
deepdoctection/extern/deskew.py +19 -10
deepdoctection/extern/doctrocr.py +162 -108
deepdoctection/extern/fastlang.py +25 -17
deepdoctection/extern/hfdetr.py +137 -60
deepdoctection/extern/hflayoutlm.py +329 -248
deepdoctection/extern/hflm.py +67 -33
deepdoctection/extern/model.py +108 -762
deepdoctection/extern/pdftext.py +37 -12
deepdoctection/extern/pt/nms.py +15 -1
deepdoctection/extern/pt/ptutils.py +13 -9
deepdoctection/extern/tessocr.py +87 -54
deepdoctection/extern/texocr.py +29 -14
deepdoctection/extern/tp/tfutils.py +36 -8
deepdoctection/extern/tp/tpcompat.py +54 -16
deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
deepdoctection/extern/tpdetect.py +4 -2
deepdoctection/mapper/__init__.py +1 -1
deepdoctection/mapper/cats.py +117 -76
deepdoctection/mapper/cocostruct.py +35 -17
deepdoctection/mapper/d2struct.py +56 -29
deepdoctection/mapper/hfstruct.py +32 -19
deepdoctection/mapper/laylmstruct.py +221 -185
deepdoctection/mapper/maputils.py +71 -35
deepdoctection/mapper/match.py +76 -62
deepdoctection/mapper/misc.py +68 -44
deepdoctection/mapper/pascalstruct.py +13 -12
deepdoctection/mapper/prodigystruct.py +33 -19
deepdoctection/mapper/pubstruct.py +42 -32
deepdoctection/mapper/tpstruct.py +39 -19
deepdoctection/mapper/xfundstruct.py +20 -13
deepdoctection/pipe/__init__.py +1 -2
deepdoctection/pipe/anngen.py +104 -62
deepdoctection/pipe/base.py +226 -107
deepdoctection/pipe/common.py +206 -123
deepdoctection/pipe/concurrency.py +74 -47
deepdoctection/pipe/doctectionpipe.py +108 -47
deepdoctection/pipe/language.py +41 -24
deepdoctection/pipe/layout.py +45 -18
deepdoctection/pipe/lm.py +146 -78
deepdoctection/pipe/order.py +205 -119
deepdoctection/pipe/refine.py +111 -63
deepdoctection/pipe/registry.py +1 -1
deepdoctection/pipe/segment.py +213 -142
deepdoctection/pipe/sub_layout.py +76 -46
deepdoctection/pipe/text.py +52 -33
deepdoctection/pipe/transform.py +8 -6
deepdoctection/train/d2_frcnn_train.py +87 -69
deepdoctection/train/hf_detr_train.py +72 -40
deepdoctection/train/hf_layoutlm_train.py +85 -46
deepdoctection/train/tp_frcnn_train.py +56 -28
deepdoctection/utils/concurrency.py +59 -16
deepdoctection/utils/context.py +40 -19
deepdoctection/utils/develop.py +26 -17
deepdoctection/utils/env_info.py +86 -37
deepdoctection/utils/error.py +16 -10
deepdoctection/utils/file_utils.py +246 -71
deepdoctection/utils/fs.py +162 -43
deepdoctection/utils/identifier.py +29 -16
deepdoctection/utils/logger.py +49 -32
deepdoctection/utils/metacfg.py +83 -21
deepdoctection/utils/pdf_utils.py +119 -62
deepdoctection/utils/settings.py +24 -10
deepdoctection/utils/tqdm.py +10 -5
deepdoctection/utils/transform.py +182 -46
deepdoctection/utils/utils.py +61 -28
deepdoctection/utils/viz.py +150 -104
deepdoctection-0.43.1.dist-info/METADATA +376 -0
deepdoctection-0.43.1.dist-info/RECORD +149 -0
deepdoctection/analyzer/_config.py +0 -146
deepdoctection-0.42.1.dist-info/METADATA +0 -431
deepdoctection-0.42.1.dist-info/RECORD +0 -148
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
{deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0

deepdoctection/mapper/laylmstruct.py CHANGED Viewed

@@ -89,35 +89,33 @@ def image_to_raw_layoutlm_features(
     segment_positions: Optional[Union[LayoutType, Sequence[LayoutType]]] = None,
 ) -> Optional[RawLayoutLMFeatures]:
     """
-    Mapping a datapoint into an intermediate format for layoutlm. Features will be provided into a dict and this mapping
+    Maps a datapoint into an intermediate format for LayoutLM. Features are provided in a dict and this mapping
     can be used for sequence or token classification as well as for inference. To generate input features for the model
-    please `use raw_features_to_layoutlm_features`.
-    :param dp: Image
-    :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
-    :param input_width: max width of box coordinates. Under the hood, it will transform the image and all box
-                        coordinates accordingly.
-    :param input_height: target height of box coordinates. Under the hood, it will transform the image and all box
-                        coordinates accordingly.
-    :param image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to input_width, whereas
-                        the image has to be resized to a different width. This input will only resize the `image` width.
-    :param image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to input_height,
-                         whereas the image has to be resized to a different height. This input will only resize the
-                         `image` height.
-    :param color_mode: Either "BGR" or "RGB". Note, that LayoutLMv2 uses "BGR" because of Detectron2 backbone, whereas
-                       LayoutLMv3 uses "RGB".
-    :param pixel_mean: (3,) array for "BGR" resp. "RGB" mean
-    :param pixel_std: (3,) array for "BGR" resp. "RGB" std
-    :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
-                          labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
-                          `WordType.token_class`.
-    :param segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
-                              Choose a single or a sequence of layout segments to use their bounding boxes. Note, that
-                              the layout segments need to have a child-relationship with words. If a word does not
-                              appear as child, it will use the word bounding box.
-    :return: dictionary with the following arguments:
-            'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
+    please use `raw_features_to_layoutlm_features`.
+    Args:
+        dp: `Image`.
+        dataset_type: Either `SEQUENCE_CLASSIFICATION` or `TOKEN_CLASSIFICATION`. When using a built-in dataset use
+                      this.
+        input_width: Max width of box coordinates. Transforms the image and all box coordinates accordingly.
+        input_height: Target height of box coordinates. Transforms the image and all box coordinates accordingly.
+        image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_width`, whereas
+            the image has to be resized to a different width. This input will only resize the `image` width.
+        image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_height`,
+            whereas the image has to be resized to a different height. This input will only resize the `image` height.
+        color_mode: Either `BGR` or `RGB`. Note that LayoutLMv2 uses `BGR` because of Detectron2 backbone, whereas
+            LayoutLMv3 uses `RGB`.
+        pixel_mean: (3,) array for `BGR` or `RGB` mean.
+        pixel_std: (3,) array for `BGR` or `RGB` std.
+        use_token_tag: Used only for `dataset_type="token_classification"`. If `True`, uses labels from subcategory
+            `WordType.token_tag` (with `B,I,O` suffix), otherwise `WordType.token_class`.
+        segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
+            Choose a single or a sequence of layout segments to use their bounding boxes. The layout segments need to
+            have a child-relationship with words. If a word does not appear as child, it will use the word bounding box.
+    Returns:
+        Dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`, `bbox`, and
+        `dataset_type`.
     """
     raw_features: RawLayoutLMFeatures = RawLayoutLMFeatures({})
@@ -212,9 +210,13 @@ def image_to_raw_layoutlm_features(
 def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
     """
-    Converting list of floats to pytorch tensors
-    :param features: LayoutLMFeatures
-    :return: LayoutLMFeatures
+    Converts a list of floats to PyTorch tensors.
+    Args:
+        features: `LayoutLMFeatures`.
+    Returns:
+        `LayoutLMFeatures`.
     """
     _image_key = "pixel_values" if "pixel_values" in features else "image"
@@ -240,13 +242,23 @@ def _tokenize_with_sliding_window(
     return_tensors: Optional[Literal["pt"]] = None,
 ) -> Union[JsonDict, BatchEncoding]:
     """
-    Runs a tokenizer: If there are no overflowing tokens, the tokenizer output will be returned as it is.
-    If there are overflowing tokens, sliding windows have to be built. As it is easier to prepare the sliding windows
-    from raw tokenized outputs we run the tokenizer a second time without truncating and build the sliding windows from
-    this second output.
-    The current implementation has a bug in that sense, that for higher batch sizes it will only return overflowing
-    samples. It is therefore recommended that if the dataset consist of many samples with lots of tokens one should
-    use a low per device batch size.
+    Runs a tokenizer. If there are no overflowing tokens, the tokenizer output will be returned as is.
+    If there are overflowing tokens, sliding windows are built. Sliding windows are prepared from raw tokenized outputs
+    by running the tokenizer a second time without truncating and building the sliding windows from this output.
+    Note:
+        The current implementation has a bug: for higher batch sizes it will only return overflowing samples.
+        If the dataset consists of many samples with lots of tokens, use a low per device batch size.
+    Args:
+        raw_features: List of `RawLayoutLMFeatures` or `RawLMFeatures`.
+        tokenizer: `PreTrainedTokenizerFast`.
+        sliding_window_stride: Stride for sliding window.
+        max_batch_size: Maximum batch size.
+        return_tensors: If `pt`, returns torch tensors.
+    Returns:
+        `JsonDict` or `BatchEncoding`.
     """
     # first try: we require return_overflowing_tokens=True. If the number of raw features is equal to
     # overflow_to_sample_mapping then there is nothing more to do because the sample has less than max_length
@@ -413,33 +425,36 @@ def raw_features_to_layoutlm_features(
     remove_bounding_boxes: bool = False,
 ) -> LayoutLMFeatures:
     """
-    Mapping raw features to tokenized input sequences for LayoutLM models.
-    :param raw_features: A dictionary with the following arguments: `image_id, width, height, ann_ids, words,
-                         boxes, dataset_type`.
-    :param tokenizer: A fast tokenizer for the model. Note, that the conventional python based tokenizer provided by the
-                      Transformer library do not return essential word_id/token_id mappings making the feature
-                      generation a lot more difficult. We therefore do not allow these tokenizer.
-    :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
-                    `do_not_pad`.
-    :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
-                       maximum acceptable input length for the model if that argument is not provided. This will
-                       truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                       sequences (or a batch of pairs) is provided.
-                       If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
-                       model maximum admissible input size).
-    :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens can
-                                  be returned as an additional batch element. Not that in this case, the number of input
-                                  batch samples will be smaller than the output batch samples.
-    :param return_tensors: If `pt` will return torch Tensors. If no argument is provided that the batches will be lists
-                           of lists.
-    :param remove_columns_for_training: Will remove all superfluous columns that are not required for training.
-    :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length sliding windows
-                                  will be created with each window having max_length sequence input. When using
-                                  `sliding_window_stride=0` no strides will be created, otherwise it will create slides
-                                  with windows shifted `sliding_window_stride` to the right.
-    :return: dictionary with the following arguments:  `image_ids, width, height, ann_ids, input_ids,
-             token_type_ids, attention_mask, bbox, labels`.
+    Maps raw features to tokenized input sequences for LayoutLM models.
+    Args:
+        raw_features: A dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`,
+            `boxes`, `dataset_type`.
+        tokenizer: A fast tokenizer for the model. The conventional Python-based tokenizer provided by the
+            Transformers library does not return essential word_id/token_id mappings, making feature generation
+            more difficult. Only fast tokenizers are allowed.
+        padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
+            `do_not_pad`.
+        truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
+            maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
+            removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
+            provided. If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model
+            maximum admissible input size).
+        return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens can
+            be returned as an additional batch element. In this case, the number of input batch samples will be smaller
+            than the output batch samples.
+        return_tensors: If `pt`, returns torch tensors. If not provided, batches will be lists of lists.
+        remove_columns_for_training: Removes all superfluous columns that are not required for training.
+        sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding windows
+            will be created with each window having `max_length` sequence input. When using
+            `sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows shifted
+            `sliding_window_stride` to the right.
+        max_batch_size: Maximum batch size.
+        remove_bounding_boxes: If `True`, removes bounding box features.
+    Returns:
+        Dictionary with the following arguments: `image_ids`, `width`, `height`, `ann_ids`, `input_ids`,
+        `token_type_ids`, `attention_mask`, `bbox`, `labels`.
     """
     if isinstance(raw_features, dict):
@@ -578,28 +593,30 @@ def raw_features_to_layoutlm_features(
 @dataclass
 class LayoutLMDataCollator:
     """
-    Data collator that will dynamically tokenize, pad and truncate the inputs received.
-    :param tokenizer: A fast tokenizer for the model. Note, that the conventional python based tokenizer provided by the
-                      Transformer library do not return essential word_id/token_id mappings making the feature
-                      generation a lot more difficult. We therefore do not allow these tokenizer.
-    :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
-                    `do_not_pad`.
-    :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
-                       maximum acceptable input length for the model if that argument is not provided. This will
-                       truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                       sequences (or a batch of pairs) is provided.
-                       If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
-                       model maximum admissible input size).
-    :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens can
-                                  be returned as an additional batch element. Not that in this case, the number of input
-                                  batch samples will be smaller than the output batch samples.
-    :param return_tensors: If `pt` will return torch Tensors. If no argument is provided that the batches will be lists
-                           of lists.
-    :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length sliding windows
-                           will be created with each window having max_length sequence input. When using
-                           `sliding_window_stride=0` no strides will be created, otherwise it will create slides
-                           with windows shifted `sliding_window_stride` to the right.
+    Data collator that will dynamically tokenize, pad, and truncate the inputs received.
+    Args:
+        tokenizer: A fast tokenizer for the model. The conventional Python-based tokenizer provided by the
+            Transformers library does not return essential word_id/token_id mappings, making feature generation
+            more difficult. Only fast tokenizers are allowed.
+        padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
+            `do_not_pad`.
+        truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
+            maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
+            removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
+            provided.
+            If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
+            admissible input size).
+        return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens can
+            be returned as an additional batch element. In this case, the number of input batch samples will be smaller
+            than the output batch samples.
+        return_tensors: If `pt`, returns torch tensors. If not provided, batches will be lists of lists.
+        sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding windows
+            will be created with each window having `max_length` sequence input. When using
+            `sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows
+             shifted `sliding_window_stride` to the right.
+        max_batch_size: Maximum batch size.
+        remove_bounding_box_features: If `True`, removes bounding box features.
     """
     tokenizer: PreTrainedTokenizerFast
@@ -621,11 +638,15 @@ class LayoutLMDataCollator:
     def __call__(self, raw_features: Union[RawLayoutLMFeatures, list[RawLayoutLMFeatures]]) -> LayoutLMFeatures:
         """
-        Calling the DataCollator to form model inputs for training and inference. Takes a single raw
-        :param raw_features: A dictionary with the following arguments: `image_id, width, height, ann_ids, words,
-                             boxes, dataset_type`.
-        :return: LayoutLMFeatures with arguments `image_ids, width, height, ann_ids, input_ids,
-                 token_type_ids, attention_masks, boxes, labels`.
+        Calls the `DataCollator` to form model inputs for training and inference.
+        Args:
+            raw_features: A dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`,
+                `boxes`, `dataset_type`.
+        Returns:
+            `LayoutLMFeatures` with arguments `image_ids`, `width`, `height`, `ann_ids`, `input_ids`,
+            `token_type_ids`, `attention_masks`, `boxes`, `labels`.
         """
         return raw_features_to_layoutlm_features(
             raw_features,  # type: ignore
@@ -660,54 +681,57 @@ def image_to_layoutlm_features(
     sliding_window_stride: int = 0,
 ) -> Optional[LayoutLMFeatures]:
     """
-    Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
+    Mapping function to generate LayoutLM features from `Image` to be used for inference in a pipeline component.
     `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
     with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
     used internally in `LMTokenClassifierService`.
-            tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
-            layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
-            layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
-    :param dp: Image datapoint
-    :param tokenizer: Tokenizer compatible with the language model
-    :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
-                    `do_not_pad`.
-    :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
-                       maximum acceptable input length for the model if that argument is not provided. This will
-                       truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                       sequences (or a batch of pairs) is provided.
-                       If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
-                       model maximum admissible input size).
-    :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
-                                      can be returned as an additional batch element. Not that in this case, the number
-                                      of input batch samples will be smaller than the output batch samples.
-    :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
-                           returned in list objects.
-    :param input_width: Standard input size for image coordinates. All LayoutLM models require input features to be
-                        normalized to an image width equal to 1000.
-    :param input_height: Standard input size for image coordinates. All LayoutLM models require input features to be
-                         normalized to an image height equal to 1000.
-    :param image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to input_width, whereas
-                        the image has to be resized to a different width. This input will only resize the `image` width.
-    :param image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to input_height,
-                         whereas the image has to be resized to a different height. This input will only resize the
-                         `image` height.
-    :param color_mode: Either "BGR" or "RGB". Note, that LayoutLMv2 uses "BGR" because of Detectron2 backbone, whereas
-                       LayoutLMv3 uses "RGB".
-    :param pixel_mean: (3,) array for "BGR" resp. "RGB" mean
-    :param pixel_std: (3,) array for "BGR" resp. "RGB" std
-    :param segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
-                              Choose a single or a sequence of layout segments to use their bounding boxes. Note, that
-                              the layout segments need to have a child-relationship with words. If a word does not
-                              appear as child, it will use the word bounding box.
-    :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
-                                  windows will be created with each window having max_length sequence input. When using
-                                  `sliding_window_stride=0` no strides will be created, otherwise it will create slides
-                                  with windows shifted `sliding_window_stride` to the right.
-    :return: A dict of layoutlm features
+    Example:
+        ```python
+        tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
+        layoutlm = HFLayoutLmTokenClassifier("path/to/config.json", "path/to/model.bin",
+                                             categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
+        layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
+        ```
+    Args:
+        dp: `Image` datapoint.
+        tokenizer: Tokenizer compatible with the language model.
+        padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
+            `do_not_pad`.
+        truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
+            maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
+            removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
+            provided.
+            If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
+            admissible input size).
+        return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens
+            can be returned as an additional batch element. In this case, the number of input batch samples will be
+            smaller than the output batch samples.
+        return_tensors: Output tensor features. Either `pt` for PyTorch models or `None` if features should be
+            returned in list objects.
+        input_width: Standard input size for image coordinates. All LayoutLM models require input features to be
+            normalized to an image width equal to 1000.
+        input_height: Standard input size for image coordinates. All LayoutLM models require input features to be
+            normalized to an image height equal to 1000.
+        image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_width`, whereas
+            the image has to be resized to a different width. This input will only resize the `image` width.
+        image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_height`,
+            whereas the image has to be resized to a different height. This input will only resize the `image` height.
+        color_mode: Either `BGR` or `RGB`. Note that LayoutLMv2 uses `BGR` because of Detectron2 backbone, whereas
+            LayoutLMv3 uses `RGB`.
+        pixel_mean: (3,) array for `BGR` or `RGB` mean.
+        pixel_std: (3,) array for `BGR` or `RGB` std.
+        segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
+            Choose a single or a sequence of layout segments to use their bounding boxes. The layout segments need to
+            have a child-relationship with words. If a word does not appear as child, it will use the word bounding box.
+        sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding
+            windows will be created with each window having `max_length` sequence input. When using
+            `sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows shifted
+            `sliding_window_stride` to the right.
+    Returns:
+        A dict of LayoutLM features.
     """
     raw_features = image_to_raw_layoutlm_features(
         None,
@@ -745,28 +769,36 @@ def image_to_raw_lm_features(
     include_residual_text_container: bool = False,
 ) -> Optional[RawLMFeatures]:
     """
-    Mapping a datapoint into an intermediate format for bert-like models. Features will be provided into a dict and
+    Maps a datapoint into an intermediate format for BERT-like models. Features are provided in a dict and
     this mapping can be used for sequence or token classification as well as for inference. To generate input features
-    for the model please `use raw_features_to_layoutlm_features`.
-    :param dp: Image
-    :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
-    :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
-                          labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
-                          `WordType.token_class`.
-    :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
-    :param floating_text_block_categories: A list of top level layout objects
-    :param include_residual_text_container: This will regard synthetic text line annotations as floating text
-                                            blocks and therefore incorporate all image annotations of category
-                                            `word` when building text strings.
-    :return: dictionary with the following arguments:
-            'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
+    for the model, please use `raw_features_to_layoutlm_features`.
+    Args:
+        dp: `Image`.
+        dataset_type: Either `SEQUENCE_CLASSIFICATION` or `TOKEN_CLASSIFICATION`. When using a built-in dataset use
+                      this.
+        use_token_tag: Used only for `dataset_type="token_classification"`. If `True`, uses labels from subcategory
+            `WordType.token_tag` (with `B,I,O` suffix), otherwise `WordType.token_class`.
+        text_container: A `LayoutType` to get the text from. It will steer the output of `Layout.words`.
+        floating_text_block_categories: A list of top-level layout objects.
+        include_residual_text_container: Regards synthetic text line annotations as floating text blocks and therefore
+            incorporates all image annotations of category `word` when building text strings.
+    Returns:
+        Dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`, `bbox`, and
+        `dataset_type`.
     """
     raw_features: RawLMFeatures = RawLMFeatures({})
-    page = Page.from_image(dp, text_container, floating_text_block_categories, include_residual_text_container)
+    # We do not need to configure residual_text_block_categories here, because text_ does ignore these layout sections
+    # anyway
+    page = Page.from_image(
+        image_orig=dp,
+        text_container=text_container,
+        floating_text_block_categories=floating_text_block_categories,
+        include_residual_text_container=include_residual_text_container,
+    )
     text_ = page.text_
@@ -808,42 +840,46 @@ def image_to_lm_features(
     include_residual_text_container: bool = False,
 ) -> Optional[LayoutLMFeatures]:
     """
-    Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
+    Mapping function to generate LayoutLM features from `Image` to be used for inference in a pipeline component.
     `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
     with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
     used internally in `LMTokenClassifierService`.
-            tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
-            layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
-                                                  categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
-            layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
-    :param dp: Image datapoint
-    :param tokenizer: Tokenizer compatible with the language model
-    :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
-                    `do_not_pad`.
-    :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
-                       maximum acceptable input length for the model if that argument is not provided. This will
-                       truncate token by token, removing a token from the longest sequence in the pair if a pair of
-                       sequences (or a batch of pairs) is provided.
-                       If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
-                       model maximum admissible input size).
-    :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
-                                      can be returned as an additional batch element. Not that in this case, the number
-                                      of input batch samples will be smaller than the output batch samples.
-    :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
-                           returned in list objects.
-    :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
-                                  windows will be created with each window having max_length sequence input. When using
-                                  `sliding_window_stride=0` no strides will be created, otherwise it will create slides
-                                  with windows shifted `sliding_window_stride` to the right.
-    :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
-    :param floating_text_block_categories: A list of top level layout objects
-    :param include_residual_text_container: This will regard synthetic text line annotations as floating text
-                                            blocks and therefore incorporate all image annotations of category
-                                            `word` when building text strings.
-    :return: A dict of lm features
+    Example:
+        ```python
+        tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
+        layoutlm = HFLayoutLmTokenClassifier("path/to/config.json", "path/to/model.bin",
+                                             categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
+        layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
+        ```
+    Args:
+        dp: `Image` datapoint.
+        tokenizer: Tokenizer compatible with the language model.
+        padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
+            `do_not_pad`.
+        truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
+            maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
+            removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
+            provided.
+            If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
+            admissible input size).
+        return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens
+            can be returned as an additional batch element. In this case, the number of input batch samples will be
+            smaller than the output batch samples.
+        return_tensors: Output tensor features. Either `pt` for PyTorch models or `None` if features should be
+            returned in list objects.
+        sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding
+            windows will be created with each window having `max_length` sequence input. When using
+            `sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows
+             shifted `sliding_window_stride` to the right.
+        text_container: A `LayoutType` to get the text from. It will steer the output of `Layout.words`.
+        floating_text_block_categories: A list of top-level layout objects.
+        include_residual_text_container: Regards synthetic text line annotations as floating text blocks and therefore
+            incorporates all image annotations of category `word` when building text strings.
+    Returns:
+        A dict of LM features.
     """
     raw_features = image_to_raw_lm_features(  # pylint: disable=E1102
         dataset_type=None,

deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

Potentially problematic release.

deepdoctection 0.42.1py3-none-any.whl → 0.43.1py3-none-any.whl