deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -89,35 +89,33 @@ def image_to_raw_layoutlm_features(
89
89
  segment_positions: Optional[Union[LayoutType, Sequence[LayoutType]]] = None,
90
90
  ) -> Optional[RawLayoutLMFeatures]:
91
91
  """
92
- Mapping a datapoint into an intermediate format for layoutlm. Features will be provided into a dict and this mapping
92
+ Maps a datapoint into an intermediate format for LayoutLM. Features are provided in a dict and this mapping
93
93
  can be used for sequence or token classification as well as for inference. To generate input features for the model
94
- please `use raw_features_to_layoutlm_features`.
95
-
96
-
97
- :param dp: Image
98
- :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
99
- :param input_width: max width of box coordinates. Under the hood, it will transform the image and all box
100
- coordinates accordingly.
101
- :param input_height: target height of box coordinates. Under the hood, it will transform the image and all box
102
- coordinates accordingly.
103
- :param image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to input_width, whereas
104
- the image has to be resized to a different width. This input will only resize the `image` width.
105
- :param image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to input_height,
106
- whereas the image has to be resized to a different height. This input will only resize the
107
- `image` height.
108
- :param color_mode: Either "BGR" or "RGB". Note, that LayoutLMv2 uses "BGR" because of Detectron2 backbone, whereas
109
- LayoutLMv3 uses "RGB".
110
- :param pixel_mean: (3,) array for "BGR" resp. "RGB" mean
111
- :param pixel_std: (3,) array for "BGR" resp. "RGB" std
112
- :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
113
- labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
114
- `WordType.token_class`.
115
- :param segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
116
- Choose a single or a sequence of layout segments to use their bounding boxes. Note, that
117
- the layout segments need to have a child-relationship with words. If a word does not
118
- appear as child, it will use the word bounding box.
119
- :return: dictionary with the following arguments:
120
- 'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
94
+ please use `raw_features_to_layoutlm_features`.
95
+
96
+ Args:
97
+ dp: `Image`.
98
+ dataset_type: Either `SEQUENCE_CLASSIFICATION` or `TOKEN_CLASSIFICATION`. When using a built-in dataset use
99
+ this.
100
+ input_width: Max width of box coordinates. Transforms the image and all box coordinates accordingly.
101
+ input_height: Target height of box coordinates. Transforms the image and all box coordinates accordingly.
102
+ image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_width`, whereas
103
+ the image has to be resized to a different width. This input will only resize the `image` width.
104
+ image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_height`,
105
+ whereas the image has to be resized to a different height. This input will only resize the `image` height.
106
+ color_mode: Either `BGR` or `RGB`. Note that LayoutLMv2 uses `BGR` because of Detectron2 backbone, whereas
107
+ LayoutLMv3 uses `RGB`.
108
+ pixel_mean: (3,) array for `BGR` or `RGB` mean.
109
+ pixel_std: (3,) array for `BGR` or `RGB` std.
110
+ use_token_tag: Used only for `dataset_type="token_classification"`. If `True`, uses labels from subcategory
111
+ `WordType.token_tag` (with `B,I,O` suffix), otherwise `WordType.token_class`.
112
+ segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
113
+ Choose a single or a sequence of layout segments to use their bounding boxes. The layout segments need to
114
+ have a child-relationship with words. If a word does not appear as child, it will use the word bounding box.
115
+
116
+ Returns:
117
+ Dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`, `bbox`, and
118
+ `dataset_type`.
121
119
  """
122
120
 
123
121
  raw_features: RawLayoutLMFeatures = RawLayoutLMFeatures({})
@@ -212,9 +210,13 @@ def image_to_raw_layoutlm_features(
212
210
 
213
211
  def layoutlm_features_to_pt_tensors(features: LayoutLMFeatures) -> LayoutLMFeatures:
214
212
  """
215
- Converting list of floats to pytorch tensors
216
- :param features: LayoutLMFeatures
217
- :return: LayoutLMFeatures
213
+ Converts a list of floats to PyTorch tensors.
214
+
215
+ Args:
216
+ features: `LayoutLMFeatures`.
217
+
218
+ Returns:
219
+ `LayoutLMFeatures`.
218
220
  """
219
221
 
220
222
  _image_key = "pixel_values" if "pixel_values" in features else "image"
@@ -240,13 +242,23 @@ def _tokenize_with_sliding_window(
240
242
  return_tensors: Optional[Literal["pt"]] = None,
241
243
  ) -> Union[JsonDict, BatchEncoding]:
242
244
  """
243
- Runs a tokenizer: If there are no overflowing tokens, the tokenizer output will be returned as it is.
244
- If there are overflowing tokens, sliding windows have to be built. As it is easier to prepare the sliding windows
245
- from raw tokenized outputs we run the tokenizer a second time without truncating and build the sliding windows from
246
- this second output.
247
- The current implementation has a bug in that sense, that for higher batch sizes it will only return overflowing
248
- samples. It is therefore recommended that if the dataset consist of many samples with lots of tokens one should
249
- use a low per device batch size.
245
+ Runs a tokenizer. If there are no overflowing tokens, the tokenizer output will be returned as is.
246
+ If there are overflowing tokens, sliding windows are built. Sliding windows are prepared from raw tokenized outputs
247
+ by running the tokenizer a second time without truncating and building the sliding windows from this output.
248
+
249
+ Note:
250
+ The current implementation has a bug: for higher batch sizes it will only return overflowing samples.
251
+ If the dataset consists of many samples with lots of tokens, use a low per device batch size.
252
+
253
+ Args:
254
+ raw_features: List of `RawLayoutLMFeatures` or `RawLMFeatures`.
255
+ tokenizer: `PreTrainedTokenizerFast`.
256
+ sliding_window_stride: Stride for sliding window.
257
+ max_batch_size: Maximum batch size.
258
+ return_tensors: If `pt`, returns torch tensors.
259
+
260
+ Returns:
261
+ `JsonDict` or `BatchEncoding`.
250
262
  """
251
263
  # first try: we require return_overflowing_tokens=True. If the number of raw features is equal to
252
264
  # overflow_to_sample_mapping then there is nothing more to do because the sample has less than max_length
@@ -413,33 +425,36 @@ def raw_features_to_layoutlm_features(
413
425
  remove_bounding_boxes: bool = False,
414
426
  ) -> LayoutLMFeatures:
415
427
  """
416
- Mapping raw features to tokenized input sequences for LayoutLM models.
417
-
418
- :param raw_features: A dictionary with the following arguments: `image_id, width, height, ann_ids, words,
419
- boxes, dataset_type`.
420
- :param tokenizer: A fast tokenizer for the model. Note, that the conventional python based tokenizer provided by the
421
- Transformer library do not return essential word_id/token_id mappings making the feature
422
- generation a lot more difficult. We therefore do not allow these tokenizer.
423
- :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
424
- `do_not_pad`.
425
- :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
426
- maximum acceptable input length for the model if that argument is not provided. This will
427
- truncate token by token, removing a token from the longest sequence in the pair if a pair of
428
- sequences (or a batch of pairs) is provided.
429
- If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
430
- model maximum admissible input size).
431
- :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens can
432
- be returned as an additional batch element. Not that in this case, the number of input
433
- batch samples will be smaller than the output batch samples.
434
- :param return_tensors: If `pt` will return torch Tensors. If no argument is provided that the batches will be lists
435
- of lists.
436
- :param remove_columns_for_training: Will remove all superfluous columns that are not required for training.
437
- :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length sliding windows
438
- will be created with each window having max_length sequence input. When using
439
- `sliding_window_stride=0` no strides will be created, otherwise it will create slides
440
- with windows shifted `sliding_window_stride` to the right.
441
- :return: dictionary with the following arguments: `image_ids, width, height, ann_ids, input_ids,
442
- token_type_ids, attention_mask, bbox, labels`.
428
+ Maps raw features to tokenized input sequences for LayoutLM models.
429
+
430
+ Args:
431
+ raw_features: A dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`,
432
+ `boxes`, `dataset_type`.
433
+ tokenizer: A fast tokenizer for the model. The conventional Python-based tokenizer provided by the
434
+ Transformers library does not return essential word_id/token_id mappings, making feature generation
435
+ more difficult. Only fast tokenizers are allowed.
436
+ padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
437
+ `do_not_pad`.
438
+ truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
439
+ maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
440
+ removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
441
+ provided. If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model
442
+ maximum admissible input size).
443
+ return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens can
444
+ be returned as an additional batch element. In this case, the number of input batch samples will be smaller
445
+ than the output batch samples.
446
+ return_tensors: If `pt`, returns torch tensors. If not provided, batches will be lists of lists.
447
+ remove_columns_for_training: Removes all superfluous columns that are not required for training.
448
+ sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding windows
449
+ will be created with each window having `max_length` sequence input. When using
450
+ `sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows shifted
451
+ `sliding_window_stride` to the right.
452
+ max_batch_size: Maximum batch size.
453
+ remove_bounding_boxes: If `True`, removes bounding box features.
454
+
455
+ Returns:
456
+ Dictionary with the following arguments: `image_ids`, `width`, `height`, `ann_ids`, `input_ids`,
457
+ `token_type_ids`, `attention_mask`, `bbox`, `labels`.
443
458
  """
444
459
 
445
460
  if isinstance(raw_features, dict):
@@ -578,28 +593,30 @@ def raw_features_to_layoutlm_features(
578
593
  @dataclass
579
594
  class LayoutLMDataCollator:
580
595
  """
581
- Data collator that will dynamically tokenize, pad and truncate the inputs received.
582
-
583
- :param tokenizer: A fast tokenizer for the model. Note, that the conventional python based tokenizer provided by the
584
- Transformer library do not return essential word_id/token_id mappings making the feature
585
- generation a lot more difficult. We therefore do not allow these tokenizer.
586
- :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
587
- `do_not_pad`.
588
- :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
589
- maximum acceptable input length for the model if that argument is not provided. This will
590
- truncate token by token, removing a token from the longest sequence in the pair if a pair of
591
- sequences (or a batch of pairs) is provided.
592
- If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
593
- model maximum admissible input size).
594
- :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens can
595
- be returned as an additional batch element. Not that in this case, the number of input
596
- batch samples will be smaller than the output batch samples.
597
- :param return_tensors: If `pt` will return torch Tensors. If no argument is provided that the batches will be lists
598
- of lists.
599
- :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length sliding windows
600
- will be created with each window having max_length sequence input. When using
601
- `sliding_window_stride=0` no strides will be created, otherwise it will create slides
602
- with windows shifted `sliding_window_stride` to the right.
596
+ Data collator that will dynamically tokenize, pad, and truncate the inputs received.
597
+
598
+ Args:
599
+ tokenizer: A fast tokenizer for the model. The conventional Python-based tokenizer provided by the
600
+ Transformers library does not return essential word_id/token_id mappings, making feature generation
601
+ more difficult. Only fast tokenizers are allowed.
602
+ padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
603
+ `do_not_pad`.
604
+ truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
605
+ maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
606
+ removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
607
+ provided.
608
+ If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
609
+ admissible input size).
610
+ return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens can
611
+ be returned as an additional batch element. In this case, the number of input batch samples will be smaller
612
+ than the output batch samples.
613
+ return_tensors: If `pt`, returns torch tensors. If not provided, batches will be lists of lists.
614
+ sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding windows
615
+ will be created with each window having `max_length` sequence input. When using
616
+ `sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows
617
+ shifted `sliding_window_stride` to the right.
618
+ max_batch_size: Maximum batch size.
619
+ remove_bounding_box_features: If `True`, removes bounding box features.
603
620
  """
604
621
 
605
622
  tokenizer: PreTrainedTokenizerFast
@@ -621,11 +638,15 @@ class LayoutLMDataCollator:
621
638
 
622
639
  def __call__(self, raw_features: Union[RawLayoutLMFeatures, list[RawLayoutLMFeatures]]) -> LayoutLMFeatures:
623
640
  """
624
- Calling the DataCollator to form model inputs for training and inference. Takes a single raw
625
- :param raw_features: A dictionary with the following arguments: `image_id, width, height, ann_ids, words,
626
- boxes, dataset_type`.
627
- :return: LayoutLMFeatures with arguments `image_ids, width, height, ann_ids, input_ids,
628
- token_type_ids, attention_masks, boxes, labels`.
641
+ Calls the `DataCollator` to form model inputs for training and inference.
642
+
643
+ Args:
644
+ raw_features: A dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`,
645
+ `boxes`, `dataset_type`.
646
+
647
+ Returns:
648
+ `LayoutLMFeatures` with arguments `image_ids`, `width`, `height`, `ann_ids`, `input_ids`,
649
+ `token_type_ids`, `attention_masks`, `boxes`, `labels`.
629
650
  """
630
651
  return raw_features_to_layoutlm_features(
631
652
  raw_features, # type: ignore
@@ -660,54 +681,57 @@ def image_to_layoutlm_features(
660
681
  sliding_window_stride: int = 0,
661
682
  ) -> Optional[LayoutLMFeatures]:
662
683
  """
663
- Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
684
+ Mapping function to generate LayoutLM features from `Image` to be used for inference in a pipeline component.
664
685
  `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
665
686
  with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
666
687
  used internally in `LMTokenClassifierService`.
667
688
 
668
- tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
669
- layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
670
- categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
671
-
672
- layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
673
-
674
- :param dp: Image datapoint
675
- :param tokenizer: Tokenizer compatible with the language model
676
- :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
677
- `do_not_pad`.
678
- :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
679
- maximum acceptable input length for the model if that argument is not provided. This will
680
- truncate token by token, removing a token from the longest sequence in the pair if a pair of
681
- sequences (or a batch of pairs) is provided.
682
- If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
683
- model maximum admissible input size).
684
- :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
685
- can be returned as an additional batch element. Not that in this case, the number
686
- of input batch samples will be smaller than the output batch samples.
687
- :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
688
- returned in list objects.
689
- :param input_width: Standard input size for image coordinates. All LayoutLM models require input features to be
690
- normalized to an image width equal to 1000.
691
- :param input_height: Standard input size for image coordinates. All LayoutLM models require input features to be
692
- normalized to an image height equal to 1000.
693
- :param image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to input_width, whereas
694
- the image has to be resized to a different width. This input will only resize the `image` width.
695
- :param image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to input_height,
696
- whereas the image has to be resized to a different height. This input will only resize the
697
- `image` height.
698
- :param color_mode: Either "BGR" or "RGB". Note, that LayoutLMv2 uses "BGR" because of Detectron2 backbone, whereas
699
- LayoutLMv3 uses "RGB".
700
- :param pixel_mean: (3,) array for "BGR" resp. "RGB" mean
701
- :param pixel_std: (3,) array for "BGR" resp. "RGB" std
702
- :param segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
703
- Choose a single or a sequence of layout segments to use their bounding boxes. Note, that
704
- the layout segments need to have a child-relationship with words. If a word does not
705
- appear as child, it will use the word bounding box.
706
- :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
707
- windows will be created with each window having max_length sequence input. When using
708
- `sliding_window_stride=0` no strides will be created, otherwise it will create slides
709
- with windows shifted `sliding_window_stride` to the right.
710
- :return: A dict of layoutlm features
689
+ Example:
690
+ ```python
691
+ tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
692
+ layoutlm = HFLayoutLmTokenClassifier("path/to/config.json", "path/to/model.bin",
693
+ categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
694
+ layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
695
+ ```
696
+
697
+ Args:
698
+ dp: `Image` datapoint.
699
+ tokenizer: Tokenizer compatible with the language model.
700
+ padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
701
+ `do_not_pad`.
702
+ truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
703
+ maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
704
+ removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
705
+ provided.
706
+ If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
707
+ admissible input size).
708
+ return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens
709
+ can be returned as an additional batch element. In this case, the number of input batch samples will be
710
+ smaller than the output batch samples.
711
+ return_tensors: Output tensor features. Either `pt` for PyTorch models or `None` if features should be
712
+ returned in list objects.
713
+ input_width: Standard input size for image coordinates. All LayoutLM models require input features to be
714
+ normalized to an image width equal to 1000.
715
+ input_height: Standard input size for image coordinates. All LayoutLM models require input features to be
716
+ normalized to an image height equal to 1000.
717
+ image_width: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_width`, whereas
718
+ the image has to be resized to a different width. This input will only resize the `image` width.
719
+ image_height: Some models (e.g. `Layoutlmv2`) assume box coordinates to be normalized to `input_height`,
720
+ whereas the image has to be resized to a different height. This input will only resize the `image` height.
721
+ color_mode: Either `BGR` or `RGB`. Note that LayoutLMv2 uses `BGR` because of Detectron2 backbone, whereas
722
+ LayoutLMv3 uses `RGB`.
723
+ pixel_mean: (3,) array for `BGR` or `RGB` mean.
724
+ pixel_std: (3,) array for `BGR` or `RGB` std.
725
+ segment_positions: Using bounding boxes of segment instead of words improves model accuracy significantly.
726
+ Choose a single or a sequence of layout segments to use their bounding boxes. The layout segments need to
727
+ have a child-relationship with words. If a word does not appear as child, it will use the word bounding box.
728
+ sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding
729
+ windows will be created with each window having `max_length` sequence input. When using
730
+ `sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows shifted
731
+ `sliding_window_stride` to the right.
732
+
733
+ Returns:
734
+ A dict of LayoutLM features.
711
735
  """
712
736
  raw_features = image_to_raw_layoutlm_features(
713
737
  None,
@@ -745,28 +769,36 @@ def image_to_raw_lm_features(
745
769
  include_residual_text_container: bool = False,
746
770
  ) -> Optional[RawLMFeatures]:
747
771
  """
748
- Mapping a datapoint into an intermediate format for bert-like models. Features will be provided into a dict and
772
+ Maps a datapoint into an intermediate format for BERT-like models. Features are provided in a dict and
749
773
  this mapping can be used for sequence or token classification as well as for inference. To generate input features
750
- for the model please `use raw_features_to_layoutlm_features`.
751
-
752
-
753
- :param dp: Image
754
- :param dataset_type: Either SEQUENCE_CLASSIFICATION or TOKEN_CLASSIFICATION. When using a built-in dataset use
755
- :param use_token_tag: Will only be used for dataset_type="token_classification". If use_token_tag=True, will use
756
- labels from sub category `WordType.token_tag` (with `B,I,O` suffix), otherwise
757
- `WordType.token_class`.
758
- :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
759
- :param floating_text_block_categories: A list of top level layout objects
760
- :param include_residual_text_container: This will regard synthetic text line annotations as floating text
761
- blocks and therefore incorporate all image annotations of category
762
- `word` when building text strings.
763
- :return: dictionary with the following arguments:
764
- 'image_id', 'width', 'height', 'ann_ids', 'words', 'bbox' and 'dataset_type'.
774
+ for the model, please use `raw_features_to_layoutlm_features`.
775
+
776
+ Args:
777
+ dp: `Image`.
778
+ dataset_type: Either `SEQUENCE_CLASSIFICATION` or `TOKEN_CLASSIFICATION`. When using a built-in dataset use
779
+ this.
780
+ use_token_tag: Used only for `dataset_type="token_classification"`. If `True`, uses labels from subcategory
781
+ `WordType.token_tag` (with `B,I,O` suffix), otherwise `WordType.token_class`.
782
+ text_container: A `LayoutType` to get the text from. It will steer the output of `Layout.words`.
783
+ floating_text_block_categories: A list of top-level layout objects.
784
+ include_residual_text_container: Regards synthetic text line annotations as floating text blocks and therefore
785
+ incorporates all image annotations of category `word` when building text strings.
786
+
787
+ Returns:
788
+ Dictionary with the following arguments: `image_id`, `width`, `height`, `ann_ids`, `words`, `bbox`, and
789
+ `dataset_type`.
765
790
  """
766
791
 
767
792
  raw_features: RawLMFeatures = RawLMFeatures({})
768
793
 
769
- page = Page.from_image(dp, text_container, floating_text_block_categories, include_residual_text_container)
794
+ # We do not need to configure residual_text_block_categories here, because text_ does ignore these layout sections
795
+ # anyway
796
+ page = Page.from_image(
797
+ image_orig=dp,
798
+ text_container=text_container,
799
+ floating_text_block_categories=floating_text_block_categories,
800
+ include_residual_text_container=include_residual_text_container,
801
+ )
770
802
 
771
803
  text_ = page.text_
772
804
 
@@ -808,42 +840,46 @@ def image_to_lm_features(
808
840
  include_residual_text_container: bool = False,
809
841
  ) -> Optional[LayoutLMFeatures]:
810
842
  """
811
- Mapping function to generate layoutlm features from `Image` to be used for inference in a pipeline component.
843
+ Mapping function to generate LayoutLM features from `Image` to be used for inference in a pipeline component.
812
844
  `LanguageModelPipelineComponent` has a positional argument `mapping_to_lm_input_func` that must be chosen
813
845
  with respect to the language model chosen. This mapper is devoted to generating features for LayoutLM. It will be
814
846
  used internally in `LMTokenClassifierService`.
815
847
 
816
- tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
817
- layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
818
- categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
819
-
820
- layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
821
-
822
- :param dp: Image datapoint
823
- :param tokenizer: Tokenizer compatible with the language model
824
- :param padding: A padding strategy to be passed to the tokenizer. Must bei either `max_length, longest` or
825
- `do_not_pad`.
826
- :param truncation: If "True" will truncate to a maximum length specified with the argument max_length or to the
827
- maximum acceptable input length for the model if that argument is not provided. This will
828
- truncate token by token, removing a token from the longest sequence in the pair if a pair of
829
- sequences (or a batch of pairs) is provided.
830
- If `False` then no truncation (i.e., can output batch with sequence lengths greater than the
831
- model maximum admissible input size).
832
- :param return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows the overflowing tokens
833
- can be returned as an additional batch element. Not that in this case, the number
834
- of input batch samples will be smaller than the output batch samples.
835
- :param return_tensors: Output tensor features. Either 'pt' for PyTorch models or None, if features should be
836
- returned in list objects.
837
- :param sliding_window_stride: If the output of the tokenizer exceeds the max_length sequence length a sliding
838
- windows will be created with each window having max_length sequence input. When using
839
- `sliding_window_stride=0` no strides will be created, otherwise it will create slides
840
- with windows shifted `sliding_window_stride` to the right.
841
- :param text_container: A LayoutType to get the text from. It will steer the output of `Layout.words`.
842
- :param floating_text_block_categories: A list of top level layout objects
843
- :param include_residual_text_container: This will regard synthetic text line annotations as floating text
844
- blocks and therefore incorporate all image annotations of category
845
- `word` when building text strings.
846
- :return: A dict of lm features
848
+ Example:
849
+ ```python
850
+ tokenizer = LayoutLMTokenizer.from_pretrained("mrm8488/layoutlm-finetuned-funsd")
851
+ layoutlm = HFLayoutLmTokenClassifier("path/to/config.json", "path/to/model.bin",
852
+ categories_explicit=['B-ANSWER', 'B-QUESTION', 'O'])
853
+ layoutlm_service = LMTokenClassifierService(tokenizer, layoutlm)
854
+ ```
855
+
856
+ Args:
857
+ dp: `Image` datapoint.
858
+ tokenizer: Tokenizer compatible with the language model.
859
+ padding: Padding strategy to be passed to the tokenizer. Must be either `max_length`, `longest`, or
860
+ `do_not_pad`.
861
+ truncation: If `True`, truncates to a maximum length specified with the argument `max_length` or to the
862
+ maximum acceptable input length for the model if that argument is not provided. Truncates token by token,
863
+ removing a token from the longest sequence in the pair if a pair of sequences (or a batch of pairs) is
864
+ provided.
865
+ If `False`, no truncation (i.e., can output batch with sequence lengths greater than the model maximum
866
+ admissible input size).
867
+ return_overflowing_tokens: If a sequence (due to a truncation strategy) overflows, the overflowing tokens
868
+ can be returned as an additional batch element. In this case, the number of input batch samples will be
869
+ smaller than the output batch samples.
870
+ return_tensors: Output tensor features. Either `pt` for PyTorch models or `None` if features should be
871
+ returned in list objects.
872
+ sliding_window_stride: If the output of the tokenizer exceeds the `max_length` sequence length, sliding
873
+ windows will be created with each window having `max_length` sequence input. When using
874
+ `sliding_window_stride=0`, no strides will be created; otherwise, it will create slides with windows
875
+ shifted `sliding_window_stride` to the right.
876
+ text_container: A `LayoutType` to get the text from. It will steer the output of `Layout.words`.
877
+ floating_text_block_categories: A list of top-level layout objects.
878
+ include_residual_text_container: Regards synthetic text line annotations as floating text blocks and therefore
879
+ incorporates all image annotations of category `word` when building text strings.
880
+
881
+ Returns:
882
+ A dict of LM features.
847
883
  """
848
884
  raw_features = image_to_raw_lm_features( # pylint: disable=E1102
849
885
  dataset_type=None,