deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +4 -2
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +919 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +162 -108
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +205 -119
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +26 -17
  104. deepdoctection/utils/env_info.py +86 -37
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -71
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.1.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.1.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
@@ -16,7 +16,7 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- HF Layoutlm model for diverse downstream tasks.
19
+ HF Layoutlm models.
20
20
  """
21
21
  from __future__ import annotations
22
22
 
@@ -87,9 +87,12 @@ def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) ->
87
87
  We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
88
88
  returns the tokenizer that should be used for a particular model.
89
89
 
90
- :param model_class: The model as stated in the transformer library.
91
- :param use_xlm_tokenizer: True if one uses the LayoutXLM. (The model cannot be distinguished from LayoutLMv2).
92
- :return: Tokenizer instance to use.
90
+ Args:
91
+ model_class: The model as stated in the transformer library.
92
+ use_xlm_tokenizer: True if one uses the `LayoutXLM`. (The model cannot be distinguished from `LayoutLMv2`).
93
+
94
+ Returns:
95
+ Tokenizer instance to use.
93
96
  """
94
97
  return {
95
98
  ("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
@@ -137,15 +140,18 @@ def predict_token_classes(
137
140
  images: Optional[torch.Tensor] = None,
138
141
  ) -> list[TokenClassResult]:
139
142
  """
140
- :param uuids: A list of uuids that correspond to a word that induces the resulting token
141
- :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
142
- :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
143
- :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
144
- :param boxes: Torch tensor of bounding boxes of type 'xyxy'
145
- :param tokens: List of original tokens taken from LayoutLMTokenizer
146
- :param model: layoutlm model for token classification
147
- :param images: A list of torch image tensors or None
148
- :return: A list of TokenClassResults
143
+ Args:
144
+ uuids: A list of uuids that correspond to a word that induces the resulting token
145
+ input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
146
+ attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
147
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
148
+ boxes: Torch tensor of bounding boxes of type 'xyxy'
149
+ tokens: List of original tokens taken from `LayoutLMTokenizer`
150
+ model: layoutlm model for token classification
151
+ images: A list of torch image tensors or None
152
+
153
+ Returns:
154
+ A list of `TokenClassResult`s
149
155
  """
150
156
 
151
157
  if images is None:
@@ -195,13 +201,16 @@ def predict_sequence_classes(
195
201
  images: Optional[torch.Tensor] = None,
196
202
  ) -> SequenceClassResult:
197
203
  """
198
- :param input_ids: Token converted to ids to be taken from LayoutLMTokenizer
199
- :param attention_mask: The associated attention masks from padded sequences taken from LayoutLMTokenizer
200
- :param token_type_ids: Torch tensor of token type ids taken from LayoutLMTokenizer
201
- :param boxes: Torch tensor of bounding boxes of type 'xyxy'
202
- :param model: layoutlm model for sequence classification
203
- :param images: A list of torch image tensors or None
204
- :return: SequenceClassResult
204
+ Args:
205
+ input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
206
+ attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
207
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
208
+ boxes: Torch tensor of bounding boxes of type `xyxy`
209
+ model: layoutlm model for sequence classification
210
+ images: A list of torch image tensors or None
211
+
212
+ Returns:
213
+ SequenceClassResult
205
214
  """
206
215
 
207
216
  if images is None:
@@ -229,7 +238,7 @@ def predict_sequence_classes(
229
238
 
230
239
  class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
231
240
  """
232
- Abstract base class for wrapping LayoutLM models for token classification into the deepdoctection framework.
241
+ Abstract base class for wrapping `LayoutLM` models for token classification into the framework.
233
242
  """
234
243
 
235
244
  def __init__(
@@ -243,17 +252,18 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
243
252
  use_xlm_tokenizer: bool = False,
244
253
  ):
245
254
  """
246
- :param path_config_json: path to .json config file
247
- :param path_weights: path to model artifact
248
- :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
249
- entities self. To be consistent with detectors use only values >0. Conversion will
250
- be done internally.
251
- :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
252
- consistent with detectors use only values>0. Conversion will be done internally.
253
- :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
254
- :param device: The device (cpu,"cuda"), where to place the model.
255
- :param use_xlm_tokenizer: True if one uses the LayoutXLM or a lilt model built with a xlm language model, e.g.
256
- info-xlm or roberta-xlm. (LayoutXLM cannot be distinguished from LayoutLMv2).
255
+ Args:
256
+ path_config_json: path to `.json` config file
257
+ path_weights: path to model artifact
258
+ categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
259
+ entities self. To be consistent with detectors use only values `>0`. Conversion will
260
+ be done internally.
261
+ categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
262
+ consistent with detectors use only `values>0`. Conversion will be done internally.
263
+ categories: If you have a pre-trained model you can pass a complete dict of NER categories
264
+ device: The device (cpu,"cuda"), where to place the model.
265
+ use_xlm_tokenizer: True if one uses the `LayoutXLM` or a lilt model built with a xlm language model, e.g.
266
+ `info-xlm` or `roberta-xlm`. (`LayoutXLM` cannot be distinguished from LayoutLMv2).
257
267
  """
258
268
 
259
269
  if categories is None:
@@ -340,10 +350,15 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
340
350
 
341
351
  @staticmethod
342
352
  def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
343
- """A refinement for adding the tokenizer class name to the model configs.
353
+ """
354
+ A refinement for adding the tokenizer class name to the model configs.
355
+
356
+ Args:
357
+ model_class_name: The model name, e.g. `model.__class__.__name__`
358
+ use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
344
359
 
345
- :param model_class_name: The model name, e.g. model.__class__.__name__
346
- :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
360
+ Returns:
361
+ The name of the tokenizer class.
347
362
  """
348
363
  tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
349
364
  return tokenizer.__class__.__name__
@@ -366,31 +381,32 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
366
381
  Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
367
382
  classification and other things please use another model of the family.
368
383
 
369
- **Example**
384
+ Example:
385
+ ```python
386
+ # setting up compulsory ocr service
387
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
388
+ tess = TesseractOcrDetector(tesseract_config_path)
389
+ ocr_service = TextExtractionService(tess)
370
390
 
371
- # setting up compulsory ocr service
372
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
373
- tess = TesseractOcrDetector(tesseract_config_path)
374
- ocr_service = TextExtractionService(tess)
391
+ # hf tokenizer and token classifier
392
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
393
+ layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
394
+ categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
395
+ 'E-header', 'E-question', 'I-answer', 'I-header',
396
+ 'I-question', 'O', 'S-answer', 'S-header',
397
+ 'S-question'])
375
398
 
376
- # hf tokenizer and token classifier
377
- tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
378
- layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
379
- categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
380
- 'E-header', 'E-question', 'I-answer', 'I-header',
381
- 'I-question', 'O', 'S-answer', 'S-header',
382
- 'S-question'])
399
+ # token classification service
400
+ layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
383
401
 
384
- # token classification service
385
- layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
386
-
387
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
402
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
388
403
 
389
- path = "path/to/some/form"
390
- df = pipe.analyze(path=path)
404
+ path = "path/to/some/form"
405
+ df = pipe.analyze(path=path)
391
406
 
392
- for dp in df:
393
- ...
407
+ for dp in df:
408
+ ...
409
+ ```
394
410
  """
395
411
 
396
412
  def __init__(
@@ -404,17 +420,18 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
404
420
  use_xlm_tokenizer: bool = False,
405
421
  ):
406
422
  """
407
- :param path_config_json: path to .json config file
408
- :param path_weights: path to model artifact
409
- :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
410
- entities self. To be consistent with detectors use only values >0. Conversion will
411
- be done internally.
412
- :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
413
- consistent with detectors use only values>0. Conversion will be done internally.
414
- :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
415
- :param device: The device (cpu,"cuda"), where to place the model.
416
- :param use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
417
- Tokenizer.
423
+ Args:
424
+ path_config_json: path to `.json` config file
425
+ path_weights: path to model artifact
426
+ categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
427
+ entities self. To be consistent with detectors use only values `>0`. Conversion will
428
+ be done internally.
429
+ categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. BIO). To be
430
+ consistent with detectors use only values>0. Conversion will be done internally.
431
+ categories: If you have a pre-trained model you can pass a complete dict of NER categories
432
+ device: The device (cpu,"cuda"), where to place the model.
433
+ use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
434
+ Tokenizer.
418
435
  """
419
436
  super().__init__(
420
437
  path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
@@ -431,17 +448,16 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
431
448
  """
432
449
  Launch inference on LayoutLm for token classification. Pass the following arguments
433
450
 
434
- `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
435
-
436
- `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
437
-
438
- `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
439
-
440
- `boxes:` Torch tensor of bounding boxes of type 'xyxy'
451
+ Args:
452
+ encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
453
+ attention_mask: The associated attention masks from padded sequences taken from
454
+ `LayoutLMTokenizer`
455
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
456
+ boxes: Torch tensor of bounding boxes of type `xyxy`
457
+ tokens: List of original tokens taken from `LayoutLMTokenizer`
441
458
 
442
- `tokens:` List of original tokens taken from `LayoutLMTokenizer`
443
-
444
- :return: A list of TokenClassResults
459
+ Returns:
460
+ A list of `TokenClassResult`s
445
461
  """
446
462
 
447
463
  ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
@@ -459,9 +475,12 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
459
475
  """
460
476
  Get the inner (wrapped) model.
461
477
 
462
- :param path_config_json: path to .json config file
463
- :param path_weights: path to model artifact
464
- :return: 'nn.Module'
478
+ Args:
479
+ path_config_json: path to .json config file
480
+ path_weights: path to model artifact
481
+
482
+ Returns:
483
+ `nn.Module`
465
484
  """
466
485
  config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
467
486
  return LayoutLMForTokenClassification.from_pretrained(
@@ -481,31 +500,32 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
481
500
 
482
501
  Note, that you must use `LayoutLMTokenizerFast` as tokenizer. `LayoutLMv2TokenizerFast` will not be accepted.
483
502
 
484
- **Example**
503
+ Example:
504
+ ```python
505
+ # setting up compulsory ocr service
506
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
507
+ tess = TesseractOcrDetector(tesseract_config_path)
508
+ ocr_service = TextExtractionService(tess)
485
509
 
486
- # setting up compulsory ocr service
487
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
488
- tess = TesseractOcrDetector(tesseract_config_path)
489
- ocr_service = TextExtractionService(tess)
510
+ # hf tokenizer and token classifier
511
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
512
+ layoutlm = HFLayoutLmv2TokenClassifier("path/to/config.json","path/to/model.bin",
513
+ categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
514
+ 'E-header', 'E-question', 'I-answer', 'I-header',
515
+ 'I-question', 'O', 'S-answer', 'S-header',
516
+ 'S-question'])
490
517
 
491
- # hf tokenizer and token classifier
492
- tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
493
- layoutlm = HFLayoutLmv2TokenClassifier("path/to/config.json","path/to/model.bin",
494
- categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
495
- 'E-header', 'E-question', 'I-answer', 'I-header',
496
- 'I-question', 'O', 'S-answer', 'S-header',
497
- 'S-question'])
498
-
499
- # token classification service
500
- layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
518
+ # token classification service
519
+ layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
501
520
 
502
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
521
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
503
522
 
504
- path = "path/to/some/form"
505
- df = pipe.analyze(path=path)
523
+ path = "path/to/some/form"
524
+ df = pipe.analyze(path=path)
506
525
 
507
- for dp in df:
508
- ...
526
+ for dp in df:
527
+ ...
528
+ ```
509
529
  """
510
530
 
511
531
  def __init__(
@@ -519,17 +539,18 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
519
539
  use_xlm_tokenizer: bool = False,
520
540
  ):
521
541
  """
522
- :param path_config_json: path to .json config file
523
- :param path_weights: path to model artifact
524
- :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
525
- entities self. To be consistent with detectors use only values >0. Conversion will
526
- be done internally.
527
- :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
528
- consistent with detectors use only values>0. Conversion will be done internally.
529
- :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
530
- :param device: The device (cpu,"cuda"), where to place the model.
531
- :param use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a LayoutLMv2 model keep the
532
- default value.
542
+ Args:
543
+ path_config_json: path to `.json` config file
544
+ path_weights: path to model artifact
545
+ categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
546
+ entities self. To be consistent with detectors use only values `>0`. Conversion will
547
+ be done internally.
548
+ categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
549
+ consistent with detectors use only values>0. Conversion will be done internally.
550
+ categories: If you have a pre-trained model you can pass a complete dict of `NER` categories
551
+ device: The device (cpu,"cuda"), where to place the model.
552
+ use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a `LayoutLMv2` model keep the
553
+ default value.
533
554
  """
534
555
  super().__init__(
535
556
  path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
@@ -544,19 +565,18 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
544
565
 
545
566
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
546
567
  """
547
- Launch inference on LayoutLm for token classification. Pass the following arguments
548
-
549
- `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
550
-
551
- `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
552
-
553
- `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
554
-
555
- `boxes:` Torch tensor of bounding boxes of type `xyxy`
556
-
557
- `tokens:` List of original tokens taken from `LayoutLMTokenizer`
558
-
559
- :return: A list of TokenClassResults
568
+ Launch inference on `LayoutLm` for token classification. Pass the following arguments
569
+
570
+ Args:
571
+ encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
572
+ attention_mask: The associated attention masks from padded sequences taken from
573
+ `LayoutLMTokenizer`
574
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
575
+ boxes: Torch tensor of bounding boxes of type `xyxy`
576
+ tokens: List of original tokens taken from `LayoutLMTokenizer`
577
+
578
+ Returns:
579
+ A list of `TokenClassResult`s
560
580
  """
561
581
 
562
582
  ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
@@ -799,8 +819,9 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
799
819
  def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
800
820
  """A refinement for adding the tokenizer class name to the model configs.
801
821
 
802
- :param model_class_name: The model name, e.g. model.__class__.__name__
803
- :param use_xlm_tokenizer: Whether to use a XLM tokenizer.
822
+ Args:
823
+ model_class_name: The model name, e.g. `model.__class__.__name__`
824
+ use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
804
825
  """
805
826
  tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
806
827
  return tokenizer.__class__.__name__
@@ -823,28 +844,29 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
823
844
  Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
824
845
  classification and other things please use another model of the family.
825
846
 
826
- **Example**
827
-
828
- # setting up compulsory ocr service
829
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
830
- tess = TesseractOcrDetector(tesseract_config_path)
831
- ocr_service = TextExtractionService(tess)
847
+ Example:
848
+ ```python
849
+ # setting up compulsory ocr service
850
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
851
+ tess = TesseractOcrDetector(tesseract_config_path)
852
+ ocr_service = TextExtractionService(tess)
832
853
 
833
- # hf tokenizer and token classifier
834
- tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
835
- layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json","path/to/model.bin",
836
- categories=["handwritten", "presentation", "resume"])
854
+ # hf tokenizer and token classifier
855
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
856
+ layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json","path/to/model.bin",
857
+ categories=["handwritten", "presentation", "resume"])
837
858
 
838
- # token classification service
839
- layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
859
+ # token classification service
860
+ layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
840
861
 
841
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
862
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
842
863
 
843
- path = "path/to/some/form"
844
- df = pipe.analyze(path=path)
864
+ path = "path/to/some/form"
865
+ df = pipe.analyze(path=path)
845
866
 
846
- for dp in df:
847
- ...
867
+ for dp in df:
868
+ ...
869
+ ```
848
870
  """
849
871
 
850
872
  def __init__(
@@ -855,6 +877,16 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
855
877
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
856
878
  use_xlm_tokenizer: bool = False,
857
879
  ):
880
+ """
881
+ Args:
882
+ path_config_json: path to `.json` config file
883
+ path_weights: path to model artifact
884
+ categories: A dict with key (indices) and values (category names) for sequence classification.
885
+ To be consistent with detectors use only values `>0`. Conversion will be done internally.
886
+ device: The device ("cpu","cuda"), where to place the model.
887
+ use_xlm_tokenizer: Do not change this value unless you pre-trained a `LayoutLM` model with a different
888
+ Tokenizer.
889
+ """
858
890
  super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
859
891
  self.name = self.get_name(path_weights, "LayoutLM")
860
892
  self.model_id = self.get_model_id()
@@ -865,6 +897,16 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
865
897
  )
866
898
 
867
899
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
900
+ """
901
+ Launch inference on LayoutLm for sequence classification. Pass the following arguments
902
+
903
+ Args:
904
+ encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
905
+ attention_mask: The associated attention masks from padded sequences taken from
906
+ `LayoutLMTokenizer`
907
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
908
+ boxes: Torch tensor of bounding boxes of type `xyxy`
909
+ """
868
910
  input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
869
911
 
870
912
  result = predict_sequence_classes(
@@ -886,9 +928,12 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
886
928
  """
887
929
  Get the inner (wrapped) model.
888
930
 
889
- :param path_config_json: path to .json config file
890
- :param path_weights: path to model artifact
891
- :return: 'nn.Module'
931
+ Args:
932
+ path_config_json: path to `.json` config file
933
+ path_weights: path to model artifact
934
+
935
+ Returns:
936
+ 'nn.Module'
892
937
  """
893
938
  config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
894
939
  return LayoutLMForSequenceClassification.from_pretrained(
@@ -906,28 +951,29 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
906
951
  itself. Note that this model is equipped with a head that is only useful for classifying the input sequence. For
907
952
  token classification and other things please use another model of the family.
908
953
 
909
- **Example**
910
-
911
- # setting up compulsory ocr service
912
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
913
- tess = TesseractOcrDetector(tesseract_config_path)
914
- ocr_service = TextExtractionService(tess)
954
+ Example:
955
+ ```python
956
+ # setting up compulsory ocr service
957
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
958
+ tess = TesseractOcrDetector(tesseract_config_path)
959
+ ocr_service = TextExtractionService(tess)
915
960
 
916
- # hf tokenizer and token classifier
917
- tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
918
- layoutlm = HFLayoutLmv2SequenceClassifier("path/to/config.json","path/to/model.bin",
919
- categories=["handwritten", "presentation", "resume"])
961
+ # hf tokenizer and token classifier
962
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
963
+ layoutlm = HFLayoutLmv2SequenceClassifier("path/to/config.json","path/to/model.bin",
964
+ categories=["handwritten", "presentation", "resume"])
920
965
 
921
- # token classification service
922
- layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
966
+ # token classification service
967
+ layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
923
968
 
924
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
969
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
925
970
 
926
- path = "path/to/some/form"
927
- df = pipe.analyze(path=path)
971
+ path = "path/to/some/form"
972
+ df = pipe.analyze(path=path)
928
973
 
929
- for dp in df:
930
- ...
974
+ for dp in df:
975
+ ...
976
+ ```
931
977
  """
932
978
 
933
979
  def __init__(
@@ -938,6 +984,16 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
938
984
  device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
939
985
  use_xlm_tokenizer: bool = False,
940
986
  ):
987
+ """
988
+ Args:
989
+ path_config_json: path to `.json` config file
990
+ path_weights: path to model artifact
991
+ categories: A dict with key (indices) and values (category names) for sequence classification.
992
+ To be consistent with detectors use only values `>0`. Conversion will be done internally.
993
+ device: The device ("cpu","cuda"), where to place the model.
994
+ use_xlm_tokenizer: Do not change this value unless you pre-trained a `LayoutLM` model with a different
995
+ Tokenizer.
996
+ """
941
997
  super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
942
998
  self.name = self.get_name(path_weights, "LayoutLMv2")
943
999
  self.model_id = self.get_model_id()
@@ -948,6 +1004,16 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
948
1004
  )
949
1005
 
950
1006
  def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
1007
+ """
1008
+ Launch inference on LayoutLm for sequence classification. Pass the following arguments
1009
+
1010
+ Args:
1011
+ encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
1012
+ attention_mask: The associated attention masks from padded sequences taken from
1013
+ `LayoutLMTokenizer`
1014
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
1015
+ boxes: Torch tensor of bounding boxes of type `xyxy`
1016
+ """
951
1017
  input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
952
1018
  images = encodings.get("image")
953
1019
  if isinstance(images, torch.Tensor):
@@ -976,9 +1042,12 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
976
1042
  """
977
1043
  Get the inner (wrapped) model.
978
1044
 
979
- :param path_config_json: path to .json config file
980
- :param path_weights: path to model artifact
981
- :return: 'nn.Module'
1045
+ Args:
1046
+ path_config_json: path to `.json` config file
1047
+ path_weights: path to model artifact
1048
+
1049
+ Returns:
1050
+ 'nn.Module'
982
1051
  """
983
1052
  config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
984
1053
  return LayoutLMv2ForSequenceClassification.from_pretrained(
@@ -996,28 +1065,29 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
996
1065
  itself. Note that this model is equipped with a head that is only useful for classifying the input sequence. For
997
1066
  token classification and other things please use another model of the family.
998
1067
 
999
- **Example**
1068
+ Example:
1069
+ ```python
1070
+ # setting up compulsory ocr service
1071
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1072
+ tess = TesseractOcrDetector(tesseract_config_path)
1073
+ ocr_service = TextExtractionService(tess)
1000
1074
 
1001
- # setting up compulsory ocr service
1002
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1003
- tess = TesseractOcrDetector(tesseract_config_path)
1004
- ocr_service = TextExtractionService(tess)
1075
+ # hf tokenizer and token classifier
1076
+ tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
1077
+ layoutlm = HFLayoutLmv3SequenceClassifier("path/to/config.json","path/to/model.bin",
1078
+ categories=["handwritten", "presentation", "resume"])
1005
1079
 
1006
- # hf tokenizer and token classifier
1007
- tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
1008
- layoutlm = HFLayoutLmv3SequenceClassifier("path/to/config.json","path/to/model.bin",
1009
- categories=["handwritten", "presentation", "resume"])
1080
+ # token classification service
1081
+ layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
1010
1082
 
1011
- # token classification service
1012
- layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
1083
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
1013
1084
 
1014
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
1085
+ path = "path/to/some/form"
1086
+ df = pipe.analyze(path=path)
1015
1087
 
1016
- path = "path/to/some/form"
1017
- df = pipe.analyze(path=path)
1018
-
1019
- for dp in df:
1020
- ...
1088
+ for dp in df:
1089
+ ...
1090
+ ```
1021
1091
  """
1022
1092
 
1023
1093
  def __init__(
@@ -1072,9 +1142,12 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
1072
1142
  """
1073
1143
  Get the inner (wrapped) model.
1074
1144
 
1075
- :param path_config_json: path to .json config file
1076
- :param path_weights: path to model artifact
1077
- :return: 'nn.Module'
1145
+ Args:
1146
+ path_config_json: path to `.json` config file
1147
+ path_weights: path to model artifact
1148
+
1149
+ Returns:
1150
+ 'nn.Module'
1078
1151
  """
1079
1152
  config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
1080
1153
  return LayoutLMv3ForSequenceClassification.from_pretrained(
@@ -1092,31 +1165,32 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
1092
1165
  Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
1093
1166
  classification and other things please use another model of the family.
1094
1167
 
1095
- **Example**
1096
-
1097
- # setting up compulsory ocr service
1098
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1099
- tess = TesseractOcrDetector(tesseract_config_path)
1100
- ocr_service = TextExtractionService(tess)
1168
+ Example:
1169
+ ```python
1170
+ # setting up compulsory ocr service
1171
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1172
+ tess = TesseractOcrDetector(tesseract_config_path)
1173
+ ocr_service = TextExtractionService(tess)
1101
1174
 
1102
- # hf tokenizer and token classifier
1103
- tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
1104
- lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
1105
- categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
1106
- 'E-header', 'E-question', 'I-answer', 'I-header',
1107
- 'I-question', 'O', 'S-answer', 'S-header',
1108
- 'S-question'])
1175
+ # hf tokenizer and token classifier
1176
+ tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
1177
+ lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
1178
+ categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
1179
+ 'E-header', 'E-question', 'I-answer', 'I-header',
1180
+ 'I-question', 'O', 'S-answer', 'S-header',
1181
+ 'S-question'])
1109
1182
 
1110
- # token classification service
1111
- lilt_service = LMTokenClassifierService(tokenizer,lilt)
1183
+ # token classification service
1184
+ lilt_service = LMTokenClassifierService(tokenizer,lilt)
1112
1185
 
1113
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
1186
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
1114
1187
 
1115
- path = "path/to/some/form"
1116
- df = pipe.analyze(path=path)
1188
+ path = "path/to/some/form"
1189
+ df = pipe.analyze(path=path)
1117
1190
 
1118
- for dp in df:
1119
- ...
1191
+ for dp in df:
1192
+ ...
1193
+ ```
1120
1194
  """
1121
1195
 
1122
1196
  def __init__(
@@ -1130,15 +1204,16 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
1130
1204
  use_xlm_tokenizer: bool = False,
1131
1205
  ):
1132
1206
  """
1133
- :param path_config_json: path to .json config file
1134
- :param path_weights: path to model artifact
1135
- :param categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
1136
- entities self. To be consistent with detectors use only values >0. Conversion will
1137
- be done internally.
1138
- :param categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. BIO). To be
1139
- consistent with detectors use only values>0. Conversion will be done internally.
1140
- :param categories: If you have a pre-trained model you can pass a complete dict of NER categories
1141
- :param device: The device (cpu,"cuda"), where to place the model.
1207
+ Args:
1208
+ path_config_json: path to `.json` config file
1209
+ path_weights: path to model artifact
1210
+ categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
1211
+ entities self. To be consistent with detectors use only values `>0`. Conversion will
1212
+ be done internally.
1213
+ categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. `BIO`). To be
1214
+ consistent with detectors use only values>0. Conversion will be done internally.
1215
+ categories: If you have a pre-trained model you can pass a complete dict of `NER` categories
1216
+ device: The device ("cpu","cuda"), where to place the model.
1142
1217
  """
1143
1218
 
1144
1219
  super().__init__(
@@ -1156,17 +1231,16 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
1156
1231
  """
1157
1232
  Launch inference on LayoutLm for token classification. Pass the following arguments
1158
1233
 
1159
- `input_ids:` Token converted to ids to be taken from `LayoutLMTokenizer`
1160
-
1161
- `attention_mask:` The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
1234
+ Args:
1235
+ encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
1236
+ attention_mask: The associated attention masks from padded sequences taken from
1237
+ `LayoutLMTokenizer`
1238
+ token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
1239
+ boxes: Torch tensor of bounding boxes of type `xyxy`
1240
+ tokens: List of original tokens taken from `LayoutLMTokenizer`
1162
1241
 
1163
- `token_type_ids:` Torch tensor of token type ids taken from `LayoutLMTokenizer`
1164
-
1165
- `boxes:` Torch tensor of bounding boxes of type 'xyxy'
1166
-
1167
- `tokens:` List of original tokens taken from `LayoutLMTokenizer`
1168
-
1169
- :return: A list of TokenClassResults
1242
+ Returns:
1243
+ A list of `TokenClassResult`s
1170
1244
  """
1171
1245
 
1172
1246
  ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
@@ -1182,9 +1256,12 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
1182
1256
  """
1183
1257
  Get the inner (wrapped) model.
1184
1258
 
1185
- :param path_config_json: path to .json config file
1186
- :param path_weights: path to model artifact
1187
- :return: 'nn.Module'
1259
+ Args:
1260
+ path_config_json: path to `.json` config file
1261
+ path_weights: path to model artifact
1262
+
1263
+ Returns:
1264
+ `nn.Module`
1188
1265
  """
1189
1266
  config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
1190
1267
  return LiltForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
@@ -1200,29 +1277,30 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
1200
1277
  Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
1201
1278
  classification and other things please use another model of the family.
1202
1279
 
1203
- **Example**
1280
+ Example:
1281
+ ```python
1282
+ # setting up compulsory ocr service
1283
+ tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1284
+ tess = TesseractOcrDetector(tesseract_config_path)
1285
+ ocr_service = TextExtractionService(tess)
1204
1286
 
1205
- # setting up compulsory ocr service
1206
- tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
1207
- tess = TesseractOcrDetector(tesseract_config_path)
1208
- ocr_service = TextExtractionService(tess)
1287
+ # hf tokenizer and sequence classifier
1288
+ tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
1289
+ lilt = HFLiltSequenceClassifier("path/to/config.json",
1290
+ "path/to/model.bin",
1291
+ categories=["handwritten", "presentation", "resume"])
1209
1292
 
1210
- # hf tokenizer and sequence classifier
1211
- tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
1212
- lilt = HFLiltSequenceClassifier("path/to/config.json",
1213
- "path/to/model.bin",
1214
- categories=["handwritten", "presentation", "resume"])
1293
+ # sequence classification service
1294
+ lilt_service = LMSequenceClassifierService(tokenizer,lilt)
1215
1295
 
1216
- # sequence classification service
1217
- lilt_service = LMSequenceClassifierService(tokenizer,lilt)
1296
+ pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
1218
1297
 
1219
- pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
1220
-
1221
- path = "path/to/some/form"
1222
- df = pipe.analyze(path=path)
1298
+ path = "path/to/some/form"
1299
+ df = pipe.analyze(path=path)
1223
1300
 
1224
- for dp in df:
1225
- ...
1301
+ for dp in df:
1302
+ ...
1303
+ ```
1226
1304
  """
1227
1305
 
1228
1306
  def __init__(
@@ -1262,9 +1340,12 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
1262
1340
  """
1263
1341
  Get the inner (wrapped) model.
1264
1342
 
1265
- :param path_config_json: path to .json config file
1266
- :param path_weights: path to model artifact
1267
- :return: 'nn.Module'
1343
+ Args:
1344
+ path_config_json: path to `.json` config file
1345
+ path_weights: path to model artifact
1346
+
1347
+ Returns:
1348
+ `nn.Module`
1268
1349
  """
1269
1350
  config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
1270
1351
  return LiltForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)