deepdoctection 0.42.0__py3-none-any.whl → 0.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +2 -1
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +904 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +157 -106
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +196 -113
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +25 -17
- deepdoctection/utils/env_info.py +85 -36
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -62
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.dist-info/METADATA +376 -0
- deepdoctection-0.43.dist-info/RECORD +149 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/WHEEL +1 -1
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.0.dist-info/METADATA +0 -431
- deepdoctection-0.42.0.dist-info/RECORD +0 -148
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.0.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
18
|
"""
|
|
19
|
-
HF Layoutlm
|
|
19
|
+
HF Layoutlm models.
|
|
20
20
|
"""
|
|
21
21
|
from __future__ import annotations
|
|
22
22
|
|
|
@@ -87,9 +87,12 @@ def get_tokenizer_from_model_class(model_class: str, use_xlm_tokenizer: bool) ->
|
|
|
87
87
|
We do not use the tokenizer for a particular model that the transformer library provides. Thie mapping therefore
|
|
88
88
|
returns the tokenizer that should be used for a particular model.
|
|
89
89
|
|
|
90
|
-
:
|
|
91
|
-
|
|
92
|
-
|
|
90
|
+
Args:
|
|
91
|
+
model_class: The model as stated in the transformer library.
|
|
92
|
+
use_xlm_tokenizer: True if one uses the `LayoutXLM`. (The model cannot be distinguished from `LayoutLMv2`).
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Tokenizer instance to use.
|
|
93
96
|
"""
|
|
94
97
|
return {
|
|
95
98
|
("LayoutLMForTokenClassification", False): LayoutLMTokenizerFast.from_pretrained(
|
|
@@ -137,15 +140,18 @@ def predict_token_classes(
|
|
|
137
140
|
images: Optional[torch.Tensor] = None,
|
|
138
141
|
) -> list[TokenClassResult]:
|
|
139
142
|
"""
|
|
140
|
-
:
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
143
|
+
Args:
|
|
144
|
+
uuids: A list of uuids that correspond to a word that induces the resulting token
|
|
145
|
+
input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
146
|
+
attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
|
|
147
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
148
|
+
boxes: Torch tensor of bounding boxes of type 'xyxy'
|
|
149
|
+
tokens: List of original tokens taken from `LayoutLMTokenizer`
|
|
150
|
+
model: layoutlm model for token classification
|
|
151
|
+
images: A list of torch image tensors or None
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
A list of `TokenClassResult`s
|
|
149
155
|
"""
|
|
150
156
|
|
|
151
157
|
if images is None:
|
|
@@ -195,13 +201,16 @@ def predict_sequence_classes(
|
|
|
195
201
|
images: Optional[torch.Tensor] = None,
|
|
196
202
|
) -> SequenceClassResult:
|
|
197
203
|
"""
|
|
198
|
-
:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
204
|
+
Args:
|
|
205
|
+
input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
206
|
+
attention_mask: The associated attention masks from padded sequences taken from `LayoutLMTokenizer`
|
|
207
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
208
|
+
boxes: Torch tensor of bounding boxes of type `xyxy`
|
|
209
|
+
model: layoutlm model for sequence classification
|
|
210
|
+
images: A list of torch image tensors or None
|
|
211
|
+
|
|
212
|
+
Returns:
|
|
213
|
+
SequenceClassResult
|
|
205
214
|
"""
|
|
206
215
|
|
|
207
216
|
if images is None:
|
|
@@ -229,7 +238,7 @@ def predict_sequence_classes(
|
|
|
229
238
|
|
|
230
239
|
class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
231
240
|
"""
|
|
232
|
-
Abstract base class for wrapping LayoutLM models for token classification into the
|
|
241
|
+
Abstract base class for wrapping `LayoutLM` models for token classification into the framework.
|
|
233
242
|
"""
|
|
234
243
|
|
|
235
244
|
def __init__(
|
|
@@ -243,17 +252,18 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
243
252
|
use_xlm_tokenizer: bool = False,
|
|
244
253
|
):
|
|
245
254
|
"""
|
|
246
|
-
:
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
255
|
+
Args:
|
|
256
|
+
path_config_json: path to `.json` config file
|
|
257
|
+
path_weights: path to model artifact
|
|
258
|
+
categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
|
|
259
|
+
entities self. To be consistent with detectors use only values `>0`. Conversion will
|
|
260
|
+
be done internally.
|
|
261
|
+
categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
|
|
262
|
+
consistent with detectors use only `values>0`. Conversion will be done internally.
|
|
263
|
+
categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
264
|
+
device: The device (cpu,"cuda"), where to place the model.
|
|
265
|
+
use_xlm_tokenizer: True if one uses the `LayoutXLM` or a lilt model built with a xlm language model, e.g.
|
|
266
|
+
`info-xlm` or `roberta-xlm`. (`LayoutXLM` cannot be distinguished from LayoutLMv2).
|
|
257
267
|
"""
|
|
258
268
|
|
|
259
269
|
if categories is None:
|
|
@@ -340,10 +350,15 @@ class HFLayoutLmTokenClassifierBase(LMTokenClassifier, ABC):
|
|
|
340
350
|
|
|
341
351
|
@staticmethod
|
|
342
352
|
def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
|
|
343
|
-
"""
|
|
353
|
+
"""
|
|
354
|
+
A refinement for adding the tokenizer class name to the model configs.
|
|
355
|
+
|
|
356
|
+
Args:
|
|
357
|
+
model_class_name: The model name, e.g. `model.__class__.__name__`
|
|
358
|
+
use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
|
|
344
359
|
|
|
345
|
-
:
|
|
346
|
-
|
|
360
|
+
Returns:
|
|
361
|
+
The name of the tokenizer class.
|
|
347
362
|
"""
|
|
348
363
|
tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
|
|
349
364
|
return tokenizer.__class__.__name__
|
|
@@ -366,31 +381,32 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
366
381
|
Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
|
|
367
382
|
classification and other things please use another model of the family.
|
|
368
383
|
|
|
369
|
-
|
|
384
|
+
Example:
|
|
385
|
+
```python
|
|
386
|
+
# setting up compulsory ocr service
|
|
387
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
388
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
389
|
+
ocr_service = TextExtractionService(tess)
|
|
370
390
|
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
391
|
+
# hf tokenizer and token classifier
|
|
392
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
393
|
+
layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
|
|
394
|
+
categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
|
|
395
|
+
'E-header', 'E-question', 'I-answer', 'I-header',
|
|
396
|
+
'I-question', 'O', 'S-answer', 'S-header',
|
|
397
|
+
'S-question'])
|
|
375
398
|
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
layoutlm = HFLayoutLmTokenClassifier("path/to/config.json","path/to/model.bin",
|
|
379
|
-
categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
|
|
380
|
-
'E-header', 'E-question', 'I-answer', 'I-header',
|
|
381
|
-
'I-question', 'O', 'S-answer', 'S-header',
|
|
382
|
-
'S-question'])
|
|
399
|
+
# token classification service
|
|
400
|
+
layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
|
|
383
401
|
|
|
384
|
-
|
|
385
|
-
layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
|
|
386
|
-
|
|
387
|
-
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
|
|
402
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
|
|
388
403
|
|
|
389
|
-
|
|
390
|
-
|
|
404
|
+
path = "path/to/some/form"
|
|
405
|
+
df = pipe.analyze(path=path)
|
|
391
406
|
|
|
392
|
-
|
|
393
|
-
|
|
407
|
+
for dp in df:
|
|
408
|
+
...
|
|
409
|
+
```
|
|
394
410
|
"""
|
|
395
411
|
|
|
396
412
|
def __init__(
|
|
@@ -404,17 +420,18 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
404
420
|
use_xlm_tokenizer: bool = False,
|
|
405
421
|
):
|
|
406
422
|
"""
|
|
407
|
-
:
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
423
|
+
Args:
|
|
424
|
+
path_config_json: path to `.json` config file
|
|
425
|
+
path_weights: path to model artifact
|
|
426
|
+
categories_semantics: A dict with key (indices) and values (category names) for NER semantics, i.e. the
|
|
427
|
+
entities self. To be consistent with detectors use only values `>0`. Conversion will
|
|
428
|
+
be done internally.
|
|
429
|
+
categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. BIO). To be
|
|
430
|
+
consistent with detectors use only values>0. Conversion will be done internally.
|
|
431
|
+
categories: If you have a pre-trained model you can pass a complete dict of NER categories
|
|
432
|
+
device: The device (cpu,"cuda"), where to place the model.
|
|
433
|
+
use_xlm_tokenizer: Do not change this value unless you pre-trained a LayoutLM model with a different
|
|
434
|
+
Tokenizer.
|
|
418
435
|
"""
|
|
419
436
|
super().__init__(
|
|
420
437
|
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
@@ -431,17 +448,16 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
431
448
|
"""
|
|
432
449
|
Launch inference on LayoutLm for token classification. Pass the following arguments
|
|
433
450
|
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
451
|
+
Args:
|
|
452
|
+
encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
453
|
+
attention_mask: The associated attention masks from padded sequences taken from
|
|
454
|
+
`LayoutLMTokenizer`
|
|
455
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
456
|
+
boxes: Torch tensor of bounding boxes of type `xyxy`
|
|
457
|
+
tokens: List of original tokens taken from `LayoutLMTokenizer`
|
|
441
458
|
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
:return: A list of TokenClassResults
|
|
459
|
+
Returns:
|
|
460
|
+
A list of `TokenClassResult`s
|
|
445
461
|
"""
|
|
446
462
|
|
|
447
463
|
ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
|
|
@@ -459,9 +475,12 @@ class HFLayoutLmTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
459
475
|
"""
|
|
460
476
|
Get the inner (wrapped) model.
|
|
461
477
|
|
|
462
|
-
:
|
|
463
|
-
|
|
464
|
-
|
|
478
|
+
Args:
|
|
479
|
+
path_config_json: path to .json config file
|
|
480
|
+
path_weights: path to model artifact
|
|
481
|
+
|
|
482
|
+
Returns:
|
|
483
|
+
`nn.Module`
|
|
465
484
|
"""
|
|
466
485
|
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
|
|
467
486
|
return LayoutLMForTokenClassification.from_pretrained(
|
|
@@ -481,31 +500,32 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
481
500
|
|
|
482
501
|
Note, that you must use `LayoutLMTokenizerFast` as tokenizer. `LayoutLMv2TokenizerFast` will not be accepted.
|
|
483
502
|
|
|
484
|
-
|
|
503
|
+
Example:
|
|
504
|
+
```python
|
|
505
|
+
# setting up compulsory ocr service
|
|
506
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
507
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
508
|
+
ocr_service = TextExtractionService(tess)
|
|
485
509
|
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
510
|
+
# hf tokenizer and token classifier
|
|
511
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
512
|
+
layoutlm = HFLayoutLmv2TokenClassifier("path/to/config.json","path/to/model.bin",
|
|
513
|
+
categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
|
|
514
|
+
'E-header', 'E-question', 'I-answer', 'I-header',
|
|
515
|
+
'I-question', 'O', 'S-answer', 'S-header',
|
|
516
|
+
'S-question'])
|
|
490
517
|
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
layoutlm = HFLayoutLmv2TokenClassifier("path/to/config.json","path/to/model.bin",
|
|
494
|
-
categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
|
|
495
|
-
'E-header', 'E-question', 'I-answer', 'I-header',
|
|
496
|
-
'I-question', 'O', 'S-answer', 'S-header',
|
|
497
|
-
'S-question'])
|
|
498
|
-
|
|
499
|
-
# token classification service
|
|
500
|
-
layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
|
|
518
|
+
# token classification service
|
|
519
|
+
layoutlm_service = LMTokenClassifierService(tokenizer,layoutlm)
|
|
501
520
|
|
|
502
|
-
|
|
521
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
|
|
503
522
|
|
|
504
|
-
|
|
505
|
-
|
|
523
|
+
path = "path/to/some/form"
|
|
524
|
+
df = pipe.analyze(path=path)
|
|
506
525
|
|
|
507
|
-
|
|
508
|
-
|
|
526
|
+
for dp in df:
|
|
527
|
+
...
|
|
528
|
+
```
|
|
509
529
|
"""
|
|
510
530
|
|
|
511
531
|
def __init__(
|
|
@@ -519,17 +539,18 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
519
539
|
use_xlm_tokenizer: bool = False,
|
|
520
540
|
):
|
|
521
541
|
"""
|
|
522
|
-
:
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
542
|
+
Args:
|
|
543
|
+
path_config_json: path to `.json` config file
|
|
544
|
+
path_weights: path to model artifact
|
|
545
|
+
categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
|
|
546
|
+
entities self. To be consistent with detectors use only values `>0`. Conversion will
|
|
547
|
+
be done internally.
|
|
548
|
+
categories_bio: A dict with key (indices) and values (category names) for `NER` tags (i.e. `BIO`). To be
|
|
549
|
+
consistent with detectors use only values>0. Conversion will be done internally.
|
|
550
|
+
categories: If you have a pre-trained model you can pass a complete dict of `NER` categories
|
|
551
|
+
device: The device (cpu,"cuda"), where to place the model.
|
|
552
|
+
use_xlm_tokenizer: Set to True if you use a LayoutXLM model. If you use a `LayoutLMv2` model keep the
|
|
553
|
+
default value.
|
|
533
554
|
"""
|
|
534
555
|
super().__init__(
|
|
535
556
|
path_config_json, path_weights, categories_semantics, categories_bio, categories, device, use_xlm_tokenizer
|
|
@@ -544,19 +565,18 @@ class HFLayoutLmv2TokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
544
565
|
|
|
545
566
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> list[TokenClassResult]:
|
|
546
567
|
"""
|
|
547
|
-
Launch inference on LayoutLm for token classification. Pass the following arguments
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
:return: A list of TokenClassResults
|
|
568
|
+
Launch inference on `LayoutLm` for token classification. Pass the following arguments
|
|
569
|
+
|
|
570
|
+
Args:
|
|
571
|
+
encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
572
|
+
attention_mask: The associated attention masks from padded sequences taken from
|
|
573
|
+
`LayoutLMTokenizer`
|
|
574
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
575
|
+
boxes: Torch tensor of bounding boxes of type `xyxy`
|
|
576
|
+
tokens: List of original tokens taken from `LayoutLMTokenizer`
|
|
577
|
+
|
|
578
|
+
Returns:
|
|
579
|
+
A list of `TokenClassResult`s
|
|
560
580
|
"""
|
|
561
581
|
|
|
562
582
|
ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
|
|
@@ -799,8 +819,9 @@ class HFLayoutLmSequenceClassifierBase(LMSequenceClassifier, ABC):
|
|
|
799
819
|
def get_tokenizer_class_name(model_class_name: str, use_xlm_tokenizer: bool) -> str:
|
|
800
820
|
"""A refinement for adding the tokenizer class name to the model configs.
|
|
801
821
|
|
|
802
|
-
:
|
|
803
|
-
|
|
822
|
+
Args:
|
|
823
|
+
model_class_name: The model name, e.g. `model.__class__.__name__`
|
|
824
|
+
use_xlm_tokenizer: Whether to use a `XLM` tokenizer.
|
|
804
825
|
"""
|
|
805
826
|
tokenizer = get_tokenizer_from_model_class(model_class_name, use_xlm_tokenizer)
|
|
806
827
|
return tokenizer.__class__.__name__
|
|
@@ -823,28 +844,29 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
823
844
|
Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
|
|
824
845
|
classification and other things please use another model of the family.
|
|
825
846
|
|
|
826
|
-
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
847
|
+
Example:
|
|
848
|
+
```python
|
|
849
|
+
# setting up compulsory ocr service
|
|
850
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
851
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
852
|
+
ocr_service = TextExtractionService(tess)
|
|
832
853
|
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
854
|
+
# hf tokenizer and token classifier
|
|
855
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
856
|
+
layoutlm = HFLayoutLmSequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
857
|
+
categories=["handwritten", "presentation", "resume"])
|
|
837
858
|
|
|
838
|
-
|
|
839
|
-
|
|
859
|
+
# token classification service
|
|
860
|
+
layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
|
|
840
861
|
|
|
841
|
-
|
|
862
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
|
|
842
863
|
|
|
843
|
-
|
|
844
|
-
|
|
864
|
+
path = "path/to/some/form"
|
|
865
|
+
df = pipe.analyze(path=path)
|
|
845
866
|
|
|
846
|
-
|
|
847
|
-
|
|
867
|
+
for dp in df:
|
|
868
|
+
...
|
|
869
|
+
```
|
|
848
870
|
"""
|
|
849
871
|
|
|
850
872
|
def __init__(
|
|
@@ -855,6 +877,16 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
855
877
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
856
878
|
use_xlm_tokenizer: bool = False,
|
|
857
879
|
):
|
|
880
|
+
"""
|
|
881
|
+
Args:
|
|
882
|
+
path_config_json: path to `.json` config file
|
|
883
|
+
path_weights: path to model artifact
|
|
884
|
+
categories: A dict with key (indices) and values (category names) for sequence classification.
|
|
885
|
+
To be consistent with detectors use only values `>0`. Conversion will be done internally.
|
|
886
|
+
device: The device ("cpu","cuda"), where to place the model.
|
|
887
|
+
use_xlm_tokenizer: Do not change this value unless you pre-trained a `LayoutLM` model with a different
|
|
888
|
+
Tokenizer.
|
|
889
|
+
"""
|
|
858
890
|
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
859
891
|
self.name = self.get_name(path_weights, "LayoutLM")
|
|
860
892
|
self.model_id = self.get_model_id()
|
|
@@ -865,6 +897,16 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
865
897
|
)
|
|
866
898
|
|
|
867
899
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
900
|
+
"""
|
|
901
|
+
Launch inference on LayoutLm for sequence classification. Pass the following arguments
|
|
902
|
+
|
|
903
|
+
Args:
|
|
904
|
+
encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
905
|
+
attention_mask: The associated attention masks from padded sequences taken from
|
|
906
|
+
`LayoutLMTokenizer`
|
|
907
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
908
|
+
boxes: Torch tensor of bounding boxes of type `xyxy`
|
|
909
|
+
"""
|
|
868
910
|
input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
|
|
869
911
|
|
|
870
912
|
result = predict_sequence_classes(
|
|
@@ -886,9 +928,12 @@ class HFLayoutLmSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
886
928
|
"""
|
|
887
929
|
Get the inner (wrapped) model.
|
|
888
930
|
|
|
889
|
-
:
|
|
890
|
-
|
|
891
|
-
|
|
931
|
+
Args:
|
|
932
|
+
path_config_json: path to `.json` config file
|
|
933
|
+
path_weights: path to model artifact
|
|
934
|
+
|
|
935
|
+
Returns:
|
|
936
|
+
'nn.Module'
|
|
892
937
|
"""
|
|
893
938
|
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
|
|
894
939
|
return LayoutLMForSequenceClassification.from_pretrained(
|
|
@@ -906,28 +951,29 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
906
951
|
itself. Note that this model is equipped with a head that is only useful for classifying the input sequence. For
|
|
907
952
|
token classification and other things please use another model of the family.
|
|
908
953
|
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
954
|
+
Example:
|
|
955
|
+
```python
|
|
956
|
+
# setting up compulsory ocr service
|
|
957
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
958
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
959
|
+
ocr_service = TextExtractionService(tess)
|
|
915
960
|
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
961
|
+
# hf tokenizer and token classifier
|
|
962
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
963
|
+
layoutlm = HFLayoutLmv2SequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
964
|
+
categories=["handwritten", "presentation", "resume"])
|
|
920
965
|
|
|
921
|
-
|
|
922
|
-
|
|
966
|
+
# token classification service
|
|
967
|
+
layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
|
|
923
968
|
|
|
924
|
-
|
|
969
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
|
|
925
970
|
|
|
926
|
-
|
|
927
|
-
|
|
971
|
+
path = "path/to/some/form"
|
|
972
|
+
df = pipe.analyze(path=path)
|
|
928
973
|
|
|
929
|
-
|
|
930
|
-
|
|
974
|
+
for dp in df:
|
|
975
|
+
...
|
|
976
|
+
```
|
|
931
977
|
"""
|
|
932
978
|
|
|
933
979
|
def __init__(
|
|
@@ -938,6 +984,16 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
938
984
|
device: Optional[Union[Literal["cpu", "cuda"], torch.device]] = None,
|
|
939
985
|
use_xlm_tokenizer: bool = False,
|
|
940
986
|
):
|
|
987
|
+
"""
|
|
988
|
+
Args:
|
|
989
|
+
path_config_json: path to `.json` config file
|
|
990
|
+
path_weights: path to model artifact
|
|
991
|
+
categories: A dict with key (indices) and values (category names) for sequence classification.
|
|
992
|
+
To be consistent with detectors use only values `>0`. Conversion will be done internally.
|
|
993
|
+
device: The device ("cpu","cuda"), where to place the model.
|
|
994
|
+
use_xlm_tokenizer: Do not change this value unless you pre-trained a `LayoutLM` model with a different
|
|
995
|
+
Tokenizer.
|
|
996
|
+
"""
|
|
941
997
|
super().__init__(path_config_json, path_weights, categories, device, use_xlm_tokenizer)
|
|
942
998
|
self.name = self.get_name(path_weights, "LayoutLMv2")
|
|
943
999
|
self.model_id = self.get_model_id()
|
|
@@ -948,6 +1004,16 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
948
1004
|
)
|
|
949
1005
|
|
|
950
1006
|
def predict(self, **encodings: Union[list[list[str]], torch.Tensor]) -> SequenceClassResult:
|
|
1007
|
+
"""
|
|
1008
|
+
Launch inference on LayoutLm for sequence classification. Pass the following arguments
|
|
1009
|
+
|
|
1010
|
+
Args:
|
|
1011
|
+
encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
1012
|
+
attention_mask: The associated attention masks from padded sequences taken from
|
|
1013
|
+
`LayoutLMTokenizer`
|
|
1014
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
1015
|
+
boxes: Torch tensor of bounding boxes of type `xyxy`
|
|
1016
|
+
"""
|
|
951
1017
|
input_ids, attention_mask, token_type_ids, boxes = self._validate_encodings(**encodings)
|
|
952
1018
|
images = encodings.get("image")
|
|
953
1019
|
if isinstance(images, torch.Tensor):
|
|
@@ -976,9 +1042,12 @@ class HFLayoutLmv2SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
976
1042
|
"""
|
|
977
1043
|
Get the inner (wrapped) model.
|
|
978
1044
|
|
|
979
|
-
:
|
|
980
|
-
|
|
981
|
-
|
|
1045
|
+
Args:
|
|
1046
|
+
path_config_json: path to `.json` config file
|
|
1047
|
+
path_weights: path to model artifact
|
|
1048
|
+
|
|
1049
|
+
Returns:
|
|
1050
|
+
'nn.Module'
|
|
982
1051
|
"""
|
|
983
1052
|
config = LayoutLMv2Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
|
|
984
1053
|
return LayoutLMv2ForSequenceClassification.from_pretrained(
|
|
@@ -996,28 +1065,29 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
996
1065
|
itself. Note that this model is equipped with a head that is only useful for classifying the input sequence. For
|
|
997
1066
|
token classification and other things please use another model of the family.
|
|
998
1067
|
|
|
999
|
-
|
|
1068
|
+
Example:
|
|
1069
|
+
```python
|
|
1070
|
+
# setting up compulsory ocr service
|
|
1071
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
1072
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
1073
|
+
ocr_service = TextExtractionService(tess)
|
|
1000
1074
|
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1075
|
+
# hf tokenizer and token classifier
|
|
1076
|
+
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
|
1077
|
+
layoutlm = HFLayoutLmv3SequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
1078
|
+
categories=["handwritten", "presentation", "resume"])
|
|
1005
1079
|
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
layoutlm = HFLayoutLmv3SequenceClassifier("path/to/config.json","path/to/model.bin",
|
|
1009
|
-
categories=["handwritten", "presentation", "resume"])
|
|
1080
|
+
# token classification service
|
|
1081
|
+
layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
|
|
1010
1082
|
|
|
1011
|
-
|
|
1012
|
-
layoutlm_service = LMSequenceClassifierService(tokenizer,layoutlm)
|
|
1083
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,layoutlm_service])
|
|
1013
1084
|
|
|
1014
|
-
|
|
1085
|
+
path = "path/to/some/form"
|
|
1086
|
+
df = pipe.analyze(path=path)
|
|
1015
1087
|
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
for dp in df:
|
|
1020
|
-
...
|
|
1088
|
+
for dp in df:
|
|
1089
|
+
...
|
|
1090
|
+
```
|
|
1021
1091
|
"""
|
|
1022
1092
|
|
|
1023
1093
|
def __init__(
|
|
@@ -1072,9 +1142,12 @@ class HFLayoutLmv3SequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
1072
1142
|
"""
|
|
1073
1143
|
Get the inner (wrapped) model.
|
|
1074
1144
|
|
|
1075
|
-
:
|
|
1076
|
-
|
|
1077
|
-
|
|
1145
|
+
Args:
|
|
1146
|
+
path_config_json: path to `.json` config file
|
|
1147
|
+
path_weights: path to model artifact
|
|
1148
|
+
|
|
1149
|
+
Returns:
|
|
1150
|
+
'nn.Module'
|
|
1078
1151
|
"""
|
|
1079
1152
|
config = LayoutLMv3Config.from_pretrained(pretrained_model_name_or_path=os.fspath(path_config_json))
|
|
1080
1153
|
return LayoutLMv3ForSequenceClassification.from_pretrained(
|
|
@@ -1092,31 +1165,32 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
1092
1165
|
Note that this model is equipped with a head that is only useful when classifying tokens. For sequence
|
|
1093
1166
|
classification and other things please use another model of the family.
|
|
1094
1167
|
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1168
|
+
Example:
|
|
1169
|
+
```python
|
|
1170
|
+
# setting up compulsory ocr service
|
|
1171
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
1172
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
1173
|
+
ocr_service = TextExtractionService(tess)
|
|
1101
1174
|
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1175
|
+
# hf tokenizer and token classifier
|
|
1176
|
+
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
|
|
1177
|
+
lilt = HFLiltTokenClassifier("path/to/config.json","path/to/model.bin",
|
|
1178
|
+
categories= ['B-answer', 'B-header', 'B-question', 'E-answer',
|
|
1179
|
+
'E-header', 'E-question', 'I-answer', 'I-header',
|
|
1180
|
+
'I-question', 'O', 'S-answer', 'S-header',
|
|
1181
|
+
'S-question'])
|
|
1109
1182
|
|
|
1110
|
-
|
|
1111
|
-
|
|
1183
|
+
# token classification service
|
|
1184
|
+
lilt_service = LMTokenClassifierService(tokenizer,lilt)
|
|
1112
1185
|
|
|
1113
|
-
|
|
1186
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
|
|
1114
1187
|
|
|
1115
|
-
|
|
1116
|
-
|
|
1188
|
+
path = "path/to/some/form"
|
|
1189
|
+
df = pipe.analyze(path=path)
|
|
1117
1190
|
|
|
1118
|
-
|
|
1119
|
-
|
|
1191
|
+
for dp in df:
|
|
1192
|
+
...
|
|
1193
|
+
```
|
|
1120
1194
|
"""
|
|
1121
1195
|
|
|
1122
1196
|
def __init__(
|
|
@@ -1130,15 +1204,16 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
1130
1204
|
use_xlm_tokenizer: bool = False,
|
|
1131
1205
|
):
|
|
1132
1206
|
"""
|
|
1133
|
-
:
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1207
|
+
Args:
|
|
1208
|
+
path_config_json: path to `.json` config file
|
|
1209
|
+
path_weights: path to model artifact
|
|
1210
|
+
categories_semantics: A dict with key (indices) and values (category names) for `NER` semantics, i.e. the
|
|
1211
|
+
entities self. To be consistent with detectors use only values `>0`. Conversion will
|
|
1212
|
+
be done internally.
|
|
1213
|
+
categories_bio: A dict with key (indices) and values (category names) for NER tags (i.e. `BIO`). To be
|
|
1214
|
+
consistent with detectors use only values>0. Conversion will be done internally.
|
|
1215
|
+
categories: If you have a pre-trained model you can pass a complete dict of `NER` categories
|
|
1216
|
+
device: The device ("cpu","cuda"), where to place the model.
|
|
1142
1217
|
"""
|
|
1143
1218
|
|
|
1144
1219
|
super().__init__(
|
|
@@ -1156,17 +1231,16 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
1156
1231
|
"""
|
|
1157
1232
|
Launch inference on LayoutLm for token classification. Pass the following arguments
|
|
1158
1233
|
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1234
|
+
Args:
|
|
1235
|
+
encodings: input_ids: Token converted to ids to be taken from `LayoutLMTokenizer`
|
|
1236
|
+
attention_mask: The associated attention masks from padded sequences taken from
|
|
1237
|
+
`LayoutLMTokenizer`
|
|
1238
|
+
token_type_ids: Torch tensor of token type ids taken from `LayoutLMTokenizer`
|
|
1239
|
+
boxes: Torch tensor of bounding boxes of type `xyxy`
|
|
1240
|
+
tokens: List of original tokens taken from `LayoutLMTokenizer`
|
|
1162
1241
|
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
`boxes:` Torch tensor of bounding boxes of type 'xyxy'
|
|
1166
|
-
|
|
1167
|
-
`tokens:` List of original tokens taken from `LayoutLMTokenizer`
|
|
1168
|
-
|
|
1169
|
-
:return: A list of TokenClassResults
|
|
1242
|
+
Returns:
|
|
1243
|
+
A list of `TokenClassResult`s
|
|
1170
1244
|
"""
|
|
1171
1245
|
|
|
1172
1246
|
ann_ids, _, input_ids, attention_mask, token_type_ids, boxes, tokens = self._validate_encodings(**encodings)
|
|
@@ -1182,9 +1256,12 @@ class HFLiltTokenClassifier(HFLayoutLmTokenClassifierBase):
|
|
|
1182
1256
|
"""
|
|
1183
1257
|
Get the inner (wrapped) model.
|
|
1184
1258
|
|
|
1185
|
-
:
|
|
1186
|
-
|
|
1187
|
-
|
|
1259
|
+
Args:
|
|
1260
|
+
path_config_json: path to `.json` config file
|
|
1261
|
+
path_weights: path to model artifact
|
|
1262
|
+
|
|
1263
|
+
Returns:
|
|
1264
|
+
`nn.Module`
|
|
1188
1265
|
"""
|
|
1189
1266
|
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
1190
1267
|
return LiltForTokenClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
|
|
@@ -1200,29 +1277,30 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
1200
1277
|
Note that this model is equipped with a head that is only useful for classifying the input sequence. For token
|
|
1201
1278
|
classification and other things please use another model of the family.
|
|
1202
1279
|
|
|
1203
|
-
|
|
1280
|
+
Example:
|
|
1281
|
+
```python
|
|
1282
|
+
# setting up compulsory ocr service
|
|
1283
|
+
tesseract_config_path = ModelCatalog.get_full_path_configs("/dd/conf_tesseract.yaml")
|
|
1284
|
+
tess = TesseractOcrDetector(tesseract_config_path)
|
|
1285
|
+
ocr_service = TextExtractionService(tess)
|
|
1204
1286
|
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1287
|
+
# hf tokenizer and sequence classifier
|
|
1288
|
+
tokenizer = LayoutLMTokenizerFast.from_pretrained("microsoft/layoutlm-base-uncased")
|
|
1289
|
+
lilt = HFLiltSequenceClassifier("path/to/config.json",
|
|
1290
|
+
"path/to/model.bin",
|
|
1291
|
+
categories=["handwritten", "presentation", "resume"])
|
|
1209
1292
|
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
lilt = HFLiltSequenceClassifier("path/to/config.json",
|
|
1213
|
-
"path/to/model.bin",
|
|
1214
|
-
categories=["handwritten", "presentation", "resume"])
|
|
1293
|
+
# sequence classification service
|
|
1294
|
+
lilt_service = LMSequenceClassifierService(tokenizer,lilt)
|
|
1215
1295
|
|
|
1216
|
-
|
|
1217
|
-
lilt_service = LMSequenceClassifierService(tokenizer,lilt)
|
|
1296
|
+
pipe = DoctectionPipe(pipeline_component_list=[ocr_service,lilt_service])
|
|
1218
1297
|
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
path = "path/to/some/form"
|
|
1222
|
-
df = pipe.analyze(path=path)
|
|
1298
|
+
path = "path/to/some/form"
|
|
1299
|
+
df = pipe.analyze(path=path)
|
|
1223
1300
|
|
|
1224
|
-
|
|
1225
|
-
|
|
1301
|
+
for dp in df:
|
|
1302
|
+
...
|
|
1303
|
+
```
|
|
1226
1304
|
"""
|
|
1227
1305
|
|
|
1228
1306
|
def __init__(
|
|
@@ -1262,9 +1340,12 @@ class HFLiltSequenceClassifier(HFLayoutLmSequenceClassifierBase):
|
|
|
1262
1340
|
"""
|
|
1263
1341
|
Get the inner (wrapped) model.
|
|
1264
1342
|
|
|
1265
|
-
:
|
|
1266
|
-
|
|
1267
|
-
|
|
1343
|
+
Args:
|
|
1344
|
+
path_config_json: path to `.json` config file
|
|
1345
|
+
path_weights: path to model artifact
|
|
1346
|
+
|
|
1347
|
+
Returns:
|
|
1348
|
+
`nn.Module`
|
|
1268
1349
|
"""
|
|
1269
1350
|
config = PretrainedConfig.from_pretrained(pretrained_model_name_or_path=path_config_json)
|
|
1270
1351
|
return LiltForSequenceClassification.from_pretrained(pretrained_model_name_or_path=path_weights, config=config)
|