deepdoctection 0.31__tar.gz → 0.33__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.31 → deepdoctection-0.33}/PKG-INFO +30 -21
- {deepdoctection-0.31 → deepdoctection-0.33}/README.md +14 -7
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/__init__.py +16 -29
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/analyzer/dd.py +70 -59
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/configs/conf_dd_one.yaml +34 -31
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/common.py +9 -5
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/custom.py +5 -5
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/custom_serialize.py +75 -18
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/parallel_map.py +3 -3
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/serialize.py +4 -4
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/stats.py +3 -3
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/annotation.py +41 -56
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/box.py +9 -8
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/convert.py +6 -6
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/image.py +56 -44
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/view.py +245 -150
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/__init__.py +1 -4
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/adapter.py +35 -26
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/base.py +14 -12
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/dataflow_builder.py +3 -3
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/info.py +24 -26
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/doclaynet.py +51 -51
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/fintabnet.py +46 -46
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/funsd.py +25 -24
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/iiitar13k.py +13 -10
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/layouttest.py +4 -3
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/publaynet.py +5 -5
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/pubtables1m.py +24 -21
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/pubtabnet.py +32 -30
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/rvlcdip.py +30 -30
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/xfund.py +26 -26
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/save.py +6 -6
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/__init__.py +1 -4
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/accmetric.py +32 -33
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/base.py +8 -9
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/cocometric.py +15 -13
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/eval.py +41 -37
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/tedsmetric.py +30 -23
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/tp_eval_callback.py +16 -19
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/__init__.py +2 -7
- deepdoctection-0.33/deepdoctection/extern/base.py +644 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/d2detect.py +85 -113
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/deskew.py +14 -11
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/doctrocr.py +141 -130
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/fastlang.py +27 -18
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/hfdetr.py +71 -62
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/hflayoutlm.py +504 -211
- deepdoctection-0.33/deepdoctection/extern/hflm.py +230 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/model.py +488 -302
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/pdftext.py +23 -19
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/pt/__init__.py +1 -3
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection-0.33/deepdoctection/extern/pt/ptutils.py +59 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tessocr.py +39 -38
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/texocr.py +18 -18
- deepdoctection-0.33/deepdoctection/extern/tp/tfutils.py +105 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpcompat.py +21 -14
- deepdoctection-0.33/deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection-0.33/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/config/config.py +13 -10
- deepdoctection-0.33/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +18 -8
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +14 -9
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +22 -17
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +21 -14
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +19 -11
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/preproc.py +12 -8
- {deepdoctection-0.31/tests/datapoint → deepdoctection-0.33/deepdoctection/extern/tp/tpfrcnn/utils}/__init__.py +4 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tpdetect.py +45 -53
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/__init__.py +3 -8
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/cats.py +27 -29
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/cocostruct.py +10 -10
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/d2struct.py +27 -26
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/hfstruct.py +13 -8
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/laylmstruct.py +178 -37
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/maputils.py +12 -11
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/match.py +2 -2
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/misc.py +11 -9
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/pascalstruct.py +4 -4
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/prodigystruct.py +5 -5
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/pubstruct.py +84 -92
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/tpstruct.py +5 -5
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/mapper/xfundstruct.py +33 -33
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/__init__.py +1 -1
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/anngen.py +12 -14
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/base.py +52 -106
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/common.py +72 -59
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/concurrency.py +16 -11
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/doctectionpipe.py +24 -21
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/language.py +20 -25
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/layout.py +20 -16
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/lm.py +75 -105
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/order.py +194 -89
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/refine.py +111 -124
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/segment.py +156 -161
- deepdoctection-0.31/deepdoctection/pipe/cell.py → deepdoctection-0.33/deepdoctection/pipe/sub_layout.py +50 -40
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/text.py +37 -36
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/transform.py +19 -16
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/__init__.py +6 -12
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/d2_frcnn_train.py +48 -41
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/hf_detr_train.py +41 -30
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/hf_layoutlm_train.py +153 -135
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/train/tp_frcnn_train.py +32 -31
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/concurrency.py +1 -1
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/context.py +13 -6
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/develop.py +4 -4
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/env_info.py +87 -125
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/file_utils.py +6 -11
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/fs.py +22 -18
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/identifier.py +2 -2
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/logger.py +16 -15
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/metacfg.py +7 -7
- deepdoctection-0.33/deepdoctection/utils/mocks.py +93 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/pdf_utils.py +11 -11
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/settings.py +185 -181
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/tqdm.py +1 -1
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/transform.py +14 -9
- deepdoctection-0.33/deepdoctection/utils/types.py +104 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/utils.py +7 -7
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/viz.py +74 -72
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/PKG-INFO +30 -21
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/SOURCES.txt +5 -91
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/requires.txt +14 -11
- {deepdoctection-0.31 → deepdoctection-0.33}/setup.cfg +10 -1
- {deepdoctection-0.31 → deepdoctection-0.33}/setup.py +10 -9
- {deepdoctection-0.31 → deepdoctection-0.33}/tests/test_utils.py +8 -0
- deepdoctection-0.31/deepdoctection/extern/base.py +0 -439
- deepdoctection-0.31/deepdoctection/extern/pt/ptutils.py +0 -49
- deepdoctection-0.31/deepdoctection/extern/tp/tfutils.py +0 -57
- deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- deepdoctection-0.31/deepdoctection/utils/detection_types.py +0 -68
- deepdoctection-0.31/tests/__init__.py +0 -22
- deepdoctection-0.31/tests/analyzer/__init__.py +0 -16
- deepdoctection-0.31/tests/analyzer/test_dd.py +0 -202
- deepdoctection-0.31/tests/conftest.py +0 -498
- deepdoctection-0.31/tests/data.py +0 -1632
- deepdoctection-0.31/tests/dataflow/__init__.py +0 -16
- deepdoctection-0.31/tests/dataflow/conftest.py +0 -95
- deepdoctection-0.31/tests/dataflow/test_common.py +0 -219
- deepdoctection-0.31/tests/dataflow/test_custom.py +0 -60
- deepdoctection-0.31/tests/dataflow/test_custom_serialize.py +0 -177
- deepdoctection-0.31/tests/dataflow/test_parallel_map.py +0 -66
- deepdoctection-0.31/tests/dataflow/test_stats.py +0 -103
- deepdoctection-0.31/tests/datapoint/conftest.py +0 -262
- deepdoctection-0.31/tests/datapoint/test_annotation.py +0 -170
- deepdoctection-0.31/tests/datapoint/test_box.py +0 -416
- deepdoctection-0.31/tests/datapoint/test_convert.py +0 -52
- deepdoctection-0.31/tests/datapoint/test_image.py +0 -387
- deepdoctection-0.31/tests/datapoint/test_view.py +0 -150
- deepdoctection-0.31/tests/datasets/__init__.py +0 -16
- deepdoctection-0.31/tests/datasets/instances/__init__.py +0 -16
- deepdoctection-0.31/tests/datasets/instances/conftest.py +0 -35
- deepdoctection-0.31/tests/datasets/instances/test_doclaynet.py +0 -43
- deepdoctection-0.31/tests/datasets/instances/test_fintabnet.py +0 -70
- deepdoctection-0.31/tests/datasets/instances/test_funsd.py +0 -58
- deepdoctection-0.31/tests/datasets/instances/test_iiitar13k.py +0 -42
- deepdoctection-0.31/tests/datasets/instances/test_layouttest.py +0 -63
- deepdoctection-0.31/tests/datasets/instances/test_publaynet.py +0 -64
- deepdoctection-0.31/tests/datasets/instances/test_pubtables1m.py +0 -66
- deepdoctection-0.31/tests/datasets/instances/test_pubtabnet.py +0 -65
- deepdoctection-0.31/tests/datasets/instances/test_rvlcdip.py +0 -46
- deepdoctection-0.31/tests/datasets/test_adapter.py +0 -77
- deepdoctection-0.31/tests/datasets/test_info.py +0 -273
- deepdoctection-0.31/tests/datasets/test_registry.py +0 -75
- deepdoctection-0.31/tests/eval/__init__.py +0 -16
- deepdoctection-0.31/tests/eval/conftest.py +0 -107
- deepdoctection-0.31/tests/eval/test_accmetric.py +0 -364
- deepdoctection-0.31/tests/eval/test_cocometric.py +0 -123
- deepdoctection-0.31/tests/eval/test_eval.py +0 -86
- deepdoctection-0.31/tests/eval/test_registry.py +0 -84
- deepdoctection-0.31/tests/eval/test_tedsmetric.py +0 -40
- deepdoctection-0.31/tests/extern/__init__.py +0 -0
- deepdoctection-0.31/tests/extern/conftest.py +0 -108
- deepdoctection-0.31/tests/extern/data.py +0 -102
- deepdoctection-0.31/tests/extern/test_deskew.py +0 -67
- deepdoctection-0.31/tests/extern/test_doctrocr.py +0 -190
- deepdoctection-0.31/tests/extern/test_fastlang.py +0 -64
- deepdoctection-0.31/tests/extern/test_hfdetr.py +0 -116
- deepdoctection-0.31/tests/extern/test_hflayoutlm.py +0 -492
- deepdoctection-0.31/tests/extern/test_pdftext.py +0 -70
- deepdoctection-0.31/tests/extern/test_tessocr.py +0 -164
- deepdoctection-0.31/tests/extern/test_texocr.py +0 -52
- deepdoctection-0.31/tests/extern/test_tpdetect.py +0 -123
- deepdoctection-0.31/tests/mapper/__init__.py +0 -16
- deepdoctection-0.31/tests/mapper/conftest.py +0 -297
- deepdoctection-0.31/tests/mapper/data.py +0 -2182
- deepdoctection-0.31/tests/mapper/test_cats.py +0 -305
- deepdoctection-0.31/tests/mapper/test_cocostruct.py +0 -91
- deepdoctection-0.31/tests/mapper/test_d2struct.py +0 -56
- deepdoctection-0.31/tests/mapper/test_hfstruct.py +0 -59
- deepdoctection-0.31/tests/mapper/test_iiitar13k.py +0 -64
- deepdoctection-0.31/tests/mapper/test_laylmstruct.py +0 -141
- deepdoctection-0.31/tests/mapper/test_misc.py +0 -72
- deepdoctection-0.31/tests/mapper/test_prodigystruct.py +0 -78
- deepdoctection-0.31/tests/mapper/test_pubstruct.py +0 -170
- deepdoctection-0.31/tests/mapper/test_tpstruct.py +0 -51
- deepdoctection-0.31/tests/mapper/test_utils.py +0 -83
- deepdoctection-0.31/tests/mapper/test_xfundstruct.py +0 -68
- deepdoctection-0.31/tests/pipe/__init__.py +0 -16
- deepdoctection-0.31/tests/pipe/test_anngen.py +0 -179
- deepdoctection-0.31/tests/pipe/test_cell.py +0 -144
- deepdoctection-0.31/tests/pipe/test_common.py +0 -107
- deepdoctection-0.31/tests/pipe/test_language.py +0 -76
- deepdoctection-0.31/tests/pipe/test_layout.py +0 -66
- deepdoctection-0.31/tests/pipe/test_lm.py +0 -119
- deepdoctection-0.31/tests/pipe/test_order.py +0 -197
- deepdoctection-0.31/tests/pipe/test_refine.py +0 -325
- deepdoctection-0.31/tests/pipe/test_registry.py +0 -58
- deepdoctection-0.31/tests/pipe/test_segment.py +0 -392
- deepdoctection-0.31/tests/pipe/test_text.py +0 -208
- deepdoctection-0.31/tests/pipe/test_transform.py +0 -65
- deepdoctection-0.31/tests/train/__init__.py +0 -16
- deepdoctection-0.31/tests/train/conftest.py +0 -118
- deepdoctection-0.31/tests/train/test_d2_frcnn_train.py +0 -64
- deepdoctection-0.31/tests/train/test_tp_frcnn_train.py +0 -99
- deepdoctection-0.31/tests_d2/__init__.py +0 -20
- deepdoctection-0.31/tests_d2/conftest.py +0 -56
- deepdoctection-0.31/tests_d2/test_d2detect.py +0 -95
- {deepdoctection-0.31 → deepdoctection-0.33}/LICENSE +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.31 → deepdoctection-0.33}/deepdoctection.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.33
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -9,21 +9,21 @@ Classifier: Development Status :: 4 - Beta
|
|
|
9
9
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
10
|
Classifier: Natural Language :: English
|
|
11
11
|
Classifier: Operating System :: POSIX :: Linux
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.9
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
-
Requires-Python: >=3.
|
|
16
|
+
Requires-Python: >=3.9
|
|
18
17
|
Description-Content-Type: text/markdown
|
|
19
18
|
License-File: LICENSE
|
|
20
19
|
Requires-Dist: catalogue==2.0.10
|
|
21
20
|
Requires-Dist: huggingface_hub>=0.12.0
|
|
22
21
|
Requires-Dist: importlib-metadata>=5.0.0
|
|
23
22
|
Requires-Dist: jsonlines==3.1.0
|
|
23
|
+
Requires-Dist: lazy-imports==0.3.1
|
|
24
24
|
Requires-Dist: mock==4.0.3
|
|
25
25
|
Requires-Dist: networkx>=2.7.1
|
|
26
|
-
Requires-Dist: numpy
|
|
26
|
+
Requires-Dist: numpy<2.0,>=1.21
|
|
27
27
|
Requires-Dist: packaging>=20.0
|
|
28
28
|
Requires-Dist: Pillow>=10.0.0
|
|
29
29
|
Requires-Dist: pypdf>=3.16.0
|
|
@@ -37,9 +37,10 @@ Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
|
37
37
|
Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
|
|
38
38
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
39
39
|
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
40
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
40
41
|
Requires-Dist: mock==4.0.3; extra == "tf"
|
|
41
42
|
Requires-Dist: networkx>=2.7.1; extra == "tf"
|
|
42
|
-
Requires-Dist: numpy
|
|
43
|
+
Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
43
44
|
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
44
45
|
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
45
46
|
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
@@ -52,10 +53,10 @@ Requires-Dist: tensorpack==0.11; extra == "tf"
|
|
|
52
53
|
Requires-Dist: protobuf==3.20.1; extra == "tf"
|
|
53
54
|
Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
|
|
54
55
|
Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
|
|
55
|
-
Requires-Dist: python-doctr==0.
|
|
56
|
+
Requires-Dist: python-doctr==0.8.1; extra == "tf"
|
|
56
57
|
Requires-Dist: pycocotools>=2.0.2; extra == "tf"
|
|
57
|
-
Requires-Dist: boto3; extra == "tf"
|
|
58
|
-
Requires-Dist: pdfplumber>=0.
|
|
58
|
+
Requires-Dist: boto3==1.34.102; extra == "tf"
|
|
59
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
|
|
59
60
|
Requires-Dist: fasttext==0.9.2; extra == "tf"
|
|
60
61
|
Requires-Dist: jdeskew>=0.2.2; extra == "tf"
|
|
61
62
|
Requires-Dist: apted==1.0.3; extra == "tf"
|
|
@@ -66,9 +67,10 @@ Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
|
66
67
|
Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
|
|
67
68
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
68
69
|
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
70
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
69
71
|
Requires-Dist: mock==4.0.3; extra == "pt"
|
|
70
72
|
Requires-Dist: networkx>=2.7.1; extra == "pt"
|
|
71
|
-
Requires-Dist: numpy
|
|
73
|
+
Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
72
74
|
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
73
75
|
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
74
76
|
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
@@ -80,9 +82,9 @@ Requires-Dist: tqdm==4.64.0; extra == "pt"
|
|
|
80
82
|
Requires-Dist: timm>=0.9.16; extra == "pt"
|
|
81
83
|
Requires-Dist: transformers>=4.36.0; extra == "pt"
|
|
82
84
|
Requires-Dist: accelerate>=0.29.1; extra == "pt"
|
|
83
|
-
Requires-Dist: python-doctr==0.
|
|
84
|
-
Requires-Dist: boto3; extra == "pt"
|
|
85
|
-
Requires-Dist: pdfplumber>=0.
|
|
85
|
+
Requires-Dist: python-doctr==0.8.1; extra == "pt"
|
|
86
|
+
Requires-Dist: boto3==1.34.102; extra == "pt"
|
|
87
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
|
|
86
88
|
Requires-Dist: fasttext==0.9.2; extra == "pt"
|
|
87
89
|
Requires-Dist: jdeskew>=0.2.2; extra == "pt"
|
|
88
90
|
Requires-Dist: apted==1.0.3; extra == "pt"
|
|
@@ -90,10 +92,10 @@ Requires-Dist: distance==0.1.3; extra == "pt"
|
|
|
90
92
|
Requires-Dist: lxml>=4.9.1; extra == "pt"
|
|
91
93
|
Provides-Extra: docs
|
|
92
94
|
Requires-Dist: tensorpack==0.11; extra == "docs"
|
|
93
|
-
Requires-Dist: boto3; extra == "docs"
|
|
95
|
+
Requires-Dist: boto3==1.34.102; extra == "docs"
|
|
94
96
|
Requires-Dist: transformers>=4.36.0; extra == "docs"
|
|
95
97
|
Requires-Dist: accelerate>=0.29.1; extra == "docs"
|
|
96
|
-
Requires-Dist: pdfplumber>=0.
|
|
98
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
|
|
97
99
|
Requires-Dist: lxml>=4.9.1; extra == "docs"
|
|
98
100
|
Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
|
|
99
101
|
Requires-Dist: jdeskew>=0.2.2; extra == "docs"
|
|
@@ -153,7 +155,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
153
155
|
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
154
156
|
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
|
|
155
157
|
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
156
|
-
- Document and token classification with all LayoutLM models provided by the
|
|
158
|
+
- Document and token classification with all LayoutLM models provided by the
|
|
159
|
+
[**Transformer library**](https://github.com/huggingface/transformers).
|
|
157
160
|
(Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
|
|
158
161
|
- Table detection and table structure recognition with
|
|
159
162
|
[**table-transformer**](https://github.com/microsoft/table-transformer).
|
|
@@ -163,10 +166,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
163
166
|
- Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
|
|
164
167
|
Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
|
|
165
168
|
[docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
|
|
166
|
-
- Document layout analysis and table recognition now runs with
|
|
167
|
-
|
|
168
|
-
|
|
169
|
+
- Document layout analysis and table recognition now runs with
|
|
170
|
+
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
171
|
+
anymore for basic inference.
|
|
172
|
+
- [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
169
173
|
(not contained in the built-in Analyzer).
|
|
174
|
+
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
175
|
+
[**transformers**](https://github.com/huggingface/transformers).
|
|
176
|
+
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
177
|
+
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
178
|
+
LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
|
|
170
179
|
|
|
171
180
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
172
181
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -257,9 +266,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
|
|
|
257
266
|
separately.
|
|
258
267
|
|
|
259
268
|
- Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
|
|
260
|
-
- Python >= 3.
|
|
261
|
-
- 1.
|
|
262
|
-
|
|
269
|
+
- Python >= 3.9
|
|
270
|
+
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
271
|
+
In general, if you want to train or fine-tune models, a GPU is required.
|
|
263
272
|
- **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
|
|
264
273
|
images.
|
|
265
274
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
@@ -31,7 +31,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
31
31
|
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
32
32
|
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
|
|
33
33
|
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
34
|
-
- Document and token classification with all LayoutLM models provided by the
|
|
34
|
+
- Document and token classification with all LayoutLM models provided by the
|
|
35
|
+
[**Transformer library**](https://github.com/huggingface/transformers).
|
|
35
36
|
(Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
|
|
36
37
|
- Table detection and table structure recognition with
|
|
37
38
|
[**table-transformer**](https://github.com/microsoft/table-transformer).
|
|
@@ -41,10 +42,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
41
42
|
- Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
|
|
42
43
|
Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
|
|
43
44
|
[docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
|
|
44
|
-
- Document layout analysis and table recognition now runs with
|
|
45
|
-
|
|
46
|
-
|
|
45
|
+
- Document layout analysis and table recognition now runs with
|
|
46
|
+
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
47
|
+
anymore for basic inference.
|
|
48
|
+
- [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
47
49
|
(not contained in the built-in Analyzer).
|
|
50
|
+
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
51
|
+
[**transformers**](https://github.com/huggingface/transformers).
|
|
52
|
+
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
53
|
+
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
54
|
+
LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
|
|
48
55
|
|
|
49
56
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
50
57
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -135,9 +142,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
|
|
|
135
142
|
separately.
|
|
136
143
|
|
|
137
144
|
- Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
|
|
138
|
-
- Python >= 3.
|
|
139
|
-
- 1.
|
|
140
|
-
|
|
145
|
+
- Python >= 3.9
|
|
146
|
+
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
147
|
+
In general, if you want to train or fine-tune models, a GPU is required.
|
|
141
148
|
- **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
|
|
142
149
|
images.
|
|
143
150
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
@@ -19,15 +19,13 @@ import os
|
|
|
19
19
|
import sys
|
|
20
20
|
from typing import TYPE_CHECKING
|
|
21
21
|
|
|
22
|
-
from
|
|
23
|
-
|
|
24
|
-
from .utils.env_info import auto_select_lib_and_device
|
|
22
|
+
from .utils.env_info import collect_env_info
|
|
25
23
|
from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
|
|
26
|
-
from .utils.logger import logger
|
|
24
|
+
from .utils.logger import LoggingRecord, logger
|
|
27
25
|
|
|
28
26
|
# pylint: enable=wrong-import-position
|
|
29
27
|
|
|
30
|
-
__version__ = 0.
|
|
28
|
+
__version__ = 0.33
|
|
31
29
|
|
|
32
30
|
_IMPORT_STRUCTURE = {
|
|
33
31
|
"analyzer": [
|
|
@@ -162,6 +160,8 @@ _IMPORT_STRUCTURE = {
|
|
|
162
160
|
"EvalCallback",
|
|
163
161
|
],
|
|
164
162
|
"extern": [
|
|
163
|
+
"ModelCategories",
|
|
164
|
+
"NerModelCategories",
|
|
165
165
|
"PredictorBase",
|
|
166
166
|
"DetectionResult",
|
|
167
167
|
"ObjectDetector",
|
|
@@ -182,6 +182,7 @@ _IMPORT_STRUCTURE = {
|
|
|
182
182
|
"DocTrRotationTransformer",
|
|
183
183
|
"FasttextLangDetector",
|
|
184
184
|
"HFDetrDerivedDetector",
|
|
185
|
+
"get_tokenizer_from_architecture",
|
|
185
186
|
"HFLayoutLmTokenClassifierBase",
|
|
186
187
|
"HFLayoutLmTokenClassifier",
|
|
187
188
|
"HFLayoutLmv2TokenClassifier",
|
|
@@ -189,6 +190,9 @@ _IMPORT_STRUCTURE = {
|
|
|
189
190
|
"HFLayoutLmSequenceClassifier",
|
|
190
191
|
"HFLayoutLmv2SequenceClassifier",
|
|
191
192
|
"HFLayoutLmv3SequenceClassifier",
|
|
193
|
+
"HFLiltTokenClassifier",
|
|
194
|
+
"HFLiltSequenceClassifier",
|
|
195
|
+
"HFLmSequenceClassifier",
|
|
192
196
|
"ModelProfile",
|
|
193
197
|
"ModelCatalog",
|
|
194
198
|
"print_model_infos",
|
|
@@ -268,11 +272,11 @@ _IMPORT_STRUCTURE = {
|
|
|
268
272
|
"DoctectionPipe",
|
|
269
273
|
"LanguageDetectionService",
|
|
270
274
|
"ImageLayoutService",
|
|
271
|
-
"get_tokenizer_from_architecture",
|
|
272
275
|
"LMTokenClassifierService",
|
|
273
276
|
"LMSequenceClassifierService",
|
|
274
277
|
"OrderGenerator",
|
|
275
278
|
"TextLineGenerator",
|
|
279
|
+
"TextLineService",
|
|
276
280
|
"TextOrderService",
|
|
277
281
|
"TableSegmentationRefinementService",
|
|
278
282
|
"generate_html_string",
|
|
@@ -297,14 +301,13 @@ _IMPORT_STRUCTURE = {
|
|
|
297
301
|
"save_tmp_file",
|
|
298
302
|
"timed_operation",
|
|
299
303
|
"collect_env_info",
|
|
300
|
-
"get_device",
|
|
301
|
-
"auto_select_lib_and_device",
|
|
302
304
|
"auto_select_viz_library",
|
|
303
305
|
"get_tensorflow_requirement",
|
|
304
306
|
"tf_addons_available",
|
|
305
307
|
"get_tf_addons_requirements",
|
|
306
308
|
"tensorpack_available",
|
|
307
309
|
"get_tensorpack_requirement",
|
|
310
|
+
"pytorch_available",
|
|
308
311
|
"get_pytorch_requirement",
|
|
309
312
|
"lxml_available",
|
|
310
313
|
"get_lxml_requirement",
|
|
@@ -418,25 +421,9 @@ _IMPORT_STRUCTURE = {
|
|
|
418
421
|
],
|
|
419
422
|
}
|
|
420
423
|
|
|
421
|
-
|
|
422
|
-
# disable TF warnings for versions > 2.4.1
|
|
423
|
-
if tf_available():
|
|
424
|
-
if version.parse(get_tf_version()) > version.parse("2.4.1"):
|
|
425
|
-
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
|
|
426
|
-
try:
|
|
427
|
-
import tensorflow.python.util.deprecation as deprecation # type: ignore # pylint: disable=E0401,R0402
|
|
428
|
-
|
|
429
|
-
deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
|
|
430
|
-
except Exception: # pylint: disable=W0703
|
|
431
|
-
try:
|
|
432
|
-
from tensorflow.python.util import deprecation # type: ignore # pylint: disable=E0401
|
|
433
|
-
|
|
434
|
-
deprecation._PRINT_DEPRECATION_WARNINGS = False # pylint: disable=W0212
|
|
435
|
-
except Exception: # pylint: disable=W0703
|
|
436
|
-
pass
|
|
437
|
-
|
|
438
424
|
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
439
|
-
|
|
425
|
+
env_info = collect_env_info()
|
|
426
|
+
logger.debug(LoggingRecord(msg=env_info))
|
|
440
427
|
|
|
441
428
|
|
|
442
429
|
# Direct imports for type-checking
|
|
@@ -444,10 +431,10 @@ if TYPE_CHECKING:
|
|
|
444
431
|
from .analyzer import *
|
|
445
432
|
from .dataflow import *
|
|
446
433
|
from .datapoint import *
|
|
447
|
-
from .datasets import *
|
|
434
|
+
from .datasets import * # type: ignore
|
|
448
435
|
from .eval import *
|
|
449
|
-
from .extern import *
|
|
450
|
-
from .mapper import *
|
|
436
|
+
from .extern import * # type: ignore
|
|
437
|
+
from .mapper import * # type: ignore
|
|
451
438
|
from .pipe import *
|
|
452
439
|
from .train import *
|
|
453
440
|
from .utils import *
|
|
@@ -23,51 +23,46 @@ Module for **deep**doctection analyzer.
|
|
|
23
23
|
-user factory with a reduced config setting
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
-
import
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
27
28
|
import os
|
|
28
29
|
from os import environ
|
|
29
30
|
from shutil import copyfile
|
|
30
|
-
from typing import
|
|
31
|
+
from typing import Optional, Union
|
|
32
|
+
|
|
33
|
+
from lazy_imports import try_import
|
|
31
34
|
|
|
32
35
|
from ..extern.base import ObjectDetector
|
|
36
|
+
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
33
37
|
from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
|
|
38
|
+
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
34
39
|
from ..extern.model import ModelCatalog, ModelDownloadManager
|
|
35
40
|
from ..extern.pdftext import PdfPlumberTextDetector
|
|
41
|
+
from ..extern.pt.ptutils import get_torch_device
|
|
36
42
|
from ..extern.tessocr import TesseractOcrDetector
|
|
37
43
|
from ..extern.texocr import TextractOcrDetector
|
|
44
|
+
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
45
|
+
from ..extern.tpdetect import TPFrcnnDetector
|
|
38
46
|
from ..pipe.base import PipelineComponent
|
|
39
|
-
from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
|
|
40
47
|
from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
|
|
41
48
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
42
49
|
from ..pipe.layout import ImageLayoutService
|
|
43
50
|
from ..pipe.order import TextOrderService
|
|
44
51
|
from ..pipe.refine import TableSegmentationRefinementService
|
|
45
52
|
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
53
|
+
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
46
54
|
from ..pipe.text import TextExtractionService
|
|
47
|
-
from ..utils.
|
|
48
|
-
from ..utils.
|
|
49
|
-
from ..utils.file_utils import
|
|
50
|
-
boto3_available,
|
|
51
|
-
detectron2_available,
|
|
52
|
-
pytorch_available,
|
|
53
|
-
tensorpack_available,
|
|
54
|
-
tf_available,
|
|
55
|
-
)
|
|
55
|
+
from ..utils.env_info import ENV_VARS_TRUE
|
|
56
|
+
from ..utils.error import DependencyError
|
|
57
|
+
from ..utils.file_utils import detectron2_available, tensorpack_available
|
|
56
58
|
from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
|
|
57
59
|
from ..utils.logger import LoggingRecord, logger
|
|
58
60
|
from ..utils.metacfg import AttrDict, set_config_by_yaml
|
|
59
61
|
from ..utils.settings import CellType, LayoutType
|
|
60
62
|
from ..utils.transform import PadTransform
|
|
63
|
+
from ..utils.types import PathLikeOrStr
|
|
61
64
|
|
|
62
|
-
|
|
63
|
-
from ..extern.tp.tfutils import disable_tp_layer_logging
|
|
64
|
-
from ..extern.tpdetect import TPFrcnnDetector
|
|
65
|
-
|
|
66
|
-
if pytorch_available():
|
|
67
|
-
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
68
|
-
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
69
|
-
|
|
70
|
-
if boto3_available():
|
|
65
|
+
with try_import() as image_guard:
|
|
71
66
|
from botocore.config import Config # type: ignore
|
|
72
67
|
|
|
73
68
|
|
|
@@ -89,7 +84,7 @@ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
|
|
|
89
84
|
|
|
90
85
|
|
|
91
86
|
def maybe_copy_config_to_cache(
|
|
92
|
-
package_path:
|
|
87
|
+
package_path: PathLikeOrStr, configs_dir_path: PathLikeOrStr, file_name: str, force_copy: bool = True
|
|
93
88
|
) -> str:
|
|
94
89
|
"""
|
|
95
90
|
Initial copying of various files
|
|
@@ -123,7 +118,7 @@ def config_sanity_checks(cfg: AttrDict) -> None:
|
|
|
123
118
|
|
|
124
119
|
def build_detector(
|
|
125
120
|
cfg: AttrDict, mode: str
|
|
126
|
-
) -> Union[
|
|
121
|
+
) -> Union[D2FrcnnDetector, TPFrcnnDetector, HFDetrDerivedDetector, D2FrcnnTracingDetector]:
|
|
127
122
|
"""Building a D2-Detector, a TP-Detector as Detr-Detector or a D2-Torch Tracing Detector according to
|
|
128
123
|
the config
|
|
129
124
|
|
|
@@ -141,8 +136,8 @@ def build_detector(
|
|
|
141
136
|
config_path = ModelCatalog.get_full_path_configs(weights)
|
|
142
137
|
weights_path = ModelDownloadManager.maybe_download_weights_and_configs(weights)
|
|
143
138
|
profile = ModelCatalog.get_profile(weights)
|
|
144
|
-
categories = profile.categories
|
|
145
|
-
|
|
139
|
+
categories = profile.categories if profile.categories is not None else {}
|
|
140
|
+
|
|
146
141
|
if profile.model_wrapper in ("TPFrcnnDetector",):
|
|
147
142
|
return TPFrcnnDetector(config_path, weights_path, categories, filter_categories=filter_categories)
|
|
148
143
|
if profile.model_wrapper in ("D2FrcnnDetector",):
|
|
@@ -210,11 +205,13 @@ def build_sub_image_service(detector: ObjectDetector, cfg: AttrDict, mode: str)
|
|
|
210
205
|
padder = None
|
|
211
206
|
if mode == "ITEM":
|
|
212
207
|
if detector.__class__.__name__ in ("HFDetrDerivedDetector",):
|
|
213
|
-
exclude_category_ids.extend([
|
|
208
|
+
exclude_category_ids.extend([1, 3, 4, 5, 6])
|
|
214
209
|
padder = build_padder(cfg, mode)
|
|
215
|
-
detect_result_generator = DetectResultGenerator(
|
|
210
|
+
detect_result_generator = DetectResultGenerator(
|
|
211
|
+
categories=detector.categories.categories, exclude_category_ids=exclude_category_ids
|
|
212
|
+
)
|
|
216
213
|
return SubImageLayoutService(
|
|
217
|
-
detector, [LayoutType.
|
|
214
|
+
detector, [LayoutType.TABLE, LayoutType.TABLE_ROTATED], None, detect_result_generator, padder
|
|
218
215
|
)
|
|
219
216
|
|
|
220
217
|
|
|
@@ -241,9 +238,9 @@ def build_ocr(cfg: AttrDict) -> Union[TesseractOcrDetector, DoctrTextRecognizer,
|
|
|
241
238
|
)
|
|
242
239
|
if cfg.OCR.USE_TEXTRACT:
|
|
243
240
|
credentials_kwargs = {
|
|
244
|
-
"aws_access_key_id": environ.get("ACCESS_KEY"),
|
|
245
|
-
"aws_secret_access_key": environ.get("SECRET_KEY"),
|
|
246
|
-
"config": Config(region_name=environ.get("REGION")),
|
|
241
|
+
"aws_access_key_id": environ.get("ACCESS_KEY", None),
|
|
242
|
+
"aws_secret_access_key": environ.get("SECRET_KEY", None),
|
|
243
|
+
"config": Config(region_name=environ.get("REGION", None)),
|
|
247
244
|
}
|
|
248
245
|
return TextractOcrDetector(**credentials_kwargs)
|
|
249
246
|
raise ValueError("You have set USE_OCR=True but any of USE_TESSERACT, USE_DOCTR, USE_TEXTRACT is set to False")
|
|
@@ -268,7 +265,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
268
265
|
:param cfg: A configuration
|
|
269
266
|
:return: Analyzer pipeline
|
|
270
267
|
"""
|
|
271
|
-
pipe_component_list:
|
|
268
|
+
pipe_component_list: list[PipelineComponent] = []
|
|
272
269
|
|
|
273
270
|
if cfg.USE_LAYOUT:
|
|
274
271
|
d_layout = build_detector(cfg, "LAYOUT")
|
|
@@ -308,22 +305,22 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
308
305
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
309
306
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
310
307
|
cfg.SEGMENTATION.CELL_CATEGORY_ID,
|
|
311
|
-
LayoutType.
|
|
308
|
+
LayoutType.TABLE,
|
|
312
309
|
[
|
|
313
|
-
CellType.
|
|
314
|
-
CellType.
|
|
315
|
-
CellType.
|
|
316
|
-
CellType.
|
|
317
|
-
LayoutType.
|
|
310
|
+
CellType.SPANNING,
|
|
311
|
+
CellType.ROW_HEADER,
|
|
312
|
+
CellType.COLUMN_HEADER,
|
|
313
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
314
|
+
LayoutType.CELL,
|
|
318
315
|
],
|
|
319
316
|
[
|
|
320
|
-
CellType.
|
|
321
|
-
CellType.
|
|
322
|
-
CellType.
|
|
323
|
-
CellType.
|
|
317
|
+
CellType.SPANNING,
|
|
318
|
+
CellType.ROW_HEADER,
|
|
319
|
+
CellType.COLUMN_HEADER,
|
|
320
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
324
321
|
],
|
|
325
|
-
[LayoutType.
|
|
326
|
-
[CellType.
|
|
322
|
+
[LayoutType.ROW, LayoutType.COLUMN],
|
|
323
|
+
[CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
|
|
327
324
|
stretch_rule=cfg.SEGMENTATION.STRETCH_RULE,
|
|
328
325
|
)
|
|
329
326
|
pipe_component_list.append(pubtables)
|
|
@@ -335,20 +332,29 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
335
332
|
cfg.SEGMENTATION.FULL_TABLE_TILING,
|
|
336
333
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS,
|
|
337
334
|
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS,
|
|
338
|
-
LayoutType.
|
|
339
|
-
[CellType.
|
|
340
|
-
[LayoutType.
|
|
341
|
-
[CellType.
|
|
335
|
+
LayoutType.TABLE,
|
|
336
|
+
[CellType.HEADER, CellType.BODY, LayoutType.CELL],
|
|
337
|
+
[LayoutType.ROW, LayoutType.COLUMN],
|
|
338
|
+
[CellType.ROW_NUMBER, CellType.COLUMN_NUMBER],
|
|
342
339
|
cfg.SEGMENTATION.STRETCH_RULE,
|
|
343
340
|
)
|
|
344
341
|
pipe_component_list.append(table_segmentation)
|
|
345
342
|
|
|
346
343
|
if cfg.USE_TABLE_REFINEMENT:
|
|
347
|
-
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
344
|
+
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
345
|
+
[LayoutType.TABLE, LayoutType.TABLE_ROTATED],
|
|
346
|
+
[
|
|
347
|
+
LayoutType.CELL,
|
|
348
|
+
CellType.COLUMN_HEADER,
|
|
349
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
350
|
+
CellType.SPANNING,
|
|
351
|
+
CellType.ROW_HEADER,
|
|
352
|
+
],
|
|
353
|
+
)
|
|
348
354
|
pipe_component_list.append(table_segmentation_refinement)
|
|
349
355
|
|
|
350
356
|
if cfg.USE_PDF_MINER:
|
|
351
|
-
pdf_text = PdfPlumberTextDetector()
|
|
357
|
+
pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
|
|
352
358
|
d_text = TextExtractionService(pdf_text)
|
|
353
359
|
pipe_component_list.append(d_text)
|
|
354
360
|
|
|
@@ -362,7 +368,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
362
368
|
|
|
363
369
|
ocr = build_ocr(cfg)
|
|
364
370
|
skip_if_text_extracted = cfg.USE_PDF_MINER
|
|
365
|
-
extract_from_roi = LayoutType.
|
|
371
|
+
extract_from_roi = LayoutType.WORD if cfg.OCR.USE_DOCTR else None
|
|
366
372
|
text = TextExtractionService(
|
|
367
373
|
ocr, skip_if_text_extracted=skip_if_text_extracted, extract_from_roi=extract_from_roi
|
|
368
374
|
)
|
|
@@ -371,7 +377,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
371
377
|
if cfg.USE_PDF_MINER or cfg.USE_OCR:
|
|
372
378
|
match = MatchingService(
|
|
373
379
|
parent_categories=cfg.WORD_MATCHING.PARENTAL_CATEGORIES,
|
|
374
|
-
child_categories=LayoutType.
|
|
380
|
+
child_categories=LayoutType.WORD,
|
|
375
381
|
matching_rule=cfg.WORD_MATCHING.RULE,
|
|
376
382
|
threshold=cfg.WORD_MATCHING.THRESHOLD,
|
|
377
383
|
max_parent_only=cfg.WORD_MATCHING.MAX_PARENT_ONLY,
|
|
@@ -379,7 +385,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
379
385
|
pipe_component_list.append(match)
|
|
380
386
|
|
|
381
387
|
order = TextOrderService(
|
|
382
|
-
text_container=LayoutType.
|
|
388
|
+
text_container=LayoutType.WORD,
|
|
383
389
|
text_block_categories=cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES,
|
|
384
390
|
floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
385
391
|
include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
@@ -391,7 +397,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
391
397
|
pipe_component_list.append(order)
|
|
392
398
|
|
|
393
399
|
page_parsing_service = PageParsingService(
|
|
394
|
-
text_container=LayoutType.
|
|
400
|
+
text_container=LayoutType.WORD,
|
|
395
401
|
floating_text_block_categories=cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES,
|
|
396
402
|
include_residual_text_container=cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER,
|
|
397
403
|
)
|
|
@@ -401,9 +407,9 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
401
407
|
|
|
402
408
|
|
|
403
409
|
def get_dd_analyzer(
|
|
404
|
-
reset_config_file: bool =
|
|
405
|
-
config_overwrite: Optional[
|
|
406
|
-
path_config_file: Optional[
|
|
410
|
+
reset_config_file: bool = True,
|
|
411
|
+
config_overwrite: Optional[list[str]] = None,
|
|
412
|
+
path_config_file: Optional[PathLikeOrStr] = None,
|
|
407
413
|
) -> DoctectionPipe:
|
|
408
414
|
"""
|
|
409
415
|
Factory function for creating the built-in **deep**doctection analyzer.
|
|
@@ -430,8 +436,13 @@ def get_dd_analyzer(
|
|
|
430
436
|
:return: A DoctectionPipe instance with given configs
|
|
431
437
|
"""
|
|
432
438
|
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
433
|
-
lib = "TF" if
|
|
434
|
-
|
|
439
|
+
lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
|
|
440
|
+
if lib == "TF":
|
|
441
|
+
device = get_tf_device()
|
|
442
|
+
elif lib == "PT":
|
|
443
|
+
device = get_torch_device()
|
|
444
|
+
else:
|
|
445
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
435
446
|
dd_one_config_path = maybe_copy_config_to_cache(
|
|
436
447
|
get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
|
|
437
448
|
)
|
|
@@ -1,38 +1,38 @@
|
|
|
1
1
|
USE_LAYOUT: True
|
|
2
2
|
USE_TABLE_SEGMENTATION: True
|
|
3
3
|
TF:
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
4
|
+
LAYOUT:
|
|
5
|
+
WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
|
|
6
|
+
FILTER:
|
|
7
|
+
CELL:
|
|
8
|
+
WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
|
|
9
|
+
FILTER:
|
|
10
|
+
ITEM:
|
|
11
|
+
WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
|
|
12
|
+
FILTER:
|
|
13
13
|
PT:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
14
|
+
LAYOUT:
|
|
15
|
+
WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
|
|
16
|
+
WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
|
|
17
|
+
FILTER:
|
|
18
|
+
PAD:
|
|
19
|
+
TOP: 60
|
|
20
|
+
RIGHT: 60
|
|
21
|
+
BOTTOM: 60
|
|
22
|
+
LEFT: 60
|
|
23
|
+
ITEM:
|
|
24
|
+
WEIGHTS: item/d2_model_1639999_item_inf_only.pt
|
|
25
|
+
WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
|
|
26
|
+
FILTER:
|
|
27
|
+
PAD:
|
|
28
|
+
TOP: 60
|
|
29
|
+
RIGHT: 60
|
|
30
|
+
BOTTOM: 60
|
|
31
|
+
LEFT: 60
|
|
32
|
+
CELL:
|
|
33
|
+
WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
|
|
34
|
+
WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
|
|
35
|
+
FILTER:
|
|
36
36
|
LAYOUT_NMS_PAIRS:
|
|
37
37
|
COMBINATIONS:
|
|
38
38
|
THRESHOLDS:
|
|
@@ -48,6 +48,9 @@ SEGMENTATION:
|
|
|
48
48
|
STRETCH_RULE: equal
|
|
49
49
|
USE_TABLE_REFINEMENT: True
|
|
50
50
|
USE_PDF_MINER: False
|
|
51
|
+
PDF_MINER:
|
|
52
|
+
X_TOLERANCE: 3
|
|
53
|
+
Y_TOLERANCE: 3
|
|
51
54
|
USE_OCR: True
|
|
52
55
|
OCR:
|
|
53
56
|
USE_TESSERACT: True
|