deepdoctection 0.31__tar.gz → 0.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.31 → deepdoctection-0.32}/PKG-INFO +27 -18
- {deepdoctection-0.31 → deepdoctection-0.32}/README.md +14 -7
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/__init__.py +35 -28
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/analyzer/dd.py +30 -24
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/configs/conf_dd_one.yaml +34 -31
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/annotation.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/box.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/image.py +13 -7
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/view.py +95 -24
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/__init__.py +1 -4
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/adapter.py +5 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/base.py +5 -3
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/info.py +2 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/doclaynet.py +3 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/fintabnet.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/funsd.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/iiitar13k.py +5 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/layouttest.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/publaynet.py +2 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/pubtables1m.py +6 -3
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/pubtabnet.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/rvlcdip.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/xfund.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/__init__.py +1 -4
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/cocometric.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/eval.py +17 -13
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/tedsmetric.py +14 -11
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/tp_eval_callback.py +9 -3
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/__init__.py +2 -7
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/d2detect.py +24 -32
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/deskew.py +4 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/doctrocr.py +75 -81
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/fastlang.py +4 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/hfdetr.py +22 -28
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/hflayoutlm.py +335 -103
- deepdoctection-0.32/deepdoctection/extern/hflm.py +225 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/model.py +56 -47
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/pdftext.py +8 -4
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/pt/__init__.py +1 -3
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection-0.32/deepdoctection/extern/pt/ptutils.py +57 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/texocr.py +4 -2
- deepdoctection-0.32/deepdoctection/extern/tp/tfutils.py +91 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpcompat.py +10 -7
- deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/preproc.py +7 -3
- {deepdoctection-0.31/tests/datapoint → deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/utils}/__init__.py +4 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tpdetect.py +5 -8
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/__init__.py +3 -8
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/d2struct.py +8 -6
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/hfstruct.py +6 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/laylmstruct.py +163 -20
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/maputils.py +3 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/misc.py +6 -3
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/tpstruct.py +2 -2
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/__init__.py +1 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/common.py +11 -9
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/concurrency.py +2 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/layout.py +3 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/lm.py +32 -64
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/order.py +142 -35
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/refine.py +8 -14
- deepdoctection-0.31/deepdoctection/pipe/cell.py → deepdoctection-0.32/deepdoctection/pipe/sub_layout.py +1 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/__init__.py +6 -12
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/d2_frcnn_train.py +21 -16
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/hf_detr_train.py +18 -11
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/hf_layoutlm_train.py +118 -101
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/train/tp_frcnn_train.py +21 -19
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/env_info.py +41 -117
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/logger.py +1 -0
- deepdoctection-0.32/deepdoctection/utils/mocks.py +93 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/settings.py +1 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/viz.py +4 -3
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/PKG-INFO +27 -18
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/SOURCES.txt +4 -90
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/requires.txt +11 -8
- {deepdoctection-0.31 → deepdoctection-0.32}/setup.cfg +7 -1
- {deepdoctection-0.31 → deepdoctection-0.32}/setup.py +9 -8
- {deepdoctection-0.31 → deepdoctection-0.32}/tests/test_utils.py +8 -0
- deepdoctection-0.31/deepdoctection/extern/pt/ptutils.py +0 -49
- deepdoctection-0.31/deepdoctection/extern/tp/tfutils.py +0 -57
- deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- deepdoctection-0.31/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- deepdoctection-0.31/tests/__init__.py +0 -22
- deepdoctection-0.31/tests/analyzer/__init__.py +0 -16
- deepdoctection-0.31/tests/analyzer/test_dd.py +0 -202
- deepdoctection-0.31/tests/conftest.py +0 -498
- deepdoctection-0.31/tests/data.py +0 -1632
- deepdoctection-0.31/tests/dataflow/__init__.py +0 -16
- deepdoctection-0.31/tests/dataflow/conftest.py +0 -95
- deepdoctection-0.31/tests/dataflow/test_common.py +0 -219
- deepdoctection-0.31/tests/dataflow/test_custom.py +0 -60
- deepdoctection-0.31/tests/dataflow/test_custom_serialize.py +0 -177
- deepdoctection-0.31/tests/dataflow/test_parallel_map.py +0 -66
- deepdoctection-0.31/tests/dataflow/test_stats.py +0 -103
- deepdoctection-0.31/tests/datapoint/conftest.py +0 -262
- deepdoctection-0.31/tests/datapoint/test_annotation.py +0 -170
- deepdoctection-0.31/tests/datapoint/test_box.py +0 -416
- deepdoctection-0.31/tests/datapoint/test_convert.py +0 -52
- deepdoctection-0.31/tests/datapoint/test_image.py +0 -387
- deepdoctection-0.31/tests/datapoint/test_view.py +0 -150
- deepdoctection-0.31/tests/datasets/__init__.py +0 -16
- deepdoctection-0.31/tests/datasets/instances/__init__.py +0 -16
- deepdoctection-0.31/tests/datasets/instances/conftest.py +0 -35
- deepdoctection-0.31/tests/datasets/instances/test_doclaynet.py +0 -43
- deepdoctection-0.31/tests/datasets/instances/test_fintabnet.py +0 -70
- deepdoctection-0.31/tests/datasets/instances/test_funsd.py +0 -58
- deepdoctection-0.31/tests/datasets/instances/test_iiitar13k.py +0 -42
- deepdoctection-0.31/tests/datasets/instances/test_layouttest.py +0 -63
- deepdoctection-0.31/tests/datasets/instances/test_publaynet.py +0 -64
- deepdoctection-0.31/tests/datasets/instances/test_pubtables1m.py +0 -66
- deepdoctection-0.31/tests/datasets/instances/test_pubtabnet.py +0 -65
- deepdoctection-0.31/tests/datasets/instances/test_rvlcdip.py +0 -46
- deepdoctection-0.31/tests/datasets/test_adapter.py +0 -77
- deepdoctection-0.31/tests/datasets/test_info.py +0 -273
- deepdoctection-0.31/tests/datasets/test_registry.py +0 -75
- deepdoctection-0.31/tests/eval/__init__.py +0 -16
- deepdoctection-0.31/tests/eval/conftest.py +0 -107
- deepdoctection-0.31/tests/eval/test_accmetric.py +0 -364
- deepdoctection-0.31/tests/eval/test_cocometric.py +0 -123
- deepdoctection-0.31/tests/eval/test_eval.py +0 -86
- deepdoctection-0.31/tests/eval/test_registry.py +0 -84
- deepdoctection-0.31/tests/eval/test_tedsmetric.py +0 -40
- deepdoctection-0.31/tests/extern/__init__.py +0 -0
- deepdoctection-0.31/tests/extern/conftest.py +0 -108
- deepdoctection-0.31/tests/extern/data.py +0 -102
- deepdoctection-0.31/tests/extern/test_deskew.py +0 -67
- deepdoctection-0.31/tests/extern/test_doctrocr.py +0 -190
- deepdoctection-0.31/tests/extern/test_fastlang.py +0 -64
- deepdoctection-0.31/tests/extern/test_hfdetr.py +0 -116
- deepdoctection-0.31/tests/extern/test_hflayoutlm.py +0 -492
- deepdoctection-0.31/tests/extern/test_pdftext.py +0 -70
- deepdoctection-0.31/tests/extern/test_tessocr.py +0 -164
- deepdoctection-0.31/tests/extern/test_texocr.py +0 -52
- deepdoctection-0.31/tests/extern/test_tpdetect.py +0 -123
- deepdoctection-0.31/tests/mapper/__init__.py +0 -16
- deepdoctection-0.31/tests/mapper/conftest.py +0 -297
- deepdoctection-0.31/tests/mapper/data.py +0 -2182
- deepdoctection-0.31/tests/mapper/test_cats.py +0 -305
- deepdoctection-0.31/tests/mapper/test_cocostruct.py +0 -91
- deepdoctection-0.31/tests/mapper/test_d2struct.py +0 -56
- deepdoctection-0.31/tests/mapper/test_hfstruct.py +0 -59
- deepdoctection-0.31/tests/mapper/test_iiitar13k.py +0 -64
- deepdoctection-0.31/tests/mapper/test_laylmstruct.py +0 -141
- deepdoctection-0.31/tests/mapper/test_misc.py +0 -72
- deepdoctection-0.31/tests/mapper/test_prodigystruct.py +0 -78
- deepdoctection-0.31/tests/mapper/test_pubstruct.py +0 -170
- deepdoctection-0.31/tests/mapper/test_tpstruct.py +0 -51
- deepdoctection-0.31/tests/mapper/test_utils.py +0 -83
- deepdoctection-0.31/tests/mapper/test_xfundstruct.py +0 -68
- deepdoctection-0.31/tests/pipe/__init__.py +0 -16
- deepdoctection-0.31/tests/pipe/test_anngen.py +0 -179
- deepdoctection-0.31/tests/pipe/test_cell.py +0 -144
- deepdoctection-0.31/tests/pipe/test_common.py +0 -107
- deepdoctection-0.31/tests/pipe/test_language.py +0 -76
- deepdoctection-0.31/tests/pipe/test_layout.py +0 -66
- deepdoctection-0.31/tests/pipe/test_lm.py +0 -119
- deepdoctection-0.31/tests/pipe/test_order.py +0 -197
- deepdoctection-0.31/tests/pipe/test_refine.py +0 -325
- deepdoctection-0.31/tests/pipe/test_registry.py +0 -58
- deepdoctection-0.31/tests/pipe/test_segment.py +0 -392
- deepdoctection-0.31/tests/pipe/test_text.py +0 -208
- deepdoctection-0.31/tests/pipe/test_transform.py +0 -65
- deepdoctection-0.31/tests/train/__init__.py +0 -16
- deepdoctection-0.31/tests/train/conftest.py +0 -118
- deepdoctection-0.31/tests/train/test_d2_frcnn_train.py +0 -64
- deepdoctection-0.31/tests/train/test_tp_frcnn_train.py +0 -99
- deepdoctection-0.31/tests_d2/__init__.py +0 -20
- deepdoctection-0.31/tests_d2/conftest.py +0 -56
- deepdoctection-0.31/tests_d2/test_d2detect.py +0 -95
- {deepdoctection-0.31 → deepdoctection-0.32}/LICENSE +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datapoint/convert.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tessocr.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/base.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/segment.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/detection_types.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/file_utils.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/fs.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/pdf_utils.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.31 → deepdoctection-0.32}/deepdoctection.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.32
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -9,18 +9,18 @@ Classifier: Development Status :: 4 - Beta
|
|
|
9
9
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
10
|
Classifier: Natural Language :: English
|
|
11
11
|
Classifier: Operating System :: POSIX :: Linux
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.9
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
-
Requires-Python: >=3.
|
|
16
|
+
Requires-Python: >=3.9
|
|
18
17
|
Description-Content-Type: text/markdown
|
|
19
18
|
License-File: LICENSE
|
|
20
19
|
Requires-Dist: catalogue==2.0.10
|
|
21
20
|
Requires-Dist: huggingface_hub>=0.12.0
|
|
22
21
|
Requires-Dist: importlib-metadata>=5.0.0
|
|
23
22
|
Requires-Dist: jsonlines==3.1.0
|
|
23
|
+
Requires-Dist: lazy-imports==0.3.1
|
|
24
24
|
Requires-Dist: mock==4.0.3
|
|
25
25
|
Requires-Dist: networkx>=2.7.1
|
|
26
26
|
Requires-Dist: numpy>=1.21
|
|
@@ -37,6 +37,7 @@ Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
|
37
37
|
Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
|
|
38
38
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
39
39
|
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
40
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
40
41
|
Requires-Dist: mock==4.0.3; extra == "tf"
|
|
41
42
|
Requires-Dist: networkx>=2.7.1; extra == "tf"
|
|
42
43
|
Requires-Dist: numpy>=1.21; extra == "tf"
|
|
@@ -52,10 +53,10 @@ Requires-Dist: tensorpack==0.11; extra == "tf"
|
|
|
52
53
|
Requires-Dist: protobuf==3.20.1; extra == "tf"
|
|
53
54
|
Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
|
|
54
55
|
Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
|
|
55
|
-
Requires-Dist: python-doctr==0.
|
|
56
|
+
Requires-Dist: python-doctr==0.8.1; extra == "tf"
|
|
56
57
|
Requires-Dist: pycocotools>=2.0.2; extra == "tf"
|
|
57
|
-
Requires-Dist: boto3; extra == "tf"
|
|
58
|
-
Requires-Dist: pdfplumber>=0.
|
|
58
|
+
Requires-Dist: boto3==1.34.102; extra == "tf"
|
|
59
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
|
|
59
60
|
Requires-Dist: fasttext==0.9.2; extra == "tf"
|
|
60
61
|
Requires-Dist: jdeskew>=0.2.2; extra == "tf"
|
|
61
62
|
Requires-Dist: apted==1.0.3; extra == "tf"
|
|
@@ -66,6 +67,7 @@ Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
|
66
67
|
Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
|
|
67
68
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
68
69
|
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
70
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
69
71
|
Requires-Dist: mock==4.0.3; extra == "pt"
|
|
70
72
|
Requires-Dist: networkx>=2.7.1; extra == "pt"
|
|
71
73
|
Requires-Dist: numpy>=1.21; extra == "pt"
|
|
@@ -80,9 +82,9 @@ Requires-Dist: tqdm==4.64.0; extra == "pt"
|
|
|
80
82
|
Requires-Dist: timm>=0.9.16; extra == "pt"
|
|
81
83
|
Requires-Dist: transformers>=4.36.0; extra == "pt"
|
|
82
84
|
Requires-Dist: accelerate>=0.29.1; extra == "pt"
|
|
83
|
-
Requires-Dist: python-doctr==0.
|
|
84
|
-
Requires-Dist: boto3; extra == "pt"
|
|
85
|
-
Requires-Dist: pdfplumber>=0.
|
|
85
|
+
Requires-Dist: python-doctr==0.8.1; extra == "pt"
|
|
86
|
+
Requires-Dist: boto3==1.34.102; extra == "pt"
|
|
87
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
|
|
86
88
|
Requires-Dist: fasttext==0.9.2; extra == "pt"
|
|
87
89
|
Requires-Dist: jdeskew>=0.2.2; extra == "pt"
|
|
88
90
|
Requires-Dist: apted==1.0.3; extra == "pt"
|
|
@@ -90,10 +92,10 @@ Requires-Dist: distance==0.1.3; extra == "pt"
|
|
|
90
92
|
Requires-Dist: lxml>=4.9.1; extra == "pt"
|
|
91
93
|
Provides-Extra: docs
|
|
92
94
|
Requires-Dist: tensorpack==0.11; extra == "docs"
|
|
93
|
-
Requires-Dist: boto3; extra == "docs"
|
|
95
|
+
Requires-Dist: boto3==1.34.102; extra == "docs"
|
|
94
96
|
Requires-Dist: transformers>=4.36.0; extra == "docs"
|
|
95
97
|
Requires-Dist: accelerate>=0.29.1; extra == "docs"
|
|
96
|
-
Requires-Dist: pdfplumber>=0.
|
|
98
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
|
|
97
99
|
Requires-Dist: lxml>=4.9.1; extra == "docs"
|
|
98
100
|
Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
|
|
99
101
|
Requires-Dist: jdeskew>=0.2.2; extra == "docs"
|
|
@@ -153,7 +155,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
153
155
|
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
154
156
|
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
|
|
155
157
|
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
156
|
-
- Document and token classification with all LayoutLM models provided by the
|
|
158
|
+
- Document and token classification with all LayoutLM models provided by the
|
|
159
|
+
[**Transformer library**](https://github.com/huggingface/transformers).
|
|
157
160
|
(Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
|
|
158
161
|
- Table detection and table structure recognition with
|
|
159
162
|
[**table-transformer**](https://github.com/microsoft/table-transformer).
|
|
@@ -163,10 +166,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
163
166
|
- Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
|
|
164
167
|
Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
|
|
165
168
|
[docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
|
|
166
|
-
- Document layout analysis and table recognition now runs with
|
|
167
|
-
|
|
168
|
-
|
|
169
|
+
- Document layout analysis and table recognition now runs with
|
|
170
|
+
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
171
|
+
anymore for basic inference.
|
|
172
|
+
- [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
169
173
|
(not contained in the built-in Analyzer).
|
|
174
|
+
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
175
|
+
[**transformers**](https://github.com/huggingface/transformers).
|
|
176
|
+
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
177
|
+
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
178
|
+
LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
|
|
170
179
|
|
|
171
180
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
172
181
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -257,9 +266,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
|
|
|
257
266
|
separately.
|
|
258
267
|
|
|
259
268
|
- Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
|
|
260
|
-
- Python >= 3.
|
|
261
|
-
- 1.
|
|
262
|
-
|
|
269
|
+
- Python >= 3.9
|
|
270
|
+
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
271
|
+
In general, if you want to train or fine-tune models, a GPU is required.
|
|
263
272
|
- **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
|
|
264
273
|
images.
|
|
265
274
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
@@ -31,7 +31,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
31
31
|
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
32
32
|
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
|
|
33
33
|
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
34
|
-
- Document and token classification with all LayoutLM models provided by the
|
|
34
|
+
- Document and token classification with all LayoutLM models provided by the
|
|
35
|
+
[**Transformer library**](https://github.com/huggingface/transformers).
|
|
35
36
|
(Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
|
|
36
37
|
- Table detection and table structure recognition with
|
|
37
38
|
[**table-transformer**](https://github.com/microsoft/table-transformer).
|
|
@@ -41,10 +42,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
41
42
|
- Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
|
|
42
43
|
Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
|
|
43
44
|
[docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
|
|
44
|
-
- Document layout analysis and table recognition now runs with
|
|
45
|
-
|
|
46
|
-
|
|
45
|
+
- Document layout analysis and table recognition now runs with
|
|
46
|
+
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
47
|
+
anymore for basic inference.
|
|
48
|
+
- [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
47
49
|
(not contained in the built-in Analyzer).
|
|
50
|
+
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
51
|
+
[**transformers**](https://github.com/huggingface/transformers).
|
|
52
|
+
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
53
|
+
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
54
|
+
LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
|
|
48
55
|
|
|
49
56
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
50
57
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -135,9 +142,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
|
|
|
135
142
|
separately.
|
|
136
143
|
|
|
137
144
|
- Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
|
|
138
|
-
- Python >= 3.
|
|
139
|
-
- 1.
|
|
140
|
-
|
|
145
|
+
- Python >= 3.9
|
|
146
|
+
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
147
|
+
In general, if you want to train or fine-tune models, a GPU is required.
|
|
141
148
|
- **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
|
|
142
149
|
images.
|
|
143
150
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
@@ -19,15 +19,13 @@ import os
|
|
|
19
19
|
import sys
|
|
20
20
|
from typing import TYPE_CHECKING
|
|
21
21
|
|
|
22
|
-
from
|
|
23
|
-
|
|
24
|
-
from .utils.env_info import auto_select_lib_and_device
|
|
22
|
+
from .utils.env_info import collect_env_info
|
|
25
23
|
from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
|
|
26
|
-
from .utils.logger import logger
|
|
24
|
+
from .utils.logger import LoggingRecord, logger
|
|
27
25
|
|
|
28
26
|
# pylint: enable=wrong-import-position
|
|
29
27
|
|
|
30
|
-
__version__ = 0.
|
|
28
|
+
__version__ = 0.32
|
|
31
29
|
|
|
32
30
|
_IMPORT_STRUCTURE = {
|
|
33
31
|
"analyzer": [
|
|
@@ -182,6 +180,7 @@ _IMPORT_STRUCTURE = {
|
|
|
182
180
|
"DocTrRotationTransformer",
|
|
183
181
|
"FasttextLangDetector",
|
|
184
182
|
"HFDetrDerivedDetector",
|
|
183
|
+
"get_tokenizer_from_architecture",
|
|
185
184
|
"HFLayoutLmTokenClassifierBase",
|
|
186
185
|
"HFLayoutLmTokenClassifier",
|
|
187
186
|
"HFLayoutLmv2TokenClassifier",
|
|
@@ -189,6 +188,9 @@ _IMPORT_STRUCTURE = {
|
|
|
189
188
|
"HFLayoutLmSequenceClassifier",
|
|
190
189
|
"HFLayoutLmv2SequenceClassifier",
|
|
191
190
|
"HFLayoutLmv3SequenceClassifier",
|
|
191
|
+
"HFLiltTokenClassifier",
|
|
192
|
+
"HFLiltSequenceClassifier",
|
|
193
|
+
"HFLmSequenceClassifier",
|
|
192
194
|
"ModelProfile",
|
|
193
195
|
"ModelCatalog",
|
|
194
196
|
"print_model_infos",
|
|
@@ -268,11 +270,11 @@ _IMPORT_STRUCTURE = {
|
|
|
268
270
|
"DoctectionPipe",
|
|
269
271
|
"LanguageDetectionService",
|
|
270
272
|
"ImageLayoutService",
|
|
271
|
-
"get_tokenizer_from_architecture",
|
|
272
273
|
"LMTokenClassifierService",
|
|
273
274
|
"LMSequenceClassifierService",
|
|
274
275
|
"OrderGenerator",
|
|
275
276
|
"TextLineGenerator",
|
|
277
|
+
"TextLineService",
|
|
276
278
|
"TextOrderService",
|
|
277
279
|
"TableSegmentationRefinementService",
|
|
278
280
|
"generate_html_string",
|
|
@@ -297,14 +299,13 @@ _IMPORT_STRUCTURE = {
|
|
|
297
299
|
"save_tmp_file",
|
|
298
300
|
"timed_operation",
|
|
299
301
|
"collect_env_info",
|
|
300
|
-
"get_device",
|
|
301
|
-
"auto_select_lib_and_device",
|
|
302
302
|
"auto_select_viz_library",
|
|
303
303
|
"get_tensorflow_requirement",
|
|
304
304
|
"tf_addons_available",
|
|
305
305
|
"get_tf_addons_requirements",
|
|
306
306
|
"tensorpack_available",
|
|
307
307
|
"get_tensorpack_requirement",
|
|
308
|
+
"pytorch_available",
|
|
308
309
|
"get_pytorch_requirement",
|
|
309
310
|
"lxml_available",
|
|
310
311
|
"get_lxml_requirement",
|
|
@@ -418,25 +419,31 @@ _IMPORT_STRUCTURE = {
|
|
|
418
419
|
],
|
|
419
420
|
}
|
|
420
421
|
|
|
422
|
+
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
423
|
+
env_info = collect_env_info()
|
|
424
|
+
logger.debug(LoggingRecord(msg=env_info))
|
|
421
425
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
except Exception: # pylint: disable=W0703
|
|
436
|
-
pass
|
|
426
|
+
if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
|
|
427
|
+
os.environ["DD_USE_TORCH"] = "1"
|
|
428
|
+
os.environ["USE_TORCH"] = "1"
|
|
429
|
+
if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
|
|
430
|
+
os.environ["DD_USE_TF"] = "1"
|
|
431
|
+
os.environ["USE_TF"] = "1"
|
|
432
|
+
if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
|
|
433
|
+
logger.warning(
|
|
434
|
+
"Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
|
|
435
|
+
"behaviour, set DD_USE_TORCH to None before importing deepdoctection."
|
|
436
|
+
)
|
|
437
|
+
os.environ.pop("DD_USE_TF")
|
|
438
|
+
os.environ.pop("USE_TF")
|
|
437
439
|
|
|
438
|
-
|
|
439
|
-
|
|
440
|
+
if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
|
|
441
|
+
logger.warning(
|
|
442
|
+
LoggingRecord(
|
|
443
|
+
msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
|
|
444
|
+
"model from the library."
|
|
445
|
+
)
|
|
446
|
+
)
|
|
440
447
|
|
|
441
448
|
|
|
442
449
|
# Direct imports for type-checking
|
|
@@ -444,10 +451,10 @@ if TYPE_CHECKING:
|
|
|
444
451
|
from .analyzer import *
|
|
445
452
|
from .dataflow import *
|
|
446
453
|
from .datapoint import *
|
|
447
|
-
from .datasets import *
|
|
454
|
+
from .datasets import * # type: ignore
|
|
448
455
|
from .eval import *
|
|
449
|
-
from .extern import *
|
|
450
|
-
from .mapper import *
|
|
456
|
+
from .extern import * # type: ignore
|
|
457
|
+
from .mapper import * # type: ignore
|
|
451
458
|
from .pipe import *
|
|
452
459
|
from .train import *
|
|
453
460
|
from .utils import *
|
|
@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
|
|
|
23
23
|
-user factory with a reduced config setting
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
-
import ast
|
|
27
26
|
import os
|
|
28
27
|
from os import environ
|
|
29
28
|
from shutil import copyfile
|
|
30
29
|
from typing import List, Optional, Union
|
|
31
30
|
|
|
31
|
+
from lazy_imports import try_import
|
|
32
|
+
|
|
32
33
|
from ..extern.base import ObjectDetector
|
|
34
|
+
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
33
35
|
from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
|
|
36
|
+
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
34
37
|
from ..extern.model import ModelCatalog, ModelDownloadManager
|
|
35
38
|
from ..extern.pdftext import PdfPlumberTextDetector
|
|
39
|
+
from ..extern.pt.ptutils import get_torch_device
|
|
36
40
|
from ..extern.tessocr import TesseractOcrDetector
|
|
37
41
|
from ..extern.texocr import TextractOcrDetector
|
|
42
|
+
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
43
|
+
from ..extern.tpdetect import TPFrcnnDetector
|
|
38
44
|
from ..pipe.base import PipelineComponent
|
|
39
|
-
from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
|
|
40
45
|
from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
|
|
41
46
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
42
47
|
from ..pipe.layout import ImageLayoutService
|
|
43
48
|
from ..pipe.order import TextOrderService
|
|
44
49
|
from ..pipe.refine import TableSegmentationRefinementService
|
|
45
50
|
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
51
|
+
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
46
52
|
from ..pipe.text import TextExtractionService
|
|
47
53
|
from ..utils.detection_types import Pathlike
|
|
48
|
-
from ..utils.
|
|
49
|
-
from ..utils.file_utils import
|
|
50
|
-
boto3_available,
|
|
51
|
-
detectron2_available,
|
|
52
|
-
pytorch_available,
|
|
53
|
-
tensorpack_available,
|
|
54
|
-
tf_available,
|
|
55
|
-
)
|
|
54
|
+
from ..utils.error import DependencyError
|
|
55
|
+
from ..utils.file_utils import detectron2_available, tensorpack_available
|
|
56
56
|
from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
|
|
57
57
|
from ..utils.logger import LoggingRecord, logger
|
|
58
58
|
from ..utils.metacfg import AttrDict, set_config_by_yaml
|
|
59
59
|
from ..utils.settings import CellType, LayoutType
|
|
60
60
|
from ..utils.transform import PadTransform
|
|
61
61
|
|
|
62
|
-
|
|
63
|
-
from ..extern.tp.tfutils import disable_tp_layer_logging
|
|
64
|
-
from ..extern.tpdetect import TPFrcnnDetector
|
|
65
|
-
|
|
66
|
-
if pytorch_available():
|
|
67
|
-
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
68
|
-
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
69
|
-
|
|
70
|
-
if boto3_available():
|
|
62
|
+
with try_import() as image_guard:
|
|
71
63
|
from botocore.config import Config # type: ignore
|
|
72
64
|
|
|
73
65
|
|
|
@@ -344,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
344
336
|
pipe_component_list.append(table_segmentation)
|
|
345
337
|
|
|
346
338
|
if cfg.USE_TABLE_REFINEMENT:
|
|
347
|
-
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
339
|
+
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
340
|
+
[LayoutType.table, LayoutType.table_rotated],
|
|
341
|
+
[
|
|
342
|
+
LayoutType.cell,
|
|
343
|
+
CellType.column_header,
|
|
344
|
+
CellType.projected_row_header,
|
|
345
|
+
CellType.spanning,
|
|
346
|
+
CellType.row_header,
|
|
347
|
+
],
|
|
348
|
+
)
|
|
348
349
|
pipe_component_list.append(table_segmentation_refinement)
|
|
349
350
|
|
|
350
351
|
if cfg.USE_PDF_MINER:
|
|
351
|
-
pdf_text = PdfPlumberTextDetector()
|
|
352
|
+
pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
|
|
352
353
|
d_text = TextExtractionService(pdf_text)
|
|
353
354
|
pipe_component_list.append(d_text)
|
|
354
355
|
|
|
@@ -401,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
401
402
|
|
|
402
403
|
|
|
403
404
|
def get_dd_analyzer(
|
|
404
|
-
reset_config_file: bool =
|
|
405
|
+
reset_config_file: bool = True,
|
|
405
406
|
config_overwrite: Optional[List[str]] = None,
|
|
406
407
|
path_config_file: Optional[Pathlike] = None,
|
|
407
408
|
) -> DoctectionPipe:
|
|
@@ -430,8 +431,13 @@ def get_dd_analyzer(
|
|
|
430
431
|
:return: A DoctectionPipe instance with given configs
|
|
431
432
|
"""
|
|
432
433
|
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
433
|
-
lib = "TF" if
|
|
434
|
-
|
|
434
|
+
lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
|
|
435
|
+
if lib == "TF":
|
|
436
|
+
device = get_tf_device()
|
|
437
|
+
elif lib == "PT":
|
|
438
|
+
device = get_torch_device()
|
|
439
|
+
else:
|
|
440
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
435
441
|
dd_one_config_path = maybe_copy_config_to_cache(
|
|
436
442
|
get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
|
|
437
443
|
)
|
|
@@ -1,38 +1,38 @@
|
|
|
1
1
|
USE_LAYOUT: True
|
|
2
2
|
USE_TABLE_SEGMENTATION: True
|
|
3
3
|
TF:
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
4
|
+
LAYOUT:
|
|
5
|
+
WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
|
|
6
|
+
FILTER:
|
|
7
|
+
CELL:
|
|
8
|
+
WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
|
|
9
|
+
FILTER:
|
|
10
|
+
ITEM:
|
|
11
|
+
WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
|
|
12
|
+
FILTER:
|
|
13
13
|
PT:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
14
|
+
LAYOUT:
|
|
15
|
+
WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
|
|
16
|
+
WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
|
|
17
|
+
FILTER:
|
|
18
|
+
PAD:
|
|
19
|
+
TOP: 60
|
|
20
|
+
RIGHT: 60
|
|
21
|
+
BOTTOM: 60
|
|
22
|
+
LEFT: 60
|
|
23
|
+
ITEM:
|
|
24
|
+
WEIGHTS: item/d2_model_1639999_item_inf_only.pt
|
|
25
|
+
WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
|
|
26
|
+
FILTER:
|
|
27
|
+
PAD:
|
|
28
|
+
TOP: 60
|
|
29
|
+
RIGHT: 60
|
|
30
|
+
BOTTOM: 60
|
|
31
|
+
LEFT: 60
|
|
32
|
+
CELL:
|
|
33
|
+
WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
|
|
34
|
+
WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
|
|
35
|
+
FILTER:
|
|
36
36
|
LAYOUT_NMS_PAIRS:
|
|
37
37
|
COMBINATIONS:
|
|
38
38
|
THRESHOLDS:
|
|
@@ -48,6 +48,9 @@ SEGMENTATION:
|
|
|
48
48
|
STRETCH_RULE: equal
|
|
49
49
|
USE_TABLE_REFINEMENT: True
|
|
50
50
|
USE_PDF_MINER: False
|
|
51
|
+
PDF_MINER:
|
|
52
|
+
X_TOLERANCE: 3
|
|
53
|
+
Y_TOLERANCE: 3
|
|
51
54
|
USE_OCR: True
|
|
52
55
|
OCR:
|
|
53
56
|
USE_TESSERACT: True
|
|
@@ -504,5 +504,6 @@ class ContainerAnnotation(CategoryAnnotation):
|
|
|
504
504
|
@classmethod
|
|
505
505
|
def from_dict(cls, **kwargs: JsonDict) -> "SummaryAnnotation":
|
|
506
506
|
container_ann = ann_from_dict(cls, **kwargs)
|
|
507
|
-
|
|
507
|
+
value = kwargs.get("value", "")
|
|
508
|
+
container_ann.value = value if isinstance(value, str) else list(value)
|
|
508
509
|
return container_ann
|
|
@@ -25,6 +25,7 @@ from typing import List, Optional, Sequence, no_type_check
|
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
import numpy.typing as npt
|
|
28
|
+
from lazy_imports import try_import
|
|
28
29
|
from numpy import float32
|
|
29
30
|
|
|
30
31
|
from ..utils.detection_types import ImageType
|
|
@@ -32,7 +33,7 @@ from ..utils.error import BoundingBoxError
|
|
|
32
33
|
from ..utils.file_utils import cocotools_available
|
|
33
34
|
from ..utils.logger import LoggingRecord, logger
|
|
34
35
|
|
|
35
|
-
|
|
36
|
+
with try_import() as import_guard:
|
|
36
37
|
import pycocotools.mask as coco_mask
|
|
37
38
|
|
|
38
39
|
|
|
@@ -18,6 +18,8 @@
|
|
|
18
18
|
"""
|
|
19
19
|
Dataclass Image
|
|
20
20
|
"""
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
21
23
|
import json
|
|
22
24
|
from dataclasses import dataclass, field
|
|
23
25
|
from os import environ
|
|
@@ -202,7 +204,7 @@ class Image:
|
|
|
202
204
|
self._bbox = None
|
|
203
205
|
self.embeddings.pop(self.image_id)
|
|
204
206
|
|
|
205
|
-
def get_image(self) ->
|
|
207
|
+
def get_image(self) -> _Img: # type: ignore # pylint: disable=E0602
|
|
206
208
|
"""
|
|
207
209
|
Get the image either in base64 string representation or as np.array.
|
|
208
210
|
|
|
@@ -531,16 +533,20 @@ class Image:
|
|
|
531
533
|
)
|
|
532
534
|
ann.image.dump(sub_image)
|
|
533
535
|
|
|
534
|
-
def remove_image_from_lower_hierachy(self) -> None:
|
|
536
|
+
def remove_image_from_lower_hierachy(self, pixel_values_only: bool = False) -> None:
|
|
535
537
|
"""Will remove all images from image annotations."""
|
|
536
538
|
for ann in self.annotations:
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
539
|
+
if pixel_values_only:
|
|
540
|
+
if ann.image is not None:
|
|
541
|
+
ann.image.clear_image()
|
|
542
|
+
else:
|
|
543
|
+
absolute_bounding_box = ann.get_bounding_box(self.image_id)
|
|
544
|
+
ann.bounding_box = absolute_bounding_box
|
|
545
|
+
ann.image = None
|
|
540
546
|
|
|
541
547
|
@classmethod
|
|
542
548
|
@no_type_check
|
|
543
|
-
def from_dict(cls, **kwargs) ->
|
|
549
|
+
def from_dict(cls, **kwargs) -> Image:
|
|
544
550
|
"""
|
|
545
551
|
Create `Image` instance from dict.
|
|
546
552
|
|
|
@@ -571,7 +577,7 @@ class Image:
|
|
|
571
577
|
|
|
572
578
|
@classmethod
|
|
573
579
|
@no_type_check
|
|
574
|
-
def from_file(cls, file_path: str) ->
|
|
580
|
+
def from_file(cls, file_path: str) -> Image:
|
|
575
581
|
"""
|
|
576
582
|
Create `Image` instance from .json file.
|
|
577
583
|
|