deepdoctection 0.30__tar.gz → 0.32__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.30 → deepdoctection-0.32}/PKG-INFO +57 -73
- {deepdoctection-0.30 → deepdoctection-0.32}/README.md +15 -6
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/__init__.py +38 -29
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/analyzer/dd.py +36 -29
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/configs/conf_dd_one.yaml +34 -31
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/base.py +0 -19
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/custom.py +4 -3
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/custom_serialize.py +14 -5
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/parallel_map.py +12 -11
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/serialize.py +5 -4
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/annotation.py +35 -13
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/box.py +3 -5
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/convert.py +3 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/image.py +79 -36
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/view.py +152 -49
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/__init__.py +1 -4
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/adapter.py +6 -3
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/base.py +86 -11
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/dataflow_builder.py +1 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/info.py +4 -4
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/doclaynet.py +3 -2
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/fintabnet.py +2 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/funsd.py +2 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/iiitar13k.py +5 -2
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/layouttest.py +4 -8
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/publaynet.py +2 -2
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/pubtables1m.py +6 -3
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/pubtabnet.py +2 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/rvlcdip.py +2 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/xfund.py +2 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/__init__.py +1 -4
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/accmetric.py +1 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/base.py +5 -4
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/cocometric.py +2 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/eval.py +19 -15
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/tedsmetric.py +14 -11
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/tp_eval_callback.py +14 -7
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/__init__.py +2 -7
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/base.py +39 -13
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/d2detect.py +182 -90
- deepdoctection-0.32/deepdoctection/extern/deskew.py +82 -0
- deepdoctection-0.32/deepdoctection/extern/doctrocr.py +526 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/fastlang.py +49 -9
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/hfdetr.py +106 -55
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/hflayoutlm.py +441 -122
- deepdoctection-0.32/deepdoctection/extern/hflm.py +225 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/model.py +56 -47
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/pdftext.py +10 -5
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/pt/__init__.py +1 -3
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/pt/nms.py +6 -2
- deepdoctection-0.32/deepdoctection/extern/pt/ptutils.py +57 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tessocr.py +134 -22
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/texocr.py +6 -2
- deepdoctection-0.32/deepdoctection/extern/tp/tfutils.py +91 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpcompat.py +14 -11
- deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/__init__.py +20 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/common.py +7 -3
- deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +20 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/config/config.py +9 -6
- deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +20 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +17 -7
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +12 -6
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +9 -4
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +8 -5
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +16 -11
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +17 -10
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +14 -8
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +15 -10
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/predict.py +9 -4
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/preproc.py +8 -9
- {deepdoctection-0.30/tests/datapoint → deepdoctection-0.32/deepdoctection/extern/tp/tpfrcnn/utils}/__init__.py +4 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +10 -2
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tpdetect.py +54 -30
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/__init__.py +3 -8
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/d2struct.py +9 -7
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/hfstruct.py +7 -2
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/laylmstruct.py +164 -21
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/maputils.py +16 -3
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/misc.py +6 -3
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/prodigystruct.py +1 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/pubstruct.py +10 -10
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/tpstruct.py +3 -3
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/__init__.py +1 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/anngen.py +35 -8
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/base.py +53 -19
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/common.py +23 -13
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/concurrency.py +2 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/doctectionpipe.py +2 -2
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/language.py +3 -2
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/layout.py +6 -3
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/lm.py +34 -66
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/order.py +142 -35
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/refine.py +26 -24
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/segment.py +21 -16
- deepdoctection-0.30/deepdoctection/pipe/cell.py → deepdoctection-0.32/deepdoctection/pipe/sub_layout.py +30 -9
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/text.py +14 -8
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/transform.py +16 -9
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/__init__.py +6 -12
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/d2_frcnn_train.py +36 -28
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/hf_detr_train.py +26 -17
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/hf_layoutlm_train.py +133 -111
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/train/tp_frcnn_train.py +21 -19
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/__init__.py +3 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/concurrency.py +1 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/context.py +2 -2
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/env_info.py +41 -84
- deepdoctection-0.32/deepdoctection/utils/error.py +84 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/file_utils.py +4 -15
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/fs.py +7 -7
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/logger.py +1 -0
- deepdoctection-0.32/deepdoctection/utils/mocks.py +93 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/pdf_utils.py +5 -4
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/settings.py +6 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/transform.py +1 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/utils.py +0 -6
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/viz.py +48 -5
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/PKG-INFO +57 -73
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/SOURCES.txt +5 -90
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/requires.txt +40 -65
- {deepdoctection-0.30 → deepdoctection-0.32}/setup.cfg +7 -1
- {deepdoctection-0.30 → deepdoctection-0.32}/setup.py +23 -23
- {deepdoctection-0.30 → deepdoctection-0.32}/tests/test_utils.py +8 -0
- deepdoctection-0.30/deepdoctection/extern/deskew.py +0 -55
- deepdoctection-0.30/deepdoctection/extern/doctrocr.py +0 -344
- deepdoctection-0.30/deepdoctection/extern/pt/ptutils.py +0 -48
- deepdoctection-0.30/deepdoctection/extern/tp/tfutils.py +0 -57
- deepdoctection-0.30/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- deepdoctection-0.30/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- deepdoctection-0.30/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- deepdoctection-0.30/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- deepdoctection-0.30/tests/__init__.py +0 -22
- deepdoctection-0.30/tests/analyzer/__init__.py +0 -16
- deepdoctection-0.30/tests/analyzer/test_dd.py +0 -202
- deepdoctection-0.30/tests/conftest.py +0 -499
- deepdoctection-0.30/tests/data.py +0 -1592
- deepdoctection-0.30/tests/dataflow/__init__.py +0 -16
- deepdoctection-0.30/tests/dataflow/conftest.py +0 -95
- deepdoctection-0.30/tests/dataflow/test_common.py +0 -219
- deepdoctection-0.30/tests/dataflow/test_custom.py +0 -60
- deepdoctection-0.30/tests/dataflow/test_custom_serialize.py +0 -177
- deepdoctection-0.30/tests/dataflow/test_parallel_map.py +0 -66
- deepdoctection-0.30/tests/dataflow/test_stats.py +0 -103
- deepdoctection-0.30/tests/datapoint/conftest.py +0 -262
- deepdoctection-0.30/tests/datapoint/test_annotation.py +0 -170
- deepdoctection-0.30/tests/datapoint/test_box.py +0 -416
- deepdoctection-0.30/tests/datapoint/test_convert.py +0 -52
- deepdoctection-0.30/tests/datapoint/test_image.py +0 -341
- deepdoctection-0.30/tests/datapoint/test_view.py +0 -150
- deepdoctection-0.30/tests/datasets/__init__.py +0 -16
- deepdoctection-0.30/tests/datasets/instances/__init__.py +0 -16
- deepdoctection-0.30/tests/datasets/instances/conftest.py +0 -35
- deepdoctection-0.30/tests/datasets/instances/test_doclaynet.py +0 -43
- deepdoctection-0.30/tests/datasets/instances/test_fintabnet.py +0 -70
- deepdoctection-0.30/tests/datasets/instances/test_funsd.py +0 -58
- deepdoctection-0.30/tests/datasets/instances/test_iiitar13k.py +0 -42
- deepdoctection-0.30/tests/datasets/instances/test_layouttest.py +0 -63
- deepdoctection-0.30/tests/datasets/instances/test_publaynet.py +0 -64
- deepdoctection-0.30/tests/datasets/instances/test_pubtables1m.py +0 -66
- deepdoctection-0.30/tests/datasets/instances/test_pubtabnet.py +0 -65
- deepdoctection-0.30/tests/datasets/instances/test_rvlcdip.py +0 -46
- deepdoctection-0.30/tests/datasets/test_adapter.py +0 -77
- deepdoctection-0.30/tests/datasets/test_info.py +0 -273
- deepdoctection-0.30/tests/datasets/test_registry.py +0 -75
- deepdoctection-0.30/tests/eval/__init__.py +0 -16
- deepdoctection-0.30/tests/eval/conftest.py +0 -107
- deepdoctection-0.30/tests/eval/test_accmetric.py +0 -364
- deepdoctection-0.30/tests/eval/test_cocometric.py +0 -123
- deepdoctection-0.30/tests/eval/test_eval.py +0 -86
- deepdoctection-0.30/tests/eval/test_registry.py +0 -84
- deepdoctection-0.30/tests/eval/test_tedsmetric.py +0 -40
- deepdoctection-0.30/tests/extern/__init__.py +0 -0
- deepdoctection-0.30/tests/extern/conftest.py +0 -99
- deepdoctection-0.30/tests/extern/data.py +0 -100
- deepdoctection-0.30/tests/extern/test_deskew.py +0 -57
- deepdoctection-0.30/tests/extern/test_doctrocr.py +0 -146
- deepdoctection-0.30/tests/extern/test_fastlang.py +0 -64
- deepdoctection-0.30/tests/extern/test_hfdetr.py +0 -116
- deepdoctection-0.30/tests/extern/test_hflayoutlm.py +0 -492
- deepdoctection-0.30/tests/extern/test_pdftext.py +0 -70
- deepdoctection-0.30/tests/extern/test_tessocr.py +0 -105
- deepdoctection-0.30/tests/extern/test_texocr.py +0 -52
- deepdoctection-0.30/tests/extern/test_tpdetect.py +0 -123
- deepdoctection-0.30/tests/mapper/__init__.py +0 -16
- deepdoctection-0.30/tests/mapper/conftest.py +0 -297
- deepdoctection-0.30/tests/mapper/data.py +0 -2182
- deepdoctection-0.30/tests/mapper/test_cats.py +0 -305
- deepdoctection-0.30/tests/mapper/test_cocostruct.py +0 -91
- deepdoctection-0.30/tests/mapper/test_d2struct.py +0 -56
- deepdoctection-0.30/tests/mapper/test_hfstruct.py +0 -59
- deepdoctection-0.30/tests/mapper/test_iiitar13k.py +0 -64
- deepdoctection-0.30/tests/mapper/test_laylmstruct.py +0 -141
- deepdoctection-0.30/tests/mapper/test_misc.py +0 -72
- deepdoctection-0.30/tests/mapper/test_prodigystruct.py +0 -78
- deepdoctection-0.30/tests/mapper/test_pubstruct.py +0 -170
- deepdoctection-0.30/tests/mapper/test_tpstruct.py +0 -51
- deepdoctection-0.30/tests/mapper/test_utils.py +0 -83
- deepdoctection-0.30/tests/mapper/test_xfundstruct.py +0 -68
- deepdoctection-0.30/tests/pipe/__init__.py +0 -16
- deepdoctection-0.30/tests/pipe/test_anngen.py +0 -179
- deepdoctection-0.30/tests/pipe/test_cell.py +0 -123
- deepdoctection-0.30/tests/pipe/test_common.py +0 -107
- deepdoctection-0.30/tests/pipe/test_language.py +0 -76
- deepdoctection-0.30/tests/pipe/test_layout.py +0 -65
- deepdoctection-0.30/tests/pipe/test_lm.py +0 -119
- deepdoctection-0.30/tests/pipe/test_order.py +0 -197
- deepdoctection-0.30/tests/pipe/test_refine.py +0 -325
- deepdoctection-0.30/tests/pipe/test_registry.py +0 -58
- deepdoctection-0.30/tests/pipe/test_segment.py +0 -392
- deepdoctection-0.30/tests/pipe/test_text.py +0 -204
- deepdoctection-0.30/tests/pipe/test_transform.py +0 -63
- deepdoctection-0.30/tests/train/__init__.py +0 -16
- deepdoctection-0.30/tests/train/conftest.py +0 -118
- deepdoctection-0.30/tests/train/test_d2_frcnn_train.py +0 -64
- deepdoctection-0.30/tests/train/test_tp_frcnn_train.py +0 -99
- deepdoctection-0.30/tests_d2/__init__.py +0 -20
- deepdoctection-0.30/tests_d2/conftest.py +0 -56
- deepdoctection-0.30/tests_d2/test_d2detect.py +0 -95
- {deepdoctection-0.30 → deepdoctection-0.32}/LICENSE +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/analyzer/__init__.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/detection_types.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/metacfg.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.30 → deepdoctection-0.32}/deepdoctection.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.32
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -9,94 +9,96 @@ Classifier: Development Status :: 4 - Beta
|
|
|
9
9
|
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
10
|
Classifier: Natural Language :: English
|
|
11
11
|
Classifier: Operating System :: POSIX :: Linux
|
|
12
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
13
12
|
Classifier: Programming Language :: Python :: 3.9
|
|
14
13
|
Classifier: Programming Language :: Python :: 3.10
|
|
15
14
|
Classifier: Programming Language :: Python :: 3.11
|
|
16
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
-
Requires-Python: >=3.
|
|
16
|
+
Requires-Python: >=3.9
|
|
18
17
|
Description-Content-Type: text/markdown
|
|
19
18
|
License-File: LICENSE
|
|
20
|
-
Requires-Dist: catalogue==2.0.
|
|
19
|
+
Requires-Dist: catalogue==2.0.10
|
|
21
20
|
Requires-Dist: huggingface_hub>=0.12.0
|
|
22
|
-
Requires-Dist: importlib-metadata>=
|
|
21
|
+
Requires-Dist: importlib-metadata>=5.0.0
|
|
23
22
|
Requires-Dist: jsonlines==3.1.0
|
|
23
|
+
Requires-Dist: lazy-imports==0.3.1
|
|
24
24
|
Requires-Dist: mock==4.0.3
|
|
25
25
|
Requires-Dist: networkx>=2.7.1
|
|
26
26
|
Requires-Dist: numpy>=1.21
|
|
27
27
|
Requires-Dist: packaging>=20.0
|
|
28
28
|
Requires-Dist: Pillow>=10.0.0
|
|
29
29
|
Requires-Dist: pypdf>=3.16.0
|
|
30
|
-
Requires-Dist: pyyaml
|
|
30
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
31
31
|
Requires-Dist: pyzmq>=16
|
|
32
32
|
Requires-Dist: termcolor>=1.1
|
|
33
33
|
Requires-Dist: tabulate>=0.7.7
|
|
34
34
|
Requires-Dist: tqdm==4.64.0
|
|
35
35
|
Provides-Extra: tf
|
|
36
|
-
Requires-Dist: catalogue==2.0.
|
|
36
|
+
Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
37
37
|
Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
|
|
38
|
-
Requires-Dist: importlib-metadata>=
|
|
38
|
+
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
39
39
|
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
40
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
40
41
|
Requires-Dist: mock==4.0.3; extra == "tf"
|
|
41
42
|
Requires-Dist: networkx>=2.7.1; extra == "tf"
|
|
42
43
|
Requires-Dist: numpy>=1.21; extra == "tf"
|
|
43
44
|
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
44
45
|
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
45
46
|
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
46
|
-
Requires-Dist: pyyaml
|
|
47
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
47
48
|
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
48
49
|
Requires-Dist: termcolor>=1.1; extra == "tf"
|
|
49
50
|
Requires-Dist: tabulate>=0.7.7; extra == "tf"
|
|
50
51
|
Requires-Dist: tqdm==4.64.0; extra == "tf"
|
|
51
|
-
Requires-Dist: tensorpack; extra == "tf"
|
|
52
|
+
Requires-Dist: tensorpack==0.11; extra == "tf"
|
|
52
53
|
Requires-Dist: protobuf==3.20.1; extra == "tf"
|
|
53
54
|
Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
|
|
54
55
|
Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
|
|
55
|
-
Requires-Dist: python-doctr==0.
|
|
56
|
+
Requires-Dist: python-doctr==0.8.1; extra == "tf"
|
|
56
57
|
Requires-Dist: pycocotools>=2.0.2; extra == "tf"
|
|
57
|
-
Requires-Dist: boto3; extra == "tf"
|
|
58
|
-
Requires-Dist: pdfplumber>=0.
|
|
59
|
-
Requires-Dist: fasttext; extra == "tf"
|
|
60
|
-
Requires-Dist: jdeskew; extra == "tf"
|
|
58
|
+
Requires-Dist: boto3==1.34.102; extra == "tf"
|
|
59
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
|
|
60
|
+
Requires-Dist: fasttext==0.9.2; extra == "tf"
|
|
61
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "tf"
|
|
61
62
|
Requires-Dist: apted==1.0.3; extra == "tf"
|
|
62
63
|
Requires-Dist: distance==0.1.3; extra == "tf"
|
|
63
64
|
Requires-Dist: lxml>=4.9.1; extra == "tf"
|
|
64
65
|
Provides-Extra: pt
|
|
65
|
-
Requires-Dist: catalogue==2.0.
|
|
66
|
+
Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
66
67
|
Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
|
|
67
|
-
Requires-Dist: importlib-metadata>=
|
|
68
|
+
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
68
69
|
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
70
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
69
71
|
Requires-Dist: mock==4.0.3; extra == "pt"
|
|
70
72
|
Requires-Dist: networkx>=2.7.1; extra == "pt"
|
|
71
73
|
Requires-Dist: numpy>=1.21; extra == "pt"
|
|
72
74
|
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
73
75
|
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
74
76
|
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
75
|
-
Requires-Dist: pyyaml
|
|
77
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
76
78
|
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
77
79
|
Requires-Dist: termcolor>=1.1; extra == "pt"
|
|
78
80
|
Requires-Dist: tabulate>=0.7.7; extra == "pt"
|
|
79
81
|
Requires-Dist: tqdm==4.64.0; extra == "pt"
|
|
80
|
-
Requires-Dist: timm; extra == "pt"
|
|
82
|
+
Requires-Dist: timm>=0.9.16; extra == "pt"
|
|
81
83
|
Requires-Dist: transformers>=4.36.0; extra == "pt"
|
|
82
|
-
Requires-Dist: accelerate; extra == "pt"
|
|
83
|
-
Requires-Dist: python-doctr==0.
|
|
84
|
-
Requires-Dist: boto3; extra == "pt"
|
|
85
|
-
Requires-Dist: pdfplumber>=0.
|
|
86
|
-
Requires-Dist: fasttext; extra == "pt"
|
|
87
|
-
Requires-Dist: jdeskew; extra == "pt"
|
|
84
|
+
Requires-Dist: accelerate>=0.29.1; extra == "pt"
|
|
85
|
+
Requires-Dist: python-doctr==0.8.1; extra == "pt"
|
|
86
|
+
Requires-Dist: boto3==1.34.102; extra == "pt"
|
|
87
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
|
|
88
|
+
Requires-Dist: fasttext==0.9.2; extra == "pt"
|
|
89
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "pt"
|
|
88
90
|
Requires-Dist: apted==1.0.3; extra == "pt"
|
|
89
91
|
Requires-Dist: distance==0.1.3; extra == "pt"
|
|
90
92
|
Requires-Dist: lxml>=4.9.1; extra == "pt"
|
|
91
93
|
Provides-Extra: docs
|
|
92
|
-
Requires-Dist: tensorpack; extra == "docs"
|
|
93
|
-
Requires-Dist: boto3; extra == "docs"
|
|
94
|
+
Requires-Dist: tensorpack==0.11; extra == "docs"
|
|
95
|
+
Requires-Dist: boto3==1.34.102; extra == "docs"
|
|
94
96
|
Requires-Dist: transformers>=4.36.0; extra == "docs"
|
|
95
|
-
Requires-Dist: accelerate; extra == "docs"
|
|
96
|
-
Requires-Dist: pdfplumber>=0.
|
|
97
|
+
Requires-Dist: accelerate>=0.29.1; extra == "docs"
|
|
98
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
|
|
97
99
|
Requires-Dist: lxml>=4.9.1; extra == "docs"
|
|
98
|
-
Requires-Dist: lxml-stubs; extra == "docs"
|
|
99
|
-
Requires-Dist: jdeskew; extra == "docs"
|
|
100
|
+
Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
|
|
101
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "docs"
|
|
100
102
|
Requires-Dist: jinja2==3.0.3; extra == "docs"
|
|
101
103
|
Requires-Dist: mkdocs-material; extra == "docs"
|
|
102
104
|
Requires-Dist: mkdocstrings-python; extra == "docs"
|
|
@@ -105,47 +107,20 @@ Provides-Extra: dev
|
|
|
105
107
|
Requires-Dist: python-dotenv==1.0.0; extra == "dev"
|
|
106
108
|
Requires-Dist: click; extra == "dev"
|
|
107
109
|
Requires-Dist: black==23.7.0; extra == "dev"
|
|
108
|
-
Requires-Dist: isort; extra == "dev"
|
|
110
|
+
Requires-Dist: isort==5.13.2; extra == "dev"
|
|
109
111
|
Requires-Dist: pylint==2.17.4; extra == "dev"
|
|
110
112
|
Requires-Dist: mypy==1.4.1; extra == "dev"
|
|
111
113
|
Requires-Dist: wandb; extra == "dev"
|
|
112
|
-
Requires-Dist: types-PyYAML; extra == "dev"
|
|
113
|
-
Requires-Dist: types-termcolor
|
|
114
|
-
Requires-Dist: types-tabulate; extra == "dev"
|
|
115
|
-
Requires-Dist: types-tqdm; extra == "dev"
|
|
116
|
-
Requires-Dist: lxml-stubs; extra == "dev"
|
|
117
|
-
Requires-Dist: types-Pillow; extra == "dev"
|
|
118
|
-
Requires-Dist: types-urllib3; extra == "dev"
|
|
114
|
+
Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
|
|
115
|
+
Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
|
|
116
|
+
Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
|
|
117
|
+
Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
|
|
118
|
+
Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
|
|
119
|
+
Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
|
|
120
|
+
Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
|
|
119
121
|
Provides-Extra: test
|
|
120
|
-
Requires-Dist: pytest; extra == "test"
|
|
122
|
+
Requires-Dist: pytest==8.0.2; extra == "test"
|
|
121
123
|
Requires-Dist: pytest-cov; extra == "test"
|
|
122
|
-
Provides-Extra: hf
|
|
123
|
-
Requires-Dist: catalogue==2.0.7; extra == "hf"
|
|
124
|
-
Requires-Dist: huggingface_hub>=0.12.0; extra == "hf"
|
|
125
|
-
Requires-Dist: importlib-metadata>=4.11.2; extra == "hf"
|
|
126
|
-
Requires-Dist: jsonlines==3.1.0; extra == "hf"
|
|
127
|
-
Requires-Dist: mock==4.0.3; extra == "hf"
|
|
128
|
-
Requires-Dist: networkx>=2.7.1; extra == "hf"
|
|
129
|
-
Requires-Dist: numpy>=1.21; extra == "hf"
|
|
130
|
-
Requires-Dist: packaging>=20.0; extra == "hf"
|
|
131
|
-
Requires-Dist: Pillow>=10.0.0; extra == "hf"
|
|
132
|
-
Requires-Dist: pypdf>=3.16.0; extra == "hf"
|
|
133
|
-
Requires-Dist: pyyaml==6.0; extra == "hf"
|
|
134
|
-
Requires-Dist: pyzmq>=16; extra == "hf"
|
|
135
|
-
Requires-Dist: termcolor>=1.1; extra == "hf"
|
|
136
|
-
Requires-Dist: tabulate>=0.7.7; extra == "hf"
|
|
137
|
-
Requires-Dist: tqdm==4.64.0; extra == "hf"
|
|
138
|
-
Requires-Dist: timm; extra == "hf"
|
|
139
|
-
Requires-Dist: transformers>=4.36.0; extra == "hf"
|
|
140
|
-
Requires-Dist: accelerate; extra == "hf"
|
|
141
|
-
Requires-Dist: python-doctr==0.7.0; extra == "hf"
|
|
142
|
-
Requires-Dist: boto3; extra == "hf"
|
|
143
|
-
Requires-Dist: pdfplumber>=0.7.1; extra == "hf"
|
|
144
|
-
Requires-Dist: fasttext; extra == "hf"
|
|
145
|
-
Requires-Dist: jdeskew; extra == "hf"
|
|
146
|
-
Requires-Dist: apted==1.0.3; extra == "hf"
|
|
147
|
-
Requires-Dist: distance==0.1.3; extra == "hf"
|
|
148
|
-
Requires-Dist: lxml>=4.9.1; extra == "hf"
|
|
149
124
|
|
|
150
125
|
|
|
151
126
|
<p align="center">
|
|
@@ -180,7 +155,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
180
155
|
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
181
156
|
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
|
|
182
157
|
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
183
|
-
- Document and token classification with all LayoutLM models provided by the
|
|
158
|
+
- Document and token classification with all LayoutLM models provided by the
|
|
159
|
+
[**Transformer library**](https://github.com/huggingface/transformers).
|
|
184
160
|
(Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
|
|
185
161
|
- Table detection and table structure recognition with
|
|
186
162
|
[**table-transformer**](https://github.com/microsoft/table-transformer).
|
|
@@ -190,8 +166,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
190
166
|
- Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
|
|
191
167
|
Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
|
|
192
168
|
[docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
|
|
193
|
-
- Document layout analysis and table recognition now runs with
|
|
194
|
-
|
|
169
|
+
- Document layout analysis and table recognition now runs with
|
|
170
|
+
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
171
|
+
anymore for basic inference.
|
|
172
|
+
- [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
173
|
+
(not contained in the built-in Analyzer).
|
|
174
|
+
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
175
|
+
[**transformers**](https://github.com/huggingface/transformers).
|
|
176
|
+
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
177
|
+
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
178
|
+
LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
|
|
195
179
|
|
|
196
180
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
197
181
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -282,9 +266,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
|
|
|
282
266
|
separately.
|
|
283
267
|
|
|
284
268
|
- Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
|
|
285
|
-
- Python >= 3.
|
|
286
|
-
- 1.
|
|
287
|
-
|
|
269
|
+
- Python >= 3.9
|
|
270
|
+
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
271
|
+
In general, if you want to train or fine-tune models, a GPU is required.
|
|
288
272
|
- **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
|
|
289
273
|
images.
|
|
290
274
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
@@ -31,7 +31,8 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
31
31
|
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
32
32
|
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
|
|
33
33
|
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
34
|
-
- Document and token classification with all LayoutLM models provided by the
|
|
34
|
+
- Document and token classification with all LayoutLM models provided by the
|
|
35
|
+
[**Transformer library**](https://github.com/huggingface/transformers).
|
|
35
36
|
(Yes, you can use any LayoutLM-model with any of the provided OCR-or pdfplumber tools straight away!).
|
|
36
37
|
- Table detection and table structure recognition with
|
|
37
38
|
[**table-transformer**](https://github.com/microsoft/table-transformer).
|
|
@@ -41,8 +42,16 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
41
42
|
- Comprehensive configuration of **analyzer** like choosing different models, output parsing, OCR selection.
|
|
42
43
|
Check this [notebook](https://github.com/deepdoctection/notebooks/blob/main/Analyzer_Configuration.ipynb) or the
|
|
43
44
|
[docs](https://deepdoctection.readthedocs.io/en/latest/tutorials/analyzer_configuration_notebook/) for more infos.
|
|
44
|
-
- Document layout analysis and table recognition now runs with
|
|
45
|
-
|
|
45
|
+
- Document layout analysis and table recognition now runs with
|
|
46
|
+
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
47
|
+
anymore for basic inference.
|
|
48
|
+
- [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
49
|
+
(not contained in the built-in Analyzer).
|
|
50
|
+
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
51
|
+
[**transformers**](https://github.com/huggingface/transformers).
|
|
52
|
+
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
53
|
+
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
54
|
+
LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
|
|
46
55
|
|
|
47
56
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
48
57
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -133,9 +142,9 @@ Everything in the overview listed below the **deep**doctection layer are necessa
|
|
|
133
142
|
separately.
|
|
134
143
|
|
|
135
144
|
- Linux or macOS. (Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available)
|
|
136
|
-
- Python >= 3.
|
|
137
|
-
- 1.
|
|
138
|
-
|
|
145
|
+
- Python >= 3.9
|
|
146
|
+
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
147
|
+
In general, if you want to train or fine-tune models, a GPU is required.
|
|
139
148
|
- **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
|
|
140
149
|
images.
|
|
141
150
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
@@ -19,15 +19,13 @@ import os
|
|
|
19
19
|
import sys
|
|
20
20
|
from typing import TYPE_CHECKING
|
|
21
21
|
|
|
22
|
-
from
|
|
23
|
-
|
|
24
|
-
from .utils.env_info import auto_select_lib_and_device
|
|
22
|
+
from .utils.env_info import collect_env_info
|
|
25
23
|
from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
|
|
26
|
-
from .utils.logger import logger
|
|
24
|
+
from .utils.logger import LoggingRecord, logger
|
|
27
25
|
|
|
28
26
|
# pylint: enable=wrong-import-position
|
|
29
27
|
|
|
30
|
-
__version__ = 0.
|
|
28
|
+
__version__ = 0.32
|
|
31
29
|
|
|
32
30
|
_IMPORT_STRUCTURE = {
|
|
33
31
|
"analyzer": [
|
|
@@ -179,8 +177,10 @@ _IMPORT_STRUCTURE = {
|
|
|
179
177
|
"Jdeskewer",
|
|
180
178
|
"DoctrTextlineDetector",
|
|
181
179
|
"DoctrTextRecognizer",
|
|
180
|
+
"DocTrRotationTransformer",
|
|
182
181
|
"FasttextLangDetector",
|
|
183
182
|
"HFDetrDerivedDetector",
|
|
183
|
+
"get_tokenizer_from_architecture",
|
|
184
184
|
"HFLayoutLmTokenClassifierBase",
|
|
185
185
|
"HFLayoutLmTokenClassifier",
|
|
186
186
|
"HFLayoutLmv2TokenClassifier",
|
|
@@ -188,12 +188,16 @@ _IMPORT_STRUCTURE = {
|
|
|
188
188
|
"HFLayoutLmSequenceClassifier",
|
|
189
189
|
"HFLayoutLmv2SequenceClassifier",
|
|
190
190
|
"HFLayoutLmv3SequenceClassifier",
|
|
191
|
+
"HFLiltTokenClassifier",
|
|
192
|
+
"HFLiltSequenceClassifier",
|
|
193
|
+
"HFLmSequenceClassifier",
|
|
191
194
|
"ModelProfile",
|
|
192
195
|
"ModelCatalog",
|
|
193
196
|
"print_model_infos",
|
|
194
197
|
"ModelDownloadManager",
|
|
195
198
|
"PdfPlumberTextDetector",
|
|
196
199
|
"TesseractOcrDetector",
|
|
200
|
+
"TesseractRotationTransformer",
|
|
197
201
|
"TextractOcrDetector",
|
|
198
202
|
"TPFrcnnDetector",
|
|
199
203
|
],
|
|
@@ -266,11 +270,11 @@ _IMPORT_STRUCTURE = {
|
|
|
266
270
|
"DoctectionPipe",
|
|
267
271
|
"LanguageDetectionService",
|
|
268
272
|
"ImageLayoutService",
|
|
269
|
-
"get_tokenizer_from_architecture",
|
|
270
273
|
"LMTokenClassifierService",
|
|
271
274
|
"LMSequenceClassifierService",
|
|
272
275
|
"OrderGenerator",
|
|
273
276
|
"TextLineGenerator",
|
|
277
|
+
"TextLineService",
|
|
274
278
|
"TextOrderService",
|
|
275
279
|
"TableSegmentationRefinementService",
|
|
276
280
|
"generate_html_string",
|
|
@@ -279,7 +283,7 @@ _IMPORT_STRUCTURE = {
|
|
|
279
283
|
"PubtablesSegmentationService",
|
|
280
284
|
"SegmentationResult",
|
|
281
285
|
"TextExtractionService",
|
|
282
|
-
"
|
|
286
|
+
"SimpleTransformService",
|
|
283
287
|
],
|
|
284
288
|
"train": [
|
|
285
289
|
"D2Trainer",
|
|
@@ -295,14 +299,13 @@ _IMPORT_STRUCTURE = {
|
|
|
295
299
|
"save_tmp_file",
|
|
296
300
|
"timed_operation",
|
|
297
301
|
"collect_env_info",
|
|
298
|
-
"get_device",
|
|
299
|
-
"auto_select_lib_and_device",
|
|
300
302
|
"auto_select_viz_library",
|
|
301
303
|
"get_tensorflow_requirement",
|
|
302
304
|
"tf_addons_available",
|
|
303
305
|
"get_tf_addons_requirements",
|
|
304
306
|
"tensorpack_available",
|
|
305
307
|
"get_tensorpack_requirement",
|
|
308
|
+
"pytorch_available",
|
|
306
309
|
"get_pytorch_requirement",
|
|
307
310
|
"lxml_available",
|
|
308
311
|
"get_lxml_requirement",
|
|
@@ -416,25 +419,31 @@ _IMPORT_STRUCTURE = {
|
|
|
416
419
|
],
|
|
417
420
|
}
|
|
418
421
|
|
|
422
|
+
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
423
|
+
env_info = collect_env_info()
|
|
424
|
+
logger.debug(LoggingRecord(msg=env_info))
|
|
419
425
|
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
except Exception: # pylint: disable=W0703
|
|
434
|
-
pass
|
|
426
|
+
if os.environ.get("PYTORCH_AVAILABLE") and os.environ.get("DD_USE_TORCH") is None:
|
|
427
|
+
os.environ["DD_USE_TORCH"] = "1"
|
|
428
|
+
os.environ["USE_TORCH"] = "1"
|
|
429
|
+
if os.environ.get("TENSORFLOW_AVAILABLE") and os.environ.get("DD_USE_TF") is None:
|
|
430
|
+
os.environ["DD_USE_TF"] = "1"
|
|
431
|
+
os.environ["USE_TF"] = "1"
|
|
432
|
+
if os.environ.get("DD_USE_TORCH") and os.environ.get("DD_USE_TF"):
|
|
433
|
+
logger.warning(
|
|
434
|
+
"Both DD_USE_TORCH and DD_USE_TF are set. Defaulting to PyTorch. If you want a different "
|
|
435
|
+
"behaviour, set DD_USE_TORCH to None before importing deepdoctection."
|
|
436
|
+
)
|
|
437
|
+
os.environ.pop("DD_USE_TF")
|
|
438
|
+
os.environ.pop("USE_TF")
|
|
435
439
|
|
|
436
|
-
|
|
437
|
-
|
|
440
|
+
if not os.environ.get("PYTORCH_AVAILABLE") and not os.environ.get("TENSORFLOW_AVAILABLE"):
|
|
441
|
+
logger.warning(
|
|
442
|
+
LoggingRecord(
|
|
443
|
+
msg="Neither Tensorflow or Pytorch are available. You will not be able to use any Deep Learning "
|
|
444
|
+
"model from the library."
|
|
445
|
+
)
|
|
446
|
+
)
|
|
438
447
|
|
|
439
448
|
|
|
440
449
|
# Direct imports for type-checking
|
|
@@ -442,10 +451,10 @@ if TYPE_CHECKING:
|
|
|
442
451
|
from .analyzer import *
|
|
443
452
|
from .dataflow import *
|
|
444
453
|
from .datapoint import *
|
|
445
|
-
from .datasets import *
|
|
454
|
+
from .datasets import * # type: ignore
|
|
446
455
|
from .eval import *
|
|
447
|
-
from .extern import *
|
|
448
|
-
from .mapper import *
|
|
456
|
+
from .extern import * # type: ignore
|
|
457
|
+
from .mapper import * # type: ignore
|
|
449
458
|
from .pipe import *
|
|
450
459
|
from .train import *
|
|
451
460
|
from .utils import *
|
|
@@ -23,51 +23,43 @@ Module for **deep**doctection analyzer.
|
|
|
23
23
|
-user factory with a reduced config setting
|
|
24
24
|
"""
|
|
25
25
|
|
|
26
|
-
import ast
|
|
27
26
|
import os
|
|
28
27
|
from os import environ
|
|
29
28
|
from shutil import copyfile
|
|
30
29
|
from typing import List, Optional, Union
|
|
31
30
|
|
|
31
|
+
from lazy_imports import try_import
|
|
32
|
+
|
|
32
33
|
from ..extern.base import ObjectDetector
|
|
34
|
+
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
33
35
|
from ..extern.doctrocr import DoctrTextlineDetector, DoctrTextRecognizer
|
|
36
|
+
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
34
37
|
from ..extern.model import ModelCatalog, ModelDownloadManager
|
|
35
38
|
from ..extern.pdftext import PdfPlumberTextDetector
|
|
39
|
+
from ..extern.pt.ptutils import get_torch_device
|
|
36
40
|
from ..extern.tessocr import TesseractOcrDetector
|
|
37
41
|
from ..extern.texocr import TextractOcrDetector
|
|
42
|
+
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
43
|
+
from ..extern.tpdetect import TPFrcnnDetector
|
|
38
44
|
from ..pipe.base import PipelineComponent
|
|
39
|
-
from ..pipe.cell import DetectResultGenerator, SubImageLayoutService
|
|
40
45
|
from ..pipe.common import AnnotationNmsService, MatchingService, PageParsingService
|
|
41
46
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
42
47
|
from ..pipe.layout import ImageLayoutService
|
|
43
48
|
from ..pipe.order import TextOrderService
|
|
44
49
|
from ..pipe.refine import TableSegmentationRefinementService
|
|
45
50
|
from ..pipe.segment import PubtablesSegmentationService, TableSegmentationService
|
|
51
|
+
from ..pipe.sub_layout import DetectResultGenerator, SubImageLayoutService
|
|
46
52
|
from ..pipe.text import TextExtractionService
|
|
47
53
|
from ..utils.detection_types import Pathlike
|
|
48
|
-
from ..utils.
|
|
49
|
-
from ..utils.file_utils import
|
|
50
|
-
boto3_available,
|
|
51
|
-
detectron2_available,
|
|
52
|
-
pytorch_available,
|
|
53
|
-
tensorpack_available,
|
|
54
|
-
tf_available,
|
|
55
|
-
)
|
|
54
|
+
from ..utils.error import DependencyError
|
|
55
|
+
from ..utils.file_utils import detectron2_available, tensorpack_available
|
|
56
56
|
from ..utils.fs import get_configs_dir_path, get_package_path, mkdir_p
|
|
57
57
|
from ..utils.logger import LoggingRecord, logger
|
|
58
58
|
from ..utils.metacfg import AttrDict, set_config_by_yaml
|
|
59
59
|
from ..utils.settings import CellType, LayoutType
|
|
60
60
|
from ..utils.transform import PadTransform
|
|
61
61
|
|
|
62
|
-
|
|
63
|
-
from ..extern.tp.tfutils import disable_tp_layer_logging
|
|
64
|
-
from ..extern.tpdetect import TPFrcnnDetector
|
|
65
|
-
|
|
66
|
-
if pytorch_available():
|
|
67
|
-
from ..extern.d2detect import D2FrcnnDetector, D2FrcnnTracingDetector
|
|
68
|
-
from ..extern.hfdetr import HFDetrDerivedDetector
|
|
69
|
-
|
|
70
|
-
if boto3_available():
|
|
62
|
+
with try_import() as image_guard:
|
|
71
63
|
from botocore.config import Config # type: ignore
|
|
72
64
|
|
|
73
65
|
|
|
@@ -113,11 +105,12 @@ def config_sanity_checks(cfg: AttrDict) -> None:
|
|
|
113
105
|
"""Some config sanity checks"""
|
|
114
106
|
if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
|
|
115
107
|
raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
|
|
116
|
-
if cfg.
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
108
|
+
if cfg.USE_OCR:
|
|
109
|
+
if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
"Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
|
|
112
|
+
"and set the other two to False. Only one OCR system can be activated."
|
|
113
|
+
)
|
|
121
114
|
|
|
122
115
|
|
|
123
116
|
def build_detector(
|
|
@@ -343,11 +336,20 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
343
336
|
pipe_component_list.append(table_segmentation)
|
|
344
337
|
|
|
345
338
|
if cfg.USE_TABLE_REFINEMENT:
|
|
346
|
-
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
339
|
+
table_segmentation_refinement = TableSegmentationRefinementService(
|
|
340
|
+
[LayoutType.table, LayoutType.table_rotated],
|
|
341
|
+
[
|
|
342
|
+
LayoutType.cell,
|
|
343
|
+
CellType.column_header,
|
|
344
|
+
CellType.projected_row_header,
|
|
345
|
+
CellType.spanning,
|
|
346
|
+
CellType.row_header,
|
|
347
|
+
],
|
|
348
|
+
)
|
|
347
349
|
pipe_component_list.append(table_segmentation_refinement)
|
|
348
350
|
|
|
349
351
|
if cfg.USE_PDF_MINER:
|
|
350
|
-
pdf_text = PdfPlumberTextDetector()
|
|
352
|
+
pdf_text = PdfPlumberTextDetector(x_tolerance=cfg.PDF_MINER.X_TOLERANCE, y_tolerance=cfg.PDF_MINER.Y_TOLERANCE)
|
|
351
353
|
d_text = TextExtractionService(pdf_text)
|
|
352
354
|
pipe_component_list.append(d_text)
|
|
353
355
|
|
|
@@ -400,7 +402,7 @@ def build_analyzer(cfg: AttrDict) -> DoctectionPipe:
|
|
|
400
402
|
|
|
401
403
|
|
|
402
404
|
def get_dd_analyzer(
|
|
403
|
-
reset_config_file: bool =
|
|
405
|
+
reset_config_file: bool = True,
|
|
404
406
|
config_overwrite: Optional[List[str]] = None,
|
|
405
407
|
path_config_file: Optional[Pathlike] = None,
|
|
406
408
|
) -> DoctectionPipe:
|
|
@@ -429,8 +431,13 @@ def get_dd_analyzer(
|
|
|
429
431
|
:return: A DoctectionPipe instance with given configs
|
|
430
432
|
"""
|
|
431
433
|
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
432
|
-
lib = "TF" if
|
|
433
|
-
|
|
434
|
+
lib = "TF" if os.environ.get("DD_USE_TF") else "PT"
|
|
435
|
+
if lib == "TF":
|
|
436
|
+
device = get_tf_device()
|
|
437
|
+
elif lib == "PT":
|
|
438
|
+
device = get_torch_device()
|
|
439
|
+
else:
|
|
440
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
434
441
|
dd_one_config_path = maybe_copy_config_to_cache(
|
|
435
442
|
get_package_path(), get_configs_dir_path(), _DD_ONE, reset_config_file
|
|
436
443
|
)
|
|
@@ -1,38 +1,38 @@
|
|
|
1
1
|
USE_LAYOUT: True
|
|
2
2
|
USE_TABLE_SEGMENTATION: True
|
|
3
3
|
TF:
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
4
|
+
LAYOUT:
|
|
5
|
+
WEIGHTS: layout/model-800000_inf_only.data-00000-of-00001
|
|
6
|
+
FILTER:
|
|
7
|
+
CELL:
|
|
8
|
+
WEIGHTS: cell/model-1800000_inf_only.data-00000-of-00001
|
|
9
|
+
FILTER:
|
|
10
|
+
ITEM:
|
|
11
|
+
WEIGHTS: item/model-1620000_inf_only.data-00000-of-00001
|
|
12
|
+
FILTER:
|
|
13
13
|
PT:
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
14
|
+
LAYOUT:
|
|
15
|
+
WEIGHTS: layout/d2_model_0829999_layout_inf_only.pt
|
|
16
|
+
WEIGHTS_TS: layout/d2_model_0829999_layout_inf_only.ts
|
|
17
|
+
FILTER:
|
|
18
|
+
PAD:
|
|
19
|
+
TOP: 60
|
|
20
|
+
RIGHT: 60
|
|
21
|
+
BOTTOM: 60
|
|
22
|
+
LEFT: 60
|
|
23
|
+
ITEM:
|
|
24
|
+
WEIGHTS: item/d2_model_1639999_item_inf_only.pt
|
|
25
|
+
WEIGHTS_TS: item/d2_model_1639999_item_inf_only.ts
|
|
26
|
+
FILTER:
|
|
27
|
+
PAD:
|
|
28
|
+
TOP: 60
|
|
29
|
+
RIGHT: 60
|
|
30
|
+
BOTTOM: 60
|
|
31
|
+
LEFT: 60
|
|
32
|
+
CELL:
|
|
33
|
+
WEIGHTS: cell/d2_model_1849999_cell_inf_only.pt
|
|
34
|
+
WEIGHTS_TS: cell/d2_model_1849999_cell_inf_only.ts
|
|
35
|
+
FILTER:
|
|
36
36
|
LAYOUT_NMS_PAIRS:
|
|
37
37
|
COMBINATIONS:
|
|
38
38
|
THRESHOLDS:
|
|
@@ -48,6 +48,9 @@ SEGMENTATION:
|
|
|
48
48
|
STRETCH_RULE: equal
|
|
49
49
|
USE_TABLE_REFINEMENT: True
|
|
50
50
|
USE_PDF_MINER: False
|
|
51
|
+
PDF_MINER:
|
|
52
|
+
X_TOLERANCE: 3
|
|
53
|
+
Y_TOLERANCE: 3
|
|
51
54
|
USE_OCR: True
|
|
52
55
|
OCR:
|
|
53
56
|
USE_TESSERACT: True
|
|
@@ -17,25 +17,6 @@ from typing import Any, Iterator, no_type_check
|
|
|
17
17
|
from ..utils.utils import get_rng
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
class DataFlowTerminated(BaseException):
|
|
21
|
-
"""
|
|
22
|
-
An exception indicating that the DataFlow is unable to produce any more
|
|
23
|
-
data, i.e. something wrong happened so that calling `__iter__`
|
|
24
|
-
cannot give a valid iterator anymore.
|
|
25
|
-
In most DataFlow this will never be raised.
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class DataFlowResetStateNotCalled(BaseException):
|
|
30
|
-
"""
|
|
31
|
-
An exception indicating that `reset_state()` has not been called before starting
|
|
32
|
-
iteration.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def __init__(self) -> None:
|
|
36
|
-
super().__init__("Iterating a dataflow requires .reset_state() to be called first")
|
|
37
|
-
|
|
38
|
-
|
|
39
20
|
class DataFlowReentrantGuard:
|
|
40
21
|
"""
|
|
41
22
|
A tool to enforce non-reentrancy.
|