deepdoctection 0.42.1__py3-none-any.whl → 0.43.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection/__init__.py +4 -2
- deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection/analyzer/config.py +919 -0
- deepdoctection/analyzer/dd.py +36 -62
- deepdoctection/analyzer/factory.py +311 -141
- deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection/dataflow/__init__.py +9 -6
- deepdoctection/dataflow/base.py +33 -15
- deepdoctection/dataflow/common.py +96 -75
- deepdoctection/dataflow/custom.py +36 -29
- deepdoctection/dataflow/custom_serialize.py +135 -91
- deepdoctection/dataflow/parallel_map.py +33 -31
- deepdoctection/dataflow/serialize.py +15 -10
- deepdoctection/dataflow/stats.py +41 -28
- deepdoctection/datapoint/__init__.py +4 -6
- deepdoctection/datapoint/annotation.py +104 -66
- deepdoctection/datapoint/box.py +190 -130
- deepdoctection/datapoint/convert.py +66 -39
- deepdoctection/datapoint/image.py +151 -95
- deepdoctection/datapoint/view.py +383 -236
- deepdoctection/datasets/__init__.py +2 -6
- deepdoctection/datasets/adapter.py +11 -11
- deepdoctection/datasets/base.py +118 -81
- deepdoctection/datasets/dataflow_builder.py +18 -12
- deepdoctection/datasets/info.py +76 -57
- deepdoctection/datasets/instances/__init__.py +6 -2
- deepdoctection/datasets/instances/doclaynet.py +17 -14
- deepdoctection/datasets/instances/fintabnet.py +16 -22
- deepdoctection/datasets/instances/funsd.py +11 -6
- deepdoctection/datasets/instances/iiitar13k.py +9 -9
- deepdoctection/datasets/instances/layouttest.py +9 -9
- deepdoctection/datasets/instances/publaynet.py +9 -9
- deepdoctection/datasets/instances/pubtables1m.py +13 -13
- deepdoctection/datasets/instances/pubtabnet.py +13 -15
- deepdoctection/datasets/instances/rvlcdip.py +8 -8
- deepdoctection/datasets/instances/xfund.py +11 -9
- deepdoctection/datasets/registry.py +18 -11
- deepdoctection/datasets/save.py +12 -11
- deepdoctection/eval/__init__.py +3 -2
- deepdoctection/eval/accmetric.py +72 -52
- deepdoctection/eval/base.py +29 -10
- deepdoctection/eval/cocometric.py +14 -12
- deepdoctection/eval/eval.py +56 -41
- deepdoctection/eval/registry.py +6 -3
- deepdoctection/eval/tedsmetric.py +24 -9
- deepdoctection/eval/tp_eval_callback.py +13 -12
- deepdoctection/extern/__init__.py +1 -1
- deepdoctection/extern/base.py +176 -97
- deepdoctection/extern/d2detect.py +127 -92
- deepdoctection/extern/deskew.py +19 -10
- deepdoctection/extern/doctrocr.py +162 -108
- deepdoctection/extern/fastlang.py +25 -17
- deepdoctection/extern/hfdetr.py +137 -60
- deepdoctection/extern/hflayoutlm.py +329 -248
- deepdoctection/extern/hflm.py +67 -33
- deepdoctection/extern/model.py +108 -762
- deepdoctection/extern/pdftext.py +37 -12
- deepdoctection/extern/pt/nms.py +15 -1
- deepdoctection/extern/pt/ptutils.py +13 -9
- deepdoctection/extern/tessocr.py +87 -54
- deepdoctection/extern/texocr.py +29 -14
- deepdoctection/extern/tp/tfutils.py +36 -8
- deepdoctection/extern/tp/tpcompat.py +54 -16
- deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- deepdoctection/extern/tpdetect.py +4 -2
- deepdoctection/mapper/__init__.py +1 -1
- deepdoctection/mapper/cats.py +117 -76
- deepdoctection/mapper/cocostruct.py +35 -17
- deepdoctection/mapper/d2struct.py +56 -29
- deepdoctection/mapper/hfstruct.py +32 -19
- deepdoctection/mapper/laylmstruct.py +221 -185
- deepdoctection/mapper/maputils.py +71 -35
- deepdoctection/mapper/match.py +76 -62
- deepdoctection/mapper/misc.py +68 -44
- deepdoctection/mapper/pascalstruct.py +13 -12
- deepdoctection/mapper/prodigystruct.py +33 -19
- deepdoctection/mapper/pubstruct.py +42 -32
- deepdoctection/mapper/tpstruct.py +39 -19
- deepdoctection/mapper/xfundstruct.py +20 -13
- deepdoctection/pipe/__init__.py +1 -2
- deepdoctection/pipe/anngen.py +104 -62
- deepdoctection/pipe/base.py +226 -107
- deepdoctection/pipe/common.py +206 -123
- deepdoctection/pipe/concurrency.py +74 -47
- deepdoctection/pipe/doctectionpipe.py +108 -47
- deepdoctection/pipe/language.py +41 -24
- deepdoctection/pipe/layout.py +45 -18
- deepdoctection/pipe/lm.py +146 -78
- deepdoctection/pipe/order.py +205 -119
- deepdoctection/pipe/refine.py +111 -63
- deepdoctection/pipe/registry.py +1 -1
- deepdoctection/pipe/segment.py +213 -142
- deepdoctection/pipe/sub_layout.py +76 -46
- deepdoctection/pipe/text.py +52 -33
- deepdoctection/pipe/transform.py +8 -6
- deepdoctection/train/d2_frcnn_train.py +87 -69
- deepdoctection/train/hf_detr_train.py +72 -40
- deepdoctection/train/hf_layoutlm_train.py +85 -46
- deepdoctection/train/tp_frcnn_train.py +56 -28
- deepdoctection/utils/concurrency.py +59 -16
- deepdoctection/utils/context.py +40 -19
- deepdoctection/utils/develop.py +26 -17
- deepdoctection/utils/env_info.py +86 -37
- deepdoctection/utils/error.py +16 -10
- deepdoctection/utils/file_utils.py +246 -71
- deepdoctection/utils/fs.py +162 -43
- deepdoctection/utils/identifier.py +29 -16
- deepdoctection/utils/logger.py +49 -32
- deepdoctection/utils/metacfg.py +83 -21
- deepdoctection/utils/pdf_utils.py +119 -62
- deepdoctection/utils/settings.py +24 -10
- deepdoctection/utils/tqdm.py +10 -5
- deepdoctection/utils/transform.py +182 -46
- deepdoctection/utils/utils.py +61 -28
- deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43.1.dist-info/METADATA +376 -0
- deepdoctection-0.43.1.dist-info/RECORD +149 -0
- deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.1.dist-info/METADATA +0 -431
- deepdoctection-0.42.1.dist-info/RECORD +0 -148
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/WHEEL +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/licenses/LICENSE +0 -0
- {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.1.dist-info}/top_level.txt +0 -0
deepdoctection/analyzer/dd.py
CHANGED
|
@@ -15,12 +15,10 @@
|
|
|
15
15
|
# See the License for the specific language governing permissions and
|
|
16
16
|
# limitations under the License.
|
|
17
17
|
|
|
18
|
-
"""
|
|
19
|
-
Module for **deep**doctection analyzer.
|
|
20
|
-
|
|
21
|
-
-factory build_analyzer for a given config
|
|
22
18
|
|
|
23
|
-
|
|
19
|
+
"""
|
|
20
|
+
- factory `build_analyzer` for a given config
|
|
21
|
+
- user factory with a reduced config setting
|
|
24
22
|
"""
|
|
25
23
|
|
|
26
24
|
from __future__ import annotations
|
|
@@ -32,12 +30,12 @@ from ..extern.pt.ptutils import get_torch_device
|
|
|
32
30
|
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
33
31
|
from ..pipe.doctectionpipe import DoctectionPipe
|
|
34
32
|
from ..utils.env_info import ENV_VARS_TRUE
|
|
35
|
-
from ..utils.file_utils import
|
|
33
|
+
from ..utils.file_utils import detectron2_available, tensorpack_available
|
|
36
34
|
from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
|
|
37
35
|
from ..utils.logger import LoggingRecord, logger
|
|
38
36
|
from ..utils.metacfg import set_config_by_yaml
|
|
39
37
|
from ..utils.types import PathLikeOrStr
|
|
40
|
-
from .
|
|
38
|
+
from .config import cfg
|
|
41
39
|
from .factory import ServiceFactory
|
|
42
40
|
|
|
43
41
|
__all__ = [
|
|
@@ -47,32 +45,6 @@ __all__ = [
|
|
|
47
45
|
|
|
48
46
|
_DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
|
|
49
47
|
_TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
|
|
50
|
-
_MODEL_CHOICES = {
|
|
51
|
-
"layout": [
|
|
52
|
-
"layout/d2_model_0829999_layout_inf_only.pt",
|
|
53
|
-
"xrf_layout/model_final_inf_only.pt",
|
|
54
|
-
"microsoft/table-transformer-detection/pytorch_model.bin",
|
|
55
|
-
],
|
|
56
|
-
"segmentation": [
|
|
57
|
-
"item/model-1620000_inf_only.data-00000-of-00001",
|
|
58
|
-
"xrf_item/model_final_inf_only.pt",
|
|
59
|
-
"microsoft/table-transformer-structure-recognition/pytorch_model.bin",
|
|
60
|
-
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
|
|
61
|
-
],
|
|
62
|
-
"ocr": ["Tesseract", "DocTr", "Textract"],
|
|
63
|
-
"doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
|
|
64
|
-
"doctr_recognition": [
|
|
65
|
-
"doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
|
|
66
|
-
"doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
|
|
67
|
-
],
|
|
68
|
-
"llm": ["gpt-3.5-turbo", "gpt-4"],
|
|
69
|
-
"segmentation_choices": {
|
|
70
|
-
"item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
|
|
71
|
-
"xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
|
|
72
|
-
"microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
|
|
73
|
-
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
|
|
74
|
-
},
|
|
75
|
-
}
|
|
76
48
|
|
|
77
49
|
|
|
78
50
|
def config_sanity_checks() -> None:
|
|
@@ -89,32 +61,35 @@ def config_sanity_checks() -> None:
|
|
|
89
61
|
|
|
90
62
|
def get_dd_analyzer(
|
|
91
63
|
reset_config_file: bool = True,
|
|
64
|
+
load_default_config_file: bool = False,
|
|
92
65
|
config_overwrite: Optional[list[str]] = None,
|
|
93
66
|
path_config_file: Optional[PathLikeOrStr] = None,
|
|
94
67
|
) -> DoctectionPipe:
|
|
95
68
|
"""
|
|
96
69
|
Factory function for creating the built-in **deep**doctection analyzer.
|
|
97
70
|
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
71
|
+
Info:
|
|
72
|
+
The Standard Analyzer is a pipeline that comprises the following analysis components:
|
|
73
|
+
|
|
74
|
+
- Rotation
|
|
75
|
+
- Document layout analysis
|
|
76
|
+
- Table segmentation
|
|
77
|
+
- Text extraction/OCR
|
|
78
|
+
- Reading order
|
|
79
|
+
- Layout linking
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
reset_config_file: This will copy the `.yaml` file with default variables to the `.cache` and therefore
|
|
83
|
+
resetting all configurations if set to `True`.
|
|
84
|
+
load_default_config_file: This will load the default config file from the `.cache` directory if set to `True`.
|
|
85
|
+
If set to `False`, the config file will be ignored.
|
|
86
|
+
config_overwrite: Passing a list of string arguments and values to overwrite the `.yaml`
|
|
87
|
+
configuration with highest priority, e.g. `["USE_TABLE_SEGMENTATION=False", "USE_OCR=False",
|
|
88
|
+
"TF.LAYOUT.WEIGHTS=my_fancy_pytorch_model"]`.
|
|
89
|
+
path_config_file: Path to a custom config file. Can be outside of the `.cache` directory.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
DoctectionPipe: A `DoctectionPipe` instance with given configs.
|
|
118
93
|
"""
|
|
119
94
|
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
120
95
|
if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE:
|
|
@@ -131,25 +106,24 @@ def get_dd_analyzer(
|
|
|
131
106
|
)
|
|
132
107
|
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
|
|
133
108
|
|
|
134
|
-
# Set up of the configuration and logging
|
|
135
|
-
file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
|
|
136
|
-
cfg.freeze(freezed=False)
|
|
137
|
-
cfg.overwrite_config(file_cfg)
|
|
138
|
-
|
|
139
109
|
cfg.freeze(freezed=False)
|
|
110
|
+
if load_default_config_file:
|
|
111
|
+
# Set up of the configuration and logging
|
|
112
|
+
file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
|
|
113
|
+
cfg.overwrite_config(file_cfg)
|
|
140
114
|
cfg.LANGUAGE = None
|
|
141
115
|
cfg.LIB = lib
|
|
142
116
|
cfg.DEVICE = device
|
|
143
117
|
if not detectron2_available() or cfg.PT.LAYOUT.WEIGHTS is None:
|
|
144
|
-
cfg.PT.ENFORCE_WEIGHTS.LAYOUT=False
|
|
118
|
+
cfg.PT.ENFORCE_WEIGHTS.LAYOUT = False
|
|
145
119
|
if not detectron2_available() or cfg.PT.ITEM.WEIGHTS is None:
|
|
146
|
-
cfg.PT.ENFORCE_WEIGHTS.ITEM=False
|
|
120
|
+
cfg.PT.ENFORCE_WEIGHTS.ITEM = False
|
|
147
121
|
if not detectron2_available() or cfg.PT.CELL.WEIGHTS is None:
|
|
148
|
-
cfg.PT.ENFORCE_WEIGHTS.CELL=False
|
|
149
|
-
cfg.freeze()
|
|
122
|
+
cfg.PT.ENFORCE_WEIGHTS.CELL = False
|
|
150
123
|
|
|
151
124
|
if config_overwrite:
|
|
152
125
|
cfg.update_args(config_overwrite)
|
|
126
|
+
cfg.freeze()
|
|
153
127
|
|
|
154
128
|
config_sanity_checks()
|
|
155
129
|
logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
|