deepdoctection 0.33__tar.gz → 0.35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.33 → deepdoctection-0.35}/PKG-INFO +20 -11
- {deepdoctection-0.33 → deepdoctection-0.35}/README.md +10 -7
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/__init__.py +11 -12
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/analyzer/__init__.py +1 -0
- deepdoctection-0.35/deepdoctection/analyzer/_config.py +150 -0
- deepdoctection-0.35/deepdoctection/analyzer/dd.py +154 -0
- deepdoctection-0.35/deepdoctection/analyzer/factory.py +522 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/configs/conf_dd_one.yaml +1 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/annotation.py +41 -3
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/convert.py +6 -4
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/image.py +132 -46
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/view.py +2 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/base.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/fintabnet.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/xfund.py +29 -7
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/eval.py +7 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/model.py +2 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/pdftext.py +96 -5
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tessocr.py +1 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/cats.py +11 -13
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/cocostruct.py +6 -2
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/d2struct.py +2 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/laylmstruct.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/match.py +31 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/misc.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/prodigystruct.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/anngen.py +27 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/base.py +23 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/common.py +123 -38
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/segment.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/sub_layout.py +1 -1
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/env_info.py +31 -2
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/file_utils.py +19 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/fs.py +27 -4
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/metacfg.py +12 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/pdf_utils.py +114 -6
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/settings.py +3 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/PKG-INFO +20 -11
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/SOURCES.txt +2 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/requires.txt +9 -3
- {deepdoctection-0.33 → deepdoctection-0.35}/setup.cfg +7 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/setup.py +6 -2
- deepdoctection-0.33/deepdoctection/analyzer/dd.py +0 -470
- {deepdoctection-0.33 → deepdoctection-0.35}/LICENSE +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/box.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/fastlang.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/refine.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/viz.py +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.33 → deepdoctection-0.35}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.35
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: catalogue==2.0.10
|
|
20
|
-
Requires-Dist: huggingface_hub
|
|
20
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0
|
|
21
21
|
Requires-Dist: importlib-metadata>=5.0.0
|
|
22
22
|
Requires-Dist: jsonlines==3.1.0
|
|
23
23
|
Requires-Dist: lazy-imports==0.3.1
|
|
@@ -27,14 +27,16 @@ Requires-Dist: numpy<2.0,>=1.21
|
|
|
27
27
|
Requires-Dist: packaging>=20.0
|
|
28
28
|
Requires-Dist: Pillow>=10.0.0
|
|
29
29
|
Requires-Dist: pypdf>=3.16.0
|
|
30
|
+
Requires-Dist: pypdfium2>=4.30.0
|
|
30
31
|
Requires-Dist: pyyaml>=6.0.1
|
|
31
32
|
Requires-Dist: pyzmq>=16
|
|
33
|
+
Requires-Dist: scipy>=1.13.1
|
|
32
34
|
Requires-Dist: termcolor>=1.1
|
|
33
35
|
Requires-Dist: tabulate>=0.7.7
|
|
34
36
|
Requires-Dist: tqdm==4.64.0
|
|
35
37
|
Provides-Extra: tf
|
|
36
38
|
Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
37
|
-
Requires-Dist: huggingface_hub
|
|
39
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
|
|
38
40
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
39
41
|
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
40
42
|
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
@@ -44,8 +46,10 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
|
44
46
|
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
45
47
|
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
46
48
|
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
49
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
|
|
47
50
|
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
48
51
|
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
52
|
+
Requires-Dist: scipy>=1.13.1; extra == "tf"
|
|
49
53
|
Requires-Dist: termcolor>=1.1; extra == "tf"
|
|
50
54
|
Requires-Dist: tabulate>=0.7.7; extra == "tf"
|
|
51
55
|
Requires-Dist: tqdm==4.64.0; extra == "tf"
|
|
@@ -64,7 +68,7 @@ Requires-Dist: distance==0.1.3; extra == "tf"
|
|
|
64
68
|
Requires-Dist: lxml>=4.9.1; extra == "tf"
|
|
65
69
|
Provides-Extra: pt
|
|
66
70
|
Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
67
|
-
Requires-Dist: huggingface_hub
|
|
71
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
|
|
68
72
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
69
73
|
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
70
74
|
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
@@ -74,8 +78,10 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
|
74
78
|
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
75
79
|
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
76
80
|
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
81
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
|
|
77
82
|
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
78
83
|
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
84
|
+
Requires-Dist: scipy>=1.13.1; extra == "pt"
|
|
79
85
|
Requires-Dist: termcolor>=1.1; extra == "pt"
|
|
80
86
|
Requires-Dist: tabulate>=0.7.7; extra == "pt"
|
|
81
87
|
Requires-Dist: tqdm==4.64.0; extra == "pt"
|
|
@@ -169,9 +175,9 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
169
175
|
- Document layout analysis and table recognition now runs with
|
|
170
176
|
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
171
177
|
anymore for basic inference.
|
|
172
|
-
-
|
|
178
|
+
- More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
173
179
|
(not contained in the built-in Analyzer).
|
|
174
|
-
-
|
|
180
|
+
- Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
175
181
|
[**transformers**](https://github.com/huggingface/transformers).
|
|
176
182
|
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
177
183
|
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
@@ -260,7 +266,7 @@ documentation.
|
|
|
260
266
|
|
|
261
267
|
## Requirements
|
|
262
268
|
|
|
263
|
-

|
|
264
270
|
|
|
265
271
|
Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
|
|
266
272
|
separately.
|
|
@@ -269,13 +275,16 @@ separately.
|
|
|
269
275
|
- Python >= 3.9
|
|
270
276
|
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
271
277
|
In general, if you want to train or fine-tune models, a GPU is required.
|
|
272
|
-
|
|
273
|
-
images.
|
|
278
|
+
|
|
274
279
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
275
280
|
and [PyTorch](https://pytorch.org/get-started/locally/).
|
|
276
281
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
|
|
277
282
|
engine has to be installed separately.
|
|
278
283
|
|
|
284
|
+
|
|
285
|
+
- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
|
|
286
|
+
documents into images. For release `v.0.35.0` this dependency will be optional.
|
|
287
|
+
|
|
279
288
|
The following overview shows the availability of the models in conjunction with the DL framework.
|
|
280
289
|
|
|
281
290
|
| Task | PyTorch | Torchscript | Tensorflow |
|
|
@@ -393,8 +402,8 @@ to develop this framework.
|
|
|
393
402
|
## Problems
|
|
394
403
|
|
|
395
404
|
We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
|
|
396
|
-
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every
|
|
397
|
-
to
|
|
405
|
+
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
|
|
406
|
+
to 12 weeks.
|
|
398
407
|
|
|
399
408
|
## If you like **deep**doctection ...
|
|
400
409
|
|
|
@@ -45,9 +45,9 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
45
45
|
- Document layout analysis and table recognition now runs with
|
|
46
46
|
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
47
47
|
anymore for basic inference.
|
|
48
|
-
-
|
|
48
|
+
- More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
49
49
|
(not contained in the built-in Analyzer).
|
|
50
|
-
-
|
|
50
|
+
- Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
51
51
|
[**transformers**](https://github.com/huggingface/transformers).
|
|
52
52
|
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
53
53
|
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
@@ -136,7 +136,7 @@ documentation.
|
|
|
136
136
|
|
|
137
137
|
## Requirements
|
|
138
138
|
|
|
139
|
-

|
|
140
140
|
|
|
141
141
|
Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
|
|
142
142
|
separately.
|
|
@@ -145,13 +145,16 @@ separately.
|
|
|
145
145
|
- Python >= 3.9
|
|
146
146
|
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
147
147
|
In general, if you want to train or fine-tune models, a GPU is required.
|
|
148
|
-
|
|
149
|
-
images.
|
|
148
|
+
|
|
150
149
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
151
150
|
and [PyTorch](https://pytorch.org/get-started/locally/).
|
|
152
151
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
|
|
153
152
|
engine has to be installed separately.
|
|
154
153
|
|
|
154
|
+
|
|
155
|
+
- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
|
|
156
|
+
documents into images. For release `v.0.35.0` this dependency will be optional.
|
|
157
|
+
|
|
155
158
|
The following overview shows the availability of the models in conjunction with the DL framework.
|
|
156
159
|
|
|
157
160
|
| Task | PyTorch | Torchscript | Tensorflow |
|
|
@@ -269,8 +272,8 @@ to develop this framework.
|
|
|
269
272
|
## Problems
|
|
270
273
|
|
|
271
274
|
We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
|
|
272
|
-
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every
|
|
273
|
-
to
|
|
275
|
+
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
|
|
276
|
+
to 12 weeks.
|
|
274
277
|
|
|
275
278
|
## If you like **deep**doctection ...
|
|
276
279
|
|
|
@@ -15,30 +15,22 @@ if importlib.util.find_spec("dotenv") is not None:
|
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
# pylint: disable=wrong-import-position
|
|
18
|
-
import os
|
|
19
18
|
import sys
|
|
20
19
|
from typing import TYPE_CHECKING
|
|
21
20
|
|
|
22
|
-
from .utils.env_info import collect_env_info
|
|
21
|
+
from .utils.env_info import auto_select_pdf_render_framework, collect_env_info
|
|
23
22
|
from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
|
|
24
23
|
from .utils.logger import LoggingRecord, logger
|
|
25
24
|
|
|
26
25
|
# pylint: enable=wrong-import-position
|
|
27
26
|
|
|
28
|
-
__version__ = 0.
|
|
27
|
+
__version__ = 0.35
|
|
29
28
|
|
|
30
29
|
_IMPORT_STRUCTURE = {
|
|
31
30
|
"analyzer": [
|
|
32
|
-
"maybe_copy_config_to_cache",
|
|
33
31
|
"config_sanity_checks",
|
|
34
|
-
"build_detector",
|
|
35
|
-
"build_padder",
|
|
36
|
-
"build_service",
|
|
37
|
-
"build_sub_image_service",
|
|
38
|
-
"build_ocr",
|
|
39
|
-
"build_doctr_word",
|
|
40
32
|
"get_dd_analyzer",
|
|
41
|
-
"
|
|
33
|
+
"ServiceFactory"
|
|
42
34
|
],
|
|
43
35
|
"configs": [],
|
|
44
36
|
"dataflow": [
|
|
@@ -76,6 +68,7 @@ _IMPORT_STRUCTURE = {
|
|
|
76
68
|
],
|
|
77
69
|
"datapoint": [
|
|
78
70
|
"ann_from_dict",
|
|
71
|
+
"AnnotationMap",
|
|
79
72
|
"Annotation",
|
|
80
73
|
"CategoryAnnotation",
|
|
81
74
|
"ImageAnnotation",
|
|
@@ -198,6 +191,7 @@ _IMPORT_STRUCTURE = {
|
|
|
198
191
|
"print_model_infos",
|
|
199
192
|
"ModelDownloadManager",
|
|
200
193
|
"PdfPlumberTextDetector",
|
|
194
|
+
"Pdfmium2TextDetector",
|
|
201
195
|
"TesseractOcrDetector",
|
|
202
196
|
"TesseractRotationTransformer",
|
|
203
197
|
"TextractOcrDetector",
|
|
@@ -237,6 +231,7 @@ _IMPORT_STRUCTURE = {
|
|
|
237
231
|
"LabelSummarizer",
|
|
238
232
|
"curry",
|
|
239
233
|
"match_anns_by_intersection",
|
|
234
|
+
"match_anns_by_distance",
|
|
240
235
|
"to_image",
|
|
241
236
|
"maybe_load_image",
|
|
242
237
|
"maybe_remove_image",
|
|
@@ -265,6 +260,8 @@ _IMPORT_STRUCTURE = {
|
|
|
265
260
|
"DetectResultGenerator",
|
|
266
261
|
"SubImageLayoutService",
|
|
267
262
|
"ImageCroppingService",
|
|
263
|
+
"IntersectionMatcher",
|
|
264
|
+
"NeighbourMatcher",
|
|
268
265
|
"MatchingService",
|
|
269
266
|
"PageParsingService",
|
|
270
267
|
"AnnotationNmsService",
|
|
@@ -302,6 +299,7 @@ _IMPORT_STRUCTURE = {
|
|
|
302
299
|
"timed_operation",
|
|
303
300
|
"collect_env_info",
|
|
304
301
|
"auto_select_viz_library",
|
|
302
|
+
"auto_select_pdf_render_framework",
|
|
305
303
|
"get_tensorflow_requirement",
|
|
306
304
|
"tf_addons_available",
|
|
307
305
|
"get_tf_addons_requirements",
|
|
@@ -364,6 +362,7 @@ _IMPORT_STRUCTURE = {
|
|
|
364
362
|
"get_configs_dir_path",
|
|
365
363
|
"get_weights_dir_path",
|
|
366
364
|
"get_dataset_dir_path",
|
|
365
|
+
"maybe_copy_config_to_cache",
|
|
367
366
|
"is_uuid_like",
|
|
368
367
|
"get_uuid_from_str",
|
|
369
368
|
"get_uuid",
|
|
@@ -424,7 +423,7 @@ _IMPORT_STRUCTURE = {
|
|
|
424
423
|
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
425
424
|
env_info = collect_env_info()
|
|
426
425
|
logger.debug(LoggingRecord(msg=env_info))
|
|
427
|
-
|
|
426
|
+
auto_select_pdf_render_framework()
|
|
428
427
|
|
|
429
428
|
# Direct imports for type-checking
|
|
430
429
|
if TYPE_CHECKING:
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: config.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
|
|
19
|
+
|
|
20
|
+
from ..utils.metacfg import AttrDict
|
|
21
|
+
from ..utils.settings import CellType, LayoutType
|
|
22
|
+
|
|
23
|
+
cfg = AttrDict()
|
|
24
|
+
|
|
25
|
+
cfg.LANGUAGE = None
|
|
26
|
+
cfg.LIB = None
|
|
27
|
+
cfg.DEVICE = None
|
|
28
|
+
cfg.USE_ROTATOR = False
|
|
29
|
+
cfg.USE_LAYOUT = True
|
|
30
|
+
cfg.USE_TABLE_SEGMENTATION = True
|
|
31
|
+
|
|
32
|
+
cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
|
|
33
|
+
cfg.TF.LAYOUT.FILTER = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
|
|
37
|
+
cfg.TF.CELL.FILTER = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
|
|
41
|
+
cfg.TF.ITEM.FILTER = None
|
|
42
|
+
|
|
43
|
+
cfg.PT.LAYOUT.WEIGHTS = "layout/d2_model_0829999_layout_inf_only.pt"
|
|
44
|
+
cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
|
|
45
|
+
cfg.PT.LAYOUT.FILTER = None
|
|
46
|
+
cfg.PT.LAYOUT.PAD.TOP = 60
|
|
47
|
+
cfg.PT.LAYOUT.PAD.RIGHT = 60
|
|
48
|
+
cfg.PT.LAYOUT.PAD.BOTTOM = 60
|
|
49
|
+
cfg.PT.LAYOUT.PAD.LEFT = 60
|
|
50
|
+
|
|
51
|
+
cfg.PT.ITEM.WEIGHTS = "item/d2_model_1639999_item_inf_only.pt"
|
|
52
|
+
cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
|
|
53
|
+
cfg.PT.ITEM.FILTER = None
|
|
54
|
+
cfg.PT.ITEM.PAD.TOP = 60
|
|
55
|
+
cfg.PT.ITEM.PAD.RIGHT = 60
|
|
56
|
+
cfg.PT.ITEM.PAD.BOTTOM = 60
|
|
57
|
+
cfg.PT.ITEM.PAD.LEFT = 60
|
|
58
|
+
|
|
59
|
+
cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
|
|
60
|
+
cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
|
|
61
|
+
cfg.PT.CELL.FILTER = None
|
|
62
|
+
|
|
63
|
+
cfg.USE_LAYOUT_NMS = False
|
|
64
|
+
cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = None
|
|
65
|
+
cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = None
|
|
66
|
+
cfg.LAYOUT_NMS_PAIRS.PRIORITY = None
|
|
67
|
+
|
|
68
|
+
cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
|
|
69
|
+
cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
|
|
70
|
+
cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
|
|
71
|
+
cfg.SEGMENTATION.FULL_TABLE_TILING = True
|
|
72
|
+
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.001
|
|
73
|
+
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.001
|
|
74
|
+
cfg.SEGMENTATION.CELL_CATEGORY_ID = 12
|
|
75
|
+
cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
|
|
76
|
+
cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
|
|
77
|
+
CellType.SPANNING,
|
|
78
|
+
CellType.ROW_HEADER,
|
|
79
|
+
CellType.COLUMN_HEADER,
|
|
80
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
81
|
+
LayoutType.CELL,
|
|
82
|
+
]
|
|
83
|
+
cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
|
|
84
|
+
CellType.SPANNING,
|
|
85
|
+
CellType.ROW_HEADER,
|
|
86
|
+
CellType.COLUMN_HEADER,
|
|
87
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
88
|
+
]
|
|
89
|
+
cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
90
|
+
cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
91
|
+
cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
|
|
92
|
+
cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
93
|
+
cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
94
|
+
|
|
95
|
+
cfg.SEGMENTATION.STRETCH_RULE = "equal"
|
|
96
|
+
|
|
97
|
+
cfg.USE_TABLE_REFINEMENT = True
|
|
98
|
+
cfg.USE_PDF_MINER = False
|
|
99
|
+
|
|
100
|
+
cfg.PDF_MINER.X_TOLERANCE = 3
|
|
101
|
+
cfg.PDF_MINER.Y_TOLERANCE = 3
|
|
102
|
+
|
|
103
|
+
cfg.USE_OCR = True
|
|
104
|
+
|
|
105
|
+
cfg.OCR.USE_TESSERACT = True
|
|
106
|
+
cfg.OCR.USE_DOCTR = False
|
|
107
|
+
cfg.OCR.USE_TEXTRACT = False
|
|
108
|
+
cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
|
|
109
|
+
|
|
110
|
+
cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
|
|
111
|
+
cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
|
|
112
|
+
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
|
|
113
|
+
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
|
|
114
|
+
|
|
115
|
+
cfg.TEXT_CONTAINER = LayoutType.WORD
|
|
116
|
+
cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
|
|
117
|
+
LayoutType.TEXT,
|
|
118
|
+
LayoutType.TITLE,
|
|
119
|
+
LayoutType.LIST,
|
|
120
|
+
LayoutType.CELL,
|
|
121
|
+
CellType.COLUMN_HEADER,
|
|
122
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
123
|
+
CellType.SPANNING,
|
|
124
|
+
CellType.ROW_HEADER,
|
|
125
|
+
]
|
|
126
|
+
cfg.WORD_MATCHING.RULE = "ioa"
|
|
127
|
+
cfg.WORD_MATCHING.THRESHOLD = 0.6
|
|
128
|
+
cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
|
|
129
|
+
|
|
130
|
+
cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = [
|
|
131
|
+
LayoutType.TEXT,
|
|
132
|
+
LayoutType.TITLE,
|
|
133
|
+
LayoutType.LIST,
|
|
134
|
+
LayoutType.CELL,
|
|
135
|
+
CellType.COLUMN_HEADER,
|
|
136
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
137
|
+
CellType.SPANNING,
|
|
138
|
+
CellType.ROW_HEADER,
|
|
139
|
+
]
|
|
140
|
+
cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = [
|
|
141
|
+
LayoutType.TEXT,
|
|
142
|
+
LayoutType.TITLE,
|
|
143
|
+
LayoutType.LIST,
|
|
144
|
+
]
|
|
145
|
+
cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
|
|
146
|
+
cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
|
|
147
|
+
cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
|
|
148
|
+
cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
|
|
149
|
+
cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
|
|
150
|
+
cfg.freeze()
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: dd.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2021 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Module for **deep**doctection analyzer.
|
|
20
|
+
|
|
21
|
+
-factory build_analyzer for a given config
|
|
22
|
+
|
|
23
|
+
-user factory with a reduced config setting
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import os
|
|
29
|
+
from typing import Optional
|
|
30
|
+
|
|
31
|
+
from ..extern.pt.ptutils import get_torch_device
|
|
32
|
+
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
33
|
+
from ..pipe.doctectionpipe import DoctectionPipe
|
|
34
|
+
from ..utils.env_info import ENV_VARS_TRUE
|
|
35
|
+
from ..utils.error import DependencyError
|
|
36
|
+
from ..utils.file_utils import tensorpack_available
|
|
37
|
+
from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
|
|
38
|
+
from ..utils.logger import LoggingRecord, logger
|
|
39
|
+
from ..utils.metacfg import set_config_by_yaml
|
|
40
|
+
from ..utils.types import PathLikeOrStr
|
|
41
|
+
from ._config import cfg
|
|
42
|
+
from .factory import ServiceFactory
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"config_sanity_checks",
|
|
46
|
+
"get_dd_analyzer",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
_DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
|
|
50
|
+
_TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
|
|
51
|
+
_MODEL_CHOICES = {
|
|
52
|
+
"layout": [
|
|
53
|
+
"layout/d2_model_0829999_layout_inf_only.pt",
|
|
54
|
+
"xrf_layout/model_final_inf_only.pt",
|
|
55
|
+
"microsoft/table-transformer-detection/pytorch_model.bin",
|
|
56
|
+
],
|
|
57
|
+
"segmentation": [
|
|
58
|
+
"item/model-1620000_inf_only.data-00000-of-00001",
|
|
59
|
+
"xrf_item/model_final_inf_only.pt",
|
|
60
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin",
|
|
61
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
|
|
62
|
+
],
|
|
63
|
+
"ocr": ["Tesseract", "DocTr", "Textract"],
|
|
64
|
+
"doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
|
|
65
|
+
"doctr_recognition": [
|
|
66
|
+
"doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
|
|
67
|
+
"doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
|
|
68
|
+
],
|
|
69
|
+
"llm": ["gpt-3.5-turbo", "gpt-4"],
|
|
70
|
+
"segmentation_choices": {
|
|
71
|
+
"item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
|
|
72
|
+
"xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
|
|
73
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
|
|
74
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def config_sanity_checks() -> None:
|
|
80
|
+
"""Some config sanity checks"""
|
|
81
|
+
if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
|
|
82
|
+
raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
|
|
83
|
+
if cfg.USE_OCR:
|
|
84
|
+
if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
"Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
|
|
87
|
+
"and set the other two to False. Only one OCR system can be activated."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_dd_analyzer(
|
|
92
|
+
reset_config_file: bool = True,
|
|
93
|
+
config_overwrite: Optional[list[str]] = None,
|
|
94
|
+
path_config_file: Optional[PathLikeOrStr] = None,
|
|
95
|
+
) -> DoctectionPipe:
|
|
96
|
+
"""
|
|
97
|
+
Factory function for creating the built-in **deep**doctection analyzer.
|
|
98
|
+
|
|
99
|
+
The Standard Analyzer is a pipeline that comprises the following analysis components:
|
|
100
|
+
|
|
101
|
+
- Document layout analysis
|
|
102
|
+
|
|
103
|
+
- Table segmentation
|
|
104
|
+
|
|
105
|
+
- Text extraction/OCR
|
|
106
|
+
|
|
107
|
+
- Reading order
|
|
108
|
+
|
|
109
|
+
We refer to the various notebooks and docs for running an analyzer and changing the configs.
|
|
110
|
+
|
|
111
|
+
:param reset_config_file: This will copy the `.yaml` file with default variables to the `.cache` and therefore
|
|
112
|
+
resetting all configurations if set to `True`.
|
|
113
|
+
:param config_overwrite: Passing a list of string arguments and values to overwrite the `.yaml` configuration with
|
|
114
|
+
highest priority, e.g. ["USE_TABLE_SEGMENTATION=False",
|
|
115
|
+
"USE_OCR=False",
|
|
116
|
+
"TF.LAYOUT.WEIGHTS=my_fancy_pytorch_model"]
|
|
117
|
+
:param path_config_file: Path to a custom config file. Can be outside of the .cache directory.
|
|
118
|
+
:return: A DoctectionPipe instance with given configs
|
|
119
|
+
"""
|
|
120
|
+
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
121
|
+
lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
|
|
122
|
+
if lib == "TF":
|
|
123
|
+
device = get_tf_device()
|
|
124
|
+
elif lib == "PT":
|
|
125
|
+
device = get_torch_device()
|
|
126
|
+
else:
|
|
127
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
128
|
+
dd_one_config_path = maybe_copy_config_to_cache(
|
|
129
|
+
get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
|
|
130
|
+
)
|
|
131
|
+
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
|
|
132
|
+
|
|
133
|
+
# Set up of the configuration and logging
|
|
134
|
+
file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
|
|
135
|
+
cfg.freeze(freezed=False)
|
|
136
|
+
cfg.overwrite_config(file_cfg)
|
|
137
|
+
|
|
138
|
+
cfg.freeze(freezed=False)
|
|
139
|
+
cfg.LANGUAGE = None
|
|
140
|
+
cfg.LIB = lib
|
|
141
|
+
cfg.DEVICE = device
|
|
142
|
+
cfg.freeze()
|
|
143
|
+
|
|
144
|
+
if config_overwrite:
|
|
145
|
+
cfg.update_args(config_overwrite)
|
|
146
|
+
|
|
147
|
+
config_sanity_checks()
|
|
148
|
+
logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
|
|
149
|
+
|
|
150
|
+
# will silent all TP logging while building the tower
|
|
151
|
+
if tensorpack_available():
|
|
152
|
+
disable_tp_layer_logging()
|
|
153
|
+
|
|
154
|
+
return ServiceFactory.build_analyzer(cfg)
|