deepdoctection 0.34__tar.gz → 0.36__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.34 → deepdoctection-0.36}/PKG-INFO +23 -13
- {deepdoctection-0.34 → deepdoctection-0.36}/README.md +16 -9
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/__init__.py +7 -14
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/analyzer/__init__.py +1 -0
- deepdoctection-0.36/deepdoctection/analyzer/_config.py +142 -0
- deepdoctection-0.36/deepdoctection/analyzer/dd.py +154 -0
- deepdoctection-0.36/deepdoctection/analyzer/factory.py +718 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/configs/conf_dd_one.yaml +5 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/annotation.py +1 -1
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/convert.py +6 -4
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/image.py +16 -6
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/view.py +91 -15
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/cocometric.py +59 -13
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/pdftext.py +96 -5
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tessocr.py +1 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/match.py +4 -2
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/env_info.py +30 -1
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/file_utils.py +19 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/metacfg.py +12 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/pdf_utils.py +86 -3
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/utils.py +39 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/viz.py +16 -13
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/PKG-INFO +23 -13
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/SOURCES.txt +2 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/requires.txt +6 -3
- {deepdoctection-0.34 → deepdoctection-0.36}/setup.cfg +4 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/setup.py +4 -2
- deepdoctection-0.34/deepdoctection/analyzer/dd.py +0 -478
- {deepdoctection-0.34 → deepdoctection-0.36}/LICENSE +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/box.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/fintabnet.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/xfund.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/fastlang.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/model.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/d2struct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/laylmstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/misc.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/common.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/refine.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/segment.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/sub_layout.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/fs.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/settings.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.34 → deepdoctection-0.36}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.36
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: catalogue==2.0.10
|
|
20
|
-
Requires-Dist: huggingface_hub
|
|
20
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0
|
|
21
21
|
Requires-Dist: importlib-metadata>=5.0.0
|
|
22
22
|
Requires-Dist: jsonlines==3.1.0
|
|
23
23
|
Requires-Dist: lazy-imports==0.3.1
|
|
@@ -27,6 +27,7 @@ Requires-Dist: numpy<2.0,>=1.21
|
|
|
27
27
|
Requires-Dist: packaging>=20.0
|
|
28
28
|
Requires-Dist: Pillow>=10.0.0
|
|
29
29
|
Requires-Dist: pypdf>=3.16.0
|
|
30
|
+
Requires-Dist: pypdfium2>=4.30.0
|
|
30
31
|
Requires-Dist: pyyaml>=6.0.1
|
|
31
32
|
Requires-Dist: pyzmq>=16
|
|
32
33
|
Requires-Dist: scipy>=1.13.1
|
|
@@ -35,7 +36,7 @@ Requires-Dist: tabulate>=0.7.7
|
|
|
35
36
|
Requires-Dist: tqdm==4.64.0
|
|
36
37
|
Provides-Extra: tf
|
|
37
38
|
Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
38
|
-
Requires-Dist: huggingface_hub
|
|
39
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
|
|
39
40
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
40
41
|
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
41
42
|
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
@@ -45,6 +46,7 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
|
45
46
|
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
46
47
|
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
47
48
|
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
49
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
|
|
48
50
|
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
49
51
|
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
50
52
|
Requires-Dist: scipy>=1.13.1; extra == "tf"
|
|
@@ -66,7 +68,7 @@ Requires-Dist: distance==0.1.3; extra == "tf"
|
|
|
66
68
|
Requires-Dist: lxml>=4.9.1; extra == "tf"
|
|
67
69
|
Provides-Extra: pt
|
|
68
70
|
Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
69
|
-
Requires-Dist: huggingface_hub
|
|
71
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
|
|
70
72
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
71
73
|
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
72
74
|
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
@@ -76,6 +78,7 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
|
76
78
|
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
77
79
|
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
78
80
|
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
81
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
|
|
79
82
|
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
80
83
|
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
81
84
|
Requires-Dist: scipy>=1.13.1; extra == "pt"
|
|
@@ -172,13 +175,17 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
172
175
|
- Document layout analysis and table recognition now runs with
|
|
173
176
|
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
174
177
|
anymore for basic inference.
|
|
175
|
-
-
|
|
176
|
-
|
|
177
|
-
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
178
|
+
- More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
179
|
+
- Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
178
180
|
[**transformers**](https://github.com/huggingface/transformers).
|
|
179
181
|
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
180
182
|
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
181
|
-
LayoutLM can be used for LiLT as well
|
|
183
|
+
LayoutLM can be used for LiLT as well.
|
|
184
|
+
- [**new**] There are two notebooks available that show, how to write a
|
|
185
|
+
[custom predictor](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) based on
|
|
186
|
+
a third party library that has not been supported yet and how to use
|
|
187
|
+
[advanced configuration](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) to
|
|
188
|
+
get links between layout segments e.g. captions and tables or figures.
|
|
182
189
|
|
|
183
190
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
184
191
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -263,7 +270,7 @@ documentation.
|
|
|
263
270
|
|
|
264
271
|
## Requirements
|
|
265
272
|
|
|
266
|
-

|
|
267
274
|
|
|
268
275
|
Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
|
|
269
276
|
separately.
|
|
@@ -272,13 +279,16 @@ separately.
|
|
|
272
279
|
- Python >= 3.9
|
|
273
280
|
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
274
281
|
In general, if you want to train or fine-tune models, a GPU is required.
|
|
275
|
-
|
|
276
|
-
images.
|
|
282
|
+
|
|
277
283
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
278
284
|
and [PyTorch](https://pytorch.org/get-started/locally/).
|
|
279
285
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
|
|
280
286
|
engine has to be installed separately.
|
|
281
287
|
|
|
288
|
+
|
|
289
|
+
- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
|
|
290
|
+
documents into images. For release `v.0.35.0` this dependency will be optional.
|
|
291
|
+
|
|
282
292
|
The following overview shows the availability of the models in conjunction with the DL framework.
|
|
283
293
|
|
|
284
294
|
| Task | PyTorch | Torchscript | Tensorflow |
|
|
@@ -396,8 +406,8 @@ to develop this framework.
|
|
|
396
406
|
## Problems
|
|
397
407
|
|
|
398
408
|
We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
|
|
399
|
-
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every
|
|
400
|
-
to
|
|
409
|
+
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
|
|
410
|
+
to 12 weeks.
|
|
401
411
|
|
|
402
412
|
## If you like **deep**doctection ...
|
|
403
413
|
|
|
@@ -45,13 +45,17 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
45
45
|
- Document layout analysis and table recognition now runs with
|
|
46
46
|
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
47
47
|
anymore for basic inference.
|
|
48
|
-
-
|
|
49
|
-
|
|
50
|
-
- [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
48
|
+
- More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
49
|
+
- Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
51
50
|
[**transformers**](https://github.com/huggingface/transformers).
|
|
52
51
|
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
53
52
|
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
54
|
-
LayoutLM can be used for LiLT as well
|
|
53
|
+
LayoutLM can be used for LiLT as well.
|
|
54
|
+
- [**new**] There are two notebooks available that show, how to write a
|
|
55
|
+
[custom predictor](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) based on
|
|
56
|
+
a third party library that has not been supported yet and how to use
|
|
57
|
+
[advanced configuration](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) to
|
|
58
|
+
get links between layout segments e.g. captions and tables or figures.
|
|
55
59
|
|
|
56
60
|
**deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
|
|
57
61
|
post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
|
|
@@ -136,7 +140,7 @@ documentation.
|
|
|
136
140
|
|
|
137
141
|
## Requirements
|
|
138
142
|
|
|
139
|
-

|
|
140
144
|
|
|
141
145
|
Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
|
|
142
146
|
separately.
|
|
@@ -145,13 +149,16 @@ separately.
|
|
|
145
149
|
- Python >= 3.9
|
|
146
150
|
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
147
151
|
In general, if you want to train or fine-tune models, a GPU is required.
|
|
148
|
-
|
|
149
|
-
images.
|
|
152
|
+
|
|
150
153
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
151
154
|
and [PyTorch](https://pytorch.org/get-started/locally/).
|
|
152
155
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
|
|
153
156
|
engine has to be installed separately.
|
|
154
157
|
|
|
158
|
+
|
|
159
|
+
- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
|
|
160
|
+
documents into images. For release `v.0.35.0` this dependency will be optional.
|
|
161
|
+
|
|
155
162
|
The following overview shows the availability of the models in conjunction with the DL framework.
|
|
156
163
|
|
|
157
164
|
| Task | PyTorch | Torchscript | Tensorflow |
|
|
@@ -269,8 +276,8 @@ to develop this framework.
|
|
|
269
276
|
## Problems
|
|
270
277
|
|
|
271
278
|
We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
|
|
272
|
-
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every
|
|
273
|
-
to
|
|
279
|
+
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
|
|
280
|
+
to 12 weeks.
|
|
274
281
|
|
|
275
282
|
## If you like **deep**doctection ...
|
|
276
283
|
|
|
@@ -18,26 +18,16 @@ if importlib.util.find_spec("dotenv") is not None:
|
|
|
18
18
|
import sys
|
|
19
19
|
from typing import TYPE_CHECKING
|
|
20
20
|
|
|
21
|
-
from .utils.env_info import collect_env_info
|
|
21
|
+
from .utils.env_info import auto_select_pdf_render_framework, collect_env_info
|
|
22
22
|
from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
|
|
23
23
|
from .utils.logger import LoggingRecord, logger
|
|
24
24
|
|
|
25
25
|
# pylint: enable=wrong-import-position
|
|
26
26
|
|
|
27
|
-
__version__ = 0.
|
|
27
|
+
__version__ = 0.36
|
|
28
28
|
|
|
29
29
|
_IMPORT_STRUCTURE = {
|
|
30
|
-
"analyzer": [
|
|
31
|
-
"config_sanity_checks",
|
|
32
|
-
"build_detector",
|
|
33
|
-
"build_padder",
|
|
34
|
-
"build_service",
|
|
35
|
-
"build_sub_image_service",
|
|
36
|
-
"build_ocr",
|
|
37
|
-
"build_doctr_word",
|
|
38
|
-
"get_dd_analyzer",
|
|
39
|
-
"build_analyzer",
|
|
40
|
-
],
|
|
30
|
+
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
41
31
|
"configs": [],
|
|
42
32
|
"dataflow": [
|
|
43
33
|
"DataFlowTerminated",
|
|
@@ -197,6 +187,7 @@ _IMPORT_STRUCTURE = {
|
|
|
197
187
|
"print_model_infos",
|
|
198
188
|
"ModelDownloadManager",
|
|
199
189
|
"PdfPlumberTextDetector",
|
|
190
|
+
"Pdfmium2TextDetector",
|
|
200
191
|
"TesseractOcrDetector",
|
|
201
192
|
"TesseractRotationTransformer",
|
|
202
193
|
"TextractOcrDetector",
|
|
@@ -304,6 +295,7 @@ _IMPORT_STRUCTURE = {
|
|
|
304
295
|
"timed_operation",
|
|
305
296
|
"collect_env_info",
|
|
306
297
|
"auto_select_viz_library",
|
|
298
|
+
"auto_select_pdf_render_framework",
|
|
307
299
|
"get_tensorflow_requirement",
|
|
308
300
|
"tf_addons_available",
|
|
309
301
|
"get_tf_addons_requirements",
|
|
@@ -383,6 +375,7 @@ _IMPORT_STRUCTURE = {
|
|
|
383
375
|
"get_pdf_file_writer",
|
|
384
376
|
"PDFStreamer",
|
|
385
377
|
"pdf_to_np_array",
|
|
378
|
+
"split_pdf",
|
|
386
379
|
"ObjectTypes",
|
|
387
380
|
"TypeOrStr",
|
|
388
381
|
"object_types_registry",
|
|
@@ -427,7 +420,7 @@ _IMPORT_STRUCTURE = {
|
|
|
427
420
|
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
428
421
|
env_info = collect_env_info()
|
|
429
422
|
logger.debug(LoggingRecord(msg=env_info))
|
|
430
|
-
|
|
423
|
+
auto_select_pdf_render_framework()
|
|
431
424
|
|
|
432
425
|
# Direct imports for type-checking
|
|
433
426
|
if TYPE_CHECKING:
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: config.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
|
|
19
|
+
|
|
20
|
+
from ..datapoint.view import IMAGE_DEFAULTS
|
|
21
|
+
from ..utils.metacfg import AttrDict
|
|
22
|
+
from ..utils.settings import CellType, LayoutType
|
|
23
|
+
|
|
24
|
+
cfg = AttrDict()
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
cfg.LANGUAGE = None
|
|
28
|
+
cfg.LIB = None
|
|
29
|
+
cfg.DEVICE = None
|
|
30
|
+
cfg.USE_ROTATOR = False
|
|
31
|
+
cfg.USE_LAYOUT = True
|
|
32
|
+
cfg.USE_TABLE_SEGMENTATION = True
|
|
33
|
+
|
|
34
|
+
cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
|
|
35
|
+
cfg.TF.LAYOUT.FILTER = None
|
|
36
|
+
|
|
37
|
+
cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
|
|
38
|
+
cfg.TF.CELL.FILTER = None
|
|
39
|
+
|
|
40
|
+
cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
|
|
41
|
+
cfg.TF.ITEM.FILTER = None
|
|
42
|
+
|
|
43
|
+
cfg.PT.LAYOUT.WEIGHTS = "layout/d2_model_0829999_layout_inf_only.pt"
|
|
44
|
+
cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
|
|
45
|
+
cfg.PT.LAYOUT.FILTER = None
|
|
46
|
+
cfg.PT.LAYOUT.PAD.TOP = 60
|
|
47
|
+
cfg.PT.LAYOUT.PAD.RIGHT = 60
|
|
48
|
+
cfg.PT.LAYOUT.PAD.BOTTOM = 60
|
|
49
|
+
cfg.PT.LAYOUT.PAD.LEFT = 60
|
|
50
|
+
|
|
51
|
+
cfg.PT.ITEM.WEIGHTS = "item/d2_model_1639999_item_inf_only.pt"
|
|
52
|
+
cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
|
|
53
|
+
cfg.PT.ITEM.FILTER = None
|
|
54
|
+
cfg.PT.ITEM.PAD.TOP = 60
|
|
55
|
+
cfg.PT.ITEM.PAD.RIGHT = 60
|
|
56
|
+
cfg.PT.ITEM.PAD.BOTTOM = 60
|
|
57
|
+
cfg.PT.ITEM.PAD.LEFT = 60
|
|
58
|
+
|
|
59
|
+
cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
|
|
60
|
+
cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
|
|
61
|
+
cfg.PT.CELL.FILTER = None
|
|
62
|
+
|
|
63
|
+
cfg.USE_LAYOUT_NMS = False
|
|
64
|
+
cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = None
|
|
65
|
+
cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = None
|
|
66
|
+
cfg.LAYOUT_NMS_PAIRS.PRIORITY = None
|
|
67
|
+
|
|
68
|
+
cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
|
|
69
|
+
cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
|
|
70
|
+
cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
|
|
71
|
+
cfg.SEGMENTATION.FULL_TABLE_TILING = True
|
|
72
|
+
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.001
|
|
73
|
+
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.001
|
|
74
|
+
cfg.SEGMENTATION.CELL_CATEGORY_ID = 12
|
|
75
|
+
cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
|
|
76
|
+
cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
|
|
77
|
+
CellType.SPANNING,
|
|
78
|
+
CellType.ROW_HEADER,
|
|
79
|
+
CellType.COLUMN_HEADER,
|
|
80
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
81
|
+
LayoutType.CELL,
|
|
82
|
+
]
|
|
83
|
+
cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
|
|
84
|
+
CellType.SPANNING,
|
|
85
|
+
CellType.ROW_HEADER,
|
|
86
|
+
CellType.COLUMN_HEADER,
|
|
87
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
88
|
+
]
|
|
89
|
+
cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
90
|
+
cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
91
|
+
cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
|
|
92
|
+
cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
93
|
+
cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
94
|
+
|
|
95
|
+
cfg.SEGMENTATION.STRETCH_RULE = "equal"
|
|
96
|
+
|
|
97
|
+
cfg.USE_TABLE_REFINEMENT = True
|
|
98
|
+
cfg.USE_PDF_MINER = False
|
|
99
|
+
|
|
100
|
+
cfg.PDF_MINER.X_TOLERANCE = 3
|
|
101
|
+
cfg.PDF_MINER.Y_TOLERANCE = 3
|
|
102
|
+
|
|
103
|
+
cfg.USE_OCR = True
|
|
104
|
+
|
|
105
|
+
cfg.OCR.USE_TESSERACT = True
|
|
106
|
+
cfg.OCR.USE_DOCTR = False
|
|
107
|
+
cfg.OCR.USE_TEXTRACT = False
|
|
108
|
+
cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
|
|
109
|
+
|
|
110
|
+
cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
|
|
111
|
+
cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
|
|
112
|
+
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
|
|
113
|
+
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
|
|
114
|
+
|
|
115
|
+
cfg.TEXT_CONTAINER = IMAGE_DEFAULTS["text_container"]
|
|
116
|
+
cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
|
|
117
|
+
LayoutType.TEXT,
|
|
118
|
+
LayoutType.TITLE,
|
|
119
|
+
LayoutType.LIST,
|
|
120
|
+
LayoutType.CELL,
|
|
121
|
+
CellType.COLUMN_HEADER,
|
|
122
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
123
|
+
CellType.SPANNING,
|
|
124
|
+
CellType.ROW_HEADER,
|
|
125
|
+
]
|
|
126
|
+
cfg.WORD_MATCHING.RULE = "ioa"
|
|
127
|
+
cfg.WORD_MATCHING.THRESHOLD = 0.6
|
|
128
|
+
cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
|
|
129
|
+
|
|
130
|
+
cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS["text_block_categories"]
|
|
131
|
+
cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS["floating_text_block_categories"]
|
|
132
|
+
cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
|
|
133
|
+
cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
|
|
134
|
+
cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
|
|
135
|
+
cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
|
|
136
|
+
cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
|
|
137
|
+
|
|
138
|
+
cfg.USE_LAYOUT_LINK = False
|
|
139
|
+
cfg.LAYOUT_LINK.PARENTAL_CATEGORIES = []
|
|
140
|
+
cfg.LAYOUT_LINK.CHILD_CATEGORIES = []
|
|
141
|
+
|
|
142
|
+
cfg.freeze()
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: dd.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2021 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Module for **deep**doctection analyzer.
|
|
20
|
+
|
|
21
|
+
-factory build_analyzer for a given config
|
|
22
|
+
|
|
23
|
+
-user factory with a reduced config setting
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import os
|
|
29
|
+
from typing import Optional
|
|
30
|
+
|
|
31
|
+
from ..extern.pt.ptutils import get_torch_device
|
|
32
|
+
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
33
|
+
from ..pipe.doctectionpipe import DoctectionPipe
|
|
34
|
+
from ..utils.env_info import ENV_VARS_TRUE
|
|
35
|
+
from ..utils.error import DependencyError
|
|
36
|
+
from ..utils.file_utils import tensorpack_available
|
|
37
|
+
from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
|
|
38
|
+
from ..utils.logger import LoggingRecord, logger
|
|
39
|
+
from ..utils.metacfg import set_config_by_yaml
|
|
40
|
+
from ..utils.types import PathLikeOrStr
|
|
41
|
+
from ._config import cfg
|
|
42
|
+
from .factory import ServiceFactory
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"config_sanity_checks",
|
|
46
|
+
"get_dd_analyzer",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
_DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
|
|
50
|
+
_TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
|
|
51
|
+
_MODEL_CHOICES = {
|
|
52
|
+
"layout": [
|
|
53
|
+
"layout/d2_model_0829999_layout_inf_only.pt",
|
|
54
|
+
"xrf_layout/model_final_inf_only.pt",
|
|
55
|
+
"microsoft/table-transformer-detection/pytorch_model.bin",
|
|
56
|
+
],
|
|
57
|
+
"segmentation": [
|
|
58
|
+
"item/model-1620000_inf_only.data-00000-of-00001",
|
|
59
|
+
"xrf_item/model_final_inf_only.pt",
|
|
60
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin",
|
|
61
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
|
|
62
|
+
],
|
|
63
|
+
"ocr": ["Tesseract", "DocTr", "Textract"],
|
|
64
|
+
"doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
|
|
65
|
+
"doctr_recognition": [
|
|
66
|
+
"doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
|
|
67
|
+
"doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
|
|
68
|
+
],
|
|
69
|
+
"llm": ["gpt-3.5-turbo", "gpt-4"],
|
|
70
|
+
"segmentation_choices": {
|
|
71
|
+
"item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
|
|
72
|
+
"xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
|
|
73
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
|
|
74
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def config_sanity_checks() -> None:
|
|
80
|
+
"""Some config sanity checks"""
|
|
81
|
+
if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
|
|
82
|
+
raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
|
|
83
|
+
if cfg.USE_OCR:
|
|
84
|
+
if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
"Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
|
|
87
|
+
"and set the other two to False. Only one OCR system can be activated."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_dd_analyzer(
|
|
92
|
+
reset_config_file: bool = True,
|
|
93
|
+
config_overwrite: Optional[list[str]] = None,
|
|
94
|
+
path_config_file: Optional[PathLikeOrStr] = None,
|
|
95
|
+
) -> DoctectionPipe:
|
|
96
|
+
"""
|
|
97
|
+
Factory function for creating the built-in **deep**doctection analyzer.
|
|
98
|
+
|
|
99
|
+
The Standard Analyzer is a pipeline that comprises the following analysis components:
|
|
100
|
+
|
|
101
|
+
- Document layout analysis
|
|
102
|
+
|
|
103
|
+
- Table segmentation
|
|
104
|
+
|
|
105
|
+
- Text extraction/OCR
|
|
106
|
+
|
|
107
|
+
- Reading order
|
|
108
|
+
|
|
109
|
+
We refer to the various notebooks and docs for running an analyzer and changing the configs.
|
|
110
|
+
|
|
111
|
+
:param reset_config_file: This will copy the `.yaml` file with default variables to the `.cache` and therefore
|
|
112
|
+
resetting all configurations if set to `True`.
|
|
113
|
+
:param config_overwrite: Passing a list of string arguments and values to overwrite the `.yaml` configuration with
|
|
114
|
+
highest priority, e.g. ["USE_TABLE_SEGMENTATION=False",
|
|
115
|
+
"USE_OCR=False",
|
|
116
|
+
"TF.LAYOUT.WEIGHTS=my_fancy_pytorch_model"]
|
|
117
|
+
:param path_config_file: Path to a custom config file. Can be outside of the .cache directory.
|
|
118
|
+
:return: A DoctectionPipe instance with given configs
|
|
119
|
+
"""
|
|
120
|
+
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
121
|
+
lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
|
|
122
|
+
if lib == "TF":
|
|
123
|
+
device = get_tf_device()
|
|
124
|
+
elif lib == "PT":
|
|
125
|
+
device = get_torch_device()
|
|
126
|
+
else:
|
|
127
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
128
|
+
dd_one_config_path = maybe_copy_config_to_cache(
|
|
129
|
+
get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
|
|
130
|
+
)
|
|
131
|
+
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
|
|
132
|
+
|
|
133
|
+
# Set up of the configuration and logging
|
|
134
|
+
file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
|
|
135
|
+
cfg.freeze(freezed=False)
|
|
136
|
+
cfg.overwrite_config(file_cfg)
|
|
137
|
+
|
|
138
|
+
cfg.freeze(freezed=False)
|
|
139
|
+
cfg.LANGUAGE = None
|
|
140
|
+
cfg.LIB = lib
|
|
141
|
+
cfg.DEVICE = device
|
|
142
|
+
cfg.freeze()
|
|
143
|
+
|
|
144
|
+
if config_overwrite:
|
|
145
|
+
cfg.update_args(config_overwrite)
|
|
146
|
+
|
|
147
|
+
config_sanity_checks()
|
|
148
|
+
logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
|
|
149
|
+
|
|
150
|
+
# will silent all TP logging while building the tower
|
|
151
|
+
if tensorpack_available():
|
|
152
|
+
disable_tp_layer_logging()
|
|
153
|
+
|
|
154
|
+
return ServiceFactory.build_analyzer(cfg)
|