deepdoctection 0.34__tar.gz → 0.35__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- {deepdoctection-0.34 → deepdoctection-0.35}/PKG-INFO +17 -11
- {deepdoctection-0.34 → deepdoctection-0.35}/README.md +10 -7
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/__init__.py +6 -10
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/analyzer/__init__.py +1 -0
- deepdoctection-0.35/deepdoctection/analyzer/_config.py +150 -0
- deepdoctection-0.35/deepdoctection/analyzer/dd.py +154 -0
- deepdoctection-0.35/deepdoctection/analyzer/factory.py +522 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/configs/conf_dd_one.yaml +1 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datapoint/annotation.py +1 -1
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datapoint/convert.py +6 -4
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datapoint/image.py +16 -6
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datapoint/view.py +1 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/pdftext.py +96 -5
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tessocr.py +1 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/env_info.py +30 -1
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/file_utils.py +19 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/metacfg.py +12 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/pdf_utils.py +86 -3
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection.egg-info/PKG-INFO +17 -11
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection.egg-info/SOURCES.txt +2 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection.egg-info/requires.txt +6 -3
- {deepdoctection-0.34 → deepdoctection-0.35}/setup.cfg +4 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/setup.py +3 -1
- deepdoctection-0.34/deepdoctection/analyzer/dd.py +0 -478
- {deepdoctection-0.34 → deepdoctection-0.35}/LICENSE +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/dataflow/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/dataflow/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/dataflow/common.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/dataflow/custom.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/dataflow/custom_serialize.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/dataflow/parallel_map.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/dataflow/serialize.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/dataflow/stats.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datapoint/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datapoint/box.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/adapter.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/dataflow_builder.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/info.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/doclaynet.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/fintabnet.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/funsd.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/layouttest.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/publaynet.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/xfund.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/registry.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/datasets/save.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/eval/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/eval/accmetric.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/eval/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/eval/cocometric.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/eval/eval.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/eval/registry.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/eval/tedsmetric.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/eval/tp_eval_callback.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/d2detect.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/deskew.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/doctrocr.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/fastlang.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/hfdetr.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/hflayoutlm.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/hflm.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/model.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/pt/nms.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/pt/ptutils.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/texocr.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tfutils.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpcompat.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/extern/tpdetect.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/cats.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/cocostruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/d2struct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/hfstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/laylmstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/maputils.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/match.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/misc.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/pascalstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/prodigystruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/pubstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/tpstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/mapper/xfundstruct.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/anngen.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/base.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/common.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/concurrency.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/doctectionpipe.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/language.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/layout.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/lm.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/order.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/refine.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/registry.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/segment.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/sub_layout.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/text.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/pipe/transform.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/train/d2_frcnn_train.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/train/hf_detr_train.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/train/hf_layoutlm_train.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/train/tp_frcnn_train.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/concurrency.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/context.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/develop.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/error.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/fs.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/identifier.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/logger.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/settings.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/tqdm.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/transform.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/utils.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection/utils/viz.py +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.34 → deepdoctection-0.35}/tests/test_utils.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepdoctection
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.35
|
|
4
4
|
Summary: Repository for Document AI
|
|
5
5
|
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
6
|
Author: Dr. Janis Meyer
|
|
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
|
|
|
17
17
|
Description-Content-Type: text/markdown
|
|
18
18
|
License-File: LICENSE
|
|
19
19
|
Requires-Dist: catalogue==2.0.10
|
|
20
|
-
Requires-Dist: huggingface_hub
|
|
20
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0
|
|
21
21
|
Requires-Dist: importlib-metadata>=5.0.0
|
|
22
22
|
Requires-Dist: jsonlines==3.1.0
|
|
23
23
|
Requires-Dist: lazy-imports==0.3.1
|
|
@@ -27,6 +27,7 @@ Requires-Dist: numpy<2.0,>=1.21
|
|
|
27
27
|
Requires-Dist: packaging>=20.0
|
|
28
28
|
Requires-Dist: Pillow>=10.0.0
|
|
29
29
|
Requires-Dist: pypdf>=3.16.0
|
|
30
|
+
Requires-Dist: pypdfium2>=4.30.0
|
|
30
31
|
Requires-Dist: pyyaml>=6.0.1
|
|
31
32
|
Requires-Dist: pyzmq>=16
|
|
32
33
|
Requires-Dist: scipy>=1.13.1
|
|
@@ -35,7 +36,7 @@ Requires-Dist: tabulate>=0.7.7
|
|
|
35
36
|
Requires-Dist: tqdm==4.64.0
|
|
36
37
|
Provides-Extra: tf
|
|
37
38
|
Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
38
|
-
Requires-Dist: huggingface_hub
|
|
39
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
|
|
39
40
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
40
41
|
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
41
42
|
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
@@ -45,6 +46,7 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
|
45
46
|
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
46
47
|
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
47
48
|
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
49
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
|
|
48
50
|
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
49
51
|
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
50
52
|
Requires-Dist: scipy>=1.13.1; extra == "tf"
|
|
@@ -66,7 +68,7 @@ Requires-Dist: distance==0.1.3; extra == "tf"
|
|
|
66
68
|
Requires-Dist: lxml>=4.9.1; extra == "tf"
|
|
67
69
|
Provides-Extra: pt
|
|
68
70
|
Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
69
|
-
Requires-Dist: huggingface_hub
|
|
71
|
+
Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
|
|
70
72
|
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
71
73
|
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
72
74
|
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
@@ -76,6 +78,7 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
|
76
78
|
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
77
79
|
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
78
80
|
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
81
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
|
|
79
82
|
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
80
83
|
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
81
84
|
Requires-Dist: scipy>=1.13.1; extra == "pt"
|
|
@@ -172,9 +175,9 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
172
175
|
- Document layout analysis and table recognition now runs with
|
|
173
176
|
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
174
177
|
anymore for basic inference.
|
|
175
|
-
-
|
|
178
|
+
- More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
176
179
|
(not contained in the built-in Analyzer).
|
|
177
|
-
-
|
|
180
|
+
- Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
178
181
|
[**transformers**](https://github.com/huggingface/transformers).
|
|
179
182
|
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
180
183
|
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
@@ -263,7 +266,7 @@ documentation.
|
|
|
263
266
|
|
|
264
267
|
## Requirements
|
|
265
268
|
|
|
266
|
-

|
|
267
270
|
|
|
268
271
|
Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
|
|
269
272
|
separately.
|
|
@@ -272,13 +275,16 @@ separately.
|
|
|
272
275
|
- Python >= 3.9
|
|
273
276
|
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
274
277
|
In general, if you want to train or fine-tune models, a GPU is required.
|
|
275
|
-
|
|
276
|
-
images.
|
|
278
|
+
|
|
277
279
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
278
280
|
and [PyTorch](https://pytorch.org/get-started/locally/).
|
|
279
281
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
|
|
280
282
|
engine has to be installed separately.
|
|
281
283
|
|
|
284
|
+
|
|
285
|
+
- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
|
|
286
|
+
documents into images. For release `v.0.35.0` this dependency will be optional.
|
|
287
|
+
|
|
282
288
|
The following overview shows the availability of the models in conjunction with the DL framework.
|
|
283
289
|
|
|
284
290
|
| Task | PyTorch | Torchscript | Tensorflow |
|
|
@@ -396,8 +402,8 @@ to develop this framework.
|
|
|
396
402
|
## Problems
|
|
397
403
|
|
|
398
404
|
We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
|
|
399
|
-
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every
|
|
400
|
-
to
|
|
405
|
+
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
|
|
406
|
+
to 12 weeks.
|
|
401
407
|
|
|
402
408
|
## If you like **deep**doctection ...
|
|
403
409
|
|
|
@@ -45,9 +45,9 @@ pipelines. Its core function does not depend on any specific deep learning libra
|
|
|
45
45
|
- Document layout analysis and table recognition now runs with
|
|
46
46
|
[**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
|
|
47
47
|
anymore for basic inference.
|
|
48
|
-
-
|
|
48
|
+
- More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
|
|
49
49
|
(not contained in the built-in Analyzer).
|
|
50
|
-
-
|
|
50
|
+
- Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
|
|
51
51
|
[**transformers**](https://github.com/huggingface/transformers).
|
|
52
52
|
We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
|
|
53
53
|
that seem to look promising, especially if you want to train a model on non-english data. The training script for
|
|
@@ -136,7 +136,7 @@ documentation.
|
|
|
136
136
|
|
|
137
137
|
## Requirements
|
|
138
138
|
|
|
139
|
-

|
|
140
140
|
|
|
141
141
|
Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
|
|
142
142
|
separately.
|
|
@@ -145,13 +145,16 @@ separately.
|
|
|
145
145
|
- Python >= 3.9
|
|
146
146
|
- 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
147
147
|
In general, if you want to train or fine-tune models, a GPU is required.
|
|
148
|
-
|
|
149
|
-
images.
|
|
148
|
+
|
|
150
149
|
- With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
|
|
151
150
|
and [PyTorch](https://pytorch.org/get-started/locally/).
|
|
152
151
|
- [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
|
|
153
152
|
engine has to be installed separately.
|
|
154
153
|
|
|
154
|
+
|
|
155
|
+
- For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
|
|
156
|
+
documents into images. For release `v.0.35.0` this dependency will be optional.
|
|
157
|
+
|
|
155
158
|
The following overview shows the availability of the models in conjunction with the DL framework.
|
|
156
159
|
|
|
157
160
|
| Task | PyTorch | Torchscript | Tensorflow |
|
|
@@ -269,8 +272,8 @@ to develop this framework.
|
|
|
269
272
|
## Problems
|
|
270
273
|
|
|
271
274
|
We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
|
|
272
|
-
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every
|
|
273
|
-
to
|
|
275
|
+
repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
|
|
276
|
+
to 12 weeks.
|
|
274
277
|
|
|
275
278
|
## If you like **deep**doctection ...
|
|
276
279
|
|
|
@@ -18,25 +18,19 @@ if importlib.util.find_spec("dotenv") is not None:
|
|
|
18
18
|
import sys
|
|
19
19
|
from typing import TYPE_CHECKING
|
|
20
20
|
|
|
21
|
-
from .utils.env_info import collect_env_info
|
|
21
|
+
from .utils.env_info import auto_select_pdf_render_framework, collect_env_info
|
|
22
22
|
from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
|
|
23
23
|
from .utils.logger import LoggingRecord, logger
|
|
24
24
|
|
|
25
25
|
# pylint: enable=wrong-import-position
|
|
26
26
|
|
|
27
|
-
__version__ = 0.
|
|
27
|
+
__version__ = 0.35
|
|
28
28
|
|
|
29
29
|
_IMPORT_STRUCTURE = {
|
|
30
30
|
"analyzer": [
|
|
31
31
|
"config_sanity_checks",
|
|
32
|
-
"build_detector",
|
|
33
|
-
"build_padder",
|
|
34
|
-
"build_service",
|
|
35
|
-
"build_sub_image_service",
|
|
36
|
-
"build_ocr",
|
|
37
|
-
"build_doctr_word",
|
|
38
32
|
"get_dd_analyzer",
|
|
39
|
-
"
|
|
33
|
+
"ServiceFactory"
|
|
40
34
|
],
|
|
41
35
|
"configs": [],
|
|
42
36
|
"dataflow": [
|
|
@@ -197,6 +191,7 @@ _IMPORT_STRUCTURE = {
|
|
|
197
191
|
"print_model_infos",
|
|
198
192
|
"ModelDownloadManager",
|
|
199
193
|
"PdfPlumberTextDetector",
|
|
194
|
+
"Pdfmium2TextDetector",
|
|
200
195
|
"TesseractOcrDetector",
|
|
201
196
|
"TesseractRotationTransformer",
|
|
202
197
|
"TextractOcrDetector",
|
|
@@ -304,6 +299,7 @@ _IMPORT_STRUCTURE = {
|
|
|
304
299
|
"timed_operation",
|
|
305
300
|
"collect_env_info",
|
|
306
301
|
"auto_select_viz_library",
|
|
302
|
+
"auto_select_pdf_render_framework",
|
|
307
303
|
"get_tensorflow_requirement",
|
|
308
304
|
"tf_addons_available",
|
|
309
305
|
"get_tf_addons_requirements",
|
|
@@ -427,7 +423,7 @@ _IMPORT_STRUCTURE = {
|
|
|
427
423
|
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
428
424
|
env_info = collect_env_info()
|
|
429
425
|
logger.debug(LoggingRecord(msg=env_info))
|
|
430
|
-
|
|
426
|
+
auto_select_pdf_render_framework()
|
|
431
427
|
|
|
432
428
|
# Direct imports for type-checking
|
|
433
429
|
if TYPE_CHECKING:
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: config.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2024 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
|
|
19
|
+
|
|
20
|
+
from ..utils.metacfg import AttrDict
|
|
21
|
+
from ..utils.settings import CellType, LayoutType
|
|
22
|
+
|
|
23
|
+
cfg = AttrDict()
|
|
24
|
+
|
|
25
|
+
cfg.LANGUAGE = None
|
|
26
|
+
cfg.LIB = None
|
|
27
|
+
cfg.DEVICE = None
|
|
28
|
+
cfg.USE_ROTATOR = False
|
|
29
|
+
cfg.USE_LAYOUT = True
|
|
30
|
+
cfg.USE_TABLE_SEGMENTATION = True
|
|
31
|
+
|
|
32
|
+
cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
|
|
33
|
+
cfg.TF.LAYOUT.FILTER = None
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
|
|
37
|
+
cfg.TF.CELL.FILTER = None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
|
|
41
|
+
cfg.TF.ITEM.FILTER = None
|
|
42
|
+
|
|
43
|
+
cfg.PT.LAYOUT.WEIGHTS = "layout/d2_model_0829999_layout_inf_only.pt"
|
|
44
|
+
cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
|
|
45
|
+
cfg.PT.LAYOUT.FILTER = None
|
|
46
|
+
cfg.PT.LAYOUT.PAD.TOP = 60
|
|
47
|
+
cfg.PT.LAYOUT.PAD.RIGHT = 60
|
|
48
|
+
cfg.PT.LAYOUT.PAD.BOTTOM = 60
|
|
49
|
+
cfg.PT.LAYOUT.PAD.LEFT = 60
|
|
50
|
+
|
|
51
|
+
cfg.PT.ITEM.WEIGHTS = "item/d2_model_1639999_item_inf_only.pt"
|
|
52
|
+
cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
|
|
53
|
+
cfg.PT.ITEM.FILTER = None
|
|
54
|
+
cfg.PT.ITEM.PAD.TOP = 60
|
|
55
|
+
cfg.PT.ITEM.PAD.RIGHT = 60
|
|
56
|
+
cfg.PT.ITEM.PAD.BOTTOM = 60
|
|
57
|
+
cfg.PT.ITEM.PAD.LEFT = 60
|
|
58
|
+
|
|
59
|
+
cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
|
|
60
|
+
cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
|
|
61
|
+
cfg.PT.CELL.FILTER = None
|
|
62
|
+
|
|
63
|
+
cfg.USE_LAYOUT_NMS = False
|
|
64
|
+
cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = None
|
|
65
|
+
cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = None
|
|
66
|
+
cfg.LAYOUT_NMS_PAIRS.PRIORITY = None
|
|
67
|
+
|
|
68
|
+
cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
|
|
69
|
+
cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
|
|
70
|
+
cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
|
|
71
|
+
cfg.SEGMENTATION.FULL_TABLE_TILING = True
|
|
72
|
+
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.001
|
|
73
|
+
cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.001
|
|
74
|
+
cfg.SEGMENTATION.CELL_CATEGORY_ID = 12
|
|
75
|
+
cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
|
|
76
|
+
cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
|
|
77
|
+
CellType.SPANNING,
|
|
78
|
+
CellType.ROW_HEADER,
|
|
79
|
+
CellType.COLUMN_HEADER,
|
|
80
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
81
|
+
LayoutType.CELL,
|
|
82
|
+
]
|
|
83
|
+
cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
|
|
84
|
+
CellType.SPANNING,
|
|
85
|
+
CellType.ROW_HEADER,
|
|
86
|
+
CellType.COLUMN_HEADER,
|
|
87
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
88
|
+
]
|
|
89
|
+
cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
90
|
+
cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
91
|
+
cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
|
|
92
|
+
cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
|
|
93
|
+
cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
|
|
94
|
+
|
|
95
|
+
cfg.SEGMENTATION.STRETCH_RULE = "equal"
|
|
96
|
+
|
|
97
|
+
cfg.USE_TABLE_REFINEMENT = True
|
|
98
|
+
cfg.USE_PDF_MINER = False
|
|
99
|
+
|
|
100
|
+
cfg.PDF_MINER.X_TOLERANCE = 3
|
|
101
|
+
cfg.PDF_MINER.Y_TOLERANCE = 3
|
|
102
|
+
|
|
103
|
+
cfg.USE_OCR = True
|
|
104
|
+
|
|
105
|
+
cfg.OCR.USE_TESSERACT = True
|
|
106
|
+
cfg.OCR.USE_DOCTR = False
|
|
107
|
+
cfg.OCR.USE_TEXTRACT = False
|
|
108
|
+
cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
|
|
109
|
+
|
|
110
|
+
cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
|
|
111
|
+
cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
|
|
112
|
+
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
|
|
113
|
+
cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
|
|
114
|
+
|
|
115
|
+
cfg.TEXT_CONTAINER = LayoutType.WORD
|
|
116
|
+
cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
|
|
117
|
+
LayoutType.TEXT,
|
|
118
|
+
LayoutType.TITLE,
|
|
119
|
+
LayoutType.LIST,
|
|
120
|
+
LayoutType.CELL,
|
|
121
|
+
CellType.COLUMN_HEADER,
|
|
122
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
123
|
+
CellType.SPANNING,
|
|
124
|
+
CellType.ROW_HEADER,
|
|
125
|
+
]
|
|
126
|
+
cfg.WORD_MATCHING.RULE = "ioa"
|
|
127
|
+
cfg.WORD_MATCHING.THRESHOLD = 0.6
|
|
128
|
+
cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
|
|
129
|
+
|
|
130
|
+
cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = [
|
|
131
|
+
LayoutType.TEXT,
|
|
132
|
+
LayoutType.TITLE,
|
|
133
|
+
LayoutType.LIST,
|
|
134
|
+
LayoutType.CELL,
|
|
135
|
+
CellType.COLUMN_HEADER,
|
|
136
|
+
CellType.PROJECTED_ROW_HEADER,
|
|
137
|
+
CellType.SPANNING,
|
|
138
|
+
CellType.ROW_HEADER,
|
|
139
|
+
]
|
|
140
|
+
cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = [
|
|
141
|
+
LayoutType.TEXT,
|
|
142
|
+
LayoutType.TITLE,
|
|
143
|
+
LayoutType.LIST,
|
|
144
|
+
]
|
|
145
|
+
cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
|
|
146
|
+
cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
|
|
147
|
+
cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
|
|
148
|
+
cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
|
|
149
|
+
cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
|
|
150
|
+
cfg.freeze()
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: dd.py
|
|
3
|
+
|
|
4
|
+
# Copyright 2021 Dr. Janis Meyer. All rights reserved.
|
|
5
|
+
#
|
|
6
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
7
|
+
# you may not use this file except in compliance with the License.
|
|
8
|
+
# You may obtain a copy of the License at
|
|
9
|
+
#
|
|
10
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
11
|
+
#
|
|
12
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
13
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
14
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
15
|
+
# See the License for the specific language governing permissions and
|
|
16
|
+
# limitations under the License.
|
|
17
|
+
|
|
18
|
+
"""
|
|
19
|
+
Module for **deep**doctection analyzer.
|
|
20
|
+
|
|
21
|
+
-factory build_analyzer for a given config
|
|
22
|
+
|
|
23
|
+
-user factory with a reduced config setting
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
from __future__ import annotations
|
|
27
|
+
|
|
28
|
+
import os
|
|
29
|
+
from typing import Optional
|
|
30
|
+
|
|
31
|
+
from ..extern.pt.ptutils import get_torch_device
|
|
32
|
+
from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
|
|
33
|
+
from ..pipe.doctectionpipe import DoctectionPipe
|
|
34
|
+
from ..utils.env_info import ENV_VARS_TRUE
|
|
35
|
+
from ..utils.error import DependencyError
|
|
36
|
+
from ..utils.file_utils import tensorpack_available
|
|
37
|
+
from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
|
|
38
|
+
from ..utils.logger import LoggingRecord, logger
|
|
39
|
+
from ..utils.metacfg import set_config_by_yaml
|
|
40
|
+
from ..utils.types import PathLikeOrStr
|
|
41
|
+
from ._config import cfg
|
|
42
|
+
from .factory import ServiceFactory
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"config_sanity_checks",
|
|
46
|
+
"get_dd_analyzer",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
_DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
|
|
50
|
+
_TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
|
|
51
|
+
_MODEL_CHOICES = {
|
|
52
|
+
"layout": [
|
|
53
|
+
"layout/d2_model_0829999_layout_inf_only.pt",
|
|
54
|
+
"xrf_layout/model_final_inf_only.pt",
|
|
55
|
+
"microsoft/table-transformer-detection/pytorch_model.bin",
|
|
56
|
+
],
|
|
57
|
+
"segmentation": [
|
|
58
|
+
"item/model-1620000_inf_only.data-00000-of-00001",
|
|
59
|
+
"xrf_item/model_final_inf_only.pt",
|
|
60
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin",
|
|
61
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
|
|
62
|
+
],
|
|
63
|
+
"ocr": ["Tesseract", "DocTr", "Textract"],
|
|
64
|
+
"doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
|
|
65
|
+
"doctr_recognition": [
|
|
66
|
+
"doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
|
|
67
|
+
"doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
|
|
68
|
+
],
|
|
69
|
+
"llm": ["gpt-3.5-turbo", "gpt-4"],
|
|
70
|
+
"segmentation_choices": {
|
|
71
|
+
"item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
|
|
72
|
+
"xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
|
|
73
|
+
"microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
|
|
74
|
+
"deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
|
|
75
|
+
},
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def config_sanity_checks() -> None:
|
|
80
|
+
"""Some config sanity checks"""
|
|
81
|
+
if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
|
|
82
|
+
raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
|
|
83
|
+
if cfg.USE_OCR:
|
|
84
|
+
if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
|
|
85
|
+
raise ValueError(
|
|
86
|
+
"Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
|
|
87
|
+
"and set the other two to False. Only one OCR system can be activated."
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def get_dd_analyzer(
|
|
92
|
+
reset_config_file: bool = True,
|
|
93
|
+
config_overwrite: Optional[list[str]] = None,
|
|
94
|
+
path_config_file: Optional[PathLikeOrStr] = None,
|
|
95
|
+
) -> DoctectionPipe:
|
|
96
|
+
"""
|
|
97
|
+
Factory function for creating the built-in **deep**doctection analyzer.
|
|
98
|
+
|
|
99
|
+
The Standard Analyzer is a pipeline that comprises the following analysis components:
|
|
100
|
+
|
|
101
|
+
- Document layout analysis
|
|
102
|
+
|
|
103
|
+
- Table segmentation
|
|
104
|
+
|
|
105
|
+
- Text extraction/OCR
|
|
106
|
+
|
|
107
|
+
- Reading order
|
|
108
|
+
|
|
109
|
+
We refer to the various notebooks and docs for running an analyzer and changing the configs.
|
|
110
|
+
|
|
111
|
+
:param reset_config_file: This will copy the `.yaml` file with default variables to the `.cache` and therefore
|
|
112
|
+
resetting all configurations if set to `True`.
|
|
113
|
+
:param config_overwrite: Passing a list of string arguments and values to overwrite the `.yaml` configuration with
|
|
114
|
+
highest priority, e.g. ["USE_TABLE_SEGMENTATION=False",
|
|
115
|
+
"USE_OCR=False",
|
|
116
|
+
"TF.LAYOUT.WEIGHTS=my_fancy_pytorch_model"]
|
|
117
|
+
:param path_config_file: Path to a custom config file. Can be outside of the .cache directory.
|
|
118
|
+
:return: A DoctectionPipe instance with given configs
|
|
119
|
+
"""
|
|
120
|
+
config_overwrite = [] if config_overwrite is None else config_overwrite
|
|
121
|
+
lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
|
|
122
|
+
if lib == "TF":
|
|
123
|
+
device = get_tf_device()
|
|
124
|
+
elif lib == "PT":
|
|
125
|
+
device = get_torch_device()
|
|
126
|
+
else:
|
|
127
|
+
raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
|
|
128
|
+
dd_one_config_path = maybe_copy_config_to_cache(
|
|
129
|
+
get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
|
|
130
|
+
)
|
|
131
|
+
maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
|
|
132
|
+
|
|
133
|
+
# Set up of the configuration and logging
|
|
134
|
+
file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
|
|
135
|
+
cfg.freeze(freezed=False)
|
|
136
|
+
cfg.overwrite_config(file_cfg)
|
|
137
|
+
|
|
138
|
+
cfg.freeze(freezed=False)
|
|
139
|
+
cfg.LANGUAGE = None
|
|
140
|
+
cfg.LIB = lib
|
|
141
|
+
cfg.DEVICE = device
|
|
142
|
+
cfg.freeze()
|
|
143
|
+
|
|
144
|
+
if config_overwrite:
|
|
145
|
+
cfg.update_args(config_overwrite)
|
|
146
|
+
|
|
147
|
+
config_sanity_checks()
|
|
148
|
+
logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
|
|
149
|
+
|
|
150
|
+
# will silent all TP logging while building the tower
|
|
151
|
+
if tensorpack_available():
|
|
152
|
+
disable_tp_layer_logging()
|
|
153
|
+
|
|
154
|
+
return ServiceFactory.build_analyzer(cfg)
|