deepdoctection 0.42.0__tar.gz → 0.43__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of deepdoctection might be problematic. Click here for more details.
- deepdoctection-0.43/PKG-INFO +376 -0
- deepdoctection-0.43/README.md +233 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/__init__.py +2 -1
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection-0.43/deepdoctection/analyzer/config.py +904 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/analyzer/dd.py +36 -62
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/analyzer/factory.py +311 -141
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/configs/conf_dd_one.yaml +100 -44
- deepdoctection-0.43/deepdoctection/configs/profiles.jsonl +32 -0
- deepdoctection-0.43/deepdoctection/dataflow/__init__.py +25 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/dataflow/base.py +33 -15
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/dataflow/common.py +96 -75
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/dataflow/custom.py +36 -29
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/dataflow/custom_serialize.py +135 -91
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/dataflow/parallel_map.py +33 -31
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/dataflow/serialize.py +15 -10
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/dataflow/stats.py +41 -28
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datapoint/__init__.py +4 -6
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datapoint/annotation.py +104 -66
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datapoint/box.py +190 -130
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datapoint/convert.py +66 -39
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datapoint/image.py +151 -95
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datapoint/view.py +383 -236
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/__init__.py +2 -6
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/adapter.py +11 -11
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/base.py +118 -81
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/dataflow_builder.py +18 -12
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/info.py +76 -57
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/__init__.py +6 -2
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/doclaynet.py +17 -14
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/fintabnet.py +16 -22
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/funsd.py +11 -6
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/iiitar13k.py +9 -9
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/layouttest.py +9 -9
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/publaynet.py +9 -9
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/pubtables1m.py +13 -13
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/pubtabnet.py +13 -15
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/rvlcdip.py +8 -8
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/xfund.py +11 -9
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/registry.py +18 -11
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/save.py +12 -11
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/eval/__init__.py +3 -2
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/eval/accmetric.py +72 -52
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/eval/base.py +29 -10
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/eval/cocometric.py +14 -12
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/eval/eval.py +56 -41
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/eval/registry.py +6 -3
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/eval/tedsmetric.py +24 -9
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/eval/tp_eval_callback.py +13 -12
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/__init__.py +1 -1
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/base.py +176 -97
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/d2detect.py +127 -92
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/deskew.py +19 -10
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/doctrocr.py +157 -106
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/fastlang.py +25 -17
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/hfdetr.py +137 -60
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/hflayoutlm.py +329 -248
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/hflm.py +67 -33
- deepdoctection-0.43/deepdoctection/extern/model.py +506 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/pdftext.py +37 -12
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/pt/nms.py +15 -1
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/pt/ptutils.py +13 -9
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tessocr.py +87 -54
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/texocr.py +29 -14
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tfutils.py +36 -8
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpcompat.py +54 -16
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tpdetect.py +4 -2
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/__init__.py +1 -1
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/cats.py +117 -76
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/cocostruct.py +35 -17
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/d2struct.py +56 -29
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/hfstruct.py +32 -19
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/laylmstruct.py +221 -185
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/maputils.py +71 -35
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/match.py +76 -62
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/misc.py +68 -44
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/pascalstruct.py +13 -12
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/prodigystruct.py +33 -19
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/pubstruct.py +42 -32
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/tpstruct.py +39 -19
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/mapper/xfundstruct.py +20 -13
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/__init__.py +1 -2
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/anngen.py +104 -62
- deepdoctection-0.43/deepdoctection/pipe/base.py +496 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/common.py +206 -123
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/concurrency.py +74 -47
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/doctectionpipe.py +108 -47
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/language.py +41 -24
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/layout.py +45 -18
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/lm.py +146 -78
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/order.py +196 -113
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/refine.py +111 -63
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/registry.py +1 -1
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/segment.py +213 -142
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/sub_layout.py +76 -46
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/text.py +52 -33
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/pipe/transform.py +8 -6
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/train/d2_frcnn_train.py +87 -69
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/train/hf_detr_train.py +72 -40
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/train/hf_layoutlm_train.py +85 -46
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/train/tp_frcnn_train.py +56 -28
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/concurrency.py +59 -16
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/context.py +40 -19
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/develop.py +25 -17
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/env_info.py +85 -36
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/error.py +16 -10
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/file_utils.py +246 -62
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/fs.py +162 -43
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/identifier.py +29 -16
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/logger.py +49 -32
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/metacfg.py +83 -21
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/pdf_utils.py +119 -62
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/settings.py +24 -10
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/tqdm.py +10 -5
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/transform.py +182 -46
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/utils.py +61 -28
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/viz.py +150 -104
- deepdoctection-0.43/deepdoctection.egg-info/PKG-INFO +376 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection.egg-info/SOURCES.txt +2 -1
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection.egg-info/requires.txt +5 -4
- {deepdoctection-0.42.0 → deepdoctection-0.43}/setup.cfg +4 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/setup.py +5 -12
- deepdoctection-0.42.0/PKG-INFO +0 -431
- deepdoctection-0.42.0/README.md +0 -290
- deepdoctection-0.42.0/deepdoctection/analyzer/_config.py +0 -146
- deepdoctection-0.42.0/deepdoctection/dataflow/__init__.py +0 -22
- deepdoctection-0.42.0/deepdoctection/extern/model.py +0 -1160
- deepdoctection-0.42.0/deepdoctection/pipe/base.py +0 -377
- deepdoctection-0.42.0/deepdoctection.egg-info/PKG-INFO +0 -431
- {deepdoctection-0.42.0 → deepdoctection-0.43}/LICENSE +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/pt/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/train/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/__init__.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/mocks.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection/utils/types.py +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/deepdoctection.egg-info/top_level.txt +0 -0
- {deepdoctection-0.42.0 → deepdoctection-0.43}/tests/test_utils.py +0 -0
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deepdoctection
|
|
3
|
+
Version: 0.43
|
|
4
|
+
Summary: Repository for Document AI
|
|
5
|
+
Home-page: https://github.com/deepdoctection/deepdoctection
|
|
6
|
+
Author: Dr. Janis Meyer
|
|
7
|
+
License: Apache License 2.0
|
|
8
|
+
Classifier: Development Status :: 4 - Beta
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Natural Language :: English
|
|
11
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.9
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
License-File: LICENSE
|
|
20
|
+
Requires-Dist: catalogue==2.0.10
|
|
21
|
+
Requires-Dist: huggingface_hub>=0.26.0
|
|
22
|
+
Requires-Dist: importlib-metadata>=5.0.0
|
|
23
|
+
Requires-Dist: jsonlines==3.1.0
|
|
24
|
+
Requires-Dist: lazy-imports==0.3.1
|
|
25
|
+
Requires-Dist: mock==4.0.3
|
|
26
|
+
Requires-Dist: networkx>=2.7.1
|
|
27
|
+
Requires-Dist: numpy<2.0,>=1.21
|
|
28
|
+
Requires-Dist: packaging>=20.0
|
|
29
|
+
Requires-Dist: Pillow>=10.0.0
|
|
30
|
+
Requires-Dist: pypdf>=3.16.0
|
|
31
|
+
Requires-Dist: pypdfium2>=4.30.0
|
|
32
|
+
Requires-Dist: pyyaml>=6.0.1
|
|
33
|
+
Requires-Dist: pyzmq>=16
|
|
34
|
+
Requires-Dist: scipy>=1.13.1
|
|
35
|
+
Requires-Dist: termcolor>=1.1
|
|
36
|
+
Requires-Dist: tabulate>=0.7.7
|
|
37
|
+
Requires-Dist: tqdm>=4.64.0
|
|
38
|
+
Provides-Extra: tf
|
|
39
|
+
Requires-Dist: catalogue==2.0.10; extra == "tf"
|
|
40
|
+
Requires-Dist: huggingface_hub>=0.26.0; extra == "tf"
|
|
41
|
+
Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
|
|
42
|
+
Requires-Dist: jsonlines==3.1.0; extra == "tf"
|
|
43
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "tf"
|
|
44
|
+
Requires-Dist: mock==4.0.3; extra == "tf"
|
|
45
|
+
Requires-Dist: networkx>=2.7.1; extra == "tf"
|
|
46
|
+
Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
|
|
47
|
+
Requires-Dist: packaging>=20.0; extra == "tf"
|
|
48
|
+
Requires-Dist: Pillow>=10.0.0; extra == "tf"
|
|
49
|
+
Requires-Dist: pypdf>=3.16.0; extra == "tf"
|
|
50
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
|
|
51
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "tf"
|
|
52
|
+
Requires-Dist: pyzmq>=16; extra == "tf"
|
|
53
|
+
Requires-Dist: scipy>=1.13.1; extra == "tf"
|
|
54
|
+
Requires-Dist: termcolor>=1.1; extra == "tf"
|
|
55
|
+
Requires-Dist: tabulate>=0.7.7; extra == "tf"
|
|
56
|
+
Requires-Dist: tqdm>=4.64.0; extra == "tf"
|
|
57
|
+
Requires-Dist: tensorpack==0.11; extra == "tf"
|
|
58
|
+
Requires-Dist: protobuf==3.20.1; extra == "tf"
|
|
59
|
+
Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
|
|
60
|
+
Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
|
|
61
|
+
Requires-Dist: python-doctr==0.9.0; extra == "tf"
|
|
62
|
+
Requires-Dist: pycocotools>=2.0.2; extra == "tf"
|
|
63
|
+
Requires-Dist: boto3==1.34.102; extra == "tf"
|
|
64
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
|
|
65
|
+
Requires-Dist: fasttext-wheel; extra == "tf"
|
|
66
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "tf"
|
|
67
|
+
Requires-Dist: apted==1.0.3; extra == "tf"
|
|
68
|
+
Requires-Dist: distance==0.1.3; extra == "tf"
|
|
69
|
+
Requires-Dist: lxml>=4.9.1; extra == "tf"
|
|
70
|
+
Provides-Extra: pt
|
|
71
|
+
Requires-Dist: catalogue==2.0.10; extra == "pt"
|
|
72
|
+
Requires-Dist: huggingface_hub>=0.26.0; extra == "pt"
|
|
73
|
+
Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
|
|
74
|
+
Requires-Dist: jsonlines==3.1.0; extra == "pt"
|
|
75
|
+
Requires-Dist: lazy-imports==0.3.1; extra == "pt"
|
|
76
|
+
Requires-Dist: mock==4.0.3; extra == "pt"
|
|
77
|
+
Requires-Dist: networkx>=2.7.1; extra == "pt"
|
|
78
|
+
Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
|
|
79
|
+
Requires-Dist: packaging>=20.0; extra == "pt"
|
|
80
|
+
Requires-Dist: Pillow>=10.0.0; extra == "pt"
|
|
81
|
+
Requires-Dist: pypdf>=3.16.0; extra == "pt"
|
|
82
|
+
Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
|
|
83
|
+
Requires-Dist: pyyaml>=6.0.1; extra == "pt"
|
|
84
|
+
Requires-Dist: pyzmq>=16; extra == "pt"
|
|
85
|
+
Requires-Dist: scipy>=1.13.1; extra == "pt"
|
|
86
|
+
Requires-Dist: termcolor>=1.1; extra == "pt"
|
|
87
|
+
Requires-Dist: tabulate>=0.7.7; extra == "pt"
|
|
88
|
+
Requires-Dist: tqdm>=4.64.0; extra == "pt"
|
|
89
|
+
Requires-Dist: timm>=0.9.16; extra == "pt"
|
|
90
|
+
Requires-Dist: transformers>=4.48.0; extra == "pt"
|
|
91
|
+
Requires-Dist: accelerate>=0.29.1; extra == "pt"
|
|
92
|
+
Requires-Dist: python-doctr==0.9.0; extra == "pt"
|
|
93
|
+
Requires-Dist: pycocotools>=2.0.2; extra == "pt"
|
|
94
|
+
Requires-Dist: boto3==1.34.102; extra == "pt"
|
|
95
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
|
|
96
|
+
Requires-Dist: fasttext-wheel; extra == "pt"
|
|
97
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "pt"
|
|
98
|
+
Requires-Dist: apted==1.0.3; extra == "pt"
|
|
99
|
+
Requires-Dist: distance==0.1.3; extra == "pt"
|
|
100
|
+
Requires-Dist: lxml>=4.9.1; extra == "pt"
|
|
101
|
+
Provides-Extra: docs
|
|
102
|
+
Requires-Dist: tensorpack==0.11; extra == "docs"
|
|
103
|
+
Requires-Dist: boto3==1.34.102; extra == "docs"
|
|
104
|
+
Requires-Dist: transformers>=4.48.0; extra == "docs"
|
|
105
|
+
Requires-Dist: accelerate>=0.29.1; extra == "docs"
|
|
106
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
|
|
107
|
+
Requires-Dist: lxml>=4.9.1; extra == "docs"
|
|
108
|
+
Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
|
|
109
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "docs"
|
|
110
|
+
Requires-Dist: jinja2; extra == "docs"
|
|
111
|
+
Requires-Dist: mkdocs-material; extra == "docs"
|
|
112
|
+
Requires-Dist: mkdocstrings-python; extra == "docs"
|
|
113
|
+
Requires-Dist: griffe==0.25.0; extra == "docs"
|
|
114
|
+
Provides-Extra: dev
|
|
115
|
+
Requires-Dist: python-dotenv==1.0.0; extra == "dev"
|
|
116
|
+
Requires-Dist: click; extra == "dev"
|
|
117
|
+
Requires-Dist: black==23.7.0; extra == "dev"
|
|
118
|
+
Requires-Dist: isort==5.13.2; extra == "dev"
|
|
119
|
+
Requires-Dist: pylint==2.17.4; extra == "dev"
|
|
120
|
+
Requires-Dist: mypy==1.4.1; extra == "dev"
|
|
121
|
+
Requires-Dist: wandb; extra == "dev"
|
|
122
|
+
Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
|
|
123
|
+
Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
|
|
124
|
+
Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
|
|
125
|
+
Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
|
|
126
|
+
Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
|
|
127
|
+
Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
|
|
128
|
+
Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
|
|
129
|
+
Provides-Extra: test
|
|
130
|
+
Requires-Dist: pytest==8.0.2; extra == "test"
|
|
131
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
132
|
+
Dynamic: author
|
|
133
|
+
Dynamic: classifier
|
|
134
|
+
Dynamic: description
|
|
135
|
+
Dynamic: description-content-type
|
|
136
|
+
Dynamic: home-page
|
|
137
|
+
Dynamic: license
|
|
138
|
+
Dynamic: license-file
|
|
139
|
+
Dynamic: provides-extra
|
|
140
|
+
Dynamic: requires-dist
|
|
141
|
+
Dynamic: requires-python
|
|
142
|
+
Dynamic: summary
|
|
143
|
+
|
|
144
|
+
<p align="center">
|
|
145
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
|
|
146
|
+
</p>
|
|
147
|
+
|
|
148
|
+

|
|
149
|
+

|
|
150
|
+

|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
------------------------------------------------------------------------------------------------------------------------
|
|
154
|
+
# NEW
|
|
155
|
+
|
|
156
|
+
Version `v.0.43` includes a significant redesign of the Analyzer's default configuration. Key changes include:
|
|
157
|
+
|
|
158
|
+
* More powerful models for Document Layout Analysis and OCR.
|
|
159
|
+
* Expanded functionality.
|
|
160
|
+
* Less dependencies.
|
|
161
|
+
|
|
162
|
+
------------------------------------------------------------------------------------------------------------------------
|
|
163
|
+
|
|
164
|
+
<p align="center">
|
|
165
|
+
<h1 align="center">
|
|
166
|
+
A Package for Document Understanding
|
|
167
|
+
</h1>
|
|
168
|
+
</p>
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
**deep**doctection is a Python library that orchestrates Scan and PDF document layout analysis and extraction for RAG.
|
|
173
|
+
It also provides a framework for training, evaluating and inferencing Document AI models.
|
|
174
|
+
|
|
175
|
+
Check the demo of a document layout analysis pipeline with OCR on 🤗
|
|
176
|
+
[**Hugging Face spaces**](https://huggingface.co/spaces/deepdoctection/deepdoctection).
|
|
177
|
+
|
|
178
|
+
# Overview
|
|
179
|
+
|
|
180
|
+
- Document layout analysis and table recognition in PyTorch with
|
|
181
|
+
[**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) and
|
|
182
|
+
[**Transformers**](https://github.com/huggingface/transformers)
|
|
183
|
+
or Tensorflow and [**Tensorpack**](https://github.com/tensorpack),
|
|
184
|
+
- OCR with support of [**Tesseract**](https://github.com/tesseract-ocr/tesseract), [**DocTr**](https://github.com/mindee/doctr) and
|
|
185
|
+
[**AWS Textract**](https://aws.amazon.com/textract/),
|
|
186
|
+
- Document and token classification with the [**LayoutLM**](https://github.com/microsoft/unilm) family,
|
|
187
|
+
[**LiLT**](https://github.com/jpWang/LiLT) and selected
|
|
188
|
+
[**Bert**](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)-style including features like sliding windows.
|
|
189
|
+
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
190
|
+
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
|
|
191
|
+
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
192
|
+
- Fine-tuning and evaluation tools.
|
|
193
|
+
- Lot's of [tutorials](https://github.com/deepdoctection/notebooks)
|
|
194
|
+
|
|
195
|
+
Have a look at the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb)
|
|
196
|
+
for an easy start.
|
|
197
|
+
|
|
198
|
+
Check the [**release notes**](https://github.com/deepdoctection/deepdoctection/releases) for recent updates.
|
|
199
|
+
|
|
200
|
+
# Example
|
|
201
|
+
|
|
202
|
+
```python
|
|
203
|
+
import deepdoctection as dd
|
|
204
|
+
from IPython.core.display import HTML
|
|
205
|
+
from matplotlib import pyplot as plt
|
|
206
|
+
|
|
207
|
+
analyzer = dd.get_dd_analyzer() # instantiate the built-in analyzer similar to the Hugging Face space demo
|
|
208
|
+
|
|
209
|
+
df = analyzer.analyze(path = "/path/to/your/doc.pdf") # setting up pipeline
|
|
210
|
+
df.reset_state() # Trigger some initialization
|
|
211
|
+
|
|
212
|
+
doc = iter(df)
|
|
213
|
+
page = next(doc)
|
|
214
|
+
|
|
215
|
+
image = page.viz(show_figures=True, show_residual_layouts=True)
|
|
216
|
+
plt.figure(figsize = (25,17))
|
|
217
|
+
plt.axis('off')
|
|
218
|
+
plt.imshow(image)
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
<p align="center">
|
|
222
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_sample.png"
|
|
223
|
+
alt="sample" width="40%">
|
|
224
|
+
</p>
|
|
225
|
+
|
|
226
|
+
```
|
|
227
|
+
HTML(page.tables[0].html)
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
<p align="center">
|
|
231
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_table.png"
|
|
232
|
+
alt="table" width="40%">
|
|
233
|
+
</p>
|
|
234
|
+
|
|
235
|
+
```
|
|
236
|
+
print(page.text)
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
<p align="center">
|
|
240
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_text.png"
|
|
241
|
+
alt="text" width="40%">
|
|
242
|
+
</p>
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
## Requirements
|
|
247
|
+
|
|
248
|
+

|
|
249
|
+
|
|
250
|
+
- Linux or macOS. Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available.
|
|
251
|
+
- Python >= 3.9
|
|
252
|
+
- 1.13 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
253
|
+
Tensorflow support will be stopped from Python 3.11 onwards.
|
|
254
|
+
- To fine-tune models, a GPU is recommended.
|
|
255
|
+
|
|
256
|
+
| Task | PyTorch | Torchscript | Tensorflow |
|
|
257
|
+
|---------------------------------------------|:-------:|----------------|:------------:|
|
|
258
|
+
| Layout detection via Detectron2/Tensorpack | ✅ | ✅ (CPU only) | ✅ (GPU only) |
|
|
259
|
+
| Table recognition via Detectron2/Tensorpack | ✅ | ✅ (CPU only) | ✅ (GPU only) |
|
|
260
|
+
| Table transformer via Transformers | ✅ | ❌ | ❌ |
|
|
261
|
+
| Deformable-Detr | ✅ | ❌ | ❌ |
|
|
262
|
+
| DocTr | ✅ | ❌ | ✅ |
|
|
263
|
+
| LayoutLM (v1, v2, v3, XLM) via Transformers | ✅ | ❌ | ❌ |
|
|
264
|
+
|
|
265
|
+
## Installation
|
|
266
|
+
|
|
267
|
+
We recommend using a virtual environment.
|
|
268
|
+
|
|
269
|
+
#### Get started installation
|
|
270
|
+
|
|
271
|
+
For a simple setup which is enough to parse documents with the default setting, install the following:
|
|
272
|
+
|
|
273
|
+
**PyTorch**
|
|
274
|
+
|
|
275
|
+
```
|
|
276
|
+
pip install transformers
|
|
277
|
+
pip install python-doctr
|
|
278
|
+
pip install deepdoctection
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
**TensorFlow**
|
|
282
|
+
|
|
283
|
+
```
|
|
284
|
+
pip install tensorpack
|
|
285
|
+
pip install python-doctr
|
|
286
|
+
pip install deepdoctection
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
Both setups are sufficient to run the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb).
|
|
290
|
+
|
|
291
|
+
#### Full installation
|
|
292
|
+
|
|
293
|
+
The following installation will give you ALL models available within the Deep Learning framework as well as all models
|
|
294
|
+
that are independent of Tensorflow/PyTorch.
|
|
295
|
+
|
|
296
|
+
**PyTorch**
|
|
297
|
+
|
|
298
|
+
First install **Detectron2** separately as it is not distributed via PyPi. Check the instruction
|
|
299
|
+
[here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) or try:
|
|
300
|
+
|
|
301
|
+
```
|
|
302
|
+
pip install detectron2@git+https://github.com/deepdoctection/detectron2.git
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
Then install **deep**doctection with all its dependencies:
|
|
306
|
+
|
|
307
|
+
```
|
|
308
|
+
pip install deepdoctection[pt]
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
**Tensorflow**
|
|
312
|
+
|
|
313
|
+
```
|
|
314
|
+
pip install deepdoctection[tf]
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
For further information, please consult the [**full installation instructions**](https://deepdoctection.readthedocs.io/en/latest/install/).
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
### Installation from source
|
|
322
|
+
|
|
323
|
+
Download the repository or clone via
|
|
324
|
+
|
|
325
|
+
```
|
|
326
|
+
git clone https://github.com/deepdoctection/deepdoctection.git
|
|
327
|
+
```
|
|
328
|
+
|
|
329
|
+
**PyTorch**
|
|
330
|
+
|
|
331
|
+
```
|
|
332
|
+
cd deepdoctection
|
|
333
|
+
pip install ".[pt]" # or "pip install -e .[pt]"
|
|
334
|
+
```
|
|
335
|
+
|
|
336
|
+
**Tensorflow**
|
|
337
|
+
|
|
338
|
+
```
|
|
339
|
+
cd deepdoctection
|
|
340
|
+
pip install ".[tf]" # or "pip install -e .[tf]"
|
|
341
|
+
```
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
### Running a Docker container from Docker hub
|
|
346
|
+
|
|
347
|
+
Pre-existing Docker images can be downloaded from the [Docker hub](https://hub.docker.
|
|
348
|
+
com/r/deepdoctection/deepdoctection).
|
|
349
|
+
|
|
350
|
+
```
|
|
351
|
+
docker pull deepdoctection/deepdoctection:<release_tag>
|
|
352
|
+
```
|
|
353
|
+
|
|
354
|
+
Use the Docker compose file `./docker/pytorch-gpu/docker-compose.yaml`.
|
|
355
|
+
In the `.env` file provided, specify the host directory where **deep**doctection's cache should be stored.
|
|
356
|
+
Additionally, specify a working directory to mount files to be processed into the container.
|
|
357
|
+
|
|
358
|
+
```
|
|
359
|
+
docker compose up -d
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
will start the container. There is no endpoint exposed, though.
|
|
363
|
+
|
|
364
|
+
## Credits
|
|
365
|
+
|
|
366
|
+
We thank all libraries that provide high quality code and pre-trained models. Without, it would have been impossible
|
|
367
|
+
to develop this framework.
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
## If you like **deep**doctection ...
|
|
371
|
+
|
|
372
|
+
...you can easily support the project by making it more visible. Leaving a star or a recommendation will help.
|
|
373
|
+
|
|
374
|
+
## License
|
|
375
|
+
|
|
376
|
+
Distributed under the Apache 2.0 License. Check [LICENSE](https://github.com/deepdoctection/deepdoctection/blob/master/LICENSE) for additional information.
|
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+

|
|
6
|
+

|
|
7
|
+

|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
------------------------------------------------------------------------------------------------------------------------
|
|
11
|
+
# NEW
|
|
12
|
+
|
|
13
|
+
Version `v.0.43` includes a significant redesign of the Analyzer's default configuration. Key changes include:
|
|
14
|
+
|
|
15
|
+
* More powerful models for Document Layout Analysis and OCR.
|
|
16
|
+
* Expanded functionality.
|
|
17
|
+
* Less dependencies.
|
|
18
|
+
|
|
19
|
+
------------------------------------------------------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
<p align="center">
|
|
22
|
+
<h1 align="center">
|
|
23
|
+
A Package for Document Understanding
|
|
24
|
+
</h1>
|
|
25
|
+
</p>
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
**deep**doctection is a Python library that orchestrates Scan and PDF document layout analysis and extraction for RAG.
|
|
30
|
+
It also provides a framework for training, evaluating and inferencing Document AI models.
|
|
31
|
+
|
|
32
|
+
Check the demo of a document layout analysis pipeline with OCR on 🤗
|
|
33
|
+
[**Hugging Face spaces**](https://huggingface.co/spaces/deepdoctection/deepdoctection).
|
|
34
|
+
|
|
35
|
+
# Overview
|
|
36
|
+
|
|
37
|
+
- Document layout analysis and table recognition in PyTorch with
|
|
38
|
+
[**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) and
|
|
39
|
+
[**Transformers**](https://github.com/huggingface/transformers)
|
|
40
|
+
or Tensorflow and [**Tensorpack**](https://github.com/tensorpack),
|
|
41
|
+
- OCR with support of [**Tesseract**](https://github.com/tesseract-ocr/tesseract), [**DocTr**](https://github.com/mindee/doctr) and
|
|
42
|
+
[**AWS Textract**](https://aws.amazon.com/textract/),
|
|
43
|
+
- Document and token classification with the [**LayoutLM**](https://github.com/microsoft/unilm) family,
|
|
44
|
+
[**LiLT**](https://github.com/jpWang/LiLT) and selected
|
|
45
|
+
[**Bert**](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)-style including features like sliding windows.
|
|
46
|
+
- Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
|
|
47
|
+
- Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
|
|
48
|
+
- Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
|
|
49
|
+
- Fine-tuning and evaluation tools.
|
|
50
|
+
- Lot's of [tutorials](https://github.com/deepdoctection/notebooks)
|
|
51
|
+
|
|
52
|
+
Have a look at the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb)
|
|
53
|
+
for an easy start.
|
|
54
|
+
|
|
55
|
+
Check the [**release notes**](https://github.com/deepdoctection/deepdoctection/releases) for recent updates.
|
|
56
|
+
|
|
57
|
+
# Example
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import deepdoctection as dd
|
|
61
|
+
from IPython.core.display import HTML
|
|
62
|
+
from matplotlib import pyplot as plt
|
|
63
|
+
|
|
64
|
+
analyzer = dd.get_dd_analyzer() # instantiate the built-in analyzer similar to the Hugging Face space demo
|
|
65
|
+
|
|
66
|
+
df = analyzer.analyze(path = "/path/to/your/doc.pdf") # setting up pipeline
|
|
67
|
+
df.reset_state() # Trigger some initialization
|
|
68
|
+
|
|
69
|
+
doc = iter(df)
|
|
70
|
+
page = next(doc)
|
|
71
|
+
|
|
72
|
+
image = page.viz(show_figures=True, show_residual_layouts=True)
|
|
73
|
+
plt.figure(figsize = (25,17))
|
|
74
|
+
plt.axis('off')
|
|
75
|
+
plt.imshow(image)
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
<p align="center">
|
|
79
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_sample.png"
|
|
80
|
+
alt="sample" width="40%">
|
|
81
|
+
</p>
|
|
82
|
+
|
|
83
|
+
```
|
|
84
|
+
HTML(page.tables[0].html)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
<p align="center">
|
|
88
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_table.png"
|
|
89
|
+
alt="table" width="40%">
|
|
90
|
+
</p>
|
|
91
|
+
|
|
92
|
+
```
|
|
93
|
+
print(page.text)
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
<p align="center">
|
|
97
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_text.png"
|
|
98
|
+
alt="text" width="40%">
|
|
99
|
+
</p>
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
## Requirements
|
|
104
|
+
|
|
105
|
+

|
|
106
|
+
|
|
107
|
+
- Linux or macOS. Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available.
|
|
108
|
+
- Python >= 3.9
|
|
109
|
+
- 1.13 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
|
|
110
|
+
Tensorflow support will be stopped from Python 3.11 onwards.
|
|
111
|
+
- To fine-tune models, a GPU is recommended.
|
|
112
|
+
|
|
113
|
+
| Task | PyTorch | Torchscript | Tensorflow |
|
|
114
|
+
|---------------------------------------------|:-------:|----------------|:------------:|
|
|
115
|
+
| Layout detection via Detectron2/Tensorpack | ✅ | ✅ (CPU only) | ✅ (GPU only) |
|
|
116
|
+
| Table recognition via Detectron2/Tensorpack | ✅ | ✅ (CPU only) | ✅ (GPU only) |
|
|
117
|
+
| Table transformer via Transformers | ✅ | ❌ | ❌ |
|
|
118
|
+
| Deformable-Detr | ✅ | ❌ | ❌ |
|
|
119
|
+
| DocTr | ✅ | ❌ | ✅ |
|
|
120
|
+
| LayoutLM (v1, v2, v3, XLM) via Transformers | ✅ | ❌ | ❌ |
|
|
121
|
+
|
|
122
|
+
## Installation
|
|
123
|
+
|
|
124
|
+
We recommend using a virtual environment.
|
|
125
|
+
|
|
126
|
+
#### Get started installation
|
|
127
|
+
|
|
128
|
+
For a simple setup which is enough to parse documents with the default setting, install the following:
|
|
129
|
+
|
|
130
|
+
**PyTorch**
|
|
131
|
+
|
|
132
|
+
```
|
|
133
|
+
pip install transformers
|
|
134
|
+
pip install python-doctr
|
|
135
|
+
pip install deepdoctection
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**TensorFlow**
|
|
139
|
+
|
|
140
|
+
```
|
|
141
|
+
pip install tensorpack
|
|
142
|
+
pip install python-doctr
|
|
143
|
+
pip install deepdoctection
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
Both setups are sufficient to run the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb).
|
|
147
|
+
|
|
148
|
+
#### Full installation
|
|
149
|
+
|
|
150
|
+
The following installation will give you ALL models available within the Deep Learning framework as well as all models
|
|
151
|
+
that are independent of Tensorflow/PyTorch.
|
|
152
|
+
|
|
153
|
+
**PyTorch**
|
|
154
|
+
|
|
155
|
+
First install **Detectron2** separately as it is not distributed via PyPi. Check the instruction
|
|
156
|
+
[here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) or try:
|
|
157
|
+
|
|
158
|
+
```
|
|
159
|
+
pip install detectron2@git+https://github.com/deepdoctection/detectron2.git
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
Then install **deep**doctection with all its dependencies:
|
|
163
|
+
|
|
164
|
+
```
|
|
165
|
+
pip install deepdoctection[pt]
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
**Tensorflow**
|
|
169
|
+
|
|
170
|
+
```
|
|
171
|
+
pip install deepdoctection[tf]
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
For further information, please consult the [**full installation instructions**](https://deepdoctection.readthedocs.io/en/latest/install/).
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
### Installation from source
|
|
179
|
+
|
|
180
|
+
Download the repository or clone via
|
|
181
|
+
|
|
182
|
+
```
|
|
183
|
+
git clone https://github.com/deepdoctection/deepdoctection.git
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**PyTorch**
|
|
187
|
+
|
|
188
|
+
```
|
|
189
|
+
cd deepdoctection
|
|
190
|
+
pip install ".[pt]" # or "pip install -e .[pt]"
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
**Tensorflow**
|
|
194
|
+
|
|
195
|
+
```
|
|
196
|
+
cd deepdoctection
|
|
197
|
+
pip install ".[tf]" # or "pip install -e .[tf]"
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
### Running a Docker container from Docker hub
|
|
203
|
+
|
|
204
|
+
Pre-existing Docker images can be downloaded from the [Docker hub](https://hub.docker.
|
|
205
|
+
com/r/deepdoctection/deepdoctection).
|
|
206
|
+
|
|
207
|
+
```
|
|
208
|
+
docker pull deepdoctection/deepdoctection:<release_tag>
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
Use the Docker compose file `./docker/pytorch-gpu/docker-compose.yaml`.
|
|
212
|
+
In the `.env` file provided, specify the host directory where **deep**doctection's cache should be stored.
|
|
213
|
+
Additionally, specify a working directory to mount files to be processed into the container.
|
|
214
|
+
|
|
215
|
+
```
|
|
216
|
+
docker compose up -d
|
|
217
|
+
```
|
|
218
|
+
|
|
219
|
+
will start the container. There is no endpoint exposed, though.
|
|
220
|
+
|
|
221
|
+
## Credits
|
|
222
|
+
|
|
223
|
+
We thank all libraries that provide high quality code and pre-trained models. Without, it would have been impossible
|
|
224
|
+
to develop this framework.
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
## If you like **deep**doctection ...
|
|
228
|
+
|
|
229
|
+
...you can easily support the project by making it more visible. Leaving a star or a recommendation will help.
|
|
230
|
+
|
|
231
|
+
## License
|
|
232
|
+
|
|
233
|
+
Distributed under the Apache 2.0 License. Check [LICENSE](https://github.com/deepdoctection/deepdoctection/blob/master/LICENSE) for additional information.
|
|
@@ -25,7 +25,7 @@ from .utils.logger import LoggingRecord, logger
|
|
|
25
25
|
|
|
26
26
|
# pylint: enable=wrong-import-position
|
|
27
27
|
|
|
28
|
-
__version__ = "0.
|
|
28
|
+
__version__ = "0.43"
|
|
29
29
|
|
|
30
30
|
_IMPORT_STRUCTURE = {
|
|
31
31
|
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
|
|
@@ -358,6 +358,7 @@ _IMPORT_STRUCTURE = {
|
|
|
358
358
|
"FileExtensionError",
|
|
359
359
|
"sub_path",
|
|
360
360
|
"get_package_path",
|
|
361
|
+
"get_cache_dir_path",
|
|
361
362
|
"get_configs_dir_path",
|
|
362
363
|
"get_weights_dir_path",
|
|
363
364
|
"get_dataset_dir_path",
|