deepdoctection 0.40.0__tar.gz → 1.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepdoctection-1.0.3/PKG-INFO +133 -0
- deepdoctection-1.0.3/README.md +75 -0
- deepdoctection-1.0.3/pyproject.toml +108 -0
- deepdoctection-1.0.3/setup.cfg +4 -0
- deepdoctection-1.0.3/src/deepdoctection/__init__.py +176 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/analyzer/__init__.py +2 -1
- deepdoctection-1.0.3/src/deepdoctection/analyzer/config.py +903 -0
- deepdoctection-1.0.3/src/deepdoctection/analyzer/dd.py +121 -0
- deepdoctection-1.0.3/src/deepdoctection/analyzer/factory.py +1807 -0
- deepdoctection-1.0.3/src/deepdoctection/configs/conf_dd_one.yaml +186 -0
- deepdoctection-1.0.3/src/deepdoctection/configs/profiles.jsonl +25 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/__init__.py +8 -3
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/accmetric.py +88 -62
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/base.py +39 -16
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/cocometric.py +28 -21
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/eval.py +80 -62
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/registry.py +6 -3
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/tedsmetric.py +40 -24
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/__init__.py +2 -3
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/base.py +292 -99
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/d2detect.py +252 -105
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/deskew.py +25 -15
- deepdoctection-1.0.3/src/deepdoctection/extern/doctrocr.py +517 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/hfdetr.py +144 -66
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/hflayoutlm.py +340 -391
- deepdoctection-1.0.3/src/deepdoctection/extern/hflm.py +639 -0
- deepdoctection-1.0.3/src/deepdoctection/extern/model.py +498 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/pdftext.py +43 -17
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/tessocr.py +118 -64
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/texocr.py +44 -19
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/__init__.py +1 -2
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/anngen.py +139 -70
- deepdoctection-1.0.3/src/deepdoctection/pipe/base.py +501 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/common.py +205 -100
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/concurrency.py +93 -63
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/doctectionpipe.py +133 -70
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/language.py +48 -30
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/layout.py +52 -24
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/lm.py +174 -102
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/order.py +236 -137
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/refine.py +156 -91
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/registry.py +1 -1
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/segment.py +349 -223
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/sub_layout.py +88 -53
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/text.py +76 -45
- deepdoctection-1.0.3/src/deepdoctection/pipe/transform.py +114 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/train/__init__.py +1 -4
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/train/d2_frcnn_train.py +106 -84
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/train/hf_detr_train.py +84 -51
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/train/hf_layoutlm_train.py +160 -150
- deepdoctection-1.0.3/src/deepdoctection.egg-info/PKG-INFO +133 -0
- deepdoctection-1.0.3/src/deepdoctection.egg-info/SOURCES.txt +56 -0
- deepdoctection-1.0.3/src/deepdoctection.egg-info/requires.txt +43 -0
- deepdoctection-0.40.0/LICENSE +0 -201
- deepdoctection-0.40.0/PKG-INFO +0 -431
- deepdoctection-0.40.0/README.md +0 -290
- deepdoctection-0.40.0/deepdoctection/__init__.py +0 -452
- deepdoctection-0.40.0/deepdoctection/analyzer/_config.py +0 -143
- deepdoctection-0.40.0/deepdoctection/analyzer/dd.py +0 -155
- deepdoctection-0.40.0/deepdoctection/analyzer/factory.py +0 -753
- deepdoctection-0.40.0/deepdoctection/configs/conf_dd_one.yaml +0 -145
- deepdoctection-0.40.0/deepdoctection/dataflow/__init__.py +0 -22
- deepdoctection-0.40.0/deepdoctection/dataflow/base.py +0 -145
- deepdoctection-0.40.0/deepdoctection/dataflow/common.py +0 -316
- deepdoctection-0.40.0/deepdoctection/dataflow/custom.py +0 -196
- deepdoctection-0.40.0/deepdoctection/dataflow/custom_serialize.py +0 -630
- deepdoctection-0.40.0/deepdoctection/dataflow/parallel_map.py +0 -444
- deepdoctection-0.40.0/deepdoctection/dataflow/serialize.py +0 -148
- deepdoctection-0.40.0/deepdoctection/dataflow/stats.py +0 -271
- deepdoctection-0.40.0/deepdoctection/datapoint/__init__.py +0 -40
- deepdoctection-0.40.0/deepdoctection/datapoint/annotation.py +0 -531
- deepdoctection-0.40.0/deepdoctection/datapoint/box.py +0 -750
- deepdoctection-0.40.0/deepdoctection/datapoint/convert.py +0 -210
- deepdoctection-0.40.0/deepdoctection/datapoint/image.py +0 -795
- deepdoctection-0.40.0/deepdoctection/datapoint/view.py +0 -1231
- deepdoctection-0.40.0/deepdoctection/datasets/__init__.py +0 -35
- deepdoctection-0.40.0/deepdoctection/datasets/adapter.py +0 -177
- deepdoctection-0.40.0/deepdoctection/datasets/base.py +0 -525
- deepdoctection-0.40.0/deepdoctection/datasets/dataflow_builder.py +0 -120
- deepdoctection-0.40.0/deepdoctection/datasets/info.py +0 -446
- deepdoctection-0.40.0/deepdoctection/datasets/instances/__init__.py +0 -55
- deepdoctection-0.40.0/deepdoctection/datasets/instances/doclaynet.py +0 -298
- deepdoctection-0.40.0/deepdoctection/datasets/instances/fintabnet.py +0 -294
- deepdoctection-0.40.0/deepdoctection/datasets/instances/funsd.py +0 -200
- deepdoctection-0.40.0/deepdoctection/datasets/instances/iiitar13k.py +0 -196
- deepdoctection-0.40.0/deepdoctection/datasets/instances/layouttest.py +0 -134
- deepdoctection-0.40.0/deepdoctection/datasets/instances/publaynet.py +0 -151
- deepdoctection-0.40.0/deepdoctection/datasets/instances/pubtables1m.py +0 -334
- deepdoctection-0.40.0/deepdoctection/datasets/instances/pubtabnet.py +0 -214
- deepdoctection-0.40.0/deepdoctection/datasets/instances/rvlcdip.py +0 -182
- deepdoctection-0.40.0/deepdoctection/datasets/instances/xfund.py +0 -241
- deepdoctection-0.40.0/deepdoctection/datasets/instances/xsl/__init__.py +0 -16
- deepdoctection-0.40.0/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -45
- deepdoctection-0.40.0/deepdoctection/datasets/registry.py +0 -101
- deepdoctection-0.40.0/deepdoctection/datasets/save.py +0 -95
- deepdoctection-0.40.0/deepdoctection/eval/tp_eval_callback.py +0 -136
- deepdoctection-0.40.0/deepdoctection/extern/doctrocr.py +0 -545
- deepdoctection-0.40.0/deepdoctection/extern/fastlang.py +0 -122
- deepdoctection-0.40.0/deepdoctection/extern/hflm.py +0 -230
- deepdoctection-0.40.0/deepdoctection/extern/model.py +0 -1160
- deepdoctection-0.40.0/deepdoctection/extern/pt/__init__.py +0 -23
- deepdoctection-0.40.0/deepdoctection/extern/pt/nms.py +0 -40
- deepdoctection-0.40.0/deepdoctection/extern/pt/ptutils.py +0 -59
- deepdoctection-0.40.0/deepdoctection/extern/tp/__init__.py +0 -20
- deepdoctection-0.40.0/deepdoctection/extern/tp/tfutils.py +0 -105
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpcompat.py +0 -138
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -20
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/common.py +0 -128
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -20
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -319
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -20
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -290
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -362
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -221
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -153
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -302
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -491
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -133
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -218
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -131
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -303
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -20
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -79
- deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -102
- deepdoctection-0.40.0/deepdoctection/extern/tpdetect.py +0 -189
- deepdoctection-0.40.0/deepdoctection/mapper/__init__.py +0 -38
- deepdoctection-0.40.0/deepdoctection/mapper/cats.py +0 -383
- deepdoctection-0.40.0/deepdoctection/mapper/cocostruct.py +0 -150
- deepdoctection-0.40.0/deepdoctection/mapper/d2struct.py +0 -212
- deepdoctection-0.40.0/deepdoctection/mapper/hfstruct.py +0 -153
- deepdoctection-0.40.0/deepdoctection/mapper/laylmstruct.py +0 -867
- deepdoctection-0.40.0/deepdoctection/mapper/maputils.py +0 -238
- deepdoctection-0.40.0/deepdoctection/mapper/match.py +0 -197
- deepdoctection-0.40.0/deepdoctection/mapper/misc.py +0 -215
- deepdoctection-0.40.0/deepdoctection/mapper/pascalstruct.py +0 -97
- deepdoctection-0.40.0/deepdoctection/mapper/prodigystruct.py +0 -188
- deepdoctection-0.40.0/deepdoctection/mapper/pubstruct.py +0 -530
- deepdoctection-0.40.0/deepdoctection/mapper/tpstruct.py +0 -117
- deepdoctection-0.40.0/deepdoctection/mapper/xfundstruct.py +0 -200
- deepdoctection-0.40.0/deepdoctection/pipe/base.py +0 -377
- deepdoctection-0.40.0/deepdoctection/pipe/transform.py +0 -88
- deepdoctection-0.40.0/deepdoctection/train/tp_frcnn_train.py +0 -332
- deepdoctection-0.40.0/deepdoctection/utils/__init__.py +0 -71
- deepdoctection-0.40.0/deepdoctection/utils/concurrency.py +0 -144
- deepdoctection-0.40.0/deepdoctection/utils/context.py +0 -133
- deepdoctection-0.40.0/deepdoctection/utils/develop.py +0 -106
- deepdoctection-0.40.0/deepdoctection/utils/env_info.py +0 -564
- deepdoctection-0.40.0/deepdoctection/utils/error.py +0 -84
- deepdoctection-0.40.0/deepdoctection/utils/file_utils.py +0 -732
- deepdoctection-0.40.0/deepdoctection/utils/fs.py +0 -322
- deepdoctection-0.40.0/deepdoctection/utils/identifier.py +0 -83
- deepdoctection-0.40.0/deepdoctection/utils/logger.py +0 -288
- deepdoctection-0.40.0/deepdoctection/utils/metacfg.py +0 -183
- deepdoctection-0.40.0/deepdoctection/utils/mocks.py +0 -93
- deepdoctection-0.40.0/deepdoctection/utils/pdf_utils.py +0 -371
- deepdoctection-0.40.0/deepdoctection/utils/settings.py +0 -442
- deepdoctection-0.40.0/deepdoctection/utils/tqdm.py +0 -61
- deepdoctection-0.40.0/deepdoctection/utils/transform.py +0 -224
- deepdoctection-0.40.0/deepdoctection/utils/types.py +0 -104
- deepdoctection-0.40.0/deepdoctection/utils/utils.py +0 -196
- deepdoctection-0.40.0/deepdoctection/utils/viz.py +0 -775
- deepdoctection-0.40.0/deepdoctection.egg-info/PKG-INFO +0 -431
- deepdoctection-0.40.0/deepdoctection.egg-info/SOURCES.txt +0 -153
- deepdoctection-0.40.0/deepdoctection.egg-info/requires.txt +0 -116
- deepdoctection-0.40.0/setup.cfg +0 -119
- deepdoctection-0.40.0/setup.py +0 -254
- deepdoctection-0.40.0/tests/test_utils.py +0 -90
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/configs/__init__.py +0 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/configs/conf_tesseract.yaml +0 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/py.typed +0 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection.egg-info/dependency_links.txt +0 -0
- {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: deepdoctection
|
|
3
|
+
Version: 1.0.3
|
|
4
|
+
Summary: Repository for Document AI - server/inference core package
|
|
5
|
+
Author: Dr. Janis Meyer
|
|
6
|
+
License: Apache License 2.0
|
|
7
|
+
Project-URL: Homepage, https://github.com/deepdoctection/deepdoctection
|
|
8
|
+
Project-URL: Documentation, https://deepdoctection.readthedocs.io
|
|
9
|
+
Project-URL: Repository, https://github.com/deepdoctection/deepdoctection
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
12
|
+
Classifier: Natural Language :: English
|
|
13
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
Requires-Dist: dd-core[full]>=1.0.1
|
|
21
|
+
Requires-Dist: huggingface_hub>=0.26.0
|
|
22
|
+
Provides-Extra: full
|
|
23
|
+
Requires-Dist: dd-datasets[full]>=1.0.1; extra == "full"
|
|
24
|
+
Requires-Dist: boto3==1.34.102; extra == "full"
|
|
25
|
+
Requires-Dist: pdfplumber>=0.11.0; extra == "full"
|
|
26
|
+
Requires-Dist: jdeskew>=0.2.2; extra == "full"
|
|
27
|
+
Requires-Dist: networkx>=2.7.1; extra == "full"
|
|
28
|
+
Requires-Dist: apted==1.0.3; extra == "full"
|
|
29
|
+
Requires-Dist: distance==0.1.3; extra == "full"
|
|
30
|
+
Requires-Dist: lxml>=4.9.1; extra == "full"
|
|
31
|
+
Requires-Dist: pycocotools>=2.0.2; extra == "full"
|
|
32
|
+
Requires-Dist: timm>=0.9.16; extra == "full"
|
|
33
|
+
Requires-Dist: transformers<5.0.0,>=4.48.0; extra == "full"
|
|
34
|
+
Requires-Dist: accelerate>=0.29.1; extra == "full"
|
|
35
|
+
Requires-Dist: python-doctr>=1.0.0; extra == "full"
|
|
36
|
+
Provides-Extra: types
|
|
37
|
+
Requires-Dist: dd_core[types]; extra == "types"
|
|
38
|
+
Requires-Dist: lxml-stubs>=0.5.1; extra == "types"
|
|
39
|
+
Provides-Extra: dev
|
|
40
|
+
Requires-Dist: black==25.11.0; extra == "dev"
|
|
41
|
+
Requires-Dist: isort==7.0.0; extra == "dev"
|
|
42
|
+
Requires-Dist: pylint==4.0.2; extra == "dev"
|
|
43
|
+
Requires-Dist: mypy==1.4.1; extra == "dev"
|
|
44
|
+
Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
|
|
45
|
+
Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
|
|
46
|
+
Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
|
|
47
|
+
Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
|
|
48
|
+
Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
|
|
49
|
+
Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
|
|
50
|
+
Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
|
|
51
|
+
Provides-Extra: test
|
|
52
|
+
Requires-Dist: pytest==9.0.1; extra == "test"
|
|
53
|
+
Requires-Dist: pytest-cov; extra == "test"
|
|
54
|
+
Provides-Extra: docs
|
|
55
|
+
Requires-Dist: mkdocs-material==9.7.0; extra == "docs"
|
|
56
|
+
Requires-Dist: mkdocstrings-python==1.19.0; extra == "docs"
|
|
57
|
+
Requires-Dist: griffe==1.13; extra == "docs"
|
|
58
|
+
|
|
59
|
+
<p align="center">
|
|
60
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
|
|
61
|
+
</p>
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
# deepdoctection
|
|
65
|
+
|
|
66
|
+
**deepdoctection** is the main package for running and training models. It provides the
|
|
67
|
+
pipeline framework, model wrappers, built-in pipelines, training scripts and evaluation methods.
|
|
68
|
+
|
|
69
|
+
The base package only installs the necessary dependencies for running inference with some selected models.
|
|
70
|
+
For training, evaluating as well as running all available models, the full package needs to be installed.
|
|
71
|
+
|
|
72
|
+
## Overview
|
|
73
|
+
|
|
74
|
+
- **analyzer**: Configuration and factory functions for creating document analysis pipelines and the built-in analyzer.
|
|
75
|
+
- **configs**: YAML configuration for pipelines and model profiles for the model catalogue.
|
|
76
|
+
- **extern**: External model wrappers (Detectron2, DocTr, HuggingFace Transformers, Tesseract, PdfPlumber, etc.)
|
|
77
|
+
- **pipe**: Pipeline components and services.
|
|
78
|
+
- **eval**: Evaluation metrics and Evaluator.
|
|
79
|
+
- **train**: Training utilities and training scripts for Detectron2 and selected Transformer models.
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
### Basic Installation
|
|
85
|
+
|
|
86
|
+
For inference use cases, install the base package:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
uv pip install deepdoctection
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
**Important**: Various dependencies must be installed separately:
|
|
93
|
+
|
|
94
|
+
- **PyTorch**: Follow instructions at https://pytorch.org/get-started/locally/ according to your os and hardware.
|
|
95
|
+
- **Transformers**: `pip install transformers>=4.48.0` (if using HF models)
|
|
96
|
+
- **Timm**: `pip install timm>=0.9.16` (necessary for if using some dedicated HF models)
|
|
97
|
+
- **DocTr**: `pip install python-doctr>=1.0.0` (if using DocTr models)
|
|
98
|
+
- **Detectron2**: Follow instructions at https://detectron2.readthedocs.io/en/latest/tutorials/install.html
|
|
99
|
+
- **PDFPlumber**: `pip install pdfplumber>=0.11.0`
|
|
100
|
+
- **JDeskew**: `pip install jdeskew>=0.2.2`
|
|
101
|
+
- **Boto3**: `pip install boto3==1.34.102`
|
|
102
|
+
|
|
103
|
+
For running evaluation with various metrics you can also install in then use:
|
|
104
|
+
|
|
105
|
+
- **APTED**: `pip install apted==1.0.3`
|
|
106
|
+
- **Distance**: `pip install distance==0.1.3`
|
|
107
|
+
- **Pycocotools**: `pip install pycocotools>=2.0.2`
|
|
108
|
+
|
|
109
|
+
Image processing is supported by PIL or OpenCV. PIL is used by default and will always be installed. If
|
|
110
|
+
you prefer to use OpenCV, you can install it:
|
|
111
|
+
|
|
112
|
+
- **OpenCV**: `pip install opencv-python==4.8.0.76`
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
### Full Installation (Training & Evaluation)
|
|
116
|
+
|
|
117
|
+
For a one large install with all dependencies (except PyTorch), run:
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
uv pip install deepdoctection[full]
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
### Development Installation
|
|
124
|
+
|
|
125
|
+
For development purpose use clone the repository and install in editable mode.
|
|
126
|
+
|
|
127
|
+
## License
|
|
128
|
+
|
|
129
|
+
Apache License 2.0
|
|
130
|
+
|
|
131
|
+
## Author
|
|
132
|
+
|
|
133
|
+
Dr. Janis Meyer
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
# deepdoctection
|
|
7
|
+
|
|
8
|
+
**deepdoctection** is the main package for running and training models. It provides the
|
|
9
|
+
pipeline framework, model wrappers, built-in pipelines, training scripts and evaluation methods.
|
|
10
|
+
|
|
11
|
+
The base package only installs the necessary dependencies for running inference with some selected models.
|
|
12
|
+
For training, evaluating as well as running all available models, the full package needs to be installed.
|
|
13
|
+
|
|
14
|
+
## Overview
|
|
15
|
+
|
|
16
|
+
- **analyzer**: Configuration and factory functions for creating document analysis pipelines and the built-in analyzer.
|
|
17
|
+
- **configs**: YAML configuration for pipelines and model profiles for the model catalogue.
|
|
18
|
+
- **extern**: External model wrappers (Detectron2, DocTr, HuggingFace Transformers, Tesseract, PdfPlumber, etc.)
|
|
19
|
+
- **pipe**: Pipeline components and services.
|
|
20
|
+
- **eval**: Evaluation metrics and Evaluator.
|
|
21
|
+
- **train**: Training utilities and training scripts for Detectron2 and selected Transformer models.
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
## Installation
|
|
25
|
+
|
|
26
|
+
### Basic Installation
|
|
27
|
+
|
|
28
|
+
For inference use cases, install the base package:
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
uv pip install deepdoctection
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
**Important**: Various dependencies must be installed separately:
|
|
35
|
+
|
|
36
|
+
- **PyTorch**: Follow instructions at https://pytorch.org/get-started/locally/ according to your os and hardware.
|
|
37
|
+
- **Transformers**: `pip install transformers>=4.48.0` (if using HF models)
|
|
38
|
+
- **Timm**: `pip install timm>=0.9.16` (necessary for if using some dedicated HF models)
|
|
39
|
+
- **DocTr**: `pip install python-doctr>=1.0.0` (if using DocTr models)
|
|
40
|
+
- **Detectron2**: Follow instructions at https://detectron2.readthedocs.io/en/latest/tutorials/install.html
|
|
41
|
+
- **PDFPlumber**: `pip install pdfplumber>=0.11.0`
|
|
42
|
+
- **JDeskew**: `pip install jdeskew>=0.2.2`
|
|
43
|
+
- **Boto3**: `pip install boto3==1.34.102`
|
|
44
|
+
|
|
45
|
+
For running evaluation with various metrics you can also install in then use:
|
|
46
|
+
|
|
47
|
+
- **APTED**: `pip install apted==1.0.3`
|
|
48
|
+
- **Distance**: `pip install distance==0.1.3`
|
|
49
|
+
- **Pycocotools**: `pip install pycocotools>=2.0.2`
|
|
50
|
+
|
|
51
|
+
Image processing is supported by PIL or OpenCV. PIL is used by default and will always be installed. If
|
|
52
|
+
you prefer to use OpenCV, you can install it:
|
|
53
|
+
|
|
54
|
+
- **OpenCV**: `pip install opencv-python==4.8.0.76`
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
### Full Installation (Training & Evaluation)
|
|
58
|
+
|
|
59
|
+
For a one large install with all dependencies (except PyTorch), run:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
uv pip install deepdoctection[full]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### Development Installation
|
|
66
|
+
|
|
67
|
+
For development purpose use clone the repository and install in editable mode.
|
|
68
|
+
|
|
69
|
+
## License
|
|
70
|
+
|
|
71
|
+
Apache License 2.0
|
|
72
|
+
|
|
73
|
+
## Author
|
|
74
|
+
|
|
75
|
+
Dr. Janis Meyer
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "deepdoctection"
|
|
7
|
+
version = "1.0.3"
|
|
8
|
+
authors = [
|
|
9
|
+
{name = "Dr. Janis Meyer"}
|
|
10
|
+
]
|
|
11
|
+
description = "Repository for Document AI - server/inference core package"
|
|
12
|
+
readme = "README.md"
|
|
13
|
+
license = {text = "Apache License 2.0"}
|
|
14
|
+
requires-python = ">=3.10"
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"License :: OSI Approved :: Apache Software License",
|
|
18
|
+
"Natural Language :: English",
|
|
19
|
+
"Operating System :: POSIX :: Linux",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
dependencies = [
|
|
27
|
+
"dd-core[full]>=1.0.1",
|
|
28
|
+
"huggingface_hub>=0.26.0",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.optional-dependencies]
|
|
32
|
+
full = [
|
|
33
|
+
"dd-datasets[full]>=1.0.1",
|
|
34
|
+
# Additional dependencies/ Non DL predictors
|
|
35
|
+
"boto3==1.34.102",
|
|
36
|
+
"pdfplumber>=0.11.0",
|
|
37
|
+
"jdeskew>=0.2.2",
|
|
38
|
+
"networkx>=2.7.1",
|
|
39
|
+
# Some libs for evals
|
|
40
|
+
"apted==1.0.3",
|
|
41
|
+
"distance==0.1.3",
|
|
42
|
+
"lxml>=4.9.1",
|
|
43
|
+
"pycocotools>=2.0.2",
|
|
44
|
+
# DL dependencies
|
|
45
|
+
"timm>=0.9.16",
|
|
46
|
+
"transformers>=4.48.0,<5.0.0",
|
|
47
|
+
"accelerate>=0.29.1",
|
|
48
|
+
"python-doctr>=1.0.0",
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
types = [
|
|
52
|
+
"dd_core[types]",
|
|
53
|
+
"lxml-stubs>=0.5.1",
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
dev = [
|
|
57
|
+
"black==25.11.0",
|
|
58
|
+
"isort==7.0.0",
|
|
59
|
+
"pylint==4.0.2",
|
|
60
|
+
"mypy==1.4.1",
|
|
61
|
+
"types-PyYAML>=6.0.12.12",
|
|
62
|
+
"types-termcolor>=1.1.3",
|
|
63
|
+
"types-tabulate>=0.9.0.3",
|
|
64
|
+
"types-tqdm>=4.66.0.5",
|
|
65
|
+
"types-Pillow>=10.2.0.20240406",
|
|
66
|
+
"types-urllib3>=1.26.25.14",
|
|
67
|
+
"lxml-stubs>=0.5.1",
|
|
68
|
+
]
|
|
69
|
+
|
|
70
|
+
test = [
|
|
71
|
+
"pytest==9.0.1",
|
|
72
|
+
"pytest-cov",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
docs = [
|
|
76
|
+
"mkdocs-material==9.7.0",
|
|
77
|
+
"mkdocstrings-python==1.19.0",
|
|
78
|
+
"griffe==1.13"
|
|
79
|
+
]
|
|
80
|
+
|
|
81
|
+
[project.urls]
|
|
82
|
+
Homepage = "https://github.com/deepdoctection/deepdoctection"
|
|
83
|
+
Documentation = "https://deepdoctection.readthedocs.io"
|
|
84
|
+
Repository = "https://github.com/deepdoctection/deepdoctection"
|
|
85
|
+
|
|
86
|
+
[tool.setuptools]
|
|
87
|
+
package-dir = {"" = "src"}
|
|
88
|
+
|
|
89
|
+
[tool.setuptools.packages.find]
|
|
90
|
+
where = ["src"]
|
|
91
|
+
|
|
92
|
+
[tool.setuptools.package-data]
|
|
93
|
+
deepdoctection = ["py.typed", "configs/*.yaml", "configs/*.jsonl"]
|
|
94
|
+
|
|
95
|
+
[tool.black]
|
|
96
|
+
line-length = 120
|
|
97
|
+
target-version = ['py310']
|
|
98
|
+
|
|
99
|
+
[tool.isort]
|
|
100
|
+
profile = "black"
|
|
101
|
+
line_length = 120
|
|
102
|
+
|
|
103
|
+
[tool.mypy]
|
|
104
|
+
python_version = "3.10"
|
|
105
|
+
warn_return_any = true
|
|
106
|
+
warn_unused_configs = true
|
|
107
|
+
ignore_missing_imports = true
|
|
108
|
+
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# File: __init__.py
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Init file for deepdoctection package. This file is used to import all submodules and to set some environment variables
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import sys
|
|
9
|
+
from typing import TYPE_CHECKING, Dict, List
|
|
10
|
+
|
|
11
|
+
from dd_core.utils.env_info import collect_env_info
|
|
12
|
+
from dd_core.utils.file_utils import _LazyModule
|
|
13
|
+
from dd_core.utils.logger import LoggingRecord, logger
|
|
14
|
+
|
|
15
|
+
__version__ = "1.0.3"
|
|
16
|
+
_IMPORT_STRUCTURE = {
|
|
17
|
+
"analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
|
|
18
|
+
"eval": [
|
|
19
|
+
"AccuracyMetric",
|
|
20
|
+
"ConfusionMetric",
|
|
21
|
+
"PrecisionMetric",
|
|
22
|
+
"RecallMetric",
|
|
23
|
+
"F1Metric",
|
|
24
|
+
"PrecisionMetricMicro",
|
|
25
|
+
"RecallMetricMicro",
|
|
26
|
+
"F1MetricMicro",
|
|
27
|
+
"MetricBase",
|
|
28
|
+
"CocoMetric",
|
|
29
|
+
"Evaluator",
|
|
30
|
+
"metric_registry",
|
|
31
|
+
"get_metric",
|
|
32
|
+
"TableTree",
|
|
33
|
+
"CustomConfig",
|
|
34
|
+
"TEDS",
|
|
35
|
+
"TedsMetric",
|
|
36
|
+
],
|
|
37
|
+
"extern": [
|
|
38
|
+
"ModelCategories",
|
|
39
|
+
"NerModelCategories",
|
|
40
|
+
"PredictorBase",
|
|
41
|
+
"DetectionResult",
|
|
42
|
+
"ObjectDetector",
|
|
43
|
+
"PdfMiner",
|
|
44
|
+
"TextRecognizer",
|
|
45
|
+
"TokenClassResult",
|
|
46
|
+
"SequenceClassResult",
|
|
47
|
+
"LMTokenClassifier",
|
|
48
|
+
"LMSequenceClassifier",
|
|
49
|
+
"LanguageDetector",
|
|
50
|
+
"ImageTransformer",
|
|
51
|
+
"DeterministicImageTransformer",
|
|
52
|
+
"InferenceResize",
|
|
53
|
+
"D2FrcnnDetector",
|
|
54
|
+
"D2FrcnnTracingDetector",
|
|
55
|
+
"Jdeskewer",
|
|
56
|
+
"DoctrTextlineDetector",
|
|
57
|
+
"DoctrTextRecognizer",
|
|
58
|
+
"DocTrRotationTransformer",
|
|
59
|
+
"HFDetrDerivedDetector",
|
|
60
|
+
"get_tokenizer_from_architecture",
|
|
61
|
+
"HFLayoutLmTokenClassifierBase",
|
|
62
|
+
"HFLayoutLmTokenClassifier",
|
|
63
|
+
"HFLayoutLmv2TokenClassifier",
|
|
64
|
+
"HFLayoutLmv3TokenClassifier",
|
|
65
|
+
"HFLayoutLmSequenceClassifier",
|
|
66
|
+
"HFLayoutLmv2SequenceClassifier",
|
|
67
|
+
"HFLayoutLmv3SequenceClassifier",
|
|
68
|
+
"HFLiltTokenClassifier",
|
|
69
|
+
"HFLiltSequenceClassifier",
|
|
70
|
+
"HFLmTokenClassifier",
|
|
71
|
+
"HFLmSequenceClassifier",
|
|
72
|
+
"HFLmLanguageDetector",
|
|
73
|
+
"ModelProfile",
|
|
74
|
+
"ModelCatalog",
|
|
75
|
+
"print_model_infos",
|
|
76
|
+
"ModelDownloadManager",
|
|
77
|
+
"PdfPlumberTextDetector",
|
|
78
|
+
"Pdfmium2TextDetector",
|
|
79
|
+
"TesseractOcrDetector",
|
|
80
|
+
"TesseractRotationTransformer",
|
|
81
|
+
"TextractOcrDetector",
|
|
82
|
+
],
|
|
83
|
+
"pipe": [
|
|
84
|
+
"DatapointManager",
|
|
85
|
+
"PipelineComponent",
|
|
86
|
+
"PredictorPipelineComponent",
|
|
87
|
+
"LanguageModelPipelineComponent",
|
|
88
|
+
"ImageTransformPipelineComponent",
|
|
89
|
+
"Pipeline",
|
|
90
|
+
"DetectResultGenerator",
|
|
91
|
+
"SubImageLayoutService",
|
|
92
|
+
"ImageCroppingService",
|
|
93
|
+
"IntersectionMatcher",
|
|
94
|
+
"NeighbourMatcher",
|
|
95
|
+
"FamilyCompound",
|
|
96
|
+
"MatchingService",
|
|
97
|
+
"PageParsingService",
|
|
98
|
+
"AnnotationNmsService",
|
|
99
|
+
"MultiThreadPipelineComponent",
|
|
100
|
+
"DoctectionPipe",
|
|
101
|
+
"LanguageDetectionService",
|
|
102
|
+
"skip_if_category_or_service_extracted",
|
|
103
|
+
"ImageLayoutService",
|
|
104
|
+
"LMTokenClassifierService",
|
|
105
|
+
"LMSequenceClassifierService",
|
|
106
|
+
"OrderGenerator",
|
|
107
|
+
"TextLineGenerator",
|
|
108
|
+
"TextLineService",
|
|
109
|
+
"TextOrderService",
|
|
110
|
+
"TableSegmentationRefinementService",
|
|
111
|
+
"generate_html_string",
|
|
112
|
+
"pipeline_component_registry",
|
|
113
|
+
"TableSegmentationService",
|
|
114
|
+
"PubtablesSegmentationService",
|
|
115
|
+
"SegmentationResult",
|
|
116
|
+
"TextExtractionService",
|
|
117
|
+
"SimpleTransformService",
|
|
118
|
+
],
|
|
119
|
+
"train": [
|
|
120
|
+
"D2Trainer",
|
|
121
|
+
"train_d2_faster_rcnn",
|
|
122
|
+
"LayoutLMTrainer",
|
|
123
|
+
"train_hf_layoutlm",
|
|
124
|
+
"DetrDerivedTrainer",
|
|
125
|
+
"train_hf_detr",
|
|
126
|
+
],
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# Setting some environment variables so that standard functions can be invoked with available hardware
|
|
131
|
+
env_info = collect_env_info()
|
|
132
|
+
logger.debug(LoggingRecord(msg=env_info))
|
|
133
|
+
|
|
134
|
+
# Build extra objects for the lazy module, starting with the version
|
|
135
|
+
_extra_objects: Dict[str, object] = {"__version__": __version__}
|
|
136
|
+
|
|
137
|
+
# Re-export all public attributes from dd_core under deepdoctection namespace
|
|
138
|
+
import dd_core # pylint: disable=C0413
|
|
139
|
+
|
|
140
|
+
for _name in dir(dd_core):
|
|
141
|
+
if _name.startswith("_"):
|
|
142
|
+
continue
|
|
143
|
+
# Optional: if dd_core defines __all__, you could respect it instead:
|
|
144
|
+
# if hasattr(dd_core, "__all__") and _name not in dd_core.__all__:
|
|
145
|
+
# continue
|
|
146
|
+
_extra_objects[_name] = getattr(dd_core, _name)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
import dd_datasets
|
|
150
|
+
|
|
151
|
+
for _name in dir(dd_datasets):
|
|
152
|
+
if _name.startswith("_"):
|
|
153
|
+
continue
|
|
154
|
+
_extra_objects[_name] = getattr(dd_datasets, _name)
|
|
155
|
+
except ImportError:
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
# Direct imports for type-checking
|
|
159
|
+
if TYPE_CHECKING:
|
|
160
|
+
from dd_core import *
|
|
161
|
+
from dd_datasets import *
|
|
162
|
+
|
|
163
|
+
from .analyzer import *
|
|
164
|
+
from .eval import *
|
|
165
|
+
from .extern import * # type: ignore
|
|
166
|
+
from .pipe import *
|
|
167
|
+
from .train import *
|
|
168
|
+
|
|
169
|
+
else:
|
|
170
|
+
sys.modules[__name__] = _LazyModule(
|
|
171
|
+
__name__,
|
|
172
|
+
globals()["__file__"],
|
|
173
|
+
_IMPORT_STRUCTURE,
|
|
174
|
+
module_spec=globals().get("__spec__"),
|
|
175
|
+
extra_objects=_extra_objects,
|
|
176
|
+
)
|