deepdoctection 0.40.0__tar.gz → 1.0.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. deepdoctection-1.0.3/PKG-INFO +133 -0
  2. deepdoctection-1.0.3/README.md +75 -0
  3. deepdoctection-1.0.3/pyproject.toml +108 -0
  4. deepdoctection-1.0.3/setup.cfg +4 -0
  5. deepdoctection-1.0.3/src/deepdoctection/__init__.py +176 -0
  6. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/analyzer/__init__.py +2 -1
  7. deepdoctection-1.0.3/src/deepdoctection/analyzer/config.py +903 -0
  8. deepdoctection-1.0.3/src/deepdoctection/analyzer/dd.py +121 -0
  9. deepdoctection-1.0.3/src/deepdoctection/analyzer/factory.py +1807 -0
  10. deepdoctection-1.0.3/src/deepdoctection/configs/conf_dd_one.yaml +186 -0
  11. deepdoctection-1.0.3/src/deepdoctection/configs/profiles.jsonl +25 -0
  12. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/__init__.py +8 -3
  13. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/accmetric.py +88 -62
  14. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/base.py +39 -16
  15. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/cocometric.py +28 -21
  16. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/eval.py +80 -62
  17. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/registry.py +6 -3
  18. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/eval/tedsmetric.py +40 -24
  19. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/__init__.py +2 -3
  20. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/base.py +292 -99
  21. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/d2detect.py +252 -105
  22. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/deskew.py +25 -15
  23. deepdoctection-1.0.3/src/deepdoctection/extern/doctrocr.py +517 -0
  24. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/hfdetr.py +144 -66
  25. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/hflayoutlm.py +340 -391
  26. deepdoctection-1.0.3/src/deepdoctection/extern/hflm.py +639 -0
  27. deepdoctection-1.0.3/src/deepdoctection/extern/model.py +498 -0
  28. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/pdftext.py +43 -17
  29. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/tessocr.py +118 -64
  30. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/extern/texocr.py +44 -19
  31. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/__init__.py +1 -2
  32. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/anngen.py +139 -70
  33. deepdoctection-1.0.3/src/deepdoctection/pipe/base.py +501 -0
  34. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/common.py +205 -100
  35. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/concurrency.py +93 -63
  36. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/doctectionpipe.py +133 -70
  37. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/language.py +48 -30
  38. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/layout.py +52 -24
  39. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/lm.py +174 -102
  40. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/order.py +236 -137
  41. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/refine.py +156 -91
  42. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/registry.py +1 -1
  43. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/segment.py +349 -223
  44. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/sub_layout.py +88 -53
  45. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/pipe/text.py +76 -45
  46. deepdoctection-1.0.3/src/deepdoctection/pipe/transform.py +114 -0
  47. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/train/__init__.py +1 -4
  48. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/train/d2_frcnn_train.py +106 -84
  49. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/train/hf_detr_train.py +84 -51
  50. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/train/hf_layoutlm_train.py +160 -150
  51. deepdoctection-1.0.3/src/deepdoctection.egg-info/PKG-INFO +133 -0
  52. deepdoctection-1.0.3/src/deepdoctection.egg-info/SOURCES.txt +56 -0
  53. deepdoctection-1.0.3/src/deepdoctection.egg-info/requires.txt +43 -0
  54. deepdoctection-0.40.0/LICENSE +0 -201
  55. deepdoctection-0.40.0/PKG-INFO +0 -431
  56. deepdoctection-0.40.0/README.md +0 -290
  57. deepdoctection-0.40.0/deepdoctection/__init__.py +0 -452
  58. deepdoctection-0.40.0/deepdoctection/analyzer/_config.py +0 -143
  59. deepdoctection-0.40.0/deepdoctection/analyzer/dd.py +0 -155
  60. deepdoctection-0.40.0/deepdoctection/analyzer/factory.py +0 -753
  61. deepdoctection-0.40.0/deepdoctection/configs/conf_dd_one.yaml +0 -145
  62. deepdoctection-0.40.0/deepdoctection/dataflow/__init__.py +0 -22
  63. deepdoctection-0.40.0/deepdoctection/dataflow/base.py +0 -145
  64. deepdoctection-0.40.0/deepdoctection/dataflow/common.py +0 -316
  65. deepdoctection-0.40.0/deepdoctection/dataflow/custom.py +0 -196
  66. deepdoctection-0.40.0/deepdoctection/dataflow/custom_serialize.py +0 -630
  67. deepdoctection-0.40.0/deepdoctection/dataflow/parallel_map.py +0 -444
  68. deepdoctection-0.40.0/deepdoctection/dataflow/serialize.py +0 -148
  69. deepdoctection-0.40.0/deepdoctection/dataflow/stats.py +0 -271
  70. deepdoctection-0.40.0/deepdoctection/datapoint/__init__.py +0 -40
  71. deepdoctection-0.40.0/deepdoctection/datapoint/annotation.py +0 -531
  72. deepdoctection-0.40.0/deepdoctection/datapoint/box.py +0 -750
  73. deepdoctection-0.40.0/deepdoctection/datapoint/convert.py +0 -210
  74. deepdoctection-0.40.0/deepdoctection/datapoint/image.py +0 -795
  75. deepdoctection-0.40.0/deepdoctection/datapoint/view.py +0 -1231
  76. deepdoctection-0.40.0/deepdoctection/datasets/__init__.py +0 -35
  77. deepdoctection-0.40.0/deepdoctection/datasets/adapter.py +0 -177
  78. deepdoctection-0.40.0/deepdoctection/datasets/base.py +0 -525
  79. deepdoctection-0.40.0/deepdoctection/datasets/dataflow_builder.py +0 -120
  80. deepdoctection-0.40.0/deepdoctection/datasets/info.py +0 -446
  81. deepdoctection-0.40.0/deepdoctection/datasets/instances/__init__.py +0 -55
  82. deepdoctection-0.40.0/deepdoctection/datasets/instances/doclaynet.py +0 -298
  83. deepdoctection-0.40.0/deepdoctection/datasets/instances/fintabnet.py +0 -294
  84. deepdoctection-0.40.0/deepdoctection/datasets/instances/funsd.py +0 -200
  85. deepdoctection-0.40.0/deepdoctection/datasets/instances/iiitar13k.py +0 -196
  86. deepdoctection-0.40.0/deepdoctection/datasets/instances/layouttest.py +0 -134
  87. deepdoctection-0.40.0/deepdoctection/datasets/instances/publaynet.py +0 -151
  88. deepdoctection-0.40.0/deepdoctection/datasets/instances/pubtables1m.py +0 -334
  89. deepdoctection-0.40.0/deepdoctection/datasets/instances/pubtabnet.py +0 -214
  90. deepdoctection-0.40.0/deepdoctection/datasets/instances/rvlcdip.py +0 -182
  91. deepdoctection-0.40.0/deepdoctection/datasets/instances/xfund.py +0 -241
  92. deepdoctection-0.40.0/deepdoctection/datasets/instances/xsl/__init__.py +0 -16
  93. deepdoctection-0.40.0/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -45
  94. deepdoctection-0.40.0/deepdoctection/datasets/registry.py +0 -101
  95. deepdoctection-0.40.0/deepdoctection/datasets/save.py +0 -95
  96. deepdoctection-0.40.0/deepdoctection/eval/tp_eval_callback.py +0 -136
  97. deepdoctection-0.40.0/deepdoctection/extern/doctrocr.py +0 -545
  98. deepdoctection-0.40.0/deepdoctection/extern/fastlang.py +0 -122
  99. deepdoctection-0.40.0/deepdoctection/extern/hflm.py +0 -230
  100. deepdoctection-0.40.0/deepdoctection/extern/model.py +0 -1160
  101. deepdoctection-0.40.0/deepdoctection/extern/pt/__init__.py +0 -23
  102. deepdoctection-0.40.0/deepdoctection/extern/pt/nms.py +0 -40
  103. deepdoctection-0.40.0/deepdoctection/extern/pt/ptutils.py +0 -59
  104. deepdoctection-0.40.0/deepdoctection/extern/tp/__init__.py +0 -20
  105. deepdoctection-0.40.0/deepdoctection/extern/tp/tfutils.py +0 -105
  106. deepdoctection-0.40.0/deepdoctection/extern/tp/tpcompat.py +0 -138
  107. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -20
  108. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/common.py +0 -128
  109. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -20
  110. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -319
  111. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -20
  112. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -290
  113. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -362
  114. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -221
  115. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -153
  116. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -302
  117. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -491
  118. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -133
  119. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -218
  120. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -131
  121. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -303
  122. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -20
  123. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -79
  124. deepdoctection-0.40.0/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -102
  125. deepdoctection-0.40.0/deepdoctection/extern/tpdetect.py +0 -189
  126. deepdoctection-0.40.0/deepdoctection/mapper/__init__.py +0 -38
  127. deepdoctection-0.40.0/deepdoctection/mapper/cats.py +0 -383
  128. deepdoctection-0.40.0/deepdoctection/mapper/cocostruct.py +0 -150
  129. deepdoctection-0.40.0/deepdoctection/mapper/d2struct.py +0 -212
  130. deepdoctection-0.40.0/deepdoctection/mapper/hfstruct.py +0 -153
  131. deepdoctection-0.40.0/deepdoctection/mapper/laylmstruct.py +0 -867
  132. deepdoctection-0.40.0/deepdoctection/mapper/maputils.py +0 -238
  133. deepdoctection-0.40.0/deepdoctection/mapper/match.py +0 -197
  134. deepdoctection-0.40.0/deepdoctection/mapper/misc.py +0 -215
  135. deepdoctection-0.40.0/deepdoctection/mapper/pascalstruct.py +0 -97
  136. deepdoctection-0.40.0/deepdoctection/mapper/prodigystruct.py +0 -188
  137. deepdoctection-0.40.0/deepdoctection/mapper/pubstruct.py +0 -530
  138. deepdoctection-0.40.0/deepdoctection/mapper/tpstruct.py +0 -117
  139. deepdoctection-0.40.0/deepdoctection/mapper/xfundstruct.py +0 -200
  140. deepdoctection-0.40.0/deepdoctection/pipe/base.py +0 -377
  141. deepdoctection-0.40.0/deepdoctection/pipe/transform.py +0 -88
  142. deepdoctection-0.40.0/deepdoctection/train/tp_frcnn_train.py +0 -332
  143. deepdoctection-0.40.0/deepdoctection/utils/__init__.py +0 -71
  144. deepdoctection-0.40.0/deepdoctection/utils/concurrency.py +0 -144
  145. deepdoctection-0.40.0/deepdoctection/utils/context.py +0 -133
  146. deepdoctection-0.40.0/deepdoctection/utils/develop.py +0 -106
  147. deepdoctection-0.40.0/deepdoctection/utils/env_info.py +0 -564
  148. deepdoctection-0.40.0/deepdoctection/utils/error.py +0 -84
  149. deepdoctection-0.40.0/deepdoctection/utils/file_utils.py +0 -732
  150. deepdoctection-0.40.0/deepdoctection/utils/fs.py +0 -322
  151. deepdoctection-0.40.0/deepdoctection/utils/identifier.py +0 -83
  152. deepdoctection-0.40.0/deepdoctection/utils/logger.py +0 -288
  153. deepdoctection-0.40.0/deepdoctection/utils/metacfg.py +0 -183
  154. deepdoctection-0.40.0/deepdoctection/utils/mocks.py +0 -93
  155. deepdoctection-0.40.0/deepdoctection/utils/pdf_utils.py +0 -371
  156. deepdoctection-0.40.0/deepdoctection/utils/settings.py +0 -442
  157. deepdoctection-0.40.0/deepdoctection/utils/tqdm.py +0 -61
  158. deepdoctection-0.40.0/deepdoctection/utils/transform.py +0 -224
  159. deepdoctection-0.40.0/deepdoctection/utils/types.py +0 -104
  160. deepdoctection-0.40.0/deepdoctection/utils/utils.py +0 -196
  161. deepdoctection-0.40.0/deepdoctection/utils/viz.py +0 -775
  162. deepdoctection-0.40.0/deepdoctection.egg-info/PKG-INFO +0 -431
  163. deepdoctection-0.40.0/deepdoctection.egg-info/SOURCES.txt +0 -153
  164. deepdoctection-0.40.0/deepdoctection.egg-info/requires.txt +0 -116
  165. deepdoctection-0.40.0/setup.cfg +0 -119
  166. deepdoctection-0.40.0/setup.py +0 -254
  167. deepdoctection-0.40.0/tests/test_utils.py +0 -90
  168. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/configs/__init__.py +0 -0
  169. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  170. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection/py.typed +0 -0
  171. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection.egg-info/dependency_links.txt +0 -0
  172. {deepdoctection-0.40.0 → deepdoctection-1.0.3/src}/deepdoctection.egg-info/top_level.txt +0 -0
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: deepdoctection
3
+ Version: 1.0.3
4
+ Summary: Repository for Document AI - server/inference core package
5
+ Author: Dr. Janis Meyer
6
+ License: Apache License 2.0
7
+ Project-URL: Homepage, https://github.com/deepdoctection/deepdoctection
8
+ Project-URL: Documentation, https://deepdoctection.readthedocs.io
9
+ Project-URL: Repository, https://github.com/deepdoctection/deepdoctection
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Natural Language :: English
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: dd-core[full]>=1.0.1
21
+ Requires-Dist: huggingface_hub>=0.26.0
22
+ Provides-Extra: full
23
+ Requires-Dist: dd-datasets[full]>=1.0.1; extra == "full"
24
+ Requires-Dist: boto3==1.34.102; extra == "full"
25
+ Requires-Dist: pdfplumber>=0.11.0; extra == "full"
26
+ Requires-Dist: jdeskew>=0.2.2; extra == "full"
27
+ Requires-Dist: networkx>=2.7.1; extra == "full"
28
+ Requires-Dist: apted==1.0.3; extra == "full"
29
+ Requires-Dist: distance==0.1.3; extra == "full"
30
+ Requires-Dist: lxml>=4.9.1; extra == "full"
31
+ Requires-Dist: pycocotools>=2.0.2; extra == "full"
32
+ Requires-Dist: timm>=0.9.16; extra == "full"
33
+ Requires-Dist: transformers<5.0.0,>=4.48.0; extra == "full"
34
+ Requires-Dist: accelerate>=0.29.1; extra == "full"
35
+ Requires-Dist: python-doctr>=1.0.0; extra == "full"
36
+ Provides-Extra: types
37
+ Requires-Dist: dd_core[types]; extra == "types"
38
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "types"
39
+ Provides-Extra: dev
40
+ Requires-Dist: black==25.11.0; extra == "dev"
41
+ Requires-Dist: isort==7.0.0; extra == "dev"
42
+ Requires-Dist: pylint==4.0.2; extra == "dev"
43
+ Requires-Dist: mypy==1.4.1; extra == "dev"
44
+ Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
45
+ Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
46
+ Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
47
+ Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
48
+ Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
49
+ Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
50
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
51
+ Provides-Extra: test
52
+ Requires-Dist: pytest==9.0.1; extra == "test"
53
+ Requires-Dist: pytest-cov; extra == "test"
54
+ Provides-Extra: docs
55
+ Requires-Dist: mkdocs-material==9.7.0; extra == "docs"
56
+ Requires-Dist: mkdocstrings-python==1.19.0; extra == "docs"
57
+ Requires-Dist: griffe==1.13; extra == "docs"
58
+
59
+ <p align="center">
60
+ <img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
61
+ </p>
62
+
63
+
64
+ # deepdoctection
65
+
66
+ **deepdoctection** is the main package for running and training models. It provides the
67
+ pipeline framework, model wrappers, built-in pipelines, training scripts and evaluation methods.
68
+
69
+ The base package only installs the necessary dependencies for running inference with some selected models.
70
+ For training, evaluating as well as running all available models, the full package needs to be installed.
71
+
72
+ ## Overview
73
+
74
+ - **analyzer**: Configuration and factory functions for creating document analysis pipelines and the built-in analyzer.
75
+ - **configs**: YAML configuration for pipelines and model profiles for the model catalogue.
76
+ - **extern**: External model wrappers (Detectron2, DocTr, HuggingFace Transformers, Tesseract, PdfPlumber, etc.)
77
+ - **pipe**: Pipeline components and services.
78
+ - **eval**: Evaluation metrics and Evaluator.
79
+ - **train**: Training utilities and training scripts for Detectron2 and selected Transformer models.
80
+
81
+
82
+ ## Installation
83
+
84
+ ### Basic Installation
85
+
86
+ For inference use cases, install the base package:
87
+
88
+ ```bash
89
+ uv pip install deepdoctection
90
+ ```
91
+
92
+ **Important**: Various dependencies must be installed separately:
93
+
94
+ - **PyTorch**: Follow instructions at https://pytorch.org/get-started/locally/ according to your os and hardware.
95
+ - **Transformers**: `pip install transformers>=4.48.0` (if using HF models)
96
+ - **Timm**: `pip install timm>=0.9.16` (necessary for if using some dedicated HF models)
97
+ - **DocTr**: `pip install python-doctr>=1.0.0` (if using DocTr models)
98
+ - **Detectron2**: Follow instructions at https://detectron2.readthedocs.io/en/latest/tutorials/install.html
99
+ - **PDFPlumber**: `pip install pdfplumber>=0.11.0`
100
+ - **JDeskew**: `pip install jdeskew>=0.2.2`
101
+ - **Boto3**: `pip install boto3==1.34.102`
102
+
103
+ For running evaluation with various metrics you can also install in then use:
104
+
105
+ - **APTED**: `pip install apted==1.0.3`
106
+ - **Distance**: `pip install distance==0.1.3`
107
+ - **Pycocotools**: `pip install pycocotools>=2.0.2`
108
+
109
+ Image processing is supported by PIL or OpenCV. PIL is used by default and will always be installed. If
110
+ you prefer to use OpenCV, you can install it:
111
+
112
+ - **OpenCV**: `pip install opencv-python==4.8.0.76`
113
+
114
+
115
+ ### Full Installation (Training & Evaluation)
116
+
117
+ For a one large install with all dependencies (except PyTorch), run:
118
+
119
+ ```bash
120
+ uv pip install deepdoctection[full]
121
+ ```
122
+
123
+ ### Development Installation
124
+
125
+ For development purpose use clone the repository and install in editable mode.
126
+
127
+ ## License
128
+
129
+ Apache License 2.0
130
+
131
+ ## Author
132
+
133
+ Dr. Janis Meyer
@@ -0,0 +1,75 @@
1
+ <p align="center">
2
+ <img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
3
+ </p>
4
+
5
+
6
+ # deepdoctection
7
+
8
+ **deepdoctection** is the main package for running and training models. It provides the
9
+ pipeline framework, model wrappers, built-in pipelines, training scripts and evaluation methods.
10
+
11
+ The base package only installs the necessary dependencies for running inference with some selected models.
12
+ For training, evaluating as well as running all available models, the full package needs to be installed.
13
+
14
+ ## Overview
15
+
16
+ - **analyzer**: Configuration and factory functions for creating document analysis pipelines and the built-in analyzer.
17
+ - **configs**: YAML configuration for pipelines and model profiles for the model catalogue.
18
+ - **extern**: External model wrappers (Detectron2, DocTr, HuggingFace Transformers, Tesseract, PdfPlumber, etc.)
19
+ - **pipe**: Pipeline components and services.
20
+ - **eval**: Evaluation metrics and Evaluator.
21
+ - **train**: Training utilities and training scripts for Detectron2 and selected Transformer models.
22
+
23
+
24
+ ## Installation
25
+
26
+ ### Basic Installation
27
+
28
+ For inference use cases, install the base package:
29
+
30
+ ```bash
31
+ uv pip install deepdoctection
32
+ ```
33
+
34
+ **Important**: Various dependencies must be installed separately:
35
+
36
+ - **PyTorch**: Follow instructions at https://pytorch.org/get-started/locally/ according to your os and hardware.
37
+ - **Transformers**: `pip install transformers>=4.48.0` (if using HF models)
38
+ - **Timm**: `pip install timm>=0.9.16` (necessary for if using some dedicated HF models)
39
+ - **DocTr**: `pip install python-doctr>=1.0.0` (if using DocTr models)
40
+ - **Detectron2**: Follow instructions at https://detectron2.readthedocs.io/en/latest/tutorials/install.html
41
+ - **PDFPlumber**: `pip install pdfplumber>=0.11.0`
42
+ - **JDeskew**: `pip install jdeskew>=0.2.2`
43
+ - **Boto3**: `pip install boto3==1.34.102`
44
+
45
+ For running evaluation with various metrics you can also install in then use:
46
+
47
+ - **APTED**: `pip install apted==1.0.3`
48
+ - **Distance**: `pip install distance==0.1.3`
49
+ - **Pycocotools**: `pip install pycocotools>=2.0.2`
50
+
51
+ Image processing is supported by PIL or OpenCV. PIL is used by default and will always be installed. If
52
+ you prefer to use OpenCV, you can install it:
53
+
54
+ - **OpenCV**: `pip install opencv-python==4.8.0.76`
55
+
56
+
57
+ ### Full Installation (Training & Evaluation)
58
+
59
+ For a one large install with all dependencies (except PyTorch), run:
60
+
61
+ ```bash
62
+ uv pip install deepdoctection[full]
63
+ ```
64
+
65
+ ### Development Installation
66
+
67
+ For development purpose use clone the repository and install in editable mode.
68
+
69
+ ## License
70
+
71
+ Apache License 2.0
72
+
73
+ ## Author
74
+
75
+ Dr. Janis Meyer
@@ -0,0 +1,108 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "deepdoctection"
7
+ version = "1.0.3"
8
+ authors = [
9
+ {name = "Dr. Janis Meyer"}
10
+ ]
11
+ description = "Repository for Document AI - server/inference core package"
12
+ readme = "README.md"
13
+ license = {text = "Apache License 2.0"}
14
+ requires-python = ">=3.10"
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "License :: OSI Approved :: Apache Software License",
18
+ "Natural Language :: English",
19
+ "Operating System :: POSIX :: Linux",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ ]
25
+
26
+ dependencies = [
27
+ "dd-core[full]>=1.0.1",
28
+ "huggingface_hub>=0.26.0",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ full = [
33
+ "dd-datasets[full]>=1.0.1",
34
+ # Additional dependencies/ Non DL predictors
35
+ "boto3==1.34.102",
36
+ "pdfplumber>=0.11.0",
37
+ "jdeskew>=0.2.2",
38
+ "networkx>=2.7.1",
39
+ # Some libs for evals
40
+ "apted==1.0.3",
41
+ "distance==0.1.3",
42
+ "lxml>=4.9.1",
43
+ "pycocotools>=2.0.2",
44
+ # DL dependencies
45
+ "timm>=0.9.16",
46
+ "transformers>=4.48.0,<5.0.0",
47
+ "accelerate>=0.29.1",
48
+ "python-doctr>=1.0.0",
49
+ ]
50
+
51
+ types = [
52
+ "dd_core[types]",
53
+ "lxml-stubs>=0.5.1",
54
+ ]
55
+
56
+ dev = [
57
+ "black==25.11.0",
58
+ "isort==7.0.0",
59
+ "pylint==4.0.2",
60
+ "mypy==1.4.1",
61
+ "types-PyYAML>=6.0.12.12",
62
+ "types-termcolor>=1.1.3",
63
+ "types-tabulate>=0.9.0.3",
64
+ "types-tqdm>=4.66.0.5",
65
+ "types-Pillow>=10.2.0.20240406",
66
+ "types-urllib3>=1.26.25.14",
67
+ "lxml-stubs>=0.5.1",
68
+ ]
69
+
70
+ test = [
71
+ "pytest==9.0.1",
72
+ "pytest-cov",
73
+ ]
74
+
75
+ docs = [
76
+ "mkdocs-material==9.7.0",
77
+ "mkdocstrings-python==1.19.0",
78
+ "griffe==1.13"
79
+ ]
80
+
81
+ [project.urls]
82
+ Homepage = "https://github.com/deepdoctection/deepdoctection"
83
+ Documentation = "https://deepdoctection.readthedocs.io"
84
+ Repository = "https://github.com/deepdoctection/deepdoctection"
85
+
86
+ [tool.setuptools]
87
+ package-dir = {"" = "src"}
88
+
89
+ [tool.setuptools.packages.find]
90
+ where = ["src"]
91
+
92
+ [tool.setuptools.package-data]
93
+ deepdoctection = ["py.typed", "configs/*.yaml", "configs/*.jsonl"]
94
+
95
+ [tool.black]
96
+ line-length = 120
97
+ target-version = ['py310']
98
+
99
+ [tool.isort]
100
+ profile = "black"
101
+ line_length = 120
102
+
103
+ [tool.mypy]
104
+ python_version = "3.10"
105
+ warn_return_any = true
106
+ warn_unused_configs = true
107
+ ignore_missing_imports = true
108
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,176 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: __init__.py
3
+
4
+ """
5
+ Init file for deepdoctection package. This file is used to import all submodules and to set some environment variables
6
+ """
7
+
8
+ import sys
9
+ from typing import TYPE_CHECKING, Dict, List
10
+
11
+ from dd_core.utils.env_info import collect_env_info
12
+ from dd_core.utils.file_utils import _LazyModule
13
+ from dd_core.utils.logger import LoggingRecord, logger
14
+
15
+ __version__ = "1.0.3"
16
+ _IMPORT_STRUCTURE = {
17
+ "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
18
+ "eval": [
19
+ "AccuracyMetric",
20
+ "ConfusionMetric",
21
+ "PrecisionMetric",
22
+ "RecallMetric",
23
+ "F1Metric",
24
+ "PrecisionMetricMicro",
25
+ "RecallMetricMicro",
26
+ "F1MetricMicro",
27
+ "MetricBase",
28
+ "CocoMetric",
29
+ "Evaluator",
30
+ "metric_registry",
31
+ "get_metric",
32
+ "TableTree",
33
+ "CustomConfig",
34
+ "TEDS",
35
+ "TedsMetric",
36
+ ],
37
+ "extern": [
38
+ "ModelCategories",
39
+ "NerModelCategories",
40
+ "PredictorBase",
41
+ "DetectionResult",
42
+ "ObjectDetector",
43
+ "PdfMiner",
44
+ "TextRecognizer",
45
+ "TokenClassResult",
46
+ "SequenceClassResult",
47
+ "LMTokenClassifier",
48
+ "LMSequenceClassifier",
49
+ "LanguageDetector",
50
+ "ImageTransformer",
51
+ "DeterministicImageTransformer",
52
+ "InferenceResize",
53
+ "D2FrcnnDetector",
54
+ "D2FrcnnTracingDetector",
55
+ "Jdeskewer",
56
+ "DoctrTextlineDetector",
57
+ "DoctrTextRecognizer",
58
+ "DocTrRotationTransformer",
59
+ "HFDetrDerivedDetector",
60
+ "get_tokenizer_from_architecture",
61
+ "HFLayoutLmTokenClassifierBase",
62
+ "HFLayoutLmTokenClassifier",
63
+ "HFLayoutLmv2TokenClassifier",
64
+ "HFLayoutLmv3TokenClassifier",
65
+ "HFLayoutLmSequenceClassifier",
66
+ "HFLayoutLmv2SequenceClassifier",
67
+ "HFLayoutLmv3SequenceClassifier",
68
+ "HFLiltTokenClassifier",
69
+ "HFLiltSequenceClassifier",
70
+ "HFLmTokenClassifier",
71
+ "HFLmSequenceClassifier",
72
+ "HFLmLanguageDetector",
73
+ "ModelProfile",
74
+ "ModelCatalog",
75
+ "print_model_infos",
76
+ "ModelDownloadManager",
77
+ "PdfPlumberTextDetector",
78
+ "Pdfmium2TextDetector",
79
+ "TesseractOcrDetector",
80
+ "TesseractRotationTransformer",
81
+ "TextractOcrDetector",
82
+ ],
83
+ "pipe": [
84
+ "DatapointManager",
85
+ "PipelineComponent",
86
+ "PredictorPipelineComponent",
87
+ "LanguageModelPipelineComponent",
88
+ "ImageTransformPipelineComponent",
89
+ "Pipeline",
90
+ "DetectResultGenerator",
91
+ "SubImageLayoutService",
92
+ "ImageCroppingService",
93
+ "IntersectionMatcher",
94
+ "NeighbourMatcher",
95
+ "FamilyCompound",
96
+ "MatchingService",
97
+ "PageParsingService",
98
+ "AnnotationNmsService",
99
+ "MultiThreadPipelineComponent",
100
+ "DoctectionPipe",
101
+ "LanguageDetectionService",
102
+ "skip_if_category_or_service_extracted",
103
+ "ImageLayoutService",
104
+ "LMTokenClassifierService",
105
+ "LMSequenceClassifierService",
106
+ "OrderGenerator",
107
+ "TextLineGenerator",
108
+ "TextLineService",
109
+ "TextOrderService",
110
+ "TableSegmentationRefinementService",
111
+ "generate_html_string",
112
+ "pipeline_component_registry",
113
+ "TableSegmentationService",
114
+ "PubtablesSegmentationService",
115
+ "SegmentationResult",
116
+ "TextExtractionService",
117
+ "SimpleTransformService",
118
+ ],
119
+ "train": [
120
+ "D2Trainer",
121
+ "train_d2_faster_rcnn",
122
+ "LayoutLMTrainer",
123
+ "train_hf_layoutlm",
124
+ "DetrDerivedTrainer",
125
+ "train_hf_detr",
126
+ ],
127
+ }
128
+
129
+
130
+ # Setting some environment variables so that standard functions can be invoked with available hardware
131
+ env_info = collect_env_info()
132
+ logger.debug(LoggingRecord(msg=env_info))
133
+
134
+ # Build extra objects for the lazy module, starting with the version
135
+ _extra_objects: Dict[str, object] = {"__version__": __version__}
136
+
137
+ # Re-export all public attributes from dd_core under deepdoctection namespace
138
+ import dd_core # pylint: disable=C0413
139
+
140
+ for _name in dir(dd_core):
141
+ if _name.startswith("_"):
142
+ continue
143
+ # Optional: if dd_core defines __all__, you could respect it instead:
144
+ # if hasattr(dd_core, "__all__") and _name not in dd_core.__all__:
145
+ # continue
146
+ _extra_objects[_name] = getattr(dd_core, _name)
147
+
148
+ try:
149
+ import dd_datasets
150
+
151
+ for _name in dir(dd_datasets):
152
+ if _name.startswith("_"):
153
+ continue
154
+ _extra_objects[_name] = getattr(dd_datasets, _name)
155
+ except ImportError:
156
+ pass
157
+
158
+ # Direct imports for type-checking
159
+ if TYPE_CHECKING:
160
+ from dd_core import *
161
+ from dd_datasets import *
162
+
163
+ from .analyzer import *
164
+ from .eval import *
165
+ from .extern import * # type: ignore
166
+ from .pipe import *
167
+ from .train import *
168
+
169
+ else:
170
+ sys.modules[__name__] = _LazyModule(
171
+ __name__,
172
+ globals()["__file__"],
173
+ _IMPORT_STRUCTURE,
174
+ module_spec=globals().get("__spec__"),
175
+ extra_objects=_extra_objects,
176
+ )
@@ -16,8 +16,9 @@
16
16
  # limitations under the License.
17
17
 
18
18
  """
19
- Package for pre-built pipelines
19
+ # Configs, Factory functions and Pre-built pipelines
20
20
  """
21
21
 
22
+ from .config import *
22
23
  from .dd import *
23
24
  from .factory import *