deepdoctection 0.46.2__tar.gz → 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (164) hide show
  1. deepdoctection-1.0.0/PKG-INFO +133 -0
  2. deepdoctection-1.0.0/README.md +75 -0
  3. deepdoctection-1.0.0/pyproject.toml +108 -0
  4. deepdoctection-1.0.0/setup.cfg +4 -0
  5. deepdoctection-1.0.0/src/deepdoctection/__init__.py +176 -0
  6. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/analyzer/config.py +70 -131
  7. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/analyzer/dd.py +17 -31
  8. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/analyzer/factory.py +173 -89
  9. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/configs/conf_dd_one.yaml +37 -52
  10. deepdoctection-1.0.0/src/deepdoctection/configs/profiles.jsonl +25 -0
  11. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/eval/__init__.py +5 -1
  12. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/eval/accmetric.py +16 -10
  13. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/eval/base.py +10 -6
  14. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/eval/cocometric.py +13 -9
  15. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/eval/eval.py +24 -21
  16. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/eval/tedsmetric.py +16 -15
  17. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/__init__.py +0 -2
  18. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/base.py +13 -15
  19. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/d2detect.py +124 -12
  20. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/deskew.py +5 -4
  21. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/doctrocr.py +49 -146
  22. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/hfdetr.py +7 -6
  23. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/hflayoutlm.py +24 -163
  24. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/hflm.py +22 -72
  25. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/model.py +44 -52
  26. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/pdftext.py +6 -5
  27. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/tessocr.py +14 -9
  28. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/extern/texocr.py +15 -5
  29. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/anngen.py +22 -8
  30. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/base.py +8 -7
  31. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/common.py +11 -15
  32. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/concurrency.py +18 -15
  33. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/doctectionpipe.py +30 -28
  34. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/language.py +6 -5
  35. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/layout.py +8 -7
  36. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/lm.py +17 -27
  37. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/order.py +11 -7
  38. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/refine.py +30 -22
  39. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/segment.py +106 -55
  40. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/sub_layout.py +8 -7
  41. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/text.py +9 -8
  42. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/transform.py +2 -1
  43. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/train/__init__.py +1 -4
  44. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/train/d2_frcnn_train.py +19 -15
  45. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/train/hf_detr_train.py +12 -11
  46. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/train/hf_layoutlm_train.py +80 -109
  47. deepdoctection-1.0.0/src/deepdoctection.egg-info/PKG-INFO +133 -0
  48. deepdoctection-1.0.0/src/deepdoctection.egg-info/SOURCES.txt +56 -0
  49. deepdoctection-1.0.0/src/deepdoctection.egg-info/requires.txt +43 -0
  50. deepdoctection-0.46.2/LICENSE +0 -201
  51. deepdoctection-0.46.2/PKG-INFO +0 -418
  52. deepdoctection-0.46.2/README.md +0 -282
  53. deepdoctection-0.46.2/deepdoctection/__init__.py +0 -465
  54. deepdoctection-0.46.2/deepdoctection/configs/profiles.jsonl +0 -34
  55. deepdoctection-0.46.2/deepdoctection/dataflow/__init__.py +0 -25
  56. deepdoctection-0.46.2/deepdoctection/dataflow/base.py +0 -163
  57. deepdoctection-0.46.2/deepdoctection/dataflow/common.py +0 -337
  58. deepdoctection-0.46.2/deepdoctection/dataflow/custom.py +0 -203
  59. deepdoctection-0.46.2/deepdoctection/dataflow/custom_serialize.py +0 -674
  60. deepdoctection-0.46.2/deepdoctection/dataflow/parallel_map.py +0 -452
  61. deepdoctection-0.46.2/deepdoctection/dataflow/serialize.py +0 -153
  62. deepdoctection-0.46.2/deepdoctection/dataflow/stats.py +0 -284
  63. deepdoctection-0.46.2/deepdoctection/datapoint/__init__.py +0 -38
  64. deepdoctection-0.46.2/deepdoctection/datapoint/annotation.py +0 -569
  65. deepdoctection-0.46.2/deepdoctection/datapoint/box.py +0 -810
  66. deepdoctection-0.46.2/deepdoctection/datapoint/convert.py +0 -213
  67. deepdoctection-0.46.2/deepdoctection/datapoint/image.py +0 -900
  68. deepdoctection-0.46.2/deepdoctection/datapoint/view.py +0 -1582
  69. deepdoctection-0.46.2/deepdoctection/datasets/__init__.py +0 -31
  70. deepdoctection-0.46.2/deepdoctection/datasets/adapter.py +0 -177
  71. deepdoctection-0.46.2/deepdoctection/datasets/base.py +0 -709
  72. deepdoctection-0.46.2/deepdoctection/datasets/dataflow_builder.py +0 -126
  73. deepdoctection-0.46.2/deepdoctection/datasets/info.py +0 -465
  74. deepdoctection-0.46.2/deepdoctection/datasets/instances/__init__.py +0 -59
  75. deepdoctection-0.46.2/deepdoctection/datasets/instances/doclaynet.py +0 -301
  76. deepdoctection-0.46.2/deepdoctection/datasets/instances/fintabnet.py +0 -288
  77. deepdoctection-0.46.2/deepdoctection/datasets/instances/funsd.py +0 -205
  78. deepdoctection-0.46.2/deepdoctection/datasets/instances/iiitar13k.py +0 -196
  79. deepdoctection-0.46.2/deepdoctection/datasets/instances/layouttest.py +0 -134
  80. deepdoctection-0.46.2/deepdoctection/datasets/instances/publaynet.py +0 -151
  81. deepdoctection-0.46.2/deepdoctection/datasets/instances/pubtables1m.py +0 -334
  82. deepdoctection-0.46.2/deepdoctection/datasets/instances/pubtabnet.py +0 -212
  83. deepdoctection-0.46.2/deepdoctection/datasets/instances/rvlcdip.py +0 -182
  84. deepdoctection-0.46.2/deepdoctection/datasets/instances/xfund.py +0 -243
  85. deepdoctection-0.46.2/deepdoctection/datasets/instances/xsl/__init__.py +0 -16
  86. deepdoctection-0.46.2/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -45
  87. deepdoctection-0.46.2/deepdoctection/datasets/registry.py +0 -108
  88. deepdoctection-0.46.2/deepdoctection/datasets/save.py +0 -96
  89. deepdoctection-0.46.2/deepdoctection/eval/tp_eval_callback.py +0 -137
  90. deepdoctection-0.46.2/deepdoctection/extern/fastlang.py +0 -132
  91. deepdoctection-0.46.2/deepdoctection/extern/pt/__init__.py +0 -23
  92. deepdoctection-0.46.2/deepdoctection/extern/pt/nms.py +0 -54
  93. deepdoctection-0.46.2/deepdoctection/extern/pt/ptutils.py +0 -63
  94. deepdoctection-0.46.2/deepdoctection/extern/tp/__init__.py +0 -20
  95. deepdoctection-0.46.2/deepdoctection/extern/tp/tfutils.py +0 -133
  96. deepdoctection-0.46.2/deepdoctection/extern/tp/tpcompat.py +0 -176
  97. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -20
  98. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/common.py +0 -128
  99. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -20
  100. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -335
  101. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -20
  102. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -290
  103. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -362
  104. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -221
  105. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -153
  106. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -302
  107. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -491
  108. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -133
  109. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -218
  110. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -131
  111. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -303
  112. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -20
  113. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -79
  114. deepdoctection-0.46.2/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -102
  115. deepdoctection-0.46.2/deepdoctection/extern/tpdetect.py +0 -191
  116. deepdoctection-0.46.2/deepdoctection/mapper/__init__.py +0 -38
  117. deepdoctection-0.46.2/deepdoctection/mapper/cats.py +0 -424
  118. deepdoctection-0.46.2/deepdoctection/mapper/cocostruct.py +0 -168
  119. deepdoctection-0.46.2/deepdoctection/mapper/d2struct.py +0 -302
  120. deepdoctection-0.46.2/deepdoctection/mapper/hfstruct.py +0 -166
  121. deepdoctection-0.46.2/deepdoctection/mapper/laylmstruct.py +0 -902
  122. deepdoctection-0.46.2/deepdoctection/mapper/maputils.py +0 -274
  123. deepdoctection-0.46.2/deepdoctection/mapper/match.py +0 -211
  124. deepdoctection-0.46.2/deepdoctection/mapper/misc.py +0 -239
  125. deepdoctection-0.46.2/deepdoctection/mapper/pascalstruct.py +0 -98
  126. deepdoctection-0.46.2/deepdoctection/mapper/prodigystruct.py +0 -202
  127. deepdoctection-0.46.2/deepdoctection/mapper/pubstruct.py +0 -540
  128. deepdoctection-0.46.2/deepdoctection/mapper/tpstruct.py +0 -161
  129. deepdoctection-0.46.2/deepdoctection/mapper/xfundstruct.py +0 -207
  130. deepdoctection-0.46.2/deepdoctection/train/tp_frcnn_train.py +0 -360
  131. deepdoctection-0.46.2/deepdoctection/utils/__init__.py +0 -71
  132. deepdoctection-0.46.2/deepdoctection/utils/concurrency.py +0 -187
  133. deepdoctection-0.46.2/deepdoctection/utils/context.py +0 -154
  134. deepdoctection-0.46.2/deepdoctection/utils/develop.py +0 -114
  135. deepdoctection-0.46.2/deepdoctection/utils/env_info.py +0 -613
  136. deepdoctection-0.46.2/deepdoctection/utils/error.py +0 -90
  137. deepdoctection-0.46.2/deepdoctection/utils/file_utils.py +0 -970
  138. deepdoctection-0.46.2/deepdoctection/utils/fs.py +0 -441
  139. deepdoctection-0.46.2/deepdoctection/utils/identifier.py +0 -96
  140. deepdoctection-0.46.2/deepdoctection/utils/logger.py +0 -342
  141. deepdoctection-0.46.2/deepdoctection/utils/metacfg.py +0 -245
  142. deepdoctection-0.46.2/deepdoctection/utils/mocks.py +0 -93
  143. deepdoctection-0.46.2/deepdoctection/utils/pdf_utils.py +0 -428
  144. deepdoctection-0.46.2/deepdoctection/utils/settings.py +0 -465
  145. deepdoctection-0.46.2/deepdoctection/utils/tqdm.py +0 -66
  146. deepdoctection-0.46.2/deepdoctection/utils/transform.py +0 -520
  147. deepdoctection-0.46.2/deepdoctection/utils/types.py +0 -103
  148. deepdoctection-0.46.2/deepdoctection/utils/utils.py +0 -229
  149. deepdoctection-0.46.2/deepdoctection/utils/viz.py +0 -734
  150. deepdoctection-0.46.2/deepdoctection.egg-info/PKG-INFO +0 -418
  151. deepdoctection-0.46.2/deepdoctection.egg-info/SOURCES.txt +0 -154
  152. deepdoctection-0.46.2/deepdoctection.egg-info/requires.txt +0 -110
  153. deepdoctection-0.46.2/setup.cfg +0 -123
  154. deepdoctection-0.46.2/setup.py +0 -246
  155. deepdoctection-0.46.2/tests/test_utils.py +0 -90
  156. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/analyzer/__init__.py +0 -0
  157. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/configs/__init__.py +0 -0
  158. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  159. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/eval/registry.py +0 -0
  160. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/__init__.py +0 -0
  161. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/pipe/registry.py +0 -0
  162. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection/py.typed +0 -0
  163. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection.egg-info/dependency_links.txt +0 -0
  164. {deepdoctection-0.46.2 → deepdoctection-1.0.0/src}/deepdoctection.egg-info/top_level.txt +0 -0
@@ -0,0 +1,133 @@
1
+ Metadata-Version: 2.4
2
+ Name: deepdoctection
3
+ Version: 1.0.0
4
+ Summary: Repository for Document AI - server/inference core package
5
+ Author: Dr. Janis Meyer
6
+ License: Apache License 2.0
7
+ Project-URL: Homepage, https://github.com/deepdoctection/deepdoctection
8
+ Project-URL: Documentation, https://deepdoctection.readthedocs.io
9
+ Project-URL: Repository, https://github.com/deepdoctection/deepdoctection
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: License :: OSI Approved :: Apache Software License
12
+ Classifier: Natural Language :: English
13
+ Classifier: Operating System :: POSIX :: Linux
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ Requires-Dist: dd-core[full]>=1.0
21
+ Requires-Dist: huggingface_hub>=0.26.0
22
+ Provides-Extra: full
23
+ Requires-Dist: dd-datasets[full]>=1.0; extra == "full"
24
+ Requires-Dist: boto3==1.34.102; extra == "full"
25
+ Requires-Dist: pdfplumber>=0.11.0; extra == "full"
26
+ Requires-Dist: jdeskew>=0.2.2; extra == "full"
27
+ Requires-Dist: networkx>=2.7.1; extra == "full"
28
+ Requires-Dist: apted==1.0.3; extra == "full"
29
+ Requires-Dist: distance==0.1.3; extra == "full"
30
+ Requires-Dist: lxml>=4.9.1; extra == "full"
31
+ Requires-Dist: pycocotools>=2.0.2; extra == "full"
32
+ Requires-Dist: timm>=0.9.16; extra == "full"
33
+ Requires-Dist: transformers<5.0.0,>=4.48.0; extra == "full"
34
+ Requires-Dist: accelerate>=0.29.1; extra == "full"
35
+ Requires-Dist: python-doctr>=1.0.0; extra == "full"
36
+ Provides-Extra: types
37
+ Requires-Dist: dd_core[types]; extra == "types"
38
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "types"
39
+ Provides-Extra: dev
40
+ Requires-Dist: black==25.11.0; extra == "dev"
41
+ Requires-Dist: isort==7.0.0; extra == "dev"
42
+ Requires-Dist: pylint==4.0.2; extra == "dev"
43
+ Requires-Dist: mypy==1.4.1; extra == "dev"
44
+ Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
45
+ Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
46
+ Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
47
+ Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
48
+ Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
49
+ Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
50
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
51
+ Provides-Extra: test
52
+ Requires-Dist: pytest==9.0.1; extra == "test"
53
+ Requires-Dist: pytest-cov; extra == "test"
54
+ Provides-Extra: docs
55
+ Requires-Dist: mkdocs-material==9.7.0; extra == "docs"
56
+ Requires-Dist: mkdocstrings-python==1.19.0; extra == "docs"
57
+ Requires-Dist: griffe==0.25.0; extra == "docs"
58
+
59
+ <p align="center">
60
+ <img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
61
+ </p>
62
+
63
+
64
+ # deepdoctection
65
+
66
+ **deepdoctection** is the main package for running and training models. It provides the
67
+ pipeline framework, model wrappers, built-in pipelines, training scripts and evaluation methods.
68
+
69
+ The base package only installs the necessary dependencies for running inference with some selected models.
70
+ For training, evaluating as well as running all available models, the full package needs to be installed.
71
+
72
+ ## Overview
73
+
74
+ - **analyzer**: Configuration and factory functions for creating document analysis pipelines and the built-in analyzer.
75
+ - **configs**: YAML configuration for pipelines and model profiles for the model catalogue.
76
+ - **extern**: External model wrappers (Detectron2, DocTr, HuggingFace Transformers, Tesseract, PdfPlumber, etc.)
77
+ - **pipe**: Pipeline components and services.
78
+ - **eval**: Evaluation metrics and Evaluator.
79
+ - **train**: Training utilities and training scripts for Detectron2 and selected Transformer models.
80
+
81
+
82
+ ## Installation
83
+
84
+ ### Basic Installation
85
+
86
+ For inference use cases, install the base package:
87
+
88
+ ```bash
89
+ uv pip install deepdoctection
90
+ ```
91
+
92
+ **Important**: Various dependencies must be installed separately:
93
+
94
+ - **PyTorch**: Follow instructions at https://pytorch.org/get-started/locally/ according to your os and hardware.
95
+ - **Transformers**: `pip install transformers>=4.48.0` (if using HF models)
96
+ - **Timm**: `pip install timm>=0.9.16` (necessary for if using some dedicated HF models)
97
+ - **DocTr**: `pip install python-doctr>=1.0.0` (if using DocTr models)
98
+ - **Detectron2**: Follow instructions at https://detectron2.readthedocs.io/en/latest/tutorials/install.html
99
+ - **PDFPlumber**: `pip install pdfplumber>=0.11.0`
100
+ - **JDeskew**: `pip install jdeskew>=0.2.2`
101
+ - **Boto3**: `pip install boto3==1.34.102`
102
+
103
+ For running evaluation with various metrics you can also install in then use:
104
+
105
+ - **APTED**: `pip install apted==1.0.3`
106
+ - **Distance**: `pip install distance==0.1.3`
107
+ - **Pycocotools**: `pip install pycocotools>=2.0.2`
108
+
109
+ Image processing is supported by PIL or OpenCV. PIL is used by default and will always be installed. If
110
+ you prefer to use OpenCV, you can install it:
111
+
112
+ - **OpenCV**: `pip install opencv-python==4.8.0.76`
113
+
114
+
115
+ ### Full Installation (Training & Evaluation)
116
+
117
+ For a one large install with all dependencies (except PyTorch), run:
118
+
119
+ ```bash
120
+ uv pip install deepdoctection[full]
121
+ ```
122
+
123
+ ### Development Installation
124
+
125
+ For development purpose use clone the repository and install in editable mode.
126
+
127
+ ## License
128
+
129
+ Apache License 2.0
130
+
131
+ ## Author
132
+
133
+ Dr. Janis Meyer
@@ -0,0 +1,75 @@
1
+ <p align="center">
2
+ <img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
3
+ </p>
4
+
5
+
6
+ # deepdoctection
7
+
8
+ **deepdoctection** is the main package for running and training models. It provides the
9
+ pipeline framework, model wrappers, built-in pipelines, training scripts and evaluation methods.
10
+
11
+ The base package only installs the necessary dependencies for running inference with some selected models.
12
+ For training, evaluating as well as running all available models, the full package needs to be installed.
13
+
14
+ ## Overview
15
+
16
+ - **analyzer**: Configuration and factory functions for creating document analysis pipelines and the built-in analyzer.
17
+ - **configs**: YAML configuration for pipelines and model profiles for the model catalogue.
18
+ - **extern**: External model wrappers (Detectron2, DocTr, HuggingFace Transformers, Tesseract, PdfPlumber, etc.)
19
+ - **pipe**: Pipeline components and services.
20
+ - **eval**: Evaluation metrics and Evaluator.
21
+ - **train**: Training utilities and training scripts for Detectron2 and selected Transformer models.
22
+
23
+
24
+ ## Installation
25
+
26
+ ### Basic Installation
27
+
28
+ For inference use cases, install the base package:
29
+
30
+ ```bash
31
+ uv pip install deepdoctection
32
+ ```
33
+
34
+ **Important**: Various dependencies must be installed separately:
35
+
36
+ - **PyTorch**: Follow instructions at https://pytorch.org/get-started/locally/ according to your os and hardware.
37
+ - **Transformers**: `pip install transformers>=4.48.0` (if using HF models)
38
+ - **Timm**: `pip install timm>=0.9.16` (necessary for if using some dedicated HF models)
39
+ - **DocTr**: `pip install python-doctr>=1.0.0` (if using DocTr models)
40
+ - **Detectron2**: Follow instructions at https://detectron2.readthedocs.io/en/latest/tutorials/install.html
41
+ - **PDFPlumber**: `pip install pdfplumber>=0.11.0`
42
+ - **JDeskew**: `pip install jdeskew>=0.2.2`
43
+ - **Boto3**: `pip install boto3==1.34.102`
44
+
45
+ For running evaluation with various metrics you can also install in then use:
46
+
47
+ - **APTED**: `pip install apted==1.0.3`
48
+ - **Distance**: `pip install distance==0.1.3`
49
+ - **Pycocotools**: `pip install pycocotools>=2.0.2`
50
+
51
+ Image processing is supported by PIL or OpenCV. PIL is used by default and will always be installed. If
52
+ you prefer to use OpenCV, you can install it:
53
+
54
+ - **OpenCV**: `pip install opencv-python==4.8.0.76`
55
+
56
+
57
+ ### Full Installation (Training & Evaluation)
58
+
59
+ For a one large install with all dependencies (except PyTorch), run:
60
+
61
+ ```bash
62
+ uv pip install deepdoctection[full]
63
+ ```
64
+
65
+ ### Development Installation
66
+
67
+ For development purpose use clone the repository and install in editable mode.
68
+
69
+ ## License
70
+
71
+ Apache License 2.0
72
+
73
+ ## Author
74
+
75
+ Dr. Janis Meyer
@@ -0,0 +1,108 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "deepdoctection"
7
+ version = "1.0.0"
8
+ authors = [
9
+ {name = "Dr. Janis Meyer"}
10
+ ]
11
+ description = "Repository for Document AI - server/inference core package"
12
+ readme = "README.md"
13
+ license = {text = "Apache License 2.0"}
14
+ requires-python = ">=3.10"
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "License :: OSI Approved :: Apache Software License",
18
+ "Natural Language :: English",
19
+ "Operating System :: POSIX :: Linux",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
24
+ ]
25
+
26
+ dependencies = [
27
+ "dd-core[full]>=1.0",
28
+ "huggingface_hub>=0.26.0",
29
+ ]
30
+
31
+ [project.optional-dependencies]
32
+ full = [
33
+ "dd-datasets[full]>=1.0",
34
+ # Additional dependencies/ Non DL predictors
35
+ "boto3==1.34.102",
36
+ "pdfplumber>=0.11.0",
37
+ "jdeskew>=0.2.2",
38
+ "networkx>=2.7.1",
39
+ # Some libs for evals
40
+ "apted==1.0.3",
41
+ "distance==0.1.3",
42
+ "lxml>=4.9.1",
43
+ "pycocotools>=2.0.2",
44
+ # DL dependencies
45
+ "timm>=0.9.16",
46
+ "transformers>=4.48.0,<5.0.0",
47
+ "accelerate>=0.29.1",
48
+ "python-doctr>=1.0.0",
49
+ ]
50
+
51
+ types = [
52
+ "dd_core[types]",
53
+ "lxml-stubs>=0.5.1",
54
+ ]
55
+
56
+ dev = [
57
+ "black==25.11.0",
58
+ "isort==7.0.0",
59
+ "pylint==4.0.2",
60
+ "mypy==1.4.1",
61
+ "types-PyYAML>=6.0.12.12",
62
+ "types-termcolor>=1.1.3",
63
+ "types-tabulate>=0.9.0.3",
64
+ "types-tqdm>=4.66.0.5",
65
+ "types-Pillow>=10.2.0.20240406",
66
+ "types-urllib3>=1.26.25.14",
67
+ "lxml-stubs>=0.5.1",
68
+ ]
69
+
70
+ test = [
71
+ "pytest==9.0.1",
72
+ "pytest-cov",
73
+ ]
74
+
75
+ docs = [
76
+ "mkdocs-material==9.7.0",
77
+ "mkdocstrings-python==1.19.0",
78
+ "griffe==0.25.0"
79
+ ]
80
+
81
+ [project.urls]
82
+ Homepage = "https://github.com/deepdoctection/deepdoctection"
83
+ Documentation = "https://deepdoctection.readthedocs.io"
84
+ Repository = "https://github.com/deepdoctection/deepdoctection"
85
+
86
+ [tool.setuptools]
87
+ package-dir = {"" = "src"}
88
+
89
+ [tool.setuptools.packages.find]
90
+ where = ["src"]
91
+
92
+ [tool.setuptools.package-data]
93
+ deepdoctection = ["py.typed", "configs/*.yaml", "configs/*.jsonl"]
94
+
95
+ [tool.black]
96
+ line-length = 120
97
+ target-version = ['py310']
98
+
99
+ [tool.isort]
100
+ profile = "black"
101
+ line_length = 120
102
+
103
+ [tool.mypy]
104
+ python_version = "3.10"
105
+ warn_return_any = true
106
+ warn_unused_configs = true
107
+ ignore_missing_imports = true
108
+
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,176 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: __init__.py
3
+
4
+ """
5
+ Init file for deepdoctection package. This file is used to import all submodules and to set some environment variables
6
+ """
7
+
8
+ import sys
9
+ from typing import TYPE_CHECKING, Dict, List
10
+
11
+ from dd_core.utils.env_info import collect_env_info
12
+ from dd_core.utils.file_utils import _LazyModule
13
+ from dd_core.utils.logger import LoggingRecord, logger
14
+
15
+ __version__ = "1.0.0"
16
+ _IMPORT_STRUCTURE = {
17
+ "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory", "update_cfg_from_defaults"],
18
+ "eval": [
19
+ "AccuracyMetric",
20
+ "ConfusionMetric",
21
+ "PrecisionMetric",
22
+ "RecallMetric",
23
+ "F1Metric",
24
+ "PrecisionMetricMicro",
25
+ "RecallMetricMicro",
26
+ "F1MetricMicro",
27
+ "MetricBase",
28
+ "CocoMetric",
29
+ "Evaluator",
30
+ "metric_registry",
31
+ "get_metric",
32
+ "TableTree",
33
+ "CustomConfig",
34
+ "TEDS",
35
+ "TedsMetric",
36
+ ],
37
+ "extern": [
38
+ "ModelCategories",
39
+ "NerModelCategories",
40
+ "PredictorBase",
41
+ "DetectionResult",
42
+ "ObjectDetector",
43
+ "PdfMiner",
44
+ "TextRecognizer",
45
+ "TokenClassResult",
46
+ "SequenceClassResult",
47
+ "LMTokenClassifier",
48
+ "LMSequenceClassifier",
49
+ "LanguageDetector",
50
+ "ImageTransformer",
51
+ "DeterministicImageTransformer",
52
+ "InferenceResize",
53
+ "D2FrcnnDetector",
54
+ "D2FrcnnTracingDetector",
55
+ "Jdeskewer",
56
+ "DoctrTextlineDetector",
57
+ "DoctrTextRecognizer",
58
+ "DocTrRotationTransformer",
59
+ "HFDetrDerivedDetector",
60
+ "get_tokenizer_from_architecture",
61
+ "HFLayoutLmTokenClassifierBase",
62
+ "HFLayoutLmTokenClassifier",
63
+ "HFLayoutLmv2TokenClassifier",
64
+ "HFLayoutLmv3TokenClassifier",
65
+ "HFLayoutLmSequenceClassifier",
66
+ "HFLayoutLmv2SequenceClassifier",
67
+ "HFLayoutLmv3SequenceClassifier",
68
+ "HFLiltTokenClassifier",
69
+ "HFLiltSequenceClassifier",
70
+ "HFLmTokenClassifier",
71
+ "HFLmSequenceClassifier",
72
+ "HFLmLanguageDetector",
73
+ "ModelProfile",
74
+ "ModelCatalog",
75
+ "print_model_infos",
76
+ "ModelDownloadManager",
77
+ "PdfPlumberTextDetector",
78
+ "Pdfmium2TextDetector",
79
+ "TesseractOcrDetector",
80
+ "TesseractRotationTransformer",
81
+ "TextractOcrDetector",
82
+ ],
83
+ "pipe": [
84
+ "DatapointManager",
85
+ "PipelineComponent",
86
+ "PredictorPipelineComponent",
87
+ "LanguageModelPipelineComponent",
88
+ "ImageTransformPipelineComponent",
89
+ "Pipeline",
90
+ "DetectResultGenerator",
91
+ "SubImageLayoutService",
92
+ "ImageCroppingService",
93
+ "IntersectionMatcher",
94
+ "NeighbourMatcher",
95
+ "FamilyCompound",
96
+ "MatchingService",
97
+ "PageParsingService",
98
+ "AnnotationNmsService",
99
+ "MultiThreadPipelineComponent",
100
+ "DoctectionPipe",
101
+ "LanguageDetectionService",
102
+ "skip_if_category_or_service_extracted",
103
+ "ImageLayoutService",
104
+ "LMTokenClassifierService",
105
+ "LMSequenceClassifierService",
106
+ "OrderGenerator",
107
+ "TextLineGenerator",
108
+ "TextLineService",
109
+ "TextOrderService",
110
+ "TableSegmentationRefinementService",
111
+ "generate_html_string",
112
+ "pipeline_component_registry",
113
+ "TableSegmentationService",
114
+ "PubtablesSegmentationService",
115
+ "SegmentationResult",
116
+ "TextExtractionService",
117
+ "SimpleTransformService",
118
+ ],
119
+ "train": [
120
+ "D2Trainer",
121
+ "train_d2_faster_rcnn",
122
+ "LayoutLMTrainer",
123
+ "train_hf_layoutlm",
124
+ "DetrDerivedTrainer",
125
+ "train_hf_detr",
126
+ ],
127
+ }
128
+
129
+
130
+ # Setting some environment variables so that standard functions can be invoked with available hardware
131
+ env_info = collect_env_info()
132
+ logger.debug(LoggingRecord(msg=env_info))
133
+
134
+ # Build extra objects for the lazy module, starting with the version
135
+ _extra_objects: Dict[str, object] = {"__version__": __version__}
136
+
137
+ # Re-export all public attributes from dd_core under deepdoctection namespace
138
+ import dd_core # pylint: disable=C0413
139
+
140
+ for _name in dir(dd_core):
141
+ if _name.startswith("_"):
142
+ continue
143
+ # Optional: if dd_core defines __all__, you could respect it instead:
144
+ # if hasattr(dd_core, "__all__") and _name not in dd_core.__all__:
145
+ # continue
146
+ _extra_objects[_name] = getattr(dd_core, _name)
147
+
148
+ try:
149
+ import dd_datasets
150
+
151
+ for _name in dir(dd_datasets):
152
+ if _name.startswith("_"):
153
+ continue
154
+ _extra_objects[_name] = getattr(dd_datasets, _name)
155
+ except ImportError:
156
+ pass
157
+
158
+ # Direct imports for type-checking
159
+ if TYPE_CHECKING:
160
+ from dd_core import *
161
+ from dd_datasets import *
162
+
163
+ from .analyzer import *
164
+ from .eval import *
165
+ from .extern import * # type: ignore
166
+ from .pipe import *
167
+ from .train import *
168
+
169
+ else:
170
+ sys.modules[__name__] = _LazyModule(
171
+ __name__,
172
+ globals()["__file__"],
173
+ _IMPORT_STRUCTURE,
174
+ module_spec=globals().get("__spec__"),
175
+ extra_objects=_extra_objects,
176
+ )