deepdoctection 0.42.1__py3-none-any.whl → 0.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (124) hide show
  1. deepdoctection/__init__.py +2 -1
  2. deepdoctection/analyzer/__init__.py +2 -1
  3. deepdoctection/analyzer/config.py +904 -0
  4. deepdoctection/analyzer/dd.py +36 -62
  5. deepdoctection/analyzer/factory.py +311 -141
  6. deepdoctection/configs/conf_dd_one.yaml +100 -44
  7. deepdoctection/configs/profiles.jsonl +32 -0
  8. deepdoctection/dataflow/__init__.py +9 -6
  9. deepdoctection/dataflow/base.py +33 -15
  10. deepdoctection/dataflow/common.py +96 -75
  11. deepdoctection/dataflow/custom.py +36 -29
  12. deepdoctection/dataflow/custom_serialize.py +135 -91
  13. deepdoctection/dataflow/parallel_map.py +33 -31
  14. deepdoctection/dataflow/serialize.py +15 -10
  15. deepdoctection/dataflow/stats.py +41 -28
  16. deepdoctection/datapoint/__init__.py +4 -6
  17. deepdoctection/datapoint/annotation.py +104 -66
  18. deepdoctection/datapoint/box.py +190 -130
  19. deepdoctection/datapoint/convert.py +66 -39
  20. deepdoctection/datapoint/image.py +151 -95
  21. deepdoctection/datapoint/view.py +383 -236
  22. deepdoctection/datasets/__init__.py +2 -6
  23. deepdoctection/datasets/adapter.py +11 -11
  24. deepdoctection/datasets/base.py +118 -81
  25. deepdoctection/datasets/dataflow_builder.py +18 -12
  26. deepdoctection/datasets/info.py +76 -57
  27. deepdoctection/datasets/instances/__init__.py +6 -2
  28. deepdoctection/datasets/instances/doclaynet.py +17 -14
  29. deepdoctection/datasets/instances/fintabnet.py +16 -22
  30. deepdoctection/datasets/instances/funsd.py +11 -6
  31. deepdoctection/datasets/instances/iiitar13k.py +9 -9
  32. deepdoctection/datasets/instances/layouttest.py +9 -9
  33. deepdoctection/datasets/instances/publaynet.py +9 -9
  34. deepdoctection/datasets/instances/pubtables1m.py +13 -13
  35. deepdoctection/datasets/instances/pubtabnet.py +13 -15
  36. deepdoctection/datasets/instances/rvlcdip.py +8 -8
  37. deepdoctection/datasets/instances/xfund.py +11 -9
  38. deepdoctection/datasets/registry.py +18 -11
  39. deepdoctection/datasets/save.py +12 -11
  40. deepdoctection/eval/__init__.py +3 -2
  41. deepdoctection/eval/accmetric.py +72 -52
  42. deepdoctection/eval/base.py +29 -10
  43. deepdoctection/eval/cocometric.py +14 -12
  44. deepdoctection/eval/eval.py +56 -41
  45. deepdoctection/eval/registry.py +6 -3
  46. deepdoctection/eval/tedsmetric.py +24 -9
  47. deepdoctection/eval/tp_eval_callback.py +13 -12
  48. deepdoctection/extern/__init__.py +1 -1
  49. deepdoctection/extern/base.py +176 -97
  50. deepdoctection/extern/d2detect.py +127 -92
  51. deepdoctection/extern/deskew.py +19 -10
  52. deepdoctection/extern/doctrocr.py +157 -106
  53. deepdoctection/extern/fastlang.py +25 -17
  54. deepdoctection/extern/hfdetr.py +137 -60
  55. deepdoctection/extern/hflayoutlm.py +329 -248
  56. deepdoctection/extern/hflm.py +67 -33
  57. deepdoctection/extern/model.py +108 -762
  58. deepdoctection/extern/pdftext.py +37 -12
  59. deepdoctection/extern/pt/nms.py +15 -1
  60. deepdoctection/extern/pt/ptutils.py +13 -9
  61. deepdoctection/extern/tessocr.py +87 -54
  62. deepdoctection/extern/texocr.py +29 -14
  63. deepdoctection/extern/tp/tfutils.py +36 -8
  64. deepdoctection/extern/tp/tpcompat.py +54 -16
  65. deepdoctection/extern/tp/tpfrcnn/config/config.py +20 -4
  66. deepdoctection/extern/tpdetect.py +4 -2
  67. deepdoctection/mapper/__init__.py +1 -1
  68. deepdoctection/mapper/cats.py +117 -76
  69. deepdoctection/mapper/cocostruct.py +35 -17
  70. deepdoctection/mapper/d2struct.py +56 -29
  71. deepdoctection/mapper/hfstruct.py +32 -19
  72. deepdoctection/mapper/laylmstruct.py +221 -185
  73. deepdoctection/mapper/maputils.py +71 -35
  74. deepdoctection/mapper/match.py +76 -62
  75. deepdoctection/mapper/misc.py +68 -44
  76. deepdoctection/mapper/pascalstruct.py +13 -12
  77. deepdoctection/mapper/prodigystruct.py +33 -19
  78. deepdoctection/mapper/pubstruct.py +42 -32
  79. deepdoctection/mapper/tpstruct.py +39 -19
  80. deepdoctection/mapper/xfundstruct.py +20 -13
  81. deepdoctection/pipe/__init__.py +1 -2
  82. deepdoctection/pipe/anngen.py +104 -62
  83. deepdoctection/pipe/base.py +226 -107
  84. deepdoctection/pipe/common.py +206 -123
  85. deepdoctection/pipe/concurrency.py +74 -47
  86. deepdoctection/pipe/doctectionpipe.py +108 -47
  87. deepdoctection/pipe/language.py +41 -24
  88. deepdoctection/pipe/layout.py +45 -18
  89. deepdoctection/pipe/lm.py +146 -78
  90. deepdoctection/pipe/order.py +196 -113
  91. deepdoctection/pipe/refine.py +111 -63
  92. deepdoctection/pipe/registry.py +1 -1
  93. deepdoctection/pipe/segment.py +213 -142
  94. deepdoctection/pipe/sub_layout.py +76 -46
  95. deepdoctection/pipe/text.py +52 -33
  96. deepdoctection/pipe/transform.py +8 -6
  97. deepdoctection/train/d2_frcnn_train.py +87 -69
  98. deepdoctection/train/hf_detr_train.py +72 -40
  99. deepdoctection/train/hf_layoutlm_train.py +85 -46
  100. deepdoctection/train/tp_frcnn_train.py +56 -28
  101. deepdoctection/utils/concurrency.py +59 -16
  102. deepdoctection/utils/context.py +40 -19
  103. deepdoctection/utils/develop.py +25 -17
  104. deepdoctection/utils/env_info.py +85 -36
  105. deepdoctection/utils/error.py +16 -10
  106. deepdoctection/utils/file_utils.py +246 -62
  107. deepdoctection/utils/fs.py +162 -43
  108. deepdoctection/utils/identifier.py +29 -16
  109. deepdoctection/utils/logger.py +49 -32
  110. deepdoctection/utils/metacfg.py +83 -21
  111. deepdoctection/utils/pdf_utils.py +119 -62
  112. deepdoctection/utils/settings.py +24 -10
  113. deepdoctection/utils/tqdm.py +10 -5
  114. deepdoctection/utils/transform.py +182 -46
  115. deepdoctection/utils/utils.py +61 -28
  116. deepdoctection/utils/viz.py +150 -104
  117. deepdoctection-0.43.dist-info/METADATA +376 -0
  118. deepdoctection-0.43.dist-info/RECORD +149 -0
  119. deepdoctection/analyzer/_config.py +0 -146
  120. deepdoctection-0.42.1.dist-info/METADATA +0 -431
  121. deepdoctection-0.42.1.dist-info/RECORD +0 -148
  122. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/WHEEL +0 -0
  123. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/licenses/LICENSE +0 -0
  124. {deepdoctection-0.42.1.dist-info → deepdoctection-0.43.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,376 @@
1
+ Metadata-Version: 2.4
2
+ Name: deepdoctection
3
+ Version: 0.43
4
+ Summary: Repository for Document AI
5
+ Home-page: https://github.com/deepdoctection/deepdoctection
6
+ Author: Dr. Janis Meyer
7
+ License: Apache License 2.0
8
+ Classifier: Development Status :: 4 - Beta
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Natural Language :: English
11
+ Classifier: Operating System :: POSIX :: Linux
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.9
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: catalogue==2.0.10
21
+ Requires-Dist: huggingface_hub>=0.26.0
22
+ Requires-Dist: importlib-metadata>=5.0.0
23
+ Requires-Dist: jsonlines==3.1.0
24
+ Requires-Dist: lazy-imports==0.3.1
25
+ Requires-Dist: mock==4.0.3
26
+ Requires-Dist: networkx>=2.7.1
27
+ Requires-Dist: numpy<2.0,>=1.21
28
+ Requires-Dist: packaging>=20.0
29
+ Requires-Dist: Pillow>=10.0.0
30
+ Requires-Dist: pypdf>=3.16.0
31
+ Requires-Dist: pypdfium2>=4.30.0
32
+ Requires-Dist: pyyaml>=6.0.1
33
+ Requires-Dist: pyzmq>=16
34
+ Requires-Dist: scipy>=1.13.1
35
+ Requires-Dist: termcolor>=1.1
36
+ Requires-Dist: tabulate>=0.7.7
37
+ Requires-Dist: tqdm>=4.64.0
38
+ Provides-Extra: tf
39
+ Requires-Dist: catalogue==2.0.10; extra == "tf"
40
+ Requires-Dist: huggingface_hub>=0.26.0; extra == "tf"
41
+ Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
42
+ Requires-Dist: jsonlines==3.1.0; extra == "tf"
43
+ Requires-Dist: lazy-imports==0.3.1; extra == "tf"
44
+ Requires-Dist: mock==4.0.3; extra == "tf"
45
+ Requires-Dist: networkx>=2.7.1; extra == "tf"
46
+ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
47
+ Requires-Dist: packaging>=20.0; extra == "tf"
48
+ Requires-Dist: Pillow>=10.0.0; extra == "tf"
49
+ Requires-Dist: pypdf>=3.16.0; extra == "tf"
50
+ Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
51
+ Requires-Dist: pyyaml>=6.0.1; extra == "tf"
52
+ Requires-Dist: pyzmq>=16; extra == "tf"
53
+ Requires-Dist: scipy>=1.13.1; extra == "tf"
54
+ Requires-Dist: termcolor>=1.1; extra == "tf"
55
+ Requires-Dist: tabulate>=0.7.7; extra == "tf"
56
+ Requires-Dist: tqdm>=4.64.0; extra == "tf"
57
+ Requires-Dist: tensorpack==0.11; extra == "tf"
58
+ Requires-Dist: protobuf==3.20.1; extra == "tf"
59
+ Requires-Dist: tensorflow-addons>=0.17.1; extra == "tf"
60
+ Requires-Dist: tf2onnx>=1.9.2; extra == "tf"
61
+ Requires-Dist: python-doctr==0.9.0; extra == "tf"
62
+ Requires-Dist: pycocotools>=2.0.2; extra == "tf"
63
+ Requires-Dist: boto3==1.34.102; extra == "tf"
64
+ Requires-Dist: pdfplumber>=0.11.0; extra == "tf"
65
+ Requires-Dist: fasttext-wheel; extra == "tf"
66
+ Requires-Dist: jdeskew>=0.2.2; extra == "tf"
67
+ Requires-Dist: apted==1.0.3; extra == "tf"
68
+ Requires-Dist: distance==0.1.3; extra == "tf"
69
+ Requires-Dist: lxml>=4.9.1; extra == "tf"
70
+ Provides-Extra: pt
71
+ Requires-Dist: catalogue==2.0.10; extra == "pt"
72
+ Requires-Dist: huggingface_hub>=0.26.0; extra == "pt"
73
+ Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
74
+ Requires-Dist: jsonlines==3.1.0; extra == "pt"
75
+ Requires-Dist: lazy-imports==0.3.1; extra == "pt"
76
+ Requires-Dist: mock==4.0.3; extra == "pt"
77
+ Requires-Dist: networkx>=2.7.1; extra == "pt"
78
+ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
79
+ Requires-Dist: packaging>=20.0; extra == "pt"
80
+ Requires-Dist: Pillow>=10.0.0; extra == "pt"
81
+ Requires-Dist: pypdf>=3.16.0; extra == "pt"
82
+ Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
83
+ Requires-Dist: pyyaml>=6.0.1; extra == "pt"
84
+ Requires-Dist: pyzmq>=16; extra == "pt"
85
+ Requires-Dist: scipy>=1.13.1; extra == "pt"
86
+ Requires-Dist: termcolor>=1.1; extra == "pt"
87
+ Requires-Dist: tabulate>=0.7.7; extra == "pt"
88
+ Requires-Dist: tqdm>=4.64.0; extra == "pt"
89
+ Requires-Dist: timm>=0.9.16; extra == "pt"
90
+ Requires-Dist: transformers>=4.48.0; extra == "pt"
91
+ Requires-Dist: accelerate>=0.29.1; extra == "pt"
92
+ Requires-Dist: python-doctr==0.9.0; extra == "pt"
93
+ Requires-Dist: pycocotools>=2.0.2; extra == "pt"
94
+ Requires-Dist: boto3==1.34.102; extra == "pt"
95
+ Requires-Dist: pdfplumber>=0.11.0; extra == "pt"
96
+ Requires-Dist: fasttext-wheel; extra == "pt"
97
+ Requires-Dist: jdeskew>=0.2.2; extra == "pt"
98
+ Requires-Dist: apted==1.0.3; extra == "pt"
99
+ Requires-Dist: distance==0.1.3; extra == "pt"
100
+ Requires-Dist: lxml>=4.9.1; extra == "pt"
101
+ Provides-Extra: docs
102
+ Requires-Dist: tensorpack==0.11; extra == "docs"
103
+ Requires-Dist: boto3==1.34.102; extra == "docs"
104
+ Requires-Dist: transformers>=4.48.0; extra == "docs"
105
+ Requires-Dist: accelerate>=0.29.1; extra == "docs"
106
+ Requires-Dist: pdfplumber>=0.11.0; extra == "docs"
107
+ Requires-Dist: lxml>=4.9.1; extra == "docs"
108
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "docs"
109
+ Requires-Dist: jdeskew>=0.2.2; extra == "docs"
110
+ Requires-Dist: jinja2; extra == "docs"
111
+ Requires-Dist: mkdocs-material; extra == "docs"
112
+ Requires-Dist: mkdocstrings-python; extra == "docs"
113
+ Requires-Dist: griffe==0.25.0; extra == "docs"
114
+ Provides-Extra: dev
115
+ Requires-Dist: python-dotenv==1.0.0; extra == "dev"
116
+ Requires-Dist: click; extra == "dev"
117
+ Requires-Dist: black==23.7.0; extra == "dev"
118
+ Requires-Dist: isort==5.13.2; extra == "dev"
119
+ Requires-Dist: pylint==2.17.4; extra == "dev"
120
+ Requires-Dist: mypy==1.4.1; extra == "dev"
121
+ Requires-Dist: wandb; extra == "dev"
122
+ Requires-Dist: types-PyYAML>=6.0.12.12; extra == "dev"
123
+ Requires-Dist: types-termcolor>=1.1.3; extra == "dev"
124
+ Requires-Dist: types-tabulate>=0.9.0.3; extra == "dev"
125
+ Requires-Dist: types-tqdm>=4.66.0.5; extra == "dev"
126
+ Requires-Dist: lxml-stubs>=0.5.1; extra == "dev"
127
+ Requires-Dist: types-Pillow>=10.2.0.20240406; extra == "dev"
128
+ Requires-Dist: types-urllib3>=1.26.25.14; extra == "dev"
129
+ Provides-Extra: test
130
+ Requires-Dist: pytest==8.0.2; extra == "test"
131
+ Requires-Dist: pytest-cov; extra == "test"
132
+ Dynamic: author
133
+ Dynamic: classifier
134
+ Dynamic: description
135
+ Dynamic: description-content-type
136
+ Dynamic: home-page
137
+ Dynamic: license
138
+ Dynamic: license-file
139
+ Dynamic: provides-extra
140
+ Dynamic: requires-dist
141
+ Dynamic: requires-python
142
+ Dynamic: summary
143
+
144
+ <p align="center">
145
+ <img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_logo.png" alt="Deep Doctection Logo" width="60%">
146
+ </p>
147
+
148
+ ![GitHub Repo stars](https://img.shields.io/github/stars/deepdoctection/deepdoctection)
149
+ ![PyPI - Version](https://img.shields.io/pypi/v/deepdoctection)
150
+ ![PyPI - License](https://img.shields.io/pypi/l/deepdoctection)
151
+
152
+
153
+ ------------------------------------------------------------------------------------------------------------------------
154
+ # NEW
155
+
156
+ Version `v.0.43` includes a significant redesign of the Analyzer's default configuration. Key changes include:
157
+
158
+ * More powerful models for Document Layout Analysis and OCR.
159
+ * Expanded functionality.
160
+ * Less dependencies.
161
+
162
+ ------------------------------------------------------------------------------------------------------------------------
163
+
164
+ <p align="center">
165
+ <h1 align="center">
166
+ A Package for Document Understanding
167
+ </h1>
168
+ </p>
169
+
170
+
171
+
172
+ **deep**doctection is a Python library that orchestrates Scan and PDF document layout analysis and extraction for RAG.
173
+ It also provides a framework for training, evaluating and inferencing Document AI models.
174
+
175
+ Check the demo of a document layout analysis pipeline with OCR on 🤗
176
+ [**Hugging Face spaces**](https://huggingface.co/spaces/deepdoctection/deepdoctection).
177
+
178
+ # Overview
179
+
180
+ - Document layout analysis and table recognition in PyTorch with
181
+ [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) and
182
+ [**Transformers**](https://github.com/huggingface/transformers)
183
+ or Tensorflow and [**Tensorpack**](https://github.com/tensorpack),
184
+ - OCR with support of [**Tesseract**](https://github.com/tesseract-ocr/tesseract), [**DocTr**](https://github.com/mindee/doctr) and
185
+ [**AWS Textract**](https://aws.amazon.com/textract/),
186
+ - Document and token classification with the [**LayoutLM**](https://github.com/microsoft/unilm) family,
187
+ [**LiLT**](https://github.com/jpWang/LiLT) and selected
188
+ [**Bert**](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)-style including features like sliding windows.
189
+ - Text mining for native PDFs with [**pdfplumber**](https://github.com/jsvine/pdfplumber),
190
+ - Language detection with [**fastText**](https://github.com/facebookresearch/fastText),
191
+ - Deskewing and rotating images with [**jdeskew**](https://github.com/phamquiluan/jdeskew).
192
+ - Fine-tuning and evaluation tools.
193
+ - Lot's of [tutorials](https://github.com/deepdoctection/notebooks)
194
+
195
+ Have a look at the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb)
196
+ for an easy start.
197
+
198
+ Check the [**release notes**](https://github.com/deepdoctection/deepdoctection/releases) for recent updates.
199
+
200
+ # Example
201
+
202
+ ```python
203
+ import deepdoctection as dd
204
+ from IPython.core.display import HTML
205
+ from matplotlib import pyplot as plt
206
+
207
+ analyzer = dd.get_dd_analyzer() # instantiate the built-in analyzer similar to the Hugging Face space demo
208
+
209
+ df = analyzer.analyze(path = "/path/to/your/doc.pdf") # setting up pipeline
210
+ df.reset_state() # Trigger some initialization
211
+
212
+ doc = iter(df)
213
+ page = next(doc)
214
+
215
+ image = page.viz(show_figures=True, show_residual_layouts=True)
216
+ plt.figure(figsize = (25,17))
217
+ plt.axis('off')
218
+ plt.imshow(image)
219
+ ```
220
+
221
+ <p align="center">
222
+ <img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_sample.png"
223
+ alt="sample" width="40%">
224
+ </p>
225
+
226
+ ```
227
+ HTML(page.tables[0].html)
228
+ ```
229
+
230
+ <p align="center">
231
+ <img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_table.png"
232
+ alt="table" width="40%">
233
+ </p>
234
+
235
+ ```
236
+ print(page.text)
237
+ ```
238
+
239
+ <p align="center">
240
+ <img src="https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/dd_rm_text.png"
241
+ alt="text" width="40%">
242
+ </p>
243
+
244
+
245
+
246
+ ## Requirements
247
+
248
+ ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_220525.png)
249
+
250
+ - Linux or macOS. Windows is not supported but there is a [Dockerfile](./docker/pytorch-cpu-jupyter/Dockerfile) available.
251
+ - Python >= 3.9
252
+ - 1.13 \<= PyTorch **or** 2.11 \<= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
253
+ Tensorflow support will be stopped from Python 3.11 onwards.
254
+ - To fine-tune models, a GPU is recommended.
255
+
256
+ | Task | PyTorch | Torchscript | Tensorflow |
257
+ |---------------------------------------------|:-------:|----------------|:------------:|
258
+ | Layout detection via Detectron2/Tensorpack | ✅ | ✅ (CPU only) | ✅ (GPU only) |
259
+ | Table recognition via Detectron2/Tensorpack | ✅ | ✅ (CPU only) | ✅ (GPU only) |
260
+ | Table transformer via Transformers | ✅ | ❌ | ❌ |
261
+ | Deformable-Detr | ✅ | ❌ | ❌ |
262
+ | DocTr | ✅ | ❌ | ✅ |
263
+ | LayoutLM (v1, v2, v3, XLM) via Transformers | ✅ | ❌ | ❌ |
264
+
265
+ ## Installation
266
+
267
+ We recommend using a virtual environment.
268
+
269
+ #### Get started installation
270
+
271
+ For a simple setup which is enough to parse documents with the default setting, install the following:
272
+
273
+ **PyTorch**
274
+
275
+ ```
276
+ pip install transformers
277
+ pip install python-doctr
278
+ pip install deepdoctection
279
+ ```
280
+
281
+ **TensorFlow**
282
+
283
+ ```
284
+ pip install tensorpack
285
+ pip install python-doctr
286
+ pip install deepdoctection
287
+ ```
288
+
289
+ Both setups are sufficient to run the [**introduction notebook**](https://github.com/deepdoctection/notebooks/blob/main/Get_Started.ipynb).
290
+
291
+ #### Full installation
292
+
293
+ The following installation will give you ALL models available within the Deep Learning framework as well as all models
294
+ that are independent of Tensorflow/PyTorch.
295
+
296
+ **PyTorch**
297
+
298
+ First install **Detectron2** separately as it is not distributed via PyPi. Check the instruction
299
+ [here](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) or try:
300
+
301
+ ```
302
+ pip install detectron2@git+https://github.com/deepdoctection/detectron2.git
303
+ ```
304
+
305
+ Then install **deep**doctection with all its dependencies:
306
+
307
+ ```
308
+ pip install deepdoctection[pt]
309
+ ```
310
+
311
+ **Tensorflow**
312
+
313
+ ```
314
+ pip install deepdoctection[tf]
315
+ ```
316
+
317
+
318
+ For further information, please consult the [**full installation instructions**](https://deepdoctection.readthedocs.io/en/latest/install/).
319
+
320
+
321
+ ### Installation from source
322
+
323
+ Download the repository or clone via
324
+
325
+ ```
326
+ git clone https://github.com/deepdoctection/deepdoctection.git
327
+ ```
328
+
329
+ **PyTorch**
330
+
331
+ ```
332
+ cd deepdoctection
333
+ pip install ".[pt]" # or "pip install -e .[pt]"
334
+ ```
335
+
336
+ **Tensorflow**
337
+
338
+ ```
339
+ cd deepdoctection
340
+ pip install ".[tf]" # or "pip install -e .[tf]"
341
+ ```
342
+
343
+
344
+
345
+ ### Running a Docker container from Docker hub
346
+
347
+ Pre-existing Docker images can be downloaded from the [Docker hub](https://hub.docker.
348
+ com/r/deepdoctection/deepdoctection).
349
+
350
+ ```
351
+ docker pull deepdoctection/deepdoctection:<release_tag>
352
+ ```
353
+
354
+ Use the Docker compose file `./docker/pytorch-gpu/docker-compose.yaml`.
355
+ In the `.env` file provided, specify the host directory where **deep**doctection's cache should be stored.
356
+ Additionally, specify a working directory to mount files to be processed into the container.
357
+
358
+ ```
359
+ docker compose up -d
360
+ ```
361
+
362
+ will start the container. There is no endpoint exposed, though.
363
+
364
+ ## Credits
365
+
366
+ We thank all libraries that provide high quality code and pre-trained models. Without, it would have been impossible
367
+ to develop this framework.
368
+
369
+
370
+ ## If you like **deep**doctection ...
371
+
372
+ ...you can easily support the project by making it more visible. Leaving a star or a recommendation will help.
373
+
374
+ ## License
375
+
376
+ Distributed under the Apache 2.0 License. Check [LICENSE](https://github.com/deepdoctection/deepdoctection/blob/master/LICENSE) for additional information.
@@ -0,0 +1,149 @@
1
+ deepdoctection/__init__.py,sha256=T4BXZotL855uGwIHhore8lZAfSinpIcrpeIvrVsSCyc,12910
2
+ deepdoctection/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ deepdoctection/analyzer/__init__.py,sha256=wg0BcFwdCeREwzZfa--Yx8HUJ9LPv5z5PmLwtkZdPH8,772
4
+ deepdoctection/analyzer/config.py,sha256=CgH4etvZL0JdeIHAXMFdDro2VvVsF2itgQar_Ml94pw,41185
5
+ deepdoctection/analyzer/dd.py,sha256=2BGvZpl9o9khcaOV52-DPHMrs0DsqUO8cpdqFVHHzDQ,5176
6
+ deepdoctection/analyzer/factory.py,sha256=DI0S38KAG2sIROrSximsWJsMbem91a9zXaeWsDNvkGg,37574
7
+ deepdoctection/configs/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
8
+ deepdoctection/configs/conf_dd_one.yaml,sha256=DHqAIKH3jRam54QO7qib2zutmpyFA8TqdV5UvIV191A,3688
9
+ deepdoctection/configs/conf_tesseract.yaml,sha256=oF6szDyoi15FHvq7yFUNIEjfA_jNLhGxoowiRsz_zY4,35
10
+ deepdoctection/configs/profiles.jsonl,sha256=zhMpsJWdfeSj2oq2J0BbiKhHnE7PIq47PA8-I1Th0pA,30266
11
+ deepdoctection/dataflow/__init__.py,sha256=pY4lhjTes2BU-0AdIIRMnRqo9Sv6TopVE_SNfLmpgnc,828
12
+ deepdoctection/dataflow/base.py,sha256=ZLRijyHI1J7tBfnE-q7eqUieYMMERjtK-c1oK40dBkk,6556
13
+ deepdoctection/dataflow/common.py,sha256=DKD_pRZBCt2vO3oNZcOvdoC3jThabTNcNbTS16mpVR0,10351
14
+ deepdoctection/dataflow/custom.py,sha256=xlw1Op4J3a8PNIlzY5stAY6olpBTN8KMhj1KQ7gf8tA,6792
15
+ deepdoctection/dataflow/custom_serialize.py,sha256=zWDx1_mkPpeot9VN-4P1C2sFtK6vYUaFoSs6UiiLMZA,23234
16
+ deepdoctection/dataflow/parallel_map.py,sha256=Xhem9lvNDKVd_x02Ih9qB4J6bEnxNbc8uHIro8mX9UU,15783
17
+ deepdoctection/dataflow/serialize.py,sha256=G5kfkFHyhy3E3AusTvTizBi0EVDU2YZov9a-LmXPjy4,4592
18
+ deepdoctection/dataflow/stats.py,sha256=AN5cbagveaDGqCXaYj6iWITpn-a2eO_AumA-vQNQ_XE,9764
19
+ deepdoctection/datapoint/__init__.py,sha256=DOhcN81MhyPUuFw9F4pyEDyZseeD9qxb8NdB_Q_81GA,1599
20
+ deepdoctection/datapoint/annotation.py,sha256=f32BNmzUGJoNMeGst2RGC2jmjJpzzjxyBRKFG8FCubY,23092
21
+ deepdoctection/datapoint/box.py,sha256=QAS8sK2Ge4_ysW6zOYkLlzNwhSyw_mhYcYsxscClEno,31453
22
+ deepdoctection/datapoint/convert.py,sha256=6ENXX3tBdY8ogb2NBPxsOsQMGnQux8ol5nrUfWS5tYE,7352
23
+ deepdoctection/datapoint/image.py,sha256=nDaWUtdD5j6l_iXW9d2PoIyXBC8M3_idoEIXm7JWGyQ,35139
24
+ deepdoctection/datapoint/view.py,sha256=5TYmKpNNZwJb-NrUXv08H3_zSfHKDHhg6LnEZjBqVns,56622
25
+ deepdoctection/datasets/__init__.py,sha256=4ifjIwWCPYiS31GzUlVDScrkNOrb1eo5xHlRXNyg_58,994
26
+ deepdoctection/datasets/adapter.py,sha256=VSLM_980aHi4TpgOxfxiBHiF_fUXyh348PXet6zTo-4,7779
27
+ deepdoctection/datasets/base.py,sha256=HTIquJir2BZRTLl1HSQM0ICfvjIaWAjJeyz3BEHgdb0,23175
28
+ deepdoctection/datasets/dataflow_builder.py,sha256=0vwkItr0wVbKPtTXoS6uJLO9QQNWbS0Ri7CySuywWxU,4186
29
+ deepdoctection/datasets/info.py,sha256=DLRYq3cHp3L34CcSXPUJ8j8wguJp2aVdoH-AhODNLBA,20814
30
+ deepdoctection/datasets/registry.py,sha256=qYRVycNYFeAzWB7jENGYzokgyzIEvTRb49he2UmPUe8,3451
31
+ deepdoctection/datasets/save.py,sha256=uIRmp3c6o4XDubs7Ay0Sf6zh3gOMFArv3qEn-hq3sBQ,3364
32
+ deepdoctection/datasets/instances/__init__.py,sha256=HIEyl1gZ_IsXda2x3NP8uDROJT8FKwfhrO4xRc_olIk,1428
33
+ deepdoctection/datasets/instances/doclaynet.py,sha256=dc1O7zj4iKrZXbEEALdKKC-1_19Nz4Ln-QoYDeziT7M,12429
34
+ deepdoctection/datasets/instances/fintabnet.py,sha256=ejWH4GnQOdwRkbQoEfAX8IxXhXHQCDHFHMx9Lrl_KIQ,11970
35
+ deepdoctection/datasets/instances/funsd.py,sha256=cDd8ThEwTPy8CarLQMzJykGsfUyGNhSxWaMZ9QvpImc,7276
36
+ deepdoctection/datasets/instances/iiitar13k.py,sha256=iMvPkSX4gKElYb90oTZ36oFxutAth_Tezh6f-wui06s,7014
37
+ deepdoctection/datasets/instances/layouttest.py,sha256=T-ri1ylmhUaF3xW9DV7IRBynWa5kKBK_MQAx42ERf6M,4624
38
+ deepdoctection/datasets/instances/publaynet.py,sha256=SpOsJM3tCwXlKcY8RQ4eKSsGktF9i0B3w-zsuHhlGWk,5540
39
+ deepdoctection/datasets/instances/pubtables1m.py,sha256=-B5i1s_OyfpNuz5qf_CNP4hxEdDgv-vz-4cpMYAvBCc,12610
40
+ deepdoctection/datasets/instances/pubtabnet.py,sha256=ljllMQ-y_2Jvu4p29AsOPin9RHogVl84dNcsVqDtk50,8687
41
+ deepdoctection/datasets/instances/rvlcdip.py,sha256=DGiWjC1iDZPqMo8P6-GOxIRAdrQOmyUCTLpRNKIlsJM,6847
42
+ deepdoctection/datasets/instances/xfund.py,sha256=1QKsmyZJIbpZj6vdtHJrfRaA3NFKrP_yXbl8EJO-YNU,9143
43
+ deepdoctection/datasets/instances/xsl/__init__.py,sha256=TX_P6tqDOF1LK1mi9ruAl7x0mtv1Asm8cYWCz3Pe2dk,646
44
+ deepdoctection/datasets/instances/xsl/pascal_voc.xsl,sha256=DlzFV2P8NtQKXVe96i-mIcPWmL6tsW7NQjgCuz2pCL4,1952
45
+ deepdoctection/eval/__init__.py,sha256=deGj63ejU9f3nthBU6GI25QIQidKWJmIW4q8fpn12bU,920
46
+ deepdoctection/eval/accmetric.py,sha256=TlOFUU9y9BBjJKVsRMyoVKpLZl6AflNsZ4thqSEie4k,19957
47
+ deepdoctection/eval/base.py,sha256=mYVvzD_wVPwsrBqcl1O4Vqqhg1yGtlG6hkuMjVZvt-k,5290
48
+ deepdoctection/eval/cocometric.py,sha256=H-BsLeV9S93tG6jzUN-3FCPXYiUUoTAYuznE5SvS9Bc,11070
49
+ deepdoctection/eval/eval.py,sha256=UUL-wk39RONLMOOyH3WjjpHunZJiQluXZFqir8eaDtY,19808
50
+ deepdoctection/eval/registry.py,sha256=us6EGN_tAia1Mk1mwWQwDeE-xqxcuopztdi8n-ieGbg,1100
51
+ deepdoctection/eval/tedsmetric.py,sha256=EcNeJynsmxyl5bOH3bjy2wE647ONf0SF5OZyGbVu35Q,9963
52
+ deepdoctection/eval/tp_eval_callback.py,sha256=lqrOn2tdaRiF_Vr_9CwBr2ryatcWu3mQKya8YZ2pA9A,5261
53
+ deepdoctection/extern/__init__.py,sha256=jG2qe5_X7fJFnJlx04Lf1KUTXKKKYtCkKR7WQ7looUk,991
54
+ deepdoctection/extern/base.py,sha256=vs4EO9vkfyTW-nVM0dnlqsmnPPeHIXI2wW5b4Wpiz-Y,31547
55
+ deepdoctection/extern/d2detect.py,sha256=I0oEkprr5iVpKpM3a3nknAU-sXwNoDQdp_B1gzzODsk,22374
56
+ deepdoctection/extern/deskew.py,sha256=L_jU0rXh03qzwaT79EIqE_zYMUVeFwWDbsGbtahuL2k,3124
57
+ deepdoctection/extern/doctrocr.py,sha256=d468jZpR4WBvZPt5Vze70dh6tZacBM1HO0pcy8ynOFM,25358
58
+ deepdoctection/extern/fastlang.py,sha256=4D9A-_hTXUcvXG6IJJknX34LrD71v08XtNdWgvXD7fE,4736
59
+ deepdoctection/extern/hfdetr.py,sha256=N3eLNI5BsQS9_7YZyBeWndSgUydJij7ugZA9p4V1xaQ,14316
60
+ deepdoctection/extern/hflayoutlm.py,sha256=3mZZ3byn00jSrLWO2vZFas9j4VrhbYQNmF1mwPG2ElQ,59642
61
+ deepdoctection/extern/hflm.py,sha256=y-9brzmT2NYtFoNcWHABNg2ZZQXSOP9CyqtT1OoeV9U,9754
62
+ deepdoctection/extern/model.py,sha256=-GbnuhLFq7jpBOvtpJe6IhGXxQdqwiM8epEd7IRELoU,18234
63
+ deepdoctection/extern/pdftext.py,sha256=ljzPQn3yYAlS6MoZqzixD-fO2GlHwu1aMiOQ6qMIzbg,7513
64
+ deepdoctection/extern/tessocr.py,sha256=SuPmngsJg38riL4b09z6_FIzJH6H3RIwoighG2GPMYM,17457
65
+ deepdoctection/extern/texocr.py,sha256=93vGj0TX2gENMFV6_FDk3et1sDecrNeuozv5EfOR5nk,5931
66
+ deepdoctection/extern/tpdetect.py,sha256=Kr00n80V_OfE-EGfpjiVw1eAQ2n2tuT-hSco-dLSR9E,8516
67
+ deepdoctection/extern/pt/__init__.py,sha256=3Cu0ZHjbYsJomru7-RQXEHihEQLegZrmLetlHiqS58I,742
68
+ deepdoctection/extern/pt/nms.py,sha256=2lSpEH8cI_QXdz5xL_OaitqsGoHhp5xvDssK5Yo8q4Q,2218
69
+ deepdoctection/extern/pt/ptutils.py,sha256=AmovwBx6WGhSE45Sxt3WYQ3Nu1ZF44dJ5WcFiH7KVsE,2132
70
+ deepdoctection/extern/tp/__init__.py,sha256=8QMkcA7tChCr1QXiA0551lZS2jTsECBrrL2YUanpFAk,706
71
+ deepdoctection/extern/tp/tfutils.py,sha256=paX5nOO2L8G4ze1AmpdizCDezMxF3yqNMXFvwFWh42A,4056
72
+ deepdoctection/extern/tp/tpcompat.py,sha256=u6qV5bhr4UUPCP_Bz3I1Z0b5ZdFEShN84JR6fFq6tJI,6249
73
+ deepdoctection/extern/tp/tpfrcnn/__init__.py,sha256=OzDaR5A8HGz9a4VwjLiR9rN1Nf1cSebv8DVEMxStFOw,703
74
+ deepdoctection/extern/tp/tpfrcnn/common.py,sha256=fCxwi2u752ZlI_DtIkLC_x9j9tyo1nnirAi2PmnziD4,3830
75
+ deepdoctection/extern/tp/tpfrcnn/predict.py,sha256=957dnhCByS-FZH13efFWADhodaV4lKto-ikLPetfvEQ,4338
76
+ deepdoctection/extern/tp/tpfrcnn/preproc.py,sha256=oHN9keBurjdNQqXmsb5BgURB5nl-eEp0KHvO1DPRQL4,12009
77
+ deepdoctection/extern/tp/tpfrcnn/config/__init__.py,sha256=RhJiXId6vUSw_Pi49SPwj0jrf61VxxptXoGeBKtT42M,705
78
+ deepdoctection/extern/tp/tpfrcnn/config/config.py,sha256=Xh3TBYWvPhoOhPRjncvv9FJ75T_4IAuzBEuPv751DFg,11531
79
+ deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py,sha256=RhJiXId6vUSw_Pi49SPwj0jrf61VxxptXoGeBKtT42M,705
80
+ deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py,sha256=H7xoWhRwCh-vlHAL5hCEolKBJ8Y2xe9duZuBuLs0ZwQ,9835
81
+ deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py,sha256=4jgWyja-_V44zJVfK4ySmknhnhqfb9f6ruVwbh387aE,13752
82
+ deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py,sha256=UvZ8_34dNjCvxsTxCJvrlpqUpQb9gWxgwRoIKgedIog,7361
83
+ deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py,sha256=plovKReX6rFjnL_ravLUUCZ49ZFni87FlRJGK0fXqco,5777
84
+ deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py,sha256=Ejd0Z2uUrdAfRjXQoS-lBVPukLlw8geP0yXcF61-nk4,11486
85
+ deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py,sha256=KsL08NNy4PEvBu53HV6bMio58oqIfVrcoqpti27pZOI,18166
86
+ deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py,sha256=B-rImWGWLNNe4UPJfhTpi4f1LUMCW8YJAbwoJFiG__o,4966
87
+ deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py,sha256=F7NGrvKyPZRxnl96zoFyezNzymFJvQghMjGslsc7iFg,9028
88
+ deepdoctection/extern/tp/tpfrcnn/utils/__init__.py,sha256=kiPlXxHlTGN9eI7YE9BgwteOQ_nCYCuqqSLO5JfkCTQ,695
89
+ deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py,sha256=aBLqPg_ApaiimtBRaOsLKTZZFIBh87vVtqjLPMaX9fQ,2379
90
+ deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py,sha256=O-q1GQiOEd1lN1MQDsJvHwD2OmBO-qHNeqJ1Qnec93g,3539
91
+ deepdoctection/mapper/__init__.py,sha256=H2fPGQ1Es0Osyvd9pSNjtyKDpPZuzsyqk2DPyAp1ckI,1099
92
+ deepdoctection/mapper/cats.py,sha256=YEnf5uOvyf_UFcEtN5ddJxF7LGwkwdPWjBE14QvSPV4,17320
93
+ deepdoctection/mapper/cocostruct.py,sha256=BbykSMXklsr6YJ4HRDYEABL1NUxndZvhKPr683aIG_A,6287
94
+ deepdoctection/mapper/d2struct.py,sha256=XiIuQAcC-ekn97RHz2hALcD02Mpdze7Lrfm4vPEB9Iw,11481
95
+ deepdoctection/mapper/hfstruct.py,sha256=15eOUwQ_f3rflZJdnQzIaN7tpj8dhDKDRlAykOtiDsk,5727
96
+ deepdoctection/mapper/laylmstruct.py,sha256=_10260AtRcF2xdkALz2JatiAKMcNIwNMbJgO__hOPN8,42094
97
+ deepdoctection/mapper/maputils.py,sha256=21Oyt4I8IV5jSgtplBP-opPTKk3idgJnA3s8ICPvMvc,8977
98
+ deepdoctection/mapper/match.py,sha256=Q_Dq95IpO9o0gRKk-Jg7ua0eiZ2rMHUhIhwXygT2aGU,10202
99
+ deepdoctection/mapper/misc.py,sha256=LYSORlUR7sn0Qf-wgpTyVNwGgnpuKN9ln7TAiFrbBrQ,7366
100
+ deepdoctection/mapper/pascalstruct.py,sha256=PviZjhTk4p5HDUTlF8qhWPyraKD0uh51f2hoNqA1Bbg,3838
101
+ deepdoctection/mapper/prodigystruct.py,sha256=OWzPUbNDrqwFipH8YWI5eSxwMdA7qYczaFdsHNrE_4c,7001
102
+ deepdoctection/mapper/pubstruct.py,sha256=UTyfUmzMSuf2BXtdYwHjK7ngsIwAxSZjwTxDtz6DySg,23416
103
+ deepdoctection/mapper/tpstruct.py,sha256=dxtEVHYVnkH-zjjbHzkFrPgS9eheys6E-CMlsjaOnxo,5468
104
+ deepdoctection/mapper/xfundstruct.py,sha256=XLUZ-yBMWtKFQ40vxHl6p8EZZvl68JdwJlV00A93Zy8,9108
105
+ deepdoctection/pipe/__init__.py,sha256=E3cYAVWOvMzIN7jbKFyqLjFXahcFGcAGkb1uChM_XCY,1034
106
+ deepdoctection/pipe/anngen.py,sha256=Hfi7C6-iOv7t8tjFoz4FuIhcz6yMZx52f5SG9bsVnLg,16365
107
+ deepdoctection/pipe/base.py,sha256=oszB_DepcFtORvDdGTZZPWMhk01C68RUWXHjeX7SF3M,18163
108
+ deepdoctection/pipe/common.py,sha256=OcsqHr_c66Yqt98hFeKwaa0mciWMCauw0HZ3YnHx8MU,24586
109
+ deepdoctection/pipe/concurrency.py,sha256=_EKZi4eCeF3mVHytZL_fMwyqa25C2aR9g8vrIFB8iR4,9780
110
+ deepdoctection/pipe/doctectionpipe.py,sha256=ik5F92F3klI5Nve_AnyIRj-ApMoKHSR2SjcWWnI0d2g,14063
111
+ deepdoctection/pipe/language.py,sha256=T5g5_2GIsbTltAmn_PFymMUMoik8_b0uJNx8f5dT9MM,5898
112
+ deepdoctection/pipe/layout.py,sha256=oAldMtwyZee1IqpuflKKvmeL2Z_nXFiqwFMS4VYv5eI,6391
113
+ deepdoctection/pipe/lm.py,sha256=nYI2bm0sc9d3JMlIPyNyd4XxXFRBIHRUYfMImuek6b4,19793
114
+ deepdoctection/pipe/order.py,sha256=9OarsHKwVqT1bTDIn7XGeGLgpetEJW3uLjuJQDdhjG4,40684
115
+ deepdoctection/pipe/refine.py,sha256=SrMcAWXRO5tJpqaZCEz9RzvjPyiQiE8fZ9TXBcaBKck,23310
116
+ deepdoctection/pipe/registry.py,sha256=uT5fnHjffoNGk2JPuD2-pMYtO3Iko7-wrwVZVCWLtok,906
117
+ deepdoctection/pipe/segment.py,sha256=rHhEWr5zZ1ppj-gMa-q-UCr1AYTWpUW7oA1umwebqBI,61302
118
+ deepdoctection/pipe/sub_layout.py,sha256=Wh4_uW-6CISe0xwD1AbJX1uk_4ygiUlQHV95gnl7280,14135
119
+ deepdoctection/pipe/text.py,sha256=4fYLCXoE-wFz0atAwbXiy-bjiJuKjNx3i3IHa54YW-0,11009
120
+ deepdoctection/pipe/transform.py,sha256=X1ZUvb6N9YBdJm4XOI7Fe4TZH1OJgJnmOi4DFK-B75U,4797
121
+ deepdoctection/train/__init__.py,sha256=YFTRAZF1F7cEAKTdAIi1BLyYb6rSRcwq09Ui5Lu8d6E,1071
122
+ deepdoctection/train/d2_frcnn_train.py,sha256=edmyNTBRMM_TuL_1D6G2TSY9CBqNndIuyKree_KAso0,15508
123
+ deepdoctection/train/hf_detr_train.py,sha256=El-VHggdBObttFQwFIfQs5xm7aaxpC5IzNUJ1gF4Z6E,13278
124
+ deepdoctection/train/hf_layoutlm_train.py,sha256=bNL5OCLKytshG6kaTJDLTQOcvWKwEsYVmnj8zPd7uio,23634
125
+ deepdoctection/train/tp_frcnn_train.py,sha256=Tltb-v2JD5oPuHCZGA9B5DM4ZaidoBITlH93QX-KPKI,13570
126
+ deepdoctection/utils/__init__.py,sha256=brBceRWeov9WXMiJTjyJOF2rHMP8trGGRRjhMdZ61nI,2371
127
+ deepdoctection/utils/concurrency.py,sha256=9ly81D5i2ZFzKfXMIUSmhT42eMs4QttsRhWXdkIk6Aw,5832
128
+ deepdoctection/utils/context.py,sha256=GXgIGQ10JwosE6FQSPdlJoCXyeM_cahEcbQ2mxGNofI,4538
129
+ deepdoctection/utils/develop.py,sha256=4myrqBDypM6tQ2a2Jo3Q20RuE_W2czykpXBwgXPrxNw,3568
130
+ deepdoctection/utils/env_info.py,sha256=yyRyjQT3xz73yEcqauPM6GdCpvWwYGAH-_KUHfXBrtM,19855
131
+ deepdoctection/utils/error.py,sha256=sIry8F5MZ0yLvKfAwVz90IorKWVvjoRqcC0L8qq8mLk,2480
132
+ deepdoctection/utils/file_utils.py,sha256=D4cua4i3Q-4ZeSRXXWEL4z7hp0M1qh9rrVSfS5t-Hzo,25643
133
+ deepdoctection/utils/fs.py,sha256=KTS9FJzZk9le_vmIPr9IisJw0AyTfjkyX1KoWQy4DNs,12729
134
+ deepdoctection/utils/identifier.py,sha256=Jt12MeZf7eC1qciY5Fp_AYUGxYVcjsy7xNBUvJil7dU,2270
135
+ deepdoctection/utils/logger.py,sha256=ddQ0xBStluf8OvoRlEB8YkqyRR-ZYgyJYLClTmJJMAU,10290
136
+ deepdoctection/utils/metacfg.py,sha256=5M390--ZMoyJEt5oZOwFMGt2i8OF_ayeb0NVmUO_3OQ,7235
137
+ deepdoctection/utils/mocks.py,sha256=IkN3-IzAl4eX0ibgKIHg8IY7ykVw6BnpF6XnxKnKaZI,2389
138
+ deepdoctection/utils/pdf_utils.py,sha256=BrxTuY9j0COyIRkJchJ0tt2h6ZsA2an6z-H8E8QwgUQ,13490
139
+ deepdoctection/utils/settings.py,sha256=OrFEe9Mll3UuDhjyS-cTCv_q1ZSr30Jpl9nQxk__t2I,12824
140
+ deepdoctection/utils/tqdm.py,sha256=kx3Ivf0x85S0ZmEaN5mImu0V6isOgygOU8iyr2U99XU,1850
141
+ deepdoctection/utils/transform.py,sha256=jgeCyQWLN9q79jCGW7jysyKUKcJ1AVMk8OslF-3fbag,16095
142
+ deepdoctection/utils/types.py,sha256=ti4WdtIJSg3TGK_YPkkoY9PYGMnR2tTX6Xfik8U1pNk,2986
143
+ deepdoctection/utils/utils.py,sha256=NBUb1qbx8Jm-AvYN1Sdbk0huXhbAKxZ-ZtOcMespsMM,7064
144
+ deepdoctection/utils/viz.py,sha256=bujRIujvX317rPz4jBrj0yd3WP8wPjDUiI5GUrw9MzQ,27339
145
+ deepdoctection-0.43.dist-info/licenses/LICENSE,sha256=GQ0rUvuGdrMNEI3iHK5UQx6dIMU1QwAuyXsxUHn5MEQ,11351
146
+ deepdoctection-0.43.dist-info/METADATA,sha256=F08hSp-kyzwW1tsTWyRMtWG3TaJsA8_LgueZ7irkBqA,13404
147
+ deepdoctection-0.43.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
148
+ deepdoctection-0.43.dist-info/top_level.txt,sha256=hs2DdoOL9h4mnHhmO82BT4pz4QATIoOZ20PZmlnxFI8,15
149
+ deepdoctection-0.43.dist-info/RECORD,,
@@ -1,146 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # File: config.py
3
-
4
- # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
19
-
20
- from ..datapoint.view import IMAGE_DEFAULTS
21
- from ..utils.metacfg import AttrDict
22
- from ..utils.settings import CellType, LayoutType
23
-
24
- cfg = AttrDict()
25
-
26
-
27
- cfg.LANGUAGE = None
28
- cfg.LIB = None
29
- cfg.DEVICE = None
30
- cfg.USE_ROTATOR = False
31
- cfg.USE_LAYOUT = True
32
- cfg.USE_TABLE_SEGMENTATION = True
33
-
34
- cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
35
- cfg.TF.LAYOUT.FILTER = None
36
-
37
- cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
38
- cfg.TF.CELL.FILTER = None
39
-
40
- cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
41
- cfg.TF.ITEM.FILTER = None
42
-
43
- cfg.PT.ENFORCE_WEIGHTS.LAYOUT = True
44
- cfg.PT.LAYOUT.WEIGHTS = "layout/d2_model_0829999_layout_inf_only.pt"
45
- cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
46
- cfg.PT.LAYOUT.FILTER = None
47
- cfg.PT.LAYOUT.PAD.TOP = 60
48
- cfg.PT.LAYOUT.PAD.RIGHT = 60
49
- cfg.PT.LAYOUT.PAD.BOTTOM = 60
50
- cfg.PT.LAYOUT.PAD.LEFT = 60
51
-
52
- cfg.PT.ENFORCE_WEIGHTS.ITEM = True
53
- cfg.PT.ITEM.WEIGHTS = "item/d2_model_1639999_item_inf_only.pt"
54
- cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
55
- cfg.PT.ITEM.FILTER = None
56
- cfg.PT.ITEM.PAD.TOP = 60
57
- cfg.PT.ITEM.PAD.RIGHT = 60
58
- cfg.PT.ITEM.PAD.BOTTOM = 60
59
- cfg.PT.ITEM.PAD.LEFT = 60
60
-
61
- cfg.PT.ENFORCE_WEIGHTS.CELL = True
62
- cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
63
- cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
64
- cfg.PT.CELL.FILTER = None
65
-
66
- cfg.USE_LAYOUT_NMS = False
67
- cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = None
68
- cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = None
69
- cfg.LAYOUT_NMS_PAIRS.PRIORITY = None
70
-
71
- cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
72
- cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
73
- cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
74
- cfg.SEGMENTATION.FULL_TABLE_TILING = True
75
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.001
76
- cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.001
77
- cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
78
- cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
79
- CellType.SPANNING,
80
- CellType.ROW_HEADER,
81
- CellType.COLUMN_HEADER,
82
- CellType.PROJECTED_ROW_HEADER,
83
- LayoutType.CELL,
84
- ]
85
- cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
86
- CellType.SPANNING,
87
- CellType.ROW_HEADER,
88
- CellType.COLUMN_HEADER,
89
- CellType.PROJECTED_ROW_HEADER,
90
- ]
91
- cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
92
- cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
93
- cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
94
- cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
95
- cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
96
- cfg.SEGMENTATION.PUBTABLES_ITEM_HEADER_CELL_NAMES = [CellType.COLUMN_HEADER, CellType.ROW_HEADER]
97
- cfg.SEGMENTATION.PUBTABLES_ITEM_HEADER_THRESHOLDS = [0.6, 0.0001]
98
- cfg.SEGMENTATION.STRETCH_RULE = "equal"
99
-
100
- cfg.USE_TABLE_REFINEMENT = True
101
- cfg.USE_PDF_MINER = False
102
-
103
- cfg.PDF_MINER.X_TOLERANCE = 3
104
- cfg.PDF_MINER.Y_TOLERANCE = 3
105
-
106
- cfg.USE_OCR = True
107
-
108
- cfg.OCR.USE_TESSERACT = True
109
- cfg.OCR.USE_DOCTR = False
110
- cfg.OCR.USE_TEXTRACT = False
111
- cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
112
-
113
- cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
114
- cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
115
- cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
116
- cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
117
-
118
- cfg.TEXT_CONTAINER = IMAGE_DEFAULTS["text_container"]
119
- cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
120
- LayoutType.TEXT,
121
- LayoutType.TITLE,
122
- LayoutType.LIST,
123
- LayoutType.CELL,
124
- CellType.COLUMN_HEADER,
125
- CellType.PROJECTED_ROW_HEADER,
126
- CellType.SPANNING,
127
- CellType.ROW_HEADER,
128
- ]
129
- cfg.WORD_MATCHING.RULE = "ioa"
130
- cfg.WORD_MATCHING.THRESHOLD = 0.6
131
- cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
132
-
133
- cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS["text_block_categories"]
134
- cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS["floating_text_block_categories"]
135
- cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
136
- cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
137
- cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
138
- cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
139
- cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
140
-
141
- cfg.USE_LAYOUT_LINK = False
142
- cfg.USE_LINE_MATCHER = False
143
- cfg.LAYOUT_LINK.PARENTAL_CATEGORIES = []
144
- cfg.LAYOUT_LINK.CHILD_CATEGORIES = []
145
-
146
- cfg.freeze()