deepdoctection 0.33__tar.gz → 0.35__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (155) hide show
  1. {deepdoctection-0.33 → deepdoctection-0.35}/PKG-INFO +20 -11
  2. {deepdoctection-0.33 → deepdoctection-0.35}/README.md +10 -7
  3. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/__init__.py +11 -12
  4. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/analyzer/__init__.py +1 -0
  5. deepdoctection-0.35/deepdoctection/analyzer/_config.py +150 -0
  6. deepdoctection-0.35/deepdoctection/analyzer/dd.py +154 -0
  7. deepdoctection-0.35/deepdoctection/analyzer/factory.py +522 -0
  8. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/configs/conf_dd_one.yaml +1 -0
  9. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/annotation.py +41 -3
  10. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/convert.py +6 -4
  11. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/image.py +132 -46
  12. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/view.py +2 -1
  13. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/base.py +1 -1
  14. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/fintabnet.py +1 -1
  15. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/xfund.py +29 -7
  16. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/eval.py +7 -1
  17. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/model.py +2 -1
  18. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/pdftext.py +96 -5
  19. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tessocr.py +1 -0
  20. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/cats.py +11 -13
  21. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/cocostruct.py +6 -2
  22. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/d2struct.py +2 -1
  23. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/laylmstruct.py +1 -1
  24. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/match.py +31 -0
  25. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/misc.py +1 -1
  26. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/prodigystruct.py +1 -1
  27. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/anngen.py +27 -0
  28. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/base.py +23 -0
  29. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/common.py +123 -38
  30. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/segment.py +1 -1
  31. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/sub_layout.py +1 -1
  32. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/env_info.py +31 -2
  33. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/file_utils.py +19 -0
  34. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/fs.py +27 -4
  35. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/metacfg.py +12 -0
  36. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/pdf_utils.py +114 -6
  37. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/settings.py +3 -0
  38. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/PKG-INFO +20 -11
  39. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/SOURCES.txt +2 -0
  40. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/requires.txt +9 -3
  41. {deepdoctection-0.33 → deepdoctection-0.35}/setup.cfg +7 -0
  42. {deepdoctection-0.33 → deepdoctection-0.35}/setup.py +6 -2
  43. deepdoctection-0.33/deepdoctection/analyzer/dd.py +0 -470
  44. {deepdoctection-0.33 → deepdoctection-0.35}/LICENSE +0 -0
  45. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/configs/__init__.py +0 -0
  46. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  47. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/__init__.py +0 -0
  48. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/base.py +0 -0
  49. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/common.py +0 -0
  50. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/custom.py +0 -0
  51. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/custom_serialize.py +0 -0
  52. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/parallel_map.py +0 -0
  53. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/serialize.py +0 -0
  54. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/dataflow/stats.py +0 -0
  55. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/__init__.py +0 -0
  56. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datapoint/box.py +0 -0
  57. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/__init__.py +0 -0
  58. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/adapter.py +0 -0
  59. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/dataflow_builder.py +0 -0
  60. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/info.py +0 -0
  61. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/__init__.py +0 -0
  62. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/doclaynet.py +0 -0
  63. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/funsd.py +0 -0
  64. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
  65. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/layouttest.py +0 -0
  66. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/publaynet.py +0 -0
  67. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
  68. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
  69. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
  70. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  71. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  72. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/registry.py +0 -0
  73. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/datasets/save.py +0 -0
  74. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/__init__.py +0 -0
  75. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/accmetric.py +0 -0
  76. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/base.py +0 -0
  77. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/cocometric.py +0 -0
  78. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/registry.py +0 -0
  79. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/tedsmetric.py +0 -0
  80. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/eval/tp_eval_callback.py +0 -0
  81. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/__init__.py +0 -0
  82. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/base.py +0 -0
  83. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/d2detect.py +0 -0
  84. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/deskew.py +0 -0
  85. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/doctrocr.py +0 -0
  86. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/fastlang.py +0 -0
  87. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/hfdetr.py +0 -0
  88. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/hflayoutlm.py +0 -0
  89. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/hflm.py +0 -0
  90. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/pt/__init__.py +0 -0
  91. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/pt/nms.py +0 -0
  92. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/pt/ptutils.py +0 -0
  93. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/texocr.py +0 -0
  94. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/__init__.py +0 -0
  95. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tfutils.py +0 -0
  96. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpcompat.py +0 -0
  97. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  98. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
  99. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  100. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
  101. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  102. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
  103. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
  104. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
  105. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
  106. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
  107. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
  108. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
  109. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
  110. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
  111. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
  112. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  113. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
  114. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  115. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/extern/tpdetect.py +0 -0
  116. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/__init__.py +0 -0
  117. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/hfstruct.py +0 -0
  118. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/maputils.py +0 -0
  119. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/pascalstruct.py +0 -0
  120. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/pubstruct.py +0 -0
  121. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/tpstruct.py +0 -0
  122. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/mapper/xfundstruct.py +0 -0
  123. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/__init__.py +0 -0
  124. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/concurrency.py +0 -0
  125. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/doctectionpipe.py +0 -0
  126. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/language.py +0 -0
  127. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/layout.py +0 -0
  128. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/lm.py +0 -0
  129. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/order.py +0 -0
  130. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/refine.py +0 -0
  131. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/registry.py +0 -0
  132. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/text.py +0 -0
  133. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/pipe/transform.py +0 -0
  134. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/py.typed +0 -0
  135. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/__init__.py +0 -0
  136. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/d2_frcnn_train.py +0 -0
  137. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/hf_detr_train.py +0 -0
  138. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/hf_layoutlm_train.py +0 -0
  139. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/train/tp_frcnn_train.py +0 -0
  140. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/__init__.py +0 -0
  141. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/concurrency.py +0 -0
  142. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/context.py +0 -0
  143. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/develop.py +0 -0
  144. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/error.py +0 -0
  145. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/identifier.py +0 -0
  146. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/logger.py +0 -0
  147. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/mocks.py +0 -0
  148. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/tqdm.py +0 -0
  149. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/transform.py +0 -0
  150. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/types.py +0 -0
  151. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/utils.py +0 -0
  152. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection/utils/viz.py +0 -0
  153. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/dependency_links.txt +0 -0
  154. {deepdoctection-0.33 → deepdoctection-0.35}/deepdoctection.egg-info/top_level.txt +0 -0
  155. {deepdoctection-0.33 → deepdoctection-0.35}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.33
3
+ Version: 0.35
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
19
  Requires-Dist: catalogue==2.0.10
20
- Requires-Dist: huggingface_hub>=0.12.0
20
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0
21
21
  Requires-Dist: importlib-metadata>=5.0.0
22
22
  Requires-Dist: jsonlines==3.1.0
23
23
  Requires-Dist: lazy-imports==0.3.1
@@ -27,14 +27,16 @@ Requires-Dist: numpy<2.0,>=1.21
27
27
  Requires-Dist: packaging>=20.0
28
28
  Requires-Dist: Pillow>=10.0.0
29
29
  Requires-Dist: pypdf>=3.16.0
30
+ Requires-Dist: pypdfium2>=4.30.0
30
31
  Requires-Dist: pyyaml>=6.0.1
31
32
  Requires-Dist: pyzmq>=16
33
+ Requires-Dist: scipy>=1.13.1
32
34
  Requires-Dist: termcolor>=1.1
33
35
  Requires-Dist: tabulate>=0.7.7
34
36
  Requires-Dist: tqdm==4.64.0
35
37
  Provides-Extra: tf
36
38
  Requires-Dist: catalogue==2.0.10; extra == "tf"
37
- Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
39
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
38
40
  Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
39
41
  Requires-Dist: jsonlines==3.1.0; extra == "tf"
40
42
  Requires-Dist: lazy-imports==0.3.1; extra == "tf"
@@ -44,8 +46,10 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
44
46
  Requires-Dist: packaging>=20.0; extra == "tf"
45
47
  Requires-Dist: Pillow>=10.0.0; extra == "tf"
46
48
  Requires-Dist: pypdf>=3.16.0; extra == "tf"
49
+ Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
47
50
  Requires-Dist: pyyaml>=6.0.1; extra == "tf"
48
51
  Requires-Dist: pyzmq>=16; extra == "tf"
52
+ Requires-Dist: scipy>=1.13.1; extra == "tf"
49
53
  Requires-Dist: termcolor>=1.1; extra == "tf"
50
54
  Requires-Dist: tabulate>=0.7.7; extra == "tf"
51
55
  Requires-Dist: tqdm==4.64.0; extra == "tf"
@@ -64,7 +68,7 @@ Requires-Dist: distance==0.1.3; extra == "tf"
64
68
  Requires-Dist: lxml>=4.9.1; extra == "tf"
65
69
  Provides-Extra: pt
66
70
  Requires-Dist: catalogue==2.0.10; extra == "pt"
67
- Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
71
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
68
72
  Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
69
73
  Requires-Dist: jsonlines==3.1.0; extra == "pt"
70
74
  Requires-Dist: lazy-imports==0.3.1; extra == "pt"
@@ -74,8 +78,10 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
74
78
  Requires-Dist: packaging>=20.0; extra == "pt"
75
79
  Requires-Dist: Pillow>=10.0.0; extra == "pt"
76
80
  Requires-Dist: pypdf>=3.16.0; extra == "pt"
81
+ Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
77
82
  Requires-Dist: pyyaml>=6.0.1; extra == "pt"
78
83
  Requires-Dist: pyzmq>=16; extra == "pt"
84
+ Requires-Dist: scipy>=1.13.1; extra == "pt"
79
85
  Requires-Dist: termcolor>=1.1; extra == "pt"
80
86
  Requires-Dist: tabulate>=0.7.7; extra == "pt"
81
87
  Requires-Dist: tqdm==4.64.0; extra == "pt"
@@ -169,9 +175,9 @@ pipelines. Its core function does not depend on any specific deep learning libra
169
175
  - Document layout analysis and table recognition now runs with
170
176
  [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
171
177
  anymore for basic inference.
172
- - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
178
+ - More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
173
179
  (not contained in the built-in Analyzer).
174
- - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
180
+ - Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
175
181
  [**transformers**](https://github.com/huggingface/transformers).
176
182
  We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
177
183
  that seem to look promising, especially if you want to train a model on non-english data. The training script for
@@ -260,7 +266,7 @@ documentation.
260
266
 
261
267
  ## Requirements
262
268
 
263
- ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection.png)
269
+ ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_081124.png)
264
270
 
265
271
  Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
266
272
  separately.
@@ -269,13 +275,16 @@ separately.
269
275
  - Python >= 3.9
270
276
  - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
271
277
  In general, if you want to train or fine-tune models, a GPU is required.
272
- - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
273
- images.
278
+
274
279
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
275
280
  and [PyTorch](https://pytorch.org/get-started/locally/).
276
281
  - [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
277
282
  engine has to be installed separately.
278
283
 
284
+
285
+ - For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
286
+ documents into images. For release `v.0.35.0` this dependency will be optional.
287
+
279
288
  The following overview shows the availability of the models in conjunction with the DL framework.
280
289
 
281
290
  | Task | PyTorch | Torchscript | Tensorflow |
@@ -393,8 +402,8 @@ to develop this framework.
393
402
  ## Problems
394
403
 
395
404
  We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
396
- repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 4
397
- to 6 weeks.
405
+ repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
406
+ to 12 weeks.
398
407
 
399
408
  ## If you like **deep**doctection ...
400
409
 
@@ -45,9 +45,9 @@ pipelines. Its core function does not depend on any specific deep learning libra
45
45
  - Document layout analysis and table recognition now runs with
46
46
  [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
47
47
  anymore for basic inference.
48
- - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
48
+ - More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
49
49
  (not contained in the built-in Analyzer).
50
- - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
50
+ - Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
51
51
  [**transformers**](https://github.com/huggingface/transformers).
52
52
  We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
53
53
  that seem to look promising, especially if you want to train a model on non-english data. The training script for
@@ -136,7 +136,7 @@ documentation.
136
136
 
137
137
  ## Requirements
138
138
 
139
- ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection.png)
139
+ ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_081124.png)
140
140
 
141
141
  Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
142
142
  separately.
@@ -145,13 +145,16 @@ separately.
145
145
  - Python >= 3.9
146
146
  - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
147
147
  In general, if you want to train or fine-tune models, a GPU is required.
148
- - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
149
- images.
148
+
150
149
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
151
150
  and [PyTorch](https://pytorch.org/get-started/locally/).
152
151
  - [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
153
152
  engine has to be installed separately.
154
153
 
154
+
155
+ - For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
156
+ documents into images. For release `v.0.35.0` this dependency will be optional.
157
+
155
158
  The following overview shows the availability of the models in conjunction with the DL framework.
156
159
 
157
160
  | Task | PyTorch | Torchscript | Tensorflow |
@@ -269,8 +272,8 @@ to develop this framework.
269
272
  ## Problems
270
273
 
271
274
  We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
272
- repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 4
273
- to 6 weeks.
275
+ repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
276
+ to 12 weeks.
274
277
 
275
278
  ## If you like **deep**doctection ...
276
279
 
@@ -15,30 +15,22 @@ if importlib.util.find_spec("dotenv") is not None:
15
15
 
16
16
 
17
17
  # pylint: disable=wrong-import-position
18
- import os
19
18
  import sys
20
19
  from typing import TYPE_CHECKING
21
20
 
22
- from .utils.env_info import collect_env_info
21
+ from .utils.env_info import auto_select_pdf_render_framework, collect_env_info
23
22
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
24
23
  from .utils.logger import LoggingRecord, logger
25
24
 
26
25
  # pylint: enable=wrong-import-position
27
26
 
28
- __version__ = 0.33
27
+ __version__ = 0.35
29
28
 
30
29
  _IMPORT_STRUCTURE = {
31
30
  "analyzer": [
32
- "maybe_copy_config_to_cache",
33
31
  "config_sanity_checks",
34
- "build_detector",
35
- "build_padder",
36
- "build_service",
37
- "build_sub_image_service",
38
- "build_ocr",
39
- "build_doctr_word",
40
32
  "get_dd_analyzer",
41
- "build_analyzer",
33
+ "ServiceFactory"
42
34
  ],
43
35
  "configs": [],
44
36
  "dataflow": [
@@ -76,6 +68,7 @@ _IMPORT_STRUCTURE = {
76
68
  ],
77
69
  "datapoint": [
78
70
  "ann_from_dict",
71
+ "AnnotationMap",
79
72
  "Annotation",
80
73
  "CategoryAnnotation",
81
74
  "ImageAnnotation",
@@ -198,6 +191,7 @@ _IMPORT_STRUCTURE = {
198
191
  "print_model_infos",
199
192
  "ModelDownloadManager",
200
193
  "PdfPlumberTextDetector",
194
+ "Pdfmium2TextDetector",
201
195
  "TesseractOcrDetector",
202
196
  "TesseractRotationTransformer",
203
197
  "TextractOcrDetector",
@@ -237,6 +231,7 @@ _IMPORT_STRUCTURE = {
237
231
  "LabelSummarizer",
238
232
  "curry",
239
233
  "match_anns_by_intersection",
234
+ "match_anns_by_distance",
240
235
  "to_image",
241
236
  "maybe_load_image",
242
237
  "maybe_remove_image",
@@ -265,6 +260,8 @@ _IMPORT_STRUCTURE = {
265
260
  "DetectResultGenerator",
266
261
  "SubImageLayoutService",
267
262
  "ImageCroppingService",
263
+ "IntersectionMatcher",
264
+ "NeighbourMatcher",
268
265
  "MatchingService",
269
266
  "PageParsingService",
270
267
  "AnnotationNmsService",
@@ -302,6 +299,7 @@ _IMPORT_STRUCTURE = {
302
299
  "timed_operation",
303
300
  "collect_env_info",
304
301
  "auto_select_viz_library",
302
+ "auto_select_pdf_render_framework",
305
303
  "get_tensorflow_requirement",
306
304
  "tf_addons_available",
307
305
  "get_tf_addons_requirements",
@@ -364,6 +362,7 @@ _IMPORT_STRUCTURE = {
364
362
  "get_configs_dir_path",
365
363
  "get_weights_dir_path",
366
364
  "get_dataset_dir_path",
365
+ "maybe_copy_config_to_cache",
367
366
  "is_uuid_like",
368
367
  "get_uuid_from_str",
369
368
  "get_uuid",
@@ -424,7 +423,7 @@ _IMPORT_STRUCTURE = {
424
423
  # Setting some environment variables so that standard functions can be invoked with available hardware
425
424
  env_info = collect_env_info()
426
425
  logger.debug(LoggingRecord(msg=env_info))
427
-
426
+ auto_select_pdf_render_framework()
428
427
 
429
428
  # Direct imports for type-checking
430
429
  if TYPE_CHECKING:
@@ -20,3 +20,4 @@ Package for pre-built pipelines
20
20
  """
21
21
 
22
22
  from .dd import *
23
+ from .factory import *
@@ -0,0 +1,150 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: config.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
19
+
20
+ from ..utils.metacfg import AttrDict
21
+ from ..utils.settings import CellType, LayoutType
22
+
23
+ cfg = AttrDict()
24
+
25
+ cfg.LANGUAGE = None
26
+ cfg.LIB = None
27
+ cfg.DEVICE = None
28
+ cfg.USE_ROTATOR = False
29
+ cfg.USE_LAYOUT = True
30
+ cfg.USE_TABLE_SEGMENTATION = True
31
+
32
+ cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
33
+ cfg.TF.LAYOUT.FILTER = None
34
+
35
+
36
+ cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
37
+ cfg.TF.CELL.FILTER = None
38
+
39
+
40
+ cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
41
+ cfg.TF.ITEM.FILTER = None
42
+
43
+ cfg.PT.LAYOUT.WEIGHTS = "layout/d2_model_0829999_layout_inf_only.pt"
44
+ cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
45
+ cfg.PT.LAYOUT.FILTER = None
46
+ cfg.PT.LAYOUT.PAD.TOP = 60
47
+ cfg.PT.LAYOUT.PAD.RIGHT = 60
48
+ cfg.PT.LAYOUT.PAD.BOTTOM = 60
49
+ cfg.PT.LAYOUT.PAD.LEFT = 60
50
+
51
+ cfg.PT.ITEM.WEIGHTS = "item/d2_model_1639999_item_inf_only.pt"
52
+ cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
53
+ cfg.PT.ITEM.FILTER = None
54
+ cfg.PT.ITEM.PAD.TOP = 60
55
+ cfg.PT.ITEM.PAD.RIGHT = 60
56
+ cfg.PT.ITEM.PAD.BOTTOM = 60
57
+ cfg.PT.ITEM.PAD.LEFT = 60
58
+
59
+ cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
60
+ cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
61
+ cfg.PT.CELL.FILTER = None
62
+
63
+ cfg.USE_LAYOUT_NMS = False
64
+ cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = None
65
+ cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = None
66
+ cfg.LAYOUT_NMS_PAIRS.PRIORITY = None
67
+
68
+ cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
69
+ cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
70
+ cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
71
+ cfg.SEGMENTATION.FULL_TABLE_TILING = True
72
+ cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.001
73
+ cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.001
74
+ cfg.SEGMENTATION.CELL_CATEGORY_ID = 12
75
+ cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
76
+ cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
77
+ CellType.SPANNING,
78
+ CellType.ROW_HEADER,
79
+ CellType.COLUMN_HEADER,
80
+ CellType.PROJECTED_ROW_HEADER,
81
+ LayoutType.CELL,
82
+ ]
83
+ cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
84
+ CellType.SPANNING,
85
+ CellType.ROW_HEADER,
86
+ CellType.COLUMN_HEADER,
87
+ CellType.PROJECTED_ROW_HEADER,
88
+ ]
89
+ cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
90
+ cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
91
+ cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
92
+ cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
93
+ cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
94
+
95
+ cfg.SEGMENTATION.STRETCH_RULE = "equal"
96
+
97
+ cfg.USE_TABLE_REFINEMENT = True
98
+ cfg.USE_PDF_MINER = False
99
+
100
+ cfg.PDF_MINER.X_TOLERANCE = 3
101
+ cfg.PDF_MINER.Y_TOLERANCE = 3
102
+
103
+ cfg.USE_OCR = True
104
+
105
+ cfg.OCR.USE_TESSERACT = True
106
+ cfg.OCR.USE_DOCTR = False
107
+ cfg.OCR.USE_TEXTRACT = False
108
+ cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
109
+
110
+ cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
111
+ cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
112
+ cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
113
+ cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
114
+
115
+ cfg.TEXT_CONTAINER = LayoutType.WORD
116
+ cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
117
+ LayoutType.TEXT,
118
+ LayoutType.TITLE,
119
+ LayoutType.LIST,
120
+ LayoutType.CELL,
121
+ CellType.COLUMN_HEADER,
122
+ CellType.PROJECTED_ROW_HEADER,
123
+ CellType.SPANNING,
124
+ CellType.ROW_HEADER,
125
+ ]
126
+ cfg.WORD_MATCHING.RULE = "ioa"
127
+ cfg.WORD_MATCHING.THRESHOLD = 0.6
128
+ cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
129
+
130
+ cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = [
131
+ LayoutType.TEXT,
132
+ LayoutType.TITLE,
133
+ LayoutType.LIST,
134
+ LayoutType.CELL,
135
+ CellType.COLUMN_HEADER,
136
+ CellType.PROJECTED_ROW_HEADER,
137
+ CellType.SPANNING,
138
+ CellType.ROW_HEADER,
139
+ ]
140
+ cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = [
141
+ LayoutType.TEXT,
142
+ LayoutType.TITLE,
143
+ LayoutType.LIST,
144
+ ]
145
+ cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
146
+ cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
147
+ cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
148
+ cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
149
+ cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
150
+ cfg.freeze()
@@ -0,0 +1,154 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: dd.py
3
+
4
+ # Copyright 2021 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """
19
+ Module for **deep**doctection analyzer.
20
+
21
+ -factory build_analyzer for a given config
22
+
23
+ -user factory with a reduced config setting
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import os
29
+ from typing import Optional
30
+
31
+ from ..extern.pt.ptutils import get_torch_device
32
+ from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
33
+ from ..pipe.doctectionpipe import DoctectionPipe
34
+ from ..utils.env_info import ENV_VARS_TRUE
35
+ from ..utils.error import DependencyError
36
+ from ..utils.file_utils import tensorpack_available
37
+ from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
38
+ from ..utils.logger import LoggingRecord, logger
39
+ from ..utils.metacfg import set_config_by_yaml
40
+ from ..utils.types import PathLikeOrStr
41
+ from ._config import cfg
42
+ from .factory import ServiceFactory
43
+
44
+ __all__ = [
45
+ "config_sanity_checks",
46
+ "get_dd_analyzer",
47
+ ]
48
+
49
+ _DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
50
+ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
51
+ _MODEL_CHOICES = {
52
+ "layout": [
53
+ "layout/d2_model_0829999_layout_inf_only.pt",
54
+ "xrf_layout/model_final_inf_only.pt",
55
+ "microsoft/table-transformer-detection/pytorch_model.bin",
56
+ ],
57
+ "segmentation": [
58
+ "item/model-1620000_inf_only.data-00000-of-00001",
59
+ "xrf_item/model_final_inf_only.pt",
60
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin",
61
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
62
+ ],
63
+ "ocr": ["Tesseract", "DocTr", "Textract"],
64
+ "doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
65
+ "doctr_recognition": [
66
+ "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
67
+ "doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
68
+ ],
69
+ "llm": ["gpt-3.5-turbo", "gpt-4"],
70
+ "segmentation_choices": {
71
+ "item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
72
+ "xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
73
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
74
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
75
+ },
76
+ }
77
+
78
+
79
+ def config_sanity_checks() -> None:
80
+ """Some config sanity checks"""
81
+ if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
82
+ raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
83
+ if cfg.USE_OCR:
84
+ if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
85
+ raise ValueError(
86
+ "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
87
+ "and set the other two to False. Only one OCR system can be activated."
88
+ )
89
+
90
+
91
+ def get_dd_analyzer(
92
+ reset_config_file: bool = True,
93
+ config_overwrite: Optional[list[str]] = None,
94
+ path_config_file: Optional[PathLikeOrStr] = None,
95
+ ) -> DoctectionPipe:
96
+ """
97
+ Factory function for creating the built-in **deep**doctection analyzer.
98
+
99
+ The Standard Analyzer is a pipeline that comprises the following analysis components:
100
+
101
+ - Document layout analysis
102
+
103
+ - Table segmentation
104
+
105
+ - Text extraction/OCR
106
+
107
+ - Reading order
108
+
109
+ We refer to the various notebooks and docs for running an analyzer and changing the configs.
110
+
111
+ :param reset_config_file: This will copy the `.yaml` file with default variables to the `.cache` and therefore
112
+ resetting all configurations if set to `True`.
113
+ :param config_overwrite: Passing a list of string arguments and values to overwrite the `.yaml` configuration with
114
+ highest priority, e.g. ["USE_TABLE_SEGMENTATION=False",
115
+ "USE_OCR=False",
116
+ "TF.LAYOUT.WEIGHTS=my_fancy_pytorch_model"]
117
+ :param path_config_file: Path to a custom config file. Can be outside of the .cache directory.
118
+ :return: A DoctectionPipe instance with given configs
119
+ """
120
+ config_overwrite = [] if config_overwrite is None else config_overwrite
121
+ lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
122
+ if lib == "TF":
123
+ device = get_tf_device()
124
+ elif lib == "PT":
125
+ device = get_torch_device()
126
+ else:
127
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
128
+ dd_one_config_path = maybe_copy_config_to_cache(
129
+ get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
130
+ )
131
+ maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
132
+
133
+ # Set up of the configuration and logging
134
+ file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
135
+ cfg.freeze(freezed=False)
136
+ cfg.overwrite_config(file_cfg)
137
+
138
+ cfg.freeze(freezed=False)
139
+ cfg.LANGUAGE = None
140
+ cfg.LIB = lib
141
+ cfg.DEVICE = device
142
+ cfg.freeze()
143
+
144
+ if config_overwrite:
145
+ cfg.update_args(config_overwrite)
146
+
147
+ config_sanity_checks()
148
+ logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
149
+
150
+ # will silent all TP logging while building the tower
151
+ if tensorpack_available():
152
+ disable_tp_layer_logging()
153
+
154
+ return ServiceFactory.build_analyzer(cfg)