deepdoctection 0.34__tar.gz → 0.36__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of deepdoctection might be problematic. Click here for more details.

Files changed (155) hide show
  1. {deepdoctection-0.34 → deepdoctection-0.36}/PKG-INFO +23 -13
  2. {deepdoctection-0.34 → deepdoctection-0.36}/README.md +16 -9
  3. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/__init__.py +7 -14
  4. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/analyzer/__init__.py +1 -0
  5. deepdoctection-0.36/deepdoctection/analyzer/_config.py +142 -0
  6. deepdoctection-0.36/deepdoctection/analyzer/dd.py +154 -0
  7. deepdoctection-0.36/deepdoctection/analyzer/factory.py +718 -0
  8. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/configs/conf_dd_one.yaml +5 -0
  9. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/annotation.py +1 -1
  10. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/convert.py +6 -4
  11. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/image.py +16 -6
  12. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/view.py +91 -15
  13. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/cocometric.py +59 -13
  14. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/pdftext.py +96 -5
  15. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tessocr.py +1 -0
  16. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/match.py +4 -2
  17. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/env_info.py +30 -1
  18. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/file_utils.py +19 -0
  19. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/metacfg.py +12 -0
  20. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/pdf_utils.py +86 -3
  21. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/utils.py +39 -0
  22. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/viz.py +16 -13
  23. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/PKG-INFO +23 -13
  24. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/SOURCES.txt +2 -0
  25. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/requires.txt +6 -3
  26. {deepdoctection-0.34 → deepdoctection-0.36}/setup.cfg +4 -0
  27. {deepdoctection-0.34 → deepdoctection-0.36}/setup.py +4 -2
  28. deepdoctection-0.34/deepdoctection/analyzer/dd.py +0 -478
  29. {deepdoctection-0.34 → deepdoctection-0.36}/LICENSE +0 -0
  30. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/configs/__init__.py +0 -0
  31. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/configs/conf_tesseract.yaml +0 -0
  32. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/__init__.py +0 -0
  33. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/base.py +0 -0
  34. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/common.py +0 -0
  35. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/custom.py +0 -0
  36. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/custom_serialize.py +0 -0
  37. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/parallel_map.py +0 -0
  38. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/serialize.py +0 -0
  39. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/dataflow/stats.py +0 -0
  40. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/__init__.py +0 -0
  41. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datapoint/box.py +0 -0
  42. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/__init__.py +0 -0
  43. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/adapter.py +0 -0
  44. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/base.py +0 -0
  45. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/dataflow_builder.py +0 -0
  46. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/info.py +0 -0
  47. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/__init__.py +0 -0
  48. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/doclaynet.py +0 -0
  49. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/fintabnet.py +0 -0
  50. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/funsd.py +0 -0
  51. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/iiitar13k.py +0 -0
  52. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/layouttest.py +0 -0
  53. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/publaynet.py +0 -0
  54. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/pubtables1m.py +0 -0
  55. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/pubtabnet.py +0 -0
  56. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/rvlcdip.py +0 -0
  57. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/xfund.py +0 -0
  58. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/xsl/__init__.py +0 -0
  59. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/instances/xsl/pascal_voc.xsl +0 -0
  60. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/registry.py +0 -0
  61. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/datasets/save.py +0 -0
  62. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/__init__.py +0 -0
  63. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/accmetric.py +0 -0
  64. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/base.py +0 -0
  65. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/eval.py +0 -0
  66. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/registry.py +0 -0
  67. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/tedsmetric.py +0 -0
  68. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/eval/tp_eval_callback.py +0 -0
  69. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/__init__.py +0 -0
  70. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/base.py +0 -0
  71. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/d2detect.py +0 -0
  72. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/deskew.py +0 -0
  73. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/doctrocr.py +0 -0
  74. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/fastlang.py +0 -0
  75. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/hfdetr.py +0 -0
  76. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/hflayoutlm.py +0 -0
  77. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/hflm.py +0 -0
  78. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/model.py +0 -0
  79. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/pt/__init__.py +0 -0
  80. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/pt/nms.py +0 -0
  81. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/pt/ptutils.py +0 -0
  82. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/texocr.py +0 -0
  83. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/__init__.py +0 -0
  84. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tfutils.py +0 -0
  85. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpcompat.py +0 -0
  86. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/__init__.py +0 -0
  87. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/common.py +0 -0
  88. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/config/__init__.py +0 -0
  89. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/config/config.py +0 -0
  90. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/__init__.py +0 -0
  91. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/backbone.py +0 -0
  92. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/generalized_rcnn.py +0 -0
  93. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_box.py +0 -0
  94. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_cascade.py +0 -0
  95. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_fpn.py +0 -0
  96. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_frcnn.py +0 -0
  97. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_mrcnn.py +0 -0
  98. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/modeling/model_rpn.py +0 -0
  99. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/predict.py +0 -0
  100. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/preproc.py +0 -0
  101. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/utils/__init__.py +0 -0
  102. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/utils/box_ops.py +0 -0
  103. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tp/tpfrcnn/utils/np_box_ops.py +0 -0
  104. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/extern/tpdetect.py +0 -0
  105. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/__init__.py +0 -0
  106. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/cats.py +0 -0
  107. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/cocostruct.py +0 -0
  108. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/d2struct.py +0 -0
  109. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/hfstruct.py +0 -0
  110. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/laylmstruct.py +0 -0
  111. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/maputils.py +0 -0
  112. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/misc.py +0 -0
  113. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/pascalstruct.py +0 -0
  114. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/prodigystruct.py +0 -0
  115. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/pubstruct.py +0 -0
  116. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/tpstruct.py +0 -0
  117. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/mapper/xfundstruct.py +0 -0
  118. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/__init__.py +0 -0
  119. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/anngen.py +0 -0
  120. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/base.py +0 -0
  121. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/common.py +0 -0
  122. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/concurrency.py +0 -0
  123. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/doctectionpipe.py +0 -0
  124. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/language.py +0 -0
  125. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/layout.py +0 -0
  126. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/lm.py +0 -0
  127. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/order.py +0 -0
  128. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/refine.py +0 -0
  129. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/registry.py +0 -0
  130. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/segment.py +0 -0
  131. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/sub_layout.py +0 -0
  132. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/text.py +0 -0
  133. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/pipe/transform.py +0 -0
  134. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/py.typed +0 -0
  135. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/__init__.py +0 -0
  136. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/d2_frcnn_train.py +0 -0
  137. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/hf_detr_train.py +0 -0
  138. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/hf_layoutlm_train.py +0 -0
  139. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/train/tp_frcnn_train.py +0 -0
  140. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/__init__.py +0 -0
  141. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/concurrency.py +0 -0
  142. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/context.py +0 -0
  143. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/develop.py +0 -0
  144. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/error.py +0 -0
  145. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/fs.py +0 -0
  146. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/identifier.py +0 -0
  147. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/logger.py +0 -0
  148. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/mocks.py +0 -0
  149. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/settings.py +0 -0
  150. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/tqdm.py +0 -0
  151. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/transform.py +0 -0
  152. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection/utils/types.py +0 -0
  153. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/dependency_links.txt +0 -0
  154. {deepdoctection-0.34 → deepdoctection-0.36}/deepdoctection.egg-info/top_level.txt +0 -0
  155. {deepdoctection-0.34 → deepdoctection-0.36}/tests/test_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: deepdoctection
3
- Version: 0.34
3
+ Version: 0.36
4
4
  Summary: Repository for Document AI
5
5
  Home-page: https://github.com/deepdoctection/deepdoctection
6
6
  Author: Dr. Janis Meyer
@@ -17,7 +17,7 @@ Requires-Python: >=3.9
17
17
  Description-Content-Type: text/markdown
18
18
  License-File: LICENSE
19
19
  Requires-Dist: catalogue==2.0.10
20
- Requires-Dist: huggingface_hub>=0.12.0
20
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0
21
21
  Requires-Dist: importlib-metadata>=5.0.0
22
22
  Requires-Dist: jsonlines==3.1.0
23
23
  Requires-Dist: lazy-imports==0.3.1
@@ -27,6 +27,7 @@ Requires-Dist: numpy<2.0,>=1.21
27
27
  Requires-Dist: packaging>=20.0
28
28
  Requires-Dist: Pillow>=10.0.0
29
29
  Requires-Dist: pypdf>=3.16.0
30
+ Requires-Dist: pypdfium2>=4.30.0
30
31
  Requires-Dist: pyyaml>=6.0.1
31
32
  Requires-Dist: pyzmq>=16
32
33
  Requires-Dist: scipy>=1.13.1
@@ -35,7 +36,7 @@ Requires-Dist: tabulate>=0.7.7
35
36
  Requires-Dist: tqdm==4.64.0
36
37
  Provides-Extra: tf
37
38
  Requires-Dist: catalogue==2.0.10; extra == "tf"
38
- Requires-Dist: huggingface_hub>=0.12.0; extra == "tf"
39
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "tf"
39
40
  Requires-Dist: importlib-metadata>=5.0.0; extra == "tf"
40
41
  Requires-Dist: jsonlines==3.1.0; extra == "tf"
41
42
  Requires-Dist: lazy-imports==0.3.1; extra == "tf"
@@ -45,6 +46,7 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "tf"
45
46
  Requires-Dist: packaging>=20.0; extra == "tf"
46
47
  Requires-Dist: Pillow>=10.0.0; extra == "tf"
47
48
  Requires-Dist: pypdf>=3.16.0; extra == "tf"
49
+ Requires-Dist: pypdfium2>=4.30.0; extra == "tf"
48
50
  Requires-Dist: pyyaml>=6.0.1; extra == "tf"
49
51
  Requires-Dist: pyzmq>=16; extra == "tf"
50
52
  Requires-Dist: scipy>=1.13.1; extra == "tf"
@@ -66,7 +68,7 @@ Requires-Dist: distance==0.1.3; extra == "tf"
66
68
  Requires-Dist: lxml>=4.9.1; extra == "tf"
67
69
  Provides-Extra: pt
68
70
  Requires-Dist: catalogue==2.0.10; extra == "pt"
69
- Requires-Dist: huggingface_hub>=0.12.0; extra == "pt"
71
+ Requires-Dist: huggingface_hub<0.26,>=0.12.0; extra == "pt"
70
72
  Requires-Dist: importlib-metadata>=5.0.0; extra == "pt"
71
73
  Requires-Dist: jsonlines==3.1.0; extra == "pt"
72
74
  Requires-Dist: lazy-imports==0.3.1; extra == "pt"
@@ -76,6 +78,7 @@ Requires-Dist: numpy<2.0,>=1.21; extra == "pt"
76
78
  Requires-Dist: packaging>=20.0; extra == "pt"
77
79
  Requires-Dist: Pillow>=10.0.0; extra == "pt"
78
80
  Requires-Dist: pypdf>=3.16.0; extra == "pt"
81
+ Requires-Dist: pypdfium2>=4.30.0; extra == "pt"
79
82
  Requires-Dist: pyyaml>=6.0.1; extra == "pt"
80
83
  Requires-Dist: pyzmq>=16; extra == "pt"
81
84
  Requires-Dist: scipy>=1.13.1; extra == "pt"
@@ -172,13 +175,17 @@ pipelines. Its core function does not depend on any specific deep learning libra
172
175
  - Document layout analysis and table recognition now runs with
173
176
  [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
174
177
  anymore for basic inference.
175
- - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
176
- (not contained in the built-in Analyzer).
177
- - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
178
+ - More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
179
+ - Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
178
180
  [**transformers**](https://github.com/huggingface/transformers).
179
181
  We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
180
182
  that seem to look promising, especially if you want to train a model on non-english data. The training script for
181
- LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
183
+ LayoutLM can be used for LiLT as well.
184
+ - [**new**] There are two notebooks available that show, how to write a
185
+ [custom predictor](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) based on
186
+ a third party library that has not been supported yet and how to use
187
+ [advanced configuration](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) to
188
+ get links between layout segments e.g. captions and tables or figures.
182
189
 
183
190
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
184
191
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -263,7 +270,7 @@ documentation.
263
270
 
264
271
  ## Requirements
265
272
 
266
- ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection.png)
273
+ ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_081124.png)
267
274
 
268
275
  Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
269
276
  separately.
@@ -272,13 +279,16 @@ separately.
272
279
  - Python >= 3.9
273
280
  - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
274
281
  In general, if you want to train or fine-tune models, a GPU is required.
275
- - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
276
- images.
282
+
277
283
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
278
284
  and [PyTorch](https://pytorch.org/get-started/locally/).
279
285
  - [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
280
286
  engine has to be installed separately.
281
287
 
288
+
289
+ - For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
290
+ documents into images. For release `v.0.35.0` this dependency will be optional.
291
+
282
292
  The following overview shows the availability of the models in conjunction with the DL framework.
283
293
 
284
294
  | Task | PyTorch | Torchscript | Tensorflow |
@@ -396,8 +406,8 @@ to develop this framework.
396
406
  ## Problems
397
407
 
398
408
  We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
399
- repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 4
400
- to 6 weeks.
409
+ repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
410
+ to 12 weeks.
401
411
 
402
412
  ## If you like **deep**doctection ...
403
413
 
@@ -45,13 +45,17 @@ pipelines. Its core function does not depend on any specific deep learning libra
45
45
  - Document layout analysis and table recognition now runs with
46
46
  [**Torchscript**](https://pytorch.org/docs/stable/jit.html) (CPU) as well and [**Detectron2**](https://github.com/facebookresearch/detectron2/tree/main/detectron2) is not required
47
47
  anymore for basic inference.
48
- - [**new**] More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
49
- (not contained in the built-in Analyzer).
50
- - [**new**] Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
48
+ - More angle predictors for determining the rotation of a document based on [**Tesseract**](https://github.com/tesseract-ocr/tesseract) and [**DocTr**](https://github.com/mindee/doctr)
49
+ - Token classification with [**LiLT**](https://github.com/jpWang/LiLT) via
51
50
  [**transformers**](https://github.com/huggingface/transformers).
52
51
  We have added a model wrapper for token classification with LiLT and added a some LiLT models to the model catalog
53
52
  that seem to look promising, especially if you want to train a model on non-english data. The training script for
54
- LayoutLM can be used for LiLT as well and we will be providing a notebook on how to train a model on a custom dataset soon.
53
+ LayoutLM can be used for LiLT as well.
54
+ - [**new**] There are two notebooks available that show, how to write a
55
+ [custom predictor](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) based on
56
+ a third party library that has not been supported yet and how to use
57
+ [advanced configuration](https://github.com/deepdoctection/notebooks/blob/main/Doclaynet_Analyzer_Config.ipynb) to
58
+ get links between layout segments e.g. captions and tables or figures.
55
59
 
56
60
  **deep**doctection provides on top of that methods for pre-processing inputs to models like cropping or resizing and to
57
61
  post-process results, like validating duplicate outputs, relating words to detected layout segments or ordering words
@@ -136,7 +140,7 @@ documentation.
136
140
 
137
141
  ## Requirements
138
142
 
139
- ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection.png)
143
+ ![requirements](https://github.com/deepdoctection/deepdoctection/raw/master/docs/tutorials/_imgs/requirements_deepdoctection_081124.png)
140
144
 
141
145
  Everything in the overview listed below the **deep**doctection layer are necessary requirements and have to be installed
142
146
  separately.
@@ -145,13 +149,16 @@ separately.
145
149
  - Python >= 3.9
146
150
  - 1.13 <= PyTorch **or** 2.11 <= Tensorflow < 2.16. (For lower Tensorflow versions the code will only run on a GPU).
147
151
  In general, if you want to train or fine-tune models, a GPU is required.
148
- - **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF documents into
149
- images.
152
+
150
153
  - With respect to the Deep Learning framework, you must decide between [Tensorflow](https://www.tensorflow.org/install?hl=en)
151
154
  and [PyTorch](https://pytorch.org/get-started/locally/).
152
155
  - [Tesseract](https://github.com/tesseract-ocr/tesseract) OCR engine will be used through a Python wrapper. The core
153
156
  engine has to be installed separately.
154
157
 
158
+
159
+ - For release `v.0.34.0` and below **deep**doctection uses Python wrappers for [Poppler](https://poppler.freedesktop.org/) to convert PDF
160
+ documents into images. For release `v.0.35.0` this dependency will be optional.
161
+
155
162
  The following overview shows the availability of the models in conjunction with the DL framework.
156
163
 
157
164
  | Task | PyTorch | Torchscript | Tensorflow |
@@ -269,8 +276,8 @@ to develop this framework.
269
276
  ## Problems
270
277
 
271
278
  We try hard to eliminate bugs. We also know that the code is not free of issues. We welcome all issues relevant to this
272
- repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 4
273
- to 6 weeks.
279
+ repo and try to address them as quickly as possible. Bug fixes or enhancements will be deployed in a new release every 10
280
+ to 12 weeks.
274
281
 
275
282
  ## If you like **deep**doctection ...
276
283
 
@@ -18,26 +18,16 @@ if importlib.util.find_spec("dotenv") is not None:
18
18
  import sys
19
19
  from typing import TYPE_CHECKING
20
20
 
21
- from .utils.env_info import collect_env_info
21
+ from .utils.env_info import auto_select_pdf_render_framework, collect_env_info
22
22
  from .utils.file_utils import _LazyModule, get_tf_version, pytorch_available, tf_available
23
23
  from .utils.logger import LoggingRecord, logger
24
24
 
25
25
  # pylint: enable=wrong-import-position
26
26
 
27
- __version__ = 0.34
27
+ __version__ = 0.36
28
28
 
29
29
  _IMPORT_STRUCTURE = {
30
- "analyzer": [
31
- "config_sanity_checks",
32
- "build_detector",
33
- "build_padder",
34
- "build_service",
35
- "build_sub_image_service",
36
- "build_ocr",
37
- "build_doctr_word",
38
- "get_dd_analyzer",
39
- "build_analyzer",
40
- ],
30
+ "analyzer": ["config_sanity_checks", "get_dd_analyzer", "ServiceFactory"],
41
31
  "configs": [],
42
32
  "dataflow": [
43
33
  "DataFlowTerminated",
@@ -197,6 +187,7 @@ _IMPORT_STRUCTURE = {
197
187
  "print_model_infos",
198
188
  "ModelDownloadManager",
199
189
  "PdfPlumberTextDetector",
190
+ "Pdfmium2TextDetector",
200
191
  "TesseractOcrDetector",
201
192
  "TesseractRotationTransformer",
202
193
  "TextractOcrDetector",
@@ -304,6 +295,7 @@ _IMPORT_STRUCTURE = {
304
295
  "timed_operation",
305
296
  "collect_env_info",
306
297
  "auto_select_viz_library",
298
+ "auto_select_pdf_render_framework",
307
299
  "get_tensorflow_requirement",
308
300
  "tf_addons_available",
309
301
  "get_tf_addons_requirements",
@@ -383,6 +375,7 @@ _IMPORT_STRUCTURE = {
383
375
  "get_pdf_file_writer",
384
376
  "PDFStreamer",
385
377
  "pdf_to_np_array",
378
+ "split_pdf",
386
379
  "ObjectTypes",
387
380
  "TypeOrStr",
388
381
  "object_types_registry",
@@ -427,7 +420,7 @@ _IMPORT_STRUCTURE = {
427
420
  # Setting some environment variables so that standard functions can be invoked with available hardware
428
421
  env_info = collect_env_info()
429
422
  logger.debug(LoggingRecord(msg=env_info))
430
-
423
+ auto_select_pdf_render_framework()
431
424
 
432
425
  # Direct imports for type-checking
433
426
  if TYPE_CHECKING:
@@ -20,3 +20,4 @@ Package for pre-built pipelines
20
20
  """
21
21
 
22
22
  from .dd import *
23
+ from .factory import *
@@ -0,0 +1,142 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: config.py
3
+
4
+ # Copyright 2024 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Pipeline configuration for deepdoctection analyzer. Do not change the defaults in this file. """
19
+
20
+ from ..datapoint.view import IMAGE_DEFAULTS
21
+ from ..utils.metacfg import AttrDict
22
+ from ..utils.settings import CellType, LayoutType
23
+
24
+ cfg = AttrDict()
25
+
26
+
27
+ cfg.LANGUAGE = None
28
+ cfg.LIB = None
29
+ cfg.DEVICE = None
30
+ cfg.USE_ROTATOR = False
31
+ cfg.USE_LAYOUT = True
32
+ cfg.USE_TABLE_SEGMENTATION = True
33
+
34
+ cfg.TF.LAYOUT.WEIGHTS = "layout/model-800000_inf_only.data-00000-of-00001"
35
+ cfg.TF.LAYOUT.FILTER = None
36
+
37
+ cfg.TF.CELL.WEIGHTS = "cell/model-1800000_inf_only.data-00000-of-00001"
38
+ cfg.TF.CELL.FILTER = None
39
+
40
+ cfg.TF.ITEM.WEIGHTS = "item/model-1620000_inf_only.data-00000-of-00001"
41
+ cfg.TF.ITEM.FILTER = None
42
+
43
+ cfg.PT.LAYOUT.WEIGHTS = "layout/d2_model_0829999_layout_inf_only.pt"
44
+ cfg.PT.LAYOUT.WEIGHTS_TS = "layout/d2_model_0829999_layout_inf_only.ts"
45
+ cfg.PT.LAYOUT.FILTER = None
46
+ cfg.PT.LAYOUT.PAD.TOP = 60
47
+ cfg.PT.LAYOUT.PAD.RIGHT = 60
48
+ cfg.PT.LAYOUT.PAD.BOTTOM = 60
49
+ cfg.PT.LAYOUT.PAD.LEFT = 60
50
+
51
+ cfg.PT.ITEM.WEIGHTS = "item/d2_model_1639999_item_inf_only.pt"
52
+ cfg.PT.ITEM.WEIGHTS_TS = "item/d2_model_1639999_item_inf_only.ts"
53
+ cfg.PT.ITEM.FILTER = None
54
+ cfg.PT.ITEM.PAD.TOP = 60
55
+ cfg.PT.ITEM.PAD.RIGHT = 60
56
+ cfg.PT.ITEM.PAD.BOTTOM = 60
57
+ cfg.PT.ITEM.PAD.LEFT = 60
58
+
59
+ cfg.PT.CELL.WEIGHTS = "cell/d2_model_1849999_cell_inf_only.pt"
60
+ cfg.PT.CELL.WEIGHTS_TS = "cell/d2_model_1849999_cell_inf_only.ts"
61
+ cfg.PT.CELL.FILTER = None
62
+
63
+ cfg.USE_LAYOUT_NMS = False
64
+ cfg.LAYOUT_NMS_PAIRS.COMBINATIONS = None
65
+ cfg.LAYOUT_NMS_PAIRS.THRESHOLDS = None
66
+ cfg.LAYOUT_NMS_PAIRS.PRIORITY = None
67
+
68
+ cfg.SEGMENTATION.ASSIGNMENT_RULE = "ioa"
69
+ cfg.SEGMENTATION.THRESHOLD_ROWS = 0.4
70
+ cfg.SEGMENTATION.THRESHOLD_COLS = 0.4
71
+ cfg.SEGMENTATION.FULL_TABLE_TILING = True
72
+ cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_ROWS = 0.001
73
+ cfg.SEGMENTATION.REMOVE_IOU_THRESHOLD_COLS = 0.001
74
+ cfg.SEGMENTATION.CELL_CATEGORY_ID = 12
75
+ cfg.SEGMENTATION.TABLE_NAME = LayoutType.TABLE
76
+ cfg.SEGMENTATION.PUBTABLES_CELL_NAMES = [
77
+ CellType.SPANNING,
78
+ CellType.ROW_HEADER,
79
+ CellType.COLUMN_HEADER,
80
+ CellType.PROJECTED_ROW_HEADER,
81
+ LayoutType.CELL,
82
+ ]
83
+ cfg.SEGMENTATION.PUBTABLES_SPANNING_CELL_NAMES = [
84
+ CellType.SPANNING,
85
+ CellType.ROW_HEADER,
86
+ CellType.COLUMN_HEADER,
87
+ CellType.PROJECTED_ROW_HEADER,
88
+ ]
89
+ cfg.SEGMENTATION.PUBTABLES_ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
90
+ cfg.SEGMENTATION.PUBTABLES_SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
91
+ cfg.SEGMENTATION.CELL_NAMES = [CellType.HEADER, CellType.BODY, LayoutType.CELL]
92
+ cfg.SEGMENTATION.ITEM_NAMES = [LayoutType.ROW, LayoutType.COLUMN]
93
+ cfg.SEGMENTATION.SUB_ITEM_NAMES = [CellType.ROW_NUMBER, CellType.COLUMN_NUMBER]
94
+
95
+ cfg.SEGMENTATION.STRETCH_RULE = "equal"
96
+
97
+ cfg.USE_TABLE_REFINEMENT = True
98
+ cfg.USE_PDF_MINER = False
99
+
100
+ cfg.PDF_MINER.X_TOLERANCE = 3
101
+ cfg.PDF_MINER.Y_TOLERANCE = 3
102
+
103
+ cfg.USE_OCR = True
104
+
105
+ cfg.OCR.USE_TESSERACT = True
106
+ cfg.OCR.USE_DOCTR = False
107
+ cfg.OCR.USE_TEXTRACT = False
108
+ cfg.OCR.CONFIG.TESSERACT = "dd/conf_tesseract.yaml"
109
+
110
+ cfg.OCR.WEIGHTS.DOCTR_WORD.TF = "doctr/db_resnet50/tf/db_resnet50-adcafc63.zip"
111
+ cfg.OCR.WEIGHTS.DOCTR_WORD.PT = "doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"
112
+ cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.TF = "doctr/crnn_vgg16_bn/tf/crnn_vgg16_bn-76b7f2c6.zip"
113
+ cfg.OCR.WEIGHTS.DOCTR_RECOGNITION.PT = "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt"
114
+
115
+ cfg.TEXT_CONTAINER = IMAGE_DEFAULTS["text_container"]
116
+ cfg.WORD_MATCHING.PARENTAL_CATEGORIES = [
117
+ LayoutType.TEXT,
118
+ LayoutType.TITLE,
119
+ LayoutType.LIST,
120
+ LayoutType.CELL,
121
+ CellType.COLUMN_HEADER,
122
+ CellType.PROJECTED_ROW_HEADER,
123
+ CellType.SPANNING,
124
+ CellType.ROW_HEADER,
125
+ ]
126
+ cfg.WORD_MATCHING.RULE = "ioa"
127
+ cfg.WORD_MATCHING.THRESHOLD = 0.6
128
+ cfg.WORD_MATCHING.MAX_PARENT_ONLY = True
129
+
130
+ cfg.TEXT_ORDERING.TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS["text_block_categories"]
131
+ cfg.TEXT_ORDERING.FLOATING_TEXT_BLOCK_CATEGORIES = IMAGE_DEFAULTS["floating_text_block_categories"]
132
+ cfg.TEXT_ORDERING.INCLUDE_RESIDUAL_TEXT_CONTAINER = False
133
+ cfg.TEXT_ORDERING.STARTING_POINT_TOLERANCE = 0.005
134
+ cfg.TEXT_ORDERING.BROKEN_LINE_TOLERANCE = 0.003
135
+ cfg.TEXT_ORDERING.HEIGHT_TOLERANCE = 2.0
136
+ cfg.TEXT_ORDERING.PARAGRAPH_BREAK = 0.035
137
+
138
+ cfg.USE_LAYOUT_LINK = False
139
+ cfg.LAYOUT_LINK.PARENTAL_CATEGORIES = []
140
+ cfg.LAYOUT_LINK.CHILD_CATEGORIES = []
141
+
142
+ cfg.freeze()
@@ -0,0 +1,154 @@
1
+ # -*- coding: utf-8 -*-
2
+ # File: dd.py
3
+
4
+ # Copyright 2021 Dr. Janis Meyer. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """
19
+ Module for **deep**doctection analyzer.
20
+
21
+ -factory build_analyzer for a given config
22
+
23
+ -user factory with a reduced config setting
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ import os
29
+ from typing import Optional
30
+
31
+ from ..extern.pt.ptutils import get_torch_device
32
+ from ..extern.tp.tfutils import disable_tp_layer_logging, get_tf_device
33
+ from ..pipe.doctectionpipe import DoctectionPipe
34
+ from ..utils.env_info import ENV_VARS_TRUE
35
+ from ..utils.error import DependencyError
36
+ from ..utils.file_utils import tensorpack_available
37
+ from ..utils.fs import get_configs_dir_path, get_package_path, maybe_copy_config_to_cache
38
+ from ..utils.logger import LoggingRecord, logger
39
+ from ..utils.metacfg import set_config_by_yaml
40
+ from ..utils.types import PathLikeOrStr
41
+ from ._config import cfg
42
+ from .factory import ServiceFactory
43
+
44
+ __all__ = [
45
+ "config_sanity_checks",
46
+ "get_dd_analyzer",
47
+ ]
48
+
49
+ _DD_ONE = "deepdoctection/configs/conf_dd_one.yaml"
50
+ _TESSERACT = "deepdoctection/configs/conf_tesseract.yaml"
51
+ _MODEL_CHOICES = {
52
+ "layout": [
53
+ "layout/d2_model_0829999_layout_inf_only.pt",
54
+ "xrf_layout/model_final_inf_only.pt",
55
+ "microsoft/table-transformer-detection/pytorch_model.bin",
56
+ ],
57
+ "segmentation": [
58
+ "item/model-1620000_inf_only.data-00000-of-00001",
59
+ "xrf_item/model_final_inf_only.pt",
60
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin",
61
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin",
62
+ ],
63
+ "ocr": ["Tesseract", "DocTr", "Textract"],
64
+ "doctr_word": ["doctr/db_resnet50/pt/db_resnet50-ac60cadc.pt"],
65
+ "doctr_recognition": [
66
+ "doctr/crnn_vgg16_bn/pt/crnn_vgg16_bn-9762b0b0.pt",
67
+ "doctr/crnn_vgg16_bn/pt/pytorch_model.bin",
68
+ ],
69
+ "llm": ["gpt-3.5-turbo", "gpt-4"],
70
+ "segmentation_choices": {
71
+ "item/model-1620000_inf_only.data-00000-of-00001": "cell/model-1800000_inf_only.data-00000-of-00001",
72
+ "xrf_item/model_final_inf_only.pt": "xrf_cell/model_final_inf_only.pt",
73
+ "microsoft/table-transformer-structure-recognition/pytorch_model.bin": None,
74
+ "deepdoctection/tatr_tab_struct_v2/pytorch_model.bin": None,
75
+ },
76
+ }
77
+
78
+
79
+ def config_sanity_checks() -> None:
80
+ """Some config sanity checks"""
81
+ if cfg.USE_PDF_MINER and cfg.USE_OCR and cfg.OCR.USE_DOCTR:
82
+ raise ValueError("Configuration USE_PDF_MINER= True and USE_OCR=True and USE_DOCTR=True is not allowed")
83
+ if cfg.USE_OCR:
84
+ if cfg.OCR.USE_TESSERACT + cfg.OCR.USE_DOCTR + cfg.OCR.USE_TEXTRACT != 1:
85
+ raise ValueError(
86
+ "Choose either OCR.USE_TESSERACT=True or OCR.USE_DOCTR=True or OCR.USE_TEXTRACT=True "
87
+ "and set the other two to False. Only one OCR system can be activated."
88
+ )
89
+
90
+
91
+ def get_dd_analyzer(
92
+ reset_config_file: bool = True,
93
+ config_overwrite: Optional[list[str]] = None,
94
+ path_config_file: Optional[PathLikeOrStr] = None,
95
+ ) -> DoctectionPipe:
96
+ """
97
+ Factory function for creating the built-in **deep**doctection analyzer.
98
+
99
+ The Standard Analyzer is a pipeline that comprises the following analysis components:
100
+
101
+ - Document layout analysis
102
+
103
+ - Table segmentation
104
+
105
+ - Text extraction/OCR
106
+
107
+ - Reading order
108
+
109
+ We refer to the various notebooks and docs for running an analyzer and changing the configs.
110
+
111
+ :param reset_config_file: This will copy the `.yaml` file with default variables to the `.cache` and therefore
112
+ resetting all configurations if set to `True`.
113
+ :param config_overwrite: Passing a list of string arguments and values to overwrite the `.yaml` configuration with
114
+ highest priority, e.g. ["USE_TABLE_SEGMENTATION=False",
115
+ "USE_OCR=False",
116
+ "TF.LAYOUT.WEIGHTS=my_fancy_pytorch_model"]
117
+ :param path_config_file: Path to a custom config file. Can be outside of the .cache directory.
118
+ :return: A DoctectionPipe instance with given configs
119
+ """
120
+ config_overwrite = [] if config_overwrite is None else config_overwrite
121
+ lib = "TF" if os.environ.get("DD_USE_TF", "0") in ENV_VARS_TRUE else "PT"
122
+ if lib == "TF":
123
+ device = get_tf_device()
124
+ elif lib == "PT":
125
+ device = get_torch_device()
126
+ else:
127
+ raise DependencyError("At least one of the env variables DD_USE_TF or DD_USE_TORCH must be set.")
128
+ dd_one_config_path = maybe_copy_config_to_cache(
129
+ get_package_path(), get_configs_dir_path() / "dd", _DD_ONE, reset_config_file
130
+ )
131
+ maybe_copy_config_to_cache(get_package_path(), get_configs_dir_path() / "dd", _TESSERACT)
132
+
133
+ # Set up of the configuration and logging
134
+ file_cfg = set_config_by_yaml(dd_one_config_path if not path_config_file else path_config_file)
135
+ cfg.freeze(freezed=False)
136
+ cfg.overwrite_config(file_cfg)
137
+
138
+ cfg.freeze(freezed=False)
139
+ cfg.LANGUAGE = None
140
+ cfg.LIB = lib
141
+ cfg.DEVICE = device
142
+ cfg.freeze()
143
+
144
+ if config_overwrite:
145
+ cfg.update_args(config_overwrite)
146
+
147
+ config_sanity_checks()
148
+ logger.info(LoggingRecord(f"Config: \n {str(cfg)}", cfg.to_dict())) # type: ignore
149
+
150
+ # will silent all TP logging while building the tower
151
+ if tensorpack_available():
152
+ disable_tp_layer_logging()
153
+
154
+ return ServiceFactory.build_analyzer(cfg)